From 767e374dced69b45db0afb30ca2ccf0bbbeef672 Mon Sep 17 00:00:00 2001 From: jfons Date: Thu, 20 May 2021 12:49:33 +0200 Subject: Upgrade Embree to the latest official release. Since Embree v3.13.0 supports AARCH64, switch back to the official repo instead of using Embree-aarch64. `thirdparty/embree/patches/godot-changes.patch` should now contain an accurate diff of the changes done to the library. --- .../common/algorithms/parallel_any_of.h | 55 - .../common/algorithms/parallel_filter.cpp | 56 - .../common/algorithms/parallel_filter.h | 93 - .../common/algorithms/parallel_for.cpp | 48 - .../common/algorithms/parallel_for.h | 229 --- .../common/algorithms/parallel_for_for.cpp | 63 - .../common/algorithms/parallel_for_for.h | 149 -- .../algorithms/parallel_for_for_prefix_sum.cpp | 85 - .../algorithms/parallel_for_for_prefix_sum.h | 112 -- .../common/algorithms/parallel_map.cpp | 47 - .../common/algorithms/parallel_map.h | 85 - .../common/algorithms/parallel_partition.cpp | 53 - .../common/algorithms/parallel_partition.h | 283 --- .../common/algorithms/parallel_prefix_sum.cpp | 48 - .../common/algorithms/parallel_prefix_sum.h | 85 - .../common/algorithms/parallel_reduce.cpp | 49 - .../common/algorithms/parallel_reduce.h | 150 -- .../common/algorithms/parallel_set.cpp | 43 - .../common/algorithms/parallel_set.h | 52 - .../common/algorithms/parallel_sort.cpp | 50 - .../common/algorithms/parallel_sort.h | 457 ----- .../embree-aarch64/common/lexers/parsestream.h | 101 -- thirdparty/embree-aarch64/common/lexers/stream.h | 215 --- .../embree-aarch64/common/lexers/streamfilters.h | 39 - .../embree-aarch64/common/lexers/stringstream.cpp | 51 - .../embree-aarch64/common/lexers/stringstream.h | 29 - .../embree-aarch64/common/lexers/tokenstream.cpp | 181 -- .../embree-aarch64/common/lexers/tokenstream.h | 164 -- thirdparty/embree-aarch64/common/math/AVX2NEON.h | 986 ----------- thirdparty/embree-aarch64/common/math/SSE2NEON.h | 1753 ------------------- .../embree-aarch64/common/math/affinespace.h | 361 ---- thirdparty/embree-aarch64/common/math/bbox.h | 331 ---- thirdparty/embree-aarch64/common/math/col3.h | 47 - thirdparty/embree-aarch64/common/math/col4.h | 47 - thirdparty/embree-aarch64/common/math/color.h | 257 --- .../embree-aarch64/common/math/constants.cpp | 61 - thirdparty/embree-aarch64/common/math/constants.h | 239 --- thirdparty/embree-aarch64/common/math/interval.h | 161 -- thirdparty/embree-aarch64/common/math/lbbox.h | 289 ---- .../embree-aarch64/common/math/linearspace2.h | 148 -- .../embree-aarch64/common/math/linearspace3.h | 213 --- thirdparty/embree-aarch64/common/math/math.h | 451 ----- thirdparty/embree-aarch64/common/math/obbox.h | 39 - thirdparty/embree-aarch64/common/math/quaternion.h | 254 --- thirdparty/embree-aarch64/common/math/range.h | 137 -- .../embree-aarch64/common/math/transcendental.h | 525 ------ thirdparty/embree-aarch64/common/math/vec2.h | 235 --- thirdparty/embree-aarch64/common/math/vec2fa.h | 317 ---- thirdparty/embree-aarch64/common/math/vec3.h | 349 ---- thirdparty/embree-aarch64/common/math/vec3ba.h | 120 -- thirdparty/embree-aarch64/common/math/vec3fa.h | 810 --------- thirdparty/embree-aarch64/common/math/vec3ia.h | 210 --- thirdparty/embree-aarch64/common/math/vec4.h | 258 --- thirdparty/embree-aarch64/common/simd/avx.h | 34 - thirdparty/embree-aarch64/common/simd/avx512.h | 41 - thirdparty/embree-aarch64/common/simd/simd.h | 110 -- thirdparty/embree-aarch64/common/simd/sse.cpp | 34 - thirdparty/embree-aarch64/common/simd/sse.h | 35 - thirdparty/embree-aarch64/common/simd/varying.h | 132 -- .../embree-aarch64/common/simd/vboold4_avx.h | 160 -- .../embree-aarch64/common/simd/vboold4_avx512.h | 140 -- .../embree-aarch64/common/simd/vboold8_avx512.h | 148 -- .../embree-aarch64/common/simd/vboolf16_avx512.h | 150 -- .../embree-aarch64/common/simd/vboolf4_avx512.h | 143 -- .../embree-aarch64/common/simd/vboolf4_sse2.h | 198 --- .../embree-aarch64/common/simd/vboolf8_avx.h | 189 -- .../embree-aarch64/common/simd/vboolf8_avx512.h | 143 -- .../embree-aarch64/common/simd/vdouble4_avx.h | 324 ---- .../embree-aarch64/common/simd/vdouble8_avx512.h | 356 ---- .../embree-aarch64/common/simd/vfloat16_avx512.h | 771 --------- .../embree-aarch64/common/simd/vfloat4_sse2.h | 925 ---------- .../embree-aarch64/common/simd/vfloat8_avx.h | 847 --------- .../embree-aarch64/common/simd/vint16_avx512.h | 490 ------ thirdparty/embree-aarch64/common/simd/vint4_sse2.h | 681 -------- thirdparty/embree-aarch64/common/simd/vint8_avx.h | 464 ----- thirdparty/embree-aarch64/common/simd/vint8_avx2.h | 512 ------ .../embree-aarch64/common/simd/vllong4_avx2.h | 358 ---- .../embree-aarch64/common/simd/vllong8_avx512.h | 381 ----- .../embree-aarch64/common/simd/vuint16_avx512.h | 443 ----- .../embree-aarch64/common/simd/vuint4_sse2.h | 499 ------ thirdparty/embree-aarch64/common/simd/vuint8_avx.h | 379 ----- .../embree-aarch64/common/simd/vuint8_avx2.h | 439 ----- thirdparty/embree-aarch64/common/sys/alloc.cpp | 327 ---- thirdparty/embree-aarch64/common/sys/alloc.h | 164 -- thirdparty/embree-aarch64/common/sys/array.h | 222 --- thirdparty/embree-aarch64/common/sys/atomic.h | 59 - thirdparty/embree-aarch64/common/sys/barrier.cpp | 289 ---- thirdparty/embree-aarch64/common/sys/barrier.h | 112 -- thirdparty/embree-aarch64/common/sys/condition.cpp | 81 - thirdparty/embree-aarch64/common/sys/condition.h | 31 - thirdparty/embree-aarch64/common/sys/filename.cpp | 138 -- thirdparty/embree-aarch64/common/sys/filename.h | 81 - thirdparty/embree-aarch64/common/sys/intrinsics.h | 559 ------ thirdparty/embree-aarch64/common/sys/library.cpp | 83 - thirdparty/embree-aarch64/common/sys/library.h | 21 - thirdparty/embree-aarch64/common/sys/mutex.cpp | 58 - thirdparty/embree-aarch64/common/sys/mutex.h | 98 -- thirdparty/embree-aarch64/common/sys/platform.h | 387 ----- thirdparty/embree-aarch64/common/sys/ref.h | 122 -- .../embree-aarch64/common/sys/regression.cpp | 30 - thirdparty/embree-aarch64/common/sys/regression.h | 25 - thirdparty/embree-aarch64/common/sys/string.cpp | 42 - thirdparty/embree-aarch64/common/sys/string.h | 37 - thirdparty/embree-aarch64/common/sys/sysinfo.cpp | 676 -------- thirdparty/embree-aarch64/common/sys/sysinfo.h | 192 --- thirdparty/embree-aarch64/common/sys/thread.cpp | 429 ----- thirdparty/embree-aarch64/common/sys/thread.h | 46 - thirdparty/embree-aarch64/common/sys/vector.h | 242 --- .../embree-aarch64/common/tasking/taskscheduler.h | 17 - .../common/tasking/taskschedulergcd.h | 49 - .../common/tasking/taskschedulerinternal.cpp | 426 ----- .../common/tasking/taskschedulerinternal.h | 386 ----- .../common/tasking/taskschedulerppl.h | 46 - .../common/tasking/taskschedulertbb.h | 67 - thirdparty/embree-aarch64/include/embree3/rtcore.h | 14 - .../embree-aarch64/include/embree3/rtcore_buffer.h | 51 - .../include/embree3/rtcore_builder.h | 125 -- .../embree-aarch64/include/embree3/rtcore_common.h | 326 ---- .../embree-aarch64/include/embree3/rtcore_config.h | 57 - .../embree-aarch64/include/embree3/rtcore_device.h | 87 - .../include/embree3/rtcore_geometry.h | 383 ----- .../include/embree3/rtcore_quaternion.h | 101 -- .../embree-aarch64/include/embree3/rtcore_ray.h | 378 ---- .../embree-aarch64/include/embree3/rtcore_scene.h | 160 -- .../kernels/builders/bvh_builder_hair.h | 411 ----- .../kernels/builders/bvh_builder_morton.h | 501 ------ .../kernels/builders/bvh_builder_msmblur.h | 692 -------- .../kernels/builders/bvh_builder_msmblur_hair.h | 526 ------ .../kernels/builders/bvh_builder_sah.h | 669 -------- .../kernels/builders/heuristic_binning.h | 972 ----------- .../builders/heuristic_binning_array_aligned.h | 205 --- .../builders/heuristic_binning_array_unaligned.h | 302 ---- .../kernels/builders/heuristic_openmerge_array.h | 443 ----- .../kernels/builders/heuristic_spatial.h | 414 ----- .../kernels/builders/heuristic_spatial_array.h | 552 ------ .../kernels/builders/heuristic_strand_array.h | 188 -- .../kernels/builders/heuristic_timesplit_array.h | 237 --- .../embree-aarch64/kernels/builders/priminfo.h | 362 ---- .../embree-aarch64/kernels/builders/primrefgen.cpp | 244 --- .../embree-aarch64/kernels/builders/primrefgen.h | 28 - .../kernels/builders/primrefgen_presplit.h | 371 ---- .../embree-aarch64/kernels/builders/splitter.h | 169 -- thirdparty/embree-aarch64/kernels/bvh/bvh.cpp | 190 --- thirdparty/embree-aarch64/kernels/bvh/bvh.h | 235 --- .../embree-aarch64/kernels/bvh/bvh4_factory.cpp | 1325 -------------- .../embree-aarch64/kernels/bvh/bvh4_factory.h | 316 ---- .../embree-aarch64/kernels/bvh/bvh8_factory.cpp | 1165 ------------- .../embree-aarch64/kernels/bvh/bvh8_factory.h | 280 --- .../embree-aarch64/kernels/bvh/bvh_builder.cpp | 60 - .../embree-aarch64/kernels/bvh/bvh_builder.h | 114 -- .../kernels/bvh/bvh_builder_morton.cpp | 531 ------ .../embree-aarch64/kernels/bvh/bvh_builder_sah.cpp | 640 ------- .../kernels/bvh/bvh_builder_sah_mb.cpp | 705 -------- .../kernels/bvh/bvh_builder_sah_spatial.cpp | 201 --- .../kernels/bvh/bvh_builder_twolevel.cpp | 377 ---- .../kernels/bvh/bvh_builder_twolevel.h | 263 --- .../kernels/bvh/bvh_builder_twolevel_internal.h | 267 --- .../embree-aarch64/kernels/bvh/bvh_collider.cpp | 375 ---- .../embree-aarch64/kernels/bvh/bvh_collider.h | 72 - .../embree-aarch64/kernels/bvh/bvh_factory.h | 21 - .../kernels/bvh/bvh_intersector1.cpp | 330 ---- .../embree-aarch64/kernels/bvh/bvh_intersector1.h | 37 - .../kernels/bvh/bvh_intersector1_bvh4.cpp | 61 - .../kernels/bvh/bvh_intersector_hybrid.h | 61 - .../kernels/bvh/bvh_intersector_stream.h | 295 ---- .../kernels/bvh/bvh_intersector_stream_filters.h | 41 - .../embree-aarch64/kernels/bvh/bvh_node_aabb.h | 213 --- .../embree-aarch64/kernels/bvh/bvh_node_aabb_mb.h | 247 --- .../kernels/bvh/bvh_node_aabb_mb4d.h | 107 -- .../embree-aarch64/kernels/bvh/bvh_node_base.h | 43 - .../embree-aarch64/kernels/bvh/bvh_node_obb.h | 98 -- .../embree-aarch64/kernels/bvh/bvh_node_obb_mb.h | 90 - .../embree-aarch64/kernels/bvh/bvh_node_qaabb.h | 265 --- .../embree-aarch64/kernels/bvh/bvh_node_ref.h | 242 --- .../embree-aarch64/kernels/bvh/bvh_refit.cpp | 247 --- thirdparty/embree-aarch64/kernels/bvh/bvh_refit.h | 95 -- .../embree-aarch64/kernels/bvh/bvh_rotate.cpp | 127 -- thirdparty/embree-aarch64/kernels/bvh/bvh_rotate.h | 37 - .../embree-aarch64/kernels/bvh/bvh_statistics.cpp | 168 -- .../embree-aarch64/kernels/bvh/bvh_statistics.h | 285 ---- .../embree-aarch64/kernels/bvh/bvh_traverser1.h | 676 -------- .../kernels/bvh/bvh_traverser_stream.h | 154 -- .../embree-aarch64/kernels/bvh/node_intersector.h | 31 - .../embree-aarch64/kernels/bvh/node_intersector1.h | 1788 ------------------- .../kernels/bvh/node_intersector_frustum.h | 269 --- .../kernels/bvh/node_intersector_packet.h | 843 --------- .../kernels/bvh/node_intersector_packet_stream.h | 215 --- thirdparty/embree-aarch64/kernels/common/accel.h | 556 ------ .../embree-aarch64/kernels/common/accelinstance.h | 41 - .../embree-aarch64/kernels/common/acceln.cpp | 232 --- thirdparty/embree-aarch64/kernels/common/acceln.h | 49 - .../embree-aarch64/kernels/common/accelset.cpp | 17 - .../embree-aarch64/kernels/common/accelset.h | 248 --- thirdparty/embree-aarch64/kernels/common/alloc.cpp | 82 - thirdparty/embree-aarch64/kernels/common/alloc.h | 1006 ----------- thirdparty/embree-aarch64/kernels/common/buffer.h | 263 --- thirdparty/embree-aarch64/kernels/common/builder.h | 60 - thirdparty/embree-aarch64/kernels/common/context.h | 131 -- thirdparty/embree-aarch64/kernels/common/default.h | 273 --- .../embree-aarch64/kernels/common/device.cpp | 567 ------ thirdparty/embree-aarch64/kernels/common/device.h | 85 - .../embree-aarch64/kernels/common/geometry.cpp | 259 --- .../embree-aarch64/kernels/common/geometry.h | 582 ------- thirdparty/embree-aarch64/kernels/common/hit.h | 114 -- .../embree-aarch64/kernels/common/instance_stack.h | 199 --- thirdparty/embree-aarch64/kernels/common/isa.h | 271 --- .../kernels/common/motion_derivative.h | 325 ---- .../embree-aarch64/kernels/common/point_query.h | 136 -- thirdparty/embree-aarch64/kernels/common/primref.h | 138 -- .../embree-aarch64/kernels/common/primref_mb.h | 262 --- thirdparty/embree-aarch64/kernels/common/profile.h | 159 -- thirdparty/embree-aarch64/kernels/common/ray.h | 1517 ----------------- .../embree-aarch64/kernels/common/rtcore.cpp | 1799 -------------------- thirdparty/embree-aarch64/kernels/common/rtcore.h | 142 -- .../kernels/common/rtcore_builder.cpp | 442 ----- thirdparty/embree-aarch64/kernels/common/scene.cpp | 976 ----------- thirdparty/embree-aarch64/kernels/common/scene.h | 390 ----- .../embree-aarch64/kernels/common/scene_curves.h | 341 ---- .../kernels/common/scene_grid_mesh.h | 215 --- .../embree-aarch64/kernels/common/scene_instance.h | 272 --- .../kernels/common/scene_line_segments.h | 307 ---- .../embree-aarch64/kernels/common/scene_points.h | 282 --- .../kernels/common/scene_quad_mesh.h | 277 --- .../kernels/common/scene_subdiv_mesh.h | 326 ---- .../kernels/common/scene_triangle_mesh.cpp | 243 --- .../kernels/common/scene_triangle_mesh.h | 264 --- .../kernels/common/scene_user_geometry.h | 77 - .../embree-aarch64/kernels/common/stack_item.h | 125 -- thirdparty/embree-aarch64/kernels/common/stat.cpp | 128 -- thirdparty/embree-aarch64/kernels/common/stat.h | 116 -- thirdparty/embree-aarch64/kernels/common/state.cpp | 543 ------ thirdparty/embree-aarch64/kernels/common/state.h | 197 --- thirdparty/embree-aarch64/kernels/common/vector.h | 76 - thirdparty/embree-aarch64/kernels/config.h | 76 - thirdparty/embree-aarch64/kernels/geometry/cone.h | 321 ---- .../kernels/geometry/coneline_intersector.h | 209 --- .../kernels/geometry/conelinei_intersector.h | 141 -- .../embree-aarch64/kernels/geometry/curveNi.h | 222 --- .../kernels/geometry/curveNi_intersector.h | 569 ------- .../embree-aarch64/kernels/geometry/curveNi_mb.h | 278 --- .../kernels/geometry/curveNi_mb_intersector.h | 516 ------ .../embree-aarch64/kernels/geometry/curveNv.h | 101 -- .../kernels/geometry/curveNv_intersector.h | 181 -- .../kernels/geometry/curve_intersector.h | 98 -- .../kernels/geometry/curve_intersector_distance.h | 129 -- .../kernels/geometry/curve_intersector_oriented.h | 417 ----- .../geometry/curve_intersector_precalculations.h | 49 - .../kernels/geometry/curve_intersector_ribbon.h | 214 --- .../kernels/geometry/curve_intersector_sweep.h | 362 ---- .../kernels/geometry/curve_intersector_virtual.h | 671 -------- .../curve_intersector_virtual_bezier_curve.h | 21 - .../curve_intersector_virtual_bspline_curve.h | 21 - .../curve_intersector_virtual_catmullrom_curve.h | 21 - .../curve_intersector_virtual_hermite_curve.h | 21 - .../curve_intersector_virtual_linear_curve.h | 21 - .../geometry/curve_intersector_virtual_point.h | 22 - .../embree-aarch64/kernels/geometry/cylinder.h | 223 --- .../kernels/geometry/disc_intersector.h | 216 --- .../kernels/geometry/disci_intersector.h | 277 --- .../embree-aarch64/kernels/geometry/filter.h | 204 --- .../kernels/geometry/grid_intersector.h | 99 -- .../embree-aarch64/kernels/geometry/grid_soa.h | 275 --- .../kernels/geometry/grid_soa_intersector1.h | 207 --- .../kernels/geometry/grid_soa_intersector_packet.h | 445 ----- .../embree-aarch64/kernels/geometry/instance.h | 78 - .../kernels/geometry/instance_intersector.h | 84 - .../kernels/geometry/intersector_epilog.h | 1074 ------------ .../kernels/geometry/intersector_iterators.h | 172 -- .../kernels/geometry/line_intersector.h | 141 -- thirdparty/embree-aarch64/kernels/geometry/linei.h | 709 -------- .../kernels/geometry/linei_intersector.h | 124 -- .../embree-aarch64/kernels/geometry/object.h | 84 - .../kernels/geometry/object_intersector.h | 127 -- thirdparty/embree-aarch64/kernels/geometry/plane.h | 57 - .../embree-aarch64/kernels/geometry/pointi.h | 417 ----- .../embree-aarch64/kernels/geometry/primitive.h | 49 - .../embree-aarch64/kernels/geometry/primitive4.cpp | 379 ----- .../kernels/geometry/quad_intersector.h | 76 - .../kernels/geometry/quad_intersector_moeller.h | 566 ------ .../kernels/geometry/quad_intersector_pluecker.h | 529 ------ thirdparty/embree-aarch64/kernels/geometry/quadi.h | 483 ------ .../kernels/geometry/quadi_intersector.h | 350 ---- thirdparty/embree-aarch64/kernels/geometry/quadv.h | 165 -- .../kernels/geometry/quadv_intersector.h | 181 -- .../kernels/geometry/roundline_intersector.h | 710 -------- .../kernels/geometry/roundlinei_intersector.h | 136 -- .../kernels/geometry/sphere_intersector.h | 183 -- .../kernels/geometry/spherei_intersector.h | 156 -- .../embree-aarch64/kernels/geometry/subdivpatch1.h | 38 - .../kernels/geometry/subdivpatch1_intersector.h | 237 --- .../embree-aarch64/kernels/geometry/subgrid.h | 517 ------ .../kernels/geometry/subgrid_intersector.h | 518 ------ .../kernels/geometry/subgrid_intersector_moeller.h | 493 ------ .../geometry/subgrid_intersector_pluecker.h | 508 ------ .../kernels/geometry/subgrid_mb_intersector.h | 236 --- .../embree-aarch64/kernels/geometry/triangle.h | 162 -- .../kernels/geometry/triangle_intersector.h | 96 -- .../geometry/triangle_intersector_moeller.h | 403 ----- .../geometry/triangle_intersector_pluecker.h | 247 --- .../kernels/geometry/triangle_intersector_woop.h | 418 ----- .../geometry/triangle_triangle_intersector.h | 132 -- .../embree-aarch64/kernels/geometry/trianglei.h | 442 ----- .../kernels/geometry/trianglei_intersector.h | 336 ---- .../embree-aarch64/kernels/geometry/trianglev.h | 157 -- .../kernels/geometry/trianglev_intersector.h | 206 --- .../embree-aarch64/kernels/geometry/trianglev_mb.h | 201 --- .../kernels/geometry/trianglev_mb_intersector.h | 211 --- thirdparty/embree-aarch64/kernels/hash.h | 5 - .../embree-aarch64/kernels/subdiv/bezier_curve.h | 669 -------- .../embree-aarch64/kernels/subdiv/bezier_patch.h | 372 ---- .../embree-aarch64/kernels/subdiv/bilinear_patch.h | 191 --- .../embree-aarch64/kernels/subdiv/bspline_curve.h | 319 ---- .../embree-aarch64/kernels/subdiv/bspline_patch.h | 449 ----- .../kernels/subdiv/catmullclark_coefficients.h | 85 - .../kernels/subdiv/catmullclark_patch.h | 562 ------ .../kernels/subdiv/catmullclark_ring.h | 826 --------- .../kernels/subdiv/catmullrom_curve.h | 296 ---- .../kernels/subdiv/feature_adaptive_eval.h | 226 --- .../kernels/subdiv/feature_adaptive_eval_grid.h | 359 ---- .../kernels/subdiv/feature_adaptive_eval_simd.h | 186 -- .../embree-aarch64/kernels/subdiv/gregory_patch.h | 893 ---------- .../kernels/subdiv/gregory_patch_dense.h | 113 -- .../embree-aarch64/kernels/subdiv/gridrange.h | 96 -- .../embree-aarch64/kernels/subdiv/half_edge.h | 371 ---- .../embree-aarch64/kernels/subdiv/hermite_curve.h | 38 - .../kernels/subdiv/linear_bezier_patch.h | 403 ----- thirdparty/embree-aarch64/kernels/subdiv/patch.h | 371 ---- .../embree-aarch64/kernels/subdiv/patch_eval.h | 129 -- .../kernels/subdiv/patch_eval_grid.h | 245 --- .../kernels/subdiv/patch_eval_simd.h | 127 -- .../kernels/subdiv/subdivpatch1base.h | 156 -- .../embree-aarch64/kernels/subdiv/tessellation.h | 161 -- .../kernels/subdiv/tessellation_cache.h | 325 ---- .../embree-aarch64/patches/godot-changes.patch | 630 ------- 334 files changed, 92836 deletions(-) delete mode 100644 thirdparty/embree-aarch64/common/algorithms/parallel_any_of.h delete mode 100644 thirdparty/embree-aarch64/common/algorithms/parallel_filter.cpp delete mode 100644 thirdparty/embree-aarch64/common/algorithms/parallel_filter.h delete mode 100644 thirdparty/embree-aarch64/common/algorithms/parallel_for.cpp delete mode 100644 thirdparty/embree-aarch64/common/algorithms/parallel_for.h delete mode 100644 thirdparty/embree-aarch64/common/algorithms/parallel_for_for.cpp delete mode 100644 thirdparty/embree-aarch64/common/algorithms/parallel_for_for.h delete mode 100644 thirdparty/embree-aarch64/common/algorithms/parallel_for_for_prefix_sum.cpp delete mode 100644 thirdparty/embree-aarch64/common/algorithms/parallel_for_for_prefix_sum.h delete mode 100644 thirdparty/embree-aarch64/common/algorithms/parallel_map.cpp delete mode 100644 thirdparty/embree-aarch64/common/algorithms/parallel_map.h delete mode 100644 thirdparty/embree-aarch64/common/algorithms/parallel_partition.cpp delete mode 100644 thirdparty/embree-aarch64/common/algorithms/parallel_partition.h delete mode 100644 thirdparty/embree-aarch64/common/algorithms/parallel_prefix_sum.cpp delete mode 100644 thirdparty/embree-aarch64/common/algorithms/parallel_prefix_sum.h delete mode 100644 thirdparty/embree-aarch64/common/algorithms/parallel_reduce.cpp delete mode 100644 thirdparty/embree-aarch64/common/algorithms/parallel_reduce.h delete mode 100644 thirdparty/embree-aarch64/common/algorithms/parallel_set.cpp delete mode 100644 thirdparty/embree-aarch64/common/algorithms/parallel_set.h delete mode 100644 thirdparty/embree-aarch64/common/algorithms/parallel_sort.cpp delete mode 100644 thirdparty/embree-aarch64/common/algorithms/parallel_sort.h delete mode 100644 thirdparty/embree-aarch64/common/lexers/parsestream.h delete mode 100644 thirdparty/embree-aarch64/common/lexers/stream.h delete mode 100644 thirdparty/embree-aarch64/common/lexers/streamfilters.h delete mode 100644 thirdparty/embree-aarch64/common/lexers/stringstream.cpp delete mode 100644 thirdparty/embree-aarch64/common/lexers/stringstream.h delete mode 100644 thirdparty/embree-aarch64/common/lexers/tokenstream.cpp delete mode 100644 thirdparty/embree-aarch64/common/lexers/tokenstream.h delete mode 100644 thirdparty/embree-aarch64/common/math/AVX2NEON.h delete mode 100644 thirdparty/embree-aarch64/common/math/SSE2NEON.h delete mode 100644 thirdparty/embree-aarch64/common/math/affinespace.h delete mode 100644 thirdparty/embree-aarch64/common/math/bbox.h delete mode 100644 thirdparty/embree-aarch64/common/math/col3.h delete mode 100644 thirdparty/embree-aarch64/common/math/col4.h delete mode 100644 thirdparty/embree-aarch64/common/math/color.h delete mode 100644 thirdparty/embree-aarch64/common/math/constants.cpp delete mode 100644 thirdparty/embree-aarch64/common/math/constants.h delete mode 100644 thirdparty/embree-aarch64/common/math/interval.h delete mode 100644 thirdparty/embree-aarch64/common/math/lbbox.h delete mode 100644 thirdparty/embree-aarch64/common/math/linearspace2.h delete mode 100644 thirdparty/embree-aarch64/common/math/linearspace3.h delete mode 100644 thirdparty/embree-aarch64/common/math/math.h delete mode 100644 thirdparty/embree-aarch64/common/math/obbox.h delete mode 100644 thirdparty/embree-aarch64/common/math/quaternion.h delete mode 100644 thirdparty/embree-aarch64/common/math/range.h delete mode 100644 thirdparty/embree-aarch64/common/math/transcendental.h delete mode 100644 thirdparty/embree-aarch64/common/math/vec2.h delete mode 100644 thirdparty/embree-aarch64/common/math/vec2fa.h delete mode 100644 thirdparty/embree-aarch64/common/math/vec3.h delete mode 100644 thirdparty/embree-aarch64/common/math/vec3ba.h delete mode 100644 thirdparty/embree-aarch64/common/math/vec3fa.h delete mode 100644 thirdparty/embree-aarch64/common/math/vec3ia.h delete mode 100644 thirdparty/embree-aarch64/common/math/vec4.h delete mode 100644 thirdparty/embree-aarch64/common/simd/avx.h delete mode 100644 thirdparty/embree-aarch64/common/simd/avx512.h delete mode 100644 thirdparty/embree-aarch64/common/simd/simd.h delete mode 100644 thirdparty/embree-aarch64/common/simd/sse.cpp delete mode 100644 thirdparty/embree-aarch64/common/simd/sse.h delete mode 100644 thirdparty/embree-aarch64/common/simd/varying.h delete mode 100644 thirdparty/embree-aarch64/common/simd/vboold4_avx.h delete mode 100644 thirdparty/embree-aarch64/common/simd/vboold4_avx512.h delete mode 100644 thirdparty/embree-aarch64/common/simd/vboold8_avx512.h delete mode 100644 thirdparty/embree-aarch64/common/simd/vboolf16_avx512.h delete mode 100644 thirdparty/embree-aarch64/common/simd/vboolf4_avx512.h delete mode 100644 thirdparty/embree-aarch64/common/simd/vboolf4_sse2.h delete mode 100644 thirdparty/embree-aarch64/common/simd/vboolf8_avx.h delete mode 100644 thirdparty/embree-aarch64/common/simd/vboolf8_avx512.h delete mode 100644 thirdparty/embree-aarch64/common/simd/vdouble4_avx.h delete mode 100644 thirdparty/embree-aarch64/common/simd/vdouble8_avx512.h delete mode 100644 thirdparty/embree-aarch64/common/simd/vfloat16_avx512.h delete mode 100644 thirdparty/embree-aarch64/common/simd/vfloat4_sse2.h delete mode 100644 thirdparty/embree-aarch64/common/simd/vfloat8_avx.h delete mode 100644 thirdparty/embree-aarch64/common/simd/vint16_avx512.h delete mode 100644 thirdparty/embree-aarch64/common/simd/vint4_sse2.h delete mode 100644 thirdparty/embree-aarch64/common/simd/vint8_avx.h delete mode 100644 thirdparty/embree-aarch64/common/simd/vint8_avx2.h delete mode 100644 thirdparty/embree-aarch64/common/simd/vllong4_avx2.h delete mode 100644 thirdparty/embree-aarch64/common/simd/vllong8_avx512.h delete mode 100644 thirdparty/embree-aarch64/common/simd/vuint16_avx512.h delete mode 100644 thirdparty/embree-aarch64/common/simd/vuint4_sse2.h delete mode 100644 thirdparty/embree-aarch64/common/simd/vuint8_avx.h delete mode 100644 thirdparty/embree-aarch64/common/simd/vuint8_avx2.h delete mode 100644 thirdparty/embree-aarch64/common/sys/alloc.cpp delete mode 100644 thirdparty/embree-aarch64/common/sys/alloc.h delete mode 100644 thirdparty/embree-aarch64/common/sys/array.h delete mode 100644 thirdparty/embree-aarch64/common/sys/atomic.h delete mode 100644 thirdparty/embree-aarch64/common/sys/barrier.cpp delete mode 100644 thirdparty/embree-aarch64/common/sys/barrier.h delete mode 100644 thirdparty/embree-aarch64/common/sys/condition.cpp delete mode 100644 thirdparty/embree-aarch64/common/sys/condition.h delete mode 100644 thirdparty/embree-aarch64/common/sys/filename.cpp delete mode 100644 thirdparty/embree-aarch64/common/sys/filename.h delete mode 100644 thirdparty/embree-aarch64/common/sys/intrinsics.h delete mode 100644 thirdparty/embree-aarch64/common/sys/library.cpp delete mode 100644 thirdparty/embree-aarch64/common/sys/library.h delete mode 100644 thirdparty/embree-aarch64/common/sys/mutex.cpp delete mode 100644 thirdparty/embree-aarch64/common/sys/mutex.h delete mode 100644 thirdparty/embree-aarch64/common/sys/platform.h delete mode 100644 thirdparty/embree-aarch64/common/sys/ref.h delete mode 100644 thirdparty/embree-aarch64/common/sys/regression.cpp delete mode 100644 thirdparty/embree-aarch64/common/sys/regression.h delete mode 100644 thirdparty/embree-aarch64/common/sys/string.cpp delete mode 100644 thirdparty/embree-aarch64/common/sys/string.h delete mode 100644 thirdparty/embree-aarch64/common/sys/sysinfo.cpp delete mode 100644 thirdparty/embree-aarch64/common/sys/sysinfo.h delete mode 100644 thirdparty/embree-aarch64/common/sys/thread.cpp delete mode 100644 thirdparty/embree-aarch64/common/sys/thread.h delete mode 100644 thirdparty/embree-aarch64/common/sys/vector.h delete mode 100644 thirdparty/embree-aarch64/common/tasking/taskscheduler.h delete mode 100644 thirdparty/embree-aarch64/common/tasking/taskschedulergcd.h delete mode 100644 thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.cpp delete mode 100644 thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.h delete mode 100644 thirdparty/embree-aarch64/common/tasking/taskschedulerppl.h delete mode 100644 thirdparty/embree-aarch64/common/tasking/taskschedulertbb.h delete mode 100644 thirdparty/embree-aarch64/include/embree3/rtcore.h delete mode 100644 thirdparty/embree-aarch64/include/embree3/rtcore_buffer.h delete mode 100644 thirdparty/embree-aarch64/include/embree3/rtcore_builder.h delete mode 100644 thirdparty/embree-aarch64/include/embree3/rtcore_common.h delete mode 100644 thirdparty/embree-aarch64/include/embree3/rtcore_config.h delete mode 100644 thirdparty/embree-aarch64/include/embree3/rtcore_device.h delete mode 100644 thirdparty/embree-aarch64/include/embree3/rtcore_geometry.h delete mode 100644 thirdparty/embree-aarch64/include/embree3/rtcore_quaternion.h delete mode 100644 thirdparty/embree-aarch64/include/embree3/rtcore_ray.h delete mode 100644 thirdparty/embree-aarch64/include/embree3/rtcore_scene.h delete mode 100644 thirdparty/embree-aarch64/kernels/builders/bvh_builder_hair.h delete mode 100644 thirdparty/embree-aarch64/kernels/builders/bvh_builder_morton.h delete mode 100644 thirdparty/embree-aarch64/kernels/builders/bvh_builder_msmblur.h delete mode 100644 thirdparty/embree-aarch64/kernels/builders/bvh_builder_msmblur_hair.h delete mode 100644 thirdparty/embree-aarch64/kernels/builders/bvh_builder_sah.h delete mode 100644 thirdparty/embree-aarch64/kernels/builders/heuristic_binning.h delete mode 100644 thirdparty/embree-aarch64/kernels/builders/heuristic_binning_array_aligned.h delete mode 100644 thirdparty/embree-aarch64/kernels/builders/heuristic_binning_array_unaligned.h delete mode 100644 thirdparty/embree-aarch64/kernels/builders/heuristic_openmerge_array.h delete mode 100644 thirdparty/embree-aarch64/kernels/builders/heuristic_spatial.h delete mode 100644 thirdparty/embree-aarch64/kernels/builders/heuristic_spatial_array.h delete mode 100644 thirdparty/embree-aarch64/kernels/builders/heuristic_strand_array.h delete mode 100644 thirdparty/embree-aarch64/kernels/builders/heuristic_timesplit_array.h delete mode 100644 thirdparty/embree-aarch64/kernels/builders/priminfo.h delete mode 100644 thirdparty/embree-aarch64/kernels/builders/primrefgen.cpp delete mode 100644 thirdparty/embree-aarch64/kernels/builders/primrefgen.h delete mode 100644 thirdparty/embree-aarch64/kernels/builders/primrefgen_presplit.h delete mode 100644 thirdparty/embree-aarch64/kernels/builders/splitter.h delete mode 100644 thirdparty/embree-aarch64/kernels/bvh/bvh.cpp delete mode 100644 thirdparty/embree-aarch64/kernels/bvh/bvh.h delete mode 100644 thirdparty/embree-aarch64/kernels/bvh/bvh4_factory.cpp delete mode 100644 thirdparty/embree-aarch64/kernels/bvh/bvh4_factory.h delete mode 100644 thirdparty/embree-aarch64/kernels/bvh/bvh8_factory.cpp delete mode 100644 thirdparty/embree-aarch64/kernels/bvh/bvh8_factory.h delete mode 100644 thirdparty/embree-aarch64/kernels/bvh/bvh_builder.cpp delete mode 100644 thirdparty/embree-aarch64/kernels/bvh/bvh_builder.h delete mode 100644 thirdparty/embree-aarch64/kernels/bvh/bvh_builder_morton.cpp delete mode 100644 thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah.cpp delete mode 100644 thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah_mb.cpp delete mode 100644 thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah_spatial.cpp delete mode 100644 thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel.cpp delete mode 100644 thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel.h delete mode 100644 thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel_internal.h delete mode 100644 thirdparty/embree-aarch64/kernels/bvh/bvh_collider.cpp delete mode 100644 thirdparty/embree-aarch64/kernels/bvh/bvh_collider.h delete mode 100644 thirdparty/embree-aarch64/kernels/bvh/bvh_factory.h delete mode 100644 thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1.cpp delete mode 100644 thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1.h delete mode 100644 thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1_bvh4.cpp delete mode 100644 thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_hybrid.h delete mode 100644 thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_stream.h delete mode 100644 thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_stream_filters.h delete mode 100644 thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb.h delete mode 100644 thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb_mb.h delete mode 100644 thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb_mb4d.h delete mode 100644 thirdparty/embree-aarch64/kernels/bvh/bvh_node_base.h delete mode 100644 thirdparty/embree-aarch64/kernels/bvh/bvh_node_obb.h delete mode 100644 thirdparty/embree-aarch64/kernels/bvh/bvh_node_obb_mb.h delete mode 100644 thirdparty/embree-aarch64/kernels/bvh/bvh_node_qaabb.h delete mode 100644 thirdparty/embree-aarch64/kernels/bvh/bvh_node_ref.h delete mode 100644 thirdparty/embree-aarch64/kernels/bvh/bvh_refit.cpp delete mode 100644 thirdparty/embree-aarch64/kernels/bvh/bvh_refit.h delete mode 100644 thirdparty/embree-aarch64/kernels/bvh/bvh_rotate.cpp delete mode 100644 thirdparty/embree-aarch64/kernels/bvh/bvh_rotate.h delete mode 100644 thirdparty/embree-aarch64/kernels/bvh/bvh_statistics.cpp delete mode 100644 thirdparty/embree-aarch64/kernels/bvh/bvh_statistics.h delete mode 100644 thirdparty/embree-aarch64/kernels/bvh/bvh_traverser1.h delete mode 100644 thirdparty/embree-aarch64/kernels/bvh/bvh_traverser_stream.h delete mode 100644 thirdparty/embree-aarch64/kernels/bvh/node_intersector.h delete mode 100644 thirdparty/embree-aarch64/kernels/bvh/node_intersector1.h delete mode 100644 thirdparty/embree-aarch64/kernels/bvh/node_intersector_frustum.h delete mode 100644 thirdparty/embree-aarch64/kernels/bvh/node_intersector_packet.h delete mode 100644 thirdparty/embree-aarch64/kernels/bvh/node_intersector_packet_stream.h delete mode 100644 thirdparty/embree-aarch64/kernels/common/accel.h delete mode 100644 thirdparty/embree-aarch64/kernels/common/accelinstance.h delete mode 100644 thirdparty/embree-aarch64/kernels/common/acceln.cpp delete mode 100644 thirdparty/embree-aarch64/kernels/common/acceln.h delete mode 100644 thirdparty/embree-aarch64/kernels/common/accelset.cpp delete mode 100644 thirdparty/embree-aarch64/kernels/common/accelset.h delete mode 100644 thirdparty/embree-aarch64/kernels/common/alloc.cpp delete mode 100644 thirdparty/embree-aarch64/kernels/common/alloc.h delete mode 100644 thirdparty/embree-aarch64/kernels/common/buffer.h delete mode 100644 thirdparty/embree-aarch64/kernels/common/builder.h delete mode 100644 thirdparty/embree-aarch64/kernels/common/context.h delete mode 100644 thirdparty/embree-aarch64/kernels/common/default.h delete mode 100644 thirdparty/embree-aarch64/kernels/common/device.cpp delete mode 100644 thirdparty/embree-aarch64/kernels/common/device.h delete mode 100644 thirdparty/embree-aarch64/kernels/common/geometry.cpp delete mode 100644 thirdparty/embree-aarch64/kernels/common/geometry.h delete mode 100644 thirdparty/embree-aarch64/kernels/common/hit.h delete mode 100644 thirdparty/embree-aarch64/kernels/common/instance_stack.h delete mode 100644 thirdparty/embree-aarch64/kernels/common/isa.h delete mode 100644 thirdparty/embree-aarch64/kernels/common/motion_derivative.h delete mode 100644 thirdparty/embree-aarch64/kernels/common/point_query.h delete mode 100644 thirdparty/embree-aarch64/kernels/common/primref.h delete mode 100644 thirdparty/embree-aarch64/kernels/common/primref_mb.h delete mode 100644 thirdparty/embree-aarch64/kernels/common/profile.h delete mode 100644 thirdparty/embree-aarch64/kernels/common/ray.h delete mode 100644 thirdparty/embree-aarch64/kernels/common/rtcore.cpp delete mode 100644 thirdparty/embree-aarch64/kernels/common/rtcore.h delete mode 100644 thirdparty/embree-aarch64/kernels/common/rtcore_builder.cpp delete mode 100644 thirdparty/embree-aarch64/kernels/common/scene.cpp delete mode 100644 thirdparty/embree-aarch64/kernels/common/scene.h delete mode 100644 thirdparty/embree-aarch64/kernels/common/scene_curves.h delete mode 100644 thirdparty/embree-aarch64/kernels/common/scene_grid_mesh.h delete mode 100644 thirdparty/embree-aarch64/kernels/common/scene_instance.h delete mode 100644 thirdparty/embree-aarch64/kernels/common/scene_line_segments.h delete mode 100644 thirdparty/embree-aarch64/kernels/common/scene_points.h delete mode 100644 thirdparty/embree-aarch64/kernels/common/scene_quad_mesh.h delete mode 100644 thirdparty/embree-aarch64/kernels/common/scene_subdiv_mesh.h delete mode 100644 thirdparty/embree-aarch64/kernels/common/scene_triangle_mesh.cpp delete mode 100644 thirdparty/embree-aarch64/kernels/common/scene_triangle_mesh.h delete mode 100644 thirdparty/embree-aarch64/kernels/common/scene_user_geometry.h delete mode 100644 thirdparty/embree-aarch64/kernels/common/stack_item.h delete mode 100644 thirdparty/embree-aarch64/kernels/common/stat.cpp delete mode 100644 thirdparty/embree-aarch64/kernels/common/stat.h delete mode 100644 thirdparty/embree-aarch64/kernels/common/state.cpp delete mode 100644 thirdparty/embree-aarch64/kernels/common/state.h delete mode 100644 thirdparty/embree-aarch64/kernels/common/vector.h delete mode 100644 thirdparty/embree-aarch64/kernels/config.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/cone.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/coneline_intersector.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/conelinei_intersector.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/curveNi.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/curveNi_intersector.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/curveNi_mb.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/curveNi_mb_intersector.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/curveNv.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/curveNv_intersector.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/curve_intersector.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/curve_intersector_distance.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/curve_intersector_oriented.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/curve_intersector_precalculations.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/curve_intersector_ribbon.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/curve_intersector_sweep.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_bezier_curve.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_bspline_curve.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_catmullrom_curve.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_hermite_curve.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_linear_curve.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_point.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/cylinder.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/disc_intersector.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/disci_intersector.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/filter.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/grid_intersector.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/grid_soa.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/grid_soa_intersector1.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/grid_soa_intersector_packet.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/instance.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/instance_intersector.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/intersector_epilog.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/intersector_iterators.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/line_intersector.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/linei.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/linei_intersector.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/object.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/object_intersector.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/plane.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/pointi.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/primitive.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/primitive4.cpp delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/quad_intersector.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/quad_intersector_moeller.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/quad_intersector_pluecker.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/quadi.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/quadi_intersector.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/quadv.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/quadv_intersector.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/roundline_intersector.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/roundlinei_intersector.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/sphere_intersector.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/spherei_intersector.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/subdivpatch1.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/subdivpatch1_intersector.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/subgrid.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/subgrid_intersector.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/subgrid_intersector_moeller.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/subgrid_intersector_pluecker.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/subgrid_mb_intersector.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/triangle.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/triangle_intersector.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/triangle_intersector_moeller.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/triangle_intersector_pluecker.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/triangle_intersector_woop.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/triangle_triangle_intersector.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/trianglei.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/trianglei_intersector.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/trianglev.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/trianglev_intersector.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/trianglev_mb.h delete mode 100644 thirdparty/embree-aarch64/kernels/geometry/trianglev_mb_intersector.h delete mode 100644 thirdparty/embree-aarch64/kernels/hash.h delete mode 100644 thirdparty/embree-aarch64/kernels/subdiv/bezier_curve.h delete mode 100644 thirdparty/embree-aarch64/kernels/subdiv/bezier_patch.h delete mode 100644 thirdparty/embree-aarch64/kernels/subdiv/bilinear_patch.h delete mode 100644 thirdparty/embree-aarch64/kernels/subdiv/bspline_curve.h delete mode 100644 thirdparty/embree-aarch64/kernels/subdiv/bspline_patch.h delete mode 100644 thirdparty/embree-aarch64/kernels/subdiv/catmullclark_coefficients.h delete mode 100644 thirdparty/embree-aarch64/kernels/subdiv/catmullclark_patch.h delete mode 100644 thirdparty/embree-aarch64/kernels/subdiv/catmullclark_ring.h delete mode 100644 thirdparty/embree-aarch64/kernels/subdiv/catmullrom_curve.h delete mode 100644 thirdparty/embree-aarch64/kernels/subdiv/feature_adaptive_eval.h delete mode 100644 thirdparty/embree-aarch64/kernels/subdiv/feature_adaptive_eval_grid.h delete mode 100644 thirdparty/embree-aarch64/kernels/subdiv/feature_adaptive_eval_simd.h delete mode 100644 thirdparty/embree-aarch64/kernels/subdiv/gregory_patch.h delete mode 100644 thirdparty/embree-aarch64/kernels/subdiv/gregory_patch_dense.h delete mode 100644 thirdparty/embree-aarch64/kernels/subdiv/gridrange.h delete mode 100644 thirdparty/embree-aarch64/kernels/subdiv/half_edge.h delete mode 100644 thirdparty/embree-aarch64/kernels/subdiv/hermite_curve.h delete mode 100644 thirdparty/embree-aarch64/kernels/subdiv/linear_bezier_patch.h delete mode 100644 thirdparty/embree-aarch64/kernels/subdiv/patch.h delete mode 100644 thirdparty/embree-aarch64/kernels/subdiv/patch_eval.h delete mode 100644 thirdparty/embree-aarch64/kernels/subdiv/patch_eval_grid.h delete mode 100644 thirdparty/embree-aarch64/kernels/subdiv/patch_eval_simd.h delete mode 100644 thirdparty/embree-aarch64/kernels/subdiv/subdivpatch1base.h delete mode 100644 thirdparty/embree-aarch64/kernels/subdiv/tessellation.h delete mode 100644 thirdparty/embree-aarch64/kernels/subdiv/tessellation_cache.h delete mode 100644 thirdparty/embree-aarch64/patches/godot-changes.patch (limited to 'thirdparty/embree-aarch64') diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_any_of.h b/thirdparty/embree-aarch64/common/algorithms/parallel_any_of.h deleted file mode 100644 index 01f1f80f6c..0000000000 --- a/thirdparty/embree-aarch64/common/algorithms/parallel_any_of.h +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include -#include "parallel_reduce.h" - -namespace embree -{ - - template - __forceinline bool parallel_any_of (Index first, Index last, UnaryPredicate pred) - { - bool ret = false; - -#if defined(TASKING_TBB) -#if TBB_INTERFACE_VERSION >= 12002 - tbb::task_group_context context; - tbb::parallel_for(tbb::blocked_range{first, last}, [&ret,pred,&context](const tbb::blocked_range& r) { - if (context.is_group_execution_cancelled()) return; - for (size_t i = r.begin(); i != r.end(); ++i) { - if (pred(i)) { - ret = true; - context.cancel_group_execution(); - } - } - }); -#else - tbb::parallel_for(tbb::blocked_range{first, last}, [&ret,pred](const tbb::blocked_range& r) { - if (tbb::task::self().is_cancelled()) return; - for (size_t i = r.begin(); i != r.end(); ++i) { - if (pred(i)) { - ret = true; - tbb::task::self().cancel_group_execution(); - } - } - }); -#endif -#else - ret = parallel_reduce (first, last, false, [pred](const range& r)->bool { - bool localret = false; - for (auto i=r.begin(); i() - ); -#endif - - return ret; - } - -} // end namespace diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_filter.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_filter.cpp deleted file mode 100644 index acddc0ff81..0000000000 --- a/thirdparty/embree-aarch64/common/algorithms/parallel_filter.cpp +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "parallel_filter.h" -#include "../sys/regression.h" -#include - -namespace embree -{ - struct parallel_filter_regression_test : public RegressionTest - { - parallel_filter_regression_test(const char* name) : RegressionTest(name) { - registerRegressionTest(this); - } - - bool run () - { - bool passed = true; - auto pred = [&]( uint32_t v ) { return (v & 0x3) == 0; }; - - for (size_t N=10; N<1000000; N=size_t(2.1*N)) - { - size_t N0 = rand() % N; - - /* initialize array with random numbers */ - std::vector src(N); - std::map m; - for (size_t i=0; i - inline Index sequential_filter( Ty* data, const Index first, const Index last, const Predicate& predicate) - { - Index j = first; - for (Index i=first; i - inline Index parallel_filter( Ty* data, const Index begin, const Index end, const Index minStepSize, const Predicate& predicate) - { - /* sequential fallback */ - if (end-begin <= minStepSize) - return sequential_filter(data,begin,end,predicate); - - /* calculate number of tasks to use */ - enum { MAX_TASKS = 64 }; - const Index numThreads = TaskScheduler::threadCount(); - const Index numBlocks = (end-begin+minStepSize-1)/minStepSize; - const Index taskCount = min(numThreads,numBlocks,(Index)MAX_TASKS); - - /* filter blocks */ - Index nused[MAX_TASKS]; - Index nfree[MAX_TASKS]; - parallel_for(taskCount, [&](const Index taskIndex) - { - const Index i0 = begin+(taskIndex+0)*(end-begin)/taskCount; - const Index i1 = begin+(taskIndex+1)*(end-begin)/taskCount; - const Index i2 = sequential_filter(data,i0,i1,predicate); - nused[taskIndex] = i2-i0; - nfree[taskIndex] = i1-i2; - }); - - /* calculate offsets */ - Index sused=0; - Index sfree=0; - Index pfree[MAX_TASKS]; - for (Index i=0; i0; i--) - { - if (k0 > r1) break; - Index k1 = k0+nused[i]; - Index src = begin+(i+0)*(end-begin)/taskCount+nused[i]; - for (Index i=max(r0,k0); i= begin && dst < end); - assert(isrc >= begin && isrc < end); - data[dst++] = data[isrc]; - } - k0 = k1; - } - }); - - return begin+sused; - } -} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_for.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_for.cpp deleted file mode 100644 index ef070ebc4d..0000000000 --- a/thirdparty/embree-aarch64/common/algorithms/parallel_for.cpp +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "parallel_for.h" -#include "../sys/regression.h" - -namespace embree -{ - struct parallel_for_regression_test : public RegressionTest - { - parallel_for_regression_test(const char* name) : RegressionTest(name) { - registerRegressionTest(this); - } - - bool run () - { - bool passed = true; - - const size_t M = 10; - for (size_t N=10; N<10000000; N=size_t(2.1*N)) - { - /* sequentially calculate sum of squares */ - size_t sum0 = 0; - for (size_t i=0; i sum1(0); - parallel_for( size_t(0), size_t(N), size_t(1024), [&](const range& r) - { - size_t s = 0; - for (size_t i=r.begin(); i -#include -#include -#endif - -namespace embree -{ - /* parallel_for without range */ - template - __forceinline void parallel_for( const Index N, const Func& func) - { -#if defined(TASKING_INTERNAL) - if (N) { - TaskScheduler::spawn(Index(0),N,Index(1),[&] (const range& r) { - assert(r.size() == 1); - func(r.begin()); - }); - if (!TaskScheduler::wait()) - // -- GODOT start -- - // throw std::runtime_error("task cancelled"); - abort(); - // -- GODOT end -- - } -#elif defined(TASKING_GCD) && defined(BUILD_IOS) - - const size_t baselineNumBlocks = (TaskScheduler::threadCount() > 1)? TaskScheduler::threadCount() : 1; - const size_t length = N; - const size_t blockSize = (length + baselineNumBlocks-1) / baselineNumBlocks; - const size_t numBlocks = (length + blockSize-1) / blockSize; - - dispatch_apply(numBlocks, DISPATCH_APPLY_AUTO, ^(size_t currentBlock) { - - const size_t start = (currentBlock * blockSize); - const size_t blockLength = std::min(length - start, blockSize); - const size_t end = start + blockLength; - - for(size_t i=start; i < end; i++) - { - func(i); - } - }); - -#elif defined(TASKING_TBB) - #if TBB_INTERFACE_VERSION >= 12002 - tbb::task_group_context context; - tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { - func(i); - },context); - if (context.is_group_execution_cancelled()) - // -- GODOT start -- - // throw std::runtime_error("task cancelled"); - abort(); - // -- GODOT end -- - #else - tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { - func(i); - }); - if (tbb::task::self().is_cancelled()) - // -- GODOT start -- - // throw std::runtime_error("task cancelled"); - abort(); - // -- GODOT end -- - #endif - -#elif defined(TASKING_PPL) - concurrency::parallel_for(Index(0),N,Index(1),[&](Index i) { - func(i); - }); -#else -# error "no tasking system enabled" -#endif - } - - /* parallel for with range and granulatity */ - template - __forceinline void parallel_for( const Index first, const Index last, const Index minStepSize, const Func& func) - { - assert(first <= last); -#if defined(TASKING_INTERNAL) - TaskScheduler::spawn(first,last,minStepSize,func); - if (!TaskScheduler::wait()) - // -- GODOT start -- - // throw std::runtime_error("task cancelled"); - abort(); - // -- GODOT end -- - -#elif defined(TASKING_GCD) && defined(BUILD_IOS) - - const size_t baselineNumBlocks = (TaskScheduler::threadCount() > 1)? 4*TaskScheduler::threadCount() : 1; - const size_t length = last - first; - const size_t blockSizeByThreads = (length + baselineNumBlocks-1) / baselineNumBlocks; - size_t blockSize = std::max(minStepSize,blockSizeByThreads); - blockSize += blockSize % 4; - - const size_t numBlocks = (length + blockSize-1) / blockSize; - - dispatch_apply(numBlocks, DISPATCH_APPLY_AUTO, ^(size_t currentBlock) { - - const size_t start = first + (currentBlock * blockSize); - const size_t end = std::min(last, start + blockSize); - - func( embree::range(start,end) ); - }); - - -#elif defined(TASKING_TBB) - #if TBB_INTERFACE_VERSION >= 12002 - tbb::task_group_context context; - tbb::parallel_for(tbb::blocked_range(first,last,minStepSize),[&](const tbb::blocked_range& r) { - func(range(r.begin(),r.end())); - },context); - if (context.is_group_execution_cancelled()) - // -- GODOT start -- - // throw std::runtime_error("task cancelled"); - abort(); - // -- GODOT end -- - #else - tbb::parallel_for(tbb::blocked_range(first,last,minStepSize),[&](const tbb::blocked_range& r) { - func(range(r.begin(),r.end())); - }); - if (tbb::task::self().is_cancelled()) - // -- GODOT start -- - // throw std::runtime_error("task cancelled"); - abort(); - // -- GODOT end -- - #endif - -#elif defined(TASKING_PPL) - concurrency::parallel_for(first, last, Index(1) /*minStepSize*/, [&](Index i) { - func(range(i,i+1)); - }); - -#else -# error "no tasking system enabled" -#endif - } - - /* parallel for with range */ - template - __forceinline void parallel_for( const Index first, const Index last, const Func& func) - { - assert(first <= last); - parallel_for(first,last,(Index)1,func); - } - -#if defined(TASKING_TBB) && (TBB_INTERFACE_VERSION > 4001) - - template - __forceinline void parallel_for_static( const Index N, const Func& func) - { - #if TBB_INTERFACE_VERSION >= 12002 - tbb::task_group_context context; - tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { - func(i); - },tbb::simple_partitioner(),context); - if (context.is_group_execution_cancelled()) - // -- GODOT start -- - // throw std::runtime_error("task cancelled"); - abort(); - // -- GODOT end -- - #else - tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { - func(i); - },tbb::simple_partitioner()); - if (tbb::task::self().is_cancelled()) - // -- GODOT start -- - // throw std::runtime_error("task cancelled"); - abort(); - // -- GODOT end -- - #endif - } - - typedef tbb::affinity_partitioner affinity_partitioner; - - template - __forceinline void parallel_for_affinity( const Index N, const Func& func, tbb::affinity_partitioner& ap) - { - #if TBB_INTERFACE_VERSION >= 12002 - tbb::task_group_context context; - tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { - func(i); - },ap,context); - if (context.is_group_execution_cancelled()) - // -- GODOT start -- - // throw std::runtime_error("task cancelled"); - abort(); - // -- GODOT end -- - #else - tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { - func(i); - },ap); - if (tbb::task::self().is_cancelled()) - // -- GODOT start -- - // throw std::runtime_error("task cancelled"); - abort(); - // -- GODOT end -- - #endif - } - -#else - - template - __forceinline void parallel_for_static( const Index N, const Func& func) - { - parallel_for(N,func); - } - - struct affinity_partitioner { - }; - - template - __forceinline void parallel_for_affinity( const Index N, const Func& func, affinity_partitioner& ap) - { - parallel_for(N,func); - } - -#endif -} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_for_for.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_for_for.cpp deleted file mode 100644 index 0337611b35..0000000000 --- a/thirdparty/embree-aarch64/common/algorithms/parallel_for_for.cpp +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "parallel_for_for.h" -#include "../sys/regression.h" - -namespace embree -{ - struct parallel_for_for_regression_test : public RegressionTest - { - parallel_for_for_regression_test(const char* name) : RegressionTest(name) { - registerRegressionTest(this); - } - - bool run () - { - bool passed = true; - - /* create vector with random numbers */ - size_t sum0 = 0; - size_t K = 0; - const size_t M = 1000; - std::vector* > array2(M); - for (size_t i=0; i(N); - for (size_t j=0; j> verify_k(K); - for (size_t i=0; i sum1(0); - parallel_for_for( array2, size_t(1), [&](std::vector* v, const range& r, size_t k) -> size_t - { - size_t s = 0; - for (size_t i=r.begin(); i - __forceinline void sequential_for_for( ArrayArray& array2, const size_t minStepSize, const Func& func ) - { - size_t k=0; - for (size_t i=0; i!=array2.size(); ++i) { - const size_t N = array2[i]->size(); - if (N) func(array2[i],range(0,N),k); - k+=N; - } - } - - class ParallelForForState - { - public: - - enum { MAX_TASKS = 64 }; - - __forceinline ParallelForForState () - : taskCount(0) {} - - template - __forceinline ParallelForForState (ArrayArray& array2, const size_t minStepSize) { - init(array2,minStepSize); - } - - template - __forceinline void init ( ArrayArray& array2, const size_t minStepSize ) - { - /* first calculate total number of elements */ - size_t N = 0; - for (size_t i=0; isize() : 0; - } - this->N = N; - - /* calculate number of tasks to use */ - const size_t numThreads = TaskScheduler::threadCount(); - const size_t numBlocks = (N+minStepSize-1)/minStepSize; - taskCount = max(size_t(1),min(numThreads,numBlocks,size_t(ParallelForForState::MAX_TASKS))); - - /* calculate start (i,j) for each task */ - size_t taskIndex = 0; - i0[taskIndex] = 0; - j0[taskIndex] = 0; - size_t k0 = (++taskIndex)*N/taskCount; - for (size_t i=0, k=0; taskIndex < taskCount; i++) - { - assert(isize() : 0; - while (j= k0 && taskIndex < taskCount) { - assert(taskIndex - __forceinline void parallel_for_for( ArrayArray& array2, const size_t minStepSize, const Func& func ) - { - ParallelForForState state(array2,minStepSize); - - parallel_for(state.taskCount, [&](const size_t taskIndex) - { - /* calculate range */ - const size_t k0 = (taskIndex+0)*state.size()/state.taskCount; - const size_t k1 = (taskIndex+1)*state.size()/state.taskCount; - size_t i0 = state.i0[taskIndex]; - size_t j0 = state.j0[taskIndex]; - - /* iterate over arrays */ - size_t k=k0; - for (size_t i=i0; ksize() : 0; - const size_t r0 = j0, r1 = min(N,r0+k1-k); - if (r1 > r0) func(array2[i],range(r0,r1),k); - k+=r1-r0; j0 = 0; - } - }); - } - - template - __forceinline void parallel_for_for( ArrayArray& array2, const Func& func ) - { - parallel_for_for(array2,1,func); - } - - template - __forceinline Value parallel_for_for_reduce( ArrayArray& array2, const size_t minStepSize, const Value& identity, const Func& func, const Reduction& reduction ) - { - ParallelForForState state(array2,minStepSize); - Value temp[ParallelForForState::MAX_TASKS]; - - for (size_t i=0; isize() : 0; - const size_t r0 = j0, r1 = min(N,r0+k1-k); - if (r1 > r0) temp[taskIndex] = reduction(temp[taskIndex],func(array2[i],range(r0,r1),k)); - k+=r1-r0; j0 = 0; - } - }); - - Value ret = identity; - for (size_t i=0; i - __forceinline Value parallel_for_for_reduce( ArrayArray& array2, const Value& identity, const Func& func, const Reduction& reduction) - { - return parallel_for_for_reduce(array2,1,identity,func,reduction); - } -} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_for_for_prefix_sum.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_for_for_prefix_sum.cpp deleted file mode 100644 index 0169d8e481..0000000000 --- a/thirdparty/embree-aarch64/common/algorithms/parallel_for_for_prefix_sum.cpp +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "parallel_for_for_prefix_sum.h" -#include "../sys/regression.h" - -namespace embree -{ - struct parallel_for_for_prefix_sum_regression_test : public RegressionTest - { - parallel_for_for_prefix_sum_regression_test(const char* name) : RegressionTest(name) { - registerRegressionTest(this); - } - - bool run () - { - bool passed = true; - - /* create vector with random numbers */ - const size_t M = 10; - std::vector> flattened; - typedef std::vector* > ArrayArray; - ArrayArray array2(M); - size_t K = 0; - for (size_t i=0; i(N); - for (size_t j=0; j> verify_k(K); - for (size_t i=0; i state(array2,size_t(1)); - - /* dry run only counts */ - size_t S = parallel_for_for_prefix_sum0( state, array2, size_t(0), [&](std::vector* v, const range& r, size_t k, size_t i) -> size_t - { - size_t s = 0; - for (size_t i=r.begin(); i* v, const range& r, size_t k, size_t i, const size_t base) -> size_t - { - size_t s = 0; - for (size_t i=r.begin(); i - struct ParallelForForPrefixSumState : public ParallelForForState - { - __forceinline ParallelForForPrefixSumState () {} - - template - __forceinline ParallelForForPrefixSumState (ArrayArray& array2, const size_t minStepSize) - : ParallelForForState(array2,minStepSize) {} - - ParallelPrefixSumState prefix_state; - }; - - template - __forceinline Value parallel_for_for_prefix_sum0( ParallelForForPrefixSumState& state, ArrayArray& array2, Index minStepSize, - const Value& identity, const Func& func, const Reduction& reduction) - { - /* calculate number of tasks to use */ - const size_t taskCount = state.taskCount; - /* perform parallel prefix sum */ - parallel_for(taskCount, [&](const size_t taskIndex) - { - const size_t k0 = (taskIndex+0)*state.size()/taskCount; - const size_t k1 = (taskIndex+1)*state.size()/taskCount; - size_t i0 = state.i0[taskIndex]; - size_t j0 = state.j0[taskIndex]; - - /* iterate over arrays */ - size_t k=k0; - Value N=identity; - for (size_t i=i0; ksize() : 0; - const size_t r0 = j0, r1 = min(size,r0+k1-k); - if (r1 > r0) N = reduction(N, func(array2[i],range((Index)r0,(Index)r1),(Index)k,(Index)i)); - k+=r1-r0; j0 = 0; - } - state.prefix_state.counts[taskIndex] = N; - }); - - /* calculate prefix sum */ - Value sum=identity; - for (size_t i=0; i - __forceinline Value parallel_for_for_prefix_sum1( ParallelForForPrefixSumState& state, ArrayArray& array2, Index minStepSize, - const Value& identity, const Func& func, const Reduction& reduction) - { - /* calculate number of tasks to use */ - const size_t taskCount = state.taskCount; - /* perform parallel prefix sum */ - parallel_for(taskCount, [&](const size_t taskIndex) - { - const size_t k0 = (taskIndex+0)*state.size()/taskCount; - const size_t k1 = (taskIndex+1)*state.size()/taskCount; - size_t i0 = state.i0[taskIndex]; - size_t j0 = state.j0[taskIndex]; - - /* iterate over arrays */ - size_t k=k0; - Value N=identity; - for (size_t i=i0; ksize() : 0; - const size_t r0 = j0, r1 = min(size,r0+k1-k); - if (r1 > r0) N = reduction(N, func(array2[i],range((Index)r0,(Index)r1),(Index)k,(Index)i,reduction(state.prefix_state.sums[taskIndex],N))); - k+=r1-r0; j0 = 0; - } - state.prefix_state.counts[taskIndex] = N; - }); - - /* calculate prefix sum */ - Value sum=identity; - for (size_t i=0; i - __forceinline Value parallel_for_for_prefix_sum0( ParallelForForPrefixSumState& state, ArrayArray& array2, - const Value& identity, const Func& func, const Reduction& reduction) - { - return parallel_for_for_prefix_sum0(state,array2,size_t(1),identity,func,reduction); - } - - template - __forceinline Value parallel_for_for_prefix_sum1( ParallelForForPrefixSumState& state, ArrayArray& array2, - const Value& identity, const Func& func, const Reduction& reduction) - { - return parallel_for_for_prefix_sum1(state,array2,size_t(1),identity,func,reduction); - } -} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_map.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_map.cpp deleted file mode 100644 index 09dc303f81..0000000000 --- a/thirdparty/embree-aarch64/common/algorithms/parallel_map.cpp +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "parallel_map.h" -#include "../sys/regression.h" - -namespace embree -{ - struct parallel_map_regression_test : public RegressionTest - { - parallel_map_regression_test(const char* name) : RegressionTest(name) { - registerRegressionTest(this); - } - - bool run () - { - bool passed = true; - - /* create key/value vectors with random numbers */ - const size_t N = 10000; - std::vector keys(N); - std::vector vals(N); - for (size_t i=0; i map; - map.init(keys,vals); - - /* check that all keys are properly mapped */ - for (size_t i=0; i - class parallel_map - { - /* key/value pair to build the map */ - struct KeyValue - { - __forceinline KeyValue () {} - - __forceinline KeyValue (const Key key, const Val val) - : key(key), val(val) {} - - __forceinline operator Key() const { - return key; - } - - public: - Key key; - Val val; - }; - - public: - - /*! parallel map constructors */ - parallel_map () {} - - /*! construction from pair of vectors */ - template - parallel_map (const KeyVector& keys, const ValVector& values) { init(keys,values); } - - /*! initialized the parallel map from a vector with keys and values */ - template - void init(const KeyVector& keys, const ValVector& values) - { - /* reserve sufficient space for all data */ - assert(keys.size() == values.size()); - vec.resize(keys.size()); - - /* generate key/value pairs */ - parallel_for( size_t(0), keys.size(), size_t(4*4096), [&](const range& r) { - for (size_t i=r.begin(); i temp(keys.size()); - radix_sort(vec.data(),temp.data(),keys.size()); - } - - /*! Returns a pointer to the value associated with the specified key. The pointer will be nullptr of the key is not contained in the map. */ - __forceinline const Val* lookup(const Key& key) const - { - typename std::vector::const_iterator i = std::lower_bound(vec.begin(), vec.end(), key); - if (i == vec.end()) return nullptr; - if (i->key != key) return nullptr; - return &i->val; - } - - /*! If the key is in the map, the function returns the value associated with the key, otherwise it returns the default value. */ - __forceinline Val lookup(const Key& key, const Val& def) const - { - typename std::vector::const_iterator i = std::lower_bound(vec.begin(), vec.end(), key); - if (i == vec.end()) return def; - if (i->key != key) return def; - return i->val; - } - - /*! clears all state */ - void clear() { - vec.clear(); - } - - private: - std::vector vec; //!< vector containing sorted elements - }; -} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_partition.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_partition.cpp deleted file mode 100644 index eb20c4465d..0000000000 --- a/thirdparty/embree-aarch64/common/algorithms/parallel_partition.cpp +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "parallel_partition.h" -#include "../sys/regression.h" - -namespace embree -{ - struct parallel_partition_regression_test : public RegressionTest - { - parallel_partition_regression_test(const char* name) : RegressionTest(name) { - registerRegressionTest(this); - } - - bool run () - { - bool passed = true; - - for (size_t i=0; i<100; i++) - { - /* create random permutation */ - size_t N = std::rand() % 1000000; - std::vector array(N); - for (unsigned i=0; i= split; - } - - return passed; - } - }; - - parallel_partition_regression_test parallel_partition_regression("parallel_partition_regression_test"); -} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_partition.h b/thirdparty/embree-aarch64/common/algorithms/parallel_partition.h deleted file mode 100644 index 3b3ad7c854..0000000000 --- a/thirdparty/embree-aarch64/common/algorithms/parallel_partition.h +++ /dev/null @@ -1,283 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "parallel_for.h" -#include "../math/range.h" - -namespace embree -{ - /* serial partitioning */ - template - __forceinline size_t serial_partitioning(T* array, - const size_t begin, - const size_t end, - V& leftReduction, - V& rightReduction, - const IsLeft& is_left, - const Reduction_T& reduction_t) - { - T* l = array + begin; - T* r = array + end - 1; - - while(1) - { - /* *l < pivot */ - while (likely(l <= r && is_left(*l) )) - { - //prefetchw(l+4); // FIXME: enable? - reduction_t(leftReduction,*l); - ++l; - } - /* *r >= pivot) */ - while (likely(l <= r && !is_left(*r))) - { - //prefetchw(r-4); FIXME: enable? - reduction_t(rightReduction,*r); - --r; - } - if (r - class __aligned(64) parallel_partition_task - { - ALIGNED_CLASS_(64); - private: - - static const size_t MAX_TASKS = 64; - - T* array; - size_t N; - const IsLeft& is_left; - const Reduction_T& reduction_t; - const Reduction_V& reduction_v; - const Vi& identity; - - size_t numTasks; - __aligned(64) size_t counter_start[MAX_TASKS+1]; - __aligned(64) size_t counter_left[MAX_TASKS+1]; - __aligned(64) range leftMisplacedRanges[MAX_TASKS]; - __aligned(64) range rightMisplacedRanges[MAX_TASKS]; - __aligned(64) V leftReductions[MAX_TASKS]; - __aligned(64) V rightReductions[MAX_TASKS]; - - public: - - __forceinline parallel_partition_task(T* array, - const size_t N, - const Vi& identity, - const IsLeft& is_left, - const Reduction_T& reduction_t, - const Reduction_V& reduction_v, - const size_t BLOCK_SIZE) - - : array(array), N(N), is_left(is_left), reduction_t(reduction_t), reduction_v(reduction_v), identity(identity), - numTasks(min((N+BLOCK_SIZE-1)/BLOCK_SIZE,min(TaskScheduler::threadCount(),MAX_TASKS))) {} - - __forceinline const range* findStartRange(size_t& index, const range* const r, const size_t numRanges) - { - size_t i = 0; - while(index >= (size_t)r[i].size()) - { - assert(i < numRanges); - index -= (size_t)r[i].size(); - i++; - } - return &r[i]; - } - - __forceinline void swapItemsInMisplacedRanges(const size_t numLeftMisplacedRanges, - const size_t numRightMisplacedRanges, - const size_t startID, - const size_t endID) - { - size_t leftLocalIndex = startID; - size_t rightLocalIndex = startID; - const range* l_range = findStartRange(leftLocalIndex,leftMisplacedRanges,numLeftMisplacedRanges); - const range* r_range = findStartRange(rightLocalIndex,rightMisplacedRanges,numRightMisplacedRanges); - - size_t l_left = l_range->size() - leftLocalIndex; - size_t r_left = r_range->size() - rightLocalIndex; - T *__restrict__ l = &array[l_range->begin() + leftLocalIndex]; - T *__restrict__ r = &array[r_range->begin() + rightLocalIndex]; - size_t size = endID - startID; - size_t items = min(size,min(l_left,r_left)); - - while (size) - { - if (unlikely(l_left == 0)) - { - l_range++; - l_left = l_range->size(); - l = &array[l_range->begin()]; - items = min(size,min(l_left,r_left)); - } - - if (unlikely(r_left == 0)) - { - r_range++; - r_left = r_range->size(); - r = &array[r_range->begin()]; - items = min(size,min(l_left,r_left)); - } - - size -= items; - l_left -= items; - r_left -= items; - - while(items) { - items--; - xchg(*l++,*r++); - } - } - } - - __forceinline size_t partition(V& leftReduction, V& rightReduction) - { - /* partition the individual ranges for each task */ - parallel_for(numTasks,[&] (const size_t taskID) { - const size_t startID = (taskID+0)*N/numTasks; - const size_t endID = (taskID+1)*N/numTasks; - V local_left(identity); - V local_right(identity); - const size_t mid = serial_partitioning(array,startID,endID,local_left,local_right,is_left,reduction_t); - counter_start[taskID] = startID; - counter_left [taskID] = mid-startID; - leftReductions[taskID] = local_left; - rightReductions[taskID] = local_right; - }); - counter_start[numTasks] = N; - counter_left[numTasks] = 0; - - /* finalize the reductions */ - for (size_t i=0; i globalLeft (0,mid); - const range globalRight(mid,N); - - /* calculate all left and right ranges that are on the wrong global side */ - size_t numMisplacedRangesLeft = 0; - size_t numMisplacedRangesRight = 0; - size_t numMisplacedItemsLeft = 0; - size_t numMisplacedItemsRight = 0; - - for (size_t i=0; i left_range (counter_start[i], counter_start[i] + counter_left[i]); - const range right_range(counter_start[i] + counter_left[i], counter_start[i+1]); - const range left_misplaced = globalLeft. intersect(right_range); - const range right_misplaced = globalRight.intersect(left_range); - - if (!left_misplaced.empty()) - { - numMisplacedItemsLeft += left_misplaced.size(); - leftMisplacedRanges[numMisplacedRangesLeft++] = left_misplaced; - } - - if (!right_misplaced.empty()) - { - numMisplacedItemsRight += right_misplaced.size(); - rightMisplacedRanges[numMisplacedRangesRight++] = right_misplaced; - } - } - assert( numMisplacedItemsLeft == numMisplacedItemsRight ); - - /* if no items are misplaced we are done */ - if (numMisplacedItemsLeft == 0) - return mid; - - /* otherwise we copy the items to the right place in parallel */ - parallel_for(numTasks,[&] (const size_t taskID) { - const size_t startID = (taskID+0)*numMisplacedItemsLeft/numTasks; - const size_t endID = (taskID+1)*numMisplacedItemsLeft/numTasks; - swapItemsInMisplacedRanges(numMisplacedRangesLeft,numMisplacedRangesRight,startID,endID); - }); - - return mid; - } - }; - - template - __noinline size_t parallel_partitioning(T* array, - const size_t begin, - const size_t end, - const Vi &identity, - V &leftReduction, - V &rightReduction, - const IsLeft& is_left, - const Reduction_T& reduction_t, - const Reduction_V& reduction_v, - size_t BLOCK_SIZE = 128) - { - /* fall back to single threaded partitioning for small N */ - if (unlikely(end-begin < BLOCK_SIZE)) - return serial_partitioning(array,begin,end,leftReduction,rightReduction,is_left,reduction_t); - - /* otherwise use parallel code */ - else { - typedef parallel_partition_task partition_task; - std::unique_ptr p(new partition_task(&array[begin],end-begin,identity,is_left,reduction_t,reduction_v,BLOCK_SIZE)); - return begin+p->partition(leftReduction,rightReduction); - } - } - - template - __noinline size_t parallel_partitioning(T* array, - const size_t begin, - const size_t end, - const Vi &identity, - V &leftReduction, - V &rightReduction, - const IsLeft& is_left, - const Reduction_T& reduction_t, - const Reduction_V& reduction_v, - size_t BLOCK_SIZE, - size_t PARALLEL_THRESHOLD) - { - /* fall back to single threaded partitioning for small N */ - if (unlikely(end-begin < PARALLEL_THRESHOLD)) - return serial_partitioning(array,begin,end,leftReduction,rightReduction,is_left,reduction_t); - - /* otherwise use parallel code */ - else { - typedef parallel_partition_task partition_task; - std::unique_ptr p(new partition_task(&array[begin],end-begin,identity,is_left,reduction_t,reduction_v,BLOCK_SIZE)); - return begin+p->partition(leftReduction,rightReduction); - } - } - - - template - inline size_t parallel_partitioning(T* array, - const size_t begin, - const size_t end, - const IsLeft& is_left, - size_t BLOCK_SIZE = 128) - { - size_t leftReduction = 0; - size_t rightReduction = 0; - return parallel_partitioning( - array,begin,end,0,leftReduction,rightReduction,is_left, - [] (size_t& t,const T& ref) { }, - [] (size_t& t0,size_t& t1) { }, - BLOCK_SIZE); - } - -} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_prefix_sum.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_prefix_sum.cpp deleted file mode 100644 index 685952c3dc..0000000000 --- a/thirdparty/embree-aarch64/common/algorithms/parallel_prefix_sum.cpp +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "parallel_prefix_sum.h" -#include "../sys/regression.h" - -namespace embree -{ - struct parallel_prefix_sum_regression_test : public RegressionTest - { - parallel_prefix_sum_regression_test(const char* name) : RegressionTest(name) { - registerRegressionTest(this); - } - - bool run () - { - bool passed = true; - const size_t M = 10; - - for (size_t N=10; N<10000000; N=size_t(2.1*N)) - { - /* initialize array with random numbers */ - uint32_t sum0 = 0; - std::vector src(N); - for (size_t i=0; i dst(N); - for (auto& v : dst) v = 0; - - for (size_t i=0; i()); - passed &= (sum0 == sum1); - } - - /* check if prefix sum is correct */ - for (size_t i=0, sum=0; i - struct ParallelPrefixSumState - { - enum { MAX_TASKS = 64 }; - Value counts[MAX_TASKS]; - Value sums [MAX_TASKS]; - }; - - template - __forceinline Value parallel_prefix_sum( ParallelPrefixSumState& state, Index first, Index last, Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction) - { - /* calculate number of tasks to use */ - const size_t numThreads = TaskScheduler::threadCount(); - const size_t numBlocks = (last-first+minStepSize-1)/minStepSize; - const size_t taskCount = min(numThreads,numBlocks,size_t(ParallelPrefixSumState::MAX_TASKS)); - - /* perform parallel prefix sum */ - parallel_for(taskCount, [&](const size_t taskIndex) - { - const size_t i0 = first+(taskIndex+0)*(last-first)/taskCount; - const size_t i1 = first+(taskIndex+1)*(last-first)/taskCount; - state.counts[taskIndex] = func(range(i0,i1),state.sums[taskIndex]); - }); - - /* calculate prefix sum */ - Value sum=identity; - for (size_t i=0; i - __forceinline Value parallel_prefix_sum(const SrcArray& src, DstArray& dst, size_t N, const Value& identity, const Add& add, const size_t SINGLE_THREAD_THRESHOLD = 4096) - { - /* perform single threaded prefix operation for small N */ - if (N < SINGLE_THREAD_THRESHOLD) - { - Value sum=identity; - for (size_t i=0; i state; - - /* initial run just sets up start values for subtasks */ - parallel_prefix_sum( state, size_t(0), size_t(N), size_t(1024), identity, [&](const range& r, const Value& sum) -> Value { - - Value s = identity; - for (size_t i=r.begin(); i& r, const Value& sum) -> Value { - - Value s = identity; - for (size_t i=r.begin(); i& r) -> size_t - { - size_t s = 0; - for (size_t i=r.begin(); i - __forceinline Value sequential_reduce( const Index first, const Index last, const Value& identity, const Func& func, const Reduction& reduction ) - { - return func(range(first,last)); - } - - template - __forceinline Value sequential_reduce( const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction ) - { - return func(range(first,last)); - } - - template - __noinline Value parallel_reduce_internal( Index taskCount, const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction ) - { - const Index maxTasks = 512; - const Index threadCount = (Index) TaskScheduler::threadCount(); - taskCount = min(taskCount,threadCount,maxTasks); - - /* parallel invokation of all tasks */ - dynamic_large_stack_array(Value,values,taskCount,8192); // consumes at most 8192 bytes on the stack - parallel_for(taskCount, [&](const Index taskIndex) { - const Index k0 = first+(taskIndex+0)*(last-first)/taskCount; - const Index k1 = first+(taskIndex+1)*(last-first)/taskCount; - values[taskIndex] = func(range(k0,k1)); - }); - - /* perform reduction over all tasks */ - Value v = identity; - for (Index i=0; i - __forceinline Value parallel_reduce( const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction ) - { -#if defined(TASKING_INTERNAL) || (defined(TASKING_GCD) && defined(BUILD_IOS)) - - /* fast path for small number of iterations */ - Index taskCount = (last-first+minStepSize-1)/minStepSize; - if (likely(taskCount == 1)) { - return func(range(first,last)); - } - return parallel_reduce_internal(taskCount,first,last,minStepSize,identity,func,reduction); - -#elif defined(TASKING_TBB) - #if TBB_INTERFACE_VERSION >= 12002 - tbb::task_group_context context; - const Value v = tbb::parallel_reduce(tbb::blocked_range(first,last,minStepSize),identity, - [&](const tbb::blocked_range& r, const Value& start) { return reduction(start,func(range(r.begin(),r.end()))); }, - reduction,context); - // -- GODOT start -- - // if (context.is_group_execution_cancelled()) - // throw std::runtime_error("task cancelled"); - // -- GODOT end -- - return v; - #else - const Value v = tbb::parallel_reduce(tbb::blocked_range(first,last,minStepSize),identity, - [&](const tbb::blocked_range& r, const Value& start) { return reduction(start,func(range(r.begin(),r.end()))); }, - reduction); - // -- GODOT start -- - // if (tbb::task::self().is_cancelled()) - // throw std::runtime_error("task cancelled"); - // -- GODOT end -- - return v; - #endif -#else // TASKING_PPL - struct AlignedValue - { - char storage[__alignof(Value)+sizeof(Value)]; - static uintptr_t alignUp(uintptr_t p, size_t a) { return p + (~(p - 1) % a); }; - Value* getValuePtr() { return reinterpret_cast(alignUp(uintptr_t(storage), __alignof(Value))); } - const Value* getValuePtr() const { return reinterpret_cast(alignUp(uintptr_t(storage), __alignof(Value))); } - AlignedValue(const Value& v) { new(getValuePtr()) Value(v); } - AlignedValue(const AlignedValue& v) { new(getValuePtr()) Value(*v.getValuePtr()); } - AlignedValue(const AlignedValue&& v) { new(getValuePtr()) Value(*v.getValuePtr()); }; - AlignedValue& operator = (const AlignedValue& v) { *getValuePtr() = *v.getValuePtr(); return *this; }; - AlignedValue& operator = (const AlignedValue&& v) { *getValuePtr() = *v.getValuePtr(); return *this; }; - operator Value() const { return *getValuePtr(); } - }; - - struct Iterator_Index - { - Index v; - typedef std::forward_iterator_tag iterator_category; - typedef AlignedValue value_type; - typedef Index difference_type; - typedef Index distance_type; - typedef AlignedValue* pointer; - typedef AlignedValue& reference; - __forceinline Iterator_Index() {} - __forceinline Iterator_Index(Index v) : v(v) {} - __forceinline bool operator== (Iterator_Index other) { return v == other.v; } - __forceinline bool operator!= (Iterator_Index other) { return v != other.v; } - __forceinline Iterator_Index operator++() { return Iterator_Index(++v); } - __forceinline Iterator_Index operator++(int) { return Iterator_Index(v++); } - }; - - auto range_reduction = [&](Iterator_Index begin, Iterator_Index end, const AlignedValue& start) { - assert(begin.v < end.v); - return reduction(start, func(range(begin.v, end.v))); - }; - const Value v = concurrency::parallel_reduce(Iterator_Index(first), Iterator_Index(last), AlignedValue(identity), range_reduction, reduction); - return v; -#endif - } - - template - __forceinline Value parallel_reduce( const Index first, const Index last, const Index minStepSize, const Index parallel_threshold, const Value& identity, const Func& func, const Reduction& reduction ) - { - if (likely(last-first < parallel_threshold)) { - return func(range(first,last)); - } else { - return parallel_reduce(first,last,minStepSize,identity,func,reduction); - } - } - - template - __forceinline Value parallel_reduce( const range range, const Index minStepSize, const Index parallel_threshold, const Value& identity, const Func& func, const Reduction& reduction ) - { - return parallel_reduce(range.begin(),range.end(),minStepSize,parallel_threshold,identity,func,reduction); - } - - template - __forceinline Value parallel_reduce( const Index first, const Index last, const Value& identity, const Func& func, const Reduction& reduction ) - { - auto funcr = [&] ( const range r ) { - Value v = identity; - for (Index i=r.begin(); i - __forceinline Value parallel_reduce( const range range, const Value& identity, const Func& func, const Reduction& reduction ) - { - return parallel_reduce(range.begin(),range.end(),Index(1),identity,func,reduction); - } -} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_set.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_set.cpp deleted file mode 100644 index 20b639c1c9..0000000000 --- a/thirdparty/embree-aarch64/common/algorithms/parallel_set.cpp +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "parallel_set.h" -#include "../sys/regression.h" - -namespace embree -{ - struct parallel_set_regression_test : public RegressionTest - { - parallel_set_regression_test(const char* name) : RegressionTest(name) { - registerRegressionTest(this); - } - - bool run () - { - bool passed = true; - - /* create vector with random numbers */ - const size_t N = 10000; - std::vector unsorted(N); - for (size_t i=0; i sorted; - sorted.init(unsorted); - - /* check that all elements are in the set */ - for (size_t i=0; i - class parallel_set - { - public: - - /*! default constructor for the parallel set */ - parallel_set () {} - - /*! construction from vector */ - template - parallel_set (const Vector& in) { init(in); } - - /*! initialized the parallel set from a vector */ - template - void init(const Vector& in) - { - /* copy data to internal vector */ - vec.resize(in.size()); - parallel_for( size_t(0), in.size(), size_t(4*4096), [&](const range& r) { - for (size_t i=r.begin(); i temp(in.size()); - radix_sort(vec.data(),temp.data(),vec.size()); - } - - /*! tests if some element is in the set */ - __forceinline bool lookup(const T& elt) const { - return std::binary_search(vec.begin(), vec.end(), elt); - } - - /*! clears all state */ - void clear() { - vec.clear(); - } - - private: - std::vector vec; //!< vector containing sorted elements - }; -} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_sort.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_sort.cpp deleted file mode 100644 index 5e7ec79ac1..0000000000 --- a/thirdparty/embree-aarch64/common/algorithms/parallel_sort.cpp +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "parallel_sort.h" -#include "../sys/regression.h" - -namespace embree -{ - template - struct RadixSortRegressionTest : public RegressionTest - { - RadixSortRegressionTest(const char* name) : RegressionTest(name) { - registerRegressionTest(this); - } - - bool run () - { - bool passed = true; - const size_t M = 10; - - for (size_t N=10; N<1000000; N=size_t(2.1*N)) - { - std::vector src(N); memset(src.data(),0,N*sizeof(Key)); - std::vector tmp(N); memset(tmp.data(),0,N*sizeof(Key)); - for (size_t i=0; i(src.data(),tmp.data(),N); - } - - /* calculate checksum */ - Key sum1 = 0; for (size_t i=0; i test_u32("RadixSortRegressionTestU32"); - RadixSortRegressionTest test_u64("RadixSortRegressionTestU64"); -} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_sort.h b/thirdparty/embree-aarch64/common/algorithms/parallel_sort.h deleted file mode 100644 index a758227c1b..0000000000 --- a/thirdparty/embree-aarch64/common/algorithms/parallel_sort.h +++ /dev/null @@ -1,457 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../simd/simd.h" -#include "parallel_for.h" -#if defined(TASKING_GCD) && defined(BUILD_IOS) -#include "../sys/alloc.h" -#endif -#include - -namespace embree -{ - template - __forceinline void insertionsort_ascending(T *__restrict__ array, const size_t length) - { - for(size_t i = 1;i 0 && v < array[j-1]) - { - array[j] = array[j-1]; - --j; - } - array[j] = v; - } - } - - template - __forceinline void insertionsort_decending(T *__restrict__ array, const size_t length) - { - for(size_t i = 1;i 0 && v > array[j-1]) - { - array[j] = array[j-1]; - --j; - } - array[j] = v; - } - } - - template - void quicksort_ascending(T *__restrict__ t, - const ssize_t begin, - const ssize_t end) - { - if (likely(begin < end)) - { - const T pivotvalue = t[begin]; - ssize_t left = begin - 1; - ssize_t right = end + 1; - - while(1) - { - while (t[--right] > pivotvalue); - while (t[++left] < pivotvalue); - - if (left >= right) break; - - const T temp = t[right]; - t[right] = t[left]; - t[left] = temp; - } - - const int pivot = right; - quicksort_ascending(t, begin, pivot); - quicksort_ascending(t, pivot + 1, end); - } - } - - template - void quicksort_decending(T *__restrict__ t, - const ssize_t begin, - const ssize_t end) - { - if (likely(begin < end)) - { - const T pivotvalue = t[begin]; - ssize_t left = begin - 1; - ssize_t right = end + 1; - - while(1) - { - while (t[--right] < pivotvalue); - while (t[++left] > pivotvalue); - - if (left >= right) break; - - const T temp = t[right]; - t[right] = t[left]; - t[left] = temp; - } - - const int pivot = right; - quicksort_decending(t, begin, pivot); - quicksort_decending(t, pivot + 1, end); - } - } - - - template - void quicksort_insertionsort_ascending(T *__restrict__ t, - const ssize_t begin, - const ssize_t end) - { - if (likely(begin < end)) - { - const ssize_t size = end-begin+1; - if (likely(size <= THRESHOLD)) - { - insertionsort_ascending(&t[begin],size); - } - else - { - const T pivotvalue = t[begin]; - ssize_t left = begin - 1; - ssize_t right = end + 1; - - while(1) - { - while (t[--right] > pivotvalue); - while (t[++left] < pivotvalue); - - if (left >= right) break; - - const T temp = t[right]; - t[right] = t[left]; - t[left] = temp; - } - - const ssize_t pivot = right; - quicksort_insertionsort_ascending(t, begin, pivot); - quicksort_insertionsort_ascending(t, pivot + 1, end); - } - } - } - - - template - void quicksort_insertionsort_decending(T *__restrict__ t, - const ssize_t begin, - const ssize_t end) - { - if (likely(begin < end)) - { - const ssize_t size = end-begin+1; - if (likely(size <= THRESHOLD)) - { - insertionsort_decending(&t[begin],size); - } - else - { - - const T pivotvalue = t[begin]; - ssize_t left = begin - 1; - ssize_t right = end + 1; - - while(1) - { - while (t[--right] < pivotvalue); - while (t[++left] > pivotvalue); - - if (left >= right) break; - - const T temp = t[right]; - t[right] = t[left]; - t[left] = temp; - } - - const ssize_t pivot = right; - quicksort_insertionsort_decending(t, begin, pivot); - quicksort_insertionsort_decending(t, pivot + 1, end); - } - } - } - - template - static void radixsort32(T* const morton, const size_t num, const unsigned int shift = 3*8) - { - static const unsigned int BITS = 8; - static const unsigned int BUCKETS = (1 << BITS); - static const unsigned int CMP_SORT_THRESHOLD = 16; - - __aligned(64) unsigned int count[BUCKETS]; - - /* clear buckets */ - for (size_t i=0;i> shift) & (BUCKETS-1)]++; - - /* prefix sums */ - __aligned(64) unsigned int head[BUCKETS]; - __aligned(64) unsigned int tail[BUCKETS]; - - head[0] = 0; - for (size_t i=1; i> shift) & (BUCKETS-1); - if (b == i) break; - std::swap(v,morton[head[b]++]); - } - assert((unsigned(v) >> shift & (BUCKETS-1)) == i); - morton[head[i]++] = v; - } - } - if (shift == 0) return; - - size_t offset = 0; - for (size_t i=0;i> shift) & (BUCKETS-1)) == i); - - if (unlikely(count[i] < CMP_SORT_THRESHOLD)) - insertionsort_ascending(morton + offset, count[i]); - else - radixsort32(morton + offset, count[i], shift-BITS); - - for (size_t j=offset;j - class ParallelRadixSort - { - static const size_t MAX_TASKS = 64; - static const size_t BITS = 8; - static const size_t BUCKETS = (1 << BITS); - typedef unsigned int TyRadixCount[BUCKETS]; - - template - static bool compare(const T& v0, const T& v1) { - return (Key)v0 < (Key)v1; - } - - private: - ParallelRadixSort (const ParallelRadixSort& other) DELETED; // do not implement - ParallelRadixSort& operator= (const ParallelRadixSort& other) DELETED; // do not implement - - - public: - ParallelRadixSort (Ty* const src, Ty* const tmp, const size_t N) - : radixCount(nullptr), src(src), tmp(tmp), N(N) {} - - void sort(const size_t blockSize) - { - assert(blockSize > 0); - - /* perform single threaded sort for small N */ - if (N<=blockSize) // handles also special case of 0! - { - /* do inplace sort inside destination array */ - std::sort(src,src+N,compare); - } - - /* perform parallel sort for large N */ - else - { - const size_t numThreads = min((N+blockSize-1)/blockSize,TaskScheduler::threadCount(),size_t(MAX_TASKS)); - tbbRadixSort(numThreads); - } - } - - ~ParallelRadixSort() - { - alignedFree(radixCount); - radixCount = nullptr; - } - - private: - - void tbbRadixIteration0(const Key shift, - const Ty* __restrict const src, - Ty* __restrict const dst, - const size_t threadIndex, const size_t threadCount) - { - const size_t startID = (threadIndex+0)*N/threadCount; - const size_t endID = (threadIndex+1)*N/threadCount; - - /* mask to extract some number of bits */ - const Key mask = BUCKETS-1; - - /* count how many items go into the buckets */ - for (size_t i=0; i> (size_t)shift) & (size_t)mask; -#else - const Key index = ((Key)src[i] >> shift) & mask; -#endif - count[index]++; - } - } - - void tbbRadixIteration1(const Key shift, - const Ty* __restrict const src, - Ty* __restrict const dst, - const size_t threadIndex, const size_t threadCount) - { - const size_t startID = (threadIndex+0)*N/threadCount; - const size_t endID = (threadIndex+1)*N/threadCount; - - /* mask to extract some number of bits */ - const Key mask = BUCKETS-1; - - /* calculate total number of items for each bucket */ - __aligned(64) unsigned int total[BUCKETS]; - /* - for (size_t i=0; i> (size_t)shift) & (size_t)mask; -#else - const size_t index = ((Key)src[i] >> shift) & mask; -#endif - dst[offset[index]++] = elt; - } - } - - void tbbRadixIteration(const Key shift, const bool last, - const Ty* __restrict src, Ty* __restrict dst, - const size_t numTasks) - { - affinity_partitioner ap; - parallel_for_affinity(numTasks,[&] (size_t taskIndex) { tbbRadixIteration0(shift,src,dst,taskIndex,numTasks); },ap); - parallel_for_affinity(numTasks,[&] (size_t taskIndex) { tbbRadixIteration1(shift,src,dst,taskIndex,numTasks); },ap); - } - - void tbbRadixSort(const size_t numTasks) - { - radixCount = (TyRadixCount*) alignedMalloc(MAX_TASKS*sizeof(TyRadixCount),64); - - if (sizeof(Key) == sizeof(uint32_t)) { - tbbRadixIteration(0*BITS,0,src,tmp,numTasks); - tbbRadixIteration(1*BITS,0,tmp,src,numTasks); - tbbRadixIteration(2*BITS,0,src,tmp,numTasks); - tbbRadixIteration(3*BITS,1,tmp,src,numTasks); - } - else if (sizeof(Key) == sizeof(uint64_t)) - { - tbbRadixIteration(0*BITS,0,src,tmp,numTasks); - tbbRadixIteration(1*BITS,0,tmp,src,numTasks); - tbbRadixIteration(2*BITS,0,src,tmp,numTasks); - tbbRadixIteration(3*BITS,0,tmp,src,numTasks); - tbbRadixIteration(4*BITS,0,src,tmp,numTasks); - tbbRadixIteration(5*BITS,0,tmp,src,numTasks); - tbbRadixIteration(6*BITS,0,src,tmp,numTasks); - tbbRadixIteration(7*BITS,1,tmp,src,numTasks); - } - } - - private: - TyRadixCount* radixCount; - Ty* const src; - Ty* const tmp; - const size_t N; - }; - - template - void radix_sort(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) - { - ParallelRadixSort(src,tmp,N).sort(blockSize); - } - - template - void radix_sort(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) - { - ParallelRadixSort(src,tmp,N).sort(blockSize); - } - - template - void radix_sort_u32(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) { - radix_sort(src,tmp,N,blockSize); - } - - template - void radix_sort_u64(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) { - radix_sort(src,tmp,N,blockSize); - } -} diff --git a/thirdparty/embree-aarch64/common/lexers/parsestream.h b/thirdparty/embree-aarch64/common/lexers/parsestream.h deleted file mode 100644 index db46dc114f..0000000000 --- a/thirdparty/embree-aarch64/common/lexers/parsestream.h +++ /dev/null @@ -1,101 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "stringstream.h" -#include "../sys/filename.h" -#include "../math/vec2.h" -#include "../math/vec3.h" -#include "../math/col3.h" -#include "../math/color.h" - -namespace embree -{ - /*! helper class for simple command line parsing */ - class ParseStream : public Stream - { - public: - ParseStream (const Ref >& cin) : cin(cin) {} - - ParseStream (const Ref >& cin, const std::string& seps = "\n\t\r ", - const std::string& endl = "", bool multiLine = false) - : cin(new StringStream(cin,seps,endl,multiLine)) {} - - public: - ParseLocation location() { return cin->loc(); } - std::string next() { return cin->get(); } - - void force(const std::string& next) { - std::string token = getString(); - if (token != next) - THROW_RUNTIME_ERROR("token \""+next+"\" expected but token \""+token+"\" found"); - } - - std::string getString() { - return get(); - } - - FileName getFileName() { - return FileName(get()); - } - - int getInt () { - return atoi(get().c_str()); - } - - Vec2i getVec2i() { - int x = atoi(get().c_str()); - int y = atoi(get().c_str()); - return Vec2i(x,y); - } - - Vec3ia getVec3ia() { - int x = atoi(get().c_str()); - int y = atoi(get().c_str()); - int z = atoi(get().c_str()); - return Vec3ia(x,y,z); - } - - float getFloat() { - return (float)atof(get().c_str()); - } - - Vec2f getVec2f() { - float x = (float)atof(get().c_str()); - float y = (float)atof(get().c_str()); - return Vec2f(x,y); - } - - Vec3f getVec3f() { - float x = (float)atof(get().c_str()); - float y = (float)atof(get().c_str()); - float z = (float)atof(get().c_str()); - return Vec3f(x,y,z); - } - - Vec3fa getVec3fa() { - float x = (float)atof(get().c_str()); - float y = (float)atof(get().c_str()); - float z = (float)atof(get().c_str()); - return Vec3fa(x,y,z); - } - - Col3f getCol3f() { - float x = (float)atof(get().c_str()); - float y = (float)atof(get().c_str()); - float z = (float)atof(get().c_str()); - return Col3f(x,y,z); - } - - Color getColor() { - float r = (float)atof(get().c_str()); - float g = (float)atof(get().c_str()); - float b = (float)atof(get().c_str()); - return Color(r,g,b); - } - - private: - Ref > cin; - }; -} diff --git a/thirdparty/embree-aarch64/common/lexers/stream.h b/thirdparty/embree-aarch64/common/lexers/stream.h deleted file mode 100644 index 3f75677e68..0000000000 --- a/thirdparty/embree-aarch64/common/lexers/stream.h +++ /dev/null @@ -1,215 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../sys/platform.h" -#include "../sys/ref.h" -#include "../sys/filename.h" -#include "../sys/string.h" - -#include -#include -#include -#include - -namespace embree -{ - /*! stores the location of a stream element in the source */ - class ParseLocation - { - public: - ParseLocation () : lineNumber(-1), colNumber(-1) {} - ParseLocation (std::shared_ptr fileName, ssize_t lineNumber, ssize_t colNumber, ssize_t /*charNumber*/) - : fileName(fileName), lineNumber(lineNumber), colNumber(colNumber) {} - - std::string str() const - { - std::string str = "unknown"; - if (fileName) str = *fileName; - if (lineNumber >= 0) str += " line " + toString(lineNumber); - if (lineNumber >= 0 && colNumber >= 0) str += " character " + toString(colNumber); - return str; - } - - private: - std::shared_ptr fileName; /// name of the file (or stream) the token is from - ssize_t lineNumber; /// the line number the token is from - ssize_t colNumber; /// the character number in the current line - }; - - /*! a stream class templated over the stream elements */ - template class Stream : public RefCount - { - enum { BUF_SIZE = 1024 }; - - private: - virtual T next() = 0; - virtual ParseLocation location() = 0; - __forceinline std::pair nextHelper() { - ParseLocation l = location(); - T v = next(); - return std::pair(v,l); - } - __forceinline void push_back(const std::pair& v) { - if (past+future == BUF_SIZE) pop_front(); - size_t end = (start+past+future++)%BUF_SIZE; - buffer[end] = v; - } - __forceinline void pop_front() { - if (past == 0) THROW_RUNTIME_ERROR("stream buffer empty"); - start = (start+1)%BUF_SIZE; past--; - } - public: - Stream () : start(0), past(0), future(0), buffer(BUF_SIZE) {} - virtual ~Stream() {} - - public: - - const ParseLocation& loc() { - if (future == 0) push_back(nextHelper()); - return buffer[(start+past)%BUF_SIZE].second; - } - T get() { - if (future == 0) push_back(nextHelper()); - T t = buffer[(start+past)%BUF_SIZE].first; - past++; future--; - return t; - } - const T& peek() { - if (future == 0) push_back(nextHelper()); - return buffer[(start+past)%BUF_SIZE].first; - } - const T& unget(size_t n = 1) { - if (past < n) THROW_RUNTIME_ERROR ("cannot unget that many items"); - past -= n; future += n; - return peek(); - } - void drop() { - if (future == 0) push_back(nextHelper()); - past++; future--; - } - private: - size_t start,past,future; - std::vector > buffer; - }; - - /*! warps an iostream stream */ - class StdStream : public Stream - { - public: - StdStream (std::istream& cin, const std::string& name = "std::stream") - : cin(cin), lineNumber(1), colNumber(0), charNumber(0), name(std::shared_ptr(new std::string(name))) {} - ~StdStream() {} - ParseLocation location() { - return ParseLocation(name,lineNumber,colNumber,charNumber); - } - int next() { - int c = cin.get(); - if (c == '\n') { lineNumber++; colNumber = 0; } else if (c != '\r') colNumber++; - charNumber++; - return c; - } - private: - std::istream& cin; - ssize_t lineNumber; /// the line number the token is from - ssize_t colNumber; /// the character number in the current line - ssize_t charNumber; /// the character in the file - std::shared_ptr name; /// name of buffer - }; - - /*! creates a stream from a file */ - class FileStream : public Stream - { - public: - - FileStream (FILE* file, const std::string& name = "file") - : file(file), lineNumber(1), colNumber(0), charNumber(0), name(std::shared_ptr(new std::string(name))) {} - - FileStream (const FileName& fileName) - : lineNumber(1), colNumber(0), charNumber(0), name(std::shared_ptr(new std::string(fileName.str()))) - { - file = fopen(fileName.c_str(),"r"); - if (file == nullptr) THROW_RUNTIME_ERROR("cannot open file " + fileName.str()); - } - ~FileStream() { if (file) fclose(file); } - - public: - ParseLocation location() { - return ParseLocation(name,lineNumber,colNumber,charNumber); - } - - int next() { - int c = fgetc(file); - if (c == '\n') { lineNumber++; colNumber = 0; } else if (c != '\r') colNumber++; - charNumber++; - return c; - } - - private: - FILE* file; - ssize_t lineNumber; /// the line number the token is from - ssize_t colNumber; /// the character number in the current line - ssize_t charNumber; /// the character in the file - std::shared_ptr name; /// name of buffer - }; - - /*! creates a stream from a string */ - class StrStream : public Stream - { - public: - - StrStream (const char* str) - : str(str), lineNumber(1), colNumber(0), charNumber(0) {} - - public: - ParseLocation location() { - return ParseLocation(std::shared_ptr(),lineNumber,colNumber,charNumber); - } - - int next() { - int c = str[charNumber]; - if (c == 0) return EOF; - if (c == '\n') { lineNumber++; colNumber = 0; } else if (c != '\r') colNumber++; - charNumber++; - return c; - } - - private: - const char* str; - ssize_t lineNumber; /// the line number the token is from - ssize_t colNumber; /// the character number in the current line - ssize_t charNumber; /// the character in the file - }; - - /*! creates a character stream from a command line */ - class CommandLineStream : public Stream - { - public: - CommandLineStream (int argc, char** argv, const std::string& name = "command line") - : i(0), j(0), charNumber(0), name(std::shared_ptr(new std::string(name))) - { - if (argc > 0) { - for (size_t i=0; argv[0][i] && i<1024; i++) charNumber++; - charNumber++; - } - for (ssize_t k=1; k args; - ssize_t charNumber; /// the character in the file - std::shared_ptr name; /// name of buffer - }; -} diff --git a/thirdparty/embree-aarch64/common/lexers/streamfilters.h b/thirdparty/embree-aarch64/common/lexers/streamfilters.h deleted file mode 100644 index 25580a77b8..0000000000 --- a/thirdparty/embree-aarch64/common/lexers/streamfilters.h +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "stream.h" - -namespace embree -{ - /* removes all line comments from a stream */ - class LineCommentFilter : public Stream - { - public: - LineCommentFilter (const FileName& fileName, const std::string& lineComment) - : cin(new FileStream(fileName)), lineComment(lineComment) {} - LineCommentFilter (Ref > cin, const std::string& lineComment) - : cin(cin), lineComment(lineComment) {} - - ParseLocation location() { return cin->loc(); } - - int next() - { - /* look if the line comment starts here */ - for (size_t j=0; jpeek() != lineComment[j]) { cin->unget(j); goto not_found; } - cin->get(); - } - /* eat all characters until the end of the line (or file) */ - while (cin->peek() != '\n' && cin->peek() != EOF) cin->get(); - - not_found: - return cin->get(); - } - - private: - Ref > cin; - std::string lineComment; - }; -} diff --git a/thirdparty/embree-aarch64/common/lexers/stringstream.cpp b/thirdparty/embree-aarch64/common/lexers/stringstream.cpp deleted file mode 100644 index 98dc80ad59..0000000000 --- a/thirdparty/embree-aarch64/common/lexers/stringstream.cpp +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "stringstream.h" - -namespace embree -{ - static const std::string stringChars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 _.,+-=:/*\\"; - - /* creates map for fast categorization of characters */ - static void createCharMap(bool map[256], const std::string& chrs) { - for (size_t i=0; i<256; i++) map[i] = false; - for (size_t i=0; i >& cin, const std::string& seps, const std::string& endl, bool multiLine) - : cin(cin), endl(endl), multiLine(multiLine) - { - createCharMap(isSepMap,seps); - createCharMap(isValidCharMap,stringChars); - } - - std::string StringStream::next() - { - /* skip separators */ - while (cin->peek() != EOF) { - if (endl != "" && cin->peek() == '\n') { cin->drop(); return endl; } - if (multiLine && cin->peek() == '\\') { - cin->drop(); - if (cin->peek() == '\n') { cin->drop(); continue; } - cin->unget(); - } - if (!isSeparator(cin->peek())) break; - cin->drop(); - } - - /* parse everything until the next separator */ - std::vector str; str.reserve(64); - while (cin->peek() != EOF && !isSeparator(cin->peek())) { - int c = cin->get(); - // -- GODOT start -- - // if (!isValidChar(c)) throw std::runtime_error("invalid character "+std::string(1,c)+" in input"); - if (!isValidChar(c)) abort(); - // -- GODOT end -- - str.push_back((char)c); - } - str.push_back(0); - return std::string(str.data()); - } -} diff --git a/thirdparty/embree-aarch64/common/lexers/stringstream.h b/thirdparty/embree-aarch64/common/lexers/stringstream.h deleted file mode 100644 index e6dbd4aecc..0000000000 --- a/thirdparty/embree-aarch64/common/lexers/stringstream.h +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "stream.h" - -namespace embree -{ - /*! simple tokenizer that produces a string stream */ - class StringStream : public Stream - { - public: - StringStream(const Ref >& cin, const std::string& seps = "\n\t\r ", - const std::string& endl = "", bool multiLine = false); - public: - ParseLocation location() { return cin->loc(); } - std::string next(); - private: - __forceinline bool isSeparator(unsigned int c) const { return c<256 && isSepMap[c]; } - __forceinline bool isValidChar(unsigned int c) const { return c<256 && isValidCharMap[c]; } - private: - Ref > cin; /*! source character stream */ - bool isSepMap[256]; /*! map for fast classification of separators */ - bool isValidCharMap[256]; /*! map for valid characters */ - std::string endl; /*! the token of the end of line */ - bool multiLine; /*! whether to parse lines wrapped with \ */ - }; -} diff --git a/thirdparty/embree-aarch64/common/lexers/tokenstream.cpp b/thirdparty/embree-aarch64/common/lexers/tokenstream.cpp deleted file mode 100644 index d05be65862..0000000000 --- a/thirdparty/embree-aarch64/common/lexers/tokenstream.cpp +++ /dev/null @@ -1,181 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "tokenstream.h" -#include "../math/math.h" - -namespace embree -{ - /* shorthands for common sets of characters */ - const std::string TokenStream::alpha = "abcdefghijklmnopqrstuvwxyz"; - const std::string TokenStream::ALPHA = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; - const std::string TokenStream::numbers = "0123456789"; - const std::string TokenStream::separators = "\n\t\r "; - const std::string TokenStream::stringChars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 _.,+-=:/*\\"; - - /* creates map for fast categorization of characters */ - static void createCharMap(bool map[256], const std::string& chrs) { - for (size_t i=0; i<256; i++) map[i] = false; - for (size_t i=0; i >& cin, //< stream to read from - const std::string& alpha, //< valid characters for identifiers - const std::string& seps, //< characters that act as separators - const std::vector& symbols) //< symbols - : cin(cin), symbols(symbols) - { - createCharMap(isAlphaMap,alpha); - createCharMap(isSepMap,seps); - createCharMap(isStringCharMap,stringChars); - } - - bool TokenStream::decDigits(std::string& str_o) - { - bool ok = false; - std::string str; - if (cin->peek() == '+' || cin->peek() == '-') str += (char)cin->get(); - while (isDigit(cin->peek())) { ok = true; str += (char)cin->get(); } - if (ok) str_o += str; - else cin->unget(str.size()); - return ok; - } - - bool TokenStream::decDigits1(std::string& str_o) - { - bool ok = false; - std::string str; - while (isDigit(cin->peek())) { ok = true; str += (char)cin->get(); } - if (ok) str_o += str; else cin->unget(str.size()); - return ok; - } - - bool TokenStream::trySymbol(const std::string& symbol) - { - size_t pos = 0; - while (pos < symbol.size()) { - if (symbol[pos] != cin->peek()) { cin->unget(pos); return false; } - cin->drop(); pos++; - } - return true; - } - - bool TokenStream::trySymbols(Token& token, const ParseLocation& loc) - { - for (size_t i=0; ipeek() == '.') { - str += (char)cin->get(); - decDigits(str); - if (cin->peek() == 'e' || cin->peek() == 'E') { - str += (char)cin->get(); - if (decDigits(str)) ok = true; // 1.[2]E2 - } - else ok = true; // 1.[2] - } - else if (cin->peek() == 'e' || cin->peek() == 'E') { - str += (char)cin->get(); - if (decDigits(str)) ok = true; // 1E2 - } - } - else - { - if (cin->peek() == '.') { - str += (char)cin->get(); - if (decDigits(str)) { - if (cin->peek() == 'e' || cin->peek() == 'E') { - str += (char)cin->get(); - if (decDigits(str)) ok = true; // .3E2 - } - else ok = true; // .3 - } - } - } - if (ok) { - token = Token((float)atof(str.c_str()),loc); - } - else cin->unget(str.size()); - return ok; - } - - bool TokenStream::tryInt(Token& token, const ParseLocation& loc) { - std::string str; - if (decDigits(str)) { - token = Token(atoi(str.c_str()),loc); - return true; - } - return false; - } - - bool TokenStream::tryString(Token& token, const ParseLocation& loc) - { - std::string str; - if (cin->peek() != '\"') return false; - cin->drop(); - while (cin->peek() != '\"') { - const int c = cin->get(); - if (!isStringChar(c)) THROW_RUNTIME_ERROR("invalid string character "+std::string(1,c)+" at "+loc.str()); - str += (char)c; - } - cin->drop(); - token = Token(str,Token::TY_STRING,loc); - return true; - } - - bool TokenStream::tryIdentifier(Token& token, const ParseLocation& loc) - { - std::string str; - if (!isAlpha(cin->peek())) return false; - str += (char)cin->get(); - while (isAlphaNum(cin->peek())) str += (char)cin->get(); - token = Token(str,Token::TY_IDENTIFIER,loc); - return true; - } - - void TokenStream::skipSeparators() - { - /* skip separators */ - while (cin->peek() != EOF && isSeparator(cin->peek())) - cin->drop(); - } - - Token TokenStream::next() - { - Token token; - skipSeparators(); - ParseLocation loc = cin->loc(); - if (trySymbols (token,loc)) return token; /**< try to parse a symbol */ - if (tryFloat (token,loc)) return token; /**< try to parse float */ - if (tryInt (token,loc)) return token; /**< try to parse integer */ - if (tryString (token,loc)) return token; /**< try to parse string */ - if (tryIdentifier(token,loc)) return token; /**< try to parse identifier */ - if (cin->peek() == EOF ) return Token(loc); /**< return EOF token */ - return Token((char)cin->get(),loc); /**< return invalid character token */ - } -} diff --git a/thirdparty/embree-aarch64/common/lexers/tokenstream.h b/thirdparty/embree-aarch64/common/lexers/tokenstream.h deleted file mode 100644 index 72a7b4f2f3..0000000000 --- a/thirdparty/embree-aarch64/common/lexers/tokenstream.h +++ /dev/null @@ -1,164 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "stream.h" -#include -#include - -namespace embree -{ - /*! token class */ - class Token - { - public: - - enum Type { TY_EOF, TY_CHAR, TY_INT, TY_FLOAT, TY_IDENTIFIER, TY_STRING, TY_SYMBOL }; - - Token ( const ParseLocation& loc = ParseLocation()) : ty(TY_EOF ), loc(loc) {} - Token (char c, const ParseLocation& loc = ParseLocation()) : ty(TY_CHAR ), c(c), loc(loc) {} - Token (int i, const ParseLocation& loc = ParseLocation()) : ty(TY_INT ), i(i), loc(loc) {} - Token (float f,const ParseLocation& loc = ParseLocation()) : ty(TY_FLOAT), f(f), loc(loc) {} - Token (std::string str, Type ty, const ParseLocation& loc = ParseLocation()) : ty(ty), str(str), loc(loc) {} - - static Token Eof() { return Token(); } - static Token Sym(std::string str) { return Token(str,TY_SYMBOL); } - static Token Str(std::string str) { return Token(str,TY_STRING); } - static Token Id (std::string str) { return Token(str,TY_IDENTIFIER); } - - char Char() const { - if (ty == TY_CHAR) return c; - THROW_RUNTIME_ERROR(loc.str()+": character expected"); - } - - int Int() const { - if (ty == TY_INT) return i; - THROW_RUNTIME_ERROR(loc.str()+": integer expected"); - } - - float Float(bool cast = true) const { - if (ty == TY_FLOAT) return f; - if (ty == TY_INT && cast) return (float)i; - THROW_RUNTIME_ERROR(loc.str()+": float expected"); - } - - std::string Identifier() const { - if (ty == TY_IDENTIFIER) return str; - THROW_RUNTIME_ERROR(loc.str()+": identifier expected"); - } - - std::string String() const { - if (ty == TY_STRING) return str; - THROW_RUNTIME_ERROR(loc.str()+": string expected"); - } - - std::string Symbol() const { - if (ty == TY_SYMBOL) return str; - THROW_RUNTIME_ERROR(loc.str()+": symbol expected"); - } - - const ParseLocation& Location() const { return loc; } - - friend bool operator==(const Token& a, const Token& b) - { - if (a.ty != b.ty) return false; - if (a.ty == TY_CHAR) return a.c == b.c; - if (a.ty == TY_INT) return a.i == b.i; - if (a.ty == TY_FLOAT) return a.f == b.f; - if (a.ty == TY_IDENTIFIER) return a.str == b.str; - if (a.ty == TY_STRING) return a.str == b.str; - if (a.ty == TY_SYMBOL) return a.str == b.str; - return true; - } - - friend bool operator!=(const Token& a, const Token& b) { - return !(a == b); - } - - friend bool operator <( const Token& a, const Token& b ) { - if (a.ty != b.ty) return (int)a.ty < (int)b.ty; - if (a.ty == TY_CHAR) return a.c < b.c; - if (a.ty == TY_INT) return a.i < b.i; - if (a.ty == TY_FLOAT) return a.f < b.f; - if (a.ty == TY_IDENTIFIER) return a.str < b.str; - if (a.ty == TY_STRING) return a.str < b.str; - if (a.ty == TY_SYMBOL) return a.str < b.str; - return false; - } - - friend std::ostream& operator<<(std::ostream& cout, const Token& t) - { - if (t.ty == TY_EOF) return cout << "eof"; - if (t.ty == TY_CHAR) return cout << "Char(" << t.c << ")"; - if (t.ty == TY_INT) return cout << "Int(" << t.i << ")"; - if (t.ty == TY_FLOAT) return cout << "Float(" << t.f << ")"; - if (t.ty == TY_IDENTIFIER) return cout << "Id(" << t.str << ")"; - if (t.ty == TY_STRING) return cout << "String(" << t.str << ")"; - if (t.ty == TY_SYMBOL) return cout << "Symbol(" << t.str << ")"; - return cout << "unknown"; - } - - private: - Type ty; //< the type of the token - union { - char c; //< data for char tokens - int i; //< data for int tokens - float f; //< data for float tokens - }; - std::string str; //< data for string and identifier tokens - ParseLocation loc; //< the location the token is from - }; - - /*! build full tokenizer that takes list of valid characters and keywords */ - class TokenStream : public Stream - { - public: - - /*! shorthands for common sets of characters */ - static const std::string alpha; - static const std::string ALPHA; - static const std::string numbers; - static const std::string separators; - static const std::string stringChars; - - public: - TokenStream(const Ref >& cin, - const std::string& alpha, //< valid characters for identifiers - const std::string& seps, //< characters that act as separators - const std::vector& symbols = std::vector()); //< symbols - public: - ParseLocation location() { return cin->loc(); } - Token next(); - bool trySymbol(const std::string& symbol); - - private: - void skipSeparators(); - bool decDigits(std::string& str); - bool decDigits1(std::string& str); - bool trySymbols(Token& token, const ParseLocation& loc); - bool tryFloat(Token& token, const ParseLocation& loc); - bool tryInt(Token& token, const ParseLocation& loc); - bool tryString(Token& token, const ParseLocation& loc); - bool tryIdentifier(Token& token, const ParseLocation& loc); - - Ref > cin; - bool isSepMap[256]; - bool isAlphaMap[256]; - bool isStringCharMap[256]; - std::vector symbols; - - /*! checks if a character is a separator */ - __forceinline bool isSeparator(unsigned int c) const { return c<256 && isSepMap[c]; } - - /*! checks if a character is a number */ - __forceinline bool isDigit(unsigned int c) const { return c >= '0' && c <= '9'; } - - /*! checks if a character is valid inside a string */ - __forceinline bool isStringChar(unsigned int c) const { return c<256 && isStringCharMap[c]; } - - /*! checks if a character is legal for an identifier */ - __forceinline bool isAlpha(unsigned int c) const { return c<256 && isAlphaMap[c]; } - __forceinline bool isAlphaNum(unsigned int c) const { return isAlpha(c) || isDigit(c); } - }; -} diff --git a/thirdparty/embree-aarch64/common/math/AVX2NEON.h b/thirdparty/embree-aarch64/common/math/AVX2NEON.h deleted file mode 100644 index e8698ac56d..0000000000 --- a/thirdparty/embree-aarch64/common/math/AVX2NEON.h +++ /dev/null @@ -1,986 +0,0 @@ -#pragma once - -#include "SSE2NEON.h" - - -#define AVX2NEON_ABI static inline __attribute__((always_inline)) - - -struct __m256d; - -struct __m256 { - __m128 lo,hi; - __m256() {} -}; - - - - -struct __m256i { - __m128i lo,hi; - explicit __m256i(const __m256 a) : lo(__m128i(a.lo)),hi(__m128i(a.hi)) {} - operator __m256() const {__m256 res; res.lo = __m128(lo);res.hi = __m128(hi); return res;} - __m256i() {} -}; - - - - -struct __m256d { - float64x2_t lo,hi; - __m256d() {} - __m256d(const __m256& a) : lo(float64x2_t(a.lo)),hi(float64x2_t(a.hi)) {} - __m256d(const __m256i& a) : lo(float64x2_t(a.lo)),hi(float64x2_t(a.hi)) {} -}; - -#define UNARY_AVX_OP(type,func,basic_func) AVX2NEON_ABI type func(const type& a) {type res;res.lo=basic_func(a.lo);res.hi=basic_func(a.hi);return res;} - - -#define BINARY_AVX_OP(type,func,basic_func) AVX2NEON_ABI type func(const type& a,const type& b) {type res;res.lo=basic_func(a.lo,b.lo);res.hi=basic_func(a.hi,b.hi);return res;} -#define BINARY_AVX_OP_CAST(type,func,basic_func,bdst,bsrc) AVX2NEON_ABI type func(const type& a,const type& b) {type res;res.lo=bdst(basic_func(bsrc(a.lo),bsrc(b.lo)));res.hi=bdst(basic_func(bsrc(a.hi),bsrc(b.hi)));return res;} - -#define TERNARY_AVX_OP(type,func,basic_func) AVX2NEON_ABI type func(const type& a,const type& b,const type& c) {type res;res.lo=basic_func(a.lo,b.lo,c.lo);res.hi=basic_func(a.hi,b.hi,c.hi);return res;} - - -#define CAST_SIMD_TYPE(to,name,from,basic_dst) AVX2NEON_ABI to name(const from& a) { to res; res.lo = basic_dst(a.lo); res.hi=basic_dst(a.hi); return res;} - - - -#define _mm_stream_load_si128 _mm_load_si128 -#define _mm256_stream_load_si256 _mm256_load_si256 - - -AVX2NEON_ABI -__m128 _mm_blend_ps (__m128 a, __m128 b, const int imm8) -{ - __m128 res; - for (int i=0;i<4;i++) - { - if (imm8 & (1< -AVX2NEON_ABI float32x4_t dpps_neon(const float32x4_t& a,const float32x4_t& b) -{ - float v; - v = 0; - v += (code & 0x10) ? a[0]*b[0] : 0; - v += (code & 0x20) ? a[1]*b[1] : 0; - v += (code & 0x40) ? a[2]*b[2] : 0; - v += (code & 0x80) ? a[3]*b[3] : 0; - float32x4_t res; - res[0] = (code & 0x1) ? v : 0; - res[1] = (code & 0x2) ? v : 0; - res[2] = (code & 0x4) ? v : 0; - res[3] = (code & 0x8) ? v : 0; - return res; -} - -template<> -inline float32x4_t dpps_neon<0x7f>(const float32x4_t& a,const float32x4_t& b) -{ - float v; - float32x4_t m = _mm_mul_ps(a,b); - m[3] = 0; - v = vaddvq_f32(m); - return _mm_set1_ps(v); -} - -template<> -inline float32x4_t dpps_neon<0xff>(const float32x4_t& a,const float32x4_t& b) -{ - float v; - float32x4_t m = _mm_mul_ps(a,b); - v = vaddvq_f32(m); - return _mm_set1_ps(v); -} - -#define _mm_dp_ps(a,b,c) dpps_neon((a),(b)) - - - -AVX2NEON_ABI -__m128 _mm_cmpnge_ps (__m128 a, __m128 b) -{ - return __m128(vmvnq_s32(__m128i(_mm_cmpge_ps(a,b)))); -} - - -AVX2NEON_ABI -__m128 _mm_permutevar_ps (__m128 a, __m128i b) -{ - __m128 x; - for (int i=0;i<4;i++) - { - x[i] = a[b[i&3]]; - } - return x; -} - -AVX2NEON_ABI -__m256i _mm256_setzero_si256() -{ - __m256i res; - res.lo = res.hi = vdupq_n_s32(0); - return res; -} - -AVX2NEON_ABI -__m256 _mm256_setzero_ps() -{ - __m256 res; - res.lo = res.hi = vdupq_n_f32(0.0f); - return res; -} - -AVX2NEON_ABI -__m256i _mm256_undefined_si256() -{ - return _mm256_setzero_si256(); -} - -AVX2NEON_ABI -__m256 _mm256_undefined_ps() -{ - return _mm256_setzero_ps(); -} - -CAST_SIMD_TYPE(__m256d,_mm256_castps_pd,__m256,float64x2_t) -CAST_SIMD_TYPE(__m256i,_mm256_castps_si256,__m256,__m128i) -CAST_SIMD_TYPE(__m256, _mm256_castsi256_ps, __m256i,__m128) -CAST_SIMD_TYPE(__m256, _mm256_castpd_ps ,__m256d,__m128) -CAST_SIMD_TYPE(__m256d, _mm256_castsi256_pd, __m256i,float64x2_t) -CAST_SIMD_TYPE(__m256i, _mm256_castpd_si256, __m256d,__m128i) - - - - -AVX2NEON_ABI -__m128 _mm256_castps256_ps128 (__m256 a) -{ - return a.lo; -} - -AVX2NEON_ABI -__m256i _mm256_castsi128_si256 (__m128i a) -{ - __m256i res; - res.lo = a ; - res.hi = vdupq_n_s32(0); - return res; -} - -AVX2NEON_ABI -__m128i _mm256_castsi256_si128 (__m256i a) -{ - return a.lo; -} - -AVX2NEON_ABI -__m256 _mm256_castps128_ps256 (__m128 a) -{ - __m256 res; - res.lo = a; - res.hi = vdupq_n_f32(0); - return res; -} - - -AVX2NEON_ABI -__m256 _mm256_broadcast_ss (float const * mem_addr) -{ - __m256 res; - res.lo = res.hi = vdupq_n_f32(*mem_addr); - return res; -} - - - -AVX2NEON_ABI -__m256i _mm256_set_epi32 (int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0) -{ - __m128i lo = {e0,e1,e2,e3}, hi = {e4,e5,e6,e7}; - __m256i res; - res.lo = lo; res.hi = hi; - return res; - -} - -AVX2NEON_ABI -__m256i _mm256_set1_epi32 (int a) -{ - __m256i res; - res.lo = res.hi = vdupq_n_s32(a); - return res; -} - - - - -AVX2NEON_ABI -int _mm256_movemask_ps(const __m256& v) -{ - return (_mm_movemask_ps(v.hi) << 4) | _mm_movemask_ps(v.lo); -} - -template -AVX2NEON_ABI -__m256 __mm256_permute_ps (const __m256& a) -{ - __m256 res; - res.lo = _mm_shuffle_ps(a.lo,a.lo,imm8); - res.hi = _mm_shuffle_ps(a.hi,a.hi,imm8); - return res; - -} - -#define _mm256_permute_ps(a,c) __mm256_permute_ps(a) - - -template -AVX2NEON_ABI -__m256 __mm256_shuffle_ps (const __m256 a,const __m256& b) -{ - __m256 res; - res.lo = _mm_shuffle_ps(a.lo,b.lo,imm8); - res.hi = _mm_shuffle_ps(a.hi,b.hi,imm8); - return res; - -} - -#define _mm256_shuffle_ps(a,b,c) __mm256_shuffle_ps(a,b) - -AVX2NEON_ABI -__m256i _mm256_set1_epi64x (long long a) -{ - __m256i res; - int64x2_t t = vdupq_n_s64(a); - res.lo = res.hi = __m128i(t); - return res; -} - - -AVX2NEON_ABI -__m256 _mm256_permute2f128_ps (__m256 a, __m256 b, int imm8) -{ - __m256 res; - __m128 tmp; - switch (imm8 & 0x7) - { - case 0: tmp = a.lo; break; - case 1: tmp = a.hi; break; - case 2: tmp = b.lo; break; - case 3: tmp = b.hi; break; - } - if (imm8 & 0x8) - tmp = _mm_setzero_ps(); - - - - res.lo = tmp; - imm8 >>= 4; - - switch (imm8 & 0x7) - { - case 0: tmp = a.lo; break; - case 1: tmp = a.hi; break; - case 2: tmp = b.lo; break; - case 3: tmp = b.hi; break; - } - if (imm8 & 0x8) - tmp = _mm_setzero_ps(); - - res.hi = tmp; - - return res; -} - -AVX2NEON_ABI -__m256 _mm256_moveldup_ps (__m256 a) -{ - __m256 res; - res.lo[0] = res.lo[1] = a.lo[0]; - res.lo[2] = res.lo[3] = a.lo[2]; - res.hi[0] = res.hi[1] = a.hi[0]; - res.hi[2] = res.hi[3] = a.hi[2]; - return res; - -} - -AVX2NEON_ABI -__m256 _mm256_movehdup_ps (__m256 a) -{ - __m256 res; - res.lo[0] = res.lo[1] = a.lo[1]; - res.lo[2] = res.lo[3] = a.lo[3]; - res.hi[0] = res.hi[1] = a.hi[1]; - res.hi[2] = res.hi[3] = a.hi[3]; - return res; -} - -AVX2NEON_ABI -__m256 _mm256_insertf128_ps (__m256 a, __m128 b, int imm8) -{ - __m256 res = a; - if (imm8 & 1) res.hi = b; - else res.lo = b; - return res; -} - - -AVX2NEON_ABI -__m128 _mm256_extractf128_ps (__m256 a, const int imm8) -{ - if (imm8 & 1) return a.hi; - return a.lo; -} - - -AVX2NEON_ABI -__m256d _mm256_movedup_pd (__m256d a) -{ - __m256d res; - res.hi = a.hi; - res.lo[0] = res.lo[1] = a.lo[0]; - return res; -} - -AVX2NEON_ABI -__m256i _mm256_abs_epi32(__m256i a) -{ - __m256i res; - res.lo = vabsq_s32(a.lo); - res.hi = vabsq_s32(a.hi); - return res; -} - -UNARY_AVX_OP(__m256,_mm256_sqrt_ps,_mm_sqrt_ps) -UNARY_AVX_OP(__m256,_mm256_rsqrt_ps,_mm_rsqrt_ps) -UNARY_AVX_OP(__m256,_mm256_rcp_ps,_mm_rcp_ps) -UNARY_AVX_OP(__m256,_mm256_floor_ps,vrndmq_f32) -UNARY_AVX_OP(__m256,_mm256_ceil_ps,vrndpq_f32) - - -BINARY_AVX_OP(__m256i,_mm256_add_epi32,_mm_add_epi32) -BINARY_AVX_OP(__m256i,_mm256_sub_epi32,_mm_sub_epi32) -BINARY_AVX_OP(__m256i,_mm256_mullo_epi32,_mm_mullo_epi32) - -BINARY_AVX_OP(__m256i,_mm256_min_epi32,_mm_min_epi32) -BINARY_AVX_OP(__m256i,_mm256_max_epi32,_mm_max_epi32) -BINARY_AVX_OP_CAST(__m256i,_mm256_min_epu32,vminq_u32,__m128i,uint32x4_t) -BINARY_AVX_OP_CAST(__m256i,_mm256_max_epu32,vmaxq_u32,__m128i,uint32x4_t) - -BINARY_AVX_OP(__m256,_mm256_min_ps,_mm_min_ps) -BINARY_AVX_OP(__m256,_mm256_max_ps,_mm_max_ps) - -BINARY_AVX_OP(__m256,_mm256_add_ps,_mm_add_ps) -BINARY_AVX_OP(__m256,_mm256_mul_ps,_mm_mul_ps) -BINARY_AVX_OP(__m256,_mm256_sub_ps,_mm_sub_ps) -BINARY_AVX_OP(__m256,_mm256_div_ps,_mm_div_ps) - -BINARY_AVX_OP(__m256,_mm256_and_ps,_mm_and_ps) -BINARY_AVX_OP(__m256,_mm256_andnot_ps,_mm_andnot_ps) -BINARY_AVX_OP(__m256,_mm256_or_ps,_mm_or_ps) -BINARY_AVX_OP(__m256,_mm256_xor_ps,_mm_xor_ps) - -BINARY_AVX_OP_CAST(__m256d,_mm256_and_pd,vandq_s64,float64x2_t,int64x2_t) -BINARY_AVX_OP_CAST(__m256d,_mm256_or_pd,vorrq_s64,float64x2_t,int64x2_t) -BINARY_AVX_OP_CAST(__m256d,_mm256_xor_pd,veorq_s64,float64x2_t,int64x2_t) - - - -BINARY_AVX_OP(__m256i,_mm256_and_si256,_mm_and_si128) -BINARY_AVX_OP(__m256i,_mm256_or_si256,_mm_or_si128) -BINARY_AVX_OP(__m256i,_mm256_xor_si256,_mm_xor_si128) - - -BINARY_AVX_OP(__m256,_mm256_unpackhi_ps,_mm_unpackhi_ps) -BINARY_AVX_OP(__m256,_mm256_unpacklo_ps,_mm_unpacklo_ps) -TERNARY_AVX_OP(__m256,_mm256_blendv_ps,_mm_blendv_ps) - - -TERNARY_AVX_OP(__m256,_mm256_fmadd_ps,_mm_fmadd_ps) -TERNARY_AVX_OP(__m256,_mm256_fnmadd_ps,_mm_fnmadd_ps) -TERNARY_AVX_OP(__m256,_mm256_fmsub_ps,_mm_fmsub_ps) -TERNARY_AVX_OP(__m256,_mm256_fnmsub_ps,_mm_fnmsub_ps) - - -BINARY_AVX_OP(__m256i,_mm256_unpackhi_epi32,_mm_unpackhi_epi32) -BINARY_AVX_OP(__m256i,_mm256_unpacklo_epi32,_mm_unpacklo_epi32) - - -BINARY_AVX_OP(__m256i,_mm256_cmpeq_epi32,_mm_cmpeq_epi32) -BINARY_AVX_OP(__m256i,_mm256_cmpgt_epi32,_mm_cmpgt_epi32) -BINARY_AVX_OP(__m256,_mm256_cmpeq_ps,_mm_cmpeq_ps) -BINARY_AVX_OP(__m256,_mm256_cmpneq_ps,_mm_cmpneq_ps) -BINARY_AVX_OP(__m256,_mm256_cmpnlt_ps,_mm_cmpnlt_ps) -BINARY_AVX_OP(__m256,_mm256_cmpngt_ps,_mm_cmpngt_ps) -BINARY_AVX_OP(__m256,_mm256_cmpge_ps,_mm_cmpge_ps) -BINARY_AVX_OP(__m256,_mm256_cmpnge_ps,_mm_cmpnge_ps) -BINARY_AVX_OP(__m256,_mm256_cmplt_ps,_mm_cmplt_ps) -BINARY_AVX_OP(__m256,_mm256_cmple_ps,_mm_cmple_ps) -BINARY_AVX_OP(__m256,_mm256_cmpgt_ps,_mm_cmpgt_ps) -BINARY_AVX_OP(__m256,_mm256_cmpnle_ps,_mm_cmpnle_ps) - - -AVX2NEON_ABI -__m256i _mm256_cvtps_epi32 (__m256 a) -{ - __m256i res; - res.lo = _mm_cvtps_epi32(a.lo); - res.hi = _mm_cvtps_epi32(a.hi); - return res; - -} - -AVX2NEON_ABI -__m256i _mm256_cvttps_epi32 (__m256 a) -{ - __m256i res; - res.lo = _mm_cvttps_epi32(a.lo); - res.hi = _mm_cvttps_epi32(a.hi); - return res; - -} - -AVX2NEON_ABI -__m256 _mm256_loadu_ps (float const * mem_addr) -{ - __m256 res; - res.lo = *(__m128 *)(mem_addr + 0); - res.hi = *(__m128 *)(mem_addr + 4); - return res; -} -#define _mm256_load_ps _mm256_loadu_ps - - -AVX2NEON_ABI -int _mm256_testz_ps (const __m256& a, const __m256& b) -{ - __m256 t = a; - if (&a != &b) - t = _mm256_and_ps(a,b); - - __m128i l = vshrq_n_s32(__m128i(t.lo),31); - __m128i h = vshrq_n_s32(__m128i(t.hi),31); - return vaddvq_s32(vaddq_s32(l,h)) == 0; -} - - -AVX2NEON_ABI -__m256i _mm256_set_epi64x (int64_t e3, int64_t e2, int64_t e1, int64_t e0) -{ - __m256i res; - int64x2_t t0 = {e0,e1}; - int64x2_t t1 = {e2,e3}; - res.lo = __m128i(t0); - res.hi = __m128i(t1); - return res; -} - -AVX2NEON_ABI -__m256d _mm256_setzero_pd () -{ - __m256d res; - res.lo = res.hi = vdupq_n_f64(0); - return res; -} - -AVX2NEON_ABI -int _mm256_movemask_pd (__m256d a) -{ - int res = 0; - uint64x2_t x; - x = uint64x2_t(a.lo); - res |= (x[0] >> 63) ? 1 : 0; - res |= (x[0] >> 63) ? 2 : 0; - x = uint64x2_t(a.hi); - res |= (x[0] >> 63) ? 4 : 0; - res |= (x[0] >> 63) ? 8 : 0; - return res; -} - -AVX2NEON_ABI -__m256i _mm256_cmpeq_epi64 (__m256i a, __m256i b) -{ - __m256i res; - res.lo = __m128i(vceqq_s64(int64x2_t(a.lo),int64x2_t(b.lo))); - res.hi = __m128i(vceqq_s64(int64x2_t(a.hi),int64x2_t(b.hi))); - return res; -} - -AVX2NEON_ABI -__m256i _mm256_cmpeq_pd (__m256d a, __m256d b) -{ - __m256i res; - res.lo = __m128i(vceqq_f64(a.lo,b.lo)); - res.hi = __m128i(vceqq_f64(a.hi,b.hi)); - return res; -} - - -AVX2NEON_ABI -int _mm256_testz_pd (const __m256d& a, const __m256d& b) -{ - __m256d t = a; - - if (&a != &b) - t = _mm256_and_pd(a,b); - - return _mm256_movemask_pd(t) == 0; -} - -AVX2NEON_ABI -__m256d _mm256_blendv_pd (__m256d a, __m256d b, __m256d mask) -{ - __m256d res; - uint64x2_t t = uint64x2_t(mask.lo); - res.lo[0] = (t[0] >> 63) ? b.lo[0] : a.lo[0]; - res.lo[1] = (t[1] >> 63) ? b.lo[1] : a.lo[1]; - t = uint64x2_t(mask.hi); - res.hi[0] = (t[0] >> 63) ? b.hi[0] : a.hi[0]; - res.hi[1] = (t[1] >> 63) ? b.hi[1] : a.hi[1]; - return res; -} - -template -__m256 __mm256_dp_ps (__m256 a, __m256 b) -{ - __m256 res; - res.lo = _mm_dp_ps(a.lo,b.lo,imm8); - res.hi = _mm_dp_ps(a.hi,b.hi,imm8); - return res; -} - -#define _mm256_dp_ps(a,b,c) __mm256_dp_ps(a,b) - -AVX2NEON_ABI -double _mm256_permute4x64_pd_select(__m256d a, const int imm8) -{ - switch (imm8 & 3) { - case 0: - return a.lo[0]; - case 1: - return a.lo[1]; - case 2: - return a.hi[0]; - case 3: - return a.hi[1]; - } - __builtin_unreachable(); - return 0; -} - -AVX2NEON_ABI -__m256d _mm256_permute4x64_pd (__m256d a, const int imm8) -{ - __m256d res; - res.lo[0] = _mm256_permute4x64_pd_select(a,imm8 >> 0); - res.lo[1] = _mm256_permute4x64_pd_select(a,imm8 >> 2); - res.hi[0] = _mm256_permute4x64_pd_select(a,imm8 >> 4); - res.hi[1] = _mm256_permute4x64_pd_select(a,imm8 >> 6); - - return res; -} - -AVX2NEON_ABI -__m256i _mm256_insertf128_si256 (__m256i a, __m128i b, int imm8) -{ - return __m256i(_mm256_insertf128_ps((__m256)a,(__m128)b,imm8)); -} - - -AVX2NEON_ABI -__m256i _mm256_loadu_si256 (__m256i const * mem_addr) -{ - __m256i res; - res.lo = *(__m128i *)((int32_t *)mem_addr + 0); - res.hi = *(__m128i *)((int32_t *)mem_addr + 4); - return res; -} - -#define _mm256_load_si256 _mm256_loadu_si256 - -AVX2NEON_ABI -void _mm256_storeu_ps (float * mem_addr, __m256 a) -{ - *(__m128 *)(mem_addr + 0) = a.lo; - *(__m128 *)(mem_addr + 4) = a.hi; - -} - -#define _mm256_store_ps _mm256_storeu_ps -#define _mm256_stream_ps _mm256_storeu_ps - - -AVX2NEON_ABI -void _mm256_storeu_si256 (__m256i * mem_addr, __m256i a) -{ - *(__m128i *)((int *)mem_addr + 0) = a.lo; - *(__m128i *)((int *)mem_addr + 4) = a.hi; - -} - -#define _mm256_store_si256 _mm256_storeu_si256 - - - -AVX2NEON_ABI -__m256 _mm256_maskload_ps (float const * mem_addr, __m256i mask) -{ - __m256 res; - res.lo = _mm_maskload_ps(mem_addr,mask.lo); - res.hi = _mm_maskload_ps(mem_addr + 4,mask.hi); - return res; - -} - - -AVX2NEON_ABI -__m256i _mm256_cvtepu8_epi32 (__m128i a) -{ - __m256i res; - uint8x16_t x = uint8x16_t(a); - for (int i=0;i<4;i++) - { - res.lo[i] = x[i]; - res.hi[i] = x[i+4]; - } - return res; -} - - -AVX2NEON_ABI -__m256i _mm256_cvtepi8_epi32 (__m128i a) -{ - __m256i res; - int8x16_t x = int8x16_t(a); - for (int i=0;i<4;i++) - { - res.lo[i] = x[i]; - res.hi[i] = x[i+4]; - } - return res; -} - - -AVX2NEON_ABI -__m256i _mm256_cvtepu16_epi32 (__m128i a) -{ - __m256i res; - uint16x8_t x = uint16x8_t(a); - for (int i=0;i<4;i++) - { - res.lo[i] = x[i]; - res.hi[i] = x[i+4]; - } - return res; -} - -AVX2NEON_ABI -__m256i _mm256_cvtepi16_epi32 (__m128i a) -{ - __m256i res; - int16x8_t x = int16x8_t(a); - for (int i=0;i<4;i++) - { - res.lo[i] = x[i]; - res.hi[i] = x[i+4]; - } - return res; -} - - - -AVX2NEON_ABI -void _mm256_maskstore_epi32 (int* mem_addr, __m256i mask, __m256i a) -{ - _mm_maskstore_epi32(mem_addr,mask.lo,a.lo); - _mm_maskstore_epi32(mem_addr + 4,mask.hi,a.hi); -} - -AVX2NEON_ABI -__m256i _mm256_slli_epi32 (__m256i a, int imm8) -{ - __m256i res; - res.lo = _mm_slli_epi32(a.lo,imm8); - res.hi = _mm_slli_epi32(a.hi,imm8); - return res; -} - - -AVX2NEON_ABI -__m256i _mm256_srli_epi32 (__m256i a, int imm8) -{ - __m256i res; - res.lo = _mm_srli_epi32(a.lo,imm8); - res.hi = _mm_srli_epi32(a.hi,imm8); - return res; -} - -AVX2NEON_ABI -__m256i _mm256_srai_epi32 (__m256i a, int imm8) -{ - __m256i res; - res.lo = _mm_srai_epi32(a.lo,imm8); - res.hi = _mm_srai_epi32(a.hi,imm8); - return res; -} - - -AVX2NEON_ABI -__m256i _mm256_sllv_epi32 (__m256i a, __m256i count) -{ - __m256i res; - res.lo = vshlq_s32(a.lo,count.lo); - res.hi = vshlq_s32(a.hi,count.hi); - return res; - -} - - -AVX2NEON_ABI -__m256i _mm256_srav_epi32 (__m256i a, __m256i count) -{ - __m256i res; - res.lo = vshlq_s32(a.lo,vnegq_s32(count.lo)); - res.hi = vshlq_s32(a.hi,vnegq_s32(count.hi)); - return res; - -} - -AVX2NEON_ABI -__m256i _mm256_srlv_epi32 (__m256i a, __m256i count) -{ - __m256i res; - res.lo = __m128i(vshlq_u32(uint32x4_t(a.lo),vnegq_s32(count.lo))); - res.hi = __m128i(vshlq_u32(uint32x4_t(a.hi),vnegq_s32(count.hi))); - return res; - -} - - -AVX2NEON_ABI -__m256i _mm256_permute2f128_si256 (__m256i a, __m256i b, int imm8) -{ - return __m256i(_mm256_permute2f128_ps(__m256(a),__m256(b),imm8)); -} - - -AVX2NEON_ABI -__m128i _mm256_extractf128_si256 (__m256i a, const int imm8) -{ - if (imm8 & 1) return a.hi; - return a.lo; -} - -AVX2NEON_ABI -__m256 _mm256_set1_ps(float x) -{ - __m256 res; - res.lo = res.hi = vdupq_n_f32(x); - return res; -} - -AVX2NEON_ABI -__m256 _mm256_set_ps (float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0) -{ - __m256 res; - res.lo = _mm_set_ps(e3,e2,e1,e0); - res.hi = _mm_set_ps(e7,e6,e5,e4); - return res; -} - -AVX2NEON_ABI -__m256 _mm256_broadcast_ps (__m128 const * mem_addr) -{ - __m256 res; - res.lo = res.hi = *mem_addr; - return res; -} - -AVX2NEON_ABI -__m256 _mm256_cvtepi32_ps (__m256i a) -{ - __m256 res; - res.lo = _mm_cvtepi32_ps(a.lo); - res.hi = _mm_cvtepi32_ps(a.hi); - return res; -} -AVX2NEON_ABI -void _mm256_maskstore_ps (float * mem_addr, __m256i mask, __m256 a) -{ - for (int i=0;i<4;i++) { - if (mask.lo[i] & 0x80000000) mem_addr[i] = a.lo[i]; - if (mask.hi[i] & 0x80000000) mem_addr[i+4] = a.hi[i]; - } -} - -AVX2NEON_ABI -__m256d _mm256_andnot_pd (__m256d a, __m256d b) -{ - __m256d res; - res.lo = float64x2_t(_mm_andnot_ps(__m128(a.lo),__m128(b.lo))); - res.hi = float64x2_t(_mm_andnot_ps(__m128(a.hi),__m128(b.hi))); - return res; -} - -AVX2NEON_ABI -__m256 _mm256_blend_ps (__m256 a, __m256 b, const int imm8) -{ - __m256 res; - res.lo = _mm_blend_ps(a.lo,b.lo,imm8 & 0xf); - res.hi = _mm_blend_ps(a.hi,b.hi,imm8 >> 4); - return res; - -} - - -AVX2NEON_ABI -__m256i _mm256_blend_epi32 (__m256i a, __m256i b, const int imm8) -{ - __m256i res; - res.lo = _mm_blend_epi32(a.lo,b.lo,imm8 & 0xf); - res.hi = _mm_blend_epi32(a.hi,b.hi,imm8 >> 4); - return res; - -} - -AVX2NEON_ABI -__m256i _mm256_i32gather_epi32 (int const* base_addr, __m256i vindex, const int scale) -{ - __m256i res; - for (int i=0;i<4;i++) - { - res.lo[i] = *(int *)((char *) base_addr + (vindex.lo[i]*scale)); - res.hi[i] = *(int *)((char *) base_addr + (vindex.hi[i]*scale)); - } - return res; -} - - -AVX2NEON_ABI -__m256i _mm256_mask_i32gather_epi32 (__m256i src, int const* base_addr, __m256i vindex, __m256i mask, const int scale) -{ - __m256i res = _mm256_setzero_si256(); - for (int i=0;i<4;i++) - { - if (mask.lo[i] >> 31) res.lo[i] = *(int *)((char *) base_addr + (vindex.lo[i]*scale)); - if (mask.hi[i] >> 31) res.hi[i] = *(int *)((char *) base_addr + (vindex.hi[i]*scale)); - } - - return res; - -} - - diff --git a/thirdparty/embree-aarch64/common/math/SSE2NEON.h b/thirdparty/embree-aarch64/common/math/SSE2NEON.h deleted file mode 100644 index 2013151d31..0000000000 --- a/thirdparty/embree-aarch64/common/math/SSE2NEON.h +++ /dev/null @@ -1,1753 +0,0 @@ -#ifndef SSE2NEON_H -#define SSE2NEON_H - -// This header file provides a simple API translation layer -// between SSE intrinsics to their corresponding ARM NEON versions -// -// This header file does not (yet) translate *all* of the SSE intrinsics. -// Since this is in support of a specific porting effort, I have only -// included the intrinsics I needed to get my port to work. -// -// Questions/Comments/Feedback send to: jratcliffscarab@gmail.com -// -// If you want to improve or add to this project, send me an -// email and I will probably approve your access to the depot. -// -// Project is located here: -// -// https://github.com/jratcliff63367/sse2neon -// -// Show your appreciation for open source by sending me a bitcoin tip to the following -// address. -// -// TipJar: 1PzgWDSyq4pmdAXRH8SPUtta4SWGrt4B1p : -// https://blockchain.info/address/1PzgWDSyq4pmdAXRH8SPUtta4SWGrt4B1p -// -// -// Contributors to this project are: -// -// John W. Ratcliff : jratcliffscarab@gmail.com -// Brandon Rowlett : browlett@nvidia.com -// Ken Fast : kfast@gdeb.com -// Eric van Beurden : evanbeurden@nvidia.com -// -// -// ********************************************************************************************************************* -// Release notes for January 20, 2017 version: -// -// The unit tests have been refactored. They no longer assert on an error, instead they return a pass/fail condition -// The unit-tests now test 10,000 random float and int values against each intrinsic. -// -// SSE2NEON now supports 95 SSE intrinsics. 39 of them have formal unit tests which have been implemented and -// fully tested on NEON/ARM. The remaining 56 still need unit tests implemented. -// -// A struct is now defined in this header file called 'SIMDVec' which can be used by applications which -// attempt to access the contents of an _m128 struct directly. It is important to note that accessing the __m128 -// struct directly is bad coding practice by Microsoft: @see: https://msdn.microsoft.com/en-us/library/ayeb3ayc.aspx -// -// However, some legacy source code may try to access the contents of an __m128 struct directly so the developer -// can use the SIMDVec as an alias for it. Any casting must be done manually by the developer, as you cannot -// cast or otherwise alias the base NEON data type for intrinsic operations. -// -// A bug was found with the _mm_shuffle_ps intrinsic. If the shuffle permutation was not one of the ones with -// a custom/unique implementation causing it to fall through to the default shuffle implementation it was failing -// to return the correct value. This is now fixed. -// -// A bug was found with the _mm_cvtps_epi32 intrinsic. This converts floating point values to integers. -// It was not honoring the correct rounding mode. In SSE the default rounding mode when converting from float to int -// is to use 'round to even' otherwise known as 'bankers rounding'. ARMv7 did not support this feature but ARMv8 does. -// As it stands today, this header file assumes ARMv8. If you are trying to target really old ARM devices, you may get -// a build error. -// -// Support for a number of new intrinsics was added, however, none of them yet have unit-tests to 100% confirm they are -// producing the correct results on NEON. These unit tests will be added as soon as possible. -// -// Here is the list of new instrinsics which have been added: -// -// _mm_cvtss_f32 : extracts the lower order floating point value from the parameter -// _mm_add_ss : adds the scalar single - precision floating point values of a and b -// _mm_div_ps : Divides the four single - precision, floating - point values of a and b. -// _mm_div_ss : Divides the scalar single - precision floating point value of a by b. -// _mm_sqrt_ss : Computes the approximation of the square root of the scalar single - precision floating point value of in. -// _mm_rsqrt_ps : Computes the approximations of the reciprocal square roots of the four single - precision floating point values of in. -// _mm_comilt_ss : Compares the lower single - precision floating point scalar values of a and b using a less than operation -// _mm_comigt_ss : Compares the lower single - precision floating point scalar values of a and b using a greater than operation. -// _mm_comile_ss : Compares the lower single - precision floating point scalar values of a and b using a less than or equal operation. -// _mm_comige_ss : Compares the lower single - precision floating point scalar values of a and b using a greater than or equal operation. -// _mm_comieq_ss : Compares the lower single - precision floating point scalar values of a and b using an equality operation. -// _mm_comineq_s : Compares the lower single - precision floating point scalar values of a and b using an inequality operation -// _mm_unpackhi_epi8 : Interleaves the upper 8 signed or unsigned 8 - bit integers in a with the upper 8 signed or unsigned 8 - bit integers in b. -// _mm_unpackhi_epi16: Interleaves the upper 4 signed or unsigned 16 - bit integers in a with the upper 4 signed or unsigned 16 - bit integers in b. -// -// ********************************************************************************************************************* -/* -** The MIT license: -** -** Permission is hereby granted, free of charge, to any person obtaining a copy -** of this software and associated documentation files (the "Software"), to deal -** in the Software without restriction, including without limitation the rights -** to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -** copies of the Software, and to permit persons to whom the Software is furnished -** to do so, subject to the following conditions: -** -** The above copyright notice and this permission notice shall be included in all -** copies or substantial portions of the Software. - -** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -** IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -** FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -** AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -** WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN -** CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -#pragma once - -#define GCC 1 -#define ENABLE_CPP_VERSION 0 - -// enable precise emulation of _mm_min_ps and _mm_max_ps? -// This would slow down the computation a bit, but gives consistent result with x86 SSE2. -// (e.g. would solve a hole or NaN pixel in the rendering result) -#define USE_PRECISE_MINMAX_IMPLEMENTATION (1) - -#if GCC -#define FORCE_INLINE inline __attribute__((always_inline)) -#define ALIGN_STRUCT(x) __attribute__((aligned(x))) -#else -#define FORCE_INLINE inline -#define ALIGN_STRUCT(x) __declspec(align(x)) -#endif - -#include -#include "arm_neon.h" -#if defined(__aarch64__) -#include "constants.h" -#endif - - -#if !defined(__has_builtin) -#define __has_builtin(x) (0) -#endif - -/*******************************************************/ -/* MACRO for shuffle parameter for _mm_shuffle_ps(). */ -/* Argument fp3 is a digit[0123] that represents the fp*/ -/* from argument "b" of mm_shuffle_ps that will be */ -/* placed in fp3 of result. fp2 is the same for fp2 in */ -/* result. fp1 is a digit[0123] that represents the fp */ -/* from argument "a" of mm_shuffle_ps that will be */ -/* places in fp1 of result. fp0 is the same for fp0 of */ -/* result */ -/*******************************************************/ -#if defined(__aarch64__) -#define _MN_SHUFFLE(fp3,fp2,fp1,fp0) ( (uint8x16_t){ (((fp3)*4)+0), (((fp3)*4)+1), (((fp3)*4)+2), (((fp3)*4)+3), (((fp2)*4)+0), (((fp2)*4)+1), (((fp2)*4)+2), (((fp2)*4)+3), (((fp1)*4)+0), (((fp1)*4)+1), (((fp1)*4)+2), (((fp1)*4)+3), (((fp0)*4)+0), (((fp0)*4)+1), (((fp0)*4)+2), (((fp0)*4)+3) } ) -#define _MF_SHUFFLE(fp3,fp2,fp1,fp0) ( (uint8x16_t){ (((fp3)*4)+0), (((fp3)*4)+1), (((fp3)*4)+2), (((fp3)*4)+3), (((fp2)*4)+0), (((fp2)*4)+1), (((fp2)*4)+2), (((fp2)*4)+3), (((fp1)*4)+16+0), (((fp1)*4)+16+1), (((fp1)*4)+16+2), (((fp1)*4)+16+3), (((fp0)*4)+16+0), (((fp0)*4)+16+1), (((fp0)*4)+16+2), (((fp0)*4)+16+3) } ) -#endif - -#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) (((fp3) << 6) | ((fp2) << 4) | \ - ((fp1) << 2) | ((fp0))) - -typedef float32x4_t __m128; -typedef int32x4_t __m128i; - -// union intended to allow direct access to an __m128 variable using the names that the MSVC -// compiler provides. This union should really only be used when trying to access the members -// of the vector as integer values. GCC/clang allow native access to the float members through -// a simple array access operator (in C since 4.6, in C++ since 4.8). -// -// Ideally direct accesses to SIMD vectors should not be used since it can cause a performance -// hit. If it really is needed however, the original __m128 variable can be aliased with a -// pointer to this union and used to access individual components. The use of this union should -// be hidden behind a macro that is used throughout the codebase to access the members instead -// of always declaring this type of variable. -typedef union ALIGN_STRUCT(16) SIMDVec -{ - float m128_f32[4]; // as floats - do not to use this. Added for convenience. - int8_t m128_i8[16]; // as signed 8-bit integers. - int16_t m128_i16[8]; // as signed 16-bit integers. - int32_t m128_i32[4]; // as signed 32-bit integers. - int64_t m128_i64[2]; // as signed 64-bit integers. - uint8_t m128_u8[16]; // as unsigned 8-bit integers. - uint16_t m128_u16[8]; // as unsigned 16-bit integers. - uint32_t m128_u32[4]; // as unsigned 32-bit integers. - uint64_t m128_u64[2]; // as unsigned 64-bit integers. - double m128_f64[2]; // as signed double -} SIMDVec; - -// ****************************************** -// CPU stuff -// ****************************************** - -typedef SIMDVec __m128d; - -#include - -#ifndef _MM_MASK_MASK -#define _MM_MASK_MASK 0x1f80 -#define _MM_MASK_DIV_ZERO 0x200 -#define _MM_FLUSH_ZERO_ON 0x8000 -#define _MM_DENORMALS_ZERO_ON 0x40 -#define _MM_MASK_DENORM 0x100 -#endif -#define _MM_SET_EXCEPTION_MASK(x) -#define _MM_SET_FLUSH_ZERO_MODE(x) -#define _MM_SET_DENORMALS_ZERO_MODE(x) - -FORCE_INLINE void _mm_pause() -{ -} - -FORCE_INLINE void _mm_mfence() -{ - __sync_synchronize(); -} - -#define _MM_HINT_T0 3 -#define _MM_HINT_T1 2 -#define _MM_HINT_T2 1 -#define _MM_HINT_NTA 0 - -FORCE_INLINE void _mm_prefetch(const void* ptr, unsigned int level) -{ - __builtin_prefetch(ptr); - -} - -FORCE_INLINE void* _mm_malloc(int size, int align) -{ - void *ptr; - // align must be multiple of sizeof(void *) for posix_memalign. - if (align < sizeof(void *)) { - align = sizeof(void *); - } - - if ((align % sizeof(void *)) != 0) { - // fallback to malloc - ptr = malloc(size); - } else { - if (posix_memalign(&ptr, align, size)) { - return 0; - } - } - - return ptr; -} - -FORCE_INLINE void _mm_free(void* ptr) -{ - free(ptr); -} - -FORCE_INLINE int _mm_getcsr() -{ - return 0; -} - -FORCE_INLINE void _mm_setcsr(int val) -{ - return; -} - -// ****************************************** -// Set/get methods -// ****************************************** - -// extracts the lower order floating point value from the parameter : https://msdn.microsoft.com/en-us/library/bb514059%28v=vs.120%29.aspx?f=255&MSPPError=-2147217396 -#if defined(__aarch64__) -FORCE_INLINE float _mm_cvtss_f32(const __m128& x) -{ - return x[0]; -} -#else -FORCE_INLINE float _mm_cvtss_f32(__m128 a) -{ - return vgetq_lane_f32(a, 0); -} -#endif - -// Sets the 128-bit value to zero https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx -FORCE_INLINE __m128i _mm_setzero_si128() -{ - return vdupq_n_s32(0); -} - -// Clears the four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx -FORCE_INLINE __m128 _mm_setzero_ps(void) -{ - return vdupq_n_f32(0); -} - -// Sets the four single-precision, floating-point values to w. https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx -FORCE_INLINE __m128 _mm_set1_ps(float _w) -{ - return vdupq_n_f32(_w); -} - -// Sets the four single-precision, floating-point values to w. https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx -FORCE_INLINE __m128 _mm_set_ps1(float _w) -{ - return vdupq_n_f32(_w); -} - -// Sets the four single-precision, floating-point values to the four inputs. https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx -#if defined(__aarch64__) -FORCE_INLINE __m128 _mm_set_ps(const float w, const float z, const float y, const float x) -{ - float32x4_t t = { x, y, z, w }; - return t; -} - -// Sets the four single-precision, floating-point values to the four inputs in reverse order. https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx -FORCE_INLINE __m128 _mm_setr_ps(const float w, const float z , const float y , const float x ) -{ - float32x4_t t = { w, z, y, x }; - return t; -} -#else -FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x) -{ - float __attribute__((aligned(16))) data[4] = { x, y, z, w }; - return vld1q_f32(data); -} - -// Sets the four single-precision, floating-point values to the four inputs in reverse order. https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx -FORCE_INLINE __m128 _mm_setr_ps(float w, float z , float y , float x ) -{ - float __attribute__ ((aligned (16))) data[4] = { w, z, y, x }; - return vld1q_f32(data); -} -#endif - -// Sets the 4 signed 32-bit integer values to i. https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx -FORCE_INLINE __m128i _mm_set1_epi32(int _i) -{ - return vdupq_n_s32(_i); -} - -//Set the first lane to of 4 signed single-position, floating-point number to w -#if defined(__aarch64__) -FORCE_INLINE __m128 _mm_set_ss(float _w) -{ - float32x4_t res = {_w, 0, 0, 0}; - return res; -} - -// Sets the 4 signed 32-bit integer values. https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx -FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0) -{ - int32x4_t t = {i0,i1,i2,i3}; - return t; -} -#else -FORCE_INLINE __m128 _mm_set_ss(float _w) -{ - __m128 val = _mm_setzero_ps(); - return vsetq_lane_f32(_w, val, 0); -} - -// Sets the 4 signed 32-bit integer values. https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx -FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0) -{ - int32_t __attribute__((aligned(16))) data[4] = { i0, i1, i2, i3 }; - return vld1q_s32(data); -} -#endif - -// Stores four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx -FORCE_INLINE void _mm_store_ps(float *p, __m128 a) -{ - vst1q_f32(p, a); -} - -// Stores four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx -FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a) -{ - vst1q_f32(p, a); -} - -FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a) -{ - vst1q_s32((int32_t*) p,a); -} - -// Stores four 32-bit integer values as (as a __m128i value) at the address p. https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx -FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a ) -{ - vst1q_s32((int32_t*) p,a); -} - -// Stores the lower single - precision, floating - point value. https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx -FORCE_INLINE void _mm_store_ss(float *p, __m128 a) -{ - vst1q_lane_f32(p, a, 0); -} - -// Reads the lower 64 bits of b and stores them into the lower 64 bits of a. https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx -FORCE_INLINE void _mm_storel_epi64(__m128i* a, __m128i b) -{ - *a = (__m128i)vsetq_lane_s64((int64_t)vget_low_s32(b), *(int64x2_t*)a, 0); -} - -// Loads a single single-precision, floating-point value, copying it into all four words https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx -FORCE_INLINE __m128 _mm_load1_ps(const float * p) -{ - return vld1q_dup_f32(p); -} - -// Loads four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx -FORCE_INLINE __m128 _mm_load_ps(const float * p) -{ - return vld1q_f32(p); -} - -// Loads four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx -FORCE_INLINE __m128 _mm_loadu_ps(const float * p) -{ - // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are equivalent for neon - return vld1q_f32(p); -} - -// Loads an single - precision, floating - point value into the low word and clears the upper three words. https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx -FORCE_INLINE __m128 _mm_load_ss(const float * p) -{ - __m128 result = vdupq_n_f32(0); - return vsetq_lane_f32(*p, result, 0); -} - -FORCE_INLINE __m128i _mm_loadu_si128(__m128i *p) -{ - return (__m128i)vld1q_s32((const int32_t*) p); -} - - -// ****************************************** -// Logic/Binary operations -// ****************************************** - -// Compares for inequality. https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx -FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b) -{ - return (__m128)vmvnq_s32((__m128i)vceqq_f32(a, b)); -} - -// Computes the bitwise AND-NOT of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx -FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b) -{ - return (__m128)vbicq_s32((__m128i)b, (__m128i)a); // *NOTE* argument swap -} - -// Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the 128-bit value in a. https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx -FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b) -{ - return (__m128i)vbicq_s32(b, a); // *NOTE* argument swap -} - -// Computes the bitwise AND of the 128-bit value in a and the 128-bit value in b. https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx -FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b) -{ - return (__m128i)vandq_s32(a, b); -} - -// Computes the bitwise AND of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx -FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b) -{ - return (__m128)vandq_s32((__m128i)a, (__m128i)b); -} - -// Computes the bitwise OR of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx -FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b) -{ - return (__m128)vorrq_s32((__m128i)a, (__m128i)b); -} - -// Computes bitwise EXOR (exclusive-or) of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx -FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b) -{ - return (__m128)veorq_s32((__m128i)a, (__m128i)b); -} - -// Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b. https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx -FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b) -{ - return (__m128i)vorrq_s32(a, b); -} - -// Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in b. https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx -FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b) -{ - return veorq_s32(a, b); -} - -// NEON does not provide this method -// Creates a 4-bit mask from the most significant bits of the four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx -FORCE_INLINE int _mm_movemask_ps(__m128 a) -{ -#if ENABLE_CPP_VERSION // I am not yet convinced that the NEON version is faster than the C version of this - uint32x4_t &ia = *(uint32x4_t *)&a; - return (ia[0] >> 31) | ((ia[1] >> 30) & 2) | ((ia[2] >> 29) & 4) | ((ia[3] >> 28) & 8); -#else - -#if defined(__aarch64__) - uint32x4_t t2 = vandq_u32(vreinterpretq_u32_f32(a), embree::movemask_mask); - return vaddvq_u32(t2); -#else - static const uint32x4_t movemask = { 1, 2, 4, 8 }; - static const uint32x4_t highbit = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 }; - uint32x4_t t0 = vreinterpretq_u32_f32(a); - uint32x4_t t1 = vtstq_u32(t0, highbit); - uint32x4_t t2 = vandq_u32(t1, movemask); - uint32x2_t t3 = vorr_u32(vget_low_u32(t2), vget_high_u32(t2)); - return vget_lane_u32(t3, 0) | vget_lane_u32(t3, 1); -#endif - -#endif -} - -#if defined(__aarch64__) -FORCE_INLINE int _mm_movemask_popcnt_ps(__m128 a) -{ - uint32x4_t t2 = vandq_u32(vreinterpretq_u32_f32(a), embree::movemask_mask); - t2 = vreinterpretq_u32_u8(vcntq_u8(vreinterpretq_u8_u32(t2))); - return vaddvq_u32(t2); - -} -#endif - -// Takes the upper 64 bits of a and places it in the low end of the result -// Takes the lower 64 bits of b and places it into the high end of the result. -FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b) -{ - return vcombine_f32(vget_high_f32(a), vget_low_f32(b)); -} - -// takes the lower two 32-bit values from a and swaps them and places in high end of result -// takes the higher two 32 bit values from b and swaps them and places in low end of result. -FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b) -{ - return vcombine_f32(vrev64_f32(vget_low_f32(a)), vrev64_f32(vget_high_f32(b))); -} - -// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the high -FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b) -{ - return vcombine_f32(vget_low_f32(a), vget_high_f32(b)); -} - -FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b) -{ - return vcombine_f32(vdup_n_f32(vgetq_lane_f32(a, 1)), vdup_n_f32(vgetq_lane_f32(b, 0))); -} - -FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b) -{ - return vcombine_f32(vdup_n_f32(vgetq_lane_f32(a, 2)), vdup_n_f32(vgetq_lane_f32(b, 0))); -} - -FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b) -{ - return vcombine_f32(vdup_n_f32(vgetq_lane_f32(a, 0)), vdup_n_f32(vgetq_lane_f32(b, 2))); -} - -FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b) -{ - float32_t a0 = vgetq_lane_f32(a, 0); - float32_t a2 = vgetq_lane_f32(a, 2); - float32x2_t aVal = vdup_n_f32(a2); - aVal = vset_lane_f32(a0, aVal, 1); - return vcombine_f32(aVal, vget_high_f32(b)); -} - -FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b) -{ - return vcombine_f32(vdup_n_f32(vgetq_lane_f32(a, 3)), vdup_n_f32(vgetq_lane_f32(b, 1))); -} - -FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b) -{ - float32_t b0 = vgetq_lane_f32(b, 0); - float32_t b2 = vgetq_lane_f32(b, 2); - float32x2_t bVal = vdup_n_f32(b0); - bVal = vset_lane_f32(b2, bVal, 1); - return vcombine_f32(vget_low_f32(a), bVal); -} - -FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b) -{ - float32_t b0 = vgetq_lane_f32(b, 0); - float32_t b2 = vgetq_lane_f32(b, 2); - float32x2_t bVal = vdup_n_f32(b0); - bVal = vset_lane_f32(b2, bVal, 1); - return vcombine_f32(vrev64_f32(vget_low_f32(a)), bVal); -} - -FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b) -{ - float32_t b0 = vgetq_lane_f32(b, 0); - float32_t b2 = vgetq_lane_f32(b, 2); - float32x2_t bVal = vdup_n_f32(b0); - bVal = vset_lane_f32(b2, bVal, 1); - return vcombine_f32(vget_high_f32(a), bVal); -} - -FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b) -{ - float32x2_t a21 = vget_high_f32(vextq_f32(a, a, 3)); - float32x2_t b03 = vget_low_f32(vextq_f32(b, b, 3)); - return vcombine_f32(a21, b03); -} - -FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b) -{ - float32x2_t a03 = vget_low_f32(vextq_f32(a, a, 3)); - float32x2_t b21 = vget_high_f32(vextq_f32(b, b, 3)); - return vcombine_f32(a03, b21); -} - -FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b) -{ - float32x2_t a10 = vget_low_f32(a); - float32x2_t b10 = vget_low_f32(b); - return vcombine_f32(a10, b10); -} - -FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b) -{ - float32x2_t a01 = vrev64_f32(vget_low_f32(a)); - float32x2_t b10 = vget_low_f32(b); - return vcombine_f32(a01, b10); -} - -FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b) -{ - float32x2_t a01 = vrev64_f32(vget_low_f32(a)); - float32x2_t b01 = vrev64_f32(vget_low_f32(b)); - return vcombine_f32(a01, b01); -} - -// NEON does not support a general purpose permute intrinsic -// Currently I am not sure whether the C implementation is faster or slower than the NEON version. -// Note, this has to be expanded as a template because the shuffle value must be an immediate value. -// The same is true on SSE as well. -// Selects four specific single-precision, floating-point values from a and b, based on the mask i. https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx -template -FORCE_INLINE __m128 _mm_shuffle_ps_default(const __m128& a, const __m128& b) -{ -#if ENABLE_CPP_VERSION // I am not convinced that the NEON version is faster than the C version yet. - __m128 ret; - ret[0] = a[i & 0x3]; - ret[1] = a[(i >> 2) & 0x3]; - ret[2] = b[(i >> 4) & 0x03]; - ret[3] = b[(i >> 6) & 0x03]; - return ret; -#else -# if __has_builtin(__builtin_shufflevector) - return __builtin_shufflevector( \ - a, b, (i) & (0x3), ((i) >> 2) & 0x3, - (((i) >> 4) & 0x3) + 4, (((i) >> 6) & 0x3) + 4); -# else - const int i0 = (i >> 0)&0x3; - const int i1 = (i >> 2)&0x3; - const int i2 = (i >> 4)&0x3; - const int i3 = (i >> 6)&0x3; - - if (&a == &b) - { - if (i0 == i1 && i0 == i2 && i0 == i3) - { - return (float32x4_t)vdupq_laneq_f32(a,i0); - } - static const uint8_t tbl[16] = { - (i0*4) + 0,(i0*4) + 1,(i0*4) + 2,(i0*4) + 3, - (i1*4) + 0,(i1*4) + 1,(i1*4) + 2,(i1*4) + 3, - (i2*4) + 0,(i2*4) + 1,(i2*4) + 2,(i2*4) + 3, - (i3*4) + 0,(i3*4) + 1,(i3*4) + 2,(i3*4) + 3 - }; - - return (float32x4_t)vqtbl1q_s8(int8x16_t(b),*(uint8x16_t *)tbl); - - } - else - { - - static const uint8_t tbl[16] = { - (i0*4) + 0,(i0*4) + 1,(i0*4) + 2,(i0*4) + 3, - (i1*4) + 0,(i1*4) + 1,(i1*4) + 2,(i1*4) + 3, - (i2*4) + 0 + 16,(i2*4) + 1 + 16,(i2*4) + 2 + 16,(i2*4) + 3 + 16, - (i3*4) + 0 + 16,(i3*4) + 1 + 16,(i3*4) + 2 + 16,(i3*4) + 3 + 16 - }; - - return float32x4_t(vqtbl2q_s8((int8x16x2_t){int8x16_t(a),int8x16_t(b)},*(uint8x16_t *)tbl)); - } -# endif //builtin(shufflevector) -#endif -} - -template -FORCE_INLINE __m128 _mm_shuffle_ps_function(const __m128& a, const __m128& b) -{ - switch (i) - { - case _MM_SHUFFLE(1, 0, 3, 2): - return _mm_shuffle_ps_1032(a, b); - break; - case _MM_SHUFFLE(2, 3, 0, 1): - return _mm_shuffle_ps_2301(a, b); - break; - case _MM_SHUFFLE(3, 2, 1, 0): - return _mm_shuffle_ps_3210(a, b); - break; - case _MM_SHUFFLE(0, 0, 1, 1): - return _mm_shuffle_ps_0011(a, b); - break; - case _MM_SHUFFLE(0, 0, 2, 2): - return _mm_shuffle_ps_0022(a, b); - break; - case _MM_SHUFFLE(2, 2, 0, 0): - return _mm_shuffle_ps_2200(a, b); - break; - case _MM_SHUFFLE(3, 2, 0, 2): - return _mm_shuffle_ps_3202(a, b); - break; - case _MM_SHUFFLE(1, 1, 3, 3): - return _mm_shuffle_ps_1133(a, b); - break; - case _MM_SHUFFLE(2, 0, 1, 0): - return _mm_shuffle_ps_2010(a, b); - break; - case _MM_SHUFFLE(2, 0, 0, 1): - return _mm_shuffle_ps_2001(a, b); - break; - case _MM_SHUFFLE(2, 0, 3, 2): - return _mm_shuffle_ps_2032(a, b); - break; - case _MM_SHUFFLE(0, 3, 2, 1): - return _mm_shuffle_ps_0321(a, b); - break; - case _MM_SHUFFLE(2, 1, 0, 3): - return _mm_shuffle_ps_2103(a, b); - break; - case _MM_SHUFFLE(1, 0, 1, 0): - return _mm_shuffle_ps_1010(a, b); - break; - case _MM_SHUFFLE(1, 0, 0, 1): - return _mm_shuffle_ps_1001(a, b); - break; - case _MM_SHUFFLE(0, 1, 0, 1): - return _mm_shuffle_ps_0101(a, b); - break; - } - return _mm_shuffle_ps_default(a, b); -} - -# if __has_builtin(__builtin_shufflevector) -#define _mm_shuffle_ps(a,b,i) _mm_shuffle_ps_default(a,b) -# else -#define _mm_shuffle_ps(a,b,i) _mm_shuffle_ps_function(a,b) -#endif - -// Takes the upper 64 bits of a and places it in the low end of the result -// Takes the lower 64 bits of b and places it into the high end of the result. -FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a, __m128i b) -{ - return vcombine_s32(vget_high_s32(a), vget_low_s32(b)); -} - -// takes the lower two 32-bit values from a and swaps them and places in low end of result -// takes the higher two 32 bit values from b and swaps them and places in high end of result. -FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a, __m128i b) -{ - return vcombine_s32(vrev64_s32(vget_low_s32(a)), vrev64_s32(vget_high_s32(b))); -} - -// shift a right by 32 bits, and put the lower 32 bits of a into the upper 32 bits of b -// when a and b are the same, rotates the least significant 32 bits into the most signficant 32 bits, and shifts the rest down -FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a, __m128i b) -{ - return vextq_s32(a, b, 1); -} - -// shift a left by 32 bits, and put the upper 32 bits of b into the lower 32 bits of a -// when a and b are the same, rotates the most significant 32 bits into the least signficant 32 bits, and shifts the rest up -FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a, __m128i b) -{ - return vextq_s32(a, b, 3); -} - -// gets the lower 64 bits of a, and places it in the upper 64 bits -// gets the lower 64 bits of b and places it in the lower 64 bits -FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a, __m128i b) -{ - return vcombine_s32(vget_low_s32(a), vget_low_s32(a)); -} - -// gets the lower 64 bits of a, and places it in the upper 64 bits -// gets the lower 64 bits of b, swaps the 0 and 1 elements, and places it in the lower 64 bits -FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a, __m128i b) -{ - return vcombine_s32(vrev64_s32(vget_low_s32(a)), vget_low_s32(b)); -} - -// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the upper 64 bits -// gets the lower 64 bits of b, swaps the 0 and 1 elements, and places it in the lower 64 bits -FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a, __m128i b) -{ - return vcombine_s32(vrev64_s32(vget_low_s32(a)), vrev64_s32(vget_low_s32(b))); -} - -FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a, __m128i b) -{ - return vcombine_s32(vdup_n_s32(vgetq_lane_s32(a, 1)), vdup_n_s32(vgetq_lane_s32(b, 2))); -} - -FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a, __m128i b) -{ - return vcombine_s32(vdup_n_s32(vgetq_lane_s32(a, 2)), vrev64_s32(vget_low_s32(b))); -} - -FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a, __m128i b) -{ - return vcombine_s32(vget_high_s32(a), vdup_n_s32(vgetq_lane_s32(b, 3))); -} - -template -FORCE_INLINE __m128i _mm_shuffle_epi32_default(__m128i a, __m128i b) -{ -#if ENABLE_CPP_VERSION - __m128i ret; - ret[0] = a[i & 0x3]; - ret[1] = a[(i >> 2) & 0x3]; - ret[2] = b[(i >> 4) & 0x03]; - ret[3] = b[(i >> 6) & 0x03]; - return ret; -#else - __m128i ret = vmovq_n_s32(vgetq_lane_s32(a, i & 0x3)); - ret = vsetq_lane_s32(vgetq_lane_s32(a, (i >> 2) & 0x3), ret, 1); - ret = vsetq_lane_s32(vgetq_lane_s32(b, (i >> 4) & 0x3), ret, 2); - ret = vsetq_lane_s32(vgetq_lane_s32(b, (i >> 6) & 0x3), ret, 3); - return ret; -#endif -} - -template -FORCE_INLINE __m128i _mm_shuffle_epi32_function(__m128i a, __m128i b) -{ - switch (i) - { - case _MM_SHUFFLE(1, 0, 3, 2): return _mm_shuffle_epi_1032(a, b); break; - case _MM_SHUFFLE(2, 3, 0, 1): return _mm_shuffle_epi_2301(a, b); break; - case _MM_SHUFFLE(0, 3, 2, 1): return _mm_shuffle_epi_0321(a, b); break; - case _MM_SHUFFLE(2, 1, 0, 3): return _mm_shuffle_epi_2103(a, b); break; - case _MM_SHUFFLE(1, 0, 1, 0): return _mm_shuffle_epi_1010(a, b); break; - case _MM_SHUFFLE(1, 0, 0, 1): return _mm_shuffle_epi_1001(a, b); break; - case _MM_SHUFFLE(0, 1, 0, 1): return _mm_shuffle_epi_0101(a, b); break; - case _MM_SHUFFLE(2, 2, 1, 1): return _mm_shuffle_epi_2211(a, b); break; - case _MM_SHUFFLE(0, 1, 2, 2): return _mm_shuffle_epi_0122(a, b); break; - case _MM_SHUFFLE(3, 3, 3, 2): return _mm_shuffle_epi_3332(a, b); break; - default: return _mm_shuffle_epi32_default(a, b); - } -} - -template -FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a) -{ - return vdupq_n_s32(vgetq_lane_s32(a, i)); -} - -template -FORCE_INLINE __m128i _mm_shuffle_epi32_single(__m128i a) -{ - switch (i) - { - case _MM_SHUFFLE(0, 0, 0, 0): return _mm_shuffle_epi32_splat<0>(a); break; - case _MM_SHUFFLE(1, 1, 1, 1): return _mm_shuffle_epi32_splat<1>(a); break; - case _MM_SHUFFLE(2, 2, 2, 2): return _mm_shuffle_epi32_splat<2>(a); break; - case _MM_SHUFFLE(3, 3, 3, 3): return _mm_shuffle_epi32_splat<3>(a); break; - default: return _mm_shuffle_epi32_function(a, a); - } -} - -// Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm. https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx -#define _mm_shuffle_epi32(a,i) _mm_shuffle_epi32_single(a) - -template -FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a) -{ - int16x8_t ret = (int16x8_t)a; - int16x4_t highBits = vget_high_s16(ret); - ret = vsetq_lane_s16(vget_lane_s16(highBits, i & 0x3), ret, 4); - ret = vsetq_lane_s16(vget_lane_s16(highBits, (i >> 2) & 0x3), ret, 5); - ret = vsetq_lane_s16(vget_lane_s16(highBits, (i >> 4) & 0x3), ret, 6); - ret = vsetq_lane_s16(vget_lane_s16(highBits, (i >> 6) & 0x3), ret, 7); - return (__m128i)ret; -} - -// Shuffles the upper 4 signed or unsigned 16 - bit integers in a as specified by imm. https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx -#define _mm_shufflehi_epi16(a,i) _mm_shufflehi_epi16_function(a) - -// Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while shifting in zeros. : https://msdn.microsoft.com/en-us/library/z2k3bbtb%28v=vs.90%29.aspx -//#define _mm_slli_epi32(a, imm) (__m128i)vshlq_n_s32(a,imm) - -// Based on SIMDe -FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, const int imm8) -{ -#if defined(__aarch64__) - const int32x4_t s = vdupq_n_s32(imm8); - return vshlq_s32(a, s); -#else - int32_t __attribute__((aligned(16))) data[4]; - vst1q_s32(data, a); - const int s = (imm8 > 31) ? 0 : imm8; - data[0] = data[0] << s; - data[1] = data[1] << s; - data[2] = data[2] << s; - data[3] = data[3] << s; - - return vld1q_s32(data); -#endif -} - - -//Shifts the 4 signed or unsigned 32-bit integers in a right by count bits while shifting in zeros. https://msdn.microsoft.com/en-us/library/w486zcfa(v=vs.100).aspx -//#define _mm_srli_epi32( a, imm ) (__m128i)vshrq_n_u32((uint32x4_t)a, imm) - -// Based on SIMDe -FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, const int imm8) -{ -#if defined(__aarch64__) - const int shift = (imm8 > 31) ? 0 : imm8; // Unfortunately, we need to check for this case for embree. - const int32x4_t s = vdupq_n_s32(-shift); - return vreinterpretq_s32_u32(vshlq_u32(vreinterpretq_u32_s32(a), s)); -#else - int32_t __attribute__((aligned(16))) data[4]; - vst1q_s32(data, a); - - const int s = (imm8 > 31) ? 0 : imm8; - - data[0] = data[0] >> s; - data[1] = data[1] >> s; - data[2] = data[2] >> s; - data[3] = data[3] >> s; - - return vld1q_s32(data); -#endif -} - - -// Shifts the 4 signed 32 - bit integers in a right by count bits while shifting in the sign bit. https://msdn.microsoft.com/en-us/library/z1939387(v=vs.100).aspx -//#define _mm_srai_epi32( a, imm ) vshrq_n_s32(a, imm) - -// Based on SIMDe -FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, const int imm8) -{ -#if defined(__aarch64__) - const int32x4_t s = vdupq_n_s32(-imm8); - return vshlq_s32(a, s); -#else - int32_t __attribute__((aligned(16))) data[4]; - vst1q_s32(data, a); - const uint32_t m = (uint32_t) ((~0U) << (32 - imm8)); - - for (int i = 0; i < 4; i++) { - uint32_t is_neg = ((uint32_t) (((data[i]) >> 31))); - data[i] = (data[i] >> imm8) | (m * is_neg); - } - - return vld1q_s32(data); -#endif -} - -// Shifts the 128 - bit value in a right by imm bytes while shifting in zeros.imm must be an immediate. https://msdn.microsoft.com/en-us/library/305w28yz(v=vs.100).aspx -//#define _mm_srli_si128( a, imm ) (__m128i)vmaxq_s8((int8x16_t)a, vextq_s8((int8x16_t)a, vdupq_n_s8(0), imm)) -#define _mm_srli_si128( a, imm ) (__m128i)vextq_s8((int8x16_t)a, vdupq_n_s8(0), (imm)) - -// Shifts the 128-bit value in a left by imm bytes while shifting in zeros. imm must be an immediate. https://msdn.microsoft.com/en-us/library/34d3k2kt(v=vs.100).aspx -#define _mm_slli_si128( a, imm ) (__m128i)vextq_s8(vdupq_n_s8(0), (int8x16_t)a, 16 - (imm)) - -// NEON does not provide a version of this function, here is an article about some ways to repro the results. -// http://stackoverflow.com/questions/11870910/sse-mm-movemask-epi8-equivalent-method-for-arm-neon -// Creates a 16-bit mask from the most significant bits of the 16 signed or unsigned 8-bit integers in a and zero extends the upper bits. https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx -FORCE_INLINE int _mm_movemask_epi8(__m128i _a) -{ - uint8x16_t input = (uint8x16_t)_a; - const int8_t __attribute__((aligned(16))) xr[8] = { -7, -6, -5, -4, -3, -2, -1, 0 }; - uint8x8_t mask_and = vdup_n_u8(0x80); - int8x8_t mask_shift = vld1_s8(xr); - - uint8x8_t lo = vget_low_u8(input); - uint8x8_t hi = vget_high_u8(input); - - lo = vand_u8(lo, mask_and); - lo = vshl_u8(lo, mask_shift); - - hi = vand_u8(hi, mask_and); - hi = vshl_u8(hi, mask_shift); - - lo = vpadd_u8(lo, lo); - lo = vpadd_u8(lo, lo); - lo = vpadd_u8(lo, lo); - - hi = vpadd_u8(hi, hi); - hi = vpadd_u8(hi, hi); - hi = vpadd_u8(hi, hi); - - return ((hi[0] << 8) | (lo[0] & 0xFF)); -} - - -// ****************************************** -// Math operations -// ****************************************** - -// Subtracts the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx -FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b) -{ - return vsubq_f32(a, b); -} - -FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b) -{ - return vsubq_f32(a, b); -} - -// Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or unsigned 32-bit integers of a. https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx -FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b) -{ - return vsubq_s32(a, b); -} - -// Adds the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx -FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b) -{ - return vaddq_f32(a, b); -} - -// adds the scalar single-precision floating point values of a and b. https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx -FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b) -{ - const float32_t b0 = vgetq_lane_f32(b, 0); - float32x4_t value = vdupq_n_f32(0); - - //the upper values in the result must be the remnants of . - value = vsetq_lane_f32(b0, value, 0); - return vaddq_f32(a, value); -} - -// Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or unsigned 32-bit integers in b. https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx -FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b) -{ - return vaddq_s32(a, b); -} - -// Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or unsigned 16-bit integers in b. https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx -FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b) -{ - return (__m128i)vaddq_s16((int16x8_t)a, (int16x8_t)b); -} - -// Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or unsigned 16-bit integers from b. https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx -FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b) -{ - return (__m128i)vmulq_s16((int16x8_t)a, (int16x8_t)b); -} - -// Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or unsigned 32-bit integers from b. https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx -FORCE_INLINE __m128i _mm_mullo_epi32 (__m128i a, __m128i b) -{ - return (__m128i)vmulq_s32((int32x4_t)a,(int32x4_t)b); -} - -// Multiplies the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx -FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b) -{ - return vmulq_f32(a, b); -} - -FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b) -{ - return vmulq_f32(a, b); -} - -// Computes the approximations of reciprocals of the four single-precision, floating-point values of a. https://msdn.microsoft.com/en-us/library/vstudio/796k1tty(v=vs.100).aspx -FORCE_INLINE __m128 _mm_rcp_ps(__m128 in) -{ -#if defined(BUILD_IOS) - return vdivq_f32(vdupq_n_f32(1.0f),in); - -#endif - // Get an initial estimate of 1/in. - float32x4_t reciprocal = vrecpeq_f32(in); - - // We only return estimated 1/in. - // Newton-Raphon iteration shold be done in the outside of _mm_rcp_ps(). - - // TODO(LTE): We could delete these ifdef? - reciprocal = vmulq_f32(vrecpsq_f32(in, reciprocal), reciprocal); - reciprocal = vmulq_f32(vrecpsq_f32(in, reciprocal), reciprocal); - return reciprocal; - -} - -FORCE_INLINE __m128 _mm_rcp_ss(__m128 in) -{ - float32x4_t value; - float32x4_t result = in; - - value = _mm_rcp_ps(in); - return vsetq_lane_f32(vgetq_lane_f32(value, 0), result, 0); -} - -// Divides the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx -FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b) -{ -#if defined(BUILD_IOS) - return vdivq_f32(a,b); -#else - float32x4_t reciprocal = _mm_rcp_ps(b); - - reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal); - reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal); - - // Add one more round of newton-raphson since NEON's reciprocal estimation has less accuracy compared to SSE2's rcp. - reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal); - - // Another round for safety - reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal); - - - return vmulq_f32(a, reciprocal); -#endif -} - -// Divides the scalar single-precision floating point value of a by b. https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx -FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b) -{ - float32x4_t value; - float32x4_t result = a; - value = _mm_div_ps(a, b); - return vsetq_lane_f32(vgetq_lane_f32(value, 0), result, 0); -} - -// Computes the approximations of the reciprocal square roots of the four single-precision floating point values of in. https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx -FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in) -{ - - float32x4_t value = vrsqrteq_f32(in); - - // TODO: We must debug and ensure that rsqrt(0) and rsqrt(-0) yield proper values. - // Related code snippets can be found here: https://cpp.hotexamples.com/examples/-/-/vrsqrteq_f32/cpp-vrsqrteq_f32-function-examples.html - // If we adapt this function, we might be able to avoid special zero treatment in _mm_sqrt_ps - - value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(in, value), value)); - value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(in, value), value)); - - // one more round to get better precision - value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(in, value), value)); - - // another round for safety - value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(in, value), value)); - - return value; -} - -FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in) -{ - float32x4_t result = in; - - __m128 value = _mm_rsqrt_ps(in); - - return vsetq_lane_f32(vgetq_lane_f32(value, 0), result, 0); -} - - -// Computes the approximations of square roots of the four single-precision, floating-point values of a. First computes reciprocal square roots and then reciprocals of the four values. https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx -FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in) -{ -#if defined(BUILD_IOS) - return vsqrtq_f32(in); -#else - __m128 reciprocal = _mm_rsqrt_ps(in); - - // We must treat sqrt(in == 0) in a special way. At this point reciprocal contains gargabe due to vrsqrteq_f32(0) returning +inf. - // We assign 0 to reciprocal wherever required. - const float32x4_t vzero = vdupq_n_f32(0.0f); - const uint32x4_t mask = vceqq_f32(in, vzero); - reciprocal = vbslq_f32(mask, vzero, reciprocal); - - // sqrt(x) = x * (1 / sqrt(x)) - return vmulq_f32(in, reciprocal); -#endif -} - -// Computes the approximation of the square root of the scalar single-precision floating point value of in. https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx -FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in) -{ - float32x4_t value; - float32x4_t result = in; - - value = _mm_sqrt_ps(in); - return vsetq_lane_f32(vgetq_lane_f32(value, 0), result, 0); -} - - -// Computes the maximums of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx -FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b) -{ -#if USE_PRECISE_MINMAX_IMPLEMENTATION - return vbslq_f32(vcltq_f32(b,a),a,b); -#else - // Faster, but would give inconsitent rendering(e.g. holes, NaN pixels) - return vmaxq_f32(a, b); -#endif -} - -// Computes the minima of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx -FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b) -{ -#if USE_PRECISE_MINMAX_IMPLEMENTATION - return vbslq_f32(vcltq_f32(a,b),a,b); -#else - // Faster, but would give inconsitent rendering(e.g. holes, NaN pixels) - return vminq_f32(a, b); -#endif -} - -// Computes the maximum of the two lower scalar single-precision floating point values of a and b. https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx -FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b) -{ - float32x4_t value; - float32x4_t result = a; - - value = _mm_max_ps(a, b); - return vsetq_lane_f32(vgetq_lane_f32(value, 0), result, 0); -} - -// Computes the minimum of the two lower scalar single-precision floating point values of a and b. https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx -FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b) -{ - float32x4_t value; - float32x4_t result = a; - - - value = _mm_min_ps(a, b); - return vsetq_lane_f32(vgetq_lane_f32(value, 0), result, 0); -} - -// Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8 signed 16-bit integers from b. https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx -FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b) -{ - return (__m128i)vminq_s16((int16x8_t)a, (int16x8_t)b); -} - -// epi versions of min/max -// Computes the pariwise maximums of the four signed 32-bit integer values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx -FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b ) -{ - return vmaxq_s32(a,b); -} - -// Computes the pariwise minima of the four signed 32-bit integer values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx -FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b ) -{ - return vminq_s32(a,b); -} - -// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit integers from b. https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx -FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b) -{ - int16x8_t ret = vqdmulhq_s16((int16x8_t)a, (int16x8_t)b); - ret = vshrq_n_s16(ret, 1); - return (__m128i)ret; -} - -// Computes pairwise add of each argument as single-precision, floating-point values a and b. -//https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx -FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b ) -{ -#if defined(__aarch64__) - return vpaddq_f32(a,b); -#else -// This does not work, no vpaddq... -// return (__m128) vpaddq_f32(a,b); - // - // get two f32x2_t values from a - // do vpadd - // put result in low half of f32x4 result - // - // get two f32x2_t values from b - // do vpadd - // put result in high half of f32x4 result - // - // combine - return vcombine_f32( vpadd_f32( vget_low_f32(a), vget_high_f32(a) ), vpadd_f32( vget_low_f32(b), vget_high_f32(b) ) ); -#endif -} - -// ****************************************** -// Compare operations -// ****************************************** - -// Compares for less than https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx -FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b) -{ - return (__m128)vcltq_f32(a, b); -} - -FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b) -{ - return (__m128) vmvnq_s32((__m128i)_mm_cmplt_ps(a,b)); -} - -// Compares for greater than. https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx -FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b) -{ - return (__m128)vcgtq_f32(a, b); -} - -FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b) -{ - return (__m128) _mm_cmpgt_ps(a,b); -} - - -// Compares for greater than or equal. https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx -FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b) -{ - return (__m128)vcgeq_f32(a, b); -} - -// Compares for less than or equal. https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx -FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b) -{ - return (__m128)vcleq_f32(a, b); -} - -// Compares for equality. https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx -FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b) -{ - return (__m128)vceqq_f32(a, b); -} - -// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers in b for less than. https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx -FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b) -{ - return (__m128i)vcltq_s32(a, b); -} - -FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b) -{ - return (__m128i) vceqq_s32(a,b); -} - -// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers in b for greater than. https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx -FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b) -{ - return (__m128i)vcgtq_s32(a, b); -} - -// Compares the four 32-bit floats in a and b to check if any values are NaN. Ordered compare between each value returns true for "orderable" and false for "not orderable" (NaN). https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx -// see also: -// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean -// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics -FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b ) -{ - // Note: NEON does not have ordered compare builtin - // Need to compare a eq a and b eq b to check for NaN - // Do AND of results to get final - return (__m128) vreinterpretq_f32_u32( vandq_u32( vceqq_f32(a,a), vceqq_f32(b,b) ) ); -} - -// Compares the lower single-precision floating point scalar values of a and b using a less than operation. : https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx -FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b) -{ - uint32x4_t value; - - value = vcltq_f32(a, b); - return vgetq_lane_u32(value, 0); -} - -// Compares the lower single-precision floating point scalar values of a and b using a greater than operation. : https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx -FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b) -{ - uint32x4_t value; - - value = vcgtq_f32(a, b); - return vgetq_lane_u32(value, 0); -} - -// Compares the lower single-precision floating point scalar values of a and b using a less than or equal operation. : https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx -FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b) -{ - uint32x4_t value; - - value = vcleq_f32(a, b); - return vgetq_lane_u32(value, 0); -} - -// Compares the lower single-precision floating point scalar values of a and b using a greater than or equal operation. : https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx -FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b) -{ - uint32x4_t value; - - value = vcgeq_f32(a, b); - return vgetq_lane_u32(value, 0); -} - -// Compares the lower single-precision floating point scalar values of a and b using an equality operation. : https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx -FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b) -{ - uint32x4_t value; - - value = vceqq_f32(a, b); - return vgetq_lane_u32(value, 0); -} - -// Compares the lower single-precision floating point scalar values of a and b using an inequality operation. : https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx -FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b) -{ - uint32x4_t value; - - value = vceqq_f32(a, b); - return !vgetq_lane_u32(value, 0); -} - -// according to the documentation, these intrinsics behave the same as the non-'u' versions. We'll just alias them here. -#define _mm_ucomilt_ss _mm_comilt_ss -#define _mm_ucomile_ss _mm_comile_ss -#define _mm_ucomigt_ss _mm_comigt_ss -#define _mm_ucomige_ss _mm_comige_ss -#define _mm_ucomieq_ss _mm_comieq_ss -#define _mm_ucomineq_ss _mm_comineq_ss - -// ****************************************** -// Conversions -// ****************************************** - -// Converts the four single-precision, floating-point values of a to signed 32-bit integer values using truncate. https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx -FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a) -{ - return vcvtq_s32_f32(a); -} - -// Converts the four signed 32-bit integer values of a to single-precision, floating-point values https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx -FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a) -{ - return vcvtq_f32_s32(a); -} - -// Converts the four single-precision, floating-point values of a to signed 32-bit integer values. https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx -// *NOTE*. The default rounding mode on SSE is 'round to even', which ArmV7 does not support! -// It is supported on ARMv8 however. -FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a) -{ -#if 1 - return vcvtnq_s32_f32(a); -#else - __m128 half = vdupq_n_f32(0.5f); - const __m128 sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(a), 31))); - const __m128 aPlusHalf = vaddq_f32(a, half); - const __m128 aRound = vsubq_f32(aPlusHalf, sign); - return vcvtq_s32_f32(aRound); -#endif -} - -// Moves the least significant 32 bits of a to a 32-bit integer. https://msdn.microsoft.com/en-us/library/5z7a9642%28v=vs.90%29.aspx -FORCE_INLINE int _mm_cvtsi128_si32(__m128i a) -{ - return vgetq_lane_s32(a, 0); -} - -// Moves 32-bit integer a to the least significant 32 bits of an __m128 object, zero extending the upper bits. https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx -FORCE_INLINE __m128i _mm_cvtsi32_si128(int a) -{ - __m128i result = vdupq_n_s32(0); - return vsetq_lane_s32(a, result, 0); -} - - -// Applies a type cast to reinterpret four 32-bit floating point values passed in as a 128-bit parameter as packed 32-bit integers. https://msdn.microsoft.com/en-us/library/bb514099.aspx -FORCE_INLINE __m128i _mm_castps_si128(__m128 a) -{ -#if defined(__aarch64__) - return (__m128i)a; -#else - return *(const __m128i *)&a; -#endif -} - -// Applies a type cast to reinterpret four 32-bit integers passed in as a 128-bit parameter as packed 32-bit floating point values. https://msdn.microsoft.com/en-us/library/bb514029.aspx -FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a) -{ -#if defined(__aarch64__) - return (__m128)a; -#else - return *(const __m128 *)&a; -#endif -} - -// Loads 128-bit value. : https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx -FORCE_INLINE __m128i _mm_load_si128(const __m128i *p) -{ - return vld1q_s32((int32_t *)p); -} - -FORCE_INLINE __m128d _mm_castps_pd(const __m128 a) -{ - return *(const __m128d *)&a; -} - -FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a) -{ - return *(const __m128d *)&a; -} -// ****************************************** -// Miscellaneous Operations -// ****************************************** - -// Packs the 16 signed 16-bit integers from a and b into 8-bit integers and saturates. https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx -FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b) -{ - return (__m128i)vcombine_s8(vqmovn_s16((int16x8_t)a), vqmovn_s16((int16x8_t)b)); -} - -// Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned integers and saturates. https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx -FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b) -{ - return (__m128i)vcombine_u8(vqmovun_s16((int16x8_t)a), vqmovun_s16((int16x8_t)b)); -} - -// Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers and saturates. https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx -FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b) -{ - return (__m128i)vcombine_s16(vqmovn_s32(a), vqmovn_s32(b)); -} - -// Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower 8 signed or unsigned 8-bit integers in b. https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx -FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b) -{ - int8x8_t a1 = (int8x8_t)vget_low_s16((int16x8_t)a); - int8x8_t b1 = (int8x8_t)vget_low_s16((int16x8_t)b); - - int8x8x2_t result = vzip_s8(a1, b1); - - return (__m128i)vcombine_s8(result.val[0], result.val[1]); -} - -// Interleaves the lower 4 signed or unsigned 16-bit integers in a with the lower 4 signed or unsigned 16-bit integers in b. https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx -FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b) -{ - int16x4_t a1 = vget_low_s16((int16x8_t)a); - int16x4_t b1 = vget_low_s16((int16x8_t)b); - - int16x4x2_t result = vzip_s16(a1, b1); - - return (__m128i)vcombine_s16(result.val[0], result.val[1]); -} - -// Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the lower 2 signed or unsigned 32 - bit integers in b. https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx -FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b) -{ - int32x2_t a1 = vget_low_s32(a); - int32x2_t b1 = vget_low_s32(b); - - int32x2x2_t result = vzip_s32(a1, b1); - - return vcombine_s32(result.val[0], result.val[1]); -} - -// Selects and interleaves the lower two single-precision, floating-point values from a and b. https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx -FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b) -{ - float32x2x2_t result = vzip_f32(vget_low_f32(a), vget_low_f32(b)); - return vcombine_f32(result.val[0], result.val[1]); -} - -// Selects and interleaves the upper two single-precision, floating-point values from a and b. https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx -FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b) -{ - float32x2x2_t result = vzip_f32(vget_high_f32(a), vget_high_f32(b)); - return vcombine_f32(result.val[0], result.val[1]); -} - -// Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper 8 signed or unsigned 8-bit integers in b. https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx -FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b) -{ - int8x8_t a1 = (int8x8_t)vget_high_s16((int16x8_t)a); - int8x8_t b1 = (int8x8_t)vget_high_s16((int16x8_t)b); - - int8x8x2_t result = vzip_s8(a1, b1); - - return (__m128i)vcombine_s8(result.val[0], result.val[1]); -} - -// Interleaves the upper 4 signed or unsigned 16-bit integers in a with the upper 4 signed or unsigned 16-bit integers in b. https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx -FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b) -{ - int16x4_t a1 = vget_high_s16((int16x8_t)a); - int16x4_t b1 = vget_high_s16((int16x8_t)b); - - int16x4x2_t result = vzip_s16(a1, b1); - - return (__m128i)vcombine_s16(result.val[0], result.val[1]); -} - -// Interleaves the upper 2 signed or unsigned 32-bit integers in a with the upper 2 signed or unsigned 32-bit integers in b. https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx -FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b) -{ - int32x2_t a1 = vget_high_s32(a); - int32x2_t b1 = vget_high_s32(b); - - int32x2x2_t result = vzip_s32(a1, b1); - - return vcombine_s32(result.val[0], result.val[1]); -} - -// Extracts the selected signed or unsigned 16-bit integer from a and zero extends. https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx -#define _mm_extract_epi16( a, imm ) vgetq_lane_s16((int16x8_t)a, imm) - -// ****************************************** -// Streaming Extensions -// ****************************************** - -// Guarantees that every preceding store is globally visible before any subsequent store. https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx -FORCE_INLINE void _mm_sfence(void) -{ - __sync_synchronize(); -} - -// Stores the data in a to the address p without polluting the caches. If the cache line containing address p is already in the cache, the cache will be updated.Address p must be 16 - byte aligned. https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx -FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a) -{ - *p = a; -} - -// Cache line containing p is flushed and invalidated from all caches in the coherency domain. : https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx -FORCE_INLINE void _mm_clflush(void const*p) -{ - // no corollary for Neon? -} - -FORCE_INLINE __m128i _mm_set_epi64x(int64_t a, int64_t b) -{ - // Stick to the flipped behavior of x86. - int64_t __attribute__((aligned(16))) data[2] = { b, a }; - return (__m128i)vld1q_s64(data); -} - -FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i) -{ - return (__m128i)vmovq_n_s64(_i); -} - -#if defined(__aarch64__) -FORCE_INLINE __m128 _mm_blendv_ps(__m128 a, __m128 b, __m128 c) -{ - int32x4_t mask = vshrq_n_s32(__m128i(c),31); - return vbslq_f32( uint32x4_t(mask), b, a); -} - -FORCE_INLINE __m128i _mm_load4epu8_epi32(__m128i *ptr) -{ - uint8x8_t t0 = vld1_u8((uint8_t*)ptr); - uint16x8_t t1 = vmovl_u8(t0); - uint32x4_t t2 = vmovl_u16(vget_low_u16(t1)); - return vreinterpretq_s32_u32(t2); -} - -FORCE_INLINE __m128i _mm_load4epu16_epi32(__m128i *ptr) -{ - uint16x8_t t0 = vld1q_u16((uint16_t*)ptr); - uint32x4_t t1 = vmovl_u16(vget_low_u16(t0)); - return vreinterpretq_s32_u32(t1); -} - -FORCE_INLINE __m128i _mm_load4epi8_f32(__m128i *ptr) -{ - int8x8_t t0 = vld1_s8((int8_t*)ptr); - int16x8_t t1 = vmovl_s8(t0); - int32x4_t t2 = vmovl_s16(vget_low_s16(t1)); - float32x4_t t3 = vcvtq_f32_s32(t2); - return vreinterpretq_s32_f32(t3); -} - -FORCE_INLINE __m128i _mm_load4epu8_f32(__m128i *ptr) -{ - uint8x8_t t0 = vld1_u8((uint8_t*)ptr); - uint16x8_t t1 = vmovl_u8(t0); - uint32x4_t t2 = vmovl_u16(vget_low_u16(t1)); - return vreinterpretq_s32_u32(t2); -} - -FORCE_INLINE __m128i _mm_load4epi16_f32(__m128i *ptr) -{ - int16x8_t t0 = vld1q_s16((int16_t*)ptr); - int32x4_t t1 = vmovl_s16(vget_low_s16(t0)); - float32x4_t t2 = vcvtq_f32_s32(t1); - return vreinterpretq_s32_f32(t2); -} - -FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b) -{ - return (__m128i)vcombine_u8(vqmovun_s16((int16x8_t)a), vqmovun_s16((int16x8_t)b)); -} - -FORCE_INLINE __m128i _mm_stream_load_si128(__m128i* ptr) -{ - // No non-temporal load on a single register on ARM. - return vreinterpretq_s32_u8(vld1q_u8((uint8_t*)ptr)); -} - -FORCE_INLINE void _mm_stream_ps(float* ptr, __m128i a) -{ - // No non-temporal store on a single register on ARM. - vst1q_f32((float*)ptr, vreinterpretq_f32_s32(a)); -} - -FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b) -{ - return vreinterpretq_s32_u32(vminq_u32(vreinterpretq_u32_s32(a), vreinterpretq_u32_s32(b))); -} - -FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b) -{ - return vreinterpretq_s32_u32(vmaxq_u32(vreinterpretq_u32_s32(a), vreinterpretq_u32_s32(b))); -} - -FORCE_INLINE __m128 _mm_abs_ps(__m128 a) -{ - return vabsq_f32(a); -} - -FORCE_INLINE __m128 _mm_madd_ps(__m128 a, __m128 b, __m128 c) -{ - return vmlaq_f32(c, a, b); -} - -FORCE_INLINE __m128 _mm_msub_ps(__m128 a, __m128 b, __m128 c) -{ - return vmlsq_f32(c, a, b); -} - -FORCE_INLINE __m128i _mm_abs_epi32(__m128i a) -{ - return vabsq_s32(a); -} -#endif //defined(__aarch64__) - -// Count the number of bits set to 1 in unsigned 32-bit integer a, and -// return that count in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u32 -FORCE_INLINE int _mm_popcnt_u32(unsigned int a) -{ - return (int)vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t)a))); -} - -// Count the number of bits set to 1 in unsigned 64-bit integer a, and -// return that count in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u64 -FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a) -{ - return (int64_t)vaddlv_u8(vcnt_u8(vcreate_u8(a))); -} - -#endif diff --git a/thirdparty/embree-aarch64/common/math/affinespace.h b/thirdparty/embree-aarch64/common/math/affinespace.h deleted file mode 100644 index 32452fbe72..0000000000 --- a/thirdparty/embree-aarch64/common/math/affinespace.h +++ /dev/null @@ -1,361 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "linearspace2.h" -#include "linearspace3.h" -#include "quaternion.h" -#include "bbox.h" -#include "vec4.h" - -namespace embree -{ - #define VectorT typename L::Vector - #define ScalarT typename L::Vector::Scalar - - //////////////////////////////////////////////////////////////////////////////// - // Affine Space - //////////////////////////////////////////////////////////////////////////////// - - template - struct AffineSpaceT - { - L l; /*< linear part of affine space */ - VectorT p; /*< affine part of affine space */ - - //////////////////////////////////////////////////////////////////////////////// - // Constructors, Assignment, Cast, Copy Operations - //////////////////////////////////////////////////////////////////////////////// - - __forceinline AffineSpaceT ( ) { } - __forceinline AffineSpaceT ( const AffineSpaceT& other ) { l = other.l; p = other.p; } - __forceinline AffineSpaceT ( const L & other ) { l = other ; p = VectorT(zero); } - __forceinline AffineSpaceT& operator=( const AffineSpaceT& other ) { l = other.l; p = other.p; return *this; } - - __forceinline AffineSpaceT( const VectorT& vx, const VectorT& vy, const VectorT& vz, const VectorT& p ) : l(vx,vy,vz), p(p) {} - __forceinline AffineSpaceT( const L& l, const VectorT& p ) : l(l), p(p) {} - - template __forceinline AffineSpaceT( const AffineSpaceT& s ) : l(s.l), p(s.p) {} - - //////////////////////////////////////////////////////////////////////////////// - // Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline AffineSpaceT( ZeroTy ) : l(zero), p(zero) {} - __forceinline AffineSpaceT( OneTy ) : l(one), p(zero) {} - - /*! return matrix for scaling */ - static __forceinline AffineSpaceT scale(const VectorT& s) { return L::scale(s); } - - /*! return matrix for translation */ - static __forceinline AffineSpaceT translate(const VectorT& p) { return AffineSpaceT(one,p); } - - /*! return matrix for rotation, only in 2D */ - static __forceinline AffineSpaceT rotate(const ScalarT& r) { return L::rotate(r); } - - /*! return matrix for rotation around arbitrary point (2D) or axis (3D) */ - static __forceinline AffineSpaceT rotate(const VectorT& u, const ScalarT& r) { return L::rotate(u,r); } - - /*! return matrix for rotation around arbitrary axis and point, only in 3D */ - static __forceinline AffineSpaceT rotate(const VectorT& p, const VectorT& u, const ScalarT& r) { return translate(+p) * rotate(u,r) * translate(-p); } - - /*! return matrix for looking at given point, only in 3D */ - static __forceinline AffineSpaceT lookat(const VectorT& eye, const VectorT& point, const VectorT& up) { - VectorT Z = normalize(point-eye); - VectorT U = normalize(cross(up,Z)); - VectorT V = normalize(cross(Z,U)); - return AffineSpaceT(L(U,V,Z),eye); - } - - }; - - // template specialization to get correct identity matrix for type AffineSpace3fa - template<> - __forceinline AffineSpaceT::AffineSpaceT( OneTy ) : l(one), p(0.f, 0.f, 0.f, 1.f) {} - - //////////////////////////////////////////////////////////////////////////////// - // Unary Operators - //////////////////////////////////////////////////////////////////////////////// - - template __forceinline AffineSpaceT operator -( const AffineSpaceT& a ) { return AffineSpaceT(-a.l,-a.p); } - template __forceinline AffineSpaceT operator +( const AffineSpaceT& a ) { return AffineSpaceT(+a.l,+a.p); } - template __forceinline AffineSpaceT rcp( const AffineSpaceT& a ) { L il = rcp(a.l); return AffineSpaceT(il,-(il*a.p)); } - - //////////////////////////////////////////////////////////////////////////////// - // Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - template __forceinline const AffineSpaceT operator +( const AffineSpaceT& a, const AffineSpaceT& b ) { return AffineSpaceT(a.l+b.l,a.p+b.p); } - template __forceinline const AffineSpaceT operator -( const AffineSpaceT& a, const AffineSpaceT& b ) { return AffineSpaceT(a.l-b.l,a.p-b.p); } - - template __forceinline const AffineSpaceT operator *( const ScalarT & a, const AffineSpaceT& b ) { return AffineSpaceT(a*b.l,a*b.p); } - template __forceinline const AffineSpaceT operator *( const AffineSpaceT& a, const AffineSpaceT& b ) { return AffineSpaceT(a.l*b.l,a.l*b.p+a.p); } - template __forceinline const AffineSpaceT operator /( const AffineSpaceT& a, const AffineSpaceT& b ) { return a * rcp(b); } - template __forceinline const AffineSpaceT operator /( const AffineSpaceT& a, const ScalarT & b ) { return a * rcp(b); } - - template __forceinline AffineSpaceT& operator *=( AffineSpaceT& a, const AffineSpaceT& b ) { return a = a * b; } - template __forceinline AffineSpaceT& operator *=( AffineSpaceT& a, const ScalarT & b ) { return a = a * b; } - template __forceinline AffineSpaceT& operator /=( AffineSpaceT& a, const AffineSpaceT& b ) { return a = a / b; } - template __forceinline AffineSpaceT& operator /=( AffineSpaceT& a, const ScalarT & b ) { return a = a / b; } - - template __forceinline VectorT xfmPoint (const AffineSpaceT& m, const VectorT& p) { return madd(VectorT(p.x),m.l.vx,madd(VectorT(p.y),m.l.vy,madd(VectorT(p.z),m.l.vz,m.p))); } - template __forceinline VectorT xfmVector(const AffineSpaceT& m, const VectorT& v) { return xfmVector(m.l,v); } - template __forceinline VectorT xfmNormal(const AffineSpaceT& m, const VectorT& n) { return xfmNormal(m.l,n); } - - __forceinline const BBox xfmBounds(const AffineSpaceT >& m, const BBox& b) - { - BBox3fa dst = empty; - const Vec3fa p0(b.lower.x,b.lower.y,b.lower.z); dst.extend(xfmPoint(m,p0)); - const Vec3fa p1(b.lower.x,b.lower.y,b.upper.z); dst.extend(xfmPoint(m,p1)); - const Vec3fa p2(b.lower.x,b.upper.y,b.lower.z); dst.extend(xfmPoint(m,p2)); - const Vec3fa p3(b.lower.x,b.upper.y,b.upper.z); dst.extend(xfmPoint(m,p3)); - const Vec3fa p4(b.upper.x,b.lower.y,b.lower.z); dst.extend(xfmPoint(m,p4)); - const Vec3fa p5(b.upper.x,b.lower.y,b.upper.z); dst.extend(xfmPoint(m,p5)); - const Vec3fa p6(b.upper.x,b.upper.y,b.lower.z); dst.extend(xfmPoint(m,p6)); - const Vec3fa p7(b.upper.x,b.upper.y,b.upper.z); dst.extend(xfmPoint(m,p7)); - return dst; - } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators - //////////////////////////////////////////////////////////////////////////////// - - template __forceinline bool operator ==( const AffineSpaceT& a, const AffineSpaceT& b ) { return a.l == b.l && a.p == b.p; } - template __forceinline bool operator !=( const AffineSpaceT& a, const AffineSpaceT& b ) { return a.l != b.l || a.p != b.p; } - - //////////////////////////////////////////////////////////////////////////////// - /// Select - //////////////////////////////////////////////////////////////////////////////// - - template __forceinline AffineSpaceT select ( const typename L::Vector::Scalar::Bool& s, const AffineSpaceT& t, const AffineSpaceT& f ) { - return AffineSpaceT(select(s,t.l,f.l),select(s,t.p,f.p)); - } - - //////////////////////////////////////////////////////////////////////////////// - // Output Operators - //////////////////////////////////////////////////////////////////////////////// - - template static embree_ostream operator<<(embree_ostream cout, const AffineSpaceT& m) { - return cout << "{ l = " << m.l << ", p = " << m.p << " }"; - } - - //////////////////////////////////////////////////////////////////////////////// - // Template Instantiations - //////////////////////////////////////////////////////////////////////////////// - - typedef AffineSpaceT AffineSpace2f; - typedef AffineSpaceT AffineSpace3f; - typedef AffineSpaceT AffineSpace3fa; - typedef AffineSpaceT AffineSpace3fx; - typedef AffineSpaceT AffineSpace3ff; - typedef AffineSpaceT OrthonormalSpace3f; - - template using AffineSpace3vf = AffineSpaceT>>>; - typedef AffineSpaceT>>> AffineSpace3vf4; - typedef AffineSpaceT>>> AffineSpace3vf8; - typedef AffineSpaceT>>> AffineSpace3vf16; - - template using AffineSpace3vff = AffineSpaceT>>>; - typedef AffineSpaceT>>> AffineSpace3vfa4; - typedef AffineSpaceT>>> AffineSpace3vfa8; - typedef AffineSpaceT>>> AffineSpace3vfa16; - - ////////////////////////////////////////////////////////////////////////////// - /// Interpolation - ////////////////////////////////////////////////////////////////////////////// - template - __forceinline AffineSpaceT lerp(const AffineSpaceT& M0, - const AffineSpaceT& M1, - const R& t) - { - return AffineSpaceT(lerp(M0.l,M1.l,t),lerp(M0.p,M1.p,t)); - } - - // slerp interprets the 16 floats of the matrix M = D * R * S as components of - // three matrizes (D, R, S) that are interpolated individually. - template __forceinline AffineSpaceT>> - slerp(const AffineSpaceT>>& M0, - const AffineSpaceT>>& M1, - const T& t) - { - QuaternionT q0(M0.p.w, M0.l.vx.w, M0.l.vy.w, M0.l.vz.w); - QuaternionT q1(M1.p.w, M1.l.vx.w, M1.l.vy.w, M1.l.vz.w); - QuaternionT q = slerp(q0, q1, t); - - AffineSpaceT>> S = lerp(M0, M1, t); - AffineSpaceT>> D(one); - D.p.x = S.l.vx.y; - D.p.y = S.l.vx.z; - D.p.z = S.l.vy.z; - S.l.vx.y = 0; - S.l.vx.z = 0; - S.l.vy.z = 0; - - AffineSpaceT>> R = LinearSpace3>(q); - return D * R * S; - } - - // this is a specialized version for Vec3fa because that does - // not play along nicely with the other templated Vec3/Vec4 types - __forceinline AffineSpace3fa slerp(const AffineSpace3ff& M0, - const AffineSpace3ff& M1, - const float& t) - { - Quaternion3f q0(M0.p.w, M0.l.vx.w, M0.l.vy.w, M0.l.vz.w); - Quaternion3f q1(M1.p.w, M1.l.vx.w, M1.l.vy.w, M1.l.vz.w); - Quaternion3f q = slerp(q0, q1, t); - - AffineSpace3fa S = lerp(M0, M1, t); - AffineSpace3fa D(one); - D.p.x = S.l.vx.y; - D.p.y = S.l.vx.z; - D.p.z = S.l.vy.z; - S.l.vx.y = 0; - S.l.vx.z = 0; - S.l.vy.z = 0; - - AffineSpace3fa R = LinearSpace3fa(q); - return D * R * S; - } - - __forceinline AffineSpace3fa quaternionDecompositionToAffineSpace(const AffineSpace3ff& qd) - { - // compute affine transform from quaternion decomposition - Quaternion3f q(qd.p.w, qd.l.vx.w, qd.l.vy.w, qd.l.vz.w); - AffineSpace3fa M = qd; - AffineSpace3fa D(one); - D.p.x = M.l.vx.y; - D.p.y = M.l.vx.z; - D.p.z = M.l.vy.z; - M.l.vx.y = 0; - M.l.vx.z = 0; - M.l.vy.z = 0; - AffineSpace3fa R = LinearSpace3fa(q); - return D * R * M; - } - - __forceinline void quaternionDecomposition(const AffineSpace3ff& qd, Vec3fa& T, Quaternion3f& q, AffineSpace3fa& S) - { - q = Quaternion3f(qd.p.w, qd.l.vx.w, qd.l.vy.w, qd.l.vz.w); - S = qd; - T.x = qd.l.vx.y; - T.y = qd.l.vx.z; - T.z = qd.l.vy.z; - S.l.vx.y = 0; - S.l.vx.z = 0; - S.l.vy.z = 0; - } - - __forceinline AffineSpace3fx quaternionDecomposition(Vec3fa const& T, Quaternion3f const& q, AffineSpace3fa const& S) - { - AffineSpace3ff M = S; - M.l.vx.w = q.i; - M.l.vy.w = q.j; - M.l.vz.w = q.k; - M.p.w = q.r; - M.l.vx.y = T.x; - M.l.vx.z = T.y; - M.l.vy.z = T.z; - return M; - } - - struct __aligned(16) QuaternionDecomposition - { - float scale_x = 1.f; - float scale_y = 1.f; - float scale_z = 1.f; - float skew_xy = 0.f; - float skew_xz = 0.f; - float skew_yz = 0.f; - float shift_x = 0.f; - float shift_y = 0.f; - float shift_z = 0.f; - float quaternion_r = 1.f; - float quaternion_i = 0.f; - float quaternion_j = 0.f; - float quaternion_k = 0.f; - float translation_x = 0.f; - float translation_y = 0.f; - float translation_z = 0.f; - }; - - __forceinline QuaternionDecomposition quaternionDecomposition(AffineSpace3ff const& M) - { - QuaternionDecomposition qd; - qd.scale_x = M.l.vx.x; - qd.scale_y = M.l.vy.y; - qd.scale_z = M.l.vz.z; - qd.shift_x = M.p.x; - qd.shift_y = M.p.y; - qd.shift_z = M.p.z; - qd.translation_x = M.l.vx.y; - qd.translation_y = M.l.vx.z; - qd.translation_z = M.l.vy.z; - qd.skew_xy = M.l.vy.x; - qd.skew_xz = M.l.vz.x; - qd.skew_yz = M.l.vz.y; - qd.quaternion_r = M.p.w; - qd.quaternion_i = M.l.vx.w; - qd.quaternion_j = M.l.vy.w; - qd.quaternion_k = M.l.vz.w; - return qd; - } - - //////////////////////////////////////////////////////////////////////////////// - /* - * ! Template Specialization for 2D: return matrix for rotation around point - * (rotation around arbitrarty vector is not meaningful in 2D) - */ - template<> __forceinline - AffineSpace2f AffineSpace2f::rotate(const Vec2f& p, const float& r) { - return translate(+p)*AffineSpace2f(LinearSpace2f::rotate(r))*translate(-p); - } - - //////////////////////////////////////////////////////////////////////////////// - // Similarity Transform - // - // checks, if M is a similarity transformation, i.e if there exists a factor D - // such that for all x,y: distance(Mx, My) = D * distance(x, y) - //////////////////////////////////////////////////////////////////////////////// - __forceinline bool similarityTransform(const AffineSpace3fa& M, float* D) - { - if (D) *D = 0.f; - if (abs(dot(M.l.vx, M.l.vy)) > 1e-5f) return false; - if (abs(dot(M.l.vx, M.l.vz)) > 1e-5f) return false; - if (abs(dot(M.l.vy, M.l.vz)) > 1e-5f) return false; - - const float D_x = dot(M.l.vx, M.l.vx); - const float D_y = dot(M.l.vy, M.l.vy); - const float D_z = dot(M.l.vz, M.l.vz); - - if (abs(D_x - D_y) > 1e-5f || - abs(D_x - D_z) > 1e-5f || - abs(D_y - D_z) > 1e-5f) - return false; - - if (D) *D = sqrtf(D_x); - return true; - } - - __forceinline void AffineSpace3fa_store_unaligned(const AffineSpace3fa &source, AffineSpace3fa* ptr) - { - Vec3fa::storeu(&ptr->l.vx, source.l.vx); - Vec3fa::storeu(&ptr->l.vy, source.l.vy); - Vec3fa::storeu(&ptr->l.vz, source.l.vz); - Vec3fa::storeu(&ptr->p, source.p); - } - - __forceinline AffineSpace3fa AffineSpace3fa_load_unaligned(AffineSpace3fa* ptr) - { - AffineSpace3fa space; - space.l.vx = Vec3fa::loadu(&ptr->l.vx); - space.l.vy = Vec3fa::loadu(&ptr->l.vy); - space.l.vz = Vec3fa::loadu(&ptr->l.vz); - space.p = Vec3fa::loadu(&ptr->p); - return space; - } - - #undef VectorT - #undef ScalarT -} diff --git a/thirdparty/embree-aarch64/common/math/bbox.h b/thirdparty/embree-aarch64/common/math/bbox.h deleted file mode 100644 index 29bb13912b..0000000000 --- a/thirdparty/embree-aarch64/common/math/bbox.h +++ /dev/null @@ -1,331 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "vec2.h" -#include "vec3.h" - -namespace embree -{ - namespace internal { - - template __forceinline T divideByTwo(const T& v) { return v / T(2); } - template <> __forceinline float divideByTwo(const float& v) { return v * 0.5f; } - template <> __forceinline double divideByTwo(const double& v) { return v * 0.5; } - - } // namespace internal - template - struct BBox - { - T lower, upper; - - //////////////////////////////////////////////////////////////////////////////// - /// Construction - //////////////////////////////////////////////////////////////////////////////// - - __forceinline BBox ( ) { } - template - __forceinline BBox ( const BBox& other ) : lower(other.lower), upper(other.upper) {} - __forceinline BBox& operator=( const BBox& other ) { lower = other.lower; upper = other.upper; return *this; } - - __forceinline BBox ( const T& v ) : lower(v), upper(v) {} - __forceinline BBox ( const T& lower, const T& upper ) : lower(lower), upper(upper) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Extending Bounds - //////////////////////////////////////////////////////////////////////////////// - - __forceinline const BBox& extend(const BBox& other) { lower = min(lower,other.lower); upper = max(upper,other.upper); return *this; } - __forceinline const BBox& extend(const T & other) { lower = min(lower,other ); upper = max(upper,other ); return *this; } - - /*! tests if box is empty */ - __forceinline bool empty() const { for (int i=0; i upper[i]) return true; return false; } - - /*! computes the size of the box */ - __forceinline T size() const { return upper - lower; } - - /*! computes the center of the box */ - __forceinline T center() const { return internal::divideByTwo(lower+upper); } - - /*! computes twice the center of the box */ - __forceinline T center2() const { return lower+upper; } - - /*! merges two boxes */ - __forceinline static const BBox merge (const BBox& a, const BBox& b) { - return BBox(min(a.lower, b.lower), max(a.upper, b.upper)); - } - - /*! enlarge box by some scaling factor */ - __forceinline BBox enlarge_by(const float a) const { - return BBox(lower - T(a)*abs(lower), upper + T(a)*abs(upper)); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline BBox( EmptyTy ) : lower(pos_inf), upper(neg_inf) {} - __forceinline BBox( FullTy ) : lower(neg_inf), upper(pos_inf) {} - __forceinline BBox( FalseTy ) : lower(pos_inf), upper(neg_inf) {} - __forceinline BBox( TrueTy ) : lower(neg_inf), upper(pos_inf) {} - __forceinline BBox( NegInfTy ): lower(pos_inf), upper(neg_inf) {} - __forceinline BBox( PosInfTy ): lower(neg_inf), upper(pos_inf) {} - }; - - template<> __forceinline bool BBox::empty() const { - return lower > upper; - } - -#if defined(__SSE__) || defined(__ARM_NEON) - template<> __forceinline bool BBox::empty() const { - return !all(le_mask(lower,upper)); - } - template<> __forceinline bool BBox::empty() const { - return !all(le_mask(lower,upper)); - } -#endif - - /*! tests if box is finite */ - __forceinline bool isvalid( const BBox& v ) { - return all(gt_mask(v.lower,Vec3fa_t(-FLT_LARGE)) & lt_mask(v.upper,Vec3fa_t(+FLT_LARGE))); - } - - /*! tests if box is finite and non-empty*/ - __forceinline bool isvalid_non_empty( const BBox& v ) { - return all(gt_mask(v.lower,Vec3fa_t(-FLT_LARGE)) & lt_mask(v.upper,Vec3fa_t(+FLT_LARGE)) & le_mask(v.lower,v.upper)); - } - - /*! tests if box has finite entries */ - __forceinline bool is_finite( const BBox& b) { - return is_finite(b.lower) && is_finite(b.upper); - } - - /*! test if point contained in box */ - __forceinline bool inside ( const BBox& b, const Vec3fa& p ) { return all(ge_mask(p,b.lower) & le_mask(p,b.upper)); } - - /*! computes the center of the box */ - template __forceinline const T center2(const BBox& box) { return box.lower + box.upper; } - template __forceinline const T center (const BBox& box) { return internal::divideByTwo(center2(box)); } - - /*! computes the volume of a bounding box */ - __forceinline float volume ( const BBox& b ) { return reduce_mul(b.size()); } - __forceinline float safeVolume( const BBox& b ) { if (b.empty()) return 0.0f; else return volume(b); } - - /*! computes the volume of a bounding box */ - __forceinline float volume( const BBox& b ) { return reduce_mul(b.size()); } - - /*! computes the surface area of a bounding box */ - template __forceinline const T area( const BBox >& b ) { const Vec2 d = b.size(); return d.x*d.y; } - - template __forceinline const T halfArea( const BBox >& b ) { return halfArea(b.size()); } - template __forceinline const T area( const BBox >& b ) { return T(2)*halfArea(b); } - - __forceinline float halfArea( const BBox& b ) { return halfArea(b.size()); } - __forceinline float area( const BBox& b ) { return 2.0f*halfArea(b); } - - __forceinline float halfArea( const BBox& b ) { return halfArea(b.size()); } - __forceinline float area( const BBox& b ) { return 2.0f*halfArea(b); } - - template __forceinline float safeArea( const BBox& b ) { if (b.empty()) return 0.0f; else return area(b); } - - template __forceinline float expectedApproxHalfArea(const BBox& box) { - return halfArea(box); - } - - /*! merges bounding boxes and points */ - template __forceinline const BBox merge( const BBox& a, const T& b ) { return BBox(min(a.lower, b ), max(a.upper, b )); } - template __forceinline const BBox merge( const T& a, const BBox& b ) { return BBox(min(a , b.lower), max(a , b.upper)); } - template __forceinline const BBox merge( const BBox& a, const BBox& b ) { return BBox(min(a.lower, b.lower), max(a.upper, b.upper)); } - - /*! Merges three boxes. */ - template __forceinline const BBox merge( const BBox& a, const BBox& b, const BBox& c ) { return merge(a,merge(b,c)); } - - /*! Merges four boxes. */ - template __forceinline BBox merge(const BBox& a, const BBox& b, const BBox& c, const BBox& d) { - return merge(merge(a,b),merge(c,d)); - } - - /*! Comparison Operators */ - template __forceinline bool operator==( const BBox& a, const BBox& b ) { return a.lower == b.lower && a.upper == b.upper; } - template __forceinline bool operator!=( const BBox& a, const BBox& b ) { return a.lower != b.lower || a.upper != b.upper; } - - /*! scaling */ - template __forceinline BBox operator *( const float& a, const BBox& b ) { return BBox(a*b.lower,a*b.upper); } - template __forceinline BBox operator *( const T& a, const BBox& b ) { return BBox(a*b.lower,a*b.upper); } - - /*! translations */ - template __forceinline BBox operator +( const BBox& a, const BBox& b ) { return BBox(a.lower+b.lower,a.upper+b.upper); } - template __forceinline BBox operator -( const BBox& a, const BBox& b ) { return BBox(a.lower-b.lower,a.upper-b.upper); } - template __forceinline BBox operator +( const BBox& a, const T & b ) { return BBox(a.lower+b ,a.upper+b ); } - template __forceinline BBox operator -( const BBox& a, const T & b ) { return BBox(a.lower-b ,a.upper-b ); } - - /*! extension */ - template __forceinline BBox enlarge(const BBox& a, const T& b) { return BBox(a.lower-b, a.upper+b); } - - /*! intersect bounding boxes */ - template __forceinline const BBox intersect( const BBox& a, const BBox& b ) { return BBox(max(a.lower, b.lower), min(a.upper, b.upper)); } - template __forceinline const BBox intersect( const BBox& a, const BBox& b, const BBox& c ) { return intersect(a,intersect(b,c)); } - template __forceinline const BBox intersect( const BBox& a, const BBox& b, const BBox& c, const BBox& d ) { return intersect(intersect(a,b),intersect(c,d)); } - - /*! subtract bounds from each other */ - template __forceinline void subtract(const BBox& a, const BBox& b, BBox& c, BBox& d) - { - c.lower = a.lower; - c.upper = min(a.upper,b.lower); - d.lower = max(a.lower,b.upper); - d.upper = a.upper; - } - - /*! tests if bounding boxes (and points) are disjoint (empty intersection) */ - template __inline bool disjoint( const BBox& a, const BBox& b ) { return intersect(a,b).empty(); } - template __inline bool disjoint( const BBox& a, const T& b ) { return disjoint(a,BBox(b)); } - template __inline bool disjoint( const T& a, const BBox& b ) { return disjoint(BBox(a),b); } - - /*! tests if bounding boxes (and points) are conjoint (non-empty intersection) */ - template __inline bool conjoint( const BBox& a, const BBox& b ) { return !intersect(a,b).empty(); } - template __inline bool conjoint( const BBox& a, const T& b ) { return conjoint(a,BBox(b)); } - template __inline bool conjoint( const T& a, const BBox& b ) { return conjoint(BBox(a),b); } - - /*! subset relation */ - template __inline bool subset( const BBox& a, const BBox& b ) - { - for ( size_t i = 0; i < T::N; i++ ) if ( a.lower[i] < b.lower[i] ) return false; - for ( size_t i = 0; i < T::N; i++ ) if ( a.upper[i] > b.upper[i] ) return false; - return true; - } - - template<> __inline bool subset( const BBox& a, const BBox& b ) { - return all(ge_mask(a.lower,b.lower)) & all(le_mask(a.upper,b.upper)); - } - - template<> __inline bool subset( const BBox& a, const BBox& b ) { - return all(ge_mask(a.lower,b.lower)) & all(le_mask(a.upper,b.upper)); - } - - /*! blending */ - template - __forceinline BBox lerp(const BBox& b0, const BBox& b1, const float t) { - return BBox(lerp(b0.lower,b1.lower,t),lerp(b0.upper,b1.upper,t)); - } - - /*! output operator */ - template __forceinline embree_ostream operator<<(embree_ostream cout, const BBox& box) { - return cout << "[" << box.lower << "; " << box.upper << "]"; - } - - /*! default template instantiations */ - typedef BBox BBox1f; - typedef BBox BBox2f; - typedef BBox BBox2fa; - typedef BBox BBox3f; - typedef BBox BBox3fa; - typedef BBox BBox3fx; - typedef BBox BBox3ff; -} - -//////////////////////////////////////////////////////////////////////////////// -/// SSE / AVX / MIC specializations -//////////////////////////////////////////////////////////////////////////////// - -#if defined (__SSE__) || defined(__ARM_NEON) -#include "../simd/sse.h" -#endif - -#if defined (__AVX__) -#include "../simd/avx.h" -#endif - -#if defined(__AVX512F__) -#include "../simd/avx512.h" -#endif - -namespace embree -{ - template - __forceinline BBox>> transpose(const BBox3fa* bounds); - - template<> - __forceinline BBox> transpose<4>(const BBox3fa* bounds) - { - BBox> dest; - - transpose((vfloat4&)bounds[0].lower, - (vfloat4&)bounds[1].lower, - (vfloat4&)bounds[2].lower, - (vfloat4&)bounds[3].lower, - dest.lower.x, - dest.lower.y, - dest.lower.z); - - transpose((vfloat4&)bounds[0].upper, - (vfloat4&)bounds[1].upper, - (vfloat4&)bounds[2].upper, - (vfloat4&)bounds[3].upper, - dest.upper.x, - dest.upper.y, - dest.upper.z); - - return dest; - } - -#if defined(__AVX__) - template<> - __forceinline BBox> transpose<8>(const BBox3fa* bounds) - { - BBox> dest; - - transpose((vfloat4&)bounds[0].lower, - (vfloat4&)bounds[1].lower, - (vfloat4&)bounds[2].lower, - (vfloat4&)bounds[3].lower, - (vfloat4&)bounds[4].lower, - (vfloat4&)bounds[5].lower, - (vfloat4&)bounds[6].lower, - (vfloat4&)bounds[7].lower, - dest.lower.x, - dest.lower.y, - dest.lower.z); - - transpose((vfloat4&)bounds[0].upper, - (vfloat4&)bounds[1].upper, - (vfloat4&)bounds[2].upper, - (vfloat4&)bounds[3].upper, - (vfloat4&)bounds[4].upper, - (vfloat4&)bounds[5].upper, - (vfloat4&)bounds[6].upper, - (vfloat4&)bounds[7].upper, - dest.upper.x, - dest.upper.y, - dest.upper.z); - - return dest; - } -#endif - - template - __forceinline BBox3fa merge(const BBox3fa* bounds); - - template<> - __forceinline BBox3fa merge<4>(const BBox3fa* bounds) - { - const Vec3fa lower = min(min(bounds[0].lower,bounds[1].lower), - min(bounds[2].lower,bounds[3].lower)); - const Vec3fa upper = max(max(bounds[0].upper,bounds[1].upper), - max(bounds[2].upper,bounds[3].upper)); - return BBox3fa(lower,upper); - } - -#if defined(__AVX__) - template<> - __forceinline BBox3fa merge<8>(const BBox3fa* bounds) - { - const Vec3fa lower = min(min(min(bounds[0].lower,bounds[1].lower),min(bounds[2].lower,bounds[3].lower)), - min(min(bounds[4].lower,bounds[5].lower),min(bounds[6].lower,bounds[7].lower))); - const Vec3fa upper = max(max(max(bounds[0].upper,bounds[1].upper),max(bounds[2].upper,bounds[3].upper)), - max(max(bounds[4].upper,bounds[5].upper),max(bounds[6].upper,bounds[7].upper))); - return BBox3fa(lower,upper); - } -#endif -} - diff --git a/thirdparty/embree-aarch64/common/math/col3.h b/thirdparty/embree-aarch64/common/math/col3.h deleted file mode 100644 index f52015fb88..0000000000 --- a/thirdparty/embree-aarch64/common/math/col3.h +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "math.h" - -namespace embree -{ - //////////////////////////////////////////////////////////////////////////////// - /// RGB Color Class - //////////////////////////////////////////////////////////////////////////////// - - template struct Col3 - { - T r, g, b; - - //////////////////////////////////////////////////////////////////////////////// - /// Construction - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Col3 ( ) { } - __forceinline Col3 ( const Col3& other ) { r = other.r; g = other.g; b = other.b; } - __forceinline Col3& operator=( const Col3& other ) { r = other.r; g = other.g; b = other.b; return *this; } - - __forceinline explicit Col3 (const T& v) : r(v), g(v), b(v) {} - __forceinline Col3 (const T& r, const T& g, const T& b) : r(r), g(g), b(b) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Col3 (ZeroTy) : r(zero) , g(zero) , b(zero) {} - __forceinline Col3 (OneTy) : r(one) , g(one) , b(one) {} - __forceinline Col3 (PosInfTy) : r(pos_inf), g(pos_inf), b(pos_inf) {} - __forceinline Col3 (NegInfTy) : r(neg_inf), g(neg_inf), b(neg_inf) {} - }; - - /*! output operator */ - template __forceinline embree_ostream operator<<(embree_ostream cout, const Col3& a) { - return cout << "(" << a.r << ", " << a.g << ", " << a.b << ")"; - } - - /*! default template instantiations */ - typedef Col3 Col3uc; - typedef Col3 Col3f; -} diff --git a/thirdparty/embree-aarch64/common/math/col4.h b/thirdparty/embree-aarch64/common/math/col4.h deleted file mode 100644 index 90df293f8e..0000000000 --- a/thirdparty/embree-aarch64/common/math/col4.h +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "math.h" - -namespace embree -{ - //////////////////////////////////////////////////////////////////////////////// - /// RGBA Color Class - //////////////////////////////////////////////////////////////////////////////// - - template struct Col4 - { - T r, g, b, a; - - //////////////////////////////////////////////////////////////////////////////// - /// Construction - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Col4 ( ) { } - __forceinline Col4 ( const Col4& other ) { r = other.r; g = other.g; b = other.b; a = other.a; } - __forceinline Col4& operator=( const Col4& other ) { r = other.r; g = other.g; b = other.b; a = other.a; return *this; } - - __forceinline explicit Col4 (const T& v) : r(v), g(v), b(v), a(v) {} - __forceinline Col4 (const T& r, const T& g, const T& b, const T& a) : r(r), g(g), b(b), a(a) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Col4 (ZeroTy) : r(zero) , g(zero) , b(zero) , a(zero) {} - __forceinline Col4 (OneTy) : r(one) , g(one) , b(one) , a(one) {} - __forceinline Col4 (PosInfTy) : r(pos_inf), g(pos_inf), b(pos_inf), a(pos_inf) {} - __forceinline Col4 (NegInfTy) : r(neg_inf), g(neg_inf), b(neg_inf), a(neg_inf) {} - }; - - /*! output operator */ - template __forceinline embree_ostream operator<<(embree_ostream cout, const Col4& a) { - return cout << "(" << a.r << ", " << a.g << ", " << a.b << ", " << a.a << ")"; - } - - /*! default template instantiations */ - typedef Col4 Col4uc; - typedef Col4 Col4f; -} diff --git a/thirdparty/embree-aarch64/common/math/color.h b/thirdparty/embree-aarch64/common/math/color.h deleted file mode 100644 index c3083e4fc0..0000000000 --- a/thirdparty/embree-aarch64/common/math/color.h +++ /dev/null @@ -1,257 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "constants.h" -#include "col3.h" -#include "col4.h" - -#include "../simd/sse.h" - -namespace embree -{ - //////////////////////////////////////////////////////////////////////////////// - /// SSE RGBA Color Class - //////////////////////////////////////////////////////////////////////////////// - - struct Color4 - { - union { - __m128 m128; - struct { float r,g,b,a; }; - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Construction - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Color4 () {} - __forceinline Color4 ( const __m128 a ) : m128(a) {} - - __forceinline explicit Color4 (const float v) : m128(_mm_set1_ps(v)) {} - __forceinline Color4 (const float r, const float g, const float b, const float a) : m128(_mm_set_ps(a,b,g,r)) {} - - __forceinline explicit Color4 ( const Col3uc& other ) { m128 = _mm_mul_ps(_mm_set_ps(255.0f,other.b,other.g,other.r),_mm_set1_ps(one_over_255)); } - __forceinline explicit Color4 ( const Col3f& other ) { m128 = _mm_set_ps(1.0f,other.b,other.g,other.r); } - __forceinline explicit Color4 ( const Col4uc& other ) { m128 = _mm_mul_ps(_mm_set_ps(other.a,other.b,other.g,other.r),_mm_set1_ps(one_over_255)); } - __forceinline explicit Color4 ( const Col4f& other ) { m128 = _mm_set_ps(other.a,other.b,other.g,other.r); } - - __forceinline Color4 ( const Color4& other ) : m128(other.m128) {} - __forceinline Color4& operator=( const Color4& other ) { m128 = other.m128; return *this; } - - __forceinline operator const __m128&() const { return m128; } - __forceinline operator __m128&() { return m128; } - - //////////////////////////////////////////////////////////////////////////////// - /// Set - //////////////////////////////////////////////////////////////////////////////// - - __forceinline void set(Col3f& d) const { d.r = r; d.g = g; d.b = b; } - __forceinline void set(Col4f& d) const { d.r = r; d.g = g; d.b = b; d.a = a; } - __forceinline void set(Col3uc& d) const - { - vfloat4 s = clamp(vfloat4(m128))*255.0f; - d.r = (uint8_t)(s[0]); - d.g = (uint8_t)(s[1]); - d.b = (uint8_t)(s[2]); - } - __forceinline void set(Col4uc& d) const - { - vfloat4 s = clamp(vfloat4(m128))*255.0f; - d.r = (uint8_t)(s[0]); - d.g = (uint8_t)(s[1]); - d.b = (uint8_t)(s[2]); - d.a = (uint8_t)(s[3]); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Color4( ZeroTy ) : m128(_mm_set1_ps(0.0f)) {} - __forceinline Color4( OneTy ) : m128(_mm_set1_ps(1.0f)) {} - __forceinline Color4( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {} - __forceinline Color4( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {} - }; - - //////////////////////////////////////////////////////////////////////////////// - /// SSE RGB Color Class - //////////////////////////////////////////////////////////////////////////////// - - struct Color - { - union { - __m128 m128; - struct { float r,g,b; }; - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Construction - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Color () {} - __forceinline Color ( const __m128 a ) : m128(a) {} - - __forceinline explicit Color (const float v) : m128(_mm_set1_ps(v)) {} - __forceinline Color (const float r, const float g, const float b) : m128(_mm_set_ps(0.0f,b,g,r)) {} - - __forceinline Color ( const Color& other ) : m128(other.m128) {} - __forceinline Color& operator=( const Color& other ) { m128 = other.m128; return *this; } - - __forceinline Color ( const Color4& other ) : m128(other.m128) {} - __forceinline Color& operator=( const Color4& other ) { m128 = other.m128; return *this; } - - __forceinline operator const __m128&() const { return m128; } - __forceinline operator __m128&() { return m128; } - - //////////////////////////////////////////////////////////////////////////////// - /// Set - //////////////////////////////////////////////////////////////////////////////// - - __forceinline void set(Col3f& d) const { d.r = r; d.g = g; d.b = b; } - __forceinline void set(Col4f& d) const { d.r = r; d.g = g; d.b = b; d.a = 1.0f; } - __forceinline void set(Col3uc& d) const - { - vfloat4 s = clamp(vfloat4(m128))*255.0f; - d.r = (uint8_t)(s[0]); - d.g = (uint8_t)(s[1]); - d.b = (uint8_t)(s[2]); - } - __forceinline void set(Col4uc& d) const - { - vfloat4 s = clamp(vfloat4(m128))*255.0f; - d.r = (uint8_t)(s[0]); - d.g = (uint8_t)(s[1]); - d.b = (uint8_t)(s[2]); - d.a = 255; - } - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Color( ZeroTy ) : m128(_mm_set1_ps(0.0f)) {} - __forceinline Color( OneTy ) : m128(_mm_set1_ps(1.0f)) {} - __forceinline Color( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {} - __forceinline Color( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {} - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline const Color operator +( const Color& a ) { return a; } - __forceinline const Color operator -( const Color& a ) { - const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); - return _mm_xor_ps(a.m128, mask); - } - __forceinline const Color abs ( const Color& a ) { - const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)); - return _mm_and_ps(a.m128, mask); - } - __forceinline const Color rcp ( const Color& a ) - { -#if defined(__aarch64__) && defined(BUILD_IOS) - __m128 reciprocal = _mm_rcp_ps(a.m128); - reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal); - reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal); - return (const Color)reciprocal; -#else -#if defined(__AVX512VL__) - const Color r = _mm_rcp14_ps(a.m128); -#else - const Color r = _mm_rcp_ps(a.m128); -#endif - return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a)); -#endif //defined(__aarch64__) && defined(BUILD_IOS) - } - __forceinline const Color rsqrt( const Color& a ) - { -#if defined(__aarch64__) && defined(BUILD_IOS) - __m128 r = _mm_rsqrt_ps(a.m128); - r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r)); - r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r)); - return r; -#else - -#if defined(__AVX512VL__) - __m128 r = _mm_rsqrt14_ps(a.m128); -#else - __m128 r = _mm_rsqrt_ps(a.m128); -#endif - return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); - -#endif //defined(__aarch64__) && defined(BUILD_IOS) - } - __forceinline const Color sqrt ( const Color& a ) { return _mm_sqrt_ps(a.m128); } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline const Color operator +( const Color& a, const Color& b ) { return _mm_add_ps(a.m128, b.m128); } - __forceinline const Color operator -( const Color& a, const Color& b ) { return _mm_sub_ps(a.m128, b.m128); } - __forceinline const Color operator *( const Color& a, const Color& b ) { return _mm_mul_ps(a.m128, b.m128); } - __forceinline const Color operator *( const Color& a, const float b ) { return a * Color(b); } - __forceinline const Color operator *( const float a, const Color& b ) { return Color(a) * b; } - __forceinline const Color operator /( const Color& a, const Color& b ) { return a * rcp(b); } - __forceinline const Color operator /( const Color& a, const float b ) { return a * rcp(b); } - - __forceinline const Color min( const Color& a, const Color& b ) { return _mm_min_ps(a.m128,b.m128); } - __forceinline const Color max( const Color& a, const Color& b ) { return _mm_max_ps(a.m128,b.m128); } - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline const Color operator+=(Color& a, const Color& b) { return a = a + b; } - __forceinline const Color operator-=(Color& a, const Color& b) { return a = a - b; } - __forceinline const Color operator*=(Color& a, const Color& b) { return a = a * b; } - __forceinline const Color operator/=(Color& a, const Color& b) { return a = a / b; } - __forceinline const Color operator*=(Color& a, const float b ) { return a = a * b; } - __forceinline const Color operator/=(Color& a, const float b ) { return a = a / b; } - - //////////////////////////////////////////////////////////////////////////////// - /// Reductions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline float reduce_add(const Color& v) { return v.r+v.g+v.b; } - __forceinline float reduce_mul(const Color& v) { return v.r*v.g*v.b; } - __forceinline float reduce_min(const Color& v) { return min(v.r,v.g,v.b); } - __forceinline float reduce_max(const Color& v) { return max(v.r,v.g,v.b); } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline bool operator ==( const Color& a, const Color& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 7) == 7; } - __forceinline bool operator !=( const Color& a, const Color& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 7) != 0; } - __forceinline bool operator < ( const Color& a, const Color& b ) { - if (a.r != b.r) return a.r < b.r; - if (a.g != b.g) return a.g < b.g; - if (a.b != b.b) return a.b < b.b; - return false; - } - - //////////////////////////////////////////////////////////////////////////////// - /// Select - //////////////////////////////////////////////////////////////////////////////// - - __forceinline const Color select( bool s, const Color& t, const Color& f ) { - __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps(); - return blendv_ps(f, t, mask); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Special Operators - //////////////////////////////////////////////////////////////////////////////// - - /*! computes luminance of a color */ - __forceinline float luminance (const Color& a) { return madd(0.212671f,a.r,madd(0.715160f,a.g,0.072169f*a.b)); } - - /*! output operator */ - __forceinline embree_ostream operator<<(embree_ostream cout, const Color& a) { - return cout << "(" << a.r << ", " << a.g << ", " << a.b << ")"; - } -} diff --git a/thirdparty/embree-aarch64/common/math/constants.cpp b/thirdparty/embree-aarch64/common/math/constants.cpp deleted file mode 100644 index eeff131664..0000000000 --- a/thirdparty/embree-aarch64/common/math/constants.cpp +++ /dev/null @@ -1,61 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#if defined(__aarch64__) -#include -#endif - -#include "constants.h" - -namespace embree -{ - TrueTy True; - FalseTy False; - ZeroTy zero; - OneTy one; - NegInfTy neg_inf; - PosInfTy inf; - PosInfTy pos_inf; - NaNTy nan; - UlpTy ulp; - PiTy pi; - OneOverPiTy one_over_pi; - TwoPiTy two_pi; - OneOverTwoPiTy one_over_two_pi; - FourPiTy four_pi; - OneOverFourPiTy one_over_four_pi; - StepTy step; - ReverseStepTy reverse_step; - EmptyTy empty; - UndefinedTy undefined; - -#if defined(__aarch64__) -const uint32x4_t movemask_mask = { 1, 2, 4, 8 }; -const uint32x4_t vzero = { 0, 0, 0, 0 }; -const uint32x4_t v0x80000000 = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 }; -const uint32x4_t v0x7fffffff = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }; -const uint32x4_t v000F = { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF }; -const uint32x4_t v00F0 = { 0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000 }; -const uint32x4_t v00FF = { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF }; -const uint32x4_t v0F00 = { 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000 }; -const uint32x4_t v0F0F = { 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF }; -const uint32x4_t v0FF0 = { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 }; -const uint32x4_t v0FFF = { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }; -const uint32x4_t vF000 = { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 }; -const uint32x4_t vF00F = { 0xFFFFFFFF, 0x00000000, 0x00000000, 0xFFFFFFFF }; -const uint32x4_t vF0F0 = { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 }; -const uint32x4_t vF0FF = { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF }; -const uint32x4_t vFF00 = { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 }; -const uint32x4_t vFF0F = { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF }; -const uint32x4_t vFFF0 = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 }; -const uint32x4_t vFFFF = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }; -const uint8x16_t v0022 = {0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11}; -const uint8x16_t v1133 = {4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15}; -const uint8x16_t v0101 = {0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7}; -const float32x4_t vOne = { 1.0f, 1.0f, 1.0f, 1.0f }; -const float32x4_t vmOne = { -1.0f, -1.0f, -1.0f, -1.0f }; -const float32x4_t vInf = { INFINITY, INFINITY, INFINITY, INFINITY }; -const float32x4_t vmInf = { -INFINITY, -INFINITY, -INFINITY, -INFINITY }; -#endif - -} diff --git a/thirdparty/embree-aarch64/common/math/constants.h b/thirdparty/embree-aarch64/common/math/constants.h deleted file mode 100644 index e80abec80f..0000000000 --- a/thirdparty/embree-aarch64/common/math/constants.h +++ /dev/null @@ -1,239 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../sys/platform.h" - -#include - -#define _USE_MATH_DEFINES -#include // using cmath causes issues under Windows -#include -#include - -// Math constants may not be defined in libcxx + mingw + strict C++ standard -#if defined(__MINGW32__) - -// TODO(LTE): use constexpr -#ifndef M_PI -#define M_PI 3.14159265358979323846 -#endif -#ifndef M_1_PI -#define M_1_PI 0.31830988618379067154 -#endif - -#endif // __MINGW32__ - -namespace embree -{ - static MAYBE_UNUSED const float one_over_255 = 1.0f/255.0f; - static MAYBE_UNUSED const float min_rcp_input = 1E-18f; // for abs(x) >= min_rcp_input the newton raphson rcp calculation does not fail - - /* we consider floating point numbers in that range as valid input numbers */ - static MAYBE_UNUSED float FLT_LARGE = 1.844E18f; - - struct TrueTy { - __forceinline operator bool( ) const { return true; } - }; - - extern MAYBE_UNUSED TrueTy True; - - struct FalseTy { - __forceinline operator bool( ) const { return false; } - }; - - extern MAYBE_UNUSED FalseTy False; - - struct ZeroTy - { - __forceinline operator double ( ) const { return 0; } - __forceinline operator float ( ) const { return 0; } - __forceinline operator long long( ) const { return 0; } - __forceinline operator unsigned long long( ) const { return 0; } - __forceinline operator long ( ) const { return 0; } - __forceinline operator unsigned long ( ) const { return 0; } - __forceinline operator int ( ) const { return 0; } - __forceinline operator unsigned int ( ) const { return 0; } - __forceinline operator short ( ) const { return 0; } - __forceinline operator unsigned short ( ) const { return 0; } - __forceinline operator int8_t ( ) const { return 0; } - __forceinline operator uint8_t ( ) const { return 0; } - }; - - extern MAYBE_UNUSED ZeroTy zero; - - struct OneTy - { - __forceinline operator double ( ) const { return 1; } - __forceinline operator float ( ) const { return 1; } - __forceinline operator long long( ) const { return 1; } - __forceinline operator unsigned long long( ) const { return 1; } - __forceinline operator long ( ) const { return 1; } - __forceinline operator unsigned long ( ) const { return 1; } - __forceinline operator int ( ) const { return 1; } - __forceinline operator unsigned int ( ) const { return 1; } - __forceinline operator short ( ) const { return 1; } - __forceinline operator unsigned short ( ) const { return 1; } - __forceinline operator int8_t ( ) const { return 1; } - __forceinline operator uint8_t ( ) const { return 1; } - }; - - extern MAYBE_UNUSED OneTy one; - - struct NegInfTy - { - __forceinline operator double ( ) const { return -std::numeric_limits::infinity(); } - __forceinline operator float ( ) const { return -std::numeric_limits::infinity(); } - __forceinline operator long long( ) const { return std::numeric_limits::min(); } - __forceinline operator unsigned long long( ) const { return std::numeric_limits::min(); } - __forceinline operator long ( ) const { return std::numeric_limits::min(); } - __forceinline operator unsigned long ( ) const { return std::numeric_limits::min(); } - __forceinline operator int ( ) const { return std::numeric_limits::min(); } - __forceinline operator unsigned int ( ) const { return std::numeric_limits::min(); } - __forceinline operator short ( ) const { return std::numeric_limits::min(); } - __forceinline operator unsigned short ( ) const { return std::numeric_limits::min(); } - __forceinline operator int8_t ( ) const { return std::numeric_limits::min(); } - __forceinline operator uint8_t ( ) const { return std::numeric_limits::min(); } - - }; - - extern MAYBE_UNUSED NegInfTy neg_inf; - - struct PosInfTy - { - __forceinline operator double ( ) const { return std::numeric_limits::infinity(); } - __forceinline operator float ( ) const { return std::numeric_limits::infinity(); } - __forceinline operator long long( ) const { return std::numeric_limits::max(); } - __forceinline operator unsigned long long( ) const { return std::numeric_limits::max(); } - __forceinline operator long ( ) const { return std::numeric_limits::max(); } - __forceinline operator unsigned long ( ) const { return std::numeric_limits::max(); } - __forceinline operator int ( ) const { return std::numeric_limits::max(); } - __forceinline operator unsigned int ( ) const { return std::numeric_limits::max(); } - __forceinline operator short ( ) const { return std::numeric_limits::max(); } - __forceinline operator unsigned short ( ) const { return std::numeric_limits::max(); } - __forceinline operator int8_t ( ) const { return std::numeric_limits::max(); } - __forceinline operator uint8_t ( ) const { return std::numeric_limits::max(); } - }; - - extern MAYBE_UNUSED PosInfTy inf; - extern MAYBE_UNUSED PosInfTy pos_inf; - - struct NaNTy - { - __forceinline operator double( ) const { return std::numeric_limits::quiet_NaN(); } - __forceinline operator float ( ) const { return std::numeric_limits::quiet_NaN(); } - }; - - extern MAYBE_UNUSED NaNTy nan; - - struct UlpTy - { - __forceinline operator double( ) const { return std::numeric_limits::epsilon(); } - __forceinline operator float ( ) const { return std::numeric_limits::epsilon(); } - }; - - extern MAYBE_UNUSED UlpTy ulp; - - struct PiTy - { - __forceinline operator double( ) const { return double(M_PI); } - __forceinline operator float ( ) const { return float(M_PI); } - }; - - extern MAYBE_UNUSED PiTy pi; - - struct OneOverPiTy - { - __forceinline operator double( ) const { return double(M_1_PI); } - __forceinline operator float ( ) const { return float(M_1_PI); } - }; - - extern MAYBE_UNUSED OneOverPiTy one_over_pi; - - struct TwoPiTy - { - __forceinline operator double( ) const { return double(2.0*M_PI); } - __forceinline operator float ( ) const { return float(2.0*M_PI); } - }; - - extern MAYBE_UNUSED TwoPiTy two_pi; - - struct OneOverTwoPiTy - { - __forceinline operator double( ) const { return double(0.5*M_1_PI); } - __forceinline operator float ( ) const { return float(0.5*M_1_PI); } - }; - - extern MAYBE_UNUSED OneOverTwoPiTy one_over_two_pi; - - struct FourPiTy - { - __forceinline operator double( ) const { return double(4.0*M_PI); } - __forceinline operator float ( ) const { return float(4.0*M_PI); } - }; - - extern MAYBE_UNUSED FourPiTy four_pi; - - struct OneOverFourPiTy - { - __forceinline operator double( ) const { return double(0.25*M_1_PI); } - __forceinline operator float ( ) const { return float(0.25*M_1_PI); } - }; - - extern MAYBE_UNUSED OneOverFourPiTy one_over_four_pi; - - struct StepTy { - }; - - extern MAYBE_UNUSED StepTy step; - - struct ReverseStepTy { - }; - - extern MAYBE_UNUSED ReverseStepTy reverse_step; - - struct EmptyTy { - }; - - extern MAYBE_UNUSED EmptyTy empty; - - struct FullTy { - }; - - extern MAYBE_UNUSED FullTy full; - - struct UndefinedTy { - }; - - extern MAYBE_UNUSED UndefinedTy undefined; - -#if defined(__aarch64__) - extern const uint32x4_t movemask_mask; - extern const uint32x4_t vzero; - extern const uint32x4_t v0x80000000; - extern const uint32x4_t v0x7fffffff; - extern const uint32x4_t v000F; - extern const uint32x4_t v00F0; - extern const uint32x4_t v00FF; - extern const uint32x4_t v0F00; - extern const uint32x4_t v0F0F; - extern const uint32x4_t v0FF0; - extern const uint32x4_t v0FFF; - extern const uint32x4_t vF000; - extern const uint32x4_t vF00F; - extern const uint32x4_t vF0F0; - extern const uint32x4_t vF0FF; - extern const uint32x4_t vFF00; - extern const uint32x4_t vFF0F; - extern const uint32x4_t vFFF0; - extern const uint32x4_t vFFFF; - extern const uint8x16_t v0022; - extern const uint8x16_t v1133; - extern const uint8x16_t v0101; - extern const float32x4_t vOne; - extern const float32x4_t vmOne; - extern const float32x4_t vInf; - extern const float32x4_t vmInf; -#endif -} diff --git a/thirdparty/embree-aarch64/common/math/interval.h b/thirdparty/embree-aarch64/common/math/interval.h deleted file mode 100644 index f06478e881..0000000000 --- a/thirdparty/embree-aarch64/common/math/interval.h +++ /dev/null @@ -1,161 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "vec2.h" -#include "vec3.h" -#include "bbox.h" - -namespace embree -{ - template - struct Interval - { - V lower, upper; - - __forceinline Interval() {} - __forceinline Interval ( const Interval& other ) { lower = other.lower; upper = other.upper; } - __forceinline Interval& operator=( const Interval& other ) { lower = other.lower; upper = other.upper; return *this; } - - __forceinline Interval(const V& a) : lower(a), upper(a) {} - __forceinline Interval(const V& lower, const V& upper) : lower(lower), upper(upper) {} - __forceinline Interval(const BBox& a) : lower(a.lower), upper(a.upper) {} - - /*! tests if box is empty */ - //__forceinline bool empty() const { return lower > upper; } - - /*! computes the size of the interval */ - __forceinline V size() const { return upper - lower; } - - __forceinline V center() const { return 0.5f*(lower+upper); } - - __forceinline const Interval& extend(const Interval& other) { lower = min(lower,other.lower); upper = max(upper,other.upper); return *this; } - __forceinline const Interval& extend(const V & other) { lower = min(lower,other ); upper = max(upper,other ); return *this; } - - __forceinline friend Interval operator +( const Interval& a, const Interval& b ) { - return Interval(a.lower+b.lower,a.upper+b.upper); - } - - __forceinline friend Interval operator -( const Interval& a, const Interval& b ) { - return Interval(a.lower-b.upper,a.upper-b.lower); - } - - __forceinline friend Interval operator -( const Interval& a, const V& b ) { - return Interval(a.lower-b,a.upper-b); - } - - __forceinline friend Interval operator *( const Interval& a, const Interval& b ) - { - const V ll = a.lower*b.lower; - const V lu = a.lower*b.upper; - const V ul = a.upper*b.lower; - const V uu = a.upper*b.upper; - return Interval(min(ll,lu,ul,uu),max(ll,lu,ul,uu)); - } - - __forceinline friend Interval merge( const Interval& a, const Interval& b) { - return Interval(min(a.lower,b.lower),max(a.upper,b.upper)); - } - - __forceinline friend Interval merge( const Interval& a, const Interval& b, const Interval& c) { - return merge(merge(a,b),c); - } - - __forceinline friend Interval merge( const Interval& a, const Interval& b, const Interval& c, const Interval& d) { - return merge(merge(a,b),merge(c,d)); - } - - /*! intersect bounding boxes */ - __forceinline friend const Interval intersect( const Interval& a, const Interval& b ) { return Interval(max(a.lower, b.lower), min(a.upper, b.upper)); } - __forceinline friend const Interval intersect( const Interval& a, const Interval& b, const Interval& c ) { return intersect(a,intersect(b,c)); } - __forceinline friend const Interval intersect( const Interval& a, const Interval& b, const Interval& c, const Interval& d ) { return intersect(intersect(a,b),intersect(c,d)); } - - friend embree_ostream operator<<(embree_ostream cout, const Interval& a) { - return cout << "[" << a.lower << ", " << a.upper << "]"; - } - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Interval( EmptyTy ) : lower(pos_inf), upper(neg_inf) {} - __forceinline Interval( FullTy ) : lower(neg_inf), upper(pos_inf) {} - }; - - __forceinline bool isEmpty(const Interval& v) { - return v.lower > v.upper; - } - - __forceinline vboolx isEmpty(const Interval& v) { - return v.lower > v.upper; - } - - /*! subset relation */ - template __forceinline bool subset( const Interval& a, const Interval& b ) { - return (a.lower > b.lower) && (a.upper < b.upper); - } - - template __forceinline bool subset( const Vec2>& a, const Vec2>& b ) { - return subset(a.x,b.x) && subset(a.y,b.y); - } - - template __forceinline const Vec2> intersect( const Vec2>& a, const Vec2>& b ) { - return Vec2>(intersect(a.x,b.x),intersect(a.y,b.y)); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Select - //////////////////////////////////////////////////////////////////////////////// - - template __forceinline Interval select ( bool s, const Interval& t, const Interval& f ) { - return Interval(select(s,t.lower,f.lower),select(s,t.upper,f.upper)); - } - - template __forceinline Interval select ( const typename T::Bool& s, const Interval& t, const Interval& f ) { - return Interval(select(s,t.lower,f.lower),select(s,t.upper,f.upper)); - } - - __forceinline int numRoots(const Interval& p0, const Interval& p1) - { - float eps = 1E-4f; - bool neg0 = p0.lower < eps; bool pos0 = p0.upper > -eps; - bool neg1 = p1.lower < eps; bool pos1 = p1.upper > -eps; - return (neg0 && pos1) || (pos0 && neg1) || (neg0 && pos0) || (neg1 && pos1); - } - - typedef Interval Interval1f; - typedef Vec2> Interval2f; - typedef Vec3> Interval3f; - -inline void swap(float& a, float& b) { float tmp = a; a = b; b = tmp; } - -inline Interval1f shift(const Interval1f& v, float shift) { return Interval1f(v.lower + shift, v.upper + shift); } - -#define TWO_PI (2.0*M_PI) -inline Interval1f sin(Interval1f interval) -{ - if (interval.upper-interval.lower >= M_PI) { return Interval1f(-1.0, 1.0); } - if (interval.upper > TWO_PI) { interval = shift(interval, -TWO_PI*floor(interval.upper/TWO_PI)); } - if (interval.lower < 0) { interval = shift(interval, -TWO_PI*floor(interval.lower/TWO_PI)); } - float sinLower = sin(interval.lower); - float sinUpper = sin(interval.upper); - if (sinLower > sinUpper) swap(sinLower, sinUpper); - if (interval.lower < M_PI / 2.0 && interval.upper > M_PI / 2.0) sinUpper = 1.0; - if (interval.lower < 3.0 * M_PI / 2.0 && interval.upper > 3.0 * M_PI / 2.0) sinLower = -1.0; - return Interval1f(sinLower, sinUpper); -} - -inline Interval1f cos(Interval1f interval) -{ - if (interval.upper-interval.lower >= M_PI) { return Interval1f(-1.0, 1.0); } - if (interval.upper > TWO_PI) { interval = shift(interval, -TWO_PI*floor(interval.upper/TWO_PI)); } - if (interval.lower < 0) { interval = shift(interval, -TWO_PI*floor(interval.lower/TWO_PI)); } - float cosLower = cos(interval.lower); - float cosUpper = cos(interval.upper); - if (cosLower > cosUpper) swap(cosLower, cosUpper); - if (interval.lower < M_PI && interval.upper > M_PI) cosLower = -1.0; - return Interval1f(cosLower, cosUpper); -} -#undef TWO_PI -} diff --git a/thirdparty/embree-aarch64/common/math/lbbox.h b/thirdparty/embree-aarch64/common/math/lbbox.h deleted file mode 100644 index 95df4a918d..0000000000 --- a/thirdparty/embree-aarch64/common/math/lbbox.h +++ /dev/null @@ -1,289 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "bbox.h" -#include "range.h" - -namespace embree -{ - template - __forceinline std::pair globalLinear(const std::pair& v, const BBox1f& dt) - { - const float rcp_dt_size = float(1.0f)/dt.size(); - const T g0 = lerp(v.first,v.second,-dt.lower*rcp_dt_size); - const T g1 = lerp(v.first,v.second,(1.0f-dt.lower)*rcp_dt_size); - return std::make_pair(g0,g1); - } - - template - struct LBBox - { - public: - __forceinline LBBox () {} - - template - __forceinline LBBox ( const LBBox& other ) - : bounds0(other.bounds0), bounds1(other.bounds1) {} - - __forceinline LBBox& operator= ( const LBBox& other ) { - bounds0 = other.bounds0; bounds1 = other.bounds1; return *this; - } - - __forceinline LBBox (EmptyTy) - : bounds0(EmptyTy()), bounds1(EmptyTy()) {} - - __forceinline explicit LBBox ( const BBox& bounds) - : bounds0(bounds), bounds1(bounds) { } - - __forceinline LBBox ( const BBox& bounds0, const BBox& bounds1) - : bounds0(bounds0), bounds1(bounds1) { } - - LBBox ( const avector>& bounds ) - { - assert(bounds.size()); - BBox b0 = bounds.front(); - BBox b1 = bounds.back(); - for (size_t i=1; i bt = lerp(b0,b1,f); - const T dlower = min(bounds[i].lower-bt.lower,T(zero)); - const T dupper = max(bounds[i].upper-bt.upper,T(zero)); - b0.lower += dlower; b1.lower += dlower; - b0.upper += dupper; b1.upper += dupper; - } - bounds0 = b0; - bounds1 = b1; - } - - /*! calculates the linear bounds of a primitive for the specified time range */ - template - __forceinline LBBox(const BoundsFunc& bounds, const BBox1f& time_range, float numTimeSegments) - { - const float lower = time_range.lower*numTimeSegments; - const float upper = time_range.upper*numTimeSegments; - const float ilowerf = floor(lower); - const float iupperf = ceil(upper); - const int ilower = (int)ilowerf; - const int iupper = (int)iupperf; - - const BBox blower0 = bounds(ilower); - const BBox bupper1 = bounds(iupper); - - if (iupper-ilower == 1) { - bounds0 = lerp(blower0, bupper1, lower-ilowerf); - bounds1 = lerp(bupper1, blower0, iupperf-upper); - return; - } - - const BBox blower1 = bounds(ilower+1); - const BBox bupper0 = bounds(iupper-1); - BBox b0 = lerp(blower0, blower1, lower-ilowerf); - BBox b1 = lerp(bupper1, bupper0, iupperf-upper); - - for (int i = ilower+1; i < iupper; i++) - { - const float f = (float(i)/numTimeSegments - time_range.lower) / time_range.size(); - const BBox bt = lerp(b0, b1, f); - const BBox bi = bounds(i); - const T dlower = min(bi.lower-bt.lower, T(zero)); - const T dupper = max(bi.upper-bt.upper, T(zero)); - b0.lower += dlower; b1.lower += dlower; - b0.upper += dupper; b1.upper += dupper; - } - - bounds0 = b0; - bounds1 = b1; - } - - /*! calculates the linear bounds of a primitive for the specified time range */ - template - __forceinline LBBox(const BoundsFunc& bounds, const BBox1f& time_range_in, const BBox1f& geom_time_range, float geom_time_segments) - { - /* normalize global time_range_in to local geom_time_range */ - const BBox1f time_range((time_range_in.lower-geom_time_range.lower)/geom_time_range.size(), - (time_range_in.upper-geom_time_range.lower)/geom_time_range.size()); - - const float lower = time_range.lower*geom_time_segments; - const float upper = time_range.upper*geom_time_segments; - const float ilowerf = floor(lower); - const float iupperf = ceil(upper); - const float ilowerfc = max(0.0f,ilowerf); - const float iupperfc = min(iupperf,geom_time_segments); - const int ilowerc = (int)ilowerfc; - const int iupperc = (int)iupperfc; - assert(iupperc-ilowerc > 0); - - /* this larger iteration range guarantees that we process borders of geom_time_range is (partially) inside time_range_in */ - const int ilower_iter = max(-1,(int)ilowerf); - const int iupper_iter = min((int)iupperf,(int)geom_time_segments+1); - - const BBox blower0 = bounds(ilowerc); - const BBox bupper1 = bounds(iupperc); - if (iupper_iter-ilower_iter == 1) { - bounds0 = lerp(blower0, bupper1, max(0.0f,lower-ilowerfc)); - bounds1 = lerp(bupper1, blower0, max(0.0f,iupperfc-upper)); - return; - } - - const BBox blower1 = bounds(ilowerc+1); - const BBox bupper0 = bounds(iupperc-1); - BBox b0 = lerp(blower0, blower1, max(0.0f,lower-ilowerfc)); - BBox b1 = lerp(bupper1, bupper0, max(0.0f,iupperfc-upper)); - - for (int i = ilower_iter+1; i < iupper_iter; i++) - { - const float f = (float(i)/geom_time_segments - time_range.lower) / time_range.size(); - const BBox bt = lerp(b0, b1, f); - const BBox bi = bounds(i); - const T dlower = min(bi.lower-bt.lower, T(zero)); - const T dupper = max(bi.upper-bt.upper, T(zero)); - b0.lower += dlower; b1.lower += dlower; - b0.upper += dupper; b1.upper += dupper; - } - - bounds0 = b0; - bounds1 = b1; - } - - /*! calculates the linear bounds of a primitive for the specified time range */ - template - __forceinline LBBox(const BoundsFunc& bounds, const range& time_range, int numTimeSegments) - { - const int ilower = time_range.begin(); - const int iupper = time_range.end(); - - BBox b0 = bounds(ilower); - BBox b1 = bounds(iupper); - - if (iupper-ilower == 1) - { - bounds0 = b0; - bounds1 = b1; - return; - } - - for (int i = ilower+1; i bt = lerp(b0, b1, f); - const BBox bi = bounds(i); - const T dlower = min(bi.lower-bt.lower, T(zero)); - const T dupper = max(bi.upper-bt.upper, T(zero)); - b0.lower += dlower; b1.lower += dlower; - b0.upper += dupper; b1.upper += dupper; - } - - bounds0 = b0; - bounds1 = b1; - } - - public: - - __forceinline bool empty() const { - return bounds().empty(); - } - - __forceinline BBox bounds () const { - return merge(bounds0,bounds1); - } - - __forceinline BBox interpolate( const float t ) const { - return lerp(bounds0,bounds1,t); - } - - __forceinline LBBox interpolate( const BBox1f& dt ) const { - return LBBox(interpolate(dt.lower),interpolate(dt.upper)); - } - - __forceinline void extend( const LBBox& other ) { - bounds0.extend(other.bounds0); - bounds1.extend(other.bounds1); - } - - __forceinline float expectedHalfArea() const; - - __forceinline float expectedHalfArea(const BBox1f& dt) const { - return interpolate(dt).expectedHalfArea(); - } - - __forceinline float expectedApproxHalfArea() const { - return 0.5f*(halfArea(bounds0) + halfArea(bounds1)); - } - - /* calculates bounds for [0,1] time range from bounds in dt time range */ - __forceinline LBBox global(const BBox1f& dt) const - { - const float rcp_dt_size = 1.0f/dt.size(); - const BBox b0 = interpolate(-dt.lower*rcp_dt_size); - const BBox b1 = interpolate((1.0f-dt.lower)*rcp_dt_size); - return LBBox(b0,b1); - } - - /*! Comparison Operators */ - //template friend __forceinline bool operator==( const LBBox& a, const LBBox& b ) { return a.bounds0 == b.bounds0 && a.bounds1 == b.bounds1; } - //template friend __forceinline bool operator!=( const LBBox& a, const LBBox& b ) { return a.bounds0 != b.bounds0 || a.bounds1 != b.bounds1; } - friend __forceinline bool operator==( const LBBox& a, const LBBox& b ) { return a.bounds0 == b.bounds0 && a.bounds1 == b.bounds1; } - friend __forceinline bool operator!=( const LBBox& a, const LBBox& b ) { return a.bounds0 != b.bounds0 || a.bounds1 != b.bounds1; } - - /*! output operator */ - friend __forceinline embree_ostream operator<<(embree_ostream cout, const LBBox& box) { - return cout << "LBBox { " << box.bounds0 << "; " << box.bounds1 << " }"; - } - - public: - BBox bounds0, bounds1; - }; - - /*! tests if box is finite */ - template - __forceinline bool isvalid( const LBBox& v ) { - return isvalid(v.bounds0) && isvalid(v.bounds1); - } - - template - __forceinline bool isvalid_non_empty( const LBBox& v ) { - return isvalid_non_empty(v.bounds0) && isvalid_non_empty(v.bounds1); - } - - template - __forceinline T expectedArea(const T& a0, const T& a1, const T& b0, const T& b1) - { - const T da = a1-a0; - const T db = b1-b0; - return a0*b0+(a0*db+da*b0)*T(0.5f) + da*db*T(1.0f/3.0f); - } - - template<> __forceinline float LBBox::expectedHalfArea() const - { - const Vec3fa d0 = bounds0.size(); - const Vec3fa d1 = bounds1.size(); - return reduce_add(expectedArea(Vec3fa(d0.x,d0.y,d0.z), - Vec3fa(d1.x,d1.y,d1.z), - Vec3fa(d0.y,d0.z,d0.x), - Vec3fa(d1.y,d1.z,d1.x))); - } - - template - __forceinline float expectedApproxHalfArea(const LBBox& box) { - return box.expectedApproxHalfArea(); - } - - template - __forceinline LBBox merge(const LBBox& a, const LBBox& b) { - return LBBox(merge(a.bounds0, b.bounds0), merge(a.bounds1, b.bounds1)); - } - - /*! subset relation */ - template __inline bool subset( const LBBox& a, const LBBox& b ) { - return subset(a.bounds0,b.bounds0) && subset(a.bounds1,b.bounds1); - } - - /*! default template instantiations */ - typedef LBBox LBBox1f; - typedef LBBox LBBox2f; - typedef LBBox LBBox3f; - typedef LBBox LBBox3fa; - typedef LBBox LBBox3fx; -} diff --git a/thirdparty/embree-aarch64/common/math/linearspace2.h b/thirdparty/embree-aarch64/common/math/linearspace2.h deleted file mode 100644 index b9a382962c..0000000000 --- a/thirdparty/embree-aarch64/common/math/linearspace2.h +++ /dev/null @@ -1,148 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "vec2.h" - -namespace embree -{ - //////////////////////////////////////////////////////////////////////////////// - /// 2D Linear Transform (2x2 Matrix) - //////////////////////////////////////////////////////////////////////////////// - - template struct LinearSpace2 - { - typedef T Vector; - typedef typename T::Scalar Scalar; - - /*! default matrix constructor */ - __forceinline LinearSpace2 ( ) {} - __forceinline LinearSpace2 ( const LinearSpace2& other ) { vx = other.vx; vy = other.vy; } - __forceinline LinearSpace2& operator=( const LinearSpace2& other ) { vx = other.vx; vy = other.vy; return *this; } - - template __forceinline LinearSpace2( const LinearSpace2& s ) : vx(s.vx), vy(s.vy) {} - - /*! matrix construction from column vectors */ - __forceinline LinearSpace2(const Vector& vx, const Vector& vy) - : vx(vx), vy(vy) {} - - /*! matrix construction from row mayor data */ - __forceinline LinearSpace2(const Scalar& m00, const Scalar& m01, - const Scalar& m10, const Scalar& m11) - : vx(m00,m10), vy(m01,m11) {} - - /*! compute the determinant of the matrix */ - __forceinline const Scalar det() const { return vx.x*vy.y - vx.y*vy.x; } - - /*! compute adjoint matrix */ - __forceinline const LinearSpace2 adjoint() const { return LinearSpace2(vy.y,-vy.x,-vx.y,vx.x); } - - /*! compute inverse matrix */ - __forceinline const LinearSpace2 inverse() const { return adjoint()/det(); } - - /*! compute transposed matrix */ - __forceinline const LinearSpace2 transposed() const { return LinearSpace2(vx.x,vx.y,vy.x,vy.y); } - - /*! returns first row of matrix */ - __forceinline Vector row0() const { return Vector(vx.x,vy.x); } - - /*! returns second row of matrix */ - __forceinline Vector row1() const { return Vector(vx.y,vy.y); } - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline LinearSpace2( ZeroTy ) : vx(zero), vy(zero) {} - __forceinline LinearSpace2( OneTy ) : vx(one, zero), vy(zero, one) {} - - /*! return matrix for scaling */ - static __forceinline LinearSpace2 scale(const Vector& s) { - return LinearSpace2(s.x, 0, - 0 , s.y); - } - - /*! return matrix for rotation */ - static __forceinline LinearSpace2 rotate(const Scalar& r) { - Scalar s = sin(r), c = cos(r); - return LinearSpace2(c, -s, - s, c); - } - - /*! return closest orthogonal matrix (i.e. a general rotation including reflection) */ - LinearSpace2 orthogonal() const - { - LinearSpace2 m = *this; - - // mirrored? - Scalar mirror(one); - if (m.det() < Scalar(zero)) { - m.vx = -m.vx; - mirror = -mirror; - } - - // rotation - for (int i = 0; i < 99; i++) { - const LinearSpace2 m_next = 0.5 * (m + m.transposed().inverse()); - const LinearSpace2 d = m_next - m; - m = m_next; - // norm^2 of difference small enough? - if (max(dot(d.vx, d.vx), dot(d.vy, d.vy)) < 1e-8) - break; - } - - // rotation * mirror_x - return LinearSpace2(mirror*m.vx, m.vy); - } - - public: - - /*! the column vectors of the matrix */ - Vector vx,vy; - }; - - //////////////////////////////////////////////////////////////////////////////// - // Unary Operators - //////////////////////////////////////////////////////////////////////////////// - - template __forceinline LinearSpace2 operator -( const LinearSpace2& a ) { return LinearSpace2(-a.vx,-a.vy); } - template __forceinline LinearSpace2 operator +( const LinearSpace2& a ) { return LinearSpace2(+a.vx,+a.vy); } - template __forceinline LinearSpace2 rcp ( const LinearSpace2& a ) { return a.inverse(); } - - //////////////////////////////////////////////////////////////////////////////// - // Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - template __forceinline LinearSpace2 operator +( const LinearSpace2& a, const LinearSpace2& b ) { return LinearSpace2(a.vx+b.vx,a.vy+b.vy); } - template __forceinline LinearSpace2 operator -( const LinearSpace2& a, const LinearSpace2& b ) { return LinearSpace2(a.vx-b.vx,a.vy-b.vy); } - - template __forceinline LinearSpace2 operator*(const typename T::Scalar & a, const LinearSpace2& b) { return LinearSpace2(a*b.vx, a*b.vy); } - template __forceinline T operator*(const LinearSpace2& a, const T & b) { return b.x*a.vx + b.y*a.vy; } - template __forceinline LinearSpace2 operator*(const LinearSpace2& a, const LinearSpace2& b) { return LinearSpace2(a*b.vx, a*b.vy); } - - template __forceinline LinearSpace2 operator/(const LinearSpace2& a, const typename T::Scalar & b) { return LinearSpace2(a.vx/b, a.vy/b); } - template __forceinline LinearSpace2 operator/(const LinearSpace2& a, const LinearSpace2& b) { return a * rcp(b); } - - template __forceinline LinearSpace2& operator *=( LinearSpace2& a, const LinearSpace2& b ) { return a = a * b; } - template __forceinline LinearSpace2& operator /=( LinearSpace2& a, const LinearSpace2& b ) { return a = a / b; } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators - //////////////////////////////////////////////////////////////////////////////// - - template __forceinline bool operator ==( const LinearSpace2& a, const LinearSpace2& b ) { return a.vx == b.vx && a.vy == b.vy; } - template __forceinline bool operator !=( const LinearSpace2& a, const LinearSpace2& b ) { return a.vx != b.vx || a.vy != b.vy; } - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - template static embree_ostream operator<<(embree_ostream cout, const LinearSpace2& m) { - return cout << "{ vx = " << m.vx << ", vy = " << m.vy << "}"; - } - - /*! Shortcuts for common linear spaces. */ - typedef LinearSpace2 LinearSpace2f; - typedef LinearSpace2 LinearSpace2fa; -} diff --git a/thirdparty/embree-aarch64/common/math/linearspace3.h b/thirdparty/embree-aarch64/common/math/linearspace3.h deleted file mode 100644 index 12b5bb776b..0000000000 --- a/thirdparty/embree-aarch64/common/math/linearspace3.h +++ /dev/null @@ -1,213 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "vec3.h" -#include "quaternion.h" - -namespace embree -{ - //////////////////////////////////////////////////////////////////////////////// - /// 3D Linear Transform (3x3 Matrix) - //////////////////////////////////////////////////////////////////////////////// - - template struct LinearSpace3 - { - typedef T Vector; - typedef typename T::Scalar Scalar; - - /*! default matrix constructor */ - __forceinline LinearSpace3 ( ) {} - __forceinline LinearSpace3 ( const LinearSpace3& other ) { vx = other.vx; vy = other.vy; vz = other.vz; } - __forceinline LinearSpace3& operator=( const LinearSpace3& other ) { vx = other.vx; vy = other.vy; vz = other.vz; return *this; } - - template __forceinline LinearSpace3( const LinearSpace3& s ) : vx(s.vx), vy(s.vy), vz(s.vz) {} - - /*! matrix construction from column vectors */ - __forceinline LinearSpace3(const Vector& vx, const Vector& vy, const Vector& vz) - : vx(vx), vy(vy), vz(vz) {} - - /*! construction from quaternion */ - __forceinline LinearSpace3( const QuaternionT& q ) - : vx((q.r*q.r + q.i*q.i - q.j*q.j - q.k*q.k), 2.0f*(q.i*q.j + q.r*q.k), 2.0f*(q.i*q.k - q.r*q.j)) - , vy(2.0f*(q.i*q.j - q.r*q.k), (q.r*q.r - q.i*q.i + q.j*q.j - q.k*q.k), 2.0f*(q.j*q.k + q.r*q.i)) - , vz(2.0f*(q.i*q.k + q.r*q.j), 2.0f*(q.j*q.k - q.r*q.i), (q.r*q.r - q.i*q.i - q.j*q.j + q.k*q.k)) {} - - /*! matrix construction from row mayor data */ - __forceinline LinearSpace3(const Scalar& m00, const Scalar& m01, const Scalar& m02, - const Scalar& m10, const Scalar& m11, const Scalar& m12, - const Scalar& m20, const Scalar& m21, const Scalar& m22) - : vx(m00,m10,m20), vy(m01,m11,m21), vz(m02,m12,m22) {} - - /*! compute the determinant of the matrix */ - __forceinline const Scalar det() const { return dot(vx,cross(vy,vz)); } - - /*! compute adjoint matrix */ - __forceinline const LinearSpace3 adjoint() const { return LinearSpace3(cross(vy,vz),cross(vz,vx),cross(vx,vy)).transposed(); } - - /*! compute inverse matrix */ - __forceinline const LinearSpace3 inverse() const { return adjoint()/det(); } - - /*! compute transposed matrix */ - __forceinline const LinearSpace3 transposed() const { return LinearSpace3(vx.x,vx.y,vx.z,vy.x,vy.y,vy.z,vz.x,vz.y,vz.z); } - - /*! returns first row of matrix */ - __forceinline Vector row0() const { return Vector(vx.x,vy.x,vz.x); } - - /*! returns second row of matrix */ - __forceinline Vector row1() const { return Vector(vx.y,vy.y,vz.y); } - - /*! returns third row of matrix */ - __forceinline Vector row2() const { return Vector(vx.z,vy.z,vz.z); } - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline LinearSpace3( ZeroTy ) : vx(zero), vy(zero), vz(zero) {} - __forceinline LinearSpace3( OneTy ) : vx(one, zero, zero), vy(zero, one, zero), vz(zero, zero, one) {} - - /*! return matrix for scaling */ - static __forceinline LinearSpace3 scale(const Vector& s) { - return LinearSpace3(s.x, 0, 0, - 0 , s.y, 0, - 0 , 0, s.z); - } - - /*! return matrix for rotation around arbitrary axis */ - static __forceinline LinearSpace3 rotate(const Vector& _u, const Scalar& r) { - Vector u = normalize(_u); - Scalar s = sin(r), c = cos(r); - return LinearSpace3(u.x*u.x+(1-u.x*u.x)*c, u.x*u.y*(1-c)-u.z*s, u.x*u.z*(1-c)+u.y*s, - u.x*u.y*(1-c)+u.z*s, u.y*u.y+(1-u.y*u.y)*c, u.y*u.z*(1-c)-u.x*s, - u.x*u.z*(1-c)-u.y*s, u.y*u.z*(1-c)+u.x*s, u.z*u.z+(1-u.z*u.z)*c); - } - - public: - - /*! the column vectors of the matrix */ - Vector vx,vy,vz; - }; - - /*! compute transposed matrix */ - template<> __forceinline const LinearSpace3 LinearSpace3::transposed() const { - vfloat4 rx,ry,rz; transpose((vfloat4&)vx,(vfloat4&)vy,(vfloat4&)vz,vfloat4(zero),rx,ry,rz); - return LinearSpace3(Vec3fa(rx),Vec3fa(ry),Vec3fa(rz)); - } - - template - __forceinline const LinearSpace3 transposed(const LinearSpace3& xfm) { - return xfm.transposed(); - } - - //////////////////////////////////////////////////////////////////////////////// - // Unary Operators - //////////////////////////////////////////////////////////////////////////////// - - template __forceinline LinearSpace3 operator -( const LinearSpace3& a ) { return LinearSpace3(-a.vx,-a.vy,-a.vz); } - template __forceinline LinearSpace3 operator +( const LinearSpace3& a ) { return LinearSpace3(+a.vx,+a.vy,+a.vz); } - template __forceinline LinearSpace3 rcp ( const LinearSpace3& a ) { return a.inverse(); } - - /* constructs a coordinate frame form a normalized normal */ - template __forceinline LinearSpace3 frame(const T& N) - { - const T dx0(0,N.z,-N.y); - const T dx1(-N.z,0,N.x); - const T dx = normalize(select(dot(dx0,dx0) > dot(dx1,dx1),dx0,dx1)); - const T dy = normalize(cross(N,dx)); - return LinearSpace3(dx,dy,N); - } - - /* constructs a coordinate frame from a normal and approximate x-direction */ - template __forceinline LinearSpace3 frame(const T& N, const T& dxi) - { - if (abs(dot(dxi,N)) > 0.99f) return frame(N); // fallback in case N and dxi are very parallel - const T dx = normalize(cross(dxi,N)); - const T dy = normalize(cross(N,dx)); - return LinearSpace3(dx,dy,N); - } - - /* clamps linear space to range -1 to +1 */ - template __forceinline LinearSpace3 clamp(const LinearSpace3& space) { - return LinearSpace3(clamp(space.vx,T(-1.0f),T(1.0f)), - clamp(space.vy,T(-1.0f),T(1.0f)), - clamp(space.vz,T(-1.0f),T(1.0f))); - } - - //////////////////////////////////////////////////////////////////////////////// - // Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - template __forceinline LinearSpace3 operator +( const LinearSpace3& a, const LinearSpace3& b ) { return LinearSpace3(a.vx+b.vx,a.vy+b.vy,a.vz+b.vz); } - template __forceinline LinearSpace3 operator -( const LinearSpace3& a, const LinearSpace3& b ) { return LinearSpace3(a.vx-b.vx,a.vy-b.vy,a.vz-b.vz); } - - template __forceinline LinearSpace3 operator*(const typename T::Scalar & a, const LinearSpace3& b) { return LinearSpace3(a*b.vx, a*b.vy, a*b.vz); } - template __forceinline T operator*(const LinearSpace3& a, const T & b) { return madd(T(b.x),a.vx,madd(T(b.y),a.vy,T(b.z)*a.vz)); } - template __forceinline LinearSpace3 operator*(const LinearSpace3& a, const LinearSpace3& b) { return LinearSpace3(a*b.vx, a*b.vy, a*b.vz); } - - template __forceinline LinearSpace3 operator/(const LinearSpace3& a, const typename T::Scalar & b) { return LinearSpace3(a.vx/b, a.vy/b, a.vz/b); } - template __forceinline LinearSpace3 operator/(const LinearSpace3& a, const LinearSpace3& b) { return a * rcp(b); } - - template __forceinline LinearSpace3& operator *=( LinearSpace3& a, const LinearSpace3& b ) { return a = a * b; } - template __forceinline LinearSpace3& operator /=( LinearSpace3& a, const LinearSpace3& b ) { return a = a / b; } - - template __forceinline T xfmPoint (const LinearSpace3& s, const T & a) { return madd(T(a.x),s.vx,madd(T(a.y),s.vy,T(a.z)*s.vz)); } - template __forceinline T xfmVector(const LinearSpace3& s, const T & a) { return madd(T(a.x),s.vx,madd(T(a.y),s.vy,T(a.z)*s.vz)); } - template __forceinline T xfmNormal(const LinearSpace3& s, const T & a) { return xfmVector(s.inverse().transposed(),a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators - //////////////////////////////////////////////////////////////////////////////// - - template __forceinline bool operator ==( const LinearSpace3& a, const LinearSpace3& b ) { return a.vx == b.vx && a.vy == b.vy && a.vz == b.vz; } - template __forceinline bool operator !=( const LinearSpace3& a, const LinearSpace3& b ) { return a.vx != b.vx || a.vy != b.vy || a.vz != b.vz; } - - //////////////////////////////////////////////////////////////////////////////// - /// Select - //////////////////////////////////////////////////////////////////////////////// - - template __forceinline LinearSpace3 select ( const typename T::Scalar::Bool& s, const LinearSpace3& t, const LinearSpace3& f ) { - return LinearSpace3(select(s,t.vx,f.vx),select(s,t.vy,f.vy),select(s,t.vz,f.vz)); - } - - /*! blending */ - template - __forceinline LinearSpace3 lerp(const LinearSpace3& l0, const LinearSpace3& l1, const float t) - { - return LinearSpace3(lerp(l0.vx,l1.vx,t), - lerp(l0.vy,l1.vy,t), - lerp(l0.vz,l1.vz,t)); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - template static embree_ostream operator<<(embree_ostream cout, const LinearSpace3& m) { - return cout << "{ vx = " << m.vx << ", vy = " << m.vy << ", vz = " << m.vz << "}"; - } - - /*! Shortcuts for common linear spaces. */ - typedef LinearSpace3 LinearSpace3f; - typedef LinearSpace3 LinearSpace3fa; - typedef LinearSpace3 LinearSpace3fx; - typedef LinearSpace3 LinearSpace3ff; - - template using LinearSpace3vf = LinearSpace3>>; - typedef LinearSpace3>> LinearSpace3vf4; - typedef LinearSpace3>> LinearSpace3vf8; - typedef LinearSpace3>> LinearSpace3vf16; - - /*! blending */ - template - __forceinline LinearSpace3 lerp(const LinearSpace3& l0, - const LinearSpace3& l1, - const S& t) - { - return LinearSpace3(lerp(l0.vx,l1.vx,t), - lerp(l0.vy,l1.vy,t), - lerp(l0.vz,l1.vz,t)); - } - -} diff --git a/thirdparty/embree-aarch64/common/math/math.h b/thirdparty/embree-aarch64/common/math/math.h deleted file mode 100644 index 6d54abd44d..0000000000 --- a/thirdparty/embree-aarch64/common/math/math.h +++ /dev/null @@ -1,451 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../sys/platform.h" -#include "../sys/intrinsics.h" -#include "constants.h" -#include - -#if defined(__ARM_NEON) -#include "SSE2NEON.h" -#if defined(NEON_AVX2_EMULATION) -#include "AVX2NEON.h" -#endif -#else -#include -#include -#include -#endif - -#if defined(__WIN32__) && !defined(__MINGW32__) -#if (__MSV_VER <= 1700) -namespace std -{ - __forceinline bool isinf ( const float x ) { return _finite(x) == 0; } - __forceinline bool isnan ( const float x ) { return _isnan(x) != 0; } - __forceinline bool isfinite (const float x) { return _finite(x) != 0; } -} -#endif -#endif - -namespace embree -{ - __forceinline bool isvalid ( const float& v ) { - return (v > -FLT_LARGE) & (v < +FLT_LARGE); - } - - __forceinline int cast_f2i(float f) { - union { float f; int i; } v; v.f = f; return v.i; - } - - __forceinline float cast_i2f(int i) { - union { float f; int i; } v; v.i = i; return v.f; - } - - __forceinline int toInt (const float& a) { return int(a); } - __forceinline float toFloat(const int& a) { return float(a); } - -#if defined(__WIN32__) && !defined(__MINGW32__) - __forceinline bool finite ( const float x ) { return _finite(x) != 0; } -#endif - - __forceinline float sign ( const float x ) { return x<0?-1.0f:1.0f; } - __forceinline float sqr ( const float x ) { return x*x; } - - __forceinline float rcp ( const float x ) - { -#if defined(__aarch64__) - // Move scalar to vector register and do rcp. - __m128 a; - a[0] = x; - float32x4_t reciprocal = vrecpeq_f32(a); - reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal); - reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal); - return reciprocal[0]; -#else - - const __m128 a = _mm_set_ss(x); - -#if defined(__AVX512VL__) - const __m128 r = _mm_rcp14_ss(_mm_set_ss(0.0f),a); -#else - const __m128 r = _mm_rcp_ss(a); -#endif - -#if defined(__AVX2__) - return _mm_cvtss_f32(_mm_mul_ss(r,_mm_fnmadd_ss(r, a, _mm_set_ss(2.0f)))); -#else - return _mm_cvtss_f32(_mm_mul_ss(r,_mm_sub_ss(_mm_set_ss(2.0f), _mm_mul_ss(r, a)))); -#endif - -#endif //defined(__aarch64__) - } - - __forceinline float signmsk ( const float x ) { -#if defined(__aarch64__) - // FP and Neon shares same vector register in arm64 - __m128 a; - __m128i b; - a[0] = x; - b[0] = 0x80000000; - a = _mm_and_ps(a, vreinterpretq_f32_s32(b)); - return a[0]; -#else - return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(0x80000000)))); -#endif - } - __forceinline float xorf( const float x, const float y ) { -#if defined(__aarch64__) - // FP and Neon shares same vector register in arm64 - __m128 a; - __m128 b; - a[0] = x; - b[0] = y; - a = _mm_xor_ps(a, b); - return a[0]; -#else - return _mm_cvtss_f32(_mm_xor_ps(_mm_set_ss(x),_mm_set_ss(y))); -#endif - } - __forceinline float andf( const float x, const unsigned y ) { -#if defined(__aarch64__) - // FP and Neon shares same vector register in arm64 - __m128 a; - __m128i b; - a[0] = x; - b[0] = y; - a = _mm_and_ps(a, vreinterpretq_f32_s32(b)); - return a[0]; -#else - return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(y)))); -#endif - } - __forceinline float rsqrt( const float x ) - { -#if defined(__aarch64__) - // FP and Neon shares same vector register in arm64 - __m128 a; - a[0] = x; - __m128 value = _mm_rsqrt_ps(a); - value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(a, value), value)); - value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(a, value), value)); - return value[0]; -#else - - const __m128 a = _mm_set_ss(x); -#if defined(__AVX512VL__) - const __m128 r = _mm_rsqrt14_ss(_mm_set_ss(0.0f),a); -#else - const __m128 r = _mm_rsqrt_ss(a); -#endif - const __m128 c = _mm_add_ss(_mm_mul_ss(_mm_set_ss(1.5f), r), - _mm_mul_ss(_mm_mul_ss(_mm_mul_ss(a, _mm_set_ss(-0.5f)), r), _mm_mul_ss(r, r))); - return _mm_cvtss_f32(c); -#endif - } - -#if defined(__WIN32__) && (__MSC_VER <= 1700) && !defined(__MINGW32__) - __forceinline float nextafter(float x, float y) { if ((x0)) return x*(1.1f+float(ulp)); else return x*(0.9f-float(ulp)); } - __forceinline double nextafter(double x, double y) { return _nextafter(x, y); } - __forceinline int roundf(float f) { return (int)(f + 0.5f); } -#else - __forceinline float nextafter(float x, float y) { return ::nextafterf(x, y); } - __forceinline double nextafter(double x, double y) { return ::nextafter(x, y); } -#endif - - __forceinline float abs ( const float x ) { return ::fabsf(x); } - __forceinline float acos ( const float x ) { return ::acosf (x); } - __forceinline float asin ( const float x ) { return ::asinf (x); } - __forceinline float atan ( const float x ) { return ::atanf (x); } - __forceinline float atan2( const float y, const float x ) { return ::atan2f(y, x); } - __forceinline float cos ( const float x ) { return ::cosf (x); } - __forceinline float cosh ( const float x ) { return ::coshf (x); } - __forceinline float exp ( const float x ) { return ::expf (x); } - __forceinline float fmod ( const float x, const float y ) { return ::fmodf (x, y); } - __forceinline float log ( const float x ) { return ::logf (x); } - __forceinline float log10( const float x ) { return ::log10f(x); } - __forceinline float pow ( const float x, const float y ) { return ::powf (x, y); } - __forceinline float sin ( const float x ) { return ::sinf (x); } - __forceinline float sinh ( const float x ) { return ::sinhf (x); } - __forceinline float sqrt ( const float x ) { return ::sqrtf (x); } - __forceinline float tan ( const float x ) { return ::tanf (x); } - __forceinline float tanh ( const float x ) { return ::tanhf (x); } - __forceinline float floor( const float x ) { return ::floorf (x); } - __forceinline float ceil ( const float x ) { return ::ceilf (x); } - __forceinline float frac ( const float x ) { return x-floor(x); } - - __forceinline double abs ( const double x ) { return ::fabs(x); } - __forceinline double sign ( const double x ) { return x<0?-1.0:1.0; } - __forceinline double acos ( const double x ) { return ::acos (x); } - __forceinline double asin ( const double x ) { return ::asin (x); } - __forceinline double atan ( const double x ) { return ::atan (x); } - __forceinline double atan2( const double y, const double x ) { return ::atan2(y, x); } - __forceinline double cos ( const double x ) { return ::cos (x); } - __forceinline double cosh ( const double x ) { return ::cosh (x); } - __forceinline double exp ( const double x ) { return ::exp (x); } - __forceinline double fmod ( const double x, const double y ) { return ::fmod (x, y); } - __forceinline double log ( const double x ) { return ::log (x); } - __forceinline double log10( const double x ) { return ::log10(x); } - __forceinline double pow ( const double x, const double y ) { return ::pow (x, y); } - __forceinline double rcp ( const double x ) { return 1.0/x; } - __forceinline double rsqrt( const double x ) { return 1.0/::sqrt(x); } - __forceinline double sin ( const double x ) { return ::sin (x); } - __forceinline double sinh ( const double x ) { return ::sinh (x); } - __forceinline double sqr ( const double x ) { return x*x; } - __forceinline double sqrt ( const double x ) { return ::sqrt (x); } - __forceinline double tan ( const double x ) { return ::tan (x); } - __forceinline double tanh ( const double x ) { return ::tanh (x); } - __forceinline double floor( const double x ) { return ::floor (x); } - __forceinline double ceil ( const double x ) { return ::ceil (x); } - -#if defined(__aarch64__) - __forceinline float mini(float a, float b) { - // FP and Neon shares same vector register in arm64 - __m128 x; - __m128 y; - x[0] = a; - y[0] = b; - x = _mm_min_ps(x, y); - return x[0]; - } -#elif defined(__SSE4_1__) - __forceinline float mini(float a, float b) { - const __m128i ai = _mm_castps_si128(_mm_set_ss(a)); - const __m128i bi = _mm_castps_si128(_mm_set_ss(b)); - const __m128i ci = _mm_min_epi32(ai,bi); - return _mm_cvtss_f32(_mm_castsi128_ps(ci)); - } -#endif - -#if defined(__aarch64__) - __forceinline float maxi(float a, float b) { - // FP and Neon shares same vector register in arm64 - __m128 x; - __m128 y; - x[0] = a; - y[0] = b; - x = _mm_max_ps(x, y); - return x[0]; - } -#elif defined(__SSE4_1__) - __forceinline float maxi(float a, float b) { - const __m128i ai = _mm_castps_si128(_mm_set_ss(a)); - const __m128i bi = _mm_castps_si128(_mm_set_ss(b)); - const __m128i ci = _mm_max_epi32(ai,bi); - return _mm_cvtss_f32(_mm_castsi128_ps(ci)); - } -#endif - - template - __forceinline T twice(const T& a) { return a+a; } - - __forceinline int min(int a, int b) { return a __forceinline T min(const T& a, const T& b, const T& c) { return min(min(a,b),c); } - template __forceinline T min(const T& a, const T& b, const T& c, const T& d) { return min(min(a,b),min(c,d)); } - template __forceinline T min(const T& a, const T& b, const T& c, const T& d, const T& e) { return min(min(min(a,b),min(c,d)),e); } - - template __forceinline T mini(const T& a, const T& b, const T& c) { return mini(mini(a,b),c); } - template __forceinline T mini(const T& a, const T& b, const T& c, const T& d) { return mini(mini(a,b),mini(c,d)); } - template __forceinline T mini(const T& a, const T& b, const T& c, const T& d, const T& e) { return mini(mini(mini(a,b),mini(c,d)),e); } - - __forceinline int max(int a, int b) { return a __forceinline T max(const T& a, const T& b, const T& c) { return max(max(a,b),c); } - template __forceinline T max(const T& a, const T& b, const T& c, const T& d) { return max(max(a,b),max(c,d)); } - template __forceinline T max(const T& a, const T& b, const T& c, const T& d, const T& e) { return max(max(max(a,b),max(c,d)),e); } - - template __forceinline T maxi(const T& a, const T& b, const T& c) { return maxi(maxi(a,b),c); } - template __forceinline T maxi(const T& a, const T& b, const T& c, const T& d) { return maxi(maxi(a,b),maxi(c,d)); } - template __forceinline T maxi(const T& a, const T& b, const T& c, const T& d, const T& e) { return maxi(maxi(maxi(a,b),maxi(c,d)),e); } - -#if defined(__MACOSX__) - __forceinline ssize_t min(ssize_t a, ssize_t b) { return a __forceinline T clamp(const T& x, const T& lower = T(zero), const T& upper = T(one)) { return max(min(x,upper),lower); } - template __forceinline T clampz(const T& x, const T& upper) { return max(T(zero), min(x,upper)); } - - template __forceinline T deg2rad ( const T& x ) { return x * T(1.74532925199432957692e-2f); } - template __forceinline T rad2deg ( const T& x ) { return x * T(5.72957795130823208768e1f); } - template __forceinline T sin2cos ( const T& x ) { return sqrt(max(T(zero),T(one)-x*x)); } - template __forceinline T cos2sin ( const T& x ) { return sin2cos(x); } - -#if defined(__AVX2__) - __forceinline float madd ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fmadd_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); } - __forceinline float msub ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fmsub_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); } - __forceinline float nmadd ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fnmadd_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); } - __forceinline float nmsub ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fnmsub_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); } -#elif defined (__aarch64__) && defined(__clang__) -#pragma clang fp contract(fast) - - -__forceinline float madd ( const float a, const float b, const float c) { return a*b + c; } -__forceinline float msub ( const float a, const float b, const float c) { return a*b - c; } -__forceinline float nmadd ( const float a, const float b, const float c) { return c - a*b; } -__forceinline float nmsub ( const float a, const float b, const float c) { return -(c + a*b); } - -#pragma clang fp contract(on) -#else - __forceinline float madd ( const float a, const float b, const float c) { return a*b+c; } - __forceinline float msub ( const float a, const float b, const float c) { return a*b-c; } - __forceinline float nmadd ( const float a, const float b, const float c) { return -a*b+c;} - __forceinline float nmsub ( const float a, const float b, const float c) { return -a*b-c; } -#endif - - /*! random functions */ - template T random() { return T(0); } -#if defined(_WIN32) - template<> __forceinline int random() { return int(rand()) ^ (int(rand()) << 8) ^ (int(rand()) << 16); } - template<> __forceinline uint32_t random() { return uint32_t(rand()) ^ (uint32_t(rand()) << 8) ^ (uint32_t(rand()) << 16); } -#else - template<> __forceinline int random() { return int(rand()); } - template<> __forceinline uint32_t random() { return uint32_t(rand()) ^ (uint32_t(rand()) << 16); } -#endif - template<> __forceinline float random() { return rand()/float(RAND_MAX); } - template<> __forceinline double random() { return rand()/double(RAND_MAX); } - -#if _WIN32 - __forceinline double drand48() { - return double(rand())/double(RAND_MAX); - } - - __forceinline void srand48(long seed) { - return srand(seed); - } -#endif - - /*! selects */ - __forceinline bool select(bool s, bool t , bool f) { return s ? t : f; } - __forceinline int select(bool s, int t, int f) { return s ? t : f; } - __forceinline float select(bool s, float t, float f) { return s ? t : f; } - - __forceinline bool all(bool s) { return s; } - - __forceinline float lerp(const float v0, const float v1, const float t) { - return madd(1.0f-t,v0,t*v1); - } - - template - __forceinline T lerp2(const float x0, const float x1, const float x2, const float x3, const T& u, const T& v) { - return madd((1.0f-u),madd((1.0f-v),T(x0),v*T(x2)),u*madd((1.0f-v),T(x1),v*T(x3))); - } - - /*! exchange */ - template __forceinline void xchg ( T& a, T& b ) { const T tmp = a; a = b; b = tmp; } - - - template __forceinline T prod_diff(const T& a,const T& b,const T& c,const T& d) { -#if 1//!defined(__aarch64__) - return msub(a,b,c*d); -#else - return nmadd(c,d,a*b); -#endif - } - - /*! bit reverse operation */ - template - __forceinline T bitReverse(const T& vin) - { - T v = vin; - v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1); - v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2); - v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4); - v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8); - v = ( v >> 16 ) | ( v << 16); - return v; - } - - /*! bit interleave operation */ - template - __forceinline T bitInterleave(const T& xin, const T& yin, const T& zin) - { - T x = xin, y = yin, z = zin; - x = (x | (x << 16)) & 0x030000FF; - x = (x | (x << 8)) & 0x0300F00F; - x = (x | (x << 4)) & 0x030C30C3; - x = (x | (x << 2)) & 0x09249249; - - y = (y | (y << 16)) & 0x030000FF; - y = (y | (y << 8)) & 0x0300F00F; - y = (y | (y << 4)) & 0x030C30C3; - y = (y | (y << 2)) & 0x09249249; - - z = (z | (z << 16)) & 0x030000FF; - z = (z | (z << 8)) & 0x0300F00F; - z = (z | (z << 4)) & 0x030C30C3; - z = (z | (z << 2)) & 0x09249249; - - return x | (y << 1) | (z << 2); - } - -#if defined(__AVX2__) && !defined(__aarch64__) - - template<> - __forceinline unsigned int bitInterleave(const unsigned int &xi, const unsigned int& yi, const unsigned int& zi) - { - const unsigned int xx = pdep(xi,0x49249249 /* 0b01001001001001001001001001001001 */ ); - const unsigned int yy = pdep(yi,0x92492492 /* 0b10010010010010010010010010010010 */); - const unsigned int zz = pdep(zi,0x24924924 /* 0b00100100100100100100100100100100 */); - return xx | yy | zz; - } - -#endif - - /*! bit interleave operation for 64bit data types*/ - template - __forceinline T bitInterleave64(const T& xin, const T& yin, const T& zin){ - T x = xin & 0x1fffff; - T y = yin & 0x1fffff; - T z = zin & 0x1fffff; - - x = (x | x << 32) & 0x1f00000000ffff; - x = (x | x << 16) & 0x1f0000ff0000ff; - x = (x | x << 8) & 0x100f00f00f00f00f; - x = (x | x << 4) & 0x10c30c30c30c30c3; - x = (x | x << 2) & 0x1249249249249249; - - y = (y | y << 32) & 0x1f00000000ffff; - y = (y | y << 16) & 0x1f0000ff0000ff; - y = (y | y << 8) & 0x100f00f00f00f00f; - y = (y | y << 4) & 0x10c30c30c30c30c3; - y = (y | y << 2) & 0x1249249249249249; - - z = (z | z << 32) & 0x1f00000000ffff; - z = (z | z << 16) & 0x1f0000ff0000ff; - z = (z | z << 8) & 0x100f00f00f00f00f; - z = (z | z << 4) & 0x10c30c30c30c30c3; - z = (z | z << 2) & 0x1249249249249249; - - return x | (y << 1) | (z << 2); - } -} diff --git a/thirdparty/embree-aarch64/common/math/obbox.h b/thirdparty/embree-aarch64/common/math/obbox.h deleted file mode 100644 index 032b56904e..0000000000 --- a/thirdparty/embree-aarch64/common/math/obbox.h +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "bbox.h" -#include "linearspace3.h" - -namespace embree -{ - /*! Oriented bounding box */ - template - struct OBBox - { - public: - - __forceinline OBBox () {} - - __forceinline OBBox (EmptyTy) - : space(one), bounds(empty) {} - - __forceinline OBBox (const BBox& bounds) - : space(one), bounds(bounds) {} - - __forceinline OBBox (const LinearSpace3& space, const BBox& bounds) - : space(space), bounds(bounds) {} - - friend embree_ostream operator<<(embree_ostream cout, const OBBox& p) { - return cout << "{ space = " << p.space << ", bounds = " << p.bounds << "}"; - } - - public: - LinearSpace3 space; //!< orthonormal transformation - BBox bounds; //!< bounds in transformed space - }; - - typedef OBBox OBBox3f; - typedef OBBox OBBox3fa; -} diff --git a/thirdparty/embree-aarch64/common/math/quaternion.h b/thirdparty/embree-aarch64/common/math/quaternion.h deleted file mode 100644 index 20c69bc62f..0000000000 --- a/thirdparty/embree-aarch64/common/math/quaternion.h +++ /dev/null @@ -1,254 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "vec3.h" -#include "vec4.h" - -#include "transcendental.h" - -namespace embree -{ - //////////////////////////////////////////////////////////////// - // Quaternion Struct - //////////////////////////////////////////////////////////////// - - template - struct QuaternionT - { - typedef Vec3 Vector; - - //////////////////////////////////////////////////////////////////////////////// - /// Construction - //////////////////////////////////////////////////////////////////////////////// - - __forceinline QuaternionT () { } - __forceinline QuaternionT ( const QuaternionT& other ) { r = other.r; i = other.i; j = other.j; k = other.k; } - __forceinline QuaternionT& operator=( const QuaternionT& other ) { r = other.r; i = other.i; j = other.j; k = other.k; return *this; } - - __forceinline QuaternionT( const T& r ) : r(r), i(zero), j(zero), k(zero) {} - __forceinline explicit QuaternionT( const Vec3& v ) : r(zero), i(v.x), j(v.y), k(v.z) {} - __forceinline explicit QuaternionT( const Vec4& v ) : r(v.x), i(v.y), j(v.z), k(v.w) {} - __forceinline QuaternionT( const T& r, const T& i, const T& j, const T& k ) : r(r), i(i), j(j), k(k) {} - __forceinline QuaternionT( const T& r, const Vec3& v ) : r(r), i(v.x), j(v.y), k(v.z) {} - - __inline QuaternionT( const Vec3& vx, const Vec3& vy, const Vec3& vz ); - __inline QuaternionT( const T& yaw, const T& pitch, const T& roll ); - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline QuaternionT( ZeroTy ) : r(zero), i(zero), j(zero), k(zero) {} - __forceinline QuaternionT( OneTy ) : r( one), i(zero), j(zero), k(zero) {} - - /*! return quaternion for rotation around arbitrary axis */ - static __forceinline QuaternionT rotate(const Vec3& u, const T& r) { - return QuaternionT(cos(T(0.5)*r),sin(T(0.5)*r)*normalize(u)); - } - - /*! returns the rotation axis of the quaternion as a vector */ - __forceinline Vec3 v( ) const { return Vec3(i, j, k); } - - public: - T r, i, j, k; - }; - - template __forceinline QuaternionT operator *( const T & a, const QuaternionT& b ) { return QuaternionT(a * b.r, a * b.i, a * b.j, a * b.k); } - template __forceinline QuaternionT operator *( const QuaternionT& a, const T & b ) { return QuaternionT(a.r * b, a.i * b, a.j * b, a.k * b); } - - //////////////////////////////////////////////////////////////// - // Unary Operators - //////////////////////////////////////////////////////////////// - - template __forceinline QuaternionT operator +( const QuaternionT& a ) { return QuaternionT(+a.r, +a.i, +a.j, +a.k); } - template __forceinline QuaternionT operator -( const QuaternionT& a ) { return QuaternionT(-a.r, -a.i, -a.j, -a.k); } - template __forceinline QuaternionT conj ( const QuaternionT& a ) { return QuaternionT(a.r, -a.i, -a.j, -a.k); } - template __forceinline T abs ( const QuaternionT& a ) { return sqrt(a.r*a.r + a.i*a.i + a.j*a.j + a.k*a.k); } - template __forceinline QuaternionT rcp ( const QuaternionT& a ) { return conj(a)*rcp(a.r*a.r + a.i*a.i + a.j*a.j + a.k*a.k); } - template __forceinline QuaternionT normalize ( const QuaternionT& a ) { return a*rsqrt(a.r*a.r + a.i*a.i + a.j*a.j + a.k*a.k); } - - // evaluates a*q-r - template __forceinline QuaternionT - msub(const T& a, const QuaternionT& q, const QuaternionT& p) - { - return QuaternionT(msub(a, q.r, p.r), - msub(a, q.i, p.i), - msub(a, q.j, p.j), - msub(a, q.k, p.k)); - } - // evaluates a*q-r - template __forceinline QuaternionT - madd (const T& a, const QuaternionT& q, const QuaternionT& p) - { - return QuaternionT(madd(a, q.r, p.r), - madd(a, q.i, p.i), - madd(a, q.j, p.j), - madd(a, q.k, p.k)); - } - - //////////////////////////////////////////////////////////////// - // Binary Operators - //////////////////////////////////////////////////////////////// - - template __forceinline QuaternionT operator +( const T & a, const QuaternionT& b ) { return QuaternionT(a + b.r, b.i, b.j, b.k); } - template __forceinline QuaternionT operator +( const QuaternionT& a, const T & b ) { return QuaternionT(a.r + b, a.i, a.j, a.k); } - template __forceinline QuaternionT operator +( const QuaternionT& a, const QuaternionT& b ) { return QuaternionT(a.r + b.r, a.i + b.i, a.j + b.j, a.k + b.k); } - template __forceinline QuaternionT operator -( const T & a, const QuaternionT& b ) { return QuaternionT(a - b.r, -b.i, -b.j, -b.k); } - template __forceinline QuaternionT operator -( const QuaternionT& a, const T & b ) { return QuaternionT(a.r - b, a.i, a.j, a.k); } - template __forceinline QuaternionT operator -( const QuaternionT& a, const QuaternionT& b ) { return QuaternionT(a.r - b.r, a.i - b.i, a.j - b.j, a.k - b.k); } - - template __forceinline Vec3 operator *( const QuaternionT& a, const Vec3 & b ) { return (a*QuaternionT(b)*conj(a)).v(); } - template __forceinline QuaternionT operator *( const QuaternionT& a, const QuaternionT& b ) { - return QuaternionT(a.r*b.r - a.i*b.i - a.j*b.j - a.k*b.k, - a.r*b.i + a.i*b.r + a.j*b.k - a.k*b.j, - a.r*b.j - a.i*b.k + a.j*b.r + a.k*b.i, - a.r*b.k + a.i*b.j - a.j*b.i + a.k*b.r); - } - template __forceinline QuaternionT operator /( const T & a, const QuaternionT& b ) { return a*rcp(b); } - template __forceinline QuaternionT operator /( const QuaternionT& a, const T & b ) { return a*rcp(b); } - template __forceinline QuaternionT operator /( const QuaternionT& a, const QuaternionT& b ) { return a*rcp(b); } - - template __forceinline QuaternionT& operator +=( QuaternionT& a, const T & b ) { return a = a+b; } - template __forceinline QuaternionT& operator +=( QuaternionT& a, const QuaternionT& b ) { return a = a+b; } - template __forceinline QuaternionT& operator -=( QuaternionT& a, const T & b ) { return a = a-b; } - template __forceinline QuaternionT& operator -=( QuaternionT& a, const QuaternionT& b ) { return a = a-b; } - template __forceinline QuaternionT& operator *=( QuaternionT& a, const T & b ) { return a = a*b; } - template __forceinline QuaternionT& operator *=( QuaternionT& a, const QuaternionT& b ) { return a = a*b; } - template __forceinline QuaternionT& operator /=( QuaternionT& a, const T & b ) { return a = a*rcp(b); } - template __forceinline QuaternionT& operator /=( QuaternionT& a, const QuaternionT& b ) { return a = a*rcp(b); } - - template __forceinline QuaternionT - select(const M& m, const QuaternionT& q, const QuaternionT& p) - { - return QuaternionT(select(m, q.r, p.r), - select(m, q.i, p.i), - select(m, q.j, p.j), - select(m, q.k, p.k)); - } - - - template __forceinline Vec3 xfmPoint ( const QuaternionT& a, const Vec3& b ) { return (a*QuaternionT(b)*conj(a)).v(); } - template __forceinline Vec3 xfmVector( const QuaternionT& a, const Vec3& b ) { return (a*QuaternionT(b)*conj(a)).v(); } - template __forceinline Vec3 xfmNormal( const QuaternionT& a, const Vec3& b ) { return (a*QuaternionT(b)*conj(a)).v(); } - - template __forceinline T dot(const QuaternionT& a, const QuaternionT& b) { return a.r*b.r + a.i*b.i + a.j*b.j + a.k*b.k; } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators - //////////////////////////////////////////////////////////////////////////////// - - template __forceinline bool operator ==( const QuaternionT& a, const QuaternionT& b ) { return a.r == b.r && a.i == b.i && a.j == b.j && a.k == b.k; } - template __forceinline bool operator !=( const QuaternionT& a, const QuaternionT& b ) { return a.r != b.r || a.i != b.i || a.j != b.j || a.k != b.k; } - - - //////////////////////////////////////////////////////////////////////////////// - /// Orientation Functions - //////////////////////////////////////////////////////////////////////////////// - - template QuaternionT::QuaternionT( const Vec3& vx, const Vec3& vy, const Vec3& vz ) - { - if ( vx.x + vy.y + vz.z >= T(zero) ) - { - const T t = T(one) + (vx.x + vy.y + vz.z); - const T s = rsqrt(t)*T(0.5f); - r = t*s; - i = (vy.z - vz.y)*s; - j = (vz.x - vx.z)*s; - k = (vx.y - vy.x)*s; - } - else if ( vx.x >= max(vy.y, vz.z) ) - { - const T t = (T(one) + vx.x) - (vy.y + vz.z); - const T s = rsqrt(t)*T(0.5f); - r = (vy.z - vz.y)*s; - i = t*s; - j = (vx.y + vy.x)*s; - k = (vz.x + vx.z)*s; - } - else if ( vy.y >= vz.z ) // if ( vy.y >= max(vz.z, vx.x) ) - { - const T t = (T(one) + vy.y) - (vz.z + vx.x); - const T s = rsqrt(t)*T(0.5f); - r = (vz.x - vx.z)*s; - i = (vx.y + vy.x)*s; - j = t*s; - k = (vy.z + vz.y)*s; - } - else //if ( vz.z >= max(vy.y, vx.x) ) - { - const T t = (T(one) + vz.z) - (vx.x + vy.y); - const T s = rsqrt(t)*T(0.5f); - r = (vx.y - vy.x)*s; - i = (vz.x + vx.z)*s; - j = (vy.z + vz.y)*s; - k = t*s; - } - } - - template QuaternionT::QuaternionT( const T& yaw, const T& pitch, const T& roll ) - { - const T cya = cos(yaw *T(0.5f)); - const T cpi = cos(pitch*T(0.5f)); - const T cro = cos(roll *T(0.5f)); - const T sya = sin(yaw *T(0.5f)); - const T spi = sin(pitch*T(0.5f)); - const T sro = sin(roll *T(0.5f)); - r = cro*cya*cpi + sro*sya*spi; - i = cro*cya*spi + sro*sya*cpi; - j = cro*sya*cpi - sro*cya*spi; - k = sro*cya*cpi - cro*sya*spi; - } - - ////////////////////////////////////////////////////////////////////////////// - /// Output Operators - ////////////////////////////////////////////////////////////////////////////// - - template static embree_ostream operator<<(embree_ostream cout, const QuaternionT& q) { - return cout << "{ r = " << q.r << ", i = " << q.i << ", j = " << q.j << ", k = " << q.k << " }"; - } - - /*! default template instantiations */ - typedef QuaternionT Quaternion3f; - typedef QuaternionT Quaternion3d; - - template using Quaternion3vf = QuaternionT>; - typedef QuaternionT> Quaternion3vf4; - typedef QuaternionT> Quaternion3vf8; - typedef QuaternionT> Quaternion3vf16; - - ////////////////////////////////////////////////////////////////////////////// - /// Interpolation - ////////////////////////////////////////////////////////////////////////////// - template - __forceinline QuaternionTlerp(const QuaternionT& q0, - const QuaternionT& q1, - const T& factor) - { - QuaternionT q; - q.r = lerp(q0.r, q1.r, factor); - q.i = lerp(q0.i, q1.i, factor); - q.j = lerp(q0.j, q1.j, factor); - q.k = lerp(q0.k, q1.k, factor); - return q; - } - - template - __forceinline QuaternionT slerp(const QuaternionT& q0, - const QuaternionT& q1_, - const T& t) - { - T cosTheta = dot(q0, q1_); - QuaternionT q1 = select(cosTheta < 0.f, -q1_, q1_); - cosTheta = select(cosTheta < 0.f, -cosTheta, cosTheta); - if (unlikely(all(cosTheta > 0.9995f))) { - return normalize(lerp(q0, q1, t)); - } - const T phi = t * fastapprox::acos(cosTheta); - T sinPhi, cosPhi; - fastapprox::sincos(phi, sinPhi, cosPhi); - QuaternionT qperp = sinPhi * normalize(msub(cosTheta, q0, q1)); - return msub(cosPhi, q0, qperp); - } -} diff --git a/thirdparty/embree-aarch64/common/math/range.h b/thirdparty/embree-aarch64/common/math/range.h deleted file mode 100644 index 762d9cd9ea..0000000000 --- a/thirdparty/embree-aarch64/common/math/range.h +++ /dev/null @@ -1,137 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../sys/platform.h" -#include "../math/math.h" - -namespace embree -{ - template - struct range - { - __forceinline range() {} - - __forceinline range(const Ty& begin) - : _begin(begin), _end(begin+1) {} - - __forceinline range(const Ty& begin, const Ty& end) - : _begin(begin), _end(end) {} - - __forceinline range(const range& other) - : _begin(other._begin), _end(other._end) {} - - template - __forceinline range(const range& other) - : _begin(Ty(other._begin)), _end(Ty(other._end)) {} - - template - __forceinline range& operator =(const range& other) { - _begin = other._begin; - _end = other._end; - return *this; - } - - __forceinline Ty begin() const { - return _begin; - } - - __forceinline Ty end() const { - return _end; - } - - __forceinline range intersect(const range& r) const { - return range (max(_begin,r._begin),min(_end,r._end)); - } - - __forceinline Ty size() const { - return _end - _begin; - } - - __forceinline bool empty() const { - return _end <= _begin; - } - - __forceinline Ty center() const { - return (_begin + _end)/2; - } - - __forceinline std::pair split() const - { - const Ty _center = center(); - return std::make_pair(range(_begin,_center),range(_center,_end)); - } - - __forceinline void split(range& left_o, range& right_o) const - { - const Ty _center = center(); - left_o = range(_begin,_center); - right_o = range(_center,_end); - } - - __forceinline friend bool operator< (const range& r0, const range& r1) { - return r0.size() < r1.size(); - } - - friend embree_ostream operator<<(embree_ostream cout, const range& r) { - return cout << "range [" << r.begin() << ", " << r.end() << "]"; - } - - Ty _begin, _end; - }; - - template - range make_range(const Ty& begin, const Ty& end) { - return range(begin,end); - } - - template - struct extended_range : public range - { - __forceinline extended_range () {} - - __forceinline extended_range (const Ty& begin) - : range(begin), _ext_end(begin+1) {} - - __forceinline extended_range (const Ty& begin, const Ty& end) - : range(begin,end), _ext_end(end) {} - - __forceinline extended_range (const Ty& begin, const Ty& end, const Ty& ext_end) - : range(begin,end), _ext_end(ext_end) {} - - __forceinline Ty ext_end() const { - return _ext_end; - } - - __forceinline Ty ext_size() const { - return _ext_end - range::_begin; - } - - __forceinline Ty ext_range_size() const { - return _ext_end - range::_end; - } - - __forceinline bool has_ext_range() const { - assert(_ext_end >= range::_end); - return (_ext_end - range::_end) > 0; - } - - __forceinline void set_ext_range(const size_t ext_end){ - assert(ext_end >= range::_end); - _ext_end = ext_end; - } - - __forceinline void move_right(const size_t plus){ - range::_begin += plus; - range::_end += plus; - _ext_end += plus; - } - - friend embree_ostream operator<<(embree_ostream cout, const extended_range& r) { - return cout << "extended_range [" << r.begin() << ", " << r.end() << " (" << r.ext_end() << ")]"; - } - - Ty _ext_end; - }; -} diff --git a/thirdparty/embree-aarch64/common/math/transcendental.h b/thirdparty/embree-aarch64/common/math/transcendental.h deleted file mode 100644 index 6855d82b53..0000000000 --- a/thirdparty/embree-aarch64/common/math/transcendental.h +++ /dev/null @@ -1,525 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -// Transcendental functions from "ispc": https://github.com/ispc/ispc/ -// Most of the transcendental implementations in ispc code come from -// Solomon Boulos's "syrah": https://github.com/boulos/syrah/ - -#include "../simd/simd.h" - -namespace embree -{ - -namespace fastapprox -{ - -template -__forceinline T sin(const T &v) -{ - static const float piOverTwoVec = 1.57079637050628662109375; - static const float twoOverPiVec = 0.636619746685028076171875; - auto scaled = v * twoOverPiVec; - auto kReal = floor(scaled); - auto k = toInt(kReal); - - // Reduced range version of x - auto x = v - kReal * piOverTwoVec; - auto kMod4 = k & 3; - auto sinUseCos = (kMod4 == 1 | kMod4 == 3); - auto flipSign = (kMod4 > 1); - - // These coefficients are from sollya with fpminimax(sin(x)/x, [|0, 2, - // 4, 6, 8, 10|], [|single...|], [0;Pi/2]); - static const float sinC2 = -0.16666667163372039794921875; - static const float sinC4 = +8.333347737789154052734375e-3; - static const float sinC6 = -1.9842604524455964565277099609375e-4; - static const float sinC8 = +2.760012648650445044040679931640625e-6; - static const float sinC10 = -2.50293279435709337121807038784027099609375e-8; - - static const float cosC2 = -0.5; - static const float cosC4 = +4.166664183139801025390625e-2; - static const float cosC6 = -1.388833043165504932403564453125e-3; - static const float cosC8 = +2.47562347794882953166961669921875e-5; - static const float cosC10 = -2.59630184018533327616751194000244140625e-7; - - auto outside = select(sinUseCos, 1., x); - auto c2 = select(sinUseCos, T(cosC2), T(sinC2)); - auto c4 = select(sinUseCos, T(cosC4), T(sinC4)); - auto c6 = select(sinUseCos, T(cosC6), T(sinC6)); - auto c8 = select(sinUseCos, T(cosC8), T(sinC8)); - auto c10 = select(sinUseCos, T(cosC10), T(sinC10)); - - auto x2 = x * x; - auto formula = x2 * c10 + c8; - formula = x2 * formula + c6; - formula = x2 * formula + c4; - formula = x2 * formula + c2; - formula = x2 * formula + 1.; - formula *= outside; - - formula = select(flipSign, -formula, formula); - return formula; -} - -template -__forceinline T cos(const T &v) -{ - static const float piOverTwoVec = 1.57079637050628662109375; - static const float twoOverPiVec = 0.636619746685028076171875; - auto scaled = v * twoOverPiVec; - auto kReal = floor(scaled); - auto k = toInt(kReal); - - // Reduced range version of x - auto x = v - kReal * piOverTwoVec; - - auto kMod4 = k & 3; - auto cosUseCos = (kMod4 == 0 | kMod4 == 2); - auto flipSign = (kMod4 == 1 | kMod4 == 2); - - const float sinC2 = -0.16666667163372039794921875; - const float sinC4 = +8.333347737789154052734375e-3; - const float sinC6 = -1.9842604524455964565277099609375e-4; - const float sinC8 = +2.760012648650445044040679931640625e-6; - const float sinC10 = -2.50293279435709337121807038784027099609375e-8; - - const float cosC2 = -0.5; - const float cosC4 = +4.166664183139801025390625e-2; - const float cosC6 = -1.388833043165504932403564453125e-3; - const float cosC8 = +2.47562347794882953166961669921875e-5; - const float cosC10 = -2.59630184018533327616751194000244140625e-7; - - auto outside = select(cosUseCos, 1., x); - auto c2 = select(cosUseCos, T(cosC2), T(sinC2)); - auto c4 = select(cosUseCos, T(cosC4), T(sinC4)); - auto c6 = select(cosUseCos, T(cosC6), T(sinC6)); - auto c8 = select(cosUseCos, T(cosC8), T(sinC8)); - auto c10 = select(cosUseCos, T(cosC10), T(sinC10)); - - auto x2 = x * x; - auto formula = x2 * c10 + c8; - formula = x2 * formula + c6; - formula = x2 * formula + c4; - formula = x2 * formula + c2; - formula = x2 * formula + 1.; - formula *= outside; - - formula = select(flipSign, -formula, formula); - return formula; -} - -template -__forceinline void sincos(const T &v, T &sinResult, T &cosResult) -{ - const float piOverTwoVec = 1.57079637050628662109375; - const float twoOverPiVec = 0.636619746685028076171875; - auto scaled = v * twoOverPiVec; - auto kReal = floor(scaled); - auto k = toInt(kReal); - - // Reduced range version of x - auto x = v - kReal * piOverTwoVec; - auto kMod4 = k & 3; - auto cosUseCos = ((kMod4 == 0) | (kMod4 == 2)); - auto sinUseCos = ((kMod4 == 1) | (kMod4 == 3)); - auto sinFlipSign = (kMod4 > 1); - auto cosFlipSign = ((kMod4 == 1) | (kMod4 == 2)); - - const float oneVec = +1.; - const float sinC2 = -0.16666667163372039794921875; - const float sinC4 = +8.333347737789154052734375e-3; - const float sinC6 = -1.9842604524455964565277099609375e-4; - const float sinC8 = +2.760012648650445044040679931640625e-6; - const float sinC10 = -2.50293279435709337121807038784027099609375e-8; - - const float cosC2 = -0.5; - const float cosC4 = +4.166664183139801025390625e-2; - const float cosC6 = -1.388833043165504932403564453125e-3; - const float cosC8 = +2.47562347794882953166961669921875e-5; - const float cosC10 = -2.59630184018533327616751194000244140625e-7; - - auto x2 = x * x; - - auto sinFormula = x2 * sinC10 + sinC8; - auto cosFormula = x2 * cosC10 + cosC8; - sinFormula = x2 * sinFormula + sinC6; - cosFormula = x2 * cosFormula + cosC6; - - sinFormula = x2 * sinFormula + sinC4; - cosFormula = x2 * cosFormula + cosC4; - - sinFormula = x2 * sinFormula + sinC2; - cosFormula = x2 * cosFormula + cosC2; - - sinFormula = x2 * sinFormula + oneVec; - cosFormula = x2 * cosFormula + oneVec; - - sinFormula *= x; - - sinResult = select(sinUseCos, cosFormula, sinFormula); - cosResult = select(cosUseCos, cosFormula, sinFormula); - - sinResult = select(sinFlipSign, -sinResult, sinResult); - cosResult = select(cosFlipSign, -cosResult, cosResult); -} - -template -__forceinline T tan(const T &v) -{ - const float piOverFourVec = 0.785398185253143310546875; - const float fourOverPiVec = 1.27323949337005615234375; - - auto xLt0 = v < 0.; - auto y = select(xLt0, -v, v); - auto scaled = y * fourOverPiVec; - - auto kReal = floor(scaled); - auto k = toInt(kReal); - - auto x = y - kReal * piOverFourVec; - - // If k & 1, x -= Pi/4 - auto needOffset = (k & 1) != 0; - x = select(needOffset, x - piOverFourVec, x); - - // If k & 3 == (0 or 3) let z = tan_In...(y) otherwise z = -cot_In0To... - auto kMod4 = k & 3; - auto useCotan = (kMod4 == 1) | (kMod4 == 2); - - const float oneVec = 1.0; - - const float tanC2 = +0.33333075046539306640625; - const float tanC4 = +0.13339905440807342529296875; - const float tanC6 = +5.3348250687122344970703125e-2; - const float tanC8 = +2.46033705770969390869140625e-2; - const float tanC10 = +2.892402000725269317626953125e-3; - const float tanC12 = +9.500005282461643218994140625e-3; - - const float cotC2 = -0.3333333432674407958984375; - const float cotC4 = -2.222204394638538360595703125e-2; - const float cotC6 = -2.11752182804048061370849609375e-3; - const float cotC8 = -2.0846328698098659515380859375e-4; - const float cotC10 = -2.548247357481159269809722900390625e-5; - const float cotC12 = -3.5257363606433500535786151885986328125e-7; - - auto x2 = x * x; - T z; - if (any(useCotan)) - { - auto cotVal = x2 * cotC12 + cotC10; - cotVal = x2 * cotVal + cotC8; - cotVal = x2 * cotVal + cotC6; - cotVal = x2 * cotVal + cotC4; - cotVal = x2 * cotVal + cotC2; - cotVal = x2 * cotVal + oneVec; - // The equation is for x * cot(x) but we need -x * cot(x) for the tan part. - cotVal /= -x; - z = cotVal; - } - auto useTan = !useCotan; - if (any(useTan)) - { - auto tanVal = x2 * tanC12 + tanC10; - tanVal = x2 * tanVal + tanC8; - tanVal = x2 * tanVal + tanC6; - tanVal = x2 * tanVal + tanC4; - tanVal = x2 * tanVal + tanC2; - tanVal = x2 * tanVal + oneVec; - // Equation was for tan(x)/x - tanVal *= x; - z = select(useTan, tanVal, z); - } - return select(xLt0, -z, z); -} - -template -__forceinline T asin(const T &x0) -{ - auto isneg = (x0 < 0.f); - auto x = abs(x0); - auto isnan = (x > 1.f); - - // sollya - // fpminimax(((asin(x)-pi/2)/-sqrt(1-x)), [|0,1,2,3,4,5|],[|single...|], - // [1e-20;.9999999999999999]); - // avg error: 1.1105439e-06, max error 1.3187528e-06 - auto v = 1.57079517841339111328125f + - x * (-0.21450997889041900634765625f + - x * (8.78556668758392333984375e-2f + - x * (-4.489909112453460693359375e-2f + - x * (1.928029954433441162109375e-2f + - x * (-4.3095736764371395111083984375e-3f))))); - - v *= -sqrt(1.f - x); - v = v + 1.57079637050628662109375f; - - v = select(v < 0.f, T(0.f), v); - v = select(isneg, -v, v); - v = select(isnan, T(cast_i2f(0x7fc00000)), v); - - return v; -} - -template -__forceinline T acos(const T &v) -{ - return 1.57079637050628662109375f - asin(v); -} - -template -__forceinline T atan(const T &v) -{ - const float piOverTwoVec = 1.57079637050628662109375; - // atan(-x) = -atan(x) (so flip from negative to positive first) - // If x > 1 -> atan(x) = Pi/2 - atan(1/x) - auto xNeg = v < 0.f; - auto xFlipped = select(xNeg, -v, v); - - auto xGt1 = xFlipped > 1.; - auto x = select(xGt1, rcpSafe(xFlipped), xFlipped); - - // These coefficients approximate atan(x)/x - const float atanC0 = +0.99999988079071044921875; - const float atanC2 = -0.3333191573619842529296875; - const float atanC4 = +0.199689209461212158203125; - const float atanC6 = -0.14015688002109527587890625; - const float atanC8 = +9.905083477497100830078125e-2; - const float atanC10 = -5.93664981424808502197265625e-2; - const float atanC12 = +2.417283318936824798583984375e-2; - const float atanC14 = -4.6721356920897960662841796875e-3; - - auto x2 = x * x; - auto result = x2 * atanC14 + atanC12; - result = x2 * result + atanC10; - result = x2 * result + atanC8; - result = x2 * result + atanC6; - result = x2 * result + atanC4; - result = x2 * result + atanC2; - result = x2 * result + atanC0; - result *= x; - - result = select(xGt1, piOverTwoVec - result, result); - result = select(xNeg, -result, result); - return result; -} - -template -__forceinline T atan2(const T &y, const T &x) -{ - const float piVec = 3.1415926536; - // atan2(y, x) = - // - // atan2(y > 0, x = +-0) -> Pi/2 - // atan2(y < 0, x = +-0) -> -Pi/2 - // atan2(y = +-0, x < +0) -> +-Pi - // atan2(y = +-0, x >= +0) -> +-0 - // - // atan2(y >= 0, x < 0) -> Pi + atan(y/x) - // atan2(y < 0, x < 0) -> -Pi + atan(y/x) - // atan2(y, x > 0) -> atan(y/x) - // - // and then a bunch of code for dealing with infinities. - auto yOverX = y * rcpSafe(x); - auto atanArg = atan(yOverX); - auto xLt0 = x < 0.f; - auto yLt0 = y < 0.f; - auto offset = select(xLt0, - select(yLt0, T(-piVec), T(piVec)), 0.f); - return offset + atanArg; -} - -template -__forceinline T exp(const T &v) -{ - const float ln2Part1 = 0.6931457519; - const float ln2Part2 = 1.4286067653e-6; - const float oneOverLn2 = 1.44269502162933349609375; - - auto scaled = v * oneOverLn2; - auto kReal = floor(scaled); - auto k = toInt(kReal); - - // Reduced range version of x - auto x = v - kReal * ln2Part1; - x -= kReal * ln2Part2; - - // These coefficients are for e^x in [0, ln(2)] - const float one = 1.; - const float c2 = 0.4999999105930328369140625; - const float c3 = 0.166668415069580078125; - const float c4 = 4.16539050638675689697265625e-2; - const float c5 = 8.378830738365650177001953125e-3; - const float c6 = 1.304379315115511417388916015625e-3; - const float c7 = 2.7555381529964506626129150390625e-4; - - auto result = x * c7 + c6; - result = x * result + c5; - result = x * result + c4; - result = x * result + c3; - result = x * result + c2; - result = x * result + one; - result = x * result + one; - - // Compute 2^k (should differ for float and double, but I'll avoid - // it for now and just do floats) - const int fpbias = 127; - auto biasedN = k + fpbias; - auto overflow = kReal > fpbias; - // Minimum exponent is -126, so if k is <= -127 (k + 127 <= 0) - // we've got underflow. -127 * ln(2) -> -88.02. So the most - // negative float input that doesn't result in zero is like -88. - auto underflow = kReal <= -fpbias; - const int infBits = 0x7f800000; - biasedN <<= 23; - // Reinterpret this thing as float - auto twoToTheN = asFloat(biasedN); - // Handle both doubles and floats (hopefully eliding the copy for float) - auto elemtype2n = twoToTheN; - result *= elemtype2n; - result = select(overflow, cast_i2f(infBits), result); - result = select(underflow, 0., result); - return result; -} - -// Range reduction for logarithms takes log(x) -> log(2^n * y) -> n -// * log(2) + log(y) where y is the reduced range (usually in [1/2, 1)). -template -__forceinline void __rangeReduceLog(const T &input, - T &reduced, - R &exponent) -{ - auto intVersion = asInt(input); - // single precision = SEEE EEEE EMMM MMMM MMMM MMMM MMMM MMMM - // exponent mask = 0111 1111 1000 0000 0000 0000 0000 0000 - // 0x7 0xF 0x8 0x0 0x0 0x0 0x0 0x0 - // non-exponent = 1000 0000 0111 1111 1111 1111 1111 1111 - // = 0x8 0x0 0x7 0xF 0xF 0xF 0xF 0xF - - //const int exponentMask(0x7F800000) - static const int nonexponentMask = 0x807FFFFF; - - // We want the reduced version to have an exponent of -1 which is - // -1 + 127 after biasing or 126 - static const int exponentNeg1 = (126l << 23); - // NOTE(boulos): We don't need to mask anything out since we know - // the sign bit has to be 0. If it's 1, we need to return infinity/nan - // anyway (log(x), x = +-0 -> infinity, x < 0 -> NaN). - auto biasedExponent = intVersion >> 23; // This number is [0, 255] but it means [-127, 128] - - auto offsetExponent = biasedExponent + 1; // Treat the number as if it were 2^{e+1} * (1.m)/2 - exponent = offsetExponent - 127; // get the real value - - // Blend the offset_exponent with the original input (do this in - // int for now, until I decide if float can have & and ¬) - auto blended = (intVersion & nonexponentMask) | (exponentNeg1); - reduced = asFloat(blended); -} - -template struct ExponentType { }; -template struct ExponentType> { typedef vint Ty; }; -template <> struct ExponentType { typedef int Ty; }; - -template -__forceinline T log(const T &v) -{ - T reduced; - typename ExponentType::Ty exponent; - - const int nanBits = 0x7fc00000; - const int negInfBits = 0xFF800000; - const float nan = cast_i2f(nanBits); - const float negInf = cast_i2f(negInfBits); - auto useNan = v < 0.; - auto useInf = v == 0.; - auto exceptional = useNan | useInf; - const float one = 1.0; - - auto patched = select(exceptional, one, v); - __rangeReduceLog(patched, reduced, exponent); - - const float ln2 = 0.693147182464599609375; - - auto x1 = one - reduced; - const float c1 = +0.50000095367431640625; - const float c2 = +0.33326041698455810546875; - const float c3 = +0.2519190013408660888671875; - const float c4 = +0.17541764676570892333984375; - const float c5 = +0.3424419462680816650390625; - const float c6 = -0.599632322788238525390625; - const float c7 = +1.98442304134368896484375; - const float c8 = -2.4899270534515380859375; - const float c9 = +1.7491014003753662109375; - - auto result = x1 * c9 + c8; - result = x1 * result + c7; - result = x1 * result + c6; - result = x1 * result + c5; - result = x1 * result + c4; - result = x1 * result + c3; - result = x1 * result + c2; - result = x1 * result + c1; - result = x1 * result + one; - - // Equation was for -(ln(red)/(1-red)) - result *= -x1; - result += toFloat(exponent) * ln2; - - return select(exceptional, - select(useNan, T(nan), T(negInf)), - result); -} - -template -__forceinline T pow(const T &x, const T &y) -{ - auto x1 = abs(x); - auto z = exp(y * log(x1)); - - // Handle special cases - const float twoOver23 = 8388608.0f; - auto yInt = y == round(y); - auto yOddInt = select(yInt, asInt(abs(y) + twoOver23) << 31, 0); // set sign bit - - // x == 0 - z = select(x == 0.0f, - select(y < 0.0f, T(inf) | signmsk(x), - select(y == 0.0f, T(1.0f), asFloat(yOddInt) & x)), z); - - // x < 0 - auto xNegative = x < 0.0f; - if (any(xNegative)) - { - auto z1 = z | asFloat(yOddInt); - z1 = select(yInt, z1, std::numeric_limits::quiet_NaN()); - z = select(xNegative, z1, z); - } - - auto xFinite = isfinite(x); - auto yFinite = isfinite(y); - if (all(xFinite & yFinite)) - return z; - - // x finite and y infinite - z = select(andn(xFinite, yFinite), - select(x1 == 1.0f, 1.0f, - select((x1 > 1.0f) ^ (y < 0.0f), inf, T(0.0f))), z); - - // x infinite - z = select(xFinite, z, - select(y == 0.0f, 1.0f, - select(y < 0.0f, T(0.0f), inf) | (asFloat(yOddInt) & x))); - - return z; -} - -template -__forceinline T pow(const T &x, float y) -{ - return pow(x, T(y)); -} - -} // namespace fastapprox - -} // namespace embree diff --git a/thirdparty/embree-aarch64/common/math/vec2.h b/thirdparty/embree-aarch64/common/math/vec2.h deleted file mode 100644 index a619459e9c..0000000000 --- a/thirdparty/embree-aarch64/common/math/vec2.h +++ /dev/null @@ -1,235 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "math.h" - -namespace embree -{ - struct Vec2fa; - - //////////////////////////////////////////////////////////////////////////////// - /// Generic 2D vector Class - //////////////////////////////////////////////////////////////////////////////// - - template struct Vec2 - { - enum { N = 2 }; - union { - struct { T x, y; }; -#if !(defined(__WIN32__) && _MSC_VER == 1800) // workaround for older VS 2013 compiler - T components[N]; -#endif - }; - - typedef T Scalar; - - //////////////////////////////////////////////////////////////////////////////// - /// Construction - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec2( ) {} - __forceinline explicit Vec2( const T& a ) : x(a), y(a) {} - __forceinline Vec2( const T& x, const T& y ) : x(x), y(y) {} - - __forceinline Vec2( const Vec2& other ) { x = other.x; y = other.y; } - __forceinline Vec2( const Vec2fa& other ); - - template __forceinline Vec2( const Vec2& a ) : x(T(a.x)), y(T(a.y)) {} - template __forceinline Vec2& operator =( const Vec2& other ) { x = other.x; y = other.y; return *this; } - - __forceinline Vec2& operator =( const Vec2& other ) { x = other.x; y = other.y; return *this; } - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec2( ZeroTy ) : x(zero), y(zero) {} - __forceinline Vec2( OneTy ) : x(one), y(one) {} - __forceinline Vec2( PosInfTy ) : x(pos_inf), y(pos_inf) {} - __forceinline Vec2( NegInfTy ) : x(neg_inf), y(neg_inf) {} - -#if defined(__WIN32__) && _MSC_VER == 1800 // workaround for older VS 2013 compiler - __forceinline const T& operator [](const size_t axis) const { assert(axis < 2); return (&x)[axis]; } - __forceinline T& operator [](const size_t axis) { assert(axis < 2); return (&x)[axis]; } -#else - __forceinline const T& operator [](const size_t axis) const { assert(axis < 2); return components[axis]; } - __forceinline T& operator [](const size_t axis ) { assert(axis < 2); return components[axis]; } -#endif - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - - template __forceinline Vec2 operator +( const Vec2& a ) { return Vec2(+a.x, +a.y); } - template __forceinline Vec2 operator -( const Vec2& a ) { return Vec2(-a.x, -a.y); } - template __forceinline Vec2 abs ( const Vec2& a ) { return Vec2(abs (a.x), abs (a.y)); } - template __forceinline Vec2 rcp ( const Vec2& a ) { return Vec2(rcp (a.x), rcp (a.y)); } - template __forceinline Vec2 rsqrt ( const Vec2& a ) { return Vec2(rsqrt(a.x), rsqrt(a.y)); } - template __forceinline Vec2 sqrt ( const Vec2& a ) { return Vec2(sqrt (a.x), sqrt (a.y)); } - template __forceinline Vec2 frac ( const Vec2& a ) { return Vec2(frac (a.x), frac (a.y)); } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - template __forceinline Vec2 operator +( const Vec2& a, const Vec2& b ) { return Vec2(a.x + b.x, a.y + b.y); } - template __forceinline Vec2 operator +( const Vec2& a, const T& b ) { return Vec2(a.x + b , a.y + b ); } - template __forceinline Vec2 operator +( const T& a, const Vec2& b ) { return Vec2(a + b.x, a + b.y); } - template __forceinline Vec2 operator -( const Vec2& a, const Vec2& b ) { return Vec2(a.x - b.x, a.y - b.y); } - template __forceinline Vec2 operator -( const Vec2& a, const T& b ) { return Vec2(a.x - b , a.y - b ); } - template __forceinline Vec2 operator -( const T& a, const Vec2& b ) { return Vec2(a - b.x, a - b.y); } - template __forceinline Vec2 operator *( const Vec2& a, const Vec2& b ) { return Vec2(a.x * b.x, a.y * b.y); } - template __forceinline Vec2 operator *( const T& a, const Vec2& b ) { return Vec2(a * b.x, a * b.y); } - template __forceinline Vec2 operator *( const Vec2& a, const T& b ) { return Vec2(a.x * b , a.y * b ); } - template __forceinline Vec2 operator /( const Vec2& a, const Vec2& b ) { return Vec2(a.x / b.x, a.y / b.y); } - template __forceinline Vec2 operator /( const Vec2& a, const T& b ) { return Vec2(a.x / b , a.y / b ); } - template __forceinline Vec2 operator /( const T& a, const Vec2& b ) { return Vec2(a / b.x, a / b.y); } - - template __forceinline Vec2 min(const Vec2& a, const Vec2& b) { return Vec2(min(a.x, b.x), min(a.y, b.y)); } - template __forceinline Vec2 max(const Vec2& a, const Vec2& b) { return Vec2(max(a.x, b.x), max(a.y, b.y)); } - - //////////////////////////////////////////////////////////////////////////////// - /// Ternary Operators - //////////////////////////////////////////////////////////////////////////////// - - template __forceinline Vec2 madd ( const Vec2& a, const Vec2& b, const Vec2& c) { return Vec2( madd(a.x,b.x,c.x), madd(a.y,b.y,c.y) ); } - template __forceinline Vec2 msub ( const Vec2& a, const Vec2& b, const Vec2& c) { return Vec2( msub(a.x,b.x,c.x), msub(a.y,b.y,c.y) ); } - template __forceinline Vec2 nmadd ( const Vec2& a, const Vec2& b, const Vec2& c) { return Vec2(nmadd(a.x,b.x,c.x),nmadd(a.y,b.y,c.y) ); } - template __forceinline Vec2 nmsub ( const Vec2& a, const Vec2& b, const Vec2& c) { return Vec2(nmsub(a.x,b.x,c.x),nmsub(a.y,b.y,c.y) ); } - - template __forceinline Vec2 madd ( const T& a, const Vec2& b, const Vec2& c) { return Vec2( madd(a,b.x,c.x), madd(a,b.y,c.y) ); } - template __forceinline Vec2 msub ( const T& a, const Vec2& b, const Vec2& c) { return Vec2( msub(a,b.x,c.x), msub(a,b.y,c.y) ); } - template __forceinline Vec2 nmadd ( const T& a, const Vec2& b, const Vec2& c) { return Vec2(nmadd(a,b.x,c.x),nmadd(a,b.y,c.y) ); } - template __forceinline Vec2 nmsub ( const T& a, const Vec2& b, const Vec2& c) { return Vec2(nmsub(a,b.x,c.x),nmsub(a,b.y,c.y) ); } - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - template __forceinline Vec2& operator +=( Vec2& a, const Vec2& b ) { a.x += b.x; a.y += b.y; return a; } - template __forceinline Vec2& operator -=( Vec2& a, const Vec2& b ) { a.x -= b.x; a.y -= b.y; return a; } - template __forceinline Vec2& operator *=( Vec2& a, const T& b ) { a.x *= b ; a.y *= b ; return a; } - template __forceinline Vec2& operator /=( Vec2& a, const T& b ) { a.x /= b ; a.y /= b ; return a; } - - //////////////////////////////////////////////////////////////////////////////// - /// Reduction Operators - //////////////////////////////////////////////////////////////////////////////// - - template __forceinline T reduce_add( const Vec2& a ) { return a.x + a.y; } - template __forceinline T reduce_mul( const Vec2& a ) { return a.x * a.y; } - template __forceinline T reduce_min( const Vec2& a ) { return min(a.x, a.y); } - template __forceinline T reduce_max( const Vec2& a ) { return max(a.x, a.y); } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators - //////////////////////////////////////////////////////////////////////////////// - - template __forceinline bool operator ==( const Vec2& a, const Vec2& b ) { return a.x == b.x && a.y == b.y; } - template __forceinline bool operator !=( const Vec2& a, const Vec2& b ) { return a.x != b.x || a.y != b.y; } - template __forceinline bool operator < ( const Vec2& a, const Vec2& b ) { - if (a.x != b.x) return a.x < b.x; - if (a.y != b.y) return a.y < b.y; - return false; - } - - //////////////////////////////////////////////////////////////////////////////// - /// Shift Operators - //////////////////////////////////////////////////////////////////////////////// - - template __forceinline Vec2 shift_right_1( const Vec2& a ) { - return Vec2(shift_right_1(a.x),shift_right_1(a.y)); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Euclidian Space Operators - //////////////////////////////////////////////////////////////////////////////// - - template __forceinline T dot ( const Vec2& a, const Vec2& b ) { return madd(a.x,b.x,a.y*b.y); } - template __forceinline Vec2 cross ( const Vec2& a ) { return Vec2(-a.y,a.x); } - template __forceinline T length ( const Vec2& a ) { return sqrt(dot(a,a)); } - template __forceinline Vec2 normalize( const Vec2& a ) { return a*rsqrt(dot(a,a)); } - template __forceinline T distance ( const Vec2& a, const Vec2& b ) { return length(a-b); } - template __forceinline T det ( const Vec2& a, const Vec2& b ) { return a.x*b.y - a.y*b.x; } - - template __forceinline Vec2 normalize_safe( const Vec2& a ) { - const T d = dot(a,a); return select(d == T( zero ),a, a*rsqrt(d) ); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Select - //////////////////////////////////////////////////////////////////////////////// - - template __forceinline Vec2 select ( bool s, const Vec2& t, const Vec2& f ) { - return Vec2(select(s,t.x,f.x),select(s,t.y,f.y)); - } - - template __forceinline Vec2 select ( const Vec2& s, const Vec2& t, const Vec2& f ) { - return Vec2(select(s.x,t.x,f.x),select(s.y,t.y,f.y)); - } - - template __forceinline Vec2 select ( const typename T::Bool& s, const Vec2& t, const Vec2& f ) { - return Vec2(select(s,t.x,f.x),select(s,t.y,f.y)); - } - - template - __forceinline Vec2 lerp(const Vec2& v0, const Vec2& v1, const T& t) { - return madd(Vec2(T(1.0f)-t),v0,t*v1); - } - - template __forceinline int maxDim ( const Vec2& a ) - { - const Vec2 b = abs(a); - if (b.x > b.y) return 0; - else return 1; - } - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - template __forceinline embree_ostream operator<<(embree_ostream cout, const Vec2& a) { - return cout << "(" << a.x << ", " << a.y << ")"; - } - - //////////////////////////////////////////////////////////////////////////////// - /// Default template instantiations - //////////////////////////////////////////////////////////////////////////////// - - typedef Vec2 Vec2b; - typedef Vec2 Vec2i; - typedef Vec2 Vec2f; -} - -#include "vec2fa.h" - -#if defined(__SSE__) || defined(__ARM_NEON) -#include "../simd/sse.h" -#endif - -#if defined(__AVX__) -#include "../simd/avx.h" -#endif - -#if defined(__AVX512F__) -#include "../simd/avx512.h" -#endif - -namespace embree -{ - template<> __forceinline Vec2::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {} - -#if defined(__SSE__) || defined(__ARM_NEON) - template<> __forceinline Vec2::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {} -#endif - -#if defined(__AVX__) - template<> __forceinline Vec2::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {} -#endif - -#if defined(__AVX512F__) - template<> __forceinline Vec2::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {} -#endif -} diff --git a/thirdparty/embree-aarch64/common/math/vec2fa.h b/thirdparty/embree-aarch64/common/math/vec2fa.h deleted file mode 100644 index 451ecd556c..0000000000 --- a/thirdparty/embree-aarch64/common/math/vec2fa.h +++ /dev/null @@ -1,317 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../sys/alloc.h" -#include "math.h" -#include "../simd/sse.h" - -namespace embree -{ - //////////////////////////////////////////////////////////////////////////////// - /// SSE Vec2fa Type - //////////////////////////////////////////////////////////////////////////////// - - struct __aligned(16) Vec2fa - { - ALIGNED_STRUCT_(16); - - typedef float Scalar; - enum { N = 2 }; - union { - __m128 m128; - struct { float x,y,az,aw; }; - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec2fa( ) {} - __forceinline Vec2fa( const __m128 a ) : m128(a) {} - - __forceinline Vec2fa ( const Vec2& other ) { x = other.x; y = other.y; } - __forceinline Vec2fa& operator =( const Vec2& other ) { x = other.x; y = other.y; return *this; } - - __forceinline Vec2fa ( const Vec2fa& other ) { m128 = other.m128; } - __forceinline Vec2fa& operator =( const Vec2fa& other ) { m128 = other.m128; return *this; } - - __forceinline explicit Vec2fa( const float a ) : m128(_mm_set1_ps(a)) {} - __forceinline Vec2fa( const float x, const float y) : m128(_mm_set_ps(y, y, y, x)) {} - - __forceinline explicit Vec2fa( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {} - - __forceinline operator const __m128&() const { return m128; } - __forceinline operator __m128&() { return m128; } - - //////////////////////////////////////////////////////////////////////////////// - /// Loads and Stores - //////////////////////////////////////////////////////////////////////////////// - - static __forceinline Vec2fa load( const void* const a ) { - return Vec2fa(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, 0, -1, -1)))); - } - - static __forceinline Vec2fa loadu( const void* const a ) { - return Vec2fa(_mm_and_ps(_mm_loadu_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, 0, -1, -1)))); - } - - static __forceinline void storeu ( void* ptr, const Vec2fa& v ) { - _mm_storeu_ps((float*)ptr,v); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec2fa( ZeroTy ) : m128(_mm_setzero_ps()) {} - __forceinline Vec2fa( OneTy ) : m128(_mm_set1_ps(1.0f)) {} - __forceinline Vec2fa( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {} - __forceinline Vec2fa( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline const float& operator []( const size_t index ) const { assert(index < 2); return (&x)[index]; } - __forceinline float& operator []( const size_t index ) { assert(index < 2); return (&x)[index]; } - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec2fa operator +( const Vec2fa& a ) { return a; } - __forceinline Vec2fa operator -( const Vec2fa& a ) { - const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); - return _mm_xor_ps(a.m128, mask); - } - __forceinline Vec2fa abs ( const Vec2fa& a ) { - const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)); - return _mm_and_ps(a.m128, mask); - } - __forceinline Vec2fa sign ( const Vec2fa& a ) { - return blendv_ps(Vec2fa(one), -Vec2fa(one), _mm_cmplt_ps (a,Vec2fa(zero))); - } - - __forceinline Vec2fa rcp ( const Vec2fa& a ) - { -#if defined(__aarch64__) - __m128 reciprocal = _mm_rcp_ps(a.m128); - reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal); - reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal); - return (const Vec2fa)reciprocal; -#else -#if defined(__AVX512VL__) - const Vec2fa r = _mm_rcp14_ps(a.m128); -#else - const Vec2fa r = _mm_rcp_ps(a.m128); -#endif - -#if defined(__AVX2__) - const Vec2fa res = _mm_mul_ps(r,_mm_fnmadd_ps(r, a, vfloat4(2.0f))); -#else - const Vec2fa res = _mm_mul_ps(r,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r, a))); - //return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a)); -#endif - - return res; -#endif //defined(__aarch64__) - } - - __forceinline Vec2fa sqrt ( const Vec2fa& a ) { return _mm_sqrt_ps(a.m128); } - __forceinline Vec2fa sqr ( const Vec2fa& a ) { return _mm_mul_ps(a,a); } - - __forceinline Vec2fa rsqrt( const Vec2fa& a ) - { -#if defined(__aarch64__) - __m128 r = _mm_rsqrt_ps(a.m128); - r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r)); - r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r)); - return r; -#else - -#if defined(__AVX512VL__) - __m128 r = _mm_rsqrt14_ps(a.m128); -#else - __m128 r = _mm_rsqrt_ps(a.m128); -#endif - return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); - -#endif - } - - __forceinline Vec2fa zero_fix(const Vec2fa& a) { - return blendv_ps(a, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input))); - } - __forceinline Vec2fa rcp_safe(const Vec2fa& a) { - return rcp(zero_fix(a)); - } - __forceinline Vec2fa log ( const Vec2fa& a ) { - return Vec2fa(logf(a.x),logf(a.y)); - } - - __forceinline Vec2fa exp ( const Vec2fa& a ) { - return Vec2fa(expf(a.x),expf(a.y)); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec2fa operator +( const Vec2fa& a, const Vec2fa& b ) { return _mm_add_ps(a.m128, b.m128); } - __forceinline Vec2fa operator -( const Vec2fa& a, const Vec2fa& b ) { return _mm_sub_ps(a.m128, b.m128); } - __forceinline Vec2fa operator *( const Vec2fa& a, const Vec2fa& b ) { return _mm_mul_ps(a.m128, b.m128); } - __forceinline Vec2fa operator *( const Vec2fa& a, const float b ) { return a * Vec2fa(b); } - __forceinline Vec2fa operator *( const float a, const Vec2fa& b ) { return Vec2fa(a) * b; } - __forceinline Vec2fa operator /( const Vec2fa& a, const Vec2fa& b ) { return _mm_div_ps(a.m128,b.m128); } - __forceinline Vec2fa operator /( const Vec2fa& a, const float b ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); } - __forceinline Vec2fa operator /( const float a, const Vec2fa& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); } - - __forceinline Vec2fa min( const Vec2fa& a, const Vec2fa& b ) { return _mm_min_ps(a.m128,b.m128); } - __forceinline Vec2fa max( const Vec2fa& a, const Vec2fa& b ) { return _mm_max_ps(a.m128,b.m128); } - -#if defined(__aarch64__) || defined(__SSE4_1__) - __forceinline Vec2fa mini(const Vec2fa& a, const Vec2fa& b) { - const vint4 ai = _mm_castps_si128(a); - const vint4 bi = _mm_castps_si128(b); - const vint4 ci = _mm_min_epi32(ai,bi); - return _mm_castsi128_ps(ci); - } -#endif - -#if defined(__aarch64__) || defined(__SSE4_1__) - __forceinline Vec2fa maxi(const Vec2fa& a, const Vec2fa& b) { - const vint4 ai = _mm_castps_si128(a); - const vint4 bi = _mm_castps_si128(b); - const vint4 ci = _mm_max_epi32(ai,bi); - return _mm_castsi128_ps(ci); - } -#endif - - __forceinline Vec2fa pow ( const Vec2fa& a, const float& b ) { - return Vec2fa(powf(a.x,b),powf(a.y,b)); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Ternary Operators - //////////////////////////////////////////////////////////////////////////////// - -#if defined(__AVX2__) - __forceinline Vec2fa madd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fmadd_ps(a,b,c); } - __forceinline Vec2fa msub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fmsub_ps(a,b,c); } - __forceinline Vec2fa nmadd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fnmadd_ps(a,b,c); } - __forceinline Vec2fa nmsub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fnmsub_ps(a,b,c); } -#else - __forceinline Vec2fa madd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return a*b+c; } - __forceinline Vec2fa msub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return a*b-c; } - __forceinline Vec2fa nmadd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return -a*b+c;} - __forceinline Vec2fa nmsub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return -a*b-c; } -#endif - - __forceinline Vec2fa madd ( const float a, const Vec2fa& b, const Vec2fa& c) { return madd(Vec2fa(a),b,c); } - __forceinline Vec2fa msub ( const float a, const Vec2fa& b, const Vec2fa& c) { return msub(Vec2fa(a),b,c); } - __forceinline Vec2fa nmadd ( const float a, const Vec2fa& b, const Vec2fa& c) { return nmadd(Vec2fa(a),b,c); } - __forceinline Vec2fa nmsub ( const float a, const Vec2fa& b, const Vec2fa& c) { return nmsub(Vec2fa(a),b,c); } - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec2fa& operator +=( Vec2fa& a, const Vec2fa& b ) { return a = a + b; } - __forceinline Vec2fa& operator -=( Vec2fa& a, const Vec2fa& b ) { return a = a - b; } - __forceinline Vec2fa& operator *=( Vec2fa& a, const Vec2fa& b ) { return a = a * b; } - __forceinline Vec2fa& operator *=( Vec2fa& a, const float b ) { return a = a * b; } - __forceinline Vec2fa& operator /=( Vec2fa& a, const Vec2fa& b ) { return a = a / b; } - __forceinline Vec2fa& operator /=( Vec2fa& a, const float b ) { return a = a / b; } - - //////////////////////////////////////////////////////////////////////////////// - /// Reductions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline float reduce_add(const Vec2fa& v) { return v.x+v.y; } - __forceinline float reduce_mul(const Vec2fa& v) { return v.x*v.y; } - __forceinline float reduce_min(const Vec2fa& v) { return min(v.x,v.y); } - __forceinline float reduce_max(const Vec2fa& v) { return max(v.x,v.y); } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline bool operator ==( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 3) == 3; } - __forceinline bool operator !=( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 3) != 0; } - - //////////////////////////////////////////////////////////////////////////////// - /// Euclidian Space Operators - //////////////////////////////////////////////////////////////////////////////// - -#if defined(__SSE4_1__) - __forceinline float dot ( const Vec2fa& a, const Vec2fa& b ) { - return _mm_cvtss_f32(_mm_dp_ps(a,b,0x3F)); - } -#else - __forceinline float dot ( const Vec2fa& a, const Vec2fa& b ) { - return reduce_add(a*b); - } -#endif - - __forceinline Vec2fa cross ( const Vec2fa& a ) { - return Vec2fa(-a.y,a.x); - } - - __forceinline float sqr_length ( const Vec2fa& a ) { return dot(a,a); } - __forceinline float rcp_length ( const Vec2fa& a ) { return rsqrt(dot(a,a)); } - __forceinline float rcp_length2( const Vec2fa& a ) { return rcp(dot(a,a)); } - __forceinline float length ( const Vec2fa& a ) { return sqrt(dot(a,a)); } - __forceinline Vec2fa normalize( const Vec2fa& a ) { return a*rsqrt(dot(a,a)); } - __forceinline float distance ( const Vec2fa& a, const Vec2fa& b ) { return length(a-b); } - - //////////////////////////////////////////////////////////////////////////////// - /// Select - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec2fa select( bool s, const Vec2fa& t, const Vec2fa& f ) { - __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps(); - return blendv_ps(f, t, mask); - } - - __forceinline Vec2fa lerp(const Vec2fa& v0, const Vec2fa& v1, const float t) { - return madd(1.0f-t,v0,t*v1); - } - - __forceinline int maxDim ( const Vec2fa& a ) - { - const Vec2fa b = abs(a); - if (b.x > b.y) return 0; - else return 1; - } - - //////////////////////////////////////////////////////////////////////////////// - /// Rounding Functions - //////////////////////////////////////////////////////////////////////////////// - -#if defined(__aarch64__) -__forceinline Vec2fa floor(const Vec2fa& a) { return vrndmq_f32(a); } -__forceinline Vec2fa ceil (const Vec2fa& a) { return vrndpq_f32(a); } -//__forceinline Vec2fa trunc(const Vec2fa& a) { return vrndq_f32(a); } -#elif defined (__SSE4_1__) - //__forceinline Vec2fa trunc( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT); } - __forceinline Vec2fa floor( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF ); } - __forceinline Vec2fa ceil ( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF ); } -#else - //__forceinline Vec2fa trunc( const Vec2fa& a ) { return Vec2fa(truncf(a.x),truncf(a.y),truncf(a.z)); } - __forceinline Vec2fa floor( const Vec2fa& a ) { return Vec2fa(floorf(a.x),floorf(a.y)); } - __forceinline Vec2fa ceil ( const Vec2fa& a ) { return Vec2fa(ceilf (a.x),ceilf (a.y)); } -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline embree_ostream operator<<(embree_ostream cout, const Vec2fa& a) { - return cout << "(" << a.x << ", " << a.y << ")"; - } - - typedef Vec2fa Vec2fa_t; -} diff --git a/thirdparty/embree-aarch64/common/math/vec3.h b/thirdparty/embree-aarch64/common/math/vec3.h deleted file mode 100644 index 1870321715..0000000000 --- a/thirdparty/embree-aarch64/common/math/vec3.h +++ /dev/null @@ -1,349 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "math.h" - -namespace embree -{ - struct Vec3fa; - - //////////////////////////////////////////////////////////////////////////////// - /// Generic 3D vector Class - //////////////////////////////////////////////////////////////////////////////// - - template struct Vec3 - { - enum { N = 3 }; - - union { - struct { - T x, y, z; - }; -#if !(defined(__WIN32__) && _MSC_VER == 1800) // workaround for older VS 2013 compiler - T components[N]; -#endif - }; - - typedef T Scalar; - - //////////////////////////////////////////////////////////////////////////////// - /// Construction - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec3( ) {} - __forceinline explicit Vec3( const T& a ) : x(a), y(a), z(a) {} - __forceinline Vec3( const T& x, const T& y, const T& z ) : x(x), y(y), z(z) {} - - __forceinline Vec3( const Vec3& other ) { x = other.x; y = other.y; z = other.z; } - __forceinline Vec3( const Vec3fa& other ); - - template __forceinline Vec3( const Vec3& a ) : x(T(a.x)), y(T(a.y)), z(T(a.z)) {} - template __forceinline Vec3& operator =(const Vec3& other) { x = other.x; y = other.y; z = other.z; return *this; } - - __forceinline Vec3& operator =(const Vec3& other) { x = other.x; y = other.y; z = other.z; return *this; } - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec3( ZeroTy ) : x(zero), y(zero), z(zero) {} - __forceinline Vec3( OneTy ) : x(one), y(one), z(one) {} - __forceinline Vec3( PosInfTy ) : x(pos_inf), y(pos_inf), z(pos_inf) {} - __forceinline Vec3( NegInfTy ) : x(neg_inf), y(neg_inf), z(neg_inf) {} - -#if defined(__WIN32__) && (_MSC_VER == 1800) // workaround for older VS 2013 compiler - __forceinline const T& operator []( const size_t axis ) const { assert(axis < 3); return (&x)[axis]; } - __forceinline T& operator []( const size_t axis ) { assert(axis < 3); return (&x)[axis]; } -#else - __forceinline const T& operator [](const size_t axis) const { assert(axis < 3); return components[axis]; } - __forceinline T& operator [](const size_t axis) { assert(axis < 3); return components[axis]; } -#endif - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - - template __forceinline Vec3 operator +( const Vec3& a ) { return Vec3(+a.x, +a.y, +a.z); } - template __forceinline Vec3 operator -( const Vec3& a ) { return Vec3(-a.x, -a.y, -a.z); } - template __forceinline Vec3 abs ( const Vec3& a ) { return Vec3(abs (a.x), abs (a.y), abs (a.z)); } - template __forceinline Vec3 rcp ( const Vec3& a ) { return Vec3(rcp (a.x), rcp (a.y), rcp (a.z)); } - template __forceinline Vec3 rsqrt ( const Vec3& a ) { return Vec3(rsqrt(a.x), rsqrt(a.y), rsqrt(a.z)); } - template __forceinline Vec3 sqrt ( const Vec3& a ) { return Vec3(sqrt (a.x), sqrt (a.y), sqrt (a.z)); } - - template __forceinline Vec3 zero_fix( const Vec3& a ) - { - return Vec3(select(abs(a.x) __forceinline Vec3 rcp_safe(const Vec3& a) { return rcp(zero_fix(a)); } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - template __forceinline Vec3 operator +( const Vec3& a, const Vec3& b ) { return Vec3(a.x + b.x, a.y + b.y, a.z + b.z); } - template __forceinline Vec3 operator -( const Vec3& a, const Vec3& b ) { return Vec3(a.x - b.x, a.y - b.y, a.z - b.z); } - template __forceinline Vec3 operator *( const Vec3& a, const Vec3& b ) { return Vec3(a.x * b.x, a.y * b.y, a.z * b.z); } - template __forceinline Vec3 operator *( const T& a, const Vec3& b ) { return Vec3(a * b.x, a * b.y, a * b.z); } - template __forceinline Vec3 operator *( const Vec3& a, const T& b ) { return Vec3(a.x * b , a.y * b , a.z * b ); } - template __forceinline Vec3 operator /( const Vec3& a, const T& b ) { return Vec3(a.x / b , a.y / b , a.z / b ); } - template __forceinline Vec3 operator /( const T& a, const Vec3& b ) { return Vec3(a / b.x, a / b.y, a / b.z); } - template __forceinline Vec3 operator /( const Vec3& a, const Vec3& b ) { return Vec3(a.x / b.x, a.y / b.y, a.z / b.z); } - - template __forceinline Vec3 min(const Vec3& a, const Vec3& b) { return Vec3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z)); } - template __forceinline Vec3 max(const Vec3& a, const Vec3& b) { return Vec3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)); } - - template __forceinline Vec3 operator >>( const Vec3& a, const int b ) { return Vec3(a.x >> b, a.y >> b, a.z >> b); } - template __forceinline Vec3 operator <<( const Vec3& a, const int b ) { return Vec3(a.x << b, a.y << b, a.z << b); } - - //////////////////////////////////////////////////////////////////////////////// - /// Ternary Operators - //////////////////////////////////////////////////////////////////////////////// - - template __forceinline Vec3 madd ( const Vec3& a, const Vec3& b, const Vec3& c) { return Vec3( madd(a.x,b.x,c.x), madd(a.y,b.y,c.y), madd(a.z,b.z,c.z)); } - template __forceinline Vec3 msub ( const Vec3& a, const Vec3& b, const Vec3& c) { return Vec3( msub(a.x,b.x,c.x), msub(a.y,b.y,c.y), msub(a.z,b.z,c.z)); } - template __forceinline Vec3 nmadd ( const Vec3& a, const Vec3& b, const Vec3& c) { return Vec3(nmadd(a.x,b.x,c.x),nmadd(a.y,b.y,c.y),nmadd(a.z,b.z,c.z));} - template __forceinline Vec3 nmsub ( const Vec3& a, const Vec3& b, const Vec3& c) { return Vec3(nmsub(a.x,b.x,c.x),nmsub(a.y,b.y,c.y),nmsub(a.z,b.z,c.z)); } - - template __forceinline Vec3 madd ( const T& a, const Vec3& b, const Vec3& c) { return Vec3( madd(a,b.x,c.x), madd(a,b.y,c.y), madd(a,b.z,c.z)); } - template __forceinline Vec3 msub ( const T& a, const Vec3& b, const Vec3& c) { return Vec3( msub(a,b.x,c.x), msub(a,b.y,c.y), msub(a,b.z,c.z)); } - template __forceinline Vec3 nmadd ( const T& a, const Vec3& b, const Vec3& c) { return Vec3(nmadd(a,b.x,c.x),nmadd(a,b.y,c.y),nmadd(a,b.z,c.z));} - template __forceinline Vec3 nmsub ( const T& a, const Vec3& b, const Vec3& c) { return Vec3(nmsub(a,b.x,c.x),nmsub(a,b.y,c.y),nmsub(a,b.z,c.z)); } - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - template __forceinline Vec3& operator +=( Vec3& a, const T b ) { a.x += b; a.y += b; a.z += b; return a; } - template __forceinline Vec3& operator +=( Vec3& a, const Vec3& b ) { a.x += b.x; a.y += b.y; a.z += b.z; return a; } - template __forceinline Vec3& operator -=( Vec3& a, const Vec3& b ) { a.x -= b.x; a.y -= b.y; a.z -= b.z; return a; } - template __forceinline Vec3& operator *=( Vec3& a, const T& b ) { a.x *= b ; a.y *= b ; a.z *= b ; return a; } - template __forceinline Vec3& operator /=( Vec3& a, const T& b ) { a.x /= b ; a.y /= b ; a.z /= b ; return a; } - - //////////////////////////////////////////////////////////////////////////////// - /// Reduction Operators - //////////////////////////////////////////////////////////////////////////////// - - template __forceinline T reduce_add( const Vec3& a ) { return a.x + a.y + a.z; } - template __forceinline T reduce_mul( const Vec3& a ) { return a.x * a.y * a.z; } - template __forceinline T reduce_min( const Vec3& a ) { return min(a.x, a.y, a.z); } - template __forceinline T reduce_max( const Vec3& a ) { return max(a.x, a.y, a.z); } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators - //////////////////////////////////////////////////////////////////////////////// - - template __forceinline bool operator ==( const Vec3& a, const Vec3& b ) { return a.x == b.x && a.y == b.y && a.z == b.z; } - template __forceinline bool operator !=( const Vec3& a, const Vec3& b ) { return a.x != b.x || a.y != b.y || a.z != b.z; } - template __forceinline bool operator < ( const Vec3& a, const Vec3& b ) { - if (a.x != b.x) return a.x < b.x; - if (a.y != b.y) return a.y < b.y; - if (a.z != b.z) return a.z < b.z; - return false; - } - - //////////////////////////////////////////////////////////////////////////////// - /// Shift Operators - //////////////////////////////////////////////////////////////////////////////// - - template __forceinline Vec3 shift_right_1( const Vec3& a ) { - return Vec3(shift_right_1(a.x),shift_right_1(a.y),shift_right_1(a.z)); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Select - //////////////////////////////////////////////////////////////////////////////// - - template __forceinline Vec3 select ( bool s, const Vec3& t, const Vec3& f ) { - return Vec3(select(s,t.x,f.x),select(s,t.y,f.y),select(s,t.z,f.z)); - } - - template __forceinline Vec3 select ( const Vec3& s, const Vec3& t, const Vec3& f ) { - return Vec3(select(s.x,t.x,f.x),select(s.y,t.y,f.y),select(s.z,t.z,f.z)); - } - - template __forceinline Vec3 select ( const typename T::Bool& s, const Vec3& t, const Vec3& f ) { - return Vec3(select(s,t.x,f.x),select(s,t.y,f.y),select(s,t.z,f.z)); - } - - template - __forceinline Vec3 lerp(const Vec3& v0, const Vec3& v1, const T& t) { - return madd(Vec3(T(1.0f)-t),v0,t*v1); - } - - template __forceinline int maxDim ( const Vec3& a ) - { - const Vec3 b = abs(a); - if (b.x > b.y) { - if (b.x > b.z) return 0; else return 2; - } else { - if (b.y > b.z) return 1; else return 2; - } - } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators - //////////////////////////////////////////////////////////////////////////////// - - template __forceinline Vec3 eq_mask( const Vec3& a, const Vec3& b ) { return Vec3(a.x==b.x,a.y==b.y,a.z==b.z); } - template __forceinline Vec3 neq_mask(const Vec3& a, const Vec3& b ) { return Vec3(a.x!=b.x,a.y!=b.y,a.z!=b.z); } - template __forceinline Vec3 lt_mask( const Vec3& a, const Vec3& b ) { return Vec3(a.x< b.x,a.y< b.y,a.z< b.z); } - template __forceinline Vec3 le_mask( const Vec3& a, const Vec3& b ) { return Vec3(a.x<=b.x,a.y<=b.y,a.z<=b.z); } - template __forceinline Vec3 gt_mask( const Vec3& a, const Vec3& b ) { return Vec3(a.x> b.x,a.y> b.y,a.z> b.z); } - template __forceinline Vec3 ge_mask( const Vec3& a, const Vec3& b ) { return Vec3(a.x>=b.x,a.y>=b.y,a.z>=b.z); } - - //////////////////////////////////////////////////////////////////////////////// - /// Euclidian Space Operators - //////////////////////////////////////////////////////////////////////////////// - - template __forceinline T sqr ( const Vec3& a ) { return dot(a,a); } - template __forceinline T dot ( const Vec3& a, const Vec3& b ) { return madd(a.x,b.x,madd(a.y,b.y,a.z*b.z)); } - template __forceinline T length ( const Vec3& a ) { return sqrt(sqr(a)); } - template __forceinline T rcp_length( const Vec3& a ) { return rsqrt(sqr(a)); } - template __forceinline Vec3 normalize( const Vec3& a ) { return a*rsqrt(sqr(a)); } - template __forceinline T distance ( const Vec3& a, const Vec3& b ) { return length(a-b); } - template __forceinline Vec3 cross ( const Vec3& a, const Vec3& b ) { return Vec3(prod_diff(a.y,b.z,a.z,b.y), prod_diff(a.z,b.x,a.x,b.z), prod_diff(a.x,b.y,a.y,b.x)); } - template __forceinline Vec3 stable_triangle_normal( const Vec3& a, const Vec3& b, const Vec3& c ) - { - const T ab_x = a.z*b.y, ab_y = a.x*b.z, ab_z = a.y*b.x; - const T bc_x = b.z*c.y, bc_y = b.x*c.z, bc_z = b.y*c.x; - const Vec3 cross_ab(msub(a.y,b.z,ab_x), msub(a.z,b.x,ab_y), msub(a.x,b.y,ab_z)); - const Vec3 cross_bc(msub(b.y,c.z,bc_x), msub(b.z,c.x,bc_y), msub(b.x,c.y,bc_z)); - const auto sx = abs(ab_x) < abs(bc_x); - const auto sy = abs(ab_y) < abs(bc_y); - const auto sz = abs(ab_z) < abs(bc_z); - return Vec3(select(sx,cross_ab.x,cross_bc.x), - select(sy,cross_ab.y,cross_bc.y), - select(sz,cross_ab.z,cross_bc.z)); - } - - template __forceinline T sum ( const Vec3& a ) { return a.x+a.y+a.z; } - - template __forceinline T halfArea ( const Vec3& d ) { return madd(d.x,(d.y+d.z),d.y*d.z); } - template __forceinline T area ( const Vec3& d ) { return 2.0f*halfArea(d); } - - template __forceinline Vec3 normalize_safe( const Vec3& a ) { - const T d = dot(a,a); return select(d == T( zero ), a , a*rsqrt(d) ); - } - - template __forceinline T sqr_point_to_line_distance(const Vec3& P, const Vec3& Q0, const Vec3& Q1) - { - const Vec3 N = cross(P-Q0,Q1-Q0); - const Vec3 D = Q1-Q0; - return dot(N,N)*rcp(dot(D,D)); - } - - template __forceinline T sqr_point_to_line_distance(const Vec3& PmQ0, const Vec3& Q1mQ0) - { - const Vec3 N = cross(PmQ0,Q1mQ0); - const Vec3 D = Q1mQ0; - return dot(N,N)*rcp(dot(D,D)); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - template __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3& a) { - return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")"; - } - - typedef Vec3 Vec3b; - typedef Vec3 Vec3i; - typedef Vec3 Vec3f; -} - -#include "vec3ba.h" -#include "vec3ia.h" -#include "vec3fa.h" - -//////////////////////////////////////////////////////////////////////////////// -/// SSE / AVX / MIC specializations -//////////////////////////////////////////////////////////////////////////////// - -#if defined(__SSE__) || defined(__ARM_NEON) -#include "../simd/sse.h" -#endif - -#if defined(__AVX__) -#include "../simd/avx.h" -#endif - -#if defined(__AVX512F__) -#include "../simd/avx512.h" -#endif - -namespace embree -{ - template - __forceinline Vec3 broadcast(const Vec3& a, const size_t k) { - return Vec3(Out(a.x[k]), Out(a.y[k]), Out(a.z[k])); - } - - template<> __forceinline Vec3::Vec3(const Vec3fa& a) { x = a.x; y = a.y; z = a.z; } - -#if defined(__AVX__) - template<> __forceinline Vec3::Vec3(const Vec3fa& a) { - x = a.x; y = a.y; z = a.z; - } -#elif defined(__SSE__) || defined(__ARM_NEON) - template<> - __forceinline Vec3::Vec3(const Vec3fa& a) { - const vfloat4 v = vfloat4(a.m128); x = shuffle<0,0,0,0>(v); y = shuffle<1,1,1,1>(v); z = shuffle<2,2,2,2>(v); - } -#endif - -#if defined(__SSE__) || defined(__ARM_NEON) - __forceinline Vec3 broadcast4f(const Vec3& a, const size_t k) { - return Vec3(vfloat4::broadcast(&a.x[k]), vfloat4::broadcast(&a.y[k]), vfloat4::broadcast(&a.z[k])); - } - - template<> - __forceinline Vec3 broadcast(const Vec3& a, const size_t k) { - return Vec3(vfloat4::broadcast(&a.x[k]), vfloat4::broadcast(&a.y[k]), vfloat4::broadcast(&a.z[k])); - } - - template - __forceinline Vec3 shuffle(const Vec3& b) { - return Vec3(shuffle(b.x), shuffle(b.y), shuffle(b.z)); - } -#endif - -#if defined(__AVX__) - template<> - __forceinline Vec3::Vec3(const Vec3fa& a) { - x = a.x; y = a.y; z = a.z; - } - __forceinline Vec3 broadcast4f(const Vec3& a, const size_t k) { - return Vec3(vfloat4::broadcast(&a.x[k]), vfloat4::broadcast(&a.y[k]), vfloat4::broadcast(&a.z[k])); - } - __forceinline Vec3 broadcast8f(const Vec3& a, const size_t k) { - return Vec3(vfloat8::broadcast(&a.x[k]), vfloat8::broadcast(&a.y[k]), vfloat8::broadcast(&a.z[k])); - } - __forceinline Vec3 broadcast8f(const Vec3& a, const size_t k) { - return Vec3(vfloat8::broadcast(&a.x[k]), vfloat8::broadcast(&a.y[k]), vfloat8::broadcast(&a.z[k])); - } - - template<> - __forceinline Vec3 broadcast(const Vec3& a, const size_t k) { - return Vec3(vfloat8::broadcast(&a.x[k]), vfloat8::broadcast(&a.y[k]), vfloat8::broadcast(&a.z[k])); - } - template<> - __forceinline Vec3 broadcast(const Vec3& a, const size_t k) { - return Vec3(vfloat8::broadcast(&a.x[k]), vfloat8::broadcast(&a.y[k]), vfloat8::broadcast(&a.z[k])); - } - - template - __forceinline Vec3 shuffle(const Vec3& b) { - return Vec3(shuffle(b.x), shuffle(b.y), shuffle(b.z)); - } -#endif - -#if defined(__AVX512F__) - template<> __forceinline Vec3::Vec3(const Vec3fa& a) : x(a.x), y(a.y), z(a.z) {} -#endif -} diff --git a/thirdparty/embree-aarch64/common/math/vec3ba.h b/thirdparty/embree-aarch64/common/math/vec3ba.h deleted file mode 100644 index 90f31739c2..0000000000 --- a/thirdparty/embree-aarch64/common/math/vec3ba.h +++ /dev/null @@ -1,120 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../sys/alloc.h" -#include "math.h" -#include "../simd/sse.h" - -namespace embree -{ - //////////////////////////////////////////////////////////////////////////////// - /// SSE Vec3ba Type - //////////////////////////////////////////////////////////////////////////////// - - struct __aligned(16) Vec3ba - { - ALIGNED_STRUCT_(16); - - union { - __m128 m128; - struct { int x,y,z; }; - }; - - typedef int Scalar; - enum { N = 3 }; - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec3ba( ) {} - __forceinline Vec3ba( const __m128 input ) : m128(input) {} - __forceinline Vec3ba( const Vec3ba& other ) : m128(other.m128) {} - __forceinline Vec3ba& operator =(const Vec3ba& other) { m128 = other.m128; return *this; } - - __forceinline explicit Vec3ba( bool a ) - : m128(mm_lookupmask_ps[(size_t(a) << 3) | (size_t(a) << 2) | (size_t(a) << 1) | size_t(a)]) {} - __forceinline Vec3ba( bool a, bool b, bool c) - : m128(mm_lookupmask_ps[(size_t(c) << 2) | (size_t(b) << 1) | size_t(a)]) {} - - __forceinline operator const __m128&() const { return m128; } - __forceinline operator __m128&() { return m128; } - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec3ba( FalseTy ) : m128(_mm_setzero_ps()) {} - __forceinline Vec3ba( TrueTy ) : m128(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()))) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline const int& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; } - __forceinline int& operator []( const size_t index ) { assert(index < 3); return (&x)[index]; } - }; - - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec3ba operator !( const Vec3ba& a ) { return _mm_xor_ps(a.m128, Vec3ba(embree::True)); } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec3ba operator &( const Vec3ba& a, const Vec3ba& b ) { return _mm_and_ps(a.m128, b.m128); } - __forceinline Vec3ba operator |( const Vec3ba& a, const Vec3ba& b ) { return _mm_or_ps (a.m128, b.m128); } - __forceinline Vec3ba operator ^( const Vec3ba& a, const Vec3ba& b ) { return _mm_xor_ps(a.m128, b.m128); } - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec3ba& operator &=( Vec3ba& a, const Vec3ba& b ) { return a = a & b; } - __forceinline Vec3ba& operator |=( Vec3ba& a, const Vec3ba& b ) { return a = a | b; } - __forceinline Vec3ba& operator ^=( Vec3ba& a, const Vec3ba& b ) { return a = a ^ b; } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators + Select - //////////////////////////////////////////////////////////////////////////////// - - __forceinline bool operator ==( const Vec3ba& a, const Vec3ba& b ) { - return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(a.m128), _mm_castps_si128(b.m128)))) & 7) == 7; - } - __forceinline bool operator !=( const Vec3ba& a, const Vec3ba& b ) { - return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(a.m128), _mm_castps_si128(b.m128)))) & 7) != 7; - } - __forceinline bool operator < ( const Vec3ba& a, const Vec3ba& b ) { - if (a.x != b.x) return a.x < b.x; - if (a.y != b.y) return a.y < b.y; - if (a.z != b.z) return a.z < b.z; - return false; - } - - //////////////////////////////////////////////////////////////////////////////// - /// Reduction Operations - //////////////////////////////////////////////////////////////////////////////// - - __forceinline bool reduce_and( const Vec3ba& a ) { return (_mm_movemask_ps(a) & 0x7) == 0x7; } - __forceinline bool reduce_or ( const Vec3ba& a ) { return (_mm_movemask_ps(a) & 0x7) != 0x0; } - - __forceinline bool all ( const Vec3ba& b ) { return (_mm_movemask_ps(b) & 0x7) == 0x7; } - __forceinline bool any ( const Vec3ba& b ) { return (_mm_movemask_ps(b) & 0x7) != 0x0; } - __forceinline bool none ( const Vec3ba& b ) { return (_mm_movemask_ps(b) & 0x7) == 0x0; } - - __forceinline size_t movemask(const Vec3ba& a) { return _mm_movemask_ps(a) & 0x7; } - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3ba& a) { - return cout << "(" << (a.x ? "1" : "0") << ", " << (a.y ? "1" : "0") << ", " << (a.z ? "1" : "0") << ")"; - } -} diff --git a/thirdparty/embree-aarch64/common/math/vec3fa.h b/thirdparty/embree-aarch64/common/math/vec3fa.h deleted file mode 100644 index 6163cfb596..0000000000 --- a/thirdparty/embree-aarch64/common/math/vec3fa.h +++ /dev/null @@ -1,810 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../sys/alloc.h" -#include "math.h" -#include "../simd/sse.h" - -namespace embree -{ - //////////////////////////////////////////////////////////////////////////////// - /// SSE Vec3fa Type - //////////////////////////////////////////////////////////////////////////////// - - struct __aligned(16) Vec3fa - { - ALIGNED_STRUCT_(16); - - typedef float Scalar; - enum { N = 3 }; - union { - __m128 m128; - struct { float x,y,z; }; - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec3fa( ) {} - __forceinline Vec3fa( const __m128 a ) : m128(a) {} - - __forceinline Vec3fa ( const Vec3& other ) { m128 = _mm_set_ps(0, other.z, other.y, other.x); } - //__forceinline Vec3fa& operator =( const Vec3& other ) { m128 = _mm_set_ps(0, other.z, other.y, other.x); return *this; } - - __forceinline Vec3fa ( const Vec3fa& other ) { m128 = other.m128; } - __forceinline Vec3fa& operator =( const Vec3fa& other ) { m128 = other.m128; return *this; } - - __forceinline explicit Vec3fa( const float a ) : m128(_mm_set1_ps(a)) {} - __forceinline Vec3fa( const float x, const float y, const float z) : m128(_mm_set_ps(0, z, y, x)) {} - - __forceinline explicit Vec3fa( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {} - - __forceinline explicit operator const vfloat4() const { return vfloat4(m128); } - __forceinline explicit operator const vint4() const { return vint4(_mm_cvtps_epi32(m128)); } - __forceinline explicit operator const Vec2fa() const { return Vec2fa(m128); } - __forceinline explicit operator const Vec3ia() const { return Vec3ia(_mm_cvtps_epi32(m128)); } - - //__forceinline operator const __m128&() const { return m128; } - //__forceinline operator __m128&() { return m128; } - - //////////////////////////////////////////////////////////////////////////////// - /// Loads and Stores - //////////////////////////////////////////////////////////////////////////////// - - static __forceinline Vec3fa load( const void* const a ) { -#if defined(__aarch64__) - __m128 t = _mm_load_ps((float*)a); - t[3] = 0.0f; - return Vec3fa(t); -#else - return Vec3fa(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1)))); -#endif - } - - static __forceinline Vec3fa loadu( const void* const a ) { - return Vec3fa(_mm_loadu_ps((float*)a)); - } - - static __forceinline void storeu ( void* ptr, const Vec3fa& v ) { - _mm_storeu_ps((float*)ptr,v.m128); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec3fa( ZeroTy ) : m128(_mm_setzero_ps()) {} - __forceinline Vec3fa( OneTy ) : m128(_mm_set1_ps(1.0f)) {} - __forceinline Vec3fa( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {} - __forceinline Vec3fa( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline const float& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; } - __forceinline float& operator []( const size_t index ) { assert(index < 3); return (&x)[index]; } - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec3fa operator +( const Vec3fa& a ) { return a; } - __forceinline Vec3fa operator -( const Vec3fa& a ) { -#if defined(__aarch64__) - return vnegq_f32(a.m128); -#else - const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); - - return _mm_xor_ps(a.m128, mask); -#endif - } - __forceinline Vec3fa abs ( const Vec3fa& a ) { -#if defined(__aarch64__) - return _mm_abs_ps(a.m128); -#else - const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)); - return _mm_and_ps(a.m128, mask); -#endif - } - __forceinline Vec3fa sign ( const Vec3fa& a ) { -#if defined(__aarch64__) - Vec3fa r = blendv_ps(vOne, vmOne, _mm_cmplt_ps (a.m128,vdupq_n_f32(0.0f))); - return r; -#else - return blendv_ps(Vec3fa(one).m128, (-Vec3fa(one)).m128, _mm_cmplt_ps (a.m128,Vec3fa(zero).m128)); -#endif - } - - __forceinline Vec3fa rcp ( const Vec3fa& a ) - { -#if defined(__aarch64__) && defined(BUILD_IOS) - return vdivq_f32(vdupq_n_f32(1.0f),a.m128); -#elif defined(__aarch64__) - __m128 reciprocal = _mm_rcp_ps(a.m128); - reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal); - reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal); - return (const Vec3fa)reciprocal; -#else - -#if defined(__AVX512VL__) - const Vec3fa r = _mm_rcp14_ps(a.m128); -#else - const Vec3fa r = _mm_rcp_ps(a.m128); -#endif - -#if defined(__AVX2__) - const Vec3fa res = _mm_mul_ps(r.m128,_mm_fnmadd_ps(r.m128, a.m128, vfloat4(2.0f))); -#else - const Vec3fa res = _mm_mul_ps(r.m128,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r.m128, a.m128))); - //return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a)); -#endif - - return res; -#endif //defined(__aarch64__) - } - - __forceinline Vec3fa sqrt ( const Vec3fa& a ) { return _mm_sqrt_ps(a.m128); } - __forceinline Vec3fa sqr ( const Vec3fa& a ) { return _mm_mul_ps(a.m128,a.m128); } - - __forceinline Vec3fa rsqrt( const Vec3fa& a ) - { -#if defined(__aarch64__) - __m128 r = _mm_rsqrt_ps(a.m128); - r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r)); - r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r)); - return r; -#else - -#if defined(__AVX512VL__) - __m128 r = _mm_rsqrt14_ps(a.m128); -#else - __m128 r = _mm_rsqrt_ps(a.m128); -#endif - return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a.m128, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); -#endif - } - - __forceinline Vec3fa zero_fix(const Vec3fa& a) { - return blendv_ps(a.m128, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input))); - } - __forceinline Vec3fa rcp_safe(const Vec3fa& a) { - return rcp(zero_fix(a)); - } - __forceinline Vec3fa log ( const Vec3fa& a ) { - return Vec3fa(logf(a.x),logf(a.y),logf(a.z)); - } - - __forceinline Vec3fa exp ( const Vec3fa& a ) { - return Vec3fa(expf(a.x),expf(a.y),expf(a.z)); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec3fa operator +( const Vec3fa& a, const Vec3fa& b ) { return _mm_add_ps(a.m128, b.m128); } - __forceinline Vec3fa operator -( const Vec3fa& a, const Vec3fa& b ) { return _mm_sub_ps(a.m128, b.m128); } - __forceinline Vec3fa operator *( const Vec3fa& a, const Vec3fa& b ) { return _mm_mul_ps(a.m128, b.m128); } - __forceinline Vec3fa operator *( const Vec3fa& a, const float b ) { return a * Vec3fa(b); } - __forceinline Vec3fa operator *( const float a, const Vec3fa& b ) { return Vec3fa(a) * b; } - __forceinline Vec3fa operator /( const Vec3fa& a, const Vec3fa& b ) { return _mm_div_ps(a.m128,b.m128); } - __forceinline Vec3fa operator /( const Vec3fa& a, const float b ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); } - __forceinline Vec3fa operator /( const float a, const Vec3fa& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); } - - __forceinline Vec3fa min( const Vec3fa& a, const Vec3fa& b ) { return _mm_min_ps(a.m128,b.m128); } - __forceinline Vec3fa max( const Vec3fa& a, const Vec3fa& b ) { return _mm_max_ps(a.m128,b.m128); } - -#if defined(__aarch64__) || defined(__SSE4_1__) - __forceinline Vec3fa mini(const Vec3fa& a, const Vec3fa& b) { - const vint4 ai = _mm_castps_si128(a.m128); - const vint4 bi = _mm_castps_si128(b.m128); - const vint4 ci = _mm_min_epi32(ai,bi); - return _mm_castsi128_ps(ci); - } -#endif - -#if defined(__aarch64__) || defined(__SSE4_1__) - __forceinline Vec3fa maxi(const Vec3fa& a, const Vec3fa& b) { - const vint4 ai = _mm_castps_si128(a.m128); - const vint4 bi = _mm_castps_si128(b.m128); - const vint4 ci = _mm_max_epi32(ai,bi); - return _mm_castsi128_ps(ci); - } -#endif - - __forceinline Vec3fa pow ( const Vec3fa& a, const float& b ) { - return Vec3fa(powf(a.x,b),powf(a.y,b),powf(a.z,b)); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Ternary Operators - //////////////////////////////////////////////////////////////////////////////// - -#if defined(__AVX2__) - __forceinline Vec3fa madd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmadd_ps(a.m128,b.m128,c.m128); } - __forceinline Vec3fa msub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmsub_ps(a.m128,b.m128,c.m128); } - __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmadd_ps(a.m128,b.m128,c.m128); } - __forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmsub_ps(a.m128,b.m128,c.m128); } -#else - -#if defined(__aarch64__) - __forceinline Vec3fa madd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { - return _mm_madd_ps(a.m128, b.m128, c.m128); //a*b+c; - } - __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { - return _mm_msub_ps(a.m128, b.m128, c.m128); //-a*b+c; - } - __forceinline Vec3fa nmsub( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { - Vec3fa t = _mm_madd_ps(a.m128, b.m128, c.m128); - return -t; - } - __forceinline Vec3fa msub( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { - return _mm_madd_ps(a.m128,b.m128,vnegq_f32(c.m128)); //a*b-c - } - -#else - __forceinline Vec3fa madd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b+c; } - __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b+c;} - __forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b-c; } - __forceinline Vec3fa msub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b-c; } -#endif - -#endif - - __forceinline Vec3fa madd ( const float a, const Vec3fa& b, const Vec3fa& c) { return madd(Vec3fa(a),b,c); } - __forceinline Vec3fa msub ( const float a, const Vec3fa& b, const Vec3fa& c) { return msub(Vec3fa(a),b,c); } - __forceinline Vec3fa nmadd ( const float a, const Vec3fa& b, const Vec3fa& c) { return nmadd(Vec3fa(a),b,c); } - __forceinline Vec3fa nmsub ( const float a, const Vec3fa& b, const Vec3fa& c) { return nmsub(Vec3fa(a),b,c); } - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec3fa& operator +=( Vec3fa& a, const Vec3fa& b ) { return a = a + b; } - __forceinline Vec3fa& operator -=( Vec3fa& a, const Vec3fa& b ) { return a = a - b; } - __forceinline Vec3fa& operator *=( Vec3fa& a, const Vec3fa& b ) { return a = a * b; } - __forceinline Vec3fa& operator *=( Vec3fa& a, const float b ) { return a = a * b; } - __forceinline Vec3fa& operator /=( Vec3fa& a, const Vec3fa& b ) { return a = a / b; } - __forceinline Vec3fa& operator /=( Vec3fa& a, const float b ) { return a = a / b; } - - //////////////////////////////////////////////////////////////////////////////// - /// Reductions - //////////////////////////////////////////////////////////////////////////////// -#if defined(__aarch64__) && defined(BUILD_IOS) - __forceinline float reduce_add(const Vec3fa& v) { - float32x4_t t = v.m128; - t[3] = 0.0f; - return vaddvq_f32(t); - } - - __forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; } - __forceinline float reduce_min(const Vec3fa& v) { - float32x4_t t = v.m128; - t[3] = t[2]; - return vminvq_f32(t); - } - __forceinline float reduce_max(const Vec3fa& v) { - float32x4_t t = v.m128; - t[3] = t[2]; - return vmaxvq_f32(t); - } -#else - __forceinline float reduce_add(const Vec3fa& v) { - const vfloat4 a(v.m128); - const vfloat4 b = shuffle<1>(a); - const vfloat4 c = shuffle<2>(a); - return _mm_cvtss_f32(a+b+c); - } - - __forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; } - __forceinline float reduce_min(const Vec3fa& v) { return min(v.x,v.y,v.z); } - __forceinline float reduce_max(const Vec3fa& v) { return max(v.x,v.y,v.z); } -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline bool operator ==( const Vec3fa& a, const Vec3fa& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 7) == 7; } - __forceinline bool operator !=( const Vec3fa& a, const Vec3fa& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 7) != 0; } - - __forceinline Vec3ba eq_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpeq_ps (a.m128, b.m128); } - __forceinline Vec3ba neq_mask(const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpneq_ps(a.m128, b.m128); } - __forceinline Vec3ba lt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmplt_ps (a.m128, b.m128); } - __forceinline Vec3ba le_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmple_ps (a.m128, b.m128); } - #if defined(__aarch64__) - __forceinline Vec3ba gt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpgt_ps (a.m128, b.m128); } - __forceinline Vec3ba ge_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpge_ps (a.m128, b.m128); } -#else - __forceinline Vec3ba gt_mask(const Vec3fa& a, const Vec3fa& b) { return _mm_cmpnle_ps(a.m128, b.m128); } - __forceinline Vec3ba ge_mask(const Vec3fa& a, const Vec3fa& b) { return _mm_cmpnlt_ps(a.m128, b.m128); } -#endif - - __forceinline bool isvalid ( const Vec3fa& v ) { - return all(gt_mask(v,Vec3fa(-FLT_LARGE)) & lt_mask(v,Vec3fa(+FLT_LARGE))); - } - - __forceinline bool is_finite ( const Vec3fa& a ) { - return all(ge_mask(a,Vec3fa(-FLT_MAX)) & le_mask(a,Vec3fa(+FLT_MAX))); - } - - __forceinline bool isvalid4 ( const Vec3fa& v ) { - return all((vfloat4(v.m128) > vfloat4(-FLT_LARGE)) & (vfloat4(v.m128) < vfloat4(+FLT_LARGE))); - } - - __forceinline bool is_finite4 ( const Vec3fa& a ) { - return all((vfloat4(a.m128) >= vfloat4(-FLT_MAX)) & (vfloat4(a.m128) <= vfloat4(+FLT_MAX))); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Euclidian Space Operators - //////////////////////////////////////////////////////////////////////////////// - -#if defined(__SSE4_1__) - __forceinline float dot ( const Vec3fa& a, const Vec3fa& b ) { - return _mm_cvtss_f32(_mm_dp_ps(a.m128,b.m128,0x7F)); - } -#else - __forceinline float dot ( const Vec3fa& a, const Vec3fa& b ) { - return reduce_add(a*b); - } -#endif - - __forceinline Vec3fa cross ( const Vec3fa& a, const Vec3fa& b ) - { - vfloat4 a0 = vfloat4(a.m128); - vfloat4 b0 = shuffle<1,2,0,3>(vfloat4(b.m128)); - vfloat4 a1 = shuffle<1,2,0,3>(vfloat4(a.m128)); - vfloat4 b1 = vfloat4(b.m128); - return Vec3fa(shuffle<1,2,0,3>(prod_diff(a0,b0,a1,b1))); - } - - __forceinline float sqr_length ( const Vec3fa& a ) { return dot(a,a); } - __forceinline float rcp_length ( const Vec3fa& a ) { return rsqrt(dot(a,a)); } - __forceinline float rcp_length2( const Vec3fa& a ) { return rcp(dot(a,a)); } - __forceinline float length ( const Vec3fa& a ) { return sqrt(dot(a,a)); } - __forceinline Vec3fa normalize( const Vec3fa& a ) { return a*rsqrt(dot(a,a)); } - __forceinline float distance ( const Vec3fa& a, const Vec3fa& b ) { return length(a-b); } - __forceinline float halfArea ( const Vec3fa& d ) { return madd(d.x,(d.y+d.z),d.y*d.z); } - __forceinline float area ( const Vec3fa& d ) { return 2.0f*halfArea(d); } - - __forceinline Vec3fa normalize_safe( const Vec3fa& a ) { - const float d = dot(a,a); if (unlikely(d == 0.0f)) return a; else return a*rsqrt(d); - } - - /*! differentiated normalization */ - __forceinline Vec3fa dnormalize(const Vec3fa& p, const Vec3fa& dp) - { - const float pp = dot(p,p); - const float pdp = dot(p,dp); - return (pp*dp-pdp*p)*rcp(pp)*rsqrt(pp); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Select - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec3fa select( bool s, const Vec3fa& t, const Vec3fa& f ) { - __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps(); - return blendv_ps(f.m128, t.m128, mask); - } - - __forceinline Vec3fa select( const Vec3ba& s, const Vec3fa& t, const Vec3fa& f ) { - return blendv_ps(f.m128, t.m128, s); - } - - __forceinline Vec3fa lerp(const Vec3fa& v0, const Vec3fa& v1, const float t) { - return madd(1.0f-t,v0,t*v1); - } - - __forceinline int maxDim ( const Vec3fa& a ) - { - const Vec3fa b = abs(a); - if (b.x > b.y) { - if (b.x > b.z) return 0; else return 2; - } else { - if (b.y > b.z) return 1; else return 2; - } - } - - //////////////////////////////////////////////////////////////////////////////// - /// Rounding Functions - //////////////////////////////////////////////////////////////////////////////// - -#if defined(__aarch64__) - __forceinline Vec3fa floor(const Vec3fa& a) { return vrndmq_f32(a.m128); } - __forceinline Vec3fa ceil (const Vec3fa& a) { return vrndpq_f32(a.m128); } - __forceinline Vec3fa trunc(const Vec3fa& a) { return vrndq_f32(a.m128); } -#elif defined (__SSE4_1__) - __forceinline Vec3fa trunc( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEAREST_INT); } - __forceinline Vec3fa floor( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEG_INF ); } - __forceinline Vec3fa ceil ( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_POS_INF ); } -#else - __forceinline Vec3fa trunc( const Vec3fa& a ) { return Vec3fa(truncf(a.x),truncf(a.y),truncf(a.z)); } - __forceinline Vec3fa floor( const Vec3fa& a ) { return Vec3fa(floorf(a.x),floorf(a.y),floorf(a.z)); } - __forceinline Vec3fa ceil ( const Vec3fa& a ) { return Vec3fa(ceilf (a.x),ceilf (a.y),ceilf (a.z)); } -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3fa& a) { - return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")"; - } - - typedef Vec3fa Vec3fa_t; - - - //////////////////////////////////////////////////////////////////////////////// - /// SSE Vec3fx Type - //////////////////////////////////////////////////////////////////////////////// - - struct __aligned(16) Vec3fx - { - ALIGNED_STRUCT_(16); - - typedef float Scalar; - enum { N = 3 }; - union { - __m128 m128; - struct { float x,y,z; union { int a; unsigned u; float w; }; }; - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec3fx( ) {} - __forceinline Vec3fx( const __m128 a ) : m128(a) {} - - __forceinline explicit Vec3fx(const Vec3fa& v) : m128(v.m128) {} - __forceinline operator Vec3fa () const { return Vec3fa(m128); } - - __forceinline explicit Vec3fx ( const Vec3& other ) { m128 = _mm_set_ps(0, other.z, other.y, other.x); } - //__forceinline Vec3fx& operator =( const Vec3& other ) { m128 = _mm_set_ps(0, other.z, other.y, other.x); return *this; } - - __forceinline Vec3fx ( const Vec3fx& other ) { m128 = other.m128; } - - __forceinline Vec3fx& operator =( const Vec3fx& other ) { m128 = other.m128; return *this; } - - __forceinline explicit Vec3fx( const float a ) : m128(_mm_set1_ps(a)) {} - __forceinline Vec3fx( const float x, const float y, const float z) : m128(_mm_set_ps(0, z, y, x)) {} - - __forceinline Vec3fx( const Vec3fa& other, const int a1) { m128 = other.m128; a = a1; } - __forceinline Vec3fx( const Vec3fa& other, const unsigned a1) { m128 = other.m128; u = a1; } - __forceinline Vec3fx( const Vec3fa& other, const float w1) { -#if defined (__aarch64__) - m128 = other.m128; m128[3] = w1; -#elif defined (__SSE4_1__) - m128 = _mm_insert_ps(other.m128, _mm_set_ss(w1),3 << 4); -#else - const vint4 mask(-1,-1,-1,0); - m128 = select(vboolf4(_mm_castsi128_ps(mask)),vfloat4(other.m128),vfloat4(w1)); -#endif - } - //__forceinline Vec3fx( const float x, const float y, const float z, const int a) : x(x), y(y), z(z), a(a) {} // not working properly! - //__forceinline Vec3fx( const float x, const float y, const float z, const unsigned a) : x(x), y(y), z(z), u(a) {} // not working properly! - __forceinline Vec3fx( const float x, const float y, const float z, const float w) : m128(_mm_set_ps(w, z, y, x)) {} - - //__forceinline explicit Vec3fx( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {} - - __forceinline explicit operator const vfloat4() const { return vfloat4(m128); } - __forceinline explicit operator const vint4() const { return vint4(_mm_cvtps_epi32(m128)); } - __forceinline explicit operator const Vec2fa() const { return Vec2fa(m128); } - __forceinline explicit operator const Vec3ia() const { return Vec3ia(_mm_cvtps_epi32(m128)); } - - //__forceinline operator const __m128&() const { return m128; } - //__forceinline operator __m128&() { return m128; } - - //////////////////////////////////////////////////////////////////////////////// - /// Loads and Stores - //////////////////////////////////////////////////////////////////////////////// - - static __forceinline Vec3fx load( const void* const a ) { - return Vec3fx(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1)))); - } - - static __forceinline Vec3fx loadu( const void* const a ) { - return Vec3fx(_mm_loadu_ps((float*)a)); - } - - static __forceinline void storeu ( void* ptr, const Vec3fx& v ) { - _mm_storeu_ps((float*)ptr,v.m128); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec3fx( ZeroTy ) : m128(_mm_setzero_ps()) {} - __forceinline Vec3fx( OneTy ) : m128(_mm_set1_ps(1.0f)) {} - __forceinline Vec3fx( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {} - __forceinline Vec3fx( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline const float& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; } - __forceinline float& operator []( const size_t index ) { assert(index < 3); return (&x)[index]; } - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec3fx operator +( const Vec3fx& a ) { return a; } - __forceinline Vec3fx operator -( const Vec3fx& a ) { - const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); - return _mm_xor_ps(a.m128, mask); - } - __forceinline Vec3fx abs ( const Vec3fx& a ) { - const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)); - return _mm_and_ps(a.m128, mask); - } - __forceinline Vec3fx sign ( const Vec3fx& a ) { - return blendv_ps(Vec3fx(one).m128, (-Vec3fx(one)).m128, _mm_cmplt_ps (a.m128,Vec3fx(zero).m128)); - } - - __forceinline Vec3fx rcp ( const Vec3fx& a ) - { -#if defined(__AVX512VL__) - const Vec3fx r = _mm_rcp14_ps(a.m128); -#else - const Vec3fx r = _mm_rcp_ps(a.m128); -#endif - -#if defined(__AVX2__) - const Vec3fx res = _mm_mul_ps(r.m128,_mm_fnmadd_ps(r.m128, a.m128, vfloat4(2.0f))); -#else - const Vec3fx res = _mm_mul_ps(r.m128,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r.m128, a.m128))); - //return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a)); -#endif - - return res; - } - - __forceinline Vec3fx sqrt ( const Vec3fx& a ) { return _mm_sqrt_ps(a.m128); } - __forceinline Vec3fx sqr ( const Vec3fx& a ) { return _mm_mul_ps(a.m128,a.m128); } - - __forceinline Vec3fx rsqrt( const Vec3fx& a ) - { -#if defined(__AVX512VL__) - __m128 r = _mm_rsqrt14_ps(a.m128); -#else - __m128 r = _mm_rsqrt_ps(a.m128); -#endif - return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a.m128, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); - } - - __forceinline Vec3fx zero_fix(const Vec3fx& a) { - return blendv_ps(a.m128, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input))); - } - __forceinline Vec3fx rcp_safe(const Vec3fx& a) { - return rcp(zero_fix(a)); - } - __forceinline Vec3fx log ( const Vec3fx& a ) { - return Vec3fx(logf(a.x),logf(a.y),logf(a.z)); - } - - __forceinline Vec3fx exp ( const Vec3fx& a ) { - return Vec3fx(expf(a.x),expf(a.y),expf(a.z)); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec3fx operator +( const Vec3fx& a, const Vec3fx& b ) { return _mm_add_ps(a.m128, b.m128); } - __forceinline Vec3fx operator -( const Vec3fx& a, const Vec3fx& b ) { return _mm_sub_ps(a.m128, b.m128); } - __forceinline Vec3fx operator *( const Vec3fx& a, const Vec3fx& b ) { return _mm_mul_ps(a.m128, b.m128); } - __forceinline Vec3fx operator *( const Vec3fx& a, const float b ) { return a * Vec3fx(b); } - __forceinline Vec3fx operator *( const float a, const Vec3fx& b ) { return Vec3fx(a) * b; } - __forceinline Vec3fx operator /( const Vec3fx& a, const Vec3fx& b ) { return _mm_div_ps(a.m128,b.m128); } - __forceinline Vec3fx operator /( const Vec3fx& a, const float b ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); } - __forceinline Vec3fx operator /( const float a, const Vec3fx& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); } - - __forceinline Vec3fx min( const Vec3fx& a, const Vec3fx& b ) { return _mm_min_ps(a.m128,b.m128); } - __forceinline Vec3fx max( const Vec3fx& a, const Vec3fx& b ) { return _mm_max_ps(a.m128,b.m128); } - -#if defined(__SSE4_1__) || defined(__aarch64__) - __forceinline Vec3fx mini(const Vec3fx& a, const Vec3fx& b) { - const vint4 ai = _mm_castps_si128(a.m128); - const vint4 bi = _mm_castps_si128(b.m128); - const vint4 ci = _mm_min_epi32(ai,bi); - return _mm_castsi128_ps(ci); - } -#endif - -#if defined(__SSE4_1__) || defined(__aarch64__) - __forceinline Vec3fx maxi(const Vec3fx& a, const Vec3fx& b) { - const vint4 ai = _mm_castps_si128(a.m128); - const vint4 bi = _mm_castps_si128(b.m128); - const vint4 ci = _mm_max_epi32(ai,bi); - return _mm_castsi128_ps(ci); - } -#endif - - __forceinline Vec3fx pow ( const Vec3fx& a, const float& b ) { - return Vec3fx(powf(a.x,b),powf(a.y,b),powf(a.z,b)); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Ternary Operators - //////////////////////////////////////////////////////////////////////////////// - -#if defined(__AVX2__) - __forceinline Vec3fx madd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fmadd_ps(a.m128,b.m128,c.m128); } - __forceinline Vec3fx msub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fmsub_ps(a.m128,b.m128,c.m128); } - __forceinline Vec3fx nmadd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fnmadd_ps(a.m128,b.m128,c.m128); } - __forceinline Vec3fx nmsub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fnmsub_ps(a.m128,b.m128,c.m128); } -#else - __forceinline Vec3fx madd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return a*b+c; } - __forceinline Vec3fx msub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return a*b-c; } - __forceinline Vec3fx nmadd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return -a*b+c;} - __forceinline Vec3fx nmsub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return -a*b-c; } -#endif - - __forceinline Vec3fx madd ( const float a, const Vec3fx& b, const Vec3fx& c) { return madd(Vec3fx(a),b,c); } - __forceinline Vec3fx msub ( const float a, const Vec3fx& b, const Vec3fx& c) { return msub(Vec3fx(a),b,c); } - __forceinline Vec3fx nmadd ( const float a, const Vec3fx& b, const Vec3fx& c) { return nmadd(Vec3fx(a),b,c); } - __forceinline Vec3fx nmsub ( const float a, const Vec3fx& b, const Vec3fx& c) { return nmsub(Vec3fx(a),b,c); } - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec3fx& operator +=( Vec3fx& a, const Vec3fx& b ) { return a = a + b; } - __forceinline Vec3fx& operator -=( Vec3fx& a, const Vec3fx& b ) { return a = a - b; } - __forceinline Vec3fx& operator *=( Vec3fx& a, const Vec3fx& b ) { return a = a * b; } - __forceinline Vec3fx& operator *=( Vec3fx& a, const float b ) { return a = a * b; } - __forceinline Vec3fx& operator /=( Vec3fx& a, const Vec3fx& b ) { return a = a / b; } - __forceinline Vec3fx& operator /=( Vec3fx& a, const float b ) { return a = a / b; } - - //////////////////////////////////////////////////////////////////////////////// - /// Reductions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline float reduce_add(const Vec3fx& v) { - const vfloat4 a(v.m128); - const vfloat4 b = shuffle<1>(a); - const vfloat4 c = shuffle<2>(a); - return _mm_cvtss_f32(a+b+c); - } - - __forceinline float reduce_mul(const Vec3fx& v) { return v.x*v.y*v.z; } - __forceinline float reduce_min(const Vec3fx& v) { return min(v.x,v.y,v.z); } - __forceinline float reduce_max(const Vec3fx& v) { return max(v.x,v.y,v.z); } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline bool operator ==( const Vec3fx& a, const Vec3fx& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 7) == 7; } - __forceinline bool operator !=( const Vec3fx& a, const Vec3fx& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 7) != 0; } - - __forceinline Vec3ba eq_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpeq_ps (a.m128, b.m128); } - __forceinline Vec3ba neq_mask(const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpneq_ps(a.m128, b.m128); } - __forceinline Vec3ba lt_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmplt_ps (a.m128, b.m128); } - __forceinline Vec3ba le_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmple_ps (a.m128, b.m128); } - __forceinline Vec3ba gt_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpnle_ps(a.m128, b.m128); } - __forceinline Vec3ba ge_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpnlt_ps(a.m128, b.m128); } - - __forceinline bool isvalid ( const Vec3fx& v ) { - return all(gt_mask(v,Vec3fx(-FLT_LARGE)) & lt_mask(v,Vec3fx(+FLT_LARGE))); - } - - __forceinline bool is_finite ( const Vec3fx& a ) { - return all(ge_mask(a,Vec3fx(-FLT_MAX)) & le_mask(a,Vec3fx(+FLT_MAX))); - } - - __forceinline bool isvalid4 ( const Vec3fx& v ) { - return all((vfloat4(v.m128) > vfloat4(-FLT_LARGE)) & (vfloat4(v.m128) < vfloat4(+FLT_LARGE))); - } - - __forceinline bool is_finite4 ( const Vec3fx& a ) { - return all((vfloat4(a.m128) >= vfloat4(-FLT_MAX)) & (vfloat4(a.m128) <= vfloat4(+FLT_MAX))); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Euclidian Space Operators - //////////////////////////////////////////////////////////////////////////////// - -#if defined(__SSE4_1__) - __forceinline float dot ( const Vec3fx& a, const Vec3fx& b ) { - return _mm_cvtss_f32(_mm_dp_ps(a.m128,b.m128,0x7F)); - } -#else - __forceinline float dot ( const Vec3fx& a, const Vec3fx& b ) { - return reduce_add(a*b); - } -#endif - - __forceinline Vec3fx cross ( const Vec3fx& a, const Vec3fx& b ) - { - vfloat4 a0 = vfloat4(a.m128); - vfloat4 b0 = shuffle<1,2,0,3>(vfloat4(b.m128)); - vfloat4 a1 = shuffle<1,2,0,3>(vfloat4(a.m128)); - vfloat4 b1 = vfloat4(b.m128); - return Vec3fx(shuffle<1,2,0,3>(msub(a0,b0,a1*b1))); - } - - __forceinline float sqr_length ( const Vec3fx& a ) { return dot(a,a); } - __forceinline float rcp_length ( const Vec3fx& a ) { return rsqrt(dot(a,a)); } - __forceinline float rcp_length2( const Vec3fx& a ) { return rcp(dot(a,a)); } - __forceinline float length ( const Vec3fx& a ) { return sqrt(dot(a,a)); } - __forceinline Vec3fx normalize( const Vec3fx& a ) { return a*rsqrt(dot(a,a)); } - __forceinline float distance ( const Vec3fx& a, const Vec3fx& b ) { return length(a-b); } - __forceinline float halfArea ( const Vec3fx& d ) { return madd(d.x,(d.y+d.z),d.y*d.z); } - __forceinline float area ( const Vec3fx& d ) { return 2.0f*halfArea(d); } - - __forceinline Vec3fx normalize_safe( const Vec3fx& a ) { - const float d = dot(a,a); if (unlikely(d == 0.0f)) return a; else return a*rsqrt(d); - } - - /*! differentiated normalization */ - __forceinline Vec3fx dnormalize(const Vec3fx& p, const Vec3fx& dp) - { - const float pp = dot(p,p); - const float pdp = dot(p,dp); - return (pp*dp-pdp*p)*rcp(pp)*rsqrt(pp); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Select - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec3fx select( bool s, const Vec3fx& t, const Vec3fx& f ) { - __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps(); - return blendv_ps(f.m128, t.m128, mask); - } - - __forceinline Vec3fx select( const Vec3ba& s, const Vec3fx& t, const Vec3fx& f ) { - return blendv_ps(f.m128, t.m128, s); - } - - __forceinline Vec3fx lerp(const Vec3fx& v0, const Vec3fx& v1, const float t) { - return madd(1.0f-t,v0,t*v1); - } - - __forceinline int maxDim ( const Vec3fx& a ) - { - const Vec3fx b = abs(a); - if (b.x > b.y) { - if (b.x > b.z) return 0; else return 2; - } else { - if (b.y > b.z) return 1; else return 2; - } - } - - //////////////////////////////////////////////////////////////////////////////// - /// Rounding Functions - //////////////////////////////////////////////////////////////////////////////// - -#if defined (__SSE4_1__) && !defined(__aarch64__) - __forceinline Vec3fx trunc( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEAREST_INT); } - __forceinline Vec3fx floor( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEG_INF ); } - __forceinline Vec3fx ceil ( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_POS_INF ); } -#else - __forceinline Vec3fx trunc( const Vec3fx& a ) { return Vec3fx(truncf(a.x),truncf(a.y),truncf(a.z)); } - __forceinline Vec3fx floor( const Vec3fx& a ) { return Vec3fx(floorf(a.x),floorf(a.y),floorf(a.z)); } - __forceinline Vec3fx ceil ( const Vec3fx& a ) { return Vec3fx(ceilf (a.x),ceilf (a.y),ceilf (a.z)); } -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3fx& a) { - return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")"; - } - - - typedef Vec3fx Vec3ff; -} diff --git a/thirdparty/embree-aarch64/common/math/vec3ia.h b/thirdparty/embree-aarch64/common/math/vec3ia.h deleted file mode 100644 index 737f67fd72..0000000000 --- a/thirdparty/embree-aarch64/common/math/vec3ia.h +++ /dev/null @@ -1,210 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../sys/alloc.h" -#include "math.h" -#include "../simd/sse.h" - -namespace embree -{ - //////////////////////////////////////////////////////////////////////////////// - /// SSE Vec3ia Type - //////////////////////////////////////////////////////////////////////////////// - - struct __aligned(16) Vec3ia - { - ALIGNED_STRUCT_(16); - - union { - __m128i m128; - struct { int x,y,z; }; - }; - - typedef int Scalar; - enum { N = 3 }; - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec3ia( ) {} - __forceinline Vec3ia( const __m128i a ) : m128(a) {} - __forceinline Vec3ia( const Vec3ia& other ) : m128(other.m128) {} - __forceinline Vec3ia& operator =(const Vec3ia& other) { m128 = other.m128; return *this; } - - __forceinline explicit Vec3ia( const int a ) : m128(_mm_set1_epi32(a)) {} - __forceinline Vec3ia( const int x, const int y, const int z) : m128(_mm_set_epi32(z, z, y, x)) {} - __forceinline explicit Vec3ia( const __m128 a ) : m128(_mm_cvtps_epi32(a)) {} - - __forceinline operator const __m128i&() const { return m128; } - __forceinline operator __m128i&() { return m128; } - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec3ia( ZeroTy ) : m128(_mm_setzero_si128()) {} - __forceinline Vec3ia( OneTy ) : m128(_mm_set1_epi32(1)) {} - __forceinline Vec3ia( PosInfTy ) : m128(_mm_set1_epi32(pos_inf)) {} - __forceinline Vec3ia( NegInfTy ) : m128(_mm_set1_epi32(neg_inf)) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline const int& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; } - __forceinline int& operator []( const size_t index ) { assert(index < 3); return (&x)[index]; } - }; - - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec3ia operator +( const Vec3ia& a ) { return a; } - __forceinline Vec3ia operator -( const Vec3ia& a ) { return _mm_sub_epi32(_mm_setzero_si128(), a.m128); } -#if (defined(__aarch64__)) - __forceinline Vec3ia abs ( const Vec3ia& a ) { return vabsq_s32(a.m128); } -#elif defined(__SSSE3__) - __forceinline Vec3ia abs ( const Vec3ia& a ) { return _mm_abs_epi32(a.m128); } -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec3ia operator +( const Vec3ia& a, const Vec3ia& b ) { return _mm_add_epi32(a.m128, b.m128); } - __forceinline Vec3ia operator +( const Vec3ia& a, const int b ) { return a+Vec3ia(b); } - __forceinline Vec3ia operator +( const int a, const Vec3ia& b ) { return Vec3ia(a)+b; } - - __forceinline Vec3ia operator -( const Vec3ia& a, const Vec3ia& b ) { return _mm_sub_epi32(a.m128, b.m128); } - __forceinline Vec3ia operator -( const Vec3ia& a, const int b ) { return a-Vec3ia(b); } - __forceinline Vec3ia operator -( const int a, const Vec3ia& b ) { return Vec3ia(a)-b; } - -#if defined(__aarch64__) || defined(__SSE4_1__) - __forceinline Vec3ia operator *( const Vec3ia& a, const Vec3ia& b ) { return _mm_mullo_epi32(a.m128, b.m128); } - __forceinline Vec3ia operator *( const Vec3ia& a, const int b ) { return a * Vec3ia(b); } - __forceinline Vec3ia operator *( const int a, const Vec3ia& b ) { return Vec3ia(a) * b; } -#endif - - __forceinline Vec3ia operator &( const Vec3ia& a, const Vec3ia& b ) { return _mm_and_si128(a.m128, b.m128); } - __forceinline Vec3ia operator &( const Vec3ia& a, const int b ) { return a & Vec3ia(b); } - __forceinline Vec3ia operator &( const int a, const Vec3ia& b ) { return Vec3ia(a) & b; } - - __forceinline Vec3ia operator |( const Vec3ia& a, const Vec3ia& b ) { return _mm_or_si128(a.m128, b.m128); } - __forceinline Vec3ia operator |( const Vec3ia& a, const int b ) { return a | Vec3ia(b); } - __forceinline Vec3ia operator |( const int a, const Vec3ia& b ) { return Vec3ia(a) | b; } - - __forceinline Vec3ia operator ^( const Vec3ia& a, const Vec3ia& b ) { return _mm_xor_si128(a.m128, b.m128); } - __forceinline Vec3ia operator ^( const Vec3ia& a, const int b ) { return a ^ Vec3ia(b); } - __forceinline Vec3ia operator ^( const int a, const Vec3ia& b ) { return Vec3ia(a) ^ b; } - -#if !defined(__ARM_NEON) - __forceinline Vec3ia operator <<( const Vec3ia& a, const int n ) { return _mm_slli_epi32(a.m128, n); } - __forceinline Vec3ia operator >>( const Vec3ia& a, const int n ) { return _mm_srai_epi32(a.m128, n); } - - __forceinline Vec3ia sll ( const Vec3ia& a, const int b ) { return _mm_slli_epi32(a.m128, b); } - __forceinline Vec3ia sra ( const Vec3ia& a, const int b ) { return _mm_srai_epi32(a.m128, b); } - __forceinline Vec3ia srl ( const Vec3ia& a, const int b ) { return _mm_srli_epi32(a.m128, b); } -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec3ia& operator +=( Vec3ia& a, const Vec3ia& b ) { return a = a + b; } - __forceinline Vec3ia& operator +=( Vec3ia& a, const int& b ) { return a = a + b; } - - __forceinline Vec3ia& operator -=( Vec3ia& a, const Vec3ia& b ) { return a = a - b; } - __forceinline Vec3ia& operator -=( Vec3ia& a, const int& b ) { return a = a - b; } - -#if defined(__aarch64__) || defined(__SSE4_1__) - __forceinline Vec3ia& operator *=( Vec3ia& a, const Vec3ia& b ) { return a = a * b; } - __forceinline Vec3ia& operator *=( Vec3ia& a, const int& b ) { return a = a * b; } -#endif - - __forceinline Vec3ia& operator &=( Vec3ia& a, const Vec3ia& b ) { return a = a & b; } - __forceinline Vec3ia& operator &=( Vec3ia& a, const int& b ) { return a = a & b; } - - __forceinline Vec3ia& operator |=( Vec3ia& a, const Vec3ia& b ) { return a = a | b; } - __forceinline Vec3ia& operator |=( Vec3ia& a, const int& b ) { return a = a | b; } - -#if !defined(__ARM_NEON) - __forceinline Vec3ia& operator <<=( Vec3ia& a, const int& b ) { return a = a << b; } - __forceinline Vec3ia& operator >>=( Vec3ia& a, const int& b ) { return a = a >> b; } -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Reductions - //////////////////////////////////////////////////////////////////////////////// -#if defined(__aarch64__) - __forceinline int reduce_add(const Vec3ia& v) { - int32x4_t t = v.m128; - t[3] = 0; - return vaddvq_s32(t); - - } - __forceinline int reduce_mul(const Vec3ia& v) { return v.x*v.y*v.z; } - __forceinline int reduce_min(const Vec3ia& v) { - int32x4_t t = (__m128i)blendv_ps((__m128)v0x7fffffff, (__m128)v.m128, (__m128)vFFF0); - return vminvq_s32(t); - - } - __forceinline int reduce_max(const Vec3ia& v) { - int32x4_t t = (__m128i)blendv_ps((__m128)v0x80000000, (__m128)v.m128, (__m128)vFFF0); - return vmaxvq_s32(t); - - } -#else - __forceinline int reduce_add(const Vec3ia& v) { return v.x+v.y+v.z; } - __forceinline int reduce_mul(const Vec3ia& v) { return v.x*v.y*v.z; } - __forceinline int reduce_min(const Vec3ia& v) { return min(v.x,v.y,v.z); } - __forceinline int reduce_max(const Vec3ia& v) { return max(v.x,v.y,v.z); } -#endif - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline bool operator ==( const Vec3ia& a, const Vec3ia& b ) { return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(a.m128, b.m128))) & 7) == 7; } - __forceinline bool operator !=( const Vec3ia& a, const Vec3ia& b ) { return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(a.m128, b.m128))) & 7) != 7; } - __forceinline bool operator < ( const Vec3ia& a, const Vec3ia& b ) { - if (a.x != b.x) return a.x < b.x; - if (a.y != b.y) return a.y < b.y; - if (a.z != b.z) return a.z < b.z; - return false; - } - - __forceinline Vec3ba eq_mask( const Vec3ia& a, const Vec3ia& b ) { return _mm_castsi128_ps(_mm_cmpeq_epi32 (a.m128, b.m128)); } - __forceinline Vec3ba lt_mask( const Vec3ia& a, const Vec3ia& b ) { return _mm_castsi128_ps(_mm_cmplt_epi32 (a.m128, b.m128)); } - __forceinline Vec3ba gt_mask( const Vec3ia& a, const Vec3ia& b ) { return _mm_castsi128_ps(_mm_cmpgt_epi32 (a.m128, b.m128)); } - - //////////////////////////////////////////////////////////////////////////////// - /// Select - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec3ia select( const Vec3ba& m, const Vec3ia& t, const Vec3ia& f ) { -#if defined(__aarch64__) || defined(__SSE4_1__) - return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m)); -#else - return _mm_or_si128(_mm_and_si128(_mm_castps_si128(m), t), _mm_andnot_si128(_mm_castps_si128(m), f)); -#endif - } - -#if defined(__aarch64__) || defined(__SSE4_1__) - __forceinline Vec3ia min( const Vec3ia& a, const Vec3ia& b ) { return _mm_min_epi32(a.m128,b.m128); } - __forceinline Vec3ia max( const Vec3ia& a, const Vec3ia& b ) { return _mm_max_epi32(a.m128,b.m128); } -#else - __forceinline Vec3ia min( const Vec3ia& a, const Vec3ia& b ) { return select(lt_mask(a,b),a,b); } - __forceinline Vec3ia max( const Vec3ia& a, const Vec3ia& b ) { return select(gt_mask(a,b),a,b); } -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3ia& a) { - return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")"; - } -} diff --git a/thirdparty/embree-aarch64/common/math/vec4.h b/thirdparty/embree-aarch64/common/math/vec4.h deleted file mode 100644 index d16542f507..0000000000 --- a/thirdparty/embree-aarch64/common/math/vec4.h +++ /dev/null @@ -1,258 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "math.h" -#include "vec3.h" - -namespace embree -{ - //////////////////////////////////////////////////////////////////////////////// - /// Generic 4D vector Class - //////////////////////////////////////////////////////////////////////////////// - - template struct Vec4 - { - enum { N = 4 }; - union { - struct { T x, y, z, w; }; -#if !(defined(__WIN32__) && _MSC_VER == 1800) // workaround for older VS 2013 compiler - T components[N]; -#endif - }; - - typedef T Scalar; - - //////////////////////////////////////////////////////////////////////////////// - /// Construction - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec4( ) {} - __forceinline explicit Vec4( const T& a ) : x(a), y(a), z(a), w(a) {} - __forceinline Vec4( const T& x, const T& y, const T& z, const T& w ) : x(x), y(y), z(z), w(w) {} - __forceinline Vec4( const Vec3& xyz, const T& w ) : x(xyz.x), y(xyz.y), z(xyz.z), w(w) {} - - __forceinline Vec4( const Vec4& other ) { x = other.x; y = other.y; z = other.z; w = other.w; } - __forceinline Vec4( const Vec3fx& other ); - - template __forceinline Vec4( const Vec4& a ) : x(T(a.x)), y(T(a.y)), z(T(a.z)), w(T(a.w)) {} - template __forceinline Vec4& operator =(const Vec4& other) { x = other.x; y = other.y; z = other.z; w = other.w; return *this; } - - __forceinline Vec4& operator =(const Vec4& other) { x = other.x; y = other.y; z = other.z; w = other.w; return *this; } - - __forceinline operator Vec3 () const { return Vec3(x,y,z); } - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec4( ZeroTy ) : x(zero), y(zero), z(zero), w(zero) {} - __forceinline Vec4( OneTy ) : x(one), y(one), z(one), w(one) {} - __forceinline Vec4( PosInfTy ) : x(pos_inf), y(pos_inf), z(pos_inf), w(pos_inf) {} - __forceinline Vec4( NegInfTy ) : x(neg_inf), y(neg_inf), z(neg_inf), w(neg_inf) {} - -#if defined(__WIN32__) && (_MSC_VER == 1800) // workaround for older VS 2013 compiler - __forceinline const T& operator [](const size_t axis) const { assert(axis < 4); return (&x)[axis]; } - __forceinline T& operator [](const size_t axis) { assert(axis < 4); return (&x)[axis]; } -#else - __forceinline const T& operator [](const size_t axis ) const { assert(axis < 4); return components[axis]; } - __forceinline T& operator [](const size_t axis) { assert(axis < 4); return components[axis]; } -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Swizzles - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec3 xyz() const { return Vec3(x, y, z); } - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - - template __forceinline Vec4 operator +( const Vec4& a ) { return Vec4(+a.x, +a.y, +a.z, +a.w); } - template __forceinline Vec4 operator -( const Vec4& a ) { return Vec4(-a.x, -a.y, -a.z, -a.w); } - template __forceinline Vec4 abs ( const Vec4& a ) { return Vec4(abs (a.x), abs (a.y), abs (a.z), abs (a.w)); } - template __forceinline Vec4 rcp ( const Vec4& a ) { return Vec4(rcp (a.x), rcp (a.y), rcp (a.z), rcp (a.w)); } - template __forceinline Vec4 rsqrt ( const Vec4& a ) { return Vec4(rsqrt(a.x), rsqrt(a.y), rsqrt(a.z), rsqrt(a.w)); } - template __forceinline Vec4 sqrt ( const Vec4& a ) { return Vec4(sqrt (a.x), sqrt (a.y), sqrt (a.z), sqrt (a.w)); } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - template __forceinline Vec4 operator +( const Vec4& a, const Vec4& b ) { return Vec4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); } - template __forceinline Vec4 operator -( const Vec4& a, const Vec4& b ) { return Vec4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); } - template __forceinline Vec4 operator *( const Vec4& a, const Vec4& b ) { return Vec4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); } - template __forceinline Vec4 operator *( const T& a, const Vec4& b ) { return Vec4(a * b.x, a * b.y, a * b.z, a * b.w); } - template __forceinline Vec4 operator *( const Vec4& a, const T& b ) { return Vec4(a.x * b , a.y * b , a.z * b , a.w * b ); } - template __forceinline Vec4 operator /( const Vec4& a, const Vec4& b ) { return Vec4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w); } - template __forceinline Vec4 operator /( const Vec4& a, const T& b ) { return Vec4(a.x / b , a.y / b , a.z / b , a.w / b ); } - template __forceinline Vec4 operator /( const T& a, const Vec4& b ) { return Vec4(a / b.x, a / b.y, a / b.z, a / b.w); } - - template __forceinline Vec4 min(const Vec4& a, const Vec4& b) { return Vec4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w)); } - template __forceinline Vec4 max(const Vec4& a, const Vec4& b) { return Vec4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w)); } - - //////////////////////////////////////////////////////////////////////////////// - /// Ternary Operators - //////////////////////////////////////////////////////////////////////////////// - - template __forceinline Vec4 madd ( const Vec4& a, const Vec4& b, const Vec4& c) { return Vec4( madd(a.x,b.x,c.x), madd(a.y,b.y,c.y), madd(a.z,b.z,c.z), madd(a.w,b.w,c.w)); } - template __forceinline Vec4 msub ( const Vec4& a, const Vec4& b, const Vec4& c) { return Vec4( msub(a.x,b.x,c.x), msub(a.y,b.y,c.y), msub(a.z,b.z,c.z), msub(a.w,b.w,c.w)); } - template __forceinline Vec4 nmadd ( const Vec4& a, const Vec4& b, const Vec4& c) { return Vec4(nmadd(a.x,b.x,c.x),nmadd(a.y,b.y,c.y),nmadd(a.z,b.z,c.z),nmadd(a.w,b.w,c.w)); } - template __forceinline Vec4 nmsub ( const Vec4& a, const Vec4& b, const Vec4& c) { return Vec4(nmsub(a.x,b.x,c.x),nmsub(a.y,b.y,c.y),nmsub(a.z,b.z,c.z),nmsub(a.w,b.w,c.w)); } - - template __forceinline Vec4 madd ( const T& a, const Vec4& b, const Vec4& c) { return Vec4( madd(a,b.x,c.x), madd(a,b.y,c.y), madd(a,b.z,c.z), madd(a,b.w,c.w)); } - template __forceinline Vec4 msub ( const T& a, const Vec4& b, const Vec4& c) { return Vec4( msub(a,b.x,c.x), msub(a,b.y,c.y), msub(a,b.z,c.z), msub(a,b.w,c.w)); } - template __forceinline Vec4 nmadd ( const T& a, const Vec4& b, const Vec4& c) { return Vec4(nmadd(a,b.x,c.x),nmadd(a,b.y,c.y),nmadd(a,b.z,c.z),nmadd(a,b.w,c.w)); } - template __forceinline Vec4 nmsub ( const T& a, const Vec4& b, const Vec4& c) { return Vec4(nmsub(a,b.x,c.x),nmsub(a,b.y,c.y),nmsub(a,b.z,c.z),nmsub(a,b.w,c.w)); } - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - template __forceinline Vec4& operator +=( Vec4& a, const Vec4& b ) { a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; return a; } - template __forceinline Vec4& operator -=( Vec4& a, const Vec4& b ) { a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w; return a; } - template __forceinline Vec4& operator *=( Vec4& a, const T& b ) { a.x *= b ; a.y *= b ; a.z *= b ; a.w *= b ; return a; } - template __forceinline Vec4& operator /=( Vec4& a, const T& b ) { a.x /= b ; a.y /= b ; a.z /= b ; a.w /= b ; return a; } - - //////////////////////////////////////////////////////////////////////////////// - /// Reduction Operators - //////////////////////////////////////////////////////////////////////////////// - - template __forceinline T reduce_add( const Vec4& a ) { return a.x + a.y + a.z + a.w; } - template __forceinline T reduce_mul( const Vec4& a ) { return a.x * a.y * a.z * a.w; } - template __forceinline T reduce_min( const Vec4& a ) { return min(a.x, a.y, a.z, a.w); } - template __forceinline T reduce_max( const Vec4& a ) { return max(a.x, a.y, a.z, a.w); } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators - //////////////////////////////////////////////////////////////////////////////// - - template __forceinline bool operator ==( const Vec4& a, const Vec4& b ) { return a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w; } - template __forceinline bool operator !=( const Vec4& a, const Vec4& b ) { return a.x != b.x || a.y != b.y || a.z != b.z || a.w != b.w; } - template __forceinline bool operator < ( const Vec4& a, const Vec4& b ) { - if (a.x != b.x) return a.x < b.x; - if (a.y != b.y) return a.y < b.y; - if (a.z != b.z) return a.z < b.z; - if (a.w != b.w) return a.w < b.w; - return false; - } - - //////////////////////////////////////////////////////////////////////////////// - /// Shift Operators - //////////////////////////////////////////////////////////////////////////////// - - template __forceinline Vec4 shift_right_1( const Vec4& a ) { - return Vec4(shift_right_1(a.x),shift_right_1(a.y),shift_right_1(a.z),shift_right_1(a.w)); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Euclidian Space Operators - //////////////////////////////////////////////////////////////////////////////// - - template __forceinline T dot ( const Vec4& a, const Vec4& b ) { return madd(a.x,b.x,madd(a.y,b.y,madd(a.z,b.z,a.w*b.w))); } - - template __forceinline T length ( const Vec4& a ) { return sqrt(dot(a,a)); } - template __forceinline Vec4 normalize( const Vec4& a ) { return a*rsqrt(dot(a,a)); } - template __forceinline T distance ( const Vec4& a, const Vec4& b ) { return length(a-b); } - - //////////////////////////////////////////////////////////////////////////////// - /// Select - //////////////////////////////////////////////////////////////////////////////// - - template __forceinline Vec4 select ( bool s, const Vec4& t, const Vec4& f ) { - return Vec4(select(s,t.x,f.x),select(s,t.y,f.y),select(s,t.z,f.z),select(s,t.w,f.w)); - } - - template __forceinline Vec4 select ( const Vec4& s, const Vec4& t, const Vec4& f ) { - return Vec4(select(s.x,t.x,f.x),select(s.y,t.y,f.y),select(s.z,t.z,f.z),select(s.w,t.w,f.w)); - } - - template __forceinline Vec4 select ( const typename T::Bool& s, const Vec4& t, const Vec4& f ) { - return Vec4(select(s,t.x,f.x),select(s,t.y,f.y),select(s,t.z,f.z),select(s,t.w,f.w)); - } - - template - __forceinline Vec4 lerp(const Vec4& v0, const Vec4& v1, const T& t) { - return madd(Vec4(T(1.0f)-t),v0,t*v1); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - template __forceinline embree_ostream operator<<(embree_ostream cout, const Vec4& a) { - return cout << "(" << a.x << ", " << a.y << ", " << a.z << ", " << a.w << ")"; - } - - //////////////////////////////////////////////////////////////////////////////// - /// Default template instantiations - //////////////////////////////////////////////////////////////////////////////// - - typedef Vec4 Vec4b; - typedef Vec4 Vec4uc; - typedef Vec4 Vec4i; - typedef Vec4 Vec4f; -} - -#include "vec3ba.h" -#include "vec3ia.h" -#include "vec3fa.h" - -//////////////////////////////////////////////////////////////////////////////// -/// SSE / AVX / MIC specializations -//////////////////////////////////////////////////////////////////////////////// - -#if defined(__SSE__) || defined(__ARM_NEON) -#include "../simd/sse.h" -#endif - -#if defined __AVX__ -#include "../simd/avx.h" -#endif - -#if defined __AVX512F__ -#include "../simd/avx512.h" -#endif - -namespace embree -{ - template<> __forceinline Vec4::Vec4( const Vec3fx& a ) { x = a.x; y = a.y; z = a.z; w = a.w; } - -#if defined(__AVX__) - template<> __forceinline Vec4::Vec4( const Vec3fx& a ) { - x = a.x; y = a.y; z = a.z; w = a.w; - } -#elif defined(__SSE__) || defined(__ARM_NEON) - template<> __forceinline Vec4::Vec4( const Vec3fx& a ) { - const vfloat4 v = vfloat4(a.m128); x = shuffle<0,0,0,0>(v); y = shuffle<1,1,1,1>(v); z = shuffle<2,2,2,2>(v); w = shuffle<3,3,3,3>(v); - } -#endif - -#if defined(__SSE__) || defined(__ARM_NEON) - __forceinline Vec4 broadcast4f( const Vec4& a, const size_t k ) { - return Vec4(vfloat4::broadcast(&a.x[k]), vfloat4::broadcast(&a.y[k]), vfloat4::broadcast(&a.z[k]), vfloat4::broadcast(&a.w[k])); - } -#endif - -#if defined(__AVX__) - template<> __forceinline Vec4::Vec4( const Vec3fx& a ) { - x = a.x; y = a.y; z = a.z; w = a.w; - } - __forceinline Vec4 broadcast4f( const Vec4& a, const size_t k ) { - return Vec4(vfloat4::broadcast(&a.x[k]), vfloat4::broadcast(&a.y[k]), vfloat4::broadcast(&a.z[k]), vfloat4::broadcast(&a.w[k])); - } - __forceinline Vec4 broadcast8f( const Vec4& a, const size_t k ) { - return Vec4(vfloat8::broadcast(&a.x[k]), vfloat8::broadcast(&a.y[k]), vfloat8::broadcast(&a.z[k]), vfloat8::broadcast(&a.w[k])); - } - __forceinline Vec4 broadcast8f( const Vec4& a, const size_t k ) { - return Vec4(vfloat8::broadcast(&a.x[k]), vfloat8::broadcast(&a.y[k]), vfloat8::broadcast(&a.z[k]), vfloat8::broadcast(&a.w[k])); - } -#endif - -#if defined(__AVX512F__) - template<> __forceinline Vec4::Vec4( const Vec3fx& a ) : x(a.x), y(a.y), z(a.z), w(a.w) {} -#endif -} diff --git a/thirdparty/embree-aarch64/common/simd/avx.h b/thirdparty/embree-aarch64/common/simd/avx.h deleted file mode 100644 index c840e41805..0000000000 --- a/thirdparty/embree-aarch64/common/simd/avx.h +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "sse.h" - -#if defined(__AVX512VL__) -#include "vboolf8_avx512.h" -#include "vboold4_avx512.h" -#else -#include "vboolf8_avx.h" -#include "vboold4_avx.h" -#endif - -#if defined(__AVX2__) -#include "vint8_avx2.h" -#include "vuint8_avx2.h" -#if defined(__X86_64__) -#include "vllong4_avx2.h" -#endif -#else -#include "vint8_avx.h" -#include "vuint8_avx.h" -#endif -#include "vfloat8_avx.h" -#if defined(__X86_64__) -#include "vdouble4_avx.h" -#endif - -#if defined(__AVX512F__) -#include "avx512.h" -#endif - diff --git a/thirdparty/embree-aarch64/common/simd/avx512.h b/thirdparty/embree-aarch64/common/simd/avx512.h deleted file mode 100644 index 25414ab5b1..0000000000 --- a/thirdparty/embree-aarch64/common/simd/avx512.h +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../sys/platform.h" -#include "../sys/intrinsics.h" -#include "../math/constants.h" -#include "../sys/alloc.h" -#include "varying.h" - -#include "vboolf16_avx512.h" -#include "vint16_avx512.h" -#include "vuint16_avx512.h" -#include "vfloat16_avx512.h" - -#include "vboold8_avx512.h" -#include "vllong8_avx512.h" -#include "vdouble8_avx512.h" - -namespace embree -{ - //////////////////////////////////////////////////////////////////////////////// - /// Prefetching - //////////////////////////////////////////////////////////////////////////////// - -#define PFHINT_L1 0 -#define PFHINT_L2 1 -#define PFHINT_NT 2 - - template - __forceinline void prefetch(const void * __restrict__ const m) - { - if (mode == PFHINT_L1) - _mm_prefetch((const char*)m,_MM_HINT_T0); - else if (mode == PFHINT_L2) - _mm_prefetch((const char*)m,_MM_HINT_T1); - else if (mode == PFHINT_NT) - _mm_prefetch((const char*)m,_MM_HINT_NTA); - } -} diff --git a/thirdparty/embree-aarch64/common/simd/simd.h b/thirdparty/embree-aarch64/common/simd/simd.h deleted file mode 100644 index 647851110b..0000000000 --- a/thirdparty/embree-aarch64/common/simd/simd.h +++ /dev/null @@ -1,110 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../math/math.h" - -/* include SSE wrapper classes */ -#if defined(__SSE__) || defined(__ARM_NEON) -# include "sse.h" -#endif - -/* include AVX wrapper classes */ -#if defined(__AVX__) -# include "avx.h" -#endif - -/* include AVX512 wrapper classes */ -#if defined (__AVX512F__) -# include "avx512.h" -#endif - -namespace embree -{ - template - __forceinline vbool isfinite(const vfloat& v) - { - return (v >= vfloat(-std::numeric_limits::max())) - & (v <= vfloat( std::numeric_limits::max())); - } - - /* foreach unique */ - template - __forceinline void foreach_unique(const vbool& valid0, const vint& vi, const Closure& closure) - { - vbool valid1 = valid0; - while (any(valid1)) { - const int j = int(bsf(movemask(valid1))); - const int i = vi[j]; - const vbool valid2 = valid1 & (i == vi); - valid1 = andn(valid1, valid2); - closure(valid2, i); - } - } - - /* returns the next unique value i in vi and the corresponding valid_i mask */ - template - __forceinline int next_unique(vbool& valid, const vint& vi, /*out*/ vbool& valid_i) - { - assert(any(valid)); - const int j = int(bsf(movemask(valid))); - const int i = vi[j]; - valid_i = valid & (i == vi); - valid = andn(valid, valid_i); - return i; - } - - /* foreach unique index */ - template - __forceinline void foreach_unique_index(const vbool& valid0, const vint& vi, const Closure& closure) - { - vbool valid1 = valid0; - while (any(valid1)) { - const int j = int(bsf(movemask(valid1))); - const int i = vi[j]; - const vbool valid2 = valid1 & (i == vi); - valid1 = andn(valid1, valid2); - closure(valid2, i, j); - } - } - - /* returns the index of the next unique value i in vi and the corresponding valid_i mask */ - template - __forceinline int next_unique_index(vbool& valid, const vint& vi, /*out*/ vbool& valid_i) - { - assert(any(valid)); - const int j = int(bsf(movemask(valid))); - const int i = vi[j]; - valid_i = valid & (i == vi); - valid = andn(valid, valid_i); - return j; - } - - template - __forceinline void foreach2(int x0, int x1, int y0, int y1, const Closure& closure) - { - __aligned(64) int U[2*VSIZEX]; - __aligned(64) int V[2*VSIZEX]; - int index = 0; - for (int y=y0; y=y1; - const vintx vy = y; - for (int x=x0; x= x1; - vintx vx = x+vintx(step); - vintx::storeu(&U[index], vx); - vintx::storeu(&V[index], vy); - const int dx = min(x1-x,VSIZEX); - index += dx; - x += dx; - if (index >= VSIZEX || (lastx && lasty)) { - const vboolx valid = vintx(step) < vintx(index); - closure(valid, vintx::load(U), vintx::load(V)); - x-= max(0, index-VSIZEX); - index = 0; - } - } - } - } -} diff --git a/thirdparty/embree-aarch64/common/simd/sse.cpp b/thirdparty/embree-aarch64/common/simd/sse.cpp deleted file mode 100644 index 1732cfa421..0000000000 --- a/thirdparty/embree-aarch64/common/simd/sse.cpp +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "sse.h" - -namespace embree -{ - const __m128 mm_lookupmask_ps[16] = { - _mm_castsi128_ps(_mm_set_epi32( 0, 0, 0, 0)), - _mm_castsi128_ps(_mm_set_epi32( 0, 0, 0,-1)), - _mm_castsi128_ps(_mm_set_epi32( 0, 0,-1, 0)), - _mm_castsi128_ps(_mm_set_epi32( 0, 0,-1,-1)), - _mm_castsi128_ps(_mm_set_epi32( 0,-1, 0, 0)), - _mm_castsi128_ps(_mm_set_epi32( 0,-1, 0,-1)), - _mm_castsi128_ps(_mm_set_epi32( 0,-1,-1, 0)), - _mm_castsi128_ps(_mm_set_epi32( 0,-1,-1,-1)), - _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0, 0)), - _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0,-1)), - _mm_castsi128_ps(_mm_set_epi32(-1, 0,-1, 0)), - _mm_castsi128_ps(_mm_set_epi32(-1, 0,-1,-1)), - _mm_castsi128_ps(_mm_set_epi32(-1,-1, 0, 0)), - _mm_castsi128_ps(_mm_set_epi32(-1,-1, 0,-1)), - _mm_castsi128_ps(_mm_set_epi32(-1,-1,-1, 0)), - _mm_castsi128_ps(_mm_set_epi32(-1,-1,-1,-1)) - }; - - const __m128d mm_lookupmask_pd[4] = { - _mm_castsi128_pd(_mm_set_epi32( 0, 0, 0, 0)), - _mm_castsi128_pd(_mm_set_epi32( 0, 0,-1,-1)), - _mm_castsi128_pd(_mm_set_epi32(-1,-1, 0, 0)), - _mm_castsi128_pd(_mm_set_epi32(-1,-1,-1,-1)) - }; - -} diff --git a/thirdparty/embree-aarch64/common/simd/sse.h b/thirdparty/embree-aarch64/common/simd/sse.h deleted file mode 100644 index 6bc818b55b..0000000000 --- a/thirdparty/embree-aarch64/common/simd/sse.h +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../sys/platform.h" -#include "../sys/intrinsics.h" -#include "../sys/alloc.h" -#include "../math/constants.h" -#include "varying.h" - -namespace embree -{ -#if (defined(__aarch64__) && defined(BUILD_IOS)) || defined(__SSE4_1__) - __forceinline __m128 blendv_ps(__m128 f, __m128 t, __m128 mask) { - return _mm_blendv_ps(f,t,mask); - } -#else - __forceinline __m128 blendv_ps(__m128 f, __m128 t, __m128 mask) { - return _mm_or_ps(_mm_and_ps(mask, t), _mm_andnot_ps(mask, f)); - } -#endif - - extern const __m128 mm_lookupmask_ps[16]; - extern const __m128d mm_lookupmask_pd[4]; -} - -#if defined(__AVX512VL__) -#include "vboolf4_avx512.h" -#else -#include "vboolf4_sse2.h" -#endif -#include "vint4_sse2.h" -#include "vuint4_sse2.h" -#include "vfloat4_sse2.h" diff --git a/thirdparty/embree-aarch64/common/simd/varying.h b/thirdparty/embree-aarch64/common/simd/varying.h deleted file mode 100644 index 9a46817da9..0000000000 --- a/thirdparty/embree-aarch64/common/simd/varying.h +++ /dev/null @@ -1,132 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../sys/platform.h" - -namespace embree -{ - /* Varying numeric types */ - template - struct vfloat - { - union { float f[N]; int i[N]; }; - __forceinline const float& operator [](size_t index) const { assert(index < N); return f[index]; } - __forceinline float& operator [](size_t index) { assert(index < N); return f[index]; } - }; - - template - struct vdouble - { - union { double f[N]; long long i[N]; }; - __forceinline const double& operator [](size_t index) const { assert(index < N); return f[index]; } - __forceinline double& operator [](size_t index) { assert(index < N); return f[index]; } - }; - - template - struct vint - { - int i[N]; - __forceinline const int& operator [](size_t index) const { assert(index < N); return i[index]; } - __forceinline int& operator [](size_t index) { assert(index < N); return i[index]; } - }; - - template - struct vuint - { - unsigned int i[N]; - __forceinline const unsigned int& operator [](size_t index) const { assert(index < N); return i[index]; } - __forceinline unsigned int& operator [](size_t index) { assert(index < N); return i[index]; } - }; - - template - struct vllong - { - long long i[N]; - __forceinline const long long& operator [](size_t index) const { assert(index < N); return i[index]; } - __forceinline long long& operator [](size_t index) { assert(index < N); return i[index]; } - }; - - /* Varying bool types */ - template struct vboolf { int i[N]; }; // for float/int - template struct vboold { long long i[N]; }; // for double/long long - - /* Aliases to default types */ - template using vreal = vfloat; - template using vbool = vboolf; - - /* Varying size constants */ -#if defined(__AVX512VL__) // SKX - const int VSIZEX = 8; // default size - const int VSIZEL = 16; // large size -#elif defined(__AVX512F__) // KNL - const int VSIZEX = 16; - const int VSIZEL = 16; -#elif defined(__AVX__) - const int VSIZEX = 8; - const int VSIZEL = 8; -#else - const int VSIZEX = 4; - const int VSIZEL = 4; -#endif - - /* Extends varying size N to optimal or up to max(N, N2) */ - template - struct vextend - { -#if defined(__AVX512F__) && !defined(__AVX512VL__) // KNL - /* use 16-wide SIMD calculations on KNL even for 4 and 8 wide SIMD */ - static const int size = (N2 == VSIZEX) ? VSIZEX : N; - #define SIMD_MODE(N) N, 16 -#else - /* calculate with same SIMD width otherwise */ - static const int size = N; - #define SIMD_MODE(N) N, N -#endif - }; - - /* 4-wide shortcuts */ - typedef vfloat<4> vfloat4; - typedef vdouble<4> vdouble4; - typedef vreal<4> vreal4; - typedef vint<4> vint4; - typedef vuint<4> vuint4; - typedef vllong<4> vllong4; - typedef vbool<4> vbool4; - typedef vboolf<4> vboolf4; - typedef vboold<4> vboold4; - - /* 8-wide shortcuts */ - typedef vfloat<8> vfloat8; - typedef vdouble<8> vdouble8; - typedef vreal<8> vreal8; - typedef vint<8> vint8; - typedef vuint<8> vuint8; - typedef vllong<8> vllong8; - typedef vbool<8> vbool8; - typedef vboolf<8> vboolf8; - typedef vboold<8> vboold8; - - /* 16-wide shortcuts */ - typedef vfloat<16> vfloat16; - typedef vdouble<16> vdouble16; - typedef vreal<16> vreal16; - typedef vint<16> vint16; - typedef vuint<16> vuint16; - typedef vllong<16> vllong16; - typedef vbool<16> vbool16; - typedef vboolf<16> vboolf16; - typedef vboold<16> vboold16; - - /* Default shortcuts */ - typedef vfloat vfloatx; - typedef vdouble vdoublex; - typedef vreal vrealx; - typedef vint vintx; - typedef vuint vuintx; - typedef vllong vllongx; - typedef vbool vboolx; - typedef vboolf vboolfx; - typedef vboold vbooldx; -} diff --git a/thirdparty/embree-aarch64/common/simd/vboold4_avx.h b/thirdparty/embree-aarch64/common/simd/vboold4_avx.h deleted file mode 100644 index 6505ee56f3..0000000000 --- a/thirdparty/embree-aarch64/common/simd/vboold4_avx.h +++ /dev/null @@ -1,160 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -namespace embree -{ - /* 4-wide AVX bool type for 64bit data types*/ - template<> - struct vboold<4> - { - ALIGNED_STRUCT_(32); - - typedef vboold4 Bool; - - enum { size = 4 }; // number of SIMD elements - union { // data - __m256d v; - struct { __m128d vl,vh; }; - long long i[4]; - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboold() {} - __forceinline vboold(const vboold4& a) { v = a.v; } - __forceinline vboold4& operator =(const vboold4& a) { v = a.v; return *this; } - - __forceinline vboold(__m256d a) : v(a) {} - __forceinline vboold(__m256i a) : v(_mm256_castsi256_pd(a)) {} - - __forceinline operator const __m256() const { return _mm256_castpd_ps(v); } - __forceinline operator const __m256i() const { return _mm256_castpd_si256(v); } - __forceinline operator const __m256d() const { return v; } - - __forceinline vboold(int a) - { - assert(a >= 0 && a <= 255); -#if defined (__AVX2__) - const __m256i mask = _mm256_set_epi64x(0x8, 0x4, 0x2, 0x1); - const __m256i b = _mm256_set1_epi64x(a); - const __m256i c = _mm256_and_si256(b,mask); - v = _mm256_castsi256_pd(_mm256_cmpeq_epi64(c,mask)); -#else - vl = mm_lookupmask_pd[a & 0x3]; - vh = mm_lookupmask_pd[a >> 2]; -#endif - } - - __forceinline vboold(__m128d a, __m128d b) : vl(a), vh(b) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboold(FalseTy) : v(_mm256_setzero_pd()) {} -#if !defined(__aarch64__) - __forceinline vboold(TrueTy) : v(_mm256_cmp_pd(_mm256_setzero_pd(), _mm256_setzero_pd(), _CMP_EQ_OQ)) {} -#else - __forceinline vboold(TrueTy) : v(_mm256_cmpeq_pd(_mm256_setzero_pd(), _mm256_setzero_pd())) {} -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline bool operator [](size_t index) const { assert(index < 4); return (_mm256_movemask_pd(v) >> index) & 1; } - __forceinline long long& operator [](size_t index) { assert(index < 4); return i[index]; } - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboold4 operator !(const vboold4& a) { return _mm256_xor_pd(a, vboold4(embree::True)); } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboold4 operator &(const vboold4& a, const vboold4& b) { return _mm256_and_pd(a, b); } - __forceinline vboold4 operator |(const vboold4& a, const vboold4& b) { return _mm256_or_pd (a, b); } - __forceinline vboold4 operator ^(const vboold4& a, const vboold4& b) { return _mm256_xor_pd(a, b); } - - __forceinline vboold4 andn(const vboold4& a, const vboold4& b) { return _mm256_andnot_pd(b, a); } - - __forceinline vboold4& operator &=(vboold4& a, const vboold4& b) { return a = a & b; } - __forceinline vboold4& operator |=(vboold4& a, const vboold4& b) { return a = a | b; } - __forceinline vboold4& operator ^=(vboold4& a, const vboold4& b) { return a = a ^ b; } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators + Select - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboold4 operator !=(const vboold4& a, const vboold4& b) { return _mm256_xor_pd(a, b); } - __forceinline vboold4 operator ==(const vboold4& a, const vboold4& b) { return _mm256_xor_pd(_mm256_xor_pd(a,b),vboold4(embree::True)); } - - __forceinline vboold4 select(const vboold4& mask, const vboold4& t, const vboold4& f) { - return _mm256_blendv_pd(f, t, mask); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Movement/Shifting/Shuffling Functions - //////////////////////////////////////////////////////////////////////////////// - -#if !defined(__aarch64__) - __forceinline vboold4 unpacklo(const vboold4& a, const vboold4& b) { return _mm256_unpacklo_pd(a, b); } - __forceinline vboold4 unpackhi(const vboold4& a, const vboold4& b) { return _mm256_unpackhi_pd(a, b); } -#endif - -#if defined(__AVX2__) - template - __forceinline vboold4 shuffle(const vboold4& v) { - return _mm256_permute4x64_pd(v, _MM_SHUFFLE(i3, i2, i1, i0)); - } - - template - __forceinline vboold4 shuffle(const vboold4& v) { - return _mm256_permute4x64_pd(v, _MM_SHUFFLE(i, i, i, i)); - } -#endif - - - //////////////////////////////////////////////////////////////////////////////// - /// Reduction Operations - //////////////////////////////////////////////////////////////////////////////// - - __forceinline bool reduce_and(const vboold4& a) { return _mm256_movemask_pd(a) == (unsigned int)0xf; } - __forceinline bool reduce_or (const vboold4& a) { return !_mm256_testz_pd(a,a); } - - __forceinline bool all (const vboold4& a) { return _mm256_movemask_pd(a) == (unsigned int)0xf; } - __forceinline bool any (const vboold4& a) { return !_mm256_testz_pd(a,a); } - __forceinline bool none(const vboold4& a) { return _mm256_testz_pd(a,a) != 0; } - - __forceinline bool all (const vboold4& valid, const vboold4& b) { return all((!valid) | b); } - __forceinline bool any (const vboold4& valid, const vboold4& b) { return any(valid & b); } - __forceinline bool none(const vboold4& valid, const vboold4& b) { return none(valid & b); } - - __forceinline unsigned int movemask(const vboold4& a) { return _mm256_movemask_pd(a); } - __forceinline size_t popcnt (const vboold4& a) { return popcnt((size_t)_mm256_movemask_pd(a)); } - - //////////////////////////////////////////////////////////////////////////////// - /// Get/Set Functions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline bool get(const vboold4& a, size_t index) { return a[index]; } - __forceinline void set (vboold4& a, size_t index) { a[index] = -1; } - __forceinline void clear(vboold4& a, size_t index) { a[index] = 0; } - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline embree_ostream operator <<(embree_ostream cout, const vboold4& a) { - return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " - << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">"; - } -} diff --git a/thirdparty/embree-aarch64/common/simd/vboold4_avx512.h b/thirdparty/embree-aarch64/common/simd/vboold4_avx512.h deleted file mode 100644 index 4fe730d713..0000000000 --- a/thirdparty/embree-aarch64/common/simd/vboold4_avx512.h +++ /dev/null @@ -1,140 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -namespace embree -{ - /* 4-wide AVX-512 bool type */ - template<> - struct vboold<4> - { - typedef vboold4 Bool; - typedef vint4 Int; - - enum { size = 4 }; // number of SIMD elements - __mmask8 v; // data - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboold() {} - __forceinline vboold(const vboold4& t) { v = t.v; } - __forceinline vboold4& operator =(const vboold4& f) { v = f.v; return *this; } - - __forceinline vboold(const __mmask8 &t) { v = t; } - __forceinline operator __mmask8() const { return v; } - - __forceinline vboold(bool b) { v = b ? 0xf : 0x0; } - __forceinline vboold(int t) { v = (__mmask8)t; } - __forceinline vboold(unsigned int t) { v = (__mmask8)t; } - - /* return int8 mask */ - __forceinline __m128i mask8() const { - return _mm_movm_epi8(v); - } - - /* return int32 mask */ - __forceinline __m128i mask32() const { - return _mm_movm_epi32(v); - } - - /* return int64 mask */ - __forceinline __m256i mask64() const { - return _mm256_movm_epi64(v); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboold(FalseTy) : v(0x0) {} - __forceinline vboold(TrueTy) : v(0xf) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline bool operator [](size_t index) const { - assert(index < 4); return (mm512_mask2int(v) >> index) & 1; - } - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboold4 operator !(const vboold4& a) { return _mm512_kandn(a, 0xf); } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboold4 operator &(const vboold4& a, const vboold4& b) { return _mm512_kand(a, b); } - __forceinline vboold4 operator |(const vboold4& a, const vboold4& b) { return _mm512_kor(a, b); } - __forceinline vboold4 operator ^(const vboold4& a, const vboold4& b) { return _mm512_kxor(a, b); } - - __forceinline vboold4 andn(const vboold4& a, const vboold4& b) { return _mm512_kandn(b, a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboold4& operator &=(vboold4& a, const vboold4& b) { return a = a & b; } - __forceinline vboold4& operator |=(vboold4& a, const vboold4& b) { return a = a | b; } - __forceinline vboold4& operator ^=(vboold4& a, const vboold4& b) { return a = a ^ b; } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators + Select - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboold4 operator !=(const vboold4& a, const vboold4& b) { return _mm512_kxor(a, b); } - __forceinline vboold4 operator ==(const vboold4& a, const vboold4& b) { return _mm512_kand(_mm512_kxnor(a, b), 0xf); } - - __forceinline vboold4 select(const vboold4& s, const vboold4& a, const vboold4& b) { - return _mm512_kor(_mm512_kand(s, a), _mm512_kandn(s, b)); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Reduction Operations - //////////////////////////////////////////////////////////////////////////////// - - __forceinline int all (const vboold4& a) { return a.v == 0xf; } - __forceinline int any (const vboold4& a) { return _mm512_kortestz(a, a) == 0; } - __forceinline int none(const vboold4& a) { return _mm512_kortestz(a, a) != 0; } - - __forceinline int all (const vboold4& valid, const vboold4& b) { return all((!valid) | b); } - __forceinline int any (const vboold4& valid, const vboold4& b) { return any(valid & b); } - __forceinline int none(const vboold4& valid, const vboold4& b) { return none(valid & b); } - - __forceinline size_t movemask(const vboold4& a) { return _mm512_kmov(a); } - __forceinline size_t popcnt (const vboold4& a) { return popcnt(a.v); } - - //////////////////////////////////////////////////////////////////////////////// - /// Conversion Operations - //////////////////////////////////////////////////////////////////////////////// - - __forceinline unsigned int toInt(const vboold4& a) { return mm512_mask2int(a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Get/Set Functions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline bool get(const vboold4& a, size_t index) { assert(index < 4); return (toInt(a) >> index) & 1; } - __forceinline void set(vboold4& a, size_t index) { assert(index < 4); a |= 1 << index; } - __forceinline void clear(vboold4& a, size_t index) { assert(index < 4); a = andn(a, 1 << index); } - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline embree_ostream operator <<(embree_ostream cout, const vboold4& a) - { - cout << "<"; - for (size_t i=0; i<4; i++) { - if ((a.v >> i) & 1) cout << "1"; else cout << "0"; - } - return cout << ">"; - } -} diff --git a/thirdparty/embree-aarch64/common/simd/vboold8_avx512.h b/thirdparty/embree-aarch64/common/simd/vboold8_avx512.h deleted file mode 100644 index fdf3f00de5..0000000000 --- a/thirdparty/embree-aarch64/common/simd/vboold8_avx512.h +++ /dev/null @@ -1,148 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -namespace embree -{ - /* 8-wide AVX-512 bool type */ - template<> - struct vboold<8> - { - typedef vboold8 Bool; - typedef vint8 Int; - - enum { size = 8 }; // number of SIMD elements - __mmask8 v; // data - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboold() {} - __forceinline vboold(const vboold8& t) { v = t.v; } - __forceinline vboold8& operator =(const vboold8& f) { v = f.v; return *this; } - - __forceinline vboold(const __mmask8& t) { v = t; } - __forceinline operator __mmask8() const { return v; } - - __forceinline vboold(bool b) { v = b ? 0xff : 0x00; } - __forceinline vboold(int t) { v = (__mmask8)t; } - __forceinline vboold(unsigned int t) { v = (__mmask8)t; } - - /* return int8 mask */ - __forceinline __m128i mask8() const { -#if defined(__AVX512BW__) - return _mm_movm_epi8(v); -#else - const __m512i f = _mm512_set1_epi64(0); - const __m512i t = _mm512_set1_epi64(-1); - const __m512i m = _mm512_mask_or_epi64(f,v,t,t); - return _mm512_cvtepi64_epi8(m); -#endif - } - - /* return int64 mask */ - __forceinline __m512i mask64() const { -#if defined(__AVX512DQ__) - return _mm512_movm_epi64(v); -#else - const __m512i f = _mm512_set1_epi64(0); - const __m512i t = _mm512_set1_epi64(-1); - return _mm512_mask_or_epi64(f,v,t,t); -#endif - } - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboold(FalseTy) : v(0x00) {} - __forceinline vboold(TrueTy) : v(0xff) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline bool operator [](size_t index) const { - assert(index < 8); return (mm512_mask2int(v) >> index) & 1; - } - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboold8 operator !(const vboold8& a) { return _mm512_knot(a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboold8 operator &(const vboold8& a, const vboold8& b) { return _mm512_kand(a, b); } - __forceinline vboold8 operator |(const vboold8& a, const vboold8& b) { return _mm512_kor(a, b); } - __forceinline vboold8 operator ^(const vboold8& a, const vboold8& b) { return _mm512_kxor(a, b); } - - __forceinline vboold8 andn(const vboold8& a, const vboold8& b) { return _mm512_kandn(b, a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboold8& operator &=(vboold8& a, const vboold8& b) { return a = a & b; } - __forceinline vboold8& operator |=(vboold8& a, const vboold8& b) { return a = a | b; } - __forceinline vboold8& operator ^=(vboold8& a, const vboold8& b) { return a = a ^ b; } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators + Select - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboold8 operator !=(const vboold8& a, const vboold8& b) { return _mm512_kxor(a, b); } - __forceinline vboold8 operator ==(const vboold8& a, const vboold8& b) { return _mm512_kxnor(a, b); } - - __forceinline vboold8 select(const vboold8& s, const vboold8& a, const vboold8& b) { - return _mm512_kor(_mm512_kand(s, a), _mm512_kandn(s, b)); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Reduction Operations - //////////////////////////////////////////////////////////////////////////////// - - __forceinline int all (const vboold8& a) { return a.v == 0xff; } - __forceinline int any (const vboold8& a) { return _mm512_kortestz(a, a) == 0; } - __forceinline int none(const vboold8& a) { return _mm512_kortestz(a, a) != 0; } - - __forceinline int all (const vboold8& valid, const vboold8& b) { return all((!valid) | b); } - __forceinline int any (const vboold8& valid, const vboold8& b) { return any(valid & b); } - __forceinline int none(const vboold8& valid, const vboold8& b) { return none(valid & b); } - - __forceinline size_t movemask(const vboold8& a) { return _mm512_kmov(a); } - __forceinline size_t popcnt (const vboold8& a) { return popcnt(a.v); } - - //////////////////////////////////////////////////////////////////////////////// - /// Conversion Operations - //////////////////////////////////////////////////////////////////////////////// - - __forceinline unsigned int toInt(const vboold8& a) { return mm512_mask2int(a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Get/Set Functions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline bool get(const vboold8& a, size_t index) { assert(index < 8); return (toInt(a) >> index) & 1; } - __forceinline void set(vboold8& a, size_t index) { assert(index < 8); a |= 1 << index; } - __forceinline void clear(vboold8& a, size_t index) { assert(index < 8); a = andn(a, 1 << index); } - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline embree_ostream operator <<(embree_ostream cout, const vboold8& a) - { - cout << "<"; - for (size_t i=0; i<8; i++) { - if ((a.v >> i) & 1) cout << "1"; else cout << "0"; - } - return cout << ">"; - } -} diff --git a/thirdparty/embree-aarch64/common/simd/vboolf16_avx512.h b/thirdparty/embree-aarch64/common/simd/vboolf16_avx512.h deleted file mode 100644 index 238cdc8eb9..0000000000 --- a/thirdparty/embree-aarch64/common/simd/vboolf16_avx512.h +++ /dev/null @@ -1,150 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -namespace embree -{ - /* 16-wide AVX-512 bool type */ - template<> - struct vboolf<16> - { - typedef vboolf16 Bool; - typedef vint16 Int; - typedef vfloat16 Float; - - enum { size = 16 }; // number of SIMD elements - __mmask16 v; // data - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf() {} - __forceinline vboolf(const vboolf16& t) { v = t.v; } - __forceinline vboolf16& operator =(const vboolf16& f) { v = f.v; return *this; } - - __forceinline vboolf(const __mmask16& t) { v = t; } - __forceinline operator __mmask16() const { return v; } - - __forceinline vboolf(bool b) { v = b ? 0xFFFF : 0x0000; } - __forceinline vboolf(int t) { v = (__mmask16)t; } - __forceinline vboolf(unsigned int t) { v = (__mmask16)t; } - - /* return int8 mask */ - __forceinline __m128i mask8() const { -#if defined(__AVX512BW__) - return _mm_movm_epi8(v); -#else - const __m512i f = _mm512_set1_epi32(0); - const __m512i t = _mm512_set1_epi32(-1); - const __m512i m = _mm512_mask_or_epi32(f,v,t,t); - return _mm512_cvtepi32_epi8(m); -#endif - } - - /* return int32 mask */ - __forceinline __m512i mask32() const { -#if defined(__AVX512DQ__) - return _mm512_movm_epi32(v); -#else - const __m512i f = _mm512_set1_epi32(0); - const __m512i t = _mm512_set1_epi32(-1); - return _mm512_mask_or_epi32(f,v,t,t); -#endif - } - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf(FalseTy) : v(0x0000) {} - __forceinline vboolf(TrueTy) : v(0xffff) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline bool operator [](size_t index) const { - assert(index < 16); return (mm512_mask2int(v) >> index) & 1; - } - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf16 operator !(const vboolf16& a) { return _mm512_knot(a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf16 operator &(const vboolf16& a, const vboolf16& b) { return _mm512_kand(a,b); } - __forceinline vboolf16 operator |(const vboolf16& a, const vboolf16& b) { return _mm512_kor(a,b); } - __forceinline vboolf16 operator ^(const vboolf16& a, const vboolf16& b) { return _mm512_kxor(a,b); } - - __forceinline vboolf16 andn(const vboolf16& a, const vboolf16& b) { return _mm512_kandn(b,a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf16& operator &=(vboolf16& a, const vboolf16& b) { return a = a & b; } - __forceinline vboolf16& operator |=(vboolf16& a, const vboolf16& b) { return a = a | b; } - __forceinline vboolf16& operator ^=(vboolf16& a, const vboolf16& b) { return a = a ^ b; } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators + Select - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf16 operator !=(const vboolf16& a, const vboolf16& b) { return _mm512_kxor(a, b); } - __forceinline vboolf16 operator ==(const vboolf16& a, const vboolf16& b) { return _mm512_kxnor(a, b); } - - __forceinline vboolf16 select(const vboolf16& s, const vboolf16& a, const vboolf16& b) { - return _mm512_kor(_mm512_kand(s,a),_mm512_kandn(s,b)); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Reduction Operations - //////////////////////////////////////////////////////////////////////////////// - - __forceinline int all (const vboolf16& a) { return _mm512_kortestc(a,a) != 0; } - __forceinline int any (const vboolf16& a) { return _mm512_kortestz(a,a) == 0; } - __forceinline int none(const vboolf16& a) { return _mm512_kortestz(a,a) != 0; } - - __forceinline int all (const vboolf16& valid, const vboolf16& b) { return all((!valid) | b); } - __forceinline int any (const vboolf16& valid, const vboolf16& b) { return any(valid & b); } - __forceinline int none(const vboolf16& valid, const vboolf16& b) { return none(valid & b); } - - __forceinline size_t movemask(const vboolf16& a) { return _mm512_kmov(a); } - __forceinline size_t popcnt (const vboolf16& a) { return popcnt(a.v); } - - //////////////////////////////////////////////////////////////////////////////// - /// Convertion Operations - //////////////////////////////////////////////////////////////////////////////// - - __forceinline unsigned int toInt (const vboolf16& a) { return mm512_mask2int(a); } - __forceinline vboolf16 toMask(const int& a) { return mm512_int2mask(a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Get/Set Functions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline bool get(const vboolf16& a, size_t index) { assert(index < 16); return (toInt(a) >> index) & 1; } - __forceinline void set(vboolf16& a, size_t index) { assert(index < 16); a |= 1 << index; } - __forceinline void clear(vboolf16& a, size_t index) { assert(index < 16); a = andn(a, 1 << index); } - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf16& a) - { - cout << "<"; - for (size_t i=0; i<16; i++) { - if ((a.v >> i) & 1) cout << "1"; else cout << "0"; - } - return cout << ">"; - } -} diff --git a/thirdparty/embree-aarch64/common/simd/vboolf4_avx512.h b/thirdparty/embree-aarch64/common/simd/vboolf4_avx512.h deleted file mode 100644 index 2ae4c4470e..0000000000 --- a/thirdparty/embree-aarch64/common/simd/vboolf4_avx512.h +++ /dev/null @@ -1,143 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -namespace embree -{ - /* 4-wide AVX-512 bool type */ - template<> - struct vboolf<4> - { - typedef vboolf4 Bool; - typedef vint4 Int; - - enum { size = 4 }; // number of SIMD elements - __mmask8 v; // data - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf() {} - __forceinline vboolf(const vboolf4& t) { v = t.v; } - __forceinline vboolf4& operator =(const vboolf4& f) { v = f.v; return *this; } - - __forceinline vboolf(const __mmask8 &t) { v = t; } - __forceinline operator __mmask8() const { return v; } - - __forceinline vboolf(bool b) { v = b ? 0xf : 0x0; } - __forceinline vboolf(int t) { v = (__mmask8)t; } - __forceinline vboolf(unsigned int t) { v = (__mmask8)t; } - - __forceinline vboolf(bool a, bool b, bool c, bool d) - : v((__mmask8)((int(d) << 3) | (int(c) << 2) | (int(b) << 1) | int(a))) {} - - /* return int8 mask */ - __forceinline __m128i mask8() const { - return _mm_movm_epi8(v); - } - - /* return int32 mask */ - __forceinline __m128i mask32() const { - return _mm_movm_epi32(v); - } - - /* return int64 mask */ - __forceinline __m256i mask64() const { - return _mm256_movm_epi64(v); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf(FalseTy) : v(0x0) {} - __forceinline vboolf(TrueTy) : v(0xf) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline bool operator [](size_t index) const { - assert(index < 4); return (mm512_mask2int(v) >> index) & 1; - } - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf4 operator !(const vboolf4& a) { return _mm512_kandn(a, 0xf); } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf4 operator &(const vboolf4& a, const vboolf4& b) { return _mm512_kand(a, b); } - __forceinline vboolf4 operator |(const vboolf4& a, const vboolf4& b) { return _mm512_kor(a, b); } - __forceinline vboolf4 operator ^(const vboolf4& a, const vboolf4& b) { return _mm512_kxor(a, b); } - - __forceinline vboolf4 andn(const vboolf4& a, const vboolf4& b) { return _mm512_kandn(b, a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf4& operator &=(vboolf4& a, const vboolf4& b) { return a = a & b; } - __forceinline vboolf4& operator |=(vboolf4& a, const vboolf4& b) { return a = a | b; } - __forceinline vboolf4& operator ^=(vboolf4& a, const vboolf4& b) { return a = a ^ b; } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators + Select - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf4 operator !=(const vboolf4& a, const vboolf4& b) { return _mm512_kxor(a, b); } - __forceinline vboolf4 operator ==(const vboolf4& a, const vboolf4& b) { return _mm512_kand(_mm512_kxnor(a, b), 0xf); } - - __forceinline vboolf4 select(const vboolf4& s, const vboolf4& a, const vboolf4& b) { - return _mm512_kor(_mm512_kand(s, a), _mm512_kandn(s, b)); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Reduction Operations - //////////////////////////////////////////////////////////////////////////////// - - __forceinline int all (const vboolf4& a) { return a.v == 0xf; } - __forceinline int any (const vboolf4& a) { return _mm512_kortestz(a, a) == 0; } - __forceinline int none(const vboolf4& a) { return _mm512_kortestz(a, a) != 0; } - - __forceinline int all (const vboolf4& valid, const vboolf4& b) { return all((!valid) | b); } - __forceinline int any (const vboolf4& valid, const vboolf4& b) { return any(valid & b); } - __forceinline int none(const vboolf4& valid, const vboolf4& b) { return none(valid & b); } - - __forceinline size_t movemask(const vboolf4& a) { return _mm512_kmov(a); } - __forceinline size_t popcnt (const vboolf4& a) { return popcnt(a.v); } - - //////////////////////////////////////////////////////////////////////////////// - /// Conversion Operations - //////////////////////////////////////////////////////////////////////////////// - - __forceinline unsigned int toInt(const vboolf4& a) { return mm512_mask2int(a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Get/Set Functions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline bool get(const vboolf4& a, size_t index) { assert(index < 4); return (toInt(a) >> index) & 1; } - __forceinline void set(vboolf4& a, size_t index) { assert(index < 4); a |= 1 << index; } - __forceinline void clear(vboolf4& a, size_t index) { assert(index < 4); a = andn(a, 1 << index); } - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf4& a) - { - cout << "<"; - for (size_t i=0; i<4; i++) { - if ((a.v >> i) & 1) cout << "1"; else cout << "0"; - } - return cout << ">"; - } -} diff --git a/thirdparty/embree-aarch64/common/simd/vboolf4_sse2.h b/thirdparty/embree-aarch64/common/simd/vboolf4_sse2.h deleted file mode 100644 index ed53b3c783..0000000000 --- a/thirdparty/embree-aarch64/common/simd/vboolf4_sse2.h +++ /dev/null @@ -1,198 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -namespace embree -{ - /* 4-wide SSE bool type */ - template<> - struct vboolf<4> - { - ALIGNED_STRUCT_(16); - - typedef vboolf4 Bool; - typedef vint4 Int; - typedef vfloat4 Float; - - enum { size = 4 }; // number of SIMD elements - union { __m128 v; int i[4]; }; // data - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf() {} - __forceinline vboolf(const vboolf4& other) { v = other.v; } - __forceinline vboolf4& operator =(const vboolf4& other) { v = other.v; return *this; } - - __forceinline vboolf(__m128 input) : v(input) {} - __forceinline operator const __m128&() const { return v; } - __forceinline operator const __m128i() const { return _mm_castps_si128(v); } - __forceinline operator const __m128d() const { return _mm_castps_pd(v); } - - __forceinline vboolf(bool a) - : v(mm_lookupmask_ps[(size_t(a) << 3) | (size_t(a) << 2) | (size_t(a) << 1) | size_t(a)]) {} - __forceinline vboolf(bool a, bool b) - : v(mm_lookupmask_ps[(size_t(b) << 3) | (size_t(a) << 2) | (size_t(b) << 1) | size_t(a)]) {} - __forceinline vboolf(bool a, bool b, bool c, bool d) - : v(mm_lookupmask_ps[(size_t(d) << 3) | (size_t(c) << 2) | (size_t(b) << 1) | size_t(a)]) {} -#if defined(__aarch64__) && defined(BUILD_IOS) - __forceinline vboolf(int mask) { v = mm_lookupmask_ps[mask]; } - __forceinline vboolf(unsigned int mask) { v = mm_lookupmask_ps[mask]; } -#else - __forceinline vboolf(int mask) { assert(mask >= 0 && mask < 16); v = mm_lookupmask_ps[mask]; } - __forceinline vboolf(unsigned int mask) { assert(mask < 16); v = mm_lookupmask_ps[mask]; } -#endif - /* return int32 mask */ - __forceinline __m128i mask32() const { - return _mm_castps_si128(v); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf(FalseTy) : v(_mm_setzero_ps()) {} - __forceinline vboolf(TrueTy) : v(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()))) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - -#if defined(__aarch64__) && defined(BUILD_IOS) - __forceinline bool operator [](size_t index) const { return (_mm_movemask_ps(v) >> index) & 1; } - __forceinline int& operator [](size_t index) { return i[index]; } -#else - __forceinline bool operator [](size_t index) const { assert(index < 4); return (_mm_movemask_ps(v) >> index) & 1; } - __forceinline int& operator [](size_t index) { assert(index < 4); return i[index]; } -#endif - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf4 operator !(const vboolf4& a) { return _mm_xor_ps(a, vboolf4(embree::True)); } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf4 operator &(const vboolf4& a, const vboolf4& b) { return _mm_and_ps(a, b); } - __forceinline vboolf4 operator |(const vboolf4& a, const vboolf4& b) { return _mm_or_ps (a, b); } - __forceinline vboolf4 operator ^(const vboolf4& a, const vboolf4& b) { return _mm_xor_ps(a, b); } - - __forceinline vboolf4 andn(const vboolf4& a, const vboolf4& b) { return _mm_andnot_ps(b, a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf4& operator &=(vboolf4& a, const vboolf4& b) { return a = a & b; } - __forceinline vboolf4& operator |=(vboolf4& a, const vboolf4& b) { return a = a | b; } - __forceinline vboolf4& operator ^=(vboolf4& a, const vboolf4& b) { return a = a ^ b; } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators + Select - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf4 operator !=(const vboolf4& a, const vboolf4& b) { return _mm_xor_ps(a, b); } - __forceinline vboolf4 operator ==(const vboolf4& a, const vboolf4& b) { return _mm_castsi128_ps(_mm_cmpeq_epi32(a, b)); } - - __forceinline vboolf4 select(const vboolf4& m, const vboolf4& t, const vboolf4& f) { -#if (defined(__aarch64__) && defined(BUILD_IOS)) || defined(__SSE4_1__) - return _mm_blendv_ps(f, t, m); -#else - return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f)); -#endif - } - - //////////////////////////////////////////////////////////////////////////////// - /// Movement/Shifting/Shuffling Functions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf4 unpacklo(const vboolf4& a, const vboolf4& b) { return _mm_unpacklo_ps(a, b); } - __forceinline vboolf4 unpackhi(const vboolf4& a, const vboolf4& b) { return _mm_unpackhi_ps(a, b); } - -#if defined(__aarch64__) - template - __forceinline vboolf4 shuffle(const vboolf4& v) { - return vreinterpretq_f32_u8(vqtbl1q_u8( vreinterpretq_u8_s32(v), _MN_SHUFFLE(i0, i1, i2, i3))); - } - - template - __forceinline vboolf4 shuffle(const vboolf4& a, const vboolf4& b) { - return vreinterpretq_f32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3))); - } -#else - template - __forceinline vboolf4 shuffle(const vboolf4& v) { - return _mm_castsi128_ps(_mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0))); - } - - template - __forceinline vboolf4 shuffle(const vboolf4& a, const vboolf4& b) { - return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)); - } -#endif - - template - __forceinline vboolf4 shuffle(const vboolf4& v) { - return shuffle(v); - } - -#if defined(__SSE3__) - template<> __forceinline vboolf4 shuffle<0, 0, 2, 2>(const vboolf4& v) { return _mm_moveldup_ps(v); } - template<> __forceinline vboolf4 shuffle<1, 1, 3, 3>(const vboolf4& v) { return _mm_movehdup_ps(v); } - template<> __forceinline vboolf4 shuffle<0, 1, 0, 1>(const vboolf4& v) { return _mm_castpd_ps(_mm_movedup_pd(v)); } -#endif - -#if defined(__SSE4_1__) && !defined(__aarch64__) - template __forceinline vboolf4 insert(const vboolf4& a, const vboolf4& b) { return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); } - template __forceinline vboolf4 insert(const vboolf4& a, const vboolf4& b) { return insert(a, b); } - template __forceinline vboolf4 insert(const vboolf4& a, const bool b) { return insert(a, vboolf4(b)); } -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Reduction Operations - //////////////////////////////////////////////////////////////////////////////// - - __forceinline bool reduce_and(const vboolf4& a) { return _mm_movemask_ps(a) == 0xf; } - __forceinline bool reduce_or (const vboolf4& a) { return _mm_movemask_ps(a) != 0x0; } - - __forceinline bool all (const vboolf4& b) { return _mm_movemask_ps(b) == 0xf; } - __forceinline bool any (const vboolf4& b) { return _mm_movemask_ps(b) != 0x0; } - __forceinline bool none(const vboolf4& b) { return _mm_movemask_ps(b) == 0x0; } - - __forceinline bool all (const vboolf4& valid, const vboolf4& b) { return all((!valid) | b); } - __forceinline bool any (const vboolf4& valid, const vboolf4& b) { return any(valid & b); } - __forceinline bool none(const vboolf4& valid, const vboolf4& b) { return none(valid & b); } - - __forceinline size_t movemask(const vboolf4& a) { return _mm_movemask_ps(a); } -#if defined(__aarch64__) && defined(BUILD_IOS) -__forceinline size_t popcnt(const vboolf4& a) { return _mm_movemask_popcnt_ps(a); } -#else -#if defined(__SSE4_2__) - __forceinline size_t popcnt(const vboolf4& a) { return popcnt((size_t)_mm_movemask_ps(a)); } -#else - __forceinline size_t popcnt(const vboolf4& a) { return bool(a[0])+bool(a[1])+bool(a[2])+bool(a[3]); } -#endif -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Get/Set Functions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline bool get(const vboolf4& a, size_t index) { return a[index]; } - __forceinline void set(vboolf4& a, size_t index) { a[index] = -1; } - __forceinline void clear(vboolf4& a, size_t index) { a[index] = 0; } - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf4& a) { - return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ">"; - } -} diff --git a/thirdparty/embree-aarch64/common/simd/vboolf8_avx.h b/thirdparty/embree-aarch64/common/simd/vboolf8_avx.h deleted file mode 100644 index 4f64741b55..0000000000 --- a/thirdparty/embree-aarch64/common/simd/vboolf8_avx.h +++ /dev/null @@ -1,189 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -namespace embree -{ - /* 8-wide AVX bool type */ - template<> - struct vboolf<8> - { - ALIGNED_STRUCT_(32); - - typedef vboolf8 Bool; - typedef vint8 Int; - typedef vfloat8 Float; - - enum { size = 8 }; // number of SIMD elements - union { // data - __m256 v; - struct { __m128 vl,vh; }; - int i[8]; - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf() {} - __forceinline vboolf(const vboolf8& a) { v = a.v; } - __forceinline vboolf8& operator =(const vboolf8& a) { v = a.v; return *this; } - - __forceinline vboolf(__m256 a) : v(a) {} - __forceinline operator const __m256&() const { return v; } - __forceinline operator const __m256i() const { return _mm256_castps_si256(v); } - __forceinline operator const __m256d() const { return _mm256_castps_pd(v); } - - __forceinline vboolf(int a) - { - assert(a >= 0 && a <= 255); -#if defined (__AVX2__) - const __m256i mask = _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1); - const __m256i b = _mm256_set1_epi32(a); - const __m256i c = _mm256_and_si256(b,mask); - v = _mm256_castsi256_ps(_mm256_cmpeq_epi32(c,mask)); -#else - vl = mm_lookupmask_ps[a & 0xF]; - vh = mm_lookupmask_ps[a >> 4]; -#endif - } - - __forceinline vboolf(const vboolf4& a) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),a,1)) {} - __forceinline vboolf(const vboolf4& a, const vboolf4& b) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),b,1)) {} - __forceinline vboolf(__m128 a, __m128 b) : vl(a), vh(b) {} - - __forceinline vboolf(bool a) : v(vboolf8(vboolf4(a), vboolf4(a))) {} - __forceinline vboolf(bool a, bool b) : v(vboolf8(vboolf4(a), vboolf4(b))) {} - __forceinline vboolf(bool a, bool b, bool c, bool d) : v(vboolf8(vboolf4(a,b), vboolf4(c,d))) {} - __forceinline vboolf(bool a, bool b, bool c, bool d, bool e, bool f, bool g, bool h) : v(vboolf8(vboolf4(a,b,c,d), vboolf4(e,f,g,h))) {} - - /* return int32 mask */ - __forceinline __m256i mask32() const { - return _mm256_castps_si256(v); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf(FalseTy) : v(_mm256_setzero_ps()) {} -#if !defined(__aarch64__) - __forceinline vboolf(TrueTy) : v(_mm256_cmp_ps(_mm256_setzero_ps(), _mm256_setzero_ps(), _CMP_EQ_OQ)) {} -#else - __forceinline vboolf(TrueTy) : v(_mm256_cmpeq_ps(_mm256_setzero_ps(), _mm256_setzero_ps())) {} -#endif - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline bool operator [](size_t index) const { assert(index < 8); return (_mm256_movemask_ps(v) >> index) & 1; } - __forceinline int& operator [](size_t index) { assert(index < 8); return i[index]; } - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf8 operator !(const vboolf8& a) { return _mm256_xor_ps(a, vboolf8(embree::True)); } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf8 operator &(const vboolf8& a, const vboolf8& b) { return _mm256_and_ps(a, b); } - __forceinline vboolf8 operator |(const vboolf8& a, const vboolf8& b) { return _mm256_or_ps (a, b); } - __forceinline vboolf8 operator ^(const vboolf8& a, const vboolf8& b) { return _mm256_xor_ps(a, b); } - - __forceinline vboolf8 andn(const vboolf8& a, const vboolf8& b) { return _mm256_andnot_ps(b, a); } - - __forceinline vboolf8& operator &=(vboolf8& a, const vboolf8& b) { return a = a & b; } - __forceinline vboolf8& operator |=(vboolf8& a, const vboolf8& b) { return a = a | b; } - __forceinline vboolf8& operator ^=(vboolf8& a, const vboolf8& b) { return a = a ^ b; } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators + Select - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf8 operator !=(const vboolf8& a, const vboolf8& b) { return _mm256_xor_ps(a, b); } - __forceinline vboolf8 operator ==(const vboolf8& a, const vboolf8& b) { return _mm256_xor_ps(_mm256_xor_ps(a,b),vboolf8(embree::True)); } - - __forceinline vboolf8 select(const vboolf8& mask, const vboolf8& t, const vboolf8& f) { - return _mm256_blendv_ps(f, t, mask); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Movement/Shifting/Shuffling Functions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf8 unpacklo(const vboolf8& a, const vboolf8& b) { return _mm256_unpacklo_ps(a, b); } - __forceinline vboolf8 unpackhi(const vboolf8& a, const vboolf8& b) { return _mm256_unpackhi_ps(a, b); } - - template - __forceinline vboolf8 shuffle(const vboolf8& v) { - return _mm256_permute_ps(v, _MM_SHUFFLE(i, i, i, i)); - } - - template - __forceinline vboolf8 shuffle4(const vboolf8& v) { - return _mm256_permute2f128_ps(v, v, (i1 << 4) | (i0 << 0)); - } - - template - __forceinline vboolf8 shuffle4(const vboolf8& a, const vboolf8& b) { - return _mm256_permute2f128_ps(a, b, (i1 << 4) | (i0 << 0)); - } - - template - __forceinline vboolf8 shuffle(const vboolf8& v) { - return _mm256_permute_ps(v, _MM_SHUFFLE(i3, i2, i1, i0)); - } - - template - __forceinline vboolf8 shuffle(const vboolf8& a, const vboolf8& b) { - return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)); - } - - template<> __forceinline vboolf8 shuffle<0, 0, 2, 2>(const vboolf8& v) { return _mm256_moveldup_ps(v); } - template<> __forceinline vboolf8 shuffle<1, 1, 3, 3>(const vboolf8& v) { return _mm256_movehdup_ps(v); } - template<> __forceinline vboolf8 shuffle<0, 1, 0, 1>(const vboolf8& v) { return _mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(v))); } - - template __forceinline vboolf8 insert4(const vboolf8& a, const vboolf4& b) { return _mm256_insertf128_ps(a, b, i); } - template __forceinline vboolf4 extract4 (const vboolf8& a) { return _mm256_extractf128_ps(a, i); } - template<> __forceinline vboolf4 extract4<0>(const vboolf8& a) { return _mm256_castps256_ps128(a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Reduction Operations - //////////////////////////////////////////////////////////////////////////////// - - __forceinline bool reduce_and(const vboolf8& a) { return _mm256_movemask_ps(a) == (unsigned int)0xff; } - __forceinline bool reduce_or (const vboolf8& a) { return !_mm256_testz_ps(a,a); } - - __forceinline bool all (const vboolf8& a) { return _mm256_movemask_ps(a) == (unsigned int)0xff; } - __forceinline bool any (const vboolf8& a) { return !_mm256_testz_ps(a,a); } - __forceinline bool none(const vboolf8& a) { return _mm256_testz_ps(a,a) != 0; } - - __forceinline bool all (const vboolf8& valid, const vboolf8& b) { return all((!valid) | b); } - __forceinline bool any (const vboolf8& valid, const vboolf8& b) { return any(valid & b); } - __forceinline bool none(const vboolf8& valid, const vboolf8& b) { return none(valid & b); } - - __forceinline unsigned int movemask(const vboolf8& a) { return _mm256_movemask_ps(a); } - __forceinline size_t popcnt (const vboolf8& a) { return popcnt((size_t)_mm256_movemask_ps(a)); } - - //////////////////////////////////////////////////////////////////////////////// - /// Get/Set Functions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline bool get(const vboolf8& a, size_t index) { return a[index]; } - __forceinline void set(vboolf8& a, size_t index) { a[index] = -1; } - __forceinline void clear(vboolf8& a, size_t index) { a[index] = 0; } - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf8& a) { - return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " - << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">"; - } -} diff --git a/thirdparty/embree-aarch64/common/simd/vboolf8_avx512.h b/thirdparty/embree-aarch64/common/simd/vboolf8_avx512.h deleted file mode 100644 index 2a52b554c7..0000000000 --- a/thirdparty/embree-aarch64/common/simd/vboolf8_avx512.h +++ /dev/null @@ -1,143 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -namespace embree -{ - /* 8-wide AVX-512 bool type */ - template<> - struct vboolf<8> - { - typedef vboolf8 Bool; - typedef vint8 Int; - - enum { size = 8 }; // number of SIMD elements - __mmask8 v; // data - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf() {} - __forceinline vboolf(const vboolf8& t) { v = t.v; } - __forceinline vboolf8& operator =(const vboolf8& f) { v = f.v; return *this; } - - __forceinline vboolf(const __mmask8 &t) { v = t; } - __forceinline operator __mmask8() const { return v; } - - __forceinline vboolf(bool b) { v = b ? 0xff : 0x00; } - __forceinline vboolf(int t) { v = (__mmask8)t; } - __forceinline vboolf(unsigned int t) { v = (__mmask8)t; } - - __forceinline vboolf(bool a, bool b, bool c, bool d, bool e, bool f, bool g, bool h) - : v((__mmask8)((int(h) << 7) | (int(g) << 6) | (int(f) << 5) | (int(e) << 4) | (int(d) << 3) | (int(c) << 2) | (int(b) << 1) | int(a))) {} - - /* return int8 mask */ - __forceinline __m128i mask8() const { - return _mm_movm_epi8(v); - } - - /* return int32 mask */ - __forceinline __m256i mask32() const { - return _mm256_movm_epi32(v); - } - - /* return int64 mask */ - __forceinline __m512i mask64() const { - return _mm512_movm_epi64(v); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf(FalseTy) : v(0x00) {} - __forceinline vboolf(TrueTy) : v(0xff) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline bool operator [](size_t index) const { - assert(index < 8); return (mm512_mask2int(v) >> index) & 1; - } - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf8 operator !(const vboolf8& a) { return _mm512_knot(a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf8 operator &(const vboolf8& a, const vboolf8& b) { return _mm512_kand(a, b); } - __forceinline vboolf8 operator |(const vboolf8& a, const vboolf8& b) { return _mm512_kor(a, b); } - __forceinline vboolf8 operator ^(const vboolf8& a, const vboolf8& b) { return _mm512_kxor(a, b); } - - __forceinline vboolf8 andn(const vboolf8& a, const vboolf8& b) { return _mm512_kandn(b, a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf8& operator &=(vboolf8& a, const vboolf8& b) { return a = a & b; } - __forceinline vboolf8& operator |=(vboolf8& a, const vboolf8& b) { return a = a | b; } - __forceinline vboolf8& operator ^=(vboolf8& a, const vboolf8& b) { return a = a ^ b; } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators + Select - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf8 operator !=(const vboolf8& a, const vboolf8& b) { return _mm512_kxor(a, b); } - __forceinline vboolf8 operator ==(const vboolf8& a, const vboolf8& b) { return _mm512_kxnor(a, b); } - - __forceinline vboolf8 select(const vboolf8& s, const vboolf8& a, const vboolf8& b) { - return _mm512_kor(_mm512_kand(s, a), _mm512_kandn(s, b)); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Reduction Operations - //////////////////////////////////////////////////////////////////////////////// - - __forceinline int all (const vboolf8& a) { return a.v == 0xff; } - __forceinline int any (const vboolf8& a) { return _mm512_kortestz(a, a) == 0; } - __forceinline int none(const vboolf8& a) { return _mm512_kortestz(a, a) != 0; } - - __forceinline int all (const vboolf8& valid, const vboolf8& b) { return all((!valid) | b); } - __forceinline int any (const vboolf8& valid, const vboolf8& b) { return any(valid & b); } - __forceinline int none(const vboolf8& valid, const vboolf8& b) { return none(valid & b); } - - __forceinline size_t movemask(const vboolf8& a) { return _mm512_kmov(a); } - __forceinline size_t popcnt (const vboolf8& a) { return popcnt(a.v); } - - //////////////////////////////////////////////////////////////////////////////// - /// Conversion Operations - //////////////////////////////////////////////////////////////////////////////// - - __forceinline unsigned int toInt(const vboolf8& a) { return mm512_mask2int(a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Get/Set Functions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline bool get(const vboolf8& a, size_t index) { assert(index < 8); return (toInt(a) >> index) & 1; } - __forceinline void set(vboolf8& a, size_t index) { assert(index < 8); a |= 1 << index; } - __forceinline void clear(vboolf8& a, size_t index) { assert(index < 8); a = andn(a, 1 << index); } - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf8& a) - { - cout << "<"; - for (size_t i=0; i<8; i++) { - if ((a.v >> i) & 1) cout << "1"; else cout << "0"; - } - return cout << ">"; - } -} diff --git a/thirdparty/embree-aarch64/common/simd/vdouble4_avx.h b/thirdparty/embree-aarch64/common/simd/vdouble4_avx.h deleted file mode 100644 index 1f65b45d7e..0000000000 --- a/thirdparty/embree-aarch64/common/simd/vdouble4_avx.h +++ /dev/null @@ -1,324 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -namespace embree -{ - /* 4-wide AVX 64-bit double type */ - template<> - struct vdouble<4> - { - ALIGNED_STRUCT_(32); - - typedef vboold4 Bool; - - enum { size = 4 }; // number of SIMD elements - union { // data - __m256d v; - double i[4]; - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vdouble() {} - __forceinline vdouble(const vdouble4& t) { v = t.v; } - __forceinline vdouble4& operator =(const vdouble4& f) { v = f.v; return *this; } - - __forceinline vdouble(const __m256d& t) { v = t; } - __forceinline operator __m256d() const { return v; } - - __forceinline vdouble(double i) { - v = _mm256_set1_pd(i); - } - - __forceinline vdouble(double a, double b, double c, double d) { - v = _mm256_set_pd(d,c,b,a); - } - - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vdouble(ZeroTy) : v(_mm256_setzero_pd()) {} - __forceinline vdouble(OneTy) : v(_mm256_set1_pd(1)) {} - __forceinline vdouble(StepTy) : v(_mm256_set_pd(3.0,2.0,1.0,0.0)) {} - __forceinline vdouble(ReverseStepTy) : v(_mm256_setr_pd(3.0,2.0,1.0,0.0)) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Loads and Stores - //////////////////////////////////////////////////////////////////////////////// - - static __forceinline void store_nt(double *__restrict__ ptr, const vdouble4& a) { - _mm256_stream_pd(ptr, a); - } - - static __forceinline vdouble4 loadu(const double* addr) { - return _mm256_loadu_pd(addr); - } - - static __forceinline vdouble4 load(const vdouble4* addr) { - return _mm256_load_pd((double*)addr); - } - - static __forceinline vdouble4 load(const double* addr) { - return _mm256_load_pd(addr); - } - - static __forceinline void store(double* ptr, const vdouble4& v) { - _mm256_store_pd(ptr, v); - } - - static __forceinline void storeu(double* ptr, const vdouble4& v) { - _mm256_storeu_pd(ptr, v); - } - - static __forceinline vdouble4 broadcast(const void* a) { return _mm256_set1_pd(*(double*)a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline double& operator [](size_t index) { assert(index < 4); return i[index]; } - __forceinline const double& operator [](size_t index) const { assert(index < 4); return i[index]; } - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - -#if defined(__AVX2__) - __forceinline vdouble4 asDouble(const vllong4& a) { return _mm256_castsi256_pd(a); } - __forceinline vllong4 asLLong (const vdouble4& a) { return _mm256_castpd_si256(a); } -#endif - - __forceinline vdouble4 operator +(const vdouble4& a) { return a; } - __forceinline vdouble4 operator -(const vdouble4& a) { return _mm256_sub_pd(_mm256_setzero_pd(), a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vdouble4 operator +(const vdouble4& a, const vdouble4& b) { return _mm256_add_pd(a, b); } - __forceinline vdouble4 operator +(const vdouble4& a, double b) { return a + vdouble4(b); } - __forceinline vdouble4 operator +(double a, const vdouble4& b) { return vdouble4(a) + b; } - - __forceinline vdouble4 operator -(const vdouble4& a, const vdouble4& b) { return _mm256_sub_pd(a, b); } - __forceinline vdouble4 operator -(const vdouble4& a, double b) { return a - vdouble4(b); } - __forceinline vdouble4 operator -(double a, const vdouble4& b) { return vdouble4(a) - b; } - - __forceinline vdouble4 operator *(const vdouble4& a, const vdouble4& b) { return _mm256_mul_pd(a, b); } - __forceinline vdouble4 operator *(const vdouble4& a, double b) { return a * vdouble4(b); } - __forceinline vdouble4 operator *(double a, const vdouble4& b) { return vdouble4(a) * b; } - - __forceinline vdouble4 operator &(const vdouble4& a, const vdouble4& b) { return _mm256_and_pd(a, b); } - __forceinline vdouble4 operator &(const vdouble4& a, double b) { return a & vdouble4(b); } - __forceinline vdouble4 operator &(double a, const vdouble4& b) { return vdouble4(a) & b; } - - __forceinline vdouble4 operator |(const vdouble4& a, const vdouble4& b) { return _mm256_or_pd(a, b); } - __forceinline vdouble4 operator |(const vdouble4& a, double b) { return a | vdouble4(b); } - __forceinline vdouble4 operator |(double a, const vdouble4& b) { return vdouble4(a) | b; } - - __forceinline vdouble4 operator ^(const vdouble4& a, const vdouble4& b) { return _mm256_xor_pd(a, b); } - __forceinline vdouble4 operator ^(const vdouble4& a, double b) { return a ^ vdouble4(b); } - __forceinline vdouble4 operator ^(double a, const vdouble4& b) { return vdouble4(a) ^ b; } - - __forceinline vdouble4 min(const vdouble4& a, const vdouble4& b) { return _mm256_min_pd(a, b); } - __forceinline vdouble4 min(const vdouble4& a, double b) { return min(a,vdouble4(b)); } - __forceinline vdouble4 min(double a, const vdouble4& b) { return min(vdouble4(a),b); } - - __forceinline vdouble4 max(const vdouble4& a, const vdouble4& b) { return _mm256_max_pd(a, b); } - __forceinline vdouble4 max(const vdouble4& a, double b) { return max(a,vdouble4(b)); } - __forceinline vdouble4 max(double a, const vdouble4& b) { return max(vdouble4(a),b); } - - //////////////////////////////////////////////////////////////////////////////// - /// Ternary Operators - //////////////////////////////////////////////////////////////////////////////// - -#if defined(__FMA__) - __forceinline vdouble4 madd (const vdouble4& a, const vdouble4& b, const vdouble4& c) { return _mm256_fmadd_pd(a,b,c); } - __forceinline vdouble4 msub (const vdouble4& a, const vdouble4& b, const vdouble4& c) { return _mm256_fmsub_pd(a,b,c); } - __forceinline vdouble4 nmadd(const vdouble4& a, const vdouble4& b, const vdouble4& c) { return _mm256_fnmadd_pd(a,b,c); } - __forceinline vdouble4 nmsub(const vdouble4& a, const vdouble4& b, const vdouble4& c) { return _mm256_fnmsub_pd(a,b,c); } -#else - __forceinline vdouble4 madd (const vdouble4& a, const vdouble4& b, const vdouble4& c) { return a*b+c; } - __forceinline vdouble4 msub (const vdouble4& a, const vdouble4& b, const vdouble4& c) { return a*b-c; } - __forceinline vdouble4 nmadd(const vdouble4& a, const vdouble4& b, const vdouble4& c) { return -a*b+c;} - __forceinline vdouble4 nmsub(const vdouble4& a, const vdouble4& b, const vdouble4& c) { return -a*b-c; } -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vdouble4& operator +=(vdouble4& a, const vdouble4& b) { return a = a + b; } - __forceinline vdouble4& operator +=(vdouble4& a, double b) { return a = a + b; } - - __forceinline vdouble4& operator -=(vdouble4& a, const vdouble4& b) { return a = a - b; } - __forceinline vdouble4& operator -=(vdouble4& a, double b) { return a = a - b; } - - __forceinline vdouble4& operator *=(vdouble4& a, const vdouble4& b) { return a = a * b; } - __forceinline vdouble4& operator *=(vdouble4& a, double b) { return a = a * b; } - - __forceinline vdouble4& operator &=(vdouble4& a, const vdouble4& b) { return a = a & b; } - __forceinline vdouble4& operator &=(vdouble4& a, double b) { return a = a & b; } - - __forceinline vdouble4& operator |=(vdouble4& a, const vdouble4& b) { return a = a | b; } - __forceinline vdouble4& operator |=(vdouble4& a, double b) { return a = a | b; } - - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators + Select - //////////////////////////////////////////////////////////////////////////////// - -#if defined(__AVX512VL__) - __forceinline vboold4 operator ==(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_EQ); } - __forceinline vboold4 operator !=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_NE); } - __forceinline vboold4 operator < (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_LT); } - __forceinline vboold4 operator >=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_GE); } - __forceinline vboold4 operator > (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_GT); } - __forceinline vboold4 operator <=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_LE); } -#elif !defined(__aarch64__) - __forceinline vboold4 operator ==(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_EQ_OQ); } - __forceinline vboold4 operator !=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NEQ_UQ); } - __forceinline vboold4 operator < (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_LT_OS); } - __forceinline vboold4 operator >=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NLT_US); } - __forceinline vboold4 operator > (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NLE_US); } - __forceinline vboold4 operator <=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_LE_OS); } -#else - __forceinline vboold4 operator ==(const vdouble4& a, const vdouble4& b) { return _mm256_cmpeq_pd(a, b); } - __forceinline vboold4 operator !=(const vdouble4& a, const vdouble4& b) { return _mm256_cmpneq_pd(a, b); } - __forceinline vboold4 operator < (const vdouble4& a, const vdouble4& b) { return _mm256_cmplt_pd(a, b); } - __forceinline vboold4 operator >=(const vdouble4& a, const vdouble4& b) { return _mm256_cmpnlt_pd(a, b); } - __forceinline vboold4 operator > (const vdouble4& a, const vdouble4& b) { return _mm256_cmpnle_pd(a, b); } - __forceinline vboold4 operator <=(const vdouble4& a, const vdouble4& b) { return _mm256_cmple_pd(a, b); } -#endif - - __forceinline vboold4 operator ==(const vdouble4& a, double b) { return a == vdouble4(b); } - __forceinline vboold4 operator ==(double a, const vdouble4& b) { return vdouble4(a) == b; } - - __forceinline vboold4 operator !=(const vdouble4& a, double b) { return a != vdouble4(b); } - __forceinline vboold4 operator !=(double a, const vdouble4& b) { return vdouble4(a) != b; } - - __forceinline vboold4 operator < (const vdouble4& a, double b) { return a < vdouble4(b); } - __forceinline vboold4 operator < (double a, const vdouble4& b) { return vdouble4(a) < b; } - - __forceinline vboold4 operator >=(const vdouble4& a, double b) { return a >= vdouble4(b); } - __forceinline vboold4 operator >=(double a, const vdouble4& b) { return vdouble4(a) >= b; } - - __forceinline vboold4 operator > (const vdouble4& a, double b) { return a > vdouble4(b); } - __forceinline vboold4 operator > (double a, const vdouble4& b) { return vdouble4(a) > b; } - - __forceinline vboold4 operator <=(const vdouble4& a, double b) { return a <= vdouble4(b); } - __forceinline vboold4 operator <=(double a, const vdouble4& b) { return vdouble4(a) <= b; } - - __forceinline vboold4 eq(const vdouble4& a, const vdouble4& b) { return a == b; } - __forceinline vboold4 ne(const vdouble4& a, const vdouble4& b) { return a != b; } - __forceinline vboold4 lt(const vdouble4& a, const vdouble4& b) { return a < b; } - __forceinline vboold4 ge(const vdouble4& a, const vdouble4& b) { return a >= b; } - __forceinline vboold4 gt(const vdouble4& a, const vdouble4& b) { return a > b; } - __forceinline vboold4 le(const vdouble4& a, const vdouble4& b) { return a <= b; } - -#if defined(__AVX512VL__) - __forceinline vboold4 eq(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_EQ); } - __forceinline vboold4 ne(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_NE); } - __forceinline vboold4 lt(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_LT); } - __forceinline vboold4 ge(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_GE); } - __forceinline vboold4 gt(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_GT); } - __forceinline vboold4 le(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_LE); } -#else - __forceinline vboold4 eq(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a == b); } - __forceinline vboold4 ne(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a != b); } - __forceinline vboold4 lt(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a < b); } - __forceinline vboold4 ge(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a >= b); } - __forceinline vboold4 gt(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a > b); } - __forceinline vboold4 le(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a <= b); } -#endif - - __forceinline vdouble4 select(const vboold4& m, const vdouble4& t, const vdouble4& f) { -#if defined(__AVX512VL__) - return _mm256_mask_blend_pd(m, f, t); -#else - return _mm256_blendv_pd(f, t, m); -#endif - } - - __forceinline void xchg(const vboold4& m, vdouble4& a, vdouble4& b) { - const vdouble4 c = a; a = select(m,b,a); b = select(m,c,b); - } - - __forceinline vboold4 test(const vdouble4& a, const vdouble4& b) { -#if defined(__AVX512VL__) - return _mm256_test_epi64_mask(_mm256_castpd_si256(a),_mm256_castpd_si256(b)); -#else - return _mm256_testz_si256(_mm256_castpd_si256(a),_mm256_castpd_si256(b)); -#endif - } - - //////////////////////////////////////////////////////////////////////////////// - // Movement/Shifting/Shuffling Functions - //////////////////////////////////////////////////////////////////////////////// - - template - __forceinline vdouble4 shuffle(const vdouble4& v) { - return _mm256_permute_pd(v, (i1 << 3) | (i0 << 2) | (i1 << 1) | i0); - } - - template - __forceinline vdouble4 shuffle(const vdouble4& v) { - return shuffle(v); - } - - template - __forceinline vdouble4 shuffle2(const vdouble4& v) { - return _mm256_permute2f128_pd(v, v, (i1 << 4) | i0); - } - - __forceinline double toScalar(const vdouble4& v) { - return _mm_cvtsd_f64(_mm256_castpd256_pd128(v)); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Reductions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vdouble4 vreduce_min2(const vdouble4& x) { return min(x, shuffle<1,0>(x)); } - __forceinline vdouble4 vreduce_min (const vdouble4& y) { const vdouble4 x = vreduce_min2(y); return min(x, shuffle2<1,0>(x)); } - - __forceinline vdouble4 vreduce_max2(const vdouble4& x) { return max(x,shuffle<1,0>(x)); } - __forceinline vdouble4 vreduce_max (const vdouble4& y) { const vdouble4 x = vreduce_max2(y); return max(x, shuffle2<1,0>(x)); } - - __forceinline vdouble4 vreduce_and2(const vdouble4& x) { return x & shuffle<1,0>(x); } - __forceinline vdouble4 vreduce_and (const vdouble4& y) { const vdouble4 x = vreduce_and2(y); return x & shuffle2<1,0>(x); } - - __forceinline vdouble4 vreduce_or2(const vdouble4& x) { return x | shuffle<1,0>(x); } - __forceinline vdouble4 vreduce_or (const vdouble4& y) { const vdouble4 x = vreduce_or2(y); return x | shuffle2<1,0>(x); } - - __forceinline vdouble4 vreduce_add2(const vdouble4& x) { return x + shuffle<1,0>(x); } - __forceinline vdouble4 vreduce_add (const vdouble4& y) { const vdouble4 x = vreduce_add2(y); return x + shuffle2<1,0>(x); } - - __forceinline double reduce_add(const vdouble4& a) { return toScalar(vreduce_add(a)); } - __forceinline double reduce_min(const vdouble4& a) { return toScalar(vreduce_min(a)); } - __forceinline double reduce_max(const vdouble4& a) { return toScalar(vreduce_max(a)); } - - //////////////////////////////////////////////////////////////////////////////// - /// Memory load and store operations - //////////////////////////////////////////////////////////////////////////////// - - - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline embree_ostream operator <<(embree_ostream cout, const vdouble4& v) - { - cout << "<" << v[0]; - for (size_t i=1; i<4; i++) cout << ", " << v[i]; - cout << ">"; - return cout; - } -} diff --git a/thirdparty/embree-aarch64/common/simd/vdouble8_avx512.h b/thirdparty/embree-aarch64/common/simd/vdouble8_avx512.h deleted file mode 100644 index 4eec7d2f6a..0000000000 --- a/thirdparty/embree-aarch64/common/simd/vdouble8_avx512.h +++ /dev/null @@ -1,356 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -namespace embree -{ - /* 8-wide AVX-512 64-bit double type */ - template<> - struct vdouble<8> - { - ALIGNED_STRUCT_(64); - - typedef vboold8 Bool; - - enum { size = 8 }; // number of SIMD elements - union { // data - __m512d v; - double i[8]; - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vdouble() {} - __forceinline vdouble(const vdouble8& t) { v = t.v; } - __forceinline vdouble8& operator =(const vdouble8& f) { v = f.v; return *this; } - - __forceinline vdouble(const __m512d& t) { v = t; } - __forceinline operator __m512d() const { return v; } - __forceinline operator __m256d() const { return _mm512_castpd512_pd256(v); } - - __forceinline vdouble(double i) { - v = _mm512_set1_pd(i); - } - - __forceinline vdouble(double a, double b, double c, double d) { - v = _mm512_set4_pd(d,c,b,a); - } - - __forceinline vdouble(double a0, double a1, double a2, double a3, - double a4, double a5, double a6, double a7) - { - v = _mm512_set_pd(a7,a6,a5,a4,a3,a2,a1,a0); - } - - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vdouble(ZeroTy) : v(_mm512_setzero_pd()) {} - __forceinline vdouble(OneTy) : v(_mm512_set1_pd(1)) {} - __forceinline vdouble(StepTy) : v(_mm512_set_pd(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)) {} - __forceinline vdouble(ReverseStepTy) : v(_mm512_setr_pd(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Loads and Stores - //////////////////////////////////////////////////////////////////////////////// - - static __forceinline void store_nt(void *__restrict__ ptr, const vdouble8& a) { - _mm512_stream_pd((double*)ptr, a); - } - - static __forceinline vdouble8 loadu(const void* addr) { - return _mm512_loadu_pd((double*)addr); - } - - static __forceinline vdouble8 load(const vdouble8* addr) { - return _mm512_load_pd((double*)addr); - } - - static __forceinline vdouble8 load(const double* addr) { - return _mm512_load_pd(addr); - } - - static __forceinline void store(void* ptr, const vdouble8& v) { - _mm512_store_pd(ptr, v); - } - - static __forceinline void storeu(void* ptr, const vdouble8& v) { - _mm512_storeu_pd(ptr, v); - } - - static __forceinline void storeu(const vboold8& mask, double* ptr, const vdouble8& f) { - _mm512_mask_storeu_pd(ptr, mask, f); - } - - static __forceinline void store(const vboold8& mask, void* addr, const vdouble8& v2) { - _mm512_mask_store_pd(addr, mask, v2); - } - - /* pass by value to avoid compiler generating inefficient code */ - static __forceinline void storeu_compact(const vboold8 mask,void * addr, const vdouble8& reg) { - _mm512_mask_compressstoreu_pd(addr, mask, reg); - } - - static __forceinline vdouble8 compact64bit(const vboold8& mask, vdouble8& v) { - return _mm512_mask_compress_pd(v, mask, v); - } - - static __forceinline vdouble8 compact(const vboold8& mask, vdouble8& v) { - return _mm512_mask_compress_pd(v, mask, v); - } - - static __forceinline vdouble8 compact(const vboold8& mask, const vdouble8& a, vdouble8& b) { - return _mm512_mask_compress_pd(a, mask, b); - } - - static __forceinline vdouble8 broadcast(const void* a) { return _mm512_set1_pd(*(double*)a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline double& operator [](size_t index) { assert(index < 8); return i[index]; } - __forceinline const double& operator [](size_t index) const { assert(index < 8); return i[index]; } - - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vdouble8 asDouble(const vllong8& a) { return _mm512_castsi512_pd(a); } - __forceinline vllong8 asLLong (const vdouble8& a) { return _mm512_castpd_si512(a); } - - __forceinline vdouble8 operator +(const vdouble8& a) { return a; } - __forceinline vdouble8 operator -(const vdouble8& a) { return _mm512_sub_pd(_mm512_setzero_pd(), a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vdouble8 operator +(const vdouble8& a, const vdouble8& b) { return _mm512_add_pd(a, b); } - __forceinline vdouble8 operator +(const vdouble8& a, double b) { return a + vdouble8(b); } - __forceinline vdouble8 operator +(double a, const vdouble8& b) { return vdouble8(a) + b; } - - __forceinline vdouble8 operator -(const vdouble8& a, const vdouble8& b) { return _mm512_sub_pd(a, b); } - __forceinline vdouble8 operator -(const vdouble8& a, double b) { return a - vdouble8(b); } - __forceinline vdouble8 operator -(double a, const vdouble8& b) { return vdouble8(a) - b; } - - __forceinline vdouble8 operator *(const vdouble8& a, const vdouble8& b) { return _mm512_mul_pd(a, b); } - __forceinline vdouble8 operator *(const vdouble8& a, double b) { return a * vdouble8(b); } - __forceinline vdouble8 operator *(double a, const vdouble8& b) { return vdouble8(a) * b; } - - __forceinline vdouble8 operator &(const vdouble8& a, const vdouble8& b) { return _mm512_and_pd(a, b); } - __forceinline vdouble8 operator &(const vdouble8& a, double b) { return a & vdouble8(b); } - __forceinline vdouble8 operator &(double a, const vdouble8& b) { return vdouble8(a) & b; } - - __forceinline vdouble8 operator |(const vdouble8& a, const vdouble8& b) { return _mm512_or_pd(a, b); } - __forceinline vdouble8 operator |(const vdouble8& a, double b) { return a | vdouble8(b); } - __forceinline vdouble8 operator |(double a, const vdouble8& b) { return vdouble8(a) | b; } - - __forceinline vdouble8 operator ^(const vdouble8& a, const vdouble8& b) { return _mm512_xor_pd(a, b); } - __forceinline vdouble8 operator ^(const vdouble8& a, double b) { return a ^ vdouble8(b); } - __forceinline vdouble8 operator ^(double a, const vdouble8& b) { return vdouble8(a) ^ b; } - - __forceinline vdouble8 operator <<(const vdouble8& a, const unsigned int n) { return _mm512_castsi512_pd(_mm512_slli_epi64(_mm512_castpd_si512(a), n)); } - __forceinline vdouble8 operator >>(const vdouble8& a, const unsigned int n) { return _mm512_castsi512_pd(_mm512_srai_epi64(_mm512_castpd_si512(a), n)); } - - __forceinline vdouble8 operator <<(const vdouble8& a, const vllong8& n) { return _mm512_castsi512_pd(_mm512_sllv_epi64(_mm512_castpd_si512(a), n)); } - __forceinline vdouble8 operator >>(const vdouble8& a, const vllong8& n) { return _mm512_castsi512_pd(_mm512_srav_epi64(_mm512_castpd_si512(a), n)); } - - __forceinline vdouble8 sll (const vdouble8& a, const unsigned int b) { return _mm512_castsi512_pd(_mm512_slli_epi64(_mm512_castpd_si512(a), b)); } - __forceinline vdouble8 sra (const vdouble8& a, const unsigned int b) { return _mm512_castsi512_pd(_mm512_srai_epi64(_mm512_castpd_si512(a), b)); } - __forceinline vdouble8 srl (const vdouble8& a, const unsigned int b) { return _mm512_castsi512_pd(_mm512_srli_epi64(_mm512_castpd_si512(a), b)); } - - __forceinline vdouble8 min(const vdouble8& a, const vdouble8& b) { return _mm512_min_pd(a, b); } - __forceinline vdouble8 min(const vdouble8& a, double b) { return min(a,vdouble8(b)); } - __forceinline vdouble8 min(double a, const vdouble8& b) { return min(vdouble8(a),b); } - - __forceinline vdouble8 max(const vdouble8& a, const vdouble8& b) { return _mm512_max_pd(a, b); } - __forceinline vdouble8 max(const vdouble8& a, double b) { return max(a,vdouble8(b)); } - __forceinline vdouble8 max(double a, const vdouble8& b) { return max(vdouble8(a),b); } - - __forceinline vdouble8 mask_add(const vboold8& mask, vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_add_pd(c,mask,a,b); } - __forceinline vdouble8 mask_sub(const vboold8& mask, vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_sub_pd(c,mask,a,b); } - - __forceinline vdouble8 mask_and(const vboold8& m,vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_and_pd(c,m,a,b); } - __forceinline vdouble8 mask_or (const vboold8& m,vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_or_pd(c,m,a,b); } - - //////////////////////////////////////////////////////////////////////////////// - /// Ternary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vdouble8 madd (const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fmadd_pd(a,b,c); } - __forceinline vdouble8 msub (const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fmsub_pd(a,b,c); } - __forceinline vdouble8 nmadd(const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fnmadd_pd(a,b,c); } - __forceinline vdouble8 nmsub(const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fnmsub_pd(a,b,c); } - - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vdouble8& operator +=(vdouble8& a, const vdouble8& b) { return a = a + b; } - __forceinline vdouble8& operator +=(vdouble8& a, double b) { return a = a + b; } - - __forceinline vdouble8& operator -=(vdouble8& a, const vdouble8& b) { return a = a - b; } - __forceinline vdouble8& operator -=(vdouble8& a, double b) { return a = a - b; } - - __forceinline vdouble8& operator *=(vdouble8& a, const vdouble8& b) { return a = a * b; } - __forceinline vdouble8& operator *=(vdouble8& a, double b) { return a = a * b; } - - __forceinline vdouble8& operator &=(vdouble8& a, const vdouble8& b) { return a = a & b; } - __forceinline vdouble8& operator &=(vdouble8& a, double b) { return a = a & b; } - - __forceinline vdouble8& operator |=(vdouble8& a, const vdouble8& b) { return a = a | b; } - __forceinline vdouble8& operator |=(vdouble8& a, double b) { return a = a | b; } - - __forceinline vdouble8& operator <<=(vdouble8& a, const double b) { return a = a << b; } - __forceinline vdouble8& operator >>=(vdouble8& a, const double b) { return a = a >> b; } - - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators + Select - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboold8 operator ==(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_EQ); } - __forceinline vboold8 operator ==(const vdouble8& a, double b) { return a == vdouble8(b); } - __forceinline vboold8 operator ==(double a, const vdouble8& b) { return vdouble8(a) == b; } - - __forceinline vboold8 operator !=(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_NE); } - __forceinline vboold8 operator !=(const vdouble8& a, double b) { return a != vdouble8(b); } - __forceinline vboold8 operator !=(double a, const vdouble8& b) { return vdouble8(a) != b; } - - __forceinline vboold8 operator < (const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LT); } - __forceinline vboold8 operator < (const vdouble8& a, double b) { return a < vdouble8(b); } - __forceinline vboold8 operator < (double a, const vdouble8& b) { return vdouble8(a) < b; } - - __forceinline vboold8 operator >=(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GE); } - __forceinline vboold8 operator >=(const vdouble8& a, double b) { return a >= vdouble8(b); } - __forceinline vboold8 operator >=(double a, const vdouble8& b) { return vdouble8(a) >= b; } - - __forceinline vboold8 operator > (const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GT); } - __forceinline vboold8 operator > (const vdouble8& a, double b) { return a > vdouble8(b); } - __forceinline vboold8 operator > (double a, const vdouble8& b) { return vdouble8(a) > b; } - - __forceinline vboold8 operator <=(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LE); } - __forceinline vboold8 operator <=(const vdouble8& a, double b) { return a <= vdouble8(b); } - __forceinline vboold8 operator <=(double a, const vdouble8& b) { return vdouble8(a) <= b; } - - __forceinline vboold8 eq(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_EQ); } - __forceinline vboold8 ne(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_NE); } - __forceinline vboold8 lt(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LT); } - __forceinline vboold8 ge(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GE); } - __forceinline vboold8 gt(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GT); } - __forceinline vboold8 le(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LE); } - - __forceinline vboold8 eq(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_EQ); } - __forceinline vboold8 ne(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_NE); } - __forceinline vboold8 lt(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_LT); } - __forceinline vboold8 ge(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_GE); } - __forceinline vboold8 gt(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_GT); } - __forceinline vboold8 le(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_LE); } - - __forceinline vdouble8 select(const vboold8& m, const vdouble8& t, const vdouble8& f) { - return _mm512_mask_or_pd(f,m,t,t); - } - - __forceinline void xchg(const vboold8& m, vdouble8& a, vdouble8& b) { - const vdouble8 c = a; a = select(m,b,a); b = select(m,c,b); - } - - __forceinline vboold8 test(const vboold8& m, const vdouble8& a, const vdouble8& b) { - return _mm512_mask_test_epi64_mask(m,_mm512_castpd_si512(a),_mm512_castpd_si512(b)); - } - - __forceinline vboold8 test(const vdouble8& a, const vdouble8& b) { - return _mm512_test_epi64_mask(_mm512_castpd_si512(a),_mm512_castpd_si512(b)); - } - - //////////////////////////////////////////////////////////////////////////////// - // Movement/Shifting/Shuffling Functions - //////////////////////////////////////////////////////////////////////////////// - - template - __forceinline vdouble8 shuffle(const vdouble8& v) { - return _mm512_permute_pd(v, (i1 << 7) | (i0 << 6) | (i1 << 5) | (i0 << 4) | (i1 << 3) | (i0 << 2) | (i1 << 1) | i0); - } - - template - __forceinline vdouble8 shuffle(const vdouble8& v) { - return shuffle(v); - } - - template - __forceinline vdouble8 shuffle(const vdouble8& v) { - return _mm512_permutex_pd(v, _MM_SHUFFLE(i3, i2, i1, i0)); - } - - template - __forceinline vdouble8 shuffle4(const vdouble8& v) { - return _mm512_shuffle_f64x2(v, v, _MM_SHUFFLE(i1*2+1, i1*2, i0*2+1, i0*2)); - } - - template - __forceinline vdouble8 shuffle4(const vdouble8& v) { - return shuffle4(v); - } - - template - __forceinline vdouble8 align_shift_right(const vdouble8& a, const vdouble8& b) { - return _mm512_castsi512_pd(_mm512_alignr_epi64(_mm512_castpd_si512(a), _mm512_castpd_si512(b), i)); - } - - __forceinline double toScalar(const vdouble8& v) { - return _mm_cvtsd_f64(_mm512_castpd512_pd128(v)); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Reductions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vdouble8 vreduce_add2(vdouble8 x) { return x + shuffle<1,0,3,2>(x); } - __forceinline vdouble8 vreduce_add4(vdouble8 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); } - __forceinline vdouble8 vreduce_add (vdouble8 x) { x = vreduce_add4(x); return x + shuffle4<1,0>(x); } - - __forceinline vdouble8 vreduce_min2(vdouble8 x) { return min(x, shuffle<1,0,3,2>(x)); } - __forceinline vdouble8 vreduce_min4(vdouble8 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); } - __forceinline vdouble8 vreduce_min (vdouble8 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0>(x)); } - - __forceinline vdouble8 vreduce_max2(vdouble8 x) { return max(x, shuffle<1,0,3,2>(x)); } - __forceinline vdouble8 vreduce_max4(vdouble8 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); } - __forceinline vdouble8 vreduce_max (vdouble8 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0>(x)); } - - __forceinline double reduce_add(const vdouble8& v) { return toScalar(vreduce_add(v)); } - __forceinline double reduce_min(const vdouble8& v) { return toScalar(vreduce_min(v)); } - __forceinline double reduce_max(const vdouble8& v) { return toScalar(vreduce_max(v)); } - - //////////////////////////////////////////////////////////////////////////////// - /// Memory load and store operations - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vdouble8 permute(const vdouble8& v, const vllong8& index) { - return _mm512_permutexvar_pd(index, v); - } - - __forceinline vdouble8 reverse(const vdouble8& a) { - return permute(a, vllong8(reverse_step)); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline embree_ostream operator <<(embree_ostream cout, const vdouble8& v) - { - cout << "<" << v[0]; - for (size_t i=1; i<8; i++) cout << ", " << v[i]; - cout << ">"; - return cout; - } -} diff --git a/thirdparty/embree-aarch64/common/simd/vfloat16_avx512.h b/thirdparty/embree-aarch64/common/simd/vfloat16_avx512.h deleted file mode 100644 index aed2419b77..0000000000 --- a/thirdparty/embree-aarch64/common/simd/vfloat16_avx512.h +++ /dev/null @@ -1,771 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -namespace embree -{ - /* 16-wide AVX-512 float type */ - template<> - struct vfloat<16> - { - ALIGNED_STRUCT_(64); - - typedef vboolf16 Bool; - typedef vint16 Int; - typedef vfloat16 Float; - - enum { size = 16 }; // number of SIMD elements - union { // data - __m512 v; - float f[16]; - int i[16]; - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vfloat() {} - __forceinline vfloat(const vfloat16& t) { v = t; } - __forceinline vfloat16& operator =(const vfloat16& f) { v = f.v; return *this; } - - __forceinline vfloat(const __m512& t) { v = t; } - __forceinline operator __m512() const { return v; } - __forceinline operator __m256() const { return _mm512_castps512_ps256(v); } - __forceinline operator __m128() const { return _mm512_castps512_ps128(v); } - - __forceinline vfloat(float f) { - v = _mm512_set1_ps(f); - } - - __forceinline vfloat(float a, float b, float c, float d) { - v = _mm512_set4_ps(a, b, c, d); - } - - __forceinline vfloat(const vfloat4& i) { - v = _mm512_broadcast_f32x4(i); - } - - __forceinline vfloat(const vfloat4& a, const vfloat4& b, const vfloat4& c, const vfloat4& d) { - v = _mm512_castps128_ps512(a); - v = _mm512_insertf32x4(v, b, 1); - v = _mm512_insertf32x4(v, c, 2); - v = _mm512_insertf32x4(v, d, 3); - } - - __forceinline vfloat(const vboolf16& mask, const vfloat4& a, const vfloat4& b) { - v = _mm512_broadcast_f32x4(a); - v = _mm512_mask_broadcast_f32x4(v,mask,b); - } - - __forceinline vfloat(const vfloat8& i) { - v = _mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_castps_pd(i))); - } - - __forceinline vfloat(const vfloat8& a, const vfloat8& b) { - v = _mm512_castps256_ps512(a); -#if defined(__AVX512DQ__) - v = _mm512_insertf32x8(v, b, 1); -#else - v = _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castps_pd(v), _mm256_castps_pd(b), 1)); -#endif - } - - /* WARNING: due to f64x4 the mask is considered as an 8bit mask */ - __forceinline vfloat(const vboolf16& mask, const vfloat8& a, const vfloat8& b) { - __m512d aa = _mm512_broadcast_f64x4(_mm256_castps_pd(a)); - aa = _mm512_mask_broadcast_f64x4(aa,mask,_mm256_castps_pd(b)); - v = _mm512_castpd_ps(aa); - } - - __forceinline explicit vfloat(const vint16& a) { - v = _mm512_cvtepi32_ps(a); - } - - __forceinline explicit vfloat(const vuint16& a) { - v = _mm512_cvtepu32_ps(a); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vfloat(ZeroTy) : v(_mm512_setzero_ps()) {} - __forceinline vfloat(OneTy) : v(_mm512_set1_ps(1.0f)) {} - __forceinline vfloat(PosInfTy) : v(_mm512_set1_ps(pos_inf)) {} - __forceinline vfloat(NegInfTy) : v(_mm512_set1_ps(neg_inf)) {} - __forceinline vfloat(StepTy) : v(_mm512_set_ps(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {} - __forceinline vfloat(NaNTy) : v(_mm512_set1_ps(nan)) {} - __forceinline vfloat(UndefinedTy) : v(_mm512_undefined_ps()) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Loads and Stores - //////////////////////////////////////////////////////////////////////////////// - - static __forceinline vfloat16 load (const void* ptr) { return _mm512_load_ps((float*)ptr); } - static __forceinline vfloat16 loadu(const void* ptr) { return _mm512_loadu_ps((float*)ptr); } - - static __forceinline vfloat16 load (const vboolf16& mask, const void* ptr) { return _mm512_mask_load_ps (_mm512_setzero_ps(),mask,(float*)ptr); } - static __forceinline vfloat16 loadu(const vboolf16& mask, const void* ptr) { return _mm512_mask_loadu_ps(_mm512_setzero_ps(),mask,(float*)ptr); } - - static __forceinline void store (void* ptr, const vfloat16& v) { _mm512_store_ps ((float*)ptr,v); } - static __forceinline void storeu(void* ptr, const vfloat16& v) { _mm512_storeu_ps((float*)ptr,v); } - - static __forceinline void store (const vboolf16& mask, void* ptr, const vfloat16& v) { _mm512_mask_store_ps ((float*)ptr,mask,v); } - static __forceinline void storeu(const vboolf16& mask, void* ptr, const vfloat16& v) { _mm512_mask_storeu_ps((float*)ptr,mask,v); } - - static __forceinline void store_nt(void* __restrict__ ptr, const vfloat16& a) { - _mm512_stream_ps((float*)ptr,a); - } - - static __forceinline vfloat16 broadcast(const float* f) { - return _mm512_set1_ps(*f); - } - - static __forceinline vfloat16 compact(const vboolf16& mask, vfloat16 &v) { - return _mm512_mask_compress_ps(v, mask, v); - } - static __forceinline vfloat16 compact(const vboolf16& mask, vfloat16 &a, const vfloat16& b) { - return _mm512_mask_compress_ps(a, mask, b); - } - - static __forceinline vfloat16 expand(const vboolf16& mask, const vfloat16& a, vfloat16& b) { - return _mm512_mask_expand_ps(b, mask, a); - } - - static __forceinline vfloat16 loadu_compact(const vboolf16& mask, const void* ptr) { - return _mm512_mask_expandloadu_ps(_mm512_setzero_ps(), mask, (float*)ptr); - } - - static __forceinline void storeu_compact(const vboolf16& mask, float *addr, const vfloat16 reg) { - _mm512_mask_compressstoreu_ps(addr, mask, reg); - } - - static __forceinline void storeu_compact_single(const vboolf16& mask, float * addr, const vfloat16& reg) { - //_mm512_mask_compressstoreu_ps(addr,mask,reg); - *addr = mm512_cvtss_f32(_mm512_mask_compress_ps(reg, mask, reg)); - } - - template - static __forceinline vfloat16 gather(const float* ptr, const vint16& index) { - return _mm512_i32gather_ps(index, ptr, scale); - } - - template - static __forceinline vfloat16 gather(const vboolf16& mask, const float* ptr, const vint16& index) { - vfloat16 r = zero; - return _mm512_mask_i32gather_ps(r, mask, index, ptr, scale); - } - - template - static __forceinline void scatter(float* ptr, const vint16& index, const vfloat16& v) { - _mm512_i32scatter_ps(ptr, index, v, scale); - } - - template - static __forceinline void scatter(const vboolf16& mask, float* ptr, const vint16& index, const vfloat16& v) { - _mm512_mask_i32scatter_ps(ptr, mask, index, v, scale); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline float& operator [](size_t index) { assert(index < 16); return f[index]; } - __forceinline const float& operator [](size_t index) const { assert(index < 16); return f[index]; } - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vfloat16 asFloat(const vint16& a) { return _mm512_castsi512_ps(a); } - __forceinline vint16 asInt (const vfloat16& a) { return _mm512_castps_si512(a); } - __forceinline vuint16 asUInt (const vfloat16& a) { return _mm512_castps_si512(a); } - - __forceinline vint16 toInt (const vfloat16& a) { return vint16(a); } - __forceinline vfloat16 toFloat(const vint16& a) { return vfloat16(a); } - - __forceinline vfloat16 operator +(const vfloat16& a) { return a; } - __forceinline vfloat16 operator -(const vfloat16& a) { return _mm512_mul_ps(a,vfloat16(-1)); } - - __forceinline vfloat16 abs (const vfloat16& a) { return _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(a),_mm512_set1_epi32(0x7FFFFFFF))); } - __forceinline vfloat16 signmsk(const vfloat16& a) { return _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(a),_mm512_set1_epi32(0x80000000))); } - - __forceinline vfloat16 rcp(const vfloat16& a) { -#if defined(__AVX512ER__) - return _mm512_rcp28_ps(a); -#else - const vfloat16 r = _mm512_rcp14_ps(a); - return _mm512_mul_ps(r, _mm512_fnmadd_ps(r, a, vfloat16(2.0f))); -#endif - } - - __forceinline vfloat16 sqr (const vfloat16& a) { return _mm512_mul_ps(a,a); } - __forceinline vfloat16 sqrt(const vfloat16& a) { return _mm512_sqrt_ps(a); } - - __forceinline vfloat16 rsqrt(const vfloat16& a) - { -#if defined(__AVX512VL__) - const vfloat16 r = _mm512_rsqrt14_ps(a); - return _mm512_fmadd_ps(_mm512_set1_ps(1.5f), r, - _mm512_mul_ps(_mm512_mul_ps(_mm512_mul_ps(a, _mm512_set1_ps(-0.5f)), r), _mm512_mul_ps(r, r))); -#else - return _mm512_rsqrt28_ps(a); -#endif - } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vfloat16 operator +(const vfloat16& a, const vfloat16& b) { return _mm512_add_ps(a, b); } - __forceinline vfloat16 operator +(const vfloat16& a, float b) { return a + vfloat16(b); } - __forceinline vfloat16 operator +(float a, const vfloat16& b) { return vfloat16(a) + b; } - - __forceinline vfloat16 operator -(const vfloat16& a, const vfloat16& b) { return _mm512_sub_ps(a, b); } - __forceinline vfloat16 operator -(const vfloat16& a, float b) { return a - vfloat16(b); } - __forceinline vfloat16 operator -(float a, const vfloat16& b) { return vfloat16(a) - b; } - - __forceinline vfloat16 operator *(const vfloat16& a, const vfloat16& b) { return _mm512_mul_ps(a, b); } - __forceinline vfloat16 operator *(const vfloat16& a, float b) { return a * vfloat16(b); } - __forceinline vfloat16 operator *(float a, const vfloat16& b) { return vfloat16(a) * b; } - - __forceinline vfloat16 operator /(const vfloat16& a, const vfloat16& b) { return _mm512_div_ps(a,b); } - __forceinline vfloat16 operator /(const vfloat16& a, float b) { return a/vfloat16(b); } - __forceinline vfloat16 operator /(float a, const vfloat16& b) { return vfloat16(a)/b; } - - __forceinline vfloat16 operator &(const vfloat16& a, const vfloat16& b) { return _mm512_and_ps(a,b); } - __forceinline vfloat16 operator |(const vfloat16& a, const vfloat16& b) { return _mm512_or_ps(a,b); } - __forceinline vfloat16 operator ^(const vfloat16& a, const vfloat16& b) { - return _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(a),_mm512_castps_si512(b))); - } - - __forceinline vfloat16 min(const vfloat16& a, const vfloat16& b) { - return _mm512_min_ps(a,b); - } - __forceinline vfloat16 min(const vfloat16& a, float b) { - return _mm512_min_ps(a,vfloat16(b)); - } - __forceinline vfloat16 min(const float& a, const vfloat16& b) { - return _mm512_min_ps(vfloat16(a),b); - } - - __forceinline vfloat16 max(const vfloat16& a, const vfloat16& b) { - return _mm512_max_ps(a,b); - } - __forceinline vfloat16 max(const vfloat16& a, float b) { - return _mm512_max_ps(a,vfloat16(b)); - } - __forceinline vfloat16 max(const float& a, const vfloat16& b) { - return _mm512_max_ps(vfloat16(a),b); - } - - __forceinline vfloat16 mask_add(const vboolf16& mask, const vfloat16& c, const vfloat16& a, const vfloat16& b) { return _mm512_mask_add_ps (c,mask,a,b); } - __forceinline vfloat16 mask_min(const vboolf16& mask, const vfloat16& c, const vfloat16& a, const vfloat16& b) { - return _mm512_mask_min_ps(c,mask,a,b); - }; - __forceinline vfloat16 mask_max(const vboolf16& mask, const vfloat16& c, const vfloat16& a, const vfloat16& b) { - return _mm512_mask_max_ps(c,mask,a,b); - }; - - __forceinline vfloat16 mini(const vfloat16& a, const vfloat16& b) { -#if !defined(__AVX512ER__) // SKX - const vint16 ai = _mm512_castps_si512(a); - const vint16 bi = _mm512_castps_si512(b); - const vint16 ci = _mm512_min_epi32(ai,bi); - return _mm512_castsi512_ps(ci); -#else // KNL - return min(a,b); -#endif - } - - __forceinline vfloat16 maxi(const vfloat16& a, const vfloat16& b) { -#if !defined(__AVX512ER__) // SKX - const vint16 ai = _mm512_castps_si512(a); - const vint16 bi = _mm512_castps_si512(b); - const vint16 ci = _mm512_max_epi32(ai,bi); - return _mm512_castsi512_ps(ci); -#else // KNL - return max(a,b); -#endif - } - - //////////////////////////////////////////////////////////////////////////////// - /// Ternary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vfloat16 madd (const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmadd_ps(a,b,c); } - __forceinline vfloat16 msub (const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmsub_ps(a,b,c); } - __forceinline vfloat16 nmadd(const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fnmadd_ps(a,b,c); } - __forceinline vfloat16 nmsub(const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fnmsub_ps(a,b,c); } - - __forceinline vfloat16 mask_msub(const vboolf16& mask,const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_mask_fmsub_ps(a,mask,b,c); } - - __forceinline vfloat16 madd231 (const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmadd_ps(c,b,a); } - __forceinline vfloat16 msub213 (const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmsub_ps(a,b,c); } - __forceinline vfloat16 msub231 (const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmsub_ps(c,b,a); } - __forceinline vfloat16 msubr231(const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fnmadd_ps(c,b,a); } - - - //////////////////////////////////////////////////////////////////////////////// - /// Operators with rounding - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vfloat16 madd_round_down(const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmadd_round_ps(a,b,c,_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); } - __forceinline vfloat16 madd_round_up (const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmadd_round_ps(a,b,c,_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); } - - __forceinline vfloat16 mul_round_down(const vfloat16& a, const vfloat16& b) { return _mm512_mul_round_ps(a,b,_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); } - __forceinline vfloat16 mul_round_up (const vfloat16& a, const vfloat16& b) { return _mm512_mul_round_ps(a,b,_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); } - - __forceinline vfloat16 add_round_down(const vfloat16& a, const vfloat16& b) { return _mm512_add_round_ps(a,b,_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); } - __forceinline vfloat16 add_round_up (const vfloat16& a, const vfloat16& b) { return _mm512_add_round_ps(a,b,_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); } - - __forceinline vfloat16 sub_round_down(const vfloat16& a, const vfloat16& b) { return _mm512_sub_round_ps(a,b,_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); } - __forceinline vfloat16 sub_round_up (const vfloat16& a, const vfloat16& b) { return _mm512_sub_round_ps(a,b,_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); } - - __forceinline vfloat16 div_round_down(const vfloat16& a, const vfloat16& b) { return _mm512_div_round_ps(a,b,_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); } - __forceinline vfloat16 div_round_up (const vfloat16& a, const vfloat16& b) { return _mm512_div_round_ps(a,b,_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); } - - __forceinline vfloat16 mask_msub_round_down(const vboolf16& mask,const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_mask_fmsub_round_ps(a,mask,b,c,_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); } - __forceinline vfloat16 mask_msub_round_up (const vboolf16& mask,const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_mask_fmsub_round_ps(a,mask,b,c,_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); } - - __forceinline vfloat16 mask_mul_round_down(const vboolf16& mask,const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_mask_mul_round_ps(a,mask,b,c,_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); } - __forceinline vfloat16 mask_mul_round_up (const vboolf16& mask,const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_mask_mul_round_ps(a,mask,b,c,_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); } - - __forceinline vfloat16 mask_sub_round_down(const vboolf16& mask,const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_mask_sub_round_ps(a,mask,b,c,_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); } - __forceinline vfloat16 mask_sub_round_up (const vboolf16& mask,const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_mask_sub_round_ps(a,mask,b,c,_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); } - - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vfloat16& operator +=(vfloat16& a, const vfloat16& b) { return a = a + b; } - __forceinline vfloat16& operator +=(vfloat16& a, float b) { return a = a + b; } - - __forceinline vfloat16& operator -=(vfloat16& a, const vfloat16& b) { return a = a - b; } - __forceinline vfloat16& operator -=(vfloat16& a, float b) { return a = a - b; } - - __forceinline vfloat16& operator *=(vfloat16& a, const vfloat16& b) { return a = a * b; } - __forceinline vfloat16& operator *=(vfloat16& a, float b) { return a = a * b; } - - __forceinline vfloat16& operator /=(vfloat16& a, const vfloat16& b) { return a = a / b; } - __forceinline vfloat16& operator /=(vfloat16& a, float b) { return a = a / b; } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators + Select - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf16 operator ==(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_EQ); } - __forceinline vboolf16 operator ==(const vfloat16& a, float b) { return a == vfloat16(b); } - __forceinline vboolf16 operator ==(float a, const vfloat16& b) { return vfloat16(a) == b; } - - __forceinline vboolf16 operator !=(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_NE); } - __forceinline vboolf16 operator !=(const vfloat16& a, float b) { return a != vfloat16(b); } - __forceinline vboolf16 operator !=(float a, const vfloat16& b) { return vfloat16(a) != b; } - - __forceinline vboolf16 operator < (const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LT); } - __forceinline vboolf16 operator < (const vfloat16& a, float b) { return a < vfloat16(b); } - __forceinline vboolf16 operator < (float a, const vfloat16& b) { return vfloat16(a) < b; } - - __forceinline vboolf16 operator >=(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GE); } - __forceinline vboolf16 operator >=(const vfloat16& a, float b) { return a >= vfloat16(b); } - __forceinline vboolf16 operator >=(float a, const vfloat16& b) { return vfloat16(a) >= b; } - - __forceinline vboolf16 operator > (const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GT); } - __forceinline vboolf16 operator > (const vfloat16& a, float b) { return a > vfloat16(b); } - __forceinline vboolf16 operator > (float a, const vfloat16& b) { return vfloat16(a) > b; } - - __forceinline vboolf16 operator <=(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LE); } - __forceinline vboolf16 operator <=(const vfloat16& a, float b) { return a <= vfloat16(b); } - __forceinline vboolf16 operator <=(float a, const vfloat16& b) { return vfloat16(a) <= b; } - - __forceinline vboolf16 eq(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_EQ); } - __forceinline vboolf16 ne(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_NE); } - __forceinline vboolf16 lt(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LT); } - __forceinline vboolf16 ge(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GE); } - __forceinline vboolf16 gt(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GT); } - __forceinline vboolf16 le(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LE); } - - __forceinline vboolf16 eq(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_EQ); } - __forceinline vboolf16 ne(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_NE); } - __forceinline vboolf16 lt(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_LT); } - __forceinline vboolf16 ge(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_GE); } - __forceinline vboolf16 gt(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_GT); } - __forceinline vboolf16 le(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_LE); } - - __forceinline vfloat16 select(const vboolf16& s, const vfloat16& t, const vfloat16& f) { - return _mm512_mask_blend_ps(s, f, t); - } - - __forceinline vfloat16 lerp(const vfloat16& a, const vfloat16& b, const vfloat16& t) { - return madd(t,b-a,a); - } - - __forceinline void xchg(vboolf16 m, vfloat16& a, vfloat16& b) - { - vfloat16 c = a; - a = select(m,b,a); - b = select(m,c,b); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Rounding Functions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vfloat16 floor(const vfloat16& a) { - return _mm512_floor_ps(a); - } - __forceinline vfloat16 ceil (const vfloat16& a) { - return _mm512_ceil_ps(a); - } - __forceinline vfloat16 round (const vfloat16& a) { - return _mm512_roundscale_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); - } - __forceinline vint16 floori (const vfloat16& a) { - return _mm512_cvt_roundps_epi32(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Movement/Shifting/Shuffling Functions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vfloat16 unpacklo(const vfloat16& a, const vfloat16& b) { return _mm512_unpacklo_ps(a, b); } - __forceinline vfloat16 unpackhi(const vfloat16& a, const vfloat16& b) { return _mm512_unpackhi_ps(a, b); } - - template - __forceinline vfloat16 shuffle(const vfloat16& v) { - return _mm512_permute_ps(v, _MM_SHUFFLE(i, i, i, i)); - } - - template - __forceinline vfloat16 shuffle(const vfloat16& v) { - return _mm512_permute_ps(v, _MM_SHUFFLE(i3, i2, i1, i0)); - } - - template - __forceinline vfloat16 shuffle4(const vfloat16& v) { - return _mm512_shuffle_f32x4(v, v ,_MM_SHUFFLE(i, i, i, i)); - } - - template - __forceinline vfloat16 shuffle4(const vfloat16& v) { - return _mm512_shuffle_f32x4(v, v, _MM_SHUFFLE(i3, i2, i1, i0)); - } - - __forceinline vfloat16 interleave_even(const vfloat16& a, const vfloat16& b) { - return _mm512_castsi512_ps(_mm512_mask_shuffle_epi32(_mm512_castps_si512(a), mm512_int2mask(0xaaaa), _mm512_castps_si512(b), (_MM_PERM_ENUM)0xb1)); - } - - __forceinline vfloat16 interleave_odd(const vfloat16& a, const vfloat16& b) { - return _mm512_castsi512_ps(_mm512_mask_shuffle_epi32(_mm512_castps_si512(b), mm512_int2mask(0x5555), _mm512_castps_si512(a), (_MM_PERM_ENUM)0xb1)); - } - - __forceinline vfloat16 interleave2_even(const vfloat16& a, const vfloat16& b) { - /* mask should be 8-bit but is 16-bit to reuse for interleave_even */ - return _mm512_castsi512_ps(_mm512_mask_permutex_epi64(_mm512_castps_si512(a), mm512_int2mask(0xaaaa), _mm512_castps_si512(b), (_MM_PERM_ENUM)0xb1)); - } - - __forceinline vfloat16 interleave2_odd(const vfloat16& a, const vfloat16& b) { - /* mask should be 8-bit but is 16-bit to reuse for interleave_odd */ - return _mm512_castsi512_ps(_mm512_mask_permutex_epi64(_mm512_castps_si512(b), mm512_int2mask(0x5555), _mm512_castps_si512(a), (_MM_PERM_ENUM)0xb1)); - } - - __forceinline vfloat16 interleave4_even(const vfloat16& a, const vfloat16& b) { - return _mm512_castsi512_ps(_mm512_mask_permutex_epi64(_mm512_castps_si512(a), mm512_int2mask(0xcc), _mm512_castps_si512(b), (_MM_PERM_ENUM)0x4e)); - } - - __forceinline vfloat16 interleave4_odd(const vfloat16& a, const vfloat16& b) { - return _mm512_castsi512_ps(_mm512_mask_permutex_epi64(_mm512_castps_si512(b), mm512_int2mask(0x33), _mm512_castps_si512(a), (_MM_PERM_ENUM)0x4e)); - } - - __forceinline vfloat16 permute(vfloat16 v, __m512i index) { - return _mm512_castsi512_ps(_mm512_permutexvar_epi32(index, _mm512_castps_si512(v))); - } - - __forceinline vfloat16 reverse(const vfloat16& v) { - return permute(v,_mm512_setr_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)); - } - - template - __forceinline vfloat16 align_shift_right(const vfloat16& a, const vfloat16& b) { - return _mm512_castsi512_ps(_mm512_alignr_epi32(_mm512_castps_si512(a),_mm512_castps_si512(b),i)); - }; - - template - __forceinline vfloat16 mask_align_shift_right(const vboolf16& mask, vfloat16& c, const vfloat16& a, const vfloat16& b) { - return _mm512_castsi512_ps(_mm512_mask_alignr_epi32(_mm512_castps_si512(c),mask,_mm512_castps_si512(a),_mm512_castps_si512(b),i)); - }; - - __forceinline vfloat16 shift_left_1(const vfloat16& a) { - vfloat16 z = zero; - return mask_align_shift_right<15>(0xfffe,z,a,a); - } - - __forceinline vfloat16 shift_right_1(const vfloat16& x) { - return align_shift_right<1>(zero,x); - } - - __forceinline float toScalar(const vfloat16& v) { return mm512_cvtss_f32(v); } - - - template __forceinline vfloat16 insert4(const vfloat16& a, const vfloat4& b) { return _mm512_insertf32x4(a, b, i); } - - template - vfloat extractN(const vfloat16& v); - - template<> __forceinline vfloat4 extractN<4,0>(const vfloat16& v) { return _mm512_castps512_ps128(v); } - template<> __forceinline vfloat4 extractN<4,1>(const vfloat16& v) { return _mm512_extractf32x4_ps(v, 1); } - template<> __forceinline vfloat4 extractN<4,2>(const vfloat16& v) { return _mm512_extractf32x4_ps(v, 2); } - template<> __forceinline vfloat4 extractN<4,3>(const vfloat16& v) { return _mm512_extractf32x4_ps(v, 3); } - - template<> __forceinline vfloat8 extractN<8,0>(const vfloat16& v) { return _mm512_castps512_ps256(v); } - template<> __forceinline vfloat8 extractN<8,1>(const vfloat16& v) { return _mm512_extractf32x8_ps(v, 1); } - - template __forceinline vfloat4 extract4 (const vfloat16& v) { return _mm512_extractf32x4_ps(v, i); } - template<> __forceinline vfloat4 extract4<0>(const vfloat16& v) { return _mm512_castps512_ps128(v); } - - template __forceinline vfloat8 extract8 (const vfloat16& v) { return _mm512_extractf32x8_ps(v, i); } - template<> __forceinline vfloat8 extract8<0>(const vfloat16& v) { return _mm512_castps512_ps256(v); } - - //////////////////////////////////////////////////////////////////////////////// - /// Transpose - //////////////////////////////////////////////////////////////////////////////// - - __forceinline void transpose(const vfloat16& r0, const vfloat16& r1, const vfloat16& r2, const vfloat16& r3, - vfloat16& c0, vfloat16& c1, vfloat16& c2, vfloat16& c3) - { -#if defined(__AVX512F__) && !defined(__AVX512VL__) // KNL - vfloat16 a0a1_c0c1 = interleave_even(r0, r1); - vfloat16 a2a3_c2c3 = interleave_even(r2, r3); - vfloat16 b0b1_d0d1 = interleave_odd (r0, r1); - vfloat16 b2b3_d2d3 = interleave_odd (r2, r3); - - c0 = interleave2_even(a0a1_c0c1, a2a3_c2c3); - c1 = interleave2_even(b0b1_d0d1, b2b3_d2d3); - c2 = interleave2_odd (a0a1_c0c1, a2a3_c2c3); - c3 = interleave2_odd (b0b1_d0d1, b2b3_d2d3); -#else - vfloat16 a0a2_b0b2 = unpacklo(r0, r2); - vfloat16 c0c2_d0d2 = unpackhi(r0, r2); - vfloat16 a1a3_b1b3 = unpacklo(r1, r3); - vfloat16 c1c3_d1d3 = unpackhi(r1, r3); - - c0 = unpacklo(a0a2_b0b2, a1a3_b1b3); - c1 = unpackhi(a0a2_b0b2, a1a3_b1b3); - c2 = unpacklo(c0c2_d0d2, c1c3_d1d3); - c3 = unpackhi(c0c2_d0d2, c1c3_d1d3); -#endif - } - - __forceinline void transpose(const vfloat4& r0, const vfloat4& r1, const vfloat4& r2, const vfloat4& r3, - const vfloat4& r4, const vfloat4& r5, const vfloat4& r6, const vfloat4& r7, - const vfloat4& r8, const vfloat4& r9, const vfloat4& r10, const vfloat4& r11, - const vfloat4& r12, const vfloat4& r13, const vfloat4& r14, const vfloat4& r15, - vfloat16& c0, vfloat16& c1, vfloat16& c2, vfloat16& c3) - { - return transpose(vfloat16(r0, r4, r8, r12), vfloat16(r1, r5, r9, r13), vfloat16(r2, r6, r10, r14), vfloat16(r3, r7, r11, r15), - c0, c1, c2, c3); - } - - __forceinline void transpose(const vfloat16& r0, const vfloat16& r1, const vfloat16& r2, const vfloat16& r3, - const vfloat16& r4, const vfloat16& r5, const vfloat16& r6, const vfloat16& r7, - vfloat16& c0, vfloat16& c1, vfloat16& c2, vfloat16& c3, - vfloat16& c4, vfloat16& c5, vfloat16& c6, vfloat16& c7) - { - vfloat16 a0a1a2a3_e0e1e2e3, b0b1b2b3_f0f1f2f3, c0c1c2c3_g0g1g2g3, d0d1d2d3_h0h1h2h3; - transpose(r0, r1, r2, r3, a0a1a2a3_e0e1e2e3, b0b1b2b3_f0f1f2f3, c0c1c2c3_g0g1g2g3, d0d1d2d3_h0h1h2h3); - - vfloat16 a4a5a6a7_e4e5e6e7, b4b5b6b7_f4f5f6f7, c4c5c6c7_g4g5g6g7, d4d5d6d7_h4h5h6h7; - transpose(r4, r5, r6, r7, a4a5a6a7_e4e5e6e7, b4b5b6b7_f4f5f6f7, c4c5c6c7_g4g5g6g7, d4d5d6d7_h4h5h6h7); - - c0 = interleave4_even(a0a1a2a3_e0e1e2e3, a4a5a6a7_e4e5e6e7); - c1 = interleave4_even(b0b1b2b3_f0f1f2f3, b4b5b6b7_f4f5f6f7); - c2 = interleave4_even(c0c1c2c3_g0g1g2g3, c4c5c6c7_g4g5g6g7); - c3 = interleave4_even(d0d1d2d3_h0h1h2h3, d4d5d6d7_h4h5h6h7); - c4 = interleave4_odd (a0a1a2a3_e0e1e2e3, a4a5a6a7_e4e5e6e7); - c5 = interleave4_odd (b0b1b2b3_f0f1f2f3, b4b5b6b7_f4f5f6f7); - c6 = interleave4_odd (c0c1c2c3_g0g1g2g3, c4c5c6c7_g4g5g6g7); - c7 = interleave4_odd (d0d1d2d3_h0h1h2h3, d4d5d6d7_h4h5h6h7); - } - - __forceinline void transpose(const vfloat8& r0, const vfloat8& r1, const vfloat8& r2, const vfloat8& r3, - const vfloat8& r4, const vfloat8& r5, const vfloat8& r6, const vfloat8& r7, - const vfloat8& r8, const vfloat8& r9, const vfloat8& r10, const vfloat8& r11, - const vfloat8& r12, const vfloat8& r13, const vfloat8& r14, const vfloat8& r15, - vfloat16& c0, vfloat16& c1, vfloat16& c2, vfloat16& c3, - vfloat16& c4, vfloat16& c5, vfloat16& c6, vfloat16& c7) - { - return transpose(vfloat16(r0, r8), vfloat16(r1, r9), vfloat16(r2, r10), vfloat16(r3, r11), - vfloat16(r4, r12), vfloat16(r5, r13), vfloat16(r6, r14), vfloat16(r7, r15), - c0, c1, c2, c3, c4, c5, c6, c7); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Reductions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vfloat16 vreduce_add2(vfloat16 x) { return x + shuffle<1,0,3,2>(x); } - __forceinline vfloat16 vreduce_add4(vfloat16 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); } - __forceinline vfloat16 vreduce_add8(vfloat16 x) { x = vreduce_add4(x); return x + shuffle4<1,0,3,2>(x); } - __forceinline vfloat16 vreduce_add (vfloat16 x) { x = vreduce_add8(x); return x + shuffle4<2,3,0,1>(x); } - - __forceinline vfloat16 vreduce_min2(vfloat16 x) { return min(x, shuffle<1,0,3,2>(x)); } - __forceinline vfloat16 vreduce_min4(vfloat16 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); } - __forceinline vfloat16 vreduce_min8(vfloat16 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0,3,2>(x)); } - __forceinline vfloat16 vreduce_min (vfloat16 x) { x = vreduce_min8(x); return min(x, shuffle4<2,3,0,1>(x)); } - - __forceinline vfloat16 vreduce_max2(vfloat16 x) { return max(x, shuffle<1,0,3,2>(x)); } - __forceinline vfloat16 vreduce_max4(vfloat16 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); } - __forceinline vfloat16 vreduce_max8(vfloat16 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0,3,2>(x)); } - __forceinline vfloat16 vreduce_max (vfloat16 x) { x = vreduce_max8(x); return max(x, shuffle4<2,3,0,1>(x)); } - - __forceinline float reduce_add(const vfloat16& v) { return toScalar(vreduce_add(v)); } - __forceinline float reduce_min(const vfloat16& v) { return toScalar(vreduce_min(v)); } - __forceinline float reduce_max(const vfloat16& v) { return toScalar(vreduce_max(v)); } - - __forceinline size_t select_min(const vfloat16& v) { - return bsf(_mm512_kmov(_mm512_cmp_epi32_mask(_mm512_castps_si512(v),_mm512_castps_si512(vreduce_min(v)),_MM_CMPINT_EQ))); - } - - __forceinline size_t select_max(const vfloat16& v) { - return bsf(_mm512_kmov(_mm512_cmp_epi32_mask(_mm512_castps_si512(v),_mm512_castps_si512(vreduce_max(v)),_MM_CMPINT_EQ))); - } - - __forceinline size_t select_min(const vboolf16& valid, const vfloat16& v) - { - const vfloat16 a = select(valid,v,vfloat16(pos_inf)); - const vbool16 valid_min = valid & (a == vreduce_min(a)); - return bsf(movemask(any(valid_min) ? valid_min : valid)); - } - - __forceinline size_t select_max(const vboolf16& valid, const vfloat16& v) - { - const vfloat16 a = select(valid,v,vfloat16(neg_inf)); - const vbool16 valid_max = valid & (a == vreduce_max(a)); - return bsf(movemask(any(valid_max) ? valid_max : valid)); - } - - __forceinline vfloat16 prefix_sum(const vfloat16& a) - { - const vfloat16 z(zero); - vfloat16 v = a; - v = v + align_shift_right<16-1>(v,z); - v = v + align_shift_right<16-2>(v,z); - v = v + align_shift_right<16-4>(v,z); - v = v + align_shift_right<16-8>(v,z); - return v; - } - - __forceinline vfloat16 reverse_prefix_sum(const vfloat16& a) - { - const vfloat16 z(zero); - vfloat16 v = a; - v = v + align_shift_right<1>(z,v); - v = v + align_shift_right<2>(z,v); - v = v + align_shift_right<4>(z,v); - v = v + align_shift_right<8>(z,v); - return v; - } - - __forceinline vfloat16 prefix_min(const vfloat16& a) - { - const vfloat16 z(pos_inf); - vfloat16 v = a; - v = min(v,align_shift_right<16-1>(v,z)); - v = min(v,align_shift_right<16-2>(v,z)); - v = min(v,align_shift_right<16-4>(v,z)); - v = min(v,align_shift_right<16-8>(v,z)); - return v; - } - - __forceinline vfloat16 prefix_max(const vfloat16& a) - { - const vfloat16 z(neg_inf); - vfloat16 v = a; - v = max(v,align_shift_right<16-1>(v,z)); - v = max(v,align_shift_right<16-2>(v,z)); - v = max(v,align_shift_right<16-4>(v,z)); - v = max(v,align_shift_right<16-8>(v,z)); - return v; - } - - - __forceinline vfloat16 reverse_prefix_min(const vfloat16& a) - { - const vfloat16 z(pos_inf); - vfloat16 v = a; - v = min(v,align_shift_right<1>(z,v)); - v = min(v,align_shift_right<2>(z,v)); - v = min(v,align_shift_right<4>(z,v)); - v = min(v,align_shift_right<8>(z,v)); - return v; - } - - __forceinline vfloat16 reverse_prefix_max(const vfloat16& a) - { - const vfloat16 z(neg_inf); - vfloat16 v = a; - v = max(v,align_shift_right<1>(z,v)); - v = max(v,align_shift_right<2>(z,v)); - v = max(v,align_shift_right<4>(z,v)); - v = max(v,align_shift_right<8>(z,v)); - return v; - } - - //////////////////////////////////////////////////////////////////////////////// - /// Memory load and store operations - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vfloat16 loadAOS4to16f(const float& x, const float& y, const float& z) - { - vfloat16 f = zero; - f = select(0x1111,vfloat16::broadcast(&x),f); - f = select(0x2222,vfloat16::broadcast(&y),f); - f = select(0x4444,vfloat16::broadcast(&z),f); - return f; - } - - __forceinline vfloat16 loadAOS4to16f(unsigned int index, - const vfloat16& x, - const vfloat16& y, - const vfloat16& z) - { - vfloat16 f = zero; - f = select(0x1111,vfloat16::broadcast((float*)&x + index),f); - f = select(0x2222,vfloat16::broadcast((float*)&y + index),f); - f = select(0x4444,vfloat16::broadcast((float*)&z + index),f); - return f; - } - - __forceinline vfloat16 loadAOS4to16f(unsigned int index, - const vfloat16& x, - const vfloat16& y, - const vfloat16& z, - const vfloat16& fill) - { - vfloat16 f = fill; - f = select(0x1111,vfloat16::broadcast((float*)&x + index),f); - f = select(0x2222,vfloat16::broadcast((float*)&y + index),f); - f = select(0x4444,vfloat16::broadcast((float*)&z + index),f); - return f; - } - - __forceinline vfloat16 rcp_safe(const vfloat16& a) { - return rcp(select(a != vfloat16(zero), a, vfloat16(min_rcp_input))); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline embree_ostream operator <<(embree_ostream cout, const vfloat16& v) - { - cout << "<" << v[0]; - for (int i=1; i<16; i++) cout << ", " << v[i]; - cout << ">"; - return cout; - } -} diff --git a/thirdparty/embree-aarch64/common/simd/vfloat4_sse2.h b/thirdparty/embree-aarch64/common/simd/vfloat4_sse2.h deleted file mode 100644 index 5732c0fbc8..0000000000 --- a/thirdparty/embree-aarch64/common/simd/vfloat4_sse2.h +++ /dev/null @@ -1,925 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -namespace embree -{ - /* 4-wide SSE float type */ - template<> - struct vfloat<4> - { - ALIGNED_STRUCT_(16); - - typedef vboolf4 Bool; - typedef vint4 Int; - typedef vfloat4 Float; - - enum { size = 4 }; // number of SIMD elements - union { __m128 v; float f[4]; int i[4]; }; // data - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vfloat() {} - __forceinline vfloat(const vfloat4& other) { v = other.v; } - __forceinline vfloat4& operator =(const vfloat4& other) { v = other.v; return *this; } - - __forceinline vfloat(__m128 a) : v(a) {} - __forceinline operator const __m128&() const { return v; } - __forceinline operator __m128&() { return v; } - - __forceinline vfloat(float a) : v(_mm_set1_ps(a)) {} - __forceinline vfloat(float a, float b, float c, float d) : v(_mm_set_ps(d, c, b, a)) {} - - __forceinline explicit vfloat(const vint4& a) : v(_mm_cvtepi32_ps(a)) {} -#if defined(__aarch64__) - __forceinline explicit vfloat(const vuint4& x) { - v = vcvtq_f32_u32(vreinterpretq_u32_s32(x.v)); - } -#else - __forceinline explicit vfloat(const vuint4& x) { - const __m128i a = _mm_and_si128(x,_mm_set1_epi32(0x7FFFFFFF)); - const __m128i b = _mm_and_si128(_mm_srai_epi32(x,31),_mm_set1_epi32(0x4F000000)); //0x4F000000 = 2^31 - const __m128 af = _mm_cvtepi32_ps(a); - const __m128 bf = _mm_castsi128_ps(b); - v = _mm_add_ps(af,bf); - } -#endif - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vfloat(ZeroTy) : v(_mm_setzero_ps()) {} - __forceinline vfloat(OneTy) : v(_mm_set1_ps(1.0f)) {} - __forceinline vfloat(PosInfTy) : v(_mm_set1_ps(pos_inf)) {} - __forceinline vfloat(NegInfTy) : v(_mm_set1_ps(neg_inf)) {} - __forceinline vfloat(StepTy) : v(_mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)) {} - __forceinline vfloat(NaNTy) : v(_mm_set1_ps(nan)) {} - __forceinline vfloat(UndefinedTy) : v(_mm_undefined_ps()) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Loads and Stores - //////////////////////////////////////////////////////////////////////////////// - - static __forceinline vfloat4 load (const void* a) { return _mm_load_ps((float*)a); } - static __forceinline vfloat4 loadu(const void* a) { return _mm_loadu_ps((float*)a); } - - static __forceinline void store (void* ptr, const vfloat4& v) { _mm_store_ps((float*)ptr,v); } - static __forceinline void storeu(void* ptr, const vfloat4& v) { _mm_storeu_ps((float*)ptr,v); } - -#if defined(__AVX512VL__) - - static __forceinline vfloat4 compact(const vboolf4& mask, vfloat4 &v) { - return _mm_mask_compress_ps(v, mask, v); - } - static __forceinline vfloat4 compact(const vboolf4& mask, vfloat4 &a, const vfloat4& b) { - return _mm_mask_compress_ps(a, mask, b); - } - - static __forceinline vfloat4 load (const vboolf4& mask, const void* ptr) { return _mm_mask_load_ps (_mm_setzero_ps(),mask,(float*)ptr); } - static __forceinline vfloat4 loadu(const vboolf4& mask, const void* ptr) { return _mm_mask_loadu_ps(_mm_setzero_ps(),mask,(float*)ptr); } - - static __forceinline void store (const vboolf4& mask, void* ptr, const vfloat4& v) { _mm_mask_store_ps ((float*)ptr,mask,v); } - static __forceinline void storeu(const vboolf4& mask, void* ptr, const vfloat4& v) { _mm_mask_storeu_ps((float*)ptr,mask,v); } -#elif defined(__AVX__) - static __forceinline vfloat4 load (const vboolf4& mask, const void* ptr) { return _mm_maskload_ps((float*)ptr,mask); } - static __forceinline vfloat4 loadu(const vboolf4& mask, const void* ptr) { return _mm_maskload_ps((float*)ptr,mask); } - - static __forceinline void store (const vboolf4& mask, void* ptr, const vfloat4& v) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,v); } - static __forceinline void storeu(const vboolf4& mask, void* ptr, const vfloat4& v) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,v); } -#else - static __forceinline vfloat4 load (const vboolf4& mask, const void* ptr) { return _mm_and_ps(_mm_load_ps ((float*)ptr),mask); } - static __forceinline vfloat4 loadu(const vboolf4& mask, const void* ptr) { return _mm_and_ps(_mm_loadu_ps((float*)ptr),mask); } - - static __forceinline void store (const vboolf4& mask, void* ptr, const vfloat4& v) { store (ptr,select(mask,v,load (ptr))); } - static __forceinline void storeu(const vboolf4& mask, void* ptr, const vfloat4& v) { storeu(ptr,select(mask,v,loadu(ptr))); } -#endif - -#if defined(__AVX__) - static __forceinline vfloat4 broadcast(const void* a) { return _mm_broadcast_ss((float*)a); } -#else - static __forceinline vfloat4 broadcast(const void* a) { return _mm_set1_ps(*(float*)a); } -#endif - - static __forceinline vfloat4 load_nt (const float* ptr) { -#if defined (__SSE4_1__) - return _mm_castsi128_ps(_mm_stream_load_si128((__m128i*)ptr)); -#else - return _mm_load_ps(ptr); -#endif - } - -#if defined(__aarch64__) - static __forceinline vfloat4 load(const int8_t* ptr) { - return __m128(_mm_load4epi8_f32(((__m128i*)ptr))); - } -#elif defined(__SSE4_1__) - static __forceinline vfloat4 load(const int8_t* ptr) { - return _mm_cvtepi32_ps(_mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)ptr))); - } -#else - static __forceinline vfloat4 load(const int8_t* ptr) { - return vfloat4(ptr[0],ptr[1],ptr[2],ptr[3]); - } -#endif - -#if defined(__aarch64__) - static __forceinline vfloat4 load(const uint8_t* ptr) { - return __m128(_mm_load4epu8_f32(((__m128i*)ptr))); - } -#elif defined(__SSE4_1__) - static __forceinline vfloat4 load(const uint8_t* ptr) { - return _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr))); - } -#else - static __forceinline vfloat4 load(const uint8_t* ptr) { - //return _mm_cvtpu8_ps(*(__m64*)ptr); // don't enable, will use MMX instructions - return vfloat4(ptr[0],ptr[1],ptr[2],ptr[3]); - } -#endif - -#if defined(__aarch64__) - static __forceinline vfloat4 load(const short* ptr) { - return __m128(_mm_load4epi16_f32(((__m128i*)ptr))); - } -#elif defined(__SSE4_1__) - static __forceinline vfloat4 load(const short* ptr) { - return _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_loadu_si128((__m128i*)ptr))); - } -#else - static __forceinline vfloat4 load(const short* ptr) { - return vfloat4(ptr[0],ptr[1],ptr[2],ptr[3]); - } -#endif - - static __forceinline vfloat4 load(const unsigned short* ptr) { - return _mm_mul_ps(vfloat4(vint4::load(ptr)),vfloat4(1.0f/65535.0f)); - } - - static __forceinline void store_nt(void* ptr, const vfloat4& v) - { -#if defined (__SSE4_1__) -#if defined(__aarch64__) - _mm_stream_ps((float*)ptr,vreinterpretq_s32_f32(v.v)); -#else - _mm_stream_ps((float*)ptr,v); -#endif -#else - _mm_store_ps((float*)ptr,v); -#endif - } - - template - static __forceinline vfloat4 gather(const float* ptr, const vint4& index) { -#if defined(__AVX2__) && !defined(__aarch64__) - return _mm_i32gather_ps(ptr, index, scale); -#else - return vfloat4( - *(float*)(((int8_t*)ptr)+scale*index[0]), - *(float*)(((int8_t*)ptr)+scale*index[1]), - *(float*)(((int8_t*)ptr)+scale*index[2]), - *(float*)(((int8_t*)ptr)+scale*index[3])); -#endif - } - - template - static __forceinline vfloat4 gather(const vboolf4& mask, const float* ptr, const vint4& index) { - vfloat4 r = zero; -#if defined(__AVX512VL__) - return _mm_mmask_i32gather_ps(r, mask, index, ptr, scale); -#elif defined(__AVX2__) && !defined(__aarch64__) - return _mm_mask_i32gather_ps(r, ptr, index, mask, scale); -#else - if (likely(mask[0])) r[0] = *(float*)(((int8_t*)ptr)+scale*index[0]); - if (likely(mask[1])) r[1] = *(float*)(((int8_t*)ptr)+scale*index[1]); - if (likely(mask[2])) r[2] = *(float*)(((int8_t*)ptr)+scale*index[2]); - if (likely(mask[3])) r[3] = *(float*)(((int8_t*)ptr)+scale*index[3]); - return r; -#endif - } - - template - static __forceinline void scatter(void* ptr, const vint4& index, const vfloat4& v) - { -#if defined(__AVX512VL__) - _mm_i32scatter_ps((float*)ptr, index, v, scale); -#else - *(float*)(((int8_t*)ptr)+scale*index[0]) = v[0]; - *(float*)(((int8_t*)ptr)+scale*index[1]) = v[1]; - *(float*)(((int8_t*)ptr)+scale*index[2]) = v[2]; - *(float*)(((int8_t*)ptr)+scale*index[3]) = v[3]; -#endif - } - - template - static __forceinline void scatter(const vboolf4& mask, void* ptr, const vint4& index, const vfloat4& v) - { -#if defined(__AVX512VL__) - _mm_mask_i32scatter_ps((float*)ptr ,mask, index, v, scale); -#else - if (likely(mask[0])) *(float*)(((int8_t*)ptr)+scale*index[0]) = v[0]; - if (likely(mask[1])) *(float*)(((int8_t*)ptr)+scale*index[1]) = v[1]; - if (likely(mask[2])) *(float*)(((int8_t*)ptr)+scale*index[2]) = v[2]; - if (likely(mask[3])) *(float*)(((int8_t*)ptr)+scale*index[3]) = v[3]; -#endif - } - - static __forceinline void store(const vboolf4& mask, int8_t* ptr, const vint4& ofs, const vfloat4& v) { - scatter<1>(mask,ptr,ofs,v); - } - static __forceinline void store(const vboolf4& mask, float* ptr, const vint4& ofs, const vfloat4& v) { - scatter<4>(mask,ptr,ofs,v); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline const float& operator [](size_t index) const { assert(index < 4); return f[index]; } - __forceinline float& operator [](size_t index) { assert(index < 4); return f[index]; } - - friend __forceinline vfloat4 select(const vboolf4& m, const vfloat4& t, const vfloat4& f) { -#if defined(__AVX512VL__) - return _mm_mask_blend_ps(m, f, t); -#elif defined(__SSE4_1__) || (defined(__aarch64__)) - return _mm_blendv_ps(f, t, m); -#else - return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f)); -#endif - } - }; - - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vfloat4 asFloat(const vint4& a) { return _mm_castsi128_ps(a); } - __forceinline vint4 asInt (const vfloat4& a) { return _mm_castps_si128(a); } - __forceinline vuint4 asUInt (const vfloat4& a) { return _mm_castps_si128(a); } - - __forceinline vint4 toInt (const vfloat4& a) { return vint4(a); } - __forceinline vfloat4 toFloat(const vint4& a) { return vfloat4(a); } - - __forceinline vfloat4 operator +(const vfloat4& a) { return a; } -#if defined(__aarch64__) - __forceinline vfloat4 operator -(const vfloat4& a) { - return vnegq_f32(a); - } -#else - __forceinline vfloat4 operator -(const vfloat4& a) { return _mm_xor_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))); } -#endif - -#if defined(__aarch64__) - __forceinline vfloat4 abs(const vfloat4& a) { return _mm_abs_ps(a); } -#else - __forceinline vfloat4 abs(const vfloat4& a) { return _mm_and_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))); } -#endif - -#if defined(__AVX512VL__) - __forceinline vfloat4 sign(const vfloat4& a) { return _mm_mask_blend_ps(_mm_cmp_ps_mask(a, vfloat4(zero), _CMP_LT_OQ), vfloat4(one), -vfloat4(one)); } -#else - __forceinline vfloat4 sign(const vfloat4& a) { return blendv_ps(vfloat4(one), -vfloat4(one), _mm_cmplt_ps(a, vfloat4(zero))); } -#endif - -#if defined(__aarch64__) - __forceinline vfloat4 signmsk(const vfloat4& a) { return _mm_and_ps(a, vreinterpretq_f32_u32(v0x80000000)); } -#else - __forceinline vfloat4 signmsk(const vfloat4& a) { return _mm_and_ps(a,_mm_castsi128_ps(_mm_set1_epi32(0x80000000))); } -#endif - - __forceinline vfloat4 rcp(const vfloat4& a) - { -#if defined(__aarch64__) -#if defined(BUILD_IOS) - return vfloat4(vdivq_f32(vdupq_n_f32(1.0f),a.v)); -#else //BUILD_IOS - __m128 reciprocal = _mm_rcp_ps(a); - reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal); - reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal); - // +1 round since NEON's reciprocal estimate instruction has less accuracy than SSE2's rcp. - reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal); - return (const vfloat4)reciprocal; -#endif // BUILD_IOS -#else - -#if defined(__AVX512VL__) - const vfloat4 r = _mm_rcp14_ps(a); -#else - const vfloat4 r = _mm_rcp_ps(a); -#endif - -#if defined(__AVX2__) - return _mm_mul_ps(r,_mm_fnmadd_ps(r, a, vfloat4(2.0f))); -#else - return _mm_mul_ps(r,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r, a))); -#endif - -#endif //defined(__aarch64__) - } - __forceinline vfloat4 sqr (const vfloat4& a) { return _mm_mul_ps(a,a); } - __forceinline vfloat4 sqrt(const vfloat4& a) { return _mm_sqrt_ps(a); } - - __forceinline vfloat4 rsqrt(const vfloat4& a) - { -#if defined(__aarch64__) - vfloat4 r = _mm_rsqrt_ps(a); - r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a, r), r)); - r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a, r), r)); - r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a, r), r)); - return r; -#else - -#if defined(__AVX512VL__) - const vfloat4 r = _mm_rsqrt14_ps(a); -#else - const vfloat4 r = _mm_rsqrt_ps(a); -#endif - -#if defined(__AVX2__) - return _mm_fmadd_ps(_mm_set1_ps(1.5f), r, - _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); -#else - return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f), r), - _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); -#endif - -#endif - } - - __forceinline vboolf4 isnan(const vfloat4& a) { -#if defined(__aarch64__) - const vfloat4 b = _mm_and_ps(a, vreinterpretq_f32_u32(v0x7fffffff)); -#else - const vfloat4 b = _mm_and_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))); -#endif -#if defined(__AVX512VL__) - return _mm_cmp_epi32_mask(_mm_castps_si128(b), _mm_set1_epi32(0x7f800000), _MM_CMPINT_GT); -#else - return _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_castps_si128(b), _mm_set1_epi32(0x7f800000))); -#endif - } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vfloat4 operator +(const vfloat4& a, const vfloat4& b) { return _mm_add_ps(a, b); } - __forceinline vfloat4 operator +(const vfloat4& a, float b) { return a + vfloat4(b); } - __forceinline vfloat4 operator +(float a, const vfloat4& b) { return vfloat4(a) + b; } - - __forceinline vfloat4 operator -(const vfloat4& a, const vfloat4& b) { return _mm_sub_ps(a, b); } - __forceinline vfloat4 operator -(const vfloat4& a, float b) { return a - vfloat4(b); } - __forceinline vfloat4 operator -(float a, const vfloat4& b) { return vfloat4(a) - b; } - - __forceinline vfloat4 operator *(const vfloat4& a, const vfloat4& b) { return _mm_mul_ps(a, b); } - __forceinline vfloat4 operator *(const vfloat4& a, float b) { return a * vfloat4(b); } - __forceinline vfloat4 operator *(float a, const vfloat4& b) { return vfloat4(a) * b; } - - __forceinline vfloat4 operator /(const vfloat4& a, const vfloat4& b) { return _mm_div_ps(a,b); } - __forceinline vfloat4 operator /(const vfloat4& a, float b) { return a/vfloat4(b); } - __forceinline vfloat4 operator /(float a, const vfloat4& b) { return vfloat4(a)/b; } - - __forceinline vfloat4 operator &(const vfloat4& a, const vfloat4& b) { return _mm_and_ps(a,b); } - __forceinline vfloat4 operator |(const vfloat4& a, const vfloat4& b) { return _mm_or_ps(a,b); } - __forceinline vfloat4 operator ^(const vfloat4& a, const vfloat4& b) { return _mm_xor_ps(a,b); } - __forceinline vfloat4 operator ^(const vfloat4& a, const vint4& b) { return _mm_xor_ps(a,_mm_castsi128_ps(b)); } - - __forceinline vfloat4 min(const vfloat4& a, const vfloat4& b) { return _mm_min_ps(a,b); } - __forceinline vfloat4 min(const vfloat4& a, float b) { return _mm_min_ps(a,vfloat4(b)); } - __forceinline vfloat4 min(float a, const vfloat4& b) { return _mm_min_ps(vfloat4(a),b); } - - __forceinline vfloat4 max(const vfloat4& a, const vfloat4& b) { return _mm_max_ps(a,b); } - __forceinline vfloat4 max(const vfloat4& a, float b) { return _mm_max_ps(a,vfloat4(b)); } - __forceinline vfloat4 max(float a, const vfloat4& b) { return _mm_max_ps(vfloat4(a),b); } - -#if defined(__SSE4_1__) || defined(__aarch64__) - - __forceinline vfloat4 mini(const vfloat4& a, const vfloat4& b) { - const vint4 ai = _mm_castps_si128(a); - const vint4 bi = _mm_castps_si128(b); - const vint4 ci = _mm_min_epi32(ai,bi); - return _mm_castsi128_ps(ci); - } - - __forceinline vfloat4 maxi(const vfloat4& a, const vfloat4& b) { - const vint4 ai = _mm_castps_si128(a); - const vint4 bi = _mm_castps_si128(b); - const vint4 ci = _mm_max_epi32(ai,bi); - return _mm_castsi128_ps(ci); - } - - __forceinline vfloat4 minui(const vfloat4& a, const vfloat4& b) { - const vint4 ai = _mm_castps_si128(a); - const vint4 bi = _mm_castps_si128(b); - const vint4 ci = _mm_min_epu32(ai,bi); - return _mm_castsi128_ps(ci); - } - - __forceinline vfloat4 maxui(const vfloat4& a, const vfloat4& b) { - const vint4 ai = _mm_castps_si128(a); - const vint4 bi = _mm_castps_si128(b); - const vint4 ci = _mm_max_epu32(ai,bi); - return _mm_castsi128_ps(ci); - } -#else - __forceinline vfloat4 mini(const vfloat4& a, const vfloat4& b) { - return min(a,b); - } - - __forceinline vfloat4 maxi(const vfloat4& a, const vfloat4& b) { - return max(a,b); - } -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Ternary Operators - //////////////////////////////////////////////////////////////////////////////// - -#if defined(__AVX2__) - __forceinline vfloat4 madd (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fmadd_ps(a,b,c); } - __forceinline vfloat4 msub (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fmsub_ps(a,b,c); } - __forceinline vfloat4 nmadd(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fnmadd_ps(a,b,c); } - __forceinline vfloat4 nmsub(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fnmsub_ps(a,b,c); } -#else - -#if defined(__aarch64__) - __forceinline vfloat4 madd (const vfloat4& a, const vfloat4& b, const vfloat4& c) { - return _mm_madd_ps(a, b, c); //a*b+c; - } - __forceinline vfloat4 nmadd(const vfloat4& a, const vfloat4& b, const vfloat4& c) { - return _mm_msub_ps(a, b, c); //-a*b+c; - } - __forceinline vfloat4 nmsub(const vfloat4& a, const vfloat4& b, const vfloat4& c) { - return vnegq_f32(vfmaq_f32(c,a, b)); - } -#else - __forceinline vfloat4 madd (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return a*b+c; } - __forceinline vfloat4 nmadd(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return -a*b+c;} - __forceinline vfloat4 nmsub(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return -a*b-c; } -#endif - __forceinline vfloat4 msub (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return a*b-c; } - -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vfloat4& operator +=(vfloat4& a, const vfloat4& b) { return a = a + b; } - __forceinline vfloat4& operator +=(vfloat4& a, float b) { return a = a + b; } - - __forceinline vfloat4& operator -=(vfloat4& a, const vfloat4& b) { return a = a - b; } - __forceinline vfloat4& operator -=(vfloat4& a, float b) { return a = a - b; } - - __forceinline vfloat4& operator *=(vfloat4& a, const vfloat4& b) { return a = a * b; } - __forceinline vfloat4& operator *=(vfloat4& a, float b) { return a = a * b; } - - __forceinline vfloat4& operator /=(vfloat4& a, const vfloat4& b) { return a = a / b; } - __forceinline vfloat4& operator /=(vfloat4& a, float b) { return a = a / b; } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators + Select - //////////////////////////////////////////////////////////////////////////////// - -#if defined(__AVX512VL__) - __forceinline vboolf4 operator ==(const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_EQ); } - __forceinline vboolf4 operator !=(const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_NE); } - __forceinline vboolf4 operator < (const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_LT); } - __forceinline vboolf4 operator >=(const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_GE); } - __forceinline vboolf4 operator > (const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_GT); } - __forceinline vboolf4 operator <=(const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_LE); } -#else - __forceinline vboolf4 operator ==(const vfloat4& a, const vfloat4& b) { return _mm_cmpeq_ps (a, b); } - __forceinline vboolf4 operator !=(const vfloat4& a, const vfloat4& b) { return _mm_cmpneq_ps(a, b); } - __forceinline vboolf4 operator < (const vfloat4& a, const vfloat4& b) { return _mm_cmplt_ps (a, b); } -#if defined(__aarch64__) - __forceinline vboolf4 operator >=(const vfloat4& a, const vfloat4& b) { return _mm_cmpge_ps (a, b); } - __forceinline vboolf4 operator > (const vfloat4& a, const vfloat4& b) { return _mm_cmpgt_ps (a, b); } -#else - __forceinline vboolf4 operator >=(const vfloat4& a, const vfloat4& b) { return _mm_cmpnlt_ps(a, b); } - __forceinline vboolf4 operator > (const vfloat4& a, const vfloat4& b) { return _mm_cmpnle_ps(a, b); } -#endif - __forceinline vboolf4 operator <=(const vfloat4& a, const vfloat4& b) { return _mm_cmple_ps (a, b); } -#endif - - __forceinline vboolf4 operator ==(const vfloat4& a, float b) { return a == vfloat4(b); } - __forceinline vboolf4 operator ==(float a, const vfloat4& b) { return vfloat4(a) == b; } - - __forceinline vboolf4 operator !=(const vfloat4& a, float b) { return a != vfloat4(b); } - __forceinline vboolf4 operator !=(float a, const vfloat4& b) { return vfloat4(a) != b; } - - __forceinline vboolf4 operator < (const vfloat4& a, float b) { return a < vfloat4(b); } - __forceinline vboolf4 operator < (float a, const vfloat4& b) { return vfloat4(a) < b; } - - __forceinline vboolf4 operator >=(const vfloat4& a, float b) { return a >= vfloat4(b); } - __forceinline vboolf4 operator >=(float a, const vfloat4& b) { return vfloat4(a) >= b; } - - __forceinline vboolf4 operator > (const vfloat4& a, float b) { return a > vfloat4(b); } - __forceinline vboolf4 operator > (float a, const vfloat4& b) { return vfloat4(a) > b; } - - __forceinline vboolf4 operator <=(const vfloat4& a, float b) { return a <= vfloat4(b); } - __forceinline vboolf4 operator <=(float a, const vfloat4& b) { return vfloat4(a) <= b; } - - __forceinline vboolf4 eq(const vfloat4& a, const vfloat4& b) { return a == b; } - __forceinline vboolf4 ne(const vfloat4& a, const vfloat4& b) { return a != b; } - __forceinline vboolf4 lt(const vfloat4& a, const vfloat4& b) { return a < b; } - __forceinline vboolf4 ge(const vfloat4& a, const vfloat4& b) { return a >= b; } - __forceinline vboolf4 gt(const vfloat4& a, const vfloat4& b) { return a > b; } - __forceinline vboolf4 le(const vfloat4& a, const vfloat4& b) { return a <= b; } - -#if defined(__AVX512VL__) - __forceinline vboolf4 eq(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_EQ); } - __forceinline vboolf4 ne(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_NE); } - __forceinline vboolf4 lt(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_LT); } - __forceinline vboolf4 ge(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_GE); } - __forceinline vboolf4 gt(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_GT); } - __forceinline vboolf4 le(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_LE); } -#else - __forceinline vboolf4 eq(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a == b); } - __forceinline vboolf4 ne(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a != b); } - __forceinline vboolf4 lt(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a < b); } - __forceinline vboolf4 ge(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a >= b); } - __forceinline vboolf4 gt(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a > b); } - __forceinline vboolf4 le(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a <= b); } -#endif - - template - __forceinline vfloat4 select(const vfloat4& t, const vfloat4& f) - { -#if defined(__SSE4_1__) - return _mm_blend_ps(f, t, mask); -#else - return select(vboolf4(mask), t, f); -#endif - } - -#if defined(__aarch64__) - template<> __forceinline vfloat4 select<0>(const vfloat4& t, const vfloat4& f) { - return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vzero)); - } - template<> __forceinline vfloat4 select<1>(const vfloat4& t, const vfloat4& f) { - return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v000F)); - } - template<> __forceinline vfloat4 select<2>(const vfloat4& t, const vfloat4& f) { - return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v00F0)); - } - template<> __forceinline vfloat4 select<3>(const vfloat4& t, const vfloat4& f) { - return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v00FF)); - } - template<> __forceinline vfloat4 select<4>(const vfloat4& t, const vfloat4& f) { - return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v0F00)); - } - template<> __forceinline vfloat4 select<5>(const vfloat4& t, const vfloat4& f) { - return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v0F0F)); - } - template<> __forceinline vfloat4 select<6>(const vfloat4& t, const vfloat4& f) { - return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v0FF0)); - } - template<> __forceinline vfloat4 select<7>(const vfloat4& t, const vfloat4& f) { - return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v0FFF)); - } - template<> __forceinline vfloat4 select<8>(const vfloat4& t, const vfloat4& f) { - return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vF000)); - } - template<> __forceinline vfloat4 select<9>(const vfloat4& t, const vfloat4& f) { - return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vF00F)); - } - template<> __forceinline vfloat4 select<10>(const vfloat4& t, const vfloat4& f) { - return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vF0F0)); - } - template<> __forceinline vfloat4 select<11>(const vfloat4& t, const vfloat4& f) { - return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vF0FF)); - } - template<> __forceinline vfloat4 select<12>(const vfloat4& t, const vfloat4& f) { - return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vFF00)); - } - template<> __forceinline vfloat4 select<13>(const vfloat4& t, const vfloat4& f) { - return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vFF0F)); - } - template<> __forceinline vfloat4 select<14>(const vfloat4& t, const vfloat4& f) { - return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vFFF0)); - } - template<> __forceinline vfloat4 select<15>(const vfloat4& t, const vfloat4& f) { - return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vFFFF)); - } -#endif - - __forceinline vfloat4 lerp(const vfloat4& a, const vfloat4& b, const vfloat4& t) { - return madd(t,b-a,a); - } - - __forceinline bool isvalid(const vfloat4& v) { - return all((v > vfloat4(-FLT_LARGE)) & (v < vfloat4(+FLT_LARGE))); - } - - __forceinline bool is_finite(const vfloat4& a) { - return all((a >= vfloat4(-FLT_MAX)) & (a <= vfloat4(+FLT_MAX))); - } - - __forceinline bool is_finite(const vboolf4& valid, const vfloat4& a) { - return all(valid, (a >= vfloat4(-FLT_MAX)) & (a <= vfloat4(+FLT_MAX))); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Rounding Functions - //////////////////////////////////////////////////////////////////////////////// - -#if defined(__aarch64__) - __forceinline vfloat4 floor(const vfloat4& a) { return vrndmq_f32(a.v); } // towards -inf - __forceinline vfloat4 ceil (const vfloat4& a) { return vrndpq_f32(a.v); } // toward +inf - __forceinline vfloat4 trunc(const vfloat4& a) { return vrndq_f32(a.v); } // towards 0 - __forceinline vfloat4 round(const vfloat4& a) { return vrndnq_f32(a.v); } // to nearest, ties to even. NOTE(LTE): arm clang uses vrndnq, old gcc uses vrndqn? -#elif defined (__SSE4_1__) - __forceinline vfloat4 floor(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF ); } - __forceinline vfloat4 ceil (const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF ); } - __forceinline vfloat4 trunc(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_ZERO ); } - __forceinline vfloat4 round(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT); } // (even) https://www.felixcloutier.com/x86/roundpd -#else - __forceinline vfloat4 floor(const vfloat4& a) { return vfloat4(floorf(a[0]),floorf(a[1]),floorf(a[2]),floorf(a[3])); } - __forceinline vfloat4 ceil (const vfloat4& a) { return vfloat4(ceilf (a[0]),ceilf (a[1]),ceilf (a[2]),ceilf (a[3])); } - __forceinline vfloat4 trunc(const vfloat4& a) { return vfloat4(truncf(a[0]),truncf(a[1]),truncf(a[2]),truncf(a[3])); } - __forceinline vfloat4 round(const vfloat4& a) { return vfloat4(roundf(a[0]),roundf(a[1]),roundf(a[2]),roundf(a[3])); } -#endif - __forceinline vfloat4 frac(const vfloat4& a) { return a-floor(a); } - - __forceinline vint4 floori(const vfloat4& a) { -#if defined(__aarch64__) - return vcvtq_s32_f32(floor(a)); -#elif defined(__SSE4_1__) - return vint4(floor(a)); -#else - return vint4(a-vfloat4(0.5f)); -#endif - } - - //////////////////////////////////////////////////////////////////////////////// - /// Movement/Shifting/Shuffling Functions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vfloat4 unpacklo(const vfloat4& a, const vfloat4& b) { return _mm_unpacklo_ps(a, b); } - __forceinline vfloat4 unpackhi(const vfloat4& a, const vfloat4& b) { return _mm_unpackhi_ps(a, b); } - -#if defined(__aarch64__) - template - __forceinline vfloat4 shuffle(const vfloat4& v) { - return vreinterpretq_f32_u8(vqtbl1q_u8( (uint8x16_t)v.v, _MN_SHUFFLE(i0, i1, i2, i3))); - } - template - __forceinline vfloat4 shuffle(const vfloat4& a, const vfloat4& b) { - return vreinterpretq_f32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3))); - } -#else - template - __forceinline vfloat4 shuffle(const vfloat4& v) { - return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v), _MM_SHUFFLE(i3, i2, i1, i0))); - } - - template - __forceinline vfloat4 shuffle(const vfloat4& a, const vfloat4& b) { - return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)); - } -#endif - -#if defined (__SSSE3__) - __forceinline vfloat4 shuffle8(const vfloat4& a, const vint4& shuf) { - return _mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(a), shuf)); - } -#endif - -#if defined(__aarch64__) - template<> __forceinline vfloat4 shuffle<0, 0, 2, 2>(const vfloat4& v) { return __m128(vqtbl1q_u8( uint8x16_t(v.v), v0022 )); } - template<> __forceinline vfloat4 shuffle<1, 1, 3, 3>(const vfloat4& v) { return __m128(vqtbl1q_u8( uint8x16_t(v.v), v1133)); } - template<> __forceinline vfloat4 shuffle<0, 1, 0, 1>(const vfloat4& v) { return __m128(vqtbl1q_u8( uint8x16_t(v.v), v0101)); } -#elif defined(__SSE3__) - template<> __forceinline vfloat4 shuffle<0, 0, 2, 2>(const vfloat4& v) { return _mm_moveldup_ps(v); } - template<> __forceinline vfloat4 shuffle<1, 1, 3, 3>(const vfloat4& v) { return _mm_movehdup_ps(v); } - template<> __forceinline vfloat4 shuffle<0, 1, 0, 1>(const vfloat4& v) { return _mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(v))); } -#endif - - template - __forceinline vfloat4 shuffle(const vfloat4& v) { - return shuffle(v); - } - -#if defined(__aarch64__) - template __forceinline float extract(const vfloat4& a); - template<> __forceinline float extract<0>(const vfloat4& b) { - return b[0]; - } - template<> __forceinline float extract<1>(const vfloat4& b) { - return b[1]; - } - template<> __forceinline float extract<2>(const vfloat4& b) { - return b[2]; - } - template<> __forceinline float extract<3>(const vfloat4& b) { - return b[3]; - } -#elif defined (__SSE4_1__) && !defined(__GNUC__) - template __forceinline float extract(const vfloat4& a) { return _mm_cvtss_f32(_mm_extract_ps(a,i)); } - template<> __forceinline float extract<0>(const vfloat4& a) { return _mm_cvtss_f32(a); } -#else - template __forceinline float extract(const vfloat4& a) { return _mm_cvtss_f32(shuffle(a)); } - template<> __forceinline float extract<0>(const vfloat4& a) { return _mm_cvtss_f32(a); } -#endif - - -#if defined(__aarch64__) - template __forceinline vfloat4 insert(const vfloat4& a, float b); - template<> __forceinline vfloat4 insert<0>(const vfloat4& a, float b) - { - vfloat4 c = a; - c[0] = b; - return c; - } - template<> __forceinline vfloat4 insert<1>(const vfloat4& a, float b) - { - vfloat4 c = a; - c[1] = b; - return c; - } - template<> __forceinline vfloat4 insert<2>(const vfloat4& a, float b) - { - vfloat4 c = a; - c[2] = b; - return c; - } - template<> __forceinline vfloat4 insert<3>(const vfloat4& a, float b) - { - vfloat4 c = a; - c[3] = b; - return c; - } -#elif defined (__SSE4_1__) - template __forceinline vfloat4 insert(const vfloat4& a, const vfloat4& b) { return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); } - template __forceinline vfloat4 insert(const vfloat4& a, const vfloat4& b) { return insert(a, b); } - template __forceinline vfloat4 insert(const vfloat4& a, const float b) { return insert(a, _mm_set_ss(b)); } -#else - template __forceinline vfloat4 insert(const vfloat4& a, const vfloat4& b) { vfloat4 c = a; c[dst&3] = b[src&3]; return c; } - template __forceinline vfloat4 insert(const vfloat4& a, float b) { vfloat4 c = a; c[dst&3] = b; return c; } -#endif - -#if defined(__aarch64__) - __forceinline float toScalar(const vfloat4& v) { - return v[0]; - } -#else - __forceinline float toScalar(const vfloat4& v) { return _mm_cvtss_f32(v); } -#endif - __forceinline vfloat4 broadcast4f(const vfloat4& a, size_t k) { - return vfloat4::broadcast(&a[k]); - } - - __forceinline vfloat4 shift_right_1(const vfloat4& x) { - return _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(x), 4)); - } - -#if defined (__AVX2__) - __forceinline vfloat4 permute(const vfloat4 &a, const __m128i &index) { - return _mm_permutevar_ps(a,index); - } - - __forceinline vfloat4 broadcast1f(const void* a) { return _mm_broadcast_ss((float*)a); } - -#endif - -#if defined(__AVX512VL__) - template - __forceinline vfloat4 align_shift_right(const vfloat4& a, const vfloat4& b) { - return _mm_castsi128_ps(_mm_alignr_epi32(_mm_castps_si128(a), _mm_castps_si128(b), i)); - } -#endif - - - //////////////////////////////////////////////////////////////////////////////// - /// Sorting Network - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vfloat4 sort_ascending(const vfloat4& v) - { - const vfloat4 a0 = v; - const vfloat4 b0 = shuffle<1,0,3,2>(a0); - const vfloat4 c0 = min(a0,b0); - const vfloat4 d0 = max(a0,b0); - const vfloat4 a1 = select<0x5 /* 0b0101 */>(c0,d0); - const vfloat4 b1 = shuffle<2,3,0,1>(a1); - const vfloat4 c1 = min(a1,b1); - const vfloat4 d1 = max(a1,b1); - const vfloat4 a2 = select<0x3 /* 0b0011 */>(c1,d1); - const vfloat4 b2 = shuffle<0,2,1,3>(a2); - const vfloat4 c2 = min(a2,b2); - const vfloat4 d2 = max(a2,b2); - const vfloat4 a3 = select<0x2 /* 0b0010 */>(c2,d2); - return a3; - } - - __forceinline vfloat4 sort_descending(const vfloat4& v) - { - const vfloat4 a0 = v; - const vfloat4 b0 = shuffle<1,0,3,2>(a0); - const vfloat4 c0 = max(a0,b0); - const vfloat4 d0 = min(a0,b0); - const vfloat4 a1 = select<0x5 /* 0b0101 */>(c0,d0); - const vfloat4 b1 = shuffle<2,3,0,1>(a1); - const vfloat4 c1 = max(a1,b1); - const vfloat4 d1 = min(a1,b1); - const vfloat4 a2 = select<0x3 /* 0b0011 */>(c1,d1); - const vfloat4 b2 = shuffle<0,2,1,3>(a2); - const vfloat4 c2 = max(a2,b2); - const vfloat4 d2 = min(a2,b2); - const vfloat4 a3 = select<0x2 /* 0b0010 */>(c2,d2); - return a3; - } - - //////////////////////////////////////////////////////////////////////////////// - /// Transpose - //////////////////////////////////////////////////////////////////////////////// - - __forceinline void transpose(const vfloat4& r0, const vfloat4& r1, const vfloat4& r2, const vfloat4& r3, vfloat4& c0, vfloat4& c1, vfloat4& c2, vfloat4& c3) - { - vfloat4 l02 = unpacklo(r0,r2); - vfloat4 h02 = unpackhi(r0,r2); - vfloat4 l13 = unpacklo(r1,r3); - vfloat4 h13 = unpackhi(r1,r3); - c0 = unpacklo(l02,l13); - c1 = unpackhi(l02,l13); - c2 = unpacklo(h02,h13); - c3 = unpackhi(h02,h13); - } - - __forceinline void transpose(const vfloat4& r0, const vfloat4& r1, const vfloat4& r2, const vfloat4& r3, vfloat4& c0, vfloat4& c1, vfloat4& c2) - { - vfloat4 l02 = unpacklo(r0,r2); - vfloat4 h02 = unpackhi(r0,r2); - vfloat4 l13 = unpacklo(r1,r3); - vfloat4 h13 = unpackhi(r1,r3); - c0 = unpacklo(l02,l13); - c1 = unpackhi(l02,l13); - c2 = unpacklo(h02,h13); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Reductions - //////////////////////////////////////////////////////////////////////////////// -#if defined(__aarch64__) - __forceinline vfloat4 vreduce_min(const vfloat4& v) { float h = vminvq_f32(v); return vdupq_n_f32(h); } - __forceinline vfloat4 vreduce_max(const vfloat4& v) { float h = vmaxvq_f32(v); return vdupq_n_f32(h); } - __forceinline vfloat4 vreduce_add(const vfloat4& v) { float h = vaddvq_f32(v); return vdupq_n_f32(h); } -#else - __forceinline vfloat4 vreduce_min(const vfloat4& v) { vfloat4 h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); } - __forceinline vfloat4 vreduce_max(const vfloat4& v) { vfloat4 h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); } - __forceinline vfloat4 vreduce_add(const vfloat4& v) { vfloat4 h = shuffle<1,0,3,2>(v) + v ; return shuffle<2,3,0,1>(h) + h ; } -#endif - -#if defined(__aarch64__) - __forceinline float reduce_min(const vfloat4& v) { return vminvq_f32(v); } - __forceinline float reduce_max(const vfloat4& v) { return vmaxvq_f32(v); } - __forceinline float reduce_add(const vfloat4& v) { return vaddvq_f32(v); } -#else - __forceinline float reduce_min(const vfloat4& v) { return _mm_cvtss_f32(vreduce_min(v)); } - __forceinline float reduce_max(const vfloat4& v) { return _mm_cvtss_f32(vreduce_max(v)); } - __forceinline float reduce_add(const vfloat4& v) { return _mm_cvtss_f32(vreduce_add(v)); } -#endif - - __forceinline size_t select_min(const vboolf4& valid, const vfloat4& v) - { - const vfloat4 a = select(valid,v,vfloat4(pos_inf)); - const vbool4 valid_min = valid & (a == vreduce_min(a)); - return bsf(movemask(any(valid_min) ? valid_min : valid)); - } - __forceinline size_t select_max(const vboolf4& valid, const vfloat4& v) - { - const vfloat4 a = select(valid,v,vfloat4(neg_inf)); - const vbool4 valid_max = valid & (a == vreduce_max(a)); - return bsf(movemask(any(valid_max) ? valid_max : valid)); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Euclidian Space Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline float dot(const vfloat4& a, const vfloat4& b) { - return reduce_add(a*b); - } - - __forceinline vfloat4 cross(const vfloat4& a, const vfloat4& b) - { - const vfloat4 a0 = a; - const vfloat4 b0 = shuffle<1,2,0,3>(b); - const vfloat4 a1 = shuffle<1,2,0,3>(a); - const vfloat4 b1 = b; - return shuffle<1,2,0,3>(prod_diff(a0,b0,a1,b1)); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline embree_ostream operator <<(embree_ostream cout, const vfloat4& a) { - return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ">"; - } - -} diff --git a/thirdparty/embree-aarch64/common/simd/vfloat8_avx.h b/thirdparty/embree-aarch64/common/simd/vfloat8_avx.h deleted file mode 100644 index 3c7e4a8cdc..0000000000 --- a/thirdparty/embree-aarch64/common/simd/vfloat8_avx.h +++ /dev/null @@ -1,847 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -namespace embree -{ - /* 8-wide AVX float type */ - template<> - struct vfloat<8> - { - ALIGNED_STRUCT_(32); - - typedef vboolf8 Bool; - typedef vint8 Int; - typedef vfloat8 Float; - - enum { size = 8 }; // number of SIMD elements - union { __m256 v; float f[8]; int i[8]; }; // data - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vfloat() {} - __forceinline vfloat(const vfloat8& other) { v = other.v; } - __forceinline vfloat8& operator =(const vfloat8& other) { v = other.v; return *this; } - - __forceinline vfloat(__m256 a) : v(a) {} - __forceinline operator const __m256&() const { return v; } - __forceinline operator __m256&() { return v; } - - __forceinline explicit vfloat(const vfloat4& a) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),a,1)) {} - __forceinline vfloat(const vfloat4& a, const vfloat4& b) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),b,1)) {} - - __forceinline explicit vfloat(const int8_t* a) : v(_mm256_loadu_ps((const float*)a)) {} - __forceinline vfloat(float a) : v(_mm256_set1_ps(a)) {} - __forceinline vfloat(float a, float b) : v(_mm256_set_ps(b, a, b, a, b, a, b, a)) {} - __forceinline vfloat(float a, float b, float c, float d) : v(_mm256_set_ps(d, c, b, a, d, c, b, a)) {} - __forceinline vfloat(float a, float b, float c, float d, float e, float f, float g, float h) : v(_mm256_set_ps(h, g, f, e, d, c, b, a)) {} - - __forceinline explicit vfloat(__m256i a) : v(_mm256_cvtepi32_ps(a)) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vfloat(ZeroTy) : v(_mm256_setzero_ps()) {} - __forceinline vfloat(OneTy) : v(_mm256_set1_ps(1.0f)) {} - __forceinline vfloat(PosInfTy) : v(_mm256_set1_ps(pos_inf)) {} - __forceinline vfloat(NegInfTy) : v(_mm256_set1_ps(neg_inf)) {} - __forceinline vfloat(StepTy) : v(_mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)) {} - __forceinline vfloat(NaNTy) : v(_mm256_set1_ps(nan)) {} - __forceinline vfloat(UndefinedTy) : v(_mm256_undefined_ps()) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Loads and Stores - //////////////////////////////////////////////////////////////////////////////// - - static __forceinline vfloat8 broadcast(const void* a) { - return _mm256_broadcast_ss((float*)a); - } - - static __forceinline vfloat8 broadcast2(const float* a, const float* b) { -#if defined(__INTEL_COMPILER) - const vfloat8 v0 = _mm256_broadcast_ss(a); - const vfloat8 v1 = _mm256_broadcast_ss(b); - return _mm256_blend_ps(v1, v0, 0xf); -#else - return _mm256_set_ps(*b,*b,*b,*b,*a,*a,*a,*a); -#endif - } - - static __forceinline vfloat8 broadcast4f(const vfloat4* ptr) { - return _mm256_broadcast_ps((__m128*)ptr); - } - - static __forceinline vfloat8 load(const int8_t* ptr) { -#if defined(__AVX2__) - return _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32(_mm_loadu_si128((__m128i*)ptr))); -#else - return vfloat8(vfloat4::load(ptr),vfloat4::load(ptr+4)); -#endif - } - - static __forceinline vfloat8 load(const uint8_t* ptr) { -#if defined(__AVX2__) - return _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr))); -#else - return vfloat8(vfloat4::load(ptr),vfloat4::load(ptr+4)); -#endif - } - - static __forceinline vfloat8 load(const short* ptr) { -#if defined(__AVX2__) - return _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)ptr))); -#else - return vfloat8(vfloat4::load(ptr),vfloat4::load(ptr+4)); -#endif - } - - static __forceinline vfloat8 load (const void* ptr) { return _mm256_load_ps((float*)ptr); } - static __forceinline vfloat8 loadu(const void* ptr) { return _mm256_loadu_ps((float*)ptr); } - - static __forceinline void store (void* ptr, const vfloat8& v) { return _mm256_store_ps((float*)ptr,v); } - static __forceinline void storeu(void* ptr, const vfloat8& v) { return _mm256_storeu_ps((float*)ptr,v); } - -#if defined(__AVX512VL__) - - static __forceinline vfloat8 compact(const vboolf8& mask, vfloat8 &v) { - return _mm256_mask_compress_ps(v, mask, v); - } - static __forceinline vfloat8 compact(const vboolf8& mask, vfloat8 &a, const vfloat8& b) { - return _mm256_mask_compress_ps(a, mask, b); - } - - static __forceinline vfloat8 load (const vboolf8& mask, const void* ptr) { return _mm256_mask_load_ps (_mm256_setzero_ps(),mask,(float*)ptr); } - static __forceinline vfloat8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_mask_loadu_ps(_mm256_setzero_ps(),mask,(float*)ptr); } - - static __forceinline void store (const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_mask_store_ps ((float*)ptr,mask,v); } - static __forceinline void storeu(const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_mask_storeu_ps((float*)ptr,mask,v); } -#elif defined(__aarch64__) - static __forceinline vfloat8 load (const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,(__m256i)mask.v); } - static __forceinline vfloat8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,(__m256i)mask.v); } - - static __forceinline void store (const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask.v,v); } - static __forceinline void storeu(const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask.v,v); } -#else - static __forceinline vfloat8 load (const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,(__m256i)mask); } - static __forceinline vfloat8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,(__m256i)mask); } - - static __forceinline void store (const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,v); } - static __forceinline void storeu(const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,v); } -#endif - -#if defined(__AVX2__) - static __forceinline vfloat8 load_nt(void* ptr) { - return _mm256_castsi256_ps(_mm256_stream_load_si256((__m256i*)ptr)); - } -#endif - - static __forceinline void store_nt(void* ptr, const vfloat8& v) { - _mm256_stream_ps((float*)ptr,v); - } - - template - static __forceinline vfloat8 gather(const float* ptr, const vint8& index) { -#if defined(__AVX2__) && !defined(__aarch64__) - return _mm256_i32gather_ps(ptr, index ,scale); -#else - return vfloat8( - *(float*)(((int8_t*)ptr)+scale*index[0]), - *(float*)(((int8_t*)ptr)+scale*index[1]), - *(float*)(((int8_t*)ptr)+scale*index[2]), - *(float*)(((int8_t*)ptr)+scale*index[3]), - *(float*)(((int8_t*)ptr)+scale*index[4]), - *(float*)(((int8_t*)ptr)+scale*index[5]), - *(float*)(((int8_t*)ptr)+scale*index[6]), - *(float*)(((int8_t*)ptr)+scale*index[7])); -#endif - } - - template - static __forceinline vfloat8 gather(const vboolf8& mask, const float* ptr, const vint8& index) { - vfloat8 r = zero; -#if defined(__AVX512VL__) - return _mm256_mmask_i32gather_ps(r, mask, index, ptr, scale); -#elif defined(__AVX2__) && !defined(__aarch64__) - return _mm256_mask_i32gather_ps(r, ptr, index, mask, scale); -#else - if (likely(mask[0])) r[0] = *(float*)(((int8_t*)ptr)+scale*index[0]); - if (likely(mask[1])) r[1] = *(float*)(((int8_t*)ptr)+scale*index[1]); - if (likely(mask[2])) r[2] = *(float*)(((int8_t*)ptr)+scale*index[2]); - if (likely(mask[3])) r[3] = *(float*)(((int8_t*)ptr)+scale*index[3]); - if (likely(mask[4])) r[4] = *(float*)(((int8_t*)ptr)+scale*index[4]); - if (likely(mask[5])) r[5] = *(float*)(((int8_t*)ptr)+scale*index[5]); - if (likely(mask[6])) r[6] = *(float*)(((int8_t*)ptr)+scale*index[6]); - if (likely(mask[7])) r[7] = *(float*)(((int8_t*)ptr)+scale*index[7]); - return r; - #endif - } - - template - static __forceinline void scatter(void* ptr, const vint8& ofs, const vfloat8& v) - { -#if defined(__AVX512VL__) - _mm256_i32scatter_ps((float*)ptr, ofs, v, scale); -#else - *(float*)(((int8_t*)ptr)+scale*ofs[0]) = v[0]; - *(float*)(((int8_t*)ptr)+scale*ofs[1]) = v[1]; - *(float*)(((int8_t*)ptr)+scale*ofs[2]) = v[2]; - *(float*)(((int8_t*)ptr)+scale*ofs[3]) = v[3]; - *(float*)(((int8_t*)ptr)+scale*ofs[4]) = v[4]; - *(float*)(((int8_t*)ptr)+scale*ofs[5]) = v[5]; - *(float*)(((int8_t*)ptr)+scale*ofs[6]) = v[6]; - *(float*)(((int8_t*)ptr)+scale*ofs[7]) = v[7]; -#endif - } - - template - static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vfloat8& v) - { -#if defined(__AVX512VL__) - _mm256_mask_i32scatter_ps((float*)ptr, mask, ofs, v, scale); -#else - if (likely(mask[0])) *(float*)(((int8_t*)ptr)+scale*ofs[0]) = v[0]; - if (likely(mask[1])) *(float*)(((int8_t*)ptr)+scale*ofs[1]) = v[1]; - if (likely(mask[2])) *(float*)(((int8_t*)ptr)+scale*ofs[2]) = v[2]; - if (likely(mask[3])) *(float*)(((int8_t*)ptr)+scale*ofs[3]) = v[3]; - if (likely(mask[4])) *(float*)(((int8_t*)ptr)+scale*ofs[4]) = v[4]; - if (likely(mask[5])) *(float*)(((int8_t*)ptr)+scale*ofs[5]) = v[5]; - if (likely(mask[6])) *(float*)(((int8_t*)ptr)+scale*ofs[6]) = v[6]; - if (likely(mask[7])) *(float*)(((int8_t*)ptr)+scale*ofs[7]) = v[7]; -#endif - } - - static __forceinline void store(const vboolf8& mask, int8_t* ptr, const vint8& ofs, const vfloat8& v) { - scatter<1>(mask,ptr,ofs,v); - } - static __forceinline void store(const vboolf8& mask, float* ptr, const vint8& ofs, const vfloat8& v) { - scatter<4>(mask,ptr,ofs,v); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline const float& operator [](size_t index) const { assert(index < 8); return f[index]; } - __forceinline float& operator [](size_t index) { assert(index < 8); return f[index]; } - }; - - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vfloat8 asFloat(const vint8& a) { return _mm256_castsi256_ps(a); } - __forceinline vint8 asInt (const vfloat8& a) { return _mm256_castps_si256(a); } - - __forceinline vint8 toInt (const vfloat8& a) { return vint8(a); } - __forceinline vfloat8 toFloat(const vint8& a) { return vfloat8(a); } - - __forceinline vfloat8 operator +(const vfloat8& a) { return a; } -#if !defined(__aarch64__) - __forceinline vfloat8 operator -(const vfloat8& a) { - const __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); - return _mm256_xor_ps(a, mask); - } -#else - __forceinline vfloat8 operator -(const vfloat8& a) { - __m256 res; - res.lo = vnegq_f32(a.v.lo); - res.hi = vnegq_f32(a.v.hi); - return res; -} -#endif - -#if !defined(__aarch64__) -__forceinline vfloat8 abs(const vfloat8& a) { - const __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff)); - return _mm256_and_ps(a, mask); -} -#else -__forceinline vfloat8 abs(const vfloat8& a) { - __m256 res; - res.lo = vabsq_f32(a.v.lo); - res.hi = vabsq_f32(a.v.hi); - return res; -} -#endif - -#if !defined(__aarch64__) - __forceinline vfloat8 sign (const vfloat8& a) { return _mm256_blendv_ps(vfloat8(one), -vfloat8(one), _mm256_cmp_ps(a, vfloat8(zero), _CMP_NGE_UQ)); } -#else - __forceinline vfloat8 sign (const vfloat8& a) { return _mm256_blendv_ps(vfloat8(one), -vfloat8(one), _mm256_cmplt_ps(a, vfloat8(zero))); } -#endif - __forceinline vfloat8 signmsk(const vfloat8& a) { return _mm256_and_ps(a,_mm256_castsi256_ps(_mm256_set1_epi32(0x80000000))); } - - - static __forceinline vfloat8 rcp(const vfloat8& a) - { -#if defined(BUILD_IOS) && defined(__aarch64__) - // ios devices are faster doing full divide, no need for NR fixup - vfloat8 ret; - const float32x4_t one = vdupq_n_f32(1.0f); - ret.v.lo = vdivq_f32(one, a.v.lo); - ret.v.hi = vdivq_f32(one, a.v.hi); - return ret; -#endif - -#if defined(__AVX512VL__) - const vfloat8 r = _mm256_rcp14_ps(a); -#else - const vfloat8 r = _mm256_rcp_ps(a); -#endif - -#if defined(__AVX2__) //&& !defined(aarch64) - return _mm256_mul_ps(r, _mm256_fnmadd_ps(r, a, vfloat8(2.0f))); -#else - return _mm256_mul_ps(r, _mm256_sub_ps(vfloat8(2.0f), _mm256_mul_ps(r, a))); -#endif - } - __forceinline vfloat8 sqr (const vfloat8& a) { return _mm256_mul_ps(a,a); } - __forceinline vfloat8 sqrt(const vfloat8& a) { return _mm256_sqrt_ps(a); } - - static __forceinline vfloat8 rsqrt(const vfloat8& a) - { -#if defined(__AVX512VL__) - const vfloat8 r = _mm256_rsqrt14_ps(a); -#else - const vfloat8 r = _mm256_rsqrt_ps(a); -#endif - -#if defined(__AVX2__) - return _mm256_fmadd_ps(_mm256_set1_ps(1.5f), r, - _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(a, _mm256_set1_ps(-0.5f)), r), _mm256_mul_ps(r, r))); -#else - return _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(1.5f), r), - _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(a, _mm256_set1_ps(-0.5f)), r), _mm256_mul_ps(r, r))); -#endif - } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vfloat8 operator +(const vfloat8& a, const vfloat8& b) { return _mm256_add_ps(a, b); } - __forceinline vfloat8 operator +(const vfloat8& a, float b) { return a + vfloat8(b); } - __forceinline vfloat8 operator +(float a, const vfloat8& b) { return vfloat8(a) + b; } - - __forceinline vfloat8 operator -(const vfloat8& a, const vfloat8& b) { return _mm256_sub_ps(a, b); } - __forceinline vfloat8 operator -(const vfloat8& a, float b) { return a - vfloat8(b); } - __forceinline vfloat8 operator -(float a, const vfloat8& b) { return vfloat8(a) - b; } - - __forceinline vfloat8 operator *(const vfloat8& a, const vfloat8& b) { return _mm256_mul_ps(a, b); } - __forceinline vfloat8 operator *(const vfloat8& a, float b) { return a * vfloat8(b); } - __forceinline vfloat8 operator *(float a, const vfloat8& b) { return vfloat8(a) * b; } - - __forceinline vfloat8 operator /(const vfloat8& a, const vfloat8& b) { return _mm256_div_ps(a, b); } - __forceinline vfloat8 operator /(const vfloat8& a, float b) { return a / vfloat8(b); } - __forceinline vfloat8 operator /(float a, const vfloat8& b) { return vfloat8(a) / b; } - - __forceinline vfloat8 operator &(const vfloat8& a, const vfloat8& b) { return _mm256_and_ps(a,b); } - __forceinline vfloat8 operator |(const vfloat8& a, const vfloat8& b) { return _mm256_or_ps(a,b); } - __forceinline vfloat8 operator ^(const vfloat8& a, const vfloat8& b) { return _mm256_xor_ps(a,b); } - __forceinline vfloat8 operator ^(const vfloat8& a, const vint8& b) { return _mm256_xor_ps(a,_mm256_castsi256_ps(b)); } - - __forceinline vfloat8 min(const vfloat8& a, const vfloat8& b) { return _mm256_min_ps(a, b); } - __forceinline vfloat8 min(const vfloat8& a, float b) { return _mm256_min_ps(a, vfloat8(b)); } - __forceinline vfloat8 min(float a, const vfloat8& b) { return _mm256_min_ps(vfloat8(a), b); } - - __forceinline vfloat8 max(const vfloat8& a, const vfloat8& b) { return _mm256_max_ps(a, b); } - __forceinline vfloat8 max(const vfloat8& a, float b) { return _mm256_max_ps(a, vfloat8(b)); } - __forceinline vfloat8 max(float a, const vfloat8& b) { return _mm256_max_ps(vfloat8(a), b); } - - /* need "static __forceinline for MSVC, otherwise we'll link the wrong version in debug mode */ -#if defined(__AVX2__) - - static __forceinline vfloat8 mini(const vfloat8& a, const vfloat8& b) { - const vint8 ai = _mm256_castps_si256(a); - const vint8 bi = _mm256_castps_si256(b); - const vint8 ci = _mm256_min_epi32(ai,bi); - return _mm256_castsi256_ps(ci); - } - - static __forceinline vfloat8 maxi(const vfloat8& a, const vfloat8& b) { - const vint8 ai = _mm256_castps_si256(a); - const vint8 bi = _mm256_castps_si256(b); - const vint8 ci = _mm256_max_epi32(ai,bi); - return _mm256_castsi256_ps(ci); - } - - static __forceinline vfloat8 minui(const vfloat8& a, const vfloat8& b) { - const vint8 ai = _mm256_castps_si256(a); - const vint8 bi = _mm256_castps_si256(b); - const vint8 ci = _mm256_min_epu32(ai,bi); - return _mm256_castsi256_ps(ci); - } - - static __forceinline vfloat8 maxui(const vfloat8& a, const vfloat8& b) { - const vint8 ai = _mm256_castps_si256(a); - const vint8 bi = _mm256_castps_si256(b); - const vint8 ci = _mm256_max_epu32(ai,bi); - return _mm256_castsi256_ps(ci); - } - -#else - - static __forceinline vfloat8 mini(const vfloat8& a, const vfloat8& b) { - return asFloat(min(asInt(a),asInt(b))); - } - - static __forceinline vfloat8 maxi(const vfloat8& a, const vfloat8& b) { - return asFloat(max(asInt(a),asInt(b))); - } - -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Ternary Operators - //////////////////////////////////////////////////////////////////////////////// - -#if defined(__AVX2__) - static __forceinline vfloat8 madd (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return _mm256_fmadd_ps(a,b,c); } - static __forceinline vfloat8 msub (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return _mm256_fmsub_ps(a,b,c); } - static __forceinline vfloat8 nmadd (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return _mm256_fnmadd_ps(a,b,c); } - static __forceinline vfloat8 nmsub (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return _mm256_fnmsub_ps(a,b,c); } -#else - static __forceinline vfloat8 madd (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return a*b+c; } - static __forceinline vfloat8 msub (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return a*b-c; } - static __forceinline vfloat8 nmadd (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return -a*b+c;} - static __forceinline vfloat8 nmsub (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return -a*b-c; } -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vfloat8& operator +=(vfloat8& a, const vfloat8& b) { return a = a + b; } - __forceinline vfloat8& operator +=(vfloat8& a, float b) { return a = a + b; } - - __forceinline vfloat8& operator -=(vfloat8& a, const vfloat8& b) { return a = a - b; } - __forceinline vfloat8& operator -=(vfloat8& a, float b) { return a = a - b; } - - __forceinline vfloat8& operator *=(vfloat8& a, const vfloat8& b) { return a = a * b; } - __forceinline vfloat8& operator *=(vfloat8& a, float b) { return a = a * b; } - - __forceinline vfloat8& operator /=(vfloat8& a, const vfloat8& b) { return a = a / b; } - __forceinline vfloat8& operator /=(vfloat8& a, float b) { return a = a / b; } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators + Select - //////////////////////////////////////////////////////////////////////////////// - -#if defined(__AVX512VL__) - static __forceinline vboolf8 operator ==(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_EQ); } - static __forceinline vboolf8 operator !=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_NE); } - static __forceinline vboolf8 operator < (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_LT); } - static __forceinline vboolf8 operator >=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_GE); } - static __forceinline vboolf8 operator > (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_GT); } - static __forceinline vboolf8 operator <=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_LE); } - - static __forceinline vfloat8 select(const vboolf8& m, const vfloat8& t, const vfloat8& f) { - return _mm256_mask_blend_ps(m, f, t); - } -#elif !defined(__aarch64__) - __forceinline vboolf8 operator ==(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_EQ_OQ); } - __forceinline vboolf8 operator !=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NEQ_UQ); } - __forceinline vboolf8 operator < (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_LT_OS); } - __forceinline vboolf8 operator >=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NLT_US); } - __forceinline vboolf8 operator > (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NLE_US); } - __forceinline vboolf8 operator <=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_LE_OS); } - - __forceinline vfloat8 select(const vboolf8& m, const vfloat8& t, const vfloat8& f) { - return _mm256_blendv_ps(f, t, m); - } -#else - __forceinline vboolf8 operator ==(const vfloat8& a, const vfloat8& b) { return _mm256_cmpeq_ps(a, b); } - __forceinline vboolf8 operator !=(const vfloat8& a, const vfloat8& b) { return _mm256_cmpneq_ps(a, b); } - __forceinline vboolf8 operator < (const vfloat8& a, const vfloat8& b) { return _mm256_cmplt_ps(a, b); } - __forceinline vboolf8 operator >=(const vfloat8& a, const vfloat8& b) { return _mm256_cmpge_ps(a, b); } - __forceinline vboolf8 operator > (const vfloat8& a, const vfloat8& b) { return _mm256_cmpgt_ps(a, b); } - __forceinline vboolf8 operator <=(const vfloat8& a, const vfloat8& b) { return _mm256_cmple_ps(a, b); } - - __forceinline vfloat8 select(const vboolf8& m, const vfloat8& t, const vfloat8& f) { - return _mm256_blendv_ps(f, t, m); - } - -#endif - - template - __forceinline vfloat8 select(const vfloat8& t, const vfloat8& f) { - return _mm256_blend_ps(f, t, mask); - } - - __forceinline vboolf8 operator ==(const vfloat8& a, const float& b) { return a == vfloat8(b); } - __forceinline vboolf8 operator ==(const float& a, const vfloat8& b) { return vfloat8(a) == b; } - - __forceinline vboolf8 operator !=(const vfloat8& a, const float& b) { return a != vfloat8(b); } - __forceinline vboolf8 operator !=(const float& a, const vfloat8& b) { return vfloat8(a) != b; } - - __forceinline vboolf8 operator < (const vfloat8& a, const float& b) { return a < vfloat8(b); } - __forceinline vboolf8 operator < (const float& a, const vfloat8& b) { return vfloat8(a) < b; } - - __forceinline vboolf8 operator >=(const vfloat8& a, const float& b) { return a >= vfloat8(b); } - __forceinline vboolf8 operator >=(const float& a, const vfloat8& b) { return vfloat8(a) >= b; } - - __forceinline vboolf8 operator > (const vfloat8& a, const float& b) { return a > vfloat8(b); } - __forceinline vboolf8 operator > (const float& a, const vfloat8& b) { return vfloat8(a) > b; } - - __forceinline vboolf8 operator <=(const vfloat8& a, const float& b) { return a <= vfloat8(b); } - __forceinline vboolf8 operator <=(const float& a, const vfloat8& b) { return vfloat8(a) <= b; } - - __forceinline vboolf8 eq(const vfloat8& a, const vfloat8& b) { return a == b; } - __forceinline vboolf8 ne(const vfloat8& a, const vfloat8& b) { return a != b; } - __forceinline vboolf8 lt(const vfloat8& a, const vfloat8& b) { return a < b; } - __forceinline vboolf8 ge(const vfloat8& a, const vfloat8& b) { return a >= b; } - __forceinline vboolf8 gt(const vfloat8& a, const vfloat8& b) { return a > b; } - __forceinline vboolf8 le(const vfloat8& a, const vfloat8& b) { return a <= b; } - -#if defined(__AVX512VL__) - static __forceinline vboolf8 eq(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_EQ); } - static __forceinline vboolf8 ne(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_NE); } - static __forceinline vboolf8 lt(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_LT); } - static __forceinline vboolf8 ge(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_GE); } - static __forceinline vboolf8 gt(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_GT); } - static __forceinline vboolf8 le(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_LE); } -#else - static __forceinline vboolf8 eq(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a == b); } - static __forceinline vboolf8 ne(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a != b); } - static __forceinline vboolf8 lt(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a < b); } - static __forceinline vboolf8 ge(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a >= b); } - static __forceinline vboolf8 gt(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a > b); } - static __forceinline vboolf8 le(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a <= b); } -#endif - - __forceinline vfloat8 lerp(const vfloat8& a, const vfloat8& b, const vfloat8& t) { - return madd(t,b-a,a); - } - - __forceinline bool isvalid (const vfloat8& v) { - return all((v > vfloat8(-FLT_LARGE)) & (v < vfloat8(+FLT_LARGE))); - } - - __forceinline bool is_finite (const vfloat8& a) { - return all((a >= vfloat8(-FLT_MAX)) & (a <= vfloat8(+FLT_MAX))); - } - - __forceinline bool is_finite (const vboolf8& valid, const vfloat8& a) { - return all(valid, (a >= vfloat8(-FLT_MAX)) & (a <= vfloat8(+FLT_MAX))); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Rounding Functions - //////////////////////////////////////////////////////////////////////////////// - -#if !defined(__aarch64__) - __forceinline vfloat8 floor(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_NEG_INF ); } - __forceinline vfloat8 ceil (const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_POS_INF ); } - __forceinline vfloat8 trunc(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_ZERO ); } - __forceinline vfloat8 round(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_NEAREST_INT); } -#else - __forceinline vfloat8 floor(const vfloat8& a) { return _mm256_floor_ps(a); } - __forceinline vfloat8 ceil (const vfloat8& a) { return _mm256_ceil_ps(a); } -#endif - - - __forceinline vfloat8 frac (const vfloat8& a) { return a-floor(a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Movement/Shifting/Shuffling Functions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vfloat8 unpacklo(const vfloat8& a, const vfloat8& b) { return _mm256_unpacklo_ps(a, b); } - __forceinline vfloat8 unpackhi(const vfloat8& a, const vfloat8& b) { return _mm256_unpackhi_ps(a, b); } - - template - __forceinline vfloat8 shuffle(const vfloat8& v) { - return _mm256_permute_ps(v, _MM_SHUFFLE(i, i, i, i)); - } - - template - __forceinline vfloat8 shuffle4(const vfloat8& v) { - return _mm256_permute2f128_ps(v, v, (i1 << 4) | (i0 << 0)); - } - - template - __forceinline vfloat8 shuffle4(const vfloat8& a, const vfloat8& b) { - return _mm256_permute2f128_ps(a, b, (i1 << 4) | (i0 << 0)); - } - - template - __forceinline vfloat8 shuffle(const vfloat8& v) { - return _mm256_permute_ps(v, _MM_SHUFFLE(i3, i2, i1, i0)); - } - - template - __forceinline vfloat8 shuffle(const vfloat8& a, const vfloat8& b) { - return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)); - } - -#if !defined(__aarch64__) - template<> __forceinline vfloat8 shuffle<0, 0, 2, 2>(const vfloat8& v) { return _mm256_moveldup_ps(v); } - template<> __forceinline vfloat8 shuffle<1, 1, 3, 3>(const vfloat8& v) { return _mm256_movehdup_ps(v); } - template<> __forceinline vfloat8 shuffle<0, 1, 0, 1>(const vfloat8& v) { return _mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(v))); } -#endif - - __forceinline vfloat8 broadcast(const float* ptr) { return _mm256_broadcast_ss(ptr); } - template __forceinline vfloat8 insert4(const vfloat8& a, const vfloat4& b) { return _mm256_insertf128_ps(a, b, i); } - template __forceinline vfloat4 extract4 (const vfloat8& a) { return _mm256_extractf128_ps(a, i); } - template<> __forceinline vfloat4 extract4<0>(const vfloat8& a) { return _mm256_castps256_ps128(a); } - - __forceinline float toScalar(const vfloat8& v) { return _mm_cvtss_f32(_mm256_castps256_ps128(v)); } - - __forceinline vfloat8 assign(const vfloat4& a) { return _mm256_castps128_ps256(a); } - -#if defined (__AVX2__) && !defined(__aarch64__) - __forceinline vfloat8 permute(const vfloat8& a, const __m256i& index) { - return _mm256_permutevar8x32_ps(a, index); - } -#endif - -#if defined(__AVX512VL__) - template - static __forceinline vfloat8 align_shift_right(const vfloat8& a, const vfloat8& b) { - return _mm256_castsi256_ps(_mm256_alignr_epi32(_mm256_castps_si256(a), _mm256_castps_si256(b), i)); - } -#endif - -#if defined (__AVX_I__) - template - static __forceinline vint4 convert_to_hf16(const vfloat8& a) { - return _mm256_cvtps_ph(a, mode); - } - - static __forceinline vfloat8 convert_from_hf16(const vint4& a) { - return _mm256_cvtph_ps(a); - } -#endif - - __forceinline vfloat4 broadcast4f(const vfloat8& a, const size_t k) { - return vfloat4::broadcast(&a[k]); - } - - __forceinline vfloat8 broadcast8f(const vfloat8& a, const size_t k) { - return vfloat8::broadcast(&a[k]); - } - -#if defined(__AVX512VL__) - static __forceinline vfloat8 shift_right_1(const vfloat8& x) { - return align_shift_right<1>(zero,x); - } -#else - static __forceinline vfloat8 shift_right_1(const vfloat8& x) { - const vfloat8 t0 = shuffle<1,2,3,0>(x); - const vfloat8 t1 = shuffle4<1,0>(t0); - return _mm256_blend_ps(t0,t1,0x88); - } -#endif - - __forceinline vint8 floori(const vfloat8& a) { - return vint8(floor(a)); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Transpose - //////////////////////////////////////////////////////////////////////////////// - - __forceinline void transpose(const vfloat8& r0, const vfloat8& r1, const vfloat8& r2, const vfloat8& r3, vfloat8& c0, vfloat8& c1, vfloat8& c2, vfloat8& c3) - { - vfloat8 l02 = unpacklo(r0,r2); - vfloat8 h02 = unpackhi(r0,r2); - vfloat8 l13 = unpacklo(r1,r3); - vfloat8 h13 = unpackhi(r1,r3); - c0 = unpacklo(l02,l13); - c1 = unpackhi(l02,l13); - c2 = unpacklo(h02,h13); - c3 = unpackhi(h02,h13); - } - - __forceinline void transpose(const vfloat8& r0, const vfloat8& r1, const vfloat8& r2, const vfloat8& r3, vfloat8& c0, vfloat8& c1, vfloat8& c2) - { - vfloat8 l02 = unpacklo(r0,r2); - vfloat8 h02 = unpackhi(r0,r2); - vfloat8 l13 = unpacklo(r1,r3); - vfloat8 h13 = unpackhi(r1,r3); - c0 = unpacklo(l02,l13); - c1 = unpackhi(l02,l13); - c2 = unpacklo(h02,h13); - } - - __forceinline void transpose(const vfloat8& r0, const vfloat8& r1, const vfloat8& r2, const vfloat8& r3, const vfloat8& r4, const vfloat8& r5, const vfloat8& r6, const vfloat8& r7, - vfloat8& c0, vfloat8& c1, vfloat8& c2, vfloat8& c3, vfloat8& c4, vfloat8& c5, vfloat8& c6, vfloat8& c7) - { - vfloat8 h0,h1,h2,h3; transpose(r0,r1,r2,r3,h0,h1,h2,h3); - vfloat8 h4,h5,h6,h7; transpose(r4,r5,r6,r7,h4,h5,h6,h7); - c0 = shuffle4<0,2>(h0,h4); - c1 = shuffle4<0,2>(h1,h5); - c2 = shuffle4<0,2>(h2,h6); - c3 = shuffle4<0,2>(h3,h7); - c4 = shuffle4<1,3>(h0,h4); - c5 = shuffle4<1,3>(h1,h5); - c6 = shuffle4<1,3>(h2,h6); - c7 = shuffle4<1,3>(h3,h7); - } - - __forceinline void transpose(const vfloat4& r0, const vfloat4& r1, const vfloat4& r2, const vfloat4& r3, const vfloat4& r4, const vfloat4& r5, const vfloat4& r6, const vfloat4& r7, - vfloat8& c0, vfloat8& c1, vfloat8& c2, vfloat8& c3) - { - transpose(vfloat8(r0,r4), vfloat8(r1,r5), vfloat8(r2,r6), vfloat8(r3,r7), c0, c1, c2, c3); - } - - __forceinline void transpose(const vfloat4& r0, const vfloat4& r1, const vfloat4& r2, const vfloat4& r3, const vfloat4& r4, const vfloat4& r5, const vfloat4& r6, const vfloat4& r7, - vfloat8& c0, vfloat8& c1, vfloat8& c2) - { - transpose(vfloat8(r0,r4), vfloat8(r1,r5), vfloat8(r2,r6), vfloat8(r3,r7), c0, c1, c2); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Reductions - //////////////////////////////////////////////////////////////////////////////// -#if !defined(__aarch64__) - __forceinline vfloat8 vreduce_min2(const vfloat8& v) { return min(v,shuffle<1,0,3,2>(v)); } - __forceinline vfloat8 vreduce_min4(const vfloat8& v) { vfloat8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); } - __forceinline vfloat8 vreduce_min (const vfloat8& v) { vfloat8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); } - - __forceinline vfloat8 vreduce_max2(const vfloat8& v) { return max(v,shuffle<1,0,3,2>(v)); } - __forceinline vfloat8 vreduce_max4(const vfloat8& v) { vfloat8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); } - __forceinline vfloat8 vreduce_max (const vfloat8& v) { vfloat8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); } - - __forceinline vfloat8 vreduce_add2(const vfloat8& v) { return v + shuffle<1,0,3,2>(v); } - __forceinline vfloat8 vreduce_add4(const vfloat8& v) { vfloat8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); } - __forceinline vfloat8 vreduce_add (const vfloat8& v) { vfloat8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); } - - __forceinline float reduce_min(const vfloat8& v) { return toScalar(vreduce_min(v)); } - __forceinline float reduce_max(const vfloat8& v) { return toScalar(vreduce_max(v)); } - __forceinline float reduce_add(const vfloat8& v) { return toScalar(vreduce_add(v)); } -#else - __forceinline float reduce_min(const vfloat8& v) { return vminvq_f32(_mm_min_ps(v.v.lo,v.v.hi)); } - __forceinline float reduce_max(const vfloat8& v) { return vmaxvq_f32(_mm_max_ps(v.v.lo,v.v.hi)); } - __forceinline vfloat8 vreduce_min(const vfloat8& v) { return vfloat8(reduce_min(v)); } - __forceinline vfloat8 vreduce_max(const vfloat8& v) { return vfloat8(reduce_max(v)); } - __forceinline float reduce_add(const vfloat8& v) { return vaddvq_f32(_mm_add_ps(v.v.lo,v.v.hi)); } - -#endif - __forceinline size_t select_min(const vboolf8& valid, const vfloat8& v) - { - const vfloat8 a = select(valid,v,vfloat8(pos_inf)); - const vbool8 valid_min = valid & (a == vreduce_min(a)); - return bsf(movemask(any(valid_min) ? valid_min : valid)); - } - - __forceinline size_t select_max(const vboolf8& valid, const vfloat8& v) - { - const vfloat8 a = select(valid,v,vfloat8(neg_inf)); - const vbool8 valid_max = valid & (a == vreduce_max(a)); - return bsf(movemask(any(valid_max) ? valid_max : valid)); - } - - - //////////////////////////////////////////////////////////////////////////////// - /// Euclidian Space Operators (pairs of Vec3fa's) - //////////////////////////////////////////////////////////////////////////////// - - //__forceinline vfloat8 dot(const vfloat8& a, const vfloat8& b) { - // return vreduce_add4(a*b); - //} - - __forceinline vfloat8 dot(const vfloat8& a, const vfloat8& b) { - return _mm256_dp_ps(a,b,0x7F); - } - - __forceinline vfloat8 cross(const vfloat8& a, const vfloat8& b) - { - const vfloat8 a0 = a; - const vfloat8 b0 = shuffle<1,2,0,3>(b); - const vfloat8 a1 = shuffle<1,2,0,3>(a); - const vfloat8 b1 = b; - return shuffle<1,2,0,3>(msub(a0,b0,a1*b1)); - } - - //__forceinline float sqr_length (const vfloat<8>& a) { return dot(a,a); } - //__forceinline float rcp_length (const vfloat<8>& a) { return rsqrt(dot(a,a)); } - //__forceinline float rcp_length2(const vfloat<8>& a) { return rcp(dot(a,a)); } - //__forceinline float length (const vfloat<8>& a) { return sqrt(dot(a,a)); } - __forceinline vfloat<8> normalize(const vfloat<8>& a) { return a*rsqrt(dot(a,a)); } - //__forceinline float distance(const vfloat<8>& a, const vfloat<8>& b) { return length(a-b); } - //__forceinline float halfArea(const vfloat<8>& d) { return madd(d.x,(d.y+d.z),d.y*d.z); } - //__forceinline float area (const vfloat<8>& d) { return 2.0f*halfArea(d); } - //__forceinline vfloat<8> reflect(const vfloat<8>& V, const vfloat<8>& N) { return 2.0f*dot(V,N)*N-V; } - - //__forceinline vfloat<8> normalize_safe(const vfloat<8>& a) { - // const float d = dot(a,a); if (unlikely(d == 0.0f)) return a; else return a*rsqrt(d); - //} - - //////////////////////////////////////////////////////////////////////////////// - /// In Register Sorting - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vfloat8 sort_ascending(const vfloat8& v) - { - const vfloat8 a0 = v; - const vfloat8 b0 = shuffle<1,0,3,2>(a0); - const vfloat8 c0 = min(a0,b0); - const vfloat8 d0 = max(a0,b0); - const vfloat8 a1 = select<0x99 /* 0b10011001 */>(c0,d0); - const vfloat8 b1 = shuffle<2,3,0,1>(a1); - const vfloat8 c1 = min(a1,b1); - const vfloat8 d1 = max(a1,b1); - const vfloat8 a2 = select<0xc3 /* 0b11000011 */>(c1,d1); - const vfloat8 b2 = shuffle<1,0,3,2>(a2); - const vfloat8 c2 = min(a2,b2); - const vfloat8 d2 = max(a2,b2); - const vfloat8 a3 = select<0xa5 /* 0b10100101 */>(c2,d2); - const vfloat8 b3 = shuffle4<1,0>(a3); - const vfloat8 c3 = min(a3,b3); - const vfloat8 d3 = max(a3,b3); - const vfloat8 a4 = select<0xf /* 0b00001111 */>(c3,d3); - const vfloat8 b4 = shuffle<2,3,0,1>(a4); - const vfloat8 c4 = min(a4,b4); - const vfloat8 d4 = max(a4,b4); - const vfloat8 a5 = select<0x33 /* 0b00110011 */>(c4,d4); - const vfloat8 b5 = shuffle<1,0,3,2>(a5); - const vfloat8 c5 = min(a5,b5); - const vfloat8 d5 = max(a5,b5); - const vfloat8 a6 = select<0x55 /* 0b01010101 */>(c5,d5); - return a6; - } - - __forceinline vfloat8 sort_descending(const vfloat8& v) - { - const vfloat8 a0 = v; - const vfloat8 b0 = shuffle<1,0,3,2>(a0); - const vfloat8 c0 = max(a0,b0); - const vfloat8 d0 = min(a0,b0); - const vfloat8 a1 = select<0x99 /* 0b10011001 */>(c0,d0); - const vfloat8 b1 = shuffle<2,3,0,1>(a1); - const vfloat8 c1 = max(a1,b1); - const vfloat8 d1 = min(a1,b1); - const vfloat8 a2 = select<0xc3 /* 0b11000011 */>(c1,d1); - const vfloat8 b2 = shuffle<1,0,3,2>(a2); - const vfloat8 c2 = max(a2,b2); - const vfloat8 d2 = min(a2,b2); - const vfloat8 a3 = select<0xa5 /* 0b10100101 */>(c2,d2); - const vfloat8 b3 = shuffle4<1,0>(a3); - const vfloat8 c3 = max(a3,b3); - const vfloat8 d3 = min(a3,b3); - const vfloat8 a4 = select<0xf /* 0b00001111 */>(c3,d3); - const vfloat8 b4 = shuffle<2,3,0,1>(a4); - const vfloat8 c4 = max(a4,b4); - const vfloat8 d4 = min(a4,b4); - const vfloat8 a5 = select<0x33 /* 0b00110011 */>(c4,d4); - const vfloat8 b5 = shuffle<1,0,3,2>(a5); - const vfloat8 c5 = max(a5,b5); - const vfloat8 d5 = min(a5,b5); - const vfloat8 a6 = select<0x55 /* 0b01010101 */>(c5,d5); - return a6; - } - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline embree_ostream operator <<(embree_ostream cout, const vfloat8& a) { - return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">"; - } -} diff --git a/thirdparty/embree-aarch64/common/simd/vint16_avx512.h b/thirdparty/embree-aarch64/common/simd/vint16_avx512.h deleted file mode 100644 index 3249bc2b45..0000000000 --- a/thirdparty/embree-aarch64/common/simd/vint16_avx512.h +++ /dev/null @@ -1,490 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -namespace embree -{ - /* 16-wide AVX-512 integer type */ - template<> - struct vint<16> - { - ALIGNED_STRUCT_(64); - - typedef vboolf16 Bool; - typedef vint16 Int; - typedef vfloat16 Float; - - enum { size = 16 }; // number of SIMD elements - union { // data - __m512i v; - int i[16]; - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vint() {} - __forceinline vint(const vint16& t) { v = t.v; } - __forceinline vint16& operator =(const vint16& f) { v = f.v; return *this; } - - __forceinline vint(const __m512i& t) { v = t; } - __forceinline operator __m512i() const { return v; } - __forceinline operator __m256i() const { return _mm512_castsi512_si256(v); } - - __forceinline vint(int i) { - v = _mm512_set1_epi32(i); - } - - __forceinline vint(int a, int b, int c, int d) { - v = _mm512_set4_epi32(d,c,b,a); - } - - __forceinline vint(int a0 , int a1 , int a2 , int a3, - int a4 , int a5 , int a6 , int a7, - int a8 , int a9 , int a10, int a11, - int a12, int a13, int a14, int a15) - { - v = _mm512_set_epi32(a15,a14,a13,a12,a11,a10,a9,a8,a7,a6,a5,a4,a3,a2,a1,a0); - } - - __forceinline vint(const vint4& i) { - v = _mm512_broadcast_i32x4(i); - } - - __forceinline vint(const vint4& a, const vint4& b, const vint4& c, const vint4& d) { - v = _mm512_castsi128_si512(a); - v = _mm512_inserti32x4(v, b, 1); - v = _mm512_inserti32x4(v, c, 2); - v = _mm512_inserti32x4(v, d, 3); - } - - __forceinline vint(const vint8& i) { - v = _mm512_castps_si512(_mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_castsi256_pd(i)))); - } - - __forceinline vint(const vint8& a, const vint8& b) { - v = _mm512_castsi256_si512(a); - v = _mm512_inserti64x4(v, b, 1); - } - - __forceinline explicit vint(const __m512& f) { - v = _mm512_cvtps_epi32(f); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vint(ZeroTy) : v(_mm512_setzero_epi32()) {} - __forceinline vint(OneTy) : v(_mm512_set1_epi32(1)) {} - __forceinline vint(PosInfTy) : v(_mm512_set1_epi32(pos_inf)) {} - __forceinline vint(NegInfTy) : v(_mm512_set1_epi32(neg_inf)) {} - __forceinline vint(StepTy) : v(_mm512_set_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {} - __forceinline vint(ReverseStepTy) : v(_mm512_setr_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Loads and Stores - //////////////////////////////////////////////////////////////////////////////// - - static __forceinline vint16 load (const void* addr) { return _mm512_load_si512((int*)addr); } - - static __forceinline vint16 load(const uint8_t* ptr) { return _mm512_cvtepu8_epi32(_mm_load_si128((__m128i*)ptr)); } - static __forceinline vint16 load(const unsigned short* ptr) { return _mm512_cvtepu16_epi32(_mm256_load_si256((__m256i*)ptr)); } - - static __forceinline vint16 loadu(const uint8_t* ptr) { return _mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr)); } - static __forceinline vint16 loadu(const unsigned short* ptr) { return _mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i*)ptr)); } - - static __forceinline vint16 loadu(const void* addr) { return _mm512_loadu_si512(addr); } - - static __forceinline vint16 load (const vboolf16& mask, const void* addr) { return _mm512_mask_load_epi32 (_mm512_setzero_epi32(),mask,addr); } - static __forceinline vint16 loadu(const vboolf16& mask, const void* addr) { return _mm512_mask_loadu_epi32(_mm512_setzero_epi32(),mask,addr); } - - static __forceinline void store (void* ptr, const vint16& v) { _mm512_store_si512 (ptr,v); } - static __forceinline void storeu(void* ptr, const vint16& v) { _mm512_storeu_si512(ptr,v); } - - static __forceinline void store (const vboolf16& mask, void* addr, const vint16& v2) { _mm512_mask_store_epi32(addr,mask,v2); } - static __forceinline void storeu(const vboolf16& mask, void* ptr, const vint16& f) { _mm512_mask_storeu_epi32((int*)ptr,mask,f); } - - static __forceinline void store_nt(void* __restrict__ ptr, const vint16& a) { _mm512_stream_si512((__m512i*)ptr,a); } - - /* pass by value to avoid compiler generating inefficient code */ - static __forceinline void storeu_compact(const vboolf16 mask, void* addr, vint16 reg) { - _mm512_mask_compressstoreu_epi32(addr,mask,reg); - } - - static __forceinline void storeu_compact_single(const vboolf16 mask, void* addr, vint16 reg) { - //_mm512_mask_compressstoreu_epi32(addr,mask,reg); - *(float*)addr = mm512_cvtss_f32(_mm512_mask_compress_ps(_mm512_castsi512_ps(reg),mask,_mm512_castsi512_ps(reg))); - } - - static __forceinline vint16 compact64bit(const vboolf16& mask, vint16 &v) { - return _mm512_mask_compress_epi64(v,mask,v); - } - - static __forceinline vint16 compact(const vboolf16& mask, vint16 &v) { - return _mm512_mask_compress_epi32(v,mask,v); - } - - static __forceinline vint16 compact(const vboolf16& mask, const vint16 &a, vint16 &b) { - return _mm512_mask_compress_epi32(a,mask,b); - } - - static __forceinline vint16 expand(const vboolf16& mask, const vint16& a, vint16& b) { - return _mm512_mask_expand_epi32(b,mask,a); - } - - template - static __forceinline vint16 gather(const int* ptr, const vint16& index) { - return _mm512_i32gather_epi32(index,ptr,scale); - } - - template - static __forceinline vint16 gather(const vboolf16& mask, const int* ptr, const vint16& index) { - return _mm512_mask_i32gather_epi32(_mm512_undefined_epi32(),mask,index,ptr,scale); - } - - template - static __forceinline vint16 gather(const vboolf16& mask, vint16& dest, const int* ptr, const vint16& index) { - return _mm512_mask_i32gather_epi32(dest,mask,index,ptr,scale); - } - - template - static __forceinline void scatter(int* ptr, const vint16& index, const vint16& v) { - _mm512_i32scatter_epi32((int*)ptr,index,v,scale); - } - - template - static __forceinline void scatter(const vboolf16& mask, int* ptr, const vint16& index, const vint16& v) { - _mm512_mask_i32scatter_epi32((int*)ptr,mask,index,v,scale); - } - - static __forceinline vint16 broadcast64bit(size_t v) { - return _mm512_set1_epi64(v); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline int& operator [](size_t index) { assert(index < 16); return i[index]; } - __forceinline const int& operator [](size_t index) const { assert(index < 16); return i[index]; } - - __forceinline unsigned int uint (size_t index) const { assert(index < 16); return ((unsigned int*)i)[index]; } - __forceinline size_t& uint64_t(size_t index) const { assert(index < 8); return ((size_t*)i)[index]; } - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf16 asBool(const vint16& a) { return _mm512_movepi32_mask(a); } - - __forceinline vint16 operator +(const vint16& a) { return a; } - __forceinline vint16 operator -(const vint16& a) { return _mm512_sub_epi32(_mm512_setzero_epi32(), a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vint16 operator +(const vint16& a, const vint16& b) { return _mm512_add_epi32(a, b); } - __forceinline vint16 operator +(const vint16& a, int b) { return a + vint16(b); } - __forceinline vint16 operator +(int a, const vint16& b) { return vint16(a) + b; } - - __forceinline vint16 operator -(const vint16& a, const vint16& b) { return _mm512_sub_epi32(a, b); } - __forceinline vint16 operator -(const vint16& a, int b) { return a - vint16(b); } - __forceinline vint16 operator -(int a, const vint16& b) { return vint16(a) - b; } - - __forceinline vint16 operator *(const vint16& a, const vint16& b) { return _mm512_mullo_epi32(a, b); } - __forceinline vint16 operator *(const vint16& a, int b) { return a * vint16(b); } - __forceinline vint16 operator *(int a, const vint16& b) { return vint16(a) * b; } - - __forceinline vint16 operator &(const vint16& a, const vint16& b) { return _mm512_and_epi32(a, b); } - __forceinline vint16 operator &(const vint16& a, int b) { return a & vint16(b); } - __forceinline vint16 operator &(int a, const vint16& b) { return vint16(a) & b; } - - __forceinline vint16 operator |(const vint16& a, const vint16& b) { return _mm512_or_epi32(a, b); } - __forceinline vint16 operator |(const vint16& a, int b) { return a | vint16(b); } - __forceinline vint16 operator |(int a, const vint16& b) { return vint16(a) | b; } - - __forceinline vint16 operator ^(const vint16& a, const vint16& b) { return _mm512_xor_epi32(a, b); } - __forceinline vint16 operator ^(const vint16& a, int b) { return a ^ vint16(b); } - __forceinline vint16 operator ^(int a, const vint16& b) { return vint16(a) ^ b; } - - __forceinline vint16 operator <<(const vint16& a, int n) { return _mm512_slli_epi32(a, n); } - __forceinline vint16 operator >>(const vint16& a, int n) { return _mm512_srai_epi32(a, n); } - - __forceinline vint16 operator <<(const vint16& a, const vint16& n) { return _mm512_sllv_epi32(a, n); } - __forceinline vint16 operator >>(const vint16& a, const vint16& n) { return _mm512_srav_epi32(a, n); } - - __forceinline vint16 sll (const vint16& a, int b) { return _mm512_slli_epi32(a, b); } - __forceinline vint16 sra (const vint16& a, int b) { return _mm512_srai_epi32(a, b); } - __forceinline vint16 srl (const vint16& a, int b) { return _mm512_srli_epi32(a, b); } - - __forceinline vint16 min(const vint16& a, const vint16& b) { return _mm512_min_epi32(a, b); } - __forceinline vint16 min(const vint16& a, int b) { return min(a,vint16(b)); } - __forceinline vint16 min(int a, const vint16& b) { return min(vint16(a),b); } - - __forceinline vint16 max(const vint16& a, const vint16& b) { return _mm512_max_epi32(a, b); } - __forceinline vint16 max(const vint16& a, int b) { return max(a,vint16(b)); } - __forceinline vint16 max(int a, const vint16& b) { return max(vint16(a),b); } - - __forceinline vint16 umin(const vint16& a, const vint16& b) { return _mm512_min_epu32(a, b); } - __forceinline vint16 umax(const vint16& a, const vint16& b) { return _mm512_max_epu32(a, b); } - - __forceinline vint16 mask_add(const vboolf16& mask, vint16& c, const vint16& a, const vint16& b) { return _mm512_mask_add_epi32(c,mask,a,b); } - __forceinline vint16 mask_sub(const vboolf16& mask, vint16& c, const vint16& a, const vint16& b) { return _mm512_mask_sub_epi32(c,mask,a,b); } - - __forceinline vint16 mask_and(const vboolf16& m, vint16& c, const vint16& a, const vint16& b) { return _mm512_mask_and_epi32(c,m,a,b); } - __forceinline vint16 mask_or (const vboolf16& m, vint16& c, const vint16& a, const vint16& b) { return _mm512_mask_or_epi32(c,m,a,b); } - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vint16& operator +=(vint16& a, const vint16& b) { return a = a + b; } - __forceinline vint16& operator +=(vint16& a, int b) { return a = a + b; } - - __forceinline vint16& operator -=(vint16& a, const vint16& b) { return a = a - b; } - __forceinline vint16& operator -=(vint16& a, int b) { return a = a - b; } - - __forceinline vint16& operator *=(vint16& a, const vint16& b) { return a = a * b; } - __forceinline vint16& operator *=(vint16& a, int b) { return a = a * b; } - - __forceinline vint16& operator &=(vint16& a, const vint16& b) { return a = a & b; } - __forceinline vint16& operator &=(vint16& a, int b) { return a = a & b; } - - __forceinline vint16& operator |=(vint16& a, const vint16& b) { return a = a | b; } - __forceinline vint16& operator |=(vint16& a, int b) { return a = a | b; } - - __forceinline vint16& operator <<=(vint16& a, int b) { return a = a << b; } - __forceinline vint16& operator >>=(vint16& a, int b) { return a = a >> b; } - - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators + Select - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf16 operator ==(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_EQ); } - __forceinline vboolf16 operator ==(const vint16& a, int b) { return a == vint16(b); } - __forceinline vboolf16 operator ==(int a, const vint16& b) { return vint16(a) == b; } - - __forceinline vboolf16 operator !=(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_NE); } - __forceinline vboolf16 operator !=(const vint16& a, int b) { return a != vint16(b); } - __forceinline vboolf16 operator !=(int a, const vint16& b) { return vint16(a) != b; } - - __forceinline vboolf16 operator < (const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_LT); } - __forceinline vboolf16 operator < (const vint16& a, int b) { return a < vint16(b); } - __forceinline vboolf16 operator < (int a, const vint16& b) { return vint16(a) < b; } - - __forceinline vboolf16 operator >=(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_GE); } - __forceinline vboolf16 operator >=(const vint16& a, int b) { return a >= vint16(b); } - __forceinline vboolf16 operator >=(int a, const vint16& b) { return vint16(a) >= b; } - - __forceinline vboolf16 operator > (const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_GT); } - __forceinline vboolf16 operator > (const vint16& a, int b) { return a > vint16(b); } - __forceinline vboolf16 operator > (int a, const vint16& b) { return vint16(a) > b; } - - __forceinline vboolf16 operator <=(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_LE); } - __forceinline vboolf16 operator <=(const vint16& a, int b) { return a <= vint16(b); } - __forceinline vboolf16 operator <=(int a, const vint16& b) { return vint16(a) <= b; } - - __forceinline vboolf16 eq(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_EQ); } - __forceinline vboolf16 ne(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_NE); } - __forceinline vboolf16 lt(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_LT); } - __forceinline vboolf16 ge(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_GE); } - __forceinline vboolf16 gt(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_GT); } - __forceinline vboolf16 le(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_LE); } - __forceinline vboolf16 uint_le(const vint16& a, const vint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LE); } - __forceinline vboolf16 uint_gt(const vint16& a, const vint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GT); } - - __forceinline vboolf16 eq(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_EQ); } - __forceinline vboolf16 ne(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_NE); } - __forceinline vboolf16 lt(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_LT); } - __forceinline vboolf16 ge(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_GE); } - __forceinline vboolf16 gt(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_GT); } - __forceinline vboolf16 le(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_LE); } - __forceinline vboolf16 uint_le(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_LE); } - __forceinline vboolf16 uint_gt(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_GT); } - - - __forceinline vint16 select(const vboolf16& m, const vint16& t, const vint16& f) { - return _mm512_mask_or_epi32(f,m,t,t); - } - - __forceinline void xchg(const vboolf16& m, vint16& a, vint16& b) { - const vint16 c = a; a = select(m,b,a); b = select(m,c,b); - } - - __forceinline vboolf16 test(const vboolf16& m, const vint16& a, const vint16& b) { - return _mm512_mask_test_epi32_mask(m,a,b); - } - - __forceinline vboolf16 test(const vint16& a, const vint16& b) { - return _mm512_test_epi32_mask(a,b); - } - - //////////////////////////////////////////////////////////////////////////////// - // Movement/Shifting/Shuffling Functions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vint16 unpacklo(const vint16& a, const vint16& b) { return _mm512_unpacklo_epi32(a, b); } - __forceinline vint16 unpackhi(const vint16& a, const vint16& b) { return _mm512_unpackhi_epi32(a, b); } - - template - __forceinline vint16 shuffle(const vint16& v) { - return _mm512_castps_si512(_mm512_permute_ps(_mm512_castsi512_ps(v), _MM_SHUFFLE(i, i, i, i))); - } - - template - __forceinline vint16 shuffle(const vint16& v) { - return _mm512_castps_si512(_mm512_permute_ps(_mm512_castsi512_ps(v), _MM_SHUFFLE(i3, i2, i1, i0))); - } - - template - __forceinline vint16 shuffle4(const vint16& v) { - return _mm512_castps_si512(_mm512_shuffle_f32x4(_mm512_castsi512_ps(v), _mm512_castsi512_ps(v), _MM_SHUFFLE(i, i, i, i))); - } - - template - __forceinline vint16 shuffle4(const vint16& v) { - return _mm512_castps_si512(_mm512_shuffle_f32x4(_mm512_castsi512_ps(v), _mm512_castsi512_ps(v), _MM_SHUFFLE(i3, i2, i1, i0))); - } - - template - __forceinline vint16 align_shift_right(const vint16& a, const vint16& b) { - return _mm512_alignr_epi32(a, b, i); - }; - - __forceinline int toScalar(const vint16& v) { - return _mm_cvtsi128_si32(_mm512_castsi512_si128(v)); - } - - template __forceinline vint16 insert4(const vint16& a, const vint4& b) { return _mm512_inserti32x4(a, b, i); } - - __forceinline size_t extract64bit(const vint16& v) { - return _mm_cvtsi128_si64(_mm512_castsi512_si128(v)); - } - - template - vint extractN(const vint16& v); - - template<> __forceinline vint4 extractN<4,0>(const vint16& v) { return _mm512_castsi512_si128(v); } - template<> __forceinline vint4 extractN<4,1>(const vint16& v) { return _mm512_extracti32x4_epi32(v, 1); } - template<> __forceinline vint4 extractN<4,2>(const vint16& v) { return _mm512_extracti32x4_epi32(v, 2); } - template<> __forceinline vint4 extractN<4,3>(const vint16& v) { return _mm512_extracti32x4_epi32(v, 3); } - - template<> __forceinline vint8 extractN<8,0>(const vint16& v) { return _mm512_castsi512_si256(v); } - template<> __forceinline vint8 extractN<8,1>(const vint16& v) { return _mm512_extracti32x8_epi32(v, 1); } - - template __forceinline vint4 extract4 (const vint16& v) { return _mm512_extracti32x4_epi32(v, i); } - template<> __forceinline vint4 extract4<0>(const vint16& v) { return _mm512_castsi512_si128(v); } - - template __forceinline vint8 extract8 (const vint16& v) { return _mm512_extracti32x8_epi32(v, i); } - template<> __forceinline vint8 extract8<0>(const vint16& v) { return _mm512_castsi512_si256(v); } - - //////////////////////////////////////////////////////////////////////////////// - /// Reductions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vint16 vreduce_min2(vint16 x) { return min(x, shuffle<1,0,3,2>(x)); } - __forceinline vint16 vreduce_min4(vint16 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); } - __forceinline vint16 vreduce_min8(vint16 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0,3,2>(x)); } - __forceinline vint16 vreduce_min (vint16 x) { x = vreduce_min8(x); return min(x, shuffle4<2,3,0,1>(x)); } - - __forceinline vint16 vreduce_max2(vint16 x) { return max(x, shuffle<1,0,3,2>(x)); } - __forceinline vint16 vreduce_max4(vint16 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); } - __forceinline vint16 vreduce_max8(vint16 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0,3,2>(x)); } - __forceinline vint16 vreduce_max (vint16 x) { x = vreduce_max8(x); return max(x, shuffle4<2,3,0,1>(x)); } - - __forceinline vint16 vreduce_and2(vint16 x) { return x & shuffle<1,0,3,2>(x); } - __forceinline vint16 vreduce_and4(vint16 x) { x = vreduce_and2(x); return x & shuffle<2,3,0,1>(x); } - __forceinline vint16 vreduce_and8(vint16 x) { x = vreduce_and4(x); return x & shuffle4<1,0,3,2>(x); } - __forceinline vint16 vreduce_and (vint16 x) { x = vreduce_and8(x); return x & shuffle4<2,3,0,1>(x); } - - __forceinline vint16 vreduce_or2(vint16 x) { return x | shuffle<1,0,3,2>(x); } - __forceinline vint16 vreduce_or4(vint16 x) { x = vreduce_or2(x); return x | shuffle<2,3,0,1>(x); } - __forceinline vint16 vreduce_or8(vint16 x) { x = vreduce_or4(x); return x | shuffle4<1,0,3,2>(x); } - __forceinline vint16 vreduce_or (vint16 x) { x = vreduce_or8(x); return x | shuffle4<2,3,0,1>(x); } - - __forceinline vint16 vreduce_add2(vint16 x) { return x + shuffle<1,0,3,2>(x); } - __forceinline vint16 vreduce_add4(vint16 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); } - __forceinline vint16 vreduce_add8(vint16 x) { x = vreduce_add4(x); return x + shuffle4<1,0,3,2>(x); } - __forceinline vint16 vreduce_add (vint16 x) { x = vreduce_add8(x); return x + shuffle4<2,3,0,1>(x); } - - __forceinline int reduce_min(const vint16& v) { return toScalar(vreduce_min(v)); } - __forceinline int reduce_max(const vint16& v) { return toScalar(vreduce_max(v)); } - __forceinline int reduce_and(const vint16& v) { return toScalar(vreduce_and(v)); } - __forceinline int reduce_or (const vint16& v) { return toScalar(vreduce_or (v)); } - __forceinline int reduce_add(const vint16& v) { return toScalar(vreduce_add(v)); } - - //////////////////////////////////////////////////////////////////////////////// - /// Memory load and store operations - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vint16 conflict(const vint16& index) - { - return _mm512_conflict_epi32(index); - } - - __forceinline vint16 conflict(const vboolf16& mask, vint16& dest, const vint16& index) - { - return _mm512_mask_conflict_epi32(dest,mask,index); - } - - __forceinline vint16 convert_uint32_t(const __m512& f) { - return _mm512_cvtps_epu32(f); - } - - __forceinline vint16 permute(vint16 v, vint16 index) { - return _mm512_permutexvar_epi32(index,v); - } - - __forceinline vint16 reverse(const vint16 &a) { - return permute(a,vint16(reverse_step)); - } - - __forceinline vint16 prefix_sum(const vint16& a) - { - const vint16 z(zero); - vint16 v = a; - v = v + align_shift_right<16-1>(v,z); - v = v + align_shift_right<16-2>(v,z); - v = v + align_shift_right<16-4>(v,z); - v = v + align_shift_right<16-8>(v,z); - return v; - } - - __forceinline vint16 reverse_prefix_sum(const vint16& a) - { - const vint16 z(zero); - vint16 v = a; - v = v + align_shift_right<1>(z,v); - v = v + align_shift_right<2>(z,v); - v = v + align_shift_right<4>(z,v); - v = v + align_shift_right<8>(z,v); - return v; - } - - /* this should use a vbool8 and a vint8_64...*/ - template - __forceinline void gather_prefetch64(const void* base_addr, const vbool16& mask, const vint16& offset) - { -#if defined(__AVX512PF__) - _mm512_mask_prefetch_i64gather_pd(offset, mask, base_addr, scale, hint); -#endif - } - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline embree_ostream operator <<(embree_ostream cout, const vint16& v) - { - cout << "<" << v[0]; - for (int i=1; i<16; i++) cout << ", " << v[i]; - cout << ">"; - return cout; - } -} diff --git a/thirdparty/embree-aarch64/common/simd/vint4_sse2.h b/thirdparty/embree-aarch64/common/simd/vint4_sse2.h deleted file mode 100644 index 96f105a7c5..0000000000 --- a/thirdparty/embree-aarch64/common/simd/vint4_sse2.h +++ /dev/null @@ -1,681 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../math/math.h" - -namespace embree -{ - /* 4-wide SSE integer type */ - template<> - struct vint<4> - { - ALIGNED_STRUCT_(16); - - typedef vboolf4 Bool; - typedef vint4 Int; - typedef vfloat4 Float; - - enum { size = 4 }; // number of SIMD elements - union { __m128i v; int i[4]; }; // data - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vint() {} - __forceinline vint(const vint4& a) { v = a.v; } - __forceinline vint4& operator =(const vint4& a) { v = a.v; return *this; } - - __forceinline vint(__m128i a) : v(a) {} - __forceinline operator const __m128i&() const { return v; } - __forceinline operator __m128i&() { return v; } - - __forceinline vint(int a) : v(_mm_set1_epi32(a)) {} - __forceinline vint(int a, int b, int c, int d) : v(_mm_set_epi32(d, c, b, a)) {} - - __forceinline explicit vint(__m128 a) : v(_mm_cvtps_epi32(a)) {} -#if defined(__AVX512VL__) - __forceinline explicit vint(const vboolf4& a) : v(_mm_movm_epi32(a)) {} -#else - __forceinline explicit vint(const vboolf4& a) : v(_mm_castps_si128((__m128)a)) {} -#endif - - __forceinline vint(long long a, long long b) : v(_mm_set_epi64x(b,a)) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vint(ZeroTy) : v(_mm_setzero_si128()) {} - __forceinline vint(OneTy) : v(_mm_set_epi32(1, 1, 1, 1)) {} - __forceinline vint(PosInfTy) : v(_mm_set_epi32(pos_inf, pos_inf, pos_inf, pos_inf)) {} - __forceinline vint(NegInfTy) : v(_mm_set_epi32(neg_inf, neg_inf, neg_inf, neg_inf)) {} - __forceinline vint(StepTy) : v(_mm_set_epi32(3, 2, 1, 0)) {} - __forceinline vint(ReverseStepTy) : v(_mm_set_epi32(0, 1, 2, 3)) {} - - __forceinline vint(TrueTy) { v = _mm_cmpeq_epi32(v,v); } - __forceinline vint(UndefinedTy) : v(_mm_castps_si128(_mm_undefined_ps())) {} - - - //////////////////////////////////////////////////////////////////////////////// - /// Loads and Stores - //////////////////////////////////////////////////////////////////////////////// - - static __forceinline vint4 load (const void* a) { return _mm_load_si128((__m128i*)a); } - static __forceinline vint4 loadu(const void* a) { return _mm_loadu_si128((__m128i*)a); } - - static __forceinline void store (void* ptr, const vint4& v) { _mm_store_si128((__m128i*)ptr,v); } - static __forceinline void storeu(void* ptr, const vint4& v) { _mm_storeu_si128((__m128i*)ptr,v); } - -#if defined(__AVX512VL__) - - static __forceinline vint4 compact(const vboolf4& mask, vint4 &v) { - return _mm_mask_compress_epi32(v, mask, v); - } - static __forceinline vint4 compact(const vboolf4& mask, vint4 &a, const vint4& b) { - return _mm_mask_compress_epi32(a, mask, b); - } - - static __forceinline vint4 load (const vboolf4& mask, const void* ptr) { return _mm_mask_load_epi32 (_mm_setzero_si128(),mask,ptr); } - static __forceinline vint4 loadu(const vboolf4& mask, const void* ptr) { return _mm_mask_loadu_epi32(_mm_setzero_si128(),mask,ptr); } - - static __forceinline void store (const vboolf4& mask, void* ptr, const vint4& v) { _mm_mask_store_epi32 (ptr,mask,v); } - static __forceinline void storeu(const vboolf4& mask, void* ptr, const vint4& v) { _mm_mask_storeu_epi32(ptr,mask,v); } -#elif defined(__AVX__) - static __forceinline vint4 load (const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask)); } - static __forceinline vint4 loadu(const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask)); } - - static __forceinline void store (const vboolf4& mask, void* ptr, const vint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,_mm_castsi128_ps(i)); } - static __forceinline void storeu(const vboolf4& mask, void* ptr, const vint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,_mm_castsi128_ps(i)); } -#else - static __forceinline vint4 load (const vbool4& mask, const void* a) { return _mm_and_si128(_mm_load_si128 ((__m128i*)a),mask); } - static __forceinline vint4 loadu(const vbool4& mask, const void* a) { return _mm_and_si128(_mm_loadu_si128((__m128i*)a),mask); } - - static __forceinline void store (const vboolf4& mask, void* ptr, const vint4& i) { store (ptr,select(mask,i,load (ptr))); } - static __forceinline void storeu(const vboolf4& mask, void* ptr, const vint4& i) { storeu(ptr,select(mask,i,loadu(ptr))); } -#endif - - -#if defined(__aarch64__) - static __forceinline vint4 load(const uint8_t* ptr) { - return _mm_load4epu8_epi32(((__m128i*)ptr)); - } - static __forceinline vint4 loadu(const uint8_t* ptr) { - return _mm_load4epu8_epi32(((__m128i*)ptr)); - } -#elif defined(__SSE4_1__) - static __forceinline vint4 load(const uint8_t* ptr) { - return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); - } - - static __forceinline vint4 loadu(const uint8_t* ptr) { - return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); - } -#else - - static __forceinline vint4 load(const uint8_t* ptr) { - return vint4(ptr[0],ptr[1],ptr[2],ptr[3]); - } - - static __forceinline vint4 loadu(const uint8_t* ptr) { - return vint4(ptr[0],ptr[1],ptr[2],ptr[3]); - } - -#endif - - static __forceinline vint4 load(const unsigned short* ptr) { -#if defined(__aarch64__) - return __m128i(vmovl_u16(vld1_u16(ptr))); -#elif defined (__SSE4_1__) - return _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr)); -#else - return vint4(ptr[0],ptr[1],ptr[2],ptr[3]); -#endif - } - - static __forceinline void store(uint8_t* ptr, const vint4& v) { -#if defined(__aarch64__) - int32x4_t x = v; - uint16x4_t y = vqmovn_u32(uint32x4_t(x)); - uint8x8_t z = vqmovn_u16(vcombine_u16(y, y)); - vst1_lane_u32((uint32_t *)ptr,uint32x2_t(z), 0); -#elif defined(__SSE4_1__) - __m128i x = v; - x = _mm_packus_epi32(x, x); - x = _mm_packus_epi16(x, x); - *(int*)ptr = _mm_cvtsi128_si32(x); -#else - for (size_t i=0;i<4;i++) - ptr[i] = (uint8_t)v[i]; -#endif - } - - static __forceinline void store(unsigned short* ptr, const vint4& v) { -#if defined(__aarch64__) - uint32x4_t x = uint32x4_t(v.v); - uint16x4_t y = vqmovn_u32(x); - vst1_u16(ptr, y); -#else - for (size_t i=0;i<4;i++) - ptr[i] = (unsigned short)v[i]; -#endif - } - - static __forceinline vint4 load_nt(void* ptr) { -#if defined(__aarch64__) || defined(__SSE4_1__) - return _mm_stream_load_si128((__m128i*)ptr); -#else - return _mm_load_si128((__m128i*)ptr); -#endif - } - - static __forceinline void store_nt(void* ptr, const vint4& v) { -#if !defined(__aarch64__) && defined(__SSE4_1__) - _mm_stream_ps((float*)ptr, _mm_castsi128_ps(v)); -#else - _mm_store_si128((__m128i*)ptr,v); -#endif - } - - template - static __forceinline vint4 gather(const int* ptr, const vint4& index) { -#if defined(__AVX2__) && !defined(__aarch64__) - return _mm_i32gather_epi32(ptr, index, scale); -#else - return vint4( - *(int*)(((int8_t*)ptr)+scale*index[0]), - *(int*)(((int8_t*)ptr)+scale*index[1]), - *(int*)(((int8_t*)ptr)+scale*index[2]), - *(int*)(((int8_t*)ptr)+scale*index[3])); -#endif - } - - template - static __forceinline vint4 gather(const vboolf4& mask, const int* ptr, const vint4& index) { - vint4 r = zero; -#if defined(__AVX512VL__) - return _mm_mmask_i32gather_epi32(r, mask, index, ptr, scale); -#elif defined(__AVX2__) && !defined(__aarch64__) - return _mm_mask_i32gather_epi32(r, ptr, index, mask, scale); -#else - if (likely(mask[0])) r[0] = *(int*)(((int8_t*)ptr)+scale*index[0]); - if (likely(mask[1])) r[1] = *(int*)(((int8_t*)ptr)+scale*index[1]); - if (likely(mask[2])) r[2] = *(int*)(((int8_t*)ptr)+scale*index[2]); - if (likely(mask[3])) r[3] = *(int*)(((int8_t*)ptr)+scale*index[3]); - return r; -#endif - } - - template - static __forceinline void scatter(void* ptr, const vint4& index, const vint4& v) - { -#if defined(__AVX512VL__) - _mm_i32scatter_epi32((int*)ptr, index, v, scale); -#else - *(int*)(((int8_t*)ptr)+scale*index[0]) = v[0]; - *(int*)(((int8_t*)ptr)+scale*index[1]) = v[1]; - *(int*)(((int8_t*)ptr)+scale*index[2]) = v[2]; - *(int*)(((int8_t*)ptr)+scale*index[3]) = v[3]; -#endif - } - - template - static __forceinline void scatter(const vboolf4& mask, void* ptr, const vint4& index, const vint4& v) - { -#if defined(__AVX512VL__) - _mm_mask_i32scatter_epi32((int*)ptr, mask, index, v, scale); -#else - if (likely(mask[0])) *(int*)(((int8_t*)ptr)+scale*index[0]) = v[0]; - if (likely(mask[1])) *(int*)(((int8_t*)ptr)+scale*index[1]) = v[1]; - if (likely(mask[2])) *(int*)(((int8_t*)ptr)+scale*index[2]) = v[2]; - if (likely(mask[3])) *(int*)(((int8_t*)ptr)+scale*index[3]) = v[3]; -#endif - } - -#if defined(__x86_64__) || defined(__aarch64__) - static __forceinline vint4 broadcast64(long long a) { return _mm_set1_epi64x(a); } -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline const int& operator [](size_t index) const { assert(index < 4); return i[index]; } - __forceinline int& operator [](size_t index) { assert(index < 4); return i[index]; } - - friend __forceinline vint4 select(const vboolf4& m, const vint4& t, const vint4& f) { -#if defined(__AVX512VL__) - return _mm_mask_blend_epi32(m, (__m128i)f, (__m128i)t); -#elif defined(__aarch64__) - return _mm_castps_si128(_mm_blendv_ps((__m128)f.v,(__m128) t.v, (__m128)m.v)); -#elif defined(__SSE4_1__) - return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m)); -#else - return _mm_or_si128(_mm_and_si128(m, t), _mm_andnot_si128(m, f)); -#endif - } - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - -#if defined(__AVX512VL__) - __forceinline vboolf4 asBool(const vint4& a) { return _mm_movepi32_mask(a); } -#else - __forceinline vboolf4 asBool(const vint4& a) { return _mm_castsi128_ps(a); } -#endif - - __forceinline vint4 operator +(const vint4& a) { return a; } - __forceinline vint4 operator -(const vint4& a) { return _mm_sub_epi32(_mm_setzero_si128(), a); } -#if defined(__aarch64__) - __forceinline vint4 abs(const vint4& a) { return vabsq_s32(a.v); } -#elif defined(__SSSE3__) - __forceinline vint4 abs(const vint4& a) { return _mm_abs_epi32(a); } -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vint4 operator +(const vint4& a, const vint4& b) { return _mm_add_epi32(a, b); } - __forceinline vint4 operator +(const vint4& a, int b) { return a + vint4(b); } - __forceinline vint4 operator +(int a, const vint4& b) { return vint4(a) + b; } - - __forceinline vint4 operator -(const vint4& a, const vint4& b) { return _mm_sub_epi32(a, b); } - __forceinline vint4 operator -(const vint4& a, int b) { return a - vint4(b); } - __forceinline vint4 operator -(int a, const vint4& b) { return vint4(a) - b; } - -#if (defined(__aarch64__)) || defined(__SSE4_1__) - __forceinline vint4 operator *(const vint4& a, const vint4& b) { return _mm_mullo_epi32(a, b); } -#else - __forceinline vint4 operator *(const vint4& a, const vint4& b) { return vint4(a[0]*b[0],a[1]*b[1],a[2]*b[2],a[3]*b[3]); } -#endif - __forceinline vint4 operator *(const vint4& a, int b) { return a * vint4(b); } - __forceinline vint4 operator *(int a, const vint4& b) { return vint4(a) * b; } - - __forceinline vint4 operator &(const vint4& a, const vint4& b) { return _mm_and_si128(a, b); } - __forceinline vint4 operator &(const vint4& a, int b) { return a & vint4(b); } - __forceinline vint4 operator &(int a, const vint4& b) { return vint4(a) & b; } - - __forceinline vint4 operator |(const vint4& a, const vint4& b) { return _mm_or_si128(a, b); } - __forceinline vint4 operator |(const vint4& a, int b) { return a | vint4(b); } - __forceinline vint4 operator |(int a, const vint4& b) { return vint4(a) | b; } - - __forceinline vint4 operator ^(const vint4& a, const vint4& b) { return _mm_xor_si128(a, b); } - __forceinline vint4 operator ^(const vint4& a, int b) { return a ^ vint4(b); } - __forceinline vint4 operator ^(int a, const vint4& b) { return vint4(a) ^ b; } - - __forceinline vint4 operator <<(const vint4& a, const int n) { return _mm_slli_epi32(a, n); } - __forceinline vint4 operator >>(const vint4& a, const int n) { return _mm_srai_epi32(a, n); } - - __forceinline vint4 sll (const vint4& a, int b) { return _mm_slli_epi32(a, b); } - __forceinline vint4 sra (const vint4& a, int b) { return _mm_srai_epi32(a, b); } - __forceinline vint4 srl (const vint4& a, int b) { return _mm_srli_epi32(a, b); } - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vint4& operator +=(vint4& a, const vint4& b) { return a = a + b; } - __forceinline vint4& operator +=(vint4& a, int b) { return a = a + b; } - - __forceinline vint4& operator -=(vint4& a, const vint4& b) { return a = a - b; } - __forceinline vint4& operator -=(vint4& a, int b) { return a = a - b; } - -#if (defined(__aarch64__)) || defined(__SSE4_1__) - __forceinline vint4& operator *=(vint4& a, const vint4& b) { return a = a * b; } - __forceinline vint4& operator *=(vint4& a, int b) { return a = a * b; } -#endif - - __forceinline vint4& operator &=(vint4& a, const vint4& b) { return a = a & b; } - __forceinline vint4& operator &=(vint4& a, int b) { return a = a & b; } - - __forceinline vint4& operator |=(vint4& a, const vint4& b) { return a = a | b; } - __forceinline vint4& operator |=(vint4& a, int b) { return a = a | b; } - - __forceinline vint4& operator <<=(vint4& a, int b) { return a = a << b; } - __forceinline vint4& operator >>=(vint4& a, int b) { return a = a >> b; } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators + Select - //////////////////////////////////////////////////////////////////////////////// - -#if defined(__AVX512VL__) - __forceinline vboolf4 operator ==(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_EQ); } - __forceinline vboolf4 operator !=(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_NE); } - __forceinline vboolf4 operator < (const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_LT); } - __forceinline vboolf4 operator >=(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_GE); } - __forceinline vboolf4 operator > (const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_GT); } - __forceinline vboolf4 operator <=(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_LE); } -#else - __forceinline vboolf4 operator ==(const vint4& a, const vint4& b) { return _mm_castsi128_ps(_mm_cmpeq_epi32(a, b)); } - __forceinline vboolf4 operator !=(const vint4& a, const vint4& b) { return !(a == b); } - __forceinline vboolf4 operator < (const vint4& a, const vint4& b) { return _mm_castsi128_ps(_mm_cmplt_epi32(a, b)); } - __forceinline vboolf4 operator >=(const vint4& a, const vint4& b) { return !(a < b); } - __forceinline vboolf4 operator > (const vint4& a, const vint4& b) { return _mm_castsi128_ps(_mm_cmpgt_epi32(a, b)); } - __forceinline vboolf4 operator <=(const vint4& a, const vint4& b) { return !(a > b); } -#endif - - __forceinline vboolf4 operator ==(const vint4& a, int b) { return a == vint4(b); } - __forceinline vboolf4 operator ==(int a, const vint4& b) { return vint4(a) == b; } - - __forceinline vboolf4 operator !=(const vint4& a, int b) { return a != vint4(b); } - __forceinline vboolf4 operator !=(int a, const vint4& b) { return vint4(a) != b; } - - __forceinline vboolf4 operator < (const vint4& a, int b) { return a < vint4(b); } - __forceinline vboolf4 operator < (int a, const vint4& b) { return vint4(a) < b; } - - __forceinline vboolf4 operator >=(const vint4& a, int b) { return a >= vint4(b); } - __forceinline vboolf4 operator >=(int a, const vint4& b) { return vint4(a) >= b; } - - __forceinline vboolf4 operator > (const vint4& a, int b) { return a > vint4(b); } - __forceinline vboolf4 operator > (int a, const vint4& b) { return vint4(a) > b; } - - __forceinline vboolf4 operator <=(const vint4& a, int b) { return a <= vint4(b); } - __forceinline vboolf4 operator <=(int a, const vint4& b) { return vint4(a) <= b; } - - __forceinline vboolf4 eq(const vint4& a, const vint4& b) { return a == b; } - __forceinline vboolf4 ne(const vint4& a, const vint4& b) { return a != b; } - __forceinline vboolf4 lt(const vint4& a, const vint4& b) { return a < b; } - __forceinline vboolf4 ge(const vint4& a, const vint4& b) { return a >= b; } - __forceinline vboolf4 gt(const vint4& a, const vint4& b) { return a > b; } - __forceinline vboolf4 le(const vint4& a, const vint4& b) { return a <= b; } - -#if defined(__AVX512VL__) - __forceinline vboolf4 eq(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_EQ); } - __forceinline vboolf4 ne(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_NE); } - __forceinline vboolf4 lt(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LT); } - __forceinline vboolf4 ge(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_GE); } - __forceinline vboolf4 gt(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_GT); } - __forceinline vboolf4 le(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LE); } -#else - __forceinline vboolf4 eq(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a == b); } - __forceinline vboolf4 ne(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a != b); } - __forceinline vboolf4 lt(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a < b); } - __forceinline vboolf4 ge(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a >= b); } - __forceinline vboolf4 gt(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a > b); } - __forceinline vboolf4 le(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a <= b); } -#endif - - template - __forceinline vint4 select(const vint4& t, const vint4& f) { -#if defined(__SSE4_1__) - return _mm_castps_si128(_mm_blend_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), mask)); -#else - return select(vboolf4(mask), t, f); -#endif - } - - -#if defined(__aarch64__) || defined(__SSE4_1__) - __forceinline vint4 min(const vint4& a, const vint4& b) { return _mm_min_epi32(a, b); } - __forceinline vint4 max(const vint4& a, const vint4& b) { return _mm_max_epi32(a, b); } - - __forceinline vint4 umin(const vint4& a, const vint4& b) { return _mm_min_epu32(a, b); } - __forceinline vint4 umax(const vint4& a, const vint4& b) { return _mm_max_epu32(a, b); } - -#else - __forceinline vint4 min(const vint4& a, const vint4& b) { return select(a < b,a,b); } - __forceinline vint4 max(const vint4& a, const vint4& b) { return select(a < b,b,a); } -#endif - - __forceinline vint4 min(const vint4& a, int b) { return min(a,vint4(b)); } - __forceinline vint4 min(int a, const vint4& b) { return min(vint4(a),b); } - __forceinline vint4 max(const vint4& a, int b) { return max(a,vint4(b)); } - __forceinline vint4 max(int a, const vint4& b) { return max(vint4(a),b); } - - //////////////////////////////////////////////////////////////////////////////// - // Movement/Shifting/Shuffling Functions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vint4 unpacklo(const vint4& a, const vint4& b) { return _mm_castps_si128(_mm_unpacklo_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); } - __forceinline vint4 unpackhi(const vint4& a, const vint4& b) { return _mm_castps_si128(_mm_unpackhi_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); } - -#if defined(__aarch64__) - template - __forceinline vint4 shuffle(const vint4& v) { - return vreinterpretq_s32_u8(vqtbl1q_u8( (uint8x16_t)v.v, _MN_SHUFFLE(i0, i1, i2, i3))); - } - template - __forceinline vint4 shuffle(const vint4& a, const vint4& b) { - return vreinterpretq_s32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3))); - } -#else - template - __forceinline vint4 shuffle(const vint4& v) { - return _mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0)); - } - template - __forceinline vint4 shuffle(const vint4& a, const vint4& b) { - return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0))); - } -#endif -#if defined(__SSE3__) - template<> __forceinline vint4 shuffle<0, 0, 2, 2>(const vint4& v) { return _mm_castps_si128(_mm_moveldup_ps(_mm_castsi128_ps(v))); } - template<> __forceinline vint4 shuffle<1, 1, 3, 3>(const vint4& v) { return _mm_castps_si128(_mm_movehdup_ps(_mm_castsi128_ps(v))); } - template<> __forceinline vint4 shuffle<0, 1, 0, 1>(const vint4& v) { return _mm_castpd_si128(_mm_movedup_pd (_mm_castsi128_pd(v))); } -#endif - - template - __forceinline vint4 shuffle(const vint4& v) { - return shuffle(v); - } - -#if defined(__aarch64__) - template __forceinline int extract(const vint4& b); - template __forceinline vint4 insert(const vint4& a, const int b); -#elif defined(__SSE4_1__) - template __forceinline int extract(const vint4& b) { return _mm_extract_epi32(b, src); } - template __forceinline vint4 insert(const vint4& a, const int b) { return _mm_insert_epi32(a, b, dst); } -#else - template __forceinline int extract(const vint4& b) { return b[src&3]; } - template __forceinline vint4 insert(const vint4& a, int b) { vint4 c = a; c[dst&3] = b; return c; } -#endif - -#if defined(__aarch64__) - template<> __forceinline int extract<0>(const vint4& b) { - return b.v[0]; - } - template<> __forceinline int extract<1>(const vint4& b) { - return b.v[1]; - } - template<> __forceinline int extract<2>(const vint4& b) { - return b.v[2]; - } - template<> __forceinline int extract<3>(const vint4& b) { - return b.v[3]; - } - template<> __forceinline vint4 insert<0>(const vint4& a, int b) - { - vint4 c = a; - c[0] = b; - return c; - } - template<> __forceinline vint4 insert<1>(const vint4& a, int b) - { - vint4 c = a; - c[1] = b; - return c; - } - template<> __forceinline vint4 insert<2>(const vint4& a, int b) - { - vint4 c = a; - c[2] = b; - return c; - } - template<> __forceinline vint4 insert<3>(const vint4& a, int b) - { - vint4 c = a; - c[3] = b; - return c; - } - - __forceinline int toScalar(const vint4& v) { - return v[0]; - } - - __forceinline size_t toSizeT(const vint4& v) { - uint64x2_t x = uint64x2_t(v.v); - return x[0]; - } -#else - template<> __forceinline int extract<0>(const vint4& b) { return _mm_cvtsi128_si32(b); } - - __forceinline int toScalar(const vint4& v) { return _mm_cvtsi128_si32(v); } - - __forceinline size_t toSizeT(const vint4& v) { -#if defined(__WIN32__) && !defined(__X86_64__) // win32 workaround - return toScalar(v); -#elif defined(__ARM_NEON) - // FIXME(LTE): Do we need a swap(i.e. use lane 1)? - return vgetq_lane_u64(*(reinterpret_cast(&v)), 0); -#else - return _mm_cvtsi128_si64(v); -#endif - } -#endif - -#if defined(__AVX512VL__) - - __forceinline vint4 permute(const vint4 &a, const vint4 &index) { - return _mm_castps_si128(_mm_permutevar_ps(_mm_castsi128_ps(a),index)); - } - - template - __forceinline vint4 align_shift_right(const vint4& a, const vint4& b) { - return _mm_alignr_epi32(a, b, i); - } -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Reductions - //////////////////////////////////////////////////////////////////////////////// - -#if defined(__aarch64__) || defined(__SSE4_1__) - -#if defined(__aarch64__) - __forceinline vint4 vreduce_min(const vint4& v) { int h = vminvq_s32(v); return vdupq_n_s32(h); } - __forceinline vint4 vreduce_max(const vint4& v) { int h = vmaxvq_s32(v); return vdupq_n_s32(h); } - __forceinline vint4 vreduce_add(const vint4& v) { int h = vaddvq_s32(v); return vdupq_n_s32(h); } - - __forceinline int reduce_min(const vint4& v) { return vminvq_s32(v); } - __forceinline int reduce_max(const vint4& v) { return vmaxvq_s32(v); } - __forceinline int reduce_add(const vint4& v) { return vaddvq_s32(v); } -#else - __forceinline vint4 vreduce_min(const vint4& v) { vint4 h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); } - __forceinline vint4 vreduce_max(const vint4& v) { vint4 h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); } - __forceinline vint4 vreduce_add(const vint4& v) { vint4 h = shuffle<1,0,3,2>(v) + v ; return shuffle<2,3,0,1>(h) + h ; } - - __forceinline int reduce_min(const vint4& v) { return toScalar(vreduce_min(v)); } - __forceinline int reduce_max(const vint4& v) { return toScalar(vreduce_max(v)); } - __forceinline int reduce_add(const vint4& v) { return toScalar(vreduce_add(v)); } -#endif - - __forceinline size_t select_min(const vint4& v) { return bsf(movemask(v == vreduce_min(v))); } - __forceinline size_t select_max(const vint4& v) { return bsf(movemask(v == vreduce_max(v))); } - - __forceinline size_t select_min(const vboolf4& valid, const vint4& v) { const vint4 a = select(valid,v,vint4(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); } - __forceinline size_t select_max(const vboolf4& valid, const vint4& v) { const vint4 a = select(valid,v,vint4(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); } - -#else - - __forceinline int reduce_min(const vint4& v) { return min(v[0],v[1],v[2],v[3]); } - __forceinline int reduce_max(const vint4& v) { return max(v[0],v[1],v[2],v[3]); } - __forceinline int reduce_add(const vint4& v) { return v[0]+v[1]+v[2]+v[3]; } - -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Sorting networks - //////////////////////////////////////////////////////////////////////////////// - -#if (defined(__aarch64__)) || defined(__SSE4_1__) - - __forceinline vint4 usort_ascending(const vint4& v) - { - const vint4 a0 = v; - const vint4 b0 = shuffle<1,0,3,2>(a0); - const vint4 c0 = umin(a0,b0); - const vint4 d0 = umax(a0,b0); - const vint4 a1 = select<0x5 /* 0b0101 */>(c0,d0); - const vint4 b1 = shuffle<2,3,0,1>(a1); - const vint4 c1 = umin(a1,b1); - const vint4 d1 = umax(a1,b1); - const vint4 a2 = select<0x3 /* 0b0011 */>(c1,d1); - const vint4 b2 = shuffle<0,2,1,3>(a2); - const vint4 c2 = umin(a2,b2); - const vint4 d2 = umax(a2,b2); - const vint4 a3 = select<0x2 /* 0b0010 */>(c2,d2); - return a3; - } - - __forceinline vint4 usort_descending(const vint4& v) - { - const vint4 a0 = v; - const vint4 b0 = shuffle<1,0,3,2>(a0); - const vint4 c0 = umax(a0,b0); - const vint4 d0 = umin(a0,b0); - const vint4 a1 = select<0x5 /* 0b0101 */>(c0,d0); - const vint4 b1 = shuffle<2,3,0,1>(a1); - const vint4 c1 = umax(a1,b1); - const vint4 d1 = umin(a1,b1); - const vint4 a2 = select<0x3 /* 0b0011 */>(c1,d1); - const vint4 b2 = shuffle<0,2,1,3>(a2); - const vint4 c2 = umax(a2,b2); - const vint4 d2 = umin(a2,b2); - const vint4 a3 = select<0x2 /* 0b0010 */>(c2,d2); - return a3; - } - -#else - - __forceinline vint4 usort_ascending(const vint4& v) - { - const vint4 a0 = v-vint4(0x80000000); - const vint4 b0 = shuffle<1,0,3,2>(a0); - const vint4 c0 = min(a0,b0); - const vint4 d0 = max(a0,b0); - const vint4 a1 = select<0x5 /* 0b0101 */>(c0,d0); - const vint4 b1 = shuffle<2,3,0,1>(a1); - const vint4 c1 = min(a1,b1); - const vint4 d1 = max(a1,b1); - const vint4 a2 = select<0x3 /* 0b0011 */>(c1,d1); - const vint4 b2 = shuffle<0,2,1,3>(a2); - const vint4 c2 = min(a2,b2); - const vint4 d2 = max(a2,b2); - const vint4 a3 = select<0x2 /* 0b0010 */>(c2,d2); - return a3+vint4(0x80000000); - } - - __forceinline vint4 usort_descending(const vint4& v) - { - const vint4 a0 = v-vint4(0x80000000); - const vint4 b0 = shuffle<1,0,3,2>(a0); - const vint4 c0 = max(a0,b0); - const vint4 d0 = min(a0,b0); - const vint4 a1 = select<0x5 /* 0b0101 */>(c0,d0); - const vint4 b1 = shuffle<2,3,0,1>(a1); - const vint4 c1 = max(a1,b1); - const vint4 d1 = min(a1,b1); - const vint4 a2 = select<0x3 /* 0b0011 */>(c1,d1); - const vint4 b2 = shuffle<0,2,1,3>(a2); - const vint4 c2 = max(a2,b2); - const vint4 d2 = min(a2,b2); - const vint4 a3 = select<0x2 /* 0b0010 */>(c2,d2); - return a3+vint4(0x80000000); - } - -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline embree_ostream operator <<(embree_ostream cout, const vint4& a) { - return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ">"; - } -} - diff --git a/thirdparty/embree-aarch64/common/simd/vint8_avx.h b/thirdparty/embree-aarch64/common/simd/vint8_avx.h deleted file mode 100644 index 25a771284d..0000000000 --- a/thirdparty/embree-aarch64/common/simd/vint8_avx.h +++ /dev/null @@ -1,464 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -namespace embree -{ - /* 8-wide AVX integer type */ - template<> - struct vint<8> - { - ALIGNED_STRUCT_(32); - - typedef vboolf8 Bool; - typedef vint8 Int; - typedef vfloat8 Float; - - enum { size = 8 }; // number of SIMD elements - union { // data - __m256i v; - struct { __m128i vl,vh; }; - int i[8]; - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vint() {} - __forceinline vint(const vint8& a) { v = a.v; } - __forceinline vint8& operator =(const vint8& a) { v = a.v; return *this; } - - __forceinline vint(__m256i a) : v(a) {} - __forceinline operator const __m256i&() const { return v; } - __forceinline operator __m256i&() { return v; } - - __forceinline explicit vint(const vint4& a) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),a,1)) {} - __forceinline vint(const vint4& a, const vint4& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {} - __forceinline vint(const __m128i& a, const __m128i& b) : vl(a), vh(b) {} - - __forceinline explicit vint(const int* a) : v(_mm256_castps_si256(_mm256_loadu_ps((const float*)a))) {} - __forceinline vint(int a) : v(_mm256_set1_epi32(a)) {} - __forceinline vint(int a, int b) : v(_mm256_set_epi32(b, a, b, a, b, a, b, a)) {} - __forceinline vint(int a, int b, int c, int d) : v(_mm256_set_epi32(d, c, b, a, d, c, b, a)) {} - __forceinline vint(int a, int b, int c, int d, int e, int f, int g, int vh) : v(_mm256_set_epi32(vh, g, f, e, d, c, b, a)) {} - - __forceinline explicit vint(__m256 a) : v(_mm256_cvtps_epi32(a)) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vint(ZeroTy) : v(_mm256_setzero_si256()) {} - __forceinline vint(OneTy) : v(_mm256_set_epi32(1,1,1,1,1,1,1,1)) {} - __forceinline vint(PosInfTy) : v(_mm256_set_epi32(pos_inf,pos_inf,pos_inf,pos_inf,pos_inf,pos_inf,pos_inf,pos_inf)) {} - __forceinline vint(NegInfTy) : v(_mm256_set_epi32(neg_inf,neg_inf,neg_inf,neg_inf,neg_inf,neg_inf,neg_inf,neg_inf)) {} - __forceinline vint(StepTy) : v(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)) {} - __forceinline vint(ReverseStepTy) : v(_mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7)) {} - __forceinline vint(UndefinedTy) : v(_mm256_undefined_si256()) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Loads and Stores - //////////////////////////////////////////////////////////////////////////////// - - static __forceinline vint8 load (const void* a) { return _mm256_castps_si256(_mm256_load_ps((float*)a)); } - static __forceinline vint8 loadu(const void* a) { return _mm256_castps_si256(_mm256_loadu_ps((float*)a)); } - - static __forceinline vint8 load (const vboolf8& mask, const void* a) { return _mm256_castps_si256(_mm256_maskload_ps((float*)a,mask)); } - static __forceinline vint8 loadu(const vboolf8& mask, const void* a) { return _mm256_castps_si256(_mm256_maskload_ps((float*)a,mask)); } - - static __forceinline void store (void* ptr, const vint8& f) { _mm256_store_ps((float*)ptr,_mm256_castsi256_ps(f)); } - static __forceinline void storeu(void* ptr, const vint8& f) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(f)); } - -#if !defined(__aarch64__) - static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); } - static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); } -#else - static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask.v,_mm256_castsi256_ps(f)); } - static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask.v,_mm256_castsi256_ps(f)); } -#endif - - static __forceinline void store_nt(void* ptr, const vint8& v) { - _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v)); - } - - static __forceinline vint8 load(const uint8_t* ptr) { - vint4 il = vint4::load(ptr+0); - vint4 ih = vint4::load(ptr+4); - return vint8(il,ih); - } - - static __forceinline vint8 loadu(const uint8_t* ptr) { - vint4 il = vint4::loadu(ptr+0); - vint4 ih = vint4::loadu(ptr+4); - return vint8(il,ih); - } - - static __forceinline vint8 load(const unsigned short* ptr) { - vint4 il = vint4::load(ptr+0); - vint4 ih = vint4::load(ptr+4); - return vint8(il,ih); - } - - static __forceinline vint8 loadu(const unsigned short* ptr) { - vint4 il = vint4::loadu(ptr+0); - vint4 ih = vint4::loadu(ptr+4); - return vint8(il,ih); - } - - static __forceinline void store(uint8_t* ptr, const vint8& i) { - vint4 il(i.vl); - vint4 ih(i.vh); - vint4::store(ptr + 0,il); - vint4::store(ptr + 4,ih); - } - - static __forceinline void store(unsigned short* ptr, const vint8& v) { - for (size_t i=0;i<8;i++) - ptr[i] = (unsigned short)v[i]; - } - - template - static __forceinline vint8 gather(const int* ptr, const vint8& index) { - return vint8( - *(int*)(((int8_t*)ptr)+scale*index[0]), - *(int*)(((int8_t*)ptr)+scale*index[1]), - *(int*)(((int8_t*)ptr)+scale*index[2]), - *(int*)(((int8_t*)ptr)+scale*index[3]), - *(int*)(((int8_t*)ptr)+scale*index[4]), - *(int*)(((int8_t*)ptr)+scale*index[5]), - *(int*)(((int8_t*)ptr)+scale*index[6]), - *(int*)(((int8_t*)ptr)+scale*index[7])); - } - - template - static __forceinline vint8 gather(const vboolf8& mask, const int* ptr, const vint8& index) { - vint8 r = zero; - if (likely(mask[0])) r[0] = *(int*)(((int8_t*)ptr)+scale*index[0]); - if (likely(mask[1])) r[1] = *(int*)(((int8_t*)ptr)+scale*index[1]); - if (likely(mask[2])) r[2] = *(int*)(((int8_t*)ptr)+scale*index[2]); - if (likely(mask[3])) r[3] = *(int*)(((int8_t*)ptr)+scale*index[3]); - if (likely(mask[4])) r[4] = *(int*)(((int8_t*)ptr)+scale*index[4]); - if (likely(mask[5])) r[5] = *(int*)(((int8_t*)ptr)+scale*index[5]); - if (likely(mask[6])) r[6] = *(int*)(((int8_t*)ptr)+scale*index[6]); - if (likely(mask[7])) r[7] = *(int*)(((int8_t*)ptr)+scale*index[7]); - return r; - } - - template - static __forceinline void scatter(void* ptr, const vint8& ofs, const vint8& v) - { - *(int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0]; - *(int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1]; - *(int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2]; - *(int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3]; - *(int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4]; - *(int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5]; - *(int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6]; - *(int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7]; - } - - template - static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vint8& v) - { - if (likely(mask[0])) *(int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0]; - if (likely(mask[1])) *(int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1]; - if (likely(mask[2])) *(int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2]; - if (likely(mask[3])) *(int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3]; - if (likely(mask[4])) *(int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4]; - if (likely(mask[5])) *(int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5]; - if (likely(mask[6])) *(int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6]; - if (likely(mask[7])) *(int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7]; - } - - - static __forceinline vint8 broadcast64(const long long& a) { return _mm256_set1_epi64x(a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline const int& operator [](size_t index) const { assert(index < 8); return i[index]; } - __forceinline int& operator [](size_t index) { assert(index < 8); return i[index]; } - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf8 asBool(const vint8& a) { return _mm256_castsi256_ps(a); } - - __forceinline vint8 operator +(const vint8& a) { return a; } - __forceinline vint8 operator -(const vint8& a) { return vint8(_mm_sub_epi32(_mm_setzero_si128(), a.vl), _mm_sub_epi32(_mm_setzero_si128(), a.vh)); } - __forceinline vint8 abs (const vint8& a) { return vint8(_mm_abs_epi32(a.vl), _mm_abs_epi32(a.vh)); } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vint8 operator +(const vint8& a, const vint8& b) { return vint8(_mm_add_epi32(a.vl, b.vl), _mm_add_epi32(a.vh, b.vh)); } - __forceinline vint8 operator +(const vint8& a, int b) { return a + vint8(b); } - __forceinline vint8 operator +(int a, const vint8& b) { return vint8(a) + b; } - - __forceinline vint8 operator -(const vint8& a, const vint8& b) { return vint8(_mm_sub_epi32(a.vl, b.vl), _mm_sub_epi32(a.vh, b.vh)); } - __forceinline vint8 operator -(const vint8& a, int b) { return a - vint8(b); } - __forceinline vint8 operator -(int a, const vint8& b) { return vint8(a) - b; } - - __forceinline vint8 operator *(const vint8& a, const vint8& b) { return vint8(_mm_mullo_epi32(a.vl, b.vl), _mm_mullo_epi32(a.vh, b.vh)); } - __forceinline vint8 operator *(const vint8& a, int b) { return a * vint8(b); } - __forceinline vint8 operator *(int a, const vint8& b) { return vint8(a) * b; } - - __forceinline vint8 operator &(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } - __forceinline vint8 operator &(const vint8& a, int b) { return a & vint8(b); } - __forceinline vint8 operator &(int a, const vint8& b) { return vint8(a) & b; } - - __forceinline vint8 operator |(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_or_ps (_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } - __forceinline vint8 operator |(const vint8& a, int b) { return a | vint8(b); } - __forceinline vint8 operator |(int a, const vint8& b) { return vint8(a) | b; } - - __forceinline vint8 operator ^(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } - __forceinline vint8 operator ^(const vint8& a, int b) { return a ^ vint8(b); } - __forceinline vint8 operator ^(int a, const vint8& b) { return vint8(a) ^ b; } - - __forceinline vint8 operator <<(const vint8& a, int n) { return vint8(_mm_slli_epi32(a.vl, n), _mm_slli_epi32(a.vh, n)); } - __forceinline vint8 operator >>(const vint8& a, int n) { return vint8(_mm_srai_epi32(a.vl, n), _mm_srai_epi32(a.vh, n)); } - - __forceinline vint8 sll (const vint8& a, int b) { return vint8(_mm_slli_epi32(a.vl, b), _mm_slli_epi32(a.vh, b)); } - __forceinline vint8 sra (const vint8& a, int b) { return vint8(_mm_srai_epi32(a.vl, b), _mm_srai_epi32(a.vh, b)); } - __forceinline vint8 srl (const vint8& a, int b) { return vint8(_mm_srli_epi32(a.vl, b), _mm_srli_epi32(a.vh, b)); } - - __forceinline vint8 min(const vint8& a, const vint8& b) { return vint8(_mm_min_epi32(a.vl, b.vl), _mm_min_epi32(a.vh, b.vh)); } - __forceinline vint8 min(const vint8& a, int b) { return min(a,vint8(b)); } - __forceinline vint8 min(int a, const vint8& b) { return min(vint8(a),b); } - - __forceinline vint8 max(const vint8& a, const vint8& b) { return vint8(_mm_max_epi32(a.vl, b.vl), _mm_max_epi32(a.vh, b.vh)); } - __forceinline vint8 max(const vint8& a, int b) { return max(a,vint8(b)); } - __forceinline vint8 max(int a, const vint8& b) { return max(vint8(a),b); } - - __forceinline vint8 umin(const vint8& a, const vint8& b) { return vint8(_mm_min_epu32(a.vl, b.vl), _mm_min_epu32(a.vh, b.vh)); } - __forceinline vint8 umin(const vint8& a, int b) { return umin(a,vint8(b)); } - __forceinline vint8 umin(int a, const vint8& b) { return umin(vint8(a),b); } - - __forceinline vint8 umax(const vint8& a, const vint8& b) { return vint8(_mm_max_epu32(a.vl, b.vl), _mm_max_epu32(a.vh, b.vh)); } - __forceinline vint8 umax(const vint8& a, int b) { return umax(a,vint8(b)); } - __forceinline vint8 umax(int a, const vint8& b) { return umax(vint8(a),b); } - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vint8& operator +=(vint8& a, const vint8& b) { return a = a + b; } - __forceinline vint8& operator +=(vint8& a, int b) { return a = a + b; } - - __forceinline vint8& operator -=(vint8& a, const vint8& b) { return a = a - b; } - __forceinline vint8& operator -=(vint8& a, int b) { return a = a - b; } - - __forceinline vint8& operator *=(vint8& a, const vint8& b) { return a = a * b; } - __forceinline vint8& operator *=(vint8& a, int b) { return a = a * b; } - - __forceinline vint8& operator &=(vint8& a, const vint8& b) { return a = a & b; } - __forceinline vint8& operator &=(vint8& a, int b) { return a = a & b; } - - __forceinline vint8& operator |=(vint8& a, const vint8& b) { return a = a | b; } - __forceinline vint8& operator |=(vint8& a, int b) { return a = a | b; } - - __forceinline vint8& operator <<=(vint8& a, int b) { return a = a << b; } - __forceinline vint8& operator >>=(vint8& a, int b) { return a = a >> b; } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators + Select - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf8 operator ==(const vint8& a, const vint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmpeq_epi32 (a.vl, b.vl)), - _mm_castsi128_ps(_mm_cmpeq_epi32 (a.vh, b.vh))); } - __forceinline vboolf8 operator ==(const vint8& a, int b) { return a == vint8(b); } - __forceinline vboolf8 operator ==(int a, const vint8& b) { return vint8(a) == b; } - - __forceinline vboolf8 operator !=(const vint8& a, const vint8& b) { return !(a == b); } - __forceinline vboolf8 operator !=(const vint8& a, int b) { return a != vint8(b); } - __forceinline vboolf8 operator !=(int a, const vint8& b) { return vint8(a) != b; } - - __forceinline vboolf8 operator < (const vint8& a, const vint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmplt_epi32 (a.vl, b.vl)), - _mm_castsi128_ps(_mm_cmplt_epi32 (a.vh, b.vh))); } - __forceinline vboolf8 operator < (const vint8& a, int b) { return a < vint8(b); } - __forceinline vboolf8 operator < (int a, const vint8& b) { return vint8(a) < b; } - - __forceinline vboolf8 operator >=(const vint8& a, const vint8& b) { return !(a < b); } - __forceinline vboolf8 operator >=(const vint8& a, int b) { return a >= vint8(b); } - __forceinline vboolf8 operator >=(int a, const vint8& b) { return vint8(a) >= b; } - - __forceinline vboolf8 operator > (const vint8& a, const vint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmpgt_epi32 (a.vl, b.vl)), - _mm_castsi128_ps(_mm_cmpgt_epi32 (a.vh, b.vh))); } - __forceinline vboolf8 operator > (const vint8& a, int b) { return a > vint8(b); } - __forceinline vboolf8 operator > (int a, const vint8& b) { return vint8(a) > b; } - - __forceinline vboolf8 operator <=(const vint8& a, const vint8& b) { return !(a > b); } - __forceinline vboolf8 operator <=(const vint8& a, int b) { return a <= vint8(b); } - __forceinline vboolf8 operator <=(int a, const vint8& b) { return vint8(a) <= b; } - - __forceinline vboolf8 eq(const vint8& a, const vint8& b) { return a == b; } - __forceinline vboolf8 ne(const vint8& a, const vint8& b) { return a != b; } - __forceinline vboolf8 lt(const vint8& a, const vint8& b) { return a < b; } - __forceinline vboolf8 ge(const vint8& a, const vint8& b) { return a >= b; } - __forceinline vboolf8 gt(const vint8& a, const vint8& b) { return a > b; } - __forceinline vboolf8 le(const vint8& a, const vint8& b) { return a <= b; } - - __forceinline vboolf8 eq(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a == b); } - __forceinline vboolf8 ne(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a != b); } - __forceinline vboolf8 lt(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a < b); } - __forceinline vboolf8 ge(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a >= b); } - __forceinline vboolf8 gt(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a > b); } - __forceinline vboolf8 le(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a <= b); } - - __forceinline vint8 select(const vboolf8& m, const vint8& t, const vint8& f) { - return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m)); - } - - __forceinline vint8 notand(const vboolf8& m, const vint8& f) { - return _mm256_castps_si256(_mm256_andnot_ps(m, _mm256_castsi256_ps(f))); - } - - - //////////////////////////////////////////////////////////////////////////////// - /// Movement/Shifting/Shuffling Functions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vint8 unpacklo(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } - __forceinline vint8 unpackhi(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } - - template - __forceinline vint8 shuffle(const vint8& v) { - return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i, i, i, i))); - } - - template - __forceinline vint8 shuffle4(const vint8& v) { - return _mm256_permute2f128_si256(v, v, (i1 << 4) | (i0 << 0)); - } - - template - __forceinline vint8 shuffle4(const vint8& a, const vint8& b) { - return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0)); - } - - template - __forceinline vint8 shuffle(const vint8& v) { - return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i3, i2, i1, i0))); - } - - template - __forceinline vint8 shuffle(const vint8& a, const vint8& b) { - return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0))); - } - - template<> __forceinline vint8 shuffle<0, 0, 2, 2>(const vint8& v) { return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(v))); } - template<> __forceinline vint8 shuffle<1, 1, 3, 3>(const vint8& v) { return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(v))); } - template<> __forceinline vint8 shuffle<0, 1, 0, 1>(const vint8& v) { return _mm256_castps_si256(_mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(v))))); } - - __forceinline vint8 broadcast(const int* ptr) { return _mm256_castps_si256(_mm256_broadcast_ss((const float*)ptr)); } - template __forceinline vint8 insert4(const vint8& a, const vint4& b) { return _mm256_insertf128_si256(a, b, i); } - template __forceinline vint4 extract4(const vint8& a) { return _mm256_extractf128_si256(a, i); } - template<> __forceinline vint4 extract4<0>(const vint8& a) { return _mm256_castsi256_si128(a); } - - __forceinline int toScalar(const vint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); } - - - //////////////////////////////////////////////////////////////////////////////// - /// Reductions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vint8 vreduce_min2(const vint8& v) { return min(v,shuffle<1,0,3,2>(v)); } - __forceinline vint8 vreduce_min4(const vint8& v) { vint8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); } - __forceinline vint8 vreduce_min (const vint8& v) { vint8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); } - - __forceinline vint8 vreduce_max2(const vint8& v) { return max(v,shuffle<1,0,3,2>(v)); } - __forceinline vint8 vreduce_max4(const vint8& v) { vint8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); } - __forceinline vint8 vreduce_max (const vint8& v) { vint8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); } - - __forceinline vint8 vreduce_add2(const vint8& v) { return v + shuffle<1,0,3,2>(v); } - __forceinline vint8 vreduce_add4(const vint8& v) { vint8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); } - __forceinline vint8 vreduce_add (const vint8& v) { vint8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); } - - __forceinline int reduce_min(const vint8& v) { return toScalar(vreduce_min(v)); } - __forceinline int reduce_max(const vint8& v) { return toScalar(vreduce_max(v)); } - __forceinline int reduce_add(const vint8& v) { return toScalar(vreduce_add(v)); } - - __forceinline size_t select_min(const vint8& v) { return bsf(movemask(v == vreduce_min(v))); } - __forceinline size_t select_max(const vint8& v) { return bsf(movemask(v == vreduce_max(v))); } - - __forceinline size_t select_min(const vboolf8& valid, const vint8& v) { const vint8 a = select(valid,v,vint8(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); } - __forceinline size_t select_max(const vboolf8& valid, const vint8& v) { const vint8 a = select(valid,v,vint8(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); } - - //////////////////////////////////////////////////////////////////////////////// - /// Sorting networks - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vint8 usort_ascending(const vint8& v) - { - const vint8 a0 = v; - const vint8 b0 = shuffle<1,0,3,2>(a0); - const vint8 c0 = umin(a0,b0); - const vint8 d0 = umax(a0,b0); - const vint8 a1 = select(0x99 /* 0b10011001 */,c0,d0); - const vint8 b1 = shuffle<2,3,0,1>(a1); - const vint8 c1 = umin(a1,b1); - const vint8 d1 = umax(a1,b1); - const vint8 a2 = select(0xc3 /* 0b11000011 */,c1,d1); - const vint8 b2 = shuffle<1,0,3,2>(a2); - const vint8 c2 = umin(a2,b2); - const vint8 d2 = umax(a2,b2); - const vint8 a3 = select(0xa5 /* 0b10100101 */,c2,d2); - const vint8 b3 = shuffle4<1,0>(a3); - const vint8 c3 = umin(a3,b3); - const vint8 d3 = umax(a3,b3); - const vint8 a4 = select(0xf /* 0b00001111 */,c3,d3); - const vint8 b4 = shuffle<2,3,0,1>(a4); - const vint8 c4 = umin(a4,b4); - const vint8 d4 = umax(a4,b4); - const vint8 a5 = select(0x33 /* 0b00110011 */,c4,d4); - const vint8 b5 = shuffle<1,0,3,2>(a5); - const vint8 c5 = umin(a5,b5); - const vint8 d5 = umax(a5,b5); - const vint8 a6 = select(0x55 /* 0b01010101 */,c5,d5); - return a6; - } - - __forceinline vint8 usort_descending(const vint8& v) - { - const vint8 a0 = v; - const vint8 b0 = shuffle<1,0,3,2>(a0); - const vint8 c0 = umax(a0,b0); - const vint8 d0 = umin(a0,b0); - const vint8 a1 = select(0x99 /* 0b10011001 */,c0,d0); - const vint8 b1 = shuffle<2,3,0,1>(a1); - const vint8 c1 = umax(a1,b1); - const vint8 d1 = umin(a1,b1); - const vint8 a2 = select(0xc3 /* 0b11000011 */,c1,d1); - const vint8 b2 = shuffle<1,0,3,2>(a2); - const vint8 c2 = umax(a2,b2); - const vint8 d2 = umin(a2,b2); - const vint8 a3 = select(0xa5 /* 0b10100101 */,c2,d2); - const vint8 b3 = shuffle4<1,0>(a3); - const vint8 c3 = umax(a3,b3); - const vint8 d3 = umin(a3,b3); - const vint8 a4 = select(0xf /* 0b00001111 */,c3,d3); - const vint8 b4 = shuffle<2,3,0,1>(a4); - const vint8 c4 = umax(a4,b4); - const vint8 d4 = umin(a4,b4); - const vint8 a5 = select(0x33 /* 0b00110011 */,c4,d4); - const vint8 b5 = shuffle<1,0,3,2>(a5); - const vint8 c5 = umax(a5,b5); - const vint8 d5 = umin(a5,b5); - const vint8 a6 = select(0x55 /* 0b01010101 */,c5,d5); - return a6; - } - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline embree_ostream operator <<(embree_ostream cout, const vint8& a) { - return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">"; - } -} diff --git a/thirdparty/embree-aarch64/common/simd/vint8_avx2.h b/thirdparty/embree-aarch64/common/simd/vint8_avx2.h deleted file mode 100644 index 4937d972cf..0000000000 --- a/thirdparty/embree-aarch64/common/simd/vint8_avx2.h +++ /dev/null @@ -1,512 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -namespace embree -{ - /* 8-wide AVX integer type */ - template<> - struct vint<8> - { - ALIGNED_STRUCT_(32); - - typedef vboolf8 Bool; - typedef vint8 Int; - typedef vfloat8 Float; - - enum { size = 8 }; // number of SIMD elements - union { // data - __m256i v; - int i[8]; - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vint() {} - __forceinline vint(const vint8& a) { v = a.v; } - __forceinline vint8& operator =(const vint8& a) { v = a.v; return *this; } - - __forceinline vint(__m256i a) : v(a) {} - __forceinline operator const __m256i&() const { return v; } - __forceinline operator __m256i&() { return v; } - - __forceinline explicit vint(const vint4& a) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),a,1)) {} - __forceinline vint(const vint4& a, const vint4& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {} - __forceinline vint(const __m128i& a, const __m128i& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {} - - __forceinline explicit vint(const int* a) : v(_mm256_castps_si256(_mm256_loadu_ps((const float*)a))) {} - __forceinline vint(int a) : v(_mm256_set1_epi32(a)) {} - __forceinline vint(int a, int b) : v(_mm256_set_epi32(b, a, b, a, b, a, b, a)) {} - __forceinline vint(int a, int b, int c, int d) : v(_mm256_set_epi32(d, c, b, a, d, c, b, a)) {} - __forceinline vint(int a, int b, int c, int d, int e, int f, int g, int h) : v(_mm256_set_epi32(h, g, f, e, d, c, b, a)) {} - - __forceinline explicit vint(__m256 a) : v(_mm256_cvtps_epi32(a)) {} - -#if defined(__AVX512VL__) - __forceinline explicit vint(const vboolf8& a) : v(_mm256_movm_epi32(a)) {} -#else - __forceinline explicit vint(const vboolf8& a) : v(_mm256_castps_si256((__m256)a)) {} -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vint(ZeroTy) : v(_mm256_setzero_si256()) {} - __forceinline vint(OneTy) : v(_mm256_set1_epi32(1)) {} - __forceinline vint(PosInfTy) : v(_mm256_set1_epi32(pos_inf)) {} - __forceinline vint(NegInfTy) : v(_mm256_set1_epi32(neg_inf)) {} - __forceinline vint(StepTy) : v(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)) {} - __forceinline vint(ReverseStepTy) : v(_mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7)) {} - __forceinline vint(UndefinedTy) : v(_mm256_undefined_si256()) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Loads and Stores - //////////////////////////////////////////////////////////////////////////////// - - static __forceinline vint8 load(const uint8_t* ptr) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); } - static __forceinline vint8 loadu(const uint8_t* ptr) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); } - static __forceinline vint8 load(const unsigned short* ptr) { return _mm256_cvtepu16_epi32(_mm_load_si128((__m128i*)ptr)); } - static __forceinline vint8 loadu(const unsigned short* ptr) { return _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr)); } - - static __forceinline vint8 load(const void* ptr) { return _mm256_load_si256((__m256i*)ptr); } - static __forceinline vint8 loadu(const void* ptr) { return _mm256_loadu_si256((__m256i*)ptr); } - - static __forceinline void store (void* ptr, const vint8& v) { _mm256_store_si256((__m256i*)ptr,v); } - static __forceinline void storeu(void* ptr, const vint8& v) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(v)); } - -#if defined(__AVX512VL__) - - static __forceinline vint8 compact(const vboolf8& mask, vint8 &v) { - return _mm256_mask_compress_epi32(v, mask, v); - } - static __forceinline vint8 compact(const vboolf8& mask, vint8 &a, const vint8& b) { - return _mm256_mask_compress_epi32(a, mask, b); - } - - static __forceinline vint8 load (const vboolf8& mask, const void* ptr) { return _mm256_mask_load_epi32 (_mm256_setzero_si256(),mask,ptr); } - static __forceinline vint8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_mask_loadu_epi32(_mm256_setzero_si256(),mask,ptr); } - - static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& v) { _mm256_mask_store_epi32 (ptr,mask,v); } - static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& v) { _mm256_mask_storeu_epi32(ptr,mask,v); } -#else - static __forceinline vint8 load (const vboolf8& mask, const void* ptr) { return _mm256_castps_si256(_mm256_maskload_ps((float*)ptr,mask)); } - static __forceinline vint8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_castps_si256(_mm256_maskload_ps((float*)ptr,mask)); } - - static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& v) { _mm256_maskstore_epi32((int*)ptr,mask,v); } - static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& v) { _mm256_maskstore_epi32((int*)ptr,mask,v); } -#endif - - static __forceinline vint8 load_nt(void* ptr) { - return _mm256_stream_load_si256((__m256i*)ptr); - } - - static __forceinline void store_nt(void* ptr, const vint8& v) { - _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v)); - } - - static __forceinline void store(uint8_t* ptr, const vint8& i) - { - for (size_t j=0; j<8; j++) - ptr[j] = i[j]; - } - - static __forceinline void store(unsigned short* ptr, const vint8& v) { - for (size_t i=0;i<8;i++) - ptr[i] = (unsigned short)v[i]; - } - - template - static __forceinline vint8 gather(const int *const ptr, const vint8& index) { - return _mm256_i32gather_epi32(ptr, index, scale); - } - - template - static __forceinline vint8 gather(const vboolf8& mask, const int *const ptr, const vint8& index) { - vint8 r = zero; -#if defined(__AVX512VL__) - return _mm256_mmask_i32gather_epi32(r, mask, index, ptr, scale); -#else - return _mm256_mask_i32gather_epi32(r, ptr, index, mask, scale); -#endif - } - - template - static __forceinline void scatter(void* ptr, const vint8& ofs, const vint8& v) - { -#if defined(__AVX512VL__) - _mm256_i32scatter_epi32((int*)ptr, ofs, v, scale); -#else - *(int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0]; - *(int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1]; - *(int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2]; - *(int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3]; - *(int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4]; - *(int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5]; - *(int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6]; - *(int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7]; -#endif - } - - template - static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vint8& v) - { -#if defined(__AVX512VL__) - _mm256_mask_i32scatter_epi32((int*)ptr, mask, ofs, v, scale); -#else - if (likely(mask[0])) *(int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0]; - if (likely(mask[1])) *(int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1]; - if (likely(mask[2])) *(int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2]; - if (likely(mask[3])) *(int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3]; - if (likely(mask[4])) *(int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4]; - if (likely(mask[5])) *(int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5]; - if (likely(mask[6])) *(int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6]; - if (likely(mask[7])) *(int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7]; -#endif - } - - static __forceinline vint8 broadcast64(const long long &a) { return _mm256_set1_epi64x(a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline const int& operator [](size_t index) const { assert(index < 8); return i[index]; } - __forceinline int& operator [](size_t index) { assert(index < 8); return i[index]; } - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - -#if defined(__AVX512VL__) - static __forceinline vboolf8 asBool(const vint8& a) { return _mm256_movepi32_mask(a); } -#else - static __forceinline vboolf8 asBool(const vint8& a) { return _mm256_castsi256_ps(a); } -#endif - - __forceinline vint8 operator +(const vint8& a) { return a; } - __forceinline vint8 operator -(const vint8& a) { return _mm256_sub_epi32(_mm256_setzero_si256(), a); } - __forceinline vint8 abs (const vint8& a) { return _mm256_abs_epi32(a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vint8 operator +(const vint8& a, const vint8& b) { return _mm256_add_epi32(a, b); } - __forceinline vint8 operator +(const vint8& a, int b) { return a + vint8(b); } - __forceinline vint8 operator +(int a, const vint8& b) { return vint8(a) + b; } - - __forceinline vint8 operator -(const vint8& a, const vint8& b) { return _mm256_sub_epi32(a, b); } - __forceinline vint8 operator -(const vint8& a, int b) { return a - vint8(b); } - __forceinline vint8 operator -(int a, const vint8& b) { return vint8(a) - b; } - - __forceinline vint8 operator *(const vint8& a, const vint8& b) { return _mm256_mullo_epi32(a, b); } - __forceinline vint8 operator *(const vint8& a, int b) { return a * vint8(b); } - __forceinline vint8 operator *(int a, const vint8& b) { return vint8(a) * b; } - - __forceinline vint8 operator &(const vint8& a, const vint8& b) { return _mm256_and_si256(a, b); } - __forceinline vint8 operator &(const vint8& a, int b) { return a & vint8(b); } - __forceinline vint8 operator &(int a, const vint8& b) { return vint8(a) & b; } - - __forceinline vint8 operator |(const vint8& a, const vint8& b) { return _mm256_or_si256(a, b); } - __forceinline vint8 operator |(const vint8& a, int b) { return a | vint8(b); } - __forceinline vint8 operator |(int a, const vint8& b) { return vint8(a) | b; } - - __forceinline vint8 operator ^(const vint8& a, const vint8& b) { return _mm256_xor_si256(a, b); } - __forceinline vint8 operator ^(const vint8& a, int b) { return a ^ vint8(b); } - __forceinline vint8 operator ^(int a, const vint8& b) { return vint8(a) ^ b; } - - __forceinline vint8 operator <<(const vint8& a, int n) { return _mm256_slli_epi32(a, n); } - __forceinline vint8 operator >>(const vint8& a, int n) { return _mm256_srai_epi32(a, n); } - - __forceinline vint8 operator <<(const vint8& a, const vint8& n) { return _mm256_sllv_epi32(a, n); } - __forceinline vint8 operator >>(const vint8& a, const vint8& n) { return _mm256_srav_epi32(a, n); } - - __forceinline vint8 sll(const vint8& a, int b) { return _mm256_slli_epi32(a, b); } - __forceinline vint8 sra(const vint8& a, int b) { return _mm256_srai_epi32(a, b); } - __forceinline vint8 srl(const vint8& a, int b) { return _mm256_srli_epi32(a, b); } - - __forceinline vint8 sll(const vint8& a, const vint8& b) { return _mm256_sllv_epi32(a, b); } - __forceinline vint8 sra(const vint8& a, const vint8& b) { return _mm256_srav_epi32(a, b); } - __forceinline vint8 srl(const vint8& a, const vint8& b) { return _mm256_srlv_epi32(a, b); } - - __forceinline vint8 min(const vint8& a, const vint8& b) { return _mm256_min_epi32(a, b); } - __forceinline vint8 min(const vint8& a, int b) { return min(a,vint8(b)); } - __forceinline vint8 min(int a, const vint8& b) { return min(vint8(a),b); } - - __forceinline vint8 max(const vint8& a, const vint8& b) { return _mm256_max_epi32(a, b); } - __forceinline vint8 max(const vint8& a, int b) { return max(a,vint8(b)); } - __forceinline vint8 max(int a, const vint8& b) { return max(vint8(a),b); } - - __forceinline vint8 umin(const vint8& a, const vint8& b) { return _mm256_min_epu32(a, b); } - __forceinline vint8 umax(const vint8& a, const vint8& b) { return _mm256_max_epu32(a, b); } - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vint8& operator +=(vint8& a, const vint8& b) { return a = a + b; } - __forceinline vint8& operator +=(vint8& a, int b) { return a = a + b; } - - __forceinline vint8& operator -=(vint8& a, const vint8& b) { return a = a - b; } - __forceinline vint8& operator -=(vint8& a, int b) { return a = a - b; } - - __forceinline vint8& operator *=(vint8& a, const vint8& b) { return a = a * b; } - __forceinline vint8& operator *=(vint8& a, int b) { return a = a * b; } - - __forceinline vint8& operator &=(vint8& a, const vint8& b) { return a = a & b; } - __forceinline vint8& operator &=(vint8& a, int b) { return a = a & b; } - - __forceinline vint8& operator |=(vint8& a, const vint8& b) { return a = a | b; } - __forceinline vint8& operator |=(vint8& a, int b) { return a = a | b; } - - __forceinline vint8& operator <<=(vint8& a, const int b) { return a = a << b; } - __forceinline vint8& operator >>=(vint8& a, const int b) { return a = a >> b; } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators + Select - //////////////////////////////////////////////////////////////////////////////// - -#if defined(__AVX512VL__) - static __forceinline vboolf8 operator ==(const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_EQ); } - static __forceinline vboolf8 operator !=(const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_NE); } - static __forceinline vboolf8 operator < (const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_LT); } - static __forceinline vboolf8 operator >=(const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_GE); } - static __forceinline vboolf8 operator > (const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_GT); } - static __forceinline vboolf8 operator <=(const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_LE); } - - static __forceinline vint8 select(const vboolf8& m, const vint8& t, const vint8& f) { - return _mm256_mask_blend_epi32(m, (__m256i)f, (__m256i)t); - } -#else - static __forceinline vboolf8 operator ==(const vint8& a, const vint8& b) { return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a, b)); } - static __forceinline vboolf8 operator !=(const vint8& a, const vint8& b) { return !(a == b); } - static __forceinline vboolf8 operator < (const vint8& a, const vint8& b) { return _mm256_castsi256_ps(_mm256_cmpgt_epi32(b, a)); } - static __forceinline vboolf8 operator >=(const vint8& a, const vint8& b) { return !(a < b); } - static __forceinline vboolf8 operator > (const vint8& a, const vint8& b) { return _mm256_castsi256_ps(_mm256_cmpgt_epi32(a, b)); } - static __forceinline vboolf8 operator <=(const vint8& a, const vint8& b) { return !(a > b); } - - static __forceinline vint8 select(const vboolf8& m, const vint8& t, const vint8& f) { - return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m)); - } -#endif - - template - __forceinline vint8 select(const vint8& t, const vint8& f) { - return _mm256_blend_epi32(f, t, mask); - } - - __forceinline vboolf8 operator ==(const vint8& a, int b) { return a == vint8(b); } - __forceinline vboolf8 operator ==(int a, const vint8& b) { return vint8(a) == b; } - - __forceinline vboolf8 operator !=(const vint8& a, int b) { return a != vint8(b); } - __forceinline vboolf8 operator !=(int a, const vint8& b) { return vint8(a) != b; } - - __forceinline vboolf8 operator < (const vint8& a, int b) { return a < vint8(b); } - __forceinline vboolf8 operator < (int a, const vint8& b) { return vint8(a) < b; } - - __forceinline vboolf8 operator >=(const vint8& a, int b) { return a >= vint8(b); } - __forceinline vboolf8 operator >=(int a, const vint8& b) { return vint8(a) >= b; } - - __forceinline vboolf8 operator > (const vint8& a, int b) { return a > vint8(b); } - __forceinline vboolf8 operator > (int a, const vint8& b) { return vint8(a) > b; } - - __forceinline vboolf8 operator <=(const vint8& a, int b) { return a <= vint8(b); } - __forceinline vboolf8 operator <=(int a, const vint8& b) { return vint8(a) <= b; } - - __forceinline vboolf8 eq(const vint8& a, const vint8& b) { return a == b; } - __forceinline vboolf8 ne(const vint8& a, const vint8& b) { return a != b; } - __forceinline vboolf8 lt(const vint8& a, const vint8& b) { return a < b; } - __forceinline vboolf8 ge(const vint8& a, const vint8& b) { return a >= b; } - __forceinline vboolf8 gt(const vint8& a, const vint8& b) { return a > b; } - __forceinline vboolf8 le(const vint8& a, const vint8& b) { return a <= b; } - -#if defined(__AVX512VL__) - static __forceinline vboolf8 eq(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_EQ); } - static __forceinline vboolf8 ne(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_NE); } - static __forceinline vboolf8 lt(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LT); } - static __forceinline vboolf8 ge(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_GE); } - static __forceinline vboolf8 gt(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_GT); } - static __forceinline vboolf8 le(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LE); } -#else - static __forceinline vboolf8 eq(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a == b); } - static __forceinline vboolf8 ne(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a != b); } - static __forceinline vboolf8 lt(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a < b); } - static __forceinline vboolf8 ge(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a >= b); } - static __forceinline vboolf8 gt(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a > b); } - static __forceinline vboolf8 le(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a <= b); } -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Movement/Shifting/Shuffling Functions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vint8 unpacklo(const vint8& a, const vint8& b) { return _mm256_unpacklo_epi32(a, b); } - __forceinline vint8 unpackhi(const vint8& a, const vint8& b) { return _mm256_unpackhi_epi32(a, b); } - - template - __forceinline vint8 shuffle(const vint8& v) { - return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i, i, i, i))); - } - - template - __forceinline vint8 shuffle4(const vint8& v) { - return _mm256_permute2f128_si256(v, v, (i1 << 4) | (i0 << 0)); - } - - template - __forceinline vint8 shuffle4(const vint8& a, const vint8& b) { - return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0)); - } - - template - __forceinline vint8 shuffle(const vint8& v) { - return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i3, i2, i1, i0))); - } - - template - __forceinline vint8 shuffle(const vint8& a, const vint8& b) { - return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0))); - } - - template<> __forceinline vint8 shuffle<0, 0, 2, 2>(const vint8& v) { return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(v))); } - template<> __forceinline vint8 shuffle<1, 1, 3, 3>(const vint8& v) { return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(v))); } - template<> __forceinline vint8 shuffle<0, 1, 0, 1>(const vint8& v) { return _mm256_castps_si256(_mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(v))))); } - - __forceinline vint8 broadcast(const int* ptr) { return _mm256_castps_si256(_mm256_broadcast_ss((const float*)ptr)); } - - template __forceinline vint8 insert4(const vint8& a, const vint4& b) { return _mm256_insertf128_si256(a, b, i); } - template __forceinline vint4 extract4(const vint8& a) { return _mm256_extractf128_si256(a, i); } - template<> __forceinline vint4 extract4<0>(const vint8& a) { return _mm256_castsi256_si128(a); } - - __forceinline int toScalar(const vint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); } - -#if !defined(__aarch64__) - -__forceinline vint8 permute(const vint8& v, const __m256i& index) { - return _mm256_permutevar8x32_epi32(v, index); - } - - __forceinline vint8 shuffle(const vint8& v, const __m256i& index) { - return _mm256_castps_si256(_mm256_permutevar_ps(_mm256_castsi256_ps(v), index)); - } - - - - template - static __forceinline vint8 align_shift_right(const vint8& a, const vint8& b) { -#if defined(__AVX512VL__) - return _mm256_alignr_epi32(a, b, i); -#else - return _mm256_alignr_epi8(a, b, 4*i); -#endif - } - -#endif - - - //////////////////////////////////////////////////////////////////////////////// - /// Reductions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vint8 vreduce_min2(const vint8& v) { return min(v,shuffle<1,0,3,2>(v)); } - __forceinline vint8 vreduce_min4(const vint8& v) { vint8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); } - __forceinline vint8 vreduce_min (const vint8& v) { vint8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); } - - __forceinline vint8 vreduce_max2(const vint8& v) { return max(v,shuffle<1,0,3,2>(v)); } - __forceinline vint8 vreduce_max4(const vint8& v) { vint8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); } - __forceinline vint8 vreduce_max (const vint8& v) { vint8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); } - - __forceinline vint8 vreduce_add2(const vint8& v) { return v + shuffle<1,0,3,2>(v); } - __forceinline vint8 vreduce_add4(const vint8& v) { vint8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); } - __forceinline vint8 vreduce_add (const vint8& v) { vint8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); } - - __forceinline int reduce_min(const vint8& v) { return toScalar(vreduce_min(v)); } - __forceinline int reduce_max(const vint8& v) { return toScalar(vreduce_max(v)); } - __forceinline int reduce_add(const vint8& v) { return toScalar(vreduce_add(v)); } - - __forceinline size_t select_min(const vint8& v) { return bsf(movemask(v == vreduce_min(v))); } - __forceinline size_t select_max(const vint8& v) { return bsf(movemask(v == vreduce_max(v))); } - - __forceinline size_t select_min(const vboolf8& valid, const vint8& v) { const vint8 a = select(valid,v,vint8(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); } - __forceinline size_t select_max(const vboolf8& valid, const vint8& v) { const vint8 a = select(valid,v,vint8(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); } - - - __forceinline vint8 assign(const vint4& a) { return _mm256_castsi128_si256(a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Sorting networks - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vint8 usort_ascending(const vint8& v) - { - const vint8 a0 = v; - const vint8 b0 = shuffle<1,0,3,2>(a0); - const vint8 c0 = umin(a0,b0); - const vint8 d0 = umax(a0,b0); - const vint8 a1 = select<0x99 /* 0b10011001 */>(c0,d0); - const vint8 b1 = shuffle<2,3,0,1>(a1); - const vint8 c1 = umin(a1,b1); - const vint8 d1 = umax(a1,b1); - const vint8 a2 = select<0xc3 /* 0b11000011 */>(c1,d1); - const vint8 b2 = shuffle<1,0,3,2>(a2); - const vint8 c2 = umin(a2,b2); - const vint8 d2 = umax(a2,b2); - const vint8 a3 = select<0xa5 /* 0b10100101 */>(c2,d2); - const vint8 b3 = shuffle4<1,0>(a3); - const vint8 c3 = umin(a3,b3); - const vint8 d3 = umax(a3,b3); - const vint8 a4 = select<0xf /* 0b00001111 */>(c3,d3); - const vint8 b4 = shuffle<2,3,0,1>(a4); - const vint8 c4 = umin(a4,b4); - const vint8 d4 = umax(a4,b4); - const vint8 a5 = select<0x33 /* 0b00110011 */>(c4,d4); - const vint8 b5 = shuffle<1,0,3,2>(a5); - const vint8 c5 = umin(a5,b5); - const vint8 d5 = umax(a5,b5); - const vint8 a6 = select<0x55 /* 0b01010101 */>(c5,d5); - return a6; - } - - __forceinline vint8 usort_descending(const vint8& v) - { - const vint8 a0 = v; - const vint8 b0 = shuffle<1,0,3,2>(a0); - const vint8 c0 = umax(a0,b0); - const vint8 d0 = umin(a0,b0); - const vint8 a1 = select<0x99 /* 0b10011001 */>(c0,d0); - const vint8 b1 = shuffle<2,3,0,1>(a1); - const vint8 c1 = umax(a1,b1); - const vint8 d1 = umin(a1,b1); - const vint8 a2 = select<0xc3 /* 0b11000011 */>(c1,d1); - const vint8 b2 = shuffle<1,0,3,2>(a2); - const vint8 c2 = umax(a2,b2); - const vint8 d2 = umin(a2,b2); - const vint8 a3 = select<0xa5 /* 0b10100101 */>(c2,d2); - const vint8 b3 = shuffle4<1,0>(a3); - const vint8 c3 = umax(a3,b3); - const vint8 d3 = umin(a3,b3); - const vint8 a4 = select<0xf /* 0b00001111 */>(c3,d3); - const vint8 b4 = shuffle<2,3,0,1>(a4); - const vint8 c4 = umax(a4,b4); - const vint8 d4 = umin(a4,b4); - const vint8 a5 = select<0x33 /* 0b00110011 */>(c4,d4); - const vint8 b5 = shuffle<1,0,3,2>(a5); - const vint8 c5 = umax(a5,b5); - const vint8 d5 = umin(a5,b5); - const vint8 a6 = select<0x55 /* 0b01010101 */>(c5,d5); - return a6; - } - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline embree_ostream operator <<(embree_ostream cout, const vint8& a) { - return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">"; - } -} diff --git a/thirdparty/embree-aarch64/common/simd/vllong4_avx2.h b/thirdparty/embree-aarch64/common/simd/vllong4_avx2.h deleted file mode 100644 index de3ebc16a7..0000000000 --- a/thirdparty/embree-aarch64/common/simd/vllong4_avx2.h +++ /dev/null @@ -1,358 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -namespace embree -{ - /* 4-wide AVX2 64-bit long long type */ - template<> - struct vllong<4> - { - ALIGNED_STRUCT_(32); - - typedef vboold4 Bool; - - enum { size = 4 }; // number of SIMD elements - union { // data - __m256i v; - long long i[4]; - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vllong() {} - __forceinline vllong(const vllong4& t) { v = t.v; } - __forceinline vllong4& operator =(const vllong4& f) { v = f.v; return *this; } - - __forceinline vllong(const __m256i& t) { v = t; } - __forceinline operator __m256i() const { return v; } - __forceinline operator __m256d() const { return _mm256_castsi256_pd(v); } - - - __forceinline vllong(long long i) { - v = _mm256_set1_epi64x(i); - } - - __forceinline vllong(long long a, long long b, long long c, long long d) { - v = _mm256_set_epi64x(d,c,b,a); - } - - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vllong(ZeroTy) : v(_mm256_setzero_si256()) {} - __forceinline vllong(OneTy) : v(_mm256_set1_epi64x(1)) {} - __forceinline vllong(StepTy) : v(_mm256_set_epi64x(3,2,1,0)) {} - __forceinline vllong(ReverseStepTy) : v(_mm256_set_epi64x(0,1,2,3)) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Loads and Stores - //////////////////////////////////////////////////////////////////////////////// - - static __forceinline void store_nt(void* __restrict__ ptr, const vllong4& a) { - _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(a)); - } - - static __forceinline vllong4 loadu(const void* addr) - { - return _mm256_loadu_si256((__m256i*)addr); - } - - static __forceinline vllong4 load(const vllong4* addr) { - return _mm256_load_si256((__m256i*)addr); - } - - static __forceinline vllong4 load(const long long* addr) { - return _mm256_load_si256((__m256i*)addr); - } - - static __forceinline void store(void* ptr, const vllong4& v) { - _mm256_store_si256((__m256i*)ptr,v); - } - - static __forceinline void storeu(void* ptr, const vllong4& v) { - _mm256_storeu_si256((__m256i*)ptr,v); - } - - static __forceinline void storeu(const vboold4& mask, long long* ptr, const vllong4& f) { -#if defined(__AVX512VL__) - _mm256_mask_storeu_epi64(ptr,mask,f); -#else - _mm256_maskstore_pd((double*)ptr,mask,_mm256_castsi256_pd(f)); -#endif - } - - static __forceinline void store(const vboold4& mask, void* ptr, const vllong4& f) { -#if defined(__AVX512VL__) - _mm256_mask_store_epi64(ptr,mask,f); -#else - _mm256_maskstore_pd((double*)ptr,mask,_mm256_castsi256_pd(f)); -#endif - } - - static __forceinline vllong4 broadcast64bit(size_t v) { - return _mm256_set1_epi64x(v); - } - - static __forceinline size_t extract64bit(const vllong4& v) - { - return _mm_cvtsi128_si64(_mm256_castsi256_si128(v)); - } - - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline long long& operator [](size_t index) { assert(index < 4); return i[index]; } - __forceinline const long long& operator [](size_t index) const { assert(index < 4); return i[index]; } - - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Select - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vllong4 select(const vboold4& m, const vllong4& t, const vllong4& f) { - #if defined(__AVX512VL__) - return _mm256_mask_blend_epi64(m, f, t); - #else - return _mm256_castpd_si256(_mm256_blendv_pd(_mm256_castsi256_pd(f), _mm256_castsi256_pd(t), m)); - #endif - } - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - -#if defined(__AVX512VL__) - __forceinline vboold4 asBool(const vllong4& a) { return _mm256_movepi64_mask(a); } -#else - __forceinline vboold4 asBool(const vllong4& a) { return _mm256_castsi256_pd(a); } -#endif - - __forceinline vllong4 operator +(const vllong4& a) { return a; } - __forceinline vllong4 operator -(const vllong4& a) { return _mm256_sub_epi64(_mm256_setzero_si256(), a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vllong4 operator +(const vllong4& a, const vllong4& b) { return _mm256_add_epi64(a, b); } - __forceinline vllong4 operator +(const vllong4& a, long long b) { return a + vllong4(b); } - __forceinline vllong4 operator +(long long a, const vllong4& b) { return vllong4(a) + b; } - - __forceinline vllong4 operator -(const vllong4& a, const vllong4& b) { return _mm256_sub_epi64(a, b); } - __forceinline vllong4 operator -(const vllong4& a, long long b) { return a - vllong4(b); } - __forceinline vllong4 operator -(long long a, const vllong4& b) { return vllong4(a) - b; } - - /* only low 32bit part */ - __forceinline vllong4 operator *(const vllong4& a, const vllong4& b) { return _mm256_mul_epi32(a, b); } - __forceinline vllong4 operator *(const vllong4& a, long long b) { return a * vllong4(b); } - __forceinline vllong4 operator *(long long a, const vllong4& b) { return vllong4(a) * b; } - - __forceinline vllong4 operator &(const vllong4& a, const vllong4& b) { return _mm256_and_si256(a, b); } - __forceinline vllong4 operator &(const vllong4& a, long long b) { return a & vllong4(b); } - __forceinline vllong4 operator &(long long a, const vllong4& b) { return vllong4(a) & b; } - - __forceinline vllong4 operator |(const vllong4& a, const vllong4& b) { return _mm256_or_si256(a, b); } - __forceinline vllong4 operator |(const vllong4& a, long long b) { return a | vllong4(b); } - __forceinline vllong4 operator |(long long a, const vllong4& b) { return vllong4(a) | b; } - - __forceinline vllong4 operator ^(const vllong4& a, const vllong4& b) { return _mm256_xor_si256(a, b); } - __forceinline vllong4 operator ^(const vllong4& a, long long b) { return a ^ vllong4(b); } - __forceinline vllong4 operator ^(long long a, const vllong4& b) { return vllong4(a) ^ b; } - - __forceinline vllong4 operator <<(const vllong4& a, long long n) { return _mm256_slli_epi64(a, (int)n); } - //__forceinline vllong4 operator >>(const vllong4& a, long long n) { return _mm256_srai_epi64(a, n); } - - __forceinline vllong4 operator <<(const vllong4& a, const vllong4& n) { return _mm256_sllv_epi64(a, n); } - //__forceinline vllong4 operator >>(const vllong4& a, const vllong4& n) { return _mm256_srav_epi64(a, n); } - //__forceinline vllong4 sra(const vllong4& a, long long b) { return _mm256_srai_epi64(a, b); } - - __forceinline vllong4 srl(const vllong4& a, long long b) { return _mm256_srli_epi64(a, (int)b); } - - //__forceinline vllong4 min(const vllong4& a, const vllong4& b) { return _mm256_min_epi64(a, b); } - //__forceinline vllong4 min(const vllong4& a, long long b) { return min(a,vllong4(b)); } - //__forceinline vllong4 min(long long a, const vllong4& b) { return min(vllong4(a),b); } - - //__forceinline vllong4 max(const vllong4& a, const vllong4& b) { return _mm256_max_epi64(a, b); } - //__forceinline vllong4 max(const vllong4& a, long long b) { return max(a,vllong4(b)); } - //__forceinline vllong4 max(long long a, const vllong4& b) { return max(vllong4(a),b); } - -#if defined(__AVX512VL__) - __forceinline vllong4 mask_and(const vboold4& m, const vllong4& c, const vllong4& a, const vllong4& b) { return _mm256_mask_and_epi64(c,m,a,b); } - __forceinline vllong4 mask_or (const vboold4& m, const vllong4& c, const vllong4& a, const vllong4& b) { return _mm256_mask_or_epi64(c,m,a,b); } -#else - __forceinline vllong4 mask_and(const vboold4& m, const vllong4& c, const vllong4& a, const vllong4& b) { return select(m, a & b, c); } - __forceinline vllong4 mask_or (const vboold4& m, const vllong4& c, const vllong4& a, const vllong4& b) { return select(m, a | b, c); } -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vllong4& operator +=(vllong4& a, const vllong4& b) { return a = a + b; } - __forceinline vllong4& operator +=(vllong4& a, long long b) { return a = a + b; } - - __forceinline vllong4& operator -=(vllong4& a, const vllong4& b) { return a = a - b; } - __forceinline vllong4& operator -=(vllong4& a, long long b) { return a = a - b; } - - __forceinline vllong4& operator *=(vllong4& a, const vllong4& b) { return a = a * b; } - __forceinline vllong4& operator *=(vllong4& a, long long b) { return a = a * b; } - - __forceinline vllong4& operator &=(vllong4& a, const vllong4& b) { return a = a & b; } - __forceinline vllong4& operator &=(vllong4& a, long long b) { return a = a & b; } - - __forceinline vllong4& operator |=(vllong4& a, const vllong4& b) { return a = a | b; } - __forceinline vllong4& operator |=(vllong4& a, long long b) { return a = a | b; } - - __forceinline vllong4& operator <<=(vllong4& a, long long b) { return a = a << b; } - //__forceinline vllong4& operator >>=(vllong4& a, long long b) { return a = a >> b; } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators - //////////////////////////////////////////////////////////////////////////////// - -#if defined(__AVX512VL__) - __forceinline vboold4 operator ==(const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_EQ); } - __forceinline vboold4 operator !=(const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_NE); } - __forceinline vboold4 operator < (const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_LT); } - __forceinline vboold4 operator >=(const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_GE); } - __forceinline vboold4 operator > (const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_GT); } - __forceinline vboold4 operator <=(const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_LE); } -#else - __forceinline vboold4 operator ==(const vllong4& a, const vllong4& b) { return _mm256_cmpeq_epi64(a,b); } - __forceinline vboold4 operator !=(const vllong4& a, const vllong4& b) { return !(a == b); } - __forceinline vboold4 operator > (const vllong4& a, const vllong4& b) { return _mm256_cmpgt_epi64(a,b); } - __forceinline vboold4 operator < (const vllong4& a, const vllong4& b) { return _mm256_cmpgt_epi64(b,a); } - __forceinline vboold4 operator >=(const vllong4& a, const vllong4& b) { return !(a < b); } - __forceinline vboold4 operator <=(const vllong4& a, const vllong4& b) { return !(a > b); } -#endif - - __forceinline vboold4 operator ==(const vllong4& a, long long b) { return a == vllong4(b); } - __forceinline vboold4 operator ==(long long a, const vllong4& b) { return vllong4(a) == b; } - - __forceinline vboold4 operator !=(const vllong4& a, long long b) { return a != vllong4(b); } - __forceinline vboold4 operator !=(long long a, const vllong4& b) { return vllong4(a) != b; } - - __forceinline vboold4 operator > (const vllong4& a, long long b) { return a > vllong4(b); } - __forceinline vboold4 operator > (long long a, const vllong4& b) { return vllong4(a) > b; } - - __forceinline vboold4 operator < (const vllong4& a, long long b) { return a < vllong4(b); } - __forceinline vboold4 operator < (long long a, const vllong4& b) { return vllong4(a) < b; } - - __forceinline vboold4 operator >=(const vllong4& a, long long b) { return a >= vllong4(b); } - __forceinline vboold4 operator >=(long long a, const vllong4& b) { return vllong4(a) >= b; } - - __forceinline vboold4 operator <=(const vllong4& a, long long b) { return a <= vllong4(b); } - __forceinline vboold4 operator <=(long long a, const vllong4& b) { return vllong4(a) <= b; } - - __forceinline vboold4 eq(const vllong4& a, const vllong4& b) { return a == b; } - __forceinline vboold4 ne(const vllong4& a, const vllong4& b) { return a != b; } - __forceinline vboold4 lt(const vllong4& a, const vllong4& b) { return a < b; } - __forceinline vboold4 ge(const vllong4& a, const vllong4& b) { return a >= b; } - __forceinline vboold4 gt(const vllong4& a, const vllong4& b) { return a > b; } - __forceinline vboold4 le(const vllong4& a, const vllong4& b) { return a <= b; } - -#if defined(__AVX512VL__) - __forceinline vboold4 eq(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_EQ); } - __forceinline vboold4 ne(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_NE); } - __forceinline vboold4 lt(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_LT); } - __forceinline vboold4 ge(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_GE); } - __forceinline vboold4 gt(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_GT); } - __forceinline vboold4 le(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_LE); } -#else - __forceinline vboold4 eq(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a == b); } - __forceinline vboold4 ne(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a != b); } - __forceinline vboold4 lt(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a < b); } - __forceinline vboold4 ge(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a >= b); } - __forceinline vboold4 gt(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a > b); } - __forceinline vboold4 le(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a <= b); } -#endif - - __forceinline void xchg(const vboold4& m, vllong4& a, vllong4& b) { - const vllong4 c = a; a = select(m,b,a); b = select(m,c,b); - } - - __forceinline vboold4 test(const vllong4& a, const vllong4& b) { -#if defined(__AVX512VL__) - return _mm256_test_epi64_mask(a,b); -#else - return _mm256_testz_si256(a,b); -#endif - } - - //////////////////////////////////////////////////////////////////////////////// - // Movement/Shifting/Shuffling Functions - //////////////////////////////////////////////////////////////////////////////// - - template - __forceinline vllong4 shuffle(const vllong4& v) { - return _mm256_castpd_si256(_mm256_permute_pd(_mm256_castsi256_pd(v), (i1 << 3) | (i0 << 2) | (i1 << 1) | i0)); - } - - template - __forceinline vllong4 shuffle(const vllong4& v) { - return shuffle(v); - } - - template - __forceinline vllong4 shuffle2(const vllong4& v) { - return _mm256_castpd_si256(_mm256_permute2f128_pd(_mm256_castsi256_pd(v), _mm256_castsi256_pd(v), (i1 << 4) | i0)); - } - - __forceinline long long toScalar(const vllong4& v) { - return _mm_cvtsi128_si64(_mm256_castsi256_si128(v)); - } - -#if defined(__AVX512VL__) - __forceinline vllong4 permute(const vllong4& a, const __m256i& index) { - // workaround for GCC 7.x -#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__) - return _mm256_permutex2var_epi64(a,index,a); -#else - return _mm256_permutexvar_epi64(index,a); -#endif - } - - __forceinline vllong4 permutex2var(const vllong4& index, const vllong4& a, const vllong4& b) { - return _mm256_permutex2var_epi64(a,index,b); - } - -#endif - //////////////////////////////////////////////////////////////////////////////// - /// Reductions - //////////////////////////////////////////////////////////////////////////////// - - - __forceinline vllong4 vreduce_and2(const vllong4& x) { return x & shuffle<1,0>(x); } - __forceinline vllong4 vreduce_and (const vllong4& y) { const vllong4 x = vreduce_and2(y); return x & shuffle2<1,0>(x); } - - __forceinline vllong4 vreduce_or2(const vllong4& x) { return x | shuffle<1,0>(x); } - __forceinline vllong4 vreduce_or (const vllong4& y) { const vllong4 x = vreduce_or2(y); return x | shuffle2<1,0>(x); } - - __forceinline vllong4 vreduce_add2(const vllong4& x) { return x + shuffle<1,0>(x); } - __forceinline vllong4 vreduce_add (const vllong4& y) { const vllong4 x = vreduce_add2(y); return x + shuffle2<1,0>(x); } - - __forceinline long long reduce_add(const vllong4& a) { return toScalar(vreduce_add(a)); } - __forceinline long long reduce_or (const vllong4& a) { return toScalar(vreduce_or(a)); } - __forceinline long long reduce_and(const vllong4& a) { return toScalar(vreduce_and(a)); } - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline embree_ostream operator <<(embree_ostream cout, const vllong4& v) - { - cout << "<" << v[0]; - for (size_t i=1; i<4; i++) cout << ", " << v[i]; - cout << ">"; - return cout; - } -} diff --git a/thirdparty/embree-aarch64/common/simd/vllong8_avx512.h b/thirdparty/embree-aarch64/common/simd/vllong8_avx512.h deleted file mode 100644 index 76dddd8991..0000000000 --- a/thirdparty/embree-aarch64/common/simd/vllong8_avx512.h +++ /dev/null @@ -1,381 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -namespace embree -{ - /* 8-wide AVX-512 64-bit long long type */ - template<> - struct vllong<8> - { - ALIGNED_STRUCT_(64); - - typedef vboold8 Bool; - - enum { size = 8 }; // number of SIMD elements - union { // data - __m512i v; - long long i[8]; - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vllong() {} - __forceinline vllong(const vllong8& t) { v = t.v; } - __forceinline vllong8& operator =(const vllong8& f) { v = f.v; return *this; } - - __forceinline vllong(const __m512i& t) { v = t; } - __forceinline operator __m512i() const { return v; } - __forceinline operator __m256i() const { return _mm512_castsi512_si256(v); } - - __forceinline vllong(long long i) { - v = _mm512_set1_epi64(i); - } - - __forceinline vllong(long long a, long long b, long long c, long long d) { - v = _mm512_set4_epi64(d,c,b,a); - } - - __forceinline vllong(long long a0, long long a1, long long a2, long long a3, - long long a4, long long a5, long long a6, long long a7) - { - v = _mm512_set_epi64(a7,a6,a5,a4,a3,a2,a1,a0); - } - - __forceinline vllong(const vllong<4>& i) { - v = _mm512_broadcast_i64x4(i); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vllong(ZeroTy) : v(_mm512_setzero_epi32()) {} - __forceinline vllong(OneTy) : v(_mm512_set1_epi64(1)) {} - __forceinline vllong(StepTy) : v(_mm512_set_epi64(7,6,5,4,3,2,1,0)) {} - __forceinline vllong(ReverseStepTy) : v(_mm512_setr_epi64(7,6,5,4,3,2,1,0)) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Loads and Stores - //////////////////////////////////////////////////////////////////////////////// - - static __forceinline void store_nt(void* __restrict__ ptr, const vllong8& a) { - _mm512_stream_si512((__m512i*)ptr,a); - } - - static __forceinline vllong8 loadu(const void* addr) { - return _mm512_loadu_si512(addr); - } - - static __forceinline vllong8 load(const vllong8* addr) { - return _mm512_load_si512(addr); - } - - static __forceinline vllong8 load(const long long* addr) { - return _mm512_load_si512(addr); - } - - static __forceinline vllong8 load(const uint8_t* ptr) { - return _mm512_cvtepu8_epi64(*(__m128i*)ptr); - } - - static __forceinline void store(void* ptr, const vllong8& v) { - _mm512_store_si512(ptr,v); - } - - static __forceinline void storeu(void* ptr, const vllong8& v) { - _mm512_storeu_si512(ptr,v); - } - - static __forceinline void storeu(const vboold8& mask, long long* ptr, const vllong8& f) { - _mm512_mask_storeu_epi64(ptr,mask,f); - } - - static __forceinline void store(const vboold8& mask, void* addr, const vllong8& v2) { - _mm512_mask_store_epi64(addr,mask,v2); - } - - /* pass by value to avoid compiler generating inefficient code */ - static __forceinline void storeu_compact(const vboold8 mask, void* addr, const vllong8& reg) { - _mm512_mask_compressstoreu_epi64(addr,mask,reg); - } - - static __forceinline vllong8 compact64bit(const vboold8& mask, vllong8& v) { - return _mm512_mask_compress_epi64(v,mask,v); - } - - static __forceinline vllong8 compact64bit(const vboold8& mask, vllong8& dest, const vllong8& source) { - return _mm512_mask_compress_epi64(dest,mask,source); - } - - static __forceinline vllong8 compact(const vboold8& mask, vllong8& v) { - return _mm512_mask_compress_epi64(v,mask,v); - } - - static __forceinline vllong8 compact(const vboold8& mask, const vllong8& a, vllong8& b) { - return _mm512_mask_compress_epi64(a,mask,b); - } - - static __forceinline vllong8 expand(const vboold8& mask, const vllong8& a, vllong8& b) { - return _mm512_mask_expand_epi64(b,mask,a); - } - - static __forceinline vllong8 broadcast64bit(size_t v) { - return _mm512_set1_epi64(v); - } - - static __forceinline size_t extract64bit(const vllong8& v) - { - return _mm_cvtsi128_si64(_mm512_castsi512_si128(v)); - } - - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline long long& operator [](size_t index) { assert(index < 8); return i[index]; } - __forceinline const long long& operator [](size_t index) const { assert(index < 8); return i[index]; } - - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboold8 asBool(const vllong8& a) { return _mm512_movepi64_mask(a); } - - __forceinline vllong8 operator +(const vllong8& a) { return a; } - __forceinline vllong8 operator -(const vllong8& a) { return _mm512_sub_epi64(_mm512_setzero_epi32(), a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vllong8 operator +(const vllong8& a, const vllong8& b) { return _mm512_add_epi64(a, b); } - __forceinline vllong8 operator +(const vllong8& a, long long b) { return a + vllong8(b); } - __forceinline vllong8 operator +(long long a, const vllong8& b) { return vllong8(a) + b; } - - __forceinline vllong8 operator -(const vllong8& a, const vllong8& b) { return _mm512_sub_epi64(a, b); } - __forceinline vllong8 operator -(const vllong8& a, long long b) { return a - vllong8(b); } - __forceinline vllong8 operator -(long long a, const vllong8& b) { return vllong8(a) - b; } - - __forceinline vllong8 operator *(const vllong8& a, const vllong8& b) { return _mm512_mullo_epi64(a, b); } - __forceinline vllong8 operator *(const vllong8& a, long long b) { return a * vllong8(b); } - __forceinline vllong8 operator *(long long a, const vllong8& b) { return vllong8(a) * b; } - - __forceinline vllong8 operator &(const vllong8& a, const vllong8& b) { return _mm512_and_epi64(a, b); } - __forceinline vllong8 operator &(const vllong8& a, long long b) { return a & vllong8(b); } - __forceinline vllong8 operator &(long long a, const vllong8& b) { return vllong8(a) & b; } - - __forceinline vllong8 operator |(const vllong8& a, const vllong8& b) { return _mm512_or_epi64(a, b); } - __forceinline vllong8 operator |(const vllong8& a, long long b) { return a | vllong8(b); } - __forceinline vllong8 operator |(long long a, const vllong8& b) { return vllong8(a) | b; } - - __forceinline vllong8 operator ^(const vllong8& a, const vllong8& b) { return _mm512_xor_epi64(a, b); } - __forceinline vllong8 operator ^(const vllong8& a, long long b) { return a ^ vllong8(b); } - __forceinline vllong8 operator ^(long long a, const vllong8& b) { return vllong8(a) ^ b; } - - __forceinline vllong8 operator <<(const vllong8& a, long long n) { return _mm512_slli_epi64(a, n); } - __forceinline vllong8 operator >>(const vllong8& a, long long n) { return _mm512_srai_epi64(a, n); } - - __forceinline vllong8 operator <<(const vllong8& a, const vllong8& n) { return _mm512_sllv_epi64(a, n); } - __forceinline vllong8 operator >>(const vllong8& a, const vllong8& n) { return _mm512_srav_epi64(a, n); } - - __forceinline vllong8 sll (const vllong8& a, long long b) { return _mm512_slli_epi64(a, b); } - __forceinline vllong8 sra (const vllong8& a, long long b) { return _mm512_srai_epi64(a, b); } - __forceinline vllong8 srl (const vllong8& a, long long b) { return _mm512_srli_epi64(a, b); } - - __forceinline vllong8 min(const vllong8& a, const vllong8& b) { return _mm512_min_epi64(a, b); } - __forceinline vllong8 min(const vllong8& a, long long b) { return min(a,vllong8(b)); } - __forceinline vllong8 min(long long a, const vllong8& b) { return min(vllong8(a),b); } - - __forceinline vllong8 max(const vllong8& a, const vllong8& b) { return _mm512_max_epi64(a, b); } - __forceinline vllong8 max(const vllong8& a, long long b) { return max(a,vllong8(b)); } - __forceinline vllong8 max(long long a, const vllong8& b) { return max(vllong8(a),b); } - - __forceinline vllong8 mask_add(const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_add_epi64(c,m,a,b); } - __forceinline vllong8 mask_sub(const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_sub_epi64(c,m,a,b); } - - __forceinline vllong8 mask_and(const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_and_epi64(c,m,a,b); } - __forceinline vllong8 mask_or (const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_or_epi64(c,m,a,b); } - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vllong8& operator +=(vllong8& a, const vllong8& b) { return a = a + b; } - __forceinline vllong8& operator +=(vllong8& a, long long b) { return a = a + b; } - - __forceinline vllong8& operator -=(vllong8& a, const vllong8& b) { return a = a - b; } - __forceinline vllong8& operator -=(vllong8& a, long long b) { return a = a - b; } - - __forceinline vllong8& operator *=(vllong8& a, const vllong8& b) { return a = a * b; } - __forceinline vllong8& operator *=(vllong8& a, long long b) { return a = a * b; } - - __forceinline vllong8& operator &=(vllong8& a, const vllong8& b) { return a = a & b; } - __forceinline vllong8& operator &=(vllong8& a, long long b) { return a = a & b; } - - __forceinline vllong8& operator |=(vllong8& a, const vllong8& b) { return a = a | b; } - __forceinline vllong8& operator |=(vllong8& a, long long b) { return a = a | b; } - - __forceinline vllong8& operator <<=(vllong8& a, long long b) { return a = a << b; } - __forceinline vllong8& operator >>=(vllong8& a, long long b) { return a = a >> b; } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators + Select - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboold8 operator ==(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_EQ); } - __forceinline vboold8 operator ==(const vllong8& a, long long b) { return a == vllong8(b); } - __forceinline vboold8 operator ==(long long a, const vllong8& b) { return vllong8(a) == b; } - - __forceinline vboold8 operator !=(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_NE); } - __forceinline vboold8 operator !=(const vllong8& a, long long b) { return a != vllong8(b); } - __forceinline vboold8 operator !=(long long a, const vllong8& b) { return vllong8(a) != b; } - - __forceinline vboold8 operator < (const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LT); } - __forceinline vboold8 operator < (const vllong8& a, long long b) { return a < vllong8(b); } - __forceinline vboold8 operator < (long long a, const vllong8& b) { return vllong8(a) < b; } - - __forceinline vboold8 operator >=(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GE); } - __forceinline vboold8 operator >=(const vllong8& a, long long b) { return a >= vllong8(b); } - __forceinline vboold8 operator >=(long long a, const vllong8& b) { return vllong8(a) >= b; } - - __forceinline vboold8 operator > (const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GT); } - __forceinline vboold8 operator > (const vllong8& a, long long b) { return a > vllong8(b); } - __forceinline vboold8 operator > (long long a, const vllong8& b) { return vllong8(a) > b; } - - __forceinline vboold8 operator <=(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LE); } - __forceinline vboold8 operator <=(const vllong8& a, long long b) { return a <= vllong8(b); } - __forceinline vboold8 operator <=(long long a, const vllong8& b) { return vllong8(a) <= b; } - - __forceinline vboold8 eq(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_EQ); } - __forceinline vboold8 ne(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_NE); } - __forceinline vboold8 lt(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LT); } - __forceinline vboold8 ge(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GE); } - __forceinline vboold8 gt(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GT); } - __forceinline vboold8 le(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LE); } - - __forceinline vboold8 eq(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_EQ); } - __forceinline vboold8 ne(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_NE); } - __forceinline vboold8 lt(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_LT); } - __forceinline vboold8 ge(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_GE); } - __forceinline vboold8 gt(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_GT); } - __forceinline vboold8 le(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_LE); } - - __forceinline vllong8 select(const vboold8& m, const vllong8& t, const vllong8& f) { - return _mm512_mask_or_epi64(f,m,t,t); - } - - __forceinline void xchg(const vboold8& m, vllong8& a, vllong8& b) { - const vllong8 c = a; a = select(m,b,a); b = select(m,c,b); - } - - __forceinline vboold8 test(const vboold8& m, const vllong8& a, const vllong8& b) { - return _mm512_mask_test_epi64_mask(m,a,b); - } - - __forceinline vboold8 test(const vllong8& a, const vllong8& b) { - return _mm512_test_epi64_mask(a,b); - } - - //////////////////////////////////////////////////////////////////////////////// - // Movement/Shifting/Shuffling Functions - //////////////////////////////////////////////////////////////////////////////// - - template - __forceinline vllong8 shuffle(const vllong8& v) { - return _mm512_castpd_si512(_mm512_permute_pd(_mm512_castsi512_pd(v), (i1 << 7) | (i0 << 6) | (i1 << 5) | (i0 << 4) | (i1 << 3) | (i0 << 2) | (i1 << 1) | i0)); - } - - template - __forceinline vllong8 shuffle(const vllong8& v) { - return shuffle(v); - } - - template - __forceinline vllong8 shuffle(const vllong8& v) { - return _mm512_permutex_epi64(v, _MM_SHUFFLE(i3, i2, i1, i0)); - } - - template - __forceinline vllong8 shuffle4(const vllong8& v) { - return _mm512_shuffle_i64x2(v, v, _MM_SHUFFLE(i1*2+1, i1*2, i0*2+1, i0*2)); - } - - template - __forceinline vllong8 shuffle4(const vllong8& v) { - return shuffle4(v); - } - - template - __forceinline vllong8 align_shift_right(const vllong8& a, const vllong8& b) { - return _mm512_alignr_epi64(a, b, i); - }; - - __forceinline long long toScalar(const vllong8& v) { - return _mm_cvtsi128_si64(_mm512_castsi512_si128(v)); - } - - __forceinline vllong8 zeroExtend32Bit(const __m512i& a) { - return _mm512_cvtepu32_epi64(_mm512_castsi512_si256(a)); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Reductions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vllong8 vreduce_min2(vllong8 x) { return min(x, shuffle<1,0,3,2>(x)); } - __forceinline vllong8 vreduce_min4(vllong8 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); } - __forceinline vllong8 vreduce_min (vllong8 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0>(x)); } - - __forceinline vllong8 vreduce_max2(vllong8 x) { return max(x, shuffle<1,0,3,2>(x)); } - __forceinline vllong8 vreduce_max4(vllong8 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); } - __forceinline vllong8 vreduce_max (vllong8 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0>(x)); } - - __forceinline vllong8 vreduce_and2(vllong8 x) { return x & shuffle<1,0,3,2>(x); } - __forceinline vllong8 vreduce_and4(vllong8 x) { x = vreduce_and2(x); return x & shuffle<2,3,0,1>(x); } - __forceinline vllong8 vreduce_and (vllong8 x) { x = vreduce_and4(x); return x & shuffle4<1,0>(x); } - - __forceinline vllong8 vreduce_or2(vllong8 x) { return x | shuffle<1,0,3,2>(x); } - __forceinline vllong8 vreduce_or4(vllong8 x) { x = vreduce_or2(x); return x | shuffle<2,3,0,1>(x); } - __forceinline vllong8 vreduce_or (vllong8 x) { x = vreduce_or4(x); return x | shuffle4<1,0>(x); } - - __forceinline vllong8 vreduce_add2(vllong8 x) { return x + shuffle<1,0,3,2>(x); } - __forceinline vllong8 vreduce_add4(vllong8 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); } - __forceinline vllong8 vreduce_add (vllong8 x) { x = vreduce_add4(x); return x + shuffle4<1,0>(x); } - - __forceinline long long reduce_min(const vllong8& v) { return toScalar(vreduce_min(v)); } - __forceinline long long reduce_max(const vllong8& v) { return toScalar(vreduce_max(v)); } - __forceinline long long reduce_and(const vllong8& v) { return toScalar(vreduce_and(v)); } - __forceinline long long reduce_or (const vllong8& v) { return toScalar(vreduce_or (v)); } - __forceinline long long reduce_add(const vllong8& v) { return toScalar(vreduce_add(v)); } - - //////////////////////////////////////////////////////////////////////////////// - /// Memory load and store operations - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vllong8 permute(const vllong8& v, const vllong8& index) { - return _mm512_permutexvar_epi64(index,v); - } - - __forceinline vllong8 reverse(const vllong8& a) { - return permute(a,vllong8(reverse_step)); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline embree_ostream operator <<(embree_ostream cout, const vllong8& v) - { - cout << "<" << v[0]; - for (size_t i=1; i<8; i++) cout << ", " << v[i]; - cout << ">"; - return cout; - } -} diff --git a/thirdparty/embree-aarch64/common/simd/vuint16_avx512.h b/thirdparty/embree-aarch64/common/simd/vuint16_avx512.h deleted file mode 100644 index 39752611bb..0000000000 --- a/thirdparty/embree-aarch64/common/simd/vuint16_avx512.h +++ /dev/null @@ -1,443 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -namespace embree -{ - /* 16-wide AVX-512 unsigned integer type */ - template<> - struct vuint<16> - { - ALIGNED_STRUCT_(64); - - typedef vboolf16 Bool; - typedef vuint16 UInt; - typedef vfloat16 Float; - - enum { size = 16 }; // number of SIMD elements - union { // data - __m512i v; - unsigned int i[16]; - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vuint() {} - __forceinline vuint(const vuint16& t) { v = t.v; } - __forceinline vuint16& operator =(const vuint16& f) { v = f.v; return *this; } - - __forceinline vuint(const __m512i& t) { v = t; } - __forceinline operator __m512i() const { return v; } - __forceinline operator __m256i() const { return _mm512_castsi512_si256(v); } - - __forceinline vuint(unsigned int i) { - v = _mm512_set1_epi32(i); - } - - __forceinline vuint(const vuint4& i) { - v = _mm512_broadcast_i32x4(i); - } - - __forceinline vuint(const vuint8& i) { - v = _mm512_castps_si512(_mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_castsi256_pd(i)))); - } - - __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d) { - v = _mm512_set4_epi32(d,c,b,a); - } - - __forceinline vuint(unsigned int a0 , unsigned int a1 , unsigned int a2 , unsigned int a3, - unsigned int a4 , unsigned int a5 , unsigned int a6 , unsigned int a7, - unsigned int a8 , unsigned int a9 , unsigned int a10, unsigned int a11, - unsigned int a12, unsigned int a13, unsigned int a14, unsigned int a15) - { - v = _mm512_set_epi32(a15,a14,a13,a12,a11,a10,a9,a8,a7,a6,a5,a4,a3,a2,a1,a0); - } - - __forceinline explicit vuint(const __m512& f) { - v = _mm512_cvtps_epu32(f); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vuint(ZeroTy) : v(_mm512_setzero_epi32()) {} - __forceinline vuint(OneTy) : v(_mm512_set1_epi32(1)) {} - __forceinline vuint(StepTy) : v(_mm512_set_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {} - __forceinline vuint(ReverseStepTy) : v(_mm512_setr_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Loads and Stores - //////////////////////////////////////////////////////////////////////////////// - - static __forceinline void store_nt(void* __restrict__ ptr, const vuint16& a) { - _mm512_stream_si512((__m512i*)ptr,a); - } - - static __forceinline vuint16 loadu(const void* addr) - { - return _mm512_loadu_si512(addr); - } - - static __forceinline vuint16 loadu(const uint8_t* ptr) { return _mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr)); } - static __forceinline vuint16 loadu(const unsigned short* ptr) { return _mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i*)ptr)); } - - static __forceinline vuint16 load(const vuint16* addr) { - return _mm512_load_si512(addr); - } - - static __forceinline vuint16 load(const unsigned int* addr) { - return _mm512_load_si512(addr); - } - - static __forceinline vuint16 load(unsigned short* ptr) { return _mm512_cvtepu16_epi32(*(__m256i*)ptr); } - - - static __forceinline void store(void* ptr, const vuint16& v) { - _mm512_store_si512(ptr,v); - } - - static __forceinline void storeu(void* ptr, const vuint16& v) { - _mm512_storeu_si512(ptr,v); - } - - static __forceinline void storeu(const vboolf16& mask, void* ptr, const vuint16& f) { - _mm512_mask_storeu_epi32(ptr,mask,f); - } - - static __forceinline void store(const vboolf16& mask, void* addr, const vuint16& v2) { - _mm512_mask_store_epi32(addr,mask,v2); - } - - /* pass by value to avoid compiler generating inefficient code */ - static __forceinline void storeu_compact(const vboolf16 mask, void* addr, const vuint16 reg) { - _mm512_mask_compressstoreu_epi32(addr,mask,reg); - } - - static __forceinline void storeu_compact_single(const vboolf16 mask, void* addr, vuint16 reg) { - //_mm512_mask_compressstoreu_epi32(addr,mask,reg); - *(float*)addr = mm512_cvtss_f32(_mm512_mask_compress_ps(_mm512_castsi512_ps(reg),mask,_mm512_castsi512_ps(reg))); - } - - static __forceinline vuint16 compact64bit(const vboolf16& mask, vuint16& v) { - return _mm512_mask_compress_epi64(v,mask,v); - } - - static __forceinline vuint16 compact(const vboolf16& mask, vuint16& v) { - return _mm512_mask_compress_epi32(v,mask,v); - } - - static __forceinline vuint16 compact(const vboolf16& mask, const vuint16& a, vuint16& b) { - return _mm512_mask_compress_epi32(a,mask,b); - } - - static __forceinline vuint16 expand(const vboolf16& mask, const vuint16& a, vuint16& b) { - return _mm512_mask_expand_epi32(b,mask,a); - } - - template - static __forceinline vuint16 gather(const unsigned int* ptr, const vint16& index) { - return _mm512_i32gather_epi32(index,ptr,scale); - } - - template - static __forceinline vuint16 gather(const vboolf16& mask, const unsigned int* ptr, const vint16& index) { - return _mm512_mask_i32gather_epi32(_mm512_undefined_epi32(),mask,index,ptr,scale); - } - - template - static __forceinline vuint16 gather(const vboolf16& mask, vuint16& dest, const unsigned int* ptr, const vint16& index) { - return _mm512_mask_i32gather_epi32(dest,mask,index,ptr,scale); - } - - template - static __forceinline void scatter(unsigned int* ptr, const vint16& index, const vuint16& v) { - _mm512_i32scatter_epi32((int*)ptr,index,v,scale); - } - - template - static __forceinline void scatter(const vboolf16& mask, unsigned int* ptr, const vint16& index, const vuint16& v) { - _mm512_mask_i32scatter_epi32((int*)ptr,mask,index,v,scale); - } - - static __forceinline vuint16 broadcast64bit(size_t v) { - return _mm512_set1_epi64(v); - } - - static __forceinline size_t extract64bit(const vuint16& v) - { - return _mm_cvtsi128_si64(_mm512_castsi512_si128(v)); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline unsigned int& operator [](size_t index) { assert(index < 16); return i[index]; } - __forceinline const unsigned int& operator [](size_t index) const { assert(index < 16); return i[index]; } - - __forceinline unsigned int uint (size_t index) const { assert(index < 16); return ((unsigned int*)i)[index]; } - __forceinline size_t& uint64_t(size_t index) const { assert(index < 8); return ((size_t*)i)[index]; } - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf16 asBool(const vuint16& a) { return _mm512_movepi32_mask(a); } - - __forceinline vuint16 operator +(const vuint16& a) { return a; } - __forceinline vuint16 operator -(const vuint16& a) { return _mm512_sub_epi32(_mm512_setzero_epi32(), a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vuint16 operator +(const vuint16& a, const vuint16& b) { return _mm512_add_epi32(a, b); } - __forceinline vuint16 operator +(const vuint16& a, unsigned int b) { return a + vuint16(b); } - __forceinline vuint16 operator +(unsigned int a, const vuint16& b) { return vuint16(a) + b; } - - __forceinline vuint16 operator -(const vuint16& a, const vuint16& b) { return _mm512_sub_epi32(a, b); } - __forceinline vuint16 operator -(const vuint16& a, unsigned int b) { return a - vuint16(b); } - __forceinline vuint16 operator -(unsigned int a, const vuint16& b) { return vuint16(a) - b; } - - __forceinline vuint16 operator *(const vuint16& a, const vuint16& b) { return _mm512_mul_epu32(a, b); } - __forceinline vuint16 operator *(const vuint16& a, unsigned int b) { return a * vuint16(b); } - __forceinline vuint16 operator *(unsigned int a, const vuint16& b) { return vuint16(a) * b; } - - __forceinline vuint16 operator &(const vuint16& a, const vuint16& b) { return _mm512_and_epi32(a, b); } - __forceinline vuint16 operator &(const vuint16& a, unsigned int b) { return a & vuint16(b); } - __forceinline vuint16 operator &(unsigned int a, const vuint16& b) { return vuint16(a) & b; } - - __forceinline vuint16 operator |(const vuint16& a, const vuint16& b) { return _mm512_or_epi32(a, b); } - __forceinline vuint16 operator |(const vuint16& a, unsigned int b) { return a | vuint16(b); } - __forceinline vuint16 operator |(unsigned int a, const vuint16& b) { return vuint16(a) | b; } - - __forceinline vuint16 operator ^(const vuint16& a, const vuint16& b) { return _mm512_xor_epi32(a, b); } - __forceinline vuint16 operator ^(const vuint16& a, unsigned int b) { return a ^ vuint16(b); } - __forceinline vuint16 operator ^(unsigned int a, const vuint16& b) { return vuint16(a) ^ b; } - - __forceinline vuint16 operator <<(const vuint16& a, unsigned int n) { return _mm512_slli_epi32(a, n); } - __forceinline vuint16 operator >>(const vuint16& a, unsigned int n) { return _mm512_srli_epi32(a, n); } - - __forceinline vuint16 operator <<(const vuint16& a, const vuint16& n) { return _mm512_sllv_epi32(a, n); } - __forceinline vuint16 operator >>(const vuint16& a, const vuint16& n) { return _mm512_srlv_epi32(a, n); } - - __forceinline vuint16 sll (const vuint16& a, unsigned int b) { return _mm512_slli_epi32(a, b); } - __forceinline vuint16 sra (const vuint16& a, unsigned int b) { return _mm512_srai_epi32(a, b); } - __forceinline vuint16 srl (const vuint16& a, unsigned int b) { return _mm512_srli_epi32(a, b); } - - __forceinline vuint16 min(const vuint16& a, const vuint16& b) { return _mm512_min_epu32(a, b); } - __forceinline vuint16 min(const vuint16& a, unsigned int b) { return min(a,vuint16(b)); } - __forceinline vuint16 min(unsigned int a, const vuint16& b) { return min(vuint16(a),b); } - - __forceinline vuint16 max(const vuint16& a, const vuint16& b) { return _mm512_max_epu32(a, b); } - __forceinline vuint16 max(const vuint16& a, unsigned int b) { return max(a,vuint16(b)); } - __forceinline vuint16 max(unsigned int a, const vuint16& b) { return max(vuint16(a),b); } - - __forceinline vuint16 mask_add(const vboolf16& mask, vuint16& c, const vuint16& a, const vuint16& b) { return _mm512_mask_add_epi32(c,mask,a,b); } - __forceinline vuint16 mask_sub(const vboolf16& mask, vuint16& c, const vuint16& a, const vuint16& b) { return _mm512_mask_sub_epi32(c,mask,a,b); } - - __forceinline vuint16 mask_and(const vboolf16& m, vuint16& c, const vuint16& a, const vuint16& b) { return _mm512_mask_and_epi32(c,m,a,b); } - __forceinline vuint16 mask_or (const vboolf16& m, vuint16& c, const vuint16& a, const vuint16& b) { return _mm512_mask_or_epi32(c,m,a,b); } - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vuint16& operator +=(vuint16& a, const vuint16& b) { return a = a + b; } - __forceinline vuint16& operator +=(vuint16& a, unsigned int b) { return a = a + b; } - - __forceinline vuint16& operator -=(vuint16& a, const vuint16& b) { return a = a - b; } - __forceinline vuint16& operator -=(vuint16& a, unsigned int b) { return a = a - b; } - - __forceinline vuint16& operator *=(vuint16& a, const vuint16& b) { return a = a * b; } - __forceinline vuint16& operator *=(vuint16& a, unsigned int b) { return a = a * b; } - - __forceinline vuint16& operator &=(vuint16& a, const vuint16& b) { return a = a & b; } - __forceinline vuint16& operator &=(vuint16& a, unsigned int b) { return a = a & b; } - - __forceinline vuint16& operator |=(vuint16& a, const vuint16& b) { return a = a | b; } - __forceinline vuint16& operator |=(vuint16& a, unsigned int b) { return a = a | b; } - - __forceinline vuint16& operator <<=(vuint16& a, unsigned int b) { return a = a << b; } - __forceinline vuint16& operator >>=(vuint16& a, unsigned int b) { return a = a >> b; } - - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators + Select - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf16 operator ==(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_EQ); } - __forceinline vboolf16 operator ==(const vuint16& a, unsigned int b) { return a == vuint16(b); } - __forceinline vboolf16 operator ==(unsigned int a, const vuint16& b) { return vuint16(a) == b; } - - __forceinline vboolf16 operator !=(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_NE); } - __forceinline vboolf16 operator !=(const vuint16& a, unsigned int b) { return a != vuint16(b); } - __forceinline vboolf16 operator !=(unsigned int a, const vuint16& b) { return vuint16(a) != b; } - - __forceinline vboolf16 operator < (const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LT); } - __forceinline vboolf16 operator < (const vuint16& a, unsigned int b) { return a < vuint16(b); } - __forceinline vboolf16 operator < (unsigned int a, const vuint16& b) { return vuint16(a) < b; } - - __forceinline vboolf16 operator >=(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GE); } - __forceinline vboolf16 operator >=(const vuint16& a, unsigned int b) { return a >= vuint16(b); } - __forceinline vboolf16 operator >=(unsigned int a, const vuint16& b) { return vuint16(a) >= b; } - - __forceinline vboolf16 operator > (const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GT); } - __forceinline vboolf16 operator > (const vuint16& a, unsigned int b) { return a > vuint16(b); } - __forceinline vboolf16 operator > (unsigned int a, const vuint16& b) { return vuint16(a) > b; } - - __forceinline vboolf16 operator <=(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LE); } - __forceinline vboolf16 operator <=(const vuint16& a, unsigned int b) { return a <= vuint16(b); } - __forceinline vboolf16 operator <=(unsigned int a, const vuint16& b) { return vuint16(a) <= b; } - - __forceinline vboolf16 eq(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_EQ); } - __forceinline vboolf16 ne(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_NE); } - __forceinline vboolf16 lt(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LT); } - __forceinline vboolf16 ge(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GE); } - __forceinline vboolf16 gt(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GT); } - __forceinline vboolf16 le(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LE); } - - __forceinline vboolf16 eq(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_EQ); } - __forceinline vboolf16 ne(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_NE); } - __forceinline vboolf16 lt(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_LT); } - __forceinline vboolf16 ge(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_GE); } - __forceinline vboolf16 gt(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_GT); } - __forceinline vboolf16 le(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_LE); } - - - __forceinline vuint16 select(const vboolf16& m, const vuint16& t, const vuint16& f) { - return _mm512_mask_or_epi32(f,m,t,t); - } - - __forceinline void xchg(const vboolf16& m, vuint16& a, vuint16& b) { - const vuint16 c = a; a = select(m,b,a); b = select(m,c,b); - } - - __forceinline vboolf16 test(const vboolf16& m, const vuint16& a, const vuint16& b) { - return _mm512_mask_test_epi32_mask(m,a,b); - } - - __forceinline vboolf16 test(const vuint16& a, const vuint16& b) { - return _mm512_test_epi32_mask(a,b); - } - - //////////////////////////////////////////////////////////////////////////////// - // Movement/Shifting/Shuffling Functions - //////////////////////////////////////////////////////////////////////////////// - - template - __forceinline vuint16 shuffle(const vuint16& v) { - return _mm512_castps_si512(_mm512_permute_ps(_mm512_castsi512_ps(v), _MM_SHUFFLE(i, i, i, i))); - } - - template - __forceinline vuint16 shuffle(const vuint16& v) { - return _mm512_castps_si512(_mm512_permute_ps(_mm512_castsi512_ps(v), _MM_SHUFFLE(i3, i2, i1, i0))); - } - - template - __forceinline vuint16 shuffle4(const vuint16& v) { - return _mm512_castps_si512(_mm512_shuffle_f32x4(_mm512_castsi512_ps(v), _mm512_castsi512_ps(v) ,_MM_SHUFFLE(i, i, i, i))); - } - - template - __forceinline vuint16 shuffle4(const vuint16& v) { - return _mm512_castps_si512(_mm512_shuffle_f32x4(_mm512_castsi512_ps(v), _mm512_castsi512_ps(v), _MM_SHUFFLE(i3, i2, i1, i0))); - } - - template - __forceinline vuint16 align_shift_right(const vuint16& a, const vuint16& b) { - return _mm512_alignr_epi32(a, b, i); - }; - - __forceinline unsigned int toScalar(const vuint16& v) { - return _mm_cvtsi128_si32(_mm512_castsi512_si128(v)); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Reductions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vuint16 vreduce_min2(vuint16 x) { return min(x, shuffle<1,0,3,2>(x)); } - __forceinline vuint16 vreduce_min4(vuint16 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); } - __forceinline vuint16 vreduce_min8(vuint16 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0,3,2>(x)); } - __forceinline vuint16 vreduce_min (vuint16 x) { x = vreduce_min8(x); return min(x, shuffle4<2,3,0,1>(x)); } - - __forceinline vuint16 vreduce_max2(vuint16 x) { return max(x, shuffle<1,0,3,2>(x)); } - __forceinline vuint16 vreduce_max4(vuint16 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); } - __forceinline vuint16 vreduce_max8(vuint16 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0,3,2>(x)); } - __forceinline vuint16 vreduce_max (vuint16 x) { x = vreduce_max8(x); return max(x, shuffle4<2,3,0,1>(x)); } - - __forceinline vuint16 vreduce_and2(vuint16 x) { return x & shuffle<1,0,3,2>(x); } - __forceinline vuint16 vreduce_and4(vuint16 x) { x = vreduce_and2(x); return x & shuffle<2,3,0,1>(x); } - __forceinline vuint16 vreduce_and8(vuint16 x) { x = vreduce_and4(x); return x & shuffle4<1,0,3,2>(x); } - __forceinline vuint16 vreduce_and (vuint16 x) { x = vreduce_and8(x); return x & shuffle4<2,3,0,1>(x); } - - __forceinline vuint16 vreduce_or2(vuint16 x) { return x | shuffle<1,0,3,2>(x); } - __forceinline vuint16 vreduce_or4(vuint16 x) { x = vreduce_or2(x); return x | shuffle<2,3,0,1>(x); } - __forceinline vuint16 vreduce_or8(vuint16 x) { x = vreduce_or4(x); return x | shuffle4<1,0,3,2>(x); } - __forceinline vuint16 vreduce_or (vuint16 x) { x = vreduce_or8(x); return x | shuffle4<2,3,0,1>(x); } - - __forceinline vuint16 vreduce_add2(vuint16 x) { return x + shuffle<1,0,3,2>(x); } - __forceinline vuint16 vreduce_add4(vuint16 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); } - __forceinline vuint16 vreduce_add8(vuint16 x) { x = vreduce_add4(x); return x + shuffle4<1,0,3,2>(x); } - __forceinline vuint16 vreduce_add (vuint16 x) { x = vreduce_add8(x); return x + shuffle4<2,3,0,1>(x); } - - __forceinline unsigned int reduce_min(const vuint16& v) { return toScalar(vreduce_min(v)); } - __forceinline unsigned int reduce_max(const vuint16& v) { return toScalar(vreduce_max(v)); } - __forceinline unsigned int reduce_and(const vuint16& v) { return toScalar(vreduce_and(v)); } - __forceinline unsigned int reduce_or (const vuint16& v) { return toScalar(vreduce_or (v)); } - __forceinline unsigned int reduce_add(const vuint16& v) { return toScalar(vreduce_add(v)); } - - //////////////////////////////////////////////////////////////////////////////// - /// Memory load and store operations - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vuint16 permute(vuint16 v, vuint16 index) { - return _mm512_permutexvar_epi32(index,v); - } - - __forceinline vuint16 reverse(const vuint16& a) { - return permute(a,vuint16(reverse_step)); - } - - __forceinline vuint16 prefix_sum(const vuint16& a) - { - const vuint16 z(zero); - vuint16 v = a; - v = v + align_shift_right<16-1>(v,z); - v = v + align_shift_right<16-2>(v,z); - v = v + align_shift_right<16-4>(v,z); - v = v + align_shift_right<16-8>(v,z); - return v; - } - - __forceinline vuint16 reverse_prefix_sum(const vuint16& a) - { - const vuint16 z(zero); - vuint16 v = a; - v = v + align_shift_right<1>(z,v); - v = v + align_shift_right<2>(z,v); - v = v + align_shift_right<4>(z,v); - v = v + align_shift_right<8>(z,v); - return v; - } - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline embree_ostream operator <<(embree_ostream cout, const vuint16& v) - { - cout << "<" << v[0]; - for (int i=1; i<16; i++) cout << ", " << v[i]; - cout << ">"; - return cout; - } -} diff --git a/thirdparty/embree-aarch64/common/simd/vuint4_sse2.h b/thirdparty/embree-aarch64/common/simd/vuint4_sse2.h deleted file mode 100644 index a3f393ebf2..0000000000 --- a/thirdparty/embree-aarch64/common/simd/vuint4_sse2.h +++ /dev/null @@ -1,499 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../math/math.h" - -namespace embree -{ - /* 4-wide SSE integer type */ - template<> - struct vuint<4> - { - ALIGNED_STRUCT_(16); - - typedef vboolf4 Bool; - typedef vuint4 Int; - typedef vfloat4 Float; - - enum { size = 4 }; // number of SIMD elements - union { __m128i v; unsigned int i[4]; }; // data - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vuint() {} - __forceinline vuint(const vuint4& a) { v = a.v; } - __forceinline vuint4& operator =(const vuint4& a) { v = a.v; return *this; } - - __forceinline vuint(const __m128i a) : v(a) {} - __forceinline operator const __m128i&() const { return v; } - __forceinline operator __m128i&() { return v; } - - - __forceinline vuint(unsigned int a) : v(_mm_set1_epi32(a)) {} - __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d) : v(_mm_set_epi32(d, c, b, a)) {} - -#if defined(__AVX512VL__) - __forceinline explicit vuint(__m128 a) : v(_mm_cvtps_epu32(a)) {} -#endif - -#if defined(__AVX512VL__) - __forceinline explicit vuint(const vboolf4& a) : v(_mm_movm_epi32(a)) {} -#else - __forceinline explicit vuint(const vboolf4& a) : v(_mm_castps_si128((__m128)a)) {} -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vuint(ZeroTy) : v(_mm_setzero_si128()) {} - __forceinline vuint(OneTy) : v(_mm_set1_epi32(1)) {} - __forceinline vuint(PosInfTy) : v(_mm_set1_epi32(unsigned(pos_inf))) {} - __forceinline vuint(StepTy) : v(_mm_set_epi32(3, 2, 1, 0)) {} - __forceinline vuint(TrueTy) { v = _mm_cmpeq_epi32(v,v); } - __forceinline vuint(UndefinedTy) : v(_mm_castps_si128(_mm_undefined_ps())) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Loads and Stores - //////////////////////////////////////////////////////////////////////////////// - - static __forceinline vuint4 load (const void* a) { return _mm_load_si128((__m128i*)a); } - static __forceinline vuint4 loadu(const void* a) { return _mm_loadu_si128((__m128i*)a); } - - static __forceinline void store (void* ptr, const vuint4& v) { _mm_store_si128((__m128i*)ptr,v); } - static __forceinline void storeu(void* ptr, const vuint4& v) { _mm_storeu_si128((__m128i*)ptr,v); } - -#if defined(__AVX512VL__) - static __forceinline vuint4 load (const vboolf4& mask, const void* ptr) { return _mm_mask_load_epi32 (_mm_setzero_si128(),mask,ptr); } - static __forceinline vuint4 loadu(const vboolf4& mask, const void* ptr) { return _mm_mask_loadu_epi32(_mm_setzero_si128(),mask,ptr); } - - static __forceinline void store (const vboolf4& mask, void* ptr, const vuint4& v) { _mm_mask_store_epi32 (ptr,mask,v); } - static __forceinline void storeu(const vboolf4& mask, void* ptr, const vuint4& v) { _mm_mask_storeu_epi32(ptr,mask,v); } -#elif defined(__AVX__) - static __forceinline vuint4 load (const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask)); } - static __forceinline vuint4 loadu(const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask)); } - - static __forceinline void store (const vboolf4& mask, void* ptr, const vuint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,_mm_castsi128_ps(i)); } - static __forceinline void storeu(const vboolf4& mask, void* ptr, const vuint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,_mm_castsi128_ps(i)); } -#else - static __forceinline vuint4 load (const vbool4& mask, const void* a) { return _mm_and_si128(_mm_load_si128 ((__m128i*)a),mask); } - static __forceinline vuint4 loadu(const vbool4& mask, const void* a) { return _mm_and_si128(_mm_loadu_si128((__m128i*)a),mask); } - - static __forceinline void store (const vboolf4& mask, void* ptr, const vuint4& i) { store (ptr,select(mask,i,load (ptr))); } - static __forceinline void storeu(const vboolf4& mask, void* ptr, const vuint4& i) { storeu(ptr,select(mask,i,loadu(ptr))); } -#endif - -#if defined(__aarch64__) - static __forceinline vuint4 load(const uint8_t* ptr) { - return _mm_load4epu8_epi32(((__m128i*)ptr)); - } - static __forceinline vuint4 loadu(const uint8_t* ptr) { - return _mm_load4epu8_epi32(((__m128i*)ptr)); - } -#elif defined(__SSE4_1__) - static __forceinline vuint4 load(const uint8_t* ptr) { - return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); - } - - static __forceinline vuint4 loadu(const uint8_t* ptr) { - return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); - } - -#endif - - static __forceinline vuint4 load(const unsigned short* ptr) { -#if defined(__aarch64__) - return _mm_load4epu16_epi32(((__m128i*)ptr)); -#elif defined (__SSE4_1__) - return _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr)); -#else - return vuint4(ptr[0],ptr[1],ptr[2],ptr[3]); -#endif - } - - static __forceinline void store_uint8(uint8_t* ptr, const vuint4& v) { -#if defined(__aarch64__) - uint32x4_t x = uint32x4_t(v.v); - uint16x4_t y = vqmovn_u32(x); - uint8x8_t z = vqmovn_u16(vcombine_u16(y, y)); - vst1_lane_u32((uint32_t *)ptr, uint32x2_t(z), 0); -#elif defined(__SSE4_1__) - __m128i x = v; - x = _mm_packus_epi32(x, x); - x = _mm_packus_epi16(x, x); - *(unsigned*)ptr = _mm_cvtsi128_si32(x); -#else - for (size_t i=0;i<4;i++) - ptr[i] = (uint8_t)v[i]; -#endif - } - - static __forceinline void store_uint8(unsigned short* ptr, const vuint4& v) { -#if defined(__aarch64__) - uint32x4_t x = (uint32x4_t)v.v; - uint16x4_t y = vqmovn_u32(x); - vst1_u16(ptr, y); -#else - for (size_t i=0;i<4;i++) - ptr[i] = (unsigned short)v[i]; -#endif - } - - static __forceinline vuint4 load_nt(void* ptr) { -#if (defined(__aarch64__)) || defined(__SSE4_1__) - return _mm_stream_load_si128((__m128i*)ptr); -#else - return _mm_load_si128((__m128i*)ptr); -#endif - } - - static __forceinline void store_nt(void* ptr, const vuint4& v) { -#if !defined(__aarch64__) && defined(__SSE4_1__) - _mm_stream_ps((float*)ptr, _mm_castsi128_ps(v)); -#else - _mm_store_si128((__m128i*)ptr,v); -#endif - } - - template - static __forceinline vuint4 gather(const unsigned int* ptr, const vint4& index) { -#if defined(__AVX2__) && !defined(__aarch64__) - return _mm_i32gather_epi32((const int*)ptr, index, scale); -#else - return vuint4( - *(unsigned int*)(((int8_t*)ptr)+scale*index[0]), - *(unsigned int*)(((int8_t*)ptr)+scale*index[1]), - *(unsigned int*)(((int8_t*)ptr)+scale*index[2]), - *(unsigned int*)(((int8_t*)ptr)+scale*index[3])); -#endif - } - - template - static __forceinline vuint4 gather(const vboolf4& mask, const unsigned int* ptr, const vint4& index) { - vuint4 r = zero; -#if defined(__AVX512VL__) - return _mm_mmask_i32gather_epi32(r, mask, index, ptr, scale); -#elif defined(__AVX2__) && !defined(__aarch64__) - return _mm_mask_i32gather_epi32(r, (const int*)ptr, index, mask, scale); -#else - if (likely(mask[0])) r[0] = *(unsigned int*)(((int8_t*)ptr)+scale*index[0]); - if (likely(mask[1])) r[1] = *(unsigned int*)(((int8_t*)ptr)+scale*index[1]); - if (likely(mask[2])) r[2] = *(unsigned int*)(((int8_t*)ptr)+scale*index[2]); - if (likely(mask[3])) r[3] = *(unsigned int*)(((int8_t*)ptr)+scale*index[3]); - return r; -#endif - } - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline const unsigned int& operator [](size_t index) const { assert(index < 4); return i[index]; } - __forceinline unsigned int& operator [](size_t index) { assert(index < 4); return i[index]; } - - friend __forceinline vuint4 select(const vboolf4& m, const vuint4& t, const vuint4& f) { -#if defined(__AVX512VL__) - return _mm_mask_blend_epi32(m, (__m128i)f, (__m128i)t); -#elif defined(__SSE4_1__) - return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m)); -#else - return _mm_or_si128(_mm_and_si128(m, t), _mm_andnot_si128(m, f)); -#endif - } - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - -#if defined(__AVX512VL__) - __forceinline vboolf4 asBool(const vuint4& a) { return _mm_movepi32_mask(a); } -#else - __forceinline vboolf4 asBool(const vuint4& a) { return _mm_castsi128_ps(a); } -#endif - - __forceinline vuint4 operator +(const vuint4& a) { return a; } - __forceinline vuint4 operator -(const vuint4& a) { return _mm_sub_epi32(_mm_setzero_si128(), a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vuint4 operator +(const vuint4& a, const vuint4& b) { return _mm_add_epi32(a, b); } - __forceinline vuint4 operator +(const vuint4& a, unsigned int b) { return a + vuint4(b); } - __forceinline vuint4 operator +(unsigned int a, const vuint4& b) { return vuint4(a) + b; } - - __forceinline vuint4 operator -(const vuint4& a, const vuint4& b) { return _mm_sub_epi32(a, b); } - __forceinline vuint4 operator -(const vuint4& a, unsigned int b) { return a - vuint4(b); } - __forceinline vuint4 operator -(unsigned int a, const vuint4& b) { return vuint4(a) - b; } - -//#if defined(__SSE4_1__) -// __forceinline vuint4 operator *(const vuint4& a, const vuint4& b) { return _mm_mullo_epu32(a, b); } -//#else -// __forceinline vuint4 operator *(const vuint4& a, const vuint4& b) { return vuint4(a[0]*b[0],a[1]*b[1],a[2]*b[2],a[3]*b[3]); } -//#endif -// __forceinline vuint4 operator *(const vuint4& a, unsigned int b) { return a * vuint4(b); } -// __forceinline vuint4 operator *(unsigned int a, const vuint4& b) { return vuint4(a) * b; } - - __forceinline vuint4 operator &(const vuint4& a, const vuint4& b) { return _mm_and_si128(a, b); } - __forceinline vuint4 operator &(const vuint4& a, unsigned int b) { return a & vuint4(b); } - __forceinline vuint4 operator &(unsigned int a, const vuint4& b) { return vuint4(a) & b; } - - __forceinline vuint4 operator |(const vuint4& a, const vuint4& b) { return _mm_or_si128(a, b); } - __forceinline vuint4 operator |(const vuint4& a, unsigned int b) { return a | vuint4(b); } - __forceinline vuint4 operator |(unsigned int a, const vuint4& b) { return vuint4(a) | b; } - - __forceinline vuint4 operator ^(const vuint4& a, const vuint4& b) { return _mm_xor_si128(a, b); } - __forceinline vuint4 operator ^(const vuint4& a, unsigned int b) { return a ^ vuint4(b); } - __forceinline vuint4 operator ^(unsigned int a, const vuint4& b) { return vuint4(a) ^ b; } - - __forceinline vuint4 operator <<(const vuint4& a, unsigned int n) { return _mm_slli_epi32(a, n); } - __forceinline vuint4 operator >>(const vuint4& a, unsigned int n) { return _mm_srli_epi32(a, n); } - - __forceinline vuint4 sll (const vuint4& a, unsigned int b) { return _mm_slli_epi32(a, b); } - __forceinline vuint4 sra (const vuint4& a, unsigned int b) { return _mm_srai_epi32(a, b); } - __forceinline vuint4 srl (const vuint4& a, unsigned int b) { return _mm_srli_epi32(a, b); } - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vuint4& operator +=(vuint4& a, const vuint4& b) { return a = a + b; } - __forceinline vuint4& operator +=(vuint4& a, unsigned int b) { return a = a + b; } - - __forceinline vuint4& operator -=(vuint4& a, const vuint4& b) { return a = a - b; } - __forceinline vuint4& operator -=(vuint4& a, unsigned int b) { return a = a - b; } - -//#if defined(__SSE4_1__) -// __forceinline vuint4& operator *=(vuint4& a, const vuint4& b) { return a = a * b; } -// __forceinline vuint4& operator *=(vuint4& a, unsigned int b) { return a = a * b; } -//#endif - - __forceinline vuint4& operator &=(vuint4& a, const vuint4& b) { return a = a & b; } - __forceinline vuint4& operator &=(vuint4& a, unsigned int b) { return a = a & b; } - - __forceinline vuint4& operator |=(vuint4& a, const vuint4& b) { return a = a | b; } - __forceinline vuint4& operator |=(vuint4& a, unsigned int b) { return a = a | b; } - - __forceinline vuint4& operator <<=(vuint4& a, unsigned int b) { return a = a << b; } - __forceinline vuint4& operator >>=(vuint4& a, unsigned int b) { return a = a >> b; } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators + Select - //////////////////////////////////////////////////////////////////////////////// - -#if defined(__AVX512VL__) - __forceinline vboolf4 operator ==(const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_EQ); } - __forceinline vboolf4 operator !=(const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_NE); } - //__forceinline vboolf4 operator < (const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_LT); } - //__forceinline vboolf4 operator >=(const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_GE); } - //__forceinline vboolf4 operator > (const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_GT); } - //__forceinline vboolf4 operator <=(const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_LE); } -#else - __forceinline vboolf4 operator ==(const vuint4& a, const vuint4& b) { return _mm_castsi128_ps(_mm_cmpeq_epi32(a, b)); } - __forceinline vboolf4 operator !=(const vuint4& a, const vuint4& b) { return !(a == b); } - //__forceinline vboolf4 operator < (const vuint4& a, const vuint4& b) { return _mm_castsi128_ps(_mm_cmplt_epu32(a, b)); } - //__forceinline vboolf4 operator >=(const vuint4& a, const vuint4& b) { return !(a < b); } - //__forceinline vboolf4 operator > (const vuint4& a, const vuint4& b) { return _mm_castsi128_ps(_mm_cmpgt_epu32(a, b)); } - //__forceinline vboolf4 operator <=(const vuint4& a, const vuint4& b) { return !(a > b); } -#endif - - __forceinline vboolf4 operator ==(const vuint4& a, unsigned int b) { return a == vuint4(b); } - __forceinline vboolf4 operator ==(unsigned int a, const vuint4& b) { return vuint4(a) == b; } - - __forceinline vboolf4 operator !=(const vuint4& a, unsigned int b) { return a != vuint4(b); } - __forceinline vboolf4 operator !=(unsigned int a, const vuint4& b) { return vuint4(a) != b; } - - //__forceinline vboolf4 operator < (const vuint4& a, unsigned int b) { return a < vuint4(b); } - //__forceinline vboolf4 operator < (unsigned int a, const vuint4& b) { return vuint4(a) < b; } - - //__forceinline vboolf4 operator >=(const vuint4& a, unsigned int b) { return a >= vuint4(b); } - //__forceinline vboolf4 operator >=(unsigned int a, const vuint4& b) { return vuint4(a) >= b; } - - //__forceinline vboolf4 operator > (const vuint4& a, unsigned int b) { return a > vuint4(b); } - //__forceinline vboolf4 operator > (unsigned int a, const vuint4& b) { return vuint4(a) > b; } - - //__forceinline vboolf4 operator <=(const vuint4& a, unsigned int b) { return a <= vuint4(b); } - //__forceinline vboolf4 operator <=(unsigned int a, const vuint4& b) { return vuint4(a) <= b; } - - __forceinline vboolf4 eq(const vuint4& a, const vuint4& b) { return a == b; } - __forceinline vboolf4 ne(const vuint4& a, const vuint4& b) { return a != b; } - //__forceinline vboolf4 lt(const vuint4& a, const vuint4& b) { return a < b; } - //__forceinline vboolf4 ge(const vuint4& a, const vuint4& b) { return a >= b; } - //__forceinline vboolf4 gt(const vuint4& a, const vuint4& b) { return a > b; } - //__forceinline vboolf4 le(const vuint4& a, const vuint4& b) { return a <= b; } - -#if defined(__AVX512VL__) - __forceinline vboolf4 eq(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_EQ); } - __forceinline vboolf4 ne(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_NE); } - //__forceinline vboolf4 lt(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_LT); } - //__forceinline vboolf4 ge(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_GE); } - //__forceinline vboolf4 gt(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_GT); } - //__forceinline vboolf4 le(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_LE); } -#else - __forceinline vboolf4 eq(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a == b); } - __forceinline vboolf4 ne(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a != b); } - //__forceinline vboolf4 lt(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a < b); } - //__forceinline vboolf4 ge(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a >= b); } - //__forceinline vboolf4 gt(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a > b); } - //__forceinline vboolf4 le(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a <= b); } -#endif - - template - __forceinline vuint4 select(const vuint4& t, const vuint4& f) { -#if defined(__SSE4_1__) - return _mm_castps_si128(_mm_blend_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), mask)); -#else - return select(vboolf4(mask), t, f); -#endif - } - -/*#if defined(__SSE4_1__) - __forceinline vuint4 min(const vuint4& a, const vuint4& b) { return _mm_min_epu32(a, b); } - __forceinline vuint4 max(const vuint4& a, const vuint4& b) { return _mm_max_epu32(a, b); } - -#else - __forceinline vuint4 min(const vuint4& a, const vuint4& b) { return select(a < b,a,b); } - __forceinline vuint4 max(const vuint4& a, const vuint4& b) { return select(a < b,b,a); } -#endif - - __forceinline vuint4 min(const vuint4& a, unsigned int b) { return min(a,vuint4(b)); } - __forceinline vuint4 min(unsigned int a, const vuint4& b) { return min(vuint4(a),b); } - __forceinline vuint4 max(const vuint4& a, unsigned int b) { return max(a,vuint4(b)); } - __forceinline vuint4 max(unsigned int a, const vuint4& b) { return max(vuint4(a),b); }*/ - - //////////////////////////////////////////////////////////////////////////////// - // Movement/Shifting/Shuffling Functions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vuint4 unpacklo(const vuint4& a, const vuint4& b) { return _mm_castps_si128(_mm_unpacklo_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); } - __forceinline vuint4 unpackhi(const vuint4& a, const vuint4& b) { return _mm_castps_si128(_mm_unpackhi_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); } - -#if defined(__aarch64__) - template - __forceinline vuint4 shuffle(const vuint4& v) { - return vreinterpretq_s32_u8(vqtbl1q_u8( (uint8x16_t)v.v, _MN_SHUFFLE(i0, i1, i2, i3))); - } - template - __forceinline vuint4 shuffle(const vuint4& a, const vuint4& b) { - return vreinterpretq_s32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3))); - } -#else - template - __forceinline vuint4 shuffle(const vuint4& v) { - return _mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0)); - } - template - __forceinline vuint4 shuffle(const vuint4& a, const vuint4& b) { - return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0))); - } -#endif -#if defined(__SSE3__) - template<> __forceinline vuint4 shuffle<0, 0, 2, 2>(const vuint4& v) { return _mm_castps_si128(_mm_moveldup_ps(_mm_castsi128_ps(v))); } - template<> __forceinline vuint4 shuffle<1, 1, 3, 3>(const vuint4& v) { return _mm_castps_si128(_mm_movehdup_ps(_mm_castsi128_ps(v))); } - template<> __forceinline vuint4 shuffle<0, 1, 0, 1>(const vuint4& v) { return _mm_castpd_si128(_mm_movedup_pd (_mm_castsi128_pd(v))); } -#endif - - template - __forceinline vuint4 shuffle(const vuint4& v) { - return shuffle(v); - } - -#if defined(__aarch64__) - template __forceinline unsigned int extract(const vuint4& b); - template __forceinline vuint4 insert(const vuint4& a, const unsigned b); -#elif defined(__SSE4_1__) - template __forceinline unsigned int extract(const vuint4& b) { return _mm_extract_epi32(b, src); } - template __forceinline vuint4 insert(const vuint4& a, const unsigned b) { return _mm_insert_epi32(a, b, dst); } -#else - template __forceinline unsigned int extract(const vuint4& b) { return b[src&3]; } - template __forceinline vuint4 insert(const vuint4& a, const unsigned b) { vuint4 c = a; c[dst&3] = b; return c; } -#endif - -#if defined(__aarch64__) - template<> __forceinline unsigned int extract<0>(const vuint4& b) { - return b[0]; - } - template<> __forceinline unsigned int extract<1>(const vuint4& b) { - return b[1]; - } - template<> __forceinline unsigned int extract<2>(const vuint4& b) { - return b[2]; - } - template<> __forceinline unsigned int extract<3>(const vuint4& b) { - return b[3]; - } - - template<> __forceinline vuint4 insert<0>(const vuint4& a, unsigned b){ - vuint4 c = a; - c[0] = b; - return c; - } - template<> __forceinline vuint4 insert<1>(const vuint4& a, unsigned b){ - vuint4 c = a; - c[1] = b; - return c; - } - template<> __forceinline vuint4 insert<2>(const vuint4& a, unsigned b){ - vuint4 c = a; - c[2] = b; - return c; - } - template<> __forceinline vuint4 insert<3>(const vuint4& a, unsigned b){ - vuint4 c = a; - c[3] = b; - return c; - } - - __forceinline unsigned int toScalar(const vuint4& v) { - return v[0]; - } -#else - template<> __forceinline unsigned int extract<0>(const vuint4& b) { return _mm_cvtsi128_si32(b); } - - __forceinline unsigned int toScalar(const vuint4& v) { return _mm_cvtsi128_si32(v); } -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Reductions - //////////////////////////////////////////////////////////////////////////////// - -#if 0 -#if defined(__SSE4_1__) - - __forceinline vuint4 vreduce_min(const vuint4& v) { vuint4 h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); } - __forceinline vuint4 vreduce_max(const vuint4& v) { vuint4 h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); } - __forceinline vuint4 vreduce_add(const vuint4& v) { vuint4 h = shuffle<1,0,3,2>(v) + v ; return shuffle<2,3,0,1>(h) + h ; } - - __forceinline unsigned int reduce_min(const vuint4& v) { return toScalar(vreduce_min(v)); } - __forceinline unsigned int reduce_max(const vuint4& v) { return toScalar(vreduce_max(v)); } - __forceinline unsigned int reduce_add(const vuint4& v) { return toScalar(vreduce_add(v)); } - - __forceinline size_t select_min(const vuint4& v) { return bsf(movemask(v == vreduce_min(v))); } - __forceinline size_t select_max(const vuint4& v) { return bsf(movemask(v == vreduce_max(v))); } - - //__forceinline size_t select_min(const vboolf4& valid, const vuint4& v) { const vuint4 a = select(valid,v,vuint4(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); } - //__forceinline size_t select_max(const vboolf4& valid, const vuint4& v) { const vuint4 a = select(valid,v,vuint4(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); } - -#else - - __forceinline unsigned int reduce_min(const vuint4& v) { return min(v[0],v[1],v[2],v[3]); } - __forceinline unsigned int reduce_max(const vuint4& v) { return max(v[0],v[1],v[2],v[3]); } - __forceinline unsigned int reduce_add(const vuint4& v) { return v[0]+v[1]+v[2]+v[3]; } - -#endif -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline embree_ostream operator <<(embree_ostream cout, const vuint4& a) { - return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ">"; - } -} - diff --git a/thirdparty/embree-aarch64/common/simd/vuint8_avx.h b/thirdparty/embree-aarch64/common/simd/vuint8_avx.h deleted file mode 100644 index d4e86ae92d..0000000000 --- a/thirdparty/embree-aarch64/common/simd/vuint8_avx.h +++ /dev/null @@ -1,379 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -namespace embree -{ - /* 8-wide AVX integer type */ - template<> - struct vuint<8> - { - ALIGNED_STRUCT_(32); - - typedef vboolf8 Bool; - typedef vuint8 Int; - typedef vfloat8 Float; - - enum { size = 8 }; // number of SIMD elements - union { // data - __m256i v; - struct { __m128i vl,vh; }; - unsigned int i[8]; - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vuint() {} - __forceinline vuint(const vuint8& a) { v = a.v; } - __forceinline vuint8& operator =(const vuint8& a) { v = a.v; return *this; } - - __forceinline vuint(__m256i a) : v(a) {} - __forceinline operator const __m256i&() const { return v; } - __forceinline operator __m256i&() { return v; } - - __forceinline explicit vuint(const vuint4& a) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),a,1)) {} - __forceinline vuint(const vuint4& a, const vuint4& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {} - __forceinline vuint(const __m128i& a, const __m128i& b) : vl(a), vh(b) {} - - __forceinline explicit vuint(const unsigned int* a) : v(_mm256_castps_si256(_mm256_loadu_ps((const float*)a))) {} - __forceinline vuint(unsigned int a) : v(_mm256_set1_epi32(a)) {} - __forceinline vuint(unsigned int a, unsigned int b) : v(_mm256_set_epi32(b, a, b, a, b, a, b, a)) {} - __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d) : v(_mm256_set_epi32(d, c, b, a, d, c, b, a)) {} - __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d, unsigned int e, unsigned int f, unsigned int g, unsigned int vh) : v(_mm256_set_epi32(vh, g, f, e, d, c, b, a)) {} - - __forceinline explicit vuint(__m256 a) : v(_mm256_cvtps_epi32(a)) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vuint(ZeroTy) : v(_mm256_setzero_si256()) {} - __forceinline vuint(OneTy) : v(_mm256_set1_epi32(1)) {} - __forceinline vuint(PosInfTy) : v(_mm256_set1_epi32(0xFFFFFFFF)) {} - __forceinline vuint(StepTy) : v(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)) {} - __forceinline vuint(UndefinedTy) : v(_mm256_undefined_si256()) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Loads and Stores - //////////////////////////////////////////////////////////////////////////////// - - static __forceinline vuint8 load (const void* a) { return _mm256_castps_si256(_mm256_load_ps((float*)a)); } - static __forceinline vuint8 loadu(const void* a) { return _mm256_castps_si256(_mm256_loadu_ps((float*)a)); } - - static __forceinline vuint8 load (const vboolf8& mask, const void* a) { return _mm256_castps_si256(_mm256_maskload_ps((float*)a,mask)); } - static __forceinline vuint8 loadu(const vboolf8& mask, const void* a) { return _mm256_castps_si256(_mm256_maskload_ps((float*)a,mask)); } - - static __forceinline void store (void* ptr, const vuint8& f) { _mm256_store_ps((float*)ptr,_mm256_castsi256_ps(f)); } - static __forceinline void storeu(void* ptr, const vuint8& f) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(f)); } - -#if !defined(__aarch64__) - static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); } - static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); } -#else - static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask.v,_mm256_castsi256_ps(f)); } - static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask.v,_mm256_castsi256_ps(f)); } -#endif - static __forceinline void store_nt(void* ptr, const vuint8& v) { - _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v)); - } - - static __forceinline vuint8 load(const uint8_t* ptr) { - vuint4 il = vuint4::load(ptr+0); - vuint4 ih = vuint4::load(ptr+4); - return vuint8(il,ih); - } - - static __forceinline vuint8 loadu(const uint8_t* ptr) { - vuint4 il = vuint4::loadu(ptr+0); - vuint4 ih = vuint4::loadu(ptr+4); - return vuint8(il,ih); - } - - static __forceinline vuint8 load(const unsigned short* ptr) { - vuint4 il = vuint4::load(ptr+0); - vuint4 ih = vuint4::load(ptr+4); - return vuint8(il,ih); - } - - static __forceinline vuint8 loadu(const unsigned short* ptr) { - vuint4 il = vuint4::loadu(ptr+0); - vuint4 ih = vuint4::loadu(ptr+4); - return vuint8(il,ih); - } - - static __forceinline void store(uint8_t* ptr, const vuint8& i) { - vuint4 il(i.vl); - vuint4 ih(i.vh); - vuint4::store(ptr + 0,il); - vuint4::store(ptr + 4,ih); - } - - static __forceinline void store(unsigned short* ptr, const vuint8& v) { - for (size_t i=0;i<8;i++) - ptr[i] = (unsigned short)v[i]; - } - - template - static __forceinline vuint8 gather(const unsigned int* ptr, const vint8& index) { - return vuint8( - *(unsigned int*)(((int8_t*)ptr)+scale*index[0]), - *(unsigned int*)(((int8_t*)ptr)+scale*index[1]), - *(unsigned int*)(((int8_t*)ptr)+scale*index[2]), - *(unsigned int*)(((int8_t*)ptr)+scale*index[3]), - *(unsigned int*)(((int8_t*)ptr)+scale*index[4]), - *(unsigned int*)(((int8_t*)ptr)+scale*index[5]), - *(unsigned int*)(((int8_t*)ptr)+scale*index[6]), - *(unsigned int*)(((int8_t*)ptr)+scale*index[7])); - } - - template - static __forceinline vuint8 gather(const vboolf8& mask, const unsigned int* ptr, const vint8& index) { - vuint8 r = zero; - if (likely(mask[0])) r[0] = *(unsigned int*)(((int8_t*)ptr)+scale*index[0]); - if (likely(mask[1])) r[1] = *(unsigned int*)(((int8_t*)ptr)+scale*index[1]); - if (likely(mask[2])) r[2] = *(unsigned int*)(((int8_t*)ptr)+scale*index[2]); - if (likely(mask[3])) r[3] = *(unsigned int*)(((int8_t*)ptr)+scale*index[3]); - if (likely(mask[4])) r[4] = *(unsigned int*)(((int8_t*)ptr)+scale*index[4]); - if (likely(mask[5])) r[5] = *(unsigned int*)(((int8_t*)ptr)+scale*index[5]); - if (likely(mask[6])) r[6] = *(unsigned int*)(((int8_t*)ptr)+scale*index[6]); - if (likely(mask[7])) r[7] = *(unsigned int*)(((int8_t*)ptr)+scale*index[7]); - return r; - } - - template - static __forceinline void scatter(void* ptr, const vint8& ofs, const vuint8& v) - { - *(unsigned int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0]; - *(unsigned int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1]; - *(unsigned int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2]; - *(unsigned int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3]; - *(unsigned int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4]; - *(unsigned int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5]; - *(unsigned int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6]; - *(unsigned int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7]; - } - - template - static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vuint8& v) - { - if (likely(mask[0])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0]; - if (likely(mask[1])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1]; - if (likely(mask[2])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2]; - if (likely(mask[3])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3]; - if (likely(mask[4])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4]; - if (likely(mask[5])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5]; - if (likely(mask[6])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6]; - if (likely(mask[7])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7]; - } - - - static __forceinline vuint8 broadcast64(const long long& a) { return _mm256_set1_epi64x(a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline const unsigned int& operator [](size_t index) const { assert(index < 8); return i[index]; } - __forceinline unsigned int& operator [](size_t index) { assert(index < 8); return i[index]; } - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf8 asBool(const vuint8& a) { return _mm256_castsi256_ps(a); } - - __forceinline vuint8 operator +(const vuint8& a) { return a; } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vuint8 operator +(const vuint8& a, const vuint8& b) { return vuint8(_mm_add_epi32(a.vl, b.vl), _mm_add_epi32(a.vh, b.vh)); } - __forceinline vuint8 operator +(const vuint8& a, unsigned int b) { return a + vuint8(b); } - __forceinline vuint8 operator +(unsigned int a, const vuint8& b) { return vuint8(a) + b; } - - __forceinline vuint8 operator -(const vuint8& a, const vuint8& b) { return vuint8(_mm_sub_epi32(a.vl, b.vl), _mm_sub_epi32(a.vh, b.vh)); } - __forceinline vuint8 operator -(const vuint8& a, unsigned int b) { return a - vuint8(b); } - __forceinline vuint8 operator -(unsigned int a, const vuint8& b) { return vuint8(a) - b; } - - //__forceinline vuint8 operator *(const vuint8& a, const vuint8& b) { return vuint8(_mm_mullo_epu32(a.vl, b.vl), _mm_mullo_epu32(a.vh, b.vh)); } - //__forceinline vuint8 operator *(const vuint8& a, unsigned int b) { return a * vuint8(b); } - //__forceinline vuint8 operator *(unsigned int a, const vuint8& b) { return vuint8(a) * b; } - - __forceinline vuint8 operator &(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } - __forceinline vuint8 operator &(const vuint8& a, unsigned int b) { return a & vuint8(b); } - __forceinline vuint8 operator &(unsigned int a, const vuint8& b) { return vuint8(a) & b; } - - __forceinline vuint8 operator |(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_or_ps (_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } - __forceinline vuint8 operator |(const vuint8& a, unsigned int b) { return a | vuint8(b); } - __forceinline vuint8 operator |(unsigned int a, const vuint8& b) { return vuint8(a) | b; } - - __forceinline vuint8 operator ^(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } - __forceinline vuint8 operator ^(const vuint8& a, unsigned int b) { return a ^ vuint8(b); } - __forceinline vuint8 operator ^(unsigned int a, const vuint8& b) { return vuint8(a) ^ b; } - - __forceinline vuint8 operator <<(const vuint8& a, unsigned int n) { return vuint8(_mm_slli_epi32(a.vl, n), _mm_slli_epi32(a.vh, n)); } - __forceinline vuint8 operator >>(const vuint8& a, unsigned int n) { return vuint8(_mm_srai_epi32(a.vl, n), _mm_srli_epi32(a.vh, n)); } - - __forceinline vuint8 sll (const vuint8& a, unsigned int b) { return vuint8(_mm_slli_epi32(a.vl, b), _mm_slli_epi32(a.vh, b)); } - __forceinline vuint8 sra (const vuint8& a, unsigned int b) { return vuint8(_mm_srai_epi32(a.vl, b), _mm_srai_epi32(a.vh, b)); } - __forceinline vuint8 srl (const vuint8& a, unsigned int b) { return vuint8(_mm_srli_epi32(a.vl, b), _mm_srli_epi32(a.vh, b)); } - - __forceinline vuint8 min(const vuint8& a, const vuint8& b) { return vuint8(_mm_min_epu32(a.vl, b.vl), _mm_min_epu32(a.vh, b.vh)); } - __forceinline vuint8 min(const vuint8& a, unsigned int b) { return min(a,vuint8(b)); } - __forceinline vuint8 min(unsigned int a, const vuint8& b) { return min(vuint8(a),b); } - - __forceinline vuint8 max(const vuint8& a, const vuint8& b) { return vuint8(_mm_max_epu32(a.vl, b.vl), _mm_max_epu32(a.vh, b.vh)); } - __forceinline vuint8 max(const vuint8& a, unsigned int b) { return max(a,vuint8(b)); } - __forceinline vuint8 max(unsigned int a, const vuint8& b) { return max(vuint8(a),b); } - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vuint8& operator +=(vuint8& a, const vuint8& b) { return a = a + b; } - __forceinline vuint8& operator +=(vuint8& a, unsigned int b) { return a = a + b; } - - __forceinline vuint8& operator -=(vuint8& a, const vuint8& b) { return a = a - b; } - __forceinline vuint8& operator -=(vuint8& a, unsigned int b) { return a = a - b; } - - //__forceinline vuint8& operator *=(vuint8& a, const vuint8& b) { return a = a * b; } - //__forceinline vuint8& operator *=(vuint8& a, unsigned int b) { return a = a * b; } - - __forceinline vuint8& operator &=(vuint8& a, const vuint8& b) { return a = a & b; } - __forceinline vuint8& operator &=(vuint8& a, unsigned int b) { return a = a & b; } - - __forceinline vuint8& operator |=(vuint8& a, const vuint8& b) { return a = a | b; } - __forceinline vuint8& operator |=(vuint8& a, unsigned int b) { return a = a | b; } - - __forceinline vuint8& operator <<=(vuint8& a, unsigned int b) { return a = a << b; } - __forceinline vuint8& operator >>=(vuint8& a, unsigned int b) { return a = a >> b; } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators + Select - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vboolf8 operator ==(const vuint8& a, const vuint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmpeq_epi32 (a.vl, b.vl)), - _mm_castsi128_ps(_mm_cmpeq_epi32 (a.vh, b.vh))); } - __forceinline vboolf8 operator ==(const vuint8& a, unsigned int b) { return a == vuint8(b); } - __forceinline vboolf8 operator ==(unsigned int a, const vuint8& b) { return vuint8(a) == b; } - - __forceinline vboolf8 operator !=(const vuint8& a, const vuint8& b) { return !(a == b); } - __forceinline vboolf8 operator !=(const vuint8& a, unsigned int b) { return a != vuint8(b); } - __forceinline vboolf8 operator !=(unsigned int a, const vuint8& b) { return vuint8(a) != b; } - - //__forceinline vboolf8 operator < (const vuint8& a, const vuint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmplt_epu32 (a.vl, b.vl)), - // _mm_castsi128_ps(_mm_cmplt_epu32 (a.vh, b.vh))); } - //__forceinline vboolf8 operator < (const vuint8& a, unsigned int b) { return a < vuint8(b); } - //__forceinline vboolf8 operator < (unsigned int a, const vuint8& b) { return vuint8(a) < b; } - - //__forceinline vboolf8 operator >=(const vuint8& a, const vuint8& b) { return !(a < b); } - //__forceinline vboolf8 operator >=(const vuint8& a, unsigned int b) { return a >= vuint8(b); } - //__forceinline vboolf8 operator >=(unsigned int a, const vuint8& b) { return vuint8(a) >= b; } - - //__forceinline vboolf8 operator > (const vuint8& a, const vuint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmpgt_epu32 (a.vl, b.vl)), - // _mm_castsi128_ps(_mm_cmpgt_epu32 (a.vh, b.vh))); } - //__forceinline vboolf8 operator > (const vuint8& a, unsigned int b) { return a > vuint8(b); } - //__forceinline vboolf8 operator > (unsigned int a, const vuint8& b) { return vuint8(a) > b; } - - //__forceinline vboolf8 operator <=(const vuint8& a, const vuint8& b) { return !(a > b); } - //__forceinline vboolf8 operator <=(const vuint8& a, unsigned int b) { return a <= vuint8(b); } - //__forceinline vboolf8 operator <=(unsigned int a, const vuint8& b) { return vuint8(a) <= b; } - - __forceinline vboolf8 eq(const vuint8& a, const vuint8& b) { return a == b; } - __forceinline vboolf8 ne(const vuint8& a, const vuint8& b) { return a != b; } - - __forceinline vboolf8 eq(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a == b); } - __forceinline vboolf8 ne(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a != b); } - - __forceinline vuint8 select(const vboolf8& m, const vuint8& t, const vuint8& f) { - return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m)); - } - - __forceinline vuint8 notand(const vboolf8& m, const vuint8& f) { - return _mm256_castps_si256(_mm256_andnot_ps(m, _mm256_castsi256_ps(f))); - } - - - //////////////////////////////////////////////////////////////////////////////// - /// Movement/Shifting/Shuffling Functions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vuint8 unpacklo(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } - __forceinline vuint8 unpackhi(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } - - template - __forceinline vuint8 shuffle(const vuint8& v) { - return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i, i, i, i))); - } - - template - __forceinline vuint8 shuffle4(const vuint8& v) { - return _mm256_permute2f128_si256(v, v, (i1 << 4) | (i0 << 0)); - } - - template - __forceinline vuint8 shuffle4(const vuint8& a, const vuint8& b) { - return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0)); - } - - template - __forceinline vuint8 shuffle(const vuint8& v) { - return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i3, i2, i1, i0))); - } - - template - __forceinline vuint8 shuffle(const vuint8& a, const vuint8& b) { - return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0))); - } - - template<> __forceinline vuint8 shuffle<0, 0, 2, 2>(const vuint8& v) { return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(v))); } - template<> __forceinline vuint8 shuffle<1, 1, 3, 3>(const vuint8& v) { return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(v))); } - template<> __forceinline vuint8 shuffle<0, 1, 0, 1>(const vuint8& v) { return _mm256_castps_si256(_mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(v))))); } - - __forceinline vuint8 broadcast(const unsigned int* ptr) { return _mm256_castps_si256(_mm256_broadcast_ss((const float*)ptr)); } - template __forceinline vuint8 insert4(const vuint8& a, const vuint4& b) { return _mm256_insertf128_si256(a, b, i); } - template __forceinline vuint4 extract4(const vuint8& a) { return _mm256_extractf128_si256(a, i); } - template<> __forceinline vuint4 extract4<0>(const vuint8& a) { return _mm256_castsi256_si128(a); } - - __forceinline int toScalar(const vuint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); } - - - //////////////////////////////////////////////////////////////////////////////// - /// Reductions - //////////////////////////////////////////////////////////////////////////////// - - //__forceinline vuint8 vreduce_min2(const vuint8& v) { return min(v,shuffle<1,0,3,2>(v)); } - //__forceinline vuint8 vreduce_min4(const vuint8& v) { vuint8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); } - //__forceinline vuint8 vreduce_min (const vuint8& v) { vuint8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); } - - //__forceinline vuint8 vreduce_max2(const vuint8& v) { return max(v,shuffle<1,0,3,2>(v)); } - //__forceinline vuint8 vreduce_max4(const vuint8& v) { vuint8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); } - //__forceinline vuint8 vreduce_max (const vuint8& v) { vuint8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); } - - __forceinline vuint8 vreduce_add2(const vuint8& v) { return v + shuffle<1,0,3,2>(v); } - __forceinline vuint8 vreduce_add4(const vuint8& v) { vuint8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); } - __forceinline vuint8 vreduce_add (const vuint8& v) { vuint8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); } - - //__forceinline int reduce_min(const vuint8& v) { return toScalar(vreduce_min(v)); } - //__forceinline int reduce_max(const vuint8& v) { return toScalar(vreduce_max(v)); } - __forceinline int reduce_add(const vuint8& v) { return toScalar(vreduce_add(v)); } - - //__forceinline size_t select_min(const vuint8& v) { return bsf(movemask(v == vreduce_min(v))); } - //__forceinline size_t select_max(const vuint8& v) { return bsf(movemask(v == vreduce_max(v))); } - - //__forceinline size_t select_min(const vboolf8& valid, const vuint8& v) { const vuint8 a = select(valid,v,vuint8(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); } - //__forceinline size_t select_max(const vboolf8& valid, const vuint8& v) { const vuint8 a = select(valid,v,vuint8(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); } - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline embree_ostream operator <<(embree_ostream cout, const vuint8& a) { - return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">"; - } -} diff --git a/thirdparty/embree-aarch64/common/simd/vuint8_avx2.h b/thirdparty/embree-aarch64/common/simd/vuint8_avx2.h deleted file mode 100644 index b2a965448d..0000000000 --- a/thirdparty/embree-aarch64/common/simd/vuint8_avx2.h +++ /dev/null @@ -1,439 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -namespace embree -{ - /* 8-wide AVX integer type */ - template<> - struct vuint<8> - { - ALIGNED_STRUCT_(32); - - typedef vboolf8 Bool; - typedef vuint8 Int; - typedef vfloat8 Float; - - enum { size = 8 }; // number of SIMD elements - union { // data - __m256i v; - unsigned int i[8]; - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vuint() {} - __forceinline vuint(const vuint8& a) { v = a.v; } - __forceinline vuint8& operator =(const vuint8& a) { v = a.v; return *this; } - - __forceinline vuint(__m256i a) : v(a) {} - __forceinline operator const __m256i&() const { return v; } - __forceinline operator __m256i&() { return v; } - - __forceinline explicit vuint(const vuint4& a) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),a,1)) {} - __forceinline vuint(const vuint4& a, const vuint4& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {} - __forceinline vuint(const __m128i& a, const __m128i& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {} - - __forceinline explicit vuint(const unsigned int* a) : v(_mm256_castps_si256(_mm256_loadu_ps((const float*)a))) {} - __forceinline vuint(unsigned int a) : v(_mm256_set1_epi32(a)) {} - __forceinline vuint(unsigned int a, unsigned int b) : v(_mm256_set_epi32(b, a, b, a, b, a, b, a)) {} - __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d) : v(_mm256_set_epi32(d, c, b, a, d, c, b, a)) {} - __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d, unsigned int e, unsigned int f, unsigned int g, unsigned int h) : v(_mm256_set_epi32(h, g, f, e, d, c, b, a)) {} - - __forceinline explicit vuint(__m256 a) : v(_mm256_cvtps_epi32(a)) {} - -#if defined(__AVX512VL__) - __forceinline explicit vuint(const vboolf8& a) : v(_mm256_movm_epi32(a)) {} -#else - __forceinline explicit vuint(const vboolf8& a) : v(_mm256_castps_si256((__m256)a)) {} -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vuint(ZeroTy) : v(_mm256_setzero_si256()) {} - __forceinline vuint(OneTy) : v(_mm256_set1_epi32(1)) {} - __forceinline vuint(PosInfTy) : v(_mm256_set1_epi32(pos_inf)) {} - __forceinline vuint(NegInfTy) : v(_mm256_set1_epi32(neg_inf)) {} - __forceinline vuint(StepTy) : v(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)) {} - __forceinline vuint(UndefinedTy) : v(_mm256_undefined_si256()) {} - - //////////////////////////////////////////////////////////////////////////////// - /// Loads and Stores - //////////////////////////////////////////////////////////////////////////////// - - static __forceinline vuint8 load(const uint8_t* ptr) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); } - static __forceinline vuint8 loadu(const uint8_t* ptr) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); } - static __forceinline vuint8 load(const unsigned short* ptr) { return _mm256_cvtepu16_epi32(_mm_load_si128((__m128i*)ptr)); } - static __forceinline vuint8 loadu(const unsigned short* ptr) { return _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr)); } - - static __forceinline vuint8 load(const void* ptr) { return _mm256_load_si256((__m256i*)ptr); } - static __forceinline vuint8 loadu(const void* ptr) { return _mm256_loadu_si256((__m256i*)ptr); } - - static __forceinline void store (void* ptr, const vuint8& v) { _mm256_store_si256((__m256i*)ptr,v); } - static __forceinline void storeu(void* ptr, const vuint8& v) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(v)); } - -#if defined(__AVX512VL__) - - static __forceinline vuint8 compact(const vboolf8& mask, vuint8 &v) { - return _mm256_mask_compress_epi32(v, mask, v); - } - static __forceinline vuint8 compact(const vboolf8& mask, vuint8 &a, const vuint8& b) { - return _mm256_mask_compress_epi32(a, mask, b); - } - - static __forceinline vuint8 load (const vboolf8& mask, const void* ptr) { return _mm256_mask_load_epi32 (_mm256_setzero_si256(),mask,ptr); } - static __forceinline vuint8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_mask_loadu_epi32(_mm256_setzero_si256(),mask,ptr); } - - static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& v) { _mm256_mask_store_epi32 (ptr,mask,v); } - static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& v) { _mm256_mask_storeu_epi32(ptr,mask,v); } -#else - static __forceinline vuint8 load (const vboolf8& mask, const void* ptr) { return _mm256_castps_si256(_mm256_maskload_ps((float*)ptr,mask)); } - static __forceinline vuint8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_castps_si256(_mm256_maskload_ps((float*)ptr,mask)); } - - static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& v) { _mm256_maskstore_epi32((int*)ptr,mask,v); } - static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& v) { _mm256_maskstore_epi32((int*)ptr,mask,v); } -#endif - - static __forceinline vuint8 load_nt(void* ptr) { - return _mm256_stream_load_si256((__m256i*)ptr); - } - - static __forceinline void store_nt(void* ptr, const vuint8& v) { - _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v)); - } - - static __forceinline void store(uint8_t* ptr, const vuint8& i) - { - for (size_t j=0; j<8; j++) - ptr[j] = i[j]; - } - - static __forceinline void store(unsigned short* ptr, const vuint8& v) { - for (size_t i=0;i<8;i++) - ptr[i] = (unsigned short)v[i]; - } - - template - static __forceinline vuint8 gather(const unsigned int *const ptr, const vint8& index) { - return _mm256_i32gather_epi32((const int*) ptr, index, scale); - } - - template - static __forceinline vuint8 gather(const vboolf8& mask, const unsigned int *const ptr, const vint8& index) { - vuint8 r = zero; -#if defined(__AVX512VL__) - return _mm256_mmask_i32gather_epi32(r, mask, index, (const int*) ptr, scale); -#else - return _mm256_mask_i32gather_epi32(r, (const int*) ptr, index, mask, scale); -#endif - } - - template - static __forceinline void scatter(void* ptr, const vint8& ofs, const vuint8& v) - { -#if defined(__AVX512VL__) - _mm256_i32scatter_epi32((int*)ptr, ofs, v, scale); -#else - *(unsigned int*)(((int8_t*)ptr) + scale * ofs[0]) = v[0]; - *(unsigned int*)(((int8_t*)ptr) + scale * ofs[1]) = v[1]; - *(unsigned int*)(((int8_t*)ptr) + scale * ofs[2]) = v[2]; - *(unsigned int*)(((int8_t*)ptr) + scale * ofs[3]) = v[3]; - *(unsigned int*)(((int8_t*)ptr) + scale * ofs[4]) = v[4]; - *(unsigned int*)(((int8_t*)ptr) + scale * ofs[5]) = v[5]; - *(unsigned int*)(((int8_t*)ptr) + scale * ofs[6]) = v[6]; - *(unsigned int*)(((int8_t*)ptr) + scale * ofs[7]) = v[7]; -#endif - } - - template - static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vuint8& v) - { -#if defined(__AVX512VL__) - _mm256_mask_i32scatter_epi32((int*)ptr, mask, ofs, v, scale); -#else - if (likely(mask[0])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0]; - if (likely(mask[1])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1]; - if (likely(mask[2])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2]; - if (likely(mask[3])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3]; - if (likely(mask[4])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4]; - if (likely(mask[5])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5]; - if (likely(mask[6])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6]; - if (likely(mask[7])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7]; -#endif - } - - static __forceinline vuint8 broadcast64(const long long &a) { return _mm256_set1_epi64x(a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline const unsigned int& operator [](size_t index) const { assert(index < 8); return i[index]; } - __forceinline unsigned int& operator [](size_t index) { assert(index < 8); return i[index]; } - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Unary Operators - //////////////////////////////////////////////////////////////////////////////// - -#if defined(__AVX512VL__) - __forceinline vboolf8 asBool(const vuint8& a) { return _mm256_movepi32_mask(a); } -#else - __forceinline vboolf8 asBool(const vuint8& a) { return _mm256_castsi256_ps(a); } -#endif - - __forceinline vuint8 operator +(const vuint8& a) { return a; } - - //////////////////////////////////////////////////////////////////////////////// - /// Binary Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vuint8 operator +(const vuint8& a, const vuint8& b) { return _mm256_add_epi32(a, b); } - __forceinline vuint8 operator +(const vuint8& a, unsigned int b) { return a + vuint8(b); } - __forceinline vuint8 operator +(unsigned int a, const vuint8& b) { return vuint8(a) + b; } - - __forceinline vuint8 operator -(const vuint8& a, const vuint8& b) { return _mm256_sub_epi32(a, b); } - __forceinline vuint8 operator -(const vuint8& a, unsigned int b) { return a - vuint8(b); } - __forceinline vuint8 operator -(unsigned int a, const vuint8& b) { return vuint8(a) - b; } - - //__forceinline vuint8 operator *(const vuint8& a, const vuint8& b) { return _mm256_mullo_epu32(a, b); } - //__forceinline vuint8 operator *(const vuint8& a, unsigned int b) { return a * vuint8(b); } - //__forceinline vuint8 operator *(unsigned int a, const vuint8& b) { return vuint8(a) * b; } - - __forceinline vuint8 operator &(const vuint8& a, const vuint8& b) { return _mm256_and_si256(a, b); } - __forceinline vuint8 operator &(const vuint8& a, unsigned int b) { return a & vuint8(b); } - __forceinline vuint8 operator &(unsigned int a, const vuint8& b) { return vuint8(a) & b; } - - __forceinline vuint8 operator |(const vuint8& a, const vuint8& b) { return _mm256_or_si256(a, b); } - __forceinline vuint8 operator |(const vuint8& a, unsigned int b) { return a | vuint8(b); } - __forceinline vuint8 operator |(unsigned int a, const vuint8& b) { return vuint8(a) | b; } - - __forceinline vuint8 operator ^(const vuint8& a, const vuint8& b) { return _mm256_xor_si256(a, b); } - __forceinline vuint8 operator ^(const vuint8& a, unsigned int b) { return a ^ vuint8(b); } - __forceinline vuint8 operator ^(unsigned int a, const vuint8& b) { return vuint8(a) ^ b; } - - __forceinline vuint8 operator <<(const vuint8& a, unsigned int n) { return _mm256_slli_epi32(a, n); } - __forceinline vuint8 operator >>(const vuint8& a, unsigned int n) { return _mm256_srli_epi32(a, n); } - - __forceinline vuint8 operator <<(const vuint8& a, const vuint8& n) { return _mm256_sllv_epi32(a, n); } - __forceinline vuint8 operator >>(const vuint8& a, const vuint8& n) { return _mm256_srlv_epi32(a, n); } - - __forceinline vuint8 sll(const vuint8& a, unsigned int b) { return _mm256_slli_epi32(a, b); } - __forceinline vuint8 sra(const vuint8& a, unsigned int b) { return _mm256_srai_epi32(a, b); } - __forceinline vuint8 srl(const vuint8& a, unsigned int b) { return _mm256_srli_epi32(a, b); } - - __forceinline vuint8 sll(const vuint8& a, const vuint8& b) { return _mm256_sllv_epi32(a, b); } - __forceinline vuint8 sra(const vuint8& a, const vuint8& b) { return _mm256_srav_epi32(a, b); } - __forceinline vuint8 srl(const vuint8& a, const vuint8& b) { return _mm256_srlv_epi32(a, b); } - - __forceinline vuint8 min(const vuint8& a, const vuint8& b) { return _mm256_min_epu32(a, b); } - __forceinline vuint8 min(const vuint8& a, unsigned int b) { return min(a,vuint8(b)); } - __forceinline vuint8 min(unsigned int a, const vuint8& b) { return min(vuint8(a),b); } - - __forceinline vuint8 max(const vuint8& a, const vuint8& b) { return _mm256_max_epu32(a, b); } - __forceinline vuint8 max(const vuint8& a, unsigned int b) { return max(a,vuint8(b)); } - __forceinline vuint8 max(unsigned int a, const vuint8& b) { return max(vuint8(a),b); } - - //////////////////////////////////////////////////////////////////////////////// - /// Assignment Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vuint8& operator +=(vuint8& a, const vuint8& b) { return a = a + b; } - __forceinline vuint8& operator +=(vuint8& a, unsigned int b) { return a = a + b; } - - __forceinline vuint8& operator -=(vuint8& a, const vuint8& b) { return a = a - b; } - __forceinline vuint8& operator -=(vuint8& a, unsigned int b) { return a = a - b; } - - //__forceinline vuint8& operator *=(vuint8& a, const vuint8& b) { return a = a * b; } - //__forceinline vuint8& operator *=(vuint8& a, unsigned int b) { return a = a * b; } - - __forceinline vuint8& operator &=(vuint8& a, const vuint8& b) { return a = a & b; } - __forceinline vuint8& operator &=(vuint8& a, unsigned int b) { return a = a & b; } - - __forceinline vuint8& operator |=(vuint8& a, const vuint8& b) { return a = a | b; } - __forceinline vuint8& operator |=(vuint8& a, unsigned int b) { return a = a | b; } - - __forceinline vuint8& operator <<=(vuint8& a, const unsigned int b) { return a = a << b; } - __forceinline vuint8& operator >>=(vuint8& a, const unsigned int b) { return a = a >> b; } - - //////////////////////////////////////////////////////////////////////////////// - /// Comparison Operators + Select - //////////////////////////////////////////////////////////////////////////////// - -#if defined(__AVX512VL__) - __forceinline vboolf8 operator ==(const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_EQ); } - __forceinline vboolf8 operator !=(const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_NE); } - __forceinline vboolf8 operator < (const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_LT); } - __forceinline vboolf8 operator >=(const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_GE); } - __forceinline vboolf8 operator > (const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_GT); } - __forceinline vboolf8 operator <=(const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_LE); } - - __forceinline vuint8 select(const vboolf8& m, const vuint8& t, const vuint8& f) { - return _mm256_mask_blend_epi32(m, (__m256i)f, (__m256i)t); - } -#else - __forceinline vboolf8 operator ==(const vuint8& a, const vuint8& b) { return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a, b)); } - __forceinline vboolf8 operator !=(const vuint8& a, const vuint8& b) { return !(a == b); } - //__forceinline vboolf8 operator < (const vuint8& a, const vuint8& b) { return _mm256_castsi256_ps(_mm256_cmpgt_epu32(b, a)); } - //__forceinline vboolf8 operator >=(const vuint8& a, const vuint8& b) { return !(a < b); } - //__forceinline vboolf8 operator > (const vuint8& a, const vuint8& b) { return _mm256_castsi256_ps(_mm256_cmpgt_epu32(a, b)); } - //__forceinline vboolf8 operator <=(const vuint8& a, const vuint8& b) { return !(a > b); } - - __forceinline vuint8 select(const vboolf8& m, const vuint8& t, const vuint8& f) { - return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m)); - } -#endif - - template - __forceinline vuint8 select(const vuint8& t, const vuint8& f) { - return _mm256_blend_epi32(f, t, mask); - } - - __forceinline vboolf8 operator ==(const vuint8& a, unsigned int b) { return a == vuint8(b); } - __forceinline vboolf8 operator ==(unsigned int a, const vuint8& b) { return vuint8(a) == b; } - - __forceinline vboolf8 operator !=(const vuint8& a, unsigned int b) { return a != vuint8(b); } - __forceinline vboolf8 operator !=(unsigned int a, const vuint8& b) { return vuint8(a) != b; } - - //__forceinline vboolf8 operator < (const vuint8& a, unsigned int b) { return a < vuint8(b); } - //__forceinline vboolf8 operator < (unsigned int a, const vuint8& b) { return vuint8(a) < b; } - - //__forceinline vboolf8 operator >=(const vuint8& a, unsigned int b) { return a >= vuint8(b); } - //__forceinline vboolf8 operator >=(unsigned int a, const vuint8& b) { return vuint8(a) >= b; } - - //__forceinline vboolf8 operator > (const vuint8& a, unsigned int b) { return a > vuint8(b); } - //__forceinline vboolf8 operator > (unsigned int a, const vuint8& b) { return vuint8(a) > b; } - - //__forceinline vboolf8 operator <=(const vuint8& a, unsigned int b) { return a <= vuint8(b); } - //__forceinline vboolf8 operator <=(unsigned int a, const vuint8& b) { return vuint8(a) <= b; } - - __forceinline vboolf8 eq(const vuint8& a, const vuint8& b) { return a == b; } - __forceinline vboolf8 ne(const vuint8& a, const vuint8& b) { return a != b; } - //__forceinline vboolf8 lt(const vuint8& a, const vuint8& b) { return a < b; } - //__forceinline vboolf8 ge(const vuint8& a, const vuint8& b) { return a >= b; } - //__forceinline vboolf8 gt(const vuint8& a, const vuint8& b) { return a > b; } - //__forceinline vboolf8 le(const vuint8& a, const vuint8& b) { return a <= b; } - -#if defined(__AVX512VL__) - __forceinline vboolf8 eq(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_EQ); } - __forceinline vboolf8 ne(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_NE); } - __forceinline vboolf8 lt(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_LT); } - __forceinline vboolf8 ge(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_GE); } - __forceinline vboolf8 gt(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_GT); } - __forceinline vboolf8 le(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_LE); } -#else - __forceinline vboolf8 eq(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a == b); } - __forceinline vboolf8 ne(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a != b); } - //__forceinline vboolf8 lt(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a < b); } - //__forceinline vboolf8 ge(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a >= b); } - //__forceinline vboolf8 gt(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a > b); } - //__forceinline vboolf8 le(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a <= b); } -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Movement/Shifting/Shuffling Functions - //////////////////////////////////////////////////////////////////////////////// - - __forceinline vuint8 unpacklo(const vuint8& a, const vuint8& b) { return _mm256_unpacklo_epi32(a, b); } - __forceinline vuint8 unpackhi(const vuint8& a, const vuint8& b) { return _mm256_unpackhi_epi32(a, b); } - - template - __forceinline vuint8 shuffle(const vuint8& v) { - return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i, i, i, i))); - } - - template - __forceinline vuint8 shuffle4(const vuint8& v) { - return _mm256_permute2f128_si256(v, v, (i1 << 4) | (i0 << 0)); - } - - template - __forceinline vuint8 shuffle4(const vuint8& a, const vuint8& b) { - return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0)); - } - - template - __forceinline vuint8 shuffle(const vuint8& v) { - return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i3, i2, i1, i0))); - } - - template - __forceinline vuint8 shuffle(const vuint8& a, const vuint8& b) { - return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0))); - } - - template<> __forceinline vuint8 shuffle<0, 0, 2, 2>(const vuint8& v) { return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(v))); } - template<> __forceinline vuint8 shuffle<1, 1, 3, 3>(const vuint8& v) { return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(v))); } - template<> __forceinline vuint8 shuffle<0, 1, 0, 1>(const vuint8& v) { return _mm256_castps_si256(_mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(v))))); } - - __forceinline vuint8 broadcast(const unsigned int* ptr) { return _mm256_castps_si256(_mm256_broadcast_ss((const float*)ptr)); } - - template __forceinline vuint8 insert4(const vuint8& a, const vuint4& b) { return _mm256_insertf128_si256(a, b, i); } - template __forceinline vuint4 extract4(const vuint8& a) { return _mm256_extractf128_si256(a, i); } - template<> __forceinline vuint4 extract4<0>(const vuint8& a) { return _mm256_castsi256_si128(a); } - - __forceinline int toScalar(const vuint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); } - -#if !defined(__aarch64__) - - __forceinline vuint8 permute(const vuint8& v, const __m256i& index) { - return _mm256_permutevar8x32_epi32(v, index); - } - - __forceinline vuint8 shuffle(const vuint8& v, const __m256i& index) { - return _mm256_castps_si256(_mm256_permutevar_ps(_mm256_castsi256_ps(v), index)); - } - - template - __forceinline vuint8 align_shift_right(const vuint8& a, const vuint8& b) { -#if defined(__AVX512VL__) - return _mm256_alignr_epi32(a, b, i); -#else - return _mm256_alignr_epi8(a, b, 4*i); -#endif - } - -#endif - - - //////////////////////////////////////////////////////////////////////////////// - /// Reductions - //////////////////////////////////////////////////////////////////////////////// - - //__forceinline vuint8 vreduce_min2(const vuint8& v) { return min(v,shuffle<1,0,3,2>(v)); } - //__forceinline vuint8 vreduce_min4(const vuint8& v) { vuint8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); } - //__forceinline vuint8 vreduce_min (const vuint8& v) { vuint8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); } - - //__forceinline vuint8 vreduce_max2(const vuint8& v) { return max(v,shuffle<1,0,3,2>(v)); } - //__forceinline vuint8 vreduce_max4(const vuint8& v) { vuint8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); } - //__forceinline vuint8 vreduce_max (const vuint8& v) { vuint8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); } - - __forceinline vuint8 vreduce_add2(const vuint8& v) { return v + shuffle<1,0,3,2>(v); } - __forceinline vuint8 vreduce_add4(const vuint8& v) { vuint8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); } - __forceinline vuint8 vreduce_add (const vuint8& v) { vuint8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); } - - //__forceinline int reduce_min(const vuint8& v) { return toScalar(vreduce_min(v)); } - //__forceinline int reduce_max(const vuint8& v) { return toScalar(vreduce_max(v)); } - __forceinline int reduce_add(const vuint8& v) { return toScalar(vreduce_add(v)); } - - //__forceinline size_t select_min(const vuint8& v) { return bsf(movemask(v == vreduce_min(v))); } - //__forceinline size_t select_max(const vuint8& v) { return bsf(movemask(v == vreduce_max(v))); } - - //__forceinline size_t select_min(const vboolf8& valid, const vuint8& v) { const vuint8 a = select(valid,v,vuint8(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); } - //__forceinline size_t select_max(const vboolf8& valid, const vuint8& v) { const vuint8 a = select(valid,v,vuint8(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); } - - __forceinline vuint8 assign(const vuint4& a) { return _mm256_castsi128_si256(a); } - - //////////////////////////////////////////////////////////////////////////////// - /// Output Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline embree_ostream operator <<(embree_ostream cout, const vuint8& a) { - return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">"; - } -} diff --git a/thirdparty/embree-aarch64/common/sys/alloc.cpp b/thirdparty/embree-aarch64/common/sys/alloc.cpp deleted file mode 100644 index 12f143f131..0000000000 --- a/thirdparty/embree-aarch64/common/sys/alloc.cpp +++ /dev/null @@ -1,327 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "alloc.h" -#include "intrinsics.h" -#include "sysinfo.h" -#include "mutex.h" - -//////////////////////////////////////////////////////////////////////////////// -/// All Platforms -//////////////////////////////////////////////////////////////////////////////// - -namespace embree -{ - void* alignedMalloc(size_t size, size_t align) - { - if (size == 0) - return nullptr; - - assert((align & (align-1)) == 0); - void* ptr = _mm_malloc(size,align); - - if (size != 0 && ptr == nullptr) - // -- GODOT start -- - // throw std::bad_alloc(); - abort(); - // -- GODOT end -- - - return ptr; - } - - void alignedFree(void* ptr) - { - if (ptr) - _mm_free(ptr); - } - - static bool huge_pages_enabled = false; - static MutexSys os_init_mutex; - - __forceinline bool isHugePageCandidate(const size_t bytes) - { - if (!huge_pages_enabled) - return false; - - /* use huge pages only when memory overhead is low */ - const size_t hbytes = (bytes+PAGE_SIZE_2M-1) & ~size_t(PAGE_SIZE_2M-1); - return 66*(hbytes-bytes) < bytes; // at most 1.5% overhead - } -} - -//////////////////////////////////////////////////////////////////////////////// -/// Windows Platform -//////////////////////////////////////////////////////////////////////////////// - -#ifdef _WIN32 - -#define WIN32_LEAN_AND_MEAN -#include -#include - -namespace embree -{ - bool win_enable_selockmemoryprivilege (bool verbose) - { - HANDLE hToken; - if (!OpenProcessToken(GetCurrentProcess(), TOKEN_QUERY | TOKEN_ADJUST_PRIVILEGES, &hToken)) { - if (verbose) std::cout << "WARNING: OpenProcessToken failed while trying to enable SeLockMemoryPrivilege: " << GetLastError() << std::endl; - return false; - } - - TOKEN_PRIVILEGES tp; - tp.PrivilegeCount = 1; - tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; - - if (!LookupPrivilegeValueW(nullptr, L"SeLockMemoryPrivilege", &tp.Privileges[0].Luid)) { - if (verbose) std::cout << "WARNING: LookupPrivilegeValue failed while trying to enable SeLockMemoryPrivilege: " << GetLastError() << std::endl; - return false; - } - - SetLastError(ERROR_SUCCESS); - if (!AdjustTokenPrivileges(hToken, FALSE, &tp, sizeof(tp), nullptr, 0)) { - if (verbose) std::cout << "WARNING: AdjustTokenPrivileges failed while trying to enable SeLockMemoryPrivilege" << std::endl; - return false; - } - - if (GetLastError() == ERROR_NOT_ALL_ASSIGNED) { - if (verbose) std::cout << "WARNING: AdjustTokenPrivileges failed to enable SeLockMemoryPrivilege: Add SeLockMemoryPrivilege for current user and run process in elevated mode (Run as administrator)." << std::endl; - return false; - } - - return true; - } - - bool os_init(bool hugepages, bool verbose) - { - Lock lock(os_init_mutex); - - if (!hugepages) { - huge_pages_enabled = false; - return true; - } - - if (GetLargePageMinimum() != PAGE_SIZE_2M) { - huge_pages_enabled = false; - return false; - } - - huge_pages_enabled = true; - return true; - } - - void* os_malloc(size_t bytes, bool& hugepages) - { - if (bytes == 0) { - hugepages = false; - return nullptr; - } - - /* try direct huge page allocation first */ - if (isHugePageCandidate(bytes)) - { - int flags = MEM_COMMIT | MEM_RESERVE | MEM_LARGE_PAGES; - char* ptr = (char*) VirtualAlloc(nullptr,bytes,flags,PAGE_READWRITE); - if (ptr != nullptr) { - hugepages = true; - return ptr; - } - } - - /* fall back to 4k pages */ - int flags = MEM_COMMIT | MEM_RESERVE; - char* ptr = (char*) VirtualAlloc(nullptr,bytes,flags,PAGE_READWRITE); - // -- GODOT start -- - // if (ptr == nullptr) throw std::bad_alloc(); - if (ptr == nullptr) abort(); - // -- GODOT end -- - hugepages = false; - return ptr; - } - - size_t os_shrink(void* ptr, size_t bytesNew, size_t bytesOld, bool hugepages) - { - if (hugepages) // decommitting huge pages seems not to work under Windows - return bytesOld; - - const size_t pageSize = hugepages ? PAGE_SIZE_2M : PAGE_SIZE_4K; - bytesNew = (bytesNew+pageSize-1) & ~(pageSize-1); - bytesOld = (bytesOld+pageSize-1) & ~(pageSize-1); - if (bytesNew >= bytesOld) - return bytesOld; - - if (!VirtualFree((char*)ptr+bytesNew,bytesOld-bytesNew,MEM_DECOMMIT)) - // -- GODOT start -- - // throw std::bad_alloc(); - abort(); - // -- GODOT end -- - - return bytesNew; - } - - void os_free(void* ptr, size_t bytes, bool hugepages) - { - if (bytes == 0) - return; - - if (!VirtualFree(ptr,0,MEM_RELEASE)) - // -- GODOT start -- - // throw std::bad_alloc(); - abort(); - // -- GODOT end -- - } - - void os_advise(void *ptr, size_t bytes) - { - } -} - -#endif - -//////////////////////////////////////////////////////////////////////////////// -/// Unix Platform -//////////////////////////////////////////////////////////////////////////////// - -#if defined(__UNIX__) - -#include -#include -#include -#include -#include - -#if defined(__MACOSX__) -#include -#endif - -namespace embree -{ - bool os_init(bool hugepages, bool verbose) - { - Lock lock(os_init_mutex); - - if (!hugepages) { - huge_pages_enabled = false; - return true; - } - -#if defined(__LINUX__) - - int hugepagesize = 0; - - std::ifstream file; - file.open("/proc/meminfo",std::ios::in); - if (!file.is_open()) { - if (verbose) std::cout << "WARNING: Could not open /proc/meminfo. Huge page support cannot get enabled!" << std::endl; - huge_pages_enabled = false; - return false; - } - - std::string line; - while (getline(file,line)) - { - std::stringstream sline(line); - while (!sline.eof() && sline.peek() == ' ') sline.ignore(); - std::string tag; getline(sline,tag,' '); - while (!sline.eof() && sline.peek() == ' ') sline.ignore(); - std::string val; getline(sline,val,' '); - while (!sline.eof() && sline.peek() == ' ') sline.ignore(); - std::string unit; getline(sline,unit,' '); - if (tag == "Hugepagesize:" && unit == "kB") { - hugepagesize = std::stoi(val)*1024; - break; - } - } - - if (hugepagesize != PAGE_SIZE_2M) - { - if (verbose) std::cout << "WARNING: Only 2MB huge pages supported. Huge page support cannot get enabled!" << std::endl; - huge_pages_enabled = false; - return false; - } -#endif - - huge_pages_enabled = true; - return true; - } - - void* os_malloc(size_t bytes, bool& hugepages) - { - if (bytes == 0) { - hugepages = false; - return nullptr; - } - - /* try direct huge page allocation first */ - if (isHugePageCandidate(bytes)) - { -#if defined(__MACOSX__) - void* ptr = mmap(0, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, VM_FLAGS_SUPERPAGE_SIZE_2MB, 0); - if (ptr != MAP_FAILED) { - hugepages = true; - return ptr; - } -#elif defined(MAP_HUGETLB) - void* ptr = mmap(0, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON | MAP_HUGETLB, -1, 0); - if (ptr != MAP_FAILED) { - hugepages = true; - return ptr; - } -#endif - } - - /* fallback to 4k pages */ - void* ptr = (char*) mmap(0, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); - // -- GODOT start -- - // if (ptr == MAP_FAILED) throw std::bad_alloc(); - if (ptr == MAP_FAILED) abort(); - // -- GODOT end -- - hugepages = false; - - /* advise huge page hint for THP */ - os_advise(ptr,bytes); - return ptr; - } - - size_t os_shrink(void* ptr, size_t bytesNew, size_t bytesOld, bool hugepages) - { - const size_t pageSize = hugepages ? PAGE_SIZE_2M : PAGE_SIZE_4K; - bytesNew = (bytesNew+pageSize-1) & ~(pageSize-1); - bytesOld = (bytesOld+pageSize-1) & ~(pageSize-1); - if (bytesNew >= bytesOld) - return bytesOld; - - if (munmap((char*)ptr+bytesNew,bytesOld-bytesNew) == -1) - // -- GODOT start -- - // throw std::bad_alloc(); - abort(); - // -- GODOT end -- - - return bytesNew; - } - - void os_free(void* ptr, size_t bytes, bool hugepages) - { - if (bytes == 0) - return; - - /* for hugepages we need to also align the size */ - const size_t pageSize = hugepages ? PAGE_SIZE_2M : PAGE_SIZE_4K; - bytes = (bytes+pageSize-1) & ~(pageSize-1); - if (munmap(ptr,bytes) == -1) - // -- GODOT start -- - // throw std::bad_alloc(); - abort(); - // -- GODOT end -- - } - - /* hint for transparent huge pages (THP) */ - void os_advise(void* pptr, size_t bytes) - { -#if defined(MADV_HUGEPAGE) - madvise(pptr,bytes,MADV_HUGEPAGE); -#endif - } -} - -#endif diff --git a/thirdparty/embree-aarch64/common/sys/alloc.h b/thirdparty/embree-aarch64/common/sys/alloc.h deleted file mode 100644 index 5898ecda70..0000000000 --- a/thirdparty/embree-aarch64/common/sys/alloc.h +++ /dev/null @@ -1,164 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "platform.h" -#include -#include - -namespace embree -{ -#define ALIGNED_STRUCT_(align) \ - void* operator new(size_t size) { return alignedMalloc(size,align); } \ - void operator delete(void* ptr) { alignedFree(ptr); } \ - void* operator new[](size_t size) { return alignedMalloc(size,align); } \ - void operator delete[](void* ptr) { alignedFree(ptr); } - -#define ALIGNED_CLASS_(align) \ - public: \ - ALIGNED_STRUCT_(align) \ - private: - - /*! aligned allocation */ - void* alignedMalloc(size_t size, size_t align); - void alignedFree(void* ptr); - - /*! allocator that performs aligned allocations */ - template - struct aligned_allocator - { - typedef T value_type; - typedef T* pointer; - typedef const T* const_pointer; - typedef T& reference; - typedef const T& const_reference; - typedef std::size_t size_type; - typedef std::ptrdiff_t difference_type; - - __forceinline pointer allocate( size_type n ) { - return (pointer) alignedMalloc(n*sizeof(value_type),alignment); - } - - __forceinline void deallocate( pointer p, size_type n ) { - return alignedFree(p); - } - - __forceinline void construct( pointer p, const_reference val ) { - new (p) T(val); - } - - __forceinline void destroy( pointer p ) { - p->~T(); - } - }; - - /*! allocates pages directly from OS */ - bool win_enable_selockmemoryprivilege(bool verbose); - bool os_init(bool hugepages, bool verbose); - void* os_malloc (size_t bytes, bool& hugepages); - size_t os_shrink (void* ptr, size_t bytesNew, size_t bytesOld, bool hugepages); - void os_free (void* ptr, size_t bytes, bool hugepages); - void os_advise (void* ptr, size_t bytes); - - /*! allocator that performs OS allocations */ - template - struct os_allocator - { - typedef T value_type; - typedef T* pointer; - typedef const T* const_pointer; - typedef T& reference; - typedef const T& const_reference; - typedef std::size_t size_type; - typedef std::ptrdiff_t difference_type; - - __forceinline os_allocator () - : hugepages(false) {} - - __forceinline pointer allocate( size_type n ) { - return (pointer) os_malloc(n*sizeof(value_type),hugepages); - } - - __forceinline void deallocate( pointer p, size_type n ) { - return os_free(p,n*sizeof(value_type),hugepages); - } - - __forceinline void construct( pointer p, const_reference val ) { - new (p) T(val); - } - - __forceinline void destroy( pointer p ) { - p->~T(); - } - - bool hugepages; - }; - - /*! allocator for IDs */ - template - struct IDPool - { - typedef T value_type; - - IDPool () - : nextID(0) {} - - T allocate() - { - /* return ID from list */ - if (!IDs.empty()) - { - T id = *IDs.begin(); - IDs.erase(IDs.begin()); - return id; - } - - /* allocate new ID */ - else - { - if (size_t(nextID)+1 > max_id) - return -1; - - return nextID++; - } - } - - /* adds an ID provided by the user */ - bool add(T id) - { - if (id > max_id) - return false; - - /* check if ID should be in IDs set */ - if (id < nextID) { - auto p = IDs.find(id); - if (p == IDs.end()) return false; - IDs.erase(p); - return true; - } - - /* otherwise increase ID set */ - else - { - for (T i=nextID; i IDs; //!< stores deallocated IDs to be reused - T nextID; //!< next ID to use when IDs vector is empty - }; -} - diff --git a/thirdparty/embree-aarch64/common/sys/array.h b/thirdparty/embree-aarch64/common/sys/array.h deleted file mode 100644 index 77722a39f6..0000000000 --- a/thirdparty/embree-aarch64/common/sys/array.h +++ /dev/null @@ -1,222 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "platform.h" -#include "alloc.h" - -namespace embree -{ - /*! static array with static size */ - template - class array_t - { - public: - - /********************** Iterators ****************************/ - - __forceinline T* begin() const { return items; }; - __forceinline T* end () const { return items+N; }; - - - /********************** Capacity ****************************/ - - __forceinline bool empty () const { return N == 0; } - __forceinline size_t size () const { return N; } - __forceinline size_t max_size () const { return N; } - - - /******************** Element access **************************/ - - __forceinline T& operator[](size_t i) { assert(i < N); return items[i]; } - __forceinline const T& operator[](size_t i) const { assert(i < N); return items[i]; } - - __forceinline T& at(size_t i) { assert(i < N); return items[i]; } - __forceinline const T& at(size_t i) const { assert(i < N); return items[i]; } - - __forceinline T& front() const { assert(N > 0); return items[0]; }; - __forceinline T& back () const { assert(N > 0); return items[N-1]; }; - - __forceinline T* data() { return items; }; - __forceinline const T* data() const { return items; }; - - private: - T items[N]; - }; - - /*! static array with dynamic size */ - template - class darray_t - { - public: - - __forceinline darray_t () : M(0) {} - - __forceinline darray_t (const T& v) : M(0) { - for (size_t i=0; i 0); return items[0]; }; - __forceinline T& back () const { assert(M > 0); return items[M-1]; }; - - __forceinline T* data() { return items; }; - __forceinline const T* data() const { return items; }; - - private: - size_t M; - T items[N]; - }; - - /*! dynamic sized array that is allocated on the stack */ -#define dynamic_large_stack_array(Ty,Name,N,max_stack_bytes) StackArray Name(N) - template - struct __aligned(64) StackArray - { - __forceinline StackArray (const size_t N) - : N(N) - { - if (N*sizeof(Ty) <= max_stack_bytes) - data = &arr[0]; - else - data = (Ty*) alignedMalloc(N*sizeof(Ty),64); - } - - __forceinline ~StackArray () { - if (data != &arr[0]) alignedFree(data); - } - - __forceinline operator Ty* () { return data; } - __forceinline operator const Ty* () const { return data; } - - __forceinline Ty& operator[](const int i) { assert(i>=0 && i=0 && i - struct __aligned(64) DynamicStackArray - { - __forceinline DynamicStackArray () - : data(&arr[0]) {} - - __forceinline ~DynamicStackArray () - { - if (!isStackAllocated()) - delete[] data; - } - - __forceinline bool isStackAllocated() const { - return data == &arr[0]; - } - - __forceinline size_t size() const - { - if (isStackAllocated()) return max_stack_elements; - else return max_total_elements; - } - - __forceinline void resize(size_t M) - { - assert(M <= max_total_elements); - if (likely(M <= max_stack_elements)) return; - if (likely(!isStackAllocated())) return; - - data = new Ty[max_total_elements]; - - for (size_t i=0; i=0 && ioperator[] (i) = other[i]; - } - - DynamicStackArray& operator= (const DynamicStackArray& other) - { - for (size_t i=0; ioperator[] (i) = other[i]; - - return *this; - } - - private: - Ty arr[max_stack_elements]; - Ty* data; - }; -} diff --git a/thirdparty/embree-aarch64/common/sys/atomic.h b/thirdparty/embree-aarch64/common/sys/atomic.h deleted file mode 100644 index ebfb8552c3..0000000000 --- a/thirdparty/embree-aarch64/common/sys/atomic.h +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include -#include "intrinsics.h" - -namespace embree -{ -/* compiler memory barriers */ -#if defined(__INTEL_COMPILER) -//#define __memory_barrier() __memory_barrier() -#elif defined(__GNUC__) || defined(__clang__) -# define __memory_barrier() asm volatile("" ::: "memory") -#elif defined(_MSC_VER) -# define __memory_barrier() _ReadWriteBarrier() -#endif - - template - struct atomic : public std::atomic - { - atomic () {} - - atomic (const T& a) - : std::atomic(a) {} - - atomic (const atomic& a) { - this->store(a.load()); - } - - atomic& operator=(const atomic& other) { - this->store(other.load()); - return *this; - } - }; - - template - __forceinline void atomic_min(std::atomic& aref, const T& bref) - { - const T b = bref.load(); - while (true) { - T a = aref.load(); - if (a <= b) break; - if (aref.compare_exchange_strong(a,b)) break; - } - } - - template - __forceinline void atomic_max(std::atomic& aref, const T& bref) - { - const T b = bref.load(); - while (true) { - T a = aref.load(); - if (a >= b) break; - if (aref.compare_exchange_strong(a,b)) break; - } - } -} diff --git a/thirdparty/embree-aarch64/common/sys/barrier.cpp b/thirdparty/embree-aarch64/common/sys/barrier.cpp deleted file mode 100644 index 0061d18db2..0000000000 --- a/thirdparty/embree-aarch64/common/sys/barrier.cpp +++ /dev/null @@ -1,289 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "barrier.h" -#include "condition.h" -#include "regression.h" -#include "thread.h" - -#if defined (__WIN32__) - -#define WIN32_LEAN_AND_MEAN -#include - -namespace embree -{ - struct BarrierSysImplementation - { - __forceinline BarrierSysImplementation (size_t N) - : i(0), enterCount(0), exitCount(0), barrierSize(0) - { - events[0] = CreateEvent(nullptr, TRUE, FALSE, nullptr); - events[1] = CreateEvent(nullptr, TRUE, FALSE, nullptr); - init(N); - } - - __forceinline ~BarrierSysImplementation () - { - CloseHandle(events[0]); - CloseHandle(events[1]); - } - - __forceinline void init(size_t N) - { - barrierSize = N; - enterCount.store(N); - exitCount.store(N); - } - - __forceinline void wait() - { - /* every thread entering the barrier decrements this count */ - size_t i0 = i; - size_t cnt0 = enterCount--; - - /* all threads except the last one are wait in the barrier */ - if (cnt0 > 1) - { - if (WaitForSingleObject(events[i0], INFINITE) != WAIT_OBJECT_0) - THROW_RUNTIME_ERROR("WaitForSingleObjects failed"); - } - - /* the last thread starts all threads waiting at the barrier */ - else - { - i = 1-i; - enterCount.store(barrierSize); - if (SetEvent(events[i0]) == 0) - THROW_RUNTIME_ERROR("SetEvent failed"); - } - - /* every thread leaving the barrier decrements this count */ - size_t cnt1 = exitCount--; - - /* the last thread that left the barrier resets the event again */ - if (cnt1 == 1) - { - exitCount.store(barrierSize); - if (ResetEvent(events[i0]) == 0) - THROW_RUNTIME_ERROR("ResetEvent failed"); - } - } - - public: - HANDLE events[2]; - atomic i; - atomic enterCount; - atomic exitCount; - size_t barrierSize; - }; -} - -#else - -namespace embree -{ - struct BarrierSysImplementation - { - __forceinline BarrierSysImplementation (size_t N) - : count(0), barrierSize(0) - { - init(N); - } - - __forceinline void init(size_t N) - { - assert(count == 0); - count = 0; - barrierSize = N; - } - - __forceinline void wait() - { - mutex.lock(); - count++; - - if (count == barrierSize) { - count = 0; - cond.notify_all(); - mutex.unlock(); - return; - } - - cond.wait(mutex); - mutex.unlock(); - return; - } - - public: - MutexSys mutex; - ConditionSys cond; - volatile size_t count; - volatile size_t barrierSize; - }; -} - -#endif - -namespace embree -{ - BarrierSys::BarrierSys (size_t N) { - opaque = new BarrierSysImplementation(N); - } - - BarrierSys::~BarrierSys () { - delete (BarrierSysImplementation*) opaque; - } - - void BarrierSys::init(size_t count) { - ((BarrierSysImplementation*) opaque)->init(count); - } - - void BarrierSys::wait() { - ((BarrierSysImplementation*) opaque)->wait(); - } - - LinearBarrierActive::LinearBarrierActive (size_t N) - : count0(nullptr), count1(nullptr), mode(0), flag0(0), flag1(0), threadCount(0) - { - if (N == 0) N = getNumberOfLogicalThreads(); - init(N); - } - - LinearBarrierActive::~LinearBarrierActive() - { - delete[] count0; - delete[] count1; - } - - void LinearBarrierActive::init(size_t N) - { - if (threadCount != N) { - threadCount = N; - if (count0) delete[] count0; count0 = new unsigned char[N]; - if (count1) delete[] count1; count1 = new unsigned char[N]; - } - mode = 0; - flag0 = 0; - flag1 = 0; - for (size_t i=0; i threadID; - std::atomic numFailed; - std::vector threadResults; - - barrier_sys_regression_test() - : RegressionTest("barrier_sys_regression_test"), threadID(0), numFailed(0) - { - registerRegressionTest(this); - } - - static void thread_alloc(barrier_sys_regression_test* This) - { - size_t tid = This->threadID++; - for (size_t j=0; j<1000; j++) - { - This->barrier.wait(); - This->threadResults[tid] = tid; - This->barrier.wait(); - } - } - - bool run () - { - threadID.store(0); - numFailed.store(0); - - size_t numThreads = getNumberOfLogicalThreads(); - threadResults.resize(numThreads); - barrier.init(numThreads+1); - - /* create threads */ - std::vector threads; - for (size_t i=0; i cntr; - }; - - /*! fast active barrier that does not require initialization to some number of threads */ - struct BarrierActiveAutoReset - { - public: - BarrierActiveAutoReset () - : cntr0(0), cntr1(0) {} - - void wait (size_t threadCount) - { - cntr0.fetch_add(1); - while (cntr0 != threadCount) pause_cpu(); - cntr1.fetch_add(1); - while (cntr1 != threadCount) pause_cpu(); - cntr0.fetch_add(-1); - while (cntr0 != 0) pause_cpu(); - cntr1.fetch_add(-1); - while (cntr1 != 0) pause_cpu(); - } - - private: - std::atomic cntr0; - std::atomic cntr1; - }; - - class LinearBarrierActive - { - public: - - /*! construction and destruction */ - LinearBarrierActive (size_t threadCount = 0); - ~LinearBarrierActive(); - - private: - /*! class in non-copyable */ - LinearBarrierActive (const LinearBarrierActive& other) DELETED; // do not implement - LinearBarrierActive& operator= (const LinearBarrierActive& other) DELETED; // do not implement - - public: - /*! intializes the barrier with some number of threads */ - void init(size_t threadCount); - - /*! thread with threadIndex waits in the barrier */ - void wait (const size_t threadIndex); - - private: - volatile unsigned char* count0; - volatile unsigned char* count1; - volatile unsigned int mode; - volatile unsigned int flag0; - volatile unsigned int flag1; - volatile size_t threadCount; - }; -} - diff --git a/thirdparty/embree-aarch64/common/sys/condition.cpp b/thirdparty/embree-aarch64/common/sys/condition.cpp deleted file mode 100644 index 0e7ca7af39..0000000000 --- a/thirdparty/embree-aarch64/common/sys/condition.cpp +++ /dev/null @@ -1,81 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "condition.h" - -#if defined(__WIN32__) && !defined(PTHREADS_WIN32) - -#define WIN32_LEAN_AND_MEAN -#include - -namespace embree -{ - struct ConditionImplementation - { - __forceinline ConditionImplementation () { - InitializeConditionVariable(&cond); - } - - __forceinline ~ConditionImplementation () { - } - - __forceinline void wait(MutexSys& mutex_in) { - SleepConditionVariableCS(&cond, (LPCRITICAL_SECTION)mutex_in.mutex, INFINITE); - } - - __forceinline void notify_all() { - WakeAllConditionVariable(&cond); - } - - public: - CONDITION_VARIABLE cond; - }; -} -#endif - -#if defined(__UNIX__) || defined(PTHREADS_WIN32) -#include -namespace embree -{ - struct ConditionImplementation - { - __forceinline ConditionImplementation () { - pthread_cond_init(&cond,nullptr); - } - - __forceinline ~ConditionImplementation() { - pthread_cond_destroy(&cond); - } - - __forceinline void wait(MutexSys& mutex) { - pthread_cond_wait(&cond, (pthread_mutex_t*)mutex.mutex); - } - - __forceinline void notify_all() { - pthread_cond_broadcast(&cond); - } - - public: - pthread_cond_t cond; - }; -} -#endif - -namespace embree -{ - ConditionSys::ConditionSys () { - cond = new ConditionImplementation; - } - - ConditionSys::~ConditionSys() { - delete (ConditionImplementation*) cond; - } - - void ConditionSys::wait(MutexSys& mutex) { - ((ConditionImplementation*) cond)->wait(mutex); - } - - void ConditionSys::notify_all() { - ((ConditionImplementation*) cond)->notify_all(); - } -} diff --git a/thirdparty/embree-aarch64/common/sys/condition.h b/thirdparty/embree-aarch64/common/sys/condition.h deleted file mode 100644 index 7a3a05aa81..0000000000 --- a/thirdparty/embree-aarch64/common/sys/condition.h +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "mutex.h" - -namespace embree -{ - class ConditionSys - { - public: - ConditionSys(); - ~ConditionSys(); - void wait( class MutexSys& mutex ); - void notify_all(); - - template - __forceinline void wait( class MutexSys& mutex, const Predicate& pred ) - { - while (!pred()) wait(mutex); - } - - private: - ConditionSys (const ConditionSys& other) DELETED; // do not implement - ConditionSys& operator= (const ConditionSys& other) DELETED; // do not implement - - protected: - void* cond; - }; -} diff --git a/thirdparty/embree-aarch64/common/sys/filename.cpp b/thirdparty/embree-aarch64/common/sys/filename.cpp deleted file mode 100644 index 86182c1afb..0000000000 --- a/thirdparty/embree-aarch64/common/sys/filename.cpp +++ /dev/null @@ -1,138 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "filename.h" -#include "sysinfo.h" - -namespace embree -{ -#ifdef __WIN32__ - const char path_sep = '\\'; -#else - const char path_sep = '/'; -#endif - - /*! create an empty filename */ - FileName::FileName () {} - - /*! create a valid filename from a string */ - FileName::FileName (const char* in) { - filename = in; - for (size_t i=0; i -#endif - -#if defined(__ARM_NEON) -#include "../math/SSE2NEON.h" -#if defined(NEON_AVX2_EMULATION) -#include "../math/AVX2NEON.h" -#endif -#else -#include -#endif - -#if defined(__BMI__) && defined(__GNUC__) && !defined(__INTEL_COMPILER) - #if !defined(_tzcnt_u32) - #define _tzcnt_u32 __tzcnt_u32 - #endif - #if !defined(_tzcnt_u64) - #define _tzcnt_u64 __tzcnt_u64 - #endif -#endif - -#if defined(__aarch64__) -#if !defined(_lzcnt_u32) - #define _lzcnt_u32 __builtin_clz -#endif -#if !defined(_lzcnt_u32) - #define _lzcnt_u32 __builtin_clzll -#endif -#else -#if defined(__LZCNT__) - #if !defined(_lzcnt_u32) - #define _lzcnt_u32 __lzcnt32 - #endif - #if !defined(_lzcnt_u64) - #define _lzcnt_u64 __lzcnt64 - #endif -#endif -#endif - -#if defined(__WIN32__) -# ifndef NOMINMAX -# define NOMINMAX -# endif -# include -#endif - -/* normally defined in pmmintrin.h, but we always need this */ -#if !defined(_MM_SET_DENORMALS_ZERO_MODE) -#define _MM_DENORMALS_ZERO_ON (0x0040) -#define _MM_DENORMALS_ZERO_OFF (0x0000) -#define _MM_DENORMALS_ZERO_MASK (0x0040) -#define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x))) -#endif - -namespace embree -{ - -//////////////////////////////////////////////////////////////////////////////// -/// Windows Platform -//////////////////////////////////////////////////////////////////////////////// - -#if defined(__WIN32__) - - __forceinline size_t read_tsc() - { - LARGE_INTEGER li; - QueryPerformanceCounter(&li); - return (size_t)li.QuadPart; - } - - __forceinline int bsf(int v) { -#if defined(__AVX2__) && !defined(__aarch64__) - return _tzcnt_u32(v); -#else - unsigned long r = 0; _BitScanForward(&r,v); return r; -#endif - } - - __forceinline unsigned bsf(unsigned v) { -#if defined(__AVX2__) && !defined(__aarch64__) - return _tzcnt_u32(v); -#else - unsigned long r = 0; _BitScanForward(&r,v); return r; -#endif - } - -#if defined(__X86_64__) - __forceinline size_t bsf(size_t v) { -#if defined(__AVX2__) - return _tzcnt_u64(v); -#else - unsigned long r = 0; _BitScanForward64(&r,v); return r; -#endif - } -#endif - - __forceinline int bscf(int& v) - { - int i = bsf(v); - v &= v-1; - return i; - } - - __forceinline unsigned bscf(unsigned& v) - { - unsigned i = bsf(v); - v &= v-1; - return i; - } - -#if defined(__X86_64__) - __forceinline size_t bscf(size_t& v) - { - size_t i = bsf(v); - v &= v-1; - return i; - } -#endif - - __forceinline int bsr(int v) { -#if defined(__AVX2__) && !defined(__aarch64__) - return 31 - _lzcnt_u32(v); -#else - unsigned long r = 0; _BitScanReverse(&r,v); return r; -#endif - } - - __forceinline unsigned bsr(unsigned v) { -#if defined(__AVX2__) && !defined(__aarch64__) - return 31 - _lzcnt_u32(v); -#else - unsigned long r = 0; _BitScanReverse(&r,v); return r; -#endif - } - -#if defined(__X86_64__) - __forceinline size_t bsr(size_t v) { -#if defined(__AVX2__) - return 63 -_lzcnt_u64(v); -#else - unsigned long r = 0; _BitScanReverse64(&r, v); return r; -#endif - } -#endif - - __forceinline int lzcnt(const int x) - { -#if defined(__AVX2__) && !defined(__aarch64__) - return _lzcnt_u32(x); -#else - if (unlikely(x == 0)) return 32; - return 31 - bsr(x); -#endif - } - - __forceinline int btc(int v, int i) { - long r = v; _bittestandcomplement(&r,i); return r; - } - - __forceinline int bts(int v, int i) { - long r = v; _bittestandset(&r,i); return r; - } - - __forceinline int btr(int v, int i) { - long r = v; _bittestandreset(&r,i); return r; - } - -#if defined(__X86_64__) - - __forceinline size_t btc(size_t v, size_t i) { - size_t r = v; _bittestandcomplement64((__int64*)&r,i); return r; - } - - __forceinline size_t bts(size_t v, size_t i) { - __int64 r = v; _bittestandset64(&r,i); return r; - } - - __forceinline size_t btr(size_t v, size_t i) { - __int64 r = v; _bittestandreset64(&r,i); return r; - } - -#endif - - __forceinline int32_t atomic_cmpxchg(volatile int32_t* p, const int32_t c, const int32_t v) { - return _InterlockedCompareExchange((volatile long*)p,v,c); - } - -//////////////////////////////////////////////////////////////////////////////// -/// Unix Platform -//////////////////////////////////////////////////////////////////////////////// - -#else - -#if defined(__i386__) && defined(__PIC__) - - __forceinline void __cpuid(int out[4], int op) - { - asm volatile ("xchg{l}\t{%%}ebx, %1\n\t" - "cpuid\n\t" - "xchg{l}\t{%%}ebx, %1\n\t" - : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) - : "0"(op)); - } - - __forceinline void __cpuid_count(int out[4], int op1, int op2) - { - asm volatile ("xchg{l}\t{%%}ebx, %1\n\t" - "cpuid\n\t" - "xchg{l}\t{%%}ebx, %1\n\t" - : "=a" (out[0]), "=r" (out[1]), "=c" (out[2]), "=d" (out[3]) - : "0" (op1), "2" (op2)); - } - -#else - - __forceinline void __cpuid(int out[4], int op) { -#if defined(__ARM_NEON) - if (op == 0) { // Get CPU name - out[0] = 0x41524d20; - out[1] = 0x41524d20; - out[2] = 0x41524d20; - out[3] = 0x41524d20; - } -#else - asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op)); -#endif - } - -#if !defined(__ARM_NEON) - __forceinline void __cpuid_count(int out[4], int op1, int op2) { - asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op1), "c"(op2)); - } -#endif - -#endif - - __forceinline uint64_t read_tsc() { -#if defined(__ARM_NEON) - return 0; // FIXME(LTE): mimic rdtsc -#else - uint32_t high,low; - asm volatile ("rdtsc" : "=d"(high), "=a"(low)); - return (((uint64_t)high) << 32) + (uint64_t)low; -#endif - } - - __forceinline int bsf(int v) { -#if defined(__ARM_NEON) - return __builtin_ctz(v); -#else -#if defined(__AVX2__) - return _tzcnt_u32(v); -#else - int r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r; -#endif -#endif - } - -#if defined(__X86_64__) || defined(__aarch64__) - __forceinline unsigned bsf(unsigned v) - { -#if defined(__ARM_NEON) - return __builtin_ctz(v); -#else -#if defined(__AVX2__) - return _tzcnt_u32(v); -#else - unsigned r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r; -#endif -#endif - } -#endif - - __forceinline size_t bsf(size_t v) { -#if defined(__AVX2__) && !defined(__aarch64__) -#if defined(__X86_64__) - return _tzcnt_u64(v); -#else - return _tzcnt_u32(v); -#endif -#elif defined(__ARM_NEON) - return __builtin_ctzl(v); -#else - size_t r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r; -#endif - } - - __forceinline int bscf(int& v) - { - int i = bsf(v); - v &= v-1; - return i; - } - -#if defined(__X86_64__) || defined(__aarch64__) - __forceinline unsigned int bscf(unsigned int& v) - { - unsigned int i = bsf(v); - v &= v-1; - return i; - } -#endif - - __forceinline size_t bscf(size_t& v) - { - size_t i = bsf(v); - v &= v-1; - return i; - } - - __forceinline int bsr(int v) { -#if defined(__AVX2__) && !defined(__aarch64__) - return 31 - _lzcnt_u32(v); -#elif defined(__ARM_NEON) - return __builtin_clz(v)^31; -#else - int r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r; -#endif - } - -#if defined(__X86_64__) || defined(__aarch64__) - __forceinline unsigned bsr(unsigned v) { -#if defined(__AVX2__) - return 31 - _lzcnt_u32(v); -#elif defined(__ARM_NEON) - return __builtin_clz(v)^31; -#else - unsigned r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r; -#endif - } -#endif - - __forceinline size_t bsr(size_t v) { -#if defined(__AVX2__) && !defined(__aarch64__) -#if defined(__X86_64__) - return 63 - _lzcnt_u64(v); -#else - return 31 - _lzcnt_u32(v); -#endif -#elif defined(__aarch64__) - return (sizeof(v) * 8 - 1) - __builtin_clzl(v); -#else - size_t r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r; -#endif - } - - __forceinline int lzcnt(const int x) - { -#if defined(__AVX2__) && !defined(__aarch64__) - return _lzcnt_u32(x); -#else - if (unlikely(x == 0)) return 32; - return 31 - bsr(x); -#endif - } - - __forceinline size_t blsr(size_t v) { -#if defined(__AVX2__) && !defined(__aarch64__) -#if defined(__INTEL_COMPILER) - return _blsr_u64(v); -#else -#if defined(__X86_64__) - return __blsr_u64(v); -#else - return __blsr_u32(v); -#endif -#endif -#else - return v & (v-1); -#endif - } - - __forceinline int btc(int v, int i) { -#if defined(__aarch64__) - // _bittestandcomplement(long *a, long b) { - // unsigned char x = (*a >> b) & 1; - // *a = *a ^ (1 << b); - // return x; - - // We only need `*a` - return (v ^ (1 << i)); -#else - int r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r; -#endif - } - - __forceinline int bts(int v, int i) { -#if defined(__aarch64__) - // _bittestandset(long *a, long b) { - // unsigned char x = (*a >> b) & 1; - // *a = *a | (1 << b); - // return x; - return (v | (v << i)); -#else - int r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r; -#endif - } - - __forceinline int btr(int v, int i) { -#if defined(__aarch64__) - // _bittestandreset(long *a, long b) { - // unsigned char x = (*a >> b) & 1; - // *a = *a & ~(1 << b); - // return x; - return (v & ~(v << i)); -#else - int r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r; -#endif - } - - __forceinline size_t btc(size_t v, size_t i) { -#if defined(__aarch64__) - return (v ^ (1 << i)); -#else - size_t r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r; -#endif - } - - __forceinline size_t bts(size_t v, size_t i) { -#if defined(__aarch64__) - return (v | (v << i)); -#else - size_t r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r; -#endif - } - - __forceinline size_t btr(size_t v, size_t i) { -#if defined(__ARM_NEON) - return (v & ~(v << i)); -#else - size_t r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r; -#endif - } - - __forceinline int32_t atomic_cmpxchg(int32_t volatile* value, int32_t comparand, const int32_t input) { - return __sync_val_compare_and_swap(value, comparand, input); - } - -#endif - -//////////////////////////////////////////////////////////////////////////////// -/// All Platforms -//////////////////////////////////////////////////////////////////////////////// - -#if defined(__clang__) || defined(__GNUC__) -#if !defined(_mm_undefined_ps) - __forceinline __m128 _mm_undefined_ps() { return _mm_setzero_ps(); } -#endif -#if !defined(_mm_undefined_si128) - __forceinline __m128i _mm_undefined_si128() { return _mm_setzero_si128(); } -#endif -#if !defined(_mm256_undefined_ps) && defined(__AVX__) - __forceinline __m256 _mm256_undefined_ps() { return _mm256_setzero_ps(); } -#endif -#if !defined(_mm256_undefined_si256) && defined(__AVX__) - __forceinline __m256i _mm256_undefined_si256() { return _mm256_setzero_si256(); } -#endif -#if !defined(_mm512_undefined_ps) && defined(__AVX512F__) - __forceinline __m512 _mm512_undefined_ps() { return _mm512_setzero_ps(); } -#endif -#if !defined(_mm512_undefined_epi32) && defined(__AVX512F__) - __forceinline __m512i _mm512_undefined_epi32() { return _mm512_setzero_si512(); } -#endif -#endif - -#if defined(__SSE4_2__) || defined(__ARM_NEON) - - __forceinline int popcnt(int in) { - return _mm_popcnt_u32(in); - } - - __forceinline unsigned popcnt(unsigned in) { - return _mm_popcnt_u32(in); - } - -#if defined(__X86_64__) || defined(__ARM_NEON) - __forceinline size_t popcnt(size_t in) { - return _mm_popcnt_u64(in); - } -#endif - -#endif - - __forceinline uint64_t rdtsc() - { - int dummy[4]; - __cpuid(dummy,0); - uint64_t clock = read_tsc(); - __cpuid(dummy,0); - return clock; - } - - __forceinline void pause_cpu(const size_t N = 8) - { - for (size_t i=0; i - -namespace embree -{ - /* opens a shared library */ - lib_t openLibrary(const std::string& file) - { - std::string fullName = file+".dll"; - FileName executable = getExecutableFileName(); - HANDLE handle = LoadLibrary((executable.path() + fullName).c_str()); - return lib_t(handle); - } - - /* returns address of a symbol from the library */ - void* getSymbol(lib_t lib, const std::string& sym) { - return reinterpret_cast(GetProcAddress(HMODULE(lib),sym.c_str())); - } - - /* closes the shared library */ - void closeLibrary(lib_t lib) { - FreeLibrary(HMODULE(lib)); - } -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -/// Unix Platform -//////////////////////////////////////////////////////////////////////////////// - -#if defined(__UNIX__) - -#include - -namespace embree -{ - /* opens a shared library */ - lib_t openLibrary(const std::string& file) - { -#if defined(__MACOSX__) - std::string fullName = "lib"+file+".dylib"; -#else - std::string fullName = "lib"+file+".so"; -#endif - void* lib = dlopen(fullName.c_str(), RTLD_NOW); - if (lib) return lib_t(lib); - FileName executable = getExecutableFileName(); - lib = dlopen((executable.path() + fullName).c_str(),RTLD_NOW); - if (lib == nullptr) { - const char* error = dlerror(); - if (error) { - THROW_RUNTIME_ERROR(error); - } else { - THROW_RUNTIME_ERROR("could not load library "+executable.str()); - } - } - return lib_t(lib); - } - - /* returns address of a symbol from the library */ - void* getSymbol(lib_t lib, const std::string& sym) { - return dlsym(lib,sym.c_str()); - } - - /* closes the shared library */ - void closeLibrary(lib_t lib) { - dlclose(lib); - } -} -#endif diff --git a/thirdparty/embree-aarch64/common/sys/library.h b/thirdparty/embree-aarch64/common/sys/library.h deleted file mode 100644 index c2164e9fbe..0000000000 --- a/thirdparty/embree-aarch64/common/sys/library.h +++ /dev/null @@ -1,21 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "platform.h" - -namespace embree -{ - /*! type for shared library */ - typedef struct opaque_lib_t* lib_t; - - /*! loads a shared library */ - lib_t openLibrary(const std::string& file); - - /*! returns address of a symbol from the library */ - void* getSymbol(lib_t lib, const std::string& sym); - - /*! unloads a shared library */ - void closeLibrary(lib_t lib); -} diff --git a/thirdparty/embree-aarch64/common/sys/mutex.cpp b/thirdparty/embree-aarch64/common/sys/mutex.cpp deleted file mode 100644 index 11779bc9b9..0000000000 --- a/thirdparty/embree-aarch64/common/sys/mutex.cpp +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "mutex.h" -#include "regression.h" - -#if defined(__WIN32__) && !defined(PTHREADS_WIN32) - -#define WIN32_LEAN_AND_MEAN -#include - -namespace embree -{ - MutexSys::MutexSys() { mutex = new CRITICAL_SECTION; InitializeCriticalSection((CRITICAL_SECTION*)mutex); } - MutexSys::~MutexSys() { DeleteCriticalSection((CRITICAL_SECTION*)mutex); delete (CRITICAL_SECTION*)mutex; } - void MutexSys::lock() { EnterCriticalSection((CRITICAL_SECTION*)mutex); } - bool MutexSys::try_lock() { return TryEnterCriticalSection((CRITICAL_SECTION*)mutex) != 0; } - void MutexSys::unlock() { LeaveCriticalSection((CRITICAL_SECTION*)mutex); } -} -#endif - -#if defined(__UNIX__) || defined(PTHREADS_WIN32) -#include -namespace embree -{ - /*! system mutex using pthreads */ - MutexSys::MutexSys() - { - mutex = new pthread_mutex_t; - if (pthread_mutex_init((pthread_mutex_t*)mutex, nullptr) != 0) - THROW_RUNTIME_ERROR("pthread_mutex_init failed"); - } - - MutexSys::~MutexSys() - { - MAYBE_UNUSED bool ok = pthread_mutex_destroy((pthread_mutex_t*)mutex) == 0; - assert(ok); - delete (pthread_mutex_t*)mutex; - mutex = nullptr; - } - - void MutexSys::lock() - { - if (pthread_mutex_lock((pthread_mutex_t*)mutex) != 0) - THROW_RUNTIME_ERROR("pthread_mutex_lock failed"); - } - - bool MutexSys::try_lock() { - return pthread_mutex_trylock((pthread_mutex_t*)mutex) == 0; - } - - void MutexSys::unlock() - { - if (pthread_mutex_unlock((pthread_mutex_t*)mutex) != 0) - THROW_RUNTIME_ERROR("pthread_mutex_unlock failed"); - } -}; -#endif diff --git a/thirdparty/embree-aarch64/common/sys/mutex.h b/thirdparty/embree-aarch64/common/sys/mutex.h deleted file mode 100644 index 1164210f23..0000000000 --- a/thirdparty/embree-aarch64/common/sys/mutex.h +++ /dev/null @@ -1,98 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "platform.h" -#include "intrinsics.h" -#include "atomic.h" - -namespace embree -{ - /*! system mutex */ - class MutexSys { - friend struct ConditionImplementation; - public: - MutexSys(); - ~MutexSys(); - - private: - MutexSys (const MutexSys& other) DELETED; // do not implement - MutexSys& operator= (const MutexSys& other) DELETED; // do not implement - - public: - void lock(); - bool try_lock(); - void unlock(); - - protected: - void* mutex; - }; - - /*! spinning mutex */ - class SpinLock - { - public: - - SpinLock () - : flag(false) {} - - __forceinline bool isLocked() { - return flag.load(); - } - - __forceinline void lock() - { - while (true) - { - while (flag.load()) - { - _mm_pause(); - _mm_pause(); - } - - bool expected = false; - if (flag.compare_exchange_strong(expected,true,std::memory_order_acquire)) - break; - } - } - - __forceinline bool try_lock() - { - bool expected = false; - if (flag.load() != expected) { - return false; - } - return flag.compare_exchange_strong(expected,true,std::memory_order_acquire); - } - - __forceinline void unlock() { - flag.store(false,std::memory_order_release); - } - - __forceinline void wait_until_unlocked() - { - while(flag.load()) - { - _mm_pause(); - _mm_pause(); - } - } - - public: - atomic flag; - }; - - /*! safe mutex lock and unlock helper */ - template class Lock { - public: - Lock (Mutex& mutex) : mutex(mutex), locked(true) { mutex.lock(); } - Lock (Mutex& mutex, bool locked) : mutex(mutex), locked(locked) {} - ~Lock() { if (locked) mutex.unlock(); } - __forceinline void lock() { assert(!locked); locked = true; mutex.lock(); } - __forceinline bool isLocked() const { return locked; } - protected: - Mutex& mutex; - bool locked; - }; -} diff --git a/thirdparty/embree-aarch64/common/sys/platform.h b/thirdparty/embree-aarch64/common/sys/platform.h deleted file mode 100644 index 737f14aa6e..0000000000 --- a/thirdparty/embree-aarch64/common/sys/platform.h +++ /dev/null @@ -1,387 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#define _CRT_SECURE_NO_WARNINGS - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -//////////////////////////////////////////////////////////////////////////////// -/// detect platform -//////////////////////////////////////////////////////////////////////////////// - -/* detect 32 or 64 platform */ -#if defined(__x86_64__) || defined(__ia64__) || defined(_M_X64) -#define __X86_64__ -#endif - -/* detect Linux platform */ -#if defined(linux) || defined(__linux__) || defined(__LINUX__) -# if !defined(__LINUX__) -# define __LINUX__ -# endif -# if !defined(__UNIX__) -# define __UNIX__ -# endif -#endif - -/* detect FreeBSD platform */ -#if defined(__FreeBSD__) || defined(__FREEBSD__) -# if !defined(__FREEBSD__) -# define __FREEBSD__ -# endif -# if !defined(__UNIX__) -# define __UNIX__ -# endif -#endif - -/* detect Windows 95/98/NT/2000/XP/Vista/7/8/10 platform */ -#if (defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)) && !defined(__CYGWIN__) -# if !defined(__WIN32__) -# define __WIN32__ -# endif -#endif - -/* detect Cygwin platform */ -#if defined(__CYGWIN__) -# if !defined(__UNIX__) -# define __UNIX__ -# endif -#endif - -/* detect MAC OS X platform */ -#if defined(__APPLE__) || defined(MACOSX) || defined(__MACOSX__) -# if !defined(__MACOSX__) -# define __MACOSX__ -# endif -# if !defined(__UNIX__) -# define __UNIX__ -# endif -#endif - -/* try to detect other Unix systems */ -#if defined(__unix__) || defined (unix) || defined(__unix) || defined(_unix) -# if !defined(__UNIX__) -# define __UNIX__ -# endif -#endif - -//////////////////////////////////////////////////////////////////////////////// -/// Macros -//////////////////////////////////////////////////////////////////////////////// - -#ifdef __WIN32__ -#define dll_export __declspec(dllexport) -#define dll_import __declspec(dllimport) -#else -#define dll_export __attribute__ ((visibility ("default"))) -#define dll_import -#endif - -#ifdef __WIN32__ -#if !defined(__noinline) -#define __noinline __declspec(noinline) -#endif -//#define __forceinline __forceinline -//#define __restrict __restrict -#if defined(__INTEL_COMPILER) -#define __restrict__ __restrict -#else -#define __restrict__ //__restrict // causes issues with MSVC -#endif -#if !defined(__thread) -// NOTE: Require `-fms-extensions` for clang -#define __thread __declspec(thread) -#endif -#if !defined(__aligned) -#if defined(__MINGW32__) -#define __aligned(...) __attribute__((aligned(__VA_ARGS__))) -#else -#define __aligned(...) __declspec(align(__VA_ARGS__)) -#endif -#endif -//#define __FUNCTION__ __FUNCTION__ -#define debugbreak() __debugbreak() - -#else -#if !defined(__noinline) -#define __noinline __attribute__((noinline)) -#endif -#if !defined(__forceinline) -#define __forceinline inline __attribute__((always_inline)) -#endif -//#define __restrict __restrict -//#define __thread __thread -#if !defined(__aligned) -#define __aligned(...) __attribute__((aligned(__VA_ARGS__))) -#endif -#if !defined(__FUNCTION__) -#define __FUNCTION__ __PRETTY_FUNCTION__ -#endif -#define debugbreak() asm ("int $3") -#endif - -#if defined(__clang__) || defined(__GNUC__) - #define MAYBE_UNUSED __attribute__((unused)) -#else - #define MAYBE_UNUSED -#endif - -#if defined(_MSC_VER) && (_MSC_VER < 1900) // before VS2015 deleted functions are not supported properly - #define DELETED -#else - #define DELETED = delete -#endif - -// -- GODOT start -- -#ifndef likely -// -- GODOT end -- -#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) -#define likely(expr) (expr) -#define unlikely(expr) (expr) -#else -#define likely(expr) __builtin_expect((bool)(expr),true ) -#define unlikely(expr) __builtin_expect((bool)(expr),false) -#endif -// -- GODOT start -- -#endif -// -- GODOT end -- - -//////////////////////////////////////////////////////////////////////////////// -/// Error handling and debugging -//////////////////////////////////////////////////////////////////////////////// - -/* debug printing macros */ -#define STRING(x) #x -#define TOSTRING(x) STRING(x) -#define PING embree_cout << __FILE__ << " (" << __LINE__ << "): " << __FUNCTION__ << embree_endl -#define PRINT(x) embree_cout << STRING(x) << " = " << (x) << embree_endl -#define PRINT2(x,y) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << embree_endl -#define PRINT3(x,y,z) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << embree_endl -#define PRINT4(x,y,z,w) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << ", " << STRING(w) << " = " << (w) << embree_endl - -#if defined(DEBUG) // only report file and line in debug mode - // -- GODOT start -- - // #define THROW_RUNTIME_ERROR(str) - // throw std::runtime_error(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)); - #define THROW_RUNTIME_ERROR(str) \ - printf(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)), abort(); - // -- GODOT end -- -#else - // -- GODOT start -- - // #define THROW_RUNTIME_ERROR(str) - // throw std::runtime_error(str); - #define THROW_RUNTIME_ERROR(str) \ - abort(); - // -- GODOT end -- -#endif - -#define FATAL(x) THROW_RUNTIME_ERROR(x) -#define WARNING(x) { std::cerr << "Warning: " << x << embree_endl << std::flush; } - -#define NOT_IMPLEMENTED FATAL(std::string(__FUNCTION__) + " not implemented") - -//////////////////////////////////////////////////////////////////////////////// -/// Basic types -//////////////////////////////////////////////////////////////////////////////// - -/* default floating-point type */ -namespace embree { - typedef float real; -} - -/* windows does not have ssize_t */ -#if defined(__WIN32__) -#if defined(__X86_64__) || defined(__aarch64__) -typedef int64_t ssize_t; -#else -typedef int32_t ssize_t; -#endif -#endif - -//////////////////////////////////////////////////////////////////////////////// -/// Basic utility functions -//////////////////////////////////////////////////////////////////////////////// - -__forceinline std::string toString(long long value) { - return std::to_string(value); -} - -//////////////////////////////////////////////////////////////////////////////// -/// Disable some compiler warnings -//////////////////////////////////////////////////////////////////////////////// - -#if defined(__INTEL_COMPILER) -//#pragma warning(disable:265 ) // floating-point operation result is out of range -//#pragma warning(disable:383 ) // value copied to temporary, reference to temporary used -//#pragma warning(disable:869 ) // parameter was never referenced -//#pragma warning(disable:981 ) // operands are evaluated in unspecified order -//#pragma warning(disable:1418) // external function definition with no prior declaration -//#pragma warning(disable:1419) // external declaration in primary source file -//#pragma warning(disable:1572) // floating-point equality and inequality comparisons are unreliable -//#pragma warning(disable:94 ) // the size of an array must be greater than zero -//#pragma warning(disable:1599) // declaration hides parameter -//#pragma warning(disable:424 ) // extra ";" ignored -#pragma warning(disable:2196) // routine is both "inline" and "noinline" -//#pragma warning(disable:177 ) // label was declared but never referenced -//#pragma warning(disable:114 ) // function was referenced but not defined -//#pragma warning(disable:819 ) // template nesting depth does not match the previous declaration of function -#pragma warning(disable:15335) // was not vectorized: vectorization possible but seems inefficient -#endif - -#if defined(_MSC_VER) -//#pragma warning(disable:4200) // nonstandard extension used : zero-sized array in struct/union -#pragma warning(disable:4800) // forcing value to bool 'true' or 'false' (performance warning) -//#pragma warning(disable:4267) // '=' : conversion from 'size_t' to 'unsigned long', possible loss of data -#pragma warning(disable:4244) // 'argument' : conversion from 'ssize_t' to 'unsigned int', possible loss of data -//#pragma warning(disable:4355) // 'this' : used in base member initializer list -//#pragma warning(disable:391 ) // '<=' : signed / unsigned mismatch -//#pragma warning(disable:4018) // '<' : signed / unsigned mismatch -//#pragma warning(disable:4305) // 'initializing' : truncation from 'double' to 'float' -//#pragma warning(disable:4068) // unknown pragma -//#pragma warning(disable:4146) // unary minus operator applied to unsigned type, result still unsigned -//#pragma warning(disable:4838) // conversion from 'unsigned int' to 'const int' requires a narrowing conversion) -//#pragma warning(disable:4227) // anachronism used : qualifiers on reference are ignored -#pragma warning(disable:4503) // decorated name length exceeded, name was truncated -#pragma warning(disable:4180) // qualifier applied to function type has no meaning; ignored -#pragma warning(disable:4258) // definition from the for loop is ignored; the definition from the enclosing scope is used - -# if _MSC_VER < 1910 // prior to Visual studio 2017 (V141) -# pragma warning(disable:4101) // warning C4101: 'x': unreferenced local variable // a compiler bug issues wrong warnings -# pragma warning(disable:4789) // buffer '' of size 8 bytes will be overrun; 32 bytes will be written starting at offset 0 -# endif - -#endif - -#if defined(__clang__) && !defined(__INTEL_COMPILER) -//#pragma clang diagnostic ignored "-Wunknown-pragmas" -//#pragma clang diagnostic ignored "-Wunused-variable" -//#pragma clang diagnostic ignored "-Wreorder" -//#pragma clang diagnostic ignored "-Wmicrosoft" -//#pragma clang diagnostic ignored "-Wunused-private-field" -//#pragma clang diagnostic ignored "-Wunused-local-typedef" -//#pragma clang diagnostic ignored "-Wunused-function" -//#pragma clang diagnostic ignored "-Wnarrowing" -//#pragma clang diagnostic ignored "-Wc++11-narrowing" -//#pragma clang diagnostic ignored "-Wdeprecated-register" -//#pragma clang diagnostic ignored "-Wdeprecated-declarations" -#endif - -#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__) -#pragma GCC diagnostic ignored "-Wpragmas" -//#pragma GCC diagnostic ignored "-Wnarrowing" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -//#pragma GCC diagnostic ignored "-Wdeprecated-declarations" -//#pragma GCC diagnostic ignored "-Warray-bounds" -#pragma GCC diagnostic ignored "-Wattributes" -#pragma GCC diagnostic ignored "-Wmisleading-indentation" -#pragma GCC diagnostic ignored "-Wsign-compare" -#pragma GCC diagnostic ignored "-Wparentheses" -#endif - -#if defined(__clang__) && defined(__WIN32__) -#pragma clang diagnostic ignored "-Wunused-parameter" -#pragma clang diagnostic ignored "-Wmicrosoft-cast" -#pragma clang diagnostic ignored "-Wmicrosoft-enum-value" -#pragma clang diagnostic ignored "-Wmicrosoft-include" -#pragma clang diagnostic ignored "-Wunused-function" -#pragma clang diagnostic ignored "-Wunknown-pragmas" -#endif - -/* disabling deprecated warning, please use only where use of deprecated Embree API functions is desired */ -#if defined(__WIN32__) && defined(__INTEL_COMPILER) -#define DISABLE_DEPRECATED_WARNING __pragma(warning (disable: 1478)) // warning: function was declared deprecated -#define ENABLE_DEPRECATED_WARNING __pragma(warning (enable: 1478)) // warning: function was declared deprecated -#elif defined(__INTEL_COMPILER) -#define DISABLE_DEPRECATED_WARNING _Pragma("warning (disable: 1478)") // warning: function was declared deprecated -#define ENABLE_DEPRECATED_WARNING _Pragma("warning (enable : 1478)") // warning: function was declared deprecated -#elif defined(__clang__) -#define DISABLE_DEPRECATED_WARNING _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"") // warning: xxx is deprecated -#define ENABLE_DEPRECATED_WARNING _Pragma("clang diagnostic warning \"-Wdeprecated-declarations\"") // warning: xxx is deprecated -#elif defined(__GNUC__) -#define DISABLE_DEPRECATED_WARNING _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"") // warning: xxx is deprecated -#define ENABLE_DEPRECATED_WARNING _Pragma("GCC diagnostic warning \"-Wdeprecated-declarations\"") // warning: xxx is deprecated -#elif defined(_MSC_VER) -#define DISABLE_DEPRECATED_WARNING __pragma(warning (disable: 4996)) // warning: function was declared deprecated -#define ENABLE_DEPRECATED_WARNING __pragma(warning (enable : 4996)) // warning: function was declared deprecated -#endif - -/* embree output stream */ -#define embree_ostream std::ostream& -#define embree_cout std::cout -#define embree_cout_uniform std::cout -#define embree_endl std::endl - -//////////////////////////////////////////////////////////////////////////////// -/// Some macros for static profiling -//////////////////////////////////////////////////////////////////////////////// - -#if defined (__GNUC__) -#define IACA_SSC_MARK( MARK_ID ) \ -__asm__ __volatile__ ( \ - "\n\t movl $"#MARK_ID", %%ebx" \ - "\n\t .byte 0x64, 0x67, 0x90" \ - : : : "memory" ); - -#define IACA_UD_BYTES __asm__ __volatile__ ("\n\t .byte 0x0F, 0x0B"); - -#else -#define IACA_UD_BYTES {__asm _emit 0x0F \ - __asm _emit 0x0B} - -#define IACA_SSC_MARK(x) {__asm mov ebx, x\ - __asm _emit 0x64 \ - __asm _emit 0x67 \ - __asm _emit 0x90 } - -#define IACA_VC64_START __writegsbyte(111, 111); -#define IACA_VC64_END __writegsbyte(222, 222); - -#endif - -#define IACA_START {IACA_UD_BYTES \ - IACA_SSC_MARK(111)} -#define IACA_END {IACA_SSC_MARK(222) \ - IACA_UD_BYTES} - -namespace embree -{ - template - struct OnScopeExitHelper - { - OnScopeExitHelper (const Closure f) : active(true), f(f) {} - ~OnScopeExitHelper() { if (active) f(); } - void deactivate() { active = false; } - bool active; - const Closure f; - }; - - template - OnScopeExitHelper OnScopeExit(const Closure f) { - return OnScopeExitHelper(f); - } - -#define STRING_JOIN2(arg1, arg2) DO_STRING_JOIN2(arg1, arg2) -#define DO_STRING_JOIN2(arg1, arg2) arg1 ## arg2 -#define ON_SCOPE_EXIT(code) \ - auto STRING_JOIN2(on_scope_exit_, __LINE__) = OnScopeExit([&](){code;}) - - template - std::unique_ptr make_unique(Ty* ptr) { - return std::unique_ptr(ptr); - } - -} diff --git a/thirdparty/embree-aarch64/common/sys/ref.h b/thirdparty/embree-aarch64/common/sys/ref.h deleted file mode 100644 index 24648e6234..0000000000 --- a/thirdparty/embree-aarch64/common/sys/ref.h +++ /dev/null @@ -1,122 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "atomic.h" - -namespace embree -{ - struct NullTy { - }; - - extern MAYBE_UNUSED NullTy null; - - class RefCount - { - public: - RefCount(int val = 0) : refCounter(val) {} - virtual ~RefCount() {}; - - virtual RefCount* refInc() { refCounter.fetch_add(1); return this; } - virtual void refDec() { if (refCounter.fetch_add(-1) == 1) delete this; } - private: - std::atomic refCounter; - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Reference to single object - //////////////////////////////////////////////////////////////////////////////// - - template - class Ref - { - public: - Type* ptr; - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Ref() : ptr(nullptr) {} - __forceinline Ref(NullTy) : ptr(nullptr) {} - __forceinline Ref(const Ref& input) : ptr(input.ptr) { if (ptr) ptr->refInc(); } - __forceinline Ref(Ref&& input) : ptr(input.ptr) { input.ptr = nullptr; } - - __forceinline Ref(Type* const input) : ptr(input) - { - if (ptr) - ptr->refInc(); - } - - __forceinline ~Ref() - { - if (ptr) - ptr->refDec(); - } - - __forceinline Ref& operator =(const Ref& input) - { - if (input.ptr) - input.ptr->refInc(); - if (ptr) - ptr->refDec(); - ptr = input.ptr; - return *this; - } - - __forceinline Ref& operator =(Ref&& input) - { - if (ptr) - ptr->refDec(); - ptr = input.ptr; - input.ptr = nullptr; - return *this; - } - - __forceinline Ref& operator =(Type* const input) - { - if (input) - input->refInc(); - if (ptr) - ptr->refDec(); - ptr = input; - return *this; - } - - __forceinline Ref& operator =(NullTy) - { - if (ptr) - ptr->refDec(); - ptr = nullptr; - return *this; - } - - __forceinline operator bool() const { return ptr != nullptr; } - - __forceinline const Type& operator *() const { return *ptr; } - __forceinline Type& operator *() { return *ptr; } - __forceinline const Type* operator ->() const { return ptr; } - __forceinline Type* operator ->() { return ptr; } - - template - __forceinline Ref cast() { return Ref(static_cast(ptr)); } - template - __forceinline const Ref cast() const { return Ref(static_cast(ptr)); } - - template - __forceinline Ref dynamicCast() { return Ref(dynamic_cast(ptr)); } - template - __forceinline const Ref dynamicCast() const { return Ref(dynamic_cast(ptr)); } - }; - - template __forceinline bool operator < (const Ref& a, const Ref& b) { return a.ptr < b.ptr; } - - template __forceinline bool operator ==(const Ref& a, NullTy ) { return a.ptr == nullptr; } - template __forceinline bool operator ==(NullTy , const Ref& b) { return nullptr == b.ptr; } - template __forceinline bool operator ==(const Ref& a, const Ref& b) { return a.ptr == b.ptr; } - - template __forceinline bool operator !=(const Ref& a, NullTy ) { return a.ptr != nullptr; } - template __forceinline bool operator !=(NullTy , const Ref& b) { return nullptr != b.ptr; } - template __forceinline bool operator !=(const Ref& a, const Ref& b) { return a.ptr != b.ptr; } -} diff --git a/thirdparty/embree-aarch64/common/sys/regression.cpp b/thirdparty/embree-aarch64/common/sys/regression.cpp deleted file mode 100644 index d95ff8dfe0..0000000000 --- a/thirdparty/embree-aarch64/common/sys/regression.cpp +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "regression.h" - -namespace embree -{ - /* registerRegressionTest is invoked from static initializers, thus - * we cannot have the regression_tests variable as global static - * variable due to issues with static variable initialization - * order. */ - std::vector& get_regression_tests() - { - static std::vector regression_tests; - return regression_tests; - } - - void registerRegressionTest(RegressionTest* test) - { - get_regression_tests().push_back(test); - } - - RegressionTest* getRegressionTest(size_t index) - { - if (index >= get_regression_tests().size()) - return nullptr; - - return get_regression_tests()[index]; - } -} diff --git a/thirdparty/embree-aarch64/common/sys/regression.h b/thirdparty/embree-aarch64/common/sys/regression.h deleted file mode 100644 index 632f8d92cf..0000000000 --- a/thirdparty/embree-aarch64/common/sys/regression.h +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "platform.h" - -#include - -namespace embree -{ - /*! virtual interface for all regression tests */ - struct RegressionTest - { - RegressionTest (std::string name) : name(name) {} - virtual bool run() = 0; - std::string name; - }; - - /*! registers a regression test */ - void registerRegressionTest(RegressionTest* test); - - /*! run all regression tests */ - RegressionTest* getRegressionTest(size_t index); -} diff --git a/thirdparty/embree-aarch64/common/sys/string.cpp b/thirdparty/embree-aarch64/common/sys/string.cpp deleted file mode 100644 index 931244383e..0000000000 --- a/thirdparty/embree-aarch64/common/sys/string.cpp +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "string.h" - -#include -#include - -namespace embree -{ - char to_lower(char c) { return char(tolower(int(c))); } - char to_upper(char c) { return char(toupper(int(c))); } - std::string toLowerCase(const std::string& s) { std::string dst(s); std::transform(dst.begin(), dst.end(), dst.begin(), to_lower); return dst; } - std::string toUpperCase(const std::string& s) { std::string dst(s); std::transform(dst.begin(), dst.end(), dst.begin(), to_upper); return dst; } - - Vec2f string_to_Vec2f ( std::string str ) - { - size_t next = 0; - const float x = std::stof(str,&next); str = str.substr(next+1); - const float y = std::stof(str,&next); - return Vec2f(x,y); - } - - Vec3f string_to_Vec3f ( std::string str ) - { - size_t next = 0; - const float x = std::stof(str,&next); str = str.substr(next+1); - const float y = std::stof(str,&next); str = str.substr(next+1); - const float z = std::stof(str,&next); - return Vec3f(x,y,z); - } - - Vec4f string_to_Vec4f ( std::string str ) - { - size_t next = 0; - const float x = std::stof(str,&next); str = str.substr(next+1); - const float y = std::stof(str,&next); str = str.substr(next+1); - const float z = std::stof(str,&next); str = str.substr(next+1); - const float w = std::stof(str,&next); - return Vec4f(x,y,z,w); - } -} diff --git a/thirdparty/embree-aarch64/common/sys/string.h b/thirdparty/embree-aarch64/common/sys/string.h deleted file mode 100644 index 2e9b0f88c3..0000000000 --- a/thirdparty/embree-aarch64/common/sys/string.h +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "platform.h" -#include "../math/vec2.h" -#include "../math/vec3.h" -#include "../math/vec4.h" - -namespace embree -{ - class IOStreamStateRestorer - { - public: - IOStreamStateRestorer(std::ostream& iostream) - : iostream(iostream), flags(iostream.flags()), precision(iostream.precision()) { - } - - ~IOStreamStateRestorer() { - iostream.flags(flags); - iostream.precision(precision); - } - - private: - std::ostream& iostream; - std::ios::fmtflags flags; - std::streamsize precision; - }; - - std::string toLowerCase(const std::string& s); - std::string toUpperCase(const std::string& s); - - Vec2f string_to_Vec2f ( std::string str ); - Vec3f string_to_Vec3f ( std::string str ); - Vec4f string_to_Vec4f ( std::string str ); -} diff --git a/thirdparty/embree-aarch64/common/sys/sysinfo.cpp b/thirdparty/embree-aarch64/common/sys/sysinfo.cpp deleted file mode 100644 index 1d11436770..0000000000 --- a/thirdparty/embree-aarch64/common/sys/sysinfo.cpp +++ /dev/null @@ -1,676 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "sysinfo.h" -#include "intrinsics.h" -#include "string.h" -#include "ref.h" -#if defined(__FREEBSD__) -#include -#include -typedef cpuset_t cpu_set_t; -#endif - -//////////////////////////////////////////////////////////////////////////////// -/// All Platforms -//////////////////////////////////////////////////////////////////////////////// - -namespace embree -{ - NullTy null; - - std::string getPlatformName() - { -#if defined(__LINUX__) && defined(__ANDROID__) && defined(__aarch64__) && defined(__ARM_NEON) - return "Android Linux (aarch64 / arm64)"; -#elif defined(__LINUX__) && defined(__ANDROID__) && defined(__X86_64__) - return "Android Linux (x64)"; -#elif defined(__LINUX__) && defined(__ANDROID__) && (defined(_X86_) || defined(__X86__) || defined(_M_IX86)) - return "Android Linux (x86)"; -#elif defined(__LINUX__) && !defined(__X86_64__) - return "Linux (32bit)"; -#elif defined(__LINUX__) && defined(__X86_64__) - return "Linux (64bit)"; -#elif defined(__FREEBSD__) && !defined(__X86_64__) - return "FreeBSD (32bit)"; -#elif defined(__FREEBSD__) && defined(__X86_64__) - return "FreeBSD (64bit)"; -#elif defined(__CYGWIN__) && !defined(__X86_64__) - return "Cygwin (32bit)"; -#elif defined(__CYGWIN__) && defined(__X86_64__) - return "Cygwin (64bit)"; -#elif defined(__WIN32__) && !defined(__X86_64__) - return "Windows (32bit)"; -#elif defined(__WIN32__) && defined(__X86_64__) - return "Windows (64bit)"; -#elif defined(TARGET_IPHONE_SIMULATOR) && defined(__X86_64__) - return "iOS Simulator (x64)"; -#elif defined(TARGET_OS_IPHONE) && defined(__aarch64__) && defined(__ARM_NEON) - return "iOS (aarch64 / arm64)"; -#elif defined(__MACOSX__) && !defined(__X86_64__) - return "Mac OS X (32bit)"; -#elif defined(__MACOSX__) && defined(__X86_64__) - return "Mac OS X (64bit)"; -#elif defined(__UNIX__) && defined(__aarch64__) - return "Unix (aarch64)"; -#elif defined(__UNIX__) && !defined(__X86_64__) - return "Unix (32bit)"; -#elif defined(__UNIX__) && defined(__X86_64__) - return "Unix (64bit)"; -#else - return "Unknown"; -#endif - } - - std::string getCompilerName() - { -#if defined(__INTEL_COMPILER) - int icc_mayor = __INTEL_COMPILER / 100 % 100; - int icc_minor = __INTEL_COMPILER % 100; - std::string version = "Intel Compiler "; - version += toString(icc_mayor); - version += "." + toString(icc_minor); -#if defined(__INTEL_COMPILER_UPDATE) - version += "." + toString(__INTEL_COMPILER_UPDATE); -#endif - return version; -#elif defined(__clang__) - return "CLANG " __clang_version__; -#elif defined (__GNUC__) - return "GCC " __VERSION__; -#elif defined(_MSC_VER) - std::string version = toString(_MSC_FULL_VER); - version.insert(4,"."); - version.insert(9,"."); - version.insert(2,"."); - return "Visual C++ Compiler " + version; -#else - return "Unknown Compiler"; -#endif - } - - std::string getCPUVendor() - { - int cpuinfo[4]; - __cpuid (cpuinfo, 0); - int name[4]; - name[0] = cpuinfo[1]; - name[1] = cpuinfo[3]; - name[2] = cpuinfo[2]; - name[3] = 0; - return (char*)name; - } - - CPU getCPUModel() - { - if (getCPUVendor() != "GenuineIntel") - return CPU::UNKNOWN; - - int out[4]; - __cpuid(out, 0); - if (out[0] < 1) return CPU::UNKNOWN; - __cpuid(out, 1); - - /* please see CPUID documentation for these formulas */ - uint32_t family_ID = (out[0] >> 8) & 0x0F; - uint32_t extended_family_ID = (out[0] >> 20) & 0xFF; - - uint32_t model_ID = (out[0] >> 4) & 0x0F; - uint32_t extended_model_ID = (out[0] >> 16) & 0x0F; - - uint32_t DisplayFamily = family_ID; - if (family_ID == 0x0F) - DisplayFamily += extended_family_ID; - - uint32_t DisplayModel = model_ID; - if (family_ID == 0x06 || family_ID == 0x0F) - DisplayModel += extended_model_ID << 4; - - uint32_t DisplayFamily_DisplayModel = (DisplayFamily << 8) + (DisplayModel << 0); - - // Data from Intel® 64 and IA-32 Architectures, Volume 4, Chapter 2, Table 2-1 (CPUID Signature Values of DisplayFamily_DisplayModel) - if (DisplayFamily_DisplayModel == 0x067D) return CPU::CORE_ICE_LAKE; - if (DisplayFamily_DisplayModel == 0x067E) return CPU::CORE_ICE_LAKE; - if (DisplayFamily_DisplayModel == 0x068C) return CPU::CORE_TIGER_LAKE; - if (DisplayFamily_DisplayModel == 0x06A5) return CPU::CORE_COMET_LAKE; - if (DisplayFamily_DisplayModel == 0x06A6) return CPU::CORE_COMET_LAKE; - if (DisplayFamily_DisplayModel == 0x0666) return CPU::CORE_CANNON_LAKE; - if (DisplayFamily_DisplayModel == 0x068E) return CPU::CORE_KABY_LAKE; - if (DisplayFamily_DisplayModel == 0x069E) return CPU::CORE_KABY_LAKE; - if (DisplayFamily_DisplayModel == 0x066A) return CPU::XEON_ICE_LAKE; - if (DisplayFamily_DisplayModel == 0x066C) return CPU::XEON_ICE_LAKE; - if (DisplayFamily_DisplayModel == 0x0655) return CPU::XEON_SKY_LAKE; - if (DisplayFamily_DisplayModel == 0x064E) return CPU::CORE_SKY_LAKE; - if (DisplayFamily_DisplayModel == 0x065E) return CPU::CORE_SKY_LAKE; - if (DisplayFamily_DisplayModel == 0x0656) return CPU::XEON_BROADWELL; - if (DisplayFamily_DisplayModel == 0x064F) return CPU::XEON_BROADWELL; - if (DisplayFamily_DisplayModel == 0x0647) return CPU::CORE_BROADWELL; - if (DisplayFamily_DisplayModel == 0x063D) return CPU::CORE_BROADWELL; - if (DisplayFamily_DisplayModel == 0x063F) return CPU::XEON_HASWELL; - if (DisplayFamily_DisplayModel == 0x063C) return CPU::CORE_HASWELL; - if (DisplayFamily_DisplayModel == 0x0645) return CPU::CORE_HASWELL; - if (DisplayFamily_DisplayModel == 0x0646) return CPU::CORE_HASWELL; - if (DisplayFamily_DisplayModel == 0x063E) return CPU::XEON_IVY_BRIDGE; - if (DisplayFamily_DisplayModel == 0x063A) return CPU::CORE_IVY_BRIDGE; - if (DisplayFamily_DisplayModel == 0x062D) return CPU::SANDY_BRIDGE; - if (DisplayFamily_DisplayModel == 0x062F) return CPU::SANDY_BRIDGE; - if (DisplayFamily_DisplayModel == 0x062A) return CPU::SANDY_BRIDGE; - if (DisplayFamily_DisplayModel == 0x062E) return CPU::NEHALEM; - if (DisplayFamily_DisplayModel == 0x0625) return CPU::NEHALEM; - if (DisplayFamily_DisplayModel == 0x062C) return CPU::NEHALEM; - if (DisplayFamily_DisplayModel == 0x061E) return CPU::NEHALEM; - if (DisplayFamily_DisplayModel == 0x061F) return CPU::NEHALEM; - if (DisplayFamily_DisplayModel == 0x061A) return CPU::NEHALEM; - if (DisplayFamily_DisplayModel == 0x061D) return CPU::NEHALEM; - if (DisplayFamily_DisplayModel == 0x0617) return CPU::CORE2; - if (DisplayFamily_DisplayModel == 0x060F) return CPU::CORE2; - if (DisplayFamily_DisplayModel == 0x060E) return CPU::CORE1; - - if (DisplayFamily_DisplayModel == 0x0685) return CPU::XEON_PHI_KNIGHTS_MILL; - if (DisplayFamily_DisplayModel == 0x0657) return CPU::XEON_PHI_KNIGHTS_LANDING; - - return CPU::UNKNOWN; - } - - std::string stringOfCPUModel(CPU model) - { - switch (model) { - case CPU::XEON_ICE_LAKE : return "Xeon Ice Lake"; - case CPU::CORE_ICE_LAKE : return "Core Ice Lake"; - case CPU::CORE_TIGER_LAKE : return "Core Tiger Lake"; - case CPU::CORE_COMET_LAKE : return "Core Comet Lake"; - case CPU::CORE_CANNON_LAKE : return "Core Cannon Lake"; - case CPU::CORE_KABY_LAKE : return "Core Kaby Lake"; - case CPU::XEON_SKY_LAKE : return "Xeon Sky Lake"; - case CPU::CORE_SKY_LAKE : return "Core Sky Lake"; - case CPU::XEON_PHI_KNIGHTS_MILL : return "Xeon Phi Knights Mill"; - case CPU::XEON_PHI_KNIGHTS_LANDING: return "Xeon Phi Knights Landing"; - case CPU::XEON_BROADWELL : return "Xeon Broadwell"; - case CPU::CORE_BROADWELL : return "Core Broadwell"; - case CPU::XEON_HASWELL : return "Xeon Haswell"; - case CPU::CORE_HASWELL : return "Core Haswell"; - case CPU::XEON_IVY_BRIDGE : return "Xeon Ivy Bridge"; - case CPU::CORE_IVY_BRIDGE : return "Core Ivy Bridge"; - case CPU::SANDY_BRIDGE : return "Sandy Bridge"; - case CPU::NEHALEM : return "Nehalem"; - case CPU::CORE2 : return "Core2"; - case CPU::CORE1 : return "Core"; - case CPU::ARM : return "Arm"; - case CPU::UNKNOWN : return "Unknown CPU"; - } - return "Unknown CPU (error)"; - } - -#if !defined(__ARM_NEON) - /* constants to access destination registers of CPUID instruction */ - static const int EAX = 0; - static const int EBX = 1; - static const int ECX = 2; - static const int EDX = 3; - - /* cpuid[eax=1].ecx */ - static const int CPU_FEATURE_BIT_SSE3 = 1 << 0; - static const int CPU_FEATURE_BIT_SSSE3 = 1 << 9; - static const int CPU_FEATURE_BIT_FMA3 = 1 << 12; - static const int CPU_FEATURE_BIT_SSE4_1 = 1 << 19; - static const int CPU_FEATURE_BIT_SSE4_2 = 1 << 20; - //static const int CPU_FEATURE_BIT_MOVBE = 1 << 22; - static const int CPU_FEATURE_BIT_POPCNT = 1 << 23; - //static const int CPU_FEATURE_BIT_XSAVE = 1 << 26; - static const int CPU_FEATURE_BIT_OXSAVE = 1 << 27; - static const int CPU_FEATURE_BIT_AVX = 1 << 28; - static const int CPU_FEATURE_BIT_F16C = 1 << 29; - static const int CPU_FEATURE_BIT_RDRAND = 1 << 30; - - /* cpuid[eax=1].edx */ - static const int CPU_FEATURE_BIT_SSE = 1 << 25; - static const int CPU_FEATURE_BIT_SSE2 = 1 << 26; - - /* cpuid[eax=0x80000001].ecx */ - static const int CPU_FEATURE_BIT_LZCNT = 1 << 5; - - /* cpuid[eax=7,ecx=0].ebx */ - static const int CPU_FEATURE_BIT_BMI1 = 1 << 3; - static const int CPU_FEATURE_BIT_AVX2 = 1 << 5; - static const int CPU_FEATURE_BIT_BMI2 = 1 << 8; - static const int CPU_FEATURE_BIT_AVX512F = 1 << 16; // AVX512F (foundation) - static const int CPU_FEATURE_BIT_AVX512DQ = 1 << 17; // AVX512DQ (doubleword and quadword instructions) - static const int CPU_FEATURE_BIT_AVX512PF = 1 << 26; // AVX512PF (prefetch gather/scatter instructions) - static const int CPU_FEATURE_BIT_AVX512ER = 1 << 27; // AVX512ER (exponential and reciprocal instructions) - static const int CPU_FEATURE_BIT_AVX512CD = 1 << 28; // AVX512CD (conflict detection instructions) - static const int CPU_FEATURE_BIT_AVX512BW = 1 << 30; // AVX512BW (byte and word instructions) - static const int CPU_FEATURE_BIT_AVX512VL = 1 << 31; // AVX512VL (vector length extensions) - static const int CPU_FEATURE_BIT_AVX512IFMA = 1 << 21; // AVX512IFMA (integer fused multiple-add instructions) - - /* cpuid[eax=7,ecx=0].ecx */ - static const int CPU_FEATURE_BIT_AVX512VBMI = 1 << 1; // AVX512VBMI (vector bit manipulation instructions) -#endif - -#if !defined(__ARM_NEON) - __noinline int64_t get_xcr0() - { - // https://github.com/opencv/opencv/blob/master/modules/core/src/system.cpp#L466 -#if defined (__WIN32__) && defined(_XCR_XFEATURE_ENABLED_MASK) - int64_t xcr0 = 0; // int64_t is workaround for compiler bug under VS2013, Win32 - xcr0 = _xgetbv(0); - return xcr0; -#else - int xcr0 = 0; - __asm__ ("xgetbv" : "=a" (xcr0) : "c" (0) : "%edx" ); - return xcr0; -#endif - } -#endif - - int getCPUFeatures() - { -#if defined(__ARM_NEON) - int cpu_features = CPU_FEATURE_NEON|CPU_FEATURE_SSE|CPU_FEATURE_SSE2; -#if defined(NEON_AVX2_EMULATION) - cpu_features |= CPU_FEATURE_SSE3|CPU_FEATURE_SSSE3|CPU_FEATURE_SSE42; - cpu_features |= CPU_FEATURE_XMM_ENABLED; - cpu_features |= CPU_FEATURE_YMM_ENABLED; - cpu_features |= CPU_FEATURE_SSE41 | CPU_FEATURE_RDRAND | CPU_FEATURE_F16C; - cpu_features |= CPU_FEATURE_POPCNT; - cpu_features |= CPU_FEATURE_AVX; - cpu_features |= CPU_FEATURE_AVX2; - cpu_features |= CPU_FEATURE_FMA3; - cpu_features |= CPU_FEATURE_LZCNT; - cpu_features |= CPU_FEATURE_BMI1; - cpu_features |= CPU_FEATURE_BMI2; - cpu_features |= CPU_FEATURE_NEON_2X; - - - -#endif - return cpu_features; - -#else - /* cache CPU features access */ - static int cpu_features = 0; - if (cpu_features) - return cpu_features; - - /* get number of CPUID leaves */ - int cpuid_leaf0[4]; - __cpuid(cpuid_leaf0, 0x00000000); - unsigned nIds = cpuid_leaf0[EAX]; - - /* get number of extended CPUID leaves */ - int cpuid_leafe[4]; - __cpuid(cpuid_leafe, 0x80000000); - unsigned nExIds = cpuid_leafe[EAX]; - - /* get CPUID leaves for EAX = 1,7, and 0x80000001 */ - int cpuid_leaf_1[4] = { 0,0,0,0 }; - int cpuid_leaf_7[4] = { 0,0,0,0 }; - int cpuid_leaf_e1[4] = { 0,0,0,0 }; - if (nIds >= 1) __cpuid (cpuid_leaf_1,0x00000001); -#if _WIN32 -#if _MSC_VER && (_MSC_FULL_VER < 160040219) -#else - if (nIds >= 7) __cpuidex(cpuid_leaf_7,0x00000007,0); -#endif -#else - if (nIds >= 7) __cpuid_count(cpuid_leaf_7,0x00000007,0); -#endif - if (nExIds >= 0x80000001) __cpuid(cpuid_leaf_e1,0x80000001); - - /* detect if OS saves XMM, YMM, and ZMM states */ - bool xmm_enabled = true; - bool ymm_enabled = false; - bool zmm_enabled = false; - if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_OXSAVE) { - int64_t xcr0 = get_xcr0(); - xmm_enabled = ((xcr0 & 0x02) == 0x02); /* checks if xmm are enabled in XCR0 */ - ymm_enabled = xmm_enabled && ((xcr0 & 0x04) == 0x04); /* checks if ymm state are enabled in XCR0 */ - zmm_enabled = ymm_enabled && ((xcr0 & 0xE0) == 0xE0); /* checks if OPMASK state, upper 256-bit of ZMM0-ZMM15 and ZMM16-ZMM31 state are enabled in XCR0 */ - } - if (xmm_enabled) cpu_features |= CPU_FEATURE_XMM_ENABLED; - if (ymm_enabled) cpu_features |= CPU_FEATURE_YMM_ENABLED; - if (zmm_enabled) cpu_features |= CPU_FEATURE_ZMM_ENABLED; - - if (cpuid_leaf_1[EDX] & CPU_FEATURE_BIT_SSE ) cpu_features |= CPU_FEATURE_SSE; - if (cpuid_leaf_1[EDX] & CPU_FEATURE_BIT_SSE2 ) cpu_features |= CPU_FEATURE_SSE2; - if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSE3 ) cpu_features |= CPU_FEATURE_SSE3; - if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSSE3 ) cpu_features |= CPU_FEATURE_SSSE3; - if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSE4_1) cpu_features |= CPU_FEATURE_SSE41; - if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSE4_2) cpu_features |= CPU_FEATURE_SSE42; - if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_POPCNT) cpu_features |= CPU_FEATURE_POPCNT; - if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_AVX ) cpu_features |= CPU_FEATURE_AVX; - - if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_F16C ) cpu_features |= CPU_FEATURE_F16C; - if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_RDRAND) cpu_features |= CPU_FEATURE_RDRAND; - if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX2 ) cpu_features |= CPU_FEATURE_AVX2; - if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_FMA3 ) cpu_features |= CPU_FEATURE_FMA3; - if (cpuid_leaf_e1[ECX] & CPU_FEATURE_BIT_LZCNT) cpu_features |= CPU_FEATURE_LZCNT; - if (cpuid_leaf_7 [EBX] & CPU_FEATURE_BIT_BMI1 ) cpu_features |= CPU_FEATURE_BMI1; - if (cpuid_leaf_7 [EBX] & CPU_FEATURE_BIT_BMI2 ) cpu_features |= CPU_FEATURE_BMI2; - - if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512F ) cpu_features |= CPU_FEATURE_AVX512F; - if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512DQ ) cpu_features |= CPU_FEATURE_AVX512DQ; - if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512PF ) cpu_features |= CPU_FEATURE_AVX512PF; - if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512ER ) cpu_features |= CPU_FEATURE_AVX512ER; - if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512CD ) cpu_features |= CPU_FEATURE_AVX512CD; - if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512BW ) cpu_features |= CPU_FEATURE_AVX512BW; - if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512IFMA) cpu_features |= CPU_FEATURE_AVX512IFMA; - if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512VL ) cpu_features |= CPU_FEATURE_AVX512VL; - if (cpuid_leaf_7[ECX] & CPU_FEATURE_BIT_AVX512VBMI) cpu_features |= CPU_FEATURE_AVX512VBMI; - - return cpu_features; -#endif - } - - std::string stringOfCPUFeatures(int features) - { - std::string str; - if (features & CPU_FEATURE_XMM_ENABLED) str += "XMM "; - if (features & CPU_FEATURE_YMM_ENABLED) str += "YMM "; - if (features & CPU_FEATURE_ZMM_ENABLED) str += "ZMM "; - if (features & CPU_FEATURE_SSE ) str += "SSE "; - if (features & CPU_FEATURE_SSE2 ) str += "SSE2 "; - if (features & CPU_FEATURE_SSE3 ) str += "SSE3 "; - if (features & CPU_FEATURE_SSSE3 ) str += "SSSE3 "; - if (features & CPU_FEATURE_SSE41 ) str += "SSE4.1 "; - if (features & CPU_FEATURE_SSE42 ) str += "SSE4.2 "; - if (features & CPU_FEATURE_POPCNT) str += "POPCNT "; - if (features & CPU_FEATURE_AVX ) str += "AVX "; - if (features & CPU_FEATURE_F16C ) str += "F16C "; - if (features & CPU_FEATURE_RDRAND) str += "RDRAND "; - if (features & CPU_FEATURE_AVX2 ) str += "AVX2 "; - if (features & CPU_FEATURE_FMA3 ) str += "FMA3 "; - if (features & CPU_FEATURE_LZCNT ) str += "LZCNT "; - if (features & CPU_FEATURE_BMI1 ) str += "BMI1 "; - if (features & CPU_FEATURE_BMI2 ) str += "BMI2 "; - if (features & CPU_FEATURE_AVX512F) str += "AVX512F "; - if (features & CPU_FEATURE_AVX512DQ) str += "AVX512DQ "; - if (features & CPU_FEATURE_AVX512PF) str += "AVX512PF "; - if (features & CPU_FEATURE_AVX512ER) str += "AVX512ER "; - if (features & CPU_FEATURE_AVX512CD) str += "AVX512CD "; - if (features & CPU_FEATURE_AVX512BW) str += "AVX512BW "; - if (features & CPU_FEATURE_AVX512VL) str += "AVX512VL "; - if (features & CPU_FEATURE_AVX512IFMA) str += "AVX512IFMA "; - if (features & CPU_FEATURE_AVX512VBMI) str += "AVX512VBMI "; - if (features & CPU_FEATURE_NEON) str += "NEON "; - if (features & CPU_FEATURE_NEON_2X) str += "2xNEON "; - return str; - } - - std::string stringOfISA (int isa) - { - if (isa == SSE) return "SSE"; - if (isa == SSE2) return "SSE2"; - if (isa == SSE3) return "SSE3"; - if (isa == SSSE3) return "SSSE3"; - if (isa == SSE41) return "SSE4.1"; - if (isa == SSE42) return "SSE4.2"; - if (isa == AVX) return "AVX"; - if (isa == AVX2) return "AVX2"; - if (isa == AVX512KNL) return "AVX512KNL"; - if (isa == AVX512SKX) return "AVX512SKX"; - if (isa == NEON) return "NEON"; - if (isa == NEON_2X) return "2xNEON"; - return "UNKNOWN"; - } - - bool hasISA(int features, int isa) { - return (features & isa) == isa; - } - - std::string supportedTargetList (int features) - { - std::string v; - if (hasISA(features,SSE)) v += "SSE "; - if (hasISA(features,SSE2)) v += "SSE2 "; - if (hasISA(features,SSE3)) v += "SSE3 "; - if (hasISA(features,SSSE3)) v += "SSSE3 "; - if (hasISA(features,SSE41)) v += "SSE4.1 "; - if (hasISA(features,SSE42)) v += "SSE4.2 "; - if (hasISA(features,AVX)) v += "AVX "; - if (hasISA(features,AVXI)) v += "AVXI "; - if (hasISA(features,AVX2)) v += "AVX2 "; - if (hasISA(features,AVX512KNL)) v += "AVX512KNL "; - if (hasISA(features,AVX512SKX)) v += "AVX512SKX "; - if (hasISA(features,NEON)) v += "NEON "; - if (hasISA(features,NEON_2X)) v += "2xNEON "; - return v; - } -} - -//////////////////////////////////////////////////////////////////////////////// -/// Windows Platform -//////////////////////////////////////////////////////////////////////////////// - -#if defined(__WIN32__) - -#define WIN32_LEAN_AND_MEAN -#include -#include - -namespace embree -{ - std::string getExecutableFileName() { - char filename[1024]; - if (!GetModuleFileName(nullptr, filename, sizeof(filename))) - return std::string(); - return std::string(filename); - } - - unsigned int getNumberOfLogicalThreads() - { - static int nThreads = -1; - if (nThreads != -1) return nThreads; - - typedef WORD (WINAPI *GetActiveProcessorGroupCountFunc)(); - typedef DWORD (WINAPI *GetActiveProcessorCountFunc)(WORD); - HMODULE hlib = LoadLibrary("Kernel32"); - GetActiveProcessorGroupCountFunc pGetActiveProcessorGroupCount = (GetActiveProcessorGroupCountFunc)GetProcAddress(hlib, "GetActiveProcessorGroupCount"); - GetActiveProcessorCountFunc pGetActiveProcessorCount = (GetActiveProcessorCountFunc) GetProcAddress(hlib, "GetActiveProcessorCount"); - - if (pGetActiveProcessorGroupCount && pGetActiveProcessorCount) - { - int groups = pGetActiveProcessorGroupCount(); - int totalProcessors = 0; - for (int i = 0; i < groups; i++) - totalProcessors += pGetActiveProcessorCount(i); - nThreads = totalProcessors; - } - else - { - SYSTEM_INFO sysinfo; - GetSystemInfo(&sysinfo); - nThreads = sysinfo.dwNumberOfProcessors; - } - assert(nThreads); - return nThreads; - } - - int getTerminalWidth() - { - HANDLE handle = GetStdHandle(STD_OUTPUT_HANDLE); - if (handle == INVALID_HANDLE_VALUE) return 80; - CONSOLE_SCREEN_BUFFER_INFO info; - memset(&info,0,sizeof(info)); - GetConsoleScreenBufferInfo(handle, &info); - return info.dwSize.X; - } - - double getSeconds() - { - LARGE_INTEGER freq, val; - QueryPerformanceFrequency(&freq); - QueryPerformanceCounter(&val); - return (double)val.QuadPart / (double)freq.QuadPart; - } - - void sleepSeconds(double t) { - Sleep(DWORD(1000.0*t)); - } - - size_t getVirtualMemoryBytes() - { - PROCESS_MEMORY_COUNTERS info; - GetProcessMemoryInfo( GetCurrentProcess( ), &info, sizeof(info) ); - return (size_t)info.QuotaPeakPagedPoolUsage; - } - - size_t getResidentMemoryBytes() - { - PROCESS_MEMORY_COUNTERS info; - GetProcessMemoryInfo( GetCurrentProcess( ), &info, sizeof(info) ); - return (size_t)info.WorkingSetSize; - } -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -/// Linux Platform -//////////////////////////////////////////////////////////////////////////////// - -#if defined(__LINUX__) - -#include -#include - -namespace embree -{ - std::string getExecutableFileName() - { - std::string pid = "/proc/" + toString(getpid()) + "/exe"; - char buf[4096]; - memset(buf,0,sizeof(buf)); - if (readlink(pid.c_str(), buf, sizeof(buf)-1) == -1) - return std::string(); - return std::string(buf); - } - - size_t getVirtualMemoryBytes() - { - size_t virt, resident, shared; - std::ifstream buffer("/proc/self/statm"); - buffer >> virt >> resident >> shared; - return virt*sysconf(_SC_PAGE_SIZE); - } - - size_t getResidentMemoryBytes() - { - size_t virt, resident, shared; - std::ifstream buffer("/proc/self/statm"); - buffer >> virt >> resident >> shared; - return resident*sysconf(_SC_PAGE_SIZE); - } -} - -#endif - -//////////////////////////////////////////////////////////////////////////////// -/// FreeBSD Platform -//////////////////////////////////////////////////////////////////////////////// - -#if defined (__FreeBSD__) - -#include - -namespace embree -{ - std::string getExecutableFileName() - { - const int mib[4] = { CTL_KERN, KERN_PROC, KERN_PROC_PATHNAME, -1 }; - char buf[4096]; - memset(buf,0,sizeof(buf)); - size_t len = sizeof(buf)-1; - if (sysctl(mib, 4, buf, &len, 0x0, 0) == -1) - return std::string(); - return std::string(buf); - } - - size_t getVirtualMemoryBytes() { - return 0; - } - - size_t getResidentMemoryBytes() { - return 0; - } -} - -#endif - -//////////////////////////////////////////////////////////////////////////////// -/// Mac OS X Platform -//////////////////////////////////////////////////////////////////////////////// - -#if defined(__MACOSX__) - -#include - -namespace embree -{ - std::string getExecutableFileName() - { - char buf[4096]; - uint32_t size = sizeof(buf); - if (_NSGetExecutablePath(buf, &size) != 0) - return std::string(); - return std::string(buf); - } - - size_t getVirtualMemoryBytes() { - return 0; - } - - size_t getResidentMemoryBytes() { - return 0; - } -} - -#endif - -//////////////////////////////////////////////////////////////////////////////// -/// Unix Platform -//////////////////////////////////////////////////////////////////////////////// - -#if defined(__UNIX__) - -#include -#include -#include -#include - -namespace embree -{ - unsigned int getNumberOfLogicalThreads() - { - static int nThreads = -1; - if (nThreads != -1) return nThreads; - -#if defined(__MACOSX__) || defined(__ANDROID__) - nThreads = sysconf(_SC_NPROCESSORS_ONLN); // does not work in Linux LXC container - assert(nThreads); -#else - cpu_set_t set; - if (pthread_getaffinity_np(pthread_self(), sizeof(set), &set) == 0) - nThreads = CPU_COUNT(&set); -#endif - - assert(nThreads); - return nThreads; - } - - int getTerminalWidth() - { - struct winsize info; - if (ioctl(STDOUT_FILENO, TIOCGWINSZ, &info) < 0) return 80; - return info.ws_col; - } - - double getSeconds() { - struct timeval tp; gettimeofday(&tp,nullptr); - return double(tp.tv_sec) + double(tp.tv_usec)/1E6; - } - - void sleepSeconds(double t) { - usleep(1000000.0*t); - } -} -#endif - diff --git a/thirdparty/embree-aarch64/common/sys/sysinfo.h b/thirdparty/embree-aarch64/common/sys/sysinfo.h deleted file mode 100644 index 8e313a59b3..0000000000 --- a/thirdparty/embree-aarch64/common/sys/sysinfo.h +++ /dev/null @@ -1,192 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#define CACHELINE_SIZE 64 - -#if !defined(PAGE_SIZE) - #define PAGE_SIZE 4096 -#endif - -#define PAGE_SIZE_2M (2*1024*1024) -#define PAGE_SIZE_4K (4*1024) - -#include "platform.h" - -/* define isa namespace and ISA bitvector */ -#if defined (__AVX512VL__) -# define isa avx512skx -# define ISA AVX512SKX -# define ISA_STR "AVX512SKX" -#elif defined (__AVX512F__) -# define isa avx512knl -# define ISA AVX512KNL -# define ISA_STR "AVX512KNL" -#elif defined (__AVX2__) -# define isa avx2 -# define ISA AVX2 -# define ISA_STR "AVX2" -#elif defined(__AVXI__) -# define isa avxi -# define ISA AVXI -# define ISA_STR "AVXI" -#elif defined(__AVX__) -# define isa avx -# define ISA AVX -# define ISA_STR "AVX" -#elif defined (__SSE4_2__) -# define isa sse42 -# define ISA SSE42 -# define ISA_STR "SSE4.2" -//#elif defined (__SSE4_1__) // we demote this to SSE2, MacOSX code compiles with SSE41 by default with XCode 11 -//# define isa sse41 -//# define ISA SSE41 -//# define ISA_STR "SSE4.1" -//#elif defined(__SSSE3__) // we demote this to SSE2, MacOSX code compiles with SSSE3 by default with ICC -//# define isa ssse3 -//# define ISA SSSE3 -//# define ISA_STR "SSSE3" -//#elif defined(__SSE3__) // we demote this to SSE2, MacOSX code compiles with SSE3 by default with clang -//# define isa sse3 -//# define ISA SSE3 -//# define ISA_STR "SSE3" -#elif defined(__SSE2__) || defined(__SSE3__) || defined(__SSSE3__) -# define isa sse2 -# define ISA SSE2 -# define ISA_STR "SSE2" -#elif defined(__SSE__) -# define isa sse -# define ISA SSE -# define ISA_STR "SSE" -#elif defined(__ARM_NEON) -// NOTE(LTE): Use sse2 for `isa` for the compatibility at the moment. -#define isa sse2 -#define ISA NEON -#define ISA_STR "NEON" -#else -#error Unknown ISA -#endif - -namespace embree -{ - enum class CPU - { - XEON_ICE_LAKE, - CORE_ICE_LAKE, - CORE_TIGER_LAKE, - CORE_COMET_LAKE, - CORE_CANNON_LAKE, - CORE_KABY_LAKE, - XEON_SKY_LAKE, - CORE_SKY_LAKE, - XEON_PHI_KNIGHTS_MILL, - XEON_PHI_KNIGHTS_LANDING, - XEON_BROADWELL, - CORE_BROADWELL, - XEON_HASWELL, - CORE_HASWELL, - XEON_IVY_BRIDGE, - CORE_IVY_BRIDGE, - SANDY_BRIDGE, - NEHALEM, - CORE2, - CORE1, - ARM, - UNKNOWN, - }; - - /*! get the full path to the running executable */ - std::string getExecutableFileName(); - - /*! return platform name */ - std::string getPlatformName(); - - /*! get the full name of the compiler */ - std::string getCompilerName(); - - /*! return the name of the CPU */ - std::string getCPUVendor(); - - /*! get microprocessor model */ - CPU getCPUModel(); - - /*! converts CPU model into string */ - std::string stringOfCPUModel(CPU model); - - /*! CPU features */ - static const int CPU_FEATURE_SSE = 1 << 0; - static const int CPU_FEATURE_SSE2 = 1 << 1; - static const int CPU_FEATURE_SSE3 = 1 << 2; - static const int CPU_FEATURE_SSSE3 = 1 << 3; - static const int CPU_FEATURE_SSE41 = 1 << 4; - static const int CPU_FEATURE_SSE42 = 1 << 5; - static const int CPU_FEATURE_POPCNT = 1 << 6; - static const int CPU_FEATURE_AVX = 1 << 7; - static const int CPU_FEATURE_F16C = 1 << 8; - static const int CPU_FEATURE_RDRAND = 1 << 9; - static const int CPU_FEATURE_AVX2 = 1 << 10; - static const int CPU_FEATURE_FMA3 = 1 << 11; - static const int CPU_FEATURE_LZCNT = 1 << 12; - static const int CPU_FEATURE_BMI1 = 1 << 13; - static const int CPU_FEATURE_BMI2 = 1 << 14; - static const int CPU_FEATURE_AVX512F = 1 << 16; - static const int CPU_FEATURE_AVX512DQ = 1 << 17; - static const int CPU_FEATURE_AVX512PF = 1 << 18; - static const int CPU_FEATURE_AVX512ER = 1 << 19; - static const int CPU_FEATURE_AVX512CD = 1 << 20; - static const int CPU_FEATURE_AVX512BW = 1 << 21; - static const int CPU_FEATURE_AVX512VL = 1 << 22; - static const int CPU_FEATURE_AVX512IFMA = 1 << 23; - static const int CPU_FEATURE_AVX512VBMI = 1 << 24; - static const int CPU_FEATURE_XMM_ENABLED = 1 << 25; - static const int CPU_FEATURE_YMM_ENABLED = 1 << 26; - static const int CPU_FEATURE_ZMM_ENABLED = 1 << 27; - static const int CPU_FEATURE_NEON = 1 << 28; - static const int CPU_FEATURE_NEON_2X = 1 << 29; - - /*! get CPU features */ - int getCPUFeatures(); - - /*! convert CPU features into a string */ - std::string stringOfCPUFeatures(int features); - - /*! creates a string of all supported targets that are supported */ - std::string supportedTargetList (int isa); - - /*! ISAs */ - static const int SSE = CPU_FEATURE_SSE | CPU_FEATURE_XMM_ENABLED; - static const int SSE2 = SSE | CPU_FEATURE_SSE2; - static const int SSE3 = SSE2 | CPU_FEATURE_SSE3; - static const int SSSE3 = SSE3 | CPU_FEATURE_SSSE3; - static const int SSE41 = SSSE3 | CPU_FEATURE_SSE41; - static const int SSE42 = SSE41 | CPU_FEATURE_SSE42 | CPU_FEATURE_POPCNT; - static const int AVX = SSE42 | CPU_FEATURE_AVX | CPU_FEATURE_YMM_ENABLED; - static const int AVXI = AVX | CPU_FEATURE_F16C | CPU_FEATURE_RDRAND; - static const int AVX2 = AVXI | CPU_FEATURE_AVX2 | CPU_FEATURE_FMA3 | CPU_FEATURE_BMI1 | CPU_FEATURE_BMI2 | CPU_FEATURE_LZCNT; - static const int AVX512KNL = AVX2 | CPU_FEATURE_AVX512F | CPU_FEATURE_AVX512PF | CPU_FEATURE_AVX512ER | CPU_FEATURE_AVX512CD | CPU_FEATURE_ZMM_ENABLED; - static const int AVX512SKX = AVX2 | CPU_FEATURE_AVX512F | CPU_FEATURE_AVX512DQ | CPU_FEATURE_AVX512CD | CPU_FEATURE_AVX512BW | CPU_FEATURE_AVX512VL | CPU_FEATURE_ZMM_ENABLED; - static const int NEON = CPU_FEATURE_NEON | CPU_FEATURE_SSE | CPU_FEATURE_SSE2; - static const int NEON_2X = CPU_FEATURE_NEON_2X | AVX2; - - /*! converts ISA bitvector into a string */ - std::string stringOfISA(int features); - - /*! return the number of logical threads of the system */ - unsigned int getNumberOfLogicalThreads(); - - /*! returns the size of the terminal window in characters */ - int getTerminalWidth(); - - /*! returns performance counter in seconds */ - double getSeconds(); - - /*! sleeps the specified number of seconds */ - void sleepSeconds(double t); - - /*! returns virtual address space occupied by process */ - size_t getVirtualMemoryBytes(); - - /*! returns resident memory required by process */ - size_t getResidentMemoryBytes(); -} diff --git a/thirdparty/embree-aarch64/common/sys/thread.cpp b/thirdparty/embree-aarch64/common/sys/thread.cpp deleted file mode 100644 index f9ea5b7d96..0000000000 --- a/thirdparty/embree-aarch64/common/sys/thread.cpp +++ /dev/null @@ -1,429 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "thread.h" -#include "sysinfo.h" -#include "string.h" - -#include -#if defined(__ARM_NEON) -#include "../math/SSE2NEON.h" -#else -#include -#endif - -#if defined(PTHREADS_WIN32) -#pragma comment (lib, "pthreadVC.lib") -#endif - -//////////////////////////////////////////////////////////////////////////////// -/// Windows Platform -//////////////////////////////////////////////////////////////////////////////// - -#if defined(__WIN32__) - -#define WIN32_LEAN_AND_MEAN -#include - -namespace embree -{ - /*! set the affinity of a given thread */ - void setAffinity(HANDLE thread, ssize_t affinity) - { - typedef WORD (WINAPI *GetActiveProcessorGroupCountFunc)(); - typedef DWORD (WINAPI *GetActiveProcessorCountFunc)(WORD); - typedef BOOL (WINAPI *SetThreadGroupAffinityFunc)(HANDLE, const GROUP_AFFINITY *, PGROUP_AFFINITY); - typedef BOOL (WINAPI *SetThreadIdealProcessorExFunc)(HANDLE, PPROCESSOR_NUMBER, PPROCESSOR_NUMBER); - HMODULE hlib = LoadLibrary("Kernel32"); - GetActiveProcessorGroupCountFunc pGetActiveProcessorGroupCount = (GetActiveProcessorGroupCountFunc)GetProcAddress(hlib, "GetActiveProcessorGroupCount"); - GetActiveProcessorCountFunc pGetActiveProcessorCount = (GetActiveProcessorCountFunc)GetProcAddress(hlib, "GetActiveProcessorCount"); - SetThreadGroupAffinityFunc pSetThreadGroupAffinity = (SetThreadGroupAffinityFunc)GetProcAddress(hlib, "SetThreadGroupAffinity"); - SetThreadIdealProcessorExFunc pSetThreadIdealProcessorEx = (SetThreadIdealProcessorExFunc)GetProcAddress(hlib, "SetThreadIdealProcessorEx"); - if (pGetActiveProcessorGroupCount && pGetActiveProcessorCount && pSetThreadGroupAffinity && pSetThreadIdealProcessorEx) - { - int groups = pGetActiveProcessorGroupCount(); - int totalProcessors = 0, group = 0, number = 0; - for (int i = 0; i affinity) { - group = i; - number = (int)affinity - totalProcessors; - break; - } - totalProcessors += processors; - } - - GROUP_AFFINITY groupAffinity; - groupAffinity.Group = (WORD)group; - groupAffinity.Mask = (KAFFINITY)(uint64_t(1) << number); - groupAffinity.Reserved[0] = 0; - groupAffinity.Reserved[1] = 0; - groupAffinity.Reserved[2] = 0; - if (!pSetThreadGroupAffinity(thread, &groupAffinity, nullptr)) - WARNING("SetThreadGroupAffinity failed"); // on purpose only a warning - - PROCESSOR_NUMBER processorNumber; - processorNumber.Group = group; - processorNumber.Number = number; - processorNumber.Reserved = 0; - if (!pSetThreadIdealProcessorEx(thread, &processorNumber, nullptr)) - WARNING("SetThreadIdealProcessorEx failed"); // on purpose only a warning - } - else - { - if (!SetThreadAffinityMask(thread, DWORD_PTR(uint64_t(1) << affinity))) - WARNING("SetThreadAffinityMask failed"); // on purpose only a warning - if (SetThreadIdealProcessor(thread, (DWORD)affinity) == (DWORD)-1) - WARNING("SetThreadIdealProcessor failed"); // on purpose only a warning - } - } - - /*! set affinity of the calling thread */ - void setAffinity(ssize_t affinity) { - setAffinity(GetCurrentThread(), affinity); - } - - struct ThreadStartupData - { - public: - ThreadStartupData (thread_func f, void* arg) - : f(f), arg(arg) {} - public: - thread_func f; - void* arg; - }; - - DWORD WINAPI threadStartup(LPVOID ptr) - { - ThreadStartupData* parg = (ThreadStartupData*) ptr; - _mm_setcsr(_mm_getcsr() | /*FTZ:*/ (1<<15) | /*DAZ:*/ (1<<6)); - parg->f(parg->arg); - delete parg; - parg = nullptr; - return 0; - } - -#if !defined(PTHREADS_WIN32) - - /*! creates a hardware thread running on specific core */ - thread_t createThread(thread_func f, void* arg, size_t stack_size, ssize_t threadID) - { - HANDLE thread = CreateThread(nullptr, stack_size, threadStartup, new ThreadStartupData(f,arg), 0, nullptr); - if (thread == nullptr) FATAL("CreateThread failed"); - if (threadID >= 0) setAffinity(thread, threadID); - return thread_t(thread); - } - - /*! the thread calling this function gets yielded */ - void yield() { - SwitchToThread(); - } - - /*! waits until the given thread has terminated */ - void join(thread_t tid) { - WaitForSingleObject(HANDLE(tid), INFINITE); - CloseHandle(HANDLE(tid)); - } - - /*! creates thread local storage */ - tls_t createTls() { - return tls_t(size_t(TlsAlloc())); - } - - /*! set the thread local storage pointer */ - void setTls(tls_t tls, void* const ptr) { - TlsSetValue(DWORD(size_t(tls)), ptr); - } - - /*! return the thread local storage pointer */ - void* getTls(tls_t tls) { - return TlsGetValue(DWORD(size_t(tls))); - } - - /*! destroys thread local storage identifier */ - void destroyTls(tls_t tls) { - TlsFree(DWORD(size_t(tls))); - } -#endif -} - -#endif - -//////////////////////////////////////////////////////////////////////////////// -/// Linux Platform -//////////////////////////////////////////////////////////////////////////////// - -#if defined(__LINUX__) - -#include -#include -#include - -#if defined(__ANDROID__) -#include -#endif - -namespace embree -{ - static MutexSys mutex; - static std::vector threadIDs; - -#if !defined(__ANDROID__) // TODO(LTE): Implement for Android target - /* changes thread ID mapping such that we first fill up all thread on one core */ - size_t mapThreadID(size_t threadID) - { - Lock lock(mutex); - - if (threadIDs.size() == 0) - { - /* parse thread/CPU topology */ - for (size_t cpuID=0;;cpuID++) - { - std::fstream fs; - std::string cpu = std::string("/sys/devices/system/cpu/cpu") + std::to_string((long long)cpuID) + std::string("/topology/thread_siblings_list"); - fs.open (cpu.c_str(), std::fstream::in); - if (fs.fail()) break; - - int i; - while (fs >> i) - { - if (std::none_of(threadIDs.begin(),threadIDs.end(),[&] (int id) { return id == i; })) - threadIDs.push_back(i); - if (fs.peek() == ',') - fs.ignore(); - } - fs.close(); - } - -#if 0 - for (size_t i=0;i " << threadIDs[i] << std::endl; -#endif - - /* verify the mapping and do not use it if the mapping has errors */ - for (size_t i=0;i - -namespace embree -{ - /*! set affinity of the calling thread */ - void setAffinity(ssize_t affinity) - { - cpuset_t cset; - CPU_ZERO(&cset); - CPU_SET(affinity, &cset); - - pthread_setaffinity_np(pthread_self(), sizeof(cset), &cset); - } -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -/// MacOSX Platform -//////////////////////////////////////////////////////////////////////////////// - -#if defined(__MACOSX__) - -#include -#include -#include - -namespace embree -{ - /*! set affinity of the calling thread */ - void setAffinity(ssize_t affinity) - { - thread_affinity_policy ap; - ap.affinity_tag = affinity; - if (thread_policy_set(mach_thread_self(),THREAD_AFFINITY_POLICY,(thread_policy_t)&ap,THREAD_AFFINITY_POLICY_COUNT) != KERN_SUCCESS) - WARNING("setting thread affinity failed"); // on purpose only a warning - } -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -/// Unix Platform -//////////////////////////////////////////////////////////////////////////////// - -#if defined(__UNIX__) || defined(PTHREADS_WIN32) - -#include -#include - -#if defined(__USE_NUMA__) -#include -#endif - -namespace embree -{ - struct ThreadStartupData - { - public: - ThreadStartupData (thread_func f, void* arg, int affinity) - : f(f), arg(arg), affinity(affinity) {} - public: - thread_func f; - void* arg; - ssize_t affinity; - }; - - static void* threadStartup(ThreadStartupData* parg) - { - _mm_setcsr(_mm_getcsr() | /*FTZ:*/ (1<<15) | /*DAZ:*/ (1<<6)); - - /*! Mac OS X does not support setting affinity at thread creation time */ -#if defined(__MACOSX__) - if (parg->affinity >= 0) - setAffinity(parg->affinity); -#endif - - parg->f(parg->arg); - delete parg; - parg = nullptr; - return nullptr; - } - - /*! creates a hardware thread running on specific core */ - thread_t createThread(thread_func f, void* arg, size_t stack_size, ssize_t threadID) - { - /* set stack size */ - pthread_attr_t attr; - pthread_attr_init(&attr); - if (stack_size > 0) pthread_attr_setstacksize (&attr, stack_size); - - /* create thread */ - pthread_t* tid = new pthread_t; - if (pthread_create(tid,&attr,(void*(*)(void*))threadStartup,new ThreadStartupData(f,arg,threadID)) != 0) { - pthread_attr_destroy(&attr); - delete tid; - FATAL("pthread_create failed"); - } - pthread_attr_destroy(&attr); - - /* set affinity */ -#if defined(__LINUX__) && !defined(__ANDROID__) - if (threadID >= 0) { - cpu_set_t cset; - CPU_ZERO(&cset); - threadID = mapThreadID(threadID); - CPU_SET(threadID, &cset); - pthread_setaffinity_np(*tid, sizeof(cset), &cset); - } -#elif defined(__FreeBSD__) - if (threadID >= 0) { - cpuset_t cset; - CPU_ZERO(&cset); - CPU_SET(threadID, &cset); - pthread_setaffinity_np(*tid, sizeof(cset), &cset); - } -#endif - - return thread_t(tid); - } - - /*! the thread calling this function gets yielded */ - void yield() { - sched_yield(); - } - - /*! waits until the given thread has terminated */ - void join(thread_t tid) { - if (pthread_join(*(pthread_t*)tid, nullptr) != 0) - FATAL("pthread_join failed"); - delete (pthread_t*)tid; - } - - /*! creates thread local storage */ - tls_t createTls() - { - pthread_key_t* key = new pthread_key_t; - if (pthread_key_create(key,nullptr) != 0) { - delete key; - FATAL("pthread_key_create failed"); - } - - return tls_t(key); - } - - /*! return the thread local storage pointer */ - void* getTls(tls_t tls) - { - assert(tls); - return pthread_getspecific(*(pthread_key_t*)tls); - } - - /*! set the thread local storage pointer */ - void setTls(tls_t tls, void* const ptr) - { - assert(tls); - if (pthread_setspecific(*(pthread_key_t*)tls, ptr) != 0) - FATAL("pthread_setspecific failed"); - } - - /*! destroys thread local storage identifier */ - void destroyTls(tls_t tls) - { - assert(tls); - if (pthread_key_delete(*(pthread_key_t*)tls) != 0) - FATAL("pthread_key_delete failed"); - delete (pthread_key_t*)tls; - } -} - -#endif diff --git a/thirdparty/embree-aarch64/common/sys/thread.h b/thirdparty/embree-aarch64/common/sys/thread.h deleted file mode 100644 index 45da6e6a70..0000000000 --- a/thirdparty/embree-aarch64/common/sys/thread.h +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "platform.h" -#include "mutex.h" -#include "alloc.h" -#include "vector.h" -#include - -namespace embree -{ - /*! type for thread */ - typedef struct opaque_thread_t* thread_t; - - /*! signature of thread start function */ - typedef void (*thread_func)(void*); - - /*! creates a hardware thread running on specific logical thread */ - thread_t createThread(thread_func f, void* arg, size_t stack_size = 0, ssize_t threadID = -1); - - /*! set affinity of the calling thread */ - void setAffinity(ssize_t affinity); - - /*! the thread calling this function gets yielded */ - void yield(); - - /*! waits until the given thread has terminated */ - void join(thread_t tid); - - /*! type for handle to thread local storage */ - typedef struct opaque_tls_t* tls_t; - - /*! creates thread local storage */ - tls_t createTls(); - - /*! set the thread local storage pointer */ - void setTls(tls_t tls, void* const ptr); - - /*! return the thread local storage pointer */ - void* getTls(tls_t tls); - - /*! destroys thread local storage identifier */ - void destroyTls(tls_t tls); -} diff --git a/thirdparty/embree-aarch64/common/sys/vector.h b/thirdparty/embree-aarch64/common/sys/vector.h deleted file mode 100644 index e41794de7c..0000000000 --- a/thirdparty/embree-aarch64/common/sys/vector.h +++ /dev/null @@ -1,242 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "alloc.h" -#include - -namespace embree -{ - template - class vector_t - { - public: - typedef T value_type; - typedef T* iterator; - typedef const T* const_iterator; - - __forceinline vector_t () - : size_active(0), size_alloced(0), items(nullptr) {} - - __forceinline explicit vector_t (size_t sz) - : size_active(0), size_alloced(0), items(nullptr) { internal_resize_init(sz); } - - template - __forceinline explicit vector_t (M alloc, size_t sz) - : alloc(alloc), size_active(0), size_alloced(0), items(nullptr) { internal_resize_init(sz); } - - __forceinline ~vector_t() { - clear(); - } - - __forceinline vector_t (const vector_t& other) - { - size_active = other.size_active; - size_alloced = other.size_alloced; - items = alloc.allocate(size_alloced); - for (size_t i=0; i 0); return items[0]; }; - __forceinline T& back () const { assert(size_active > 0); return items[size_active-1]; }; - - __forceinline T* data() { return items; }; - __forceinline const T* data() const { return items; }; - - - /******************** Modifiers **************************/ - - __forceinline void push_back(const T& nt) - { - const T v = nt; // need local copy as input reference could point to this vector - internal_resize(size_active,internal_grow_size(size_active+1)); - ::new (&items[size_active++]) T(v); - } - - __forceinline void pop_back() - { - assert(!empty()); - size_active--; - alloc.destroy(&items[size_active]); - } - - __forceinline void clear() - { - /* destroy elements */ - for (size_t i=0; i - using vector = vector_t>; - - /*! vector class that performs aligned allocations */ - template - using avector = vector_t::value> >; - - /*! vector class that performs OS allocations */ - template - using ovector = vector_t >; -} diff --git a/thirdparty/embree-aarch64/common/tasking/taskscheduler.h b/thirdparty/embree-aarch64/common/tasking/taskscheduler.h deleted file mode 100644 index 9940e068d0..0000000000 --- a/thirdparty/embree-aarch64/common/tasking/taskscheduler.h +++ /dev/null @@ -1,17 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#if defined(TASKING_INTERNAL) -# include "taskschedulerinternal.h" -#elif defined(TASKING_GCD) && defined(BUILD_IOS) -# include "taskschedulergcd.h" -#elif defined(TASKING_TBB) -# include "taskschedulertbb.h" -#elif defined(TASKING_PPL) -# include "taskschedulerppl.h" -#else -# error "no tasking system enabled" -#endif - diff --git a/thirdparty/embree-aarch64/common/tasking/taskschedulergcd.h b/thirdparty/embree-aarch64/common/tasking/taskschedulergcd.h deleted file mode 100644 index d31f8bb478..0000000000 --- a/thirdparty/embree-aarch64/common/tasking/taskschedulergcd.h +++ /dev/null @@ -1,49 +0,0 @@ -#pragma once - -#include "../sys/platform.h" -#include "../sys/alloc.h" -#include "../sys/barrier.h" -#include "../sys/thread.h" -#include "../sys/mutex.h" -#include "../sys/condition.h" -#include "../sys/ref.h" - -#include - -namespace embree -{ - struct TaskScheduler - { - /*! initializes the task scheduler */ - static void create(size_t numThreads, bool set_affinity, bool start_threads); - - /*! destroys the task scheduler again */ - static void destroy() {} - - /* returns the ID of the current thread */ - static __forceinline size_t threadID() - { - return threadIndex(); - } - - /* returns the index (0..threadCount-1) of the current thread */ - static __forceinline size_t threadIndex() - { - currentThreadIndex = (currentThreadIndex + 1) % GCDNumThreads; - return currentThreadIndex; - } - - /* returns the total number of threads */ - static __forceinline size_t threadCount() - { - return GCDNumThreads; - } - - private: - static size_t GCDNumThreads; - static size_t currentThreadIndex; - - }; - -}; - diff --git a/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.cpp b/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.cpp deleted file mode 100644 index ebf656d1a0..0000000000 --- a/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.cpp +++ /dev/null @@ -1,426 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "taskschedulerinternal.h" -#include "../math/math.h" -#include "../sys/sysinfo.h" -#include - -namespace embree -{ - RTC_NAMESPACE_BEGIN - - static MutexSys g_mutex; - size_t TaskScheduler::g_numThreads = 0; - __thread TaskScheduler* TaskScheduler::g_instance = nullptr; - std::vector> g_instance_vector; - __thread TaskScheduler::Thread* TaskScheduler::thread_local_thread = nullptr; - TaskScheduler::ThreadPool* TaskScheduler::threadPool = nullptr; - - template - __forceinline void TaskScheduler::steal_loop(Thread& thread, const Predicate& pred, const Body& body) - { - while (true) - { - /*! some rounds that yield */ - for (size_t i=0; i<32; i++) - { - /*! some spinning rounds */ - const size_t threadCount = thread.threadCount(); - for (size_t j=0; j<1024; j+=threadCount) - { - if (!pred()) return; - if (thread.scheduler->steal_from_other_threads(thread)) { - i=j=0; - body(); - } - } - yield(); - } - } - } - - /*! run this task */ - void TaskScheduler::Task::run_internal (Thread& thread) // FIXME: avoid as many dll_exports as possible - { - /* try to run if not already stolen */ - if (try_switch_state(INITIALIZED,DONE)) - { - Task* prevTask = thread.task; - thread.task = this; - // -- GODOT start -- - // try { - // if (thread.scheduler->cancellingException == nullptr) - closure->execute(); - // } catch (...) { - // if (thread.scheduler->cancellingException == nullptr) - // thread.scheduler->cancellingException = std::current_exception(); - // } - // -- GODOT end -- - thread.task = prevTask; - add_dependencies(-1); - } - - /* steal until all dependencies have completed */ - steal_loop(thread, - [&] () { return dependencies>0; }, - [&] () { while (thread.tasks.execute_local_internal(thread,this)); }); - - /* now signal our parent task that we are finished */ - if (parent) - parent->add_dependencies(-1); - } - - /*! run this task */ - dll_export void TaskScheduler::Task::run (Thread& thread) { - run_internal(thread); - } - - bool TaskScheduler::TaskQueue::execute_local_internal(Thread& thread, Task* parent) - { - /* stop if we run out of local tasks or reach the waiting task */ - if (right == 0 || &tasks[right-1] == parent) - return false; - - /* execute task */ - size_t oldRight = right; - tasks[right-1].run_internal(thread); - if (right != oldRight) { - THROW_RUNTIME_ERROR("you have to wait for spawned subtasks"); - } - - /* pop task and closure from stack */ - right--; - if (tasks[right].stackPtr != size_t(-1)) - stackPtr = tasks[right].stackPtr; - - /* also move left pointer */ - if (left >= right) left.store(right.load()); - - return right != 0; - } - - dll_export bool TaskScheduler::TaskQueue::execute_local(Thread& thread, Task* parent) { - return execute_local_internal(thread,parent); - } - - bool TaskScheduler::TaskQueue::steal(Thread& thread) - { - size_t l = left; - size_t r = right; - if (l < r) - { - l = left++; - if (l >= r) - return false; - } - else - return false; - - if (!tasks[l].try_steal(thread.tasks.tasks[thread.tasks.right])) - return false; - - thread.tasks.right++; - return true; - } - - /* we steal from the left */ - size_t TaskScheduler::TaskQueue::getTaskSizeAtLeft() - { - if (left >= right) return 0; - return tasks[left].N; - } - - void threadPoolFunction(std::pair* pair) - { - TaskScheduler::ThreadPool* pool = pair->first; - size_t threadIndex = pair->second; - delete pair; - pool->thread_loop(threadIndex); - } - - TaskScheduler::ThreadPool::ThreadPool(bool set_affinity) - : numThreads(0), numThreadsRunning(0), set_affinity(set_affinity), running(false) {} - - dll_export void TaskScheduler::ThreadPool::startThreads() - { - if (running) return; - setNumThreads(numThreads,true); - } - - void TaskScheduler::ThreadPool::setNumThreads(size_t newNumThreads, bool startThreads) - { - Lock lock(g_mutex); - assert(newNumThreads); - newNumThreads = min(newNumThreads, (size_t) getNumberOfLogicalThreads()); - - // We are observing a few % gain by increasing number threads by 2 on aarch64. -#if defined(__aarch64__) && defined(BUILD_IOS) - numThreads = newNumThreads*2; -#else - numThreads = newNumThreads; -#endif - numThreads = newNumThreads; - if (!startThreads && !running) return; - running = true; - size_t numThreadsActive = numThreadsRunning; - - mutex.lock(); - numThreadsRunning = newNumThreads; - mutex.unlock(); - condition.notify_all(); - - /* start new threads */ - for (size_t t=numThreadsActive; t(this,t); - threads.push_back(createThread((thread_func)threadPoolFunction,pair,4*1024*1024,set_affinity ? t : -1)); - } - - /* stop some threads if we reduce the number of threads */ - for (ssize_t t=numThreadsActive-1; t>=ssize_t(numThreadsRunning); t--) { - if (t == 0) continue; - embree::join(threads.back()); - threads.pop_back(); - } - } - - TaskScheduler::ThreadPool::~ThreadPool() - { - /* leave all taskschedulers */ - mutex.lock(); - numThreadsRunning = 0; - mutex.unlock(); - condition.notify_all(); - - /* wait for threads to terminate */ - for (size_t i=0; i& scheduler) - { - mutex.lock(); - schedulers.push_back(scheduler); - mutex.unlock(); - condition.notify_all(); - } - - dll_export void TaskScheduler::ThreadPool::remove(const Ref& scheduler) - { - Lock lock(mutex); - for (std::list >::iterator it = schedulers.begin(); it != schedulers.end(); it++) { - if (scheduler == *it) { - schedulers.erase(it); - return; - } - } - } - - void TaskScheduler::ThreadPool::thread_loop(size_t globalThreadIndex) - { - while (globalThreadIndex < numThreadsRunning) - { - Ref scheduler = NULL; - ssize_t threadIndex = -1; - { - Lock lock(mutex); - condition.wait(mutex, [&] () { return globalThreadIndex >= numThreadsRunning || !schedulers.empty(); }); - if (globalThreadIndex >= numThreadsRunning) break; - scheduler = schedulers.front(); - threadIndex = scheduler->allocThreadIndex(); - } - scheduler->thread_loop(threadIndex); - } - } - - TaskScheduler::TaskScheduler() - : threadCounter(0), anyTasksRunning(0), hasRootTask(false) - { - threadLocal.resize(2*getNumberOfLogicalThreads()); // FIXME: this has to be 2x as in the compatibility join mode with rtcCommitScene the worker threads also join. When disallowing rtcCommitScene to join a build we can remove the 2x. - for (size_t i=0; ithreadIndex; - else return 0; - } - - dll_export size_t TaskScheduler::threadIndex() - { - Thread* thread = TaskScheduler::thread(); - if (thread) return thread->threadIndex; - else return 0; - } - - dll_export size_t TaskScheduler::threadCount() { - return threadPool->size(); - } - - dll_export TaskScheduler* TaskScheduler::instance() - { - if (g_instance == NULL) { - Lock lock(g_mutex); - g_instance = new TaskScheduler; - g_instance_vector.push_back(g_instance); - } - return g_instance; - } - - void TaskScheduler::create(size_t numThreads, bool set_affinity, bool start_threads) - { - if (!threadPool) threadPool = new TaskScheduler::ThreadPool(set_affinity); - threadPool->setNumThreads(numThreads,start_threads); - } - - void TaskScheduler::destroy() { - delete threadPool; threadPool = nullptr; - } - - dll_export ssize_t TaskScheduler::allocThreadIndex() - { - size_t threadIndex = threadCounter++; - assert(threadIndex < threadLocal.size()); - return threadIndex; - } - - void TaskScheduler::join() - { - mutex.lock(); - size_t threadIndex = allocThreadIndex(); - condition.wait(mutex, [&] () { return hasRootTask.load(); }); - mutex.unlock(); - // -- GODOT start -- - // std::exception_ptr except = thread_loop(threadIndex); - // if (except != nullptr) std::rethrow_exception(except); - thread_loop(threadIndex); - // -- GODOT end -- - } - - void TaskScheduler::reset() { - hasRootTask = false; - } - - void TaskScheduler::wait_for_threads(size_t threadCount) - { - while (threadCounter < threadCount-1) - pause_cpu(); - } - - dll_export TaskScheduler::Thread* TaskScheduler::thread() { - return thread_local_thread; - } - - dll_export TaskScheduler::Thread* TaskScheduler::swapThread(Thread* thread) - { - Thread* old = thread_local_thread; - thread_local_thread = thread; - return old; - } - - dll_export bool TaskScheduler::wait() - { - Thread* thread = TaskScheduler::thread(); - if (thread == nullptr) return true; - while (thread->tasks.execute_local_internal(*thread,thread->task)) {}; - return thread->scheduler->cancellingException == nullptr; - } - -// -- GODOT start -- -// std::exception_ptr TaskScheduler::thread_loop(size_t threadIndex) - void TaskScheduler::thread_loop(size_t threadIndex) -// -- GODOT end -- - { - /* allocate thread structure */ - std::unique_ptr mthread(new Thread(threadIndex,this)); // too large for stack allocation - Thread& thread = *mthread; - threadLocal[threadIndex].store(&thread); - Thread* oldThread = swapThread(&thread); - - /* main thread loop */ - while (anyTasksRunning) - { - steal_loop(thread, - [&] () { return anyTasksRunning > 0; }, - [&] () { - anyTasksRunning++; - while (thread.tasks.execute_local_internal(thread,nullptr)); - anyTasksRunning--; - }); - } - threadLocal[threadIndex].store(nullptr); - swapThread(oldThread); - - /* remember exception to throw */ - // -- GODOT start -- - // std::exception_ptr except = nullptr; - // if (cancellingException != nullptr) except = cancellingException; - // -- GODOT end -- - /* wait for all threads to terminate */ - threadCounter--; -#if defined(__WIN32__) - size_t loopIndex = 1; -#endif -#define LOOP_YIELD_THRESHOLD (4096) - while (threadCounter > 0) { -#if defined(__WIN32__) - if ((loopIndex % LOOP_YIELD_THRESHOLD) == 0) - yield(); - else - _mm_pause(); - loopIndex++; -#else - yield(); -#endif - } - // -- GODOT start -- - // return except; - return; - // -- GODOT end -- - } - - bool TaskScheduler::steal_from_other_threads(Thread& thread) - { - const size_t threadIndex = thread.threadIndex; - const size_t threadCount = this->threadCounter; - - for (size_t i=1; i= threadCount) otherThreadIndex -= threadCount; - - Thread* othread = threadLocal[otherThreadIndex].load(); - if (!othread) - continue; - - if (othread->tasks.steal(thread)) - return true; - } - - return false; - } - - dll_export void TaskScheduler::startThreads() { - threadPool->startThreads(); - } - - dll_export void TaskScheduler::addScheduler(const Ref& scheduler) { - threadPool->add(scheduler); - } - - dll_export void TaskScheduler::removeScheduler(const Ref& scheduler) { - threadPool->remove(scheduler); - } - - RTC_NAMESPACE_END -} diff --git a/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.h b/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.h deleted file mode 100644 index 8bd70b2b8c..0000000000 --- a/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.h +++ /dev/null @@ -1,386 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../sys/platform.h" -#include "../sys/alloc.h" -#include "../sys/barrier.h" -#include "../sys/thread.h" -#include "../sys/mutex.h" -#include "../sys/condition.h" -#include "../sys/ref.h" -#include "../sys/atomic.h" -#include "../math/range.h" -#include "../../include/embree3/rtcore.h" - -#include - -namespace embree -{ - - /* The tasking system exports some symbols to be used by the tutorials. Thus we - hide is also in the API namespace when requested. */ - RTC_NAMESPACE_BEGIN - - struct TaskScheduler : public RefCount - { - ALIGNED_STRUCT_(64); - friend class Device; - - static const size_t TASK_STACK_SIZE = 4*1024; //!< task structure stack - static const size_t CLOSURE_STACK_SIZE = 512*1024; //!< stack for task closures - - struct Thread; - - /*! virtual interface for all tasks */ - struct TaskFunction { - virtual void execute() = 0; - }; - - /*! builds a task interface from a closure */ - template - struct ClosureTaskFunction : public TaskFunction - { - Closure closure; - __forceinline ClosureTaskFunction (const Closure& closure) : closure(closure) {} - void execute() { closure(); }; - }; - - struct __aligned(64) Task - { - /*! states a task can be in */ - enum { DONE, INITIALIZED }; - - /*! switch from one state to another */ - __forceinline void switch_state(int from, int to) - { - __memory_barrier(); - MAYBE_UNUSED bool success = state.compare_exchange_strong(from,to); - assert(success); - } - - /*! try to switch from one state to another */ - __forceinline bool try_switch_state(int from, int to) { - __memory_barrier(); - return state.compare_exchange_strong(from,to); - } - - /*! increment/decrement dependency counter */ - void add_dependencies(int n) { - dependencies+=n; - } - - /*! initialize all tasks to DONE state by default */ - __forceinline Task() - : state(DONE) {} - - /*! construction of new task */ - __forceinline Task (TaskFunction* closure, Task* parent, size_t stackPtr, size_t N) - : dependencies(1), stealable(true), closure(closure), parent(parent), stackPtr(stackPtr), N(N) - { - if (parent) parent->add_dependencies(+1); - switch_state(DONE,INITIALIZED); - } - - /*! construction of stolen task, stealing thread will decrement initial dependency */ - __forceinline Task (TaskFunction* closure, Task* parent) - : dependencies(1), stealable(false), closure(closure), parent(parent), stackPtr(-1), N(1) - { - switch_state(DONE,INITIALIZED); - } - - /*! try to steal this task */ - bool try_steal(Task& child) - { - if (!stealable) return false; - if (!try_switch_state(INITIALIZED,DONE)) return false; - new (&child) Task(closure, this); - return true; - } - - /*! run this task */ - dll_export void run(Thread& thread); - - void run_internal(Thread& thread); - - public: - std::atomic state; //!< state this task is in - std::atomic dependencies; //!< dependencies to wait for - std::atomic stealable; //!< true if task can be stolen - TaskFunction* closure; //!< the closure to execute - Task* parent; //!< parent task to signal when we are finished - size_t stackPtr; //!< stack location where closure is stored - size_t N; //!< approximative size of task - }; - - struct TaskQueue - { - TaskQueue () - : left(0), right(0), stackPtr(0) {} - - __forceinline void* alloc(size_t bytes, size_t align = 64) - { - size_t ofs = bytes + ((align - stackPtr) & (align-1)); - if (stackPtr + ofs > CLOSURE_STACK_SIZE) - // -- GODOT start -- - // throw std::runtime_error("closure stack overflow"); - abort(); - // -- GODOT end -- - stackPtr += ofs; - return &stack[stackPtr-bytes]; - } - - template - __forceinline void push_right(Thread& thread, const size_t size, const Closure& closure) - { - if (right >= TASK_STACK_SIZE) - // -- GODOT start -- - // throw std::runtime_error("task stack overflow"); - abort(); - // -- GODOT end -- - - /* allocate new task on right side of stack */ - size_t oldStackPtr = stackPtr; - TaskFunction* func = new (alloc(sizeof(ClosureTaskFunction))) ClosureTaskFunction(closure); - /* gcc 8 or later fails to compile without explicit .load() */ - new (&(tasks[right.load()])) Task(func,thread.task,oldStackPtr,size); - right++; - - /* also move left pointer */ - if (left >= right-1) left = right-1; - } - - dll_export bool execute_local(Thread& thread, Task* parent); - bool execute_local_internal(Thread& thread, Task* parent); - bool steal(Thread& thread); - size_t getTaskSizeAtLeft(); - - bool empty() { return right == 0; } - - public: - - /* task stack */ - Task tasks[TASK_STACK_SIZE]; - __aligned(64) std::atomic left; //!< threads steal from left - __aligned(64) std::atomic right; //!< new tasks are added to the right - - /* closure stack */ - __aligned(64) char stack[CLOSURE_STACK_SIZE]; - size_t stackPtr; - }; - - /*! thread local structure for each thread */ - struct Thread - { - ALIGNED_STRUCT_(64); - - Thread (size_t threadIndex, const Ref& scheduler) - : threadIndex(threadIndex), task(nullptr), scheduler(scheduler) {} - - __forceinline size_t threadCount() { - return scheduler->threadCounter; - } - - size_t threadIndex; //!< ID of this thread - TaskQueue tasks; //!< local task queue - Task* task; //!< current active task - Ref scheduler; //!< pointer to task scheduler - }; - - /*! pool of worker threads */ - struct ThreadPool - { - ThreadPool (bool set_affinity); - ~ThreadPool (); - - /*! starts the threads */ - dll_export void startThreads(); - - /*! sets number of threads to use */ - void setNumThreads(size_t numThreads, bool startThreads = false); - - /*! adds a task scheduler object for scheduling */ - dll_export void add(const Ref& scheduler); - - /*! remove the task scheduler object again */ - dll_export void remove(const Ref& scheduler); - - /*! returns number of threads of the thread pool */ - size_t size() const { return numThreads; } - - /*! main loop for all threads */ - void thread_loop(size_t threadIndex); - - private: - std::atomic numThreads; - std::atomic numThreadsRunning; - bool set_affinity; - std::atomic running; - std::vector threads; - - private: - MutexSys mutex; - ConditionSys condition; - std::list > schedulers; - }; - - TaskScheduler (); - ~TaskScheduler (); - - /*! initializes the task scheduler */ - static void create(size_t numThreads, bool set_affinity, bool start_threads); - - /*! destroys the task scheduler again */ - static void destroy(); - - /*! lets new worker threads join the tasking system */ - void join(); - void reset(); - - /*! let a worker thread allocate a thread index */ - dll_export ssize_t allocThreadIndex(); - - /*! wait for some number of threads available (threadCount includes main thread) */ - void wait_for_threads(size_t threadCount); - - /*! thread loop for all worker threads */ - // -- GODOT start -- - // std::exception_ptr thread_loop(size_t threadIndex); - void thread_loop(size_t threadIndex); - // -- GODOT end -- - - /*! steals a task from a different thread */ - bool steal_from_other_threads(Thread& thread); - - template - static void steal_loop(Thread& thread, const Predicate& pred, const Body& body); - - /* spawn a new task at the top of the threads task stack */ - template - void spawn_root(const Closure& closure, size_t size = 1, bool useThreadPool = true) - { - if (useThreadPool) startThreads(); - - size_t threadIndex = allocThreadIndex(); - std::unique_ptr mthread(new Thread(threadIndex,this)); // too large for stack allocation - Thread& thread = *mthread; - assert(threadLocal[threadIndex].load() == nullptr); - threadLocal[threadIndex] = &thread; - Thread* oldThread = swapThread(&thread); - thread.tasks.push_right(thread,size,closure); - { - Lock lock(mutex); - anyTasksRunning++; - hasRootTask = true; - condition.notify_all(); - } - - if (useThreadPool) addScheduler(this); - - while (thread.tasks.execute_local(thread,nullptr)); - anyTasksRunning--; - if (useThreadPool) removeScheduler(this); - - threadLocal[threadIndex] = nullptr; - swapThread(oldThread); - - /* remember exception to throw */ - std::exception_ptr except = nullptr; - if (cancellingException != nullptr) except = cancellingException; - - /* wait for all threads to terminate */ - threadCounter--; - while (threadCounter > 0) yield(); - cancellingException = nullptr; - - /* re-throw proper exception */ - if (except != nullptr) - std::rethrow_exception(except); - } - - /* spawn a new task at the top of the threads task stack */ - template - static __forceinline void spawn(size_t size, const Closure& closure) - { - Thread* thread = TaskScheduler::thread(); - if (likely(thread != nullptr)) thread->tasks.push_right(*thread,size,closure); - else instance()->spawn_root(closure,size); - } - - /* spawn a new task at the top of the threads task stack */ - template - static __forceinline void spawn(const Closure& closure) { - spawn(1,closure); - } - - /* spawn a new task set */ - template - static void spawn(const Index begin, const Index end, const Index blockSize, const Closure& closure) - { - spawn(end-begin, [=]() - { - if (end-begin <= blockSize) { - return closure(range(begin,end)); - } - const Index center = (begin+end)/2; - spawn(begin,center,blockSize,closure); - spawn(center,end ,blockSize,closure); - wait(); - }); - } - - /* work on spawned subtasks and wait until all have finished */ - dll_export static bool wait(); - - /* returns the ID of the current thread */ - dll_export static size_t threadID(); - - /* returns the index (0..threadCount-1) of the current thread */ - dll_export static size_t threadIndex(); - - /* returns the total number of threads */ - dll_export static size_t threadCount(); - - private: - - /* returns the thread local task list of this worker thread */ - dll_export static Thread* thread(); - - /* sets the thread local task list of this worker thread */ - dll_export static Thread* swapThread(Thread* thread); - - /*! returns the taskscheduler object to be used by the master thread */ - dll_export static TaskScheduler* instance(); - - /*! starts the threads */ - dll_export static void startThreads(); - - /*! adds a task scheduler object for scheduling */ - dll_export static void addScheduler(const Ref& scheduler); - - /*! remove the task scheduler object again */ - dll_export static void removeScheduler(const Ref& scheduler); - - private: - std::vector> threadLocal; - std::atomic threadCounter; - std::atomic anyTasksRunning; - std::atomic hasRootTask; - std::exception_ptr cancellingException; - MutexSys mutex; - ConditionSys condition; - - private: - static size_t g_numThreads; - static __thread TaskScheduler* g_instance; - static __thread Thread* thread_local_thread; - static ThreadPool* threadPool; - }; - - RTC_NAMESPACE_END - -#if defined(RTC_NAMESPACE) - using RTC_NAMESPACE::TaskScheduler; -#endif -} diff --git a/thirdparty/embree-aarch64/common/tasking/taskschedulerppl.h b/thirdparty/embree-aarch64/common/tasking/taskschedulerppl.h deleted file mode 100644 index 776f98cdac..0000000000 --- a/thirdparty/embree-aarch64/common/tasking/taskschedulerppl.h +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../sys/platform.h" -#include "../sys/alloc.h" -#include "../sys/barrier.h" -#include "../sys/thread.h" -#include "../sys/mutex.h" -#include "../sys/condition.h" -#include "../sys/ref.h" - -#if !defined(__WIN32__) -#error PPL tasking system only available under windows -#endif - -#include - -namespace embree -{ - struct TaskScheduler - { - /*! initializes the task scheduler */ - static void create(size_t numThreads, bool set_affinity, bool start_threads); - - /*! destroys the task scheduler again */ - static void destroy(); - - /* returns the ID of the current thread */ - static __forceinline size_t threadID() { - return GetCurrentThreadId(); - } - - /* returns the index (0..threadCount-1) of the current thread */ - /* FIXME: threadIndex is NOT supported by PPL! */ - static __forceinline size_t threadIndex() { - return 0; - } - - /* returns the total number of threads */ - static __forceinline size_t threadCount() { - return GetMaximumProcessorCount(ALL_PROCESSOR_GROUPS) + 1; - } - }; -}; diff --git a/thirdparty/embree-aarch64/common/tasking/taskschedulertbb.h b/thirdparty/embree-aarch64/common/tasking/taskschedulertbb.h deleted file mode 100644 index 98dba26871..0000000000 --- a/thirdparty/embree-aarch64/common/tasking/taskschedulertbb.h +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../sys/platform.h" -#include "../sys/alloc.h" -#include "../sys/barrier.h" -#include "../sys/thread.h" -#include "../sys/mutex.h" -#include "../sys/condition.h" -#include "../sys/ref.h" - -#if defined(__WIN32__) -# define NOMINMAX -#endif - -// We need to define these to avoid implicit linkage against -// tbb_debug.lib under Windows. When removing these lines debug build -// under Windows fails. -#define __TBB_NO_IMPLICIT_LINKAGE 1 -#define __TBBMALLOC_NO_IMPLICIT_LINKAGE 1 -#define TBB_SUPPRESS_DEPRECATED_MESSAGES 1 -#define TBB_PREVIEW_ISOLATED_TASK_GROUP 1 -#include "tbb/tbb.h" -#include "tbb/parallel_sort.h" - -namespace embree -{ - struct TaskScheduler - { - /*! initializes the task scheduler */ - static void create(size_t numThreads, bool set_affinity, bool start_threads); - - /*! destroys the task scheduler again */ - static void destroy(); - - /* returns the ID of the current thread */ - static __forceinline size_t threadID() - { - return threadIndex(); - } - - /* returns the index (0..threadCount-1) of the current thread */ - static __forceinline size_t threadIndex() - { -#if TBB_INTERFACE_VERSION >= 9100 - return tbb::this_task_arena::current_thread_index(); -#elif TBB_INTERFACE_VERSION >= 9000 - return tbb::task_arena::current_thread_index(); -#else - return 0; -#endif - } - - /* returns the total number of threads */ - static __forceinline size_t threadCount() { -#if TBB_INTERFACE_VERSION >= 9100 - return tbb::this_task_arena::max_concurrency(); -#else - return tbb::task_scheduler_init::default_num_threads(); -#endif - } - - }; - -}; diff --git a/thirdparty/embree-aarch64/include/embree3/rtcore.h b/thirdparty/embree-aarch64/include/embree3/rtcore.h deleted file mode 100644 index 5830bb5880..0000000000 --- a/thirdparty/embree-aarch64/include/embree3/rtcore.h +++ /dev/null @@ -1,14 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "rtcore_config.h" -#include "rtcore_common.h" -#include "rtcore_device.h" -#include "rtcore_buffer.h" -#include "rtcore_ray.h" -#include "rtcore_geometry.h" -#include "rtcore_scene.h" -#include "rtcore_builder.h" -#include "rtcore_quaternion.h" diff --git a/thirdparty/embree-aarch64/include/embree3/rtcore_buffer.h b/thirdparty/embree-aarch64/include/embree3/rtcore_buffer.h deleted file mode 100644 index 400b604aa5..0000000000 --- a/thirdparty/embree-aarch64/include/embree3/rtcore_buffer.h +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "rtcore_device.h" - -RTC_NAMESPACE_BEGIN - -/* Types of buffers */ -enum RTCBufferType -{ - RTC_BUFFER_TYPE_INDEX = 0, - RTC_BUFFER_TYPE_VERTEX = 1, - RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE = 2, - RTC_BUFFER_TYPE_NORMAL = 3, - RTC_BUFFER_TYPE_TANGENT = 4, - RTC_BUFFER_TYPE_NORMAL_DERIVATIVE = 5, - - RTC_BUFFER_TYPE_GRID = 8, - - RTC_BUFFER_TYPE_FACE = 16, - RTC_BUFFER_TYPE_LEVEL = 17, - RTC_BUFFER_TYPE_EDGE_CREASE_INDEX = 18, - RTC_BUFFER_TYPE_EDGE_CREASE_WEIGHT = 19, - RTC_BUFFER_TYPE_VERTEX_CREASE_INDEX = 20, - RTC_BUFFER_TYPE_VERTEX_CREASE_WEIGHT = 21, - RTC_BUFFER_TYPE_HOLE = 22, - - RTC_BUFFER_TYPE_FLAGS = 32 -}; - -/* Opaque buffer type */ -typedef struct RTCBufferTy* RTCBuffer; - -/* Creates a new buffer. */ -RTC_API RTCBuffer rtcNewBuffer(RTCDevice device, size_t byteSize); - -/* Creates a new shared buffer. */ -RTC_API RTCBuffer rtcNewSharedBuffer(RTCDevice device, void* ptr, size_t byteSize); - -/* Returns a pointer to the buffer data. */ -RTC_API void* rtcGetBufferData(RTCBuffer buffer); - -/* Retains the buffer (increments the reference count). */ -RTC_API void rtcRetainBuffer(RTCBuffer buffer); - -/* Releases the buffer (decrements the reference count). */ -RTC_API void rtcReleaseBuffer(RTCBuffer buffer); - -RTC_NAMESPACE_END diff --git a/thirdparty/embree-aarch64/include/embree3/rtcore_builder.h b/thirdparty/embree-aarch64/include/embree3/rtcore_builder.h deleted file mode 100644 index d62a7f72cc..0000000000 --- a/thirdparty/embree-aarch64/include/embree3/rtcore_builder.h +++ /dev/null @@ -1,125 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "rtcore_scene.h" - -RTC_NAMESPACE_BEGIN - -/* Opaque BVH type */ -typedef struct RTCBVHTy* RTCBVH; - -/* Input build primitives for the builder */ -struct RTC_ALIGN(32) RTCBuildPrimitive -{ - float lower_x, lower_y, lower_z; - unsigned int geomID; - float upper_x, upper_y, upper_z; - unsigned int primID; -}; - -/* Opaque thread local allocator type */ -typedef struct RTCThreadLocalAllocatorTy* RTCThreadLocalAllocator; - -/* Callback to create a node */ -typedef void* (*RTCCreateNodeFunction) (RTCThreadLocalAllocator allocator, unsigned int childCount, void* userPtr); - -/* Callback to set the pointer to all children */ -typedef void (*RTCSetNodeChildrenFunction) (void* nodePtr, void** children, unsigned int childCount, void* userPtr); - -/* Callback to set the bounds of all children */ -typedef void (*RTCSetNodeBoundsFunction) (void* nodePtr, const struct RTCBounds** bounds, unsigned int childCount, void* userPtr); - -/* Callback to create a leaf node */ -typedef void* (*RTCCreateLeafFunction) (RTCThreadLocalAllocator allocator, const struct RTCBuildPrimitive* primitives, size_t primitiveCount, void* userPtr); - -/* Callback to split a build primitive */ -typedef void (*RTCSplitPrimitiveFunction) (const struct RTCBuildPrimitive* primitive, unsigned int dimension, float position, struct RTCBounds* leftBounds, struct RTCBounds* rightBounds, void* userPtr); - -/* Build flags */ -enum RTCBuildFlags -{ - RTC_BUILD_FLAG_NONE = 0, - RTC_BUILD_FLAG_DYNAMIC = (1 << 0), -}; - -enum RTCBuildConstants -{ - RTC_BUILD_MAX_PRIMITIVES_PER_LEAF = 32 -}; - -/* Input for builders */ -struct RTCBuildArguments -{ - size_t byteSize; - - enum RTCBuildQuality buildQuality; - enum RTCBuildFlags buildFlags; - unsigned int maxBranchingFactor; - unsigned int maxDepth; - unsigned int sahBlockSize; - unsigned int minLeafSize; - unsigned int maxLeafSize; - float traversalCost; - float intersectionCost; - - RTCBVH bvh; - struct RTCBuildPrimitive* primitives; - size_t primitiveCount; - size_t primitiveArrayCapacity; - - RTCCreateNodeFunction createNode; - RTCSetNodeChildrenFunction setNodeChildren; - RTCSetNodeBoundsFunction setNodeBounds; - RTCCreateLeafFunction createLeaf; - RTCSplitPrimitiveFunction splitPrimitive; - RTCProgressMonitorFunction buildProgress; - void* userPtr; -}; - -/* Returns the default build settings. */ -RTC_FORCEINLINE struct RTCBuildArguments rtcDefaultBuildArguments() -{ - struct RTCBuildArguments args; - args.byteSize = sizeof(args); - args.buildQuality = RTC_BUILD_QUALITY_MEDIUM; - args.buildFlags = RTC_BUILD_FLAG_NONE; - args.maxBranchingFactor = 2; - args.maxDepth = 32; - args.sahBlockSize = 1; - args.minLeafSize = 1; - args.maxLeafSize = RTC_BUILD_MAX_PRIMITIVES_PER_LEAF; - args.traversalCost = 1.0f; - args.intersectionCost = 1.0f; - args.bvh = NULL; - args.primitives = NULL; - args.primitiveCount = 0; - args.primitiveArrayCapacity = 0; - args.createNode = NULL; - args.setNodeChildren = NULL; - args.setNodeBounds = NULL; - args.createLeaf = NULL; - args.splitPrimitive = NULL; - args.buildProgress = NULL; - args.userPtr = NULL; - return args; -} - -/* Creates a new BVH. */ -RTC_API RTCBVH rtcNewBVH(RTCDevice device); - -/* Builds a BVH. */ -RTC_API void* rtcBuildBVH(const struct RTCBuildArguments* args); - -/* Allocates memory using the thread local allocator. */ -RTC_API void* rtcThreadLocalAlloc(RTCThreadLocalAllocator allocator, size_t bytes, size_t align); - -/* Retains the BVH (increments reference count). */ -RTC_API void rtcRetainBVH(RTCBVH bvh); - -/* Releases the BVH (decrements reference count). */ -RTC_API void rtcReleaseBVH(RTCBVH bvh); - -RTC_NAMESPACE_END - diff --git a/thirdparty/embree-aarch64/include/embree3/rtcore_common.h b/thirdparty/embree-aarch64/include/embree3/rtcore_common.h deleted file mode 100644 index 890e06faa3..0000000000 --- a/thirdparty/embree-aarch64/include/embree3/rtcore_common.h +++ /dev/null @@ -1,326 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include -#include -#include - -#include "rtcore_config.h" - -RTC_NAMESPACE_BEGIN - -#if defined(_WIN32) -#if defined(_M_X64) -typedef long long ssize_t; -#else -typedef int ssize_t; -#endif -#endif - -#if defined(_WIN32) && !defined(__MINGW32__) -# define RTC_ALIGN(...) __declspec(align(__VA_ARGS__)) -#else -# define RTC_ALIGN(...) __attribute__((aligned(__VA_ARGS__))) -#endif - -#if !defined (RTC_DEPRECATED) -#ifdef __GNUC__ - #define RTC_DEPRECATED __attribute__((deprecated)) -#elif defined(_MSC_VER) - #define RTC_DEPRECATED __declspec(deprecated) -#else - #define RTC_DEPRECATED -#endif -#endif - -#if defined(_WIN32) -# define RTC_FORCEINLINE __forceinline -#else -# define RTC_FORCEINLINE inline __attribute__((always_inline)) -#endif - -/* Invalid geometry ID */ -#define RTC_INVALID_GEOMETRY_ID ((unsigned int)-1) - -/* Maximum number of time steps */ -#define RTC_MAX_TIME_STEP_COUNT 129 - -/* Formats of buffers and other data structures */ -enum RTCFormat -{ - RTC_FORMAT_UNDEFINED = 0, - - /* 8-bit unsigned integer */ - RTC_FORMAT_UCHAR = 0x1001, - RTC_FORMAT_UCHAR2, - RTC_FORMAT_UCHAR3, - RTC_FORMAT_UCHAR4, - - /* 8-bit signed integer */ - RTC_FORMAT_CHAR = 0x2001, - RTC_FORMAT_CHAR2, - RTC_FORMAT_CHAR3, - RTC_FORMAT_CHAR4, - - /* 16-bit unsigned integer */ - RTC_FORMAT_USHORT = 0x3001, - RTC_FORMAT_USHORT2, - RTC_FORMAT_USHORT3, - RTC_FORMAT_USHORT4, - - /* 16-bit signed integer */ - RTC_FORMAT_SHORT = 0x4001, - RTC_FORMAT_SHORT2, - RTC_FORMAT_SHORT3, - RTC_FORMAT_SHORT4, - - /* 32-bit unsigned integer */ - RTC_FORMAT_UINT = 0x5001, - RTC_FORMAT_UINT2, - RTC_FORMAT_UINT3, - RTC_FORMAT_UINT4, - - /* 32-bit signed integer */ - RTC_FORMAT_INT = 0x6001, - RTC_FORMAT_INT2, - RTC_FORMAT_INT3, - RTC_FORMAT_INT4, - - /* 64-bit unsigned integer */ - RTC_FORMAT_ULLONG = 0x7001, - RTC_FORMAT_ULLONG2, - RTC_FORMAT_ULLONG3, - RTC_FORMAT_ULLONG4, - - /* 64-bit signed integer */ - RTC_FORMAT_LLONG = 0x8001, - RTC_FORMAT_LLONG2, - RTC_FORMAT_LLONG3, - RTC_FORMAT_LLONG4, - - /* 32-bit float */ - RTC_FORMAT_FLOAT = 0x9001, - RTC_FORMAT_FLOAT2, - RTC_FORMAT_FLOAT3, - RTC_FORMAT_FLOAT4, - RTC_FORMAT_FLOAT5, - RTC_FORMAT_FLOAT6, - RTC_FORMAT_FLOAT7, - RTC_FORMAT_FLOAT8, - RTC_FORMAT_FLOAT9, - RTC_FORMAT_FLOAT10, - RTC_FORMAT_FLOAT11, - RTC_FORMAT_FLOAT12, - RTC_FORMAT_FLOAT13, - RTC_FORMAT_FLOAT14, - RTC_FORMAT_FLOAT15, - RTC_FORMAT_FLOAT16, - - /* 32-bit float matrix (row-major order) */ - RTC_FORMAT_FLOAT2X2_ROW_MAJOR = 0x9122, - RTC_FORMAT_FLOAT2X3_ROW_MAJOR = 0x9123, - RTC_FORMAT_FLOAT2X4_ROW_MAJOR = 0x9124, - RTC_FORMAT_FLOAT3X2_ROW_MAJOR = 0x9132, - RTC_FORMAT_FLOAT3X3_ROW_MAJOR = 0x9133, - RTC_FORMAT_FLOAT3X4_ROW_MAJOR = 0x9134, - RTC_FORMAT_FLOAT4X2_ROW_MAJOR = 0x9142, - RTC_FORMAT_FLOAT4X3_ROW_MAJOR = 0x9143, - RTC_FORMAT_FLOAT4X4_ROW_MAJOR = 0x9144, - - /* 32-bit float matrix (column-major order) */ - RTC_FORMAT_FLOAT2X2_COLUMN_MAJOR = 0x9222, - RTC_FORMAT_FLOAT2X3_COLUMN_MAJOR = 0x9223, - RTC_FORMAT_FLOAT2X4_COLUMN_MAJOR = 0x9224, - RTC_FORMAT_FLOAT3X2_COLUMN_MAJOR = 0x9232, - RTC_FORMAT_FLOAT3X3_COLUMN_MAJOR = 0x9233, - RTC_FORMAT_FLOAT3X4_COLUMN_MAJOR = 0x9234, - RTC_FORMAT_FLOAT4X2_COLUMN_MAJOR = 0x9242, - RTC_FORMAT_FLOAT4X3_COLUMN_MAJOR = 0x9243, - RTC_FORMAT_FLOAT4X4_COLUMN_MAJOR = 0x9244, - - /* special 12-byte format for grids */ - RTC_FORMAT_GRID = 0xA001 -}; - -/* Build quality levels */ -enum RTCBuildQuality -{ - RTC_BUILD_QUALITY_LOW = 0, - RTC_BUILD_QUALITY_MEDIUM = 1, - RTC_BUILD_QUALITY_HIGH = 2, - RTC_BUILD_QUALITY_REFIT = 3, -}; - -/* Axis-aligned bounding box representation */ -struct RTC_ALIGN(16) RTCBounds -{ - float lower_x, lower_y, lower_z, align0; - float upper_x, upper_y, upper_z, align1; -}; - -/* Linear axis-aligned bounding box representation */ -struct RTC_ALIGN(16) RTCLinearBounds -{ - struct RTCBounds bounds0; - struct RTCBounds bounds1; -}; - -/* Intersection context flags */ -enum RTCIntersectContextFlags -{ - RTC_INTERSECT_CONTEXT_FLAG_NONE = 0, - RTC_INTERSECT_CONTEXT_FLAG_INCOHERENT = (0 << 0), // optimize for incoherent rays - RTC_INTERSECT_CONTEXT_FLAG_COHERENT = (1 << 0) // optimize for coherent rays -}; - -/* Arguments for RTCFilterFunctionN */ -struct RTCFilterFunctionNArguments -{ - int* valid; - void* geometryUserPtr; - struct RTCIntersectContext* context; - struct RTCRayN* ray; - struct RTCHitN* hit; - unsigned int N; -}; - -/* Filter callback function */ -typedef void (*RTCFilterFunctionN)(const struct RTCFilterFunctionNArguments* args); - -/* Intersection context passed to intersect/occluded calls */ -struct RTCIntersectContext -{ - enum RTCIntersectContextFlags flags; // intersection flags - RTCFilterFunctionN filter; // filter function to execute - -#if RTC_MAX_INSTANCE_LEVEL_COUNT > 1 - unsigned int instStackSize; // Number of instances currently on the stack. -#endif - unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // The current stack of instance ids. - -#if RTC_MIN_WIDTH - float minWidthDistanceFactor; // curve radius is set to this factor times distance to ray origin -#endif -}; - -/* Initializes an intersection context. */ -RTC_FORCEINLINE void rtcInitIntersectContext(struct RTCIntersectContext* context) -{ - unsigned l = 0; - context->flags = RTC_INTERSECT_CONTEXT_FLAG_INCOHERENT; - context->filter = NULL; - -#if RTC_MAX_INSTANCE_LEVEL_COUNT > 1 - context->instStackSize = 0; -#endif - for (; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) - context->instID[l] = RTC_INVALID_GEOMETRY_ID; - -#if RTC_MIN_WIDTH - context->minWidthDistanceFactor = 0.0f; -#endif -} - -/* Point query structure for closest point query */ -struct RTC_ALIGN(16) RTCPointQuery -{ - float x; // x coordinate of the query point - float y; // y coordinate of the query point - float z; // z coordinate of the query point - float time; // time of the point query - float radius; // radius of the point query -}; - -/* Structure of a packet of 4 query points */ -struct RTC_ALIGN(16) RTCPointQuery4 -{ - float x[4]; // x coordinate of the query point - float y[4]; // y coordinate of the query point - float z[4]; // z coordinate of the query point - float time[4]; // time of the point query - float radius[4]; // radius of the point query -}; - -/* Structure of a packet of 8 query points */ -struct RTC_ALIGN(32) RTCPointQuery8 -{ - float x[8]; // x coordinate of the query point - float y[8]; // y coordinate of the query point - float z[8]; // z coordinate of the query point - float time[8]; // time of the point query - float radius[8]; // radius ofr the point query -}; - -/* Structure of a packet of 16 query points */ -struct RTC_ALIGN(64) RTCPointQuery16 -{ - float x[16]; // x coordinate of the query point - float y[16]; // y coordinate of the query point - float z[16]; // z coordinate of the query point - float time[16]; // time of the point quey - float radius[16]; // radius of the point query -}; - -struct RTCPointQueryN; - -struct RTC_ALIGN(16) RTCPointQueryContext -{ - // accumulated 4x4 column major matrices from world space to instance space. - // undefined if size == 0. - float world2inst[RTC_MAX_INSTANCE_LEVEL_COUNT][16]; - - // accumulated 4x4 column major matrices from instance space to world space. - // undefined if size == 0. - float inst2world[RTC_MAX_INSTANCE_LEVEL_COUNT][16]; - - // instance ids. - unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; - - // number of instances currently on the stack. - unsigned int instStackSize; -}; - -/* Initializes an intersection context. */ -RTC_FORCEINLINE void rtcInitPointQueryContext(struct RTCPointQueryContext* context) -{ - context->instStackSize = 0; - context->instID[0] = RTC_INVALID_GEOMETRY_ID; -} - -struct RTC_ALIGN(16) RTCPointQueryFunctionArguments -{ - // The (world space) query object that was passed as an argument of rtcPointQuery. The - // radius of the query can be decreased inside the callback to shrink the - // search domain. Increasing the radius or modifying the time or position of - // the query results in undefined behaviour. - struct RTCPointQuery* query; - - // Used for user input/output data. Will not be read or modified internally. - void* userPtr; - - // primitive and geometry ID of primitive - unsigned int primID; - unsigned int geomID; - - // the context with transformation and instance ID stack - struct RTCPointQueryContext* context; - - // If the current instance transform M (= context->world2inst[context->instStackSize]) - // is a similarity matrix, i.e there is a constant factor similarityScale such that, - // for all x,y: dist(Mx, My) = similarityScale * dist(x, y), - // The similarity scale is 0, if the current instance transform is not a - // similarity transform and vice versa. The similarity scale allows to compute - // distance information in instance space and scale the distances into world - // space by dividing with the similarity scale, for example, to update the - // query radius. If the current instance transform is not a similarity - // transform (similarityScale = 0), the distance computation has to be - // performed in world space to ensure correctness. if there is no instance - // transform (context->instStackSize == 0), the similarity scale is 1. - float similarityScale; -}; - -typedef bool (*RTCPointQueryFunction)(struct RTCPointQueryFunctionArguments* args); - -RTC_NAMESPACE_END diff --git a/thirdparty/embree-aarch64/include/embree3/rtcore_config.h b/thirdparty/embree-aarch64/include/embree3/rtcore_config.h deleted file mode 100644 index 337d4e9487..0000000000 --- a/thirdparty/embree-aarch64/include/embree3/rtcore_config.h +++ /dev/null @@ -1,57 +0,0 @@ - -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#define RTC_VERSION_MAJOR 3 -#define RTC_VERSION_MINOR 12 -#define RTC_VERSION_PATCH 1 -#define RTC_VERSION 31201 -#define RTC_VERSION_STRING "3.12.1" - -#define RTC_MAX_INSTANCE_LEVEL_COUNT 1 - -#define EMBREE_MIN_WIDTH 0 -#define RTC_MIN_WIDTH EMBREE_MIN_WIDTH - -#define EMBREE_STATIC_LIB -/* #undef EMBREE_API_NAMESPACE */ - -#if defined(EMBREE_API_NAMESPACE) -# define RTC_NAMESPACE -# define RTC_NAMESPACE_BEGIN namespace { -# define RTC_NAMESPACE_END } -# define RTC_NAMESPACE_USE using namespace ; -# define RTC_API_EXTERN_C -# undef EMBREE_API_NAMESPACE -#else -# define RTC_NAMESPACE_BEGIN -# define RTC_NAMESPACE_END -# define RTC_NAMESPACE_USE -# if defined(__cplusplus) -# define RTC_API_EXTERN_C extern "C" -# else -# define RTC_API_EXTERN_C -# endif -#endif - -#if defined(ISPC) -# define RTC_API_IMPORT extern "C" unmasked -# define RTC_API_EXPORT extern "C" unmasked -#elif defined(EMBREE_STATIC_LIB) -# define RTC_API_IMPORT RTC_API_EXTERN_C -# define RTC_API_EXPORT RTC_API_EXTERN_C -#elif defined(_WIN32) -# define RTC_API_IMPORT RTC_API_EXTERN_C __declspec(dllimport) -# define RTC_API_EXPORT RTC_API_EXTERN_C __declspec(dllexport) -#else -# define RTC_API_IMPORT RTC_API_EXTERN_C -# define RTC_API_EXPORT RTC_API_EXTERN_C __attribute__ ((visibility ("default"))) -#endif - -#if defined(RTC_EXPORT_API) -# define RTC_API RTC_API_EXPORT -#else -# define RTC_API RTC_API_IMPORT -#endif diff --git a/thirdparty/embree-aarch64/include/embree3/rtcore_device.h b/thirdparty/embree-aarch64/include/embree3/rtcore_device.h deleted file mode 100644 index 594e2b755d..0000000000 --- a/thirdparty/embree-aarch64/include/embree3/rtcore_device.h +++ /dev/null @@ -1,87 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "rtcore_common.h" - -RTC_NAMESPACE_BEGIN - -/* Opaque device type */ -typedef struct RTCDeviceTy* RTCDevice; - -/* Creates a new Embree device. */ -RTC_API RTCDevice rtcNewDevice(const char* config); - -/* Retains the Embree device (increments the reference count). */ -RTC_API void rtcRetainDevice(RTCDevice device); - -/* Releases an Embree device (decrements the reference count). */ -RTC_API void rtcReleaseDevice(RTCDevice device); - -/* Device properties */ -enum RTCDeviceProperty -{ - RTC_DEVICE_PROPERTY_VERSION = 0, - RTC_DEVICE_PROPERTY_VERSION_MAJOR = 1, - RTC_DEVICE_PROPERTY_VERSION_MINOR = 2, - RTC_DEVICE_PROPERTY_VERSION_PATCH = 3, - - RTC_DEVICE_PROPERTY_NATIVE_RAY4_SUPPORTED = 32, - RTC_DEVICE_PROPERTY_NATIVE_RAY8_SUPPORTED = 33, - RTC_DEVICE_PROPERTY_NATIVE_RAY16_SUPPORTED = 34, - RTC_DEVICE_PROPERTY_RAY_STREAM_SUPPORTED = 35, - - RTC_DEVICE_PROPERTY_BACKFACE_CULLING_CURVES_ENABLED = 63, - RTC_DEVICE_PROPERTY_RAY_MASK_SUPPORTED = 64, - RTC_DEVICE_PROPERTY_BACKFACE_CULLING_ENABLED = 65, - RTC_DEVICE_PROPERTY_FILTER_FUNCTION_SUPPORTED = 66, - RTC_DEVICE_PROPERTY_IGNORE_INVALID_RAYS_ENABLED = 67, - RTC_DEVICE_PROPERTY_COMPACT_POLYS_ENABLED = 68, - - RTC_DEVICE_PROPERTY_TRIANGLE_GEOMETRY_SUPPORTED = 96, - RTC_DEVICE_PROPERTY_QUAD_GEOMETRY_SUPPORTED = 97, - RTC_DEVICE_PROPERTY_SUBDIVISION_GEOMETRY_SUPPORTED = 98, - RTC_DEVICE_PROPERTY_CURVE_GEOMETRY_SUPPORTED = 99, - RTC_DEVICE_PROPERTY_USER_GEOMETRY_SUPPORTED = 100, - RTC_DEVICE_PROPERTY_POINT_GEOMETRY_SUPPORTED = 101, - - RTC_DEVICE_PROPERTY_TASKING_SYSTEM = 128, - RTC_DEVICE_PROPERTY_JOIN_COMMIT_SUPPORTED = 129, - RTC_DEVICE_PROPERTY_PARALLEL_COMMIT_SUPPORTED = 130 -}; - -/* Gets a device property. */ -RTC_API ssize_t rtcGetDeviceProperty(RTCDevice device, enum RTCDeviceProperty prop); - -/* Sets a device property. */ -RTC_API void rtcSetDeviceProperty(RTCDevice device, const enum RTCDeviceProperty prop, ssize_t value); - -/* Error codes */ -enum RTCError -{ - RTC_ERROR_NONE = 0, - RTC_ERROR_UNKNOWN = 1, - RTC_ERROR_INVALID_ARGUMENT = 2, - RTC_ERROR_INVALID_OPERATION = 3, - RTC_ERROR_OUT_OF_MEMORY = 4, - RTC_ERROR_UNSUPPORTED_CPU = 5, - RTC_ERROR_CANCELLED = 6 -}; - -/* Returns the error code. */ -RTC_API enum RTCError rtcGetDeviceError(RTCDevice device); - -/* Error callback function */ -typedef void (*RTCErrorFunction)(void* userPtr, enum RTCError code, const char* str); - -/* Sets the error callback function. */ -RTC_API void rtcSetDeviceErrorFunction(RTCDevice device, RTCErrorFunction error, void* userPtr); - -/* Memory monitor callback function */ -typedef bool (*RTCMemoryMonitorFunction)(void* ptr, ssize_t bytes, bool post); - -/* Sets the memory monitor callback function. */ -RTC_API void rtcSetDeviceMemoryMonitorFunction(RTCDevice device, RTCMemoryMonitorFunction memoryMonitor, void* userPtr); - -RTC_NAMESPACE_END diff --git a/thirdparty/embree-aarch64/include/embree3/rtcore_geometry.h b/thirdparty/embree-aarch64/include/embree3/rtcore_geometry.h deleted file mode 100644 index c70f1b0e5c..0000000000 --- a/thirdparty/embree-aarch64/include/embree3/rtcore_geometry.h +++ /dev/null @@ -1,383 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "rtcore_buffer.h" -#include "rtcore_quaternion.h" - -RTC_NAMESPACE_BEGIN - -/* Opaque scene type */ -typedef struct RTCSceneTy* RTCScene; - -/* Opaque geometry type */ -typedef struct RTCGeometryTy* RTCGeometry; - -/* Types of geometries */ -enum RTCGeometryType -{ - RTC_GEOMETRY_TYPE_TRIANGLE = 0, // triangle mesh - RTC_GEOMETRY_TYPE_QUAD = 1, // quad (triangle pair) mesh - RTC_GEOMETRY_TYPE_GRID = 2, // grid mesh - - RTC_GEOMETRY_TYPE_SUBDIVISION = 8, // Catmull-Clark subdivision surface - - RTC_GEOMETRY_TYPE_CONE_LINEAR_CURVE = 15, // Cone linear curves - discontinuous at edge boundaries - RTC_GEOMETRY_TYPE_ROUND_LINEAR_CURVE = 16, // Round (rounded cone like) linear curves - RTC_GEOMETRY_TYPE_FLAT_LINEAR_CURVE = 17, // flat (ribbon-like) linear curves - - RTC_GEOMETRY_TYPE_ROUND_BEZIER_CURVE = 24, // round (tube-like) Bezier curves - RTC_GEOMETRY_TYPE_FLAT_BEZIER_CURVE = 25, // flat (ribbon-like) Bezier curves - RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_BEZIER_CURVE = 26, // flat normal-oriented Bezier curves - - RTC_GEOMETRY_TYPE_ROUND_BSPLINE_CURVE = 32, // round (tube-like) B-spline curves - RTC_GEOMETRY_TYPE_FLAT_BSPLINE_CURVE = 33, // flat (ribbon-like) B-spline curves - RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_BSPLINE_CURVE = 34, // flat normal-oriented B-spline curves - - RTC_GEOMETRY_TYPE_ROUND_HERMITE_CURVE = 40, // round (tube-like) Hermite curves - RTC_GEOMETRY_TYPE_FLAT_HERMITE_CURVE = 41, // flat (ribbon-like) Hermite curves - RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_HERMITE_CURVE = 42, // flat normal-oriented Hermite curves - - RTC_GEOMETRY_TYPE_SPHERE_POINT = 50, - RTC_GEOMETRY_TYPE_DISC_POINT = 51, - RTC_GEOMETRY_TYPE_ORIENTED_DISC_POINT = 52, - - RTC_GEOMETRY_TYPE_ROUND_CATMULL_ROM_CURVE = 58, // round (tube-like) Catmull-Rom curves - RTC_GEOMETRY_TYPE_FLAT_CATMULL_ROM_CURVE = 59, // flat (ribbon-like) Catmull-Rom curves - RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_CATMULL_ROM_CURVE = 60, // flat normal-oriented Catmull-Rom curves - - RTC_GEOMETRY_TYPE_USER = 120, // user-defined geometry - RTC_GEOMETRY_TYPE_INSTANCE = 121 // scene instance -}; - -/* Interpolation modes for subdivision surfaces */ -enum RTCSubdivisionMode -{ - RTC_SUBDIVISION_MODE_NO_BOUNDARY = 0, - RTC_SUBDIVISION_MODE_SMOOTH_BOUNDARY = 1, - RTC_SUBDIVISION_MODE_PIN_CORNERS = 2, - RTC_SUBDIVISION_MODE_PIN_BOUNDARY = 3, - RTC_SUBDIVISION_MODE_PIN_ALL = 4, -}; - -/* Curve segment flags */ -enum RTCCurveFlags -{ - RTC_CURVE_FLAG_NEIGHBOR_LEFT = (1 << 0), // left segments exists - RTC_CURVE_FLAG_NEIGHBOR_RIGHT = (1 << 1) // right segment exists -}; - -/* Arguments for RTCBoundsFunction */ -struct RTCBoundsFunctionArguments -{ - void* geometryUserPtr; - unsigned int primID; - unsigned int timeStep; - struct RTCBounds* bounds_o; -}; - -/* Bounding callback function */ -typedef void (*RTCBoundsFunction)(const struct RTCBoundsFunctionArguments* args); - -/* Arguments for RTCIntersectFunctionN */ -struct RTCIntersectFunctionNArguments -{ - int* valid; - void* geometryUserPtr; - unsigned int primID; - struct RTCIntersectContext* context; - struct RTCRayHitN* rayhit; - unsigned int N; - unsigned int geomID; -}; - -/* Intersection callback function */ -typedef void (*RTCIntersectFunctionN)(const struct RTCIntersectFunctionNArguments* args); - -/* Arguments for RTCOccludedFunctionN */ -struct RTCOccludedFunctionNArguments -{ - int* valid; - void* geometryUserPtr; - unsigned int primID; - struct RTCIntersectContext* context; - struct RTCRayN* ray; - unsigned int N; - unsigned int geomID; -}; - -/* Occlusion callback function */ -typedef void (*RTCOccludedFunctionN)(const struct RTCOccludedFunctionNArguments* args); - -/* Arguments for RTCDisplacementFunctionN */ -struct RTCDisplacementFunctionNArguments -{ - void* geometryUserPtr; - RTCGeometry geometry; - unsigned int primID; - unsigned int timeStep; - const float* u; - const float* v; - const float* Ng_x; - const float* Ng_y; - const float* Ng_z; - float* P_x; - float* P_y; - float* P_z; - unsigned int N; -}; - -/* Displacement mapping callback function */ -typedef void (*RTCDisplacementFunctionN)(const struct RTCDisplacementFunctionNArguments* args); - -/* Creates a new geometry of specified type. */ -RTC_API RTCGeometry rtcNewGeometry(RTCDevice device, enum RTCGeometryType type); - -/* Retains the geometry (increments the reference count). */ -RTC_API void rtcRetainGeometry(RTCGeometry geometry); - -/* Releases the geometry (decrements the reference count) */ -RTC_API void rtcReleaseGeometry(RTCGeometry geometry); - -/* Commits the geometry. */ -RTC_API void rtcCommitGeometry(RTCGeometry geometry); - - -/* Enables the geometry. */ -RTC_API void rtcEnableGeometry(RTCGeometry geometry); - -/* Disables the geometry. */ -RTC_API void rtcDisableGeometry(RTCGeometry geometry); - - -/* Sets the number of motion blur time steps of the geometry. */ -RTC_API void rtcSetGeometryTimeStepCount(RTCGeometry geometry, unsigned int timeStepCount); - -/* Sets the motion blur time range of the geometry. */ -RTC_API void rtcSetGeometryTimeRange(RTCGeometry geometry, float startTime, float endTime); - -/* Sets the number of vertex attributes of the geometry. */ -RTC_API void rtcSetGeometryVertexAttributeCount(RTCGeometry geometry, unsigned int vertexAttributeCount); - -/* Sets the ray mask of the geometry. */ -RTC_API void rtcSetGeometryMask(RTCGeometry geometry, unsigned int mask); - -/* Sets the build quality of the geometry. */ -RTC_API void rtcSetGeometryBuildQuality(RTCGeometry geometry, enum RTCBuildQuality quality); - -/* Sets the maximal curve or point radius scale allowed by min-width feature. */ -RTC_API void rtcSetGeometryMaxRadiusScale(RTCGeometry geometry, float maxRadiusScale); - - -/* Sets a geometry buffer. */ -RTC_API void rtcSetGeometryBuffer(RTCGeometry geometry, enum RTCBufferType type, unsigned int slot, enum RTCFormat format, RTCBuffer buffer, size_t byteOffset, size_t byteStride, size_t itemCount); - -/* Sets a shared geometry buffer. */ -RTC_API void rtcSetSharedGeometryBuffer(RTCGeometry geometry, enum RTCBufferType type, unsigned int slot, enum RTCFormat format, const void* ptr, size_t byteOffset, size_t byteStride, size_t itemCount); - -/* Creates and sets a new geometry buffer. */ -RTC_API void* rtcSetNewGeometryBuffer(RTCGeometry geometry, enum RTCBufferType type, unsigned int slot, enum RTCFormat format, size_t byteStride, size_t itemCount); - -/* Returns the pointer to the data of a buffer. */ -RTC_API void* rtcGetGeometryBufferData(RTCGeometry geometry, enum RTCBufferType type, unsigned int slot); - -/* Updates a geometry buffer. */ -RTC_API void rtcUpdateGeometryBuffer(RTCGeometry geometry, enum RTCBufferType type, unsigned int slot); - - -/* Sets the intersection filter callback function of the geometry. */ -RTC_API void rtcSetGeometryIntersectFilterFunction(RTCGeometry geometry, RTCFilterFunctionN filter); - -/* Sets the occlusion filter callback function of the geometry. */ -RTC_API void rtcSetGeometryOccludedFilterFunction(RTCGeometry geometry, RTCFilterFunctionN filter); - -/* Sets the user-defined data pointer of the geometry. */ -RTC_API void rtcSetGeometryUserData(RTCGeometry geometry, void* ptr); - -/* Gets the user-defined data pointer of the geometry. */ -RTC_API void* rtcGetGeometryUserData(RTCGeometry geometry); - -/* Set the point query callback function of a geometry. */ -RTC_API void rtcSetGeometryPointQueryFunction(RTCGeometry geometry, RTCPointQueryFunction pointQuery); - -/* Sets the number of primitives of a user geometry. */ -RTC_API void rtcSetGeometryUserPrimitiveCount(RTCGeometry geometry, unsigned int userPrimitiveCount); - -/* Sets the bounding callback function to calculate bounding boxes for user primitives. */ -RTC_API void rtcSetGeometryBoundsFunction(RTCGeometry geometry, RTCBoundsFunction bounds, void* userPtr); - -/* Set the intersect callback function of a user geometry. */ -RTC_API void rtcSetGeometryIntersectFunction(RTCGeometry geometry, RTCIntersectFunctionN intersect); - -/* Set the occlusion callback function of a user geometry. */ -RTC_API void rtcSetGeometryOccludedFunction(RTCGeometry geometry, RTCOccludedFunctionN occluded); - -/* Invokes the intersection filter from the intersection callback function. */ -RTC_API void rtcFilterIntersection(const struct RTCIntersectFunctionNArguments* args, const struct RTCFilterFunctionNArguments* filterArgs); - -/* Invokes the occlusion filter from the occlusion callback function. */ -RTC_API void rtcFilterOcclusion(const struct RTCOccludedFunctionNArguments* args, const struct RTCFilterFunctionNArguments* filterArgs); - - -/* Sets the instanced scene of an instance geometry. */ -RTC_API void rtcSetGeometryInstancedScene(RTCGeometry geometry, RTCScene scene); - -/* Sets the transformation of an instance for the specified time step. */ -RTC_API void rtcSetGeometryTransform(RTCGeometry geometry, unsigned int timeStep, enum RTCFormat format, const void* xfm); - -/* Sets the transformation quaternion of an instance for the specified time step. */ -RTC_API void rtcSetGeometryTransformQuaternion(RTCGeometry geometry, unsigned int timeStep, const struct RTCQuaternionDecomposition* qd); - -/* Returns the interpolated transformation of an instance for the specified time. */ -RTC_API void rtcGetGeometryTransform(RTCGeometry geometry, float time, enum RTCFormat format, void* xfm); - - -/* Sets the uniform tessellation rate of the geometry. */ -RTC_API void rtcSetGeometryTessellationRate(RTCGeometry geometry, float tessellationRate); - -/* Sets the number of topologies of a subdivision surface. */ -RTC_API void rtcSetGeometryTopologyCount(RTCGeometry geometry, unsigned int topologyCount); - -/* Sets the subdivision interpolation mode. */ -RTC_API void rtcSetGeometrySubdivisionMode(RTCGeometry geometry, unsigned int topologyID, enum RTCSubdivisionMode mode); - -/* Binds a vertex attribute to a topology of the geometry. */ -RTC_API void rtcSetGeometryVertexAttributeTopology(RTCGeometry geometry, unsigned int vertexAttributeID, unsigned int topologyID); - -/* Sets the displacement callback function of a subdivision surface. */ -RTC_API void rtcSetGeometryDisplacementFunction(RTCGeometry geometry, RTCDisplacementFunctionN displacement); - -/* Returns the first half edge of a face. */ -RTC_API unsigned int rtcGetGeometryFirstHalfEdge(RTCGeometry geometry, unsigned int faceID); - -/* Returns the face the half edge belongs to. */ -RTC_API unsigned int rtcGetGeometryFace(RTCGeometry geometry, unsigned int edgeID); - -/* Returns next half edge. */ -RTC_API unsigned int rtcGetGeometryNextHalfEdge(RTCGeometry geometry, unsigned int edgeID); - -/* Returns previous half edge. */ -RTC_API unsigned int rtcGetGeometryPreviousHalfEdge(RTCGeometry geometry, unsigned int edgeID); - -/* Returns opposite half edge. */ -RTC_API unsigned int rtcGetGeometryOppositeHalfEdge(RTCGeometry geometry, unsigned int topologyID, unsigned int edgeID); - - -/* Arguments for rtcInterpolate */ -struct RTCInterpolateArguments -{ - RTCGeometry geometry; - unsigned int primID; - float u; - float v; - enum RTCBufferType bufferType; - unsigned int bufferSlot; - float* P; - float* dPdu; - float* dPdv; - float* ddPdudu; - float* ddPdvdv; - float* ddPdudv; - unsigned int valueCount; -}; - -/* Interpolates vertex data to some u/v location and optionally calculates all derivatives. */ -RTC_API void rtcInterpolate(const struct RTCInterpolateArguments* args); - -/* Interpolates vertex data to some u/v location. */ -RTC_FORCEINLINE void rtcInterpolate0(RTCGeometry geometry, unsigned int primID, float u, float v, enum RTCBufferType bufferType, unsigned int bufferSlot, float* P, unsigned int valueCount) -{ - struct RTCInterpolateArguments args; - args.geometry = geometry; - args.primID = primID; - args.u = u; - args.v = v; - args.bufferType = bufferType; - args.bufferSlot = bufferSlot; - args.P = P; - args.dPdu = NULL; - args.dPdv = NULL; - args.ddPdudu = NULL; - args.ddPdvdv = NULL; - args.ddPdudv = NULL; - args.valueCount = valueCount; - rtcInterpolate(&args); -} - -/* Interpolates vertex data to some u/v location and calculates first order derivatives. */ -RTC_FORCEINLINE void rtcInterpolate1(RTCGeometry geometry, unsigned int primID, float u, float v, enum RTCBufferType bufferType, unsigned int bufferSlot, - float* P, float* dPdu, float* dPdv, unsigned int valueCount) -{ - struct RTCInterpolateArguments args; - args.geometry = geometry; - args.primID = primID; - args.u = u; - args.v = v; - args.bufferType = bufferType; - args.bufferSlot = bufferSlot; - args.P = P; - args.dPdu = dPdu; - args.dPdv = dPdv; - args.ddPdudu = NULL; - args.ddPdvdv = NULL; - args.ddPdudv = NULL; - args.valueCount = valueCount; - rtcInterpolate(&args); -} - -/* Interpolates vertex data to some u/v location and calculates first and second order derivatives. */ -RTC_FORCEINLINE void rtcInterpolate2(RTCGeometry geometry, unsigned int primID, float u, float v, enum RTCBufferType bufferType, unsigned int bufferSlot, - float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv, unsigned int valueCount) -{ - struct RTCInterpolateArguments args; - args.geometry = geometry; - args.primID = primID; - args.u = u; - args.v = v; - args.bufferType = bufferType; - args.bufferSlot = bufferSlot; - args.P = P; - args.dPdu = dPdu; - args.dPdv = dPdv; - args.ddPdudu = ddPdudu; - args.ddPdvdv = ddPdvdv; - args.ddPdudv = ddPdudv; - args.valueCount = valueCount; - rtcInterpolate(&args); -} - -/* Arguments for rtcInterpolateN */ -struct RTCInterpolateNArguments -{ - RTCGeometry geometry; - const void* valid; - const unsigned int* primIDs; - const float* u; - const float* v; - unsigned int N; - enum RTCBufferType bufferType; - unsigned int bufferSlot; - float* P; - float* dPdu; - float* dPdv; - float* ddPdudu; - float* ddPdvdv; - float* ddPdudv; - unsigned int valueCount; -}; - -/* Interpolates vertex data to an array of u/v locations. */ -RTC_API void rtcInterpolateN(const struct RTCInterpolateNArguments* args); - -/* RTCGrid primitive for grid mesh */ -struct RTCGrid -{ - unsigned int startVertexID; - unsigned int stride; - unsigned short width,height; // max is a 32k x 32k grid -}; - -RTC_NAMESPACE_END - - diff --git a/thirdparty/embree-aarch64/include/embree3/rtcore_quaternion.h b/thirdparty/embree-aarch64/include/embree3/rtcore_quaternion.h deleted file mode 100644 index 449cdedfdc..0000000000 --- a/thirdparty/embree-aarch64/include/embree3/rtcore_quaternion.h +++ /dev/null @@ -1,101 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "rtcore_common.h" - -RTC_NAMESPACE_BEGIN - -/* - * Structure for transformation respresentation as a matrix decomposition using - * a quaternion - */ -struct RTC_ALIGN(16) RTCQuaternionDecomposition -{ - float scale_x; - float scale_y; - float scale_z; - float skew_xy; - float skew_xz; - float skew_yz; - float shift_x; - float shift_y; - float shift_z; - float quaternion_r; - float quaternion_i; - float quaternion_j; - float quaternion_k; - float translation_x; - float translation_y; - float translation_z; -}; - -RTC_FORCEINLINE void rtcInitQuaternionDecomposition(struct RTCQuaternionDecomposition* qdecomp) -{ - qdecomp->scale_x = 1.f; - qdecomp->scale_y = 1.f; - qdecomp->scale_z = 1.f; - qdecomp->skew_xy = 0.f; - qdecomp->skew_xz = 0.f; - qdecomp->skew_yz = 0.f; - qdecomp->shift_x = 0.f; - qdecomp->shift_y = 0.f; - qdecomp->shift_z = 0.f; - qdecomp->quaternion_r = 1.f; - qdecomp->quaternion_i = 0.f; - qdecomp->quaternion_j = 0.f; - qdecomp->quaternion_k = 0.f; - qdecomp->translation_x = 0.f; - qdecomp->translation_y = 0.f; - qdecomp->translation_z = 0.f; -} - -RTC_FORCEINLINE void rtcQuaternionDecompositionSetQuaternion( - struct RTCQuaternionDecomposition* qdecomp, - float r, float i, float j, float k) -{ - qdecomp->quaternion_r = r; - qdecomp->quaternion_i = i; - qdecomp->quaternion_j = j; - qdecomp->quaternion_k = k; -} - -RTC_FORCEINLINE void rtcQuaternionDecompositionSetScale( - struct RTCQuaternionDecomposition* qdecomp, - float scale_x, float scale_y, float scale_z) -{ - qdecomp->scale_x = scale_x; - qdecomp->scale_y = scale_y; - qdecomp->scale_z = scale_z; -} - -RTC_FORCEINLINE void rtcQuaternionDecompositionSetSkew( - struct RTCQuaternionDecomposition* qdecomp, - float skew_xy, float skew_xz, float skew_yz) -{ - qdecomp->skew_xy = skew_xy; - qdecomp->skew_xz = skew_xz; - qdecomp->skew_yz = skew_yz; -} - -RTC_FORCEINLINE void rtcQuaternionDecompositionSetShift( - struct RTCQuaternionDecomposition* qdecomp, - float shift_x, float shift_y, float shift_z) -{ - qdecomp->shift_x = shift_x; - qdecomp->shift_y = shift_y; - qdecomp->shift_z = shift_z; -} - -RTC_FORCEINLINE void rtcQuaternionDecompositionSetTranslation( - struct RTCQuaternionDecomposition* qdecomp, - float translation_x, float translation_y, float translation_z) -{ - qdecomp->translation_x = translation_x; - qdecomp->translation_y = translation_y; - qdecomp->translation_z = translation_z; -} - -RTC_NAMESPACE_END - diff --git a/thirdparty/embree-aarch64/include/embree3/rtcore_ray.h b/thirdparty/embree-aarch64/include/embree3/rtcore_ray.h deleted file mode 100644 index 1ae3309ef1..0000000000 --- a/thirdparty/embree-aarch64/include/embree3/rtcore_ray.h +++ /dev/null @@ -1,378 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "rtcore_common.h" - -RTC_NAMESPACE_BEGIN - -/* Ray structure for a single ray */ -struct RTC_ALIGN(16) RTCRay -{ - float org_x; // x coordinate of ray origin - float org_y; // y coordinate of ray origin - float org_z; // z coordinate of ray origin - float tnear; // start of ray segment - - float dir_x; // x coordinate of ray direction - float dir_y; // y coordinate of ray direction - float dir_z; // z coordinate of ray direction - float time; // time of this ray for motion blur - - float tfar; // end of ray segment (set to hit distance) - unsigned int mask; // ray mask - unsigned int id; // ray ID - unsigned int flags; // ray flags -}; - -/* Hit structure for a single ray */ -struct RTC_ALIGN(16) RTCHit -{ - float Ng_x; // x coordinate of geometry normal - float Ng_y; // y coordinate of geometry normal - float Ng_z; // z coordinate of geometry normal - - float u; // barycentric u coordinate of hit - float v; // barycentric v coordinate of hit - - unsigned int primID; // primitive ID - unsigned int geomID; // geometry ID - unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // instance ID -}; - -/* Combined ray/hit structure for a single ray */ -struct RTCRayHit -{ - struct RTCRay ray; - struct RTCHit hit; -}; - -/* Ray structure for a packet of 4 rays */ -struct RTC_ALIGN(16) RTCRay4 -{ - float org_x[4]; - float org_y[4]; - float org_z[4]; - float tnear[4]; - - float dir_x[4]; - float dir_y[4]; - float dir_z[4]; - float time[4]; - - float tfar[4]; - unsigned int mask[4]; - unsigned int id[4]; - unsigned int flags[4]; -}; - -/* Hit structure for a packet of 4 rays */ -struct RTC_ALIGN(16) RTCHit4 -{ - float Ng_x[4]; - float Ng_y[4]; - float Ng_z[4]; - - float u[4]; - float v[4]; - - unsigned int primID[4]; - unsigned int geomID[4]; - unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT][4]; -}; - -/* Combined ray/hit structure for a packet of 4 rays */ -struct RTCRayHit4 -{ - struct RTCRay4 ray; - struct RTCHit4 hit; -}; - -/* Ray structure for a packet of 8 rays */ -struct RTC_ALIGN(32) RTCRay8 -{ - float org_x[8]; - float org_y[8]; - float org_z[8]; - float tnear[8]; - - float dir_x[8]; - float dir_y[8]; - float dir_z[8]; - float time[8]; - - float tfar[8]; - unsigned int mask[8]; - unsigned int id[8]; - unsigned int flags[8]; -}; - -/* Hit structure for a packet of 8 rays */ -struct RTC_ALIGN(32) RTCHit8 -{ - float Ng_x[8]; - float Ng_y[8]; - float Ng_z[8]; - - float u[8]; - float v[8]; - - unsigned int primID[8]; - unsigned int geomID[8]; - unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT][8]; -}; - -/* Combined ray/hit structure for a packet of 8 rays */ -struct RTCRayHit8 -{ - struct RTCRay8 ray; - struct RTCHit8 hit; -}; - -/* Ray structure for a packet of 16 rays */ -struct RTC_ALIGN(64) RTCRay16 -{ - float org_x[16]; - float org_y[16]; - float org_z[16]; - float tnear[16]; - - float dir_x[16]; - float dir_y[16]; - float dir_z[16]; - float time[16]; - - float tfar[16]; - unsigned int mask[16]; - unsigned int id[16]; - unsigned int flags[16]; -}; - -/* Hit structure for a packet of 16 rays */ -struct RTC_ALIGN(64) RTCHit16 -{ - float Ng_x[16]; - float Ng_y[16]; - float Ng_z[16]; - - float u[16]; - float v[16]; - - unsigned int primID[16]; - unsigned int geomID[16]; - unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT][16]; -}; - -/* Combined ray/hit structure for a packet of 16 rays */ -struct RTCRayHit16 -{ - struct RTCRay16 ray; - struct RTCHit16 hit; -}; - -/* Ray structure for a packet/stream of N rays in pointer SOA layout */ -struct RTCRayNp -{ - float* org_x; - float* org_y; - float* org_z; - float* tnear; - - float* dir_x; - float* dir_y; - float* dir_z; - float* time; - - float* tfar; - unsigned int* mask; - unsigned int* id; - unsigned int* flags; -}; - -/* Hit structure for a packet/stream of N rays in pointer SOA layout */ -struct RTCHitNp -{ - float* Ng_x; - float* Ng_y; - float* Ng_z; - - float* u; - float* v; - - unsigned int* primID; - unsigned int* geomID; - unsigned int* instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; -}; - -/* Combined ray/hit structure for a packet/stream of N rays in pointer SOA layout */ -struct RTCRayHitNp -{ - struct RTCRayNp ray; - struct RTCHitNp hit; -}; - -struct RTCRayN; -struct RTCHitN; -struct RTCRayHitN; - -#if defined(__cplusplus) - -/* Helper functions to access ray packets of runtime size N */ -RTC_FORCEINLINE float& RTCRayN_org_x(RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[0*N+i]; } -RTC_FORCEINLINE float& RTCRayN_org_y(RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[1*N+i]; } -RTC_FORCEINLINE float& RTCRayN_org_z(RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[2*N+i]; } -RTC_FORCEINLINE float& RTCRayN_tnear(RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[3*N+i]; } - -RTC_FORCEINLINE float& RTCRayN_dir_x(RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[4*N+i]; } -RTC_FORCEINLINE float& RTCRayN_dir_y(RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[5*N+i]; } -RTC_FORCEINLINE float& RTCRayN_dir_z(RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[6*N+i]; } -RTC_FORCEINLINE float& RTCRayN_time (RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[7*N+i]; } - -RTC_FORCEINLINE float& RTCRayN_tfar (RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[8*N+i]; } -RTC_FORCEINLINE unsigned int& RTCRayN_mask (RTCRayN* ray, unsigned int N, unsigned int i) { return ((unsigned*)ray)[9*N+i]; } -RTC_FORCEINLINE unsigned int& RTCRayN_id (RTCRayN* ray, unsigned int N, unsigned int i) { return ((unsigned*)ray)[10*N+i]; } -RTC_FORCEINLINE unsigned int& RTCRayN_flags(RTCRayN* ray, unsigned int N, unsigned int i) { return ((unsigned*)ray)[11*N+i]; } - -/* Helper functions to access hit packets of runtime size N */ -RTC_FORCEINLINE float& RTCHitN_Ng_x(RTCHitN* hit, unsigned int N, unsigned int i) { return ((float*)hit)[0*N+i]; } -RTC_FORCEINLINE float& RTCHitN_Ng_y(RTCHitN* hit, unsigned int N, unsigned int i) { return ((float*)hit)[1*N+i]; } -RTC_FORCEINLINE float& RTCHitN_Ng_z(RTCHitN* hit, unsigned int N, unsigned int i) { return ((float*)hit)[2*N+i]; } - -RTC_FORCEINLINE float& RTCHitN_u(RTCHitN* hit, unsigned int N, unsigned int i) { return ((float*)hit)[3*N+i]; } -RTC_FORCEINLINE float& RTCHitN_v(RTCHitN* hit, unsigned int N, unsigned int i) { return ((float*)hit)[4*N+i]; } - -RTC_FORCEINLINE unsigned int& RTCHitN_primID(RTCHitN* hit, unsigned int N, unsigned int i) { return ((unsigned*)hit)[5*N+i]; } -RTC_FORCEINLINE unsigned int& RTCHitN_geomID(RTCHitN* hit, unsigned int N, unsigned int i) { return ((unsigned*)hit)[6*N+i]; } -RTC_FORCEINLINE unsigned int& RTCHitN_instID(RTCHitN* hit, unsigned int N, unsigned int i, unsigned int l) { return ((unsigned*)hit)[7*N+i+N*l]; } - -/* Helper functions to extract RTCRayN and RTCHitN from RTCRayHitN */ -RTC_FORCEINLINE RTCRayN* RTCRayHitN_RayN(RTCRayHitN* rayhit, unsigned int N) { return (RTCRayN*)&((float*)rayhit)[0*N]; } -RTC_FORCEINLINE RTCHitN* RTCRayHitN_HitN(RTCRayHitN* rayhit, unsigned int N) { return (RTCHitN*)&((float*)rayhit)[12*N]; } - -/* Helper structure for a ray packet of compile-time size N */ -template -struct RTCRayNt -{ - float org_x[N]; - float org_y[N]; - float org_z[N]; - float tnear[N]; - - float dir_x[N]; - float dir_y[N]; - float dir_z[N]; - float time[N]; - - float tfar[N]; - unsigned int mask[N]; - unsigned int id[N]; - unsigned int flags[N]; -}; - -/* Helper structure for a hit packet of compile-time size N */ -template -struct RTCHitNt -{ - float Ng_x[N]; - float Ng_y[N]; - float Ng_z[N]; - - float u[N]; - float v[N]; - - unsigned int primID[N]; - unsigned int geomID[N]; - unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT][N]; -}; - -/* Helper structure for a combined ray/hit packet of compile-time size N */ -template -struct RTCRayHitNt -{ - RTCRayNt ray; - RTCHitNt hit; -}; - -RTC_FORCEINLINE RTCRay rtcGetRayFromRayN(RTCRayN* rayN, unsigned int N, unsigned int i) -{ - RTCRay ray; - ray.org_x = RTCRayN_org_x(rayN,N,i); - ray.org_y = RTCRayN_org_y(rayN,N,i); - ray.org_z = RTCRayN_org_z(rayN,N,i); - ray.tnear = RTCRayN_tnear(rayN,N,i); - ray.dir_x = RTCRayN_dir_x(rayN,N,i); - ray.dir_y = RTCRayN_dir_y(rayN,N,i); - ray.dir_z = RTCRayN_dir_z(rayN,N,i); - ray.time = RTCRayN_time(rayN,N,i); - ray.tfar = RTCRayN_tfar(rayN,N,i); - ray.mask = RTCRayN_mask(rayN,N,i); - ray.id = RTCRayN_id(rayN,N,i); - ray.flags = RTCRayN_flags(rayN,N,i); - return ray; -} - -RTC_FORCEINLINE RTCHit rtcGetHitFromHitN(RTCHitN* hitN, unsigned int N, unsigned int i) -{ - RTCHit hit; - hit.Ng_x = RTCHitN_Ng_x(hitN,N,i); - hit.Ng_y = RTCHitN_Ng_y(hitN,N,i); - hit.Ng_z = RTCHitN_Ng_z(hitN,N,i); - hit.u = RTCHitN_u(hitN,N,i); - hit.v = RTCHitN_v(hitN,N,i); - hit.primID = RTCHitN_primID(hitN,N,i); - hit.geomID = RTCHitN_geomID(hitN,N,i); - for (unsigned int l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; l++) - hit.instID[l] = RTCHitN_instID(hitN,N,i,l); - return hit; -} - -RTC_FORCEINLINE void rtcCopyHitToHitN(RTCHitN* hitN, const RTCHit* hit, unsigned int N, unsigned int i) -{ - RTCHitN_Ng_x(hitN,N,i) = hit->Ng_x; - RTCHitN_Ng_y(hitN,N,i) = hit->Ng_y; - RTCHitN_Ng_z(hitN,N,i) = hit->Ng_z; - RTCHitN_u(hitN,N,i) = hit->u; - RTCHitN_v(hitN,N,i) = hit->v; - RTCHitN_primID(hitN,N,i) = hit->primID; - RTCHitN_geomID(hitN,N,i) = hit->geomID; - for (unsigned int l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; l++) - RTCHitN_instID(hitN,N,i,l) = hit->instID[l]; -} - -RTC_FORCEINLINE RTCRayHit rtcGetRayHitFromRayHitN(RTCRayHitN* rayhitN, unsigned int N, unsigned int i) -{ - RTCRayHit rh; - - RTCRayN* ray = RTCRayHitN_RayN(rayhitN,N); - rh.ray.org_x = RTCRayN_org_x(ray,N,i); - rh.ray.org_y = RTCRayN_org_y(ray,N,i); - rh.ray.org_z = RTCRayN_org_z(ray,N,i); - rh.ray.tnear = RTCRayN_tnear(ray,N,i); - rh.ray.dir_x = RTCRayN_dir_x(ray,N,i); - rh.ray.dir_y = RTCRayN_dir_y(ray,N,i); - rh.ray.dir_z = RTCRayN_dir_z(ray,N,i); - rh.ray.time = RTCRayN_time(ray,N,i); - rh.ray.tfar = RTCRayN_tfar(ray,N,i); - rh.ray.mask = RTCRayN_mask(ray,N,i); - rh.ray.id = RTCRayN_id(ray,N,i); - rh.ray.flags = RTCRayN_flags(ray,N,i); - - RTCHitN* hit = RTCRayHitN_HitN(rayhitN,N); - rh.hit.Ng_x = RTCHitN_Ng_x(hit,N,i); - rh.hit.Ng_y = RTCHitN_Ng_y(hit,N,i); - rh.hit.Ng_z = RTCHitN_Ng_z(hit,N,i); - rh.hit.u = RTCHitN_u(hit,N,i); - rh.hit.v = RTCHitN_v(hit,N,i); - rh.hit.primID = RTCHitN_primID(hit,N,i); - rh.hit.geomID = RTCHitN_geomID(hit,N,i); - for (unsigned int l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; l++) - rh.hit.instID[l] = RTCHitN_instID(hit,N,i,l); - - return rh; -} - -#endif - -RTC_NAMESPACE_END - diff --git a/thirdparty/embree-aarch64/include/embree3/rtcore_scene.h b/thirdparty/embree-aarch64/include/embree3/rtcore_scene.h deleted file mode 100644 index 0cd6401593..0000000000 --- a/thirdparty/embree-aarch64/include/embree3/rtcore_scene.h +++ /dev/null @@ -1,160 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "rtcore_device.h" - -RTC_NAMESPACE_BEGIN - -/* Forward declarations for ray structures */ -struct RTCRayHit; -struct RTCRayHit4; -struct RTCRayHit8; -struct RTCRayHit16; -struct RTCRayHitNp; - -/* Scene flags */ -enum RTCSceneFlags -{ - RTC_SCENE_FLAG_NONE = 0, - RTC_SCENE_FLAG_DYNAMIC = (1 << 0), - RTC_SCENE_FLAG_COMPACT = (1 << 1), - RTC_SCENE_FLAG_ROBUST = (1 << 2), - RTC_SCENE_FLAG_CONTEXT_FILTER_FUNCTION = (1 << 3) -}; - -/* Creates a new scene. */ -RTC_API RTCScene rtcNewScene(RTCDevice device); - -/* Returns the device the scene got created in. The reference count of - * the device is incremented by this function. */ -RTC_API RTCDevice rtcGetSceneDevice(RTCScene hscene); - -/* Retains the scene (increments the reference count). */ -RTC_API void rtcRetainScene(RTCScene scene); - -/* Releases the scene (decrements the reference count). */ -RTC_API void rtcReleaseScene(RTCScene scene); - - -/* Attaches the geometry to a scene. */ -RTC_API unsigned int rtcAttachGeometry(RTCScene scene, RTCGeometry geometry); - -/* Attaches the geometry to a scene using the specified geometry ID. */ -RTC_API void rtcAttachGeometryByID(RTCScene scene, RTCGeometry geometry, unsigned int geomID); - -/* Detaches the geometry from the scene. */ -RTC_API void rtcDetachGeometry(RTCScene scene, unsigned int geomID); - -/* Gets a geometry handle from the scene. */ -RTC_API RTCGeometry rtcGetGeometry(RTCScene scene, unsigned int geomID); - - -/* Commits the scene. */ -RTC_API void rtcCommitScene(RTCScene scene); - -/* Commits the scene from multiple threads. */ -RTC_API void rtcJoinCommitScene(RTCScene scene); - - -/* Progress monitor callback function */ -typedef bool (*RTCProgressMonitorFunction)(void* ptr, double n); - -/* Sets the progress monitor callback function of the scene. */ -RTC_API void rtcSetSceneProgressMonitorFunction(RTCScene scene, RTCProgressMonitorFunction progress, void* ptr); - -/* Sets the build quality of the scene. */ -RTC_API void rtcSetSceneBuildQuality(RTCScene scene, enum RTCBuildQuality quality); - -/* Sets the scene flags. */ -RTC_API void rtcSetSceneFlags(RTCScene scene, enum RTCSceneFlags flags); - -/* Returns the scene flags. */ -RTC_API enum RTCSceneFlags rtcGetSceneFlags(RTCScene scene); - -/* Returns the axis-aligned bounds of the scene. */ -RTC_API void rtcGetSceneBounds(RTCScene scene, struct RTCBounds* bounds_o); - -/* Returns the linear axis-aligned bounds of the scene. */ -RTC_API void rtcGetSceneLinearBounds(RTCScene scene, struct RTCLinearBounds* bounds_o); - - -/* Perform a closest point query of the scene. */ -RTC_API bool rtcPointQuery(RTCScene scene, struct RTCPointQuery* query, struct RTCPointQueryContext* context, RTCPointQueryFunction queryFunc, void* userPtr); - -/* Perform a closest point query with a packet of 4 points with the scene. */ -RTC_API bool rtcPointQuery4(const int* valid, RTCScene scene, struct RTCPointQuery4* query, struct RTCPointQueryContext* context, RTCPointQueryFunction queryFunc, void** userPtr); - -/* Perform a closest point query with a packet of 4 points with the scene. */ -RTC_API bool rtcPointQuery8(const int* valid, RTCScene scene, struct RTCPointQuery8* query, struct RTCPointQueryContext* context, RTCPointQueryFunction queryFunc, void** userPtr); - -/* Perform a closest point query with a packet of 4 points with the scene. */ -RTC_API bool rtcPointQuery16(const int* valid, RTCScene scene, struct RTCPointQuery16* query, struct RTCPointQueryContext* context, RTCPointQueryFunction queryFunc, void** userPtr); - -/* Intersects a single ray with the scene. */ -RTC_API void rtcIntersect1(RTCScene scene, struct RTCIntersectContext* context, struct RTCRayHit* rayhit); - -/* Intersects a packet of 4 rays with the scene. */ -RTC_API void rtcIntersect4(const int* valid, RTCScene scene, struct RTCIntersectContext* context, struct RTCRayHit4* rayhit); - -/* Intersects a packet of 8 rays with the scene. */ -RTC_API void rtcIntersect8(const int* valid, RTCScene scene, struct RTCIntersectContext* context, struct RTCRayHit8* rayhit); - -/* Intersects a packet of 16 rays with the scene. */ -RTC_API void rtcIntersect16(const int* valid, RTCScene scene, struct RTCIntersectContext* context, struct RTCRayHit16* rayhit); - -/* Intersects a stream of M rays with the scene. */ -RTC_API void rtcIntersect1M(RTCScene scene, struct RTCIntersectContext* context, struct RTCRayHit* rayhit, unsigned int M, size_t byteStride); - -/* Intersects a stream of pointers to M rays with the scene. */ -RTC_API void rtcIntersect1Mp(RTCScene scene, struct RTCIntersectContext* context, struct RTCRayHit** rayhit, unsigned int M); - -/* Intersects a stream of M ray packets of size N in SOA format with the scene. */ -RTC_API void rtcIntersectNM(RTCScene scene, struct RTCIntersectContext* context, struct RTCRayHitN* rayhit, unsigned int N, unsigned int M, size_t byteStride); - -/* Intersects a stream of M ray packets of size N in SOA format with the scene. */ -RTC_API void rtcIntersectNp(RTCScene scene, struct RTCIntersectContext* context, const struct RTCRayHitNp* rayhit, unsigned int N); - -/* Tests a single ray for occlusion with the scene. */ -RTC_API void rtcOccluded1(RTCScene scene, struct RTCIntersectContext* context, struct RTCRay* ray); - -/* Tests a packet of 4 rays for occlusion occluded with the scene. */ -RTC_API void rtcOccluded4(const int* valid, RTCScene scene, struct RTCIntersectContext* context, struct RTCRay4* ray); - -/* Tests a packet of 8 rays for occlusion with the scene. */ -RTC_API void rtcOccluded8(const int* valid, RTCScene scene, struct RTCIntersectContext* context, struct RTCRay8* ray); - -/* Tests a packet of 16 rays for occlusion with the scene. */ -RTC_API void rtcOccluded16(const int* valid, RTCScene scene, struct RTCIntersectContext* context, struct RTCRay16* ray); - -/* Tests a stream of M rays for occlusion with the scene. */ -RTC_API void rtcOccluded1M(RTCScene scene, struct RTCIntersectContext* context, struct RTCRay* ray, unsigned int M, size_t byteStride); - -/* Tests a stream of pointers to M rays for occlusion with the scene. */ -RTC_API void rtcOccluded1Mp(RTCScene scene, struct RTCIntersectContext* context, struct RTCRay** ray, unsigned int M); - -/* Tests a stream of M ray packets of size N in SOA format for occlusion with the scene. */ -RTC_API void rtcOccludedNM(RTCScene scene, struct RTCIntersectContext* context, struct RTCRayN* ray, unsigned int N, unsigned int M, size_t byteStride); - -/* Tests a stream of M ray packets of size N in SOA format for occlusion with the scene. */ -RTC_API void rtcOccludedNp(RTCScene scene, struct RTCIntersectContext* context, const struct RTCRayNp* ray, unsigned int N); - -/*! collision callback */ -struct RTCCollision { unsigned int geomID0; unsigned int primID0; unsigned int geomID1; unsigned int primID1; }; -typedef void (*RTCCollideFunc) (void* userPtr, struct RTCCollision* collisions, unsigned int num_collisions); - -/*! Performs collision detection of two scenes */ -RTC_API void rtcCollide (RTCScene scene0, RTCScene scene1, RTCCollideFunc callback, void* userPtr); - -#if defined(__cplusplus) - -/* Helper for easily combining scene flags */ -inline RTCSceneFlags operator|(RTCSceneFlags a, RTCSceneFlags b) { - return (RTCSceneFlags)((size_t)a | (size_t)b); -} - -#endif - -RTC_NAMESPACE_END - diff --git a/thirdparty/embree-aarch64/kernels/builders/bvh_builder_hair.h b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_hair.h deleted file mode 100644 index 755ce255fb..0000000000 --- a/thirdparty/embree-aarch64/kernels/builders/bvh_builder_hair.h +++ /dev/null @@ -1,411 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../bvh/bvh.h" -#include "../geometry/primitive.h" -#include "../builders/bvh_builder_sah.h" -#include "../builders/heuristic_binning_array_aligned.h" -#include "../builders/heuristic_binning_array_unaligned.h" -#include "../builders/heuristic_strand_array.h" - -#define NUM_HAIR_OBJECT_BINS 32 - -namespace embree -{ - namespace isa - { - struct BVHBuilderHair - { - /*! settings for builder */ - struct Settings - { - /*! default settings */ - Settings () - : branchingFactor(2), maxDepth(32), logBlockSize(0), minLeafSize(1), maxLeafSize(7), finished_range_threshold(inf) {} - - public: - size_t branchingFactor; //!< branching factor of BVH to build - size_t maxDepth; //!< maximum depth of BVH to build - size_t logBlockSize; //!< log2 of blocksize for SAH heuristic - size_t minLeafSize; //!< minimum size of a leaf - size_t maxLeafSize; //!< maximum size of a leaf - size_t finished_range_threshold; //!< finished range threshold - }; - - template - - class BuilderT - { - ALIGNED_CLASS_(16); - friend struct BVHBuilderHair; - - typedef FastAllocator::CachedAllocator Allocator; - typedef HeuristicArrayBinningSAH HeuristicBinningSAH; - typedef UnalignedHeuristicArrayBinningSAH UnalignedHeuristicBinningSAH; - typedef HeuristicStrandSplit HeuristicStrandSplitSAH; - - static const size_t MAX_BRANCHING_FACTOR = 8; //!< maximum supported BVH branching factor - static const size_t MIN_LARGE_LEAF_LEVELS = 8; //!< create balanced tree if we are that many levels before the maximum tree depth - static const size_t SINGLE_THREADED_THRESHOLD = 4096; //!< threshold to switch to single threaded build - - static const size_t travCostAligned = 1; - static const size_t travCostUnaligned = 5; - static const size_t intCost = 6; - - BuilderT (Scene* scene, - PrimRef* prims, - const CreateAllocFunc& createAlloc, - const CreateAABBNodeFunc& createAABBNode, - const SetAABBNodeFunc& setAABBNode, - const CreateOBBNodeFunc& createOBBNode, - const SetOBBNodeFunc& setOBBNode, - const CreateLeafFunc& createLeaf, - const ProgressMonitor& progressMonitor, - const ReportFinishedRangeFunc& reportFinishedRange, - const Settings settings) - - : cfg(settings), - prims(prims), - createAlloc(createAlloc), - createAABBNode(createAABBNode), - setAABBNode(setAABBNode), - createOBBNode(createOBBNode), - setOBBNode(setOBBNode), - createLeaf(createLeaf), - progressMonitor(progressMonitor), - reportFinishedRange(reportFinishedRange), - alignedHeuristic(prims), unalignedHeuristic(scene,prims), strandHeuristic(scene,prims) {} - - /*! checks if all primitives are from the same geometry */ - __forceinline bool sameGeometry(const PrimInfoRange& range) - { - if (range.size() == 0) return true; - unsigned int firstGeomID = prims[range.begin()].geomID(); - for (size_t i=range.begin()+1; i cfg.maxDepth) - throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached"); - - /* create leaf for few primitives */ - if (pinfo.size() <= cfg.maxLeafSize && sameGeometry(pinfo)) - return createLeaf(prims,pinfo,alloc); - - /* fill all children by always splitting the largest one */ - PrimInfoRange children[MAX_BRANCHING_FACTOR]; - unsigned numChildren = 1; - children[0] = pinfo; - - do { - - /* find best child with largest bounding box area */ - int bestChild = -1; - size_t bestSize = 0; - for (unsigned i=0; i bestSize) { - bestSize = children[i].size(); - bestChild = i; - } - } - if (bestChild == -1) break; - - /*! split best child into left and right child */ - __aligned(64) PrimInfoRange left, right; - if (!sameGeometry(children[bestChild])) { - alignedHeuristic.splitByGeometry(children[bestChild],left,right); - } else { - alignedHeuristic.splitFallback(children[bestChild],left,right); - } - - /* add new children left and right */ - children[bestChild] = children[numChildren-1]; - children[numChildren-1] = left; - children[numChildren+0] = right; - numChildren++; - - } while (numChildren < cfg.branchingFactor); - - /* create node */ - auto node = createAABBNode(alloc); - - for (size_t i=0; i> cfg.logBlockSize; - const float leafSAH = intCost*float(blocks)*halfArea(pinfo.geomBounds); - - /* try standard binning in aligned space */ - float alignedObjectSAH = inf; - HeuristicBinningSAH::Split alignedObjectSplit; - if (aligned) { - alignedObjectSplit = alignedHeuristic.find(pinfo,cfg.logBlockSize); - alignedObjectSAH = travCostAligned*halfArea(pinfo.geomBounds) + intCost*alignedObjectSplit.splitSAH(); - bestSAH = min(alignedObjectSAH,bestSAH); - } - - /* try standard binning in unaligned space */ - UnalignedHeuristicBinningSAH::Split unalignedObjectSplit; - LinearSpace3fa uspace; - float unalignedObjectSAH = inf; - if (bestSAH > 0.7f*leafSAH) { - uspace = unalignedHeuristic.computeAlignedSpace(pinfo); - const PrimInfoRange sinfo = unalignedHeuristic.computePrimInfo(pinfo,uspace); - unalignedObjectSplit = unalignedHeuristic.find(sinfo,cfg.logBlockSize,uspace); - unalignedObjectSAH = travCostUnaligned*halfArea(pinfo.geomBounds) + intCost*unalignedObjectSplit.splitSAH(); - bestSAH = min(unalignedObjectSAH,bestSAH); - } - - /* try splitting into two strands */ - HeuristicStrandSplitSAH::Split strandSplit; - float strandSAH = inf; - if (bestSAH > 0.7f*leafSAH && pinfo.size() <= 256) { - strandSplit = strandHeuristic.find(pinfo,cfg.logBlockSize); - strandSAH = travCostUnaligned*halfArea(pinfo.geomBounds) + intCost*strandSplit.splitSAH(); - bestSAH = min(strandSAH,bestSAH); - } - - /* fallback if SAH heuristics failed */ - if (unlikely(!std::isfinite(bestSAH))) - { - alignedHeuristic.deterministic_order(pinfo); - alignedHeuristic.splitFallback(pinfo,linfo,rinfo); - } - - /* perform aligned split if this is best */ - else if (bestSAH == alignedObjectSAH) { - alignedHeuristic.split(alignedObjectSplit,pinfo,linfo,rinfo); - } - - /* perform unaligned split if this is best */ - else if (bestSAH == unalignedObjectSAH) { - unalignedHeuristic.split(unalignedObjectSplit,uspace,pinfo,linfo,rinfo); - aligned = false; - } - - /* perform strand split if this is best */ - else if (bestSAH == strandSAH) { - strandHeuristic.split(strandSplit,pinfo,linfo,rinfo); - aligned = false; - } - - /* can never happen */ - else - assert(false); - } - - /*! recursive build */ - NodeRef recurse(size_t depth, const PrimInfoRange& pinfo, Allocator alloc, bool toplevel, bool alloc_barrier) - { - /* get thread local allocator */ - if (!alloc) - alloc = createAlloc(); - - /* call memory monitor function to signal progress */ - if (toplevel && pinfo.size() <= SINGLE_THREADED_THRESHOLD) - progressMonitor(pinfo.size()); - - PrimInfoRange children[MAX_BRANCHING_FACTOR]; - - /* create leaf node */ - if (depth+MIN_LARGE_LEAF_LEVELS >= cfg.maxDepth || pinfo.size() <= cfg.minLeafSize) { - alignedHeuristic.deterministic_order(pinfo); - return createLargeLeaf(depth,pinfo,alloc); - } - - /* fill all children by always splitting the one with the largest surface area */ - size_t numChildren = 1; - children[0] = pinfo; - bool aligned = true; - - do { - - /* find best child with largest bounding box area */ - ssize_t bestChild = -1; - float bestArea = neg_inf; - for (size_t i=0; i bestArea) { - bestArea = area(children[i].geomBounds); - bestChild = i; - } - } - if (bestChild == -1) break; - - /*! split best child into left and right child */ - PrimInfoRange left, right; - split(children[bestChild],left,right,aligned); - - /* add new children left and right */ - children[bestChild] = children[numChildren-1]; - children[numChildren-1] = left; - children[numChildren+0] = right; - numChildren++; - - } while (numChildren < cfg.branchingFactor); - - NodeRef node; - - /* create aligned node */ - if (aligned) - { - node = createAABBNode(alloc); - - /* spawn tasks or ... */ - if (pinfo.size() > SINGLE_THREADED_THRESHOLD) - { - parallel_for(size_t(0), numChildren, [&] (const range& r) { - for (size_t i=r.begin(); i cfg.finished_range_threshold && children[i].size() <= cfg.finished_range_threshold; - setAABBNode(node,i,recurse(depth+1,children[i],nullptr,true,child_alloc_barrier),children[i].geomBounds); - _mm_mfence(); // to allow non-temporal stores during build - } - }); - } - /* ... continue sequentially */ - else { - for (size_t i=0; i cfg.finished_range_threshold && children[i].size() <= cfg.finished_range_threshold; - setAABBNode(node,i,recurse(depth+1,children[i],alloc,false,child_alloc_barrier),children[i].geomBounds); - } - } - } - - /* create unaligned node */ - else - { - node = createOBBNode(alloc); - - /* spawn tasks or ... */ - if (pinfo.size() > SINGLE_THREADED_THRESHOLD) - { - parallel_for(size_t(0), numChildren, [&] (const range& r) { - for (size_t i=r.begin(); i cfg.finished_range_threshold && children[i].size() <= cfg.finished_range_threshold; - setOBBNode(node,i,recurse(depth+1,children[i],nullptr,true,child_alloc_barrier),obounds); - _mm_mfence(); // to allow non-temporal stores during build - } - }); - } - /* ... continue sequentially */ - else - { - for (size_t i=0; i cfg.finished_range_threshold && children[i].size() <= cfg.finished_range_threshold; - setOBBNode(node,i,recurse(depth+1,children[i],alloc,false,child_alloc_barrier),obounds); - } - } - } - - /* reports a finished range of primrefs */ - if (unlikely(alloc_barrier)) - reportFinishedRange(pinfo); - - return node; - } - - private: - Settings cfg; - PrimRef* prims; - const CreateAllocFunc& createAlloc; - const CreateAABBNodeFunc& createAABBNode; - const SetAABBNodeFunc& setAABBNode; - const CreateOBBNodeFunc& createOBBNode; - const SetOBBNodeFunc& setOBBNode; - const CreateLeafFunc& createLeaf; - const ProgressMonitor& progressMonitor; - const ReportFinishedRangeFunc& reportFinishedRange; - - private: - HeuristicBinningSAH alignedHeuristic; - UnalignedHeuristicBinningSAH unalignedHeuristic; - HeuristicStrandSplitSAH strandHeuristic; - }; - - template - - static NodeRef build (const CreateAllocFunc& createAlloc, - const CreateAABBNodeFunc& createAABBNode, - const SetAABBNodeFunc& setAABBNode, - const CreateOBBNodeFunc& createOBBNode, - const SetOBBNodeFunc& setOBBNode, - const CreateLeafFunc& createLeaf, - const ProgressMonitor& progressMonitor, - const ReportFinishedRangeFunc& reportFinishedRange, - Scene* scene, - PrimRef* prims, - const PrimInfo& pinfo, - const Settings settings) - { - typedef BuilderT Builder; - - Builder builder(scene,prims,createAlloc, - createAABBNode,setAABBNode, - createOBBNode,setOBBNode, - createLeaf,progressMonitor,reportFinishedRange,settings); - - NodeRef root = builder.recurse(1,pinfo,nullptr,true,false); - _mm_mfence(); // to allow non-temporal stores during build - return root; - } - }; - } -} diff --git a/thirdparty/embree-aarch64/kernels/builders/bvh_builder_morton.h b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_morton.h deleted file mode 100644 index 92be2f7e65..0000000000 --- a/thirdparty/embree-aarch64/kernels/builders/bvh_builder_morton.h +++ /dev/null @@ -1,501 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../common/builder.h" -#include "../../common/algorithms/parallel_reduce.h" - -namespace embree -{ - namespace isa - { - struct BVHBuilderMorton - { - static const size_t MAX_BRANCHING_FACTOR = 8; //!< maximum supported BVH branching factor - static const size_t MIN_LARGE_LEAF_LEVELS = 8; //!< create balanced tree of we are that many levels before the maximum tree depth - - /*! settings for morton builder */ - struct Settings - { - /*! default settings */ - Settings () - : branchingFactor(2), maxDepth(32), minLeafSize(1), maxLeafSize(7), singleThreadThreshold(1024) {} - - /*! initialize settings from API settings */ - Settings (const RTCBuildArguments& settings) - : branchingFactor(2), maxDepth(32), minLeafSize(1), maxLeafSize(7), singleThreadThreshold(1024) - { - if (RTC_BUILD_ARGUMENTS_HAS(settings,maxBranchingFactor)) branchingFactor = settings.maxBranchingFactor; - if (RTC_BUILD_ARGUMENTS_HAS(settings,maxDepth )) maxDepth = settings.maxDepth; - if (RTC_BUILD_ARGUMENTS_HAS(settings,minLeafSize )) minLeafSize = settings.minLeafSize; - if (RTC_BUILD_ARGUMENTS_HAS(settings,maxLeafSize )) maxLeafSize = settings.maxLeafSize; - - minLeafSize = min(minLeafSize,maxLeafSize); - } - - Settings (size_t branchingFactor, size_t maxDepth, size_t minLeafSize, size_t maxLeafSize, size_t singleThreadThreshold) - : branchingFactor(branchingFactor), maxDepth(maxDepth), minLeafSize(minLeafSize), maxLeafSize(maxLeafSize), singleThreadThreshold(singleThreadThreshold) - { - minLeafSize = min(minLeafSize,maxLeafSize); - } - - public: - size_t branchingFactor; //!< branching factor of BVH to build - size_t maxDepth; //!< maximum depth of BVH to build - size_t minLeafSize; //!< minimum size of a leaf - size_t maxLeafSize; //!< maximum size of a leaf - size_t singleThreadThreshold; //!< threshold when we switch to single threaded build - }; - - /*! Build primitive consisting of morton code and primitive ID. */ - struct __aligned(8) BuildPrim - { - union { - struct { - unsigned int code; //!< morton code - unsigned int index; //!< i'th primitive - }; - uint64_t t; - }; - - /*! interface for radix sort */ - __forceinline operator unsigned() const { return code; } - - /*! interface for standard sort */ - __forceinline bool operator<(const BuildPrim &m) const { return code < m.code; } - }; - - /*! maps bounding box to morton code */ - struct MortonCodeMapping - { - static const size_t LATTICE_BITS_PER_DIM = 10; - static const size_t LATTICE_SIZE_PER_DIM = size_t(1) << LATTICE_BITS_PER_DIM; - - vfloat4 base; - vfloat4 scale; - - __forceinline MortonCodeMapping(const BBox3fa& bounds) - { - base = (vfloat4)bounds.lower; - const vfloat4 diag = (vfloat4)bounds.upper - (vfloat4)bounds.lower; - scale = select(diag > vfloat4(1E-19f), rcp(diag) * vfloat4(LATTICE_SIZE_PER_DIM * 0.99f),vfloat4(0.0f)); - } - - __forceinline const vint4 bin (const BBox3fa& box) const - { - const vfloat4 lower = (vfloat4)box.lower; - const vfloat4 upper = (vfloat4)box.upper; - const vfloat4 centroid = lower+upper; - return vint4((centroid-base)*scale); - } - - __forceinline unsigned int code (const BBox3fa& box) const - { - const vint4 binID = bin(box); - const unsigned int x = extract<0>(binID); - const unsigned int y = extract<1>(binID); - const unsigned int z = extract<2>(binID); - const unsigned int xyz = bitInterleave(x,y,z); - return xyz; - } - }; - -#if defined (__AVX2__) - - /*! for AVX2 there is a fast scalar bitInterleave */ - struct MortonCodeGenerator - { - __forceinline MortonCodeGenerator(const MortonCodeMapping& mapping, BuildPrim* dest) - : mapping(mapping), dest(dest) {} - - __forceinline void operator() (const BBox3fa& b, const unsigned index) - { - dest->index = index; - dest->code = mapping.code(b); - dest++; - } - - public: - const MortonCodeMapping mapping; - BuildPrim* dest; - size_t currentID; - }; - -#else - - /*! before AVX2 is it better to use the SSE version of bitInterleave */ - struct MortonCodeGenerator - { - __forceinline MortonCodeGenerator(const MortonCodeMapping& mapping, BuildPrim* dest) - : mapping(mapping), dest(dest), currentID(0), slots(0), ax(0), ay(0), az(0), ai(0) {} - - __forceinline ~MortonCodeGenerator() - { - if (slots != 0) - { - const vint4 code = bitInterleave(ax,ay,az); - for (size_t i=0; i(binID); - ay[slots] = extract<1>(binID); - az[slots] = extract<2>(binID); - ai[slots] = index; - slots++; - currentID++; - - if (slots == 4) - { - const vint4 code = bitInterleave(ax,ay,az); - vint4::storeu(&dest[currentID-4],unpacklo(code,ai)); - vint4::storeu(&dest[currentID-2],unpackhi(code,ai)); - slots = 0; - } - } - - public: - const MortonCodeMapping mapping; - BuildPrim* dest; - size_t currentID; - size_t slots; - vint4 ax, ay, az, ai; - }; - -#endif - - template< - typename ReductionTy, - typename Allocator, - typename CreateAllocator, - typename CreateNodeFunc, - typename SetNodeBoundsFunc, - typename CreateLeafFunc, - typename CalculateBounds, - typename ProgressMonitor> - - class BuilderT : private Settings - { - ALIGNED_CLASS_(16); - - public: - - BuilderT (CreateAllocator& createAllocator, - CreateNodeFunc& createNode, - SetNodeBoundsFunc& setBounds, - CreateLeafFunc& createLeaf, - CalculateBounds& calculateBounds, - ProgressMonitor& progressMonitor, - const Settings& settings) - - : Settings(settings), - createAllocator(createAllocator), - createNode(createNode), - setBounds(setBounds), - createLeaf(createLeaf), - calculateBounds(calculateBounds), - progressMonitor(progressMonitor), - morton(nullptr) {} - - ReductionTy createLargeLeaf(size_t depth, const range& current, Allocator alloc) - { - /* this should never occur but is a fatal error */ - if (depth > maxDepth) - throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached"); - - /* create leaf for few primitives */ - if (current.size() <= maxLeafSize) - return createLeaf(current,alloc); - - /* fill all children by always splitting the largest one */ - range children[MAX_BRANCHING_FACTOR]; - size_t numChildren = 1; - children[0] = current; - - do { - - /* find best child with largest number of primitives */ - size_t bestChild = -1; - size_t bestSize = 0; - for (size_t i=0; i bestSize) { - bestSize = children[i].size(); - bestChild = i; - } - } - if (bestChild == size_t(-1)) break; - - /*! split best child into left and right child */ - auto split = children[bestChild].split(); - - /* add new children left and right */ - children[bestChild] = children[numChildren-1]; - children[numChildren-1] = split.first; - children[numChildren+0] = split.second; - numChildren++; - - } while (numChildren < branchingFactor); - - /* create node */ - auto node = createNode(alloc,numChildren); - - /* recurse into each child */ - ReductionTy bounds[MAX_BRANCHING_FACTOR]; - for (size_t i=0; i& current) const - { - /* fast path for small ranges */ - if (likely(current.size() < 1024)) - { - /*! recalculate centroid bounds */ - BBox3fa centBounds(empty); - for (size_t i=current.begin(); i& r ) { - BBox3fa centBounds = empty; - for (size_t i=r.begin(); i& r ) { - for (size_t i=r.begin(); i& current, range& left, range& right) const - { - const unsigned int code_start = morton[current.begin()].code; - const unsigned int code_end = morton[current.end()-1].code; - unsigned int bitpos = lzcnt(code_start^code_end); - - /* if all items mapped to same morton code, then re-create new morton codes for the items */ - if (unlikely(bitpos == 32)) - { - recreateMortonCodes(current); - const unsigned int code_start = morton[current.begin()].code; - const unsigned int code_end = morton[current.end()-1].code; - bitpos = lzcnt(code_start^code_end); - - /* if the morton code is still the same, goto fall back split */ - if (unlikely(bitpos == 32)) { - current.split(left,right); - return; - } - } - - /* split the items at the topmost different morton code bit */ - const unsigned int bitpos_diff = 31-bitpos; - const unsigned int bitmask = 1 << bitpos_diff; - - /* find location where bit differs using binary search */ - unsigned begin = current.begin(); - unsigned end = current.end(); - while (begin + 1 != end) { - const unsigned mid = (begin+end)/2; - const unsigned bit = morton[mid].code & bitmask; - if (bit == 0) begin = mid; else end = mid; - } - unsigned center = end; -#if defined(DEBUG) - for (unsigned int i=begin; i& current, Allocator alloc, bool toplevel) - { - /* get thread local allocator */ - if (!alloc) - alloc = createAllocator(); - - /* call memory monitor function to signal progress */ - if (toplevel && current.size() <= singleThreadThreshold) - progressMonitor(current.size()); - - /* create leaf node */ - if (unlikely(depth+MIN_LARGE_LEAF_LEVELS >= maxDepth || current.size() <= minLeafSize)) - return createLargeLeaf(depth,current,alloc); - - /* fill all children by always splitting the one with the largest surface area */ - range children[MAX_BRANCHING_FACTOR]; - split(current,children[0],children[1]); - size_t numChildren = 2; - - while (numChildren < branchingFactor) - { - /* find best child with largest number of primitives */ - int bestChild = -1; - unsigned bestItems = 0; - for (unsigned int i=0; i bestItems) { - bestItems = children[i].size(); - bestChild = i; - } - } - if (bestChild == -1) break; - - /*! split best child into left and right child */ - range left, right; - split(children[bestChild],left,right); - - /* add new children left and right */ - children[bestChild] = children[numChildren-1]; - children[numChildren-1] = left; - children[numChildren+0] = right; - numChildren++; - } - - /* create leaf node if no split is possible */ - if (unlikely(numChildren == 1)) - return createLeaf(current,alloc); - - /* allocate node */ - auto node = createNode(alloc,numChildren); - - /* process top parts of tree parallel */ - ReductionTy bounds[MAX_BRANCHING_FACTOR]; - if (current.size() > singleThreadThreshold) - { - /*! parallel_for is faster than spawing sub-tasks */ - parallel_for(size_t(0), numChildren, [&] (const range& r) { - for (size_t i=r.begin(); i(0,(unsigned)numPrimitives), nullptr, true); - _mm_mfence(); // to allow non-temporal stores during build - return root; - } - - public: - CreateAllocator& createAllocator; - CreateNodeFunc& createNode; - SetNodeBoundsFunc& setBounds; - CreateLeafFunc& createLeaf; - CalculateBounds& calculateBounds; - ProgressMonitor& progressMonitor; - - public: - BuildPrim* morton; - }; - - - template< - typename ReductionTy, - typename CreateAllocFunc, - typename CreateNodeFunc, - typename SetBoundsFunc, - typename CreateLeafFunc, - typename CalculateBoundsFunc, - typename ProgressMonitor> - - static ReductionTy build(CreateAllocFunc createAllocator, - CreateNodeFunc createNode, - SetBoundsFunc setBounds, - CreateLeafFunc createLeaf, - CalculateBoundsFunc calculateBounds, - ProgressMonitor progressMonitor, - BuildPrim* src, - BuildPrim* tmp, - size_t numPrimitives, - const Settings& settings) - { - typedef BuilderT< - ReductionTy, - decltype(createAllocator()), - CreateAllocFunc, - CreateNodeFunc, - SetBoundsFunc, - CreateLeafFunc, - CalculateBoundsFunc, - ProgressMonitor> Builder; - - Builder builder(createAllocator, - createNode, - setBounds, - createLeaf, - calculateBounds, - progressMonitor, - settings); - - return builder.build(src,tmp,numPrimitives); - } - }; - } -} diff --git a/thirdparty/embree-aarch64/kernels/builders/bvh_builder_msmblur.h b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_msmblur.h deleted file mode 100644 index 4c138dacdb..0000000000 --- a/thirdparty/embree-aarch64/kernels/builders/bvh_builder_msmblur.h +++ /dev/null @@ -1,692 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#define MBLUR_NUM_TEMPORAL_BINS 2 -#define MBLUR_NUM_OBJECT_BINS 32 - -#include "../bvh/bvh.h" -#include "../common/primref_mb.h" -#include "heuristic_binning_array_aligned.h" -#include "heuristic_timesplit_array.h" - -namespace embree -{ - namespace isa - { - template - struct SharedVector - { - __forceinline SharedVector() {} - - __forceinline SharedVector(T* ptr, size_t refCount = 1) - : prims(ptr), refCount(refCount) {} - - __forceinline void incRef() { - refCount++; - } - - __forceinline void decRef() - { - if (--refCount == 0) - delete prims; - } - - T* prims; - size_t refCount; - }; - - template - struct LocalChildListT - { - typedef SharedVector> SharedPrimRefVector; - - __forceinline LocalChildListT (const BuildRecord& record) - : numChildren(1), numSharedPrimVecs(1) - { - /* the local root will be freed in the ancestor where it was created (thus refCount is 2) */ - children[0] = record; - primvecs[0] = new (&sharedPrimVecs[0]) SharedPrimRefVector(record.prims.prims, 2); - } - - __forceinline ~LocalChildListT() - { - for (size_t i = 0; i < numChildren; i++) - primvecs[i]->decRef(); - } - - __forceinline BuildRecord& operator[] ( const size_t i ) { - return children[i]; - } - - __forceinline size_t size() const { - return numChildren; - } - - __forceinline void split(ssize_t bestChild, const BuildRecord& lrecord, const BuildRecord& rrecord, std::unique_ptr> new_vector) - { - SharedPrimRefVector* bsharedPrimVec = primvecs[bestChild]; - if (lrecord.prims.prims == bsharedPrimVec->prims) { - primvecs[bestChild] = bsharedPrimVec; - bsharedPrimVec->incRef(); - } - else { - primvecs[bestChild] = new (&sharedPrimVecs[numSharedPrimVecs++]) SharedPrimRefVector(lrecord.prims.prims); - } - - if (rrecord.prims.prims == bsharedPrimVec->prims) { - primvecs[numChildren] = bsharedPrimVec; - bsharedPrimVec->incRef(); - } - else { - primvecs[numChildren] = new (&sharedPrimVecs[numSharedPrimVecs++]) SharedPrimRefVector(rrecord.prims.prims); - } - bsharedPrimVec->decRef(); - new_vector.release(); - - children[bestChild] = lrecord; - children[numChildren] = rrecord; - numChildren++; - } - - public: - array_t children; - array_t primvecs; - size_t numChildren; - - array_t sharedPrimVecs; - size_t numSharedPrimVecs; - }; - - template - struct RecalculatePrimRef - { - Scene* scene; - - __forceinline RecalculatePrimRef (Scene* scene) - : scene(scene) {} - - __forceinline PrimRefMB operator() (const PrimRefMB& prim, const BBox1f time_range) const - { - const unsigned geomID = prim.geomID(); - const unsigned primID = prim.primID(); - const Mesh* mesh = scene->get(geomID); - const LBBox3fa lbounds = mesh->linearBounds(primID, time_range); - const range tbounds = mesh->timeSegmentRange(time_range); - return PrimRefMB (lbounds, tbounds.size(), mesh->time_range, mesh->numTimeSegments(), geomID, primID); - } - - // __noinline is workaround for ICC16 bug under MacOSX - __noinline PrimRefMB operator() (const PrimRefMB& prim, const BBox1f time_range, const LinearSpace3fa& space) const - { - const unsigned geomID = prim.geomID(); - const unsigned primID = prim.primID(); - const Mesh* mesh = scene->get(geomID); - const LBBox3fa lbounds = mesh->linearBounds(space, primID, time_range); - const range tbounds = mesh->timeSegmentRange(time_range); - return PrimRefMB (lbounds, tbounds.size(), mesh->time_range, mesh->numTimeSegments(), geomID, primID); - } - - __forceinline LBBox3fa linearBounds(const PrimRefMB& prim, const BBox1f time_range) const { - return scene->get(prim.geomID())->linearBounds(prim.primID(), time_range); - } - - // __noinline is workaround for ICC16 bug under MacOSX - __noinline LBBox3fa linearBounds(const PrimRefMB& prim, const BBox1f time_range, const LinearSpace3fa& space) const { - return scene->get(prim.geomID())->linearBounds(space, prim.primID(), time_range); - } - }; - - struct VirtualRecalculatePrimRef - { - Scene* scene; - - __forceinline VirtualRecalculatePrimRef (Scene* scene) - : scene(scene) {} - - __forceinline PrimRefMB operator() (const PrimRefMB& prim, const BBox1f time_range) const - { - const unsigned geomID = prim.geomID(); - const unsigned primID = prim.primID(); - const Geometry* mesh = scene->get(geomID); - const LBBox3fa lbounds = mesh->vlinearBounds(primID, time_range); - const range tbounds = mesh->timeSegmentRange(time_range); - return PrimRefMB (lbounds, tbounds.size(), mesh->time_range, mesh->numTimeSegments(), geomID, primID); - } - - __forceinline PrimRefMB operator() (const PrimRefMB& prim, const BBox1f time_range, const LinearSpace3fa& space) const - { - const unsigned geomID = prim.geomID(); - const unsigned primID = prim.primID(); - const Geometry* mesh = scene->get(geomID); - const LBBox3fa lbounds = mesh->vlinearBounds(space, primID, time_range); - const range tbounds = mesh->timeSegmentRange(time_range); - return PrimRefMB (lbounds, tbounds.size(), mesh->time_range, mesh->numTimeSegments(), geomID, primID); - } - - __forceinline LBBox3fa linearBounds(const PrimRefMB& prim, const BBox1f time_range) const { - return scene->get(prim.geomID())->vlinearBounds(prim.primID(), time_range); - } - - __forceinline LBBox3fa linearBounds(const PrimRefMB& prim, const BBox1f time_range, const LinearSpace3fa& space) const { - return scene->get(prim.geomID())->vlinearBounds(space, prim.primID(), time_range); - } - }; - - struct BVHBuilderMSMBlur - { - /*! settings for msmblur builder */ - struct Settings - { - /*! default settings */ - Settings () - : branchingFactor(2), maxDepth(32), logBlockSize(0), minLeafSize(1), maxLeafSize(8), - travCost(1.0f), intCost(1.0f), singleLeafTimeSegment(false), - singleThreadThreshold(1024) {} - - - Settings (size_t sahBlockSize, size_t minLeafSize, size_t maxLeafSize, float travCost, float intCost, size_t singleThreadThreshold) - : branchingFactor(2), maxDepth(32), logBlockSize(bsr(sahBlockSize)), minLeafSize(minLeafSize), maxLeafSize(maxLeafSize), - travCost(travCost), intCost(intCost), singleThreadThreshold(singleThreadThreshold) - { - minLeafSize = min(minLeafSize,maxLeafSize); - } - - public: - size_t branchingFactor; //!< branching factor of BVH to build - size_t maxDepth; //!< maximum depth of BVH to build - size_t logBlockSize; //!< log2 of blocksize for SAH heuristic - size_t minLeafSize; //!< minimum size of a leaf - size_t maxLeafSize; //!< maximum size of a leaf - float travCost; //!< estimated cost of one traversal step - float intCost; //!< estimated cost of one primitive intersection - bool singleLeafTimeSegment; //!< split time to single time range - size_t singleThreadThreshold; //!< threshold when we switch to single threaded build - }; - - struct BuildRecord - { - public: - __forceinline BuildRecord () {} - - __forceinline BuildRecord (size_t depth) - : depth(depth) {} - - __forceinline BuildRecord (const SetMB& prims, size_t depth) - : depth(depth), prims(prims) {} - - __forceinline friend bool operator< (const BuildRecord& a, const BuildRecord& b) { - return a.prims.size() < b.prims.size(); - } - - __forceinline size_t size() const { - return prims.size(); - } - - public: - size_t depth; //!< Depth of the root of this subtree. - SetMB prims; //!< The list of primitives. - }; - - struct BuildRecordSplit : public BuildRecord - { - __forceinline BuildRecordSplit () {} - - __forceinline BuildRecordSplit (size_t depth) - : BuildRecord(depth) {} - - __forceinline BuildRecordSplit (const BuildRecord& record, const BinSplit& split) - : BuildRecord(record), split(split) {} - - BinSplit split; - }; - - template< - typename NodeRef, - typename RecalculatePrimRef, - typename Allocator, - typename CreateAllocFunc, - typename CreateNodeFunc, - typename SetNodeFunc, - typename CreateLeafFunc, - typename ProgressMonitor> - - class BuilderT - { - ALIGNED_CLASS_(16); - static const size_t MAX_BRANCHING_FACTOR = 16; //!< maximum supported BVH branching factor - static const size_t MIN_LARGE_LEAF_LEVELS = 8; //!< create balanced tree if we are that many levels before the maximum tree depth - - typedef BVHNodeRecordMB4D NodeRecordMB4D; - typedef BinSplit Split; - typedef mvector* PrimRefVector; - typedef SharedVector> SharedPrimRefVector; - typedef LocalChildListT LocalChildList; - typedef LocalChildListT LocalChildListSplit; - - public: - - BuilderT (MemoryMonitorInterface* device, - const RecalculatePrimRef recalculatePrimRef, - const CreateAllocFunc createAlloc, - const CreateNodeFunc createNode, - const SetNodeFunc setNode, - const CreateLeafFunc createLeaf, - const ProgressMonitor progressMonitor, - const Settings& settings) - : cfg(settings), - heuristicObjectSplit(), - heuristicTemporalSplit(device, recalculatePrimRef), - recalculatePrimRef(recalculatePrimRef), createAlloc(createAlloc), createNode(createNode), setNode(setNode), createLeaf(createLeaf), - progressMonitor(progressMonitor) - { - if (cfg.branchingFactor > MAX_BRANCHING_FACTOR) - throw_RTCError(RTC_ERROR_UNKNOWN,"bvh_builder: branching factor too large"); - } - - /*! finds the best split */ - const Split find(const SetMB& set) - { - /* first try standard object split */ - const Split object_split = heuristicObjectSplit.find(set,cfg.logBlockSize); - const float object_split_sah = object_split.splitSAH(); - - /* test temporal splits only when object split was bad */ - const float leaf_sah = set.leafSAH(cfg.logBlockSize); - if (object_split_sah < 0.50f*leaf_sah) - return object_split; - - /* do temporal splits only if the the time range is big enough */ - if (set.time_range.size() > 1.01f/float(set.max_num_time_segments)) - { - const Split temporal_split = heuristicTemporalSplit.find(set,cfg.logBlockSize); - const float temporal_split_sah = temporal_split.splitSAH(); - - /* take temporal split if it improved SAH */ - if (temporal_split_sah < object_split_sah) - return temporal_split; - } - - return object_split; - } - - /*! array partitioning */ - __forceinline std::unique_ptr> split(const Split& split, const SetMB& set, SetMB& lset, SetMB& rset) - { - /* perform object split */ - if (likely(split.data == Split::SPLIT_OBJECT)) { - heuristicObjectSplit.split(split,set,lset,rset); - } - /* perform temporal split */ - else if (likely(split.data == Split::SPLIT_TEMPORAL)) { - return heuristicTemporalSplit.split(split,set,lset,rset); - } - /* perform fallback split */ - else if (unlikely(split.data == Split::SPLIT_FALLBACK)) { - set.deterministic_order(); - splitFallback(set,lset,rset); - } - /* split by geometry */ - else if (unlikely(split.data == Split::SPLIT_GEOMID)) { - set.deterministic_order(); - splitByGeometry(set,lset,rset); - } - else - assert(false); - - return std::unique_ptr>(); - } - - /*! finds the best fallback split */ - __noinline Split findFallback(const SetMB& set) - { - /* split if primitives are not from same geometry */ - if (!sameGeometry(set)) - return Split(0.0f,Split::SPLIT_GEOMID); - - /* if a leaf can only hold a single time-segment, we might have to do additional temporal splits */ - if (cfg.singleLeafTimeSegment) - { - /* test if one primitive has more than one time segment in time range, if so split time */ - for (size_t i=set.begin(); i itime_range = prim.timeSegmentRange(set.time_range); - const int localTimeSegments = itime_range.size(); - assert(localTimeSegments > 0); - if (localTimeSegments > 1) { - const int icenter = (itime_range.begin() + itime_range.end())/2; - const float splitTime = prim.timeStep(icenter); - return Split(0.0f,(unsigned)Split::SPLIT_TEMPORAL,0,splitTime); - } - } - } - - /* otherwise return fallback split */ - return Split(0.0f,Split::SPLIT_FALLBACK); - } - - /*! performs fallback split */ - void splitFallback(const SetMB& set, SetMB& lset, SetMB& rset) - { - mvector& prims = *set.prims; - - const size_t begin = set.begin(); - const size_t end = set.end(); - const size_t center = (begin + end)/2; - - PrimInfoMB linfo = empty; - for (size_t i=begin; i(begin,center),set.time_range); - new (&rset) SetMB(rinfo,set.prims,range(center,end ),set.time_range); - } - - /*! checks if all primitives are from the same geometry */ - __forceinline bool sameGeometry(const SetMB& set) - { - if (set.size() == 0) return true; - mvector& prims = *set.prims; - const size_t begin = set.begin(); - const size_t end = set.end(); - unsigned int firstGeomID = prims[begin].geomID(); - for (size_t i=begin+1; i 1); - - mvector& prims = *set.prims; - const size_t begin = set.begin(); - const size_t end = set.end(); - - PrimInfoMB left(empty); - PrimInfoMB right(empty); - unsigned int geomID = prims[begin].geomID(); - size_t center = serial_partitioning(prims.data(),begin,end,left,right, - [&] ( const PrimRefMB& prim ) { return prim.geomID() == geomID; }, - [ ] ( PrimInfoMB& dst, const PrimRefMB& prim ) { dst.add_primref(prim); }); - - new (&lset) SetMB(left, set.prims,range(begin,center),set.time_range); - new (&rset) SetMB(right,set.prims,range(center,end ),set.time_range); - } - - const NodeRecordMB4D createLargeLeaf(const BuildRecord& in, Allocator alloc) - { - /* this should never occur but is a fatal error */ - if (in.depth > cfg.maxDepth) - throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached"); - - /* replace already found split by fallback split */ - const BuildRecordSplit current(BuildRecord(in.prims,in.depth),findFallback(in.prims)); - - /* special case when directly creating leaf without any splits that could shrink time_range */ - bool force_split = false; - if (current.depth == 1 && current.size() > 0) - { - BBox1f c = empty; - BBox1f p = current.prims.time_range; - for (size_t i=current.prims.begin(); i& prims = *current.prims.prims; - c.extend(prims[i].time_range); - } - - force_split = c.lower > p.lower || c.upper < p.upper; - } - - /* create leaf for few primitives */ - if (current.size() <= cfg.maxLeafSize && current.split.data < Split::SPLIT_ENFORCE && !force_split) - return createLeaf(current,alloc); - - /* fill all children by always splitting the largest one */ - bool hasTimeSplits = false; - NodeRecordMB4D values[MAX_BRANCHING_FACTOR]; - LocalChildListSplit children(current); - - do { - /* find best child with largest bounding box area */ - size_t bestChild = -1; - size_t bestSize = 0; - for (size_t i=0; i bestSize) { - bestSize = children[i].size(); - bestChild = i; - } - } - if (bestChild == -1) break; - - /* perform best found split */ - BuildRecordSplit& brecord = children[bestChild]; - BuildRecordSplit lrecord(current.depth+1); - BuildRecordSplit rrecord(current.depth+1); - std::unique_ptr> new_vector = split(brecord.split,brecord.prims,lrecord.prims,rrecord.prims); - hasTimeSplits |= new_vector != nullptr; - - /* find new splits */ - lrecord.split = findFallback(lrecord.prims); - rrecord.split = findFallback(rrecord.prims); - children.split(bestChild,lrecord,rrecord,std::move(new_vector)); - - } while (children.size() < cfg.branchingFactor); - - /* detect time_ranges that have shrunken */ - for (size_t i=0; i p.lower || c.upper < p.upper; - } - - /* create node */ - auto node = createNode(children.children.data(),children.numChildren,alloc,hasTimeSplits); - - /* recurse into each child and perform reduction */ - LBBox3fa gbounds = empty; - for (size_t i=0; i= 0) && (splitSAH >= 0))); - - /*! create a leaf node when threshold reached or SAH tells us to stop */ - if (current.size() <= cfg.minLeafSize || current.depth+MIN_LARGE_LEAF_LEVELS >= cfg.maxDepth || (current.size() <= cfg.maxLeafSize && leafSAH <= splitSAH)) { - current.prims.deterministic_order(); - return createLargeLeaf(current,alloc); - } - - /*! perform initial split */ - SetMB lprims,rprims; - std::unique_ptr> new_vector = split(csplit,current.prims,lprims,rprims); - bool hasTimeSplits = new_vector != nullptr; - NodeRecordMB4D values[MAX_BRANCHING_FACTOR]; - LocalChildList children(current); - { - BuildRecord lrecord(lprims,current.depth+1); - BuildRecord rrecord(rprims,current.depth+1); - children.split(0,lrecord,rrecord,std::move(new_vector)); - } - - /*! split until node is full or SAH tells us to stop */ - while (children.size() < cfg.branchingFactor) - { - /*! find best child to split */ - float bestArea = neg_inf; - ssize_t bestChild = -1; - for (size_t i=0; i bestArea) { - bestChild = i; bestArea = expectedApproxHalfArea(children[i].prims.geomBounds); - } - } - if (bestChild == -1) break; - - /* perform split */ - BuildRecord& brecord = children[bestChild]; - BuildRecord lrecord(current.depth+1); - BuildRecord rrecord(current.depth+1); - Split csplit = find(brecord.prims); - std::unique_ptr> new_vector = split(csplit,brecord.prims,lrecord.prims,rrecord.prims); - hasTimeSplits |= new_vector != nullptr; - children.split(bestChild,lrecord,rrecord,std::move(new_vector)); - } - - /* detect time_ranges that have shrunken */ - for (size_t i=0; i p.lower || c.upper < p.upper; - } - - /* sort buildrecords for simpler shadow ray traversal */ - //std::sort(&children[0],&children[children.size()],std::greater()); // FIXME: reduces traversal performance of bvh8.triangle4 (need to verified) !! - - /*! create an inner node */ - auto node = createNode(children.children.data(), children.numChildren, alloc, hasTimeSplits); - LBBox3fa gbounds = empty; - - /* spawn tasks */ - if (unlikely(current.size() > cfg.singleThreadThreshold)) - { - /*! parallel_for is faster than spawing sub-tasks */ - parallel_for(size_t(0), children.size(), [&] (const range& r) { - for (size_t i=r.begin(); i=0; i--) { - values[i] = recurse(children[i],alloc,false); - gbounds.extend(values[i].lbounds); - } - } - - setNode(current,children.children.data(),node,values,children.numChildren); - - /* calculate geometry bounds of this node */ - if (unlikely(hasTimeSplits)) - return NodeRecordMB4D(node,current.prims.linearBounds(recalculatePrimRef),current.prims.time_range); - else - return NodeRecordMB4D(node,gbounds,current.prims.time_range); - } - - /*! builder entry function */ - __forceinline const NodeRecordMB4D operator() (mvector& prims, const PrimInfoMB& pinfo) - { - const SetMB set(pinfo,&prims); - auto ret = recurse(BuildRecord(set,1),nullptr,true); - _mm_mfence(); // to allow non-temporal stores during build - return ret; - } - - private: - Settings cfg; - HeuristicArrayBinningMB heuristicObjectSplit; - HeuristicMBlurTemporalSplit heuristicTemporalSplit; - const RecalculatePrimRef recalculatePrimRef; - const CreateAllocFunc createAlloc; - const CreateNodeFunc createNode; - const SetNodeFunc setNode; - const CreateLeafFunc createLeaf; - const ProgressMonitor progressMonitor; - }; - - template - - static const BVHNodeRecordMB4D build(mvector& prims, - const PrimInfoMB& pinfo, - MemoryMonitorInterface* device, - const RecalculatePrimRef recalculatePrimRef, - const CreateAllocFunc createAlloc, - const CreateNodeFunc createNode, - const SetNodeFunc setNode, - const CreateLeafFunc createLeaf, - const ProgressMonitorFunc progressMonitor, - const Settings& settings) - { - typedef BuilderT< - NodeRef, - RecalculatePrimRef, - decltype(createAlloc()), - CreateAllocFunc, - CreateNodeFunc, - SetNodeFunc, - CreateLeafFunc, - ProgressMonitorFunc> Builder; - - Builder builder(device, - recalculatePrimRef, - createAlloc, - createNode, - setNode, - createLeaf, - progressMonitor, - settings); - - - return builder(prims,pinfo); - } - }; - } -} diff --git a/thirdparty/embree-aarch64/kernels/builders/bvh_builder_msmblur_hair.h b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_msmblur_hair.h deleted file mode 100644 index e477c313a3..0000000000 --- a/thirdparty/embree-aarch64/kernels/builders/bvh_builder_msmblur_hair.h +++ /dev/null @@ -1,526 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../bvh/bvh.h" -#include "../geometry/primitive.h" -#include "../builders/bvh_builder_msmblur.h" -#include "../builders/heuristic_binning_array_aligned.h" -#include "../builders/heuristic_binning_array_unaligned.h" -#include "../builders/heuristic_timesplit_array.h" - -namespace embree -{ - namespace isa - { - struct BVHBuilderHairMSMBlur - { - /*! settings for msmblur builder */ - struct Settings - { - /*! default settings */ - Settings () - : branchingFactor(2), maxDepth(32), logBlockSize(0), minLeafSize(1), maxLeafSize(8) {} - - public: - size_t branchingFactor; //!< branching factor of BVH to build - size_t maxDepth; //!< maximum depth of BVH to build - size_t logBlockSize; //!< log2 of blocksize for SAH heuristic - size_t minLeafSize; //!< minimum size of a leaf - size_t maxLeafSize; //!< maximum size of a leaf - }; - - struct BuildRecord - { - public: - __forceinline BuildRecord () {} - - __forceinline BuildRecord (size_t depth) - : depth(depth) {} - - __forceinline BuildRecord (const SetMB& prims, size_t depth) - : depth(depth), prims(prims) {} - - __forceinline size_t size() const { - return prims.size(); - } - - public: - size_t depth; //!< depth of the root of this subtree - SetMB prims; //!< the list of primitives - }; - - template - - class BuilderT - { - ALIGNED_CLASS_(16); - - static const size_t MAX_BRANCHING_FACTOR = 8; //!< maximum supported BVH branching factor - static const size_t MIN_LARGE_LEAF_LEVELS = 8; //!< create balanced tree if we are that many levels before the maximum tree depth - static const size_t SINGLE_THREADED_THRESHOLD = 4096; //!< threshold to switch to single threaded build - - typedef BVHNodeRecordMB NodeRecordMB; - typedef BVHNodeRecordMB4D NodeRecordMB4D; - - typedef FastAllocator::CachedAllocator Allocator; - typedef LocalChildListT LocalChildList; - - typedef HeuristicMBlurTemporalSplit HeuristicTemporal; - typedef HeuristicArrayBinningMB HeuristicBinning; - typedef UnalignedHeuristicArrayBinningMB UnalignedHeuristicBinning; - - public: - - BuilderT (Scene* scene, - const RecalculatePrimRef& recalculatePrimRef, - const CreateAllocFunc& createAlloc, - const CreateAABBNodeMBFunc& createAABBNodeMB, - const SetAABBNodeMBFunc& setAABBNodeMB, - const CreateOBBNodeMBFunc& createOBBNodeMB, - const SetOBBNodeMBFunc& setOBBNodeMB, - const CreateLeafFunc& createLeaf, - const ProgressMonitor& progressMonitor, - const Settings settings) - - : cfg(settings), - scene(scene), - recalculatePrimRef(recalculatePrimRef), - createAlloc(createAlloc), - createAABBNodeMB(createAABBNodeMB), setAABBNodeMB(setAABBNodeMB), - createOBBNodeMB(createOBBNodeMB), setOBBNodeMB(setOBBNodeMB), - createLeaf(createLeaf), - progressMonitor(progressMonitor), - unalignedHeuristic(scene), - temporalSplitHeuristic(scene->device,recalculatePrimRef) {} - - private: - - /*! checks if all primitives are from the same geometry */ - __forceinline bool sameGeometry(const SetMB& set) - { - mvector& prims = *set.prims; - unsigned int firstGeomID = prims[set.begin()].geomID(); - for (size_t i=set.begin()+1; i& prims = *set.prims; - - const size_t begin = set.begin(); - const size_t end = set.end(); - const size_t center = (begin + end)/2; - - PrimInfoMB linfo = empty; - for (size_t i=begin; i(begin,center),set.time_range); - new (&rset) SetMB(rinfo,set.prims,range(center,end ),set.time_range); - } - - void splitByGeometry(const SetMB& set, SetMB& lset, SetMB& rset) - { - assert(set.size() > 1); - const size_t begin = set.begin(); - const size_t end = set.end(); - PrimInfoMB linfo(empty); - PrimInfoMB rinfo(empty); - unsigned int geomID = (*set.prims)[begin].geomID(); - size_t center = serial_partitioning(set.prims->data(),begin,end,linfo,rinfo, - [&] ( const PrimRefMB& prim ) { return prim.geomID() == geomID; }, - [ ] ( PrimInfoMB& a, const PrimRefMB& ref ) { a.add_primref(ref); }); - - new (&lset) SetMB(linfo,set.prims,range(begin,center),set.time_range); - new (&rset) SetMB(rinfo,set.prims,range(center,end ),set.time_range); - } - - /*! creates a large leaf that could be larger than supported by the BVH */ - NodeRecordMB4D createLargeLeaf(BuildRecord& current, Allocator alloc) - { - /* this should never occur but is a fatal error */ - if (current.depth > cfg.maxDepth) - throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached"); - - /* special case when directly creating leaf without any splits that could shrink time_range */ - bool force_split = false; - if (current.depth == 1 && current.size() > 0) - { - BBox1f c = empty; - BBox1f p = current.prims.time_range; - for (size_t i=current.prims.begin(); i& prims = *current.prims.prims; - c.extend(prims[i].time_range); - } - - force_split = c.lower > p.lower || c.upper < p.upper; - } - - /* create leaf for few primitives */ - if (current.size() <= cfg.maxLeafSize && sameGeometry(current.prims) && !force_split) - return createLeaf(current.prims,alloc); - - /* fill all children by always splitting the largest one */ - LocalChildList children(current); - NodeRecordMB4D values[MAX_BRANCHING_FACTOR]; - - do { - - /* find best child with largest bounding box area */ - int bestChild = -1; - size_t bestSize = 0; - for (unsigned i=0; i bestSize) { - bestSize = children[i].size(); - bestChild = i; - } - } - if (bestChild == -1) break; - - /*! split best child into left and right child */ - BuildRecord left(current.depth+1); - BuildRecord right(current.depth+1); - if (!sameGeometry(children[bestChild].prims)) { - splitByGeometry(children[bestChild].prims,left.prims,right.prims); - } else { - splitFallback(children[bestChild].prims,left.prims,right.prims); - } - children.split(bestChild,left,right,std::unique_ptr>()); - - } while (children.size() < cfg.branchingFactor); - - - /* detect time_ranges that have shrunken */ - bool timesplit = false; - for (size_t i=0; i p.lower || c.upper < p.upper; - } - - /* create node */ - NodeRef node = createAABBNodeMB(children.children.data(),children.numChildren,alloc,timesplit); - - LBBox3fa bounds = empty; - for (size_t i=0; i> split(const BuildRecord& current, BuildRecord& lrecord, BuildRecord& rrecord, bool& aligned, bool& timesplit) - { - /* variable to track the SAH of the best splitting approach */ - float bestSAH = inf; - const float leafSAH = current.prims.leafSAH(cfg.logBlockSize); - - /* perform standard binning in aligned space */ - HeuristicBinning::Split alignedObjectSplit = alignedHeuristic.find(current.prims,cfg.logBlockSize); - float alignedObjectSAH = alignedObjectSplit.splitSAH(); - bestSAH = min(alignedObjectSAH,bestSAH); - - /* perform standard binning in unaligned space */ - UnalignedHeuristicBinning::Split unalignedObjectSplit; - LinearSpace3fa uspace; - float unalignedObjectSAH = inf; - if (alignedObjectSAH > 0.7f*leafSAH) { - uspace = unalignedHeuristic.computeAlignedSpaceMB(scene,current.prims); - const SetMB sset = current.prims.primInfo(recalculatePrimRef,uspace); - unalignedObjectSplit = unalignedHeuristic.find(sset,cfg.logBlockSize,uspace); - unalignedObjectSAH = 1.3f*unalignedObjectSplit.splitSAH(); // makes unaligned splits more expensive - bestSAH = min(unalignedObjectSAH,bestSAH); - } - - /* do temporal splits only if previous approaches failed to produce good SAH and the the time range is large enough */ - float temporal_split_sah = inf; - typename HeuristicTemporal::Split temporal_split; - if (bestSAH > 0.5f*leafSAH) { - if (current.prims.time_range.size() > 1.01f/float(current.prims.max_num_time_segments)) { - temporal_split = temporalSplitHeuristic.find(current.prims,cfg.logBlockSize); - temporal_split_sah = temporal_split.splitSAH(); - bestSAH = min(temporal_split_sah,bestSAH); - } - } - - /* perform fallback split if SAH heuristics failed */ - if (unlikely(!std::isfinite(bestSAH))) { - current.prims.deterministic_order(); - splitFallback(current.prims,lrecord.prims,rrecord.prims); - } - /* perform aligned split if this is best */ - else if (likely(bestSAH == alignedObjectSAH)) { - alignedHeuristic.split(alignedObjectSplit,current.prims,lrecord.prims,rrecord.prims); - } - /* perform unaligned split if this is best */ - else if (likely(bestSAH == unalignedObjectSAH)) { - unalignedHeuristic.split(unalignedObjectSplit,uspace,current.prims,lrecord.prims,rrecord.prims); - aligned = false; - } - /* perform temporal split if this is best */ - else if (likely(bestSAH == temporal_split_sah)) { - timesplit = true; - return temporalSplitHeuristic.split(temporal_split,current.prims,lrecord.prims,rrecord.prims); - } - else - assert(false); - - return std::unique_ptr>(); - } - - /*! recursive build */ - NodeRecordMB4D recurse(BuildRecord& current, Allocator alloc, bool toplevel) - { - /* get thread local allocator */ - if (!alloc) - alloc = createAlloc(); - - /* call memory monitor function to signal progress */ - if (toplevel && current.size() <= SINGLE_THREADED_THRESHOLD) - progressMonitor(current.size()); - - /* create leaf node */ - if (current.depth+MIN_LARGE_LEAF_LEVELS >= cfg.maxDepth || current.size() <= cfg.minLeafSize) { - current.prims.deterministic_order(); - return createLargeLeaf(current,alloc); - } - - /* fill all children by always splitting the one with the largest surface area */ - NodeRecordMB4D values[MAX_BRANCHING_FACTOR]; - LocalChildList children(current); - bool aligned = true; - bool timesplit = false; - - do { - - /* find best child with largest bounding box area */ - ssize_t bestChild = -1; - float bestArea = neg_inf; - for (size_t i=0; i bestArea) { - bestArea = children[i].prims.halfArea(); - bestChild = i; - } - } - if (bestChild == -1) break; - - /*! split best child into left and right child */ - BuildRecord left(current.depth+1); - BuildRecord right(current.depth+1); - std::unique_ptr> new_vector = split(children[bestChild],left,right,aligned,timesplit); - children.split(bestChild,left,right,std::move(new_vector)); - - } while (children.size() < cfg.branchingFactor); - - /* detect time_ranges that have shrunken */ - for (size_t i=0; i p.lower || c.upper < p.upper; - } - - /* create time split node */ - if (timesplit) - { - const NodeRef node = createAABBNodeMB(children.children.data(),children.numChildren,alloc,true); - - /* spawn tasks or ... */ - if (current.size() > SINGLE_THREADED_THRESHOLD) - { - parallel_for(size_t(0), children.size(), [&] (const range& r) { - for (size_t i=r.begin(); i SINGLE_THREADED_THRESHOLD) - { - LBBox3fa cbounds[MAX_BRANCHING_FACTOR]; - parallel_for(size_t(0), children.size(), [&] (const range& r) { - for (size_t i=r.begin(); i SINGLE_THREADED_THRESHOLD) - { - parallel_for(size_t(0), children.size(), [&] (const range& r) { - for (size_t i=r.begin(); i& prims, const PrimInfoMB& pinfo) - { - BuildRecord record(SetMB(pinfo,&prims),1); - auto root = recurse(record,nullptr,true); - _mm_mfence(); // to allow non-temporal stores during build - return root; - } - - private: - Settings cfg; - Scene* scene; - const RecalculatePrimRef& recalculatePrimRef; - const CreateAllocFunc& createAlloc; - const CreateAABBNodeMBFunc& createAABBNodeMB; - const SetAABBNodeMBFunc& setAABBNodeMB; - const CreateOBBNodeMBFunc& createOBBNodeMB; - const SetOBBNodeMBFunc& setOBBNodeMB; - const CreateLeafFunc& createLeaf; - const ProgressMonitor& progressMonitor; - - private: - HeuristicBinning alignedHeuristic; - UnalignedHeuristicBinning unalignedHeuristic; - HeuristicTemporal temporalSplitHeuristic; - }; - - template - - static BVHNodeRecordMB4D build (Scene* scene, mvector& prims, const PrimInfoMB& pinfo, - const RecalculatePrimRef& recalculatePrimRef, - const CreateAllocFunc& createAlloc, - const CreateAABBNodeMBFunc& createAABBNodeMB, - const SetAABBNodeMBFunc& setAABBNodeMB, - const CreateOBBNodeMBFunc& createOBBNodeMB, - const SetOBBNodeMBFunc& setOBBNodeMB, - const CreateLeafFunc& createLeaf, - const ProgressMonitor& progressMonitor, - const Settings settings) - { - typedef BuilderT Builder; - - Builder builder(scene,recalculatePrimRef,createAlloc, - createAABBNodeMB,setAABBNodeMB, - createOBBNodeMB,setOBBNodeMB, - createLeaf,progressMonitor,settings); - - return builder(prims,pinfo); - } - }; - } -} diff --git a/thirdparty/embree-aarch64/kernels/builders/bvh_builder_sah.h b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_sah.h deleted file mode 100644 index 3f7e678a10..0000000000 --- a/thirdparty/embree-aarch64/kernels/builders/bvh_builder_sah.h +++ /dev/null @@ -1,669 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "heuristic_binning_array_aligned.h" -#include "heuristic_spatial_array.h" -#include "heuristic_openmerge_array.h" - -#if defined(__AVX512F__) && !defined(__AVX512VL__) // KNL -# define NUM_OBJECT_BINS 16 -# define NUM_SPATIAL_BINS 16 -#else -# define NUM_OBJECT_BINS 32 -# define NUM_SPATIAL_BINS 16 -#endif - -namespace embree -{ - namespace isa - { - MAYBE_UNUSED static const float travCost = 1.0f; - MAYBE_UNUSED static const size_t DEFAULT_SINGLE_THREAD_THRESHOLD = 1024; - - struct GeneralBVHBuilder - { - static const size_t MAX_BRANCHING_FACTOR = 16; //!< maximum supported BVH branching factor - static const size_t MIN_LARGE_LEAF_LEVELS = 8; //!< create balanced tree of we are that many levels before the maximum tree depth - - - /*! settings for SAH builder */ - struct Settings - { - /*! default settings */ - Settings () - : branchingFactor(2), maxDepth(32), logBlockSize(0), minLeafSize(1), maxLeafSize(7), - travCost(1.0f), intCost(1.0f), singleThreadThreshold(1024), primrefarrayalloc(inf) {} - - /*! initialize settings from API settings */ - Settings (const RTCBuildArguments& settings) - : branchingFactor(2), maxDepth(32), logBlockSize(0), minLeafSize(1), maxLeafSize(7), - travCost(1.0f), intCost(1.0f), singleThreadThreshold(1024), primrefarrayalloc(inf) - { - if (RTC_BUILD_ARGUMENTS_HAS(settings,maxBranchingFactor)) branchingFactor = settings.maxBranchingFactor; - if (RTC_BUILD_ARGUMENTS_HAS(settings,maxDepth )) maxDepth = settings.maxDepth; - if (RTC_BUILD_ARGUMENTS_HAS(settings,sahBlockSize )) logBlockSize = bsr(static_cast(settings.sahBlockSize)); - if (RTC_BUILD_ARGUMENTS_HAS(settings,minLeafSize )) minLeafSize = settings.minLeafSize; - if (RTC_BUILD_ARGUMENTS_HAS(settings,maxLeafSize )) maxLeafSize = settings.maxLeafSize; - if (RTC_BUILD_ARGUMENTS_HAS(settings,traversalCost )) travCost = settings.traversalCost; - if (RTC_BUILD_ARGUMENTS_HAS(settings,intersectionCost )) intCost = settings.intersectionCost; - - minLeafSize = min(minLeafSize,maxLeafSize); - } - - Settings (size_t sahBlockSize, size_t minLeafSize, size_t maxLeafSize, float travCost, float intCost, size_t singleThreadThreshold, size_t primrefarrayalloc = inf) - : branchingFactor(2), maxDepth(32), logBlockSize(bsr(sahBlockSize)), minLeafSize(minLeafSize), maxLeafSize(maxLeafSize), - travCost(travCost), intCost(intCost), singleThreadThreshold(singleThreadThreshold), primrefarrayalloc(primrefarrayalloc) - { - minLeafSize = min(minLeafSize,maxLeafSize); - } - - public: - size_t branchingFactor; //!< branching factor of BVH to build - size_t maxDepth; //!< maximum depth of BVH to build - size_t logBlockSize; //!< log2 of blocksize for SAH heuristic - size_t minLeafSize; //!< minimum size of a leaf - size_t maxLeafSize; //!< maximum size of a leaf - float travCost; //!< estimated cost of one traversal step - float intCost; //!< estimated cost of one primitive intersection - size_t singleThreadThreshold; //!< threshold when we switch to single threaded build - size_t primrefarrayalloc; //!< builder uses prim ref array to allocate nodes and leaves when a subtree of that size is finished - }; - - /*! recursive state of builder */ - template - struct BuildRecordT - { - public: - __forceinline BuildRecordT () {} - - __forceinline BuildRecordT (size_t depth) - : depth(depth), alloc_barrier(false), prims(empty) {} - - __forceinline BuildRecordT (size_t depth, const Set& prims) - : depth(depth), alloc_barrier(false), prims(prims) {} - - __forceinline BBox3fa bounds() const { return prims.geomBounds; } - - __forceinline friend bool operator< (const BuildRecordT& a, const BuildRecordT& b) { return a.prims.size() < b.prims.size(); } - __forceinline friend bool operator> (const BuildRecordT& a, const BuildRecordT& b) { return a.prims.size() > b.prims.size(); } - - __forceinline size_t size() const { return prims.size(); } - - public: - size_t depth; //!< Depth of the root of this subtree. - bool alloc_barrier; //!< barrier used to reuse primref-array blocks to allocate nodes - Set prims; //!< The list of primitives. - }; - - template - struct DefaultCanCreateLeafFunc - { - __forceinline bool operator()(const PrimRef*, const Set&) const { return true; } - }; - - template - struct DefaultCanCreateLeafSplitFunc - { - __forceinline void operator()(PrimRef*, const Set&, Set&, Set&) const { } - }; - - template - - class BuilderT - { - friend struct GeneralBVHBuilder; - - BuilderT (PrimRef* prims, - Heuristic& heuristic, - const CreateAllocFunc& createAlloc, - const CreateNodeFunc& createNode, - const UpdateNodeFunc& updateNode, - const CreateLeafFunc& createLeaf, - const CanCreateLeafFunc& canCreateLeaf, - const CanCreateLeafSplitFunc& canCreateLeafSplit, - const ProgressMonitor& progressMonitor, - const Settings& settings) : - cfg(settings), - prims(prims), - heuristic(heuristic), - createAlloc(createAlloc), - createNode(createNode), - updateNode(updateNode), - createLeaf(createLeaf), - canCreateLeaf(canCreateLeaf), - canCreateLeafSplit(canCreateLeafSplit), - progressMonitor(progressMonitor) - { - if (cfg.branchingFactor > MAX_BRANCHING_FACTOR) - throw_RTCError(RTC_ERROR_UNKNOWN,"bvh_builder: branching factor too large"); - } - - const ReductionTy createLargeLeaf(const BuildRecord& current, Allocator alloc) - { - /* this should never occur but is a fatal error */ - if (current.depth > cfg.maxDepth) - throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached"); - - /* create leaf for few primitives */ - if (current.prims.size() <= cfg.maxLeafSize && canCreateLeaf(prims,current.prims)) - return createLeaf(prims,current.prims,alloc); - - /* fill all children by always splitting the largest one */ - ReductionTy values[MAX_BRANCHING_FACTOR]; - BuildRecord children[MAX_BRANCHING_FACTOR]; - size_t numChildren = 1; - children[0] = current; - do { - - /* find best child with largest bounding box area */ - size_t bestChild = -1; - size_t bestSize = 0; - for (size_t i=0; i bestSize) { - bestSize = children[i].prims.size(); - bestChild = i; - } - } - if (bestChild == (size_t)-1) break; - - /*! split best child into left and right child */ - BuildRecord left(current.depth+1); - BuildRecord right(current.depth+1); - if (!canCreateLeaf(prims,children[bestChild].prims)) { - canCreateLeafSplit(prims,children[bestChild].prims,left.prims,right.prims); - } else { - heuristic.splitFallback(children[bestChild].prims,left.prims,right.prims); - } - - /* add new children left and right */ - children[bestChild] = children[numChildren-1]; - children[numChildren-1] = left; - children[numChildren+0] = right; - numChildren++; - - } while (numChildren < cfg.branchingFactor); - - /* set barrier for primrefarrayalloc */ - if (unlikely(current.size() > cfg.primrefarrayalloc)) - for (size_t i=0; i= 0) && (splitSAH >= 0))); - - /*! create a leaf node when threshold reached or SAH tells us to stop */ - if (current.prims.size() <= cfg.minLeafSize || current.depth+MIN_LARGE_LEAF_LEVELS >= cfg.maxDepth || (current.prims.size() <= cfg.maxLeafSize && leafSAH <= splitSAH)) { - heuristic.deterministic_order(current.prims); - return createLargeLeaf(current,alloc); - } - - /*! perform initial split */ - Set lprims,rprims; - heuristic.split(split,current.prims,lprims,rprims); - - /*! initialize child list with initial split */ - ReductionTy values[MAX_BRANCHING_FACTOR]; - BuildRecord children[MAX_BRANCHING_FACTOR]; - children[0] = BuildRecord(current.depth+1,lprims); - children[1] = BuildRecord(current.depth+1,rprims); - size_t numChildren = 2; - - /*! split until node is full or SAH tells us to stop */ - while (numChildren < cfg.branchingFactor) - { - /*! find best child to split */ - float bestArea = neg_inf; - ssize_t bestChild = -1; - for (size_t i=0; i bestArea) { - bestChild = i; - bestArea = halfArea(children[i].prims.geomBounds); - } - } - if (bestChild == -1) break; - - /* perform best found split */ - BuildRecord& brecord = children[bestChild]; - BuildRecord lrecord(current.depth+1); - BuildRecord rrecord(current.depth+1); - auto split = heuristic.find(brecord.prims,cfg.logBlockSize); - heuristic.split(split,brecord.prims,lrecord.prims,rrecord.prims); - children[bestChild ] = lrecord; - children[numChildren] = rrecord; - numChildren++; - } - - /* set barrier for primrefarrayalloc */ - if (unlikely(current.size() > cfg.primrefarrayalloc)) - for (size_t i=0; i()); - - /*! create an inner node */ - auto node = createNode(children,numChildren,alloc); - - /* spawn tasks */ - if (current.size() > cfg.singleThreadThreshold) - { - /*! parallel_for is faster than spawing sub-tasks */ - parallel_for(size_t(0), numChildren, [&] (const range& r) { // FIXME: no range here - for (size_t i=r.begin(); i - - __noinline static ReductionTy build(Heuristic& heuristic, - PrimRef* prims, - const Set& set, - CreateAllocFunc createAlloc, - CreateNodeFunc createNode, UpdateNodeFunc updateNode, - const CreateLeafFunc& createLeaf, - const ProgressMonitor& progressMonitor, - const Settings& settings) - { - typedef BuildRecordT BuildRecord; - - typedef BuilderT< - BuildRecord, - Heuristic, - Set, - PrimRef, - ReductionTy, - decltype(createAlloc()), - CreateAllocFunc, - CreateNodeFunc, - UpdateNodeFunc, - CreateLeafFunc, - DefaultCanCreateLeafFunc, - DefaultCanCreateLeafSplitFunc, - ProgressMonitor> Builder; - - /* instantiate builder */ - Builder builder(prims, - heuristic, - createAlloc, - createNode, - updateNode, - createLeaf, - DefaultCanCreateLeafFunc(), - DefaultCanCreateLeafSplitFunc(), - progressMonitor, - settings); - - /* build hierarchy */ - BuildRecord record(1,set); - const ReductionTy root = builder.recurse(record,nullptr,true); - _mm_mfence(); // to allow non-temporal stores during build - return root; - } - - template< - typename ReductionTy, - typename Heuristic, - typename Set, - typename PrimRef, - typename CreateAllocFunc, - typename CreateNodeFunc, - typename UpdateNodeFunc, - typename CreateLeafFunc, - typename CanCreateLeafFunc, - typename CanCreateLeafSplitFunc, - typename ProgressMonitor> - - __noinline static ReductionTy build(Heuristic& heuristic, - PrimRef* prims, - const Set& set, - CreateAllocFunc createAlloc, - CreateNodeFunc createNode, UpdateNodeFunc updateNode, - const CreateLeafFunc& createLeaf, - const CanCreateLeafFunc& canCreateLeaf, - const CanCreateLeafSplitFunc& canCreateLeafSplit, - const ProgressMonitor& progressMonitor, - const Settings& settings) - { - typedef BuildRecordT BuildRecord; - - typedef BuilderT< - BuildRecord, - Heuristic, - Set, - PrimRef, - ReductionTy, - decltype(createAlloc()), - CreateAllocFunc, - CreateNodeFunc, - UpdateNodeFunc, - CreateLeafFunc, - CanCreateLeafFunc, - CanCreateLeafSplitFunc, - ProgressMonitor> Builder; - - /* instantiate builder */ - Builder builder(prims, - heuristic, - createAlloc, - createNode, - updateNode, - createLeaf, - canCreateLeaf, - canCreateLeafSplit, - progressMonitor, - settings); - - /* build hierarchy */ - BuildRecord record(1,set); - const ReductionTy root = builder.recurse(record,nullptr,true); - _mm_mfence(); // to allow non-temporal stores during build - return root; - } - }; - - /* SAH builder that operates on an array of BuildRecords */ - struct BVHBuilderBinnedSAH - { - typedef PrimInfoRange Set; - typedef HeuristicArrayBinningSAH Heuristic; - typedef GeneralBVHBuilder::BuildRecordT BuildRecord; - typedef GeneralBVHBuilder::Settings Settings; - - /*! special builder that propagates reduction over the tree */ - template< - typename ReductionTy, - typename CreateAllocFunc, - typename CreateNodeFunc, - typename UpdateNodeFunc, - typename CreateLeafFunc, - typename ProgressMonitor> - - static ReductionTy build(CreateAllocFunc createAlloc, - CreateNodeFunc createNode, UpdateNodeFunc updateNode, - const CreateLeafFunc& createLeaf, - const ProgressMonitor& progressMonitor, - PrimRef* prims, const PrimInfo& pinfo, - const Settings& settings) - { - Heuristic heuristic(prims); - return GeneralBVHBuilder::build( - heuristic, - prims, - PrimInfoRange(0,pinfo.size(),pinfo), - createAlloc, - createNode, - updateNode, - createLeaf, - progressMonitor, - settings); - } - - /*! special builder that propagates reduction over the tree */ - template< - typename ReductionTy, - typename CreateAllocFunc, - typename CreateNodeFunc, - typename UpdateNodeFunc, - typename CreateLeafFunc, - typename CanCreateLeafFunc, - typename CanCreateLeafSplitFunc, - typename ProgressMonitor> - - static ReductionTy build(CreateAllocFunc createAlloc, - CreateNodeFunc createNode, UpdateNodeFunc updateNode, - const CreateLeafFunc& createLeaf, - const CanCreateLeafFunc& canCreateLeaf, - const CanCreateLeafSplitFunc& canCreateLeafSplit, - const ProgressMonitor& progressMonitor, - PrimRef* prims, const PrimInfo& pinfo, - const Settings& settings) - { - Heuristic heuristic(prims); - return GeneralBVHBuilder::build( - heuristic, - prims, - PrimInfoRange(0,pinfo.size(),pinfo), - createAlloc, - createNode, - updateNode, - createLeaf, - canCreateLeaf, - canCreateLeafSplit, - progressMonitor, - settings); - } - }; - - /* Spatial SAH builder that operates on an double-buffered array of BuildRecords */ - struct BVHBuilderBinnedFastSpatialSAH - { - typedef PrimInfoExtRange Set; - typedef Split2,SpatialBinSplit > Split; - typedef GeneralBVHBuilder::BuildRecordT BuildRecord; - typedef GeneralBVHBuilder::Settings Settings; - - static const unsigned int GEOMID_MASK = 0xFFFFFFFF >> RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS; - static const unsigned int SPLITS_MASK = 0xFFFFFFFF << (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS); - - template - struct CreateLeafExt - { - __forceinline CreateLeafExt (const UserCreateLeaf userCreateLeaf) - : userCreateLeaf(userCreateLeaf) {} - - // __noinline is workaround for ICC2016 compiler bug - template - __noinline ReductionTy operator() (PrimRef* prims, const range& range, Allocator alloc) const - { - for (size_t i=range.begin(); i - - static ReductionTy build(CreateAllocFunc createAlloc, - CreateNodeFunc createNode, - UpdateNodeFunc updateNode, - const CreateLeafFunc& createLeaf, - SplitPrimitiveFunc splitPrimitive, - ProgressMonitor progressMonitor, - PrimRef* prims, - const size_t extSize, - const PrimInfo& pinfo, - const Settings& settings) - { - typedef HeuristicArraySpatialSAH Heuristic; - Heuristic heuristic(splitPrimitive,prims,pinfo); - - /* calculate total surface area */ // FIXME: this sum is not deterministic - const float A = (float) parallel_reduce(size_t(0),pinfo.size(),0.0, [&] (const range& r) -> double { - - double A = 0.0f; - for (size_t i=r.begin(); i()); - - - /* calculate maximum number of spatial splits per primitive */ - const unsigned int maxSplits = ((size_t)1 << RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)-1; - const float f = 10.0f; - - const float invA = 1.0f / A; - parallel_for( size_t(0), pinfo.size(), [&](const range& r) { - - for (size_t i=r.begin(); i( - heuristic, - prims, - PrimInfoExtRange(0,pinfo.size(),extSize,pinfo), - createAlloc, - createNode, - updateNode, - CreateLeafExt(createLeaf), - progressMonitor, - settings); - } - }; - - /* Open/Merge SAH builder that operates on an array of BuildRecords */ - struct BVHBuilderBinnedOpenMergeSAH - { - static const size_t NUM_OBJECT_BINS_HQ = 32; - typedef PrimInfoExtRange Set; - typedef BinSplit Split; - typedef GeneralBVHBuilder::BuildRecordT BuildRecord; - typedef GeneralBVHBuilder::Settings Settings; - - /*! special builder that propagates reduction over the tree */ - template< - typename ReductionTy, - typename BuildRef, - typename CreateAllocFunc, - typename CreateNodeFunc, - typename UpdateNodeFunc, - typename CreateLeafFunc, - typename NodeOpenerFunc, - typename ProgressMonitor> - - static ReductionTy build(CreateAllocFunc createAlloc, - CreateNodeFunc createNode, - UpdateNodeFunc updateNode, - const CreateLeafFunc& createLeaf, - NodeOpenerFunc nodeOpenerFunc, - ProgressMonitor progressMonitor, - BuildRef* prims, - const size_t extSize, - const PrimInfo& pinfo, - const Settings& settings) - { - typedef HeuristicArrayOpenMergeSAH Heuristic; - Heuristic heuristic(nodeOpenerFunc,prims,settings.branchingFactor); - - return GeneralBVHBuilder::build( - heuristic, - prims, - PrimInfoExtRange(0,pinfo.size(),extSize,pinfo), - createAlloc, - createNode, - updateNode, - createLeaf, - progressMonitor, - settings); - } - }; - } -} diff --git a/thirdparty/embree-aarch64/kernels/builders/heuristic_binning.h b/thirdparty/embree-aarch64/kernels/builders/heuristic_binning.h deleted file mode 100644 index a4d3b68e46..0000000000 --- a/thirdparty/embree-aarch64/kernels/builders/heuristic_binning.h +++ /dev/null @@ -1,972 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "priminfo.h" -#include "../../common/algorithms/parallel_reduce.h" -#include "../../common/algorithms/parallel_partition.h" - -namespace embree -{ - namespace isa - { - /*! mapping into bins */ - template - struct BinMapping - { - public: - __forceinline BinMapping() {} - - /*! calculates the mapping */ - __forceinline BinMapping(size_t N, const BBox3fa& centBounds) - { - num = min(BINS,size_t(4.0f + 0.05f*N)); - assert(num >= 1); - const vfloat4 eps = 1E-34f; - const vfloat4 diag = max(eps, (vfloat4) centBounds.size()); - scale = select(diag > eps,vfloat4(0.99f*num)/diag,vfloat4(0.0f)); - ofs = (vfloat4) centBounds.lower; - } - - /*! calculates the mapping */ - __forceinline BinMapping(const BBox3fa& centBounds) - { - num = BINS; - const vfloat4 eps = 1E-34f; - const vfloat4 diag = max(eps, (vfloat4) centBounds.size()); - scale = select(diag > eps,vfloat4(0.99f*num)/diag,vfloat4(0.0f)); - ofs = (vfloat4) centBounds.lower; - } - - /*! calculates the mapping */ - template - __forceinline BinMapping(const PrimInfo& pinfo) - { - const vfloat4 eps = 1E-34f; - num = min(BINS,size_t(4.0f + 0.05f*pinfo.size())); - const vfloat4 diag = max(eps,(vfloat4) pinfo.centBounds.size()); - scale = select(diag > eps,vfloat4(0.99f*num)/diag,vfloat4(0.0f)); - ofs = (vfloat4) pinfo.centBounds.lower; - } - - /*! returns number of bins */ - __forceinline size_t size() const { return num; } - - /*! slower but safe binning */ - __forceinline Vec3ia bin(const Vec3fa& p) const - { - const vint4 i = floori((vfloat4(p)-ofs)*scale); -#if 1 - assert(i[0] >= 0 && (size_t)i[0] < num); - assert(i[1] >= 0 && (size_t)i[1] < num); - assert(i[2] >= 0 && (size_t)i[2] < num); - return Vec3ia(i); -#else - return Vec3ia(clamp(i,vint4(0),vint4(num-1))); -#endif - } - - /*! faster but unsafe binning */ - __forceinline Vec3ia bin_unsafe(const Vec3fa& p) const { - return Vec3ia(floori((vfloat4(p)-ofs)*scale)); - } - - /*! faster but unsafe binning */ - template - __forceinline Vec3ia bin_unsafe(const PrimRef& p) const { - return bin_unsafe(p.binCenter()); - } - - /*! faster but unsafe binning */ - template - __forceinline Vec3ia bin_unsafe(const PrimRef& p, const BinBoundsAndCenter& binBoundsAndCenter) const { - return bin_unsafe(binBoundsAndCenter.binCenter(p)); - } - - template - __forceinline bool bin_unsafe(const PrimRef& ref, - const vint4& vSplitPos, - const vbool4& splitDimMask) const // FIXME: rename to isLeft - { - return any(((vint4)bin_unsafe(center2(ref.bounds())) < vSplitPos) & splitDimMask); - } - /*! calculates left spatial position of bin */ - __forceinline float pos(const size_t bin, const size_t dim) const { - return madd(float(bin),1.0f / scale[dim],ofs[dim]); - } - - /*! returns true if the mapping is invalid in some dimension */ - __forceinline bool invalid(const size_t dim) const { - return scale[dim] == 0.0f; - } - - /*! stream output */ - friend embree_ostream operator<<(embree_ostream cout, const BinMapping& mapping) { - return cout << "BinMapping { num = " << mapping.num << ", ofs = " << mapping.ofs << ", scale = " << mapping.scale << "}"; - } - - public: - size_t num; - vfloat4 ofs,scale; //!< linear function that maps to bin ID - }; - - /*! stores all information to perform some split */ - template - struct BinSplit - { - enum - { - SPLIT_OBJECT = 0, - SPLIT_FALLBACK = 1, - SPLIT_ENFORCE = 2, // splits with larger ID are enforced in createLargeLeaf even if we could create a leaf already - SPLIT_TEMPORAL = 2, - SPLIT_GEOMID = 3, - }; - - /*! construct an invalid split by default */ - __forceinline BinSplit() - : sah(inf), dim(-1), pos(0), data(0) {} - - __forceinline BinSplit(float sah, unsigned data, int dim = 0, float fpos = 0) - : sah(sah), dim(dim), fpos(fpos), data(data) {} - - /*! constructs specified split */ - __forceinline BinSplit(float sah, int dim, int pos, const BinMapping& mapping) - : sah(sah), dim(dim), pos(pos), data(0), mapping(mapping) {} - - /*! tests if this split is valid */ - __forceinline bool valid() const { return dim != -1; } - - /*! calculates surface area heuristic for performing the split */ - __forceinline float splitSAH() const { return sah; } - - /*! stream output */ - friend embree_ostream operator<<(embree_ostream cout, const BinSplit& split) { - return cout << "BinSplit { sah = " << split.sah << ", dim = " << split.dim << ", pos = " << split.pos << "}"; - } - - public: - float sah; //!< SAH cost of the split - int dim; //!< split dimension - union { int pos; float fpos; }; //!< bin index for splitting - unsigned int data; //!< extra optional split data - BinMapping mapping; //!< mapping into bins - }; - - /*! stores extended information about the split */ - template - struct SplitInfoT - { - - __forceinline SplitInfoT () {} - - __forceinline SplitInfoT (size_t leftCount, const BBox& leftBounds, size_t rightCount, const BBox& rightBounds) - : leftCount(leftCount), rightCount(rightCount), leftBounds(leftBounds), rightBounds(rightBounds) {} - - public: - size_t leftCount,rightCount; - BBox leftBounds,rightBounds; - }; - - typedef SplitInfoT SplitInfo; - typedef SplitInfoT SplitInfo2; - - /*! stores all binning information */ - template - struct __aligned(64) BinInfoT - { - typedef BinSplit Split; - typedef vbool4 vbool; - typedef vint4 vint; - typedef vfloat4 vfloat; - - __forceinline BinInfoT() { - } - - __forceinline BinInfoT(EmptyTy) { - clear(); - } - - /*! bin access function */ - __forceinline BBox &bounds(const size_t binID, const size_t dimID) { return _bounds[binID][dimID]; } - __forceinline const BBox &bounds(const size_t binID, const size_t dimID) const { return _bounds[binID][dimID]; } - - __forceinline unsigned int &counts(const size_t binID, const size_t dimID) { return _counts[binID][dimID]; } - __forceinline const unsigned int &counts(const size_t binID, const size_t dimID) const { return _counts[binID][dimID]; } - - __forceinline vuint4 &counts(const size_t binID) { return _counts[binID]; } - __forceinline const vuint4 &counts(const size_t binID) const { return _counts[binID]; } - - /*! clears the bin info */ - __forceinline void clear() - { - for (size_t i=0; i& mapping) - { - if (unlikely(N == 0)) return; - size_t i; - for (i=0; i(bin0); bounds(b00,0).extend(prim0); - const unsigned int b01 = extract<1>(bin0); bounds(b01,1).extend(prim0); - const unsigned int b02 = extract<2>(bin0); bounds(b02,2).extend(prim0); - const unsigned int s0 = (unsigned int)prims[i+0].size(); - counts(b00,0)+=s0; - counts(b01,1)+=s0; - counts(b02,2)+=s0; - - /*! increase bounds of bins for odd primitive */ - const unsigned int b10 = extract<0>(bin1); bounds(b10,0).extend(prim1); - const unsigned int b11 = extract<1>(bin1); bounds(b11,1).extend(prim1); - const unsigned int b12 = extract<2>(bin1); bounds(b12,2).extend(prim1); - const unsigned int s1 = (unsigned int)prims[i+1].size(); - counts(b10,0)+=s1; - counts(b11,1)+=s1; - counts(b12,2)+=s1; - } - /*! for uneven number of primitives */ - if (i < N) - { - /*! map primitive to bin */ - BBox prim0; Vec3fa center0; - prims[i].binBoundsAndCenter(prim0,center0); - const vint4 bin0 = (vint4)mapping.bin(center0); - - /*! increase bounds of bins */ - const unsigned int s0 = (unsigned int)prims[i].size(); - const int b00 = extract<0>(bin0); counts(b00,0)+=s0; bounds(b00,0).extend(prim0); - const int b01 = extract<1>(bin0); counts(b01,1)+=s0; bounds(b01,1).extend(prim0); - const int b02 = extract<2>(bin0); counts(b02,2)+=s0; bounds(b02,2).extend(prim0); - } - } - - /*! bins an array of primitives */ - template - __forceinline void bin (const PrimRef* prims, size_t N, const BinMapping& mapping, const BinBoundsAndCenter& binBoundsAndCenter) - { - if (N == 0) return; - - size_t i; - for (i=0; i(bin0); counts(b00,0)+=s0; bounds(b00,0).extend(prim0); - const int b01 = extract<1>(bin0); counts(b01,1)+=s0; bounds(b01,1).extend(prim0); - const int b02 = extract<2>(bin0); counts(b02,2)+=s0; bounds(b02,2).extend(prim0); - - /*! increase bounds of bins for odd primitive */ - const unsigned int s1 = prims[i+1].size(); - const int b10 = extract<0>(bin1); counts(b10,0)+=s1; bounds(b10,0).extend(prim1); - const int b11 = extract<1>(bin1); counts(b11,1)+=s1; bounds(b11,1).extend(prim1); - const int b12 = extract<2>(bin1); counts(b12,2)+=s1; bounds(b12,2).extend(prim1); - } - - /*! for uneven number of primitives */ - if (i < N) - { - /*! map primitive to bin */ - BBox prim0; Vec3fa center0; binBoundsAndCenter.binBoundsAndCenter(prims[i+0],prim0,center0); - const vint4 bin0 = (vint4)mapping.bin(center0); - - /*! increase bounds of bins */ - const unsigned int s0 = prims[i+0].size(); - const int b00 = extract<0>(bin0); counts(b00,0)+=s0; bounds(b00,0).extend(prim0); - const int b01 = extract<1>(bin0); counts(b01,1)+=s0; bounds(b01,1).extend(prim0); - const int b02 = extract<2>(bin0); counts(b02,2)+=s0; bounds(b02,2).extend(prim0); - } - } - - __forceinline void bin(const PrimRef* prims, size_t begin, size_t end, const BinMapping& mapping) { - bin(prims+begin,end-begin,mapping); - } - - template - __forceinline void bin(const PrimRef* prims, size_t begin, size_t end, const BinMapping& mapping, const BinBoundsAndCenter& binBoundsAndCenter) { - bin(prims+begin,end-begin,mapping,binBoundsAndCenter); - } - - /*! merges in other binning information */ - __forceinline void merge (const BinInfoT& other, size_t numBins) - { - - for (size_t i=0; i& mapping, const size_t blocks_shift) const - { - /* sweep from right to left and compute parallel prefix of merged bounds */ - vfloat4 rAreas[BINS]; - vuint4 rCounts[BINS]; - vuint4 count = 0; BBox bx = empty; BBox by = empty; BBox bz = empty; - for (size_t i=mapping.size()-1; i>0; i--) - { - count += counts(i); - rCounts[i] = count; - bx.extend(bounds(i,0)); rAreas[i][0] = expectedApproxHalfArea(bx); - by.extend(bounds(i,1)); rAreas[i][1] = expectedApproxHalfArea(by); - bz.extend(bounds(i,2)); rAreas[i][2] = expectedApproxHalfArea(bz); - rAreas[i][3] = 0.0f; - } - /* sweep from left to right and compute SAH */ - vuint4 blocks_add = (1 << blocks_shift)-1; - vuint4 ii = 1; vfloat4 vbestSAH = pos_inf; vuint4 vbestPos = 0; - count = 0; bx = empty; by = empty; bz = empty; - for (size_t i=1; i> (unsigned int)(blocks_shift); // if blocks_shift >=1 then lCount < 4B and could be represented with an vint4, which would allow for faster vfloat4 conversions. - const vuint4 rCount = (rCounts[i]+blocks_add) >> (unsigned int)(blocks_shift); - const vfloat4 sah = madd(lArea,vfloat4(lCount),rArea*vfloat4(rCount)); - //const vfloat4 sah = madd(lArea,vfloat4(vint4(lCount)),rArea*vfloat4(vint4(rCount))); - - vbestPos = select(sah < vbestSAH,ii ,vbestPos); - vbestSAH = select(sah < vbestSAH,sah,vbestSAH); - } - - /* find best dimension */ - float bestSAH = inf; - int bestDim = -1; - int bestPos = 0; - for (int dim=0; dim<3; dim++) - { - /* ignore zero sized dimensions */ - if (unlikely(mapping.invalid(dim))) - continue; - - /* test if this is a better dimension */ - if (vbestSAH[dim] < bestSAH && vbestPos[dim] != 0) { - bestDim = dim; - bestPos = vbestPos[dim]; - bestSAH = vbestSAH[dim]; - } - } - return Split(bestSAH,bestDim,bestPos,mapping); - } - - /*! calculates extended split information */ - __forceinline void getSplitInfo(const BinMapping& mapping, const Split& split, SplitInfoT& info) const - { - if (split.dim == -1) { - new (&info) SplitInfoT(0,empty,0,empty); - return; - } - - size_t leftCount = 0; - BBox leftBounds = empty; - for (size_t i=0; i<(size_t)split.pos; i++) { - leftCount += counts(i,split.dim); - leftBounds.extend(bounds(i,split.dim)); - } - size_t rightCount = 0; - BBox rightBounds = empty; - for (size_t i=split.pos; i(leftCount,leftBounds,rightCount,rightBounds); - } - - /*! gets the number of primitives left of the split */ - __forceinline size_t getLeftCount(const BinMapping& mapping, const Split& split) const - { - if (unlikely(split.dim == -1)) return -1; - - size_t leftCount = 0; - for (size_t i = 0; i < (size_t)split.pos; i++) { - leftCount += counts(i, split.dim); - } - return leftCount; - } - - /*! gets the number of primitives right of the split */ - __forceinline size_t getRightCount(const BinMapping& mapping, const Split& split) const - { - if (unlikely(split.dim == -1)) return -1; - - size_t rightCount = 0; - for (size_t i = (size_t)split.pos; i - struct BinMapping<16> - { - public: - __forceinline BinMapping() {} - - /*! calculates the mapping */ - template - __forceinline BinMapping(const PrimInfo& pinfo) - { - num = 16; - const vfloat4 eps = 1E-34f; - const vfloat4 diag = max(eps,(vfloat4) pinfo.centBounds.size()); - scale = select(diag > eps,vfloat4(0.99f*num)/diag,vfloat4(0.0f)); - ofs = (vfloat4) pinfo.centBounds.lower; - scale16 = scale; - ofs16 = ofs; - } - - /*! returns number of bins */ - __forceinline size_t size() const { return num; } - - __forceinline vint16 bin16(const Vec3fa& p) const { - return vint16(vint4(floori((vfloat4(p)-ofs)*scale))); - } - - __forceinline vint16 bin16(const vfloat16& p) const { - return floori((p-ofs16)*scale16); - } - - __forceinline int bin_unsafe(const PrimRef& ref, - const vint16& vSplitPos, - const vbool16& splitDimMask) const // FIXME: rename to isLeft - { - const vfloat16 lower(*(vfloat4*)&ref.lower); - const vfloat16 upper(*(vfloat4*)&ref.upper); - const vfloat16 p = lower + upper; - const vint16 i = floori((p-ofs16)*scale16); - return lt(splitDimMask,i,vSplitPos); - } - - /*! returns true if the mapping is invalid in some dimension */ - __forceinline bool invalid(const size_t dim) const { - return scale[dim] == 0.0f; - } - - public: - size_t num; - vfloat4 ofs,scale; //!< linear function that maps to bin ID - vfloat16 ofs16,scale16; //!< linear function that maps to bin ID - }; - - /* 16 bins in-register binner */ - template - struct __aligned(64) BinInfoT<16,PrimRef,BBox3fa> - { - typedef BinSplit<16> Split; - typedef vbool16 vbool; - typedef vint16 vint; - typedef vfloat16 vfloat; - - __forceinline BinInfoT() { - } - - __forceinline BinInfoT(EmptyTy) { - clear(); - } - - /*! clears the bin info */ - __forceinline void clear() - { - lower[0] = lower[1] = lower[2] = pos_inf; - upper[0] = upper[1] = upper[2] = neg_inf; - count[0] = count[1] = count[2] = 0; - } - - - static __forceinline vfloat16 prefix_area_rl(const vfloat16 min_x, - const vfloat16 min_y, - const vfloat16 min_z, - const vfloat16 max_x, - const vfloat16 max_y, - const vfloat16 max_z) - { - const vfloat16 r_min_x = reverse_prefix_min(min_x); - const vfloat16 r_min_y = reverse_prefix_min(min_y); - const vfloat16 r_min_z = reverse_prefix_min(min_z); - const vfloat16 r_max_x = reverse_prefix_max(max_x); - const vfloat16 r_max_y = reverse_prefix_max(max_y); - const vfloat16 r_max_z = reverse_prefix_max(max_z); - const vfloat16 dx = r_max_x - r_min_x; - const vfloat16 dy = r_max_y - r_min_y; - const vfloat16 dz = r_max_z - r_min_z; - const vfloat16 area_rl = madd(dx,dy,madd(dx,dz,dy*dz)); - return area_rl; - } - - static __forceinline vfloat16 prefix_area_lr(const vfloat16 min_x, - const vfloat16 min_y, - const vfloat16 min_z, - const vfloat16 max_x, - const vfloat16 max_y, - const vfloat16 max_z) - { - const vfloat16 r_min_x = prefix_min(min_x); - const vfloat16 r_min_y = prefix_min(min_y); - const vfloat16 r_min_z = prefix_min(min_z); - const vfloat16 r_max_x = prefix_max(max_x); - const vfloat16 r_max_y = prefix_max(max_y); - const vfloat16 r_max_z = prefix_max(max_z); - const vfloat16 dx = r_max_x - r_min_x; - const vfloat16 dy = r_max_y - r_min_y; - const vfloat16 dz = r_max_z - r_min_z; - const vfloat16 area_lr = madd(dx,dy,madd(dx,dz,dy*dz)); - return area_lr; - } - - - /*! bins an array of primitives */ - __forceinline void bin (const PrimRef* prims, size_t N, const BinMapping<16>& mapping) - { - if (unlikely(N == 0)) return; - - const vfloat16 init_min(pos_inf); - const vfloat16 init_max(neg_inf); - - vfloat16 min_x0,min_x1,min_x2; - vfloat16 min_y0,min_y1,min_y2; - vfloat16 min_z0,min_z1,min_z2; - vfloat16 max_x0,max_x1,max_x2; - vfloat16 max_y0,max_y1,max_y2; - vfloat16 max_z0,max_z1,max_z2; - vuint16 count0,count1,count2; - - min_x0 = init_min; - min_x1 = init_min; - min_x2 = init_min; - min_y0 = init_min; - min_y1 = init_min; - min_y2 = init_min; - min_z0 = init_min; - min_z1 = init_min; - min_z2 = init_min; - - max_x0 = init_max; - max_x1 = init_max; - max_x2 = init_max; - max_y0 = init_max; - max_y1 = init_max; - max_y2 = init_max; - max_z0 = init_max; - max_z1 = init_max; - max_z2 = init_max; - - count0 = zero; - count1 = zero; - count2 = zero; - - const vint16 step16(step); - size_t i; - for (i=0; i(binA); - const vint16 bin1 = shuffle<1>(binA); - const vint16 bin2 = shuffle<2>(binA); - - const vbool16 m_update_x = step16 == bin0; - const vbool16 m_update_y = step16 == bin1; - const vbool16 m_update_z = step16 == bin2; - - assert(popcnt((size_t)m_update_x) == 1); - assert(popcnt((size_t)m_update_y) == 1); - assert(popcnt((size_t)m_update_z) == 1); - - min_x0 = mask_min(m_update_x,min_x0,min_x0,b_min_x); - min_y0 = mask_min(m_update_x,min_y0,min_y0,b_min_y); - min_z0 = mask_min(m_update_x,min_z0,min_z0,b_min_z); - // ------------------------------------------------------------------------ - max_x0 = mask_max(m_update_x,max_x0,max_x0,b_max_x); - max_y0 = mask_max(m_update_x,max_y0,max_y0,b_max_y); - max_z0 = mask_max(m_update_x,max_z0,max_z0,b_max_z); - // ------------------------------------------------------------------------ - min_x1 = mask_min(m_update_y,min_x1,min_x1,b_min_x); - min_y1 = mask_min(m_update_y,min_y1,min_y1,b_min_y); - min_z1 = mask_min(m_update_y,min_z1,min_z1,b_min_z); - // ------------------------------------------------------------------------ - max_x1 = mask_max(m_update_y,max_x1,max_x1,b_max_x); - max_y1 = mask_max(m_update_y,max_y1,max_y1,b_max_y); - max_z1 = mask_max(m_update_y,max_z1,max_z1,b_max_z); - // ------------------------------------------------------------------------ - min_x2 = mask_min(m_update_z,min_x2,min_x2,b_min_x); - min_y2 = mask_min(m_update_z,min_y2,min_y2,b_min_y); - min_z2 = mask_min(m_update_z,min_z2,min_z2,b_min_z); - // ------------------------------------------------------------------------ - max_x2 = mask_max(m_update_z,max_x2,max_x2,b_max_x); - max_y2 = mask_max(m_update_z,max_y2,max_y2,b_max_y); - max_z2 = mask_max(m_update_z,max_z2,max_z2,b_max_z); - // ------------------------------------------------------------------------ - count0 = mask_add(m_update_x,count0,count0,vuint16(1)); - count1 = mask_add(m_update_y,count1,count1,vuint16(1)); - count2 = mask_add(m_update_z,count2,count2,vuint16(1)); - } - - - /* B */ - { - const vfloat16 b_min_x = prims[i+1].lower.x; - const vfloat16 b_min_y = prims[i+1].lower.y; - const vfloat16 b_min_z = prims[i+1].lower.z; - const vfloat16 b_max_x = prims[i+1].upper.x; - const vfloat16 b_max_y = prims[i+1].upper.y; - const vfloat16 b_max_z = prims[i+1].upper.z; - - const vint16 bin0 = shuffle<0>(binB); - const vint16 bin1 = shuffle<1>(binB); - const vint16 bin2 = shuffle<2>(binB); - - const vbool16 m_update_x = step16 == bin0; - const vbool16 m_update_y = step16 == bin1; - const vbool16 m_update_z = step16 == bin2; - - assert(popcnt((size_t)m_update_x) == 1); - assert(popcnt((size_t)m_update_y) == 1); - assert(popcnt((size_t)m_update_z) == 1); - - min_x0 = mask_min(m_update_x,min_x0,min_x0,b_min_x); - min_y0 = mask_min(m_update_x,min_y0,min_y0,b_min_y); - min_z0 = mask_min(m_update_x,min_z0,min_z0,b_min_z); - // ------------------------------------------------------------------------ - max_x0 = mask_max(m_update_x,max_x0,max_x0,b_max_x); - max_y0 = mask_max(m_update_x,max_y0,max_y0,b_max_y); - max_z0 = mask_max(m_update_x,max_z0,max_z0,b_max_z); - // ------------------------------------------------------------------------ - min_x1 = mask_min(m_update_y,min_x1,min_x1,b_min_x); - min_y1 = mask_min(m_update_y,min_y1,min_y1,b_min_y); - min_z1 = mask_min(m_update_y,min_z1,min_z1,b_min_z); - // ------------------------------------------------------------------------ - max_x1 = mask_max(m_update_y,max_x1,max_x1,b_max_x); - max_y1 = mask_max(m_update_y,max_y1,max_y1,b_max_y); - max_z1 = mask_max(m_update_y,max_z1,max_z1,b_max_z); - // ------------------------------------------------------------------------ - min_x2 = mask_min(m_update_z,min_x2,min_x2,b_min_x); - min_y2 = mask_min(m_update_z,min_y2,min_y2,b_min_y); - min_z2 = mask_min(m_update_z,min_z2,min_z2,b_min_z); - // ------------------------------------------------------------------------ - max_x2 = mask_max(m_update_z,max_x2,max_x2,b_max_x); - max_y2 = mask_max(m_update_z,max_y2,max_y2,b_max_y); - max_z2 = mask_max(m_update_z,max_z2,max_z2,b_max_z); - // ------------------------------------------------------------------------ - count0 = mask_add(m_update_x,count0,count0,vuint16(1)); - count1 = mask_add(m_update_y,count1,count1,vuint16(1)); - count2 = mask_add(m_update_z,count2,count2,vuint16(1)); - } - - } - - if (i < N) - { - const BBox3fa prim0 = prims[i].bounds(); - const vfloat16 center0 = vfloat16((vfloat4)prim0.lower) + vfloat16((vfloat4)prim0.upper); - const vint16 bin = mapping.bin16(center0); - - const vfloat16 b_min_x = prims[i].lower.x; - const vfloat16 b_min_y = prims[i].lower.y; - const vfloat16 b_min_z = prims[i].lower.z; - const vfloat16 b_max_x = prims[i].upper.x; - const vfloat16 b_max_y = prims[i].upper.y; - const vfloat16 b_max_z = prims[i].upper.z; - - const vint16 bin0 = shuffle<0>(bin); - const vint16 bin1 = shuffle<1>(bin); - const vint16 bin2 = shuffle<2>(bin); - - const vbool16 m_update_x = step16 == bin0; - const vbool16 m_update_y = step16 == bin1; - const vbool16 m_update_z = step16 == bin2; - - assert(popcnt((size_t)m_update_x) == 1); - assert(popcnt((size_t)m_update_y) == 1); - assert(popcnt((size_t)m_update_z) == 1); - - min_x0 = mask_min(m_update_x,min_x0,min_x0,b_min_x); - min_y0 = mask_min(m_update_x,min_y0,min_y0,b_min_y); - min_z0 = mask_min(m_update_x,min_z0,min_z0,b_min_z); - // ------------------------------------------------------------------------ - max_x0 = mask_max(m_update_x,max_x0,max_x0,b_max_x); - max_y0 = mask_max(m_update_x,max_y0,max_y0,b_max_y); - max_z0 = mask_max(m_update_x,max_z0,max_z0,b_max_z); - // ------------------------------------------------------------------------ - min_x1 = mask_min(m_update_y,min_x1,min_x1,b_min_x); - min_y1 = mask_min(m_update_y,min_y1,min_y1,b_min_y); - min_z1 = mask_min(m_update_y,min_z1,min_z1,b_min_z); - // ------------------------------------------------------------------------ - max_x1 = mask_max(m_update_y,max_x1,max_x1,b_max_x); - max_y1 = mask_max(m_update_y,max_y1,max_y1,b_max_y); - max_z1 = mask_max(m_update_y,max_z1,max_z1,b_max_z); - // ------------------------------------------------------------------------ - min_x2 = mask_min(m_update_z,min_x2,min_x2,b_min_x); - min_y2 = mask_min(m_update_z,min_y2,min_y2,b_min_y); - min_z2 = mask_min(m_update_z,min_z2,min_z2,b_min_z); - // ------------------------------------------------------------------------ - max_x2 = mask_max(m_update_z,max_x2,max_x2,b_max_x); - max_y2 = mask_max(m_update_z,max_y2,max_y2,b_max_y); - max_z2 = mask_max(m_update_z,max_z2,max_z2,b_max_z); - // ------------------------------------------------------------------------ - count0 = mask_add(m_update_x,count0,count0,vuint16(1)); - count1 = mask_add(m_update_y,count1,count1,vuint16(1)); - count2 = mask_add(m_update_z,count2,count2,vuint16(1)); - } - - lower[0] = Vec3vf16( min_x0, min_y0, min_z0 ); - lower[1] = Vec3vf16( min_x1, min_y1, min_z1 ); - lower[2] = Vec3vf16( min_x2, min_y2, min_z2 ); - - upper[0] = Vec3vf16( max_x0, max_y0, max_z0 ); - upper[1] = Vec3vf16( max_x1, max_y1, max_z1 ); - upper[2] = Vec3vf16( max_x2, max_y2, max_z2 ); - - count[0] = count0; - count[1] = count1; - count[2] = count2; - } - - __forceinline void bin(const PrimRef* prims, size_t begin, size_t end, const BinMapping<16>& mapping) { - bin(prims+begin,end-begin,mapping); - } - - /*! merges in other binning information */ - __forceinline void merge (const BinInfoT& other, size_t numBins) - { - for (size_t i=0; i<3; i++) - { - lower[i] = min(lower[i],other.lower[i]); - upper[i] = max(upper[i],other.upper[i]); - count[i] += other.count[i]; - } - } - - /*! reducesr binning information */ - static __forceinline const BinInfoT reduce (const BinInfoT& a, const BinInfoT& b) - { - BinInfoT c; - for (size_t i=0; i<3; i++) - { - c.counts[i] = a.counts[i] + b.counts[i]; - c.lower[i] = min(a.lower[i],b.lower[i]); - c.upper[i] = max(a.upper[i],b.upper[i]); - } - return c; - } - - /*! finds the best split by scanning binning information */ - __forceinline Split best(const BinMapping<16>& mapping, const size_t blocks_shift) const - { - /* find best dimension */ - float bestSAH = inf; - int bestDim = -1; - int bestPos = 0; - const vuint16 blocks_add = (1 << blocks_shift)-1; - const vfloat16 inf(pos_inf); - for (size_t dim=0; dim<3; dim++) - { - /* ignore zero sized dimensions */ - if (unlikely(mapping.invalid(dim))) - continue; - - const vfloat16 rArea16 = prefix_area_rl(lower[dim].x,lower[dim].y,lower[dim].z, upper[dim].x,upper[dim].y,upper[dim].z); - const vfloat16 lArea16 = prefix_area_lr(lower[dim].x,lower[dim].y,lower[dim].z, upper[dim].x,upper[dim].y,upper[dim].z); - const vuint16 lCount16 = prefix_sum(count[dim]); - const vuint16 rCount16 = reverse_prefix_sum(count[dim]); - - /* compute best split in this dimension */ - const vfloat16 leftArea = lArea16; - const vfloat16 rightArea = align_shift_right<1>(zero,rArea16); - const vuint16 lC = lCount16; - const vuint16 rC = align_shift_right<1>(zero,rCount16); - const vuint16 leftCount = ( lC + blocks_add) >> blocks_shift; - const vuint16 rightCount = ( rC + blocks_add) >> blocks_shift; - const vbool16 valid = (leftArea < inf) & (rightArea < inf) & vbool16(0x7fff); // handles inf entries - const vfloat16 sah = select(valid,madd(leftArea,vfloat16(leftCount),rightArea*vfloat16(rightCount)),vfloat16(pos_inf)); - /* test if this is a better dimension */ - if (any(sah < vfloat16(bestSAH))) - { - const size_t index = select_min(sah); - assert(index < 15); - assert(sah[index] < bestSAH); - bestDim = dim; - bestPos = index+1; - bestSAH = sah[index]; - } - } - - return Split(bestSAH,bestDim,bestPos,mapping); - - } - - /*! calculates extended split information */ - __forceinline void getSplitInfo(const BinMapping<16>& mapping, const Split& split, SplitInfo& info) const - { - if (split.dim == -1) { - new (&info) SplitInfo(0,empty,0,empty); - return; - } - // FIXME: horizontal reduction! - - size_t leftCount = 0; - BBox3fa leftBounds = empty; - for (size_t i=0; i<(size_t)split.pos; i++) { - leftCount += count[split.dim][i]; - Vec3fa bounds_lower(lower[split.dim].x[i],lower[split.dim].y[i],lower[split.dim].z[i]); - Vec3fa bounds_upper(upper[split.dim].x[i],upper[split.dim].y[i],upper[split.dim].z[i]); - leftBounds.extend(BBox3fa(bounds_lower,bounds_upper)); - } - size_t rightCount = 0; - BBox3fa rightBounds = empty; - for (size_t i=split.pos; i& mapping, const Split& split) const - { - if (unlikely(split.dim == -1)) return -1; - - size_t leftCount = 0; - for (size_t i = 0; i < (size_t)split.pos; i++) { - leftCount += count[split.dim][i]; - } - return leftCount; - } - - /*! gets the number of primitives right of the split */ - __forceinline size_t getRightCount(const BinMapping<16>& mapping, const Split& split) const - { - if (unlikely(split.dim == -1)) return -1; - - size_t rightCount = 0; - for (size_t i = (size_t)split.pos; i - __forceinline void bin_parallel(BinInfoT& binner, const PrimRef* prims, size_t begin, size_t end, size_t blockSize, size_t parallelThreshold, const BinMapping& mapping) - { - if (likely(end-begin < parallelThreshold)) { - binner.bin(prims,begin,end,mapping); - } else { - binner = parallel_reduce(begin,end,blockSize,binner, - [&](const range& r) -> BinInfoT { BinInfoT binner(empty); binner.bin(prims + r.begin(), r.size(), mapping); return binner; }, - [&](const BinInfoT& b0, const BinInfoT& b1) -> BinInfoT { BinInfoT r = b0; r.merge(b1, mapping.size()); return r; }); - } - } - - template - __forceinline void bin_parallel(BinInfoT& binner, const PrimRef* prims, size_t begin, size_t end, size_t blockSize, size_t parallelThreshold, const BinMapping& mapping, const BinBoundsAndCenter& binBoundsAndCenter) - { - if (likely(end-begin < parallelThreshold)) { - binner.bin(prims,begin,end,mapping,binBoundsAndCenter); - } else { - binner = parallel_reduce(begin,end,blockSize,binner, - [&](const range& r) -> BinInfoT { BinInfoT binner(empty); binner.bin(prims + r.begin(), r.size(), mapping, binBoundsAndCenter); return binner; }, - [&](const BinInfoT& b0, const BinInfoT& b1) -> BinInfoT { BinInfoT r = b0; r.merge(b1, mapping.size()); return r; }); - } - } - - template - __forceinline void bin_serial_or_parallel(BinInfoT& binner, const PrimRef* prims, size_t begin, size_t end, size_t blockSize, const BinMapping& mapping) - { - if (!parallel) { - binner.bin(prims,begin,end,mapping); - } else { - binner = parallel_reduce(begin,end,blockSize,binner, - [&](const range& r) -> BinInfoT { BinInfoT binner(empty); binner.bin(prims + r.begin(), r.size(), mapping); return binner; }, - [&](const BinInfoT& b0, const BinInfoT& b1) -> BinInfoT { BinInfoT r = b0; r.merge(b1, mapping.size()); return r; }); - } - } - - template - __forceinline void bin_serial_or_parallel(BinInfoT& binner, const PrimRef* prims, size_t begin, size_t end, size_t blockSize, const BinMapping& mapping, const BinBoundsAndCenter& binBoundsAndCenter) - { - if (!parallel) { - binner.bin(prims,begin,end,mapping,binBoundsAndCenter); - } else { - binner = parallel_reduce(begin,end,blockSize,binner, - [&](const range& r) -> BinInfoT { BinInfoT binner(empty); binner.bin(prims + r.begin(), r.size(), mapping, binBoundsAndCenter); return binner; }, - [&](const BinInfoT& b0, const BinInfoT& b1) -> BinInfoT { BinInfoT r = b0; r.merge(b1, mapping.size()); return r; }); - } - } -} diff --git a/thirdparty/embree-aarch64/kernels/builders/heuristic_binning_array_aligned.h b/thirdparty/embree-aarch64/kernels/builders/heuristic_binning_array_aligned.h deleted file mode 100644 index a4c272f015..0000000000 --- a/thirdparty/embree-aarch64/kernels/builders/heuristic_binning_array_aligned.h +++ /dev/null @@ -1,205 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "heuristic_binning.h" - -namespace embree -{ - namespace isa - { - struct PrimInfoRange : public CentGeomBBox3fa, public range - { - __forceinline PrimInfoRange () { - } - - __forceinline PrimInfoRange(const PrimInfo& pinfo) - : CentGeomBBox3fa(pinfo), range(pinfo.begin,pinfo.end) {} - - __forceinline PrimInfoRange(EmptyTy) - : CentGeomBBox3fa(EmptyTy()), range(0,0) {} - - __forceinline PrimInfoRange (size_t begin, size_t end, const CentGeomBBox3fa& centGeomBounds) - : CentGeomBBox3fa(centGeomBounds), range(begin,end) {} - - __forceinline float leafSAH() const { - return expectedApproxHalfArea(geomBounds)*float(size()); - } - - __forceinline float leafSAH(size_t block_shift) const { - return expectedApproxHalfArea(geomBounds)*float((size()+(size_t(1)<> block_shift); - } - }; - - /*! Performs standard object binning */ - template - struct HeuristicArrayBinningSAH - { - typedef BinSplit Split; - typedef BinInfoT Binner; - typedef range Set; - -#if defined(__AVX512ER__) // KNL - static const size_t PARALLEL_THRESHOLD = 4*768; - static const size_t PARALLEL_FIND_BLOCK_SIZE = 768; - static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 768; -#else - static const size_t PARALLEL_THRESHOLD = 3 * 1024; - static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024; - static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128; -#endif - __forceinline HeuristicArrayBinningSAH () - : prims(nullptr) {} - - /*! remember prim array */ - __forceinline HeuristicArrayBinningSAH (PrimRef* prims) - : prims(prims) {} - - /*! finds the best split */ - __noinline const Split find(const PrimInfoRange& pinfo, const size_t logBlockSize) - { - if (likely(pinfo.size() < PARALLEL_THRESHOLD)) - return find_template(pinfo,logBlockSize); - else - return find_template(pinfo,logBlockSize); - } - - template - __forceinline const Split find_template(const PrimInfoRange& pinfo, const size_t logBlockSize) - { - Binner binner(empty); - const BinMapping mapping(pinfo); - bin_serial_or_parallel(binner,prims,pinfo.begin(),pinfo.end(),PARALLEL_FIND_BLOCK_SIZE,mapping); - return binner.best(mapping,logBlockSize); - } - - /*! array partitioning */ - __forceinline void split(const Split& split, const PrimInfoRange& pinfo, PrimInfoRange& linfo, PrimInfoRange& rinfo) - { - if (likely(pinfo.size() < PARALLEL_THRESHOLD)) - split_template(split,pinfo,linfo,rinfo); - else - split_template(split,pinfo,linfo,rinfo); - } - - template - __forceinline void split_template(const Split& split, const PrimInfoRange& set, PrimInfoRange& lset, PrimInfoRange& rset) - { - if (!split.valid()) { - deterministic_order(set); - return splitFallback(set,lset,rset); - } - - const size_t begin = set.begin(); - const size_t end = set.end(); - CentGeomBBox3fa local_left(empty); - CentGeomBBox3fa local_right(empty); - const unsigned int splitPos = split.pos; - const unsigned int splitDim = split.dim; - const unsigned int splitDimMask = (unsigned int)1 << splitDim; - - const typename Binner::vint vSplitPos(splitPos); - const typename Binner::vbool vSplitMask(splitDimMask); - auto isLeft = [&] (const PrimRef &ref) { return split.mapping.bin_unsafe(ref,vSplitPos,vSplitMask); }; - - size_t center = 0; - if (!parallel) - center = serial_partitioning(prims,begin,end,local_left,local_right,isLeft, - [] (CentGeomBBox3fa& pinfo,const PrimRef& ref) { pinfo.extend_center2(ref); }); - else - center = parallel_partitioning( - prims,begin,end,EmptyTy(),local_left,local_right,isLeft, - [] (CentGeomBBox3fa& pinfo,const PrimRef& ref) { pinfo.extend_center2(ref); }, - [] (CentGeomBBox3fa& pinfo0,const CentGeomBBox3fa& pinfo1) { pinfo0.merge(pinfo1); }, - PARALLEL_PARTITION_BLOCK_SIZE); - - new (&lset) PrimInfoRange(begin,center,local_left); - new (&rset) PrimInfoRange(center,end,local_right); - assert(area(lset.geomBounds) >= 0.0f); - assert(area(rset.geomBounds) >= 0.0f); - } - - void deterministic_order(const PrimInfoRange& pinfo) - { - /* required as parallel partition destroys original primitive order */ - std::sort(&prims[pinfo.begin()],&prims[pinfo.end()]); - } - - void splitFallback(const PrimInfoRange& pinfo, PrimInfoRange& linfo, PrimInfoRange& rinfo) - { - const size_t begin = pinfo.begin(); - const size_t end = pinfo.end(); - const size_t center = (begin + end)/2; - - CentGeomBBox3fa left(empty); - for (size_t i=begin; i& range, PrimInfoRange& linfo, PrimInfoRange& rinfo) - { - assert(range.size() > 1); - CentGeomBBox3fa left(empty); - CentGeomBBox3fa right(empty); - unsigned int geomID = prims[range.begin()].geomID(); - size_t center = serial_partitioning(prims,range.begin(),range.end(),left,right, - [&] ( const PrimRef& prim ) { return prim.geomID() == geomID; }, - [ ] ( CentGeomBBox3fa& a, const PrimRef& ref ) { a.extend_center2(ref); }); - - new (&linfo) PrimInfoRange(range.begin(),center,left); - new (&rinfo) PrimInfoRange(center,range.end(),right); - } - - private: - PrimRef* const prims; - }; - - /*! Performs standard object binning */ - template - struct HeuristicArrayBinningMB - { - typedef BinSplit Split; - typedef typename PrimRefMB::BBox BBox; - typedef BinInfoT ObjectBinner; - static const size_t PARALLEL_THRESHOLD = 3 * 1024; - static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024; - static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128; - - /*! finds the best split */ - const Split find(const SetMB& set, const size_t logBlockSize) - { - ObjectBinner binner(empty); - const BinMapping mapping(set.size(),set.centBounds); - bin_parallel(binner,set.prims->data(),set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,PARALLEL_THRESHOLD,mapping); - Split osplit = binner.best(mapping,logBlockSize); - osplit.sah *= set.time_range.size(); - if (!osplit.valid()) osplit.data = Split::SPLIT_FALLBACK; // use fallback split - return osplit; - } - - /*! array partitioning */ - __forceinline void split(const Split& split, const SetMB& set, SetMB& lset, SetMB& rset) - { - const size_t begin = set.begin(); - const size_t end = set.end(); - PrimInfoMB left = empty; - PrimInfoMB right = empty; - const vint4 vSplitPos(split.pos); - const vbool4 vSplitMask(1 << split.dim); - auto isLeft = [&] (const PrimRefMB &ref) { return any(((vint4)split.mapping.bin_unsafe(ref) < vSplitPos) & vSplitMask); }; - auto reduction = [] (PrimInfoMB& pinfo, const PrimRefMB& ref) { pinfo.add_primref(ref); }; - auto reduction2 = [] (PrimInfoMB& pinfo0,const PrimInfoMB& pinfo1) { pinfo0.merge(pinfo1); }; - size_t center = parallel_partitioning(set.prims->data(),begin,end,EmptyTy(),left,right,isLeft,reduction,reduction2,PARALLEL_PARTITION_BLOCK_SIZE,PARALLEL_THRESHOLD); - new (&lset) SetMB(left, set.prims,range(begin,center),set.time_range); - new (&rset) SetMB(right,set.prims,range(center,end ),set.time_range); - } - }; - } -} diff --git a/thirdparty/embree-aarch64/kernels/builders/heuristic_binning_array_unaligned.h b/thirdparty/embree-aarch64/kernels/builders/heuristic_binning_array_unaligned.h deleted file mode 100644 index 1370244586..0000000000 --- a/thirdparty/embree-aarch64/kernels/builders/heuristic_binning_array_unaligned.h +++ /dev/null @@ -1,302 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "heuristic_binning.h" - -namespace embree -{ - namespace isa - { - /*! Performs standard object binning */ - template - struct UnalignedHeuristicArrayBinningSAH - { - typedef BinSplit Split; - typedef BinInfoT Binner; - typedef range Set; - - __forceinline UnalignedHeuristicArrayBinningSAH () // FIXME: required? - : scene(nullptr), prims(nullptr) {} - - /*! remember prim array */ - __forceinline UnalignedHeuristicArrayBinningSAH (Scene* scene, PrimRef* prims) - : scene(scene), prims(prims) {} - - const LinearSpace3fa computeAlignedSpace(const range& set) - { - Vec3fa axis(0,0,1); - uint64_t bestGeomPrimID = -1; - - /*! find curve with minimum ID that defines valid direction */ - for (size_t i=set.begin(); i= bestGeomPrimID) continue; - const Vec3fa axis1 = scene->get(geomID)->computeDirection(primID); - if (sqr_length(axis1) > 1E-18f) { - axis = normalize(axis1); - bestGeomPrimID = geomprimID; - } - } - return frame(axis).transposed(); - } - - const PrimInfo computePrimInfo(const range& set, const LinearSpace3fa& space) - { - auto computeBounds = [&](const range& r) -> CentGeomBBox3fa - { - CentGeomBBox3fa bounds(empty); - for (size_t i=r.begin(); iget(prims[i].geomID()); - bounds.extend(mesh->vbounds(space,prims[i].primID())); - } - return bounds; - }; - - const CentGeomBBox3fa bounds = parallel_reduce(set.begin(), set.end(), size_t(1024), size_t(4096), - CentGeomBBox3fa(empty), computeBounds, CentGeomBBox3fa::merge2); - - return PrimInfo(set.begin(),set.end(),bounds); - } - - struct BinBoundsAndCenter - { - __forceinline BinBoundsAndCenter(Scene* scene, const LinearSpace3fa& space) - : scene(scene), space(space) {} - - /*! returns center for binning */ - __forceinline Vec3fa binCenter(const PrimRef& ref) const - { - Geometry* mesh = (Geometry*) scene->get(ref.geomID()); - BBox3fa bounds = mesh->vbounds(space,ref.primID()); - return embree::center2(bounds); - } - - /*! returns bounds and centroid used for binning */ - __forceinline void binBoundsAndCenter(const PrimRef& ref, BBox3fa& bounds_o, Vec3fa& center_o) const - { - Geometry* mesh = (Geometry*) scene->get(ref.geomID()); - BBox3fa bounds = mesh->vbounds(space,ref.primID()); - bounds_o = bounds; - center_o = embree::center2(bounds); - } - - private: - Scene* scene; - const LinearSpace3fa space; - }; - - /*! finds the best split */ - __forceinline const Split find(const PrimInfoRange& pinfo, const size_t logBlockSize, const LinearSpace3fa& space) - { - if (likely(pinfo.size() < 10000)) - return find_template(pinfo,logBlockSize,space); - else - return find_template(pinfo,logBlockSize,space); - } - - /*! finds the best split */ - template - const Split find_template(const PrimInfoRange& set, const size_t logBlockSize, const LinearSpace3fa& space) - { - Binner binner(empty); - const BinMapping mapping(set); - BinBoundsAndCenter binBoundsAndCenter(scene,space); - bin_serial_or_parallel(binner,prims,set.begin(),set.end(),size_t(4096),mapping,binBoundsAndCenter); - return binner.best(mapping,logBlockSize); - } - - /*! array partitioning */ - __forceinline void split(const Split& split, const LinearSpace3fa& space, const Set& set, PrimInfoRange& lset, PrimInfoRange& rset) - { - if (likely(set.size() < 10000)) - split_template(split,space,set,lset,rset); - else - split_template(split,space,set,lset,rset); - } - - /*! array partitioning */ - template - __forceinline void split_template(const Split& split, const LinearSpace3fa& space, const Set& set, PrimInfoRange& lset, PrimInfoRange& rset) - { - if (!split.valid()) { - deterministic_order(set); - return splitFallback(set,lset,rset); - } - - const size_t begin = set.begin(); - const size_t end = set.end(); - CentGeomBBox3fa local_left(empty); - CentGeomBBox3fa local_right(empty); - const int splitPos = split.pos; - const int splitDim = split.dim; - BinBoundsAndCenter binBoundsAndCenter(scene,space); - - size_t center = 0; - if (likely(set.size() < 10000)) - center = serial_partitioning(prims,begin,end,local_left,local_right, - [&] (const PrimRef& ref) { return split.mapping.bin_unsafe(ref,binBoundsAndCenter)[splitDim] < splitPos; }, - [] (CentGeomBBox3fa& pinfo,const PrimRef& ref) { pinfo.extend_center2(ref); }); - else - center = parallel_partitioning(prims,begin,end,EmptyTy(),local_left,local_right, - [&] (const PrimRef& ref) { return split.mapping.bin_unsafe(ref,binBoundsAndCenter)[splitDim] < splitPos; }, - [] (CentGeomBBox3fa& pinfo,const PrimRef& ref) { pinfo.extend_center2(ref); }, - [] (CentGeomBBox3fa& pinfo0,const CentGeomBBox3fa& pinfo1) { pinfo0.merge(pinfo1); }, - 128); - - new (&lset) PrimInfoRange(begin,center,local_left); - new (&rset) PrimInfoRange(center,end,local_right); - assert(area(lset.geomBounds) >= 0.0f); - assert(area(rset.geomBounds) >= 0.0f); - } - - void deterministic_order(const range& set) - { - /* required as parallel partition destroys original primitive order */ - std::sort(&prims[set.begin()],&prims[set.end()]); - } - - void splitFallback(const range& set, PrimInfoRange& lset, PrimInfoRange& rset) - { - const size_t begin = set.begin(); - const size_t end = set.end(); - const size_t center = (begin + end)/2; - - CentGeomBBox3fa left(empty); - for (size_t i=begin; i - struct UnalignedHeuristicArrayBinningMB - { - typedef BinSplit Split; - typedef typename PrimRefMB::BBox BBox; - typedef BinInfoT ObjectBinner; - - static const size_t PARALLEL_THRESHOLD = 3 * 1024; - static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024; - static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128; - - UnalignedHeuristicArrayBinningMB(Scene* scene) - : scene(scene) {} - - const LinearSpace3fa computeAlignedSpaceMB(Scene* scene, const SetMB& set) - { - Vec3fa axis0(0,0,1); - uint64_t bestGeomPrimID = -1; - - /*! find curve with minimum ID that defines valid direction */ - for (size_t i=set.begin(); i= bestGeomPrimID) continue; - - const Geometry* mesh = scene->get(geomID); - const range tbounds = mesh->timeSegmentRange(set.time_range); - if (tbounds.size() == 0) continue; - - const size_t t = (tbounds.begin()+tbounds.end())/2; - const Vec3fa axis1 = mesh->computeDirection(primID,t); - if (sqr_length(axis1) > 1E-18f) { - axis0 = normalize(axis1); - bestGeomPrimID = geomprimID; - } - } - - return frame(axis0).transposed(); - } - - struct BinBoundsAndCenter - { - __forceinline BinBoundsAndCenter(Scene* scene, BBox1f time_range, const LinearSpace3fa& space) - : scene(scene), time_range(time_range), space(space) {} - - /*! returns center for binning */ - template - __forceinline Vec3fa binCenter(const PrimRef& ref) const - { - Geometry* mesh = scene->get(ref.geomID()); - LBBox3fa lbounds = mesh->vlinearBounds(space,ref.primID(),time_range); - return center2(lbounds.interpolate(0.5f)); - } - - /*! returns bounds and centroid used for binning */ - __noinline void binBoundsAndCenter (const PrimRefMB& ref, BBox3fa& bounds_o, Vec3fa& center_o) const // __noinline is workaround for ICC16 bug under MacOSX - { - Geometry* mesh = scene->get(ref.geomID()); - LBBox3fa lbounds = mesh->vlinearBounds(space,ref.primID(),time_range); - bounds_o = lbounds.interpolate(0.5f); - center_o = center2(bounds_o); - } - - /*! returns bounds and centroid used for binning */ - __noinline void binBoundsAndCenter (const PrimRefMB& ref, LBBox3fa& bounds_o, Vec3fa& center_o) const // __noinline is workaround for ICC16 bug under MacOSX - { - Geometry* mesh = scene->get(ref.geomID()); - LBBox3fa lbounds = mesh->vlinearBounds(space,ref.primID(),time_range); - bounds_o = lbounds; - center_o = center2(lbounds.interpolate(0.5f)); - } - - private: - Scene* scene; - BBox1f time_range; - const LinearSpace3fa space; - }; - - /*! finds the best split */ - const Split find(const SetMB& set, const size_t logBlockSize, const LinearSpace3fa& space) - { - BinBoundsAndCenter binBoundsAndCenter(scene,set.time_range,space); - ObjectBinner binner(empty); - const BinMapping mapping(set.size(),set.centBounds); - bin_parallel(binner,set.prims->data(),set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,PARALLEL_THRESHOLD,mapping,binBoundsAndCenter); - Split osplit = binner.best(mapping,logBlockSize); - osplit.sah *= set.time_range.size(); - if (!osplit.valid()) osplit.data = Split::SPLIT_FALLBACK; // use fallback split - return osplit; - } - - /*! array partitioning */ - __forceinline void split(const Split& split, const LinearSpace3fa& space, const SetMB& set, SetMB& lset, SetMB& rset) - { - BinBoundsAndCenter binBoundsAndCenter(scene,set.time_range,space); - const size_t begin = set.begin(); - const size_t end = set.end(); - PrimInfoMB left = empty; - PrimInfoMB right = empty; - const vint4 vSplitPos(split.pos); - const vbool4 vSplitMask(1 << split.dim); - auto isLeft = [&] (const PrimRefMB &ref) { return any(((vint4)split.mapping.bin_unsafe(ref,binBoundsAndCenter) < vSplitPos) & vSplitMask); }; - auto reduction = [] (PrimInfoMB& pinfo, const PrimRefMB& ref) { pinfo.add_primref(ref); }; - auto reduction2 = [] (PrimInfoMB& pinfo0,const PrimInfoMB& pinfo1) { pinfo0.merge(pinfo1); }; - size_t center = parallel_partitioning(set.prims->data(),begin,end,EmptyTy(),left,right,isLeft,reduction,reduction2,PARALLEL_PARTITION_BLOCK_SIZE,PARALLEL_THRESHOLD); - new (&lset) SetMB(left,set.prims,range(begin,center),set.time_range); - new (&rset) SetMB(right,set.prims,range(center,end ),set.time_range); - } - - private: - Scene* scene; - }; - } -} diff --git a/thirdparty/embree-aarch64/kernels/builders/heuristic_openmerge_array.h b/thirdparty/embree-aarch64/kernels/builders/heuristic_openmerge_array.h deleted file mode 100644 index 21f18c0208..0000000000 --- a/thirdparty/embree-aarch64/kernels/builders/heuristic_openmerge_array.h +++ /dev/null @@ -1,443 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -// TODO: -// - adjust parallel build thresholds -// - openNodesBasedOnExtend should consider max extended size - -#pragma once - -#include "heuristic_binning.h" -#include "heuristic_spatial.h" - -/* stop opening of all bref.geomIDs are the same */ -#define EQUAL_GEOMID_STOP_CRITERIA 1 - -/* 10% spatial extend threshold */ -#define MAX_EXTEND_THRESHOLD 0.1f - -/* maximum is 8 children */ -#define MAX_OPENED_CHILD_NODES 8 - -/* open until all build refs are below threshold size in one step */ -#define USE_LOOP_OPENING 0 - -namespace embree -{ - namespace isa - { - /*! Performs standard object binning */ - template - struct HeuristicArrayOpenMergeSAH - { - typedef BinSplit Split; - typedef BinInfoT Binner; - - static const size_t PARALLEL_THRESHOLD = 1024; - static const size_t PARALLEL_FIND_BLOCK_SIZE = 512; - static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128; - - static const size_t MOVE_STEP_SIZE = 64; - static const size_t CREATE_SPLITS_STEP_SIZE = 128; - - __forceinline HeuristicArrayOpenMergeSAH () - : prims0(nullptr) {} - - /*! remember prim array */ - __forceinline HeuristicArrayOpenMergeSAH (const NodeOpenerFunc& nodeOpenerFunc, PrimRef* prims0, size_t max_open_size) - : prims0(prims0), nodeOpenerFunc(nodeOpenerFunc), max_open_size(max_open_size) - { - assert(max_open_size <= MAX_OPENED_CHILD_NODES); - } - - struct OpenHeuristic - { - __forceinline OpenHeuristic( const PrimInfoExtRange& pinfo ) - { - const Vec3fa diag = pinfo.geomBounds.size(); - dim = maxDim(diag); - assert(diag[dim] > 0.0f); - inv_max_extend = 1.0f / diag[dim]; - } - - __forceinline bool operator () ( PrimRef& prim ) const { - return !prim.node.isLeaf() && prim.bounds().size()[dim] * inv_max_extend > MAX_EXTEND_THRESHOLD; - } - - private: - size_t dim; - float inv_max_extend; - }; - - /*! compute extended ranges */ - __forceinline void setExtentedRanges(const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset, const size_t lweight, const size_t rweight) - { - assert(set.ext_range_size() > 0); - const float left_factor = (float)lweight / (lweight + rweight); - const size_t ext_range_size = set.ext_range_size(); - const size_t left_ext_range_size = min((size_t)(floorf(left_factor * ext_range_size)),ext_range_size); - const size_t right_ext_range_size = ext_range_size - left_ext_range_size; - lset.set_ext_range(lset.end() + left_ext_range_size); - rset.set_ext_range(rset.end() + right_ext_range_size); - } - - /*! move ranges */ - __forceinline void moveExtentedRange(const PrimInfoExtRange& set, const PrimInfoExtRange& lset, PrimInfoExtRange& rset) - { - const size_t left_ext_range_size = lset.ext_range_size(); - const size_t right_size = rset.size(); - - /* has the left child an extended range? */ - if (left_ext_range_size > 0) - { - /* left extended range smaller than right range ? */ - if (left_ext_range_size < right_size) - { - /* only move a small part of the beginning of the right range to the end */ - parallel_for( rset.begin(), rset.begin()+left_ext_range_size, MOVE_STEP_SIZE, [&](const range& r) { - for (size_t i=r.begin(); i& r) { - for (size_t i=r.begin(); i getProperties(const PrimInfoExtRange& set) - { - const OpenHeuristic heuristic(set); - const unsigned int geomID = prims0[set.begin()].geomID(); - - auto body = [&] (const range& r) -> std::pair { - bool commonGeomID = true; - size_t opens = 0; - for (size_t i=r.begin(); i(opens,commonGeomID); - }; - auto reduction = [&] (const std::pair& b0, const std::pair& b1) -> std::pair { - return std::pair(b0.first+b1.first,b0.second && b1.second); - }; - return parallel_reduce(set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,PARALLEL_THRESHOLD,std::pair(0,true),body,reduction); - } - - // FIXME: should consider maximum available extended size - __noinline void openNodesBasedOnExtend(PrimInfoExtRange& set) - { - const OpenHeuristic heuristic(set); - const size_t ext_range_start = set.end(); - - if (false && set.size() < PARALLEL_THRESHOLD) - { - size_t extra_elements = 0; - for (size_t i=set.begin(); i ext_elements; - ext_elements.store(0); - PrimInfo info = parallel_reduce( set.begin(), set.end(), CREATE_SPLITS_STEP_SIZE, PrimInfo(empty), [&](const range& r) -> PrimInfo { - PrimInfo info(empty); - for (size_t i=r.begin(); i 0); - - if (unlikely(next_iteration_extra_elements == 0)) break; - } - } - - __noinline const Split find(PrimInfoExtRange& set, const size_t logBlockSize) - { - /* single element */ - if (set.size() <= 1) - return Split(); - - /* disable opening if there is no overlap */ - const size_t D = 4; - if (unlikely(set.has_ext_range() && set.size() <= D)) - { - bool disjoint = true; - for (size_t j=set.begin(); j p(0,false); - - /* disable opening when all primitives are from same geometry */ - if (unlikely(set.has_ext_range())) - { - p = getProperties(set); -#if EQUAL_GEOMID_STOP_CRITERIA == 1 - if (p.second) set.set_ext_range(set.end()); /* disable opening */ -#endif - } - - /* open nodes when we have sufficient space available */ - if (unlikely(set.has_ext_range())) - { -#if USE_LOOP_OPENING == 1 - openNodesBasedOnExtendLoop(set,p.first); -#else - if (p.first <= set.ext_range_size()) - openNodesBasedOnExtend(set); -#endif - - /* disable opening when unsufficient space for opening a node available */ - if (set.ext_range_size() < max_open_size-1) - set.set_ext_range(set.end()); /* disable opening */ - } - - /* find best split */ - return object_find(set,logBlockSize); - } - - - /*! finds the best object split */ - __forceinline const Split object_find(const PrimInfoExtRange& set,const size_t logBlockSize) - { - if (set.size() < PARALLEL_THRESHOLD) return sequential_object_find(set,logBlockSize); - else return parallel_object_find (set,logBlockSize); - } - - /*! finds the best object split */ - __noinline const Split sequential_object_find(const PrimInfoExtRange& set, const size_t logBlockSize) - { - Binner binner(empty); - const BinMapping mapping(set.centBounds); - binner.bin(prims0,set.begin(),set.end(),mapping); - return binner.best(mapping,logBlockSize); - } - - /*! finds the best split */ - __noinline const Split parallel_object_find(const PrimInfoExtRange& set, const size_t logBlockSize) - { - Binner binner(empty); - const BinMapping mapping(set.centBounds); - const BinMapping& _mapping = mapping; // CLANG 3.4 parser bug workaround - auto body = [&] (const range& r) -> Binner { - Binner binner(empty); binner.bin(prims0+r.begin(),r.size(),_mapping); return binner; - }; - auto reduction = [&] (const Binner& b0, const Binner& b1) -> Binner { - Binner r = b0; r.merge(b1,_mapping.size()); return r; - }; - binner = parallel_reduce(set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,binner,body,reduction); - return binner.best(mapping,logBlockSize); - } - - /*! array partitioning */ - __noinline void split(const Split& split, const PrimInfoExtRange& set_i, PrimInfoExtRange& lset, PrimInfoExtRange& rset) - { - PrimInfoExtRange set = set_i; - - /* valid split */ - if (unlikely(!split.valid())) { - deterministic_order(set); - splitFallback(set,lset,rset); - return; - } - - std::pair ext_weights(0,0); - - /* object split */ - if (likely(set.size() < PARALLEL_THRESHOLD)) - ext_weights = sequential_object_split(split,set,lset,rset); - else - ext_weights = parallel_object_split(split,set,lset,rset); - - /* if we have an extended range, set extended child ranges and move right split range */ - if (unlikely(set.has_ext_range())) - { - setExtentedRanges(set,lset,rset,ext_weights.first,ext_weights.second); - moveExtentedRange(set,lset,rset); - } - } - - /*! array partitioning */ - std::pair sequential_object_split(const Split& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset) - { - const size_t begin = set.begin(); - const size_t end = set.end(); - PrimInfo local_left(empty); - PrimInfo local_right(empty); - const unsigned int splitPos = split.pos; - const unsigned int splitDim = split.dim; - const unsigned int splitDimMask = (unsigned int)1 << splitDim; - - const vint4 vSplitPos(splitPos); - const vbool4 vSplitMask( (int)splitDimMask ); - - size_t center = serial_partitioning(prims0, - begin,end,local_left,local_right, - [&] (const PrimRef& ref) { return split.mapping.bin_unsafe(ref,vSplitPos,vSplitMask); }, - [] (PrimInfo& pinfo,const PrimRef& ref) { pinfo.add_center2(ref); }); - - new (&lset) PrimInfoExtRange(begin,center,center,local_left); - new (&rset) PrimInfoExtRange(center,end,end,local_right); - assert(area(lset.geomBounds) >= 0.0f); - assert(area(rset.geomBounds) >= 0.0f); - return std::pair(local_left.size(),local_right.size()); - } - - /*! array partitioning */ - __noinline std::pair parallel_object_split(const Split& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset) - { - const size_t begin = set.begin(); - const size_t end = set.end(); - PrimInfo left(empty); - PrimInfo right(empty); - const unsigned int splitPos = split.pos; - const unsigned int splitDim = split.dim; - const unsigned int splitDimMask = (unsigned int)1 << splitDim; - - const vint4 vSplitPos(splitPos); - const vbool4 vSplitMask( (int)splitDimMask ); - auto isLeft = [&] (const PrimRef& ref) { return split.mapping.bin_unsafe(ref,vSplitPos,vSplitMask); }; - - const size_t center = parallel_partitioning( - prims0,begin,end,EmptyTy(),left,right,isLeft, - [] (PrimInfo& pinfo,const PrimRef& ref) { pinfo.add_center2(ref); }, - [] (PrimInfo& pinfo0,const PrimInfo& pinfo1) { pinfo0.merge(pinfo1); }, - PARALLEL_PARTITION_BLOCK_SIZE); - - new (&lset) PrimInfoExtRange(begin,center,center,left); - new (&rset) PrimInfoExtRange(center,end,end,right); - assert(area(lset.geomBounds) >= 0.0f); - assert(area(rset.geomBounds) >= 0.0f); - - return std::pair(left.size(),right.size()); - } - - void deterministic_order(const extended_range& set) - { - /* required as parallel partition destroys original primitive order */ - std::sort(&prims0[set.begin()],&prims0[set.end()]); - } - - __forceinline void splitFallback(const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset) - { - const size_t begin = set.begin(); - const size_t end = set.end(); - const size_t center = (begin + end)/2; - - PrimInfo left(empty); - for (size_t i=begin; i - struct SpatialBinMapping - { - public: - __forceinline SpatialBinMapping() {} - - /*! calculates the mapping */ - __forceinline SpatialBinMapping(const CentGeomBBox3fa& pinfo) - { - const vfloat4 lower = (vfloat4) pinfo.geomBounds.lower; - const vfloat4 upper = (vfloat4) pinfo.geomBounds.upper; - const vfloat4 eps = 128.0f*vfloat4(ulp)*max(abs(lower),abs(upper)); - const vfloat4 diag = max(eps,(vfloat4) pinfo.geomBounds.size()); - scale = select(upper-lower <= eps,vfloat4(0.0f),vfloat4(BINS)/diag); - ofs = (vfloat4) pinfo.geomBounds.lower; - inv_scale = 1.0f / scale; - } - - /*! slower but safe binning */ - __forceinline vint4 bin(const Vec3fa& p) const - { - const vint4 i = floori((vfloat4(p)-ofs)*scale); - return clamp(i,vint4(0),vint4(BINS-1)); - } - - __forceinline std::pair bin(const BBox3fa& b) const - { -#if defined(__AVX__) - const vfloat8 ofs8(ofs); - const vfloat8 scale8(scale); - const vint8 lu = floori((vfloat8::loadu(&b)-ofs8)*scale8); - const vint8 c_lu = clamp(lu,vint8(zero),vint8(BINS-1)); - return std::pair(extract4<0>(c_lu),extract4<1>(c_lu)); -#else - const vint4 lower = floori((vfloat4(b.lower)-ofs)*scale); - const vint4 upper = floori((vfloat4(b.upper)-ofs)*scale); - const vint4 c_lower = clamp(lower,vint4(0),vint4(BINS-1)); - const vint4 c_upper = clamp(upper,vint4(0),vint4(BINS-1)); - return std::pair(c_lower,c_upper); -#endif - } - - - /*! calculates left spatial position of bin */ - __forceinline float pos(const size_t bin, const size_t dim) const { - return madd(float(bin),inv_scale[dim],ofs[dim]); - } - - /*! calculates left spatial position of bin */ - template - __forceinline vfloat posN(const vfloat bin, const size_t dim) const { - return madd(bin,vfloat(inv_scale[dim]),vfloat(ofs[dim])); - } - - /*! returns true if the mapping is invalid in some dimension */ - __forceinline bool invalid(const size_t dim) const { - return scale[dim] == 0.0f; - } - - public: - vfloat4 ofs,scale,inv_scale; //!< linear function that maps to bin ID - }; - - /*! stores all information required to perform some split */ - template - struct SpatialBinSplit - { - /*! construct an invalid split by default */ - __forceinline SpatialBinSplit() - : sah(inf), dim(-1), pos(0), left(-1), right(-1), factor(1.0f) {} - - /*! constructs specified split */ - __forceinline SpatialBinSplit(float sah, int dim, int pos, const SpatialBinMapping& mapping) - : sah(sah), dim(dim), pos(pos), left(-1), right(-1), factor(1.0f), mapping(mapping) {} - - /*! constructs specified split */ - __forceinline SpatialBinSplit(float sah, int dim, int pos, int left, int right, float factor, const SpatialBinMapping& mapping) - : sah(sah), dim(dim), pos(pos), left(left), right(right), factor(factor), mapping(mapping) {} - - /*! tests if this split is valid */ - __forceinline bool valid() const { return dim != -1; } - - /*! calculates surface area heuristic for performing the split */ - __forceinline float splitSAH() const { return sah; } - - /*! stream output */ - friend embree_ostream operator<<(embree_ostream cout, const SpatialBinSplit& split) { - return cout << "SpatialBinSplit { sah = " << split.sah << ", dim = " << split.dim << ", pos = " << split.pos << ", left = " << split.left << ", right = " << split.right << ", factor = " << split.factor << "}"; - } - - public: - float sah; //!< SAH cost of the split - int dim; //!< split dimension - int pos; //!< split position - int left; //!< number of elements on the left side - int right; //!< number of elements on the right side - float factor; //!< factor splitting the extended range - SpatialBinMapping mapping; //!< mapping into bins - }; - - /*! stores all binning information */ - template - struct __aligned(64) SpatialBinInfo - { - SpatialBinInfo() { - } - - __forceinline SpatialBinInfo(EmptyTy) { - clear(); - } - - /*! clears the bin info */ - __forceinline void clear() - { - for (size_t i=0; i - __forceinline void bin(const SplitPrimitive& splitPrimitive, const PrimRef* prims, size_t N, const SpatialBinMapping& mapping) - { - for (size_t i=0; i> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS); - - if (unlikely(splits == 1)) - { - const vint4 bin = mapping.bin(center(prim.bounds())); - for (size_t dim=0; dim<3; dim++) - { - assert(bin[dim] >= (int)0 && bin[dim] < (int)BINS); - numBegin[bin[dim]][dim]++; - numEnd [bin[dim]][dim]++; - bounds [bin[dim]][dim].extend(prim.bounds()); - } - } - else - { - const vint4 bin0 = mapping.bin(prim.bounds().lower); - const vint4 bin1 = mapping.bin(prim.bounds().upper); - - for (size_t dim=0; dim<3; dim++) - { - size_t bin; - PrimRef rest = prim; - size_t l = bin0[dim]; - size_t r = bin1[dim]; - - // same bin optimization - if (likely(l == r)) - { - numBegin[l][dim]++; - numEnd [l][dim]++; - bounds [l][dim].extend(prim.bounds()); - continue; - } - - for (bin=(size_t)bin0[dim]; bin<(size_t)bin1[dim]; bin++) - { - const float pos = mapping.pos(bin+1,dim); - - PrimRef left,right; - splitPrimitive(rest,(int)dim,pos,left,right); - if (unlikely(left.bounds().empty())) l++; - bounds[bin][dim].extend(left.bounds()); - rest = right; - } - if (unlikely(rest.bounds().empty())) r--; - numBegin[l][dim]++; - numEnd [r][dim]++; - bounds [bin][dim].extend(rest.bounds()); - } - } - } - } - - /*! bins a range of primitives inside an array */ - template - void bin(const SplitPrimitive& splitPrimitive, const PrimRef* prims, size_t begin, size_t end, const SpatialBinMapping& mapping) { - bin(splitPrimitive,prims+begin,end-begin,mapping); - } - - /*! bins an array of primitives */ - template - __forceinline void bin2(const PrimitiveSplitterFactory& splitterFactory, const PrimRef* source, size_t begin, size_t end, const SpatialBinMapping& mapping) - { - for (size_t i=begin; i& mapping) - { - for (size_t i=begin; i best(const SpatialBinMapping& mapping, const size_t blocks_shift) const - { - /* sweep from right to left and compute parallel prefix of merged bounds */ - vfloat4 rAreas[BINS]; - vuint4 rCounts[BINS]; - vuint4 count = 0; BBox3fa bx = empty; BBox3fa by = empty; BBox3fa bz = empty; - for (size_t i=BINS-1; i>0; i--) - { - count += numEnd[i]; - rCounts[i] = count; - bx.extend(bounds[i][0]); rAreas[i][0] = halfArea(bx); - by.extend(bounds[i][1]); rAreas[i][1] = halfArea(by); - bz.extend(bounds[i][2]); rAreas[i][2] = halfArea(bz); - rAreas[i][3] = 0.0f; - } - - /* sweep from left to right and compute SAH */ - vuint4 blocks_add = (1 << blocks_shift)-1; - vuint4 ii = 1; vfloat4 vbestSAH = pos_inf; vuint4 vbestPos = 0; vuint4 vbestlCount = 0; vuint4 vbestrCount = 0; - count = 0; bx = empty; by = empty; bz = empty; - for (size_t i=1; i> (unsigned int)(blocks_shift); - const vuint4 rCount = (rCounts[i]+blocks_add) >> (unsigned int)(blocks_shift); - const vfloat4 sah = madd(lArea,vfloat4(lCount),rArea*vfloat4(rCount)); - // const vfloat4 sah = madd(lArea,vfloat4(vint4(lCount)),rArea*vfloat4(vint4(rCount))); - const vbool4 mask = sah < vbestSAH; - vbestPos = select(mask,ii ,vbestPos); - vbestSAH = select(mask,sah,vbestSAH); - vbestlCount = select(mask,count,vbestlCount); - vbestrCount = select(mask,rCounts[i],vbestrCount); - } - - /* find best dimension */ - float bestSAH = inf; - int bestDim = -1; - int bestPos = 0; - unsigned int bestlCount = 0; - unsigned int bestrCount = 0; - for (int dim=0; dim<3; dim++) - { - /* ignore zero sized dimensions */ - if (unlikely(mapping.invalid(dim))) - continue; - - /* test if this is a better dimension */ - if (vbestSAH[dim] < bestSAH && vbestPos[dim] != 0) { - bestDim = dim; - bestPos = vbestPos[dim]; - bestSAH = vbestSAH[dim]; - bestlCount = vbestlCount[dim]; - bestrCount = vbestrCount[dim]; - } - } - assert(bestSAH >= 0.0f); - - /* return invalid split if no split found */ - if (bestDim == -1) - return SpatialBinSplit(inf,-1,0,mapping); - - /* return best found split */ - return SpatialBinSplit(bestSAH,bestDim,bestPos,bestlCount,bestrCount,1.0f,mapping); - } - - private: - BBox3fa bounds[BINS][3]; //!< geometry bounds for each bin in each dimension - vuint4 numBegin[BINS]; //!< number of primitives starting in bin - vuint4 numEnd[BINS]; //!< number of primitives ending in bin - }; - } -} - diff --git a/thirdparty/embree-aarch64/kernels/builders/heuristic_spatial_array.h b/thirdparty/embree-aarch64/kernels/builders/heuristic_spatial_array.h deleted file mode 100644 index 911dcf950c..0000000000 --- a/thirdparty/embree-aarch64/kernels/builders/heuristic_spatial_array.h +++ /dev/null @@ -1,552 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "heuristic_binning.h" -#include "heuristic_spatial.h" - -namespace embree -{ - namespace isa - { -#if 0 -#define SPATIAL_ASPLIT_OVERLAP_THRESHOLD 0.2f -#define SPATIAL_ASPLIT_SAH_THRESHOLD 0.95f -#define SPATIAL_ASPLIT_AREA_THRESHOLD 0.0f -#else -#define SPATIAL_ASPLIT_OVERLAP_THRESHOLD 0.1f -#define SPATIAL_ASPLIT_SAH_THRESHOLD 0.99f -#define SPATIAL_ASPLIT_AREA_THRESHOLD 0.000005f -#endif - - struct PrimInfoExtRange : public CentGeomBBox3fa, public extended_range - { - __forceinline PrimInfoExtRange() { - } - - __forceinline PrimInfoExtRange(EmptyTy) - : CentGeomBBox3fa(EmptyTy()), extended_range(0,0,0) {} - - __forceinline PrimInfoExtRange(size_t begin, size_t end, size_t ext_end, const CentGeomBBox3fa& centGeomBounds) - : CentGeomBBox3fa(centGeomBounds), extended_range(begin,end,ext_end) {} - - __forceinline float leafSAH() const { - return expectedApproxHalfArea(geomBounds)*float(size()); - } - - __forceinline float leafSAH(size_t block_shift) const { - return expectedApproxHalfArea(geomBounds)*float((size()+(size_t(1)<> block_shift); - } - }; - - template - struct Split2 - { - __forceinline Split2 () {} - - __forceinline Split2 (const Split2& other) - { - spatial = other.spatial; - sah = other.sah; - if (spatial) spatialSplit() = other.spatialSplit(); - else objectSplit() = other.objectSplit(); - } - - __forceinline Split2& operator= (const Split2& other) - { - spatial = other.spatial; - sah = other.sah; - if (spatial) spatialSplit() = other.spatialSplit(); - else objectSplit() = other.objectSplit(); - return *this; - } - - __forceinline ObjectSplit& objectSplit() { return *( ObjectSplit*)data; } - __forceinline const ObjectSplit& objectSplit() const { return *(const ObjectSplit*)data; } - - __forceinline SpatialSplit& spatialSplit() { return *( SpatialSplit*)data; } - __forceinline const SpatialSplit& spatialSplit() const { return *(const SpatialSplit*)data; } - - __forceinline Split2 (const ObjectSplit& objectSplit, float sah) - : spatial(false), sah(sah) - { - new (data) ObjectSplit(objectSplit); - } - - __forceinline Split2 (const SpatialSplit& spatialSplit, float sah) - : spatial(true), sah(sah) - { - new (data) SpatialSplit(spatialSplit); - } - - __forceinline float splitSAH() const { - return sah; - } - - __forceinline bool valid() const { - return sah < float(inf); - } - - public: - __aligned(64) char data[sizeof(ObjectSplit) > sizeof(SpatialSplit) ? sizeof(ObjectSplit) : sizeof(SpatialSplit)]; - bool spatial; - float sah; - }; - - /*! Performs standard object binning */ - template - struct HeuristicArraySpatialSAH - { - typedef BinSplit ObjectSplit; - typedef BinInfoT ObjectBinner; - - typedef SpatialBinSplit SpatialSplit; - typedef SpatialBinInfo SpatialBinner; - - //typedef extended_range Set; - typedef Split2 Split; - -#if defined(__AVX512ER__) // KNL - static const size_t PARALLEL_THRESHOLD = 3*1024; - static const size_t PARALLEL_FIND_BLOCK_SIZE = 768; - static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128; -#else - static const size_t PARALLEL_THRESHOLD = 3*1024; - static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024; - static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128; -#endif - - static const size_t MOVE_STEP_SIZE = 64; - static const size_t CREATE_SPLITS_STEP_SIZE = 64; - - __forceinline HeuristicArraySpatialSAH () - : prims0(nullptr) {} - - /*! remember prim array */ - __forceinline HeuristicArraySpatialSAH (const PrimitiveSplitterFactory& splitterFactory, PrimRef* prims0, const CentGeomBBox3fa& root_info) - : prims0(prims0), splitterFactory(splitterFactory), root_info(root_info) {} - - - /*! compute extended ranges */ - __noinline void setExtentedRanges(const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset, const size_t lweight, const size_t rweight) - { - assert(set.ext_range_size() > 0); - const float left_factor = (float)lweight / (lweight + rweight); - const size_t ext_range_size = set.ext_range_size(); - const size_t left_ext_range_size = min((size_t)(floorf(left_factor * ext_range_size)),ext_range_size); - const size_t right_ext_range_size = ext_range_size - left_ext_range_size; - lset.set_ext_range(lset.end() + left_ext_range_size); - rset.set_ext_range(rset.end() + right_ext_range_size); - } - - /*! move ranges */ - __noinline void moveExtentedRange(const PrimInfoExtRange& set, const PrimInfoExtRange& lset, PrimInfoExtRange& rset) - { - const size_t left_ext_range_size = lset.ext_range_size(); - const size_t right_size = rset.size(); - - /* has the left child an extended range? */ - if (left_ext_range_size > 0) - { - /* left extended range smaller than right range ? */ - if (left_ext_range_size < right_size) - { - /* only move a small part of the beginning of the right range to the end */ - parallel_for( rset.begin(), rset.begin()+left_ext_range_size, MOVE_STEP_SIZE, [&](const range& r) { - for (size_t i=r.begin(); i& r) { - for (size_t i=r.begin(); i= SPATIAL_ASPLIT_AREA_THRESHOLD*safeArea(root_info.geomBounds) && - safeArea(overlap) >= SPATIAL_ASPLIT_OVERLAP_THRESHOLD*safeArea(set.geomBounds)) - { - const SpatialSplit spatial_split = spatial_find(set, logBlockSize); - const float spatial_split_sah = spatial_split.splitSAH(); - - /* valid spatial split, better SAH and number of splits do not exceed extended range */ - if (spatial_split_sah < SPATIAL_ASPLIT_SAH_THRESHOLD*object_split_sah && - spatial_split.left + spatial_split.right - set.size() <= set.ext_range_size()) - { - return Split(spatial_split,spatial_split_sah); - } - } - } - - return Split(object_split,object_split_sah); - } - - /*! finds the best object split */ - __forceinline const ObjectSplit object_find(const PrimInfoExtRange& set, const size_t logBlockSize, SplitInfo &info) - { - if (set.size() < PARALLEL_THRESHOLD) return sequential_object_find(set,logBlockSize,info); - else return parallel_object_find (set,logBlockSize,info); - } - - /*! finds the best object split */ - __noinline const ObjectSplit sequential_object_find(const PrimInfoExtRange& set, const size_t logBlockSize, SplitInfo &info) - { - ObjectBinner binner(empty); - const BinMapping mapping(set); - binner.bin(prims0,set.begin(),set.end(),mapping); - ObjectSplit s = binner.best(mapping,logBlockSize); - binner.getSplitInfo(mapping, s, info); - return s; - } - - /*! finds the best split */ - __noinline const ObjectSplit parallel_object_find(const PrimInfoExtRange& set, const size_t logBlockSize, SplitInfo &info) - { - ObjectBinner binner(empty); - const BinMapping mapping(set); - const BinMapping& _mapping = mapping; // CLANG 3.4 parser bug workaround - binner = parallel_reduce(set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,binner, - [&] (const range& r) -> ObjectBinner { ObjectBinner binner(empty); binner.bin(prims0+r.begin(),r.size(),_mapping); return binner; }, - [&] (const ObjectBinner& b0, const ObjectBinner& b1) -> ObjectBinner { ObjectBinner r = b0; r.merge(b1,_mapping.size()); return r; }); - ObjectSplit s = binner.best(mapping,logBlockSize); - binner.getSplitInfo(mapping, s, info); - return s; - } - - /*! finds the best spatial split */ - __forceinline const SpatialSplit spatial_find(const PrimInfoExtRange& set, const size_t logBlockSize) - { - if (set.size() < PARALLEL_THRESHOLD) return sequential_spatial_find(set, logBlockSize); - else return parallel_spatial_find (set, logBlockSize); - } - - /*! finds the best spatial split */ - __noinline const SpatialSplit sequential_spatial_find(const PrimInfoExtRange& set, const size_t logBlockSize) - { - SpatialBinner binner(empty); - const SpatialBinMapping mapping(set); - binner.bin2(splitterFactory,prims0,set.begin(),set.end(),mapping); - /* todo: best spatial split not exeeding the extended range does not provide any benefit ?*/ - return binner.best(mapping,logBlockSize); //,set.ext_size()); - } - - __noinline const SpatialSplit parallel_spatial_find(const PrimInfoExtRange& set, const size_t logBlockSize) - { - SpatialBinner binner(empty); - const SpatialBinMapping mapping(set); - const SpatialBinMapping& _mapping = mapping; // CLANG 3.4 parser bug workaround - binner = parallel_reduce(set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,binner, - [&] (const range& r) -> SpatialBinner { - SpatialBinner binner(empty); - binner.bin2(splitterFactory,prims0,r.begin(),r.end(),_mapping); - return binner; }, - [&] (const SpatialBinner& b0, const SpatialBinner& b1) -> SpatialBinner { return SpatialBinner::reduce(b0,b1); }); - /* todo: best spatial split not exeeding the extended range does not provide any benefit ?*/ - return binner.best(mapping,logBlockSize); //,set.ext_size()); - } - - - /*! subdivides primitives based on a spatial split */ - __noinline void create_spatial_splits(PrimInfoExtRange& set, const SpatialSplit& split, const SpatialBinMapping &mapping) - { - assert(set.has_ext_range()); - const size_t max_ext_range_size = set.ext_range_size(); - const size_t ext_range_start = set.end(); - - /* atomic counter for number of primref splits */ - std::atomic ext_elements; - ext_elements.store(0); - - const float fpos = split.mapping.pos(split.pos,split.dim); - - const unsigned int mask = 0xFFFFFFFF >> RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS; - - parallel_for( set.begin(), set.end(), CREATE_SPLITS_STEP_SIZE, [&](const range& r) { - for (size_t i=r.begin();i> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS); - - if (likely(splits <= 1)) continue; /* todo: does this ever happen ? */ - - //int bin0 = split.mapping.bin(prims0[i].lower)[split.dim]; - //int bin1 = split.mapping.bin(prims0[i].upper)[split.dim]; - //if (unlikely(bin0 < split.pos && bin1 >= split.pos)) - if (unlikely(prims0[i].lower[split.dim] < fpos && prims0[i].upper[split.dim] > fpos)) - { - assert(splits > 1); - - PrimRef left,right; - const auto splitter = splitterFactory(prims0[i]); - splitter(prims0[i],split.dim,fpos,left,right); - - // no empty splits - if (unlikely(left.bounds().empty() || right.bounds().empty())) continue; - - left.lower.u = (left.lower.u & mask) | ((splits-1) << (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); - right.lower.u = (right.lower.u & mask) | ((splits-1) << (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); - - const size_t ID = ext_elements.fetch_add(1); - - /* break if the number of subdivided elements are greater than the maximum allowed size */ - if (unlikely(ID >= max_ext_range_size)) - break; - - /* only write within the correct bounds */ - assert(ID < max_ext_range_size); - prims0[i] = left; - prims0[ext_range_start+ID] = right; - } - } - }); - - const size_t numExtElements = min(max_ext_range_size,ext_elements.load()); - assert(set.end()+numExtElements<=set.ext_end()); - set._end += numExtElements; - } - - /*! array partitioning */ - void split(const Split& split, const PrimInfoExtRange& set_i, PrimInfoExtRange& lset, PrimInfoExtRange& rset) - { - PrimInfoExtRange set = set_i; - - /* valid split */ - if (unlikely(!split.valid())) { - deterministic_order(set); - return splitFallback(set,lset,rset); - } - - std::pair ext_weights(0,0); - - if (unlikely(split.spatial)) - { - create_spatial_splits(set,split.spatialSplit(), split.spatialSplit().mapping); - - /* spatial split */ - if (likely(set.size() < PARALLEL_THRESHOLD)) - ext_weights = sequential_spatial_split(split.spatialSplit(),set,lset,rset); - else - ext_weights = parallel_spatial_split(split.spatialSplit(),set,lset,rset); - } - else - { - /* object split */ - if (likely(set.size() < PARALLEL_THRESHOLD)) - ext_weights = sequential_object_split(split.objectSplit(),set,lset,rset); - else - ext_weights = parallel_object_split(split.objectSplit(),set,lset,rset); - } - - /* if we have an extended range, set extended child ranges and move right split range */ - if (unlikely(set.has_ext_range())) - { - setExtentedRanges(set,lset,rset,ext_weights.first,ext_weights.second); - moveExtentedRange(set,lset,rset); - } - } - - /*! array partitioning */ - std::pair sequential_object_split(const ObjectSplit& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset) - { - const size_t begin = set.begin(); - const size_t end = set.end(); - PrimInfo local_left(empty); - PrimInfo local_right(empty); - const unsigned int splitPos = split.pos; - const unsigned int splitDim = split.dim; - const unsigned int splitDimMask = (unsigned int)1 << splitDim; - - const typename ObjectBinner::vint vSplitPos(splitPos); - const typename ObjectBinner::vbool vSplitMask(splitDimMask); - size_t center = serial_partitioning(prims0, - begin,end,local_left,local_right, - [&] (const PrimRef& ref) { - return split.mapping.bin_unsafe(ref,vSplitPos,vSplitMask); - }, - [] (PrimInfo& pinfo,const PrimRef& ref) { pinfo.add_center2(ref,ref.lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); }); - const size_t left_weight = local_left.end; - const size_t right_weight = local_right.end; - - new (&lset) PrimInfoExtRange(begin,center,center,local_left); - new (&rset) PrimInfoExtRange(center,end,end,local_right); - - assert(area(lset.geomBounds) >= 0.0f); - assert(area(rset.geomBounds) >= 0.0f); - return std::pair(left_weight,right_weight); - } - - - /*! array partitioning */ - __noinline std::pair sequential_spatial_split(const SpatialSplit& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset) - { - const size_t begin = set.begin(); - const size_t end = set.end(); - PrimInfo local_left(empty); - PrimInfo local_right(empty); - const unsigned int splitPos = split.pos; - const unsigned int splitDim = split.dim; - const unsigned int splitDimMask = (unsigned int)1 << splitDim; - - /* init spatial mapping */ - const SpatialBinMapping &mapping = split.mapping; - const vint4 vSplitPos(splitPos); - const vbool4 vSplitMask( (int)splitDimMask ); - - size_t center = serial_partitioning(prims0, - begin,end,local_left,local_right, - [&] (const PrimRef& ref) { - const Vec3fa c = ref.bounds().center(); - return any(((vint4)mapping.bin(c) < vSplitPos) & vSplitMask); - }, - [] (PrimInfo& pinfo,const PrimRef& ref) { pinfo.add_center2(ref,ref.lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); }); - - const size_t left_weight = local_left.end; - const size_t right_weight = local_right.end; - - new (&lset) PrimInfoExtRange(begin,center,center,local_left); - new (&rset) PrimInfoExtRange(center,end,end,local_right); - assert(area(lset.geomBounds) >= 0.0f); - assert(area(rset.geomBounds) >= 0.0f); - return std::pair(left_weight,right_weight); - } - - - - /*! array partitioning */ - __noinline std::pair parallel_object_split(const ObjectSplit& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset) - { - const size_t begin = set.begin(); - const size_t end = set.end(); - PrimInfo left(empty); - PrimInfo right(empty); - const unsigned int splitPos = split.pos; - const unsigned int splitDim = split.dim; - const unsigned int splitDimMask = (unsigned int)1 << splitDim; - - const typename ObjectBinner::vint vSplitPos(splitPos); - const typename ObjectBinner::vbool vSplitMask(splitDimMask); - auto isLeft = [&] (const PrimRef &ref) { return split.mapping.bin_unsafe(ref,vSplitPos,vSplitMask); }; - - const size_t center = parallel_partitioning( - prims0,begin,end,EmptyTy(),left,right,isLeft, - [] (PrimInfo &pinfo,const PrimRef &ref) { pinfo.add_center2(ref,ref.lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); }, - [] (PrimInfo &pinfo0,const PrimInfo &pinfo1) { pinfo0.merge(pinfo1); }, - PARALLEL_PARTITION_BLOCK_SIZE); - - const size_t left_weight = left.end; - const size_t right_weight = right.end; - - left.begin = begin; left.end = center; - right.begin = center; right.end = end; - - new (&lset) PrimInfoExtRange(begin,center,center,left); - new (&rset) PrimInfoExtRange(center,end,end,right); - - assert(area(left.geomBounds) >= 0.0f); - assert(area(right.geomBounds) >= 0.0f); - return std::pair(left_weight,right_weight); - } - - /*! array partitioning */ - __noinline std::pair parallel_spatial_split(const SpatialSplit& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset) - { - const size_t begin = set.begin(); - const size_t end = set.end(); - PrimInfo left(empty); - PrimInfo right(empty); - const unsigned int splitPos = split.pos; - const unsigned int splitDim = split.dim; - const unsigned int splitDimMask = (unsigned int)1 << splitDim; - - /* init spatial mapping */ - const SpatialBinMapping& mapping = split.mapping; - const vint4 vSplitPos(splitPos); - const vbool4 vSplitMask( (int)splitDimMask ); - - auto isLeft = [&] (const PrimRef &ref) { - const Vec3fa c = ref.bounds().center(); - return any(((vint4)mapping.bin(c) < vSplitPos) & vSplitMask); }; - - const size_t center = parallel_partitioning( - prims0,begin,end,EmptyTy(),left,right,isLeft, - [] (PrimInfo &pinfo,const PrimRef &ref) { pinfo.add_center2(ref,ref.lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); }, - [] (PrimInfo &pinfo0,const PrimInfo &pinfo1) { pinfo0.merge(pinfo1); }, - PARALLEL_PARTITION_BLOCK_SIZE); - - const size_t left_weight = left.end; - const size_t right_weight = right.end; - - left.begin = begin; left.end = center; - right.begin = center; right.end = end; - - new (&lset) PrimInfoExtRange(begin,center,center,left); - new (&rset) PrimInfoExtRange(center,end,end,right); - - assert(area(left.geomBounds) >= 0.0f); - assert(area(right.geomBounds) >= 0.0f); - return std::pair(left_weight,right_weight); - } - - void deterministic_order(const PrimInfoExtRange& set) - { - /* required as parallel partition destroys original primitive order */ - std::sort(&prims0[set.begin()],&prims0[set.end()]); - } - - void splitFallback(const PrimInfoExtRange& set, - PrimInfoExtRange& lset, - PrimInfoExtRange& rset) - { - const size_t begin = set.begin(); - const size_t end = set.end(); - const size_t center = (begin + end)/2; - - PrimInfo left(empty); - for (size_t i=begin; i> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); - } - const size_t lweight = left.end; - - PrimInfo right(empty); - for (size_t i=center; i> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); - } - const size_t rweight = right.end; - - new (&lset) PrimInfoExtRange(begin,center,center,left); - new (&rset) PrimInfoExtRange(center,end,end,right); - - /* if we have an extended range */ - if (set.has_ext_range()) { - setExtentedRanges(set,lset,rset,lweight,rweight); - moveExtentedRange(set,lset,rset); - } - } - - private: - PrimRef* const prims0; - const PrimitiveSplitterFactory& splitterFactory; - const CentGeomBBox3fa& root_info; - }; - } -} diff --git a/thirdparty/embree-aarch64/kernels/builders/heuristic_strand_array.h b/thirdparty/embree-aarch64/kernels/builders/heuristic_strand_array.h deleted file mode 100644 index ede0d04c78..0000000000 --- a/thirdparty/embree-aarch64/kernels/builders/heuristic_strand_array.h +++ /dev/null @@ -1,188 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "priminfo.h" -#include "../../common/algorithms/parallel_reduce.h" -#include "../../common/algorithms/parallel_partition.h" - -namespace embree -{ - namespace isa - { - /*! Performs standard object binning */ - struct HeuristicStrandSplit - { - typedef range Set; - - static const size_t PARALLEL_THRESHOLD = 10000; - static const size_t PARALLEL_FIND_BLOCK_SIZE = 4096; - static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 64; - - /*! stores all information to perform some split */ - struct Split - { - /*! construct an invalid split by default */ - __forceinline Split() - : sah(inf), axis0(zero), axis1(zero) {} - - /*! constructs specified split */ - __forceinline Split(const float sah, const Vec3fa& axis0, const Vec3fa& axis1) - : sah(sah), axis0(axis0), axis1(axis1) {} - - /*! calculates standard surface area heuristic for the split */ - __forceinline float splitSAH() const { return sah; } - - /*! test if this split is valid */ - __forceinline bool valid() const { return sah != float(inf); } - - public: - float sah; //!< SAH cost of the split - Vec3fa axis0, axis1; //!< axis the two strands are aligned into - }; - - __forceinline HeuristicStrandSplit () // FIXME: required? - : scene(nullptr), prims(nullptr) {} - - /*! remember prim array */ - __forceinline HeuristicStrandSplit (Scene* scene, PrimRef* prims) - : scene(scene), prims(prims) {} - - __forceinline const Vec3fa direction(const PrimRef& prim) { - return scene->get(prim.geomID())->computeDirection(prim.primID()); - } - - __forceinline const BBox3fa bounds(const PrimRef& prim) { - return scene->get(prim.geomID())->vbounds(prim.primID()); - } - - __forceinline const BBox3fa bounds(const LinearSpace3fa& space, const PrimRef& prim) { - return scene->get(prim.geomID())->vbounds(space,prim.primID()); - } - - /*! finds the best split */ - const Split find(const range& set, size_t logBlockSize) - { - Vec3fa axis0(0,0,1); - uint64_t bestGeomPrimID = -1; - - /* curve with minimum ID determines first axis */ - for (size_t i=set.begin(); i= bestGeomPrimID) continue; - const Vec3fa axis = direction(prims[i]); - if (sqr_length(axis) > 1E-18f) { - axis0 = normalize(axis); - bestGeomPrimID = geomprimID; - } - } - - /* find 2nd axis that is most misaligned with first axis and has minimum ID */ - float bestCos = 1.0f; - Vec3fa axis1 = axis0; - bestGeomPrimID = -1; - for (size_t i=set.begin(); i cos1) { lnum++; lbounds.extend(bounds(space0,prim)); } - else { rnum++; rbounds.extend(bounds(space1,prim)); } - } - - /*! return an invalid split if we do not partition */ - if (lnum == 0 || rnum == 0) - return Split(inf,axis0,axis1); - - /*! calculate sah for the split */ - const size_t lblocks = (lnum+(1ull<> logBlockSize; - const size_t rblocks = (rnum+(1ull<> logBlockSize; - const float sah = madd(float(lblocks),halfArea(lbounds),float(rblocks)*halfArea(rbounds)); - return Split(sah,axis0,axis1); - } - - /*! array partitioning */ - void split(const Split& split, const PrimInfoRange& set, PrimInfoRange& lset, PrimInfoRange& rset) - { - if (!split.valid()) { - deterministic_order(set); - return splitFallback(set,lset,rset); - } - - const size_t begin = set.begin(); - const size_t end = set.end(); - CentGeomBBox3fa local_left(empty); - CentGeomBBox3fa local_right(empty); - - auto primOnLeftSide = [&] (const PrimRef& prim) -> bool { - const Vec3fa axisi = normalize(direction(prim)); - const float cos0 = abs(dot(axisi,split.axis0)); - const float cos1 = abs(dot(axisi,split.axis1)); - return cos0 > cos1; - }; - - auto mergePrimBounds = [this] (CentGeomBBox3fa& pinfo,const PrimRef& ref) { - pinfo.extend(bounds(ref)); - }; - - size_t center = serial_partitioning(prims,begin,end,local_left,local_right,primOnLeftSide,mergePrimBounds); - - new (&lset) PrimInfoRange(begin,center,local_left); - new (&rset) PrimInfoRange(center,end,local_right); - assert(area(lset.geomBounds) >= 0.0f); - assert(area(rset.geomBounds) >= 0.0f); - } - - void deterministic_order(const Set& set) - { - /* required as parallel partition destroys original primitive order */ - std::sort(&prims[set.begin()],&prims[set.end()]); - } - - void splitFallback(const Set& set, PrimInfoRange& lset, PrimInfoRange& rset) - { - const size_t begin = set.begin(); - const size_t end = set.end(); - const size_t center = (begin + end)/2; - - CentGeomBBox3fa left(empty); - for (size_t i=begin; i - struct HeuristicMBlurTemporalSplit - { - typedef BinSplit Split; - typedef mvector* PrimRefVector; - typedef typename PrimRefMB::BBox BBox; - - static const size_t PARALLEL_THRESHOLD = 3 * 1024; - static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024; - static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128; - - HeuristicMBlurTemporalSplit (MemoryMonitorInterface* device, const RecalculatePrimRef& recalculatePrimRef) - : device(device), recalculatePrimRef(recalculatePrimRef) {} - - struct TemporalBinInfo - { - __forceinline TemporalBinInfo () { - } - - __forceinline TemporalBinInfo (EmptyTy) - { - for (size_t i=0; i= time_range.upper) continue; - const BBox1f dt0(time_range.lower,center_time); - const BBox1f dt1(center_time,time_range.upper); - - /* find linear bounds for both time segments */ - for (size_t i=begin; i& r) -> TemporalBinInfo { - TemporalBinInfo binner(empty); binner.bin(prims, r.begin(), r.end(), time_range, set, recalculatePrimRef); return binner; - }; - *this = parallel_reduce(begin,end,blockSize,TemporalBinInfo(empty),bin,merge2); - } - } - - /*! merges in other binning information */ - __forceinline void merge (const TemporalBinInfo& other) - { - for (size_t i=0; i= time_range.upper) continue; - const BBox1f dt0(time_range.lower,center_time); - const BBox1f dt1(center_time,time_range.upper); - - /* calculate sah */ - const size_t lCount = (count0[b]+(size_t(1) << logBlockSize)-1) >> int(logBlockSize); - const size_t rCount = (count1[b]+(size_t(1) << logBlockSize)-1) >> int(logBlockSize); - float sah0 = expectedApproxHalfArea(bounds0[b])*float(lCount)*dt0.size(); - float sah1 = expectedApproxHalfArea(bounds1[b])*float(rCount)*dt1.size(); - if (unlikely(lCount == 0)) sah0 = 0.0f; // happens for initial splits when objects not alive over entire shutter time - if (unlikely(rCount == 0)) sah1 = 0.0f; - const float sah = sah0+sah1; - if (sah < bestSAH) { - bestSAH = sah; - bestPos = center_time; - } - } - return Split(bestSAH*MBLUR_TIME_SPLIT_THRESHOLD,(unsigned)Split::SPLIT_TEMPORAL,0,bestPos); - } - - public: - size_t count0[BINS-1]; - size_t count1[BINS-1]; - BBox bounds0[BINS-1]; - BBox bounds1[BINS-1]; - }; - - /*! finds the best split */ - const Split find(const SetMB& set, const size_t logBlockSize) - { - assert(set.size() > 0); - TemporalBinInfo binner(empty); - binner.bin_parallel(set.prims->data(),set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,PARALLEL_THRESHOLD,set.time_range,set,recalculatePrimRef); - Split tsplit = binner.best((int)logBlockSize,set.time_range,set); - if (!tsplit.valid()) tsplit.data = Split::SPLIT_FALLBACK; // use fallback split - return tsplit; - } - - __forceinline std::unique_ptr> split(const Split& tsplit, const SetMB& set, SetMB& lset, SetMB& rset) - { - assert(tsplit.sah != float(inf)); - assert(tsplit.fpos > set.time_range.lower); - assert(tsplit.fpos < set.time_range.upper); - - float center_time = tsplit.fpos; - const BBox1f time_range0(set.time_range.lower,center_time); - const BBox1f time_range1(center_time,set.time_range.upper); - mvector& prims = *set.prims; - - /* calculate primrefs for first time range */ - std::unique_ptr> new_vector(new mvector(device, set.size())); - PrimRefVector lprims = new_vector.get(); - - auto reduction_func0 = [&] (const range& r) { - PrimInfoMB pinfo = empty; - for (size_t i=r.begin(); idata(), size_t(0), set.size(), size_t(1024), - [&](const PrimRefMB& prim) { return prim.time_range_overlap(time_range0); }); - - lset = SetMB(linfo,lprims,time_range0); - - /* calculate primrefs for second time range */ - auto reduction_func1 = [&] (const range& r) { - PrimInfoMB pinfo = empty; - for (size_t i=r.begin(); i(set.begin(), set.begin() + rinfo.size()); - - /* primrefs for second time range are in prims[set.begin() .. set.end()) */ - /* some primitives may need to be filtered out */ - if (rinfo.size() != set.size()) - rinfo.object_range._end = parallel_filter(prims.data(), set.begin(), set.end(), size_t(1024), - [&](const PrimRefMB& prim) { return prim.time_range_overlap(time_range1); }); - - rset = SetMB(rinfo,&prims,time_range1); - - return new_vector; - } - - private: - MemoryMonitorInterface* device; // device to report memory usage to - const RecalculatePrimRef recalculatePrimRef; - }; - } -} diff --git a/thirdparty/embree-aarch64/kernels/builders/priminfo.h b/thirdparty/embree-aarch64/kernels/builders/priminfo.h deleted file mode 100644 index 06c1388742..0000000000 --- a/thirdparty/embree-aarch64/kernels/builders/priminfo.h +++ /dev/null @@ -1,362 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../common/default.h" -#include "../common/primref.h" -#include "../common/primref_mb.h" - -namespace embree -{ - // FIXME: maybe there's a better place for this util fct - __forceinline float areaProjectedTriangle(const Vec3fa& v0, const Vec3fa& v1, const Vec3fa& v2) - { - const Vec3fa e0 = v1-v0; - const Vec3fa e1 = v2-v0; - const Vec3fa d = cross(e0,e1); - return fabs(d.x) + fabs(d.y) + fabs(d.z); - } - - //namespace isa - //{ - template - class CentGeom - { - public: - __forceinline CentGeom () {} - - __forceinline CentGeom (EmptyTy) - : geomBounds(empty), centBounds(empty) {} - - __forceinline CentGeom (const BBox& geomBounds, const BBox3fa& centBounds) - : geomBounds(geomBounds), centBounds(centBounds) {} - - template - __forceinline void extend_primref(const PrimRef& prim) - { - BBox bounds; Vec3fa center; - prim.binBoundsAndCenter(bounds,center); - geomBounds.extend(bounds); - centBounds.extend(center); - } - - template - __forceinline void extend_center2(const PrimRef& prim) - { - BBox3fa bounds = prim.bounds(); - geomBounds.extend(bounds); - centBounds.extend(bounds.center2()); - } - - __forceinline void extend(const BBox& geomBounds_) { - geomBounds.extend(geomBounds_); - centBounds.extend(center2(geomBounds_)); - } - - __forceinline void merge(const CentGeom& other) - { - geomBounds.extend(other.geomBounds); - centBounds.extend(other.centBounds); - } - - static __forceinline const CentGeom merge2(const CentGeom& a, const CentGeom& b) { - CentGeom r = a; r.merge(b); return r; - } - - public: - BBox geomBounds; //!< geometry bounds of primitives - BBox3fa centBounds; //!< centroid bounds of primitives - }; - - typedef CentGeom CentGeomBBox3fa; - - /*! stores bounding information for a set of primitives */ - template - class PrimInfoT : public CentGeom - { - public: - using CentGeom::geomBounds; - using CentGeom::centBounds; - - __forceinline PrimInfoT () {} - - __forceinline PrimInfoT (EmptyTy) - : CentGeom(empty), begin(0), end(0) {} - - __forceinline PrimInfoT (size_t begin, size_t end, const CentGeomBBox3fa& centGeomBounds) - : CentGeom(centGeomBounds), begin(begin), end(end) {} - - template - __forceinline void add_primref(const PrimRef& prim) - { - CentGeom::extend_primref(prim); - end++; - } - - template - __forceinline void add_center2(const PrimRef& prim) { - CentGeom::extend_center2(prim); - end++; - } - - template - __forceinline void add_center2(const PrimRef& prim, const size_t i) { - CentGeom::extend_center2(prim); - end+=i; - } - - /*__forceinline void add(const BBox& geomBounds_) { - CentGeom::extend(geomBounds_); - end++; - } - - __forceinline void add(const BBox& geomBounds_, const size_t i) { - CentGeom::extend(geomBounds_); - end+=i; - }*/ - - __forceinline void merge(const PrimInfoT& other) - { - CentGeom::merge(other); - begin += other.begin; - end += other.end; - } - - static __forceinline const PrimInfoT merge(const PrimInfoT& a, const PrimInfoT& b) { - PrimInfoT r = a; r.merge(b); return r; - } - - /*! returns the number of primitives */ - __forceinline size_t size() const { - return end-begin; - } - - __forceinline float halfArea() { - return expectedApproxHalfArea(geomBounds); - } - - __forceinline float leafSAH() const { - return expectedApproxHalfArea(geomBounds)*float(size()); - //return halfArea(geomBounds)*blocks(num); - } - - __forceinline float leafSAH(size_t block_shift) const { - return expectedApproxHalfArea(geomBounds)*float((size()+(size_t(1)<> block_shift); - //return halfArea(geomBounds)*float((num+3) >> 2); - //return halfArea(geomBounds)*blocks(num); - } - - /*! stream output */ - friend embree_ostream operator<<(embree_ostream cout, const PrimInfoT& pinfo) { - return cout << "PrimInfo { begin = " << pinfo.begin << ", end = " << pinfo.end << ", geomBounds = " << pinfo.geomBounds << ", centBounds = " << pinfo.centBounds << "}"; - } - - public: - size_t begin,end; //!< number of primitives - }; - - typedef PrimInfoT PrimInfo; - //typedef PrimInfoT PrimInfoMB; - - /*! stores bounding information for a set of primitives */ - template - class PrimInfoMBT : public CentGeom - { - public: - using CentGeom::geomBounds; - using CentGeom::centBounds; - - __forceinline PrimInfoMBT () { - } - - __forceinline PrimInfoMBT (EmptyTy) - : CentGeom(empty), object_range(0,0), num_time_segments(0), max_num_time_segments(0), max_time_range(0.0f,1.0f), time_range(1.0f,0.0f) {} - - __forceinline PrimInfoMBT (size_t begin, size_t end) - : CentGeom(empty), object_range(begin,end), num_time_segments(0), max_num_time_segments(0), max_time_range(0.0f,1.0f), time_range(1.0f,0.0f) {} - - template - __forceinline void add_primref(const PrimRef& prim) - { - CentGeom::extend_primref(prim); - time_range.extend(prim.time_range); - object_range._end++; - num_time_segments += prim.size(); - if (max_num_time_segments < prim.totalTimeSegments()) { - max_num_time_segments = prim.totalTimeSegments(); - max_time_range = prim.time_range; - } - } - - __forceinline void merge(const PrimInfoMBT& other) - { - CentGeom::merge(other); - time_range.extend(other.time_range); - object_range._begin += other.object_range.begin(); - object_range._end += other.object_range.end(); - num_time_segments += other.num_time_segments; - if (max_num_time_segments < other.max_num_time_segments) { - max_num_time_segments = other.max_num_time_segments; - max_time_range = other.max_time_range; - } - } - - static __forceinline const PrimInfoMBT merge2(const PrimInfoMBT& a, const PrimInfoMBT& b) { - PrimInfoMBT r = a; r.merge(b); return r; - } - - __forceinline size_t begin() const { - return object_range.begin(); - } - - __forceinline size_t end() const { - return object_range.end(); - } - - /*! returns the number of primitives */ - __forceinline size_t size() const { - return object_range.size(); - } - - __forceinline float halfArea() const { - return time_range.size()*expectedApproxHalfArea(geomBounds); - } - - __forceinline float leafSAH() const { - return time_range.size()*expectedApproxHalfArea(geomBounds)*float(num_time_segments); - } - - __forceinline float leafSAH(size_t block_shift) const { - return time_range.size()*expectedApproxHalfArea(geomBounds)*float((num_time_segments+(size_t(1)<> block_shift); - } - - __forceinline float align_time(float ct) const - { - //return roundf(ct * float(numTimeSegments)) / float(numTimeSegments); - float t0 = (ct-max_time_range.lower)/max_time_range.size(); - float t1 = roundf(t0 * float(max_num_time_segments)) / float(max_num_time_segments); - return t1*max_time_range.size()+max_time_range.lower; - } - - /*! stream output */ - friend embree_ostream operator<<(embree_ostream cout, const PrimInfoMBT& pinfo) - { - return cout << "PrimInfo { " << - "object_range = " << pinfo.object_range << - ", time_range = " << pinfo.time_range << - ", time_segments = " << pinfo.num_time_segments << - ", geomBounds = " << pinfo.geomBounds << - ", centBounds = " << pinfo.centBounds << - "}"; - } - - public: - range object_range; //!< primitive range - size_t num_time_segments; //!< total number of time segments of all added primrefs - size_t max_num_time_segments; //!< maximum number of time segments of a primitive - BBox1f max_time_range; //!< time range of primitive with max_num_time_segments - BBox1f time_range; //!< merged time range of primitives when merging prims, or additionally clipped with build time range when used in SetMB - }; - - typedef PrimInfoMBT PrimInfoMB; - - struct SetMB : public PrimInfoMB - { - static const size_t PARALLEL_THRESHOLD = 3 * 1024; - static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024; - static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128; - - typedef mvector* PrimRefVector; - - __forceinline SetMB() {} - - __forceinline SetMB(const PrimInfoMB& pinfo_i, PrimRefVector prims) - : PrimInfoMB(pinfo_i), prims(prims) {} - - __forceinline SetMB(const PrimInfoMB& pinfo_i, PrimRefVector prims, range object_range_in, BBox1f time_range_in) - : PrimInfoMB(pinfo_i), prims(prims) - { - object_range = object_range_in; - time_range = intersect(time_range,time_range_in); - } - - __forceinline SetMB(const PrimInfoMB& pinfo_i, PrimRefVector prims, BBox1f time_range_in) - : PrimInfoMB(pinfo_i), prims(prims) - { - time_range = intersect(time_range,time_range_in); - } - - void deterministic_order() const - { - /* required as parallel partition destroys original primitive order */ - PrimRefMB* prim = prims->data(); - std::sort(&prim[object_range.begin()],&prim[object_range.end()]); - } - - template - __forceinline LBBox3fa linearBounds(const RecalculatePrimRef& recalculatePrimRef) const - { - auto reduce = [&](const range& r) -> LBBox3fa - { - LBBox3fa cbounds(empty); - for (size_t j = r.begin(); j < r.end(); j++) - { - PrimRefMB& ref = (*prims)[j]; - const LBBox3fa bn = recalculatePrimRef.linearBounds(ref, time_range); - cbounds.extend(bn); - }; - return cbounds; - }; - - return parallel_reduce(object_range.begin(), object_range.end(), PARALLEL_FIND_BLOCK_SIZE, PARALLEL_THRESHOLD, LBBox3fa(empty), - reduce, - [&](const LBBox3fa& b0, const LBBox3fa& b1) -> LBBox3fa { return embree::merge(b0, b1); }); - } - - template - __forceinline LBBox3fa linearBounds(const RecalculatePrimRef& recalculatePrimRef, const LinearSpace3fa& space) const - { - auto reduce = [&](const range& r) -> LBBox3fa - { - LBBox3fa cbounds(empty); - for (size_t j = r.begin(); j < r.end(); j++) - { - PrimRefMB& ref = (*prims)[j]; - const LBBox3fa bn = recalculatePrimRef.linearBounds(ref, time_range, space); - cbounds.extend(bn); - }; - return cbounds; - }; - - return parallel_reduce(object_range.begin(), object_range.end(), PARALLEL_FIND_BLOCK_SIZE, PARALLEL_THRESHOLD, LBBox3fa(empty), - reduce, - [&](const LBBox3fa& b0, const LBBox3fa& b1) -> LBBox3fa { return embree::merge(b0, b1); }); - } - - template - const SetMB primInfo(const RecalculatePrimRef& recalculatePrimRef, const LinearSpace3fa& space) const - { - auto computePrimInfo = [&](const range& r) -> PrimInfoMB - { - PrimInfoMB pinfo(empty); - for (size_t j=r.begin(); j& prims, BuildProgressMonitor& progressMonitor) - { - ParallelPrefixSumState pstate; - - /* first try */ - progressMonitor(0); - PrimInfo pinfo = parallel_prefix_sum( pstate, size_t(0), geometry->size(), size_t(1024), PrimInfo(empty), [&](const range& r, const PrimInfo& base) -> PrimInfo { - return geometry->createPrimRefArray(prims,r,r.begin(),geomID); - }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); - - /* if we need to filter out geometry, run again */ - if (pinfo.size() != prims.size()) - { - progressMonitor(0); - pinfo = parallel_prefix_sum( pstate, size_t(0), geometry->size(), size_t(1024), PrimInfo(empty), [&](const range& r, const PrimInfo& base) -> PrimInfo { - return geometry->createPrimRefArray(prims,r,base.size(),geomID); - }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); - } - return pinfo; - } - - PrimInfo createPrimRefArray(Scene* scene, Geometry::GTypeMask types, bool mblur, mvector& prims, BuildProgressMonitor& progressMonitor) - { - ParallelForForPrefixSumState pstate; - Scene::Iterator2 iter(scene,types,mblur); - - /* first try */ - progressMonitor(0); - pstate.init(iter,size_t(1024)); - PrimInfo pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range& r, size_t k, size_t geomID) -> PrimInfo { - return mesh->createPrimRefArray(prims,r,k,(unsigned)geomID); - }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); - - /* if we need to filter out geometry, run again */ - if (pinfo.size() != prims.size()) - { - progressMonitor(0); - pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range& r, size_t k, size_t geomID, const PrimInfo& base) -> PrimInfo { - return mesh->createPrimRefArray(prims,r,base.size(),(unsigned)geomID); - }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); - } - return pinfo; - } - - PrimInfo createPrimRefArrayMBlur(Scene* scene, Geometry::GTypeMask types, mvector& prims, BuildProgressMonitor& progressMonitor, size_t itime) - { - ParallelForForPrefixSumState pstate; - Scene::Iterator2 iter(scene,types,true); - - /* first try */ - progressMonitor(0); - pstate.init(iter,size_t(1024)); - PrimInfo pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range& r, size_t k, size_t geomID) -> PrimInfo { - return mesh->createPrimRefArrayMB(prims,itime,r,k,(unsigned)geomID); - }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); - - /* if we need to filter out geometry, run again */ - if (pinfo.size() != prims.size()) - { - progressMonitor(0); - pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range& r, size_t k, size_t geomID, const PrimInfo& base) -> PrimInfo { - return mesh->createPrimRefArrayMB(prims,itime,r,base.size(),(unsigned)geomID); - }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); - } - return pinfo; - } - - PrimInfoMB createPrimRefArrayMSMBlur(Scene* scene, Geometry::GTypeMask types, mvector& prims, BuildProgressMonitor& progressMonitor, BBox1f t0t1) - { - ParallelForForPrefixSumState pstate; - Scene::Iterator2 iter(scene,types,true); - - /* first try */ - progressMonitor(0); - pstate.init(iter,size_t(1024)); - PrimInfoMB pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfoMB(empty), [&](Geometry* mesh, const range& r, size_t k, size_t geomID) -> PrimInfoMB { - return mesh->createPrimRefMBArray(prims,t0t1,r,k,(unsigned)geomID); - }, [](const PrimInfoMB& a, const PrimInfoMB& b) -> PrimInfoMB { return PrimInfoMB::merge2(a,b); }); - - /* if we need to filter out geometry, run again */ - if (pinfo.size() != prims.size()) - { - progressMonitor(0); - pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfoMB(empty), [&](Geometry* mesh, const range& r, size_t k, size_t geomID, const PrimInfoMB& base) -> PrimInfoMB { - return mesh->createPrimRefMBArray(prims,t0t1,r,base.size(),(unsigned)geomID); - }, [](const PrimInfoMB& a, const PrimInfoMB& b) -> PrimInfoMB { return PrimInfoMB::merge2(a,b); }); - } - - /* the BVH starts with that time range, even though primitives might have smaller/larger time range */ - pinfo.time_range = t0t1; - return pinfo; - } - - template - size_t createMortonCodeArray(Mesh* mesh, mvector& morton, BuildProgressMonitor& progressMonitor) - { - size_t numPrimitives = morton.size(); - - /* compute scene bounds */ - std::pair cb_empty(0,empty); - auto cb = parallel_reduce - ( size_t(0), numPrimitives, size_t(1024), cb_empty, [&](const range& r) -> std::pair - { - size_t num = 0; - BBox3fa bounds = empty; - - for (size_t j=r.begin(); jbuildBounds(j,&prim_bounds))) continue; - bounds.extend(center2(prim_bounds)); - num++; - } - return std::make_pair(num,bounds); - }, [] (const std::pair& a, const std::pair& b) { - return std::make_pair(a.first + b.first,merge(a.second,b.second)); - }); - - - size_t numPrimitivesGen = cb.first; - const BBox3fa centBounds = cb.second; - - /* compute morton codes */ - if (likely(numPrimitivesGen == numPrimitives)) - { - /* fast path if all primitives were valid */ - BVHBuilderMorton::MortonCodeMapping mapping(centBounds); - parallel_for( size_t(0), numPrimitives, size_t(1024), [&](const range& r) -> void { - BVHBuilderMorton::MortonCodeGenerator generator(mapping,&morton.data()[r.begin()]); - for (size_t j=r.begin(); jbounds(j),unsigned(j)); - }); - } - else - { - /* slow path, fallback in case some primitives were invalid */ - ParallelPrefixSumState pstate; - BVHBuilderMorton::MortonCodeMapping mapping(centBounds); - parallel_prefix_sum( pstate, size_t(0), numPrimitives, size_t(1024), size_t(0), [&](const range& r, const size_t base) -> size_t { - size_t num = 0; - BVHBuilderMorton::MortonCodeGenerator generator(mapping,&morton.data()[r.begin()]); - for (size_t j=r.begin(); jbuildBounds(j,&bounds))) continue; - generator(bounds,unsigned(j)); - num++; - } - return num; - }, std::plus()); - - parallel_prefix_sum( pstate, size_t(0), numPrimitives, size_t(1024), size_t(0), [&](const range& r, const size_t base) -> size_t { - size_t num = 0; - BVHBuilderMorton::MortonCodeGenerator generator(mapping,&morton.data()[base]); - for (size_t j=r.begin(); jbuildBounds(j,&bounds)) continue; - generator(bounds,unsigned(j)); - num++; - } - return num; - }, std::plus()); - } - return numPrimitivesGen; - } - - // ==================================================================================================== - // ==================================================================================================== - // ==================================================================================================== - - // template for grid meshes - -#if 0 - template<> - PrimInfo createPrimRefArray(Scene* scene, mvector& prims, BuildProgressMonitor& progressMonitor) - { - PING; - ParallelForForPrefixSumState pstate; - Scene::Iterator iter(scene); - - /* first try */ - progressMonitor(0); - pstate.init(iter,size_t(1024)); - PrimInfo pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](GridMesh* mesh, const range& r, size_t k) -> PrimInfo - { - PrimInfo pinfo(empty); - for (size_t j=r.begin(); jbuildBounds(j,&bounds)) continue; - const PrimRef prim(bounds,mesh->geomID,unsigned(j)); - pinfo.add_center2(prim); - prims[k++] = prim; - } - return pinfo; - }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); - - /* if we need to filter out geometry, run again */ - if (pinfo.size() != prims.size()) - { - progressMonitor(0); - pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](GridMesh* mesh, const range& r, size_t k, const PrimInfo& base) -> PrimInfo - { - k = base.size(); - PrimInfo pinfo(empty); - for (size_t j=r.begin(); jbuildBounds(j,&bounds)) continue; - const PrimRef prim(bounds,mesh->geomID,unsigned(j)); - pinfo.add_center2(prim); - prims[k++] = prim; - } - return pinfo; - }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); - } - return pinfo; - } -#endif - - // ==================================================================================================== - // ==================================================================================================== - // ==================================================================================================== - - IF_ENABLED_TRIS (template size_t createMortonCodeArray(TriangleMesh* mesh COMMA mvector& morton COMMA BuildProgressMonitor& progressMonitor)); - IF_ENABLED_QUADS(template size_t createMortonCodeArray(QuadMesh* mesh COMMA mvector& morton COMMA BuildProgressMonitor& progressMonitor)); - IF_ENABLED_USER (template size_t createMortonCodeArray(UserGeometry* mesh COMMA mvector& morton COMMA BuildProgressMonitor& progressMonitor)); - IF_ENABLED_INSTANCE (template size_t createMortonCodeArray(Instance* mesh COMMA mvector& morton COMMA BuildProgressMonitor& progressMonitor)); - } -} diff --git a/thirdparty/embree-aarch64/kernels/builders/primrefgen.h b/thirdparty/embree-aarch64/kernels/builders/primrefgen.h deleted file mode 100644 index 9919c945c3..0000000000 --- a/thirdparty/embree-aarch64/kernels/builders/primrefgen.h +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../common/scene.h" -#include "../common/primref.h" -#include "../common/primref_mb.h" -#include "priminfo.h" -#include "bvh_builder_morton.h" - -namespace embree -{ - namespace isa - { - PrimInfo createPrimRefArray(Geometry* geometry, unsigned int geomID, mvector& prims, BuildProgressMonitor& progressMonitor); - - PrimInfo createPrimRefArray(Scene* scene, Geometry::GTypeMask types, bool mblur, mvector& prims, BuildProgressMonitor& progressMonitor); - - PrimInfo createPrimRefArrayMBlur(Scene* scene, Geometry::GTypeMask types, mvector& prims, BuildProgressMonitor& progressMonitor, size_t itime = 0); - - PrimInfoMB createPrimRefArrayMSMBlur(Scene* scene, Geometry::GTypeMask types, mvector& prims, BuildProgressMonitor& progressMonitor, BBox1f t0t1 = BBox1f(0.0f,1.0f)); - - template - size_t createMortonCodeArray(Mesh* mesh, mvector& morton, BuildProgressMonitor& progressMonitor); - } -} - diff --git a/thirdparty/embree-aarch64/kernels/builders/primrefgen_presplit.h b/thirdparty/embree-aarch64/kernels/builders/primrefgen_presplit.h deleted file mode 100644 index 8bdb38b955..0000000000 --- a/thirdparty/embree-aarch64/kernels/builders/primrefgen_presplit.h +++ /dev/null @@ -1,371 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../builders/primrefgen.h" -#include "../builders/heuristic_spatial.h" -#include "../builders/splitter.h" - -#include "../../common/algorithms/parallel_for_for.h" -#include "../../common/algorithms/parallel_for_for_prefix_sum.h" - -#define DBG_PRESPLIT(x) -#define CHECK_PRESPLIT(x) - -#define GRID_SIZE 1024 -#define MAX_PRESPLITS_PER_PRIMITIVE_LOG 5 -#define MAX_PRESPLITS_PER_PRIMITIVE (1<(priority); - } - __forceinline bool operator < (const PresplitItem& item) const - { - return (priority < item.priority); - } - - template - __forceinline static float compute_priority(const PrimRef &ref, Scene *scene, const Vec2i &mc) - { - const unsigned int geomID = ref.geomID(); - const unsigned int primID = ref.primID(); - const float area_aabb = area(ref.bounds()); - const float area_prim = ((Mesh*)scene->get(geomID))->projectedPrimitiveArea(primID); - const unsigned int diff = 31 - lzcnt(mc.x^mc.y); - assert(area_prim <= area_aabb); - //const float priority = powf((area_aabb - area_prim) * powf(PRIORITY_SPLIT_POS_WEIGHT,(float)diff),1.0f/4.0f); - const float priority = sqrtf(sqrtf( (area_aabb - area_prim) * powf(PRIORITY_SPLIT_POS_WEIGHT,(float)diff) )); - assert(priority >= 0.0f && priority < FLT_LARGE); - return priority; - } - - - }; - - inline std::ostream &operator<<(std::ostream &cout, const PresplitItem& item) { - return cout << "index " << item.index << " priority " << item.priority; - }; - - template - void splitPrimitive(SplitterFactory &Splitter, - const PrimRef &prim, - const unsigned int geomID, - const unsigned int primID, - const unsigned int split_level, - const Vec3fa &grid_base, - const float grid_scale, - const float grid_extend, - PrimRef subPrims[MAX_PRESPLITS_PER_PRIMITIVE], - unsigned int& numSubPrims) - { - assert(split_level <= MAX_PRESPLITS_PER_PRIMITIVE_LOG); - if (split_level == 0) - { - assert(numSubPrims < MAX_PRESPLITS_PER_PRIMITIVE); - subPrims[numSubPrims++] = prim; - } - else - { - const Vec3fa lower = prim.lower; - const Vec3fa upper = prim.upper; - const Vec3fa glower = (lower-grid_base)*Vec3fa(grid_scale)+Vec3fa(0.2f); - const Vec3fa gupper = (upper-grid_base)*Vec3fa(grid_scale)-Vec3fa(0.2f); - Vec3ia ilower(floor(glower)); - Vec3ia iupper(floor(gupper)); - - /* this ignores dimensions that are empty */ - iupper = (Vec3ia)(select(vint4(glower) >= vint4(gupper),vint4(ilower),vint4(iupper))); - - /* compute a morton code for the lower and upper grid coordinates. */ - const unsigned int lower_code = bitInterleave(ilower.x,ilower.y,ilower.z); - const unsigned int upper_code = bitInterleave(iupper.x,iupper.y,iupper.z); - - /* if all bits are equal then we cannot split */ - if(unlikely(lower_code == upper_code)) - { - assert(numSubPrims < MAX_PRESPLITS_PER_PRIMITIVE); - subPrims[numSubPrims++] = prim; - return; - } - - /* compute octree level and dimension to perform the split in */ - const unsigned int diff = 31 - lzcnt(lower_code^upper_code); - const unsigned int level = diff / 3; - const unsigned int dim = diff % 3; - - /* now we compute the grid position of the split */ - const unsigned int isplit = iupper[dim] & ~((1<= fsplit); - - /* split primitive */ - const auto splitter = Splitter(prim); - BBox3fa left,right; - splitter(prim.bounds(),dim,fsplit,left,right); - assert(!left.empty()); - assert(!right.empty()); - - - splitPrimitive(Splitter,PrimRef(left ,geomID,primID),geomID,primID,split_level-1,grid_base,grid_scale,grid_extend,subPrims,numSubPrims); - splitPrimitive(Splitter,PrimRef(right,geomID,primID),geomID,primID,split_level-1,grid_base,grid_scale,grid_extend,subPrims,numSubPrims); - } - } - - - template - PrimInfo createPrimRefArray_presplit(Geometry* geometry, unsigned int geomID, size_t numPrimRefs, mvector& prims, BuildProgressMonitor& progressMonitor) - { - ParallelPrefixSumState pstate; - - /* first try */ - progressMonitor(0); - PrimInfo pinfo = parallel_prefix_sum( pstate, size_t(0), geometry->size(), size_t(1024), PrimInfo(empty), [&](const range& r, const PrimInfo& base) -> PrimInfo { - return geometry->createPrimRefArray(prims,r,r.begin(),geomID); - }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); - - /* if we need to filter out geometry, run again */ - if (pinfo.size() != numPrimRefs) - { - progressMonitor(0); - pinfo = parallel_prefix_sum( pstate, size_t(0), geometry->size(), size_t(1024), PrimInfo(empty), [&](const range& r, const PrimInfo& base) -> PrimInfo { - return geometry->createPrimRefArray(prims,r,base.size(),geomID); - }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); - } - return pinfo; - } - - __forceinline Vec2i computeMC(const Vec3fa &grid_base, const float grid_scale, const PrimRef &ref) - { - const Vec3fa lower = ref.lower; - const Vec3fa upper = ref.upper; - const Vec3fa glower = (lower-grid_base)*Vec3fa(grid_scale)+Vec3fa(0.2f); - const Vec3fa gupper = (upper-grid_base)*Vec3fa(grid_scale)-Vec3fa(0.2f); - Vec3ia ilower(floor(glower)); - Vec3ia iupper(floor(gupper)); - - /* this ignores dimensions that are empty */ - iupper = (Vec3ia)select(vint4(glower) >= vint4(gupper),vint4(ilower),vint4(iupper)); - - /* compute a morton code for the lower and upper grid coordinates. */ - const unsigned int lower_code = bitInterleave(ilower.x,ilower.y,ilower.z); - const unsigned int upper_code = bitInterleave(iupper.x,iupper.y,iupper.z); - return Vec2i(lower_code,upper_code); - } - - template - PrimInfo createPrimRefArray_presplit(Scene* scene, Geometry::GTypeMask types, bool mblur, size_t numPrimRefs, mvector& prims, BuildProgressMonitor& progressMonitor) - { - static const size_t MIN_STEP_SIZE = 128; - - ParallelForForPrefixSumState pstate; - Scene::Iterator2 iter(scene,types,mblur); - - /* first try */ - progressMonitor(0); - pstate.init(iter,size_t(1024)); - PrimInfo pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range& r, size_t k, size_t geomID) -> PrimInfo { - return mesh->createPrimRefArray(prims,r,k,(unsigned)geomID); - }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); - - /* if we need to filter out geometry, run again */ - if (pinfo.size() != numPrimRefs) - { - progressMonitor(0); - pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range& r, size_t k, size_t geomID, const PrimInfo& base) -> PrimInfo { - return mesh->createPrimRefArray(prims,r,base.size(),(unsigned)geomID); - }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); - } - - /* use correct number of primitives */ - size_t numPrimitives = pinfo.size(); - const size_t alloc_numPrimitives = prims.size(); - const size_t numSplitPrimitivesBudget = alloc_numPrimitives - numPrimitives; - - /* set up primitive splitter */ - SplitterFactory Splitter(scene); - - - DBG_PRESPLIT( - const size_t org_numPrimitives = pinfo.size(); - PRINT(numPrimitives); - PRINT(alloc_numPrimitives); - PRINT(numSplitPrimitivesBudget); - ); - - /* allocate double buffer presplit items */ - const size_t presplit_allocation_size = sizeof(PresplitItem)*alloc_numPrimitives; - PresplitItem *presplitItem = (PresplitItem*)alignedMalloc(presplit_allocation_size,64); - PresplitItem *tmp_presplitItem = (PresplitItem*)alignedMalloc(presplit_allocation_size,64); - - /* compute grid */ - const Vec3fa grid_base = pinfo.geomBounds.lower; - const Vec3fa grid_diag = pinfo.geomBounds.size(); - const float grid_extend = max(grid_diag.x,max(grid_diag.y,grid_diag.z)); - const float grid_scale = grid_extend == 0.0f ? 0.0f : GRID_SIZE / grid_extend; - - /* init presplit items and get total sum */ - const float psum = parallel_reduce( size_t(0), numPrimitives, size_t(MIN_STEP_SIZE), 0.0f, [&](const range& r) -> float { - float sum = 0.0f; - for (size_t i=r.begin(); i(prims[i],scene,mc) : 0.0f; - /* FIXME: sum undeterministic */ - sum += presplitItem[i].priority; - } - return sum; - },[](const float& a, const float& b) -> float { return a+b; }); - - /* compute number of splits per primitive */ - const float inv_psum = 1.0f / psum; - parallel_for( size_t(0), numPrimitives, size_t(MIN_STEP_SIZE), [&](const range& r) -> void { - for (size_t i=r.begin(); i 0.0f) - { - const float rel_p = (float)numSplitPrimitivesBudget * presplitItem[i].priority * inv_psum; - if (rel_p >= PRIORITY_CUTOFF_THRESHOLD) // need at least a split budget that generates two sub-prims - { - presplitItem[i].priority = max(min(ceilf(logf(rel_p)/logf(2.0f)),(float)MAX_PRESPLITS_PER_PRIMITIVE_LOG),1.0f); - //presplitItem[i].priority = min(floorf(logf(rel_p)/logf(2.0f)),(float)MAX_PRESPLITS_PER_PRIMITIVE_LOG); - assert(presplitItem[i].priority >= 0.0f && presplitItem[i].priority <= (float)MAX_PRESPLITS_PER_PRIMITIVE_LOG); - } - else - presplitItem[i].priority = 0.0f; - } - } - }); - - auto isLeft = [&] (const PresplitItem &ref) { return ref.priority < PRIORITY_CUTOFF_THRESHOLD; }; - size_t center = parallel_partitioning(presplitItem,0,numPrimitives,isLeft,1024); - - /* anything to split ? */ - if (center < numPrimitives) - { - const size_t numPrimitivesToSplit = numPrimitives - center; - assert(presplitItem[center].priority >= 1.0f); - - /* sort presplit items in ascending order */ - radix_sort_u32(presplitItem + center,tmp_presplitItem + center,numPrimitivesToSplit,1024); - - CHECK_PRESPLIT( - parallel_for( size_t(center+1), numPrimitives, size_t(MIN_STEP_SIZE), [&](const range& r) -> void { - for (size_t i=r.begin(); i& t) -> size_t { - size_t sum = 0; - for (size_t i=t.begin(); i= 1.0f); - const unsigned int primrefID = presplitItem[i].index; - const float prio = presplitItem[i].priority; - const unsigned int geomID = prims[primrefID].geomID(); - const unsigned int primID = prims[primrefID].primID(); - const unsigned int split_levels = (unsigned int)prio; - unsigned int numSubPrims = 0; - splitPrimitive(Splitter,prims[primrefID],geomID,primID,split_levels,grid_base,grid_scale,grid_extend,subPrims,numSubPrims); - assert(numSubPrims); - numSubPrims--; // can reuse slot - sum+=numSubPrims; - presplitItem[i].data = (numSubPrims << MAX_PRESPLITS_PER_PRIMITIVE_LOG) | split_levels; - primOffset0[i-center] = numSubPrims; - } - return sum; - },[](const size_t& a, const size_t& b) -> size_t { return a+b; }); - - /* if we are over budget, need to shrink the range */ - if (totalNumSubPrims > numSplitPrimitivesBudget) - { - size_t new_center = numPrimitives-1; - size_t sum = 0; - for (;new_center>=center;new_center--) - { - const unsigned int numSubPrims = presplitItem[new_center].data >> MAX_PRESPLITS_PER_PRIMITIVE_LOG; - if (unlikely(sum + numSubPrims >= numSplitPrimitivesBudget)) break; - sum += numSubPrims; - } - new_center++; - center = new_center; - } - - /* parallel prefix sum to compute offsets for storing sub-primitives */ - const unsigned int offset = parallel_prefix_sum(primOffset0,primOffset1,numPrimitivesToSplit,(unsigned int)0,std::plus()); - - /* iterate over range, and split primitives into sub primitives and append them to prims array */ - parallel_for( size_t(center), numPrimitives, size_t(MIN_STEP_SIZE), [&](const range& rn) -> void { - for (size_t j=rn.begin(); j& r) -> PrimInfo { - PrimInfo p(empty); - for (size_t j=r.begin(); j PrimInfo { return PrimInfo::merge(a,b); }); - - assert(pinfo.size() == numPrimitives); - - /* free double buffer presplit items */ - alignedFree(tmp_presplitItem); - alignedFree(presplitItem); - return pinfo; - } - } -} diff --git a/thirdparty/embree-aarch64/kernels/builders/splitter.h b/thirdparty/embree-aarch64/kernels/builders/splitter.h deleted file mode 100644 index dbd6cf07c7..0000000000 --- a/thirdparty/embree-aarch64/kernels/builders/splitter.h +++ /dev/null @@ -1,169 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../common/scene.h" -#include "../common/primref.h" - -namespace embree -{ - namespace isa - { - template - __forceinline void splitPolygon(const BBox3fa& bounds, - const size_t dim, - const float pos, - const Vec3fa (&v)[N+1], - const Vec3fa (&inv_length)[N], - BBox3fa& left_o, - BBox3fa& right_o) - { - BBox3fa left = empty, right = empty; - /* clip triangle to left and right box by processing all edges */ - for (size_t i=0; i= pos) right.extend(v0); // this point is on right side - - if ((v0d < pos && pos < v1d) || (v1d < pos && pos < v0d)) // the edge crosses the splitting location - { - assert((v1d-v0d) != 0.0f); - const Vec3fa c = madd(Vec3fa((pos-v0d)*inv_length[i][dim]),v1-v0,v0); - left.extend(c); - right.extend(c); - } - } - - /* clip against current bounds */ - left_o = intersect(left,bounds); - right_o = intersect(right,bounds); - } - - template - __forceinline void splitPolygon(const PrimRef& prim, - const size_t dim, - const float pos, - const Vec3fa (&v)[N+1], - PrimRef& left_o, - PrimRef& right_o) - { - BBox3fa left = empty, right = empty; - for (size_t i=0; i= pos) right.extend(v0); // this point is on right side - - if ((v0d < pos && pos < v1d) || (v1d < pos && pos < v0d)) // the edge crosses the splitting location - { - assert((v1d-v0d) != 0.0f); - const float inv_length = 1.0f/(v1d-v0d); - const Vec3fa c = madd(Vec3fa((pos-v0d)*inv_length),v1-v0,v0); - left.extend(c); - right.extend(c); - } - } - - /* clip against current bounds */ - new (&left_o ) PrimRef(intersect(left ,prim.bounds()),prim.geomID(), prim.primID()); - new (&right_o) PrimRef(intersect(right,prim.bounds()),prim.geomID(), prim.primID()); - } - - struct TriangleSplitter - { - __forceinline TriangleSplitter(const Scene* scene, const PrimRef& prim) - { - const unsigned int mask = 0xFFFFFFFF >> RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS; - const TriangleMesh* mesh = (const TriangleMesh*) scene->get(prim.geomID() & mask ); - TriangleMesh::Triangle tri = mesh->triangle(prim.primID()); - v[0] = mesh->vertex(tri.v[0]); - v[1] = mesh->vertex(tri.v[1]); - v[2] = mesh->vertex(tri.v[2]); - v[3] = mesh->vertex(tri.v[0]); - inv_length[0] = Vec3fa(1.0f) / (v[1]-v[0]); - inv_length[1] = Vec3fa(1.0f) / (v[2]-v[1]); - inv_length[2] = Vec3fa(1.0f) / (v[0]-v[2]); - } - - __forceinline void operator() (const PrimRef& prim, const size_t dim, const float pos, PrimRef& left_o, PrimRef& right_o) const { - splitPolygon<3>(prim,dim,pos,v,left_o,right_o); - } - - __forceinline void operator() (const BBox3fa& prim, const size_t dim, const float pos, BBox3fa& left_o, BBox3fa& right_o) const { - splitPolygon<3>(prim,dim,pos,v,inv_length,left_o,right_o); - } - - private: - Vec3fa v[4]; - Vec3fa inv_length[3]; - }; - - struct TriangleSplitterFactory - { - __forceinline TriangleSplitterFactory(const Scene* scene) - : scene(scene) {} - - __forceinline TriangleSplitter operator() (const PrimRef& prim) const { - return TriangleSplitter(scene,prim); - } - - private: - const Scene* scene; - }; - - struct QuadSplitter - { - __forceinline QuadSplitter(const Scene* scene, const PrimRef& prim) - { - const unsigned int mask = 0xFFFFFFFF >> RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS; - const QuadMesh* mesh = (const QuadMesh*) scene->get(prim.geomID() & mask ); - QuadMesh::Quad quad = mesh->quad(prim.primID()); - v[0] = mesh->vertex(quad.v[0]); - v[1] = mesh->vertex(quad.v[1]); - v[2] = mesh->vertex(quad.v[2]); - v[3] = mesh->vertex(quad.v[3]); - v[4] = mesh->vertex(quad.v[0]); - inv_length[0] = Vec3fa(1.0f) / (v[1]-v[0]); - inv_length[1] = Vec3fa(1.0f) / (v[2]-v[1]); - inv_length[2] = Vec3fa(1.0f) / (v[3]-v[2]); - inv_length[3] = Vec3fa(1.0f) / (v[0]-v[3]); - } - - __forceinline void operator() (const PrimRef& prim, const size_t dim, const float pos, PrimRef& left_o, PrimRef& right_o) const { - splitPolygon<4>(prim,dim,pos,v,left_o,right_o); - } - - __forceinline void operator() (const BBox3fa& prim, const size_t dim, const float pos, BBox3fa& left_o, BBox3fa& right_o) const { - splitPolygon<4>(prim,dim,pos,v,inv_length,left_o,right_o); - } - - private: - Vec3fa v[5]; - Vec3fa inv_length[4]; - }; - - struct QuadSplitterFactory - { - __forceinline QuadSplitterFactory(const Scene* scene) - : scene(scene) {} - - __forceinline QuadSplitter operator() (const PrimRef& prim) const { - return QuadSplitter(scene,prim); - } - - private: - const Scene* scene; - }; - } -} - diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh.cpp deleted file mode 100644 index bd102bd6ef..0000000000 --- a/thirdparty/embree-aarch64/kernels/bvh/bvh.cpp +++ /dev/null @@ -1,190 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "bvh.h" -#include "bvh_statistics.h" - -namespace embree -{ - template - BVHN::BVHN (const PrimitiveType& primTy, Scene* scene) - : AccelData((N==4) ? AccelData::TY_BVH4 : (N==8) ? AccelData::TY_BVH8 : AccelData::TY_UNKNOWN), - primTy(&primTy), device(scene->device), scene(scene), - root(emptyNode), alloc(scene->device,scene->isStaticAccel()), numPrimitives(0), numVertices(0) - { - } - - template - BVHN::~BVHN () - { - for (size_t i=0; i - void BVHN::clear() - { - set(BVHN::emptyNode,empty,0); - alloc.clear(); - } - - template - void BVHN::set (NodeRef root, const LBBox3fa& bounds, size_t numPrimitives) - { - this->root = root; - this->bounds = bounds; - this->numPrimitives = numPrimitives; - } - - template - void BVHN::clearBarrier(NodeRef& node) - { - if (node.isBarrier()) - node.clearBarrier(); - else if (!node.isLeaf()) { - BaseNode* n = node.baseNode(); // FIXME: flags should be stored in BVH - for (size_t c=0; cchild(c)); - } - } - - template - void BVHN::layoutLargeNodes(size_t num) - { -#if defined(__X86_64__) || defined(__aarch64__) // do not use tree rotations on 32 bit platforms, barrier bit in NodeRef will cause issues - struct NodeArea - { - __forceinline NodeArea() {} - - __forceinline NodeArea(NodeRef& node, const BBox3fa& bounds) - : node(&node), A(node.isLeaf() ? float(neg_inf) : area(bounds)) {} - - __forceinline bool operator< (const NodeArea& other) const { - return this->A < other.A; - } - - NodeRef* node; - float A; - }; - std::vector lst; - lst.reserve(num); - lst.push_back(NodeArea(root,empty)); - - while (lst.size() < num) - { - std::pop_heap(lst.begin(), lst.end()); - NodeArea n = lst.back(); lst.pop_back(); - if (!n.node->isAABBNode()) break; - AABBNode* node = n.node->getAABBNode(); - for (size_t i=0; ichild(i) == BVHN::emptyNode) continue; - lst.push_back(NodeArea(node->child(i),node->bounds(i))); - std::push_heap(lst.begin(), lst.end()); - } - } - - for (size_t i=0; isetBarrier(); - - root = layoutLargeNodesRecursion(root,alloc.getCachedAllocator()); -#endif - } - - template - typename BVHN::NodeRef BVHN::layoutLargeNodesRecursion(NodeRef& node, const FastAllocator::CachedAllocator& allocator) - { - if (node.isBarrier()) { - node.clearBarrier(); - return node; - } - else if (node.isAABBNode()) - { - AABBNode* oldnode = node.getAABBNode(); - AABBNode* newnode = (BVHN::AABBNode*) allocator.malloc0(sizeof(BVHN::AABBNode),byteNodeAlignment); - *newnode = *oldnode; - for (size_t c=0; cchild(c) = layoutLargeNodesRecursion(oldnode->child(c),allocator); - return encodeNode(newnode); - } - else return node; - } - - template - double BVHN::preBuild(const std::string& builderName) - { - if (builderName == "") - return inf; - - if (device->verbosity(2)) - { - Lock lock(g_printMutex); - std::cout << "building BVH" << N << (builderName.find("MBlur") != std::string::npos ? "MB" : "") << "<" << primTy->name() << "> using " << builderName << " ..." << std::endl << std::flush; - } - - double t0 = 0.0; - if (device->benchmark || device->verbosity(2)) t0 = getSeconds(); - return t0; - } - - template - void BVHN::postBuild(double t0) - { - if (t0 == double(inf)) - return; - - double dt = 0.0; - if (device->benchmark || device->verbosity(2)) - dt = getSeconds()-t0; - - std::unique_ptr> stat; - - /* print statistics */ - if (device->verbosity(2)) - { - if (!stat) stat.reset(new BVHNStatistics(this)); - const size_t usedBytes = alloc.getUsedBytes(); - Lock lock(g_printMutex); - std::cout << "finished BVH" << N << "<" << primTy->name() << "> : " << 1000.0f*dt << "ms, " << 1E-6*double(numPrimitives)/dt << " Mprim/s, " << 1E-9*double(usedBytes)/dt << " GB/s" << std::endl; - - if (device->verbosity(2)) - std::cout << stat->str(); - - if (device->verbosity(2)) - { - FastAllocator::AllStatistics stat(&alloc); - for (size_t i=0; ialloc); - - stat.print(numPrimitives); - } - - if (device->verbosity(3)) - { - alloc.print_blocks(); - for (size_t i=0; ialloc.print_blocks(); - } - - std::cout << std::flush; - } - - /* benchmark mode */ - if (device->benchmark) - { - if (!stat) stat.reset(new BVHNStatistics(this)); - Lock lock(g_printMutex); - std::cout << "BENCHMARK_BUILD " << dt << " " << double(numPrimitives)/dt << " " << stat->sah() << " " << stat->bytesUsed() << " BVH" << N << "<" << primTy->name() << ">" << std::endl << std::flush; - } - } - -#if defined(__AVX__) - template class BVHN<8>; -#endif - -#if !defined(__AVX__) || !defined(EMBREE_TARGET_SSE2) && !defined(EMBREE_TARGET_SSE42) || defined(__aarch64__) - template class BVHN<4>; -#endif -} - diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh.h b/thirdparty/embree-aarch64/kernels/bvh/bvh.h deleted file mode 100644 index 8fdf912e52..0000000000 --- a/thirdparty/embree-aarch64/kernels/bvh/bvh.h +++ /dev/null @@ -1,235 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -/* include all node types */ -#include "bvh_node_aabb.h" -#include "bvh_node_aabb_mb.h" -#include "bvh_node_aabb_mb4d.h" -#include "bvh_node_obb.h" -#include "bvh_node_obb_mb.h" -#include "bvh_node_qaabb.h" - -namespace embree -{ - /*! flags used to enable specific node types in intersectors */ - enum BVHNodeFlags - { - BVH_FLAG_ALIGNED_NODE = 0x00001, - BVH_FLAG_ALIGNED_NODE_MB = 0x00010, - BVH_FLAG_UNALIGNED_NODE = 0x00100, - BVH_FLAG_UNALIGNED_NODE_MB = 0x01000, - BVH_FLAG_QUANTIZED_NODE = 0x100000, - BVH_FLAG_ALIGNED_NODE_MB4D = 0x1000000, - - /* short versions */ - BVH_AN1 = BVH_FLAG_ALIGNED_NODE, - BVH_AN2 = BVH_FLAG_ALIGNED_NODE_MB, - BVH_AN2_AN4D = BVH_FLAG_ALIGNED_NODE_MB | BVH_FLAG_ALIGNED_NODE_MB4D, - BVH_UN1 = BVH_FLAG_UNALIGNED_NODE, - BVH_UN2 = BVH_FLAG_UNALIGNED_NODE_MB, - BVH_MB = BVH_FLAG_ALIGNED_NODE_MB | BVH_FLAG_UNALIGNED_NODE_MB | BVH_FLAG_ALIGNED_NODE_MB4D, - BVH_AN1_UN1 = BVH_FLAG_ALIGNED_NODE | BVH_FLAG_UNALIGNED_NODE, - BVH_AN2_UN2 = BVH_FLAG_ALIGNED_NODE_MB | BVH_FLAG_UNALIGNED_NODE_MB, - BVH_AN2_AN4D_UN2 = BVH_FLAG_ALIGNED_NODE_MB | BVH_FLAG_ALIGNED_NODE_MB4D | BVH_FLAG_UNALIGNED_NODE_MB, - BVH_QN1 = BVH_FLAG_QUANTIZED_NODE - }; - - /*! Multi BVH with N children. Each node stores the bounding box of - * it's N children as well as N child references. */ - template - class BVHN : public AccelData - { - ALIGNED_CLASS_(16); - public: - - /*! forward declaration of node ref type */ - typedef NodeRefPtr NodeRef; - typedef BaseNode_t BaseNode; - typedef AABBNode_t AABBNode; - typedef AABBNodeMB_t AABBNodeMB; - typedef AABBNodeMB4D_t AABBNodeMB4D; - typedef OBBNode_t OBBNode; - typedef OBBNodeMB_t OBBNodeMB; - typedef QuantizedBaseNode_t QuantizedBaseNode; - typedef QuantizedBaseNodeMB_t QuantizedBaseNodeMB; - typedef QuantizedNode_t QuantizedNode; - - /*! Number of bytes the nodes and primitives are minimally aligned to.*/ - static const size_t byteAlignment = 16; - static const size_t byteNodeAlignment = 4*N; - - /*! Empty node */ - static const size_t emptyNode = NodeRef::emptyNode; - - /*! Invalid node, used as marker in traversal */ - static const size_t invalidNode = NodeRef::invalidNode; - static const size_t popRay = NodeRef::popRay; - - /*! Maximum depth of the BVH. */ - static const size_t maxBuildDepth = 32; - static const size_t maxBuildDepthLeaf = maxBuildDepth+8; - static const size_t maxDepth = 2*maxBuildDepthLeaf; // 2x because of two level builder - - /*! Maximum number of primitive blocks in a leaf. */ - static const size_t maxLeafBlocks = NodeRef::maxLeafBlocks; - - public: - - /*! Builder interface to create allocator */ - struct CreateAlloc : public FastAllocator::Create { - __forceinline CreateAlloc (BVHN* bvh) : FastAllocator::Create(&bvh->alloc) {} - }; - - typedef BVHNodeRecord NodeRecord; - typedef BVHNodeRecordMB NodeRecordMB; - typedef BVHNodeRecordMB4D NodeRecordMB4D; - - public: - - /*! BVHN default constructor. */ - BVHN (const PrimitiveType& primTy, Scene* scene); - - /*! BVHN destruction */ - ~BVHN (); - - /*! clears the acceleration structure */ - void clear(); - - /*! sets BVH members after build */ - void set (NodeRef root, const LBBox3fa& bounds, size_t numPrimitives); - - /*! Clears the barrier bits of a subtree. */ - void clearBarrier(NodeRef& node); - - /*! lays out num large nodes of the BVH */ - void layoutLargeNodes(size_t num); - NodeRef layoutLargeNodesRecursion(NodeRef& node, const FastAllocator::CachedAllocator& allocator); - - /*! called by all builders before build starts */ - double preBuild(const std::string& builderName); - - /*! called by all builders after build ended */ - void postBuild(double t0); - - /*! allocator class */ - struct Allocator { - BVHN* bvh; - Allocator (BVHN* bvh) : bvh(bvh) {} - __forceinline void* operator() (size_t bytes) const { - return bvh->alloc._threadLocal()->malloc(&bvh->alloc,bytes); - } - }; - - /*! post build cleanup */ - void cleanup() { - alloc.cleanup(); - } - - public: - - /*! Encodes a node */ - static __forceinline NodeRef encodeNode(AABBNode* node) { return NodeRef::encodeNode(node); } - static __forceinline NodeRef encodeNode(AABBNodeMB* node) { return NodeRef::encodeNode(node); } - static __forceinline NodeRef encodeNode(AABBNodeMB4D* node) { return NodeRef::encodeNode(node); } - static __forceinline NodeRef encodeNode(OBBNode* node) { return NodeRef::encodeNode(node); } - static __forceinline NodeRef encodeNode(OBBNodeMB* node) { return NodeRef::encodeNode(node); } - static __forceinline NodeRef encodeLeaf(void* tri, size_t num) { return NodeRef::encodeLeaf(tri,num); } - static __forceinline NodeRef encodeTypedLeaf(void* ptr, size_t ty) { return NodeRef::encodeTypedLeaf(ptr,ty); } - - public: - - /*! Prefetches the node this reference points to */ - __forceinline static void prefetch(const NodeRef ref, int types=0) - { -#if defined(__AVX512PF__) // MIC - if (types != BVH_FLAG_QUANTIZED_NODE) { - prefetchL2(((char*)ref.ptr)+0*64); - prefetchL2(((char*)ref.ptr)+1*64); - if ((N >= 8) || (types > BVH_FLAG_ALIGNED_NODE)) { - prefetchL2(((char*)ref.ptr)+2*64); - prefetchL2(((char*)ref.ptr)+3*64); - } - if ((N >= 8) && (types > BVH_FLAG_ALIGNED_NODE)) { - /* KNL still needs L2 prefetches for large nodes */ - prefetchL2(((char*)ref.ptr)+4*64); - prefetchL2(((char*)ref.ptr)+5*64); - prefetchL2(((char*)ref.ptr)+6*64); - prefetchL2(((char*)ref.ptr)+7*64); - } - } - else - { - /* todo: reduce if 32bit offsets are enabled */ - prefetchL2(((char*)ref.ptr)+0*64); - prefetchL2(((char*)ref.ptr)+1*64); - prefetchL2(((char*)ref.ptr)+2*64); - } -#else - if (types != BVH_FLAG_QUANTIZED_NODE) { - prefetchL1(((char*)ref.ptr)+0*64); - prefetchL1(((char*)ref.ptr)+1*64); - if ((N >= 8) || (types > BVH_FLAG_ALIGNED_NODE)) { - prefetchL1(((char*)ref.ptr)+2*64); - prefetchL1(((char*)ref.ptr)+3*64); - } - if ((N >= 8) && (types > BVH_FLAG_ALIGNED_NODE)) { - /* deactivate for large nodes on Xeon, as it introduces regressions */ - //prefetchL1(((char*)ref.ptr)+4*64); - //prefetchL1(((char*)ref.ptr)+5*64); - //prefetchL1(((char*)ref.ptr)+6*64); - //prefetchL1(((char*)ref.ptr)+7*64); - } - } - else - { - /* todo: reduce if 32bit offsets are enabled */ - prefetchL1(((char*)ref.ptr)+0*64); - prefetchL1(((char*)ref.ptr)+1*64); - prefetchL1(((char*)ref.ptr)+2*64); - } -#endif - } - - __forceinline static void prefetchW(const NodeRef ref, int types=0) - { - embree::prefetchEX(((char*)ref.ptr)+0*64); - embree::prefetchEX(((char*)ref.ptr)+1*64); - if ((N >= 8) || (types > BVH_FLAG_ALIGNED_NODE)) { - embree::prefetchEX(((char*)ref.ptr)+2*64); - embree::prefetchEX(((char*)ref.ptr)+3*64); - } - if ((N >= 8) && (types > BVH_FLAG_ALIGNED_NODE)) { - embree::prefetchEX(((char*)ref.ptr)+4*64); - embree::prefetchEX(((char*)ref.ptr)+5*64); - embree::prefetchEX(((char*)ref.ptr)+6*64); - embree::prefetchEX(((char*)ref.ptr)+7*64); - } - } - - /*! bvh type information */ - public: - const PrimitiveType* primTy; //!< primitive type stored in the BVH - - /*! bvh data */ - public: - Device* device; //!< device pointer - Scene* scene; //!< scene pointer - NodeRef root; //!< root node - FastAllocator alloc; //!< allocator used to allocate nodes - - /*! statistics data */ - public: - size_t numPrimitives; //!< number of primitives the BVH is build over - size_t numVertices; //!< number of vertices the BVH references - - /*! data arrays for special builders */ - public: - std::vector objects; - vector_t> subdiv_patches; - }; - - typedef BVHN<4> BVH4; - typedef BVHN<8> BVH8; -} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh4_factory.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh4_factory.cpp deleted file mode 100644 index 23f4f63d45..0000000000 --- a/thirdparty/embree-aarch64/kernels/bvh/bvh4_factory.cpp +++ /dev/null @@ -1,1325 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "bvh4_factory.h" -#include "../bvh/bvh.h" - -#include "../geometry/curveNv.h" -#include "../geometry/curveNi.h" -#include "../geometry/curveNi_mb.h" -#include "../geometry/linei.h" -#include "../geometry/triangle.h" -#include "../geometry/trianglev.h" -#include "../geometry/trianglev_mb.h" -#include "../geometry/trianglei.h" -#include "../geometry/quadv.h" -#include "../geometry/quadi.h" -#include "../geometry/subdivpatch1.h" -#include "../geometry/object.h" -#include "../geometry/instance.h" -#include "../geometry/subgrid.h" -#include "../common/accelinstance.h" - -namespace embree -{ - DECLARE_SYMBOL2(Accel::Collider,BVH4ColliderUserGeom); - - DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector4i,void); - DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector8i,void); - DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector4v,void); - DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector8v,void); - DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector4iMB,void); - DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector8iMB,void); - - DECLARE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersector1); - DECLARE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersector1MB); - DECLARE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersectorRobust1); - DECLARE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersectorRobust1MB); - - DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4Intersector1Moeller); - DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iIntersector1Moeller); - DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4vIntersector1Pluecker); - DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iIntersector1Pluecker); - - DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4vMBIntersector1Moeller); - DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iMBIntersector1Moeller); - DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4vMBIntersector1Pluecker); - DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iMBIntersector1Pluecker); - - DECLARE_SYMBOL2(Accel::Intersector1,BVH4Quad4vIntersector1Moeller); - DECLARE_SYMBOL2(Accel::Intersector1,BVH4Quad4iIntersector1Moeller); - DECLARE_SYMBOL2(Accel::Intersector1,BVH4Quad4vIntersector1Pluecker); - DECLARE_SYMBOL2(Accel::Intersector1,BVH4Quad4iIntersector1Pluecker); - - DECLARE_SYMBOL2(Accel::Intersector1,BVH4Quad4iMBIntersector1Moeller); - DECLARE_SYMBOL2(Accel::Intersector1,BVH4Quad4iMBIntersector1Pluecker); - - DECLARE_SYMBOL2(Accel::Intersector1,QBVH4Triangle4iIntersector1Pluecker); - DECLARE_SYMBOL2(Accel::Intersector1,QBVH4Quad4iIntersector1Pluecker); - - DECLARE_SYMBOL2(Accel::Intersector1,BVH4SubdivPatch1Intersector1); - DECLARE_SYMBOL2(Accel::Intersector1,BVH4SubdivPatch1MBIntersector1); - - DECLARE_SYMBOL2(Accel::Intersector1,BVH4VirtualIntersector1); - DECLARE_SYMBOL2(Accel::Intersector1,BVH4VirtualMBIntersector1); - - DECLARE_SYMBOL2(Accel::Intersector1,BVH4InstanceIntersector1); - DECLARE_SYMBOL2(Accel::Intersector1,BVH4InstanceMBIntersector1); - - DECLARE_SYMBOL2(Accel::Intersector1,BVH4GridIntersector1Moeller); - DECLARE_SYMBOL2(Accel::Intersector1,BVH4GridMBIntersector1Moeller); - DECLARE_SYMBOL2(Accel::Intersector1,BVH4GridIntersector1Pluecker); - - DECLARE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersector4Hybrid); - DECLARE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersector4HybridMB); - DECLARE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersectorRobust4Hybrid); - DECLARE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersectorRobust4HybridMB); - - DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4Intersector4HybridMoeller); - DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4Intersector4HybridMoellerNoFilter); - DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iIntersector4HybridMoeller); - DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4vIntersector4HybridPluecker); - DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iIntersector4HybridPluecker); - - DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4vMBIntersector4HybridMoeller); - DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iMBIntersector4HybridMoeller); - DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4vMBIntersector4HybridPluecker); - DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iMBIntersector4HybridPluecker); - - DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4vIntersector4HybridMoeller); - DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4vIntersector4HybridMoellerNoFilter); - DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4iIntersector4HybridMoeller); - DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4vIntersector4HybridPluecker); - DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4iIntersector4HybridPluecker); - - DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4iMBIntersector4HybridMoeller); - DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4iMBIntersector4HybridPluecker); - - DECLARE_SYMBOL2(Accel::Intersector4,BVH4SubdivPatch1Intersector4); - DECLARE_SYMBOL2(Accel::Intersector4,BVH4SubdivPatch1MBIntersector4); - - DECLARE_SYMBOL2(Accel::Intersector4,BVH4VirtualIntersector4Chunk); - DECLARE_SYMBOL2(Accel::Intersector4,BVH4VirtualMBIntersector4Chunk); - - DECLARE_SYMBOL2(Accel::Intersector4,BVH4InstanceIntersector4Chunk); - DECLARE_SYMBOL2(Accel::Intersector4,BVH4InstanceMBIntersector4Chunk); - - DECLARE_SYMBOL2(Accel::Intersector4,BVH4GridIntersector4HybridMoeller); - DECLARE_SYMBOL2(Accel::Intersector4,BVH4GridMBIntersector4HybridMoeller); - DECLARE_SYMBOL2(Accel::Intersector4,BVH4GridIntersector4HybridPluecker); - - DECLARE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersector8Hybrid); - DECLARE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersector8HybridMB); - DECLARE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersectorRobust8Hybrid); - DECLARE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersectorRobust8HybridMB); - - DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4Intersector8HybridMoeller); - DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4Intersector8HybridMoellerNoFilter); - DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iIntersector8HybridMoeller); - DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4vIntersector8HybridPluecker); - DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iIntersector8HybridPluecker); - - DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4vMBIntersector8HybridMoeller); - DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iMBIntersector8HybridMoeller); - DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4vMBIntersector8HybridPluecker); - DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iMBIntersector8HybridPluecker); - - DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4vIntersector8HybridMoeller); - DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4vIntersector8HybridMoellerNoFilter); - DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4iIntersector8HybridMoeller); - DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4vIntersector8HybridPluecker); - DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4iIntersector8HybridPluecker); - - DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4iMBIntersector8HybridMoeller); - DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4iMBIntersector8HybridPluecker); - - DECLARE_SYMBOL2(Accel::Intersector8,BVH4SubdivPatch1Intersector8); - DECLARE_SYMBOL2(Accel::Intersector8,BVH4SubdivPatch1MBIntersector8); - - DECLARE_SYMBOL2(Accel::Intersector8,BVH4VirtualIntersector8Chunk); - DECLARE_SYMBOL2(Accel::Intersector8,BVH4VirtualMBIntersector8Chunk); - - DECLARE_SYMBOL2(Accel::Intersector8,BVH4InstanceIntersector8Chunk); - DECLARE_SYMBOL2(Accel::Intersector8,BVH4InstanceMBIntersector8Chunk); - - DECLARE_SYMBOL2(Accel::Intersector8,BVH4GridIntersector8HybridMoeller); - DECLARE_SYMBOL2(Accel::Intersector8,BVH4GridMBIntersector8HybridMoeller); - DECLARE_SYMBOL2(Accel::Intersector8,BVH4GridIntersector8HybridPluecker); - - DECLARE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersector16Hybrid); - DECLARE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersector16HybridMB); - DECLARE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersectorRobust16Hybrid); - DECLARE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersectorRobust16HybridMB); - - DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4Intersector16HybridMoeller); - DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4Intersector16HybridMoellerNoFilter); - DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iIntersector16HybridMoeller); - DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4vIntersector16HybridPluecker); - DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iIntersector16HybridPluecker); - - DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4vMBIntersector16HybridMoeller); - DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iMBIntersector16HybridMoeller); - DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4vMBIntersector16HybridPluecker); - DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iMBIntersector16HybridPluecker); - - DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4vIntersector16HybridMoeller); - DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4vIntersector16HybridMoellerNoFilter); - DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4iIntersector16HybridMoeller); - DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4vIntersector16HybridPluecker); - DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4iIntersector16HybridPluecker); - - DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4iMBIntersector16HybridMoeller); - DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4iMBIntersector16HybridPluecker); - - DECLARE_SYMBOL2(Accel::Intersector16,BVH4SubdivPatch1Intersector16); - DECLARE_SYMBOL2(Accel::Intersector16,BVH4SubdivPatch1MBIntersector16); - - DECLARE_SYMBOL2(Accel::Intersector16,BVH4VirtualIntersector16Chunk); - DECLARE_SYMBOL2(Accel::Intersector16,BVH4VirtualMBIntersector16Chunk); - - DECLARE_SYMBOL2(Accel::Intersector16,BVH4InstanceIntersector16Chunk); - DECLARE_SYMBOL2(Accel::Intersector16,BVH4InstanceMBIntersector16Chunk); - - DECLARE_SYMBOL2(Accel::Intersector16,BVH4GridIntersector16HybridMoeller); - DECLARE_SYMBOL2(Accel::Intersector16,BVH4GridMBIntersector16HybridMoeller); - DECLARE_SYMBOL2(Accel::Intersector16,BVH4GridIntersector16HybridPluecker); - - DECLARE_SYMBOL2(Accel::IntersectorN,BVH4IntersectorStreamPacketFallback); - - DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Triangle4IntersectorStreamMoeller); - DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Triangle4IntersectorStreamMoellerNoFilter); - DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Triangle4iIntersectorStreamMoeller); - DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Triangle4vIntersectorStreamPluecker); - DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Triangle4iIntersectorStreamPluecker); - - DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Quad4vIntersectorStreamMoeller); - DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Quad4vIntersectorStreamMoellerNoFilter); - DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Quad4iIntersectorStreamMoeller); - DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Quad4vIntersectorStreamPluecker); - DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Quad4iIntersectorStreamPluecker); - - DECLARE_SYMBOL2(Accel::IntersectorN,BVH4VirtualIntersectorStream); - DECLARE_SYMBOL2(Accel::IntersectorN,BVH4InstanceIntersectorStream); - - DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelTriangle4MeshSAH,void* COMMA Scene* COMMA bool); - DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelTriangle4vMeshSAH,void* COMMA Scene* COMMA bool); - DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelTriangle4iMeshSAH,void* COMMA Scene* COMMA bool); - DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelQuadMeshSAH,void* COMMA Scene* COMMA bool); - DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelVirtualSAH,void* COMMA Scene* COMMA bool); - DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelInstanceSAH,void* COMMA Scene* COMMA Geometry::GTypeMask COMMA bool); - - DECLARE_ISA_FUNCTION(Builder*,BVH4Curve4vBuilder_OBB_New,void* COMMA Scene* COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH4Curve4iBuilder_OBB_New,void* COMMA Scene* COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH4OBBCurve4iMBBuilder_OBB,void* COMMA Scene* COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH4Curve8iBuilder_OBB_New,void* COMMA Scene* COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH4OBBCurve8iMBBuilder_OBB,void* COMMA Scene* COMMA size_t); - - DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4SceneBuilderSAH,void* COMMA Scene* COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH4QuantizedTriangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); - - DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH4QuantizedQuad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); - - DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4SceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t); - - DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t); - - DECLARE_ISA_FUNCTION(Builder*,BVH4VirtualSceneBuilderSAH,void* COMMA Scene* COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH4VirtualMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); - - DECLARE_ISA_FUNCTION(Builder*,BVH4InstanceSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask); - DECLARE_ISA_FUNCTION(Builder*,BVH4InstanceMBSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask); - - DECLARE_ISA_FUNCTION(Builder*,BVH4GridSceneBuilderSAH,void* COMMA Scene* COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH4GridMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); - - DECLARE_ISA_FUNCTION(Builder*,BVH4SubdivPatch1BuilderSAH,void* COMMA Scene* COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH4SubdivPatch1MBBuilderSAH,void* COMMA Scene* COMMA size_t); - - DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4MeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vMeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iMeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4vMeshRefitSAH,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH4VirtualMeshRefitSAH,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t); - - BVH4Factory::BVH4Factory(int bfeatures, int ifeatures) - { - SELECT_SYMBOL_DEFAULT_AVX_AVX2(ifeatures,BVH4ColliderUserGeom); - - selectBuilders(bfeatures); - selectIntersectors(ifeatures); - } - - void BVH4Factory::selectBuilders(int features) - { - IF_ENABLED_TRIS (SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4BuilderTwoLevelTriangle4MeshSAH)); - IF_ENABLED_TRIS (SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4BuilderTwoLevelTriangle4iMeshSAH)); - IF_ENABLED_TRIS (SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4BuilderTwoLevelTriangle4vMeshSAH)); - IF_ENABLED_QUADS (SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4BuilderTwoLevelQuadMeshSAH)); - IF_ENABLED_USER (SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4BuilderTwoLevelVirtualSAH)); - IF_ENABLED_INSTANCE (SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4BuilderTwoLevelInstanceSAH)); - - IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Curve4vBuilder_OBB_New)); - IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Curve4iBuilder_OBB_New)); - IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4OBBCurve4iMBBuilder_OBB)); - IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX(features,BVH4Curve8iBuilder_OBB_New)); - IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX(features,BVH4OBBCurve8iMBBuilder_OBB)); - - IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4Triangle4SceneBuilderSAH)); - IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4Triangle4vSceneBuilderSAH)); - IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4Triangle4iSceneBuilderSAH)); - IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Triangle4iMBSceneBuilderSAH)); - IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Triangle4vMBSceneBuilderSAH)); - IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4QuantizedTriangle4iSceneBuilderSAH)); - - IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4Quad4vSceneBuilderSAH)); - IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4Quad4iSceneBuilderSAH)); - IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Quad4iMBSceneBuilderSAH)); - IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4QuantizedQuad4iSceneBuilderSAH)); - - IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Triangle4SceneBuilderFastSpatialSAH)); - IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Triangle4vSceneBuilderFastSpatialSAH)); - IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Triangle4iSceneBuilderFastSpatialSAH)); - - IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Quad4vSceneBuilderFastSpatialSAH)); - - IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4VirtualSceneBuilderSAH)); - IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4VirtualMBSceneBuilderSAH)); - - IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4InstanceSceneBuilderSAH)); - IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4InstanceMBSceneBuilderSAH)); - - IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4GridSceneBuilderSAH)); - IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4GridMBSceneBuilderSAH)); - - IF_ENABLED_SUBDIV(SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4SubdivPatch1BuilderSAH)); - IF_ENABLED_SUBDIV(SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4SubdivPatch1MBBuilderSAH)); - } - - void BVH4Factory::selectIntersectors(int features) - { - IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(features,VirtualCurveIntersector4i)); - IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,VirtualCurveIntersector8i)); - IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(features,VirtualCurveIntersector4v)); - IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,VirtualCurveIntersector8v)); - IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(features,VirtualCurveIntersector4iMB)); - IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,VirtualCurveIntersector8iMB)); - - /* select intersectors1 */ - IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersector1)); - IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersector1MB)); - IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersectorRobust1)); - IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersectorRobust1MB)); - - IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Triangle4Intersector1Moeller)); - IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX512SKX(features,BVH4Triangle4iIntersector1Moeller)); - IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX512SKX(features,BVH4Triangle4vIntersector1Pluecker)); - IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX512SKX(features,BVH4Triangle4iIntersector1Pluecker)); - - IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4vMBIntersector1Moeller)); - IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iMBIntersector1Moeller)); - IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4vMBIntersector1Pluecker)); - IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iMBIntersector1Pluecker)); - - IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4vIntersector1Moeller)); - IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4iIntersector1Moeller)); - IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4vIntersector1Pluecker)); - IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4iIntersector1Pluecker)); - - IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4iMBIntersector1Pluecker)); - IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4iMBIntersector1Moeller)); - - IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX512SKX(features,QBVH4Triangle4iIntersector1Pluecker)); - IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX512SKX(features,QBVH4Quad4iIntersector1Pluecker)); - - IF_ENABLED_SUBDIV(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4SubdivPatch1Intersector1)); - IF_ENABLED_SUBDIV(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4SubdivPatch1MBIntersector1)); - - IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4VirtualIntersector1)); - IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4VirtualMBIntersector1)); - - IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4InstanceIntersector1)); - IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4InstanceMBIntersector1)); - - IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4GridIntersector1Moeller)); - IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4GridMBIntersector1Moeller)) - IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4GridIntersector1Pluecker)); - -#if defined (EMBREE_RAY_PACKETS) - - /* select intersectors4 */ - IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersector4Hybrid)); - IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersector4HybridMB)); - IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersectorRobust4Hybrid)); - IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersectorRobust4HybridMB)); - - IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4Intersector4HybridMoeller)); - IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4Intersector4HybridMoellerNoFilter)); - IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iIntersector4HybridMoeller)); - IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4vIntersector4HybridPluecker)); - IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iIntersector4HybridPluecker)); - - IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4vMBIntersector4HybridMoeller)); - IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iMBIntersector4HybridMoeller)); - IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4vMBIntersector4HybridPluecker)); - IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iMBIntersector4HybridPluecker)); - - IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4vIntersector4HybridMoeller)); - IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4vIntersector4HybridMoellerNoFilter)); - IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4iIntersector4HybridMoeller)); - IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4vIntersector4HybridPluecker)); - IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4iIntersector4HybridPluecker)); - - IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4iMBIntersector4HybridMoeller)); - IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4iMBIntersector4HybridPluecker)); - - IF_ENABLED_SUBDIV(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4SubdivPatch1Intersector4)); - IF_ENABLED_SUBDIV(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4SubdivPatch1MBIntersector4)); - - IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4VirtualIntersector4Chunk)); - IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4VirtualMBIntersector4Chunk)); - - IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4InstanceIntersector4Chunk)); - IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4InstanceMBIntersector4Chunk)); - - IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4vIntersector4HybridMoeller)); - - IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4GridIntersector4HybridMoeller)); - IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4GridMBIntersector4HybridMoeller)); - IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4GridIntersector4HybridPluecker)); - - /* select intersectors8 */ - IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersector8Hybrid)); - IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersector8HybridMB)); - IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersectorRobust8Hybrid)); - IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersectorRobust8HybridMB)); - - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Triangle4Intersector8HybridMoeller)); - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Triangle4Intersector8HybridMoellerNoFilter)); - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iIntersector8HybridMoeller)); - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Triangle4vIntersector8HybridPluecker)); - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iIntersector8HybridPluecker)); - - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Triangle4vMBIntersector8HybridMoeller)); - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iMBIntersector8HybridMoeller)); - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Triangle4vMBIntersector8HybridPluecker)); - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iMBIntersector8HybridPluecker)); - - IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Quad4vIntersector8HybridMoeller)); - IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Quad4vIntersector8HybridMoellerNoFilter)); - IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Quad4iIntersector8HybridMoeller)); - IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Quad4vIntersector8HybridPluecker)); - IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Quad4iIntersector8HybridPluecker)); - - IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Quad4iMBIntersector8HybridMoeller)); - IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Quad4iMBIntersector8HybridPluecker)); - - IF_ENABLED_SUBDIV(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4SubdivPatch1Intersector8)); - IF_ENABLED_SUBDIV(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4SubdivPatch1MBIntersector8)); - - IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4VirtualIntersector8Chunk)); - IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4VirtualMBIntersector8Chunk)); - - IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4InstanceIntersector8Chunk)); - IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4InstanceMBIntersector8Chunk)); - - IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4GridIntersector8HybridMoeller)); - IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4GridMBIntersector8HybridMoeller)); - IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4GridIntersector8HybridPluecker)); - - /* select intersectors16 */ - IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4OBBVirtualCurveIntersector16Hybrid)); - IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4OBBVirtualCurveIntersector16HybridMB)); - IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4OBBVirtualCurveIntersectorRobust16Hybrid)); - IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4OBBVirtualCurveIntersectorRobust16HybridMB)); - - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Triangle4Intersector16HybridMoeller)); - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Triangle4Intersector16HybridMoellerNoFilter)); - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Triangle4iIntersector16HybridMoeller)); - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Triangle4vIntersector16HybridPluecker)); - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Triangle4iIntersector16HybridPluecker)); - - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Triangle4vMBIntersector16HybridMoeller)); - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Triangle4iMBIntersector16HybridMoeller)); - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Triangle4vMBIntersector16HybridPluecker)); - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Triangle4iMBIntersector16HybridPluecker)); - - IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Quad4vIntersector16HybridMoeller)); - IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Quad4vIntersector16HybridMoellerNoFilter)); - IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Quad4iIntersector16HybridMoeller)); - IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Quad4vIntersector16HybridPluecker)); - IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Quad4iIntersector16HybridPluecker)); - - IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Quad4iMBIntersector16HybridMoeller)); - IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Quad4iMBIntersector16HybridPluecker)); - - IF_ENABLED_SUBDIV(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4SubdivPatch1Intersector16)); - IF_ENABLED_SUBDIV(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4SubdivPatch1MBIntersector16)); - - IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4VirtualIntersector16Chunk)); - IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4VirtualMBIntersector16Chunk)); - - IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4InstanceIntersector16Chunk)); - IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4InstanceMBIntersector16Chunk)); - - IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4GridIntersector16HybridMoeller)); - IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4GridMBIntersector16HybridMoeller)); - IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4GridIntersector16HybridPluecker)); - - /* select stream intersectors */ - SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4IntersectorStreamPacketFallback); - - IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Triangle4IntersectorStreamMoeller)); - IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Triangle4IntersectorStreamMoellerNoFilter)); - IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Triangle4iIntersectorStreamMoeller)); - IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Triangle4vIntersectorStreamPluecker)); - IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Triangle4iIntersectorStreamPluecker)); - - IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Quad4vIntersectorStreamMoeller)); - IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Quad4vIntersectorStreamMoellerNoFilter)); - IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Quad4iIntersectorStreamMoeller)); - IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Quad4vIntersectorStreamPluecker)); - IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Quad4iIntersectorStreamPluecker)); - - IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4VirtualIntersectorStream)); - - IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4InstanceIntersectorStream)); - -#endif - } - - Accel::Intersectors BVH4Factory::BVH4OBBVirtualCurveIntersectors(BVH4* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant) - { - switch (ivariant) { - case IntersectVariant::FAST: - { - Accel::Intersectors intersectors; - intersectors.ptr = bvh; - intersectors.leafIntersector = leafIntersector; - intersectors.intersector1 = BVH4OBBVirtualCurveIntersector1(); -#if defined (EMBREE_RAY_PACKETS) - intersectors.intersector4 = BVH4OBBVirtualCurveIntersector4Hybrid(); - intersectors.intersector8 = BVH4OBBVirtualCurveIntersector8Hybrid(); - intersectors.intersector16 = BVH4OBBVirtualCurveIntersector16Hybrid(); - intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); -#endif - return intersectors; - } - case IntersectVariant::ROBUST: - { - Accel::Intersectors intersectors; - intersectors.ptr = bvh; - intersectors.leafIntersector = leafIntersector; - intersectors.intersector1 = BVH4OBBVirtualCurveIntersectorRobust1(); -#if defined (EMBREE_RAY_PACKETS) - intersectors.intersector4 = BVH4OBBVirtualCurveIntersectorRobust4Hybrid(); - intersectors.intersector8 = BVH4OBBVirtualCurveIntersectorRobust8Hybrid(); - intersectors.intersector16 = BVH4OBBVirtualCurveIntersectorRobust16Hybrid(); - intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); -#endif - return intersectors; - } - default: assert(false); - } - return Accel::Intersectors(); - } - - Accel::Intersectors BVH4Factory::BVH4OBBVirtualCurveIntersectorsMB(BVH4* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant) - { - switch (ivariant) { - case IntersectVariant::FAST: - { - Accel::Intersectors intersectors; - intersectors.ptr = bvh; - intersectors.leafIntersector = leafIntersector; - intersectors.intersector1 = BVH4OBBVirtualCurveIntersector1MB(); -#if defined (EMBREE_RAY_PACKETS) - intersectors.intersector4 = BVH4OBBVirtualCurveIntersector4HybridMB(); - intersectors.intersector8 = BVH4OBBVirtualCurveIntersector8HybridMB(); - intersectors.intersector16 = BVH4OBBVirtualCurveIntersector16HybridMB(); - intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); -#endif - return intersectors; - } - case IntersectVariant::ROBUST: - { - Accel::Intersectors intersectors; - intersectors.ptr = bvh; - intersectors.leafIntersector = leafIntersector; - intersectors.intersector1 = BVH4OBBVirtualCurveIntersectorRobust1MB(); -#if defined (EMBREE_RAY_PACKETS) - intersectors.intersector4 = BVH4OBBVirtualCurveIntersectorRobust4HybridMB(); - intersectors.intersector8 = BVH4OBBVirtualCurveIntersectorRobust8HybridMB(); - intersectors.intersector16 = BVH4OBBVirtualCurveIntersectorRobust16HybridMB(); - intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); -#endif - return intersectors; - } - default: assert(false); - } - return Accel::Intersectors(); - } - - Accel::Intersectors BVH4Factory::BVH4Triangle4Intersectors(BVH4* bvh, IntersectVariant ivariant) - { - assert(ivariant == IntersectVariant::FAST); - Accel::Intersectors intersectors; - intersectors.ptr = bvh; - intersectors.intersector1 = BVH4Triangle4Intersector1Moeller(); -#if defined (EMBREE_RAY_PACKETS) - intersectors.intersector4_filter = BVH4Triangle4Intersector4HybridMoeller(); - intersectors.intersector4_nofilter = BVH4Triangle4Intersector4HybridMoellerNoFilter(); - intersectors.intersector8_filter = BVH4Triangle4Intersector8HybridMoeller(); - intersectors.intersector8_nofilter = BVH4Triangle4Intersector8HybridMoellerNoFilter(); - intersectors.intersector16_filter = BVH4Triangle4Intersector16HybridMoeller(); - intersectors.intersector16_nofilter = BVH4Triangle4Intersector16HybridMoellerNoFilter(); - intersectors.intersectorN_filter = BVH4Triangle4IntersectorStreamMoeller(); - intersectors.intersectorN_nofilter = BVH4Triangle4IntersectorStreamMoellerNoFilter(); -#endif - return intersectors; - } - - Accel::Intersectors BVH4Factory::BVH4Triangle4vIntersectors(BVH4* bvh, IntersectVariant ivariant) - { - assert(ivariant == IntersectVariant::ROBUST); - Accel::Intersectors intersectors; - intersectors.ptr = bvh; - intersectors.intersector1 = BVH4Triangle4vIntersector1Pluecker(); -#if defined (EMBREE_RAY_PACKETS) - intersectors.intersector4 = BVH4Triangle4vIntersector4HybridPluecker(); - intersectors.intersector8 = BVH4Triangle4vIntersector8HybridPluecker(); - intersectors.intersector16 = BVH4Triangle4vIntersector16HybridPluecker(); - intersectors.intersectorN = BVH4Triangle4vIntersectorStreamPluecker(); -#endif - return intersectors; - } - - Accel::Intersectors BVH4Factory::BVH4Triangle4iIntersectors(BVH4* bvh, IntersectVariant ivariant) - { - switch (ivariant) { - case IntersectVariant::FAST: - { - Accel::Intersectors intersectors; - intersectors.ptr = bvh; - intersectors.intersector1 = BVH4Triangle4iIntersector1Moeller(); -#if defined (EMBREE_RAY_PACKETS) - intersectors.intersector4 = BVH4Triangle4iIntersector4HybridMoeller(); - intersectors.intersector8 = BVH4Triangle4iIntersector8HybridMoeller(); - intersectors.intersector16 = BVH4Triangle4iIntersector16HybridMoeller(); - intersectors.intersectorN = BVH4Triangle4iIntersectorStreamMoeller(); -#endif - return intersectors; - } - case IntersectVariant::ROBUST: - { - Accel::Intersectors intersectors; - intersectors.ptr = bvh; - intersectors.intersector1 = BVH4Triangle4iIntersector1Pluecker(); -#if defined (EMBREE_RAY_PACKETS) - intersectors.intersector4 = BVH4Triangle4iIntersector4HybridPluecker(); - intersectors.intersector8 = BVH4Triangle4iIntersector8HybridPluecker(); - intersectors.intersector16 = BVH4Triangle4iIntersector16HybridPluecker(); - intersectors.intersectorN = BVH4Triangle4iIntersectorStreamPluecker(); -#endif - return intersectors; - } - } - return Accel::Intersectors(); - } - - Accel::Intersectors BVH4Factory::BVH4Triangle4vMBIntersectors(BVH4* bvh, IntersectVariant ivariant) - { - switch (ivariant) { - case IntersectVariant::FAST: - { - Accel::Intersectors intersectors; - intersectors.ptr = bvh; - intersectors.intersector1 = BVH4Triangle4vMBIntersector1Moeller(); -#if defined (EMBREE_RAY_PACKETS) - intersectors.intersector4 = BVH4Triangle4vMBIntersector4HybridMoeller(); - intersectors.intersector8 = BVH4Triangle4vMBIntersector8HybridMoeller(); - intersectors.intersector16 = BVH4Triangle4vMBIntersector16HybridMoeller(); - intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); -#endif - return intersectors; - } - case IntersectVariant::ROBUST: - { - Accel::Intersectors intersectors; - intersectors.ptr = bvh; - intersectors.intersector1 = BVH4Triangle4vMBIntersector1Pluecker(); -#if defined (EMBREE_RAY_PACKETS) - intersectors.intersector4 = BVH4Triangle4vMBIntersector4HybridPluecker(); - intersectors.intersector8 = BVH4Triangle4vMBIntersector8HybridPluecker(); - intersectors.intersector16 = BVH4Triangle4vMBIntersector16HybridPluecker(); - intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); -#endif - return intersectors; - } - } - return Accel::Intersectors(); - } - - Accel::Intersectors BVH4Factory::BVH4Triangle4iMBIntersectors(BVH4* bvh, IntersectVariant ivariant) - { - switch (ivariant) { - case IntersectVariant::FAST: - { - Accel::Intersectors intersectors; - intersectors.ptr = bvh; - intersectors.intersector1 = BVH4Triangle4iMBIntersector1Moeller(); -#if defined (EMBREE_RAY_PACKETS) - intersectors.intersector4 = BVH4Triangle4iMBIntersector4HybridMoeller(); - intersectors.intersector8 = BVH4Triangle4iMBIntersector8HybridMoeller(); - intersectors.intersector16 = BVH4Triangle4iMBIntersector16HybridMoeller(); - intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); -#endif - return intersectors; - } - case IntersectVariant::ROBUST: - { - Accel::Intersectors intersectors; - intersectors.ptr = bvh; - intersectors.intersector1 = BVH4Triangle4iMBIntersector1Pluecker(); -#if defined (EMBREE_RAY_PACKETS) - intersectors.intersector4 = BVH4Triangle4iMBIntersector4HybridPluecker(); - intersectors.intersector8 = BVH4Triangle4iMBIntersector8HybridPluecker(); - intersectors.intersector16 = BVH4Triangle4iMBIntersector16HybridPluecker(); - intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); -#endif - return intersectors; - } - } - return Accel::Intersectors(); - } - - Accel::Intersectors BVH4Factory::BVH4Quad4vIntersectors(BVH4* bvh, IntersectVariant ivariant) - { - switch (ivariant) { - case IntersectVariant::FAST: - { - Accel::Intersectors intersectors; - intersectors.ptr = bvh; - intersectors.intersector1 = BVH4Quad4vIntersector1Moeller(); -#if defined (EMBREE_RAY_PACKETS) - intersectors.intersector4_filter = BVH4Quad4vIntersector4HybridMoeller(); - intersectors.intersector4_nofilter = BVH4Quad4vIntersector4HybridMoellerNoFilter(); - intersectors.intersector8_filter = BVH4Quad4vIntersector8HybridMoeller(); - intersectors.intersector8_nofilter = BVH4Quad4vIntersector8HybridMoellerNoFilter(); - intersectors.intersector16_filter = BVH4Quad4vIntersector16HybridMoeller(); - intersectors.intersector16_nofilter = BVH4Quad4vIntersector16HybridMoellerNoFilter(); - intersectors.intersectorN_filter = BVH4Quad4vIntersectorStreamMoeller(); - intersectors.intersectorN_nofilter = BVH4Quad4vIntersectorStreamMoellerNoFilter(); -#endif - return intersectors; - } - case IntersectVariant::ROBUST: - { - Accel::Intersectors intersectors; - intersectors.ptr = bvh; - intersectors.intersector1 = BVH4Quad4vIntersector1Pluecker(); -#if defined (EMBREE_RAY_PACKETS) - intersectors.intersector4 = BVH4Quad4vIntersector4HybridPluecker(); - intersectors.intersector8 = BVH4Quad4vIntersector8HybridPluecker(); - intersectors.intersector16 = BVH4Quad4vIntersector16HybridPluecker(); - intersectors.intersectorN = BVH4Quad4vIntersectorStreamPluecker(); -#endif - return intersectors; - } - } - return Accel::Intersectors(); - } - - Accel::Intersectors BVH4Factory::BVH4Quad4iIntersectors(BVH4* bvh, IntersectVariant ivariant) - { - switch (ivariant) { - case IntersectVariant::FAST: - { - Accel::Intersectors intersectors; - intersectors.ptr = bvh; - intersectors.intersector1 = BVH4Quad4iIntersector1Moeller(); -#if defined (EMBREE_RAY_PACKETS) - intersectors.intersector4 = BVH4Quad4iIntersector4HybridMoeller(); - intersectors.intersector8 = BVH4Quad4iIntersector8HybridMoeller(); - intersectors.intersector16= BVH4Quad4iIntersector16HybridMoeller(); - intersectors.intersectorN = BVH4Quad4iIntersectorStreamMoeller(); -#endif - return intersectors; - } - case IntersectVariant::ROBUST: - { - Accel::Intersectors intersectors; - intersectors.ptr = bvh; - intersectors.intersector1 = BVH4Quad4iIntersector1Pluecker(); -#if defined (EMBREE_RAY_PACKETS) - intersectors.intersector4 = BVH4Quad4iIntersector4HybridPluecker(); - intersectors.intersector8 = BVH4Quad4iIntersector8HybridPluecker(); - intersectors.intersector16= BVH4Quad4iIntersector16HybridPluecker(); - intersectors.intersectorN = BVH4Quad4iIntersectorStreamPluecker(); -#endif - return intersectors; - } - } - return Accel::Intersectors(); - } - - Accel::Intersectors BVH4Factory::BVH4Quad4iMBIntersectors(BVH4* bvh, IntersectVariant ivariant) - { - switch (ivariant) { - case IntersectVariant::FAST: - { - Accel::Intersectors intersectors; - intersectors.ptr = bvh; - intersectors.intersector1 = BVH4Quad4iMBIntersector1Moeller(); -#if defined (EMBREE_RAY_PACKETS) - intersectors.intersector4 = BVH4Quad4iMBIntersector4HybridMoeller(); - intersectors.intersector8 = BVH4Quad4iMBIntersector8HybridMoeller(); - intersectors.intersector16= BVH4Quad4iMBIntersector16HybridMoeller(); - intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); -#endif - return intersectors; - } - case IntersectVariant::ROBUST: - { - Accel::Intersectors intersectors; - intersectors.ptr = bvh; - intersectors.intersector1 = BVH4Quad4iMBIntersector1Pluecker(); -#if defined (EMBREE_RAY_PACKETS) - intersectors.intersector4 = BVH4Quad4iMBIntersector4HybridPluecker(); - intersectors.intersector8 = BVH4Quad4iMBIntersector8HybridPluecker(); - intersectors.intersector16= BVH4Quad4iMBIntersector16HybridPluecker(); - intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); -#endif - return intersectors; - } - } - return Accel::Intersectors(); - } - - Accel::Intersectors BVH4Factory::QBVH4Triangle4iIntersectors(BVH4* bvh) - { - Accel::Intersectors intersectors; - intersectors.ptr = bvh; - intersectors.intersector1 = QBVH4Triangle4iIntersector1Pluecker(); - return intersectors; - } - - Accel::Intersectors BVH4Factory::QBVH4Quad4iIntersectors(BVH4* bvh) - { - Accel::Intersectors intersectors; - intersectors.ptr = bvh; - intersectors.intersector1 = QBVH4Quad4iIntersector1Pluecker(); - return intersectors; - } - - Accel::Intersectors BVH4Factory::BVH4UserGeometryIntersectors(BVH4* bvh) - { - Accel::Intersectors intersectors; - intersectors.ptr = bvh; - intersectors.intersector1 = BVH4VirtualIntersector1(); -#if defined (EMBREE_RAY_PACKETS) - intersectors.intersector4 = BVH4VirtualIntersector4Chunk(); - intersectors.intersector8 = BVH4VirtualIntersector8Chunk(); - intersectors.intersector16 = BVH4VirtualIntersector16Chunk(); - intersectors.intersectorN = BVH4VirtualIntersectorStream(); -#endif - intersectors.collider = BVH4ColliderUserGeom(); - return intersectors; - } - - Accel::Intersectors BVH4Factory::BVH4UserGeometryMBIntersectors(BVH4* bvh) - { - Accel::Intersectors intersectors; - intersectors.ptr = bvh; - intersectors.intersector1 = BVH4VirtualMBIntersector1(); -#if defined (EMBREE_RAY_PACKETS) - intersectors.intersector4 = BVH4VirtualMBIntersector4Chunk(); - intersectors.intersector8 = BVH4VirtualMBIntersector8Chunk(); - intersectors.intersector16 = BVH4VirtualMBIntersector16Chunk(); - intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); -#endif - return intersectors; - } - - Accel::Intersectors BVH4Factory::BVH4InstanceIntersectors(BVH4* bvh) - { - Accel::Intersectors intersectors; - intersectors.ptr = bvh; - intersectors.intersector1 = BVH4InstanceIntersector1(); -#if defined (EMBREE_RAY_PACKETS) - intersectors.intersector4 = BVH4InstanceIntersector4Chunk(); - intersectors.intersector8 = BVH4InstanceIntersector8Chunk(); - intersectors.intersector16 = BVH4InstanceIntersector16Chunk(); - intersectors.intersectorN = BVH4InstanceIntersectorStream(); -#endif - return intersectors; - } - - Accel::Intersectors BVH4Factory::BVH4InstanceMBIntersectors(BVH4* bvh) - { - Accel::Intersectors intersectors; - intersectors.ptr = bvh; - intersectors.intersector1 = BVH4InstanceMBIntersector1(); -#if defined (EMBREE_RAY_PACKETS) - intersectors.intersector4 = BVH4InstanceMBIntersector4Chunk(); - intersectors.intersector8 = BVH4InstanceMBIntersector8Chunk(); - intersectors.intersector16 = BVH4InstanceMBIntersector16Chunk(); - intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); -#endif - return intersectors; - } - - Accel::Intersectors BVH4Factory::BVH4SubdivPatch1Intersectors(BVH4* bvh) - { - Accel::Intersectors intersectors; - intersectors.ptr = bvh; - intersectors.intersector1 = BVH4SubdivPatch1Intersector1(); -#if defined (EMBREE_RAY_PACKETS) - intersectors.intersector4 = BVH4SubdivPatch1Intersector4(); - intersectors.intersector8 = BVH4SubdivPatch1Intersector8(); - intersectors.intersector16 = BVH4SubdivPatch1Intersector16(); - intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); -#endif - return intersectors; - } - - Accel::Intersectors BVH4Factory::BVH4SubdivPatch1MBIntersectors(BVH4* bvh) - { - Accel::Intersectors intersectors; - intersectors.ptr = bvh; - intersectors.intersector1 = BVH4SubdivPatch1MBIntersector1(); -#if defined (EMBREE_RAY_PACKETS) - intersectors.intersector4 = BVH4SubdivPatch1MBIntersector4(); - intersectors.intersector8 = BVH4SubdivPatch1MBIntersector8(); - intersectors.intersector16 = BVH4SubdivPatch1MBIntersector16(); - intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); -#endif - return intersectors; - } - - Accel* BVH4Factory::BVH4OBBVirtualCurve4i(Scene* scene, IntersectVariant ivariant) - { - BVH4* accel = new BVH4(Curve4i::type,scene); - Accel::Intersectors intersectors = BVH4OBBVirtualCurveIntersectors(accel,VirtualCurveIntersector4i(),ivariant); - - Builder* builder = nullptr; - if (scene->device->hair_builder == "default" ) builder = BVH4Curve4iBuilder_OBB_New(accel,scene,0); - else if (scene->device->hair_builder == "sah" ) builder = BVH4Curve4iBuilder_OBB_New(accel,scene,0); - else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->hair_builder+" for BVH4OBB"); - - return new AccelInstance(accel,builder,intersectors); - } - -#if defined(EMBREE_TARGET_SIMD8) - Accel* BVH4Factory::BVH4OBBVirtualCurve8i(Scene* scene, IntersectVariant ivariant) - { - BVH4* accel = new BVH4(Curve8i::type,scene); - Accel::Intersectors intersectors = BVH4OBBVirtualCurveIntersectors(accel,VirtualCurveIntersector8i(),ivariant); - - Builder* builder = nullptr; - if (scene->device->hair_builder == "default" ) builder = BVH4Curve8iBuilder_OBB_New(accel,scene,0); - else if (scene->device->hair_builder == "sah" ) builder = BVH4Curve8iBuilder_OBB_New(accel,scene,0); - else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->hair_builder+" for BVH4OBB"); - - return new AccelInstance(accel,builder,intersectors); - } -#endif - - Accel* BVH4Factory::BVH4OBBVirtualCurve4v(Scene* scene, IntersectVariant ivariant) - { - BVH4* accel = new BVH4(Curve4v::type,scene); - Accel::Intersectors intersectors = BVH4OBBVirtualCurveIntersectors(accel,VirtualCurveIntersector4v(),ivariant); - - Builder* builder = nullptr; - if (scene->device->hair_builder == "default" ) builder = BVH4Curve4vBuilder_OBB_New(accel,scene,0); - else if (scene->device->hair_builder == "sah" ) builder = BVH4Curve4vBuilder_OBB_New(accel,scene,0); - else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->hair_builder+" for BVH4OBB"); - - return new AccelInstance(accel,builder,intersectors); - } - - Accel* BVH4Factory::BVH4OBBVirtualCurve4iMB(Scene* scene, IntersectVariant ivariant) - { - BVH4* accel = new BVH4(Curve4iMB::type,scene); - Accel::Intersectors intersectors = BVH4OBBVirtualCurveIntersectorsMB(accel,VirtualCurveIntersector4iMB(),ivariant); - - Builder* builder = nullptr; - if (scene->device->hair_builder == "default" ) builder = BVH4OBBCurve4iMBBuilder_OBB(accel,scene,0); - else if (scene->device->hair_builder == "sah" ) builder = BVH4OBBCurve4iMBBuilder_OBB(accel,scene,0); - else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->hair_builder+" for BVH4OBB"); - - return new AccelInstance(accel,builder,intersectors); - } - -#if defined(EMBREE_TARGET_SIMD8) - Accel* BVH4Factory::BVH4OBBVirtualCurve8iMB(Scene* scene, IntersectVariant ivariant) - { - BVH4* accel = new BVH4(Curve8iMB::type,scene); - Accel::Intersectors intersectors = BVH4OBBVirtualCurveIntersectorsMB(accel,VirtualCurveIntersector8iMB(), ivariant); - - Builder* builder = nullptr; - if (scene->device->hair_builder == "default" ) builder = BVH4OBBCurve8iMBBuilder_OBB(accel,scene,0); - else if (scene->device->hair_builder == "sah" ) builder = BVH4OBBCurve8iMBBuilder_OBB(accel,scene,0); - else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->hair_builder+" for BVH4OBB"); - - return new AccelInstance(accel,builder,intersectors); - } -#endif - - Accel* BVH4Factory::BVH4Triangle4(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) - { - BVH4* accel = new BVH4(Triangle4::type,scene); - - Accel::Intersectors intersectors; - if (scene->device->tri_traverser == "default") intersectors = BVH4Triangle4Intersectors(accel,ivariant); - else if (scene->device->tri_traverser == "fast" ) intersectors = BVH4Triangle4Intersectors(accel,IntersectVariant::FAST); - else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown traverser "+scene->device->tri_traverser+" for BVH4"); - - Builder* builder = nullptr; - if (scene->device->tri_builder == "default") { - switch (bvariant) { - case BuildVariant::STATIC : builder = BVH4Triangle4SceneBuilderSAH(accel,scene,0); break; - case BuildVariant::DYNAMIC : builder = BVH4BuilderTwoLevelTriangle4MeshSAH(accel,scene,false); break; - case BuildVariant::HIGH_QUALITY: builder = BVH4Triangle4SceneBuilderFastSpatialSAH(accel,scene,0); break; - } - } - else if (scene->device->tri_builder == "sah" ) builder = BVH4Triangle4SceneBuilderSAH(accel,scene,0); - else if (scene->device->tri_builder == "sah_fast_spatial" ) builder = BVH4Triangle4SceneBuilderFastSpatialSAH(accel,scene,0); - else if (scene->device->tri_builder == "sah_presplit") builder = BVH4Triangle4SceneBuilderSAH(accel,scene,MODE_HIGH_QUALITY); - else if (scene->device->tri_builder == "dynamic" ) builder = BVH4BuilderTwoLevelTriangle4MeshSAH(accel,scene,false); - else if (scene->device->tri_builder == "morton" ) builder = BVH4BuilderTwoLevelTriangle4MeshSAH(accel,scene,true); - else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder+" for BVH4"); - - return new AccelInstance(accel,builder,intersectors); - } - - Accel* BVH4Factory::BVH4Triangle4v(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) - { - BVH4* accel = new BVH4(Triangle4v::type,scene); - - Accel::Intersectors intersectors; - if (scene->device->tri_traverser == "default") intersectors = BVH4Triangle4vIntersectors(accel,ivariant); - else if (scene->device->tri_traverser == "fast" ) intersectors = BVH4Triangle4vIntersectors(accel,IntersectVariant::FAST); - else if (scene->device->tri_traverser == "robust" ) intersectors = BVH4Triangle4vIntersectors(accel,IntersectVariant::ROBUST); - else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown traverser "+scene->device->tri_traverser+" for BVH4"); - - Builder* builder = nullptr; - if (scene->device->tri_builder == "default") { - switch (bvariant) { - case BuildVariant::STATIC : builder = BVH4Triangle4vSceneBuilderSAH(accel,scene,0); break; - case BuildVariant::DYNAMIC : builder = BVH4BuilderTwoLevelTriangle4vMeshSAH(accel,scene,false); break; - case BuildVariant::HIGH_QUALITY: builder = BVH4Triangle4vSceneBuilderFastSpatialSAH(accel,scene,0); break; - } - } - else if (scene->device->tri_builder == "sah" ) builder = BVH4Triangle4vSceneBuilderSAH(accel,scene,0); - else if (scene->device->tri_builder == "sah_fast_spatial" ) builder = BVH4Triangle4vSceneBuilderFastSpatialSAH(accel,scene,0); - else if (scene->device->tri_builder == "sah_presplit") builder = BVH4Triangle4vSceneBuilderSAH(accel,scene,MODE_HIGH_QUALITY); - else if (scene->device->tri_builder == "dynamic" ) builder = BVH4BuilderTwoLevelTriangle4vMeshSAH(accel,scene,false); - else if (scene->device->tri_builder == "morton" ) builder = BVH4BuilderTwoLevelTriangle4vMeshSAH(accel,scene,true); - else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder+" for BVH4"); - - return new AccelInstance(accel,builder,intersectors); - } - - Accel* BVH4Factory::BVH4Triangle4i(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) - { - BVH4* accel = new BVH4(Triangle4i::type,scene); - - Accel::Intersectors intersectors; - if (scene->device->tri_traverser == "default") intersectors = BVH4Triangle4iIntersectors(accel,ivariant); - else if (scene->device->tri_traverser == "fast" ) intersectors = BVH4Triangle4iIntersectors(accel,IntersectVariant::FAST); - else if (scene->device->tri_traverser == "robust" ) intersectors = BVH4Triangle4iIntersectors(accel,IntersectVariant::ROBUST); - else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown traverser "+scene->device->tri_traverser+" for BVH4"); - - Builder* builder = nullptr; - if (scene->device->tri_builder == "default" ) { - switch (bvariant) { - case BuildVariant::STATIC : builder = BVH4Triangle4iSceneBuilderSAH(accel,scene,0); break; - case BuildVariant::DYNAMIC : builder = BVH4BuilderTwoLevelTriangle4iMeshSAH(accel,scene,false); break; - case BuildVariant::HIGH_QUALITY: builder = BVH4Triangle4iSceneBuilderFastSpatialSAH(accel,scene,0); break; - } - } - else if (scene->device->tri_builder == "sah" ) builder = BVH4Triangle4iSceneBuilderSAH(accel,scene,0); - else if (scene->device->tri_builder == "sah_fast_spatial" ) builder = BVH4Triangle4iSceneBuilderFastSpatialSAH(accel,scene,0); - else if (scene->device->tri_builder == "sah_presplit") builder = BVH4Triangle4iSceneBuilderSAH(accel,scene,MODE_HIGH_QUALITY); - else if (scene->device->tri_builder == "dynamic" ) builder = BVH4BuilderTwoLevelTriangle4iMeshSAH(accel,scene,false); - else if (scene->device->tri_builder == "morton" ) builder = BVH4BuilderTwoLevelTriangle4iMeshSAH(accel,scene,true); - else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder+" for BVH4"); - - return new AccelInstance(accel,builder,intersectors); - } - - Accel* BVH4Factory::BVH4Triangle4iMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) - { - BVH4* accel = new BVH4(Triangle4i::type,scene); - - Accel::Intersectors intersectors; - if (scene->device->tri_traverser_mb == "default") intersectors = BVH4Triangle4iMBIntersectors(accel,ivariant); - else if (scene->device->tri_traverser_mb == "fast" ) intersectors = BVH4Triangle4iMBIntersectors(accel,IntersectVariant::FAST); - else if (scene->device->tri_traverser_mb == "robust" ) intersectors = BVH4Triangle4iMBIntersectors(accel,IntersectVariant::ROBUST); - else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown traverser "+scene->device->tri_traverser_mb+" for BVH4"); - - Builder* builder = nullptr; - if (scene->device->tri_builder_mb == "default") { - switch (bvariant) { - case BuildVariant::STATIC : builder = BVH4Triangle4iMBSceneBuilderSAH(accel,scene,0); break; - case BuildVariant::DYNAMIC : assert(false); break; // FIXME: implement - case BuildVariant::HIGH_QUALITY: assert(false); break; - } - } - else if (scene->device->tri_builder_mb == "internal_time_splits") builder = BVH4Triangle4iMBSceneBuilderSAH(accel,scene,0); - else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder_mb+" for BVH4"); - - return new AccelInstance(accel,builder,intersectors); - } - - Accel* BVH4Factory::BVH4Triangle4vMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) - { - BVH4* accel = new BVH4(Triangle4vMB::type,scene); - - Accel::Intersectors intersectors; - if (scene->device->tri_traverser_mb == "default") intersectors = BVH4Triangle4vMBIntersectors(accel,ivariant); - else if (scene->device->tri_traverser_mb == "fast" ) intersectors = BVH4Triangle4vMBIntersectors(accel,IntersectVariant::FAST); - else if (scene->device->tri_traverser_mb == "robust" ) intersectors = BVH4Triangle4vMBIntersectors(accel,IntersectVariant::ROBUST); - else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown traverser "+scene->device->tri_traverser_mb+" for BVH4"); - - Builder* builder = nullptr; - if (scene->device->tri_builder_mb == "default") { - switch (bvariant) { - case BuildVariant::STATIC : builder = BVH4Triangle4vMBSceneBuilderSAH(accel,scene,0); break; - case BuildVariant::DYNAMIC : assert(false); break; // FIXME: implement - case BuildVariant::HIGH_QUALITY: assert(false); break; - } - } - else if (scene->device->tri_builder_mb == "internal_time_splits") builder = BVH4Triangle4vMBSceneBuilderSAH(accel,scene,0); - else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder_mb+" for BVH4"); - - return new AccelInstance(accel,builder,intersectors); - } - - Accel* BVH4Factory::BVH4Quad4v(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) - { - BVH4* accel = new BVH4(Quad4v::type,scene); - Accel::Intersectors intersectors = BVH4Quad4vIntersectors(accel,ivariant); - - Builder* builder = nullptr; - if (scene->device->quad_builder == "default") { - switch (bvariant) { - case BuildVariant::STATIC : builder = BVH4Quad4vSceneBuilderSAH(accel,scene,0); break; - case BuildVariant::DYNAMIC : builder = BVH4BuilderTwoLevelQuadMeshSAH(accel,scene,false); break; - case BuildVariant::HIGH_QUALITY: builder = BVH4Quad4vSceneBuilderFastSpatialSAH(accel,scene,0); break; - } - } - else if (scene->device->quad_builder == "sah" ) builder = BVH4Quad4vSceneBuilderSAH(accel,scene,0); - else if (scene->device->quad_builder == "sah_fast_spatial" ) builder = BVH4Quad4vSceneBuilderFastSpatialSAH(accel,scene,0); - else if (scene->device->quad_builder == "dynamic" ) builder = BVH4BuilderTwoLevelQuadMeshSAH(accel,scene,false); - else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder+" for BVH4"); - - return new AccelInstance(accel,builder,intersectors); - } - - Accel* BVH4Factory::BVH4Quad4i(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) - { - BVH4* accel = new BVH4(Quad4i::type,scene); - Accel::Intersectors intersectors = BVH4Quad4iIntersectors(accel,ivariant); - - Builder* builder = nullptr; - if (scene->device->quad_builder == "default") { - switch (bvariant) { - case BuildVariant::STATIC : builder = BVH4Quad4iSceneBuilderSAH(accel,scene,0); break; - case BuildVariant::DYNAMIC : assert(false); break; // FIXME: implement - case BuildVariant::HIGH_QUALITY: assert(false); break; // FIXME: implement - } - } - else if (scene->device->quad_builder == "sah") builder = BVH4Quad4iSceneBuilderSAH(accel,scene,0); - else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder+" for BVH4"); - - return new AccelInstance(accel,builder,intersectors); - } - - Accel* BVH4Factory::BVH4Quad4iMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) - { - BVH4* accel = new BVH4(Quad4i::type,scene); - Accel::Intersectors intersectors = BVH4Quad4iMBIntersectors(accel,ivariant); - - Builder* builder = nullptr; - if (scene->device->quad_builder_mb == "default") { - switch (bvariant) { - case BuildVariant::STATIC : builder = BVH4Quad4iMBSceneBuilderSAH(accel,scene,0); break; - case BuildVariant::DYNAMIC : assert(false); break; // FIXME: implement - case BuildVariant::HIGH_QUALITY: assert(false); break; - } - } - else if (scene->device->quad_builder_mb == "sah") builder = BVH4Quad4iMBSceneBuilderSAH(accel,scene,0); - else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder_mb+" for BVH4"); - - return new AccelInstance(accel,builder,intersectors); - } - - Accel* BVH4Factory::BVH4QuantizedQuad4i(Scene* scene) - { - BVH4* accel = new BVH4(Quad4i::type,scene); - Builder* builder = BVH4QuantizedQuad4iSceneBuilderSAH(accel,scene,0); - Accel::Intersectors intersectors = QBVH4Quad4iIntersectors(accel); - return new AccelInstance(accel,builder,intersectors); - } - - Accel* BVH4Factory::BVH4QuantizedTriangle4i(Scene* scene) - { - BVH4* accel = new BVH4(Triangle4i::type,scene); - Builder* builder = BVH4QuantizedTriangle4iSceneBuilderSAH(accel,scene,0); - Accel::Intersectors intersectors = QBVH4Triangle4iIntersectors(accel); - return new AccelInstance(accel,builder,intersectors); - } - - Accel* BVH4Factory::BVH4SubdivPatch1(Scene* scene) - { - BVH4* accel = new BVH4(SubdivPatch1::type,scene); - Accel::Intersectors intersectors = BVH4SubdivPatch1Intersectors(accel); - Builder* builder = BVH4SubdivPatch1BuilderSAH(accel,scene,0); - return new AccelInstance(accel,builder,intersectors); - } - - Accel* BVH4Factory::BVH4SubdivPatch1MB(Scene* scene) - { - BVH4* accel = new BVH4(SubdivPatch1::type,scene); - Accel::Intersectors intersectors = BVH4SubdivPatch1MBIntersectors(accel); - Builder* builder = BVH4SubdivPatch1MBBuilderSAH(accel,scene,0); - return new AccelInstance(accel,builder,intersectors); - } - - Accel* BVH4Factory::BVH4UserGeometry(Scene* scene, BuildVariant bvariant) - { - BVH4* accel = new BVH4(Object::type,scene); - Accel::Intersectors intersectors = BVH4UserGeometryIntersectors(accel); - - Builder* builder = nullptr; - if (scene->device->object_builder == "default") { - switch (bvariant) { - case BuildVariant::STATIC : builder = BVH4VirtualSceneBuilderSAH(accel,scene,0); break; - case BuildVariant::DYNAMIC : builder = BVH4BuilderTwoLevelVirtualSAH(accel,scene,false); break; - case BuildVariant::HIGH_QUALITY: assert(false); break; - } - } - else if (scene->device->object_builder == "sah") builder = BVH4VirtualSceneBuilderSAH(accel,scene,0); - else if (scene->device->object_builder == "dynamic") builder = BVH4BuilderTwoLevelVirtualSAH(accel,scene,false); - else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->object_builder+" for BVH4"); - - return new AccelInstance(accel,builder,intersectors); - } - - Accel* BVH4Factory::BVH4UserGeometryMB(Scene* scene) - { - BVH4* accel = new BVH4(Object::type,scene); - Accel::Intersectors intersectors = BVH4UserGeometryMBIntersectors(accel); - Builder* builder = BVH4VirtualMBSceneBuilderSAH(accel,scene,0); - return new AccelInstance(accel,builder,intersectors); - } - - Accel* BVH4Factory::BVH4Instance(Scene* scene, bool isExpensive, BuildVariant bvariant) - { - BVH4* accel = new BVH4(InstancePrimitive::type,scene); - Accel::Intersectors intersectors = BVH4InstanceIntersectors(accel); - auto gtype = isExpensive ? Geometry::MTY_INSTANCE_EXPENSIVE : Geometry::MTY_INSTANCE_CHEAP; - // Builder* builder = BVH4InstanceSceneBuilderSAH(accel,scene,gtype); - - Builder* builder = nullptr; - if (scene->device->object_builder == "default") { - switch (bvariant) { - case BuildVariant::STATIC : builder = BVH4InstanceSceneBuilderSAH(accel,scene,gtype); break; - case BuildVariant::DYNAMIC : builder = BVH4BuilderTwoLevelInstanceSAH(accel,scene,gtype,false); break; - case BuildVariant::HIGH_QUALITY: assert(false); break; - } - } - else if (scene->device->object_builder == "sah") builder = BVH4InstanceSceneBuilderSAH(accel,scene,gtype); - else if (scene->device->object_builder == "dynamic") builder = BVH4BuilderTwoLevelInstanceSAH(accel,scene,gtype,false); - else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->object_builder+" for BVH4"); - - return new AccelInstance(accel,builder,intersectors); - } - - Accel* BVH4Factory::BVH4InstanceMB(Scene* scene, bool isExpensive) - { - BVH4* accel = new BVH4(InstancePrimitive::type,scene); - Accel::Intersectors intersectors = BVH4InstanceMBIntersectors(accel); - auto gtype = isExpensive ? Geometry::MTY_INSTANCE_EXPENSIVE : Geometry::MTY_INSTANCE_CHEAP; - Builder* builder = BVH4InstanceMBSceneBuilderSAH(accel,scene,gtype); - return new AccelInstance(accel,builder,intersectors); - } - - Accel::Intersectors BVH4Factory::BVH4GridIntersectors(BVH4* bvh, IntersectVariant ivariant) - { - Accel::Intersectors intersectors; - intersectors.ptr = bvh; - if (ivariant == IntersectVariant::FAST) - { - intersectors.intersector1 = BVH4GridIntersector1Moeller(); -#if defined (EMBREE_RAY_PACKETS) - intersectors.intersector4 = BVH4GridIntersector4HybridMoeller(); - intersectors.intersector8 = BVH4GridIntersector8HybridMoeller(); - intersectors.intersector16 = BVH4GridIntersector16HybridMoeller(); - intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); -#endif - } - else /* if (ivariant == IntersectVariant::ROBUST) */ - { - intersectors.intersector1 = BVH4GridIntersector1Pluecker(); -#if defined (EMBREE_RAY_PACKETS) - intersectors.intersector4 = BVH4GridIntersector4HybridPluecker(); - intersectors.intersector8 = BVH4GridIntersector8HybridPluecker(); - intersectors.intersector16 = BVH4GridIntersector16HybridPluecker(); - intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); -#endif - } - return intersectors; - } - - Accel::Intersectors BVH4Factory::BVH4GridMBIntersectors(BVH4* bvh, IntersectVariant ivariant) - { - Accel::Intersectors intersectors; - intersectors.ptr = bvh; - intersectors.intersector1 = BVH4GridMBIntersector1Moeller(); -#if defined (EMBREE_RAY_PACKETS) - intersectors.intersector4 = BVH4GridMBIntersector4HybridMoeller(); - intersectors.intersector8 = BVH4GridMBIntersector8HybridMoeller(); - intersectors.intersector16 = BVH4GridMBIntersector16HybridMoeller(); - intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); -#endif - return intersectors; - } - - Accel* BVH4Factory::BVH4Grid(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) - { - BVH4* accel = new BVH4(SubGridQBVH4::type,scene); - Accel::Intersectors intersectors = BVH4GridIntersectors(accel,ivariant); - - Builder* builder = nullptr; - if (scene->device->object_builder == "default") { - builder = BVH4GridSceneBuilderSAH(accel,scene,0); - } - else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->grid_builder+" for BVH4"); - - return new AccelInstance(accel,builder,intersectors); - } - - Accel* BVH4Factory::BVH4GridMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) - { - BVH4* accel = new BVH4(SubGridQBVH4::type,scene); - Accel::Intersectors intersectors = BVH4GridMBIntersectors(accel,ivariant); - Builder* builder = nullptr; - if (scene->device->object_builder == "default") { - builder = BVH4GridMBSceneBuilderSAH(accel,scene,0); - } - else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->grid_builder+" for BVH4MB"); - return new AccelInstance(accel,builder,intersectors); - } - -} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh4_factory.h b/thirdparty/embree-aarch64/kernels/bvh/bvh4_factory.h deleted file mode 100644 index a68227b41f..0000000000 --- a/thirdparty/embree-aarch64/kernels/bvh/bvh4_factory.h +++ /dev/null @@ -1,316 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "bvh_factory.h" - -namespace embree -{ - /*! BVH4 instantiations */ - class BVH4Factory : public BVHFactory - { - public: - BVH4Factory(int bfeatures, int ifeatures); - - public: - Accel* BVH4OBBVirtualCurve4i(Scene* scene, IntersectVariant ivariant); - Accel* BVH4OBBVirtualCurve4v(Scene* scene, IntersectVariant ivariant); - Accel* BVH4OBBVirtualCurve8i(Scene* scene, IntersectVariant ivariant); - Accel* BVH4OBBVirtualCurve4iMB(Scene* scene, IntersectVariant ivariant); - Accel* BVH4OBBVirtualCurve8iMB(Scene* scene, IntersectVariant ivariant); - DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector4i); - DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector8i); - DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector4v); - DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector8v); - DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector4iMB); - DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector8iMB); - - Accel* BVH4Triangle4 (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); - Accel* BVH4Triangle4v (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::ROBUST); - Accel* BVH4Triangle4i (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); - Accel* BVH4Triangle4vMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); - Accel* BVH4Triangle4iMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); - - Accel* BVH4Quad4v (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); - Accel* BVH4Quad4i (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); - Accel* BVH4Quad4iMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); - - Accel* BVH4QuantizedTriangle4i(Scene* scene); - Accel* BVH4QuantizedQuad4i(Scene* scene); - - Accel* BVH4SubdivPatch1(Scene* scene); - Accel* BVH4SubdivPatch1MB(Scene* scene); - - Accel* BVH4UserGeometry(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC); - Accel* BVH4UserGeometryMB(Scene* scene); - - Accel* BVH4Instance(Scene* scene, bool isExpensive, BuildVariant bvariant = BuildVariant::STATIC); - Accel* BVH4InstanceMB(Scene* scene, bool isExpensive); - - Accel* BVH4Grid(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); - Accel* BVH4GridMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); - - private: - void selectBuilders(int features); - void selectIntersectors(int features); - - private: - Accel::Intersectors BVH4OBBVirtualCurveIntersectors(BVH4* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant); - Accel::Intersectors BVH4OBBVirtualCurveIntersectorsMB(BVH4* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant); - - Accel::Intersectors BVH4Triangle4Intersectors(BVH4* bvh, IntersectVariant ivariant); - Accel::Intersectors BVH4Triangle4vIntersectors(BVH4* bvh, IntersectVariant ivariant); - Accel::Intersectors BVH4Triangle4iIntersectors(BVH4* bvh, IntersectVariant ivariant); - Accel::Intersectors BVH4Triangle4iMBIntersectors(BVH4* bvh, IntersectVariant ivariant); - Accel::Intersectors BVH4Triangle4vMBIntersectors(BVH4* bvh, IntersectVariant ivariant); - - Accel::Intersectors BVH4Quad4vIntersectors(BVH4* bvh, IntersectVariant ivariant); - Accel::Intersectors BVH4Quad4iIntersectors(BVH4* bvh, IntersectVariant ivariant); - Accel::Intersectors BVH4Quad4iMBIntersectors(BVH4* bvh, IntersectVariant ivariant); - - Accel::Intersectors QBVH4Quad4iIntersectors(BVH4* bvh); - Accel::Intersectors QBVH4Triangle4iIntersectors(BVH4* bvh); - - Accel::Intersectors BVH4UserGeometryIntersectors(BVH4* bvh); - Accel::Intersectors BVH4UserGeometryMBIntersectors(BVH4* bvh); - - Accel::Intersectors BVH4InstanceIntersectors(BVH4* bvh); - Accel::Intersectors BVH4InstanceMBIntersectors(BVH4* bvh); - - Accel::Intersectors BVH4SubdivPatch1Intersectors(BVH4* bvh); - Accel::Intersectors BVH4SubdivPatch1MBIntersectors(BVH4* bvh); - - Accel::Intersectors BVH4GridIntersectors(BVH4* bvh, IntersectVariant ivariant); - Accel::Intersectors BVH4GridMBIntersectors(BVH4* bvh, IntersectVariant ivariant); - - private: - - DEFINE_SYMBOL2(Accel::Collider,BVH4ColliderUserGeom); - - DEFINE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersector1); - DEFINE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersector1MB); - DEFINE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersectorRobust1); - DEFINE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersectorRobust1MB); - - DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4Intersector1Moeller); - DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iIntersector1Moeller); - DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4vIntersector1Pluecker); - DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iIntersector1Pluecker); - - DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4vMBIntersector1Moeller); - DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iMBIntersector1Moeller); - DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4vMBIntersector1Pluecker); - DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iMBIntersector1Pluecker); - - DEFINE_SYMBOL2(Accel::Intersector1,BVH4Quad4vIntersector1Moeller); - DEFINE_SYMBOL2(Accel::Intersector1,BVH4Quad4iIntersector1Moeller); - DEFINE_SYMBOL2(Accel::Intersector1,BVH4Quad4vIntersector1Pluecker); - DEFINE_SYMBOL2(Accel::Intersector1,BVH4Quad4iIntersector1Pluecker); - - DEFINE_SYMBOL2(Accel::Intersector1,BVH4Quad4iMBIntersector1Moeller); - DEFINE_SYMBOL2(Accel::Intersector1,BVH4Quad4iMBIntersector1Pluecker); - - DEFINE_SYMBOL2(Accel::Intersector1,QBVH4Triangle4iIntersector1Pluecker); - DEFINE_SYMBOL2(Accel::Intersector1,QBVH4Quad4iIntersector1Pluecker); - - DEFINE_SYMBOL2(Accel::Intersector1,BVH4SubdivPatch1Intersector1); - DEFINE_SYMBOL2(Accel::Intersector1,BVH4SubdivPatch1MBIntersector1); - - DEFINE_SYMBOL2(Accel::Intersector1,BVH4VirtualIntersector1); - DEFINE_SYMBOL2(Accel::Intersector1,BVH4VirtualMBIntersector1); - - DEFINE_SYMBOL2(Accel::Intersector1,BVH4InstanceIntersector1); - DEFINE_SYMBOL2(Accel::Intersector1,BVH4InstanceMBIntersector1); - - DEFINE_SYMBOL2(Accel::Intersector1,BVH4GridIntersector1Moeller); - DEFINE_SYMBOL2(Accel::Intersector1,BVH4GridMBIntersector1Moeller); - DEFINE_SYMBOL2(Accel::Intersector1,BVH4GridIntersector1Pluecker); - - DEFINE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersector4Hybrid); - DEFINE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersector4HybridMB); - DEFINE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersectorRobust4Hybrid); - DEFINE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersectorRobust4HybridMB); - - DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4Intersector4HybridMoeller); - DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4Intersector4HybridMoellerNoFilter); - DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iIntersector4HybridMoeller); - DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4vIntersector4HybridPluecker); - DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iIntersector4HybridPluecker); - - DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4vMBIntersector4HybridMoeller); - DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iMBIntersector4HybridMoeller); - DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4vMBIntersector4HybridPluecker); - DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iMBIntersector4HybridPluecker); - - DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4vIntersector4HybridMoeller); - DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4vIntersector4HybridMoellerNoFilter); - DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4iIntersector4HybridMoeller); - DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4vIntersector4HybridPluecker); - DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4iIntersector4HybridPluecker); - - DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4iMBIntersector4HybridMoeller); - DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4iMBIntersector4HybridPluecker); - - DEFINE_SYMBOL2(Accel::Intersector4,BVH4SubdivPatch1Intersector4); - DEFINE_SYMBOL2(Accel::Intersector4,BVH4SubdivPatch1MBIntersector4); - - DEFINE_SYMBOL2(Accel::Intersector4,BVH4VirtualIntersector4Chunk); - DEFINE_SYMBOL2(Accel::Intersector4,BVH4VirtualMBIntersector4Chunk); - - DEFINE_SYMBOL2(Accel::Intersector4,BVH4InstanceIntersector4Chunk); - DEFINE_SYMBOL2(Accel::Intersector4,BVH4InstanceMBIntersector4Chunk); - - DEFINE_SYMBOL2(Accel::Intersector4,BVH4GridIntersector4HybridMoeller); - DEFINE_SYMBOL2(Accel::Intersector4,BVH4GridMBIntersector4HybridMoeller); - DEFINE_SYMBOL2(Accel::Intersector4,BVH4GridIntersector4HybridPluecker); - - // ============== - - DEFINE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersector8Hybrid); - DEFINE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersector8HybridMB); - DEFINE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersectorRobust8Hybrid); - DEFINE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersectorRobust8HybridMB); - - DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4Intersector8HybridMoeller); - DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4Intersector8HybridMoellerNoFilter); - DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iIntersector8HybridMoeller); - DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4vIntersector8HybridPluecker); - DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iIntersector8HybridPluecker); - - DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4vMBIntersector8HybridMoeller); - DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iMBIntersector8HybridMoeller); - DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4vMBIntersector8HybridPluecker); - DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iMBIntersector8HybridPluecker); - - DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4vIntersector8HybridMoeller); - DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4vIntersector8HybridMoellerNoFilter); - DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4iIntersector8HybridMoeller); - DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4vIntersector8HybridPluecker); - DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4iIntersector8HybridPluecker); - - DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4iMBIntersector8HybridMoeller); - DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4iMBIntersector8HybridPluecker); - - DEFINE_SYMBOL2(Accel::Intersector8,BVH4SubdivPatch1Intersector8); - DEFINE_SYMBOL2(Accel::Intersector8,BVH4SubdivPatch1MBIntersector8); - - DEFINE_SYMBOL2(Accel::Intersector8,BVH4VirtualIntersector8Chunk); - DEFINE_SYMBOL2(Accel::Intersector8,BVH4VirtualMBIntersector8Chunk); - - DEFINE_SYMBOL2(Accel::Intersector8,BVH4InstanceIntersector8Chunk); - DEFINE_SYMBOL2(Accel::Intersector8,BVH4InstanceMBIntersector8Chunk); - - DEFINE_SYMBOL2(Accel::Intersector8,BVH4GridIntersector8HybridMoeller); - DEFINE_SYMBOL2(Accel::Intersector8,BVH4GridMBIntersector8HybridMoeller); - DEFINE_SYMBOL2(Accel::Intersector8,BVH4GridIntersector8HybridPluecker); - - // ============== - - DEFINE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersector16Hybrid); - DEFINE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersector16HybridMB); - DEFINE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersectorRobust16Hybrid); - DEFINE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersectorRobust16HybridMB); - - DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4Intersector16HybridMoeller); - DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4Intersector16HybridMoellerNoFilter); - DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iIntersector16HybridMoeller); - DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4vIntersector16HybridPluecker); - DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iIntersector16HybridPluecker); - - DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4vMBIntersector16HybridMoeller); - DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iMBIntersector16HybridMoeller); - DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4vMBIntersector16HybridPluecker); - DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iMBIntersector16HybridPluecker); - - DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4vIntersector16HybridMoeller); - DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4vIntersector16HybridMoellerNoFilter); - DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4iIntersector16HybridMoeller); - DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4vIntersector16HybridPluecker); - DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4iIntersector16HybridPluecker); - - DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4iMBIntersector16HybridMoeller); - DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4iMBIntersector16HybridPluecker); - - DEFINE_SYMBOL2(Accel::Intersector16,BVH4SubdivPatch1Intersector16); - DEFINE_SYMBOL2(Accel::Intersector16,BVH4SubdivPatch1MBIntersector16); - - DEFINE_SYMBOL2(Accel::Intersector16,BVH4VirtualIntersector16Chunk); - DEFINE_SYMBOL2(Accel::Intersector16,BVH4VirtualMBIntersector16Chunk); - - DEFINE_SYMBOL2(Accel::Intersector16,BVH4InstanceIntersector16Chunk); - DEFINE_SYMBOL2(Accel::Intersector16,BVH4InstanceMBIntersector16Chunk); - - DEFINE_SYMBOL2(Accel::Intersector16,BVH4GridIntersector16HybridMoeller); - DEFINE_SYMBOL2(Accel::Intersector16,BVH4GridMBIntersector16HybridMoeller); - DEFINE_SYMBOL2(Accel::Intersector16,BVH4GridIntersector16HybridPluecker); - - // ============== - - DEFINE_SYMBOL2(Accel::IntersectorN, BVH4IntersectorStreamPacketFallback); - - DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Triangle4IntersectorStreamMoeller); - DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Triangle4IntersectorStreamMoellerNoFilter); - DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Triangle4iIntersectorStreamMoeller); - DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Triangle4vIntersectorStreamPluecker); - DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Triangle4iIntersectorStreamPluecker); - - DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Quad4vIntersectorStreamMoeller); - DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Quad4vIntersectorStreamMoellerNoFilter); - DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Quad4iIntersectorStreamMoeller); - DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Quad4vIntersectorStreamPluecker); - DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Quad4iIntersectorStreamPluecker); - - DEFINE_SYMBOL2(Accel::IntersectorN,BVH4VirtualIntersectorStream); - - DEFINE_SYMBOL2(Accel::IntersectorN,BVH4InstanceIntersectorStream); - - // SAH scene builders - private: - DEFINE_ISA_FUNCTION(Builder*,BVH4Curve4vBuilder_OBB_New,void* COMMA Scene* COMMA size_t); - DEFINE_ISA_FUNCTION(Builder*,BVH4Curve4iBuilder_OBB_New,void* COMMA Scene* COMMA size_t); - DEFINE_ISA_FUNCTION(Builder*,BVH4OBBCurve4iMBBuilder_OBB,void* COMMA Scene* COMMA size_t); - DEFINE_ISA_FUNCTION(Builder*,BVH4Curve8iBuilder_OBB_New,void* COMMA Scene* COMMA size_t); - DEFINE_ISA_FUNCTION(Builder*,BVH4OBBCurve8iMBBuilder_OBB,void* COMMA Scene* COMMA size_t); - - DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4SceneBuilderSAH,void* COMMA Scene* COMMA size_t); - DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t); - DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); - DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); - DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4vMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); - DEFINE_ISA_FUNCTION(Builder*,BVH4QuantizedTriangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); - - DEFINE_ISA_FUNCTION(Builder*,BVH4Quad4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t); - DEFINE_ISA_FUNCTION(Builder*,BVH4Quad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); - DEFINE_ISA_FUNCTION(Builder*,BVH4Quad4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); - DEFINE_ISA_FUNCTION(Builder*,BVH4QuantizedQuad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); - - DEFINE_ISA_FUNCTION(Builder*,BVH4SubdivPatch1BuilderSAH,void* COMMA Scene* COMMA size_t); - DEFINE_ISA_FUNCTION(Builder*,BVH4SubdivPatch1MBBuilderSAH,void* COMMA Scene* COMMA size_t); - - DEFINE_ISA_FUNCTION(Builder*,BVH4VirtualSceneBuilderSAH,void* COMMA Scene* COMMA size_t); - DEFINE_ISA_FUNCTION(Builder*,BVH4VirtualMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); - - DEFINE_ISA_FUNCTION(Builder*,BVH4InstanceSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask); - DEFINE_ISA_FUNCTION(Builder*,BVH4InstanceMBSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask); - - DEFINE_ISA_FUNCTION(Builder*,BVH4GridSceneBuilderSAH,void* COMMA Scene* COMMA size_t); - DEFINE_ISA_FUNCTION(Builder*,BVH4GridMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); - - // spatial scene builder - private: - DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4SceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t); - DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t); - DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4iSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t); - DEFINE_ISA_FUNCTION(Builder*,BVH4Quad4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t); - - // twolevel scene builders - private: - DEFINE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelTriangle4MeshSAH,void* COMMA Scene* COMMA bool); - DEFINE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelTriangle4vMeshSAH,void* COMMA Scene* COMMA bool); - DEFINE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelTriangle4iMeshSAH,void* COMMA Scene* COMMA bool); - DEFINE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelQuadMeshSAH,void* COMMA Scene* COMMA bool); - DEFINE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelVirtualSAH,void* COMMA Scene* COMMA bool); - DEFINE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelInstanceSAH,void* COMMA Scene* COMMA Geometry::GTypeMask COMMA bool); - }; -} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh8_factory.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh8_factory.cpp deleted file mode 100644 index 9fe057c392..0000000000 --- a/thirdparty/embree-aarch64/kernels/bvh/bvh8_factory.cpp +++ /dev/null @@ -1,1165 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "../common/isa.h" // to define EMBREE_TARGET_SIMD8 - -#if defined (EMBREE_TARGET_SIMD8) - -#include "bvh8_factory.h" -#include "../bvh/bvh.h" - -#include "../geometry/curveNv.h" -#include "../geometry/curveNi.h" -#include "../geometry/curveNi_mb.h" -#include "../geometry/linei.h" -#include "../geometry/triangle.h" -#include "../geometry/trianglev.h" -#include "../geometry/trianglev_mb.h" -#include "../geometry/trianglei.h" -#include "../geometry/quadv.h" -#include "../geometry/quadi.h" -#include "../geometry/subdivpatch1.h" -#include "../geometry/object.h" -#include "../geometry/instance.h" -#include "../geometry/subgrid.h" -#include "../common/accelinstance.h" - -namespace embree -{ - DECLARE_SYMBOL2(Accel::Collider,BVH8ColliderUserGeom); - - DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector8v,void); - DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector8iMB,void); - - DECLARE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersector1); - DECLARE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersector1MB); - DECLARE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersectorRobust1); - DECLARE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersectorRobust1MB); - - DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4Intersector1Moeller); - DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iIntersector1Moeller); - DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vIntersector1Pluecker); - DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iIntersector1Pluecker); - - DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vIntersector1Woop); - - DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vMBIntersector1Moeller); - DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iMBIntersector1Moeller); - DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vMBIntersector1Pluecker); - DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iMBIntersector1Pluecker); - - DECLARE_SYMBOL2(Accel::Intersector1,BVH8Quad4vIntersector1Moeller); - DECLARE_SYMBOL2(Accel::Intersector1,BVH8Quad4iIntersector1Moeller); - DECLARE_SYMBOL2(Accel::Intersector1,BVH8Quad4vIntersector1Pluecker); - DECLARE_SYMBOL2(Accel::Intersector1,BVH8Quad4iIntersector1Pluecker); - - DECLARE_SYMBOL2(Accel::Intersector1,BVH8Quad4iMBIntersector1Moeller); - DECLARE_SYMBOL2(Accel::Intersector1,BVH8Quad4iMBIntersector1Pluecker); - - DECLARE_SYMBOL2(Accel::Intersector1,QBVH8Triangle4iIntersector1Pluecker); - DECLARE_SYMBOL2(Accel::Intersector1,QBVH8Triangle4Intersector1Moeller); - DECLARE_SYMBOL2(Accel::Intersector1,QBVH8Quad4iIntersector1Pluecker); - - DECLARE_SYMBOL2(Accel::Intersector1,BVH8VirtualIntersector1); - DECLARE_SYMBOL2(Accel::Intersector1,BVH8VirtualMBIntersector1); - - DECLARE_SYMBOL2(Accel::Intersector1,BVH8InstanceIntersector1); - DECLARE_SYMBOL2(Accel::Intersector1,BVH8InstanceMBIntersector1); - - DECLARE_SYMBOL2(Accel::Intersector1,BVH8GridIntersector1Moeller); - DECLARE_SYMBOL2(Accel::Intersector1,BVH8GridMBIntersector1Moeller); - DECLARE_SYMBOL2(Accel::Intersector1,BVH8GridIntersector1Pluecker); - - DECLARE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersector4Hybrid); - DECLARE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersector4HybridMB); - DECLARE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersectorRobust4Hybrid); - DECLARE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersectorRobust4HybridMB); - - DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4Intersector4HybridMoeller); - DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4Intersector4HybridMoellerNoFilter); - DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iIntersector4HybridMoeller); - DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4vIntersector4HybridPluecker); - DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iIntersector4HybridPluecker); - - DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4vMBIntersector4HybridMoeller); - DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iMBIntersector4HybridMoeller); - DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4vMBIntersector4HybridPluecker); - DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iMBIntersector4HybridPluecker); - - DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4vIntersector4HybridMoeller); - DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4vIntersector4HybridMoellerNoFilter); - DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4iIntersector4HybridMoeller); - DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4vIntersector4HybridPluecker); - DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4iIntersector4HybridPluecker); - - DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4iMBIntersector4HybridMoeller); - DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4iMBIntersector4HybridPluecker); - - DECLARE_SYMBOL2(Accel::Intersector4,BVH8VirtualIntersector4Chunk); - DECLARE_SYMBOL2(Accel::Intersector4,BVH8VirtualMBIntersector4Chunk); - - DECLARE_SYMBOL2(Accel::Intersector4,BVH8InstanceIntersector4Chunk); - DECLARE_SYMBOL2(Accel::Intersector4,BVH8InstanceMBIntersector4Chunk); - - DECLARE_SYMBOL2(Accel::Intersector4,BVH8GridIntersector4HybridMoeller); - DECLARE_SYMBOL2(Accel::Intersector4,BVH8GridIntersector4HybridPluecker); - - DECLARE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersector8Hybrid); - DECLARE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersector8HybridMB); - DECLARE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersectorRobust8Hybrid); - DECLARE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersectorRobust8HybridMB); - - DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4Intersector8HybridMoeller); - DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4Intersector8HybridMoellerNoFilter); - DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iIntersector8HybridMoeller); - DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4vIntersector8HybridPluecker); - DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iIntersector8HybridPluecker); - - DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4vMBIntersector8HybridMoeller); - DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iMBIntersector8HybridMoeller); - DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4vMBIntersector8HybridPluecker); - DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iMBIntersector8HybridPluecker); - - DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4vIntersector8HybridMoeller); - DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4vIntersector8HybridMoellerNoFilter); - DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4iIntersector8HybridMoeller); - DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4vIntersector8HybridPluecker); - DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4iIntersector8HybridPluecker); - - DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4iMBIntersector8HybridMoeller); - DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4iMBIntersector8HybridPluecker); - - DECLARE_SYMBOL2(Accel::Intersector8,BVH8VirtualIntersector8Chunk); - DECLARE_SYMBOL2(Accel::Intersector8,BVH8VirtualMBIntersector8Chunk); - - DECLARE_SYMBOL2(Accel::Intersector8,BVH8InstanceIntersector8Chunk); - DECLARE_SYMBOL2(Accel::Intersector8,BVH8InstanceMBIntersector8Chunk); - - DECLARE_SYMBOL2(Accel::Intersector8,BVH8GridIntersector8HybridMoeller); - DECLARE_SYMBOL2(Accel::Intersector8,BVH8GridIntersector8HybridPluecker); - - DECLARE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersector16Hybrid); - DECLARE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersector16HybridMB); - DECLARE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersectorRobust16Hybrid); - DECLARE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersectorRobust16HybridMB); - - DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4Intersector16HybridMoeller); - DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4Intersector16HybridMoellerNoFilter); - DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iIntersector16HybridMoeller); - DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4vIntersector16HybridPluecker); - DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iIntersector16HybridPluecker); - - DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4vMBIntersector16HybridMoeller); - DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iMBIntersector16HybridMoeller); - DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4vMBIntersector16HybridPluecker); - DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iMBIntersector16HybridPluecker); - - DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4vIntersector16HybridMoeller); - DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4vIntersector16HybridMoellerNoFilter); - DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4iIntersector16HybridMoeller); - DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4vIntersector16HybridPluecker); - DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4iIntersector16HybridPluecker); - - DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4iMBIntersector16HybridMoeller); - DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4iMBIntersector16HybridPluecker); - - DECLARE_SYMBOL2(Accel::Intersector16,BVH8VirtualIntersector16Chunk); - DECLARE_SYMBOL2(Accel::Intersector16,BVH8VirtualMBIntersector16Chunk); - - DECLARE_SYMBOL2(Accel::Intersector16,BVH8InstanceIntersector16Chunk); - DECLARE_SYMBOL2(Accel::Intersector16,BVH8InstanceMBIntersector16Chunk); - - DECLARE_SYMBOL2(Accel::Intersector16,BVH8GridIntersector16HybridMoeller); - DECLARE_SYMBOL2(Accel::Intersector16,BVH8GridIntersector16HybridPluecker); - - DECLARE_SYMBOL2(Accel::IntersectorN,BVH8IntersectorStreamPacketFallback); - - DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4IntersectorStreamMoeller); - DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4IntersectorStreamMoellerNoFilter); - DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4iIntersectorStreamMoeller); - DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4vIntersectorStreamPluecker); - DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4iIntersectorStreamPluecker); - - DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamMoeller); - DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamMoellerNoFilter); - DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Quad4iIntersectorStreamMoeller); - DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamPluecker); - DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Quad4iIntersectorStreamPluecker); - - DECLARE_SYMBOL2(Accel::IntersectorN,BVH8VirtualIntersectorStream); - - DECLARE_SYMBOL2(Accel::IntersectorN,BVH8InstanceIntersectorStream); - - DECLARE_ISA_FUNCTION(Builder*,BVH8Curve8vBuilder_OBB_New,void* COMMA Scene* COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH8OBBCurve8iMBBuilder_OBB,void* COMMA Scene* COMMA size_t); - - DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4SceneBuilderSAH,void* COMMA Scene* COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4vMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH8QuantizedTriangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH8QuantizedTriangle4SceneBuilderSAH,void* COMMA Scene* COMMA size_t); - - DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH8QuantizedQuad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); - - DECLARE_ISA_FUNCTION(Builder*,BVH8VirtualSceneBuilderSAH,void* COMMA Scene* COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH8VirtualMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); - - DECLARE_ISA_FUNCTION(Builder*,BVH8InstanceSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask); - DECLARE_ISA_FUNCTION(Builder*,BVH8InstanceMBSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask); - - DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4SceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH8GridSceneBuilderSAH,void* COMMA Scene* COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH8GridMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); - - DECLARE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelTriangle4MeshSAH,void* COMMA Scene* COMMA bool); - DECLARE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelTriangle4vMeshSAH,void* COMMA Scene* COMMA bool); - DECLARE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelTriangle4iMeshSAH,void* COMMA Scene* COMMA bool); - DECLARE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelQuadMeshSAH,void* COMMA Scene* COMMA bool); - DECLARE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelVirtualSAH,void* COMMA Scene* COMMA bool); - DECLARE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelInstanceSAH,void* COMMA Scene* COMMA Geometry::GTypeMask COMMA bool); - - BVH8Factory::BVH8Factory(int bfeatures, int ifeatures) - { - SELECT_SYMBOL_INIT_AVX(ifeatures,BVH8ColliderUserGeom); - - selectBuilders(bfeatures); - selectIntersectors(ifeatures); - } - - void BVH8Factory::selectBuilders(int features) - { - IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX(features,BVH8Curve8vBuilder_OBB_New)); - IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX(features,BVH8OBBCurve8iMBBuilder_OBB)); - - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Triangle4SceneBuilderSAH)); - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Triangle4vSceneBuilderSAH)); - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Triangle4iSceneBuilderSAH)); - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Triangle4iMBSceneBuilderSAH)); - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Triangle4vMBSceneBuilderSAH)); - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX(features,BVH8QuantizedTriangle4iSceneBuilderSAH)); - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX(features,BVH8QuantizedTriangle4SceneBuilderSAH)); - - IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Quad4vSceneBuilderSAH)); - IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Quad4iSceneBuilderSAH)); - IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Quad4iMBSceneBuilderSAH)); - IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX(features,BVH8QuantizedQuad4iSceneBuilderSAH)); - - IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX(features,BVH8VirtualSceneBuilderSAH)); - IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX(features,BVH8VirtualMBSceneBuilderSAH)); - - IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX(features,BVH8InstanceSceneBuilderSAH)); - IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX(features,BVH8InstanceMBSceneBuilderSAH)); - - IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX(features,BVH8GridSceneBuilderSAH)); - IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX(features,BVH8GridMBSceneBuilderSAH)); - - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Triangle4SceneBuilderFastSpatialSAH)); - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Triangle4vSceneBuilderFastSpatialSAH)); - IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Quad4vSceneBuilderFastSpatialSAH)); - - IF_ENABLED_TRIS (SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8BuilderTwoLevelTriangle4MeshSAH)); - IF_ENABLED_TRIS (SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8BuilderTwoLevelTriangle4vMeshSAH)); - IF_ENABLED_TRIS (SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8BuilderTwoLevelTriangle4iMeshSAH)); - IF_ENABLED_QUADS (SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8BuilderTwoLevelQuadMeshSAH)); - IF_ENABLED_USER (SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8BuilderTwoLevelVirtualSAH)); - IF_ENABLED_INSTANCE (SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8BuilderTwoLevelInstanceSAH)); - } - - void BVH8Factory::selectIntersectors(int features) - { - IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,VirtualCurveIntersector8v)); - IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,VirtualCurveIntersector8iMB)); - - /* select intersectors1 */ - IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8OBBVirtualCurveIntersector1)); - IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8OBBVirtualCurveIntersector1MB)); - IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8OBBVirtualCurveIntersectorRobust1)); - IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8OBBVirtualCurveIntersectorRobust1MB)); - - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4Intersector1Moeller)); - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4iIntersector1Moeller)); - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4vIntersector1Pluecker)); - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4iIntersector1Pluecker)); - - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4vIntersector1Woop)); - - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4vMBIntersector1Moeller)); - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4iMBIntersector1Moeller)); - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4vMBIntersector1Pluecker)); - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4iMBIntersector1Pluecker)); - - IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4vIntersector1Moeller)); - IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4iIntersector1Moeller)); - IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4vIntersector1Pluecker)); - IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4iIntersector1Pluecker)); - - IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4iMBIntersector1Moeller)); - IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4iMBIntersector1Pluecker)); - - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,QBVH8Triangle4iIntersector1Pluecker)); - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,QBVH8Triangle4Intersector1Moeller)); - IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,QBVH8Quad4iIntersector1Pluecker)); - - IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8VirtualIntersector1)); - IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8VirtualMBIntersector1)); - - IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8InstanceIntersector1)); - IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8InstanceMBIntersector1)); - - IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8GridIntersector1Moeller)); - IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8GridMBIntersector1Moeller)) - IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8GridIntersector1Pluecker)); - -#if defined (EMBREE_RAY_PACKETS) - - /* select intersectors4 */ - IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8OBBVirtualCurveIntersector4Hybrid)); - IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8OBBVirtualCurveIntersector4HybridMB)); - IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8OBBVirtualCurveIntersectorRobust4Hybrid)); - IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8OBBVirtualCurveIntersectorRobust4HybridMB)); - - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4Intersector4HybridMoeller)); - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4Intersector4HybridMoellerNoFilter)); - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4iIntersector4HybridMoeller)); - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4vIntersector4HybridPluecker)); - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4iIntersector4HybridPluecker)); - - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4vMBIntersector4HybridMoeller)); - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4iMBIntersector4HybridMoeller)); - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4vMBIntersector4HybridPluecker)); - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4iMBIntersector4HybridPluecker)); - - IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4vIntersector4HybridMoeller)); - IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4vIntersector4HybridMoellerNoFilter)); - IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4iIntersector4HybridMoeller)); - IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4vIntersector4HybridPluecker)); - IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4iIntersector4HybridPluecker)); - - IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2(features,BVH8Quad4iMBIntersector4HybridMoeller)); - IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2(features,BVH8Quad4iMBIntersector4HybridPluecker)); - - IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8VirtualIntersector4Chunk)); - IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8VirtualMBIntersector4Chunk)); - - IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8InstanceIntersector4Chunk)); - IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8InstanceMBIntersector4Chunk)); - - IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8GridIntersector4HybridMoeller)); - IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8GridIntersector4HybridPluecker)); - - /* select intersectors8 */ - IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8OBBVirtualCurveIntersector8Hybrid)); - IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8OBBVirtualCurveIntersector8HybridMB)); - IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8OBBVirtualCurveIntersectorRobust8Hybrid)); - IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8OBBVirtualCurveIntersectorRobust8HybridMB)); - - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4Intersector8HybridMoeller)); - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4Intersector8HybridMoellerNoFilter)); - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4iIntersector8HybridMoeller)); - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4vIntersector8HybridPluecker)); - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4iIntersector8HybridPluecker)); - - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4vMBIntersector8HybridMoeller)); - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4iMBIntersector8HybridMoeller)); - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4vMBIntersector8HybridPluecker)); - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4iMBIntersector8HybridPluecker)); - - IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4vIntersector8HybridMoeller)); - IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4vIntersector8HybridMoellerNoFilter)); - IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4iIntersector8HybridMoeller)); - IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4vIntersector8HybridPluecker)); - IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4iIntersector8HybridPluecker)); - - IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2(features,BVH8Quad4iMBIntersector8HybridMoeller)); - IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2(features,BVH8Quad4iMBIntersector8HybridPluecker)); - - IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8VirtualIntersector8Chunk)); - IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8VirtualMBIntersector8Chunk)); - - IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8InstanceIntersector8Chunk)); - IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8InstanceMBIntersector8Chunk)); - - IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8GridIntersector8HybridMoeller)); - IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8GridIntersector8HybridPluecker)); - - /* select intersectors16 */ - IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8OBBVirtualCurveIntersector16Hybrid)); - IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8OBBVirtualCurveIntersector16HybridMB)); - IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8OBBVirtualCurveIntersectorRobust16Hybrid)); - IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8OBBVirtualCurveIntersectorRobust16HybridMB)); - - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Triangle4Intersector16HybridMoeller)); - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Triangle4Intersector16HybridMoellerNoFilter)); - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Triangle4iIntersector16HybridMoeller)); - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Triangle4vIntersector16HybridPluecker)); - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Triangle4iIntersector16HybridPluecker)); - - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Triangle4vMBIntersector16HybridMoeller)); - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Triangle4iMBIntersector16HybridMoeller)); - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Triangle4vMBIntersector16HybridPluecker)); - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Triangle4iMBIntersector16HybridPluecker)); - - IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Quad4vIntersector16HybridMoeller)); - IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Quad4vIntersector16HybridMoellerNoFilter)); - IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Quad4iIntersector16HybridMoeller)); - IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Quad4vIntersector16HybridPluecker)); - IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Quad4iIntersector16HybridPluecker)); - - IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Quad4iMBIntersector16HybridMoeller)); - IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Quad4iMBIntersector16HybridPluecker)); - - IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8VirtualIntersector16Chunk)); - IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8VirtualMBIntersector16Chunk)); - - IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8InstanceIntersector16Chunk)); - IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8InstanceMBIntersector16Chunk)); - - IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8GridIntersector16HybridMoeller)); - IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8GridIntersector16HybridPluecker)); - - /* select stream intersectors */ - - SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8IntersectorStreamPacketFallback); - - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4IntersectorStreamMoeller)); - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4IntersectorStreamMoellerNoFilter)); - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4iIntersectorStreamMoeller)); - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4vIntersectorStreamPluecker)); - IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4iIntersectorStreamPluecker)); - - IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4vIntersectorStreamMoeller)); - IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4vIntersectorStreamMoellerNoFilter)); - IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4iIntersectorStreamMoeller)); - IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4vIntersectorStreamPluecker)); - IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4iIntersectorStreamPluecker)); - - IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8VirtualIntersectorStream)); - - IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8InstanceIntersectorStream)); - -#endif - } - - Accel::Intersectors BVH8Factory::BVH8OBBVirtualCurveIntersectors(BVH8* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant) - { - switch (ivariant) { - case IntersectVariant::FAST: - { - Accel::Intersectors intersectors; - intersectors.ptr = bvh; - intersectors.leafIntersector = leafIntersector; - intersectors.intersector1 = BVH8OBBVirtualCurveIntersector1(); -#if defined (EMBREE_RAY_PACKETS) - intersectors.intersector4 = BVH8OBBVirtualCurveIntersector4Hybrid(); - intersectors.intersector8 = BVH8OBBVirtualCurveIntersector8Hybrid(); - intersectors.intersector16 = BVH8OBBVirtualCurveIntersector16Hybrid(); - intersectors.intersectorN = BVH8IntersectorStreamPacketFallback(); -#endif - return intersectors; - } - case IntersectVariant::ROBUST: - { - Accel::Intersectors intersectors; - intersectors.ptr = bvh; - intersectors.leafIntersector = leafIntersector; - intersectors.intersector1 = BVH8OBBVirtualCurveIntersectorRobust1(); -#if defined (EMBREE_RAY_PACKETS) - intersectors.intersector4 = BVH8OBBVirtualCurveIntersectorRobust4Hybrid(); - intersectors.intersector8 = BVH8OBBVirtualCurveIntersectorRobust8Hybrid(); - intersectors.intersector16 = BVH8OBBVirtualCurveIntersectorRobust16Hybrid(); - intersectors.intersectorN = BVH8IntersectorStreamPacketFallback(); -#endif - return intersectors; - } - default: assert(false); - } - return Accel::Intersectors(); - } - - Accel::Intersectors BVH8Factory::BVH8OBBVirtualCurveIntersectorsMB(BVH8* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant) - { - switch (ivariant) { - case IntersectVariant::FAST: - { - Accel::Intersectors intersectors; - intersectors.ptr = bvh; - intersectors.leafIntersector = leafIntersector; - intersectors.intersector1 = BVH8OBBVirtualCurveIntersector1MB(); -#if defined (EMBREE_RAY_PACKETS) - intersectors.intersector4 = BVH8OBBVirtualCurveIntersector4HybridMB(); - intersectors.intersector8 = BVH8OBBVirtualCurveIntersector8HybridMB(); - intersectors.intersector16 = BVH8OBBVirtualCurveIntersector16HybridMB(); - intersectors.intersectorN = BVH8IntersectorStreamPacketFallback(); -#endif - return intersectors; - } - case IntersectVariant::ROBUST: - { - Accel::Intersectors intersectors; - intersectors.ptr = bvh; - intersectors.leafIntersector = leafIntersector; - intersectors.intersector1 = BVH8OBBVirtualCurveIntersectorRobust1MB(); -#if defined (EMBREE_RAY_PACKETS) - intersectors.intersector4 = BVH8OBBVirtualCurveIntersectorRobust4HybridMB(); - intersectors.intersector8 = BVH8OBBVirtualCurveIntersectorRobust8HybridMB(); - intersectors.intersector16 = BVH8OBBVirtualCurveIntersectorRobust16HybridMB(); - intersectors.intersectorN = BVH8IntersectorStreamPacketFallback(); -#endif - return intersectors; - } - default: assert(false); - } - return Accel::Intersectors(); - } - - Accel::Intersectors BVH8Factory::BVH8Triangle4Intersectors(BVH8* bvh, IntersectVariant ivariant) - { - assert(ivariant == IntersectVariant::FAST); - Accel::Intersectors intersectors; - intersectors.ptr = bvh; - intersectors.intersector1 = BVH8Triangle4Intersector1Moeller(); -#if defined (EMBREE_RAY_PACKETS) - intersectors.intersector4_filter = BVH8Triangle4Intersector4HybridMoeller(); - intersectors.intersector4_nofilter = BVH8Triangle4Intersector4HybridMoellerNoFilter(); - intersectors.intersector8_filter = BVH8Triangle4Intersector8HybridMoeller(); - intersectors.intersector8_nofilter = BVH8Triangle4Intersector8HybridMoellerNoFilter(); - intersectors.intersector16_filter = BVH8Triangle4Intersector16HybridMoeller(); - intersectors.intersector16_nofilter = BVH8Triangle4Intersector16HybridMoellerNoFilter(); - intersectors.intersectorN_filter = BVH8Triangle4IntersectorStreamMoeller(); - intersectors.intersectorN_nofilter = BVH8Triangle4IntersectorStreamMoellerNoFilter(); -#endif - return intersectors; - } - - Accel::Intersectors BVH8Factory::BVH8Triangle4vIntersectors(BVH8* bvh, IntersectVariant ivariant) - { - Accel::Intersectors intersectors; - intersectors.ptr = bvh; -#define ENABLE_WOOP_TEST 0 -#if ENABLE_WOOP_TEST == 0 - //assert(ivariant == IntersectVariant::ROBUST); - intersectors.intersector1 = BVH8Triangle4vIntersector1Pluecker(); -#else - intersectors.intersector1 = BVH8Triangle4vIntersector1Woop(); -#endif - -#if defined (EMBREE_RAY_PACKETS) - intersectors.intersector4 = BVH8Triangle4vIntersector4HybridPluecker(); - intersectors.intersector8 = BVH8Triangle4vIntersector8HybridPluecker(); - intersectors.intersector16 = BVH8Triangle4vIntersector16HybridPluecker(); - intersectors.intersectorN = BVH8Triangle4vIntersectorStreamPluecker(); -#endif - return intersectors; - } - - Accel::Intersectors BVH8Factory::BVH8Triangle4iIntersectors(BVH8* bvh, IntersectVariant ivariant) - { - switch (ivariant) { - case IntersectVariant::FAST: - { - Accel::Intersectors intersectors; - intersectors.ptr = bvh; - intersectors.intersector1 = BVH8Triangle4iIntersector1Moeller(); -#if defined (EMBREE_RAY_PACKETS) - intersectors.intersector4 = BVH8Triangle4iIntersector4HybridMoeller(); - intersectors.intersector8 = BVH8Triangle4iIntersector8HybridMoeller(); - intersectors.intersector16 = BVH8Triangle4iIntersector16HybridMoeller(); - intersectors.intersectorN = BVH8Triangle4iIntersectorStreamMoeller(); -#endif - return intersectors; - } - case IntersectVariant::ROBUST: - { - Accel::Intersectors intersectors; - intersectors.ptr = bvh; - intersectors.intersector1 = BVH8Triangle4iIntersector1Pluecker(); -#if defined (EMBREE_RAY_PACKETS) - intersectors.intersector4 = BVH8Triangle4iIntersector4HybridPluecker(); - intersectors.intersector8 = BVH8Triangle4iIntersector8HybridPluecker(); - intersectors.intersector16 = BVH8Triangle4iIntersector16HybridPluecker(); - intersectors.intersectorN = BVH8Triangle4iIntersectorStreamPluecker(); -#endif - return intersectors; - } - } - return Accel::Intersectors(); - } - - Accel::Intersectors BVH8Factory::BVH8Triangle4vMBIntersectors(BVH8* bvh, IntersectVariant ivariant) - { - switch (ivariant) { - case IntersectVariant::FAST: - { - Accel::Intersectors intersectors; - intersectors.ptr = bvh; - intersectors.intersector1 = BVH8Triangle4vMBIntersector1Moeller(); -#if defined (EMBREE_RAY_PACKETS) - intersectors.intersector4 = BVH8Triangle4vMBIntersector4HybridMoeller(); - intersectors.intersector8 = BVH8Triangle4vMBIntersector8HybridMoeller(); - intersectors.intersector16 = BVH8Triangle4vMBIntersector16HybridMoeller(); - intersectors.intersectorN = BVH8IntersectorStreamPacketFallback(); -#endif - return intersectors; - } - case IntersectVariant::ROBUST: - { - Accel::Intersectors intersectors; - intersectors.ptr = bvh; - intersectors.intersector1 = BVH8Triangle4vMBIntersector1Pluecker(); -#if defined (EMBREE_RAY_PACKETS) - intersectors.intersector4 = BVH8Triangle4vMBIntersector4HybridPluecker(); - intersectors.intersector8 = BVH8Triangle4vMBIntersector8HybridPluecker(); - intersectors.intersector16 = BVH8Triangle4vMBIntersector16HybridPluecker(); - intersectors.intersectorN = BVH8IntersectorStreamPacketFallback(); -#endif - return intersectors; - } - } - return Accel::Intersectors(); - } - - Accel::Intersectors BVH8Factory::BVH8Triangle4iMBIntersectors(BVH8* bvh, IntersectVariant ivariant) - { - switch (ivariant) { - case IntersectVariant::FAST: - { - Accel::Intersectors intersectors; - intersectors.ptr = bvh; - intersectors.intersector1 = BVH8Triangle4iMBIntersector1Moeller(); -#if defined (EMBREE_RAY_PACKETS) - intersectors.intersector4 = BVH8Triangle4iMBIntersector4HybridMoeller(); - intersectors.intersector8 = BVH8Triangle4iMBIntersector8HybridMoeller(); - intersectors.intersector16 = BVH8Triangle4iMBIntersector16HybridMoeller(); - intersectors.intersectorN = BVH8IntersectorStreamPacketFallback(); -#endif - return intersectors; - } - case IntersectVariant::ROBUST: - { - Accel::Intersectors intersectors; - intersectors.ptr = bvh; - intersectors.intersector1 = BVH8Triangle4iMBIntersector1Pluecker(); -#if defined (EMBREE_RAY_PACKETS) - intersectors.intersector4 = BVH8Triangle4iMBIntersector4HybridPluecker(); - intersectors.intersector8 = BVH8Triangle4iMBIntersector8HybridPluecker(); - intersectors.intersector16 = BVH8Triangle4iMBIntersector16HybridPluecker(); - intersectors.intersectorN = BVH8IntersectorStreamPacketFallback(); -#endif - return intersectors; - } - } - return Accel::Intersectors(); - } - - Accel::Intersectors BVH8Factory::BVH8Quad4vIntersectors(BVH8* bvh, IntersectVariant ivariant) - { - switch (ivariant) { - case IntersectVariant::FAST: - { - Accel::Intersectors intersectors; - intersectors.ptr = bvh; - intersectors.intersector1 = BVH8Quad4vIntersector1Moeller(); -#if defined (EMBREE_RAY_PACKETS) - intersectors.intersector4_filter = BVH8Quad4vIntersector4HybridMoeller(); - intersectors.intersector4_nofilter = BVH8Quad4vIntersector4HybridMoellerNoFilter(); - intersectors.intersector8_filter = BVH8Quad4vIntersector8HybridMoeller(); - intersectors.intersector8_nofilter = BVH8Quad4vIntersector8HybridMoellerNoFilter(); - intersectors.intersector16_filter = BVH8Quad4vIntersector16HybridMoeller(); - intersectors.intersector16_nofilter = BVH8Quad4vIntersector16HybridMoellerNoFilter(); - intersectors.intersectorN_filter = BVH8Quad4vIntersectorStreamMoeller(); - intersectors.intersectorN_nofilter = BVH8Quad4vIntersectorStreamMoellerNoFilter(); -#endif - return intersectors; - } - case IntersectVariant::ROBUST: - { - Accel::Intersectors intersectors; - intersectors.ptr = bvh; - intersectors.intersector1 = BVH8Quad4vIntersector1Pluecker(); -#if defined (EMBREE_RAY_PACKETS) - intersectors.intersector4 = BVH8Quad4vIntersector4HybridPluecker(); - intersectors.intersector8 = BVH8Quad4vIntersector8HybridPluecker(); - intersectors.intersector16 = BVH8Quad4vIntersector16HybridPluecker(); - intersectors.intersectorN = BVH8Quad4vIntersectorStreamPluecker(); -#endif - return intersectors; - } - } - return Accel::Intersectors(); - } - - Accel::Intersectors BVH8Factory::BVH8Quad4iIntersectors(BVH8* bvh, IntersectVariant ivariant) - { - switch (ivariant) { - case IntersectVariant::FAST: - { - Accel::Intersectors intersectors; - intersectors.ptr = bvh; - intersectors.intersector1 = BVH8Quad4iIntersector1Moeller(); -#if defined (EMBREE_RAY_PACKETS) - intersectors.intersector4 = BVH8Quad4iIntersector4HybridMoeller(); - intersectors.intersector8 = BVH8Quad4iIntersector8HybridMoeller(); - intersectors.intersector16 = BVH8Quad4iIntersector16HybridMoeller(); - intersectors.intersectorN = BVH8Quad4iIntersectorStreamMoeller(); -#endif - return intersectors; - } - case IntersectVariant::ROBUST: - { - Accel::Intersectors intersectors; - intersectors.ptr = bvh; - intersectors.intersector1 = BVH8Quad4iIntersector1Pluecker(); -#if defined (EMBREE_RAY_PACKETS) - intersectors.intersector4 = BVH8Quad4iIntersector4HybridPluecker(); - intersectors.intersector8 = BVH8Quad4iIntersector8HybridPluecker(); - intersectors.intersector16 = BVH8Quad4iIntersector16HybridPluecker(); - intersectors.intersectorN = BVH8Quad4iIntersectorStreamPluecker(); -#endif - return intersectors; - } - } - return Accel::Intersectors(); - } - - Accel::Intersectors BVH8Factory::BVH8Quad4iMBIntersectors(BVH8* bvh, IntersectVariant ivariant) - { - switch (ivariant) { - case IntersectVariant::FAST: - { - Accel::Intersectors intersectors; - intersectors.ptr = bvh; - intersectors.intersector1 = BVH8Quad4iMBIntersector1Moeller(); -#if defined (EMBREE_RAY_PACKETS) - intersectors.intersector4 = BVH8Quad4iMBIntersector4HybridMoeller(); - intersectors.intersector8 = BVH8Quad4iMBIntersector8HybridMoeller(); - intersectors.intersector16 = BVH8Quad4iMBIntersector16HybridMoeller(); - intersectors.intersectorN = BVH8IntersectorStreamPacketFallback(); -#endif - return intersectors; - } - case IntersectVariant::ROBUST: - { - Accel::Intersectors intersectors; - intersectors.ptr = bvh; - intersectors.intersector1 = BVH8Quad4iMBIntersector1Pluecker(); -#if defined (EMBREE_RAY_PACKETS) - intersectors.intersector4 = BVH8Quad4iMBIntersector4HybridPluecker(); - intersectors.intersector8 = BVH8Quad4iMBIntersector8HybridPluecker(); - intersectors.intersector16 = BVH8Quad4iMBIntersector16HybridPluecker(); - intersectors.intersectorN = BVH8IntersectorStreamPacketFallback(); -#endif - return intersectors; - } - } - return Accel::Intersectors(); - } - - Accel::Intersectors BVH8Factory::QBVH8Triangle4iIntersectors(BVH8* bvh) - { - Accel::Intersectors intersectors; - intersectors.ptr = bvh; - intersectors.intersector1 = QBVH8Triangle4iIntersector1Pluecker(); - return intersectors; - } - - Accel::Intersectors BVH8Factory::QBVH8Triangle4Intersectors(BVH8* bvh) - { - Accel::Intersectors intersectors; - intersectors.ptr = bvh; - intersectors.intersector1 = QBVH8Triangle4Intersector1Moeller(); - return intersectors; - } - - Accel::Intersectors BVH8Factory::QBVH8Quad4iIntersectors(BVH8* bvh) - { - Accel::Intersectors intersectors; - intersectors.ptr = bvh; - intersectors.intersector1 = QBVH8Quad4iIntersector1Pluecker(); - return intersectors; - } - - Accel::Intersectors BVH8Factory::BVH8UserGeometryIntersectors(BVH8* bvh) - { - Accel::Intersectors intersectors; - intersectors.ptr = bvh; - intersectors.intersector1 = BVH8VirtualIntersector1(); -#if defined (EMBREE_RAY_PACKETS) - intersectors.intersector4 = BVH8VirtualIntersector4Chunk(); - intersectors.intersector8 = BVH8VirtualIntersector8Chunk(); - intersectors.intersector16 = BVH8VirtualIntersector16Chunk(); - intersectors.intersectorN = BVH8VirtualIntersectorStream(); -#endif - intersectors.collider = BVH8ColliderUserGeom(); - return intersectors; - } - - Accel::Intersectors BVH8Factory::BVH8UserGeometryMBIntersectors(BVH8* bvh) - { - Accel::Intersectors intersectors; - intersectors.ptr = bvh; - intersectors.intersector1 = BVH8VirtualMBIntersector1(); -#if defined (EMBREE_RAY_PACKETS) - intersectors.intersector4 = BVH8VirtualMBIntersector4Chunk(); - intersectors.intersector8 = BVH8VirtualMBIntersector8Chunk(); - intersectors.intersector16 = BVH8VirtualMBIntersector16Chunk(); - intersectors.intersectorN = BVH8IntersectorStreamPacketFallback(); -#endif - return intersectors; - } - - Accel::Intersectors BVH8Factory::BVH8InstanceIntersectors(BVH8* bvh) - { - Accel::Intersectors intersectors; - intersectors.ptr = bvh; - intersectors.intersector1 = BVH8InstanceIntersector1(); -#if defined (EMBREE_RAY_PACKETS) - intersectors.intersector4 = BVH8InstanceIntersector4Chunk(); - intersectors.intersector8 = BVH8InstanceIntersector8Chunk(); - intersectors.intersector16 = BVH8InstanceIntersector16Chunk(); - intersectors.intersectorN = BVH8InstanceIntersectorStream(); -#endif - return intersectors; - } - - Accel::Intersectors BVH8Factory::BVH8InstanceMBIntersectors(BVH8* bvh) - { - Accel::Intersectors intersectors; - intersectors.ptr = bvh; - intersectors.intersector1 = BVH8InstanceMBIntersector1(); -#if defined (EMBREE_RAY_PACKETS) - intersectors.intersector4 = BVH8InstanceMBIntersector4Chunk(); - intersectors.intersector8 = BVH8InstanceMBIntersector8Chunk(); - intersectors.intersector16 = BVH8InstanceMBIntersector16Chunk(); - intersectors.intersectorN = BVH8IntersectorStreamPacketFallback(); -#endif - return intersectors; - } - - Accel* BVH8Factory::BVH8OBBVirtualCurve8v(Scene* scene, IntersectVariant ivariant) - { - BVH8* accel = new BVH8(Curve8v::type,scene); - Accel::Intersectors intersectors = BVH8OBBVirtualCurveIntersectors(accel,VirtualCurveIntersector8v(),ivariant); - Builder* builder = BVH8Curve8vBuilder_OBB_New(accel,scene,0); - return new AccelInstance(accel,builder,intersectors); - } - - Accel* BVH8Factory::BVH8OBBVirtualCurve8iMB(Scene* scene, IntersectVariant ivariant) - { - BVH8* accel = new BVH8(Curve8iMB::type,scene); - Accel::Intersectors intersectors = BVH8OBBVirtualCurveIntersectorsMB(accel,VirtualCurveIntersector8iMB(),ivariant); - Builder* builder = BVH8OBBCurve8iMBBuilder_OBB(accel,scene,0); - return new AccelInstance(accel,builder,intersectors); - } - - Accel* BVH8Factory::BVH8Triangle4(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) - { - BVH8* accel = new BVH8(Triangle4::type,scene); - Accel::Intersectors intersectors= BVH8Triangle4Intersectors(accel,ivariant); - Builder* builder = nullptr; - if (scene->device->tri_builder == "default") { - switch (bvariant) { - case BuildVariant::STATIC : builder = BVH8Triangle4SceneBuilderSAH(accel,scene,0); break; - case BuildVariant::DYNAMIC : builder = BVH8BuilderTwoLevelTriangle4MeshSAH(accel,scene,false); break; - case BuildVariant::HIGH_QUALITY: builder = BVH8Triangle4SceneBuilderFastSpatialSAH(accel,scene,0); break; - } - } - else if (scene->device->tri_builder == "sah" ) builder = BVH8Triangle4SceneBuilderSAH(accel,scene,0); - else if (scene->device->tri_builder == "sah_fast_spatial") builder = BVH8Triangle4SceneBuilderFastSpatialSAH(accel,scene,0); - else if (scene->device->tri_builder == "sah_presplit") builder = BVH8Triangle4SceneBuilderSAH(accel,scene,MODE_HIGH_QUALITY); - else if (scene->device->tri_builder == "dynamic" ) builder = BVH8BuilderTwoLevelTriangle4MeshSAH(accel,scene,false); - else if (scene->device->tri_builder == "morton" ) builder = BVH8BuilderTwoLevelTriangle4MeshSAH(accel,scene,true); - else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder+" for BVH8"); - - return new AccelInstance(accel,builder,intersectors); - } - - Accel* BVH8Factory::BVH8Triangle4v(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) - { - BVH8* accel = new BVH8(Triangle4v::type,scene); - Accel::Intersectors intersectors= BVH8Triangle4vIntersectors(accel,ivariant); - Builder* builder = nullptr; - if (scene->device->tri_builder == "default") { - switch (bvariant) { - case BuildVariant::STATIC : builder = BVH8Triangle4vSceneBuilderSAH(accel,scene,0); break; - case BuildVariant::DYNAMIC : builder = BVH8BuilderTwoLevelTriangle4vMeshSAH(accel,scene,false); break; - case BuildVariant::HIGH_QUALITY: builder = BVH8Triangle4vSceneBuilderFastSpatialSAH(accel,scene,0); break; - } - } - else if (scene->device->tri_builder == "sah_fast_spatial") builder = BVH8Triangle4SceneBuilderFastSpatialSAH(accel,scene,0); - else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder+" for BVH8"); - return new AccelInstance(accel,builder,intersectors); - } - - Accel* BVH8Factory::BVH8Triangle4i(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) - { - BVH8* accel = new BVH8(Triangle4i::type,scene); - Accel::Intersectors intersectors = BVH8Triangle4iIntersectors(accel,ivariant); - - Builder* builder = nullptr; - if (scene->device->tri_builder == "default") { - switch (bvariant) { - case BuildVariant::STATIC : builder = BVH8Triangle4iSceneBuilderSAH(accel,scene,0); break; - case BuildVariant::DYNAMIC : builder = BVH8BuilderTwoLevelTriangle4iMeshSAH(accel,scene,false); break; - case BuildVariant::HIGH_QUALITY: assert(false); break; // FIXME: implement - } - } - else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder+" for BVH8"); - - return new AccelInstance(accel,builder,intersectors); - } - - Accel* BVH8Factory::BVH8Triangle4iMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) - { - BVH8* accel = new BVH8(Triangle4i::type,scene); - Accel::Intersectors intersectors = BVH8Triangle4iMBIntersectors(accel,ivariant); - - Builder* builder = nullptr; - if (scene->device->tri_builder_mb == "default") { // FIXME: implement - switch (bvariant) { - case BuildVariant::STATIC : builder = BVH8Triangle4iMBSceneBuilderSAH(accel,scene,0); break; - case BuildVariant::DYNAMIC : assert(false); break; // FIXME: implement - case BuildVariant::HIGH_QUALITY: assert(false); break; - } - } - else if (scene->device->tri_builder_mb == "internal_time_splits") builder = BVH8Triangle4iMBSceneBuilderSAH(accel,scene,0); - else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder_mb+" for BVH8"); - - return new AccelInstance(accel,builder,intersectors); - } - - Accel* BVH8Factory::BVH8Triangle4vMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) - { - BVH8* accel = new BVH8(Triangle4vMB::type,scene); - Accel::Intersectors intersectors= BVH8Triangle4vMBIntersectors(accel,ivariant); - - Builder* builder = nullptr; - if (scene->device->tri_builder_mb == "default") { - switch (bvariant) { - case BuildVariant::STATIC : builder = BVH8Triangle4vMBSceneBuilderSAH(accel,scene,0); break; - case BuildVariant::DYNAMIC : assert(false); break; // FIXME: implement - case BuildVariant::HIGH_QUALITY: assert(false); break; - } - } - else if (scene->device->tri_builder_mb == "internal_time_splits") builder = BVH8Triangle4vMBSceneBuilderSAH(accel,scene,0); - else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder_mb+" for BVH8"); - - return new AccelInstance(accel,builder,intersectors); - } - - Accel* BVH8Factory::BVH8QuantizedTriangle4i(Scene* scene) - { - BVH8* accel = new BVH8(Triangle4i::type,scene); - Accel::Intersectors intersectors = QBVH8Triangle4iIntersectors(accel); - Builder* builder = BVH8QuantizedTriangle4iSceneBuilderSAH(accel,scene,0); - return new AccelInstance(accel,builder,intersectors); - } - - Accel* BVH8Factory::BVH8QuantizedTriangle4(Scene* scene) - { - BVH8* accel = new BVH8(Triangle4::type,scene); - Accel::Intersectors intersectors = QBVH8Triangle4Intersectors(accel); - Builder* builder = BVH8QuantizedTriangle4SceneBuilderSAH(accel,scene,0); - return new AccelInstance(accel,builder,intersectors); - } - - Accel* BVH8Factory::BVH8Quad4v(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) - { - BVH8* accel = new BVH8(Quad4v::type,scene); - Accel::Intersectors intersectors = BVH8Quad4vIntersectors(accel,ivariant); - - Builder* builder = nullptr; - if (scene->device->quad_builder == "default") { - switch (bvariant) { - case BuildVariant::STATIC : builder = BVH8Quad4vSceneBuilderSAH(accel,scene,0); break; - case BuildVariant::DYNAMIC : builder = BVH8BuilderTwoLevelQuadMeshSAH(accel,scene,false); break; - case BuildVariant::HIGH_QUALITY: builder = BVH8Quad4vSceneBuilderFastSpatialSAH(accel,scene,0); break; - } - } - else if (scene->device->quad_builder == "dynamic" ) builder = BVH8BuilderTwoLevelQuadMeshSAH(accel,scene,false); - else if (scene->device->quad_builder == "morton" ) builder = BVH8BuilderTwoLevelQuadMeshSAH(accel,scene,true); - else if (scene->device->quad_builder == "sah_fast_spatial" ) builder = BVH8Quad4vSceneBuilderFastSpatialSAH(accel,scene,0); - else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder+" for BVH8"); - - return new AccelInstance(accel,builder,intersectors); - } - - Accel* BVH8Factory::BVH8Quad4i(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) - { - BVH8* accel = new BVH8(Quad4i::type,scene); - Accel::Intersectors intersectors = BVH8Quad4iIntersectors(accel,ivariant); - - Builder* builder = nullptr; - if (scene->device->quad_builder == "default") { - switch (bvariant) { - case BuildVariant::STATIC : builder = BVH8Quad4iSceneBuilderSAH(accel,scene,0); break; - case BuildVariant::DYNAMIC : assert(false); break; // FIXME: implement - case BuildVariant::HIGH_QUALITY: assert(false); break; // FIXME: implement - } - } - else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder+" for BVH8"); - - return new AccelInstance(accel,builder,intersectors); - } - - Accel* BVH8Factory::BVH8Quad4iMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) - { - BVH8* accel = new BVH8(Quad4i::type,scene); - Accel::Intersectors intersectors = BVH8Quad4iMBIntersectors(accel,ivariant); - - Builder* builder = nullptr; - if (scene->device->quad_builder_mb == "default") { - switch (bvariant) { - case BuildVariant::STATIC : builder = BVH8Quad4iMBSceneBuilderSAH(accel,scene,0); break; - case BuildVariant::DYNAMIC : assert(false); break; // FIXME: implement - case BuildVariant::HIGH_QUALITY: assert(false); break; - } - } - else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder_mb+" for BVH8"); - - return new AccelInstance(accel,builder,intersectors); - } - - Accel* BVH8Factory::BVH8QuantizedQuad4i(Scene* scene) - { - BVH8* accel = new BVH8(Quad4i::type,scene); - Accel::Intersectors intersectors = QBVH8Quad4iIntersectors(accel); - Builder* builder = nullptr; - if (scene->device->quad_builder == "default" ) builder = BVH8QuantizedQuad4iSceneBuilderSAH(accel,scene,0); - else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder+" for QBVH8"); - return new AccelInstance(accel,builder,intersectors); - } - - Accel* BVH8Factory::BVH8UserGeometry(Scene* scene, BuildVariant bvariant) - { - BVH8* accel = new BVH8(Object::type,scene); - Accel::Intersectors intersectors = BVH8UserGeometryIntersectors(accel); - - Builder* builder = nullptr; - if (scene->device->object_builder == "default") { - switch (bvariant) { - case BuildVariant::STATIC : builder = BVH8VirtualSceneBuilderSAH(accel,scene,0); break; - case BuildVariant::DYNAMIC : builder = BVH8BuilderTwoLevelVirtualSAH(accel,scene,false); break; - case BuildVariant::HIGH_QUALITY: assert(false); break; - } - } - else if (scene->device->object_builder == "sah") builder = BVH8VirtualSceneBuilderSAH(accel,scene,0); - else if (scene->device->object_builder == "dynamic") builder = BVH8BuilderTwoLevelVirtualSAH(accel,scene,false); - else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->object_builder+" for BVH8"); - - return new AccelInstance(accel,builder,intersectors); - } - - Accel* BVH8Factory::BVH8UserGeometryMB(Scene* scene) - { - BVH8* accel = new BVH8(Object::type,scene); - Accel::Intersectors intersectors = BVH8UserGeometryMBIntersectors(accel); - Builder* builder = BVH8VirtualMBSceneBuilderSAH(accel,scene,0); - return new AccelInstance(accel,builder,intersectors); - } - - Accel* BVH8Factory::BVH8Instance(Scene* scene, bool isExpensive, BuildVariant bvariant) - { - BVH8* accel = new BVH8(InstancePrimitive::type,scene); - Accel::Intersectors intersectors = BVH8InstanceIntersectors(accel); - auto gtype = isExpensive ? Geometry::MTY_INSTANCE_EXPENSIVE : Geometry::MTY_INSTANCE; - // Builder* builder = BVH8InstanceSceneBuilderSAH(accel,scene,gtype); - - Builder* builder = nullptr; - if (scene->device->object_builder == "default") { - switch (bvariant) { - case BuildVariant::STATIC : builder = BVH8InstanceSceneBuilderSAH(accel,scene,gtype);; break; - case BuildVariant::DYNAMIC : builder = BVH8BuilderTwoLevelInstanceSAH(accel,scene,gtype,false); break; - case BuildVariant::HIGH_QUALITY: assert(false); break; - } - } - else if (scene->device->object_builder == "sah") builder = BVH8InstanceSceneBuilderSAH(accel,scene,gtype); - else if (scene->device->object_builder == "dynamic") builder = BVH8BuilderTwoLevelInstanceSAH(accel,scene,gtype,false); - else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->object_builder+" for BVH8"); - - return new AccelInstance(accel,builder,intersectors); - } - - Accel* BVH8Factory::BVH8InstanceMB(Scene* scene, bool isExpensive) - { - BVH8* accel = new BVH8(InstancePrimitive::type,scene); - Accel::Intersectors intersectors = BVH8InstanceMBIntersectors(accel); - auto gtype = isExpensive ? Geometry::MTY_INSTANCE_EXPENSIVE : Geometry::MTY_INSTANCE; - Builder* builder = BVH8InstanceMBSceneBuilderSAH(accel,scene,gtype); - return new AccelInstance(accel,builder,intersectors); - } - - Accel::Intersectors BVH8Factory::BVH8GridIntersectors(BVH8* bvh, IntersectVariant ivariant) - { - Accel::Intersectors intersectors; - intersectors.ptr = bvh; - if (ivariant == IntersectVariant::FAST) - { - intersectors.intersector1 = BVH8GridIntersector1Moeller(); -#if defined (EMBREE_RAY_PACKETS) - intersectors.intersector4 = BVH8GridIntersector4HybridMoeller(); - intersectors.intersector8 = BVH8GridIntersector8HybridMoeller(); - intersectors.intersector16 = BVH8GridIntersector16HybridMoeller(); - intersectors.intersectorN = BVH8IntersectorStreamPacketFallback(); -#endif - } - else /* if (ivariant == IntersectVariant::ROBUST) */ - { - intersectors.intersector1 = BVH8GridIntersector1Pluecker(); -#if defined (EMBREE_RAY_PACKETS) - intersectors.intersector4 = BVH8GridIntersector4HybridPluecker(); - intersectors.intersector8 = BVH8GridIntersector8HybridPluecker(); - intersectors.intersector16 = BVH8GridIntersector16HybridPluecker(); - intersectors.intersectorN = BVH8IntersectorStreamPacketFallback(); -#endif - } - return intersectors; - } - - Accel::Intersectors BVH8Factory::BVH8GridMBIntersectors(BVH8* bvh, IntersectVariant ivariant) - { - Accel::Intersectors intersectors; - intersectors.ptr = bvh; - intersectors.intersector1 = BVH8GridMBIntersector1Moeller(); -#if defined (EMBREE_RAY_PACKETS) - intersectors.intersector4 = nullptr; - intersectors.intersector8 = nullptr; - intersectors.intersector16 = nullptr; - intersectors.intersectorN = nullptr; -#endif - return intersectors; - } - - Accel* BVH8Factory::BVH8Grid(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) - { - BVH8* accel = new BVH8(SubGridQBVH8::type,scene); - Accel::Intersectors intersectors = BVH8GridIntersectors(accel,ivariant); - Builder* builder = nullptr; - if (scene->device->grid_builder == "default") { - builder = BVH8GridSceneBuilderSAH(accel,scene,0); - } - else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->object_builder+" for BVH4"); - - return new AccelInstance(accel,builder,intersectors); - } - - Accel* BVH8Factory::BVH8GridMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) - { - BVH8* accel = new BVH8(SubGridQBVH8::type,scene); - Accel::Intersectors intersectors = BVH8GridMBIntersectors(accel,ivariant); - Builder* builder = nullptr; - if (scene->device->grid_builder_mb == "default") { - builder = BVH8GridMBSceneBuilderSAH(accel,scene,0); - } - else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->object_builder+" for BVH8MB"); - return new AccelInstance(accel,builder,intersectors); - } -} - -#endif diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh8_factory.h b/thirdparty/embree-aarch64/kernels/bvh/bvh8_factory.h deleted file mode 100644 index b92188e7d3..0000000000 --- a/thirdparty/embree-aarch64/kernels/bvh/bvh8_factory.h +++ /dev/null @@ -1,280 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "bvh_factory.h" - -namespace embree -{ - /*! BVH8 instantiations */ - class BVH8Factory : public BVHFactory - { - public: - BVH8Factory(int bfeatures, int ifeatures); - - public: - Accel* BVH8OBBVirtualCurve8v(Scene* scene, IntersectVariant ivariant); - Accel* BVH8OBBVirtualCurve8iMB(Scene* scene, IntersectVariant ivariant); - DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector8v); - DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector8iMB); - - Accel* BVH8Triangle4 (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); - Accel* BVH8Triangle4v (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); - Accel* BVH8Triangle4i (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); - Accel* BVH8Triangle4vMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); - Accel* BVH8Triangle4iMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); - - Accel* BVH8Quad4v (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); - Accel* BVH8Quad4i (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); - Accel* BVH8Quad4iMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); - - Accel* BVH8QuantizedTriangle4i(Scene* scene); - Accel* BVH8QuantizedTriangle4(Scene* scene); - Accel* BVH8QuantizedQuad4i(Scene* scene); - - Accel* BVH8UserGeometry(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC); - Accel* BVH8UserGeometryMB(Scene* scene); - - Accel* BVH8Instance(Scene* scene, bool isExpensive, BuildVariant bvariant = BuildVariant::STATIC); - Accel* BVH8InstanceMB(Scene* scene, bool isExpensive); - - Accel* BVH8Grid(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); - Accel* BVH8GridMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); - - private: - void selectBuilders(int features); - void selectIntersectors(int features); - - private: - Accel::Intersectors BVH8OBBVirtualCurveIntersectors(BVH8* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant); - Accel::Intersectors BVH8OBBVirtualCurveIntersectorsMB(BVH8* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant); - - Accel::Intersectors BVH8Triangle4Intersectors(BVH8* bvh, IntersectVariant ivariant); - Accel::Intersectors BVH8Triangle4vIntersectors(BVH8* bvh, IntersectVariant ivariant); - Accel::Intersectors BVH8Triangle4iIntersectors(BVH8* bvh, IntersectVariant ivariant); - Accel::Intersectors BVH8Triangle4iMBIntersectors(BVH8* bvh, IntersectVariant ivariant); - Accel::Intersectors BVH8Triangle4vMBIntersectors(BVH8* bvh, IntersectVariant ivariant); - - Accel::Intersectors BVH8Quad4vIntersectors(BVH8* bvh, IntersectVariant ivariant); - Accel::Intersectors BVH8Quad4iIntersectors(BVH8* bvh, IntersectVariant ivariant); - Accel::Intersectors BVH8Quad4iMBIntersectors(BVH8* bvh, IntersectVariant ivariant); - - Accel::Intersectors QBVH8Triangle4iIntersectors(BVH8* bvh); - Accel::Intersectors QBVH8Triangle4Intersectors(BVH8* bvh); - Accel::Intersectors QBVH8Quad4iIntersectors(BVH8* bvh); - - Accel::Intersectors BVH8UserGeometryIntersectors(BVH8* bvh); - Accel::Intersectors BVH8UserGeometryMBIntersectors(BVH8* bvh); - - Accel::Intersectors BVH8InstanceIntersectors(BVH8* bvh); - Accel::Intersectors BVH8InstanceMBIntersectors(BVH8* bvh); - - Accel::Intersectors BVH8GridIntersectors(BVH8* bvh, IntersectVariant ivariant); - Accel::Intersectors BVH8GridMBIntersectors(BVH8* bvh, IntersectVariant ivariant); - - private: - DEFINE_SYMBOL2(Accel::Collider,BVH8ColliderUserGeom); - - DEFINE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersector1); - DEFINE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersector1MB); - DEFINE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersectorRobust1); - DEFINE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersectorRobust1MB); - - DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4Intersector1Moeller); - DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iIntersector1Moeller); - DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vIntersector1Pluecker); - DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iIntersector1Pluecker); - - DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vMBIntersector1Moeller); - DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iMBIntersector1Moeller); - DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vMBIntersector1Pluecker); - DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iMBIntersector1Pluecker); - - DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vIntersector1Woop); - - DEFINE_SYMBOL2(Accel::Intersector1,BVH8Quad4vIntersector1Moeller); - DEFINE_SYMBOL2(Accel::Intersector1,BVH8Quad4iIntersector1Moeller); - DEFINE_SYMBOL2(Accel::Intersector1,BVH8Quad4vIntersector1Pluecker); - DEFINE_SYMBOL2(Accel::Intersector1,BVH8Quad4iIntersector1Pluecker); - - DEFINE_SYMBOL2(Accel::Intersector1,BVH8Quad4iMBIntersector1Moeller); - DEFINE_SYMBOL2(Accel::Intersector1,BVH8Quad4iMBIntersector1Pluecker); - - DEFINE_SYMBOL2(Accel::Intersector1,QBVH8Triangle4iIntersector1Pluecker); - DEFINE_SYMBOL2(Accel::Intersector1,QBVH8Triangle4Intersector1Moeller); - DEFINE_SYMBOL2(Accel::Intersector1,QBVH8Quad4iIntersector1Pluecker); - - DEFINE_SYMBOL2(Accel::Intersector1,BVH8VirtualIntersector1); - DEFINE_SYMBOL2(Accel::Intersector1,BVH8VirtualMBIntersector1); - - DEFINE_SYMBOL2(Accel::Intersector1,BVH8InstanceIntersector1); - DEFINE_SYMBOL2(Accel::Intersector1,BVH8InstanceMBIntersector1); - - DEFINE_SYMBOL2(Accel::Intersector1,BVH8GridIntersector1Moeller); - DEFINE_SYMBOL2(Accel::Intersector1,BVH8GridMBIntersector1Moeller); - DEFINE_SYMBOL2(Accel::Intersector1,BVH8GridIntersector1Pluecker); - - DEFINE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersector4Hybrid); - DEFINE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersector4HybridMB); - DEFINE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersectorRobust4Hybrid); - DEFINE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersectorRobust4HybridMB); - - DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4Intersector4HybridMoeller); - DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4Intersector4HybridMoellerNoFilter); - DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iIntersector4HybridMoeller); - DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4vIntersector4HybridPluecker); - DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iIntersector4HybridPluecker); - - DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4vMBIntersector4HybridMoeller); - DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iMBIntersector4HybridMoeller); - DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4vMBIntersector4HybridPluecker); - DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iMBIntersector4HybridPluecker); - - DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4vIntersector4HybridMoeller); - DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4vIntersector4HybridMoellerNoFilter); - DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4iIntersector4HybridMoeller); - DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4vIntersector4HybridPluecker); - DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4iIntersector4HybridPluecker); - - DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4iMBIntersector4HybridMoeller); - DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4iMBIntersector4HybridPluecker); - - DEFINE_SYMBOL2(Accel::Intersector4,BVH8VirtualIntersector4Chunk); - DEFINE_SYMBOL2(Accel::Intersector4,BVH8VirtualMBIntersector4Chunk); - - DEFINE_SYMBOL2(Accel::Intersector4,BVH8InstanceIntersector4Chunk); - DEFINE_SYMBOL2(Accel::Intersector4,BVH8InstanceMBIntersector4Chunk); - - DEFINE_SYMBOL2(Accel::Intersector4,BVH8GridIntersector4HybridMoeller); - DEFINE_SYMBOL2(Accel::Intersector4,BVH8GridIntersector4HybridPluecker); - - DEFINE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersector8Hybrid); - DEFINE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersector8HybridMB); - DEFINE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersectorRobust8Hybrid); - DEFINE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersectorRobust8HybridMB); - - DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4Intersector8HybridMoeller); - DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4Intersector8HybridMoellerNoFilter); - DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iIntersector8HybridMoeller); - DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4vIntersector8HybridPluecker); - DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iIntersector8HybridPluecker); - - DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4vMBIntersector8HybridMoeller); - DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iMBIntersector8HybridMoeller); - DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4vMBIntersector8HybridPluecker); - DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iMBIntersector8HybridPluecker); - - DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4vIntersector8HybridMoeller); - DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4vIntersector8HybridMoellerNoFilter); - DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4iIntersector8HybridMoeller); - DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4vIntersector8HybridPluecker); - DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4iIntersector8HybridPluecker); - - DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4iMBIntersector8HybridMoeller); - DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4iMBIntersector8HybridPluecker); - - DEFINE_SYMBOL2(Accel::Intersector8,BVH8VirtualIntersector8Chunk); - DEFINE_SYMBOL2(Accel::Intersector8,BVH8VirtualMBIntersector8Chunk); - - DEFINE_SYMBOL2(Accel::Intersector8,BVH8InstanceIntersector8Chunk); - DEFINE_SYMBOL2(Accel::Intersector8,BVH8InstanceMBIntersector8Chunk); - - DEFINE_SYMBOL2(Accel::Intersector8,BVH8GridIntersector8HybridMoeller); - DEFINE_SYMBOL2(Accel::Intersector8,BVH8GridIntersector8HybridPluecker); - - DEFINE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersector16Hybrid); - DEFINE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersector16HybridMB); - DEFINE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersectorRobust16Hybrid); - DEFINE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersectorRobust16HybridMB); - - DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4Intersector16HybridMoeller); - DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4Intersector16HybridMoellerNoFilter); - DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iIntersector16HybridMoeller); - DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4vIntersector16HybridPluecker); - DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iIntersector16HybridPluecker); - - DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4vMBIntersector16HybridMoeller); - DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iMBIntersector16HybridMoeller); - DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4vMBIntersector16HybridPluecker); - DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iMBIntersector16HybridPluecker); - - DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4vIntersector16HybridMoeller); - DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4vIntersector16HybridMoellerNoFilter); - DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4iIntersector16HybridMoeller); - DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4vIntersector16HybridPluecker); - DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4iIntersector16HybridPluecker); - - DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4iMBIntersector16HybridMoeller); - DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4iMBIntersector16HybridPluecker); - - DEFINE_SYMBOL2(Accel::Intersector16,BVH8VirtualIntersector16Chunk); - DEFINE_SYMBOL2(Accel::Intersector16,BVH8VirtualMBIntersector16Chunk); - - DEFINE_SYMBOL2(Accel::Intersector16,BVH8InstanceIntersector16Chunk); - DEFINE_SYMBOL2(Accel::Intersector16,BVH8InstanceMBIntersector16Chunk); - - DEFINE_SYMBOL2(Accel::Intersector16,BVH8GridIntersector16HybridMoeller); - DEFINE_SYMBOL2(Accel::Intersector16,BVH8GridIntersector16HybridPluecker); - - DEFINE_SYMBOL2(Accel::IntersectorN,BVH8IntersectorStreamPacketFallback); - - DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4IntersectorStreamMoeller); - DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4IntersectorStreamMoellerNoFilter); - DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4iIntersectorStreamMoeller); - DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4vIntersectorStreamPluecker); - DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4iIntersectorStreamPluecker); - - DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamMoeller); - DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamMoellerNoFilter); - DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Quad4iIntersectorStreamMoeller); - DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamPluecker); - DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Quad4iIntersectorStreamPluecker); - - DEFINE_SYMBOL2(Accel::IntersectorN,BVH8VirtualIntersectorStream); - - DEFINE_SYMBOL2(Accel::IntersectorN,BVH8InstanceIntersectorStream); - - // SAH scene builders - private: - DEFINE_ISA_FUNCTION(Builder*,BVH8Curve8vBuilder_OBB_New,void* COMMA Scene* COMMA size_t); - DEFINE_ISA_FUNCTION(Builder*,BVH8OBBCurve8iMBBuilder_OBB,void* COMMA Scene* COMMA size_t); - - DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4SceneBuilderSAH,void* COMMA Scene* COMMA size_t); - DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t); - DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); - DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); - DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4vMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); - DEFINE_ISA_FUNCTION(Builder*,BVH8QuantizedTriangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); - DEFINE_ISA_FUNCTION(Builder*,BVH8QuantizedTriangle4SceneBuilderSAH,void* COMMA Scene* COMMA size_t); - - DEFINE_ISA_FUNCTION(Builder*,BVH8Quad4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t); - DEFINE_ISA_FUNCTION(Builder*,BVH8Quad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); - DEFINE_ISA_FUNCTION(Builder*,BVH8Quad4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); - DEFINE_ISA_FUNCTION(Builder*,BVH8QuantizedQuad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); - - DEFINE_ISA_FUNCTION(Builder*,BVH8VirtualSceneBuilderSAH,void* COMMA Scene* COMMA size_t); - DEFINE_ISA_FUNCTION(Builder*,BVH8VirtualMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); - - DEFINE_ISA_FUNCTION(Builder*,BVH8InstanceSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask); - DEFINE_ISA_FUNCTION(Builder*,BVH8InstanceMBSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask); - - DEFINE_ISA_FUNCTION(Builder*,BVH8GridSceneBuilderSAH,void* COMMA Scene* COMMA size_t); - DEFINE_ISA_FUNCTION(Builder*,BVH8GridMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); - - // SAH spatial scene builders - private: - DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4SceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t); - DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t); - DEFINE_ISA_FUNCTION(Builder*,BVH8Quad4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t); - - // twolevel scene builders - private: - DEFINE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelTriangle4MeshSAH,void* COMMA Scene* COMMA bool); - DEFINE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelTriangle4vMeshSAH,void* COMMA Scene* COMMA bool); - DEFINE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelTriangle4iMeshSAH,void* COMMA Scene* COMMA bool); - DEFINE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelQuadMeshSAH,void* COMMA Scene* COMMA bool); - DEFINE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelVirtualSAH,void* COMMA Scene* COMMA bool); - DEFINE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelInstanceSAH,void* COMMA Scene* COMMA Geometry::GTypeMask COMMA bool); - }; -} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder.cpp deleted file mode 100644 index e832537ec5..0000000000 --- a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder.cpp +++ /dev/null @@ -1,60 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "bvh_builder.h" - -namespace embree -{ - namespace isa - { - template - typename BVHN::NodeRef BVHNBuilderVirtual::BVHNBuilderV::build(FastAllocator* allocator, BuildProgressMonitor& progressFunc, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings) - { - auto createLeafFunc = [&] (const PrimRef* prims, const range& set, const Allocator& alloc) -> NodeRef { - return createLeaf(prims,set,alloc); - }; - - settings.branchingFactor = N; - settings.maxDepth = BVH::maxBuildDepthLeaf; - return BVHBuilderBinnedSAH::build - (FastAllocator::Create(allocator),typename BVH::AABBNode::Create2(),typename BVH::AABBNode::Set3(allocator,prims),createLeafFunc,progressFunc,prims,pinfo,settings); - } - - - template - typename BVHN::NodeRef BVHNBuilderQuantizedVirtual::BVHNBuilderV::build(FastAllocator* allocator, BuildProgressMonitor& progressFunc, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings) - { - auto createLeafFunc = [&] (const PrimRef* prims, const range& set, const Allocator& alloc) -> NodeRef { - return createLeaf(prims,set,alloc); - }; - - settings.branchingFactor = N; - settings.maxDepth = BVH::maxBuildDepthLeaf; - return BVHBuilderBinnedSAH::build - (FastAllocator::Create(allocator),typename BVH::QuantizedNode::Create2(),typename BVH::QuantizedNode::Set2(),createLeafFunc,progressFunc,prims,pinfo,settings); - } - - template - typename BVHN::NodeRecordMB BVHNBuilderMblurVirtual::BVHNBuilderV::build(FastAllocator* allocator, BuildProgressMonitor& progressFunc, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings, const BBox1f& timeRange) - { - auto createLeafFunc = [&] (const PrimRef* prims, const range& set, const Allocator& alloc) -> NodeRecordMB { - return createLeaf(prims,set,alloc); - }; - - settings.branchingFactor = N; - settings.maxDepth = BVH::maxBuildDepthLeaf; - return BVHBuilderBinnedSAH::build - (FastAllocator::Create(allocator),typename BVH::AABBNodeMB::Create(),typename BVH::AABBNodeMB::SetTimeRange(timeRange),createLeafFunc,progressFunc,prims,pinfo,settings); - } - - template struct BVHNBuilderVirtual<4>; - template struct BVHNBuilderQuantizedVirtual<4>; - template struct BVHNBuilderMblurVirtual<4>; - -#if defined(__AVX__) - template struct BVHNBuilderVirtual<8>; - template struct BVHNBuilderQuantizedVirtual<8>; - template struct BVHNBuilderMblurVirtual<8>; -#endif - } -} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder.h deleted file mode 100644 index 1b86bb45ad..0000000000 --- a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder.h +++ /dev/null @@ -1,114 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "bvh.h" -#include "../builders/bvh_builder_sah.h" - -namespace embree -{ - namespace isa - { - /************************************************************************************/ - /************************************************************************************/ - /************************************************************************************/ - /************************************************************************************/ - - template - struct BVHNBuilderVirtual - { - typedef BVHN BVH; - typedef typename BVH::NodeRef NodeRef; - typedef FastAllocator::CachedAllocator Allocator; - - struct BVHNBuilderV { - NodeRef build(FastAllocator* allocator, BuildProgressMonitor& progress, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings); - virtual NodeRef createLeaf (const PrimRef* prims, const range& set, const Allocator& alloc) = 0; - }; - - template - struct BVHNBuilderT : public BVHNBuilderV - { - BVHNBuilderT (CreateLeafFunc createLeafFunc) - : createLeafFunc(createLeafFunc) {} - - NodeRef createLeaf (const PrimRef* prims, const range& set, const Allocator& alloc) { - return createLeafFunc(prims,set,alloc); - } - - private: - CreateLeafFunc createLeafFunc; - }; - - template - static NodeRef build(FastAllocator* allocator, CreateLeafFunc createLeaf, BuildProgressMonitor& progress, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings) { - return BVHNBuilderT(createLeaf).build(allocator,progress,prims,pinfo,settings); - } - }; - - template - struct BVHNBuilderQuantizedVirtual - { - typedef BVHN BVH; - typedef typename BVH::NodeRef NodeRef; - typedef FastAllocator::CachedAllocator Allocator; - - struct BVHNBuilderV { - NodeRef build(FastAllocator* allocator, BuildProgressMonitor& progress, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings); - virtual NodeRef createLeaf (const PrimRef* prims, const range& set, const Allocator& alloc) = 0; - }; - - template - struct BVHNBuilderT : public BVHNBuilderV - { - BVHNBuilderT (CreateLeafFunc createLeafFunc) - : createLeafFunc(createLeafFunc) {} - - NodeRef createLeaf (const PrimRef* prims, const range& set, const Allocator& alloc) { - return createLeafFunc(prims,set,alloc); - } - - private: - CreateLeafFunc createLeafFunc; - }; - - template - static NodeRef build(FastAllocator* allocator, CreateLeafFunc createLeaf, BuildProgressMonitor& progress, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings) { - return BVHNBuilderT(createLeaf).build(allocator,progress,prims,pinfo,settings); - } - }; - - template - struct BVHNBuilderMblurVirtual - { - typedef BVHN BVH; - typedef typename BVH::AABBNodeMB AABBNodeMB; - typedef typename BVH::NodeRef NodeRef; - typedef typename BVH::NodeRecordMB NodeRecordMB; - typedef FastAllocator::CachedAllocator Allocator; - - struct BVHNBuilderV { - NodeRecordMB build(FastAllocator* allocator, BuildProgressMonitor& progress, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings, const BBox1f& timeRange); - virtual NodeRecordMB createLeaf (const PrimRef* prims, const range& set, const Allocator& alloc) = 0; - }; - - template - struct BVHNBuilderT : public BVHNBuilderV - { - BVHNBuilderT (CreateLeafFunc createLeafFunc) - : createLeafFunc(createLeafFunc) {} - - NodeRecordMB createLeaf (const PrimRef* prims, const range& set, const Allocator& alloc) { - return createLeafFunc(prims,set,alloc); - } - - private: - CreateLeafFunc createLeafFunc; - }; - - template - static NodeRecordMB build(FastAllocator* allocator, CreateLeafFunc createLeaf, BuildProgressMonitor& progress, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings, const BBox1f& timeRange) { - return BVHNBuilderT(createLeaf).build(allocator,progress,prims,pinfo,settings,timeRange); - } - }; - } -} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_morton.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_morton.cpp deleted file mode 100644 index 64759c1294..0000000000 --- a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_morton.cpp +++ /dev/null @@ -1,531 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "bvh.h" -#include "bvh_statistics.h" -#include "bvh_rotate.h" -#include "../common/profile.h" -#include "../../common/algorithms/parallel_prefix_sum.h" - -#include "../builders/primrefgen.h" -#include "../builders/bvh_builder_morton.h" - -#include "../geometry/triangle.h" -#include "../geometry/trianglev.h" -#include "../geometry/trianglei.h" -#include "../geometry/quadv.h" -#include "../geometry/quadi.h" -#include "../geometry/object.h" -#include "../geometry/instance.h" - -#if defined(__X86_64__) || defined(__aarch64__) -# define ROTATE_TREE 1 // specifies number of tree rotation rounds to perform -#else -# define ROTATE_TREE 0 // do not use tree rotations on 32 bit platforms, barrier bit in NodeRef will cause issues -#endif - -namespace embree -{ - namespace isa - { - template - struct SetBVHNBounds - { - typedef BVHN BVH; - typedef typename BVH::NodeRef NodeRef; - typedef typename BVH::NodeRecord NodeRecord; - typedef typename BVH::AABBNode AABBNode; - - BVH* bvh; - __forceinline SetBVHNBounds (BVH* bvh) : bvh(bvh) {} - - __forceinline NodeRecord operator() (NodeRef ref, const NodeRecord* children, size_t num) - { - AABBNode* node = ref.getAABBNode(); - - BBox3fa res = empty; - for (size_t i=0; isetRef(i,children[i].ref); - node->setBounds(i,b); - } - - BBox3fx result = (BBox3fx&)res; -#if ROTATE_TREE - if (N == 4) - { - size_t n = 0; - for (size_t i=0; i= 4096) { - for (size_t i=0; i::rotate(node->child(i)); - node->child(i).setBarrier(); - } - } - } - result.lower.a = unsigned(n); - } -#endif - - return NodeRecord(ref,result); - } - }; - - template - struct CreateMortonLeaf; - - template - struct CreateMortonLeaf - { - typedef BVHN BVH; - typedef typename BVH::NodeRef NodeRef; - typedef typename BVH::NodeRecord NodeRecord; - - __forceinline CreateMortonLeaf (TriangleMesh* mesh, unsigned int geomID, BVHBuilderMorton::BuildPrim* morton) - : mesh(mesh), morton(morton), geomID_(geomID) {} - - __noinline NodeRecord operator() (const range& current, const FastAllocator::CachedAllocator& alloc) - { - vfloat4 lower(pos_inf); - vfloat4 upper(neg_inf); - size_t items = current.size(); - size_t start = current.begin(); - assert(items<=4); - - /* allocate leaf node */ - Triangle4* accel = (Triangle4*) alloc.malloc1(sizeof(Triangle4),BVH::byteAlignment); - NodeRef ref = BVH::encodeLeaf((char*)accel,1); - vuint4 vgeomID = -1, vprimID = -1; - Vec3vf4 v0 = zero, v1 = zero, v2 = zero; - const TriangleMesh* __restrict__ const mesh = this->mesh; - - for (size_t i=0; itriangle(primID); - const Vec3fa& p0 = mesh->vertex(tri.v[0]); - const Vec3fa& p1 = mesh->vertex(tri.v[1]); - const Vec3fa& p2 = mesh->vertex(tri.v[2]); - lower = min(lower,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2); - upper = max(upper,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2); - vgeomID [i] = geomID_; - vprimID [i] = primID; - v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z; - v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z; - v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z; - } - - Triangle4::store_nt(accel,Triangle4(v0,v1,v2,vgeomID,vprimID)); - BBox3fx box_o = BBox3fx((Vec3fx)lower,(Vec3fx)upper); -#if ROTATE_TREE - if (N == 4) - box_o.lower.a = unsigned(current.size()); -#endif - return NodeRecord(ref,box_o); - } - - private: - TriangleMesh* mesh; - BVHBuilderMorton::BuildPrim* morton; - unsigned int geomID_ = std::numeric_limits::max(); - }; - - template - struct CreateMortonLeaf - { - typedef BVHN BVH; - typedef typename BVH::NodeRef NodeRef; - typedef typename BVH::NodeRecord NodeRecord; - - __forceinline CreateMortonLeaf (TriangleMesh* mesh, unsigned int geomID, BVHBuilderMorton::BuildPrim* morton) - : mesh(mesh), morton(morton), geomID_(geomID) {} - - __noinline NodeRecord operator() (const range& current, const FastAllocator::CachedAllocator& alloc) - { - vfloat4 lower(pos_inf); - vfloat4 upper(neg_inf); - size_t items = current.size(); - size_t start = current.begin(); - assert(items<=4); - - /* allocate leaf node */ - Triangle4v* accel = (Triangle4v*) alloc.malloc1(sizeof(Triangle4v),BVH::byteAlignment); - NodeRef ref = BVH::encodeLeaf((char*)accel,1); - vuint4 vgeomID = -1, vprimID = -1; - Vec3vf4 v0 = zero, v1 = zero, v2 = zero; - const TriangleMesh* __restrict__ mesh = this->mesh; - - for (size_t i=0; itriangle(primID); - const Vec3fa& p0 = mesh->vertex(tri.v[0]); - const Vec3fa& p1 = mesh->vertex(tri.v[1]); - const Vec3fa& p2 = mesh->vertex(tri.v[2]); - lower = min(lower,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2); - upper = max(upper,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2); - vgeomID [i] = geomID_; - vprimID [i] = primID; - v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z; - v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z; - v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z; - } - Triangle4v::store_nt(accel,Triangle4v(v0,v1,v2,vgeomID,vprimID)); - BBox3fx box_o = BBox3fx((Vec3fx)lower,(Vec3fx)upper); -#if ROTATE_TREE - if (N == 4) - box_o.lower.a = current.size(); -#endif - return NodeRecord(ref,box_o); - } - private: - TriangleMesh* mesh; - BVHBuilderMorton::BuildPrim* morton; - unsigned int geomID_ = std::numeric_limits::max(); - }; - - template - struct CreateMortonLeaf - { - typedef BVHN BVH; - typedef typename BVH::NodeRef NodeRef; - typedef typename BVH::NodeRecord NodeRecord; - - __forceinline CreateMortonLeaf (TriangleMesh* mesh, unsigned int geomID, BVHBuilderMorton::BuildPrim* morton) - : mesh(mesh), morton(morton), geomID_(geomID) {} - - __noinline NodeRecord operator() (const range& current, const FastAllocator::CachedAllocator& alloc) - { - vfloat4 lower(pos_inf); - vfloat4 upper(neg_inf); - size_t items = current.size(); - size_t start = current.begin(); - assert(items<=4); - - /* allocate leaf node */ - Triangle4i* accel = (Triangle4i*) alloc.malloc1(sizeof(Triangle4i),BVH::byteAlignment); - NodeRef ref = BVH::encodeLeaf((char*)accel,1); - - vuint4 v0 = zero, v1 = zero, v2 = zero; - vuint4 vgeomID = -1, vprimID = -1; - const TriangleMesh* __restrict__ const mesh = this->mesh; - - for (size_t i=0; itriangle(primID); - const Vec3fa& p0 = mesh->vertex(tri.v[0]); - const Vec3fa& p1 = mesh->vertex(tri.v[1]); - const Vec3fa& p2 = mesh->vertex(tri.v[2]); - lower = min(lower,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2); - upper = max(upper,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2); - vgeomID[i] = geomID_; - vprimID[i] = primID; - unsigned int int_stride = mesh->vertices0.getStride()/4; - v0[i] = tri.v[0] * int_stride; - v1[i] = tri.v[1] * int_stride; - v2[i] = tri.v[2] * int_stride; - } - - for (size_t i=items; i<4; i++) - { - vgeomID[i] = vgeomID[0]; - vprimID[i] = -1; - v0[i] = 0; - v1[i] = 0; - v2[i] = 0; - } - Triangle4i::store_nt(accel,Triangle4i(v0,v1,v2,vgeomID,vprimID)); - BBox3fx box_o = BBox3fx((Vec3fx)lower,(Vec3fx)upper); -#if ROTATE_TREE - if (N == 4) - box_o.lower.a = current.size(); -#endif - return NodeRecord(ref,box_o); - } - private: - TriangleMesh* mesh; - BVHBuilderMorton::BuildPrim* morton; - unsigned int geomID_ = std::numeric_limits::max(); - }; - - template - struct CreateMortonLeaf - { - typedef BVHN BVH; - typedef typename BVH::NodeRef NodeRef; - typedef typename BVH::NodeRecord NodeRecord; - - __forceinline CreateMortonLeaf (QuadMesh* mesh, unsigned int geomID, BVHBuilderMorton::BuildPrim* morton) - : mesh(mesh), morton(morton), geomID_(geomID) {} - - __noinline NodeRecord operator() (const range& current, const FastAllocator::CachedAllocator& alloc) - { - vfloat4 lower(pos_inf); - vfloat4 upper(neg_inf); - size_t items = current.size(); - size_t start = current.begin(); - assert(items<=4); - - /* allocate leaf node */ - Quad4v* accel = (Quad4v*) alloc.malloc1(sizeof(Quad4v),BVH::byteAlignment); - NodeRef ref = BVH::encodeLeaf((char*)accel,1); - - vuint4 vgeomID = -1, vprimID = -1; - Vec3vf4 v0 = zero, v1 = zero, v2 = zero, v3 = zero; - const QuadMesh* __restrict__ mesh = this->mesh; - - for (size_t i=0; iquad(primID); - const Vec3fa& p0 = mesh->vertex(tri.v[0]); - const Vec3fa& p1 = mesh->vertex(tri.v[1]); - const Vec3fa& p2 = mesh->vertex(tri.v[2]); - const Vec3fa& p3 = mesh->vertex(tri.v[3]); - lower = min(lower,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2,(vfloat4)p3); - upper = max(upper,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2,(vfloat4)p3); - vgeomID [i] = geomID_; - vprimID [i] = primID; - v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z; - v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z; - v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z; - v3.x[i] = p3.x; v3.y[i] = p3.y; v3.z[i] = p3.z; - } - Quad4v::store_nt(accel,Quad4v(v0,v1,v2,v3,vgeomID,vprimID)); - BBox3fx box_o = BBox3fx((Vec3fx)lower,(Vec3fx)upper); -#if ROTATE_TREE - if (N == 4) - box_o.lower.a = current.size(); -#endif - return NodeRecord(ref,box_o); - } - private: - QuadMesh* mesh; - BVHBuilderMorton::BuildPrim* morton; - unsigned int geomID_ = std::numeric_limits::max(); - }; - - template - struct CreateMortonLeaf - { - typedef BVHN BVH; - typedef typename BVH::NodeRef NodeRef; - typedef typename BVH::NodeRecord NodeRecord; - - __forceinline CreateMortonLeaf (UserGeometry* mesh, unsigned int geomID, BVHBuilderMorton::BuildPrim* morton) - : mesh(mesh), morton(morton), geomID_(geomID) {} - - __noinline NodeRecord operator() (const range& current, const FastAllocator::CachedAllocator& alloc) - { - vfloat4 lower(pos_inf); - vfloat4 upper(neg_inf); - size_t items = current.size(); - size_t start = current.begin(); - - /* allocate leaf node */ - Object* accel = (Object*) alloc.malloc1(items*sizeof(Object),BVH::byteAlignment); - NodeRef ref = BVH::encodeLeaf((char*)accel,items); - const UserGeometry* mesh = this->mesh; - - BBox3fa bounds = empty; - for (size_t i=0; ibounds(primID)); - new (&accel[i]) Object(geomID_,primID); - } - - BBox3fx box_o = (BBox3fx&)bounds; -#if ROTATE_TREE - if (N == 4) - box_o.lower.a = current.size(); -#endif - return NodeRecord(ref,box_o); - } - private: - UserGeometry* mesh; - BVHBuilderMorton::BuildPrim* morton; - unsigned int geomID_ = std::numeric_limits::max(); - }; - - template - struct CreateMortonLeaf - { - typedef BVHN BVH; - typedef typename BVH::NodeRef NodeRef; - typedef typename BVH::NodeRecord NodeRecord; - - __forceinline CreateMortonLeaf (Instance* mesh, unsigned int geomID, BVHBuilderMorton::BuildPrim* morton) - : mesh(mesh), morton(morton), geomID_(geomID) {} - - __noinline NodeRecord operator() (const range& current, const FastAllocator::CachedAllocator& alloc) - { - vfloat4 lower(pos_inf); - vfloat4 upper(neg_inf); - size_t items = current.size(); - size_t start = current.begin(); - assert(items <= 1); - - /* allocate leaf node */ - InstancePrimitive* accel = (InstancePrimitive*) alloc.malloc1(items*sizeof(InstancePrimitive),BVH::byteAlignment); - NodeRef ref = BVH::encodeLeaf((char*)accel,items); - const Instance* instance = this->mesh; - - BBox3fa bounds = empty; - for (size_t i=0; ibounds(primID)); - new (&accel[i]) InstancePrimitive(instance, geomID_); - } - - BBox3fx box_o = (BBox3fx&)bounds; -#if ROTATE_TREE - if (N == 4) - box_o.lower.a = current.size(); -#endif - return NodeRecord(ref,box_o); - } - private: - Instance* mesh; - BVHBuilderMorton::BuildPrim* morton; - unsigned int geomID_ = std::numeric_limits::max(); - }; - - template - struct CalculateMeshBounds - { - __forceinline CalculateMeshBounds (Mesh* mesh) - : mesh(mesh) {} - - __forceinline const BBox3fa operator() (const BVHBuilderMorton::BuildPrim& morton) { - return mesh->bounds(morton.index); - } - - private: - Mesh* mesh; - }; - - template - class BVHNMeshBuilderMorton : public Builder - { - typedef BVHN BVH; - typedef typename BVH::AABBNode AABBNode; - typedef typename BVH::NodeRef NodeRef; - typedef typename BVH::NodeRecord NodeRecord; - - public: - - BVHNMeshBuilderMorton (BVH* bvh, Mesh* mesh, unsigned int geomID, const size_t minLeafSize, const size_t maxLeafSize, const size_t singleThreadThreshold = DEFAULT_SINGLE_THREAD_THRESHOLD) - : bvh(bvh), mesh(mesh), morton(bvh->device,0), settings(N,BVH::maxBuildDepth,minLeafSize,min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks),singleThreadThreshold), geomID_(geomID) {} - - /* build function */ - void build() - { - /* we reset the allocator when the mesh size changed */ - if (mesh->numPrimitives != numPreviousPrimitives) { - bvh->alloc.clear(); - morton.clear(); - } - size_t numPrimitives = mesh->size(); - numPreviousPrimitives = numPrimitives; - - /* skip build for empty scene */ - if (numPrimitives == 0) { - bvh->set(BVH::emptyNode,empty,0); - return; - } - - /* preallocate arrays */ - morton.resize(numPrimitives); - size_t bytesEstimated = numPrimitives*sizeof(AABBNode)/(4*N) + size_t(1.2f*Primitive::blocks(numPrimitives)*sizeof(Primitive)); - size_t bytesMortonCodes = numPrimitives*sizeof(BVHBuilderMorton::BuildPrim); - bytesEstimated = max(bytesEstimated,bytesMortonCodes); // the first allocation block is reused to sort the morton codes - bvh->alloc.init(bytesMortonCodes,bytesMortonCodes,bytesEstimated); - - /* create morton code array */ - BVHBuilderMorton::BuildPrim* dest = (BVHBuilderMorton::BuildPrim*) bvh->alloc.specialAlloc(bytesMortonCodes); - size_t numPrimitivesGen = createMortonCodeArray(mesh,morton,bvh->scene->progressInterface); - - /* create BVH */ - SetBVHNBounds setBounds(bvh); - CreateMortonLeaf createLeaf(mesh,geomID_,morton.data()); - CalculateMeshBounds calculateBounds(mesh); - auto root = BVHBuilderMorton::build( - typename BVH::CreateAlloc(bvh), - typename BVH::AABBNode::Create(), - setBounds,createLeaf,calculateBounds,bvh->scene->progressInterface, - morton.data(),dest,numPrimitivesGen,settings); - - bvh->set(root.ref,LBBox3fa(root.bounds),numPrimitives); - -#if ROTATE_TREE - if (N == 4) - { - for (int i=0; i::rotate(bvh->root); - bvh->clearBarrier(bvh->root); - } -#endif - - /* clear temporary data for static geometry */ - if (bvh->scene->isStaticAccel()) { - morton.clear(); - } - bvh->cleanup(); - } - - void clear() { - morton.clear(); - } - - private: - BVH* bvh; - Mesh* mesh; - mvector morton; - BVHBuilderMorton::Settings settings; - unsigned int geomID_ = std::numeric_limits::max(); - unsigned int numPreviousPrimitives = 0; - }; - -#if defined(EMBREE_GEOMETRY_TRIANGLE) - Builder* BVH4Triangle4MeshBuilderMortonGeneral (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<4,TriangleMesh,Triangle4> ((BVH4*)bvh,mesh,geomID,4,4); } - Builder* BVH4Triangle4vMeshBuilderMortonGeneral (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<4,TriangleMesh,Triangle4v>((BVH4*)bvh,mesh,geomID,4,4); } - Builder* BVH4Triangle4iMeshBuilderMortonGeneral (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<4,TriangleMesh,Triangle4i>((BVH4*)bvh,mesh,geomID,4,4); } -#if defined(__AVX__) - Builder* BVH8Triangle4MeshBuilderMortonGeneral (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<8,TriangleMesh,Triangle4> ((BVH8*)bvh,mesh,geomID,4,4); } - Builder* BVH8Triangle4vMeshBuilderMortonGeneral (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<8,TriangleMesh,Triangle4v>((BVH8*)bvh,mesh,geomID,4,4); } - Builder* BVH8Triangle4iMeshBuilderMortonGeneral (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<8,TriangleMesh,Triangle4i>((BVH8*)bvh,mesh,geomID,4,4); } -#endif -#endif - -#if defined(EMBREE_GEOMETRY_QUAD) - Builder* BVH4Quad4vMeshBuilderMortonGeneral (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<4,QuadMesh,Quad4v>((BVH4*)bvh,mesh,geomID,4,4); } -#if defined(__AVX__) - Builder* BVH8Quad4vMeshBuilderMortonGeneral (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<8,QuadMesh,Quad4v>((BVH8*)bvh,mesh,geomID,4,4); } -#endif -#endif - -#if defined(EMBREE_GEOMETRY_USER) - Builder* BVH4VirtualMeshBuilderMortonGeneral (void* bvh, UserGeometry* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<4,UserGeometry,Object>((BVH4*)bvh,mesh,geomID,1,BVH4::maxLeafBlocks); } -#if defined(__AVX__) - Builder* BVH8VirtualMeshBuilderMortonGeneral (void* bvh, UserGeometry* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<8,UserGeometry,Object>((BVH8*)bvh,mesh,geomID,1,BVH4::maxLeafBlocks); } -#endif -#endif - -#if defined(EMBREE_GEOMETRY_INSTANCE) - Builder* BVH4InstanceMeshBuilderMortonGeneral (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<4,Instance,InstancePrimitive>((BVH4*)bvh,mesh,gtype,geomID,1,BVH4::maxLeafBlocks); } -#if defined(__AVX__) - Builder* BVH8InstanceMeshBuilderMortonGeneral (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<8,Instance,InstancePrimitive>((BVH8*)bvh,mesh,gtype,geomID,1,BVH4::maxLeafBlocks); } -#endif -#endif - - } -} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah.cpp deleted file mode 100644 index cf5b2eb47f..0000000000 --- a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah.cpp +++ /dev/null @@ -1,640 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "bvh.h" -#include "bvh_builder.h" -#include "../builders/primrefgen.h" -#include "../builders/splitter.h" - -#include "../geometry/linei.h" -#include "../geometry/triangle.h" -#include "../geometry/trianglev.h" -#include "../geometry/trianglev_mb.h" -#include "../geometry/trianglei.h" -#include "../geometry/quadv.h" -#include "../geometry/quadi.h" -#include "../geometry/object.h" -#include "../geometry/instance.h" -#include "../geometry/subgrid.h" - -#include "../common/state.h" -#include "../../common/algorithms/parallel_for_for.h" -#include "../../common/algorithms/parallel_for_for_prefix_sum.h" - -#define PROFILE 0 -#define PROFILE_RUNS 20 - -namespace embree -{ - namespace isa - { - template - struct CreateLeaf - { - typedef BVHN BVH; - typedef typename BVH::NodeRef NodeRef; - - __forceinline CreateLeaf (BVH* bvh) : bvh(bvh) {} - - __forceinline NodeRef operator() (const PrimRef* prims, const range& set, const FastAllocator::CachedAllocator& alloc) const - { - size_t n = set.size(); - size_t items = Primitive::blocks(n); - size_t start = set.begin(); - Primitive* accel = (Primitive*) alloc.malloc1(items*sizeof(Primitive),BVH::byteAlignment); - typename BVH::NodeRef node = BVH::encodeLeaf((char*)accel,items); - for (size_t i=0; iscene); - } - return node; - } - - BVH* bvh; - }; - - - template - struct CreateLeafQuantized - { - typedef BVHN BVH; - typedef typename BVH::NodeRef NodeRef; - - __forceinline CreateLeafQuantized (BVH* bvh) : bvh(bvh) {} - - __forceinline NodeRef operator() (const PrimRef* prims, const range& set, const FastAllocator::CachedAllocator& alloc) const - { - size_t n = set.size(); - size_t items = Primitive::blocks(n); - size_t start = set.begin(); - Primitive* accel = (Primitive*) alloc.malloc1(items*sizeof(Primitive),BVH::byteAlignment); - typename BVH::NodeRef node = BVH::encodeLeaf((char*)accel,items); - for (size_t i=0; iscene); - } - return node; - } - - BVH* bvh; - }; - - /************************************************************************************/ - /************************************************************************************/ - /************************************************************************************/ - /************************************************************************************/ - - template - struct BVHNBuilderSAH : public Builder - { - typedef BVHN BVH; - typedef typename BVHN::NodeRef NodeRef; - - BVH* bvh; - Scene* scene; - Geometry* mesh; - mvector prims; - GeneralBVHBuilder::Settings settings; - Geometry::GTypeMask gtype_; - unsigned int geomID_ = std::numeric_limits::max (); - bool primrefarrayalloc; - unsigned int numPreviousPrimitives = 0; - - BVHNBuilderSAH (BVH* bvh, Scene* scene, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, - const Geometry::GTypeMask gtype, bool primrefarrayalloc = false) - : bvh(bvh), scene(scene), mesh(nullptr), prims(scene->device,0), - settings(sahBlockSize, minLeafSize, min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD), gtype_(gtype), primrefarrayalloc(primrefarrayalloc) {} - - BVHNBuilderSAH (BVH* bvh, Geometry* mesh, unsigned int geomID, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const Geometry::GTypeMask gtype) - : bvh(bvh), scene(nullptr), mesh(mesh), prims(bvh->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD), gtype_(gtype), geomID_(geomID), primrefarrayalloc(false) {} - - // FIXME: shrink bvh->alloc in destructor here and in other builders too - - void build() - { - /* we reset the allocator when the mesh size changed */ - if (mesh && mesh->numPrimitives != numPreviousPrimitives) { - bvh->alloc.clear(); - } - - /* if we use the primrefarray for allocations we have to take it back from the BVH */ - if (settings.primrefarrayalloc != size_t(inf)) - bvh->alloc.unshare(prims); - - /* skip build for empty scene */ - const size_t numPrimitives = mesh ? mesh->size() : scene->getNumPrimitives(gtype_,false); - numPreviousPrimitives = numPrimitives; - if (numPrimitives == 0) { - bvh->clear(); - prims.clear(); - return; - } - - double t0 = bvh->preBuild(mesh ? "" : TOSTRING(isa) "::BVH" + toString(N) + "BuilderSAH"); - -#if PROFILE - profile(2,PROFILE_RUNS,numPrimitives,[&] (ProfileTimer& timer) { -#endif - - /* create primref array */ - if (primrefarrayalloc) { - settings.primrefarrayalloc = numPrimitives/1000; - if (settings.primrefarrayalloc < 1000) - settings.primrefarrayalloc = inf; - } - - /* enable os_malloc for two level build */ - if (mesh) - bvh->alloc.setOSallocation(true); - - /* initialize allocator */ - const size_t node_bytes = numPrimitives*sizeof(typename BVH::AABBNodeMB)/(4*N); - const size_t leaf_bytes = size_t(1.2*Primitive::blocks(numPrimitives)*sizeof(Primitive)); - bvh->alloc.init_estimate(node_bytes+leaf_bytes); - settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,numPrimitives,node_bytes+leaf_bytes); - prims.resize(numPrimitives); - - PrimInfo pinfo = mesh ? - createPrimRefArray(mesh,geomID_,prims,bvh->scene->progressInterface) : - createPrimRefArray(scene,gtype_,false,prims,bvh->scene->progressInterface); - - /* pinfo might has zero size due to invalid geometry */ - if (unlikely(pinfo.size() == 0)) - { - bvh->clear(); - prims.clear(); - return; - } - - /* call BVH builder */ - NodeRef root = BVHNBuilderVirtual::build(&bvh->alloc,CreateLeaf(bvh),bvh->scene->progressInterface,prims.data(),pinfo,settings); - bvh->set(root,LBBox3fa(pinfo.geomBounds),pinfo.size()); - bvh->layoutLargeNodes(size_t(pinfo.size()*0.005f)); - -#if PROFILE - }); -#endif - - /* if we allocated using the primrefarray we have to keep it alive */ - if (settings.primrefarrayalloc != size_t(inf)) - bvh->alloc.share(prims); - - /* for static geometries we can do some cleanups */ - else if (scene && scene->isStaticAccel()) { - prims.clear(); - } - bvh->cleanup(); - bvh->postBuild(t0); - } - - void clear() { - prims.clear(); - } - }; - - /************************************************************************************/ - /************************************************************************************/ - /************************************************************************************/ - /************************************************************************************/ - - template - struct BVHNBuilderSAHQuantized : public Builder - { - typedef BVHN BVH; - typedef typename BVHN::NodeRef NodeRef; - - BVH* bvh; - Scene* scene; - Geometry* mesh; - mvector prims; - GeneralBVHBuilder::Settings settings; - Geometry::GTypeMask gtype_; - unsigned int geomID_ = std::numeric_limits::max(); - unsigned int numPreviousPrimitives = 0; - - BVHNBuilderSAHQuantized (BVH* bvh, Scene* scene, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const Geometry::GTypeMask gtype) - : bvh(bvh), scene(scene), mesh(nullptr), prims(scene->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD), gtype_(gtype) {} - - BVHNBuilderSAHQuantized (BVH* bvh, Geometry* mesh, unsigned int geomID, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const Geometry::GTypeMask gtype) - : bvh(bvh), scene(nullptr), mesh(mesh), prims(bvh->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD), gtype_(gtype), geomID_(geomID) {} - - // FIXME: shrink bvh->alloc in destructor here and in other builders too - - void build() - { - /* we reset the allocator when the mesh size changed */ - if (mesh && mesh->numPrimitives != numPreviousPrimitives) { - bvh->alloc.clear(); - } - - /* skip build for empty scene */ - const size_t numPrimitives = mesh ? mesh->size() : scene->getNumPrimitives(gtype_,false); - numPreviousPrimitives = numPrimitives; - if (numPrimitives == 0) { - prims.clear(); - bvh->clear(); - return; - } - - double t0 = bvh->preBuild(mesh ? "" : TOSTRING(isa) "::QBVH" + toString(N) + "BuilderSAH"); - -#if PROFILE - profile(2,PROFILE_RUNS,numPrimitives,[&] (ProfileTimer& timer) { -#endif - /* create primref array */ - prims.resize(numPrimitives); - PrimInfo pinfo = mesh ? - createPrimRefArray(mesh,geomID_,prims,bvh->scene->progressInterface) : - createPrimRefArray(scene,gtype_,false,prims,bvh->scene->progressInterface); - - /* enable os_malloc for two level build */ - if (mesh) - bvh->alloc.setOSallocation(true); - - /* call BVH builder */ - const size_t node_bytes = numPrimitives*sizeof(typename BVH::QuantizedNode)/(4*N); - const size_t leaf_bytes = size_t(1.2*Primitive::blocks(numPrimitives)*sizeof(Primitive)); - bvh->alloc.init_estimate(node_bytes+leaf_bytes); - settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,numPrimitives,node_bytes+leaf_bytes); - NodeRef root = BVHNBuilderQuantizedVirtual::build(&bvh->alloc,CreateLeafQuantized(bvh),bvh->scene->progressInterface,prims.data(),pinfo,settings); - bvh->set(root,LBBox3fa(pinfo.geomBounds),pinfo.size()); - //bvh->layoutLargeNodes(pinfo.size()*0.005f); // FIXME: COPY LAYOUT FOR LARGE NODES !!! -#if PROFILE - }); -#endif - - /* clear temporary data for static geometry */ - if (scene && scene->isStaticAccel()) { - prims.clear(); - } - bvh->cleanup(); - bvh->postBuild(t0); - } - - void clear() { - prims.clear(); - } - }; - - /************************************************************************************/ - /************************************************************************************/ - /************************************************************************************/ - /************************************************************************************/ - - - template - struct CreateLeafGrid - { - typedef BVHN BVH; - typedef typename BVH::NodeRef NodeRef; - - __forceinline CreateLeafGrid (BVH* bvh, const SubGridBuildData * const sgrids) : bvh(bvh),sgrids(sgrids) {} - - __forceinline NodeRef operator() (const PrimRef* prims, const range& set, const FastAllocator::CachedAllocator& alloc) const - { - const size_t items = set.size(); //Primitive::blocks(n); - const size_t start = set.begin(); - - /* collect all subsets with unique geomIDs */ - assert(items <= N); - unsigned int geomIDs[N]; - unsigned int num_geomIDs = 1; - geomIDs[0] = prims[start].geomID(); - - for (size_t i=1;i* accel = (SubGridQBVHN*) alloc.malloc1(num_geomIDs*sizeof(SubGridQBVHN),BVH::byteAlignment); - typename BVH::NodeRef node = BVH::encodeLeaf((char*)accel,num_geomIDs); - - for (size_t g=0;g(x,y,primID,bounds,geomIDs[g],pos); - } - - return node; - } - - BVH* bvh; - const SubGridBuildData * const sgrids; - }; - - - template - struct BVHNBuilderSAHGrid : public Builder - { - typedef BVHN BVH; - typedef typename BVHN::NodeRef NodeRef; - - BVH* bvh; - Scene* scene; - GridMesh* mesh; - mvector prims; - mvector sgrids; - GeneralBVHBuilder::Settings settings; - unsigned int geomID_ = std::numeric_limits::max(); - unsigned int numPreviousPrimitives = 0; - - BVHNBuilderSAHGrid (BVH* bvh, Scene* scene, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const size_t mode) - : bvh(bvh), scene(scene), mesh(nullptr), prims(scene->device,0), sgrids(scene->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD) {} - - BVHNBuilderSAHGrid (BVH* bvh, GridMesh* mesh, unsigned int geomID, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const size_t mode) - : bvh(bvh), scene(nullptr), mesh(mesh), prims(bvh->device,0), sgrids(scene->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD), geomID_(geomID) {} - - void build() - { - /* we reset the allocator when the mesh size changed */ - if (mesh && mesh->numPrimitives != numPreviousPrimitives) { - bvh->alloc.clear(); - } - - /* if we use the primrefarray for allocations we have to take it back from the BVH */ - if (settings.primrefarrayalloc != size_t(inf)) - bvh->alloc.unshare(prims); - - const size_t numGridPrimitives = mesh ? mesh->size() : scene->getNumPrimitives(GridMesh::geom_type,false); - numPreviousPrimitives = numGridPrimitives; - - PrimInfo pinfo(empty); - size_t numPrimitives = 0; - - if (!mesh) - { - /* first run to get #primitives */ - - ParallelForForPrefixSumState pstate; - Scene::Iterator iter(scene); - - pstate.init(iter,size_t(1024)); - - /* iterate over all meshes in the scene */ - pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](GridMesh* mesh, const range& r, size_t k, size_t geomID) -> PrimInfo { - PrimInfo pinfo(empty); - for (size_t j=r.begin(); jvalid(j)) continue; - BBox3fa bounds = empty; - const PrimRef prim(bounds,(unsigned)geomID,(unsigned)j); - if (!mesh->valid(j)) continue; - pinfo.add_center2(prim,mesh->getNumSubGrids(j)); - } - return pinfo; - }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); - numPrimitives = pinfo.size(); - - /* resize arrays */ - sgrids.resize(numPrimitives); - prims.resize(numPrimitives); - - /* second run to fill primrefs and SubGridBuildData arrays */ - pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](GridMesh* mesh, const range& r, size_t k, size_t geomID, const PrimInfo& base) -> PrimInfo { - k = base.size(); - size_t p_index = k; - PrimInfo pinfo(empty); - for (size_t j=r.begin(); jvalid(j)) continue; - const GridMesh::Grid &g = mesh->grid(j); - for (unsigned int y=0; ybuildBounds(g,x,y,bounds)) continue; // get bounds of subgrid - const PrimRef prim(bounds,(unsigned)geomID,(unsigned)p_index); - pinfo.add_center2(prim); - sgrids[p_index] = SubGridBuildData(x | g.get3x3FlagsX(x), y | g.get3x3FlagsY(y), unsigned(j)); - prims[p_index++] = prim; - } - } - return pinfo; - }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); - assert(pinfo.size() == numPrimitives); - } - else - { - ParallelPrefixSumState pstate; - /* iterate over all grids in a single mesh */ - pinfo = parallel_prefix_sum( pstate, size_t(0), mesh->size(), size_t(1024), PrimInfo(empty), [&](const range& r, const PrimInfo& base) -> PrimInfo - { - PrimInfo pinfo(empty); - for (size_t j=r.begin(); jvalid(j)) continue; - BBox3fa bounds = empty; - const PrimRef prim(bounds,geomID_,unsigned(j)); - pinfo.add_center2(prim,mesh->getNumSubGrids(j)); - } - return pinfo; - }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); - numPrimitives = pinfo.size(); - /* resize arrays */ - sgrids.resize(numPrimitives); - prims.resize(numPrimitives); - - /* second run to fill primrefs and SubGridBuildData arrays */ - pinfo = parallel_prefix_sum( pstate, size_t(0), mesh->size(), size_t(1024), PrimInfo(empty), [&](const range& r, const PrimInfo& base) -> PrimInfo - { - - size_t p_index = base.size(); - PrimInfo pinfo(empty); - for (size_t j=r.begin(); jvalid(j)) continue; - const GridMesh::Grid &g = mesh->grid(j); - for (unsigned int y=0; ybuildBounds(g,x,y,bounds)) continue; // get bounds of subgrid - const PrimRef prim(bounds,geomID_,unsigned(p_index)); - pinfo.add_center2(prim); - sgrids[p_index] = SubGridBuildData(x | g.get3x3FlagsX(x), y | g.get3x3FlagsY(y), unsigned(j)); - prims[p_index++] = prim; - } - } - return pinfo; - }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); - - } - - /* no primitives */ - if (numPrimitives == 0) { - bvh->clear(); - prims.clear(); - sgrids.clear(); - return; - } - - double t0 = bvh->preBuild(mesh ? "" : TOSTRING(isa) "::BVH" + toString(N) + "BuilderSAH"); - - /* create primref array */ - settings.primrefarrayalloc = numPrimitives/1000; - if (settings.primrefarrayalloc < 1000) - settings.primrefarrayalloc = inf; - - /* enable os_malloc for two level build */ - if (mesh) - bvh->alloc.setOSallocation(true); - - /* initialize allocator */ - const size_t node_bytes = numPrimitives*sizeof(typename BVH::AABBNodeMB)/(4*N); - const size_t leaf_bytes = size_t(1.2*(float)numPrimitives/N * sizeof(SubGridQBVHN)); - - bvh->alloc.init_estimate(node_bytes+leaf_bytes); - settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,numPrimitives,node_bytes+leaf_bytes); - - /* pinfo might has zero size due to invalid geometry */ - if (unlikely(pinfo.size() == 0)) - { - bvh->clear(); - sgrids.clear(); - prims.clear(); - return; - } - - /* call BVH builder */ - NodeRef root = BVHNBuilderVirtual::build(&bvh->alloc,CreateLeafGrid>(bvh,sgrids.data()),bvh->scene->progressInterface,prims.data(),pinfo,settings); - bvh->set(root,LBBox3fa(pinfo.geomBounds),pinfo.size()); - bvh->layoutLargeNodes(size_t(pinfo.size()*0.005f)); - - /* clear temporary array */ - sgrids.clear(); - - /* if we allocated using the primrefarray we have to keep it alive */ - if (settings.primrefarrayalloc != size_t(inf)) - bvh->alloc.share(prims); - - /* for static geometries we can do some cleanups */ - else if (scene && scene->isStaticAccel()) { - prims.clear(); - } - bvh->cleanup(); - bvh->postBuild(t0); - } - - void clear() { - prims.clear(); - } - }; - - /************************************************************************************/ - /************************************************************************************/ - /************************************************************************************/ - /************************************************************************************/ - -#if defined(EMBREE_GEOMETRY_TRIANGLE) - Builder* BVH4Triangle4MeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<4,Triangle4>((BVH4*)bvh,mesh,geomID,4,1.0f,4,inf,TriangleMesh::geom_type); } - Builder* BVH4Triangle4vMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<4,Triangle4v>((BVH4*)bvh,mesh,geomID,4,1.0f,4,inf,TriangleMesh::geom_type); } - Builder* BVH4Triangle4iMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<4,Triangle4i>((BVH4*)bvh,mesh,geomID,4,1.0f,4,inf,TriangleMesh::geom_type); } - - Builder* BVH4Triangle4SceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<4,Triangle4>((BVH4*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); } - Builder* BVH4Triangle4vSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<4,Triangle4v>((BVH4*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); } - Builder* BVH4Triangle4iSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<4,Triangle4i>((BVH4*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type,true); } - - - Builder* BVH4QuantizedTriangle4iSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<4,Triangle4i>((BVH4*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); } -#if defined(__AVX__) - Builder* BVH8Triangle4MeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<8,Triangle4>((BVH8*)bvh,mesh,geomID,4,1.0f,4,inf,TriangleMesh::geom_type); } - Builder* BVH8Triangle4vMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<8,Triangle4v>((BVH8*)bvh,mesh,geomID,4,1.0f,4,inf,TriangleMesh::geom_type); } - Builder* BVH8Triangle4iMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<8,Triangle4i>((BVH8*)bvh,mesh,geomID,4,1.0f,4,inf,TriangleMesh::geom_type); } - - Builder* BVH8Triangle4SceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<8,Triangle4>((BVH8*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); } - Builder* BVH8Triangle4vSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<8,Triangle4v>((BVH8*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); } - Builder* BVH8Triangle4iSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<8,Triangle4i>((BVH8*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type,true); } - Builder* BVH8QuantizedTriangle4iSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<8,Triangle4i>((BVH8*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); } - Builder* BVH8QuantizedTriangle4SceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<8,Triangle4>((BVH8*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); } - -#endif -#endif - -#if defined(EMBREE_GEOMETRY_QUAD) - Builder* BVH4Quad4vMeshBuilderSAH (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<4,Quad4v>((BVH4*)bvh,mesh,geomID,4,1.0f,4,inf,QuadMesh::geom_type); } - Builder* BVH4Quad4iMeshBuilderSAH (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<4,Quad4i>((BVH4*)bvh,mesh,geomID,4,1.0f,4,inf,QuadMesh::geom_type); } - Builder* BVH4Quad4vSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<4,Quad4v>((BVH4*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type); } - Builder* BVH4Quad4iSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<4,Quad4i>((BVH4*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type,true); } - Builder* BVH4QuantizedQuad4vSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<4,Quad4v>((BVH4*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type); } - Builder* BVH4QuantizedQuad4iSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<4,Quad4i>((BVH4*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type); } - -#if defined(__AVX__) - Builder* BVH8Quad4vSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<8,Quad4v>((BVH8*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type); } - Builder* BVH8Quad4iSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<8,Quad4i>((BVH8*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type,true); } - Builder* BVH8QuantizedQuad4vSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<8,Quad4v>((BVH8*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type); } - Builder* BVH8QuantizedQuad4iSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<8,Quad4i>((BVH8*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type); } - Builder* BVH8Quad4vMeshBuilderSAH (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<8,Quad4v>((BVH8*)bvh,mesh,geomID,4,1.0f,4,inf,QuadMesh::geom_type); } - -#endif -#endif - -#if defined(EMBREE_GEOMETRY_USER) - - Builder* BVH4VirtualSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { - int minLeafSize = scene->device->object_accel_min_leaf_size; - int maxLeafSize = scene->device->object_accel_max_leaf_size; - return new BVHNBuilderSAH<4,Object>((BVH4*)bvh,scene,4,1.0f,minLeafSize,maxLeafSize,UserGeometry::geom_type); - } - - Builder* BVH4VirtualMeshBuilderSAH (void* bvh, UserGeometry* mesh, unsigned int geomID, size_t mode) { - return new BVHNBuilderSAH<4,Object>((BVH4*)bvh,mesh,geomID,4,1.0f,1,inf,UserGeometry::geom_type); - } -#if defined(__AVX__) - - Builder* BVH8VirtualSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { - int minLeafSize = scene->device->object_accel_min_leaf_size; - int maxLeafSize = scene->device->object_accel_max_leaf_size; - return new BVHNBuilderSAH<8,Object>((BVH8*)bvh,scene,8,1.0f,minLeafSize,maxLeafSize,UserGeometry::geom_type); - } - - Builder* BVH8VirtualMeshBuilderSAH (void* bvh, UserGeometry* mesh, unsigned int geomID, size_t mode) { - return new BVHNBuilderSAH<8,Object>((BVH8*)bvh,mesh,geomID,8,1.0f,1,inf,UserGeometry::geom_type); - } -#endif -#endif - -#if defined(EMBREE_GEOMETRY_INSTANCE) - Builder* BVH4InstanceSceneBuilderSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype) { return new BVHNBuilderSAH<4,InstancePrimitive>((BVH4*)bvh,scene,4,1.0f,1,1,gtype); } - Builder* BVH4InstanceMeshBuilderSAH (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) { - return new BVHNBuilderSAH<4,InstancePrimitive>((BVH4*)bvh,mesh,geomID,4,1.0f,1,inf,gtype); - } -#if defined(__AVX__) - Builder* BVH8InstanceSceneBuilderSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype) { return new BVHNBuilderSAH<8,InstancePrimitive>((BVH8*)bvh,scene,8,1.0f,1,1,gtype); } - Builder* BVH8InstanceMeshBuilderSAH (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) { - return new BVHNBuilderSAH<8,InstancePrimitive>((BVH8*)bvh,mesh,geomID,8,1.0f,1,inf,gtype); - } -#endif -#endif - -#if defined(EMBREE_GEOMETRY_GRID) - Builder* BVH4GridMeshBuilderSAH (void* bvh, GridMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAHGrid<4>((BVH4*)bvh,mesh,geomID,4,1.0f,4,4,mode); } - Builder* BVH4GridSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHGrid<4>((BVH4*)bvh,scene,4,1.0f,4,4,mode); } // FIXME: check whether cost factors are correct - -#if defined(__AVX__) - Builder* BVH8GridMeshBuilderSAH (void* bvh, GridMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAHGrid<8>((BVH8*)bvh,mesh,geomID,8,1.0f,8,8,mode); } - Builder* BVH8GridSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHGrid<8>((BVH8*)bvh,scene,8,1.0f,8,8,mode); } // FIXME: check whether cost factors are correct -#endif -#endif - } -} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah_mb.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah_mb.cpp deleted file mode 100644 index 9c01553ec6..0000000000 --- a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah_mb.cpp +++ /dev/null @@ -1,705 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "bvh.h" -#include "bvh_builder.h" -#include "../builders/bvh_builder_msmblur.h" - -#include "../builders/primrefgen.h" -#include "../builders/splitter.h" - -#include "../geometry/linei.h" -#include "../geometry/triangle.h" -#include "../geometry/trianglev.h" -#include "../geometry/trianglev_mb.h" -#include "../geometry/trianglei.h" -#include "../geometry/quadv.h" -#include "../geometry/quadi.h" -#include "../geometry/object.h" -#include "../geometry/instance.h" -#include "../geometry/subgrid.h" - -#include "../common/state.h" - -// FIXME: remove after removing BVHNBuilderMBlurRootTimeSplitsSAH -#include "../../common/algorithms/parallel_for_for.h" -#include "../../common/algorithms/parallel_for_for_prefix_sum.h" - - -namespace embree -{ - namespace isa - { - -#if 0 - template - struct CreateMBlurLeaf - { - typedef BVHN BVH; - typedef typename BVH::NodeRef NodeRef; - typedef typename BVH::NodeRecordMB NodeRecordMB; - - __forceinline CreateMBlurLeaf (BVH* bvh, PrimRef* prims, size_t time) : bvh(bvh), prims(prims), time(time) {} - - __forceinline NodeRecordMB operator() (const PrimRef* prims, const range& set, const FastAllocator::CachedAllocator& alloc) const - { - size_t items = Primitive::blocks(set.size()); - size_t start = set.begin(); - for (size_t i=start; iencodeLeaf((char*)accel,items); - - LBBox3fa allBounds = empty; - for (size_t i=0; iscene, time)); - - return NodeRecordMB(node,allBounds); - } - - BVH* bvh; - PrimRef* prims; - size_t time; - }; -#endif - - template - struct CreateMSMBlurLeaf - { - typedef BVHN BVH; - typedef typename BVH::NodeRef NodeRef; - typedef typename BVH::NodeRecordMB4D NodeRecordMB4D; - - __forceinline CreateMSMBlurLeaf (BVH* bvh) : bvh(bvh) {} - - __forceinline const NodeRecordMB4D operator() (const BVHBuilderMSMBlur::BuildRecord& current, const FastAllocator::CachedAllocator& alloc) const - { - size_t items = Primitive::blocks(current.prims.size()); - size_t start = current.prims.begin(); - size_t end = current.prims.end(); - for (size_t i=start; iencodeLeaf((char*)accel,items); - LBBox3fa allBounds = empty; - for (size_t i=0; idata(), start, current.prims.end(), bvh->scene, current.prims.time_range)); - return NodeRecordMB4D(node,allBounds,current.prims.time_range); - } - - BVH* bvh; - }; - - /* Motion blur BVH with 4D nodes and internal time splits */ - template - struct BVHNBuilderMBlurSAH : public Builder - { - typedef BVHN BVH; - typedef typename BVHN::NodeRef NodeRef; - typedef typename BVHN::NodeRecordMB NodeRecordMB; - typedef typename BVHN::AABBNodeMB AABBNodeMB; - - BVH* bvh; - Scene* scene; - const size_t sahBlockSize; - const float intCost; - const size_t minLeafSize; - const size_t maxLeafSize; - const Geometry::GTypeMask gtype_; - - BVHNBuilderMBlurSAH (BVH* bvh, Scene* scene, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const Geometry::GTypeMask gtype) - : bvh(bvh), scene(scene), sahBlockSize(sahBlockSize), intCost(intCost), minLeafSize(minLeafSize), maxLeafSize(min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks)), gtype_(gtype) {} - - void build() - { - /* skip build for empty scene */ - const size_t numPrimitives = scene->getNumPrimitives(gtype_,true); - if (numPrimitives == 0) { bvh->clear(); return; } - - double t0 = bvh->preBuild(TOSTRING(isa) "::BVH" + toString(N) + "BuilderMBlurSAH"); - -#if PROFILE - profile(2,PROFILE_RUNS,numPrimitives,[&] (ProfileTimer& timer) { -#endif - - //const size_t numTimeSteps = scene->getNumTimeSteps(); - //const size_t numTimeSegments = numTimeSteps-1; assert(numTimeSteps > 1); - - /*if (numTimeSegments == 1) - buildSingleSegment(numPrimitives); - else*/ - buildMultiSegment(numPrimitives); - -#if PROFILE - }); -#endif - - /* clear temporary data for static geometry */ - bvh->cleanup(); - bvh->postBuild(t0); - } - -#if 0 // No longer compatible when time_ranges are present for geometries. Would have to create temporal nodes sometimes, and put only a single geometry into leaf. - void buildSingleSegment(size_t numPrimitives) - { - /* create primref array */ - mvector prims(scene->device,numPrimitives); - const PrimInfo pinfo = createPrimRefArrayMBlur(scene,gtype_,prims,bvh->scene->progressInterface,0); - /* early out if no valid primitives */ - if (pinfo.size() == 0) { bvh->clear(); return; } - /* estimate acceleration structure size */ - const size_t node_bytes = pinfo.size()*sizeof(AABBNodeMB)/(4*N); - const size_t leaf_bytes = size_t(1.2*Primitive::blocks(pinfo.size())*sizeof(Primitive)); - bvh->alloc.init_estimate(node_bytes+leaf_bytes); - - /* settings for BVH build */ - GeneralBVHBuilder::Settings settings; - settings.branchingFactor = N; - settings.maxDepth = BVH::maxBuildDepthLeaf; - settings.logBlockSize = bsr(sahBlockSize); - settings.minLeafSize = min(minLeafSize,maxLeafSize); - settings.maxLeafSize = maxLeafSize; - settings.travCost = travCost; - settings.intCost = intCost; - settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,pinfo.size(),node_bytes+leaf_bytes); - - /* build hierarchy */ - auto root = BVHBuilderBinnedSAH::build - (typename BVH::CreateAlloc(bvh),typename BVH::AABBNodeMB::Create(),typename BVH::AABBNodeMB::Set(), - CreateMBlurLeaf(bvh,prims.data(),0),bvh->scene->progressInterface, - prims.data(),pinfo,settings); - - bvh->set(root.ref,root.lbounds,pinfo.size()); - } -#endif - - void buildMultiSegment(size_t numPrimitives) - { - /* create primref array */ - mvector prims(scene->device,numPrimitives); - PrimInfoMB pinfo = createPrimRefArrayMSMBlur(scene,gtype_,prims,bvh->scene->progressInterface); - - /* early out if no valid primitives */ - if (pinfo.size() == 0) { bvh->clear(); return; } - - /* estimate acceleration structure size */ - const size_t node_bytes = pinfo.num_time_segments*sizeof(AABBNodeMB)/(4*N); - const size_t leaf_bytes = size_t(1.2*Primitive::blocks(pinfo.num_time_segments)*sizeof(Primitive)); - bvh->alloc.init_estimate(node_bytes+leaf_bytes); - - /* settings for BVH build */ - BVHBuilderMSMBlur::Settings settings; - settings.branchingFactor = N; - settings.maxDepth = BVH::maxDepth; - settings.logBlockSize = bsr(sahBlockSize); - settings.minLeafSize = min(minLeafSize,maxLeafSize); - settings.maxLeafSize = maxLeafSize; - settings.travCost = travCost; - settings.intCost = intCost; - settings.singleLeafTimeSegment = Primitive::singleTimeSegment; - settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,pinfo.size(),node_bytes+leaf_bytes); - - /* build hierarchy */ - auto root = - BVHBuilderMSMBlur::build(prims,pinfo,scene->device, - RecalculatePrimRef(scene), - typename BVH::CreateAlloc(bvh), - typename BVH::AABBNodeMB4D::Create(), - typename BVH::AABBNodeMB4D::Set(), - CreateMSMBlurLeaf(bvh), - bvh->scene->progressInterface, - settings); - - bvh->set(root.ref,root.lbounds,pinfo.num_time_segments); - } - - void clear() { - } - }; - - /************************************************************************************/ - /************************************************************************************/ - /************************************************************************************/ - /************************************************************************************/ - - struct GridRecalculatePrimRef - { - Scene* scene; - const SubGridBuildData * const sgrids; - - __forceinline GridRecalculatePrimRef (Scene* scene, const SubGridBuildData * const sgrids) - : scene(scene), sgrids(sgrids) {} - - __forceinline PrimRefMB operator() (const PrimRefMB& prim, const BBox1f time_range) const - { - const unsigned int geomID = prim.geomID(); - const GridMesh* mesh = scene->get(geomID); - const unsigned int buildID = prim.primID(); - const SubGridBuildData &subgrid = sgrids[buildID]; - const unsigned int primID = subgrid.primID; - const size_t x = subgrid.x(); - const size_t y = subgrid.y(); - const LBBox3fa lbounds = mesh->linearBounds(mesh->grid(primID),x,y,time_range); - const unsigned num_time_segments = mesh->numTimeSegments(); - const range tbounds = mesh->timeSegmentRange(time_range); - return PrimRefMB (lbounds, tbounds.size(), mesh->time_range, num_time_segments, geomID, buildID); - } - - __forceinline LBBox3fa linearBounds(const PrimRefMB& prim, const BBox1f time_range) const { - const unsigned int geomID = prim.geomID(); - const GridMesh* mesh = scene->get(geomID); - const unsigned int buildID = prim.primID(); - const SubGridBuildData &subgrid = sgrids[buildID]; - const unsigned int primID = subgrid.primID; - const size_t x = subgrid.x(); - const size_t y = subgrid.y(); - return mesh->linearBounds(mesh->grid(primID),x,y,time_range); - } - - }; - - template - struct CreateMSMBlurLeafGrid - { - typedef BVHN BVH; - typedef typename BVH::NodeRef NodeRef; - typedef typename BVH::NodeRecordMB4D NodeRecordMB4D; - - __forceinline CreateMSMBlurLeafGrid (Scene* scene, BVH* bvh, const SubGridBuildData * const sgrids) : scene(scene), bvh(bvh), sgrids(sgrids) {} - - __forceinline const NodeRecordMB4D operator() (const BVHBuilderMSMBlur::BuildRecord& current, const FastAllocator::CachedAllocator& alloc) const - { - const size_t items = current.prims.size(); - const size_t start = current.prims.begin(); - - const PrimRefMB* prims = current.prims.prims->data(); - /* collect all subsets with unique geomIDs */ - assert(items <= N); - unsigned int geomIDs[N]; - unsigned int num_geomIDs = 1; - geomIDs[0] = prims[start].geomID(); - - for (size_t i=1;i* accel = (SubGridMBQBVHN*) alloc.malloc1(num_geomIDs*sizeof(SubGridMBQBVHN),BVH::byteAlignment); - typename BVH::NodeRef node = bvh->encodeLeaf((char*)accel,num_geomIDs); - - LBBox3fa allBounds = empty; - - for (size_t g=0;gget(geomIDs[g]); - unsigned int x[N]; - unsigned int y[N]; - unsigned int primID[N]; - BBox3fa bounds0[N]; - BBox3fa bounds1[N]; - unsigned int pos = 0; - for (size_t i=0;ilinearBounds(mesh->grid(sgrid_bd.primID),x,y,current.prims.time_range); - allBounds.extend(newBounds); - bounds0[pos] = newBounds.bounds0; - bounds1[pos] = newBounds.bounds1; - pos++; - } - assert(pos <= N); - new (&accel[g]) SubGridMBQBVHN(x,y,primID,bounds0,bounds1,geomIDs[g],current.prims.time_range.lower,1.0f/current.prims.time_range.size(),pos); - } - return NodeRecordMB4D(node,allBounds,current.prims.time_range); - } - - Scene *scene; - BVH* bvh; - const SubGridBuildData * const sgrids; - }; - -#if 0 - template - struct CreateLeafGridMB - { - typedef BVHN BVH; - typedef typename BVH::NodeRef NodeRef; - typedef typename BVH::NodeRecordMB NodeRecordMB; - - __forceinline CreateLeafGridMB (Scene* scene, BVH* bvh, const SubGridBuildData * const sgrids) - : scene(scene), bvh(bvh), sgrids(sgrids) {} - - __forceinline NodeRecordMB operator() (const PrimRef* prims, const range& set, const FastAllocator::CachedAllocator& alloc) const - { - const size_t items = set.size(); - const size_t start = set.begin(); - - /* collect all subsets with unique geomIDs */ - assert(items <= N); - unsigned int geomIDs[N]; - unsigned int num_geomIDs = 1; - geomIDs[0] = prims[start].geomID(); - - for (size_t i=1;i* accel = (SubGridMBQBVHN*) alloc.malloc1(num_geomIDs*sizeof(SubGridMBQBVHN),BVH::byteAlignment); - typename BVH::NodeRef node = bvh->encodeLeaf((char*)accel,num_geomIDs); - - LBBox3fa allBounds = empty; - - for (size_t g=0;gget(geomIDs[g]); - - unsigned int x[N]; - unsigned int y[N]; - unsigned int primID[N]; - BBox3fa bounds0[N]; - BBox3fa bounds1[N]; - unsigned int pos = 0; - for (size_t i=0;ibuildBounds(mesh->grid(sgrid_bd.primID),x,y,0,bounds0[pos]); - bool MAYBE_UNUSED valid1 = mesh->buildBounds(mesh->grid(sgrid_bd.primID),x,y,1,bounds1[pos]); - assert(valid0); - assert(valid1); - allBounds.extend(LBBox3fa(bounds0[pos],bounds1[pos])); - pos++; - } - new (&accel[g]) SubGridMBQBVHN(x,y,primID,bounds0,bounds1,geomIDs[g],0.0f,1.0f,pos); - } - return NodeRecordMB(node,allBounds); - } - - Scene *scene; - BVH* bvh; - const SubGridBuildData * const sgrids; - }; -#endif - - - /* Motion blur BVH with 4D nodes and internal time splits */ - template - struct BVHNBuilderMBlurSAHGrid : public Builder - { - typedef BVHN BVH; - typedef typename BVHN::NodeRef NodeRef; - typedef typename BVHN::NodeRecordMB NodeRecordMB; - typedef typename BVHN::AABBNodeMB AABBNodeMB; - - BVH* bvh; - Scene* scene; - const size_t sahBlockSize; - const float intCost; - const size_t minLeafSize; - const size_t maxLeafSize; - mvector sgrids; - - - BVHNBuilderMBlurSAHGrid (BVH* bvh, Scene* scene, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize) - : bvh(bvh), scene(scene), sahBlockSize(sahBlockSize), intCost(intCost), minLeafSize(minLeafSize), maxLeafSize(min(maxLeafSize,BVH::maxLeafBlocks)), sgrids(scene->device,0) {} - - - PrimInfo createPrimRefArrayMBlurGrid(Scene* scene, mvector& prims, BuildProgressMonitor& progressMonitor, size_t itime) - { - /* first run to get #primitives */ - ParallelForForPrefixSumState pstate; - Scene::Iterator iter(scene); - - pstate.init(iter,size_t(1024)); - - /* iterate over all meshes in the scene */ - PrimInfo pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](GridMesh* mesh, const range& r, size_t k, size_t geomID) -> PrimInfo { - - PrimInfo pinfo(empty); - for (size_t j=r.begin(); jvalid(j,range(0,1))) continue; - BBox3fa bounds = empty; - const PrimRef prim(bounds,unsigned(geomID),unsigned(j)); - pinfo.add_center2(prim,mesh->getNumSubGrids(j)); - } - return pinfo; - }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); - - size_t numPrimitives = pinfo.size(); - if (numPrimitives == 0) return pinfo; - - /* resize arrays */ - sgrids.resize(numPrimitives); - prims.resize(numPrimitives); - - /* second run to fill primrefs and SubGridBuildData arrays */ - pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](GridMesh* mesh, const range& r, size_t k, size_t geomID, const PrimInfo& base) -> PrimInfo { - - k = base.size(); - size_t p_index = k; - PrimInfo pinfo(empty); - for (size_t j=r.begin(); jgrid(j); - if (!mesh->valid(j,range(0,1))) continue; - - for (unsigned int y=0; ybuildBounds(g,x,y,itime,bounds)) continue; // get bounds of subgrid - const PrimRef prim(bounds,unsigned(geomID),unsigned(p_index)); - pinfo.add_center2(prim); - sgrids[p_index] = SubGridBuildData(x | g.get3x3FlagsX(x), y | g.get3x3FlagsY(y), unsigned(j)); - prims[p_index++] = prim; - } - } - return pinfo; - }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); - - assert(pinfo.size() == numPrimitives); - return pinfo; - } - - PrimInfoMB createPrimRefArrayMSMBlurGrid(Scene* scene, mvector& prims, BuildProgressMonitor& progressMonitor, BBox1f t0t1 = BBox1f(0.0f,1.0f)) - { - /* first run to get #primitives */ - ParallelForForPrefixSumState pstate; - Scene::Iterator iter(scene); - - pstate.init(iter,size_t(1024)); - /* iterate over all meshes in the scene */ - PrimInfoMB pinfoMB = parallel_for_for_prefix_sum0( pstate, iter, PrimInfoMB(empty), [&](GridMesh* mesh, const range& r, size_t k, size_t /*geomID*/) -> PrimInfoMB { - - PrimInfoMB pinfoMB(empty); - for (size_t j=r.begin(); jvalid(j, mesh->timeSegmentRange(t0t1))) continue; - LBBox3fa bounds(empty); - PrimInfoMB gridMB(0,mesh->getNumSubGrids(j)); - pinfoMB.merge(gridMB); - } - return pinfoMB; - }, [](const PrimInfoMB& a, const PrimInfoMB& b) -> PrimInfoMB { return PrimInfoMB::merge2(a,b); }); - - size_t numPrimitives = pinfoMB.size(); - if (numPrimitives == 0) return pinfoMB; - - /* resize arrays */ - sgrids.resize(numPrimitives); - prims.resize(numPrimitives); - /* second run to fill primrefs and SubGridBuildData arrays */ - pinfoMB = parallel_for_for_prefix_sum1( pstate, iter, PrimInfoMB(empty), [&](GridMesh* mesh, const range& r, size_t k, size_t geomID, const PrimInfoMB& base) -> PrimInfoMB { - - k = base.size(); - size_t p_index = k; - PrimInfoMB pinfoMB(empty); - for (size_t j=r.begin(); jvalid(j, mesh->timeSegmentRange(t0t1))) continue; - const GridMesh::Grid &g = mesh->grid(j); - - for (unsigned int y=0; ylinearBounds(g,x,y,t0t1),mesh->numTimeSegments(),mesh->time_range,mesh->numTimeSegments(),unsigned(geomID),unsigned(p_index)); - pinfoMB.add_primref(prim); - sgrids[p_index] = SubGridBuildData(x | g.get3x3FlagsX(x), y | g.get3x3FlagsY(y), unsigned(j)); - prims[p_index++] = prim; - } - } - return pinfoMB; - }, [](const PrimInfoMB& a, const PrimInfoMB& b) -> PrimInfoMB { return PrimInfoMB::merge2(a,b); }); - - assert(pinfoMB.size() == numPrimitives); - pinfoMB.time_range = t0t1; - return pinfoMB; - } - - void build() - { - /* skip build for empty scene */ - const size_t numPrimitives = scene->getNumPrimitives(GridMesh::geom_type,true); - if (numPrimitives == 0) { bvh->clear(); return; } - - double t0 = bvh->preBuild(TOSTRING(isa) "::BVH" + toString(N) + "BuilderMBlurSAHGrid"); - - //const size_t numTimeSteps = scene->getNumTimeSteps(); - //const size_t numTimeSegments = numTimeSteps-1; assert(numTimeSteps > 1); - //if (numTimeSegments == 1) - // buildSingleSegment(numPrimitives); - //else - buildMultiSegment(numPrimitives); - - /* clear temporary data for static geometry */ - bvh->cleanup(); - bvh->postBuild(t0); - } - -#if 0 - void buildSingleSegment(size_t numPrimitives) - { - /* create primref array */ - mvector prims(scene->device,numPrimitives); - const PrimInfo pinfo = createPrimRefArrayMBlurGrid(scene,prims,bvh->scene->progressInterface,0); - /* early out if no valid primitives */ - if (pinfo.size() == 0) { bvh->clear(); return; } - - /* estimate acceleration structure size */ - const size_t node_bytes = pinfo.size()*sizeof(AABBNodeMB)/(4*N); - //TODO: check leaf_bytes - const size_t leaf_bytes = size_t(1.2*(float)numPrimitives/N * sizeof(SubGridQBVHN)); - bvh->alloc.init_estimate(node_bytes+leaf_bytes); - - /* settings for BVH build */ - GeneralBVHBuilder::Settings settings; - settings.branchingFactor = N; - settings.maxDepth = BVH::maxBuildDepthLeaf; - settings.logBlockSize = bsr(sahBlockSize); - settings.minLeafSize = min(minLeafSize,maxLeafSize); - settings.maxLeafSize = maxLeafSize; - settings.travCost = travCost; - settings.intCost = intCost; - settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,pinfo.size(),node_bytes+leaf_bytes); - - /* build hierarchy */ - auto root = BVHBuilderBinnedSAH::build - (typename BVH::CreateAlloc(bvh), - typename BVH::AABBNodeMB::Create(), - typename BVH::AABBNodeMB::Set(), - CreateLeafGridMB(scene,bvh,sgrids.data()), - bvh->scene->progressInterface, - prims.data(),pinfo,settings); - - bvh->set(root.ref,root.lbounds,pinfo.size()); - } -#endif - - void buildMultiSegment(size_t numPrimitives) - { - /* create primref array */ - mvector prims(scene->device,numPrimitives); - PrimInfoMB pinfo = createPrimRefArrayMSMBlurGrid(scene,prims,bvh->scene->progressInterface); - - /* early out if no valid primitives */ - if (pinfo.size() == 0) { bvh->clear(); return; } - - - - GridRecalculatePrimRef recalculatePrimRef(scene,sgrids.data()); - - /* estimate acceleration structure size */ - const size_t node_bytes = pinfo.num_time_segments*sizeof(AABBNodeMB)/(4*N); - //FIXME: check leaf_bytes - //const size_t leaf_bytes = size_t(1.2*Primitive::blocks(pinfo.num_time_segments)*sizeof(SubGridQBVHN)); - const size_t leaf_bytes = size_t(1.2*(float)numPrimitives/N * sizeof(SubGridQBVHN)); - - bvh->alloc.init_estimate(node_bytes+leaf_bytes); - - /* settings for BVH build */ - BVHBuilderMSMBlur::Settings settings; - settings.branchingFactor = N; - settings.maxDepth = BVH::maxDepth; - settings.logBlockSize = bsr(sahBlockSize); - settings.minLeafSize = min(minLeafSize,maxLeafSize); - settings.maxLeafSize = maxLeafSize; - settings.travCost = travCost; - settings.intCost = intCost; - settings.singleLeafTimeSegment = false; - settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,pinfo.size(),node_bytes+leaf_bytes); - - /* build hierarchy */ - auto root = - BVHBuilderMSMBlur::build(prims,pinfo,scene->device, - recalculatePrimRef, - typename BVH::CreateAlloc(bvh), - typename BVH::AABBNodeMB4D::Create(), - typename BVH::AABBNodeMB4D::Set(), - CreateMSMBlurLeafGrid(scene,bvh,sgrids.data()), - bvh->scene->progressInterface, - settings); - bvh->set(root.ref,root.lbounds,pinfo.num_time_segments); - } - - void clear() { - } - }; - - /************************************************************************************/ - /************************************************************************************/ - /************************************************************************************/ - /************************************************************************************/ - -#if defined(EMBREE_GEOMETRY_TRIANGLE) - Builder* BVH4Triangle4iMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAH<4,TriangleMesh,Triangle4i>((BVH4*)bvh,scene,4,1.0f,4,inf,Geometry::MTY_TRIANGLE_MESH); } - Builder* BVH4Triangle4vMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAH<4,TriangleMesh,Triangle4vMB>((BVH4*)bvh,scene,4,1.0f,4,inf,Geometry::MTY_TRIANGLE_MESH); } -#if defined(__AVX__) - Builder* BVH8Triangle4iMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAH<8,TriangleMesh,Triangle4i>((BVH8*)bvh,scene,4,1.0f,4,inf,Geometry::MTY_TRIANGLE_MESH); } - Builder* BVH8Triangle4vMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAH<8,TriangleMesh,Triangle4vMB>((BVH8*)bvh,scene,4,1.0f,4,inf,Geometry::MTY_TRIANGLE_MESH); } -#endif -#endif - -#if defined(EMBREE_GEOMETRY_QUAD) - Builder* BVH4Quad4iMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAH<4,QuadMesh,Quad4i>((BVH4*)bvh,scene,4,1.0f,4,inf,Geometry::MTY_QUAD_MESH); } -#if defined(__AVX__) - Builder* BVH8Quad4iMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAH<8,QuadMesh,Quad4i>((BVH8*)bvh,scene,4,1.0f,4,inf,Geometry::MTY_QUAD_MESH); } -#endif -#endif - -#if defined(EMBREE_GEOMETRY_USER) - Builder* BVH4VirtualMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { - int minLeafSize = scene->device->object_accel_mb_min_leaf_size; - int maxLeafSize = scene->device->object_accel_mb_max_leaf_size; - return new BVHNBuilderMBlurSAH<4,UserGeometry,Object>((BVH4*)bvh,scene,4,1.0f,minLeafSize,maxLeafSize,Geometry::MTY_USER_GEOMETRY); - } -#if defined(__AVX__) - Builder* BVH8VirtualMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { - int minLeafSize = scene->device->object_accel_mb_min_leaf_size; - int maxLeafSize = scene->device->object_accel_mb_max_leaf_size; - return new BVHNBuilderMBlurSAH<8,UserGeometry,Object>((BVH8*)bvh,scene,8,1.0f,minLeafSize,maxLeafSize,Geometry::MTY_USER_GEOMETRY); - } -#endif -#endif - -#if defined(EMBREE_GEOMETRY_INSTANCE) - Builder* BVH4InstanceMBSceneBuilderSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype) { return new BVHNBuilderMBlurSAH<4,Instance,InstancePrimitive>((BVH4*)bvh,scene,4,1.0f,1,1,gtype); } -#if defined(__AVX__) - Builder* BVH8InstanceMBSceneBuilderSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype) { return new BVHNBuilderMBlurSAH<8,Instance,InstancePrimitive>((BVH8*)bvh,scene,8,1.0f,1,1,gtype); } -#endif -#endif - -#if defined(EMBREE_GEOMETRY_GRID) - Builder* BVH4GridMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAHGrid<4>((BVH4*)bvh,scene,4,1.0f,4,4); } -#if defined(__AVX__) - Builder* BVH8GridMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAHGrid<8>((BVH8*)bvh,scene,8,1.0f,8,8); } -#endif -#endif - } -} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah_spatial.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah_spatial.cpp deleted file mode 100644 index 285b38c39d..0000000000 --- a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah_spatial.cpp +++ /dev/null @@ -1,201 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "bvh.h" -#include "bvh_builder.h" - -#include "../builders/primrefgen.h" -#include "../builders/primrefgen_presplit.h" -#include "../builders/splitter.h" - -#include "../geometry/linei.h" -#include "../geometry/triangle.h" -#include "../geometry/trianglev.h" -#include "../geometry/trianglev_mb.h" -#include "../geometry/trianglei.h" -#include "../geometry/quadv.h" -#include "../geometry/quadi.h" -#include "../geometry/object.h" -#include "../geometry/instance.h" -#include "../geometry/subgrid.h" - -#include "../common/state.h" - -namespace embree -{ - namespace isa - { - template - struct CreateLeafSpatial - { - typedef BVHN BVH; - typedef typename BVH::NodeRef NodeRef; - - __forceinline CreateLeafSpatial (BVH* bvh) : bvh(bvh) {} - - __forceinline NodeRef operator() (const PrimRef* prims, const range& set, const FastAllocator::CachedAllocator& alloc) const - { - size_t n = set.size(); - size_t items = Primitive::blocks(n); - size_t start = set.begin(); - Primitive* accel = (Primitive*) alloc.malloc1(items*sizeof(Primitive),BVH::byteAlignment); - typename BVH::NodeRef node = BVH::encodeLeaf((char*)accel,items); - for (size_t i=0; iscene); - } - return node; - } - - BVH* bvh; - }; - - template - struct BVHNBuilderFastSpatialSAH : public Builder - { - typedef BVHN BVH; - typedef typename BVH::NodeRef NodeRef; - BVH* bvh; - Scene* scene; - Mesh* mesh; - mvector prims0; - GeneralBVHBuilder::Settings settings; - const float splitFactor; - unsigned int geomID_ = std::numeric_limits::max(); - unsigned int numPreviousPrimitives = 0; - - BVHNBuilderFastSpatialSAH (BVH* bvh, Scene* scene, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const size_t mode) - : bvh(bvh), scene(scene), mesh(nullptr), prims0(scene->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD), - splitFactor(scene->device->max_spatial_split_replications) {} - - BVHNBuilderFastSpatialSAH (BVH* bvh, Mesh* mesh, const unsigned int geomID, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const size_t mode) - : bvh(bvh), scene(nullptr), mesh(mesh), prims0(bvh->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD), - splitFactor(scene->device->max_spatial_split_replications), geomID_(geomID) {} - - // FIXME: shrink bvh->alloc in destructor here and in other builders too - - void build() - { - /* we reset the allocator when the mesh size changed */ - if (mesh && mesh->numPrimitives != numPreviousPrimitives) { - bvh->alloc.clear(); - } - - /* skip build for empty scene */ - const size_t numOriginalPrimitives = mesh ? mesh->size() : scene->getNumPrimitives(Mesh::geom_type,false); - numPreviousPrimitives = numOriginalPrimitives; - if (numOriginalPrimitives == 0) { - prims0.clear(); - bvh->clear(); - return; - } - - const unsigned int maxGeomID = mesh ? geomID_ : scene->getMaxGeomID(); - const bool usePreSplits = scene->device->useSpatialPreSplits || (maxGeomID >= ((unsigned int)1 << (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS))); - double t0 = bvh->preBuild(mesh ? "" : TOSTRING(isa) "::BVH" + toString(N) + (usePreSplits ? "BuilderFastSpatialPresplitSAH" : "BuilderFastSpatialSAH")); - - /* create primref array */ - const size_t numSplitPrimitives = max(numOriginalPrimitives,size_t(splitFactor*numOriginalPrimitives)); - prims0.resize(numSplitPrimitives); - - /* enable os_malloc for two level build */ - if (mesh) - bvh->alloc.setOSallocation(true); - - NodeRef root(0); - PrimInfo pinfo; - - - if (likely(usePreSplits)) - { - /* spatial presplit SAH BVH builder */ - pinfo = mesh ? - createPrimRefArray_presplit(mesh,maxGeomID,numOriginalPrimitives,prims0,bvh->scene->progressInterface) : - createPrimRefArray_presplit(scene,Mesh::geom_type,false,numOriginalPrimitives,prims0,bvh->scene->progressInterface); - - const size_t node_bytes = pinfo.size()*sizeof(typename BVH::AABBNode)/(4*N); - const size_t leaf_bytes = size_t(1.2*Primitive::blocks(pinfo.size())*sizeof(Primitive)); - bvh->alloc.init_estimate(node_bytes+leaf_bytes); - settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,pinfo.size(),node_bytes+leaf_bytes); - - settings.branchingFactor = N; - settings.maxDepth = BVH::maxBuildDepthLeaf; - - /* call BVH builder */ - root = BVHNBuilderVirtual::build(&bvh->alloc,CreateLeafSpatial(bvh),bvh->scene->progressInterface,prims0.data(),pinfo,settings); - } - else - { - /* standard spatial split SAH BVH builder */ - pinfo = mesh ? - createPrimRefArray(mesh,geomID_,/*numSplitPrimitives,*/prims0,bvh->scene->progressInterface) : - createPrimRefArray(scene,Mesh::geom_type,false,/*numSplitPrimitives,*/prims0,bvh->scene->progressInterface); - - Splitter splitter(scene); - - const size_t node_bytes = pinfo.size()*sizeof(typename BVH::AABBNode)/(4*N); - const size_t leaf_bytes = size_t(1.2*Primitive::blocks(pinfo.size())*sizeof(Primitive)); - bvh->alloc.init_estimate(node_bytes+leaf_bytes); - settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,pinfo.size(),node_bytes+leaf_bytes); - - settings.branchingFactor = N; - settings.maxDepth = BVH::maxBuildDepthLeaf; - - /* call BVH builder */ - root = BVHBuilderBinnedFastSpatialSAH::build( - typename BVH::CreateAlloc(bvh), - typename BVH::AABBNode::Create2(), - typename BVH::AABBNode::Set2(), - CreateLeafSpatial(bvh), - splitter, - bvh->scene->progressInterface, - prims0.data(), - numSplitPrimitives, - pinfo,settings); - - /* ==================== */ - } - - bvh->set(root,LBBox3fa(pinfo.geomBounds),pinfo.size()); - bvh->layoutLargeNodes(size_t(pinfo.size()*0.005f)); - - /* clear temporary data for static geometry */ - if (scene && scene->isStaticAccel()) { - prims0.clear(); - } - bvh->cleanup(); - bvh->postBuild(t0); - } - - void clear() { - prims0.clear(); - } - }; - - /************************************************************************************/ - /************************************************************************************/ - /************************************************************************************/ - /************************************************************************************/ - - -#if defined(EMBREE_GEOMETRY_TRIANGLE) - - Builder* BVH4Triangle4SceneBuilderFastSpatialSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<4,TriangleMesh,Triangle4,TriangleSplitterFactory>((BVH4*)bvh,scene,4,1.0f,4,inf,mode); } - Builder* BVH4Triangle4vSceneBuilderFastSpatialSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<4,TriangleMesh,Triangle4v,TriangleSplitterFactory>((BVH4*)bvh,scene,4,1.0f,4,inf,mode); } - Builder* BVH4Triangle4iSceneBuilderFastSpatialSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<4,TriangleMesh,Triangle4i,TriangleSplitterFactory>((BVH4*)bvh,scene,4,1.0f,4,inf,mode); } - -#if defined(__AVX__) - Builder* BVH8Triangle4SceneBuilderFastSpatialSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<8,TriangleMesh,Triangle4,TriangleSplitterFactory>((BVH8*)bvh,scene,4,1.0f,4,inf,mode); } - Builder* BVH8Triangle4vSceneBuilderFastSpatialSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<8,TriangleMesh,Triangle4v,TriangleSplitterFactory>((BVH8*)bvh,scene,4,1.0f,4,inf,mode); } -#endif -#endif - -#if defined(EMBREE_GEOMETRY_QUAD) - Builder* BVH4Quad4vSceneBuilderFastSpatialSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<4,QuadMesh,Quad4v,QuadSplitterFactory>((BVH4*)bvh,scene,4,1.0f,4,inf,mode); } - -#if defined(__AVX__) - Builder* BVH8Quad4vSceneBuilderFastSpatialSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<8,QuadMesh,Quad4v,QuadSplitterFactory>((BVH8*)bvh,scene,4,1.0f,4,inf,mode); } -#endif - -#endif - } -} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel.cpp deleted file mode 100644 index 1a78f347ac..0000000000 --- a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel.cpp +++ /dev/null @@ -1,377 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "bvh_builder_twolevel.h" -#include "bvh_statistics.h" -#include "../builders/bvh_builder_sah.h" -#include "../common/scene_line_segments.h" -#include "../common/scene_triangle_mesh.h" -#include "../common/scene_quad_mesh.h" - -#define PROFILE 0 - -namespace embree -{ - namespace isa - { - template - BVHNBuilderTwoLevel::BVHNBuilderTwoLevel (BVH* bvh, Scene* scene, Geometry::GTypeMask gtype, bool useMortonBuilder, const size_t singleThreadThreshold) - : bvh(bvh), scene(scene), refs(scene->device,0), prims(scene->device,0), singleThreadThreshold(singleThreadThreshold), gtype(gtype), useMortonBuilder_(useMortonBuilder) {} - - template - BVHNBuilderTwoLevel::~BVHNBuilderTwoLevel () { - } - - // =========================================================================== - // =========================================================================== - // =========================================================================== - - template - void BVHNBuilderTwoLevel::build() - { - /* delete some objects */ - size_t num = scene->size(); - if (num < bvh->objects.size()) { - parallel_for(num, bvh->objects.size(), [&] (const range& r) { - for (size_t i=r.begin(); iobjects[i]; bvh->objects[i] = nullptr; - } - }); - } - -#if PROFILE - while(1) -#endif - { - /* reset memory allocator */ - bvh->alloc.reset(); - - /* skip build for empty scene */ - const size_t numPrimitives = scene->getNumPrimitives(gtype,false); - - if (numPrimitives == 0) { - prims.resize(0); - bvh->set(BVH::emptyNode,empty,0); - return; - } - - /* calculate the size of the entire BVH */ - const size_t numLeafBlocks = Primitive::blocks(numPrimitives); - const size_t node_bytes = 2*numLeafBlocks*sizeof(typename BVH::AABBNode)/N; - const size_t leaf_bytes = size_t(1.2*numLeafBlocks*sizeof(Primitive)); - bvh->alloc.init_estimate(node_bytes+leaf_bytes); - - double t0 = bvh->preBuild(TOSTRING(isa) "::BVH" + toString(N) + "BuilderTwoLevel"); - - /* resize object array if scene got larger */ - if (bvh->objects.size() < num) bvh->objects.resize(num); - if (builders.size() < num) builders.resize(num); - resizeRefsList (); - nextRef.store(0); - - /* create acceleration structures */ - parallel_for(size_t(0), num, [&] (const range& r) - { - for (size_t objectID=r.begin(); objectIDgetSafe(objectID); - - /* ignore meshes we do not support */ - if (mesh == nullptr || mesh->numTimeSteps != 1) - continue; - - if (isSmallGeometry(mesh)) { - setupSmallBuildRefBuilder (objectID, mesh); - } else { - setupLargeBuildRefBuilder (objectID, mesh); - } - } - }); - - /* parallel build of acceleration structures */ - parallel_for(size_t(0), num, [&] (const range& r) - { - for (size_t objectID=r.begin(); objectIDgetSafe(objectID); - if (mesh == nullptr || !mesh->isEnabled() || mesh->numTimeSteps != 1) - continue; - - builders[objectID]->attachBuildRefs (this); - } - }); - - -#if PROFILE - double d0 = getSeconds(); -#endif - /* fast path for single geometry scenes */ - if (nextRef == 1) { - bvh->set(refs[0].node,LBBox3fa(refs[0].bounds()),numPrimitives); - } - - else - { - /* open all large nodes */ - refs.resize(nextRef); - - /* this probably needs some more tuning */ - const size_t extSize = max(max((size_t)SPLIT_MIN_EXT_SPACE,refs.size()*SPLIT_MEMORY_RESERVE_SCALE),size_t((float)numPrimitives / SPLIT_MEMORY_RESERVE_FACTOR)); - -#if !ENABLE_DIRECT_SAH_MERGE_BUILDER - -#if ENABLE_OPEN_SEQUENTIAL - open_sequential(extSize); -#endif - /* compute PrimRefs */ - prims.resize(refs.size()); -#endif - -#if defined(TASKING_TBB) && defined(__AVX512ER__) && USE_TASK_ARENA // KNL - tbb::task_arena limited(min(32,(int)TaskScheduler::threadCount())); - limited.execute([&] -#endif - { -#if ENABLE_DIRECT_SAH_MERGE_BUILDER - - const PrimInfo pinfo = parallel_reduce(size_t(0), refs.size(), PrimInfo(empty), [&] (const range& r) -> PrimInfo { - - PrimInfo pinfo(empty); - for (size_t i=r.begin(); i& r) -> PrimInfo { - - PrimInfo pinfo(empty); - for (size_t i=r.begin(); iset(BVH::emptyNode,empty,0); - - /* otherwise build toplevel hierarchy */ - else - { - /* settings for BVH build */ - GeneralBVHBuilder::Settings settings; - settings.branchingFactor = N; - settings.maxDepth = BVH::maxBuildDepthLeaf; - settings.logBlockSize = bsr(N); - settings.minLeafSize = 1; - settings.maxLeafSize = 1; - settings.travCost = 1.0f; - settings.intCost = 1.0f; - settings.singleThreadThreshold = singleThreadThreshold; - -#if ENABLE_DIRECT_SAH_MERGE_BUILDER - - refs.resize(extSize); - - NodeRef root = BVHBuilderBinnedOpenMergeSAH::build( - typename BVH::CreateAlloc(bvh), - typename BVH::AABBNode::Create2(), - typename BVH::AABBNode::Set2(), - - [&] (const BuildRef* refs, const range& range, const FastAllocator::CachedAllocator& alloc) -> NodeRef { - assert(range.size() == 1); - return (NodeRef) refs[range.begin()].node; - }, - [&] (BuildRef &bref, BuildRef *refs) -> size_t { - return openBuildRef(bref,refs); - }, - [&] (size_t dn) { bvh->scene->progressMonitor(0); }, - refs.data(),extSize,pinfo,settings); -#else - NodeRef root = BVHBuilderBinnedSAH::build( - typename BVH::CreateAlloc(bvh), - typename BVH::AABBNode::Create2(), - typename BVH::AABBNode::Set2(), - - [&] (const PrimRef* prims, const range& range, const FastAllocator::CachedAllocator& alloc) -> NodeRef { - assert(range.size() == 1); - return (NodeRef) prims[range.begin()].ID(); - }, - [&] (size_t dn) { bvh->scene->progressMonitor(0); }, - prims.data(),pinfo,settings); -#endif - - - bvh->set(root,LBBox3fa(pinfo.geomBounds),numPrimitives); - } - } -#if defined(TASKING_TBB) && defined(__AVX512ER__) && USE_TASK_ARENA // KNL - ); -#endif - - } - - bvh->alloc.cleanup(); - bvh->postBuild(t0); -#if PROFILE - double d1 = getSeconds(); - std::cout << "TOP_LEVEL OPENING/REBUILD TIME " << 1000.0*(d1-d0) << " ms" << std::endl; -#endif - } - - } - - template - void BVHNBuilderTwoLevel::deleteGeometry(size_t geomID) - { - if (geomID >= bvh->objects.size()) return; - if (builders[geomID]) builders[geomID].reset(); - delete bvh->objects [geomID]; bvh->objects [geomID] = nullptr; - } - - template - void BVHNBuilderTwoLevel::clear() - { - for (size_t i=0; iobjects.size(); i++) - if (bvh->objects[i]) bvh->objects[i]->clear(); - - for (size_t i=0; i - void BVHNBuilderTwoLevel::open_sequential(const size_t extSize) - { - if (refs.size() == 0) - return; - - refs.reserve(extSize); - -#if 1 - for (size_t i=0;ichild(i) == BVH::emptyNode) continue; - refs.push_back(BuildRef(node->bounds(i),node->child(i))); - -#if 1 - NodeRef ref_pre = node->child(i); - if (ref_pre.isAABBNode()) - ref_pre.prefetch(); -#endif - std::push_heap (refs.begin(),refs.end()); - } - } - } - - template - void BVHNBuilderTwoLevel::setupSmallBuildRefBuilder (size_t objectID, Mesh const * const /*mesh*/) - { - if (builders[objectID] == nullptr || // new mesh - dynamic_cast(builders[objectID].get()) == nullptr) // size change resulted in large->small change - { - builders[objectID].reset (new RefBuilderSmall(objectID)); - } - } - - template - void BVHNBuilderTwoLevel::setupLargeBuildRefBuilder (size_t objectID, Mesh const * const mesh) - { - if (bvh->objects[objectID] == nullptr || // new mesh - builders[objectID]->meshQualityChanged (mesh->quality) || // changed build quality - dynamic_cast(builders[objectID].get()) == nullptr) // size change resulted in small->large change - { - Builder* builder = nullptr; - delete bvh->objects[objectID]; - createMeshAccel(objectID, builder); - builders[objectID].reset (new RefBuilderLarge(objectID, builder, mesh->quality)); - } - } - -#if defined(EMBREE_GEOMETRY_TRIANGLE) - Builder* BVH4BuilderTwoLevelTriangle4MeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) { - return new BVHNBuilderTwoLevel<4,TriangleMesh,Triangle4>((BVH4*)bvh,scene,TriangleMesh::geom_type,useMortonBuilder); - } - Builder* BVH4BuilderTwoLevelTriangle4vMeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) { - return new BVHNBuilderTwoLevel<4,TriangleMesh,Triangle4v>((BVH4*)bvh,scene,TriangleMesh::geom_type,useMortonBuilder); - } - Builder* BVH4BuilderTwoLevelTriangle4iMeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) { - return new BVHNBuilderTwoLevel<4,TriangleMesh,Triangle4i>((BVH4*)bvh,scene,TriangleMesh::geom_type,useMortonBuilder); - } -#endif - -#if defined(EMBREE_GEOMETRY_QUAD) - Builder* BVH4BuilderTwoLevelQuadMeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) { - return new BVHNBuilderTwoLevel<4,QuadMesh,Quad4v>((BVH4*)bvh,scene,QuadMesh::geom_type,useMortonBuilder); - } -#endif - -#if defined(EMBREE_GEOMETRY_USER) - Builder* BVH4BuilderTwoLevelVirtualSAH (void* bvh, Scene* scene, bool useMortonBuilder) { - return new BVHNBuilderTwoLevel<4,UserGeometry,Object>((BVH4*)bvh,scene,UserGeometry::geom_type,useMortonBuilder); - } -#endif - -#if defined(EMBREE_GEOMETRY_INSTANCE) - Builder* BVH4BuilderTwoLevelInstanceSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype, bool useMortonBuilder) { - return new BVHNBuilderTwoLevel<4,Instance,InstancePrimitive>((BVH4*)bvh,scene,gtype,useMortonBuilder); - } -#endif - -#if defined(__AVX__) -#if defined(EMBREE_GEOMETRY_TRIANGLE) - Builder* BVH8BuilderTwoLevelTriangle4MeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) { - return new BVHNBuilderTwoLevel<8,TriangleMesh,Triangle4>((BVH8*)bvh,scene,TriangleMesh::geom_type,useMortonBuilder); - } - Builder* BVH8BuilderTwoLevelTriangle4vMeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) { - return new BVHNBuilderTwoLevel<8,TriangleMesh,Triangle4v>((BVH8*)bvh,scene,TriangleMesh::geom_type,useMortonBuilder); - } - Builder* BVH8BuilderTwoLevelTriangle4iMeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) { - return new BVHNBuilderTwoLevel<8,TriangleMesh,Triangle4i>((BVH8*)bvh,scene,TriangleMesh::geom_type,useMortonBuilder); - } -#endif - -#if defined(EMBREE_GEOMETRY_QUAD) - Builder* BVH8BuilderTwoLevelQuadMeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) { - return new BVHNBuilderTwoLevel<8,QuadMesh,Quad4v>((BVH8*)bvh,scene,QuadMesh::geom_type,useMortonBuilder); - } -#endif - -#if defined(EMBREE_GEOMETRY_USER) - Builder* BVH8BuilderTwoLevelVirtualSAH (void* bvh, Scene* scene, bool useMortonBuilder) { - return new BVHNBuilderTwoLevel<8,UserGeometry,Object>((BVH8*)bvh,scene,UserGeometry::geom_type,useMortonBuilder); - } -#endif - -#if defined(EMBREE_GEOMETRY_INSTANCE) - Builder* BVH8BuilderTwoLevelInstanceSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype, bool useMortonBuilder) { - return new BVHNBuilderTwoLevel<8,Instance,InstancePrimitive>((BVH8*)bvh,scene,gtype,useMortonBuilder); - } -#endif - -#endif - } -} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel.h deleted file mode 100644 index 8f57c3b406..0000000000 --- a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel.h +++ /dev/null @@ -1,263 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include - -#include "bvh_builder_twolevel_internal.h" -#include "bvh.h" -#include "../common/primref.h" -#include "../builders/priminfo.h" -#include "../builders/primrefgen.h" - -/* new open/merge builder */ -#define ENABLE_DIRECT_SAH_MERGE_BUILDER 1 -#define ENABLE_OPEN_SEQUENTIAL 0 -#define SPLIT_MEMORY_RESERVE_FACTOR 1000 -#define SPLIT_MEMORY_RESERVE_SCALE 2 -#define SPLIT_MIN_EXT_SPACE 1000 - -namespace embree -{ - namespace isa - { - template - class BVHNBuilderTwoLevel : public Builder - { - typedef BVHN BVH; - typedef typename BVH::AABBNode AABBNode; - typedef typename BVH::NodeRef NodeRef; - - __forceinline static bool isSmallGeometry(Mesh* mesh) { - return mesh->size() <= 4; - } - - public: - - typedef void (*createMeshAccelTy)(Scene* scene, unsigned int geomID, AccelData*& accel, Builder*& builder); - - struct BuildRef : public PrimRef - { - public: - __forceinline BuildRef () {} - - __forceinline BuildRef (const BBox3fa& bounds, NodeRef node) - : PrimRef(bounds,(size_t)node), node(node) - { - if (node.isLeaf()) - bounds_area = 0.0f; - else - bounds_area = area(this->bounds()); - } - - /* used by the open/merge bvh builder */ - __forceinline BuildRef (const BBox3fa& bounds, NodeRef node, const unsigned int geomID, const unsigned int numPrimitives) - : PrimRef(bounds,geomID,numPrimitives), node(node) - { - /* important for relative buildref ordering */ - if (node.isLeaf()) - bounds_area = 0.0f; - else - bounds_area = area(this->bounds()); - } - - __forceinline size_t size() const { - return primID(); - } - - friend bool operator< (const BuildRef& a, const BuildRef& b) { - return a.bounds_area < b.bounds_area; - } - - friend __forceinline embree_ostream operator<<(embree_ostream cout, const BuildRef& ref) { - return cout << "{ lower = " << ref.lower << ", upper = " << ref.upper << ", center2 = " << ref.center2() << ", geomID = " << ref.geomID() << ", numPrimitives = " << ref.numPrimitives() << ", bounds_area = " << ref.bounds_area << " }"; - } - - __forceinline unsigned int numPrimitives() const { return primID(); } - - public: - NodeRef node; - float bounds_area; - }; - - - __forceinline size_t openBuildRef(BuildRef &bref, BuildRef *const refs) { - if (bref.node.isLeaf()) - { - refs[0] = bref; - return 1; - } - NodeRef ref = bref.node; - unsigned int geomID = bref.geomID(); - unsigned int numPrims = max((unsigned int)bref.numPrimitives() / N,(unsigned int)1); - AABBNode* node = ref.getAABBNode(); - size_t n = 0; - for (size_t i=0; ichild(i) == BVH::emptyNode) continue; - refs[i] = BuildRef(node->bounds(i),node->child(i),geomID,numPrims); - n++; - } - assert(n > 1); - return n; - } - - /*! Constructor. */ - BVHNBuilderTwoLevel (BVH* bvh, Scene* scene, Geometry::GTypeMask gtype = Mesh::geom_type, bool useMortonBuilder = false, const size_t singleThreadThreshold = DEFAULT_SINGLE_THREAD_THRESHOLD); - - /*! Destructor */ - ~BVHNBuilderTwoLevel (); - - /*! builder entry point */ - void build(); - void deleteGeometry(size_t geomID); - void clear(); - - void open_sequential(const size_t extSize); - - private: - - class RefBuilderBase { - public: - virtual ~RefBuilderBase () {} - virtual void attachBuildRefs (BVHNBuilderTwoLevel* builder) = 0; - virtual bool meshQualityChanged (RTCBuildQuality currQuality) = 0; - }; - - class RefBuilderSmall : public RefBuilderBase { - public: - - RefBuilderSmall (size_t objectID) - : objectID_ (objectID) {} - - void attachBuildRefs (BVHNBuilderTwoLevel* topBuilder) { - - Mesh* mesh = topBuilder->scene->template getSafe(objectID_); - size_t meshSize = mesh->size(); - assert(isSmallGeometry(mesh)); - - mvector prefs(topBuilder->scene->device, meshSize); - auto pinfo = createPrimRefArray(mesh,objectID_,prefs,topBuilder->bvh->scene->progressInterface); - - size_t begin=0; - while (begin < pinfo.size()) - { - Primitive* accel = (Primitive*) topBuilder->bvh->alloc.getCachedAllocator().malloc1(sizeof(Primitive),BVH::byteAlignment); - typename BVH::NodeRef node = BVH::encodeLeaf((char*)accel,1); - accel->fill(prefs.data(),begin,pinfo.size(),topBuilder->bvh->scene); - - /* create build primitive */ -#if ENABLE_DIRECT_SAH_MERGE_BUILDER - topBuilder->refs[topBuilder->nextRef++] = BVHNBuilderTwoLevel::BuildRef(pinfo.geomBounds,node,(unsigned int)objectID_,1); -#else - topBuilder->refs[topBuilder->nextRef++] = BVHNBuilderTwoLevel::BuildRef(pinfo.geomBounds,node); -#endif - } - assert(begin == pinfo.size()); - } - - bool meshQualityChanged (RTCBuildQuality /*currQuality*/) { - return false; - } - - size_t objectID_; - }; - - class RefBuilderLarge : public RefBuilderBase { - public: - - RefBuilderLarge (size_t objectID, const Ref& builder, RTCBuildQuality quality) - : objectID_ (objectID), builder_ (builder), quality_ (quality) {} - - void attachBuildRefs (BVHNBuilderTwoLevel* topBuilder) - { - BVH* object = topBuilder->getBVH(objectID_); assert(object); - - /* build object if it got modified */ - if (topBuilder->isGeometryModified(objectID_)) - builder_->build(); - - /* create build primitive */ - if (!object->getBounds().empty()) - { -#if ENABLE_DIRECT_SAH_MERGE_BUILDER - Mesh* mesh = topBuilder->getMesh(objectID_); - topBuilder->refs[topBuilder->nextRef++] = BVHNBuilderTwoLevel::BuildRef(object->getBounds(),object->root,(unsigned int)objectID_,(unsigned int)mesh->size()); -#else - topBuilder->refs[topBuilder->nextRef++] = BVHNBuilderTwoLevel::BuildRef(object->getBounds(),object->root); -#endif - } - } - - bool meshQualityChanged (RTCBuildQuality currQuality) { - return currQuality != quality_; - } - - private: - size_t objectID_; - Ref builder_; - RTCBuildQuality quality_; - }; - - void setupLargeBuildRefBuilder (size_t objectID, Mesh const * const mesh); - void setupSmallBuildRefBuilder (size_t objectID, Mesh const * const mesh); - - BVH* getBVH (size_t objectID) { - return this->bvh->objects[objectID]; - } - Mesh* getMesh (size_t objectID) { - return this->scene->template getSafe(objectID); - } - bool isGeometryModified (size_t objectID) { - return this->scene->isGeometryModified(objectID); - } - - void resizeRefsList () - { - size_t num = parallel_reduce (size_t(0), scene->size(), size_t(0), - [this](const range& r)->size_t { - size_t c = 0; - for (auto i=r.begin(); igetSafe(i); - if (mesh == nullptr || mesh->numTimeSteps != 1) - continue; - size_t meshSize = mesh->size(); - c += isSmallGeometry(mesh) ? Primitive::blocks(meshSize) : 1; - } - return c; - }, - std::plus() - ); - - if (refs.size() < num) { - refs.resize(num); - } - } - - void createMeshAccel (size_t geomID, Builder*& builder) - { - bvh->objects[geomID] = new BVH(Primitive::type,scene); - BVH* accel = bvh->objects[geomID]; - auto mesh = scene->getSafe(geomID); - if (nullptr == mesh) { - throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"geomID does not return correct type"); - return; - } - - __internal_two_level_builder__::MeshBuilder()(accel, mesh, geomID, this->gtype, this->useMortonBuilder_, builder); - } - - using BuilderList = std::vector>; - - BuilderList builders; - BVH* bvh; - Scene* scene; - mvector refs; - mvector prims; - std::atomic nextRef; - const size_t singleThreadThreshold; - Geometry::GTypeMask gtype; - bool useMortonBuilder_ = false; - }; - } -} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel_internal.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel_internal.h deleted file mode 100644 index 1c1ae8d6a7..0000000000 --- a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel_internal.h +++ /dev/null @@ -1,267 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "bvh.h" -#include "../geometry/triangle.h" -#include "../geometry/trianglev.h" -#include "../geometry/trianglei.h" -#include "../geometry/quadv.h" -#include "../geometry/quadi.h" -#include "../geometry/object.h" -#include "../geometry/instance.h" - -namespace embree -{ - DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4MeshBuilderMortonGeneral,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4MeshBuilderSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4MeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vMeshBuilderMortonGeneral,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vMeshBuilderSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vMeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iMeshBuilderMortonGeneral,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iMeshBuilderSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iMeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4vMeshBuilderMortonGeneral,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4vMeshBuilderSAH,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4vMeshRefitSAH,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH4VirtualMeshBuilderMortonGeneral,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH4VirtualMeshBuilderSAH,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH4VirtualMeshRefitSAH,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH4InstanceMeshBuilderMortonGeneral,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH4InstanceMeshBuilderSAH,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH4InstanceMeshRefitSAH,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t) - DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4MeshBuilderMortonGeneral,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4MeshBuilderSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4MeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4vMeshBuilderMortonGeneral,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4vMeshBuilderSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4vMeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4iMeshBuilderMortonGeneral,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4iMeshBuilderSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4iMeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4vMeshBuilderMortonGeneral,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4vMeshBuilderSAH,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4vMeshRefitSAH,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH8VirtualMeshBuilderMortonGeneral,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH8VirtualMeshBuilderSAH,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH8VirtualMeshRefitSAH,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH8InstanceMeshBuilderMortonGeneral,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH8InstanceMeshBuilderSAH,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t); - DECLARE_ISA_FUNCTION(Builder*,BVH8InstanceMeshRefitSAH,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t) - - namespace isa - { - - namespace __internal_two_level_builder__ { - - template - struct MortonBuilder {}; - template<> - struct MortonBuilder<4,TriangleMesh,Triangle4> { - MortonBuilder () {} - Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4MeshBuilderMortonGeneral(bvh,mesh,geomID,0);} - }; - template<> - struct MortonBuilder<4,TriangleMesh,Triangle4v> { - MortonBuilder () {} - Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4vMeshBuilderMortonGeneral(bvh,mesh,geomID,0);} - }; - template<> - struct MortonBuilder<4,TriangleMesh,Triangle4i> { - MortonBuilder () {} - Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4iMeshBuilderMortonGeneral(bvh,mesh,geomID,0);} - }; - template<> - struct MortonBuilder<4,QuadMesh,Quad4v> { - MortonBuilder () {} - Builder* operator () (void* bvh, QuadMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Quad4vMeshBuilderMortonGeneral(bvh,mesh,geomID,0);} - }; - template<> - struct MortonBuilder<4,UserGeometry,Object> { - MortonBuilder () {} - Builder* operator () (void* bvh, UserGeometry* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4VirtualMeshBuilderMortonGeneral(bvh,mesh,geomID,0);} - }; - template<> - struct MortonBuilder<4,Instance,InstancePrimitive> { - MortonBuilder () {} - Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH4InstanceMeshBuilderMortonGeneral(bvh,mesh,gtype,geomID,0);} - }; - template<> - struct MortonBuilder<8,TriangleMesh,Triangle4> { - MortonBuilder () {} - Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4MeshBuilderMortonGeneral(bvh,mesh,geomID,0);} - }; - template<> - struct MortonBuilder<8,TriangleMesh,Triangle4v> { - MortonBuilder () {} - Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4vMeshBuilderMortonGeneral(bvh,mesh,geomID,0);} - }; - template<> - struct MortonBuilder<8,TriangleMesh,Triangle4i> { - MortonBuilder () {} - Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4iMeshBuilderMortonGeneral(bvh,mesh,geomID,0);} - }; - template<> - struct MortonBuilder<8,QuadMesh,Quad4v> { - MortonBuilder () {} - Builder* operator () (void* bvh, QuadMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Quad4vMeshBuilderMortonGeneral(bvh,mesh,geomID,0);} - }; - template<> - struct MortonBuilder<8,UserGeometry,Object> { - MortonBuilder () {} - Builder* operator () (void* bvh, UserGeometry* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8VirtualMeshBuilderMortonGeneral(bvh,mesh,geomID,0);} - }; - template<> - struct MortonBuilder<8,Instance,InstancePrimitive> { - MortonBuilder () {} - Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH8InstanceMeshBuilderMortonGeneral(bvh,mesh,gtype,geomID,0);} - }; - - template - struct SAHBuilder {}; - template<> - struct SAHBuilder<4,TriangleMesh,Triangle4> { - SAHBuilder () {} - Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4MeshBuilderSAH(bvh,mesh,geomID,0);} - }; - template<> - struct SAHBuilder<4,TriangleMesh,Triangle4v> { - SAHBuilder () {} - Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4vMeshBuilderSAH(bvh,mesh,geomID,0);} - }; - template<> - struct SAHBuilder<4,TriangleMesh,Triangle4i> { - SAHBuilder () {} - Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4iMeshBuilderSAH(bvh,mesh,geomID,0);} - }; - template<> - struct SAHBuilder<4,QuadMesh,Quad4v> { - SAHBuilder () {} - Builder* operator () (void* bvh, QuadMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Quad4vMeshBuilderSAH(bvh,mesh,geomID,0);} - }; - template<> - struct SAHBuilder<4,UserGeometry,Object> { - SAHBuilder () {} - Builder* operator () (void* bvh, UserGeometry* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4VirtualMeshBuilderSAH(bvh,mesh,geomID,0);} - }; - template<> - struct SAHBuilder<4,Instance,InstancePrimitive> { - SAHBuilder () {} - Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH4InstanceMeshBuilderSAH(bvh,mesh,gtype,geomID,0);} - }; - template<> - struct SAHBuilder<8,TriangleMesh,Triangle4> { - SAHBuilder () {} - Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4MeshBuilderSAH(bvh,mesh,geomID,0);} - }; - template<> - struct SAHBuilder<8,TriangleMesh,Triangle4v> { - SAHBuilder () {} - Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4vMeshBuilderSAH(bvh,mesh,geomID,0);} - }; - template<> - struct SAHBuilder<8,TriangleMesh,Triangle4i> { - SAHBuilder () {} - Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4iMeshBuilderSAH(bvh,mesh,geomID,0);} - }; - template<> - struct SAHBuilder<8,QuadMesh,Quad4v> { - SAHBuilder () {} - Builder* operator () (void* bvh, QuadMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Quad4vMeshBuilderSAH(bvh,mesh,geomID,0);} - }; - template<> - struct SAHBuilder<8,UserGeometry,Object> { - SAHBuilder () {} - Builder* operator () (void* bvh, UserGeometry* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8VirtualMeshBuilderSAH(bvh,mesh,geomID,0);} - }; - template<> - struct SAHBuilder<8,Instance,InstancePrimitive> { - SAHBuilder () {} - Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH8InstanceMeshBuilderSAH(bvh,mesh,gtype,geomID,0);} - }; - - template - struct RefitBuilder {}; - template<> - struct RefitBuilder<4,TriangleMesh,Triangle4> { - RefitBuilder () {} - Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4MeshRefitSAH(bvh,mesh,geomID,0);} - }; - template<> - struct RefitBuilder<4,TriangleMesh,Triangle4v> { - RefitBuilder () {} - Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4vMeshRefitSAH(bvh,mesh,geomID,0);} - }; - template<> - struct RefitBuilder<4,TriangleMesh,Triangle4i> { - RefitBuilder () {} - Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4iMeshRefitSAH(bvh,mesh,geomID,0);} - }; - template<> - struct RefitBuilder<4,QuadMesh,Quad4v> { - RefitBuilder () {} - Builder* operator () (void* bvh, QuadMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Quad4vMeshRefitSAH(bvh,mesh,geomID,0);} - }; - template<> - struct RefitBuilder<4,UserGeometry,Object> { - RefitBuilder () {} - Builder* operator () (void* bvh, UserGeometry* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4VirtualMeshRefitSAH(bvh,mesh,geomID,0);} - }; - template<> - struct RefitBuilder<4,Instance,InstancePrimitive> { - RefitBuilder () {} - Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH4InstanceMeshRefitSAH(bvh,mesh,gtype,geomID,0);} - }; - template<> - struct RefitBuilder<8,TriangleMesh,Triangle4> { - RefitBuilder () {} - Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4MeshRefitSAH(bvh,mesh,geomID,0);} - }; - template<> - struct RefitBuilder<8,TriangleMesh,Triangle4v> { - RefitBuilder () {} - Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4vMeshRefitSAH(bvh,mesh,geomID,0);} - }; - template<> - struct RefitBuilder<8,TriangleMesh,Triangle4i> { - RefitBuilder () {} - Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4iMeshRefitSAH(bvh,mesh,geomID,0);} - }; - template<> - struct RefitBuilder<8,QuadMesh,Quad4v> { - RefitBuilder () {} - Builder* operator () (void* bvh, QuadMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Quad4vMeshRefitSAH(bvh,mesh,geomID,0);} - }; - template<> - struct RefitBuilder<8,UserGeometry,Object> { - RefitBuilder () {} - Builder* operator () (void* bvh, UserGeometry* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8VirtualMeshRefitSAH(bvh,mesh,geomID,0);} - }; - template<> - struct RefitBuilder<8,Instance,InstancePrimitive> { - RefitBuilder () {} - Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH8InstanceMeshRefitSAH(bvh,mesh,gtype,geomID,0);} - }; - - template - struct MeshBuilder { - MeshBuilder () {} - void operator () (void* bvh, Mesh* mesh, size_t geomID, Geometry::GTypeMask gtype, bool useMortonBuilder, Builder*& builder) { - if(useMortonBuilder) { - builder = MortonBuilder()(bvh,mesh,geomID,gtype); - return; - } - switch (mesh->quality) { - case RTC_BUILD_QUALITY_LOW: builder = MortonBuilder()(bvh,mesh,geomID,gtype); break; - case RTC_BUILD_QUALITY_MEDIUM: - case RTC_BUILD_QUALITY_HIGH: builder = SAHBuilder()(bvh,mesh,geomID,gtype); break; - case RTC_BUILD_QUALITY_REFIT: builder = RefitBuilder()(bvh,mesh,geomID,gtype); break; - default: throw_RTCError(RTC_ERROR_UNKNOWN,"invalid build quality"); - } - } - }; - } - } -} \ No newline at end of file diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_collider.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_collider.cpp deleted file mode 100644 index a27be8bae8..0000000000 --- a/thirdparty/embree-aarch64/kernels/bvh/bvh_collider.cpp +++ /dev/null @@ -1,375 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "bvh_collider.h" -#include "../geometry/triangle_triangle_intersector.h" - -namespace embree -{ - namespace isa - { -#define CSTAT(x) - - size_t parallel_depth_threshold = 3; - CSTAT(std::atomic bvh_collide_traversal_steps(0)); - CSTAT(std::atomic bvh_collide_leaf_pairs(0)); - CSTAT(std::atomic bvh_collide_leaf_iterations(0)); - CSTAT(std::atomic bvh_collide_prim_intersections1(0)); - CSTAT(std::atomic bvh_collide_prim_intersections2(0)); - CSTAT(std::atomic bvh_collide_prim_intersections3(0)); - CSTAT(std::atomic bvh_collide_prim_intersections4(0)); - CSTAT(std::atomic bvh_collide_prim_intersections5(0)); - CSTAT(std::atomic bvh_collide_prim_intersections(0)); - - struct Collision - { - __forceinline Collision() {} - - __forceinline Collision (unsigned geomID0, unsigned primID0, unsigned geomID1, unsigned primID1) - : geomID0(geomID0), primID0(primID0), geomID1(geomID1), primID1(primID1) {} - - unsigned geomID0; - unsigned primID0; - unsigned geomID1; - unsigned primID1; - }; - - template - __forceinline size_t overlap(const BBox3fa& box0, const typename BVHN::AABBNode& node1) - { - const vfloat lower_x = max(vfloat(box0.lower.x),node1.lower_x); - const vfloat lower_y = max(vfloat(box0.lower.y),node1.lower_y); - const vfloat lower_z = max(vfloat(box0.lower.z),node1.lower_z); - const vfloat upper_x = min(vfloat(box0.upper.x),node1.upper_x); - const vfloat upper_y = min(vfloat(box0.upper.y),node1.upper_y); - const vfloat upper_z = min(vfloat(box0.upper.z),node1.upper_z); - return movemask((lower_x <= upper_x) & (lower_y <= upper_y) & (lower_z <= upper_z)); - } - - template - __forceinline size_t overlap(const BBox3fa& box0, const BBox>>& box1) - { - const vfloat lower_x = max(vfloat(box0.lower.x),box1.lower.x); - const vfloat lower_y = max(vfloat(box0.lower.y),box1.lower.y); - const vfloat lower_z = max(vfloat(box0.lower.z),box1.lower.z); - const vfloat upper_x = min(vfloat(box0.upper.x),box1.upper.x); - const vfloat upper_y = min(vfloat(box0.upper.y),box1.upper.y); - const vfloat upper_z = min(vfloat(box0.upper.z),box1.upper.z); - return movemask((lower_x <= upper_x) & (lower_y <= upper_y) & (lower_z <= upper_z)); - } - - template - __forceinline size_t overlap(const BBox>>& box0, size_t i, const BBox>>& box1) - { - const vfloat lower_x = max(vfloat(box0.lower.x[i]),box1.lower.x); - const vfloat lower_y = max(vfloat(box0.lower.y[i]),box1.lower.y); - const vfloat lower_z = max(vfloat(box0.lower.z[i]),box1.lower.z); - const vfloat upper_x = min(vfloat(box0.upper.x[i]),box1.upper.x); - const vfloat upper_y = min(vfloat(box0.upper.y[i]),box1.upper.y); - const vfloat upper_z = min(vfloat(box0.upper.z[i]),box1.upper.z); - return movemask((lower_x <= upper_x) & (lower_y <= upper_y) & (lower_z <= upper_z)); - } - - bool intersect_triangle_triangle (Scene* scene0, unsigned geomID0, unsigned primID0, Scene* scene1, unsigned geomID1, unsigned primID1) - { - CSTAT(bvh_collide_prim_intersections1++); - const TriangleMesh* mesh0 = scene0->get(geomID0); - const TriangleMesh* mesh1 = scene1->get(geomID1); - const TriangleMesh::Triangle& tri0 = mesh0->triangle(primID0); - const TriangleMesh::Triangle& tri1 = mesh1->triangle(primID1); - - /* special culling for scene intersection with itself */ - if (scene0 == scene1 && geomID0 == geomID1) - { - /* ignore self intersections */ - if (primID0 == primID1) - return false; - } - CSTAT(bvh_collide_prim_intersections2++); - - if (scene0 == scene1 && geomID0 == geomID1) - { - /* ignore intersection with topological neighbors */ - const vint4 t0(tri0.v[0],tri0.v[1],tri0.v[2],tri0.v[2]); - if (any(vint4(tri1.v[0]) == t0)) return false; - if (any(vint4(tri1.v[1]) == t0)) return false; - if (any(vint4(tri1.v[2]) == t0)) return false; - } - CSTAT(bvh_collide_prim_intersections3++); - - const Vec3fa a0 = mesh0->vertex(tri0.v[0]); - const Vec3fa a1 = mesh0->vertex(tri0.v[1]); - const Vec3fa a2 = mesh0->vertex(tri0.v[2]); - const Vec3fa b0 = mesh1->vertex(tri1.v[0]); - const Vec3fa b1 = mesh1->vertex(tri1.v[1]); - const Vec3fa b2 = mesh1->vertex(tri1.v[2]); - - return TriangleTriangleIntersector::intersect_triangle_triangle(a0,a1,a2,b0,b1,b2); - } - - template - __forceinline void BVHNColliderUserGeom::processLeaf(NodeRef node0, NodeRef node1) - { - Collision collisions[16]; - size_t num_collisions = 0; - - size_t N0; Object* leaf0 = (Object*) node0.leaf(N0); - size_t N1; Object* leaf1 = (Object*) node1.leaf(N1); - for (size_t i=0; iscene0 == this->scene1 && geomID0 == geomID1 && primID0 == primID1) continue; - collisions[num_collisions++] = Collision(geomID0,primID0,geomID1,primID1); - if (num_collisions == 16) { - this->callback(this->userPtr,(RTCCollision*)&collisions,num_collisions); - num_collisions = 0; - } - } - } - if (num_collisions) - this->callback(this->userPtr,(RTCCollision*)&collisions,num_collisions); - } - - template - void BVHNCollider::collide_recurse(NodeRef ref0, const BBox3fa& bounds0, NodeRef ref1, const BBox3fa& bounds1, size_t depth0, size_t depth1) - { - CSTAT(bvh_collide_traversal_steps++); - if (unlikely(ref0.isLeaf())) { - if (unlikely(ref1.isLeaf())) { - CSTAT(bvh_collide_leaf_pairs++); - processLeaf(ref0,ref1); - return; - } else goto recurse_node1; - - } else { - if (unlikely(ref1.isLeaf())) { - goto recurse_node0; - } else { - if (area(bounds0) > area(bounds1)) { - goto recurse_node0; - } - else { - goto recurse_node1; - } - } - } - - { - recurse_node0: - AABBNode* node0 = ref0.getAABBNode(); - size_t mask = overlap(bounds1,*node0); - //for (size_t m=mask, i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) { - //for (size_t i=0; i::prefetch(node0->child(i),BVH_FLAG_ALIGNED_NODE); - collide_recurse(node0->child(i),node0->bounds(i),ref1,bounds1,depth0+1,depth1); - } - }); - } - else -#endif - { - for (size_t m=mask, i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) { - BVHN::prefetch(node0->child(i),BVH_FLAG_ALIGNED_NODE); - collide_recurse(node0->child(i),node0->bounds(i),ref1,bounds1,depth0+1,depth1); - } - } - return; - } - - { - recurse_node1: - AABBNode* node1 = ref1.getAABBNode(); - size_t mask = overlap(bounds0,*node1); - //for (size_t m=mask, i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) { - //for (size_t i=0; i::prefetch(node1->child(i),BVH_FLAG_ALIGNED_NODE); - collide_recurse(ref0,bounds0,node1->child(i),node1->bounds(i),depth0,depth1+1); - } - }); - } - else -#endif - { - for (size_t m=mask, i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) { - BVHN::prefetch(node1->child(i),BVH_FLAG_ALIGNED_NODE); - collide_recurse(ref0,bounds0,node1->child(i),node1->bounds(i),depth0,depth1+1); - } - } - return; - } - } - - template - void BVHNCollider::split(const CollideJob& job, jobvector& jobs) - { - if (unlikely(job.ref0.isLeaf())) { - if (unlikely(job.ref1.isLeaf())) { - jobs.push_back(job); - return; - } else goto recurse_node1; - } else { - if (unlikely(job.ref1.isLeaf())) { - goto recurse_node0; - } else { - if (area(job.bounds0) > area(job.bounds1)) { - goto recurse_node0; - } - else { - goto recurse_node1; - } - } - } - - { - recurse_node0: - const AABBNode* node0 = job.ref0.getAABBNode(); - size_t mask = overlap(job.bounds1,*node0); - for (size_t m=mask, i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) { - jobs.push_back(CollideJob(node0->child(i),node0->bounds(i),job.depth0+1,job.ref1,job.bounds1,job.depth1)); - } - return; - } - - { - recurse_node1: - const AABBNode* node1 = job.ref1.getAABBNode(); - size_t mask = overlap(job.bounds0,*node1); - for (size_t m=mask, i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) { - jobs.push_back(CollideJob(job.ref0,job.bounds0,job.depth0,node1->child(i),node1->bounds(i),job.depth1+1)); - } - return; - } - } - - template - void BVHNCollider::collide_recurse_entry(NodeRef ref0, const BBox3fa& bounds0, NodeRef ref1, const BBox3fa& bounds1) - { - CSTAT(bvh_collide_traversal_steps = 0); - CSTAT(bvh_collide_leaf_pairs = 0); - CSTAT(bvh_collide_leaf_iterations = 0); - CSTAT(bvh_collide_prim_intersections1 = 0); - CSTAT(bvh_collide_prim_intersections2 = 0); - CSTAT(bvh_collide_prim_intersections3 = 0); - CSTAT(bvh_collide_prim_intersections4 = 0); - CSTAT(bvh_collide_prim_intersections5 = 0); - CSTAT(bvh_collide_prim_intersections = 0); -#if 0 - collide_recurse(ref0,bounds0,ref1,bounds1,0,0); -#else - const int M = 2048; - jobvector jobs[2]; - jobs[0].reserve(M); - jobs[1].reserve(M); - jobs[0].push_back(CollideJob(ref0,bounds0,0,ref1,bounds1,0)); - int source = 0; - int target = 1; - - /* try to split job until job list is full */ - while (jobs[source].size()+8 <= M) - { - for (size_t i=0; i M) { - jobs[target].push_back(job); - } else { - split(job,jobs[target]); - } - } - - /* stop splitting jobs if we reached only leaves and cannot make progress anymore */ - if (jobs[target].size() == jobs[source].size()) - break; - - jobs[source].resize(0); - std::swap(source,target); - } - - /* parallel processing of all jobs */ - parallel_for(size_t(jobs[source].size()), [&] ( size_t i ) { - CollideJob& j = jobs[source][i]; - collide_recurse(j.ref0,j.bounds0,j.ref1,j.bounds1,j.depth0,j.depth1); - }); - - -#endif - CSTAT(PRINT(bvh_collide_traversal_steps)); - CSTAT(PRINT(bvh_collide_leaf_pairs)); - CSTAT(PRINT(bvh_collide_leaf_iterations)); - CSTAT(PRINT(bvh_collide_prim_intersections1)); - CSTAT(PRINT(bvh_collide_prim_intersections2)); - CSTAT(PRINT(bvh_collide_prim_intersections3)); - CSTAT(PRINT(bvh_collide_prim_intersections4)); - CSTAT(PRINT(bvh_collide_prim_intersections5)); - CSTAT(PRINT(bvh_collide_prim_intersections)); - } - - template - void BVHNColliderUserGeom::collide(BVH* __restrict__ bvh0, BVH* __restrict__ bvh1, RTCCollideFunc callback, void* userPtr) - { - BVHNColliderUserGeom(bvh0->scene,bvh1->scene,callback,userPtr). - collide_recurse_entry(bvh0->root,bvh0->bounds.bounds(),bvh1->root,bvh1->bounds.bounds()); - } - -#if defined (EMBREE_LOWEST_ISA) - struct collision_regression_test : public RegressionTest - { - collision_regression_test(const char* name) : RegressionTest(name) { - registerRegressionTest(this); - } - - bool run () - { - bool passed = true; - passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(-0.008815f, 0.041848f, -2.49875e-06f), Vec3fa(-0.008276f, 0.053318f, -2.49875e-06f), Vec3fa(0.003023f, 0.048969f, -2.49875e-06f), - Vec3fa(0.00245f, 0.037612f, -2.49875e-06f), Vec3fa(0.01434f, 0.042634f, -2.49875e-06f), Vec3fa(0.013499f, 0.031309f, -2.49875e-06f)) == false; - passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0)) == true; - passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0,0,1),Vec3fa(1,0,1),Vec3fa(0,1,1)) == false; - passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0,0,1),Vec3fa(1,0,0),Vec3fa(0,1,0)) == true; - passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0,0,0),Vec3fa(1,0,1),Vec3fa(0,1,1)) == true; - passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0.1f,0.1f,0),Vec3fa(1,0,1),Vec3fa(0,1,1)) == true; - passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0.1f,0.1f,-0.1f),Vec3fa(1,0,1),Vec3fa(0,1,1)) == true; - passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0)) == true; - passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0,0,0),Vec3fa(0.5f,0,0),Vec3fa(0,0.5f,0)) == true; - passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0.1f,0.1f,0),Vec3fa(0.5f,0,0),Vec3fa(0,0.5f,0)) == true; - passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0.1f,0.1f,0),Vec3fa(0.5f,0.1f,0),Vec3fa(0.1f,0.5f,0)) == true; - passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0.1f,-0.1f,0),Vec3fa(0.5f,0.1f,0),Vec3fa(0.1f,0.5f,0)) == true; - passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(-0.1f,0.1f,0),Vec3fa(0.5f,0.1f,0),Vec3fa(0.1f,0.5f,0)) == true; - passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), - Vec3fa(-1,1,0) + Vec3fa(0,0,0),Vec3fa(-1,1,0) + Vec3fa(0.1f,0,0),Vec3fa(-1,1,0) + Vec3fa(0,0.1f,0)) == false; - passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), - Vec3fa( 2,0.5f,0) + Vec3fa(0,0,0),Vec3fa( 2,0.5f,0) + Vec3fa(0.1f,0,0),Vec3fa( 2,0.5f,0) + Vec3fa(0,0.1f,0)) == false; - passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), - Vec3fa(0.5f,-2.0f,0) + Vec3fa(0,0,0),Vec3fa(0.5f,-2.0f,0) + Vec3fa(0.1f,0,0),Vec3fa(0.5f,-2.0f,0) + Vec3fa(0,0.1f,0)) == false; - return passed; - } - }; - - collision_regression_test collision_regression("collision_regression_test"); -#endif - - //////////////////////////////////////////////////////////////////////////////// - /// Collider Definitions - //////////////////////////////////////////////////////////////////////////////// - - DEFINE_COLLIDER(BVH4ColliderUserGeom,BVHNColliderUserGeom<4>); - -#if defined(__AVX__) - DEFINE_COLLIDER(BVH8ColliderUserGeom,BVHNColliderUserGeom<8>); -#endif - } -} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_collider.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_collider.h deleted file mode 100644 index ac4f99c96a..0000000000 --- a/thirdparty/embree-aarch64/kernels/bvh/bvh_collider.h +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "bvh.h" -#include "../geometry/trianglev.h" -#include "../geometry/object.h" - -namespace embree -{ - namespace isa - { - template - class BVHNCollider - { - typedef BVHN BVH; - typedef typename BVH::NodeRef NodeRef; - typedef typename BVH::AABBNode AABBNode; - - struct CollideJob - { - CollideJob () {} - - CollideJob (NodeRef ref0, const BBox3fa& bounds0, size_t depth0, - NodeRef ref1, const BBox3fa& bounds1, size_t depth1) - : ref0(ref0), bounds0(bounds0), depth0(depth0), ref1(ref1), bounds1(bounds1), depth1(depth1) {} - - NodeRef ref0; - BBox3fa bounds0; - size_t depth0; - NodeRef ref1; - BBox3fa bounds1; - size_t depth1; - }; - - typedef vector_t> jobvector; - - void split(const CollideJob& job, jobvector& jobs); - - public: - __forceinline BVHNCollider (Scene* scene0, Scene* scene1, RTCCollideFunc callback, void* userPtr) - : scene0(scene0), scene1(scene1), callback(callback), userPtr(userPtr) {} - - public: - virtual void processLeaf(NodeRef leaf0, NodeRef leaf1) = 0; - void collide_recurse(NodeRef node0, const BBox3fa& bounds0, NodeRef node1, const BBox3fa& bounds1, size_t depth0, size_t depth1); - void collide_recurse_entry(NodeRef node0, const BBox3fa& bounds0, NodeRef node1, const BBox3fa& bounds1); - - protected: - Scene* scene0; - Scene* scene1; - RTCCollideFunc callback; - void* userPtr; - }; - - template - class BVHNColliderUserGeom : public BVHNCollider - { - typedef BVHN BVH; - typedef typename BVH::NodeRef NodeRef; - typedef typename BVH::AABBNode AABBNode; - - __forceinline BVHNColliderUserGeom (Scene* scene0, Scene* scene1, RTCCollideFunc callback, void* userPtr) - : BVHNCollider(scene0,scene1,callback,userPtr) {} - - virtual void processLeaf(NodeRef leaf0, NodeRef leaf1); - public: - static void collide(BVH* __restrict__ bvh0, BVH* __restrict__ bvh1, RTCCollideFunc callback, void* userPtr); - }; - } -} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_factory.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_factory.h deleted file mode 100644 index 54021ca6eb..0000000000 --- a/thirdparty/embree-aarch64/kernels/bvh/bvh_factory.h +++ /dev/null @@ -1,21 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../bvh/bvh.h" -#include "../common/isa.h" -#include "../common/accel.h" -#include "../common/scene.h" -#include "../geometry/curve_intersector_virtual.h" - -namespace embree -{ - /*! BVH instantiations */ - class BVHFactory - { - public: - enum class BuildVariant { STATIC, DYNAMIC, HIGH_QUALITY }; - enum class IntersectVariant { FAST, ROBUST }; - }; -} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1.cpp deleted file mode 100644 index ea6adc2717..0000000000 --- a/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1.cpp +++ /dev/null @@ -1,330 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "bvh_intersector1.h" -#include "node_intersector1.h" -#include "bvh_traverser1.h" - -#include "../geometry/intersector_iterators.h" -#include "../geometry/triangle_intersector.h" -#include "../geometry/trianglev_intersector.h" -#include "../geometry/trianglev_mb_intersector.h" -#include "../geometry/trianglei_intersector.h" -#include "../geometry/quadv_intersector.h" -#include "../geometry/quadi_intersector.h" -#include "../geometry/curveNv_intersector.h" -#include "../geometry/curveNi_intersector.h" -#include "../geometry/curveNi_mb_intersector.h" -#include "../geometry/linei_intersector.h" -#include "../geometry/subdivpatch1_intersector.h" -#include "../geometry/object_intersector.h" -#include "../geometry/instance_intersector.h" -#include "../geometry/subgrid_intersector.h" -#include "../geometry/subgrid_mb_intersector.h" -#include "../geometry/curve_intersector_virtual.h" - -namespace embree -{ - namespace isa - { - template - void BVHNIntersector1::intersect(const Accel::Intersectors* __restrict__ This, - RayHit& __restrict__ ray, - IntersectContext* __restrict__ context) - { - const BVH* __restrict__ bvh = (const BVH*)This->ptr; - - /* we may traverse an empty BVH in case all geometry was invalid */ - if (bvh->root == BVH::emptyNode) - return; - - /* perform per ray precalculations required by the primitive intersector */ - Precalculations pre(ray, bvh); - - /* stack state */ - StackItemT stack[stackSize]; // stack of nodes - StackItemT* stackPtr = stack+1; // current stack pointer - StackItemT* stackEnd = stack+stackSize; - stack[0].ptr = bvh->root; - stack[0].dist = neg_inf; - - if (bvh->root == BVH::emptyNode) - return; - - /* filter out invalid rays */ -#if defined(EMBREE_IGNORE_INVALID_RAYS) - if (!ray.valid()) return; -#endif - /* verify correct input */ - assert(ray.valid()); - assert(ray.tnear() >= 0.0f); - assert(!(types & BVH_MB) || (ray.time() >= 0.0f && ray.time() <= 1.0f)); - - /* load the ray into SIMD registers */ - TravRay tray(ray.org, ray.dir, max(ray.tnear(), 0.0f), max(ray.tfar, 0.0f)); - - /* initialize the node traverser */ - BVHNNodeTraverser1Hit nodeTraverser; - - /* pop loop */ - while (true) pop: - { - /* pop next node */ - if (unlikely(stackPtr == stack)) break; - stackPtr--; - NodeRef cur = NodeRef(stackPtr->ptr); - - /* if popped node is too far, pop next one */ -#if defined(__AVX512ER__) - /* much faster on KNL */ - if (unlikely(any(vfloat(*(float*)&stackPtr->dist) > tray.tfar))) - continue; -#else - if (unlikely(*(float*)&stackPtr->dist > ray.tfar)) - continue; -#endif - - /* downtraversal loop */ - while (true) - { - /* intersect node */ - size_t mask; vfloat tNear; - STAT3(normal.trav_nodes,1,1,1); - bool nodeIntersected = BVHNNodeIntersector1::intersect(cur, tray, ray.time(), tNear, mask); - if (unlikely(!nodeIntersected)) { STAT3(normal.trav_nodes,-1,-1,-1); break; } - - /* if no child is hit, pop next node */ - if (unlikely(mask == 0)) - goto pop; - - /* select next child and push other children */ - nodeTraverser.traverseClosestHit(cur, mask, tNear, stackPtr, stackEnd); - } - - /* this is a leaf node */ - assert(cur != BVH::emptyNode); - STAT3(normal.trav_leaves,1,1,1); - size_t num; Primitive* prim = (Primitive*)cur.leaf(num); - size_t lazy_node = 0; - PrimitiveIntersector1::intersect(This, pre, ray, context, prim, num, tray, lazy_node); - tray.tfar = ray.tfar; - - /* push lazy node onto stack */ - if (unlikely(lazy_node)) { - stackPtr->ptr = lazy_node; - stackPtr->dist = neg_inf; - stackPtr++; - } - } - } - - template - void BVHNIntersector1::occluded(const Accel::Intersectors* __restrict__ This, - Ray& __restrict__ ray, - IntersectContext* __restrict__ context) - { - const BVH* __restrict__ bvh = (const BVH*)This->ptr; - - /* we may traverse an empty BVH in case all geometry was invalid */ - if (bvh->root == BVH::emptyNode) - return; - - /* early out for already occluded rays */ - if (unlikely(ray.tfar < 0.0f)) - return; - - /* perform per ray precalculations required by the primitive intersector */ - Precalculations pre(ray, bvh); - - /* stack state */ - NodeRef stack[stackSize]; // stack of nodes that still need to get traversed - NodeRef* stackPtr = stack+1; // current stack pointer - NodeRef* stackEnd = stack+stackSize; - stack[0] = bvh->root; - - /* filter out invalid rays */ -#if defined(EMBREE_IGNORE_INVALID_RAYS) - if (!ray.valid()) return; -#endif - - /* verify correct input */ - assert(ray.valid()); - assert(ray.tnear() >= 0.0f); - assert(!(types & BVH_MB) || (ray.time() >= 0.0f && ray.time() <= 1.0f)); - - /* load the ray into SIMD registers */ - TravRay tray(ray.org, ray.dir, max(ray.tnear(), 0.0f), max(ray.tfar, 0.0f)); - - /* initialize the node traverser */ - BVHNNodeTraverser1Hit nodeTraverser; - - /* pop loop */ - while (true) pop: - { - /* pop next node */ - if (unlikely(stackPtr == stack)) break; - stackPtr--; - NodeRef cur = (NodeRef)*stackPtr; - - /* downtraversal loop */ - while (true) - { - /* intersect node */ - size_t mask; vfloat tNear; - STAT3(shadow.trav_nodes,1,1,1); - bool nodeIntersected = BVHNNodeIntersector1::intersect(cur, tray, ray.time(), tNear, mask); - if (unlikely(!nodeIntersected)) { STAT3(shadow.trav_nodes,-1,-1,-1); break; } - - /* if no child is hit, pop next node */ - if (unlikely(mask == 0)) - goto pop; - - /* select next child and push other children */ - nodeTraverser.traverseAnyHit(cur, mask, tNear, stackPtr, stackEnd); - } - - /* this is a leaf node */ - assert(cur != BVH::emptyNode); - STAT3(shadow.trav_leaves,1,1,1); - size_t num; Primitive* prim = (Primitive*)cur.leaf(num); - size_t lazy_node = 0; - if (PrimitiveIntersector1::occluded(This, pre, ray, context, prim, num, tray, lazy_node)) { - ray.tfar = neg_inf; - break; - } - - /* push lazy node onto stack */ - if (unlikely(lazy_node)) { - *stackPtr = (NodeRef)lazy_node; - stackPtr++; - } - } - } - - template - struct PointQueryDispatch - { - typedef typename PrimitiveIntersector1::Precalculations Precalculations; - typedef typename PrimitiveIntersector1::Primitive Primitive; - typedef BVHN BVH; - typedef typename BVH::NodeRef NodeRef; - typedef typename BVH::AABBNode AABBNode; - typedef typename BVH::AABBNodeMB4D AABBNodeMB4D; - - static const size_t stackSize = 1+(N-1)*BVH::maxDepth+3; // +3 due to 16-wide store - - /* right now AVX512KNL SIMD extension only for standard node types */ - static const size_t Nx = (types == BVH_AN1 || types == BVH_QN1) ? vextend::size : N; - - static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context) - { - const BVH* __restrict__ bvh = (const BVH*)This->ptr; - - /* we may traverse an empty BVH in case all geometry was invalid */ - if (bvh->root == BVH::emptyNode) - return false; - - /* stack state */ - StackItemT stack[stackSize]; // stack of nodes - StackItemT* stackPtr = stack+1; // current stack pointer - StackItemT* stackEnd = stack+stackSize; - stack[0].ptr = bvh->root; - stack[0].dist = neg_inf; - - /* verify correct input */ - assert(!(types & BVH_MB) || (query->time >= 0.0f && query->time <= 1.0f)); - - /* load the point query into SIMD registers */ - TravPointQuery tquery(query->p, context->query_radius); - - /* initialize the node traverser */ - BVHNNodeTraverser1Hit nodeTraverser; - - bool changed = false; - float cull_radius = context->query_type == POINT_QUERY_TYPE_SPHERE - ? query->radius * query->radius - : dot(context->query_radius, context->query_radius); - - /* pop loop */ - while (true) pop: - { - /* pop next node */ - if (unlikely(stackPtr == stack)) break; - stackPtr--; - NodeRef cur = NodeRef(stackPtr->ptr); - - /* if popped node is too far, pop next one */ - if (unlikely(*(float*)&stackPtr->dist > cull_radius)) - continue; - - /* downtraversal loop */ - while (true) - { - /* intersect node */ - size_t mask; vfloat tNear; - STAT3(point_query.trav_nodes,1,1,1); - bool nodeIntersected; - if (likely(context->query_type == POINT_QUERY_TYPE_SPHERE)) { - nodeIntersected = BVHNNodePointQuerySphere1::pointQuery(cur, tquery, query->time, tNear, mask); - } else { - nodeIntersected = BVHNNodePointQueryAABB1 ::pointQuery(cur, tquery, query->time, tNear, mask); - } - if (unlikely(!nodeIntersected)) { STAT3(point_query.trav_nodes,-1,-1,-1); break; } - - /* if no child is hit, pop next node */ - if (unlikely(mask == 0)) - goto pop; - - /* select next child and push other children */ - nodeTraverser.traverseClosestHit(cur, mask, tNear, stackPtr, stackEnd); - } - - /* this is a leaf node */ - assert(cur != BVH::emptyNode); - STAT3(point_query.trav_leaves,1,1,1); - size_t num; Primitive* prim = (Primitive*)cur.leaf(num); - size_t lazy_node = 0; - if (PrimitiveIntersector1::pointQuery(This, query, context, prim, num, tquery, lazy_node)) - { - changed = true; - tquery.rad = context->query_radius; - cull_radius = context->query_type == POINT_QUERY_TYPE_SPHERE - ? query->radius * query->radius - : dot(context->query_radius, context->query_radius); - } - - /* push lazy node onto stack */ - if (unlikely(lazy_node)) { - stackPtr->ptr = lazy_node; - stackPtr->dist = neg_inf; - stackPtr++; - } - } - return changed; - } - }; - - /* disable point queries for not yet supported geometry types */ - template - struct PointQueryDispatch { - static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context) { return false; } - }; - - template - struct PointQueryDispatch { - static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context) { return false; } - }; - - template - struct PointQueryDispatch { - static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context) { return false; } - }; - - template - bool BVHNIntersector1::pointQuery( - const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context) - { - return PointQueryDispatch::pointQuery(This, query, context); - } - } -} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1.h deleted file mode 100644 index 1a269c319a..0000000000 --- a/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1.h +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "bvh.h" -#include "../common/ray.h" -#include "../common/point_query.h" - -namespace embree -{ - namespace isa - { - /*! BVH single ray intersector. */ - template - class BVHNIntersector1 - { - /* shortcuts for frequently used types */ - typedef typename PrimitiveIntersector1::Precalculations Precalculations; - typedef typename PrimitiveIntersector1::Primitive Primitive; - typedef BVHN BVH; - typedef typename BVH::NodeRef NodeRef; - typedef typename BVH::AABBNode AABBNode; - typedef typename BVH::AABBNodeMB4D AABBNodeMB4D; - - static const size_t stackSize = 1+(N-1)*BVH::maxDepth+3; // +3 due to 16-wide store - - /* right now AVX512KNL SIMD extension only for standard node types */ - static const size_t Nx = (types == BVH_AN1 || types == BVH_QN1) ? vextend::size : N; - - public: - static void intersect (const Accel::Intersectors* This, RayHit& ray, IntersectContext* context); - static void occluded (const Accel::Intersectors* This, Ray& ray, IntersectContext* context); - static bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context); - }; - } -} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1_bvh4.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1_bvh4.cpp deleted file mode 100644 index 989f7354fd..0000000000 --- a/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1_bvh4.cpp +++ /dev/null @@ -1,61 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "bvh_intersector1.cpp" - -namespace embree -{ - namespace isa - { - int getISA() { - return VerifyMultiTargetLinking::getISA(); - } - - //////////////////////////////////////////////////////////////////////////////// - /// BVH4Intersector1 Definitions - //////////////////////////////////////////////////////////////////////////////// - - IF_ENABLED_CURVES_OR_POINTS(DEFINE_INTERSECTOR1(BVH4OBBVirtualCurveIntersector1,BVHNIntersector1<4 COMMA BVH_AN1_UN1 COMMA false COMMA VirtualCurveIntersector1 >)); - IF_ENABLED_CURVES_OR_POINTS(DEFINE_INTERSECTOR1(BVH4OBBVirtualCurveIntersector1MB,BVHNIntersector1<4 COMMA BVH_AN2_AN4D_UN2 COMMA false COMMA VirtualCurveIntersector1 >)); - - IF_ENABLED_CURVES_OR_POINTS(DEFINE_INTERSECTOR1(BVH4OBBVirtualCurveIntersectorRobust1,BVHNIntersector1<4 COMMA BVH_AN1_UN1 COMMA true COMMA VirtualCurveIntersector1 >)); - IF_ENABLED_CURVES_OR_POINTS(DEFINE_INTERSECTOR1(BVH4OBBVirtualCurveIntersectorRobust1MB,BVHNIntersector1<4 COMMA BVH_AN2_AN4D_UN2 COMMA true COMMA VirtualCurveIntersector1 >)); - - IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4Intersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersector1 > >)); - IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4iIntersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersector1 > >)); - IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4vIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN1 COMMA true COMMA ArrayIntersector1 > >)); - IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4iIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN1 COMMA true COMMA ArrayIntersector1 > >)); - - IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4vMBIntersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersector1 > >)); - IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4iMBIntersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersector1 > >)); - IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4vMBIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA true COMMA ArrayIntersector1 > >)); - IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4iMBIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA true COMMA ArrayIntersector1 > >)); - - IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(BVH4Quad4vIntersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersector1 > >)); - IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(BVH4Quad4iIntersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersector1 > >)); - IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(BVH4Quad4vIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN1 COMMA true COMMA ArrayIntersector1 > >)); - IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(BVH4Quad4iIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN1 COMMA true COMMA ArrayIntersector1 > >)); - - IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(BVH4Quad4iMBIntersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersector1 > >)); - IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(BVH4Quad4iMBIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA true COMMA ArrayIntersector1 > >)); - - IF_ENABLED_SUBDIV(DEFINE_INTERSECTOR1(BVH4SubdivPatch1Intersector1,BVHNIntersector1<4 COMMA BVH_AN1 COMMA true COMMA SubdivPatch1Intersector1>)); - IF_ENABLED_SUBDIV(DEFINE_INTERSECTOR1(BVH4SubdivPatch1MBIntersector1,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA true COMMA SubdivPatch1MBIntersector1>)); - - IF_ENABLED_USER(DEFINE_INTERSECTOR1(BVH4VirtualIntersector1,BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersector1> >)); - IF_ENABLED_USER(DEFINE_INTERSECTOR1(BVH4VirtualMBIntersector1,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersector1> >)); - - IF_ENABLED_INSTANCE(DEFINE_INTERSECTOR1(BVH4InstanceIntersector1,BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersector1 >)); - IF_ENABLED_INSTANCE(DEFINE_INTERSECTOR1(BVH4InstanceMBIntersector1,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersector1 >)); - - IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(QBVH4Triangle4iIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_QN1 COMMA false COMMA ArrayIntersector1 > >)); - IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(QBVH4Quad4iIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_QN1 COMMA false COMMA ArrayIntersector1 > >)); - - IF_ENABLED_GRIDS(DEFINE_INTERSECTOR1(BVH4GridIntersector1Moeller,BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA SubGridIntersector1Moeller<4 COMMA true> >)); - IF_ENABLED_GRIDS(DEFINE_INTERSECTOR1(BVH4GridMBIntersector1Moeller,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA true COMMA SubGridMBIntersector1Pluecker<4 COMMA true> >)); - - IF_ENABLED_GRIDS(DEFINE_INTERSECTOR1(BVH4GridIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN1 COMMA true COMMA SubGridIntersector1Pluecker<4 COMMA true> >)); - //IF_ENABLED_GRIDS(DEFINE_INTERSECTOR1(BVH4GridMBIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA false COMMA SubGridMBIntersector1Pluecker<4 COMMA true> >)); - - } -} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_hybrid.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_hybrid.h deleted file mode 100644 index d764cc928d..0000000000 --- a/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_hybrid.h +++ /dev/null @@ -1,61 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "bvh.h" -#include "../common/ray.h" -#include "../common/stack_item.h" -#include "node_intersector_frustum.h" - -namespace embree -{ - namespace isa - { - template - struct TravRayK; - - /*! BVH hybrid packet intersector. Switches between packet and single ray traversal (optional). */ - template - class BVHNIntersectorKHybrid - { - /* right now AVX512KNL SIMD extension only for standard node types */ - static const size_t Nx = types == BVH_AN1 ? vextend::size : N; - - /* shortcuts for frequently used types */ - typedef typename PrimitiveIntersectorK::Precalculations Precalculations; - typedef typename PrimitiveIntersectorK::Primitive Primitive; - typedef BVHN BVH; - typedef typename BVH::NodeRef NodeRef; - typedef typename BVH::BaseNode BaseNode; - typedef typename BVH::AABBNode AABBNode; - - static const size_t stackSizeSingle = 1+(N-1)*BVH::maxDepth+3; // +3 due to 16-wide store - static const size_t stackSizeChunk = 1+(N-1)*BVH::maxDepth; - - static const size_t switchThresholdIncoherent = \ - (K==4) ? 3 : - (K==8) ? ((N==4) ? 5 : 7) : - (K==16) ? 14 : // 14 seems to work best for KNL due to better ordered chunk traversal - 0; - - private: - static void intersect1(Accel::Intersectors* This, const BVH* bvh, NodeRef root, size_t k, Precalculations& pre, - RayHitK& ray, const TravRayK& tray, IntersectContext* context); - static bool occluded1(Accel::Intersectors* This, const BVH* bvh, NodeRef root, size_t k, Precalculations& pre, - RayK& ray, const TravRayK& tray, IntersectContext* context); - - public: - static void intersect(vint* valid, Accel::Intersectors* This, RayHitK& ray, IntersectContext* context); - static void occluded (vint* valid, Accel::Intersectors* This, RayK& ray, IntersectContext* context); - - static void intersectCoherent(vint* valid, Accel::Intersectors* This, RayHitK& ray, IntersectContext* context); - static void occludedCoherent (vint* valid, Accel::Intersectors* This, RayK& ray, IntersectContext* context); - - }; - - /*! BVH packet intersector. */ - template - class BVHNIntersectorKChunk : public BVHNIntersectorKHybrid {}; - } -} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_stream.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_stream.h deleted file mode 100644 index 83d1fb4d3d..0000000000 --- a/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_stream.h +++ /dev/null @@ -1,295 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "node_intersector_packet_stream.h" -#include "node_intersector_frustum.h" -#include "bvh_traverser_stream.h" - -namespace embree -{ - namespace isa - { - /*! BVH ray stream intersector. */ - template - class BVHNIntersectorStream - { - static const int Nxd = (Nx == N) ? N : Nx/2; - - /* shortcuts for frequently used types */ - template using PrimitiveIntersectorK = typename PrimitiveIntersector::template Type; - template using PrimitiveK = typename PrimitiveIntersectorK::PrimitiveK; - typedef BVHN BVH; - typedef typename BVH::NodeRef NodeRef; - typedef typename BVH::BaseNode BaseNode; - typedef typename BVH::AABBNode AABBNode; - typedef typename BVH::AABBNodeMB AABBNodeMB; - - template - __forceinline static size_t initPacketsAndFrustum(RayK** inputPackets, size_t numOctantRays, - TravRayKStream* packets, Frustum& frustum, bool& commonOctant) - { - const size_t numPackets = (numOctantRays+K-1)/K; - - Vec3vf tmp_min_rdir(pos_inf); - Vec3vf tmp_max_rdir(neg_inf); - Vec3vf tmp_min_org(pos_inf); - Vec3vf tmp_max_org(neg_inf); - vfloat tmp_min_dist(pos_inf); - vfloat tmp_max_dist(neg_inf); - - size_t m_active = 0; - for (size_t i = 0; i < numPackets; i++) - { - const vfloat tnear = inputPackets[i]->tnear(); - const vfloat tfar = inputPackets[i]->tfar; - vbool m_valid = (tnear <= tfar) & (tnear >= 0.0f); - -#if defined(EMBREE_IGNORE_INVALID_RAYS) - m_valid &= inputPackets[i]->valid(); -#endif - - m_active |= (size_t)movemask(m_valid) << (i*K); - - vfloat packet_min_dist = max(tnear, 0.0f); - vfloat packet_max_dist = select(m_valid, tfar, neg_inf); - tmp_min_dist = min(tmp_min_dist, packet_min_dist); - tmp_max_dist = max(tmp_max_dist, packet_max_dist); - - const Vec3vf& org = inputPackets[i]->org; - const Vec3vf& dir = inputPackets[i]->dir; - - new (&packets[i]) TravRayKStream(org, dir, packet_min_dist, packet_max_dist); - - tmp_min_rdir = min(tmp_min_rdir, select(m_valid, packets[i].rdir, Vec3vf(pos_inf))); - tmp_max_rdir = max(tmp_max_rdir, select(m_valid, packets[i].rdir, Vec3vf(neg_inf))); - tmp_min_org = min(tmp_min_org , select(m_valid,org , Vec3vf(pos_inf))); - tmp_max_org = max(tmp_max_org , select(m_valid,org , Vec3vf(neg_inf))); - } - - m_active &= (numOctantRays == (8 * sizeof(size_t))) ? (size_t)-1 : (((size_t)1 << numOctantRays)-1); - - - const Vec3fa reduced_min_rdir(reduce_min(tmp_min_rdir.x), - reduce_min(tmp_min_rdir.y), - reduce_min(tmp_min_rdir.z)); - - const Vec3fa reduced_max_rdir(reduce_max(tmp_max_rdir.x), - reduce_max(tmp_max_rdir.y), - reduce_max(tmp_max_rdir.z)); - - const Vec3fa reduced_min_origin(reduce_min(tmp_min_org.x), - reduce_min(tmp_min_org.y), - reduce_min(tmp_min_org.z)); - - const Vec3fa reduced_max_origin(reduce_max(tmp_max_org.x), - reduce_max(tmp_max_org.y), - reduce_max(tmp_max_org.z)); - - commonOctant = - (reduced_max_rdir.x < 0.0f || reduced_min_rdir.x >= 0.0f) && - (reduced_max_rdir.y < 0.0f || reduced_min_rdir.y >= 0.0f) && - (reduced_max_rdir.z < 0.0f || reduced_min_rdir.z >= 0.0f); - - const float frustum_min_dist = reduce_min(tmp_min_dist); - const float frustum_max_dist = reduce_max(tmp_max_dist); - - frustum.init(reduced_min_origin, reduced_max_origin, - reduced_min_rdir, reduced_max_rdir, - frustum_min_dist, frustum_max_dist, - N); - - return m_active; - } - - template - __forceinline static size_t intersectAABBNodePacket(size_t m_active, - const TravRayKStream* packets, - const AABBNode* __restrict__ node, - size_t boxID, - const NearFarPrecalculations& nf) - { - assert(m_active); - const size_t startPacketID = bsf(m_active) / K; - const size_t endPacketID = bsr(m_active) / K; - size_t m_trav_active = 0; - for (size_t i = startPacketID; i <= endPacketID; i++) - { - const size_t m_hit = intersectNodeK(node, boxID, packets[i], nf); - m_trav_active |= m_hit << (i*K); - } - return m_trav_active; - } - - template - __forceinline static size_t traverseCoherentStream(size_t m_active, - TravRayKStream* packets, - const AABBNode* __restrict__ node, - const Frustum& frustum, - size_t* maskK, - vfloat& dist) - { - size_t m_node_hit = intersectNodeFrustum(node, frustum, dist); - const size_t first_index = bsf(m_active); - const size_t first_packetID = first_index / K; - const size_t first_rayID = first_index % K; - size_t m_first_hit = intersectNode1(node, packets[first_packetID], first_rayID, frustum.nf); - - /* this make traversal independent of the ordering of rays */ - size_t m_node = m_node_hit ^ m_first_hit; - while (unlikely(m_node)) - { - const size_t boxID = bscf(m_node); - const size_t m_current = m_active & intersectAABBNodePacket(m_active, packets, node, boxID, frustum.nf); - m_node_hit ^= m_current ? (size_t)0 : ((size_t)1 << boxID); - maskK[boxID] = m_current; - } - return m_node_hit; - } - - // TODO: explicit 16-wide path for KNL - template - __forceinline static vint traverseIncoherentStream(size_t m_active, - TravRayKStreamFast* __restrict__ packets, - const AABBNode* __restrict__ node, - const NearFarPrecalculations& nf, - const int shiftTable[32]) - { - const vfloat bminX = vfloat(*(const vfloat*)((const char*)&node->lower_x + nf.nearX)); - const vfloat bminY = vfloat(*(const vfloat*)((const char*)&node->lower_x + nf.nearY)); - const vfloat bminZ = vfloat(*(const vfloat*)((const char*)&node->lower_x + nf.nearZ)); - const vfloat bmaxX = vfloat(*(const vfloat*)((const char*)&node->lower_x + nf.farX)); - const vfloat bmaxY = vfloat(*(const vfloat*)((const char*)&node->lower_x + nf.farY)); - const vfloat bmaxZ = vfloat(*(const vfloat*)((const char*)&node->lower_x + nf.farZ)); - assert(m_active); - vint vmask(zero); - do - { - STAT3(shadow.trav_nodes,1,1,1); - const size_t rayID = bscf(m_active); - assert(rayID < MAX_INTERNAL_STREAM_SIZE); - TravRayKStream &p = packets[rayID / K]; - const size_t i = rayID % K; - const vint bitmask(shiftTable[rayID]); - -#if defined (__aarch64__) - const vfloat tNearX = madd(bminX, p.rdir.x[i], p.neg_org_rdir.x[i]); - const vfloat tNearY = madd(bminY, p.rdir.y[i], p.neg_org_rdir.y[i]); - const vfloat tNearZ = madd(bminZ, p.rdir.z[i], p.neg_org_rdir.z[i]); - const vfloat tFarX = madd(bmaxX, p.rdir.x[i], p.neg_org_rdir.x[i]); - const vfloat tFarY = madd(bmaxY, p.rdir.y[i], p.neg_org_rdir.y[i]); - const vfloat tFarZ = madd(bmaxZ, p.rdir.z[i], p.neg_org_rdir.z[i]); -#else - const vfloat tNearX = msub(bminX, p.rdir.x[i], p.org_rdir.x[i]); - const vfloat tNearY = msub(bminY, p.rdir.y[i], p.org_rdir.y[i]); - const vfloat tNearZ = msub(bminZ, p.rdir.z[i], p.org_rdir.z[i]); - const vfloat tFarX = msub(bmaxX, p.rdir.x[i], p.org_rdir.x[i]); - const vfloat tFarY = msub(bmaxY, p.rdir.y[i], p.org_rdir.y[i]); - const vfloat tFarZ = msub(bmaxZ, p.rdir.z[i], p.org_rdir.z[i]); -#endif - - const vfloat tNear = maxi(tNearX, tNearY, tNearZ, vfloat(p.tnear[i])); - const vfloat tFar = mini(tFarX , tFarY , tFarZ, vfloat(p.tfar[i])); - -#if defined(__AVX512ER__) - const vboolx m_node((1 << N)-1); - const vbool hit_mask = le(m_node, tNear, tFar); - vmask = mask_or(hit_mask, vmask, vmask, bitmask); -#else - const vbool hit_mask = tNear <= tFar; -#if defined(__AVX2__) - vmask = vmask | (bitmask & vint(hit_mask)); -#else - vmask = select(hit_mask, vmask | bitmask, vmask); -#endif -#endif - } while(m_active); - return vmask; - } - - template - __forceinline static vint traverseIncoherentStream(size_t m_active, - TravRayKStreamRobust* __restrict__ packets, - const AABBNode* __restrict__ node, - const NearFarPrecalculations& nf, - const int shiftTable[32]) - { - const vfloat bminX = vfloat(*(const vfloat*)((const char*)&node->lower_x + nf.nearX)); - const vfloat bminY = vfloat(*(const vfloat*)((const char*)&node->lower_x + nf.nearY)); - const vfloat bminZ = vfloat(*(const vfloat*)((const char*)&node->lower_x + nf.nearZ)); - const vfloat bmaxX = vfloat(*(const vfloat*)((const char*)&node->lower_x + nf.farX)); - const vfloat bmaxY = vfloat(*(const vfloat*)((const char*)&node->lower_x + nf.farY)); - const vfloat bmaxZ = vfloat(*(const vfloat*)((const char*)&node->lower_x + nf.farZ)); - assert(m_active); - vint vmask(zero); - do - { - STAT3(shadow.trav_nodes,1,1,1); - const size_t rayID = bscf(m_active); - assert(rayID < MAX_INTERNAL_STREAM_SIZE); - TravRayKStream &p = packets[rayID / K]; - const size_t i = rayID % K; - const vint bitmask(shiftTable[rayID]); - const vfloat tNearX = (bminX - p.org.x[i]) * p.rdir.x[i]; - const vfloat tNearY = (bminY - p.org.y[i]) * p.rdir.y[i]; - const vfloat tNearZ = (bminZ - p.org.z[i]) * p.rdir.z[i]; - const vfloat tFarX = (bmaxX - p.org.x[i]) * p.rdir.x[i]; - const vfloat tFarY = (bmaxY - p.org.y[i]) * p.rdir.y[i]; - const vfloat tFarZ = (bmaxZ - p.org.z[i]) * p.rdir.z[i]; - const vfloat tNear = maxi(tNearX, tNearY, tNearZ, vfloat(p.tnear[i])); - const vfloat tFar = mini(tFarX , tFarY , tFarZ, vfloat(p.tfar[i])); - const float round_down = 1.0f-2.0f*float(ulp); - const float round_up = 1.0f+2.0f*float(ulp); -#if defined(__AVX512ER__) - const vboolx m_node((1 << N)-1); - const vbool hit_mask = le(m_node, round_down*tNear, round_up*tFar); - vmask = mask_or(hit_mask, vmask, vmask, bitmask); -#else - const vbool hit_mask = round_down*tNear <= round_up*tFar; -#if defined(__AVX2__) - vmask = vmask | (bitmask & vint(hit_mask)); -#else - vmask = select(hit_mask, vmask | bitmask, vmask); -#endif -#endif - } while(m_active); - return vmask; - } - - - static const size_t stackSizeSingle = 1+(N-1)*BVH::maxDepth; - - public: - static void intersect(Accel::Intersectors* This, RayHitN** inputRays, size_t numRays, IntersectContext* context); - static void occluded (Accel::Intersectors* This, RayN** inputRays, size_t numRays, IntersectContext* context); - - private: - template - static void intersectCoherent(Accel::Intersectors* This, RayHitK** inputRays, size_t numRays, IntersectContext* context); - - template - static void occludedCoherent(Accel::Intersectors* This, RayK** inputRays, size_t numRays, IntersectContext* context); - - template - static void occludedIncoherent(Accel::Intersectors* This, RayK** inputRays, size_t numRays, IntersectContext* context); - }; - - - /*! BVH ray stream intersector with direct fallback to packets. */ - template - class BVHNIntersectorStreamPacketFallback - { - public: - static void intersect(Accel::Intersectors* This, RayHitN** inputRays, size_t numRays, IntersectContext* context); - static void occluded (Accel::Intersectors* This, RayN** inputRays, size_t numRays, IntersectContext* context); - - private: - template - static void intersectK(Accel::Intersectors* This, RayHitK** inputRays, size_t numRays, IntersectContext* context); - - template - static void occludedK(Accel::Intersectors* This, RayK** inputRays, size_t numRays, IntersectContext* context); - }; - } -} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_stream_filters.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_stream_filters.h deleted file mode 100644 index cdeb923637..0000000000 --- a/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_stream_filters.h +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../common/default.h" -#include "../common/ray.h" -#include "../common/scene.h" - -namespace embree -{ - namespace isa - { - class RayStreamFilter - { - public: - static void intersectAOS(Scene* scene, RTCRayHit* rays, size_t N, size_t stride, IntersectContext* context); - static void intersectAOP(Scene* scene, RTCRayHit** rays, size_t N, IntersectContext* context); - static void intersectSOA(Scene* scene, char* rays, size_t N, size_t numPackets, size_t stride, IntersectContext* context); - static void intersectSOP(Scene* scene, const RTCRayHitNp* rays, size_t N, IntersectContext* context); - - static void occludedAOS(Scene* scene, RTCRay* rays, size_t N, size_t stride, IntersectContext* context); - static void occludedAOP(Scene* scene, RTCRay** rays, size_t N, IntersectContext* context); - static void occludedSOA(Scene* scene, char* rays, size_t N, size_t numPackets, size_t stride, IntersectContext* context); - static void occludedSOP(Scene* scene, const RTCRayNp* rays, size_t N, IntersectContext* context); - - private: - template - static void filterAOS(Scene* scene, void* rays, size_t N, size_t stride, IntersectContext* context); - - template - static void filterAOP(Scene* scene, void** rays, size_t N, IntersectContext* context); - - template - static void filterSOA(Scene* scene, char* rays, size_t N, size_t numPackets, size_t stride, IntersectContext* context); - - template - static void filterSOP(Scene* scene, const void* rays, size_t N, IntersectContext* context); - }; - } -}; diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb.h deleted file mode 100644 index baa4a8d805..0000000000 --- a/thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb.h +++ /dev/null @@ -1,213 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "bvh_node_base.h" - -namespace embree -{ - /*! BVHN AABBNode */ - template - struct AABBNode_t : public BaseNode_t - { - using BaseNode_t::children; - - struct Create - { - __forceinline NodeRef operator() (const FastAllocator::CachedAllocator& alloc, size_t numChildren = 0) const - { - AABBNode_t* node = (AABBNode_t*) alloc.malloc0(sizeof(AABBNode_t),NodeRef::byteNodeAlignment); node->clear(); - return NodeRef::encodeNode(node); - } - }; - - struct Set - { - __forceinline void operator() (NodeRef node, size_t i, NodeRef child, const BBox3fa& bounds) const { - node.getAABBNode()->setRef(i,child); - node.getAABBNode()->setBounds(i,bounds); - } - }; - - struct Create2 - { - template - __forceinline NodeRef operator() (BuildRecord* children, const size_t num, const FastAllocator::CachedAllocator& alloc) const - { - AABBNode_t* node = (AABBNode_t*) alloc.malloc0(sizeof(AABBNode_t), NodeRef::byteNodeAlignment); node->clear(); - for (size_t i=0; isetBounds(i,children[i].bounds()); - return NodeRef::encodeNode(node); - } - }; - - struct Set2 - { - template - __forceinline NodeRef operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRef* children, const size_t num) const - { - AABBNode_t* node = ref.getAABBNode(); - for (size_t i=0; isetRef(i,children[i]); - return ref; - } - }; - - struct Set3 - { - Set3 (FastAllocator* allocator, PrimRef* prims) - : allocator(allocator), prims(prims) {} - - template - __forceinline NodeRef operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRef* children, const size_t num) const - { - AABBNode_t* node = ref.getAABBNode(); - for (size_t i=0; isetRef(i,children[i]); - - if (unlikely(precord.alloc_barrier)) - { - PrimRef* begin = &prims[precord.prims.begin()]; - PrimRef* end = &prims[precord.prims.end()]; // FIXME: extended end for spatial split builder!!!!! - size_t bytes = (size_t)end - (size_t)begin; - allocator->addBlock(begin,bytes); - } - - return ref; - } - - FastAllocator* const allocator; - PrimRef* const prims; - }; - - /*! Clears the node. */ - __forceinline void clear() { - lower_x = lower_y = lower_z = pos_inf; - upper_x = upper_y = upper_z = neg_inf; - BaseNode_t::clear(); - } - - /*! Sets bounding box and ID of child. */ - __forceinline void setRef(size_t i, const NodeRef& ref) { - assert(i < N); - children[i] = ref; - } - - /*! Sets bounding box of child. */ - __forceinline void setBounds(size_t i, const BBox3fa& bounds) - { - assert(i < N); - lower_x[i] = bounds.lower.x; lower_y[i] = bounds.lower.y; lower_z[i] = bounds.lower.z; - upper_x[i] = bounds.upper.x; upper_y[i] = bounds.upper.y; upper_z[i] = bounds.upper.z; - } - - /*! Sets bounding box and ID of child. */ - __forceinline void set(size_t i, const NodeRef& ref, const BBox3fa& bounds) { - setBounds(i,bounds); - children[i] = ref; - } - - /*! Returns bounds of node. */ - __forceinline BBox3fa bounds() const { - const Vec3fa lower(reduce_min(lower_x),reduce_min(lower_y),reduce_min(lower_z)); - const Vec3fa upper(reduce_max(upper_x),reduce_max(upper_y),reduce_max(upper_z)); - return BBox3fa(lower,upper); - } - - /*! Returns bounds of specified child. */ - __forceinline BBox3fa bounds(size_t i) const - { - assert(i < N); - const Vec3fa lower(lower_x[i],lower_y[i],lower_z[i]); - const Vec3fa upper(upper_x[i],upper_y[i],upper_z[i]); - return BBox3fa(lower,upper); - } - - /*! Returns extent of bounds of specified child. */ - __forceinline Vec3fa extend(size_t i) const { - return bounds(i).size(); - } - - /*! Returns bounds of all children (implemented later as specializations) */ - __forceinline void bounds(BBox& bounds0, BBox& bounds1, BBox& bounds2, BBox& bounds3) const; - - /*! swap two children of the node */ - __forceinline void swap(size_t i, size_t j) - { - assert(ichildren[i],b->children[j]); - std::swap(a->lower_x[i],b->lower_x[j]); - std::swap(a->lower_y[i],b->lower_y[j]); - std::swap(a->lower_z[i],b->lower_z[j]); - std::swap(a->upper_x[i],b->upper_x[j]); - std::swap(a->upper_y[i],b->upper_y[j]); - std::swap(a->upper_z[i],b->upper_z[j]); - } - - /*! compacts a node (moves empty children to the end) */ - __forceinline static void compact(AABBNode_t* a) - { - /* find right most filled node */ - ssize_t j=N; - for (j=j-1; j>=0; j--) - if (a->child(j) != NodeRef::emptyNode) - break; - - /* replace empty nodes with filled nodes */ - for (ssize_t i=0; ichild(i) == NodeRef::emptyNode) { - a->swap(i,j); - for (j=j-1; j>i; j--) - if (a->child(j) != NodeRef::emptyNode) - break; - } - } - } - - /*! Returns reference to specified child */ - __forceinline NodeRef& child(size_t i) { assert(i lower_x; //!< X dimension of lower bounds of all N children. - vfloat upper_x; //!< X dimension of upper bounds of all N children. - vfloat lower_y; //!< Y dimension of lower bounds of all N children. - vfloat upper_y; //!< Y dimension of upper bounds of all N children. - vfloat lower_z; //!< Z dimension of lower bounds of all N children. - vfloat upper_z; //!< Z dimension of upper bounds of all N children. - }; - - template<> - __forceinline void AABBNode_t,4>::bounds(BBox& bounds0, BBox& bounds1, BBox& bounds2, BBox& bounds3) const { - transpose(lower_x,lower_y,lower_z,vfloat4(zero),bounds0.lower,bounds1.lower,bounds2.lower,bounds3.lower); - transpose(upper_x,upper_y,upper_z,vfloat4(zero),bounds0.upper,bounds1.upper,bounds2.upper,bounds3.upper); - } -} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb_mb.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb_mb.h deleted file mode 100644 index 501f4bce5b..0000000000 --- a/thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb_mb.h +++ /dev/null @@ -1,247 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "bvh_node_base.h" - -namespace embree -{ - /*! Motion Blur AABBNode */ - template - struct AABBNodeMB_t : public BaseNode_t - { - using BaseNode_t::children; - typedef BVHNodeRecord NodeRecord; - typedef BVHNodeRecordMB NodeRecordMB; - typedef BVHNodeRecordMB4D NodeRecordMB4D; - - struct Create - { - template - __forceinline NodeRef operator() (BuildRecord* children, const size_t num, const FastAllocator::CachedAllocator& alloc) const - { - AABBNodeMB_t* node = (AABBNodeMB_t*) alloc.malloc0(sizeof(AABBNodeMB_t),NodeRef::byteNodeAlignment); node->clear(); - return NodeRef::encodeNode(node); - } - }; - - struct Set - { - template - __forceinline NodeRecordMB operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRecordMB* children, const size_t num) const - { - AABBNodeMB_t* node = ref.getAABBNodeMB(); - - LBBox3fa bounds = empty; - for (size_t i=0; isetRef(i,children[i].ref); - node->setBounds(i,children[i].lbounds); - bounds.extend(children[i].lbounds); - } - return NodeRecordMB(ref,bounds); - } - }; - - struct SetTimeRange - { - __forceinline SetTimeRange(BBox1f tbounds) : tbounds(tbounds) {} - - template - __forceinline NodeRecordMB operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRecordMB* children, const size_t num) const - { - AABBNodeMB_t* node = ref.getAABBNodeMB(); - - LBBox3fa bounds = empty; - for (size_t i=0; isetRef(i, children[i].ref); - node->setBounds(i, children[i].lbounds, tbounds); - bounds.extend(children[i].lbounds); - } - return NodeRecordMB(ref,bounds); - } - - BBox1f tbounds; - }; - - /*! Clears the node. */ - __forceinline void clear() { - lower_x = lower_y = lower_z = vfloat(pos_inf); - upper_x = upper_y = upper_z = vfloat(neg_inf); - lower_dx = lower_dy = lower_dz = vfloat(0.0f); - upper_dx = upper_dy = upper_dz = vfloat(0.0f); - BaseNode_t::clear(); - } - - /*! Sets ID of child. */ - __forceinline void setRef(size_t i, NodeRef ref) { - children[i] = ref; - } - - /*! Sets bounding box of child. */ - __forceinline void setBounds(size_t i, const BBox3fa& bounds0_i, const BBox3fa& bounds1_i) - { - /*! for empty bounds we have to avoid inf-inf=nan */ - BBox3fa bounds0(min(bounds0_i.lower,Vec3fa(+FLT_MAX)),max(bounds0_i.upper,Vec3fa(-FLT_MAX))); - BBox3fa bounds1(min(bounds1_i.lower,Vec3fa(+FLT_MAX)),max(bounds1_i.upper,Vec3fa(-FLT_MAX))); - bounds0 = bounds0.enlarge_by(4.0f*float(ulp)); - bounds1 = bounds1.enlarge_by(4.0f*float(ulp)); - Vec3fa dlower = bounds1.lower-bounds0.lower; - Vec3fa dupper = bounds1.upper-bounds0.upper; - - lower_x[i] = bounds0.lower.x; lower_y[i] = bounds0.lower.y; lower_z[i] = bounds0.lower.z; - upper_x[i] = bounds0.upper.x; upper_y[i] = bounds0.upper.y; upper_z[i] = bounds0.upper.z; - - lower_dx[i] = dlower.x; lower_dy[i] = dlower.y; lower_dz[i] = dlower.z; - upper_dx[i] = dupper.x; upper_dy[i] = dupper.y; upper_dz[i] = dupper.z; - } - - /*! Sets bounding box of child. */ - __forceinline void setBounds(size_t i, const LBBox3fa& bounds) { - setBounds(i, bounds.bounds0, bounds.bounds1); - } - - /*! Sets bounding box of child. */ - __forceinline void setBounds(size_t i, const LBBox3fa& bounds, const BBox1f& tbounds) { - setBounds(i, bounds.global(tbounds)); - } - - /*! Sets bounding box and ID of child. */ - __forceinline void set(size_t i, NodeRef ref, const BBox3fa& bounds) { - lower_x[i] = bounds.lower.x; lower_y[i] = bounds.lower.y; lower_z[i] = bounds.lower.z; - upper_x[i] = bounds.upper.x; upper_y[i] = bounds.upper.y; upper_z[i] = bounds.upper.z; - children[i] = ref; - } - - /*! Sets bounding box and ID of child. */ - __forceinline void set(size_t i, const NodeRecordMB4D& child) - { - setRef(i, child.ref); - setBounds(i, child.lbounds, child.dt); - } - - /*! Return bounding box for time 0 */ - __forceinline BBox3fa bounds0(size_t i) const { - return BBox3fa(Vec3fa(lower_x[i],lower_y[i],lower_z[i]), - Vec3fa(upper_x[i],upper_y[i],upper_z[i])); - } - - /*! Return bounding box for time 1 */ - __forceinline BBox3fa bounds1(size_t i) const { - return BBox3fa(Vec3fa(lower_x[i]+lower_dx[i],lower_y[i]+lower_dy[i],lower_z[i]+lower_dz[i]), - Vec3fa(upper_x[i]+upper_dx[i],upper_y[i]+upper_dy[i],upper_z[i]+upper_dz[i])); - } - - /*! Returns bounds of node. */ - __forceinline BBox3fa bounds() const { - return BBox3fa(Vec3fa(reduce_min(min(lower_x,lower_x+lower_dx)), - reduce_min(min(lower_y,lower_y+lower_dy)), - reduce_min(min(lower_z,lower_z+lower_dz))), - Vec3fa(reduce_max(max(upper_x,upper_x+upper_dx)), - reduce_max(max(upper_y,upper_y+upper_dy)), - reduce_max(max(upper_z,upper_z+upper_dz)))); - } - - /*! Return bounding box of child i */ - __forceinline BBox3fa bounds(size_t i) const { - return merge(bounds0(i),bounds1(i)); - } - - /*! Return linear bounding box of child i */ - __forceinline LBBox3fa lbounds(size_t i) const { - return LBBox3fa(bounds0(i),bounds1(i)); - } - - /*! Return bounding box of child i at specified time */ - __forceinline BBox3fa bounds(size_t i, float time) const { - return lerp(bounds0(i),bounds1(i),time); - } - - /*! Returns the expected surface area when randomly sampling the time. */ - __forceinline float expectedHalfArea(size_t i) const { - return lbounds(i).expectedHalfArea(); - } - - /*! Returns the expected surface area when randomly sampling the time. */ - __forceinline float expectedHalfArea(size_t i, const BBox1f& t0t1) const { - return lbounds(i).expectedHalfArea(t0t1); - } - - /*! swap two children of the node */ - __forceinline void swap(size_t i, size_t j) - { - assert(i=0; j--) - if (a->child(j) != NodeRef::emptyNode) - break; - - /* replace empty nodes with filled nodes */ - for (ssize_t i=0; ichild(i) == NodeRef::emptyNode) { - a->swap(i,j); - for (j=j-1; j>i; j--) - if (a->child(j) != NodeRef::emptyNode) - break; - } - } - } - - /*! Returns reference to specified child */ - __forceinline NodeRef& child(size_t i) { assert(i lower_x; //!< X dimension of lower bounds of all N children. - vfloat upper_x; //!< X dimension of upper bounds of all N children. - vfloat lower_y; //!< Y dimension of lower bounds of all N children. - vfloat upper_y; //!< Y dimension of upper bounds of all N children. - vfloat lower_z; //!< Z dimension of lower bounds of all N children. - vfloat upper_z; //!< Z dimension of upper bounds of all N children. - - vfloat lower_dx; //!< X dimension of lower bounds of all N children. - vfloat upper_dx; //!< X dimension of upper bounds of all N children. - vfloat lower_dy; //!< Y dimension of lower bounds of all N children. - vfloat upper_dy; //!< Y dimension of upper bounds of all N children. - vfloat lower_dz; //!< Z dimension of lower bounds of all N children. - vfloat upper_dz; //!< Z dimension of upper bounds of all N children. - }; -} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb_mb4d.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb_mb4d.h deleted file mode 100644 index e968bbbc39..0000000000 --- a/thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb_mb4d.h +++ /dev/null @@ -1,107 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "bvh_node_aabb_mb.h" - -namespace embree -{ - /*! Aligned 4D Motion Blur Node */ - template - struct AABBNodeMB4D_t : public AABBNodeMB_t - { - using BaseNode_t::children; - using AABBNodeMB_t::set; - - typedef BVHNodeRecord NodeRecord; - typedef BVHNodeRecordMB NodeRecordMB; - typedef BVHNodeRecordMB4D NodeRecordMB4D; - - struct Create - { - template - __forceinline NodeRef operator() (BuildRecord*, const size_t, const FastAllocator::CachedAllocator& alloc, bool hasTimeSplits = true) const - { - if (hasTimeSplits) - { - AABBNodeMB4D_t* node = (AABBNodeMB4D_t*) alloc.malloc0(sizeof(AABBNodeMB4D_t),NodeRef::byteNodeAlignment); node->clear(); - return NodeRef::encodeNode(node); - } - else - { - AABBNodeMB_t* node = (AABBNodeMB_t*) alloc.malloc0(sizeof(AABBNodeMB_t),NodeRef::byteNodeAlignment); node->clear(); - return NodeRef::encodeNode(node); - } - } - }; - - struct Set - { - template - __forceinline void operator() (const BuildRecord&, const BuildRecord*, NodeRef ref, NodeRecordMB4D* children, const size_t num) const - { - if (likely(ref.isAABBNodeMB())) { - for (size_t i=0; iset(i, children[i]); - } else { - for (size_t i=0; iset(i, children[i]); - } - } - }; - - /*! Clears the node. */ - __forceinline void clear() { - lower_t = vfloat(pos_inf); - upper_t = vfloat(neg_inf); - AABBNodeMB_t::clear(); - } - - /*! Sets bounding box of child. */ - __forceinline void setBounds(size_t i, const LBBox3fa& bounds, const BBox1f& tbounds) - { - AABBNodeMB_t::setBounds(i, bounds.global(tbounds)); - lower_t[i] = tbounds.lower; - upper_t[i] = tbounds.upper == 1.0f ? 1.0f+float(ulp) : tbounds.upper; - } - - /*! Sets bounding box and ID of child. */ - __forceinline void set(size_t i, const NodeRecordMB4D& child) { - AABBNodeMB_t::setRef(i,child.ref); - setBounds(i, child.lbounds, child.dt); - } - - /*! Returns the expected surface area when randomly sampling the time. */ - __forceinline float expectedHalfArea(size_t i) const { - return AABBNodeMB_t::lbounds(i).expectedHalfArea(timeRange(i)); - } - - /*! returns time range for specified child */ - __forceinline BBox1f timeRange(size_t i) const { - return BBox1f(lower_t[i],upper_t[i]); - } - - /*! stream output operator */ - friend embree_ostream operator<<(embree_ostream cout, const AABBNodeMB4D_t& n) - { - cout << "AABBNodeMB4D {" << embree_endl; - for (size_t i=0; i lower_t; //!< time dimension of lower bounds of all N children - vfloat upper_t; //!< time dimension of upper bounds of all N children - }; -} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_node_base.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_base.h deleted file mode 100644 index 8268f3b932..0000000000 --- a/thirdparty/embree-aarch64/kernels/bvh/bvh_node_base.h +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "bvh_node_ref.h" - -namespace embree -{ - - /*! BVHN Base Node */ - template - struct BaseNode_t - { - /*! Clears the node. */ - __forceinline void clear() - { - for (size_t i=0; i - struct OBBNode_t : public BaseNode_t - { - using BaseNode_t::children; - - struct Create - { - __forceinline NodeRef operator() (const FastAllocator::CachedAllocator& alloc) const - { - OBBNode_t* node = (OBBNode_t*) alloc.malloc0(sizeof(OBBNode_t),NodeRef::byteNodeAlignment); node->clear(); - return NodeRef::encodeNode(node); - } - }; - - struct Set - { - __forceinline void operator() (NodeRef node, size_t i, NodeRef child, const OBBox3fa& bounds) const { - node.ungetAABBNode()->setRef(i,child); - node.ungetAABBNode()->setBounds(i,bounds); - } - }; - - /*! Clears the node. */ - __forceinline void clear() - { - naabb.l.vx = Vec3fa(nan); - naabb.l.vy = Vec3fa(nan); - naabb.l.vz = Vec3fa(nan); - naabb.p = Vec3fa(nan); - BaseNode_t::clear(); - } - - /*! Sets bounding box. */ - __forceinline void setBounds(size_t i, const OBBox3fa& b) - { - assert(i < N); - - AffineSpace3fa space = b.space; - space.p -= b.bounds.lower; - space = AffineSpace3fa::scale(1.0f/max(Vec3fa(1E-19f),b.bounds.upper-b.bounds.lower))*space; - - naabb.l.vx.x[i] = space.l.vx.x; - naabb.l.vx.y[i] = space.l.vx.y; - naabb.l.vx.z[i] = space.l.vx.z; - - naabb.l.vy.x[i] = space.l.vy.x; - naabb.l.vy.y[i] = space.l.vy.y; - naabb.l.vy.z[i] = space.l.vy.z; - - naabb.l.vz.x[i] = space.l.vz.x; - naabb.l.vz.y[i] = space.l.vz.y; - naabb.l.vz.z[i] = space.l.vz.z; - - naabb.p.x[i] = space.p.x; - naabb.p.y[i] = space.p.y; - naabb.p.z[i] = space.p.z; - } - - /*! Sets ID of child. */ - __forceinline void setRef(size_t i, const NodeRef& ref) { - assert(i < N); - children[i] = ref; - } - - /*! Returns the extent of the bounds of the ith child */ - __forceinline Vec3fa extent(size_t i) const { - assert(i naabb; //!< non-axis aligned bounding boxes (bounds are [0,1] in specified space) - }; -} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_node_obb_mb.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_obb_mb.h deleted file mode 100644 index 834cf5ec28..0000000000 --- a/thirdparty/embree-aarch64/kernels/bvh/bvh_node_obb_mb.h +++ /dev/null @@ -1,90 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "bvh_node_base.h" - -namespace embree -{ - template - struct OBBNodeMB_t : public BaseNode_t - { - using BaseNode_t::children; - - struct Create - { - __forceinline NodeRef operator() (const FastAllocator::CachedAllocator& alloc) const - { - OBBNodeMB_t* node = (OBBNodeMB_t*) alloc.malloc0(sizeof(OBBNodeMB_t),NodeRef::byteNodeAlignment); node->clear(); - return NodeRef::encodeNode(node); - } - }; - - struct Set - { - __forceinline void operator() (NodeRef node, size_t i, NodeRef child, const LinearSpace3fa& space, const LBBox3fa& lbounds, const BBox1f dt) const { - node.ungetAABBNodeMB()->setRef(i,child); - node.ungetAABBNodeMB()->setBounds(i,space,lbounds.global(dt)); - } - }; - - /*! Clears the node. */ - __forceinline void clear() - { - space0 = one; - //b0.lower = b0.upper = Vec3fa(nan); - b1.lower = b1.upper = Vec3fa(nan); - BaseNode_t::clear(); - } - - /*! Sets space and bounding boxes. */ - __forceinline void setBounds(size_t i, const AffineSpace3fa& space, const LBBox3fa& lbounds) { - setBounds(i,space,lbounds.bounds0,lbounds.bounds1); - } - - /*! Sets space and bounding boxes. */ - __forceinline void setBounds(size_t i, const AffineSpace3fa& s0, const BBox3fa& a, const BBox3fa& c) - { - assert(i < N); - - AffineSpace3fa space = s0; - space.p -= a.lower; - Vec3fa scale = 1.0f/max(Vec3fa(1E-19f),a.upper-a.lower); - space = AffineSpace3fa::scale(scale)*space; - BBox3fa a1((a.lower-a.lower)*scale,(a.upper-a.lower)*scale); - BBox3fa c1((c.lower-a.lower)*scale,(c.upper-a.lower)*scale); - - space0.l.vx.x[i] = space.l.vx.x; space0.l.vx.y[i] = space.l.vx.y; space0.l.vx.z[i] = space.l.vx.z; - space0.l.vy.x[i] = space.l.vy.x; space0.l.vy.y[i] = space.l.vy.y; space0.l.vy.z[i] = space.l.vy.z; - space0.l.vz.x[i] = space.l.vz.x; space0.l.vz.y[i] = space.l.vz.y; space0.l.vz.z[i] = space.l.vz.z; - space0.p .x[i] = space.p .x; space0.p .y[i] = space.p .y; space0.p .z[i] = space.p .z; - - /*b0.lower.x[i] = a1.lower.x; b0.lower.y[i] = a1.lower.y; b0.lower.z[i] = a1.lower.z; - b0.upper.x[i] = a1.upper.x; b0.upper.y[i] = a1.upper.y; b0.upper.z[i] = a1.upper.z;*/ - - b1.lower.x[i] = c1.lower.x; b1.lower.y[i] = c1.lower.y; b1.lower.z[i] = c1.lower.z; - b1.upper.x[i] = c1.upper.x; b1.upper.y[i] = c1.upper.y; b1.upper.z[i] = c1.upper.z; - } - - /*! Sets ID of child. */ - __forceinline void setRef(size_t i, const NodeRef& ref) { - assert(i < N); - children[i] = ref; - } - - /*! Returns the extent of the bounds of the ith child */ - __forceinline Vec3fa extent0(size_t i) const { - assert(i < N); - const Vec3fa vx(space0.l.vx.x[i],space0.l.vx.y[i],space0.l.vx.z[i]); - const Vec3fa vy(space0.l.vy.x[i],space0.l.vy.y[i],space0.l.vy.z[i]); - const Vec3fa vz(space0.l.vz.x[i],space0.l.vz.y[i],space0.l.vz.z[i]); - return rsqrt(vx*vx + vy*vy + vz*vz); - } - - public: - AffineSpace3vf space0; - //BBox3vf b0; // these are the unit bounds - BBox3vf b1; - }; -} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_node_qaabb.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_qaabb.h deleted file mode 100644 index 5212821f3f..0000000000 --- a/thirdparty/embree-aarch64/kernels/bvh/bvh_node_qaabb.h +++ /dev/null @@ -1,265 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "bvh_node_base.h" - -namespace embree -{ - /*! BVHN Quantized Node */ - template - struct __aligned(8) QuantizedBaseNode_t - { - typedef unsigned char T; - static const T MIN_QUAN = 0; - static const T MAX_QUAN = 255; - - /*! Clears the node. */ - __forceinline void clear() { - for (size_t i=0; i &lower, - const vfloat &upper, - T lower_quant[N], - T upper_quant[N], - float &start, - float &scale) - { - /* quantize bounds */ - const vbool m_valid = lower != vfloat(pos_inf); - const float minF = reduce_min(lower); - const float maxF = reduce_max(upper); - float diff = (1.0f+2.0f*float(ulp))*(maxF - minF); - float decode_scale = diff / float(MAX_QUAN); - if (decode_scale == 0.0f) decode_scale = 2.0f*FLT_MIN; // result may have been flushed to zero - assert(madd(decode_scale,float(MAX_QUAN),minF) >= maxF); - const float encode_scale = diff > 0 ? (float(MAX_QUAN) / diff) : 0.0f; - vint ilower = max(vint(floor((lower - vfloat(minF))*vfloat(encode_scale))),MIN_QUAN); - vint iupper = min(vint(ceil ((upper - vfloat(minF))*vfloat(encode_scale))),MAX_QUAN); - - /* lower/upper correction */ - vbool m_lower_correction = (madd(vfloat(ilower),decode_scale,minF)) > lower; - vbool m_upper_correction = (madd(vfloat(iupper),decode_scale,minF)) < upper; - ilower = max(select(m_lower_correction,ilower-1,ilower),MIN_QUAN); - iupper = min(select(m_upper_correction,iupper+1,iupper),MAX_QUAN); - - /* disable invalid lanes */ - ilower = select(m_valid,ilower,MAX_QUAN); - iupper = select(m_valid,iupper,MIN_QUAN); - - /* store as uchar to memory */ - vint::store(lower_quant,ilower); - vint::store(upper_quant,iupper); - start = minF; - scale = decode_scale; - -#if defined(DEBUG) - vfloat extract_lower( vint::loadu(lower_quant) ); - vfloat extract_upper( vint::loadu(upper_quant) ); - vfloat final_extract_lower = madd(extract_lower,decode_scale,minF); - vfloat final_extract_upper = madd(extract_upper,decode_scale,minF); - assert( (movemask(final_extract_lower <= lower ) & movemask(m_valid)) == movemask(m_valid)); - assert( (movemask(final_extract_upper >= upper ) & movemask(m_valid)) == movemask(m_valid)); -#endif - } - - __forceinline void init_dim(AABBNode_t,N>& node) - { - init_dim(node.lower_x,node.upper_x,lower_x,upper_x,start.x,scale.x); - init_dim(node.lower_y,node.upper_y,lower_y,upper_y,start.y,scale.y); - init_dim(node.lower_z,node.upper_z,lower_z,upper_z,start.z,scale.z); - } - - __forceinline vbool validMask() const { return vint::loadu(lower_x) <= vint::loadu(upper_x); } - -#if defined(__AVX512F__) // KNL - __forceinline vbool16 validMask16() const { return le(0xff,vint<16>::loadu(lower_x),vint<16>::loadu(upper_x)); } -#endif - __forceinline vfloat dequantizeLowerX() const { return madd(vfloat(vint::loadu(lower_x)),scale.x,vfloat(start.x)); } - - __forceinline vfloat dequantizeUpperX() const { return madd(vfloat(vint::loadu(upper_x)),scale.x,vfloat(start.x)); } - - __forceinline vfloat dequantizeLowerY() const { return madd(vfloat(vint::loadu(lower_y)),scale.y,vfloat(start.y)); } - - __forceinline vfloat dequantizeUpperY() const { return madd(vfloat(vint::loadu(upper_y)),scale.y,vfloat(start.y)); } - - __forceinline vfloat dequantizeLowerZ() const { return madd(vfloat(vint::loadu(lower_z)),scale.z,vfloat(start.z)); } - - __forceinline vfloat dequantizeUpperZ() const { return madd(vfloat(vint::loadu(upper_z)),scale.z,vfloat(start.z)); } - - template - __forceinline vfloat dequantize(const size_t offset) const { return vfloat(vint::loadu(all_planes+offset)); } - -#if defined(__AVX512F__) - __forceinline vfloat16 dequantizeLowerUpperX(const vint16 &p) const { return madd(vfloat16(permute(vint<16>::loadu(lower_x),p)),scale.x,vfloat16(start.x)); } - __forceinline vfloat16 dequantizeLowerUpperY(const vint16 &p) const { return madd(vfloat16(permute(vint<16>::loadu(lower_y),p)),scale.y,vfloat16(start.y)); } - __forceinline vfloat16 dequantizeLowerUpperZ(const vint16 &p) const { return madd(vfloat16(permute(vint<16>::loadu(lower_z),p)),scale.z,vfloat16(start.z)); } -#endif - - union { - struct { - T lower_x[N]; //!< 8bit discretized X dimension of lower bounds of all N children - T upper_x[N]; //!< 8bit discretized X dimension of upper bounds of all N children - T lower_y[N]; //!< 8bit discretized Y dimension of lower bounds of all N children - T upper_y[N]; //!< 8bit discretized Y dimension of upper bounds of all N children - T lower_z[N]; //!< 8bit discretized Z dimension of lower bounds of all N children - T upper_z[N]; //!< 8bit discretized Z dimension of upper bounds of all N children - }; - T all_planes[6*N]; - }; - - Vec3f start; - Vec3f scale; - - friend embree_ostream operator<<(embree_ostream o, const QuantizedBaseNode_t& n) - { - o << "QuantizedBaseNode { " << embree_endl; - o << " start " << n.start << embree_endl; - o << " scale " << n.scale << embree_endl; - o << " lower_x " << vuint::loadu(n.lower_x) << embree_endl; - o << " upper_x " << vuint::loadu(n.upper_x) << embree_endl; - o << " lower_y " << vuint::loadu(n.lower_y) << embree_endl; - o << " upper_y " << vuint::loadu(n.upper_y) << embree_endl; - o << " lower_z " << vuint::loadu(n.lower_z) << embree_endl; - o << " upper_z " << vuint::loadu(n.upper_z) << embree_endl; - o << "}" << embree_endl; - return o; - } - - }; - - template - struct __aligned(8) QuantizedNode_t : public BaseNode_t, QuantizedBaseNode_t - { - using BaseNode_t::children; - using QuantizedBaseNode_t::lower_x; - using QuantizedBaseNode_t::upper_x; - using QuantizedBaseNode_t::lower_y; - using QuantizedBaseNode_t::upper_y; - using QuantizedBaseNode_t::lower_z; - using QuantizedBaseNode_t::upper_z; - using QuantizedBaseNode_t::start; - using QuantizedBaseNode_t::scale; - using QuantizedBaseNode_t::init_dim; - - __forceinline void setRef(size_t i, const NodeRef& ref) { - assert(i < N); - children[i] = ref; - } - - struct Create2 - { - template - __forceinline NodeRef operator() (BuildRecord* children, const size_t n, const FastAllocator::CachedAllocator& alloc) const - { - __aligned(64) AABBNode_t node; - node.clear(); - for (size_t i=0; iinit(node); - - return (size_t)qnode | NodeRef::tyQuantizedNode; - } - }; - - struct Set2 - { - template - __forceinline NodeRef operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRef* children, const size_t num) const - { - QuantizedNode_t* node = ref.quantizedNode(); - for (size_t i=0; isetRef(i,children[i]); - return ref; - } - }; - - __forceinline void init(AABBNode_t& node) - { - for (size_t i=0;i - struct __aligned(8) QuantizedBaseNodeMB_t - { - QuantizedBaseNode_t node0; - QuantizedBaseNode_t node1; - - /*! Clears the node. */ - __forceinline void clear() { - node0.clear(); - node1.clear(); - } - - /*! Returns bounds of specified child. */ - __forceinline BBox3fa bounds(size_t i) const - { - assert(i < N); - BBox3fa bounds0 = node0.bounds(i); - BBox3fa bounds1 = node1.bounds(i); - bounds0.extend(bounds1); - return bounds0; - } - - /*! Returns extent of bounds of specified child. */ - __forceinline Vec3fa extent(size_t i) const { - return bounds(i).size(); - } - - __forceinline vbool validMask() const { return node0.validMask(); } - - template - __forceinline vfloat dequantizeLowerX(const T t) const { return lerp(node0.dequantizeLowerX(),node1.dequantizeLowerX(),t); } - template - __forceinline vfloat dequantizeUpperX(const T t) const { return lerp(node0.dequantizeUpperX(),node1.dequantizeUpperX(),t); } - template - __forceinline vfloat dequantizeLowerY(const T t) const { return lerp(node0.dequantizeLowerY(),node1.dequantizeLowerY(),t); } - template - __forceinline vfloat dequantizeUpperY(const T t) const { return lerp(node0.dequantizeUpperY(),node1.dequantizeUpperY(),t); } - template - __forceinline vfloat dequantizeLowerZ(const T t) const { return lerp(node0.dequantizeLowerZ(),node1.dequantizeLowerZ(),t); } - template - __forceinline vfloat dequantizeUpperZ(const T t) const { return lerp(node0.dequantizeUpperZ(),node1.dequantizeUpperZ(),t); } - - - template - __forceinline vfloat dequantizeLowerX(const size_t i, const vfloat &t) const { return lerp(vfloat(node0.dequantizeLowerX()[i]),vfloat(node1.dequantizeLowerX()[i]),t); } - template - __forceinline vfloat dequantizeUpperX(const size_t i, const vfloat &t) const { return lerp(vfloat(node0.dequantizeUpperX()[i]),vfloat(node1.dequantizeUpperX()[i]),t); } - template - __forceinline vfloat dequantizeLowerY(const size_t i, const vfloat &t) const { return lerp(vfloat(node0.dequantizeLowerY()[i]),vfloat(node1.dequantizeLowerY()[i]),t); } - template - __forceinline vfloat dequantizeUpperY(const size_t i, const vfloat &t) const { return lerp(vfloat(node0.dequantizeUpperY()[i]),vfloat(node1.dequantizeUpperY()[i]),t); } - template - __forceinline vfloat dequantizeLowerZ(const size_t i, const vfloat &t) const { return lerp(vfloat(node0.dequantizeLowerZ()[i]),vfloat(node1.dequantizeLowerZ()[i]),t); } - template - __forceinline vfloat dequantizeUpperZ(const size_t i, const vfloat &t) const { return lerp(vfloat(node0.dequantizeUpperZ()[i]),vfloat(node1.dequantizeUpperZ()[i]),t); } - - }; -} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_node_ref.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_ref.h deleted file mode 100644 index 0f6d4dac7e..0000000000 --- a/thirdparty/embree-aarch64/kernels/bvh/bvh_node_ref.h +++ /dev/null @@ -1,242 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../common/default.h" -#include "../common/alloc.h" -#include "../common/accel.h" -#include "../common/device.h" -#include "../common/scene.h" -#include "../geometry/primitive.h" -#include "../common/ray.h" - -namespace embree -{ - /* BVH node reference with bounds */ - template - struct BVHNodeRecord - { - __forceinline BVHNodeRecord() {} - __forceinline BVHNodeRecord(NodeRef ref, const BBox3fa& bounds) : ref(ref), bounds((BBox3fx)bounds) {} - __forceinline BVHNodeRecord(NodeRef ref, const BBox3fx& bounds) : ref(ref), bounds(bounds) {} - - NodeRef ref; - BBox3fx bounds; - }; - - template - struct BVHNodeRecordMB - { - __forceinline BVHNodeRecordMB() {} - __forceinline BVHNodeRecordMB(NodeRef ref, const LBBox3fa& lbounds) : ref(ref), lbounds(lbounds) {} - - NodeRef ref; - LBBox3fa lbounds; - }; - - template - struct BVHNodeRecordMB4D - { - __forceinline BVHNodeRecordMB4D() {} - __forceinline BVHNodeRecordMB4D(NodeRef ref, const LBBox3fa& lbounds, const BBox1f& dt) : ref(ref), lbounds(lbounds), dt(dt) {} - - NodeRef ref; - LBBox3fa lbounds; - BBox1f dt; - }; - - template struct BaseNode_t; - template struct AABBNode_t; - template struct AABBNodeMB_t; - template struct AABBNodeMB4D_t; - template struct OBBNode_t; - template struct OBBNodeMB_t; - template struct QuantizedNode_t; - template struct QuantizedNodeMB_t; - - /*! Pointer that points to a node or a list of primitives */ - template - struct NodeRefPtr - { - //template friend class BVHN; - - /*! Number of bytes the nodes and primitives are minimally aligned to.*/ - static const size_t byteAlignment = 16; - static const size_t byteNodeAlignment = 4*N; - - /*! highest address bit is used as barrier for some algorithms */ - static const size_t barrier_mask = (1LL << (8*sizeof(size_t)-1)); - - /*! Masks the bits that store the number of items per leaf. */ - static const size_t align_mask = byteAlignment-1; - static const size_t items_mask = byteAlignment-1; - - /*! different supported node types */ - static const size_t tyAABBNode = 0; - static const size_t tyAABBNodeMB = 1; - static const size_t tyAABBNodeMB4D = 6; - static const size_t tyOBBNode = 2; - static const size_t tyOBBNodeMB = 3; - static const size_t tyQuantizedNode = 5; - static const size_t tyLeaf = 8; - - /*! Empty node */ - static const size_t emptyNode = tyLeaf; - - /*! Invalid node, used as marker in traversal */ - static const size_t invalidNode = (((size_t)-1) & (~items_mask)) | (tyLeaf+0); - static const size_t popRay = (((size_t)-1) & (~items_mask)) | (tyLeaf+1); - - /*! Maximum number of primitive blocks in a leaf. */ - static const size_t maxLeafBlocks = items_mask-tyLeaf; - - /*! Default constructor */ - __forceinline NodeRefPtr () {} - - /*! Construction from integer */ - __forceinline NodeRefPtr (size_t ptr) : ptr(ptr) {} - - /*! Cast to size_t */ - __forceinline operator size_t() const { return ptr; } - - /*! Sets the barrier bit. */ - __forceinline void setBarrier() { -#if defined(__X86_64__) || defined(__aarch64__) - assert(!isBarrier()); - ptr |= barrier_mask; -#else - assert(false); -#endif - } - - /*! Clears the barrier bit. */ - __forceinline void clearBarrier() { -#if defined(__X86_64__) || defined(__aarch64__) - ptr &= ~barrier_mask; -#else - assert(false); -#endif - } - - /*! Checks if this is an barrier. A barrier tells the top level tree rotations how deep to enter the tree. */ - __forceinline bool isBarrier() const { return (ptr & barrier_mask) != 0; } - - /*! checks if this is a leaf */ - __forceinline size_t isLeaf() const { return ptr & tyLeaf; } - - /*! returns node type */ - __forceinline int type() const { return ptr & (size_t)align_mask; } - - /*! checks if this is a node */ - __forceinline int isAABBNode() const { return (ptr & (size_t)align_mask) == tyAABBNode; } - - /*! checks if this is a motion blur node */ - __forceinline int isAABBNodeMB() const { return (ptr & (size_t)align_mask) == tyAABBNodeMB; } - - /*! checks if this is a 4D motion blur node */ - __forceinline int isAABBNodeMB4D() const { return (ptr & (size_t)align_mask) == tyAABBNodeMB4D; } - - /*! checks if this is a node with unaligned bounding boxes */ - __forceinline int isOBBNode() const { return (ptr & (size_t)align_mask) == tyOBBNode; } - - /*! checks if this is a motion blur node with unaligned bounding boxes */ - __forceinline int isOBBNodeMB() const { return (ptr & (size_t)align_mask) == tyOBBNodeMB; } - - /*! checks if this is a quantized node */ - __forceinline int isQuantizedNode() const { return (ptr & (size_t)align_mask) == tyQuantizedNode; } - - /*! Encodes a node */ - static __forceinline NodeRefPtr encodeNode(AABBNode_t* node) { - assert(!((size_t)node & align_mask)); - return NodeRefPtr((size_t) node); - } - - static __forceinline NodeRefPtr encodeNode(AABBNodeMB_t* node) { - assert(!((size_t)node & align_mask)); - return NodeRefPtr((size_t) node | tyAABBNodeMB); - } - - static __forceinline NodeRefPtr encodeNode(AABBNodeMB4D_t* node) { - assert(!((size_t)node & align_mask)); - return NodeRefPtr((size_t) node | tyAABBNodeMB4D); - } - - /*! Encodes an unaligned node */ - static __forceinline NodeRefPtr encodeNode(OBBNode_t* node) { - return NodeRefPtr((size_t) node | tyOBBNode); - } - - /*! Encodes an unaligned motion blur node */ - static __forceinline NodeRefPtr encodeNode(OBBNodeMB_t* node) { - return NodeRefPtr((size_t) node | tyOBBNodeMB); - } - - /*! Encodes a leaf */ - static __forceinline NodeRefPtr encodeLeaf(void* tri, size_t num) { - assert(!((size_t)tri & align_mask)); - assert(num <= maxLeafBlocks); - return NodeRefPtr((size_t)tri | (tyLeaf+min(num,(size_t)maxLeafBlocks))); - } - - /*! Encodes a leaf */ - static __forceinline NodeRefPtr encodeTypedLeaf(void* ptr, size_t ty) { - assert(!((size_t)ptr & align_mask)); - return NodeRefPtr((size_t)ptr | (tyLeaf+ty)); - } - - /*! returns base node pointer */ - __forceinline BaseNode_t* baseNode() - { - assert(!isLeaf()); - return (BaseNode_t*)(ptr & ~(size_t)align_mask); - } - __forceinline const BaseNode_t* baseNode() const - { - assert(!isLeaf()); - return (const BaseNode_t*)(ptr & ~(size_t)align_mask); - } - - /*! returns node pointer */ - __forceinline AABBNode_t* getAABBNode() { assert(isAABBNode()); return ( AABBNode_t*)ptr; } - __forceinline const AABBNode_t* getAABBNode() const { assert(isAABBNode()); return (const AABBNode_t*)ptr; } - - /*! returns motion blur node pointer */ - __forceinline AABBNodeMB_t* getAABBNodeMB() { assert(isAABBNodeMB() || isAABBNodeMB4D()); return ( AABBNodeMB_t*)(ptr & ~(size_t)align_mask); } - __forceinline const AABBNodeMB_t* getAABBNodeMB() const { assert(isAABBNodeMB() || isAABBNodeMB4D()); return (const AABBNodeMB_t*)(ptr & ~(size_t)align_mask); } - - /*! returns 4D motion blur node pointer */ - __forceinline AABBNodeMB4D_t* getAABBNodeMB4D() { assert(isAABBNodeMB4D()); return ( AABBNodeMB4D_t*)(ptr & ~(size_t)align_mask); } - __forceinline const AABBNodeMB4D_t* getAABBNodeMB4D() const { assert(isAABBNodeMB4D()); return (const AABBNodeMB4D_t*)(ptr & ~(size_t)align_mask); } - - /*! returns unaligned node pointer */ - __forceinline OBBNode_t* ungetAABBNode() { assert(isOBBNode()); return ( OBBNode_t*)(ptr & ~(size_t)align_mask); } - __forceinline const OBBNode_t* ungetAABBNode() const { assert(isOBBNode()); return (const OBBNode_t*)(ptr & ~(size_t)align_mask); } - - /*! returns unaligned motion blur node pointer */ - __forceinline OBBNodeMB_t* ungetAABBNodeMB() { assert(isOBBNodeMB()); return ( OBBNodeMB_t*)(ptr & ~(size_t)align_mask); } - __forceinline const OBBNodeMB_t* ungetAABBNodeMB() const { assert(isOBBNodeMB()); return (const OBBNodeMB_t*)(ptr & ~(size_t)align_mask); } - - /*! returns quantized node pointer */ - __forceinline QuantizedNode_t* quantizedNode() { assert(isQuantizedNode()); return ( QuantizedNode_t*)(ptr & ~(size_t)align_mask ); } - __forceinline const QuantizedNode_t* quantizedNode() const { assert(isQuantizedNode()); return (const QuantizedNode_t*)(ptr & ~(size_t)align_mask ); } - - /*! returns leaf pointer */ - __forceinline char* leaf(size_t& num) const { - assert(isLeaf()); - num = (ptr & (size_t)items_mask)-tyLeaf; - return (char*)(ptr & ~(size_t)align_mask); - } - - /*! clear all bit flags */ - __forceinline void clearFlags() { - ptr &= ~(size_t)align_mask; - } - - /*! returns the wideness */ - __forceinline size_t getN() const { return N; } - - public: - size_t ptr; - }; -} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_refit.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_refit.cpp deleted file mode 100644 index a273c21e8b..0000000000 --- a/thirdparty/embree-aarch64/kernels/bvh/bvh_refit.cpp +++ /dev/null @@ -1,247 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "bvh_refit.h" -#include "bvh_statistics.h" - -#include "../geometry/linei.h" -#include "../geometry/triangle.h" -#include "../geometry/trianglev.h" -#include "../geometry/trianglei.h" -#include "../geometry/quadv.h" -#include "../geometry/object.h" -#include "../geometry/instance.h" - -namespace embree -{ - namespace isa - { - static const size_t SINGLE_THREAD_THRESHOLD = 4*1024; - - template - __forceinline bool compare(const typename BVHN::NodeRef* a, const typename BVHN::NodeRef* b) - { - size_t sa = *(size_t*)&a->node()->lower_x; - size_t sb = *(size_t*)&b->node()->lower_x; - return sa < sb; - } - - template - BVHNRefitter::BVHNRefitter (BVH* bvh, const LeafBoundsInterface& leafBounds) - : bvh(bvh), leafBounds(leafBounds), numSubTrees(0) - { - } - - template - void BVHNRefitter::refit() - { - if (bvh->numPrimitives <= SINGLE_THREAD_THRESHOLD) { - bvh->bounds = LBBox3fa(recurse_bottom(bvh->root)); - } - else - { - BBox3fa subTreeBounds[MAX_NUM_SUB_TREES]; - numSubTrees = 0; - gather_subtree_refs(bvh->root,numSubTrees,0); - if (numSubTrees) - parallel_for(size_t(0), numSubTrees, size_t(1), [&](const range& r) { - for (size_t i=r.begin(); ibounds = LBBox3fa(refit_toplevel(bvh->root,numSubTrees,subTreeBounds,0)); - } - } - - template - void BVHNRefitter::gather_subtree_refs(NodeRef& ref, - size_t &subtrees, - const size_t depth) - { - if (depth >= MAX_SUB_TREE_EXTRACTION_DEPTH) - { - assert(subtrees < MAX_NUM_SUB_TREES); - subTrees[subtrees++] = ref; - return; - } - - if (ref.isAABBNode()) - { - AABBNode* node = ref.getAABBNode(); - for (size_t i=0; ichild(i); - if (unlikely(child == BVH::emptyNode)) continue; - gather_subtree_refs(child,subtrees,depth+1); - } - } - } - - template - BBox3fa BVHNRefitter::refit_toplevel(NodeRef& ref, - size_t &subtrees, - const BBox3fa *const subTreeBounds, - const size_t depth) - { - if (depth >= MAX_SUB_TREE_EXTRACTION_DEPTH) - { - assert(subtrees < MAX_NUM_SUB_TREES); - assert(subTrees[subtrees] == ref); - return subTreeBounds[subtrees++]; - } - - if (ref.isAABBNode()) - { - AABBNode* node = ref.getAABBNode(); - BBox3fa bounds[N]; - - for (size_t i=0; ichild(i); - - if (unlikely(child == BVH::emptyNode)) - bounds[i] = BBox3fa(empty); - else - bounds[i] = refit_toplevel(child,subtrees,subTreeBounds,depth+1); - } - - BBox3vf boundsT = transpose(bounds); - - /* set new bounds */ - node->lower_x = boundsT.lower.x; - node->lower_y = boundsT.lower.y; - node->lower_z = boundsT.lower.z; - node->upper_x = boundsT.upper.x; - node->upper_y = boundsT.upper.y; - node->upper_z = boundsT.upper.z; - - return merge(bounds); - } - else - return leafBounds.leafBounds(ref); - } - - // ========================================================= - // ========================================================= - // ========================================================= - - - template - BBox3fa BVHNRefitter::recurse_bottom(NodeRef& ref) - { - /* this is a leaf node */ - if (unlikely(ref.isLeaf())) - return leafBounds.leafBounds(ref); - - /* recurse if this is an internal node */ - AABBNode* node = ref.getAABBNode(); - - /* enable exclusive prefetch for >= AVX platforms */ -#if defined(__AVX__) - BVH::prefetchW(ref); -#endif - BBox3fa bounds[N]; - - for (size_t i=0; ichild(i) == BVH::emptyNode)) - { - bounds[i] = BBox3fa(empty); - } - else - bounds[i] = recurse_bottom(node->child(i)); - - /* AOS to SOA transform */ - BBox3vf boundsT = transpose(bounds); - - /* set new bounds */ - node->lower_x = boundsT.lower.x; - node->lower_y = boundsT.lower.y; - node->lower_z = boundsT.lower.z; - node->upper_x = boundsT.upper.x; - node->upper_y = boundsT.upper.y; - node->upper_z = boundsT.upper.z; - - return merge(bounds); - } - - template - BVHNRefitT::BVHNRefitT (BVH* bvh, Builder* builder, Mesh* mesh, size_t mode) - : bvh(bvh), builder(builder), refitter(new BVHNRefitter(bvh,*(typename BVHNRefitter::LeafBoundsInterface*)this)), mesh(mesh), topologyVersion(0) {} - - template - void BVHNRefitT::clear() - { - if (builder) - builder->clear(); - } - - template - void BVHNRefitT::build() - { - if (mesh->topologyChanged(topologyVersion)) { - topologyVersion = mesh->getTopologyVersion(); - builder->build(); - } - else - refitter->refit(); - } - - template class BVHNRefitter<4>; -#if defined(__AVX__) - template class BVHNRefitter<8>; -#endif - -#if defined(EMBREE_GEOMETRY_TRIANGLE) - Builder* BVH4Triangle4MeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode); - Builder* BVH4Triangle4vMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode); - Builder* BVH4Triangle4iMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode); - - Builder* BVH4Triangle4MeshRefitSAH (void* accel, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<4,TriangleMesh,Triangle4> ((BVH4*)accel,BVH4Triangle4MeshBuilderSAH (accel,mesh,geomID,mode),mesh,mode); } - Builder* BVH4Triangle4vMeshRefitSAH (void* accel, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<4,TriangleMesh,Triangle4v>((BVH4*)accel,BVH4Triangle4vMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); } - Builder* BVH4Triangle4iMeshRefitSAH (void* accel, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<4,TriangleMesh,Triangle4i>((BVH4*)accel,BVH4Triangle4iMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); } -#if defined(__AVX__) - Builder* BVH8Triangle4MeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode); - Builder* BVH8Triangle4vMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode); - Builder* BVH8Triangle4iMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode); - - Builder* BVH8Triangle4MeshRefitSAH (void* accel, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<8,TriangleMesh,Triangle4> ((BVH8*)accel,BVH8Triangle4MeshBuilderSAH (accel,mesh,geomID,mode),mesh,mode); } - Builder* BVH8Triangle4vMeshRefitSAH (void* accel, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<8,TriangleMesh,Triangle4v>((BVH8*)accel,BVH8Triangle4vMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); } - Builder* BVH8Triangle4iMeshRefitSAH (void* accel, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<8,TriangleMesh,Triangle4i>((BVH8*)accel,BVH8Triangle4iMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); } -#endif -#endif - -#if defined(EMBREE_GEOMETRY_QUAD) - Builder* BVH4Quad4vMeshBuilderSAH (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode); - Builder* BVH4Quad4vMeshRefitSAH (void* accel, QuadMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<4,QuadMesh,Quad4v>((BVH4*)accel,BVH4Quad4vMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); } - -#if defined(__AVX__) - Builder* BVH8Quad4vMeshBuilderSAH (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode); - Builder* BVH8Quad4vMeshRefitSAH (void* accel, QuadMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<8,QuadMesh,Quad4v>((BVH8*)accel,BVH8Quad4vMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); } -#endif - -#endif - -#if defined(EMBREE_GEOMETRY_USER) - Builder* BVH4VirtualMeshBuilderSAH (void* bvh, UserGeometry* mesh, unsigned int geomID, size_t mode); - Builder* BVH4VirtualMeshRefitSAH (void* accel, UserGeometry* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<4,UserGeometry,Object>((BVH4*)accel,BVH4VirtualMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); } - -#if defined(__AVX__) - Builder* BVH8VirtualMeshBuilderSAH (void* bvh, UserGeometry* mesh, unsigned int geomID, size_t mode); - Builder* BVH8VirtualMeshRefitSAH (void* accel, UserGeometry* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<8,UserGeometry,Object>((BVH8*)accel,BVH8VirtualMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); } -#endif -#endif - -#if defined(EMBREE_GEOMETRY_INSTANCE) - Builder* BVH4InstanceMeshBuilderSAH (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode); - Builder* BVH4InstanceMeshRefitSAH (void* accel, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) { return new BVHNRefitT<4,Instance,InstancePrimitive>((BVH4*)accel,BVH4InstanceMeshBuilderSAH(accel,mesh,gtype,geomID,mode),mesh,mode); } - -#if defined(__AVX__) - Builder* BVH8InstanceMeshBuilderSAH (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode); - Builder* BVH8InstanceMeshRefitSAH (void* accel, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) { return new BVHNRefitT<8,Instance,InstancePrimitive>((BVH8*)accel,BVH8InstanceMeshBuilderSAH(accel,mesh,gtype,geomID,mode),mesh,mode); } -#endif -#endif - - } -} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_refit.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_refit.h deleted file mode 100644 index 4aa9bdd7cc..0000000000 --- a/thirdparty/embree-aarch64/kernels/bvh/bvh_refit.h +++ /dev/null @@ -1,95 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../bvh/bvh.h" - -namespace embree -{ - namespace isa - { - template - class BVHNRefitter - { - public: - - /*! Type shortcuts */ - typedef BVHN BVH; - typedef typename BVH::AABBNode AABBNode; - typedef typename BVH::NodeRef NodeRef; - - struct LeafBoundsInterface { - virtual const BBox3fa leafBounds(NodeRef& ref) const = 0; - }; - - public: - - /*! Constructor. */ - BVHNRefitter (BVH* bvh, const LeafBoundsInterface& leafBounds); - - /*! refits the BVH */ - void refit(); - - private: - /* single-threaded subtree extraction based on BVH depth */ - void gather_subtree_refs(NodeRef& ref, - size_t &subtrees, - const size_t depth = 0); - - /* single-threaded top-level refit */ - BBox3fa refit_toplevel(NodeRef& ref, - size_t &subtrees, - const BBox3fa *const subTreeBounds, - const size_t depth = 0); - - /* single-threaded subtree refit */ - BBox3fa recurse_bottom(NodeRef& ref); - - public: - BVH* bvh; //!< BVH to refit - const LeafBoundsInterface& leafBounds; //!< calculates bounds of leaves - - static const size_t MAX_SUB_TREE_EXTRACTION_DEPTH = (N==4) ? 4 : (N==8) ? 3 : 3; - static const size_t MAX_NUM_SUB_TREES = (N==4) ? 256 : (N==8) ? 512 : N*N*N; // N ^ MAX_SUB_TREE_EXTRACTION_DEPTH - size_t numSubTrees; - NodeRef subTrees[MAX_NUM_SUB_TREES]; - }; - - template - class BVHNRefitT : public Builder, public BVHNRefitter::LeafBoundsInterface - { - public: - - /*! Type shortcuts */ - typedef BVHN BVH; - typedef typename BVH::AABBNode AABBNode; - typedef typename BVH::NodeRef NodeRef; - - public: - BVHNRefitT (BVH* bvh, Builder* builder, Mesh* mesh, size_t mode); - - virtual void build(); - - virtual void clear(); - - virtual const BBox3fa leafBounds (NodeRef& ref) const - { - size_t num; char* prim = ref.leaf(num); - if (unlikely(ref == BVH::emptyNode)) return empty; - - BBox3fa bounds = empty; - for (size_t i=0; i builder; - std::unique_ptr> refitter; - Mesh* mesh; - unsigned int topologyVersion; - }; - } -} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_rotate.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_rotate.cpp deleted file mode 100644 index 2bb431bf0e..0000000000 --- a/thirdparty/embree-aarch64/kernels/bvh/bvh_rotate.cpp +++ /dev/null @@ -1,127 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "bvh_rotate.h" - -namespace embree -{ - namespace isa - { - /*! Computes half surface area of box. */ - __forceinline float halfArea3f(const BBox& box) { - const vfloat4 d = box.size(); - const vfloat4 a = d*shuffle<1,2,0,3>(d); - return a[0]+a[1]+a[2]; - } - - size_t BVHNRotate<4>::rotate(NodeRef parentRef, size_t depth) - { - /*! nothing to rotate if we reached a leaf node. */ - if (parentRef.isBarrier()) return 0; - if (parentRef.isLeaf()) return 0; - AABBNode* parent = parentRef.getAABBNode(); - - /*! rotate all children first */ - vint4 cdepth; - for (size_t c=0; c<4; c++) - cdepth[c] = (int)rotate(parent->child(c),depth+1); - - /* compute current areas of all children */ - vfloat4 sizeX = parent->upper_x-parent->lower_x; - vfloat4 sizeY = parent->upper_y-parent->lower_y; - vfloat4 sizeZ = parent->upper_z-parent->lower_z; - vfloat4 childArea = madd(sizeX,(sizeY + sizeZ),sizeY*sizeZ); - - /*! get node bounds */ - BBox child1_0,child1_1,child1_2,child1_3; - parent->bounds(child1_0,child1_1,child1_2,child1_3); - - /*! Find best rotation. We pick a first child (child1) and a sub-child - (child2child) of a different second child (child2), and swap child1 - and child2child. We perform the best such swap. */ - float bestArea = 0; - size_t bestChild1 = -1, bestChild2 = -1, bestChild2Child = -1; - for (size_t c2=0; c2<4; c2++) - { - /*! ignore leaf nodes as we cannot descent into them */ - if (parent->child(c2).isBarrier()) continue; - if (parent->child(c2).isLeaf()) continue; - AABBNode* child2 = parent->child(c2).getAABBNode(); - - /*! transpose child bounds */ - BBox child2c0,child2c1,child2c2,child2c3; - child2->bounds(child2c0,child2c1,child2c2,child2c3); - - /*! put child1_0 at each child2 position */ - float cost00 = halfArea3f(merge(child1_0,child2c1,child2c2,child2c3)); - float cost01 = halfArea3f(merge(child2c0,child1_0,child2c2,child2c3)); - float cost02 = halfArea3f(merge(child2c0,child2c1,child1_0,child2c3)); - float cost03 = halfArea3f(merge(child2c0,child2c1,child2c2,child1_0)); - vfloat4 cost0 = vfloat4(cost00,cost01,cost02,cost03); - vfloat4 min0 = vreduce_min(cost0); - int pos0 = (int)bsf(movemask(min0 == cost0)); - - /*! put child1_1 at each child2 position */ - float cost10 = halfArea3f(merge(child1_1,child2c1,child2c2,child2c3)); - float cost11 = halfArea3f(merge(child2c0,child1_1,child2c2,child2c3)); - float cost12 = halfArea3f(merge(child2c0,child2c1,child1_1,child2c3)); - float cost13 = halfArea3f(merge(child2c0,child2c1,child2c2,child1_1)); - vfloat4 cost1 = vfloat4(cost10,cost11,cost12,cost13); - vfloat4 min1 = vreduce_min(cost1); - int pos1 = (int)bsf(movemask(min1 == cost1)); - - /*! put child1_2 at each child2 position */ - float cost20 = halfArea3f(merge(child1_2,child2c1,child2c2,child2c3)); - float cost21 = halfArea3f(merge(child2c0,child1_2,child2c2,child2c3)); - float cost22 = halfArea3f(merge(child2c0,child2c1,child1_2,child2c3)); - float cost23 = halfArea3f(merge(child2c0,child2c1,child2c2,child1_2)); - vfloat4 cost2 = vfloat4(cost20,cost21,cost22,cost23); - vfloat4 min2 = vreduce_min(cost2); - int pos2 = (int)bsf(movemask(min2 == cost2)); - - /*! put child1_3 at each child2 position */ - float cost30 = halfArea3f(merge(child1_3,child2c1,child2c2,child2c3)); - float cost31 = halfArea3f(merge(child2c0,child1_3,child2c2,child2c3)); - float cost32 = halfArea3f(merge(child2c0,child2c1,child1_3,child2c3)); - float cost33 = halfArea3f(merge(child2c0,child2c1,child2c2,child1_3)); - vfloat4 cost3 = vfloat4(cost30,cost31,cost32,cost33); - vfloat4 min3 = vreduce_min(cost3); - int pos3 = (int)bsf(movemask(min3 == cost3)); - - /*! find best other child */ - vfloat4 area0123 = vfloat4(extract<0>(min0),extract<0>(min1),extract<0>(min2),extract<0>(min3)) - vfloat4(childArea[c2]); - int pos[4] = { pos0,pos1,pos2,pos3 }; - const size_t mbd = BVH4::maxBuildDepth; - vbool4 valid = vint4(int(depth+1))+cdepth <= vint4(mbd); // only select swaps that fulfill depth constraints - valid &= vint4(int(c2)) != vint4(step); - if (none(valid)) continue; - size_t c1 = select_min(valid,area0123); - float area = area0123[c1]; - if (c1 == c2) continue; // can happen if bounds are NANs - - /*! accept a swap when it reduces cost and is not swapping a node with itself */ - if (area < bestArea) { - bestArea = area; - bestChild1 = c1; - bestChild2 = c2; - bestChild2Child = pos[c1]; - } - } - - /*! if we did not find a swap that improves the SAH then do nothing */ - if (bestChild1 == size_t(-1)) return 1+reduce_max(cdepth); - - /*! perform the best found tree rotation */ - AABBNode* child2 = parent->child(bestChild2).getAABBNode(); - AABBNode::swap(parent,bestChild1,child2,bestChild2Child); - parent->setBounds(bestChild2,child2->bounds()); - AABBNode::compact(parent); - AABBNode::compact(child2); - - /*! This returned depth is conservative as the child that was - * pulled up in the tree could have been on the critical path. */ - cdepth[bestChild1]++; // bestChild1 was pushed down one level - return 1+reduce_max(cdepth); - } - } -} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_rotate.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_rotate.h deleted file mode 100644 index 009bef339e..0000000000 --- a/thirdparty/embree-aarch64/kernels/bvh/bvh_rotate.h +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "bvh.h" - -namespace embree -{ - namespace isa - { - template - class BVHNRotate - { - typedef typename BVHN::NodeRef NodeRef; - - public: - static const bool enabled = false; - - static __forceinline size_t rotate(NodeRef parentRef, size_t depth = 1) { return 0; } - static __forceinline void restructure(NodeRef ref, size_t depth = 1) {} - }; - - /* BVH4 tree rotations */ - template<> - class BVHNRotate<4> - { - typedef BVH4::AABBNode AABBNode; - typedef BVH4::NodeRef NodeRef; - - public: - static const bool enabled = true; - - static size_t rotate(NodeRef parentRef, size_t depth = 1); - }; - } -} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_statistics.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_statistics.cpp deleted file mode 100644 index aa56035026..0000000000 --- a/thirdparty/embree-aarch64/kernels/bvh/bvh_statistics.cpp +++ /dev/null @@ -1,168 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "bvh_statistics.h" -#include "../../common/algorithms/parallel_reduce.h" - -namespace embree -{ - template - BVHNStatistics::BVHNStatistics (BVH* bvh) : bvh(bvh) - { - double A = max(0.0f,bvh->getLinearBounds().expectedHalfArea()); - stat = statistics(bvh->root,A,BBox1f(0.0f,1.0f)); - } - - template - std::string BVHNStatistics::str() - { - std::ostringstream stream; - stream.setf(std::ios::fixed, std::ios::floatfield); - stream << " primitives = " << bvh->numPrimitives << ", vertices = " << bvh->numVertices << ", depth = " << stat.depth << std::endl; - size_t totalBytes = stat.bytes(bvh); - double totalSAH = stat.sah(bvh); - stream << " total : sah = " << std::setw(7) << std::setprecision(3) << totalSAH << " (100.00%), "; - stream << "#bytes = " << std::setw(7) << std::setprecision(2) << totalBytes/1E6 << " MB (100.00%), "; - stream << "#nodes = " << std::setw(7) << stat.size() << " (" << std::setw(6) << std::setprecision(2) << 100.0*stat.fillRate(bvh) << "% filled), "; - stream << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(totalBytes)/double(bvh->numPrimitives) << std::endl; - if (stat.statAABBNodes.numNodes ) stream << " getAABBNodes : " << stat.statAABBNodes.toString(bvh,totalSAH,totalBytes) << std::endl; - if (stat.statOBBNodes.numNodes ) stream << " ungetAABBNodes : " << stat.statOBBNodes.toString(bvh,totalSAH,totalBytes) << std::endl; - if (stat.statAABBNodesMB.numNodes ) stream << " getAABBNodesMB : " << stat.statAABBNodesMB.toString(bvh,totalSAH,totalBytes) << std::endl; - if (stat.statAABBNodesMB4D.numNodes) stream << " getAABBNodesMB4D : " << stat.statAABBNodesMB4D.toString(bvh,totalSAH,totalBytes) << std::endl; - if (stat.statOBBNodesMB.numNodes) stream << " ungetAABBNodesMB : " << stat.statOBBNodesMB.toString(bvh,totalSAH,totalBytes) << std::endl; - if (stat.statQuantizedNodes.numNodes ) stream << " quantizedNodes : " << stat.statQuantizedNodes.toString(bvh,totalSAH,totalBytes) << std::endl; - if (true) stream << " leaves : " << stat.statLeaf.toString(bvh,totalSAH,totalBytes) << std::endl; - if (true) stream << " histogram : " << stat.statLeaf.histToString() << std::endl; - return stream.str(); - } - - template - typename BVHNStatistics::Statistics BVHNStatistics::statistics(NodeRef node, const double A, const BBox1f t0t1) - { - Statistics s; - assert(t0t1.size() > 0.0f); - double dt = max(0.0f,t0t1.size()); - if (node.isAABBNode()) - { - AABBNode* n = node.getAABBNode(); - s = s + parallel_reduce(0,N,Statistics(),[&] ( const int i ) { - if (n->child(i) == BVH::emptyNode) return Statistics(); - const double Ai = max(0.0f,halfArea(n->extend(i))); - Statistics s = statistics(n->child(i),Ai,t0t1); - s.statAABBNodes.numChildren++; - return s; - }, Statistics::add); - s.statAABBNodes.numNodes++; - s.statAABBNodes.nodeSAH += dt*A; - s.depth++; - } - else if (node.isOBBNode()) - { - OBBNode* n = node.ungetAABBNode(); - s = s + parallel_reduce(0,N,Statistics(),[&] ( const int i ) { - if (n->child(i) == BVH::emptyNode) return Statistics(); - const double Ai = max(0.0f,halfArea(n->extent(i))); - Statistics s = statistics(n->child(i),Ai,t0t1); - s.statOBBNodes.numChildren++; - return s; - }, Statistics::add); - s.statOBBNodes.numNodes++; - s.statOBBNodes.nodeSAH += dt*A; - s.depth++; - } - else if (node.isAABBNodeMB()) - { - AABBNodeMB* n = node.getAABBNodeMB(); - s = s + parallel_reduce(0,N,Statistics(),[&] ( const int i ) { - if (n->child(i) == BVH::emptyNode) return Statistics(); - const double Ai = max(0.0f,n->expectedHalfArea(i,t0t1)); - Statistics s = statistics(n->child(i),Ai,t0t1); - s.statAABBNodesMB.numChildren++; - return s; - }, Statistics::add); - s.statAABBNodesMB.numNodes++; - s.statAABBNodesMB.nodeSAH += dt*A; - s.depth++; - } - else if (node.isAABBNodeMB4D()) - { - AABBNodeMB4D* n = node.getAABBNodeMB4D(); - s = s + parallel_reduce(0,N,Statistics(),[&] ( const int i ) { - if (n->child(i) == BVH::emptyNode) return Statistics(); - const BBox1f t0t1i = intersect(t0t1,n->timeRange(i)); - assert(!t0t1i.empty()); - const double Ai = n->AABBNodeMB::expectedHalfArea(i,t0t1i); - Statistics s = statistics(n->child(i),Ai,t0t1i); - s.statAABBNodesMB4D.numChildren++; - return s; - }, Statistics::add); - s.statAABBNodesMB4D.numNodes++; - s.statAABBNodesMB4D.nodeSAH += dt*A; - s.depth++; - } - else if (node.isOBBNodeMB()) - { - OBBNodeMB* n = node.ungetAABBNodeMB(); - s = s + parallel_reduce(0,N,Statistics(),[&] ( const int i ) { - if (n->child(i) == BVH::emptyNode) return Statistics(); - const double Ai = max(0.0f,halfArea(n->extent0(i))); - Statistics s = statistics(n->child(i),Ai,t0t1); - s.statOBBNodesMB.numChildren++; - return s; - }, Statistics::add); - s.statOBBNodesMB.numNodes++; - s.statOBBNodesMB.nodeSAH += dt*A; - s.depth++; - } - else if (node.isQuantizedNode()) - { - QuantizedNode* n = node.quantizedNode(); - s = s + parallel_reduce(0,N,Statistics(),[&] ( const int i ) { - if (n->child(i) == BVH::emptyNode) return Statistics(); - const double Ai = max(0.0f,halfArea(n->extent(i))); - Statistics s = statistics(n->child(i),Ai,t0t1); - s.statQuantizedNodes.numChildren++; - return s; - }, Statistics::add); - s.statQuantizedNodes.numNodes++; - s.statQuantizedNodes.nodeSAH += dt*A; - s.depth++; - } - else if (node.isLeaf()) - { - size_t num; const char* tri = node.leaf(num); - if (num) - { - for (size_t i=0; iprimTy->getBytes(tri); - s.statLeaf.numPrimsActive += bvh->primTy->sizeActive(tri); - s.statLeaf.numPrimsTotal += bvh->primTy->sizeTotal(tri); - s.statLeaf.numBytes += bytes; - tri+=bytes; - } - s.statLeaf.numLeaves++; - s.statLeaf.numPrimBlocks += num; - s.statLeaf.leafSAH += dt*A*num; - if (num-1 < Statistics::LeafStat::NHIST) { - s.statLeaf.numPrimBlocksHistogram[num-1]++; - } - } - } - else { - // -- GODOT start -- - // throw std::runtime_error("not supported node type in bvh_statistics"); - abort(); - // -- GODOT end -- - } - return s; - } - -#if defined(__AVX__) - template class BVHNStatistics<8>; -#endif - -#if !defined(__AVX__) || (!defined(EMBREE_TARGET_SSE2) && !defined(EMBREE_TARGET_SSE42)) || defined(__aarch64__) - template class BVHNStatistics<4>; -#endif -} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_statistics.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_statistics.h deleted file mode 100644 index 73dfc6fbcc..0000000000 --- a/thirdparty/embree-aarch64/kernels/bvh/bvh_statistics.h +++ /dev/null @@ -1,285 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "bvh.h" -#include - -namespace embree -{ - template - class BVHNStatistics - { - typedef BVHN BVH; - typedef typename BVH::AABBNode AABBNode; - typedef typename BVH::OBBNode OBBNode; - typedef typename BVH::AABBNodeMB AABBNodeMB; - typedef typename BVH::AABBNodeMB4D AABBNodeMB4D; - typedef typename BVH::OBBNodeMB OBBNodeMB; - typedef typename BVH::QuantizedNode QuantizedNode; - - typedef typename BVH::NodeRef NodeRef; - - struct Statistics - { - template - struct NodeStat - { - NodeStat ( double nodeSAH = 0, - size_t numNodes = 0, - size_t numChildren = 0) - : nodeSAH(nodeSAH), - numNodes(numNodes), - numChildren(numChildren) {} - - double sah(BVH* bvh) const { - return nodeSAH/bvh->getLinearBounds().expectedHalfArea(); - } - - size_t bytes() const { - return numNodes*sizeof(Node); - } - - size_t size() const { - return numNodes; - } - - double fillRateNom () const { return double(numChildren); } - double fillRateDen () const { return double(numNodes*N); } - double fillRate () const { return fillRateNom()/fillRateDen(); } - - __forceinline friend NodeStat operator+ ( const NodeStat& a, const NodeStat& b) - { - return NodeStat(a.nodeSAH + b.nodeSAH, - a.numNodes+b.numNodes, - a.numChildren+b.numChildren); - } - - std::string toString(BVH* bvh, double sahTotal, size_t bytesTotal) const - { - std::ostringstream stream; - stream.setf(std::ios::fixed, std::ios::floatfield); - stream << "sah = " << std::setw(7) << std::setprecision(3) << sah(bvh); - stream << " (" << std::setw(6) << std::setprecision(2) << 100.0*sah(bvh)/sahTotal << "%), "; - stream << "#bytes = " << std::setw(7) << std::setprecision(2) << bytes()/1E6 << " MB "; - stream << "(" << std::setw(6) << std::setprecision(2) << 100.0*double(bytes())/double(bytesTotal) << "%), "; - stream << "#nodes = " << std::setw(7) << numNodes << " (" << std::setw(6) << std::setprecision(2) << 100.0*fillRate() << "% filled), "; - stream << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytes())/double(bvh->numPrimitives); - return stream.str(); - } - - public: - double nodeSAH; - size_t numNodes; - size_t numChildren; - }; - - struct LeafStat - { - static const int NHIST = 8; - - LeafStat ( double leafSAH = 0.0f, - size_t numLeaves = 0, - size_t numPrimsActive = 0, - size_t numPrimsTotal = 0, - size_t numPrimBlocks = 0, - size_t numBytes = 0) - : leafSAH(leafSAH), - numLeaves(numLeaves), - numPrimsActive(numPrimsActive), - numPrimsTotal(numPrimsTotal), - numPrimBlocks(numPrimBlocks), - numBytes(numBytes) - { - for (size_t i=0; igetLinearBounds().expectedHalfArea(); - } - - size_t bytes(BVH* bvh) const { - return numBytes; - } - - size_t size() const { - return numLeaves; - } - - double fillRateNom (BVH* bvh) const { return double(numPrimsActive); } - double fillRateDen (BVH* bvh) const { return double(numPrimsTotal); } - double fillRate (BVH* bvh) const { return fillRateNom(bvh)/fillRateDen(bvh); } - - __forceinline friend LeafStat operator+ ( const LeafStat& a, const LeafStat& b) - { - LeafStat stat(a.leafSAH + b.leafSAH, - a.numLeaves+b.numLeaves, - a.numPrimsActive+b.numPrimsActive, - a.numPrimsTotal+b.numPrimsTotal, - a.numPrimBlocks+b.numPrimBlocks, - a.numBytes+b.numBytes); - for (size_t i=0; i statAABBNodes = NodeStat(), - NodeStat statOBBNodes = NodeStat(), - NodeStat statAABBNodesMB = NodeStat(), - NodeStat statAABBNodesMB4D = NodeStat(), - NodeStat statOBBNodesMB = NodeStat(), - NodeStat statQuantizedNodes = NodeStat()) - - : depth(depth), - statLeaf(statLeaf), - statAABBNodes(statAABBNodes), - statOBBNodes(statOBBNodes), - statAABBNodesMB(statAABBNodesMB), - statAABBNodesMB4D(statAABBNodesMB4D), - statOBBNodesMB(statOBBNodesMB), - statQuantizedNodes(statQuantizedNodes) {} - - double sah(BVH* bvh) const - { - return statLeaf.sah(bvh) + - statAABBNodes.sah(bvh) + - statOBBNodes.sah(bvh) + - statAABBNodesMB.sah(bvh) + - statAABBNodesMB4D.sah(bvh) + - statOBBNodesMB.sah(bvh) + - statQuantizedNodes.sah(bvh); - } - - size_t bytes(BVH* bvh) const { - return statLeaf.bytes(bvh) + - statAABBNodes.bytes() + - statOBBNodes.bytes() + - statAABBNodesMB.bytes() + - statAABBNodesMB4D.bytes() + - statOBBNodesMB.bytes() + - statQuantizedNodes.bytes(); - } - - size_t size() const - { - return statLeaf.size() + - statAABBNodes.size() + - statOBBNodes.size() + - statAABBNodesMB.size() + - statAABBNodesMB4D.size() + - statOBBNodesMB.size() + - statQuantizedNodes.size(); - } - - double fillRate (BVH* bvh) const - { - double nom = statLeaf.fillRateNom(bvh) + - statAABBNodes.fillRateNom() + - statOBBNodes.fillRateNom() + - statAABBNodesMB.fillRateNom() + - statAABBNodesMB4D.fillRateNom() + - statOBBNodesMB.fillRateNom() + - statQuantizedNodes.fillRateNom(); - double den = statLeaf.fillRateDen(bvh) + - statAABBNodes.fillRateDen() + - statOBBNodes.fillRateDen() + - statAABBNodesMB.fillRateDen() + - statAABBNodesMB4D.fillRateDen() + - statOBBNodesMB.fillRateDen() + - statQuantizedNodes.fillRateDen(); - return nom/den; - } - - friend Statistics operator+ ( const Statistics& a, const Statistics& b ) - { - return Statistics(max(a.depth,b.depth), - a.statLeaf + b.statLeaf, - a.statAABBNodes + b.statAABBNodes, - a.statOBBNodes + b.statOBBNodes, - a.statAABBNodesMB + b.statAABBNodesMB, - a.statAABBNodesMB4D + b.statAABBNodesMB4D, - a.statOBBNodesMB + b.statOBBNodesMB, - a.statQuantizedNodes + b.statQuantizedNodes); - } - - static Statistics add ( const Statistics& a, const Statistics& b ) { - return a+b; - } - - public: - size_t depth; - LeafStat statLeaf; - NodeStat statAABBNodes; - NodeStat statOBBNodes; - NodeStat statAABBNodesMB; - NodeStat statAABBNodesMB4D; - NodeStat statOBBNodesMB; - NodeStat statQuantizedNodes; - }; - - public: - - /* Constructor gathers statistics. */ - BVHNStatistics (BVH* bvh); - - /*! Convert statistics into a string */ - std::string str(); - - double sah() const { - return stat.sah(bvh); - } - - size_t bytesUsed() const { - return stat.bytes(bvh); - } - - private: - Statistics statistics(NodeRef node, const double A, const BBox1f dt); - - private: - BVH* bvh; - Statistics stat; - }; - - typedef BVHNStatistics<4> BVH4Statistics; - typedef BVHNStatistics<8> BVH8Statistics; -} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_traverser1.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_traverser1.h deleted file mode 100644 index 7f17084b81..0000000000 --- a/thirdparty/embree-aarch64/kernels/bvh/bvh_traverser1.h +++ /dev/null @@ -1,676 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "bvh.h" -#include "node_intersector1.h" -#include "../common/stack_item.h" - -#define NEW_SORTING_CODE 1 - -namespace embree -{ - namespace isa - { - /*! BVH regular node traversal for single rays. */ - template - class BVHNNodeTraverser1Hit; - - /*! Helper functions for fast sorting using AVX512 instructions. */ -#if defined(__AVX512ER__) - - /* KNL code path */ - __forceinline void isort_update(vfloat16 &dist, vllong8 &ptr, const vfloat16 &d, const vllong8 &p) - { - const vfloat16 dist_shift = align_shift_right<15>(dist,dist); - const vllong8 ptr_shift = align_shift_right<7>(ptr,ptr); - const vbool16 m_geq = d >= dist; - const vbool16 m_geq_shift = m_geq << 1; - dist = select(m_geq,d,dist); - ptr = select(vboold8(m_geq),p,ptr); - dist = select(m_geq_shift,dist_shift,dist); - ptr = select(vboold8(m_geq_shift),ptr_shift,ptr); - } - - __forceinline void isort_quick_update(vfloat16 &dist, vllong8 &ptr, const vfloat16 &d, const vllong8 &p) - { - //dist = align_shift_right<15>(dist,d); - //ptr = align_shift_right<7>(ptr,p); - dist = align_shift_right<15>(dist,permute(d,vint16(zero))); - ptr = align_shift_right<7>(ptr,permute(p,vllong8(zero))); - } - - template - __forceinline void traverseClosestHitAVX512(NodeRef& cur, - size_t mask, - const vfloat& tNear, - StackItemT*& stackPtr, - StackItemT* stackEnd) - { - assert(mask != 0); - const BaseNode* node = cur.baseNode(); - - vllong8 children( vllong::loadu((void*)node->children) ); - children = vllong8::compact((int)mask,children); - vfloat16 distance = tNear; - distance = vfloat16::compact((int)mask,distance,tNear); - - cur = toScalar(children); - BVHN::prefetch(cur,types); - - mask &= mask-1; - if (likely(mask == 0)) return; - - /* 2 hits: order A0 B0 */ - const vllong8 c0(children); - const vfloat16 d0(distance); - children = align_shift_right<1>(children,children); - distance = align_shift_right<1>(distance,distance); - const vllong8 c1(children); - const vfloat16 d1(distance); - - cur = toScalar(children); - BVHN::prefetch(cur,types); - - /* a '<' keeps the order for equal distances, scenes like powerplant largely benefit from it */ - const vboolf16 m_dist = d0 < d1; - const vfloat16 dist_A0 = select(m_dist, d0, d1); - const vfloat16 dist_B0 = select(m_dist, d1, d0); - const vllong8 ptr_A0 = select(vboold8(m_dist), c0, c1); - const vllong8 ptr_B0 = select(vboold8(m_dist), c1, c0); - - mask &= mask-1; - if (likely(mask == 0)) { - cur = toScalar(ptr_A0); - stackPtr[0].ptr = toScalar(ptr_B0); - *(float*)&stackPtr[0].dist = toScalar(dist_B0); - stackPtr++; - return; - } - - /* 3 hits: order A1 B1 C1 */ - - children = align_shift_right<1>(children,children); - distance = align_shift_right<1>(distance,distance); - - const vllong8 c2(children); - const vfloat16 d2(distance); - - cur = toScalar(children); - BVHN::prefetch(cur,types); - - const vboolf16 m_dist1 = dist_A0 <= d2; - const vfloat16 dist_tmp_B1 = select(m_dist1, d2, dist_A0); - const vllong8 ptr_A1 = select(vboold8(m_dist1), ptr_A0, c2); - const vllong8 ptr_tmp_B1 = select(vboold8(m_dist1), c2, ptr_A0); - - const vboolf16 m_dist2 = dist_B0 <= dist_tmp_B1; - const vfloat16 dist_B1 = select(m_dist2, dist_B0 , dist_tmp_B1); - const vfloat16 dist_C1 = select(m_dist2, dist_tmp_B1, dist_B0); - const vllong8 ptr_B1 = select(vboold8(m_dist2), ptr_B0, ptr_tmp_B1); - const vllong8 ptr_C1 = select(vboold8(m_dist2), ptr_tmp_B1, ptr_B0); - - mask &= mask-1; - if (likely(mask == 0)) { - cur = toScalar(ptr_A1); - stackPtr[0].ptr = toScalar(ptr_C1); - *(float*)&stackPtr[0].dist = toScalar(dist_C1); - stackPtr[1].ptr = toScalar(ptr_B1); - *(float*)&stackPtr[1].dist = toScalar(dist_B1); - stackPtr+=2; - return; - } - - /* 4 hits: order A2 B2 C2 D2 */ - - const vfloat16 dist_A1 = select(m_dist1, dist_A0, d2); - - children = align_shift_right<1>(children,children); - distance = align_shift_right<1>(distance,distance); - - const vllong8 c3(children); - const vfloat16 d3(distance); - - cur = toScalar(children); - BVHN::prefetch(cur,types); - - const vboolf16 m_dist3 = dist_A1 <= d3; - const vfloat16 dist_tmp_B2 = select(m_dist3, d3, dist_A1); - const vllong8 ptr_A2 = select(vboold8(m_dist3), ptr_A1, c3); - const vllong8 ptr_tmp_B2 = select(vboold8(m_dist3), c3, ptr_A1); - - const vboolf16 m_dist4 = dist_B1 <= dist_tmp_B2; - const vfloat16 dist_B2 = select(m_dist4, dist_B1 , dist_tmp_B2); - const vfloat16 dist_tmp_C2 = select(m_dist4, dist_tmp_B2, dist_B1); - const vllong8 ptr_B2 = select(vboold8(m_dist4), ptr_B1, ptr_tmp_B2); - const vllong8 ptr_tmp_C2 = select(vboold8(m_dist4), ptr_tmp_B2, ptr_B1); - - const vboolf16 m_dist5 = dist_C1 <= dist_tmp_C2; - const vfloat16 dist_C2 = select(m_dist5, dist_C1 , dist_tmp_C2); - const vfloat16 dist_D2 = select(m_dist5, dist_tmp_C2, dist_C1); - const vllong8 ptr_C2 = select(vboold8(m_dist5), ptr_C1, ptr_tmp_C2); - const vllong8 ptr_D2 = select(vboold8(m_dist5), ptr_tmp_C2, ptr_C1); - - mask &= mask-1; - if (likely(mask == 0)) { - cur = toScalar(ptr_A2); - stackPtr[0].ptr = toScalar(ptr_D2); - *(float*)&stackPtr[0].dist = toScalar(dist_D2); - stackPtr[1].ptr = toScalar(ptr_C2); - *(float*)&stackPtr[1].dist = toScalar(dist_C2); - stackPtr[2].ptr = toScalar(ptr_B2); - *(float*)&stackPtr[2].dist = toScalar(dist_B2); - stackPtr+=3; - return; - } - - /* >=5 hits: reverse to descending order for writing to stack */ - - const size_t hits = 4 + popcnt(mask); - const vfloat16 dist_A2 = select(m_dist3, dist_A1, d3); - vfloat16 dist(neg_inf); - vllong8 ptr(zero); - - - isort_quick_update(dist,ptr,dist_A2,ptr_A2); - isort_quick_update(dist,ptr,dist_B2,ptr_B2); - isort_quick_update(dist,ptr,dist_C2,ptr_C2); - isort_quick_update(dist,ptr,dist_D2,ptr_D2); - - do { - - children = align_shift_right<1>(children,children); - distance = align_shift_right<1>(distance,distance); - - cur = toScalar(children); - BVHN::prefetch(cur,types); - - const vfloat16 new_dist(permute(distance,vint16(zero))); - const vllong8 new_ptr(permute(children,vllong8(zero))); - - mask &= mask-1; - isort_update(dist,ptr,new_dist,new_ptr); - - } while(mask); - - const vboold8 m_stack_ptr(0x55); // 10101010 (lsb -> msb) - const vboolf16 m_stack_dist(0x4444); // 0010001000100010 (lsb -> msb) - - /* extract current noderef */ - cur = toScalar(permute(ptr,vllong8(hits-1))); - /* rearrange pointers to beginning of 16 bytes block */ - vllong8 stackElementA0; - stackElementA0 = vllong8::expand(m_stack_ptr,ptr,stackElementA0); - /* put distances in between */ - vuint16 stackElementA1((__m512i)stackElementA0); - stackElementA1 = vuint16::expand(m_stack_dist,asUInt(dist),stackElementA1); - /* write out first 4 x 16 bytes block to stack */ - vuint16::storeu(stackPtr,stackElementA1); - /* get upper half of dist and ptr */ - dist = align_shift_right<4>(dist,dist); - ptr = align_shift_right<4>(ptr,ptr); - /* assemble and write out second block */ - vllong8 stackElementB0; - stackElementB0 = vllong8::expand(m_stack_ptr,ptr,stackElementB0); - vuint16 stackElementB1((__m512i)stackElementB0); - stackElementB1 = vuint16::expand(m_stack_dist,asUInt(dist),stackElementB1); - vuint16::storeu(stackPtr + 4,stackElementB1); - /* increase stack pointer */ - stackPtr += hits-1; - } -#endif - -#if defined(__AVX512VL__) // SKX - - template - __forceinline void isort_update(vint &dist, const vint &d) - { - const vint dist_shift = align_shift_right(dist,dist); - const vboolf m_geq = d >= dist; - const vboolf m_geq_shift = m_geq << 1; - dist = select(m_geq,d,dist); - dist = select(m_geq_shift,dist_shift,dist); - } - - template - __forceinline void isort_quick_update(vint &dist, const vint &d) { - dist = align_shift_right(dist,permute(d,vint(zero))); - } - - __forceinline size_t permuteExtract(const vint8& index, const vllong4& n0, const vllong4& n1) { - return toScalar(permutex2var((__m256i)index,n0,n1)); - } - - __forceinline float permuteExtract(const vint8& index, const vfloat8& n) { - return toScalar(permute(n,index)); - } - -#endif - - /* Specialization for BVH4. */ - template - class BVHNNodeTraverser1Hit<4, Nx, types> - { - typedef BVH4 BVH; - typedef BVH4::NodeRef NodeRef; - typedef BVH4::BaseNode BaseNode; - - - public: - /* Traverses a node with at least one hit child. Optimized for finding the closest hit (intersection). */ - static __forceinline void traverseClosestHit(NodeRef& cur, - size_t mask, - const vfloat& tNear, - StackItemT*& stackPtr, - StackItemT* stackEnd) - { - assert(mask != 0); -#if defined(__AVX512ER__) - traverseClosestHitAVX512<4,Nx,types,NodeRef,BaseNode>(cur,mask,tNear,stackPtr,stackEnd); -#else - const BaseNode* node = cur.baseNode(); - - /*! one child is hit, continue with that child */ - size_t r = bscf(mask); - cur = node->child(r); - BVH::prefetch(cur,types); - if (likely(mask == 0)) { - assert(cur != BVH::emptyNode); - return; - } - - /*! two children are hit, push far child, and continue with closer child */ - NodeRef c0 = cur; - const unsigned int d0 = ((unsigned int*)&tNear)[r]; - r = bscf(mask); - NodeRef c1 = node->child(r); - BVH::prefetch(c1,types); - const unsigned int d1 = ((unsigned int*)&tNear)[r]; - assert(c0 != BVH::emptyNode); - assert(c1 != BVH::emptyNode); - if (likely(mask == 0)) { - assert(stackPtr < stackEnd); - if (d0 < d1) { stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++; cur = c0; return; } - else { stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++; cur = c1; return; } - } - -#if NEW_SORTING_CODE == 1 - vint4 s0((size_t)c0,(size_t)d0); - vint4 s1((size_t)c1,(size_t)d1); - r = bscf(mask); - NodeRef c2 = node->child(r); BVH::prefetch(c2,types); unsigned int d2 = ((unsigned int*)&tNear)[r]; - vint4 s2((size_t)c2,(size_t)d2); - /* 3 hits */ - if (likely(mask == 0)) { - StackItemT::sort3(s0,s1,s2); - *(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1; - cur = toSizeT(s2); - stackPtr+=2; - return; - } - r = bscf(mask); - NodeRef c3 = node->child(r); BVH::prefetch(c3,types); unsigned int d3 = ((unsigned int*)&tNear)[r]; - vint4 s3((size_t)c3,(size_t)d3); - /* 4 hits */ - StackItemT::sort4(s0,s1,s2,s3); - *(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1; *(vint4*)&stackPtr[2] = s2; - cur = toSizeT(s3); - stackPtr+=3; -#else - /*! Here starts the slow path for 3 or 4 hit children. We push - * all nodes onto the stack to sort them there. */ - assert(stackPtr < stackEnd); - stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++; - assert(stackPtr < stackEnd); - stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++; - - /*! three children are hit, push all onto stack and sort 3 stack items, continue with closest child */ - assert(stackPtr < stackEnd); - r = bscf(mask); - NodeRef c = node->child(r); BVH::prefetch(c,types); unsigned int d = ((unsigned int*)&tNear)[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++; - assert(c != BVH::emptyNode); - if (likely(mask == 0)) { - sort(stackPtr[-1],stackPtr[-2],stackPtr[-3]); - cur = (NodeRef) stackPtr[-1].ptr; stackPtr--; - return; - } - - /*! four children are hit, push all onto stack and sort 4 stack items, continue with closest child */ - assert(stackPtr < stackEnd); - r = bscf(mask); - c = node->child(r); BVH::prefetch(c,types); d = *(unsigned int*)&tNear[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++; - assert(c != BVH::emptyNode); - sort(stackPtr[-1],stackPtr[-2],stackPtr[-3],stackPtr[-4]); - cur = (NodeRef) stackPtr[-1].ptr; stackPtr--; -#endif -#endif - } - - /* Traverses a node with at least one hit child. Optimized for finding any hit (occlusion). */ - static __forceinline void traverseAnyHit(NodeRef& cur, - size_t mask, - const vfloat& tNear, - NodeRef*& stackPtr, - NodeRef* stackEnd) - { - const BaseNode* node = cur.baseNode(); - - /*! one child is hit, continue with that child */ - size_t r = bscf(mask); - cur = node->child(r); - BVH::prefetch(cur,types); - - /* simpler in sequence traversal order */ - assert(cur != BVH::emptyNode); - if (likely(mask == 0)) return; - assert(stackPtr < stackEnd); - *stackPtr = cur; stackPtr++; - - for (; ;) - { - r = bscf(mask); - cur = node->child(r); BVH::prefetch(cur,types); - assert(cur != BVH::emptyNode); - if (likely(mask == 0)) return; - assert(stackPtr < stackEnd); - *stackPtr = cur; stackPtr++; - } - } - }; - - /* Specialization for BVH8. */ - template - class BVHNNodeTraverser1Hit<8, Nx, types> - { - typedef BVH8 BVH; - typedef BVH8::NodeRef NodeRef; - typedef BVH8::BaseNode BaseNode; - -#if defined(__AVX512VL__) - template - static __forceinline void traverseClosestHitAVX512VL8(NodeRef& cur, - size_t mask, - const vfloat8& tNear, - StackItemT*& stackPtr, - StackItemT* stackEnd) - { - assert(mask != 0); - const BaseNode* node = cur.baseNode(); - const vllong4 n0 = vllong4::loadu((vllong4*)&node->children[0]); - const vllong4 n1 = vllong4::loadu((vllong4*)&node->children[4]); - vint8 distance_i = (asInt(tNear) & 0xfffffff8) | vint8(step); - distance_i = vint8::compact((int)mask,distance_i,distance_i); - cur = permuteExtract(distance_i,n0,n1); - BVH::prefetch(cur,types); - - mask &= mask-1; - if (likely(mask == 0)) return; - - /* 2 hits: order A0 B0 */ - const vint8 d0(distance_i); - const vint8 d1(shuffle<1>(distance_i)); - cur = permuteExtract(d1,n0,n1); - BVH::prefetch(cur,types); - - const vint8 dist_A0 = min(d0, d1); - const vint8 dist_B0 = max(d0, d1); - assert(dist_A0[0] < dist_B0[0]); - - mask &= mask-1; - if (likely(mask == 0)) { - cur = permuteExtract(dist_A0,n0,n1); - stackPtr[0].ptr = permuteExtract(dist_B0,n0,n1); - *(float*)&stackPtr[0].dist = permuteExtract(dist_B0,tNear); - stackPtr++; - return; - } - - /* 3 hits: order A1 B1 C1 */ - - const vint8 d2(shuffle<2>(distance_i)); - cur = permuteExtract(d2,n0,n1); - BVH::prefetch(cur,types); - - const vint8 dist_A1 = min(dist_A0,d2); - const vint8 dist_tmp_B1 = max(dist_A0,d2); - const vint8 dist_B1 = min(dist_B0,dist_tmp_B1); - const vint8 dist_C1 = max(dist_B0,dist_tmp_B1); - assert(dist_A1[0] < dist_B1[0]); - assert(dist_B1[0] < dist_C1[0]); - - mask &= mask-1; - if (likely(mask == 0)) { - cur = permuteExtract(dist_A1,n0,n1); - stackPtr[0].ptr = permuteExtract(dist_C1,n0,n1); - *(float*)&stackPtr[0].dist = permuteExtract(dist_C1,tNear); - stackPtr[1].ptr = permuteExtract(dist_B1,n0,n1); - *(float*)&stackPtr[1].dist = permuteExtract(dist_B1,tNear); - stackPtr+=2; - return; - } - - /* 4 hits: order A2 B2 C2 D2 */ - - const vint8 d3(shuffle<3>(distance_i)); - cur = permuteExtract(d3,n0,n1); - BVH::prefetch(cur,types); - - const vint8 dist_A2 = min(dist_A1,d3); - const vint8 dist_tmp_B2 = max(dist_A1,d3); - const vint8 dist_B2 = min(dist_B1,dist_tmp_B2); - const vint8 dist_tmp_C2 = max(dist_B1,dist_tmp_B2); - const vint8 dist_C2 = min(dist_C1,dist_tmp_C2); - const vint8 dist_D2 = max(dist_C1,dist_tmp_C2); - assert(dist_A2[0] < dist_B2[0]); - assert(dist_B2[0] < dist_C2[0]); - assert(dist_C2[0] < dist_D2[0]); - - mask &= mask-1; - if (likely(mask == 0)) { - cur = permuteExtract(dist_A2,n0,n1); - stackPtr[0].ptr = permuteExtract(dist_D2,n0,n1); - *(float*)&stackPtr[0].dist = permuteExtract(dist_D2,tNear); - stackPtr[1].ptr = permuteExtract(dist_C2,n0,n1); - *(float*)&stackPtr[1].dist = permuteExtract(dist_C2,tNear); - stackPtr[2].ptr = permuteExtract(dist_B2,n0,n1); - *(float*)&stackPtr[2].dist = permuteExtract(dist_B2,tNear); - stackPtr+=3; - return; - } - - /* >=5 hits: reverse to descending order for writing to stack */ - - distance_i = align_shift_right<3>(distance_i,distance_i); - const size_t hits = 4 + popcnt(mask); - vint8 dist(INT_MIN); // this will work with -0.0f (0x80000000) as distance, isort_update uses >= to insert - - isort_quick_update(dist,dist_A2); - isort_quick_update(dist,dist_B2); - isort_quick_update(dist,dist_C2); - isort_quick_update(dist,dist_D2); - - do { - - distance_i = align_shift_right<1>(distance_i,distance_i); - cur = permuteExtract(distance_i,n0,n1); - BVH::prefetch(cur,types); - const vint8 new_dist(permute(distance_i,vint8(zero))); - mask &= mask-1; - isort_update(dist,new_dist); - - } while(mask); - - for (size_t i=0; i<7; i++) - assert(dist[i+0]>=dist[i+1]); - - for (size_t i=0;iptr = permuteExtract(dist,n0,n1); - *(float*)&stackPtr->dist = permuteExtract(dist,tNear); - dist = align_shift_right<1>(dist,dist); - stackPtr++; - } - cur = permuteExtract(dist,n0,n1); - } -#endif - - public: - static __forceinline void traverseClosestHit(NodeRef& cur, - size_t mask, - const vfloat& tNear, - StackItemT*& stackPtr, - StackItemT* stackEnd) - { - assert(mask != 0); -#if defined(__AVX512ER__) - traverseClosestHitAVX512<8,Nx,types,NodeRef,BaseNode>(cur,mask,tNear,stackPtr,stackEnd); -#elif defined(__AVX512VL__) - traverseClosestHitAVX512VL8(cur,mask,tNear,stackPtr,stackEnd); -#else - - const BaseNode* node = cur.baseNode(); - - /*! one child is hit, continue with that child */ - size_t r = bscf(mask); - cur = node->child(r); - BVH::prefetch(cur,types); - if (likely(mask == 0)) { - assert(cur != BVH::emptyNode); - return; - } - - /*! two children are hit, push far child, and continue with closer child */ - NodeRef c0 = cur; - const unsigned int d0 = ((unsigned int*)&tNear)[r]; - r = bscf(mask); - NodeRef c1 = node->child(r); - BVH::prefetch(c1,types); - const unsigned int d1 = ((unsigned int*)&tNear)[r]; - - assert(c0 != BVH::emptyNode); - assert(c1 != BVH::emptyNode); - if (likely(mask == 0)) { - assert(stackPtr < stackEnd); - if (d0 < d1) { stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++; cur = c0; return; } - else { stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++; cur = c1; return; } - } -#if NEW_SORTING_CODE == 1 - vint4 s0((size_t)c0,(size_t)d0); - vint4 s1((size_t)c1,(size_t)d1); - - r = bscf(mask); - NodeRef c2 = node->child(r); BVH::prefetch(c2,types); unsigned int d2 = ((unsigned int*)&tNear)[r]; - vint4 s2((size_t)c2,(size_t)d2); - /* 3 hits */ - if (likely(mask == 0)) { - StackItemT::sort3(s0,s1,s2); - *(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1; - cur = toSizeT(s2); - stackPtr+=2; - return; - } - r = bscf(mask); - NodeRef c3 = node->child(r); BVH::prefetch(c3,types); unsigned int d3 = ((unsigned int*)&tNear)[r]; - vint4 s3((size_t)c3,(size_t)d3); - /* 4 hits */ - if (likely(mask == 0)) { - StackItemT::sort4(s0,s1,s2,s3); - *(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1; *(vint4*)&stackPtr[2] = s2; - cur = toSizeT(s3); - stackPtr+=3; - return; - } - *(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1; *(vint4*)&stackPtr[2] = s2; *(vint4*)&stackPtr[3] = s3; - /*! fallback case if more than 4 children are hit */ - StackItemT* stackFirst = stackPtr; - stackPtr+=4; - while (1) - { - assert(stackPtr < stackEnd); - r = bscf(mask); - NodeRef c = node->child(r); BVH::prefetch(c,types); unsigned int d = *(unsigned int*)&tNear[r]; - const vint4 s((size_t)c,(size_t)d); - *(vint4*)stackPtr++ = s; - assert(c != BVH::emptyNode); - if (unlikely(mask == 0)) break; - } - sort(stackFirst,stackPtr); - cur = (NodeRef) stackPtr[-1].ptr; stackPtr--; -#else - /*! Here starts the slow path for 3 or 4 hit children. We push - * all nodes onto the stack to sort them there. */ - assert(stackPtr < stackEnd); - stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++; - assert(stackPtr < stackEnd); - stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++; - - /*! three children are hit, push all onto stack and sort 3 stack items, continue with closest child */ - assert(stackPtr < stackEnd); - r = bscf(mask); - NodeRef c = node->child(r); BVH::prefetch(c,types); unsigned int d = ((unsigned int*)&tNear)[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++; - assert(c != BVH::emptyNode); - if (likely(mask == 0)) { - sort(stackPtr[-1],stackPtr[-2],stackPtr[-3]); - cur = (NodeRef) stackPtr[-1].ptr; stackPtr--; - return; - } - - /*! four children are hit, push all onto stack and sort 4 stack items, continue with closest child */ - assert(stackPtr < stackEnd); - r = bscf(mask); - c = node->child(r); BVH::prefetch(c,types); d = *(unsigned int*)&tNear[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++; - assert(c != BVH::emptyNode); - if (likely(mask == 0)) { - sort(stackPtr[-1],stackPtr[-2],stackPtr[-3],stackPtr[-4]); - cur = (NodeRef) stackPtr[-1].ptr; stackPtr--; - return; - } - /*! fallback case if more than 4 children are hit */ - StackItemT* stackFirst = stackPtr-4; - while (1) - { - assert(stackPtr < stackEnd); - r = bscf(mask); - c = node->child(r); BVH::prefetch(c,types); d = *(unsigned int*)&tNear[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++; - assert(c != BVH::emptyNode); - if (unlikely(mask == 0)) break; - } - sort(stackFirst,stackPtr); - cur = (NodeRef) stackPtr[-1].ptr; stackPtr--; -#endif -#endif - } - - static __forceinline void traverseAnyHit(NodeRef& cur, - size_t mask, - const vfloat& tNear, - NodeRef*& stackPtr, - NodeRef* stackEnd) - { - const BaseNode* node = cur.baseNode(); - - /*! one child is hit, continue with that child */ - size_t r = bscf(mask); - cur = node->child(r); - BVH::prefetch(cur,types); - - /* simpler in sequence traversal order */ - assert(cur != BVH::emptyNode); - if (likely(mask == 0)) return; - assert(stackPtr < stackEnd); - *stackPtr = cur; stackPtr++; - - for (; ;) - { - r = bscf(mask); - cur = node->child(r); BVH::prefetch(cur,types); - assert(cur != BVH::emptyNode); - if (likely(mask == 0)) return; - assert(stackPtr < stackEnd); - *stackPtr = cur; stackPtr++; - } - } - }; - } -} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_traverser_stream.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_traverser_stream.h deleted file mode 100644 index 9c603babf0..0000000000 --- a/thirdparty/embree-aarch64/kernels/bvh/bvh_traverser_stream.h +++ /dev/null @@ -1,154 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "bvh.h" -#include "../common/ray.h" -#include "../common/stack_item.h" - -namespace embree -{ - namespace isa - { - template - class BVHNNodeTraverserStreamHitCoherent - { - typedef BVHN BVH; - typedef typename BVH::NodeRef NodeRef; - typedef typename BVH::BaseNode BaseNode; - - public: - template - static __forceinline void traverseClosestHit(NodeRef& cur, - size_t& m_trav_active, - const vbool& vmask, - const vfloat& tNear, - const T* const tMask, - StackItemMaskCoherent*& stackPtr) - { - const NodeRef parent = cur; - size_t mask = movemask(vmask); - assert(mask != 0); - const BaseNode* node = cur.baseNode(); - - /*! one child is hit, continue with that child */ - const size_t r0 = bscf(mask); - assert(r0 < 8); - cur = node->child(r0); - BVHN::prefetch(cur,types); - m_trav_active = tMask[r0]; - assert(cur != BVH::emptyNode); - if (unlikely(mask == 0)) return; - - const unsigned int* const tNear_i = (unsigned int*)&tNear; - - /*! two children are hit, push far child, and continue with closer child */ - NodeRef c0 = cur; - unsigned int d0 = tNear_i[r0]; - const size_t r1 = bscf(mask); - assert(r1 < 8); - NodeRef c1 = node->child(r1); - BVHN::prefetch(c1,types); - unsigned int d1 = tNear_i[r1]; - - assert(c0 != BVH::emptyNode); - assert(c1 != BVH::emptyNode); - if (likely(mask == 0)) { - if (d0 < d1) { - assert(tNear[r1] >= 0.0f); - stackPtr->mask = tMask[r1]; - stackPtr->parent = parent; - stackPtr->child = c1; - stackPtr++; - cur = c0; - m_trav_active = tMask[r0]; - return; - } - else { - assert(tNear[r0] >= 0.0f); - stackPtr->mask = tMask[r0]; - stackPtr->parent = parent; - stackPtr->child = c0; - stackPtr++; - cur = c1; - m_trav_active = tMask[r1]; - return; - } - } - - /*! slow path for more than two hits */ - size_t hits = movemask(vmask); - const vint dist_i = select(vmask, (asInt(tNear) & 0xfffffff8) | vint(step), 0); - #if defined(__AVX512F__) && !defined(__AVX512VL__) // KNL - const vint tmp = extractN(dist_i); - const vint dist_i_sorted = usort_descending(tmp); - #else - const vint dist_i_sorted = usort_descending(dist_i); - #endif - const vint sorted_index = dist_i_sorted & 7; - - size_t i = 0; - for (;;) - { - const unsigned int index = sorted_index[i]; - assert(index < 8); - cur = node->child(index); - m_trav_active = tMask[index]; - assert(m_trav_active); - BVHN::prefetch(cur,types); - bscf(hits); - if (unlikely(hits==0)) break; - i++; - assert(cur != BVH::emptyNode); - assert(tNear[index] >= 0.0f); - stackPtr->mask = m_trav_active; - stackPtr->parent = parent; - stackPtr->child = cur; - stackPtr++; - } - } - - template - static __forceinline void traverseAnyHit(NodeRef& cur, - size_t& m_trav_active, - const vbool& vmask, - const T* const tMask, - StackItemMaskCoherent*& stackPtr) - { - const NodeRef parent = cur; - size_t mask = movemask(vmask); - assert(mask != 0); - const BaseNode* node = cur.baseNode(); - - /*! one child is hit, continue with that child */ - size_t r = bscf(mask); - cur = node->child(r); - BVHN::prefetch(cur,types); - m_trav_active = tMask[r]; - - /* simple in order sequence */ - assert(cur != BVH::emptyNode); - if (likely(mask == 0)) return; - stackPtr->mask = m_trav_active; - stackPtr->parent = parent; - stackPtr->child = cur; - stackPtr++; - - for (; ;) - { - r = bscf(mask); - cur = node->child(r); - BVHN::prefetch(cur,types); - m_trav_active = tMask[r]; - assert(cur != BVH::emptyNode); - if (likely(mask == 0)) return; - stackPtr->mask = m_trav_active; - stackPtr->parent = parent; - stackPtr->child = cur; - stackPtr++; - } - } - }; - } -} diff --git a/thirdparty/embree-aarch64/kernels/bvh/node_intersector.h b/thirdparty/embree-aarch64/kernels/bvh/node_intersector.h deleted file mode 100644 index a978c0c459..0000000000 --- a/thirdparty/embree-aarch64/kernels/bvh/node_intersector.h +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "bvh.h" - -namespace embree -{ - namespace isa - { - struct NearFarPrecalculations - { - size_t nearX, nearY, nearZ; - size_t farX, farY, farZ; - - __forceinline NearFarPrecalculations() {} - - __forceinline NearFarPrecalculations(const Vec3fa& dir, size_t N) - { - const size_t size = sizeof(float)*N; - nearX = (dir.x < 0.0f) ? 1*size : 0*size; - nearY = (dir.y < 0.0f) ? 3*size : 2*size; - nearZ = (dir.z < 0.0f) ? 5*size : 4*size; - farX = nearX ^ size; - farY = nearY ^ size; - farZ = nearZ ^ size; - } - }; - } -} diff --git a/thirdparty/embree-aarch64/kernels/bvh/node_intersector1.h b/thirdparty/embree-aarch64/kernels/bvh/node_intersector1.h deleted file mode 100644 index aa0d4ba4d7..0000000000 --- a/thirdparty/embree-aarch64/kernels/bvh/node_intersector1.h +++ /dev/null @@ -1,1788 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "node_intersector.h" - -#if defined(__AVX2__) -#define __FMA_X4__ -#endif - -#if defined(__aarch64__) -#define __FMA_X4__ -#endif - - -namespace embree -{ - namespace isa - { - ////////////////////////////////////////////////////////////////////////////////////// - // Ray structure used in single-ray traversal - ////////////////////////////////////////////////////////////////////////////////////// - - template - struct TravRayBase; - - /* Base (without tnear and tfar) */ - template - struct TravRayBase - { - __forceinline TravRayBase() {} - - __forceinline TravRayBase(const Vec3fa& ray_org, const Vec3fa& ray_dir) - : org_xyz(ray_org), dir_xyz(ray_dir) - { - const Vec3fa ray_rdir = rcp_safe(ray_dir); - org = Vec3vf(ray_org.x,ray_org.y,ray_org.z); - dir = Vec3vf(ray_dir.x,ray_dir.y,ray_dir.z); - rdir = Vec3vf(ray_rdir.x,ray_rdir.y,ray_rdir.z); -#if defined(__FMA_X4__) - const Vec3fa ray_org_rdir = ray_org*ray_rdir; -#if !defined(__aarch64__) - org_rdir = Vec3vf(ray_org_rdir.x,ray_org_rdir.y,ray_org_rdir.z); -#else - //for aarch64, we do not have msub equal instruction, so we negeate orig and use madd - //x86 will use msub - neg_org_rdir = Vec3vf(-ray_org_rdir.x,-ray_org_rdir.y,-ray_org_rdir.z); -#endif -#endif - nearX = ray_rdir.x >= 0.0f ? 0*sizeof(vfloat) : 1*sizeof(vfloat); - nearY = ray_rdir.y >= 0.0f ? 2*sizeof(vfloat) : 3*sizeof(vfloat); - nearZ = ray_rdir.z >= 0.0f ? 4*sizeof(vfloat) : 5*sizeof(vfloat); - farX = nearX ^ sizeof(vfloat); - farY = nearY ^ sizeof(vfloat); - farZ = nearZ ^ sizeof(vfloat); - -#if defined(__AVX512ER__) // KNL+ - /* optimization works only for 8-wide BVHs with 16-wide SIMD */ - const vint<16> id(step); - const vint<16> id2 = align_shift_right<16/2>(id, id); - permX = select(vfloat<16>(dir.x) >= 0.0f, id, id2); - permY = select(vfloat<16>(dir.y) >= 0.0f, id, id2); - permZ = select(vfloat<16>(dir.z) >= 0.0f, id, id2); -#endif - - } - - template - __forceinline TravRayBase(size_t k, const Vec3vf& ray_org, const Vec3vf& ray_dir, - const Vec3vf& ray_rdir, const Vec3vi& nearXYZ, - size_t flip = sizeof(vfloat)) - { - org = Vec3vf(ray_org.x[k], ray_org.y[k], ray_org.z[k]); - dir = Vec3vf(ray_dir.x[k], ray_dir.y[k], ray_dir.z[k]); - rdir = Vec3vf(ray_rdir.x[k], ray_rdir.y[k], ray_rdir.z[k]); -#if defined(__FMA_X4__) -#if !defined(__aarch64__) - org_rdir = org*rdir; -#else - neg_org_rdir = -(org*rdir); -#endif -#endif - nearX = nearXYZ.x[k]; - nearY = nearXYZ.y[k]; - nearZ = nearXYZ.z[k]; - farX = nearX ^ flip; - farY = nearY ^ flip; - farZ = nearZ ^ flip; - -#if defined(__AVX512ER__) // KNL+ - /* optimization works only for 8-wide BVHs with 16-wide SIMD */ - const vint<16> id(step); - const vint<16> id2 = align_shift_right<16/2>(id, id); - permX = select(vfloat<16>(dir.x) >= 0.0f, id, id2); - permY = select(vfloat<16>(dir.y) >= 0.0f, id, id2); - permZ = select(vfloat<16>(dir.z) >= 0.0f, id, id2); -#endif - } - - Vec3fa org_xyz, dir_xyz; - Vec3vf org, dir, rdir; -#if defined(__FMA_X4__) -#if !defined(__aarch64__) - Vec3vf org_rdir; -#else - //aarch64 version are keeping negation of the org_rdir and use madd - //x86 uses msub - Vec3vf neg_org_rdir; -#endif -#endif -#if defined(__AVX512ER__) // KNL+ - vint16 permX, permY, permZ; -#endif - - size_t nearX, nearY, nearZ; - size_t farX, farY, farZ; - }; - - /* Base (without tnear and tfar) */ - template - struct TravRayBase - { - __forceinline TravRayBase() {} - - __forceinline TravRayBase(const Vec3fa& ray_org, const Vec3fa& ray_dir) - : org_xyz(ray_org), dir_xyz(ray_dir) - { - const float round_down = 1.0f-3.0f*float(ulp); - const float round_up = 1.0f+3.0f*float(ulp); - const Vec3fa ray_rdir = 1.0f/zero_fix(ray_dir); - const Vec3fa ray_rdir_near = round_down*ray_rdir; - const Vec3fa ray_rdir_far = round_up *ray_rdir; - org = Vec3vf(ray_org.x,ray_org.y,ray_org.z); - dir = Vec3vf(ray_dir.x,ray_dir.y,ray_dir.z); - rdir_near = Vec3vf(ray_rdir_near.x,ray_rdir_near.y,ray_rdir_near.z); - rdir_far = Vec3vf(ray_rdir_far .x,ray_rdir_far .y,ray_rdir_far .z); - nearX = ray_rdir_near.x >= 0.0f ? 0*sizeof(vfloat) : 1*sizeof(vfloat); - nearY = ray_rdir_near.y >= 0.0f ? 2*sizeof(vfloat) : 3*sizeof(vfloat); - nearZ = ray_rdir_near.z >= 0.0f ? 4*sizeof(vfloat) : 5*sizeof(vfloat); - farX = nearX ^ sizeof(vfloat); - farY = nearY ^ sizeof(vfloat); - farZ = nearZ ^ sizeof(vfloat); - -#if defined(__AVX512ER__) // KNL+ - /* optimization works only for 8-wide BVHs with 16-wide SIMD */ - const vint<16> id(step); - const vint<16> id2 = align_shift_right<16/2>(id, id); - permX = select(vfloat<16>(dir.x) >= 0.0f, id, id2); - permY = select(vfloat<16>(dir.y) >= 0.0f, id, id2); - permZ = select(vfloat<16>(dir.z) >= 0.0f, id, id2); -#endif - } - - template - __forceinline TravRayBase(size_t k, const Vec3vf& ray_org, const Vec3vf& ray_dir, - const Vec3vf& ray_rdir, const Vec3vi& nearXYZ, - size_t flip = sizeof(vfloat)) - { - const vfloat round_down = 1.0f-3.0f*float(ulp); - const vfloat round_up = 1.0f+3.0f*float(ulp); - org = Vec3vf(ray_org.x[k], ray_org.y[k], ray_org.z[k]); - dir = Vec3vf(ray_dir.x[k], ray_dir.y[k], ray_dir.z[k]); - rdir_near = round_down*Vec3vf(ray_rdir.x[k], ray_rdir.y[k], ray_rdir.z[k]); - rdir_far = round_up *Vec3vf(ray_rdir.x[k], ray_rdir.y[k], ray_rdir.z[k]); - - nearX = nearXYZ.x[k]; - nearY = nearXYZ.y[k]; - nearZ = nearXYZ.z[k]; - farX = nearX ^ flip; - farY = nearY ^ flip; - farZ = nearZ ^ flip; - -#if defined(__AVX512ER__) // KNL+ - /* optimization works only for 8-wide BVHs with 16-wide SIMD */ - const vint<16> id(step); - const vint<16> id2 = align_shift_right<16/2>(id, id); - permX = select(vfloat<16>(dir.x) >= 0.0f, id, id2); - permY = select(vfloat<16>(dir.y) >= 0.0f, id, id2); - permZ = select(vfloat<16>(dir.z) >= 0.0f, id, id2); -#endif - } - - Vec3fa org_xyz, dir_xyz; - Vec3vf org, dir, rdir_near, rdir_far; -#if defined(__AVX512ER__) // KNL+ - vint16 permX, permY, permZ; -#endif - - size_t nearX, nearY, nearZ; - size_t farX, farY, farZ; - }; - - /* Full (with tnear and tfar) */ - template - struct TravRay : TravRayBase - { - __forceinline TravRay() {} - - __forceinline TravRay(const Vec3fa& ray_org, const Vec3fa& ray_dir, float ray_tnear, float ray_tfar) - : TravRayBase(ray_org, ray_dir), - tnear(ray_tnear), tfar(ray_tfar) {} - - template - __forceinline TravRay(size_t k, const Vec3vf& ray_org, const Vec3vf& ray_dir, - const Vec3vf& ray_rdir, const Vec3vi& nearXYZ, - float ray_tnear, float ray_tfar, - size_t flip = sizeof(vfloat)) - : TravRayBase(k, ray_org, ray_dir, ray_rdir, nearXYZ, flip), - tnear(ray_tnear), tfar(ray_tfar) {} - - vfloat tnear; - vfloat tfar; - }; - - ////////////////////////////////////////////////////////////////////////////////////// - // Point Query structure used in single-ray traversal - ////////////////////////////////////////////////////////////////////////////////////// - - template - struct TravPointQuery - { - __forceinline TravPointQuery() {} - - __forceinline TravPointQuery(const Vec3fa& query_org, const Vec3fa& query_rad) - { - org = Vec3vf(query_org.x, query_org.y, query_org.z); - rad = Vec3vf(query_rad.x, query_rad.y, query_rad.z); - } - - __forceinline vfloat const& tfar() const { - return rad.x; - } - - Vec3vf org, rad; - }; - - ////////////////////////////////////////////////////////////////////////////////////// - // point query - ////////////////////////////////////////////////////////////////////////////////////// - - template - __forceinline size_t pointQuerySphereDistAndMask( - const TravPointQuery& query, vfloat& dist, vfloat const& minX, vfloat const& maxX, - vfloat const& minY, vfloat const& maxY, vfloat const& minZ, vfloat const& maxZ) - { - const vfloat vX = min(max(query.org.x, minX), maxX) - query.org.x; - const vfloat vY = min(max(query.org.y, minY), maxY) - query.org.y; - const vfloat vZ = min(max(query.org.z, minZ), maxZ) - query.org.z; - dist = vX * vX + vY * vY + vZ * vZ; - const vbool vmask = dist <= query.tfar()*query.tfar(); - const vbool valid = minX <= maxX; - return movemask(vmask) & movemask(valid); - } - - template - __forceinline size_t pointQueryNodeSphere(const typename BVHN::AABBNode* node, const TravPointQuery& query, vfloat& dist) - { - const vfloat minX = vfloat::load((float*)((const char*)&node->lower_x)); - const vfloat minY = vfloat::load((float*)((const char*)&node->lower_y)); - const vfloat minZ = vfloat::load((float*)((const char*)&node->lower_z)); - const vfloat maxX = vfloat::load((float*)((const char*)&node->upper_x)); - const vfloat maxY = vfloat::load((float*)((const char*)&node->upper_y)); - const vfloat maxZ = vfloat::load((float*)((const char*)&node->upper_z)); - return pointQuerySphereDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ); - } - - template - __forceinline size_t pointQueryNodeSphere(const typename BVHN::AABBNodeMB* node, const TravPointQuery& query, const float time, vfloat& dist) - { - const vfloat* pMinX = (const vfloat*)((const char*)&node->lower_x); - const vfloat* pMinY = (const vfloat*)((const char*)&node->lower_y); - const vfloat* pMinZ = (const vfloat*)((const char*)&node->lower_z); - const vfloat* pMaxX = (const vfloat*)((const char*)&node->upper_x); - const vfloat* pMaxY = (const vfloat*)((const char*)&node->upper_y); - const vfloat* pMaxZ = (const vfloat*)((const char*)&node->upper_z); - const vfloat minX = madd(time,pMinX[6],vfloat(pMinX[0])); - const vfloat minY = madd(time,pMinY[6],vfloat(pMinY[0])); - const vfloat minZ = madd(time,pMinZ[6],vfloat(pMinZ[0])); - const vfloat maxX = madd(time,pMaxX[6],vfloat(pMaxX[0])); - const vfloat maxY = madd(time,pMaxY[6],vfloat(pMaxY[0])); - const vfloat maxZ = madd(time,pMaxZ[6],vfloat(pMaxZ[0])); - return pointQuerySphereDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ); - } - - template - __forceinline size_t pointQueryNodeSphereMB4D(const typename BVHN::NodeRef ref, const TravPointQuery& query, const float time, vfloat& dist) - { - const typename BVHN::AABBNodeMB* node = ref.getAABBNodeMB(); - size_t mask = pointQueryNodeSphere(node, query, time, dist); - - if (unlikely(ref.isAABBNodeMB4D())) { - const typename BVHN::AABBNodeMB4D* node1 = (const typename BVHN::AABBNodeMB4D*) node; - const vbool vmask = (node1->lower_t <= time) & (time < node1->upper_t); - mask &= movemask(vmask); - } - - return mask; - } - - template - __forceinline size_t pointQueryNodeSphere(const typename BVHN::QuantizedBaseNode* node, const TravPointQuery& query, vfloat& dist) - { - const vfloat start_x(node->start.x); - const vfloat scale_x(node->scale.x); - const vfloat minX = madd(node->template dequantize((0*sizeof(vfloat)) >> 2),scale_x,start_x); - const vfloat maxX = madd(node->template dequantize((1*sizeof(vfloat)) >> 2),scale_x,start_x); - const vfloat start_y(node->start.y); - const vfloat scale_y(node->scale.y); - const vfloat minY = madd(node->template dequantize((2*sizeof(vfloat)) >> 2),scale_y,start_y); - const vfloat maxY = madd(node->template dequantize((3*sizeof(vfloat)) >> 2),scale_y,start_y); - const vfloat start_z(node->start.z); - const vfloat scale_z(node->scale.z); - const vfloat minZ = madd(node->template dequantize((4*sizeof(vfloat)) >> 2),scale_z,start_z); - const vfloat maxZ = madd(node->template dequantize((5*sizeof(vfloat)) >> 2),scale_z,start_z); - return pointQuerySphereDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ) & movemask(node->validMask()); - } - - template - __forceinline size_t pointQueryNodeSphere(const typename BVHN::QuantizedBaseNodeMB* node, const TravPointQuery& query, const float time, vfloat& dist) - { - const vfloat minX = node->dequantizeLowerX(time); - const vfloat maxX = node->dequantizeUpperX(time); - const vfloat minY = node->dequantizeLowerY(time); - const vfloat maxY = node->dequantizeUpperY(time); - const vfloat minZ = node->dequantizeLowerZ(time); - const vfloat maxZ = node->dequantizeUpperZ(time); - return pointQuerySphereDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ) & movemask(node->validMask()); - } - - template - __forceinline size_t pointQueryNodeSphere(const typename BVHN::OBBNode* node, const TravPointQuery& query, vfloat& dist) - { - // TODO: point query - implement - const vbool vmask = vbool(true); - const size_t mask = movemask(vmask) & ((1<(0.0f); - return mask; - } - - template - __forceinline size_t pointQueryNodeSphere(const typename BVHN::OBBNodeMB* node, const TravPointQuery& query, const float time, vfloat& dist) - { - // TODO: point query - implement - const vbool vmask = vbool(true); - const size_t mask = movemask(vmask) & ((1<(0.0f); - return mask; - } - - template - __forceinline size_t pointQueryAABBDistAndMask( - const TravPointQuery& query, vfloat& dist, vfloat const& minX, vfloat const& maxX, - vfloat const& minY, vfloat const& maxY, vfloat const& minZ, vfloat const& maxZ) - { - const vfloat vX = min(max(query.org.x, minX), maxX) - query.org.x; - const vfloat vY = min(max(query.org.y, minY), maxY) - query.org.y; - const vfloat vZ = min(max(query.org.z, minZ), maxZ) - query.org.z; - dist = vX * vX + vY * vY + vZ * vZ; - const vbool valid = minX <= maxX; - const vbool vmask = !((maxX < query.org.x - query.rad.x) | (minX > query.org.x + query.rad.x) | - (maxY < query.org.y - query.rad.y) | (minY > query.org.y + query.rad.y) | - (maxZ < query.org.z - query.rad.z) | (minZ > query.org.z + query.rad.z)); - return movemask(vmask) & movemask(valid); - } - - template - __forceinline size_t pointQueryNodeAABB(const typename BVHN::AABBNode* node, const TravPointQuery& query, vfloat& dist) - { - const vfloat minX = vfloat::load((float*)((const char*)&node->lower_x)); - const vfloat minY = vfloat::load((float*)((const char*)&node->lower_y)); - const vfloat minZ = vfloat::load((float*)((const char*)&node->lower_z)); - const vfloat maxX = vfloat::load((float*)((const char*)&node->upper_x)); - const vfloat maxY = vfloat::load((float*)((const char*)&node->upper_y)); - const vfloat maxZ = vfloat::load((float*)((const char*)&node->upper_z)); - return pointQueryAABBDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ); - } - - template - __forceinline size_t pointQueryNodeAABB(const typename BVHN::AABBNodeMB* node, const TravPointQuery& query, const float time, vfloat& dist) - { - const vfloat* pMinX = (const vfloat*)((const char*)&node->lower_x); - const vfloat* pMinY = (const vfloat*)((const char*)&node->lower_y); - const vfloat* pMinZ = (const vfloat*)((const char*)&node->lower_z); - const vfloat* pMaxX = (const vfloat*)((const char*)&node->upper_x); - const vfloat* pMaxY = (const vfloat*)((const char*)&node->upper_y); - const vfloat* pMaxZ = (const vfloat*)((const char*)&node->upper_z); - const vfloat minX = madd(time,pMinX[6],vfloat(pMinX[0])); - const vfloat minY = madd(time,pMinY[6],vfloat(pMinY[0])); - const vfloat minZ = madd(time,pMinZ[6],vfloat(pMinZ[0])); - const vfloat maxX = madd(time,pMaxX[6],vfloat(pMaxX[0])); - const vfloat maxY = madd(time,pMaxY[6],vfloat(pMaxY[0])); - const vfloat maxZ = madd(time,pMaxZ[6],vfloat(pMaxZ[0])); - return pointQueryAABBDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ); - } - - template - __forceinline size_t pointQueryNodeAABBMB4D(const typename BVHN::NodeRef ref, const TravPointQuery& query, const float time, vfloat& dist) - { - const typename BVHN::AABBNodeMB* node = ref.getAABBNodeMB(); - size_t mask = pointQueryNodeAABB(node, query, time, dist); - - if (unlikely(ref.isAABBNodeMB4D())) { - const typename BVHN::AABBNodeMB4D* node1 = (const typename BVHN::AABBNodeMB4D*) node; - const vbool vmask = (node1->lower_t <= time) & (time < node1->upper_t); - mask &= movemask(vmask); - } - - return mask; - } - - template - __forceinline size_t pointQueryNodeAABB(const typename BVHN::QuantizedBaseNode* node, const TravPointQuery& query, vfloat& dist) - { - const size_t mvalid = movemask(node->validMask()); - const vfloat start_x(node->start.x); - const vfloat scale_x(node->scale.x); - const vfloat minX = madd(node->template dequantize((0*sizeof(vfloat)) >> 2),scale_x,start_x); - const vfloat maxX = madd(node->template dequantize((1*sizeof(vfloat)) >> 2),scale_x,start_x); - const vfloat start_y(node->start.y); - const vfloat scale_y(node->scale.y); - const vfloat minY = madd(node->template dequantize((2*sizeof(vfloat)) >> 2),scale_y,start_y); - const vfloat maxY = madd(node->template dequantize((3*sizeof(vfloat)) >> 2),scale_y,start_y); - const vfloat start_z(node->start.z); - const vfloat scale_z(node->scale.z); - const vfloat minZ = madd(node->template dequantize((4*sizeof(vfloat)) >> 2),scale_z,start_z); - const vfloat maxZ = madd(node->template dequantize((5*sizeof(vfloat)) >> 2),scale_z,start_z); - return pointQueryAABBDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ) & mvalid; - } - - template - __forceinline size_t pointQueryNodeAABB(const typename BVHN::QuantizedBaseNodeMB* node, const TravPointQuery& query, const float time, vfloat& dist) - { - const size_t mvalid = movemask(node->validMask()); - const vfloat minX = node->dequantizeLowerX(time); - const vfloat maxX = node->dequantizeUpperX(time); - const vfloat minY = node->dequantizeLowerY(time); - const vfloat maxY = node->dequantizeUpperY(time); - const vfloat minZ = node->dequantizeLowerZ(time); - const vfloat maxZ = node->dequantizeUpperZ(time); - return pointQueryAABBDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ) & mvalid; - } - - template - __forceinline size_t pointQueryNodeAABB(const typename BVHN::OBBNode* node, const TravPointQuery& query, vfloat& dist) - { - // TODO: point query - implement - const vbool vmask = vbool(true); - const size_t mask = movemask(vmask) & ((1<(0.0f); - return mask; - } - - template - __forceinline size_t pointQueryNodeAABB(const typename BVHN::OBBNodeMB* node, const TravPointQuery& query, const float time, vfloat& dist) - { - // TODO: point query - implement - const vbool vmask = vbool(true); - const size_t mask = movemask(vmask) & ((1<(0.0f); - return mask; - } - - ////////////////////////////////////////////////////////////////////////////////////// - // Fast AABBNode intersection - ////////////////////////////////////////////////////////////////////////////////////// - - template - __forceinline size_t intersectNode(const typename BVHN::AABBNode* node, const TravRay& ray, vfloat& dist); - - template<> - __forceinline size_t intersectNode<4,4>(const typename BVH4::AABBNode* node, const TravRay<4,4,false>& ray, vfloat4& dist) - { -#if defined(__FMA_X4__) -#if defined(__aarch64__) - const vfloat4 tNearX = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.neg_org_rdir.x); - const vfloat4 tNearY = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.neg_org_rdir.y); - const vfloat4 tNearZ = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.neg_org_rdir.z); - const vfloat4 tFarX = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.neg_org_rdir.x); - const vfloat4 tFarY = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.neg_org_rdir.y); - const vfloat4 tFarZ = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.neg_org_rdir.z); -#else - const vfloat4 tNearX = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.org_rdir.x); - const vfloat4 tNearY = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.org_rdir.y); - const vfloat4 tNearZ = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.org_rdir.z); - const vfloat4 tFarX = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.org_rdir.x); - const vfloat4 tFarY = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.org_rdir.y); - const vfloat4 tFarZ = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.org_rdir.z); -#endif -#else - const vfloat4 tNearX = (vfloat4::load((float*)((const char*)&node->lower_x+ray.nearX)) - ray.org.x) * ray.rdir.x; - const vfloat4 tNearY = (vfloat4::load((float*)((const char*)&node->lower_x+ray.nearY)) - ray.org.y) * ray.rdir.y; - const vfloat4 tNearZ = (vfloat4::load((float*)((const char*)&node->lower_x+ray.nearZ)) - ray.org.z) * ray.rdir.z; - const vfloat4 tFarX = (vfloat4::load((float*)((const char*)&node->lower_x+ray.farX )) - ray.org.x) * ray.rdir.x; - const vfloat4 tFarY = (vfloat4::load((float*)((const char*)&node->lower_x+ray.farY )) - ray.org.y) * ray.rdir.y; - const vfloat4 tFarZ = (vfloat4::load((float*)((const char*)&node->lower_x+ray.farZ )) - ray.org.z) * ray.rdir.z; -#endif - -#if defined(__aarch64__) - const vfloat4 tNear = maxi(tNearX, tNearY, tNearZ, ray.tnear); - const vfloat4 tFar = mini(tFarX, tFarY, tFarZ, ray.tfar); - const vbool4 vmask = asInt(tNear) <= asInt(tFar); - const size_t mask = movemask(vmask); -#elif defined(__SSE4_1__) && !defined(__AVX512F__) // up to HSW - const vfloat4 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear); - const vfloat4 tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar); - const vbool4 vmask = asInt(tNear) > asInt(tFar); - const size_t mask = movemask(vmask) ^ ((1<<4)-1); -#elif defined(__AVX512F__) && !defined(__AVX512ER__) // SKX - const vfloat4 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear); - const vfloat4 tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar); - const vbool4 vmask = asInt(tNear) <= asInt(tFar); - const size_t mask = movemask(vmask); -#else - const vfloat4 tNear = max(tNearX,tNearY,tNearZ,ray.tnear); - const vfloat4 tFar = min(tFarX ,tFarY ,tFarZ ,ray.tfar); - const vbool4 vmask = tNear <= tFar; - const size_t mask = movemask(vmask); -#endif - dist = tNear; - return mask; - } - -#if defined(__AVX__) - - template<> - __forceinline size_t intersectNode<8,8>(const typename BVH8::AABBNode* node, const TravRay<8,8,false>& ray, vfloat8& dist) - { -#if defined(__AVX2__) -#if defined(__aarch64__) - const vfloat8 tNearX = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.neg_org_rdir.x); - const vfloat8 tNearY = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.neg_org_rdir.y); - const vfloat8 tNearZ = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.neg_org_rdir.z); - const vfloat8 tFarX = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.neg_org_rdir.x); - const vfloat8 tFarY = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.neg_org_rdir.y); - const vfloat8 tFarZ = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.neg_org_rdir.z); -#else - const vfloat8 tNearX = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.org_rdir.x); - const vfloat8 tNearY = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.org_rdir.y); - const vfloat8 tNearZ = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.org_rdir.z); - const vfloat8 tFarX = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.org_rdir.x); - const vfloat8 tFarY = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.org_rdir.y); - const vfloat8 tFarZ = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.org_rdir.z); -#endif - -#else - const vfloat8 tNearX = (vfloat8::load((float*)((const char*)&node->lower_x+ray.nearX)) - ray.org.x) * ray.rdir.x; - const vfloat8 tNearY = (vfloat8::load((float*)((const char*)&node->lower_x+ray.nearY)) - ray.org.y) * ray.rdir.y; - const vfloat8 tNearZ = (vfloat8::load((float*)((const char*)&node->lower_x+ray.nearZ)) - ray.org.z) * ray.rdir.z; - const vfloat8 tFarX = (vfloat8::load((float*)((const char*)&node->lower_x+ray.farX )) - ray.org.x) * ray.rdir.x; - const vfloat8 tFarY = (vfloat8::load((float*)((const char*)&node->lower_x+ray.farY )) - ray.org.y) * ray.rdir.y; - const vfloat8 tFarZ = (vfloat8::load((float*)((const char*)&node->lower_x+ray.farZ )) - ray.org.z) * ray.rdir.z; -#endif - -#if defined(__AVX2__) && !defined(__AVX512F__) // HSW - const vfloat8 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear); - const vfloat8 tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar); - const vbool8 vmask = asInt(tNear) > asInt(tFar); - const size_t mask = movemask(vmask) ^ ((1<<8)-1); -#elif defined(__AVX512F__) && !defined(__AVX512ER__) // SKX - const vfloat8 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear); - const vfloat8 tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar); - const vbool8 vmask = asInt(tNear) <= asInt(tFar); - const size_t mask = movemask(vmask); -#else - const vfloat8 tNear = max(tNearX,tNearY,tNearZ,ray.tnear); - const vfloat8 tFar = min(tFarX ,tFarY ,tFarZ ,ray.tfar); - const vbool8 vmask = tNear <= tFar; - const size_t mask = movemask(vmask); -#endif - dist = tNear; - return mask; - } - -#endif - -#if defined(__AVX512F__) && !defined(__AVX512VL__) // KNL - - template<> - __forceinline size_t intersectNode<4,16>(const typename BVH4::AABBNode* node, const TravRay<4,16,false>& ray, vfloat16& dist) - { - const vfloat16 tNearX = msub(vfloat16(*(vfloat4*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.org_rdir.x); - const vfloat16 tNearY = msub(vfloat16(*(vfloat4*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.org_rdir.y); - const vfloat16 tNearZ = msub(vfloat16(*(vfloat4*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.org_rdir.z); - const vfloat16 tFarX = msub(vfloat16(*(vfloat4*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.org_rdir.x); - const vfloat16 tFarY = msub(vfloat16(*(vfloat4*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.org_rdir.y); - const vfloat16 tFarZ = msub(vfloat16(*(vfloat4*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.org_rdir.z); - const vfloat16 tNear = max(tNearX,tNearY,tNearZ,ray.tnear); - const vfloat16 tFar = min(tFarX ,tFarY ,tFarZ ,ray.tfar); - const vbool16 vmask = le(vbool16(0xf),tNear,tFar); - const size_t mask = movemask(vmask); - dist = tNear; - return mask; - } - - template<> - __forceinline size_t intersectNode<8,16>(const typename BVH8::AABBNode* node, const TravRay<8,16,false>& ray, vfloat16& dist) - { - const vllong8 invalid((size_t)BVH8::emptyNode); - const vboold8 m_valid(invalid != vllong8::loadu(node->children)); - const vfloat16 bminmaxX = permute(vfloat16::load((const float*)&node->lower_x), ray.permX); - const vfloat16 bminmaxY = permute(vfloat16::load((const float*)&node->lower_y), ray.permY); - const vfloat16 bminmaxZ = permute(vfloat16::load((const float*)&node->lower_z), ray.permZ); - const vfloat16 tNearFarX = msub(bminmaxX, ray.rdir.x, ray.org_rdir.x); - const vfloat16 tNearFarY = msub(bminmaxY, ray.rdir.y, ray.org_rdir.y); - const vfloat16 tNearFarZ = msub(bminmaxZ, ray.rdir.z, ray.org_rdir.z); - const vfloat16 tNear = max(tNearFarX, tNearFarY, tNearFarZ, ray.tnear); - const vfloat16 tFar = min(tNearFarX, tNearFarY, tNearFarZ, ray.tfar); - const vbool16 vmask = le(vboolf16(m_valid),tNear,align_shift_right<8>(tFar, tFar)); - const size_t mask = movemask(vmask); - dist = tNear; - return mask; - } - -#endif - - ////////////////////////////////////////////////////////////////////////////////////// - // Robust AABBNode intersection - ////////////////////////////////////////////////////////////////////////////////////// - - template - __forceinline size_t intersectNodeRobust(const typename BVHN::AABBNode* node, const TravRay& ray, vfloat& dist) - { - const vfloat tNearX = (vfloat::load((float*)((const char*)&node->lower_x+ray.nearX)) - ray.org.x) * ray.rdir_near.x; - const vfloat tNearY = (vfloat::load((float*)((const char*)&node->lower_x+ray.nearY)) - ray.org.y) * ray.rdir_near.y; - const vfloat tNearZ = (vfloat::load((float*)((const char*)&node->lower_x+ray.nearZ)) - ray.org.z) * ray.rdir_near.z; - const vfloat tFarX = (vfloat::load((float*)((const char*)&node->lower_x+ray.farX )) - ray.org.x) * ray.rdir_far.x; - const vfloat tFarY = (vfloat::load((float*)((const char*)&node->lower_x+ray.farY )) - ray.org.y) * ray.rdir_far.y; - const vfloat tFarZ = (vfloat::load((float*)((const char*)&node->lower_x+ray.farZ )) - ray.org.z) * ray.rdir_far.z; - const vfloat tNear = max(tNearX,tNearY,tNearZ,ray.tnear); - const vfloat tFar = min(tFarX ,tFarY ,tFarZ ,ray.tfar); - const vbool vmask = tNear <= tFar; - const size_t mask = movemask(vmask); - dist = tNear; - return mask; - } - -#if defined(__AVX512F__) && !defined(__AVX512VL__) // KNL - - template<> - __forceinline size_t intersectNodeRobust<4,16>(const typename BVHN<4>::AABBNode* node, const TravRay<4,16,true>& ray, vfloat<16>& dist) - { - const vfloat16 tNearX = (vfloat16(*(vfloat<4>*)((const char*)&node->lower_x+ray.nearX)) - ray.org.x) * ray.rdir_near.x; - const vfloat16 tNearY = (vfloat16(*(vfloat<4>*)((const char*)&node->lower_x+ray.nearY)) - ray.org.y) * ray.rdir_near.y; - const vfloat16 tNearZ = (vfloat16(*(vfloat<4>*)((const char*)&node->lower_x+ray.nearZ)) - ray.org.z) * ray.rdir_near.z; - const vfloat16 tFarX = (vfloat16(*(vfloat<4>*)((const char*)&node->lower_x+ray.farX )) - ray.org.x) * ray.rdir_far.x; - const vfloat16 tFarY = (vfloat16(*(vfloat<4>*)((const char*)&node->lower_x+ray.farY )) - ray.org.y) * ray.rdir_far.y; - const vfloat16 tFarZ = (vfloat16(*(vfloat<4>*)((const char*)&node->lower_x+ray.farZ )) - ray.org.z) * ray.rdir_far.z; - const vfloat16 tNear = max(tNearX,tNearY,tNearZ,ray.tnear); - const vfloat16 tFar = min(tFarX ,tFarY ,tFarZ ,ray.tfar); - const vbool16 vmask = le((1 << 4)-1,tNear,tFar); - const size_t mask = movemask(vmask); - dist = tNear; - return mask; - } - - template<> - __forceinline size_t intersectNodeRobust<8,16>(const typename BVHN<8>::AABBNode* node, const TravRay<8,16,true>& ray, vfloat<16>& dist) - { - const vfloat16 tNearX = (vfloat16(*(vfloat<8>*)((const char*)&node->lower_x+ray.nearX)) - ray.org.x) * ray.rdir_near.x; - const vfloat16 tNearY = (vfloat16(*(vfloat<8>*)((const char*)&node->lower_x+ray.nearY)) - ray.org.y) * ray.rdir_near.y; - const vfloat16 tNearZ = (vfloat16(*(vfloat<8>*)((const char*)&node->lower_x+ray.nearZ)) - ray.org.z) * ray.rdir_near.z; - const vfloat16 tFarX = (vfloat16(*(vfloat<8>*)((const char*)&node->lower_x+ray.farX )) - ray.org.x) * ray.rdir_far.x; - const vfloat16 tFarY = (vfloat16(*(vfloat<8>*)((const char*)&node->lower_x+ray.farY )) - ray.org.y) * ray.rdir_far.y; - const vfloat16 tFarZ = (vfloat16(*(vfloat<8>*)((const char*)&node->lower_x+ray.farZ )) - ray.org.z) * ray.rdir_far.z; - const vfloat16 tNear = max(tNearX,tNearY,tNearZ,ray.tnear); - const vfloat16 tFar = min(tFarX ,tFarY ,tFarZ ,ray.tfar); - const vbool16 vmask = le((1 << 8)-1,tNear,tFar); - const size_t mask = movemask(vmask); - dist = tNear; - return mask; - } - -#endif - - ////////////////////////////////////////////////////////////////////////////////////// - // Fast AABBNodeMB intersection - ////////////////////////////////////////////////////////////////////////////////////// - - template - __forceinline size_t intersectNode(const typename BVHN::AABBNodeMB* node, const TravRay& ray, const float time, vfloat& dist) - { - const vfloat* pNearX = (const vfloat*)((const char*)&node->lower_x+ray.nearX); - const vfloat* pNearY = (const vfloat*)((const char*)&node->lower_x+ray.nearY); - const vfloat* pNearZ = (const vfloat*)((const char*)&node->lower_x+ray.nearZ); - const vfloat* pFarX = (const vfloat*)((const char*)&node->lower_x+ray.farX); - const vfloat* pFarY = (const vfloat*)((const char*)&node->lower_x+ray.farY); - const vfloat* pFarZ = (const vfloat*)((const char*)&node->lower_x+ray.farZ); -#if defined(__FMA_X4__) -#if defined(__aarch64__) - const vfloat tNearX = madd(madd(time,pNearX[6],vfloat(pNearX[0])), ray.rdir.x, ray.neg_org_rdir.x); - const vfloat tNearY = madd(madd(time,pNearY[6],vfloat(pNearY[0])), ray.rdir.y, ray.neg_org_rdir.y); - const vfloat tNearZ = madd(madd(time,pNearZ[6],vfloat(pNearZ[0])), ray.rdir.z, ray.neg_org_rdir.z); - const vfloat tFarX = madd(madd(time,pFarX [6],vfloat(pFarX [0])), ray.rdir.x, ray.neg_org_rdir.x); - const vfloat tFarY = madd(madd(time,pFarY [6],vfloat(pFarY [0])), ray.rdir.y, ray.neg_org_rdir.y); - const vfloat tFarZ = madd(madd(time,pFarZ [6],vfloat(pFarZ [0])), ray.rdir.z, ray.neg_org_rdir.z); -#else - const vfloat tNearX = msub(madd(time,pNearX[6],vfloat(pNearX[0])), ray.rdir.x, ray.org_rdir.x); - const vfloat tNearY = msub(madd(time,pNearY[6],vfloat(pNearY[0])), ray.rdir.y, ray.org_rdir.y); - const vfloat tNearZ = msub(madd(time,pNearZ[6],vfloat(pNearZ[0])), ray.rdir.z, ray.org_rdir.z); - const vfloat tFarX = msub(madd(time,pFarX [6],vfloat(pFarX [0])), ray.rdir.x, ray.org_rdir.x); - const vfloat tFarY = msub(madd(time,pFarY [6],vfloat(pFarY [0])), ray.rdir.y, ray.org_rdir.y); - const vfloat tFarZ = msub(madd(time,pFarZ [6],vfloat(pFarZ [0])), ray.rdir.z, ray.org_rdir.z); -#endif -#else - const vfloat tNearX = (madd(time,pNearX[6],vfloat(pNearX[0])) - ray.org.x) * ray.rdir.x; - const vfloat tNearY = (madd(time,pNearY[6],vfloat(pNearY[0])) - ray.org.y) * ray.rdir.y; - const vfloat tNearZ = (madd(time,pNearZ[6],vfloat(pNearZ[0])) - ray.org.z) * ray.rdir.z; - const vfloat tFarX = (madd(time,pFarX [6],vfloat(pFarX [0])) - ray.org.x) * ray.rdir.x; - const vfloat tFarY = (madd(time,pFarY [6],vfloat(pFarY [0])) - ray.org.y) * ray.rdir.y; - const vfloat tFarZ = (madd(time,pFarZ [6],vfloat(pFarZ [0])) - ray.org.z) * ray.rdir.z; -#endif -#if defined(__FMA_X4__) && !defined(__AVX512F__) // HSW - const vfloat tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear); - const vfloat tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar); - const vbool vmask = asInt(tNear) > asInt(tFar); - const size_t mask = movemask(vmask) ^ ((1< tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear); - const vfloat tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar); - const vbool vmask = asInt(tNear) <= asInt(tFar); - const size_t mask = movemask(vmask); -#else - const vfloat tNear = max(ray.tnear,tNearX,tNearY,tNearZ); - const vfloat tFar = min(ray.tfar, tFarX ,tFarY ,tFarZ ); - const vbool vmask = tNear <= tFar; - const size_t mask = movemask(vmask); -#endif - dist = tNear; - return mask; - } - - ////////////////////////////////////////////////////////////////////////////////////// - // Robust AABBNodeMB intersection - ////////////////////////////////////////////////////////////////////////////////////// - - template - __forceinline size_t intersectNodeRobust(const typename BVHN::AABBNodeMB* node, const TravRay& ray, const float time, vfloat& dist) - { - const vfloat* pNearX = (const vfloat*)((const char*)&node->lower_x+ray.nearX); - const vfloat* pNearY = (const vfloat*)((const char*)&node->lower_x+ray.nearY); - const vfloat* pNearZ = (const vfloat*)((const char*)&node->lower_x+ray.nearZ); - const vfloat tNearX = (madd(time,pNearX[6],vfloat(pNearX[0])) - ray.org.x) * ray.rdir_near.x; - const vfloat tNearY = (madd(time,pNearY[6],vfloat(pNearY[0])) - ray.org.y) * ray.rdir_near.y; - const vfloat tNearZ = (madd(time,pNearZ[6],vfloat(pNearZ[0])) - ray.org.z) * ray.rdir_near.z; - const vfloat tNear = max(ray.tnear,tNearX,tNearY,tNearZ); - const vfloat* pFarX = (const vfloat*)((const char*)&node->lower_x+ray.farX); - const vfloat* pFarY = (const vfloat*)((const char*)&node->lower_x+ray.farY); - const vfloat* pFarZ = (const vfloat*)((const char*)&node->lower_x+ray.farZ); - const vfloat tFarX = (madd(time,pFarX[6],vfloat(pFarX[0])) - ray.org.x) * ray.rdir_far.x; - const vfloat tFarY = (madd(time,pFarY[6],vfloat(pFarY[0])) - ray.org.y) * ray.rdir_far.y; - const vfloat tFarZ = (madd(time,pFarZ[6],vfloat(pFarZ[0])) - ray.org.z) * ray.rdir_far.z; - const vfloat tFar = min(ray.tfar,tFarX,tFarY,tFarZ); - const size_t mask = movemask(tNear <= tFar); - dist = tNear; - return mask; - } - - ////////////////////////////////////////////////////////////////////////////////////// - // Fast AABBNodeMB4D intersection - ////////////////////////////////////////////////////////////////////////////////////// - - template - __forceinline size_t intersectNodeMB4D(const typename BVHN::NodeRef ref, const TravRay& ray, const float time, vfloat& dist) - { - const typename BVHN::AABBNodeMB* node = ref.getAABBNodeMB(); - - const vfloat* pNearX = (const vfloat*)((const char*)&node->lower_x+ray.nearX); - const vfloat* pNearY = (const vfloat*)((const char*)&node->lower_x+ray.nearY); - const vfloat* pNearZ = (const vfloat*)((const char*)&node->lower_x+ray.nearZ); - const vfloat* pFarX = (const vfloat*)((const char*)&node->lower_x+ray.farX); - const vfloat* pFarY = (const vfloat*)((const char*)&node->lower_x+ray.farY); - const vfloat* pFarZ = (const vfloat*)((const char*)&node->lower_x+ray.farZ); -#if defined (__FMA_X4__) -#if defined(__aarch64__) - const vfloat tNearX = madd(madd(time,pNearX[6],vfloat(pNearX[0])), ray.rdir.x, ray.neg_org_rdir.x); - const vfloat tNearY = madd(madd(time,pNearY[6],vfloat(pNearY[0])), ray.rdir.y, ray.neg_org_rdir.y); - const vfloat tNearZ = madd(madd(time,pNearZ[6],vfloat(pNearZ[0])), ray.rdir.z, ray.neg_org_rdir.z); - const vfloat tFarX = madd(madd(time,pFarX [6],vfloat(pFarX [0])), ray.rdir.x, ray.neg_org_rdir.x); - const vfloat tFarY = madd(madd(time,pFarY [6],vfloat(pFarY [0])), ray.rdir.y, ray.neg_org_rdir.y); - const vfloat tFarZ = madd(madd(time,pFarZ [6],vfloat(pFarZ [0])), ray.rdir.z, ray.neg_org_rdir.z); -#else - const vfloat tNearX = msub(madd(time,pNearX[6],vfloat(pNearX[0])), ray.rdir.x, ray.org_rdir.x); - const vfloat tNearY = msub(madd(time,pNearY[6],vfloat(pNearY[0])), ray.rdir.y, ray.org_rdir.y); - const vfloat tNearZ = msub(madd(time,pNearZ[6],vfloat(pNearZ[0])), ray.rdir.z, ray.org_rdir.z); - const vfloat tFarX = msub(madd(time,pFarX [6],vfloat(pFarX [0])), ray.rdir.x, ray.org_rdir.x); - const vfloat tFarY = msub(madd(time,pFarY [6],vfloat(pFarY [0])), ray.rdir.y, ray.org_rdir.y); - const vfloat tFarZ = msub(madd(time,pFarZ [6],vfloat(pFarZ [0])), ray.rdir.z, ray.org_rdir.z); -#endif -#else - const vfloat tNearX = (madd(time,pNearX[6],vfloat(pNearX[0])) - ray.org.x) * ray.rdir.x; - const vfloat tNearY = (madd(time,pNearY[6],vfloat(pNearY[0])) - ray.org.y) * ray.rdir.y; - const vfloat tNearZ = (madd(time,pNearZ[6],vfloat(pNearZ[0])) - ray.org.z) * ray.rdir.z; - const vfloat tFarX = (madd(time,pFarX [6],vfloat(pFarX [0])) - ray.org.x) * ray.rdir.x; - const vfloat tFarY = (madd(time,pFarY [6],vfloat(pFarY [0])) - ray.org.y) * ray.rdir.y; - const vfloat tFarZ = (madd(time,pFarZ [6],vfloat(pFarZ [0])) - ray.org.z) * ray.rdir.z; -#endif -#if defined(__FMA_X4__) && !defined(__AVX512F__) - const vfloat tNear = maxi(maxi(tNearX,tNearY),maxi(tNearZ,ray.tnear)); - const vfloat tFar = mini(mini(tFarX ,tFarY ),mini(tFarZ ,ray.tfar )); -#else - const vfloat tNear = max(ray.tnear,tNearX,tNearY,tNearZ); - const vfloat tFar = min(ray.tfar, tFarX ,tFarY ,tFarZ ); -#endif - vbool vmask = tNear <= tFar; - if (unlikely(ref.isAABBNodeMB4D())) { - const typename BVHN::AABBNodeMB4D* node1 = (const typename BVHN::AABBNodeMB4D*) node; - vmask &= (node1->lower_t <= time) & (time < node1->upper_t); - } - const size_t mask = movemask(vmask); - dist = tNear; - return mask; - } - - ////////////////////////////////////////////////////////////////////////////////////// - // Robust AABBNodeMB4D intersection - ////////////////////////////////////////////////////////////////////////////////////// - - template - __forceinline size_t intersectNodeMB4DRobust(const typename BVHN::NodeRef ref, const TravRay& ray, const float time, vfloat& dist) - { - const typename BVHN::AABBNodeMB* node = ref.getAABBNodeMB(); - - const vfloat* pNearX = (const vfloat*)((const char*)&node->lower_x+ray.nearX); - const vfloat* pNearY = (const vfloat*)((const char*)&node->lower_x+ray.nearY); - const vfloat* pNearZ = (const vfloat*)((const char*)&node->lower_x+ray.nearZ); - const vfloat tNearX = (madd(time,pNearX[6],vfloat(pNearX[0])) - ray.org.x) * ray.rdir_near.x; - const vfloat tNearY = (madd(time,pNearY[6],vfloat(pNearY[0])) - ray.org.y) * ray.rdir_near.y; - const vfloat tNearZ = (madd(time,pNearZ[6],vfloat(pNearZ[0])) - ray.org.z) * ray.rdir_near.z; - const vfloat tNear = max(ray.tnear,tNearX,tNearY,tNearZ); - const vfloat* pFarX = (const vfloat*)((const char*)&node->lower_x+ray.farX); - const vfloat* pFarY = (const vfloat*)((const char*)&node->lower_x+ray.farY); - const vfloat* pFarZ = (const vfloat*)((const char*)&node->lower_x+ray.farZ); - const vfloat tFarX = (madd(time,pFarX[6],vfloat(pFarX[0])) - ray.org.x) * ray.rdir_far.x; - const vfloat tFarY = (madd(time,pFarY[6],vfloat(pFarY[0])) - ray.org.y) * ray.rdir_far.y; - const vfloat tFarZ = (madd(time,pFarZ[6],vfloat(pFarZ[0])) - ray.org.z) * ray.rdir_far.z; - const vfloat tFar = min(ray.tfar,tFarX,tFarY,tFarZ); - vbool vmask = tNear <= tFar; - if (unlikely(ref.isAABBNodeMB4D())) { - const typename BVHN::AABBNodeMB4D* node1 = (const typename BVHN::AABBNodeMB4D*) node; - vmask &= (node1->lower_t <= time) & (time < node1->upper_t); - } - const size_t mask = movemask(vmask); - dist = tNear; - return mask; - } - - ////////////////////////////////////////////////////////////////////////////////////// - // Fast QuantizedBaseNode intersection - ////////////////////////////////////////////////////////////////////////////////////// - - template - __forceinline size_t intersectNode(const typename BVHN::QuantizedBaseNode* node, const TravRay& ray, vfloat& dist); - - template<> - __forceinline size_t intersectNode<4,4>(const typename BVH4::QuantizedBaseNode* node, const TravRay<4,4,false>& ray, vfloat4& dist) - { - const size_t mvalid = movemask(node->validMask()); - const vfloat4 start_x(node->start.x); - const vfloat4 scale_x(node->scale.x); - const vfloat4 lower_x = madd(node->dequantize<4>(ray.nearX >> 2),scale_x,start_x); - const vfloat4 upper_x = madd(node->dequantize<4>(ray.farX >> 2),scale_x,start_x); - const vfloat4 start_y(node->start.y); - const vfloat4 scale_y(node->scale.y); - const vfloat4 lower_y = madd(node->dequantize<4>(ray.nearY >> 2),scale_y,start_y); - const vfloat4 upper_y = madd(node->dequantize<4>(ray.farY >> 2),scale_y,start_y); - const vfloat4 start_z(node->start.z); - const vfloat4 scale_z(node->scale.z); - const vfloat4 lower_z = madd(node->dequantize<4>(ray.nearZ >> 2),scale_z,start_z); - const vfloat4 upper_z = madd(node->dequantize<4>(ray.farZ >> 2),scale_z,start_z); - -#if defined(__FMA_X4__) -#if defined(__aarch64__) - const vfloat4 tNearX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x); - const vfloat4 tNearY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y); - const vfloat4 tNearZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z); - const vfloat4 tFarX = madd(upper_x, ray.rdir.x, ray.neg_org_rdir.x); - const vfloat4 tFarY = madd(upper_y, ray.rdir.y, ray.neg_org_rdir.y); - const vfloat4 tFarZ = madd(upper_z, ray.rdir.z, ray.neg_org_rdir.z); -#else - const vfloat4 tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x); - const vfloat4 tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y); - const vfloat4 tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z); - const vfloat4 tFarX = msub(upper_x, ray.rdir.x, ray.org_rdir.x); - const vfloat4 tFarY = msub(upper_y, ray.rdir.y, ray.org_rdir.y); - const vfloat4 tFarZ = msub(upper_z, ray.rdir.z, ray.org_rdir.z); -#endif -#else - const vfloat4 tNearX = (lower_x - ray.org.x) * ray.rdir.x; - const vfloat4 tNearY = (lower_y - ray.org.y) * ray.rdir.y; - const vfloat4 tNearZ = (lower_z - ray.org.z) * ray.rdir.z; - const vfloat4 tFarX = (upper_x - ray.org.x) * ray.rdir.x; - const vfloat4 tFarY = (upper_y - ray.org.y) * ray.rdir.y; - const vfloat4 tFarZ = (upper_z - ray.org.z) * ray.rdir.z; -#endif - -#if (defined(__aarch64__) && defined(BUILD_IOS)) || defined(__SSE4_1__) && !defined(__AVX512F__) // up to HSW - const vfloat4 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear); - const vfloat4 tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar); - const vbool4 vmask = asInt(tNear) > asInt(tFar); - const size_t mask = movemask(vmask) ^ ((1<<4)-1); -#elif defined(__AVX512F__) && !defined(__AVX512ER__) // SKX - const vfloat4 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear); - const vfloat4 tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar); - const vbool4 vmask = asInt(tNear) <= asInt(tFar); - const size_t mask = movemask(vmask); -#else - const vfloat4 tNear = max(tNearX,tNearY,tNearZ,ray.tnear); - const vfloat4 tFar = min(tFarX ,tFarY ,tFarZ ,ray.tfar); - const vbool4 vmask = tNear <= tFar; - const size_t mask = movemask(vmask); -#endif - dist = tNear; - return mask & mvalid; - } - - template<> - __forceinline size_t intersectNode<4,4>(const typename BVH4::QuantizedBaseNode* node, const TravRay<4,4,true>& ray, vfloat4& dist) - { - const size_t mvalid = movemask(node->validMask()); - const vfloat4 start_x(node->start.x); - const vfloat4 scale_x(node->scale.x); - const vfloat4 lower_x = madd(node->dequantize<4>(ray.nearX >> 2),scale_x,start_x); - const vfloat4 upper_x = madd(node->dequantize<4>(ray.farX >> 2),scale_x,start_x); - const vfloat4 start_y(node->start.y); - const vfloat4 scale_y(node->scale.y); - const vfloat4 lower_y = madd(node->dequantize<4>(ray.nearY >> 2),scale_y,start_y); - const vfloat4 upper_y = madd(node->dequantize<4>(ray.farY >> 2),scale_y,start_y); - const vfloat4 start_z(node->start.z); - const vfloat4 scale_z(node->scale.z); - const vfloat4 lower_z = madd(node->dequantize<4>(ray.nearZ >> 2),scale_z,start_z); - const vfloat4 upper_z = madd(node->dequantize<4>(ray.farZ >> 2),scale_z,start_z); - - const vfloat4 tNearX = (lower_x - ray.org.x) * ray.rdir_near.x; - const vfloat4 tNearY = (lower_y - ray.org.y) * ray.rdir_near.y; - const vfloat4 tNearZ = (lower_z - ray.org.z) * ray.rdir_near.z; - const vfloat4 tFarX = (upper_x - ray.org.x) * ray.rdir_far.x; - const vfloat4 tFarY = (upper_y - ray.org.y) * ray.rdir_far.y; - const vfloat4 tFarZ = (upper_z - ray.org.z) * ray.rdir_far.z; - - const vfloat4 tNear = max(tNearX,tNearY,tNearZ,ray.tnear); - const vfloat4 tFar = min(tFarX ,tFarY ,tFarZ ,ray.tfar); - const vbool4 vmask = tNear <= tFar; - const size_t mask = movemask(vmask); - dist = tNear; - return mask & mvalid; - } - - -#if defined(__AVX__) - - template<> - __forceinline size_t intersectNode<8,8>(const typename BVH8::QuantizedBaseNode* node, const TravRay<8,8,false>& ray, vfloat8& dist) - { - const size_t mvalid = movemask(node->validMask()); - const vfloat8 start_x(node->start.x); - const vfloat8 scale_x(node->scale.x); - const vfloat8 lower_x = madd(node->dequantize<8>(ray.nearX >> 2),scale_x,start_x); - const vfloat8 upper_x = madd(node->dequantize<8>(ray.farX >> 2),scale_x,start_x); - const vfloat8 start_y(node->start.y); - const vfloat8 scale_y(node->scale.y); - const vfloat8 lower_y = madd(node->dequantize<8>(ray.nearY >> 2),scale_y,start_y); - const vfloat8 upper_y = madd(node->dequantize<8>(ray.farY >> 2),scale_y,start_y); - const vfloat8 start_z(node->start.z); - const vfloat8 scale_z(node->scale.z); - const vfloat8 lower_z = madd(node->dequantize<8>(ray.nearZ >> 2),scale_z,start_z); - const vfloat8 upper_z = madd(node->dequantize<8>(ray.farZ >> 2),scale_z,start_z); - -#if defined(__AVX2__) -#if defined(__aarch64__) - const vfloat8 tNearX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x); - const vfloat8 tNearY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y); - const vfloat8 tNearZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z); - const vfloat8 tFarX = madd(upper_x, ray.rdir.x, ray.neg_org_rdir.x); - const vfloat8 tFarY = madd(upper_y, ray.rdir.y, ray.neg_org_rdir.y); - const vfloat8 tFarZ = madd(upper_z, ray.rdir.z, ray.neg_org_rdir.z); -#else - const vfloat8 tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x); - const vfloat8 tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y); - const vfloat8 tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z); - const vfloat8 tFarX = msub(upper_x, ray.rdir.x, ray.org_rdir.x); - const vfloat8 tFarY = msub(upper_y, ray.rdir.y, ray.org_rdir.y); - const vfloat8 tFarZ = msub(upper_z, ray.rdir.z, ray.org_rdir.z); -#endif -#else - const vfloat8 tNearX = (lower_x - ray.org.x) * ray.rdir.x; - const vfloat8 tNearY = (lower_y - ray.org.y) * ray.rdir.y; - const vfloat8 tNearZ = (lower_z - ray.org.z) * ray.rdir.z; - const vfloat8 tFarX = (upper_x - ray.org.x) * ray.rdir.x; - const vfloat8 tFarY = (upper_y - ray.org.y) * ray.rdir.y; - const vfloat8 tFarZ = (upper_z - ray.org.z) * ray.rdir.z; -#endif - -#if defined(__AVX2__) && !defined(__AVX512F__) // HSW - const vfloat8 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear); - const vfloat8 tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar); - const vbool8 vmask = asInt(tNear) > asInt(tFar); - const size_t mask = movemask(vmask) ^ ((1<<8)-1); -#elif defined(__AVX512F__) && !defined(__AVX512ER__) // SKX - const vfloat8 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear); - const vfloat8 tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar); - const vbool8 vmask = asInt(tNear) <= asInt(tFar); - const size_t mask = movemask(vmask); -#else - const vfloat8 tNear = max(tNearX,tNearY,tNearZ,ray.tnear); - const vfloat8 tFar = min(tFarX ,tFarY ,tFarZ ,ray.tfar); - const vbool8 vmask = tNear <= tFar; - const size_t mask = movemask(vmask); -#endif - dist = tNear; - return mask & mvalid; - } - - template<> - __forceinline size_t intersectNode<8,8>(const typename BVH8::QuantizedBaseNode* node, const TravRay<8,8,true>& ray, vfloat8& dist) - { - const size_t mvalid = movemask(node->validMask()); - const vfloat8 start_x(node->start.x); - const vfloat8 scale_x(node->scale.x); - const vfloat8 lower_x = madd(node->dequantize<8>(ray.nearX >> 2),scale_x,start_x); - const vfloat8 upper_x = madd(node->dequantize<8>(ray.farX >> 2),scale_x,start_x); - const vfloat8 start_y(node->start.y); - const vfloat8 scale_y(node->scale.y); - const vfloat8 lower_y = madd(node->dequantize<8>(ray.nearY >> 2),scale_y,start_y); - const vfloat8 upper_y = madd(node->dequantize<8>(ray.farY >> 2),scale_y,start_y); - const vfloat8 start_z(node->start.z); - const vfloat8 scale_z(node->scale.z); - const vfloat8 lower_z = madd(node->dequantize<8>(ray.nearZ >> 2),scale_z,start_z); - const vfloat8 upper_z = madd(node->dequantize<8>(ray.farZ >> 2),scale_z,start_z); - - const vfloat8 tNearX = (lower_x - ray.org.x) * ray.rdir_near.x; - const vfloat8 tNearY = (lower_y - ray.org.y) * ray.rdir_near.y; - const vfloat8 tNearZ = (lower_z - ray.org.z) * ray.rdir_near.z; - const vfloat8 tFarX = (upper_x - ray.org.x) * ray.rdir_far.x; - const vfloat8 tFarY = (upper_y - ray.org.y) * ray.rdir_far.y; - const vfloat8 tFarZ = (upper_z - ray.org.z) * ray.rdir_far.z; - - const vfloat8 tNear = max(tNearX,tNearY,tNearZ,ray.tnear); - const vfloat8 tFar = min(tFarX ,tFarY ,tFarZ ,ray.tfar); - const vbool8 vmask = tNear <= tFar; - const size_t mask = movemask(vmask); - - dist = tNear; - return mask & mvalid; - } - - -#endif - -#if defined(__AVX512F__) && !defined(__AVX512VL__) // KNL - - template<> - __forceinline size_t intersectNode<4,16>(const typename BVH4::QuantizedBaseNode* node, const TravRay<4,16,false>& ray, vfloat16& dist) - { - const size_t mvalid = movemask(node->validMask()); - const vfloat16 start_x(node->start.x); - const vfloat16 scale_x(node->scale.x); - const vfloat16 lower_x = madd(vfloat16(node->dequantize<4>(ray.nearX >> 2)),scale_x,start_x); - const vfloat16 upper_x = madd(vfloat16(node->dequantize<4>(ray.farX >> 2)),scale_x,start_x); - const vfloat16 start_y(node->start.y); - const vfloat16 scale_y(node->scale.y); - const vfloat16 lower_y = madd(vfloat16(node->dequantize<4>(ray.nearY >> 2)),scale_y,start_y); - const vfloat16 upper_y = madd(vfloat16(node->dequantize<4>(ray.farY >> 2)),scale_y,start_y); - const vfloat16 start_z(node->start.z); - const vfloat16 scale_z(node->scale.z); - const vfloat16 lower_z = madd(vfloat16(node->dequantize<4>(ray.nearZ >> 2)),scale_z,start_z); - const vfloat16 upper_z = madd(vfloat16(node->dequantize<4>(ray.farZ >> 2)),scale_z,start_z); - - const vfloat16 tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x); - const vfloat16 tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y); - const vfloat16 tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z); - const vfloat16 tFarX = msub(upper_x, ray.rdir.x, ray.org_rdir.x); - const vfloat16 tFarY = msub(upper_y, ray.rdir.y, ray.org_rdir.y); - const vfloat16 tFarZ = msub(upper_z, ray.rdir.z, ray.org_rdir.z); - const vfloat16 tNear = max(tNearX,tNearY,tNearZ,ray.tnear); - const vfloat16 tFar = min(tFarX ,tFarY ,tFarZ ,ray.tfar); - const vbool16 vmask = le(vbool16(0xf),tNear,tFar); - const size_t mask = movemask(vmask) & mvalid; - dist = tNear; - return mask; - } - - template<> - __forceinline size_t intersectNode<4,16>(const typename BVH4::QuantizedBaseNode* node, const TravRay<4,16,true>& ray, vfloat16& dist) - { - const size_t mvalid = movemask(node->validMask()); - const vfloat16 start_x(node->start.x); - const vfloat16 scale_x(node->scale.x); - const vfloat16 lower_x = madd(vfloat16(node->dequantize<4>(ray.nearX >> 2)),scale_x,start_x); - const vfloat16 upper_x = madd(vfloat16(node->dequantize<4>(ray.farX >> 2)),scale_x,start_x); - const vfloat16 start_y(node->start.y); - const vfloat16 scale_y(node->scale.y); - const vfloat16 lower_y = madd(vfloat16(node->dequantize<4>(ray.nearY >> 2)),scale_y,start_y); - const vfloat16 upper_y = madd(vfloat16(node->dequantize<4>(ray.farY >> 2)),scale_y,start_y); - const vfloat16 start_z(node->start.z); - const vfloat16 scale_z(node->scale.z); - const vfloat16 lower_z = madd(vfloat16(node->dequantize<4>(ray.nearZ >> 2)),scale_z,start_z); - const vfloat16 upper_z = madd(vfloat16(node->dequantize<4>(ray.farZ >> 2)),scale_z,start_z); - - const vfloat16 tNearX = (lower_x - ray.org.x) * ray.rdir_near.x; - const vfloat16 tNearY = (lower_y - ray.org.y) * ray.rdir_near.y; - const vfloat16 tNearZ = (lower_z - ray.org.z) * ray.rdir_near.z; - const vfloat16 tFarX = (upper_x - ray.org.x) * ray.rdir_far.x; - const vfloat16 tFarY = (upper_y - ray.org.y) * ray.rdir_far.y; - const vfloat16 tFarZ = (upper_z - ray.org.z) * ray.rdir_far.z; - - const vfloat16 tNear = max(tNearX,tNearY,tNearZ,ray.tnear); - const vfloat16 tFar = min(tFarX ,tFarY ,tFarZ ,ray.tfar); - const vbool16 vmask = le(vbool16(0xf),tNear,tFar); - const size_t mask = movemask(vmask) & mvalid; - dist = tNear; - return mask; - } - - template<> - __forceinline size_t intersectNode<8,16>(const typename BVH8::QuantizedBaseNode* node, const TravRay<8,16,false>& ray, vfloat16& dist) - { - const vbool16 m_valid(node->validMask16()); - const vfloat16 bminmaxX = node->dequantizeLowerUpperX(ray.permX); - const vfloat16 bminmaxY = node->dequantizeLowerUpperY(ray.permY); - const vfloat16 bminmaxZ = node->dequantizeLowerUpperZ(ray.permZ); - const vfloat16 tNearFarX = msub(bminmaxX, ray.rdir.x, ray.org_rdir.x); - const vfloat16 tNearFarY = msub(bminmaxY, ray.rdir.y, ray.org_rdir.y); - const vfloat16 tNearFarZ = msub(bminmaxZ, ray.rdir.z, ray.org_rdir.z); - const vfloat16 tNear = max(tNearFarX, tNearFarY, tNearFarZ, ray.tnear); - const vfloat16 tFar = min(tNearFarX, tNearFarY, tNearFarZ, ray.tfar); - const vbool16 vmask = le(m_valid,tNear,align_shift_right<8>(tFar, tFar)); - const size_t mask = movemask(vmask); - dist = tNear; - return mask; - } - - template<> - __forceinline size_t intersectNode<8,16>(const typename BVH8::QuantizedBaseNode* node, const TravRay<8,16,true>& ray, vfloat16& dist) - { - const vbool16 m_valid(node->validMask16()); - const vfloat16 bminmaxX = node->dequantizeLowerUpperX(ray.permX); - const vfloat16 bminmaxY = node->dequantizeLowerUpperY(ray.permY); - const vfloat16 bminmaxZ = node->dequantizeLowerUpperZ(ray.permZ); - const vfloat16 tNearFarX = (bminmaxX - ray.org.x) * ray.rdir_far.x; // FIXME: this is not conservative !!!!!!!!! - const vfloat16 tNearFarY = (bminmaxY - ray.org.y) * ray.rdir_far.y; - const vfloat16 tNearFarZ = (bminmaxZ - ray.org.z) * ray.rdir_far.z; - const vfloat16 tNear = max(tNearFarX, tNearFarY, tNearFarZ, ray.tnear); - const vfloat16 tFar = min(tNearFarX, tNearFarY, tNearFarZ, ray.tfar); - const vbool16 vmask = le(m_valid,tNear,align_shift_right<8>(tFar, tFar)); - const size_t mask = movemask(vmask); - dist = tNear; - return mask; - } - - -#endif - - - template - __forceinline size_t intersectNode(const typename BVHN::QuantizedBaseNodeMB* node, const TravRay& ray, const float time, vfloat& dist) - { - const vboolf mvalid = node->validMask(); - const vfloat lower_x = node->dequantizeLowerX(time); - const vfloat upper_x = node->dequantizeUpperX(time); - const vfloat lower_y = node->dequantizeLowerY(time); - const vfloat upper_y = node->dequantizeUpperY(time); - const vfloat lower_z = node->dequantizeLowerZ(time); - const vfloat upper_z = node->dequantizeUpperZ(time); -#if defined(__FMA_X4__) -#if defined(__aarch64__) - const vfloat tNearX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x); - const vfloat tNearY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y); - const vfloat tNearZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z); - const vfloat tFarX = madd(upper_x, ray.rdir.x, ray.neg_org_rdir.x); - const vfloat tFarY = madd(upper_y, ray.rdir.y, ray.neg_org_rdir.y); - const vfloat tFarZ = madd(upper_z, ray.rdir.z, ray.neg_org_rdir.z); -#else - const vfloat tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x); - const vfloat tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y); - const vfloat tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z); - const vfloat tFarX = msub(upper_x, ray.rdir.x, ray.org_rdir.x); - const vfloat tFarY = msub(upper_y, ray.rdir.y, ray.org_rdir.y); - const vfloat tFarZ = msub(upper_z, ray.rdir.z, ray.org_rdir.z); -#endif -#else - const vfloat tNearX = (lower_x - ray.org.x) * ray.rdir.x; - const vfloat tNearY = (lower_y - ray.org.y) * ray.rdir.y; - const vfloat tNearZ = (lower_z - ray.org.z) * ray.rdir.z; - const vfloat tFarX = (upper_x - ray.org.x) * ray.rdir.x; - const vfloat tFarY = (upper_y - ray.org.y) * ray.rdir.y; - const vfloat tFarZ = (upper_z - ray.org.z) * ray.rdir.z; -#endif - - const vfloat tminX = mini(tNearX,tFarX); - const vfloat tmaxX = maxi(tNearX,tFarX); - const vfloat tminY = mini(tNearY,tFarY); - const vfloat tmaxY = maxi(tNearY,tFarY); - const vfloat tminZ = mini(tNearZ,tFarZ); - const vfloat tmaxZ = maxi(tNearZ,tFarZ); - const vfloat tNear = maxi(tminX,tminY,tminZ,ray.tnear); - const vfloat tFar = mini(tmaxX,tmaxY,tmaxZ,ray.tfar); -#if defined(__AVX512F__) && !defined(__AVX512ER__) // SKX - const vbool vmask = le(mvalid,asInt(tNear),asInt(tFar)); -#else - const vbool vmask = (asInt(tNear) <= asInt(tFar)) & mvalid; -#endif - const size_t mask = movemask(vmask); - dist = tNear; - return mask; - } - - template - __forceinline size_t intersectNode(const typename BVHN::QuantizedBaseNodeMB* node, const TravRay& ray, const float time, vfloat& dist) - { - const vboolf mvalid = node->validMask(); - const vfloat lower_x = node->dequantizeLowerX(time); - const vfloat upper_x = node->dequantizeUpperX(time); - const vfloat lower_y = node->dequantizeLowerY(time); - const vfloat upper_y = node->dequantizeUpperY(time); - const vfloat lower_z = node->dequantizeLowerZ(time); - const vfloat upper_z = node->dequantizeUpperZ(time); - const vfloat tNearX = (lower_x - ray.org.x) * ray.rdir_near.x; - const vfloat tNearY = (lower_y - ray.org.y) * ray.rdir_near.y; - const vfloat tNearZ = (lower_z - ray.org.z) * ray.rdir_near.z; - const vfloat tFarX = (upper_x - ray.org.x) * ray.rdir_far.x; - const vfloat tFarY = (upper_y - ray.org.y) * ray.rdir_far.y; - const vfloat tFarZ = (upper_z - ray.org.z) * ray.rdir_far.z; - - const vfloat tminX = mini(tNearX,tFarX); - const vfloat tmaxX = maxi(tNearX,tFarX); - const vfloat tminY = mini(tNearY,tFarY); - const vfloat tmaxY = maxi(tNearY,tFarY); - const vfloat tminZ = mini(tNearZ,tFarZ); - const vfloat tmaxZ = maxi(tNearZ,tFarZ); - const vfloat tNear = maxi(tminX,tminY,tminZ,ray.tnear); - const vfloat tFar = mini(tmaxX,tmaxY,tmaxZ,ray.tfar); -#if defined(__AVX512F__) && !defined(__AVX512ER__) // SKX - const vbool vmask = le(mvalid,asInt(tNear),asInt(tFar)); -#else - const vbool vmask = (asInt(tNear) <= asInt(tFar)) & mvalid; -#endif - const size_t mask = movemask(vmask); - dist = tNear; - return mask; - } - - -#if defined(__AVX512ER__) - // for KNL - template<> - __forceinline size_t intersectNode<4,16>(const typename BVHN<4>::QuantizedBaseNodeMB* node, const TravRay<4,16,false>& ray, const float time, vfloat<4>& dist) - { - const size_t mvalid = movemask(node->validMask()); - const vfloat16 lower_x = node->dequantizeLowerX(time); - const vfloat16 upper_x = node->dequantizeUpperX(time); - const vfloat16 lower_y = node->dequantizeLowerY(time); - const vfloat16 upper_y = node->dequantizeUpperY(time); - const vfloat16 lower_z = node->dequantizeLowerZ(time); - const vfloat16 upper_z = node->dequantizeUpperZ(time); - - const vfloat16 tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x); - const vfloat16 tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y); - const vfloat16 tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z); - const vfloat16 tFarX = msub(upper_x, ray.rdir.x, ray.org_rdir.x); - const vfloat16 tFarY = msub(upper_y, ray.rdir.y, ray.org_rdir.y); - const vfloat16 tFarZ = msub(upper_z, ray.rdir.z, ray.org_rdir.z); - - const vfloat16 tminX = min(tNearX,tFarX); - const vfloat16 tmaxX = max(tNearX,tFarX); - const vfloat16 tminY = min(tNearY,tFarY); - const vfloat16 tmaxY = max(tNearY,tFarY); - const vfloat16 tminZ = min(tNearZ,tFarZ); - const vfloat16 tmaxZ = max(tNearZ,tFarZ); - const vfloat16 tNear = max(tminX,tminY,tminZ,ray.tnear); - const vfloat16 tFar = min(tmaxX,tmaxY,tmaxZ,ray.tfar ); - const vbool16 vmask = tNear <= tFar; - const size_t mask = movemask(vmask) & mvalid; - dist = extractN<4,0>(tNear); - return mask; - } - - - // for KNL - template<> - __forceinline size_t intersectNode<4,16>(const typename BVHN<4>::QuantizedBaseNodeMB* node, const TravRay<4,16,true>& ray, const float time, vfloat<4>& dist) - { - const size_t mvalid = movemask(node->validMask()); - const vfloat16 lower_x = node->dequantizeLowerX(time); - const vfloat16 upper_x = node->dequantizeUpperX(time); - const vfloat16 lower_y = node->dequantizeLowerY(time); - const vfloat16 upper_y = node->dequantizeUpperY(time); - const vfloat16 lower_z = node->dequantizeLowerZ(time); - const vfloat16 upper_z = node->dequantizeUpperZ(time); - - const vfloat16 tNearX = (lower_x - ray.org.x) * ray.rdir_near.x; - const vfloat16 tNearY = (lower_y - ray.org.y) * ray.rdir_near.y; - const vfloat16 tNearZ = (lower_z - ray.org.z) * ray.rdir_near.z; - const vfloat16 tFarX = (upper_x - ray.org.x) * ray.rdir_far.x; - const vfloat16 tFarY = (upper_y - ray.org.y) * ray.rdir_far.y; - const vfloat16 tFarZ = (upper_z - ray.org.z) * ray.rdir_far.z; - - const vfloat16 tminX = min(tNearX,tFarX); - const vfloat16 tmaxX = max(tNearX,tFarX); - const vfloat16 tminY = min(tNearY,tFarY); - const vfloat16 tmaxY = max(tNearY,tFarY); - const vfloat16 tminZ = min(tNearZ,tFarZ); - const vfloat16 tmaxZ = max(tNearZ,tFarZ); - const vfloat16 tNear = max(tminX,tminY,tminZ,ray.tnear); - const vfloat16 tFar = min(tmaxX,tmaxY,tmaxZ,ray.tfar ); - const vbool16 vmask = tNear <= tFar; - const size_t mask = movemask(vmask) & mvalid; - dist = extractN<4,0>(tNear); - return mask; - } - -#endif - - ////////////////////////////////////////////////////////////////////////////////////// - // Fast OBBNode intersection - ////////////////////////////////////////////////////////////////////////////////////// - - template - __forceinline size_t intersectNode(const typename BVHN::OBBNode* node, const TravRay& ray, vfloat& dist) - { - const Vec3vf dir = xfmVector(node->naabb,ray.dir); - //const Vec3vf nrdir = Vec3vf(vfloat(-1.0f))/dir; - const Vec3vf nrdir = Vec3vf(vfloat(-1.0f))*rcp_safe(dir); - const Vec3vf org = xfmPoint(node->naabb,ray.org); - const Vec3vf tLowerXYZ = org * nrdir; // (Vec3fa(zero) - org) * rdir; - const Vec3vf tUpperXYZ = tLowerXYZ - nrdir; // (Vec3fa(one ) - org) * rdir; - - const vfloat tNearX = mini(tLowerXYZ.x,tUpperXYZ.x); - const vfloat tNearY = mini(tLowerXYZ.y,tUpperXYZ.y); - const vfloat tNearZ = mini(tLowerXYZ.z,tUpperXYZ.z); - const vfloat tFarX = maxi(tLowerXYZ.x,tUpperXYZ.x); - const vfloat tFarY = maxi(tLowerXYZ.y,tUpperXYZ.y); - const vfloat tFarZ = maxi(tLowerXYZ.z,tUpperXYZ.z); - vfloat tNear = max(ray.tnear, tNearX,tNearY,tNearZ); - vfloat tFar = min(ray.tfar, tFarX ,tFarY ,tFarZ ); - if (robust) { - tNear = tNear*vfloat(1.0f-3.0f*float(ulp)); - tFar = tFar *vfloat(1.0f+3.0f*float(ulp)); - } - const vbool vmask = tNear <= tFar; - dist = tNear; - return movemask(vmask); - } - - ////////////////////////////////////////////////////////////////////////////////////// - // Fast OBBNodeMB intersection - ////////////////////////////////////////////////////////////////////////////////////// - - template - __forceinline size_t intersectNode(const typename BVHN::OBBNodeMB* node, const TravRay& ray, const float time, vfloat& dist) - { - const AffineSpace3vf xfm = node->space0; - const Vec3vf b0_lower = zero; - const Vec3vf b0_upper = one; - const Vec3vf lower = lerp(b0_lower,node->b1.lower,vfloat(time)); - const Vec3vf upper = lerp(b0_upper,node->b1.upper,vfloat(time)); - - const BBox3vf bounds(lower,upper); - const Vec3vf dir = xfmVector(xfm,ray.dir); - const Vec3vf rdir = rcp_safe(dir); - const Vec3vf org = xfmPoint(xfm,ray.org); - - const Vec3vf tLowerXYZ = (bounds.lower - org) * rdir; - const Vec3vf tUpperXYZ = (bounds.upper - org) * rdir; - - const vfloat tNearX = mini(tLowerXYZ.x,tUpperXYZ.x); - const vfloat tNearY = mini(tLowerXYZ.y,tUpperXYZ.y); - const vfloat tNearZ = mini(tLowerXYZ.z,tUpperXYZ.z); - const vfloat tFarX = maxi(tLowerXYZ.x,tUpperXYZ.x); - const vfloat tFarY = maxi(tLowerXYZ.y,tUpperXYZ.y); - const vfloat tFarZ = maxi(tLowerXYZ.z,tUpperXYZ.z); - vfloat tNear = max(ray.tnear, tNearX,tNearY,tNearZ); - vfloat tFar = min(ray.tfar, tFarX ,tFarY ,tFarZ ); - if (robust) { - tNear = tNear*vfloat(1.0f-3.0f*float(ulp)); - tFar = tFar *vfloat(1.0f+3.0f*float(ulp)); - } - const vbool vmask = tNear <= tFar; - dist = tNear; - return movemask(vmask); - } - - ////////////////////////////////////////////////////////////////////////////////////// - // Node intersectors used in point query raversal - ////////////////////////////////////////////////////////////////////////////////////// - - /*! Computes traversal information for N nodes with 1 point query */ - template - struct BVHNNodePointQuerySphere1; - - template - struct BVHNNodePointQuerySphere1 - { - static __forceinline bool pointQuery(const typename BVHN::NodeRef& node, const TravPointQuery& query, float time, vfloat& dist, size_t& mask) - { - if (unlikely(node.isLeaf())) return false; - mask = pointQueryNodeSphere(node.getAABBNode(), query, dist); - return true; - } - }; - - template - struct BVHNNodePointQuerySphere1 - { - static __forceinline bool pointQuery(const typename BVHN::NodeRef& node, const TravPointQuery& query, float time, vfloat& dist, size_t& mask) - { - if (unlikely(node.isLeaf())) return false; - mask = pointQueryNodeSphere(node.getAABBNodeMB(), query, time, dist); - return true; - } - }; - - template - struct BVHNNodePointQuerySphere1 - { - static __forceinline bool pointQuery(const typename BVHN::NodeRef& node, const TravPointQuery& query, float time, vfloat& dist, size_t& mask) - { - if (unlikely(node.isLeaf())) return false; - mask = pointQueryNodeSphereMB4D(node, query, time, dist); - return true; - } - }; - - template - struct BVHNNodePointQuerySphere1 - { - static __forceinline bool pointQuery(const typename BVHN::NodeRef& node, const TravPointQuery& query, float time, vfloat& dist, size_t& mask) - { - if (likely(node.isAABBNode())) mask = pointQueryNodeSphere(node.getAABBNode(), query, dist); - else if (unlikely(node.isOBBNode())) mask = pointQueryNodeSphere(node.ungetAABBNode(), query, dist); - else return false; - return true; - } - }; - - template - struct BVHNNodePointQuerySphere1 - { - static __forceinline bool pointQuery(const typename BVHN::NodeRef& node, const TravPointQuery& query, float time, vfloat& dist, size_t& mask) - { - if (likely(node.isAABBNodeMB())) mask = pointQueryNodeSphere(node.getAABBNodeMB(), query, time, dist); - else if (unlikely(node.isOBBNodeMB())) mask = pointQueryNodeSphere(node.ungetAABBNodeMB(), query, time, dist); - else return false; - return true; - } - }; - - template - struct BVHNNodePointQuerySphere1 - { - static __forceinline bool pointQuery(const typename BVHN::NodeRef& node, const TravPointQuery& query, float time, vfloat& dist, size_t& mask) - { - if (unlikely(node.isLeaf())) return false; - if (unlikely(node.isOBBNodeMB())) mask = pointQueryNodeSphere(node.ungetAABBNodeMB(), query, time, dist); - else mask = pointQueryNodeSphereMB4D(node, query, time, dist); - return true; - } - }; - - template - struct BVHNNodePointQuerySphere1 - { - static __forceinline bool pointQuery(const typename BVHN::NodeRef& node, const TravPointQuery& query, float time, vfloat& dist, size_t& mask) - { - if (unlikely(node.isLeaf())) return false; - mask = pointQueryNodeSphere((const typename BVHN::QuantizedNode*)node.quantizedNode(), query, dist); - return true; - } - }; - - template - struct BVHNQuantizedBaseNodePointQuerySphere1 - { - static __forceinline size_t pointQuery(const typename BVHN::QuantizedBaseNode* node, const TravPointQuery& query, vfloat& dist) - { - return pointQueryNodeSphere(node,query,dist); - } - - static __forceinline size_t pointQuery(const typename BVHN::QuantizedBaseNodeMB* node, const TravPointQuery& query, const float time, vfloat& dist) - { - return pointQueryNodeSphere(node,query,time,dist); - } - }; - - /*! Computes traversal information for N nodes with 1 point query */ - template - struct BVHNNodePointQueryAABB1; - - template - struct BVHNNodePointQueryAABB1 - { - static __forceinline bool pointQuery(const typename BVHN::NodeRef& node, const TravPointQuery& query, float time, vfloat& dist, size_t& mask) - { - if (unlikely(node.isLeaf())) return false; - mask = pointQueryNodeAABB(node.getAABBNode(), query, dist); - return true; - } - }; - - template - struct BVHNNodePointQueryAABB1 - { - static __forceinline bool pointQuery(const typename BVHN::NodeRef& node, const TravPointQuery& query, float time, vfloat& dist, size_t& mask) - { - if (unlikely(node.isLeaf())) return false; - mask = pointQueryNodeAABB(node.getAABBNodeMB(), query, time, dist); - return true; - } - }; - - template - struct BVHNNodePointQueryAABB1 - { - static __forceinline bool pointQuery(const typename BVHN::NodeRef& node, const TravPointQuery& query, float time, vfloat& dist, size_t& mask) - { - if (unlikely(node.isLeaf())) return false; - mask = pointQueryNodeAABBMB4D(node, query, time, dist); - return true; - } - }; - - template - struct BVHNNodePointQueryAABB1 - { - static __forceinline bool pointQuery(const typename BVHN::NodeRef& node, const TravPointQuery& query, float time, vfloat& dist, size_t& mask) - { - if (likely(node.isAABBNode())) mask = pointQueryNodeAABB(node.getAABBNode(), query, dist); - else if (unlikely(node.isOBBNode())) mask = pointQueryNodeAABB(node.ungetAABBNode(), query, dist); - else return false; - return true; - } - }; - - template - struct BVHNNodePointQueryAABB1 - { - static __forceinline bool pointQuery(const typename BVHN::NodeRef& node, const TravPointQuery& query, float time, vfloat& dist, size_t& mask) - { - if (likely(node.isAABBNodeMB())) mask = pointQueryNodeAABB(node.getAABBNodeMB(), query, time, dist); - else if (unlikely(node.isOBBNodeMB())) mask = pointQueryNodeAABB(node.ungetAABBNodeMB(), query, time, dist); - else return false; - return true; - } - }; - - template - struct BVHNNodePointQueryAABB1 - { - static __forceinline bool pointQuery(const typename BVHN::NodeRef& node, const TravPointQuery& query, float time, vfloat& dist, size_t& mask) - { - if (unlikely(node.isLeaf())) return false; - if (unlikely(node.isOBBNodeMB())) mask = pointQueryNodeAABB(node.ungetAABBNodeMB(), query, time, dist); - else mask = pointQueryNodeAABBMB4D(node, query, time, dist); - return true; - } - }; - - template - struct BVHNNodePointQueryAABB1 - { - static __forceinline bool pointQuery(const typename BVHN::NodeRef& node, const TravPointQuery& query, float time, vfloat& dist, size_t& mask) - { - if (unlikely(node.isLeaf())) return false; - mask = pointQueryNodeAABB((const typename BVHN::QuantizedNode*)node.quantizedNode(), query, dist); - return true; - } - }; - - template - struct BVHNQuantizedBaseNodePointQueryAABB1 - { - static __forceinline size_t pointQuery(const typename BVHN::QuantizedBaseNode* node, const TravPointQuery& query, vfloat& dist) - { - return pointQueryNodeAABB(node,query,dist); - } - - static __forceinline size_t pointQuery(const typename BVHN::QuantizedBaseNodeMB* node, const TravPointQuery& query, const float time, vfloat& dist) - { - return pointQueryNodeAABB(node,query,time,dist); - } - }; - - - ////////////////////////////////////////////////////////////////////////////////////// - // Node intersectors used in ray traversal - ////////////////////////////////////////////////////////////////////////////////////// - - /*! Intersects N nodes with 1 ray */ - template - struct BVHNNodeIntersector1; - - template - struct BVHNNodeIntersector1 - { - static __forceinline bool intersect(const typename BVHN::NodeRef& node, const TravRay& ray, float time, vfloat& dist, size_t& mask) - { - if (unlikely(node.isLeaf())) return false; - mask = intersectNode(node.getAABBNode(), ray, dist); - return true; - } - }; - - template - struct BVHNNodeIntersector1 - { - static __forceinline bool intersect(const typename BVHN::NodeRef& node, const TravRay& ray, float time, vfloat& dist, size_t& mask) - { - if (unlikely(node.isLeaf())) return false; - mask = intersectNodeRobust(node.getAABBNode(), ray, dist); - return true; - } - }; - - template - struct BVHNNodeIntersector1 - { - static __forceinline bool intersect(const typename BVHN::NodeRef& node, const TravRay& ray, float time, vfloat& dist, size_t& mask) - { - if (unlikely(node.isLeaf())) return false; - mask = intersectNode(node.getAABBNodeMB(), ray, time, dist); - return true; - } - }; - - template - struct BVHNNodeIntersector1 - { - static __forceinline bool intersect(const typename BVHN::NodeRef& node, const TravRay& ray, float time, vfloat& dist, size_t& mask) - { - if (unlikely(node.isLeaf())) return false; - mask = intersectNodeRobust(node.getAABBNodeMB(), ray, time, dist); - return true; - } - }; - - template - struct BVHNNodeIntersector1 - { - static __forceinline bool intersect(const typename BVHN::NodeRef& node, const TravRay& ray, float time, vfloat& dist, size_t& mask) - { - if (unlikely(node.isLeaf())) return false; - mask = intersectNodeMB4D(node, ray, time, dist); - return true; - } - }; - - template - struct BVHNNodeIntersector1 - { - static __forceinline bool intersect(const typename BVHN::NodeRef& node, const TravRay& ray, float time, vfloat& dist, size_t& mask) - { - if (unlikely(node.isLeaf())) return false; - mask = intersectNodeMB4DRobust(node, ray, time, dist); - return true; - } - }; - - template - struct BVHNNodeIntersector1 - { - static __forceinline bool intersect(const typename BVHN::NodeRef& node, const TravRay& ray, float time, vfloat& dist, size_t& mask) - { - if (likely(node.isAABBNode())) mask = intersectNode(node.getAABBNode(), ray, dist); - else if (unlikely(node.isOBBNode())) mask = intersectNode(node.ungetAABBNode(), ray, dist); - else return false; - return true; - } - }; - - template - struct BVHNNodeIntersector1 - { - static __forceinline bool intersect(const typename BVHN::NodeRef& node, const TravRay& ray, float time, vfloat& dist, size_t& mask) - { - if (likely(node.isAABBNode())) mask = intersectNodeRobust(node.getAABBNode(), ray, dist); - else if (unlikely(node.isOBBNode())) mask = intersectNode(node.ungetAABBNode(), ray, dist); - else return false; - return true; - } - }; - - template - struct BVHNNodeIntersector1 - { - static __forceinline bool intersect(const typename BVHN::NodeRef& node, const TravRay& ray, float time, vfloat& dist, size_t& mask) - { - if (likely(node.isAABBNodeMB())) mask = intersectNode(node.getAABBNodeMB(), ray, time, dist); - else if (unlikely(node.isOBBNodeMB())) mask = intersectNode(node.ungetAABBNodeMB(), ray, time, dist); - else return false; - return true; - } - }; - - template - struct BVHNNodeIntersector1 - { - static __forceinline bool intersect(const typename BVHN::NodeRef& node, const TravRay& ray, float time, vfloat& dist, size_t& mask) - { - if (likely(node.isAABBNodeMB())) mask = intersectNodeRobust(node.getAABBNodeMB(), ray, time, dist); - else if (unlikely(node.isOBBNodeMB())) mask = intersectNode(node.ungetAABBNodeMB(), ray, time, dist); - else return false; - return true; - } - }; - - template - struct BVHNNodeIntersector1 - { - static __forceinline bool intersect(const typename BVHN::NodeRef& node, const TravRay& ray, float time, vfloat& dist, size_t& mask) - { - if (unlikely(node.isLeaf())) return false; - if (unlikely(node.isOBBNodeMB())) mask = intersectNode(node.ungetAABBNodeMB(), ray, time, dist); - else mask = intersectNodeMB4D(node, ray, time, dist); - return true; - } - }; - - template - struct BVHNNodeIntersector1 - { - static __forceinline bool intersect(const typename BVHN::NodeRef& node, const TravRay& ray, float time, vfloat& dist, size_t& mask) - { - if (unlikely(node.isLeaf())) return false; - if (unlikely(node.isOBBNodeMB())) mask = intersectNode(node.ungetAABBNodeMB(), ray, time, dist); - else mask = intersectNodeMB4DRobust(node, ray, time, dist); - return true; - } - }; - - template - struct BVHNNodeIntersector1 - { - static __forceinline bool intersect(const typename BVHN::NodeRef& node, const TravRay& ray, float time, vfloat& dist, size_t& mask) - { - if (unlikely(node.isLeaf())) return false; - mask = intersectNode((const typename BVHN::QuantizedNode*)node.quantizedNode(), ray, dist); - return true; - } - }; - - template - struct BVHNNodeIntersector1 - { - static __forceinline bool intersect(const typename BVHN::NodeRef& node, const TravRay& ray, float time, vfloat& dist, size_t& mask) - { - if (unlikely(node.isLeaf())) return false; - mask = intersectNodeRobust((const typename BVHN::QuantizedNode*)node.quantizedNode(), ray, dist); - return true; - } - }; - - /*! Intersects N nodes with K rays */ - template - struct BVHNQuantizedBaseNodeIntersector1; - - template - struct BVHNQuantizedBaseNodeIntersector1 - { - static __forceinline size_t intersect(const typename BVHN::QuantizedBaseNode* node, const TravRay& ray, vfloat& dist) - { - return intersectNode(node,ray,dist); - } - - static __forceinline size_t intersect(const typename BVHN::QuantizedBaseNodeMB* node, const TravRay& ray, const float time, vfloat& dist) - { - return intersectNode(node,ray,time,dist); - } - - }; - - template - struct BVHNQuantizedBaseNodeIntersector1 - { - static __forceinline size_t intersect(const typename BVHN::QuantizedBaseNode* node, const TravRay& ray, vfloat& dist) - { - return intersectNode(node,ray,dist); - } - - static __forceinline size_t intersect(const typename BVHN::QuantizedBaseNodeMB* node, const TravRay& ray, const float time, vfloat& dist) - { - return intersectNode(node,ray,time,dist); - } - - }; - - - } -} diff --git a/thirdparty/embree-aarch64/kernels/bvh/node_intersector_frustum.h b/thirdparty/embree-aarch64/kernels/bvh/node_intersector_frustum.h deleted file mode 100644 index 800ac8b478..0000000000 --- a/thirdparty/embree-aarch64/kernels/bvh/node_intersector_frustum.h +++ /dev/null @@ -1,269 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "node_intersector.h" - -namespace embree -{ - namespace isa - { - ////////////////////////////////////////////////////////////////////////////////////// - // Frustum structure used in hybrid and stream traversal - ////////////////////////////////////////////////////////////////////////////////////// - - /* - Optimized frustum test. We calculate t=(p-org)/dir in ray/box - intersection. We assume the rays are split by octant, thus - dir intervals are either positive or negative in each - dimension. - - Case 1: dir.min >= 0 && dir.max >= 0: - t_min = (p_min - org_max) / dir_max = (p_min - org_max)*rdir_min = p_min*rdir_min - org_max*rdir_min - t_max = (p_max - org_min) / dir_min = (p_max - org_min)*rdir_max = p_max*rdir_max - org_min*rdir_max - - Case 2: dir.min < 0 && dir.max < 0: - t_min = (p_max - org_min) / dir_min = (p_max - org_min)*rdir_max = p_max*rdir_max - org_min*rdir_max - t_max = (p_min - org_max) / dir_max = (p_min - org_max)*rdir_min = p_min*rdir_min - org_max*rdir_min - */ - - template - struct Frustum; - - /* Fast variant */ - template<> - struct Frustum - { - __forceinline Frustum() {} - - template - __forceinline Frustum(const vbool& valid, const Vec3vf& org, const Vec3vf& rdir, const vfloat& ray_tnear, const vfloat& ray_tfar, int N) - { - init(valid, org, rdir, ray_tnear, ray_tfar, N); - } - - template - __forceinline void init(const vbool& valid, const Vec3vf& org, const Vec3vf& rdir, const vfloat& ray_tnear, const vfloat& ray_tfar, int N) - { - const Vec3fa reduced_min_org(reduce_min(select(valid, org.x, pos_inf)), - reduce_min(select(valid, org.y, pos_inf)), - reduce_min(select(valid, org.z, pos_inf))); - - const Vec3fa reduced_max_org(reduce_max(select(valid, org.x, neg_inf)), - reduce_max(select(valid, org.y, neg_inf)), - reduce_max(select(valid, org.z, neg_inf))); - - const Vec3fa reduced_min_rdir(reduce_min(select(valid, rdir.x, pos_inf)), - reduce_min(select(valid, rdir.y, pos_inf)), - reduce_min(select(valid, rdir.z, pos_inf))); - - const Vec3fa reduced_max_rdir(reduce_max(select(valid, rdir.x, neg_inf)), - reduce_max(select(valid, rdir.y, neg_inf)), - reduce_max(select(valid, rdir.z, neg_inf))); - - const float reduced_min_dist = reduce_min(select(valid, ray_tnear, vfloat(pos_inf))); - const float reduced_max_dist = reduce_max(select(valid, ray_tfar , vfloat(neg_inf))); - - init(reduced_min_org, reduced_max_org, reduced_min_rdir, reduced_max_rdir, reduced_min_dist, reduced_max_dist, N); - } - - __forceinline void init(const Vec3fa& reduced_min_org, - const Vec3fa& reduced_max_org, - const Vec3fa& reduced_min_rdir, - const Vec3fa& reduced_max_rdir, - float reduced_min_dist, - float reduced_max_dist, - int N) - { - const Vec3ba pos_rdir = ge_mask(reduced_min_rdir, Vec3fa(zero)); - - min_rdir = select(pos_rdir, reduced_min_rdir, reduced_max_rdir); - max_rdir = select(pos_rdir, reduced_max_rdir, reduced_min_rdir); - -#if defined (__aarch64__) - neg_min_org_rdir = -(min_rdir * select(pos_rdir, reduced_max_org, reduced_min_org)); - neg_max_org_rdir = -(max_rdir * select(pos_rdir, reduced_min_org, reduced_max_org)); -#else - min_org_rdir = min_rdir * select(pos_rdir, reduced_max_org, reduced_min_org); - max_org_rdir = max_rdir * select(pos_rdir, reduced_min_org, reduced_max_org); -#endif - min_dist = reduced_min_dist; - max_dist = reduced_max_dist; - - nf = NearFarPrecalculations(min_rdir, N); - } - - template - __forceinline void updateMaxDist(const vfloat& ray_tfar) - { - max_dist = reduce_max(ray_tfar); - } - - NearFarPrecalculations nf; - - Vec3fa min_rdir; - Vec3fa max_rdir; - -#if defined (__aarch64__) - Vec3fa neg_min_org_rdir; - Vec3fa neg_max_org_rdir; -#else - Vec3fa min_org_rdir; - Vec3fa max_org_rdir; -#endif - float min_dist; - float max_dist; - }; - - typedef Frustum FrustumFast; - - /* Robust variant */ - template<> - struct Frustum - { - __forceinline Frustum() {} - - template - __forceinline Frustum(const vbool& valid, const Vec3vf& org, const Vec3vf& rdir, const vfloat& ray_tnear, const vfloat& ray_tfar, int N) - { - init(valid, org, rdir, ray_tnear, ray_tfar, N); - } - - template - __forceinline void init(const vbool& valid, const Vec3vf& org, const Vec3vf& rdir, const vfloat& ray_tnear, const vfloat& ray_tfar, int N) - { - const Vec3fa reduced_min_org(reduce_min(select(valid, org.x, pos_inf)), - reduce_min(select(valid, org.y, pos_inf)), - reduce_min(select(valid, org.z, pos_inf))); - - const Vec3fa reduced_max_org(reduce_max(select(valid, org.x, neg_inf)), - reduce_max(select(valid, org.y, neg_inf)), - reduce_max(select(valid, org.z, neg_inf))); - - const Vec3fa reduced_min_rdir(reduce_min(select(valid, rdir.x, pos_inf)), - reduce_min(select(valid, rdir.y, pos_inf)), - reduce_min(select(valid, rdir.z, pos_inf))); - - const Vec3fa reduced_max_rdir(reduce_max(select(valid, rdir.x, neg_inf)), - reduce_max(select(valid, rdir.y, neg_inf)), - reduce_max(select(valid, rdir.z, neg_inf))); - - const float reduced_min_dist = reduce_min(select(valid, ray_tnear, vfloat(pos_inf))); - const float reduced_max_dist = reduce_max(select(valid, ray_tfar , vfloat(neg_inf))); - - init(reduced_min_org, reduced_max_org, reduced_min_rdir, reduced_max_rdir, reduced_min_dist, reduced_max_dist, N); - } - - __forceinline void init(const Vec3fa& reduced_min_org, - const Vec3fa& reduced_max_org, - const Vec3fa& reduced_min_rdir, - const Vec3fa& reduced_max_rdir, - float reduced_min_dist, - float reduced_max_dist, - int N) - { - const Vec3ba pos_rdir = ge_mask(reduced_min_rdir, Vec3fa(zero)); - min_rdir = select(pos_rdir, reduced_min_rdir, reduced_max_rdir); - max_rdir = select(pos_rdir, reduced_max_rdir, reduced_min_rdir); - - min_org = select(pos_rdir, reduced_max_org, reduced_min_org); - max_org = select(pos_rdir, reduced_min_org, reduced_max_org); - - min_dist = reduced_min_dist; - max_dist = reduced_max_dist; - - nf = NearFarPrecalculations(min_rdir, N); - } - - template - __forceinline void updateMaxDist(const vfloat& ray_tfar) - { - max_dist = reduce_max(ray_tfar); - } - - NearFarPrecalculations nf; - - Vec3fa min_rdir; - Vec3fa max_rdir; - - Vec3fa min_org; - Vec3fa max_org; - - float min_dist; - float max_dist; - }; - - typedef Frustum FrustumRobust; - - ////////////////////////////////////////////////////////////////////////////////////// - // Fast AABBNode intersection - ////////////////////////////////////////////////////////////////////////////////////// - - template - __forceinline size_t intersectNodeFrustum(const typename BVHN::AABBNode* __restrict__ node, - const FrustumFast& frustum, vfloat& dist) - { - const vfloat bminX = *(const vfloat*)((const char*)&node->lower_x + frustum.nf.nearX); - const vfloat bminY = *(const vfloat*)((const char*)&node->lower_x + frustum.nf.nearY); - const vfloat bminZ = *(const vfloat*)((const char*)&node->lower_x + frustum.nf.nearZ); - const vfloat bmaxX = *(const vfloat*)((const char*)&node->lower_x + frustum.nf.farX); - const vfloat bmaxY = *(const vfloat*)((const char*)&node->lower_x + frustum.nf.farY); - const vfloat bmaxZ = *(const vfloat*)((const char*)&node->lower_x + frustum.nf.farZ); - -#if defined (__aarch64__) - const vfloat fminX = madd(bminX, vfloat(frustum.min_rdir.x), vfloat(frustum.neg_min_org_rdir.x)); - const vfloat fminY = madd(bminY, vfloat(frustum.min_rdir.y), vfloat(frustum.neg_min_org_rdir.y)); - const vfloat fminZ = madd(bminZ, vfloat(frustum.min_rdir.z), vfloat(frustum.neg_min_org_rdir.z)); - const vfloat fmaxX = madd(bmaxX, vfloat(frustum.max_rdir.x), vfloat(frustum.neg_max_org_rdir.x)); - const vfloat fmaxY = madd(bmaxY, vfloat(frustum.max_rdir.y), vfloat(frustum.neg_max_org_rdir.y)); - const vfloat fmaxZ = madd(bmaxZ, vfloat(frustum.max_rdir.z), vfloat(frustum.neg_max_org_rdir.z)); -#else - const vfloat fminX = msub(bminX, vfloat(frustum.min_rdir.x), vfloat(frustum.min_org_rdir.x)); - const vfloat fminY = msub(bminY, vfloat(frustum.min_rdir.y), vfloat(frustum.min_org_rdir.y)); - const vfloat fminZ = msub(bminZ, vfloat(frustum.min_rdir.z), vfloat(frustum.min_org_rdir.z)); - const vfloat fmaxX = msub(bmaxX, vfloat(frustum.max_rdir.x), vfloat(frustum.max_org_rdir.x)); - const vfloat fmaxY = msub(bmaxY, vfloat(frustum.max_rdir.y), vfloat(frustum.max_org_rdir.y)); - const vfloat fmaxZ = msub(bmaxZ, vfloat(frustum.max_rdir.z), vfloat(frustum.max_org_rdir.z)); -#endif - const vfloat fmin = maxi(fminX, fminY, fminZ, vfloat(frustum.min_dist)); - dist = fmin; - const vfloat fmax = mini(fmaxX, fmaxY, fmaxZ, vfloat(frustum.max_dist)); - const vbool vmask_node_hit = fmin <= fmax; - size_t m_node = movemask(vmask_node_hit) & (((size_t)1 << N)-1); - return m_node; - } - - ////////////////////////////////////////////////////////////////////////////////////// - // Robust AABBNode intersection - ////////////////////////////////////////////////////////////////////////////////////// - - template - __forceinline size_t intersectNodeFrustum(const typename BVHN::AABBNode* __restrict__ node, - const FrustumRobust& frustum, vfloat& dist) - { - const vfloat bminX = *(const vfloat*)((const char*)&node->lower_x + frustum.nf.nearX); - const vfloat bminY = *(const vfloat*)((const char*)&node->lower_x + frustum.nf.nearY); - const vfloat bminZ = *(const vfloat*)((const char*)&node->lower_x + frustum.nf.nearZ); - const vfloat bmaxX = *(const vfloat*)((const char*)&node->lower_x + frustum.nf.farX); - const vfloat bmaxY = *(const vfloat*)((const char*)&node->lower_x + frustum.nf.farY); - const vfloat bmaxZ = *(const vfloat*)((const char*)&node->lower_x + frustum.nf.farZ); - - const vfloat fminX = (bminX - vfloat(frustum.min_org.x)) * vfloat(frustum.min_rdir.x); - const vfloat fminY = (bminY - vfloat(frustum.min_org.y)) * vfloat(frustum.min_rdir.y); - const vfloat fminZ = (bminZ - vfloat(frustum.min_org.z)) * vfloat(frustum.min_rdir.z); - const vfloat fmaxX = (bmaxX - vfloat(frustum.max_org.x)) * vfloat(frustum.max_rdir.x); - const vfloat fmaxY = (bmaxY - vfloat(frustum.max_org.y)) * vfloat(frustum.max_rdir.y); - const vfloat fmaxZ = (bmaxZ - vfloat(frustum.max_org.z)) * vfloat(frustum.max_rdir.z); - - const float round_down = 1.0f-2.0f*float(ulp); // FIXME: use per instruction rounding for AVX512 - const float round_up = 1.0f+2.0f*float(ulp); - const vfloat fmin = max(fminX, fminY, fminZ, vfloat(frustum.min_dist)); - dist = fmin; - const vfloat fmax = min(fmaxX, fmaxY, fmaxZ, vfloat(frustum.max_dist)); - const vbool vmask_node_hit = (round_down*fmin <= round_up*fmax); - size_t m_node = movemask(vmask_node_hit) & (((size_t)1 << N)-1); - return m_node; - } - } -} diff --git a/thirdparty/embree-aarch64/kernels/bvh/node_intersector_packet.h b/thirdparty/embree-aarch64/kernels/bvh/node_intersector_packet.h deleted file mode 100644 index 0543e56f8e..0000000000 --- a/thirdparty/embree-aarch64/kernels/bvh/node_intersector_packet.h +++ /dev/null @@ -1,843 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "node_intersector.h" - -namespace embree -{ - namespace isa - { - ////////////////////////////////////////////////////////////////////////////////////// - // Ray packet structure used in hybrid traversal - ////////////////////////////////////////////////////////////////////////////////////// - - template - struct TravRayK; - - /* Fast variant */ - template - struct TravRayK - { - __forceinline TravRayK() {} - - __forceinline TravRayK(const Vec3vf& ray_org, const Vec3vf& ray_dir, int N) - { - init(ray_org, ray_dir, N); - } - - __forceinline TravRayK(const Vec3vf& ray_org, const Vec3vf& ray_dir, const vfloat& ray_tnear, const vfloat& ray_tfar, int N) - { - init(ray_org, ray_dir, N); - tnear = ray_tnear; - tfar = ray_tfar; - } - - __forceinline void init(const Vec3vf& ray_org, const Vec3vf& ray_dir, int N) - { - org = ray_org; - dir = ray_dir; - rdir = rcp_safe(ray_dir); -#if defined(__aarch64__) - neg_org_rdir = -(org * rdir); -#elif defined(__AVX2__) - org_rdir = org * rdir; -#endif - if (N) - { - const int size = sizeof(float)*N; - nearXYZ.x = select(rdir.x >= 0.0f, vint(0*size), vint(1*size)); - nearXYZ.y = select(rdir.y >= 0.0f, vint(2*size), vint(3*size)); - nearXYZ.z = select(rdir.z >= 0.0f, vint(4*size), vint(5*size)); - } - } - - Vec3vf org; - Vec3vf dir; - Vec3vf rdir; -#if defined(__aarch64__) - Vec3vf neg_org_rdir; -#elif defined(__AVX2__) - Vec3vf org_rdir; -#endif - Vec3vi nearXYZ; - vfloat tnear; - vfloat tfar; - }; - - template - using TravRayKFast = TravRayK; - - /* Robust variant */ - template - struct TravRayK - { - __forceinline TravRayK() {} - - __forceinline TravRayK(const Vec3vf& ray_org, const Vec3vf& ray_dir, int N) - { - init(ray_org, ray_dir, N); - } - - __forceinline TravRayK(const Vec3vf& ray_org, const Vec3vf& ray_dir, const vfloat& ray_tnear, const vfloat& ray_tfar, int N) - { - init(ray_org, ray_dir, N); - tnear = ray_tnear; - tfar = ray_tfar; - } - - __forceinline void init(const Vec3vf& ray_org, const Vec3vf& ray_dir, int N) - { - org = ray_org; - dir = ray_dir; - rdir = vfloat(1.0f)/(zero_fix(ray_dir)); - - if (N) - { - const int size = sizeof(float)*N; - nearXYZ.x = select(rdir.x >= 0.0f, vint(0*size), vint(1*size)); - nearXYZ.y = select(rdir.y >= 0.0f, vint(2*size), vint(3*size)); - nearXYZ.z = select(rdir.z >= 0.0f, vint(4*size), vint(5*size)); - } - } - - Vec3vf org; - Vec3vf dir; - Vec3vf rdir; - Vec3vi nearXYZ; - vfloat tnear; - vfloat tfar; - }; - - template - using TravRayKRobust = TravRayK; - - ////////////////////////////////////////////////////////////////////////////////////// - // Fast AABBNode intersection - ////////////////////////////////////////////////////////////////////////////////////// - - template - __forceinline vbool intersectNodeK(const typename BVHN::AABBNode* node, size_t i, - const TravRayKFast& ray, vfloat& dist) - - { -#if defined(__aarch64__) - const vfloat lclipMinX = madd(node->lower_x[i], ray.rdir.x, ray.neg_org_rdir.x); - const vfloat lclipMinY = madd(node->lower_y[i], ray.rdir.y, ray.neg_org_rdir.y); - const vfloat lclipMinZ = madd(node->lower_z[i], ray.rdir.z, ray.neg_org_rdir.z); - const vfloat lclipMaxX = madd(node->upper_x[i], ray.rdir.x, ray.neg_org_rdir.x); - const vfloat lclipMaxY = madd(node->upper_y[i], ray.rdir.y, ray.neg_org_rdir.y); - const vfloat lclipMaxZ = madd(node->upper_z[i], ray.rdir.z, ray.neg_org_rdir.z); -#elif defined(__AVX2__) - const vfloat lclipMinX = msub(node->lower_x[i], ray.rdir.x, ray.org_rdir.x); - const vfloat lclipMinY = msub(node->lower_y[i], ray.rdir.y, ray.org_rdir.y); - const vfloat lclipMinZ = msub(node->lower_z[i], ray.rdir.z, ray.org_rdir.z); - const vfloat lclipMaxX = msub(node->upper_x[i], ray.rdir.x, ray.org_rdir.x); - const vfloat lclipMaxY = msub(node->upper_y[i], ray.rdir.y, ray.org_rdir.y); - const vfloat lclipMaxZ = msub(node->upper_z[i], ray.rdir.z, ray.org_rdir.z); - #else - const vfloat lclipMinX = (node->lower_x[i] - ray.org.x) * ray.rdir.x; - const vfloat lclipMinY = (node->lower_y[i] - ray.org.y) * ray.rdir.y; - const vfloat lclipMinZ = (node->lower_z[i] - ray.org.z) * ray.rdir.z; - const vfloat lclipMaxX = (node->upper_x[i] - ray.org.x) * ray.rdir.x; - const vfloat lclipMaxY = (node->upper_y[i] - ray.org.y) * ray.rdir.y; - const vfloat lclipMaxZ = (node->upper_z[i] - ray.org.z) * ray.rdir.z; - #endif - - #if defined(__AVX512F__) && !defined(__AVX512ER__) // SKX - if (K == 16) - { - /* use mixed float/int min/max */ - const vfloat lnearP = maxi(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ)); - const vfloat lfarP = mini(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ)); - const vbool lhit = asInt(maxi(lnearP, ray.tnear)) <= asInt(mini(lfarP, ray.tfar)); - dist = lnearP; - return lhit; - } - else - #endif - { - const vfloat lnearP = maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY), mini(lclipMinZ, lclipMaxZ)); - const vfloat lfarP = mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY), maxi(lclipMinZ, lclipMaxZ)); - #if defined(__AVX512F__) && !defined(__AVX512ER__) // SKX - const vbool lhit = asInt(maxi(lnearP, ray.tnear)) <= asInt(mini(lfarP, ray.tfar)); - #else - const vbool lhit = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar); - #endif - dist = lnearP; - return lhit; - } - } - - ////////////////////////////////////////////////////////////////////////////////////// - // Robust AABBNode intersection - ////////////////////////////////////////////////////////////////////////////////////// - - template - __forceinline vbool intersectNodeKRobust(const typename BVHN::AABBNode* node, size_t i, - const TravRayKRobust& ray, vfloat& dist) - { - // FIXME: use per instruction rounding for AVX512 - const vfloat lclipMinX = (node->lower_x[i] - ray.org.x) * ray.rdir.x; - const vfloat lclipMinY = (node->lower_y[i] - ray.org.y) * ray.rdir.y; - const vfloat lclipMinZ = (node->lower_z[i] - ray.org.z) * ray.rdir.z; - const vfloat lclipMaxX = (node->upper_x[i] - ray.org.x) * ray.rdir.x; - const vfloat lclipMaxY = (node->upper_y[i] - ray.org.y) * ray.rdir.y; - const vfloat lclipMaxZ = (node->upper_z[i] - ray.org.z) * ray.rdir.z; - const float round_up = 1.0f+3.0f*float(ulp); - const float round_down = 1.0f-3.0f*float(ulp); - const vfloat lnearP = round_down*max(max(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY)), min(lclipMinZ, lclipMaxZ)); - const vfloat lfarP = round_up *min(min(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY)), max(lclipMinZ, lclipMaxZ)); - const vbool lhit = max(lnearP, ray.tnear) <= min(lfarP, ray.tfar); - dist = lnearP; - return lhit; - } - - ////////////////////////////////////////////////////////////////////////////////////// - // Fast AABBNodeMB intersection - ////////////////////////////////////////////////////////////////////////////////////// - - template - __forceinline vbool intersectNodeK(const typename BVHN::AABBNodeMB* node, const size_t i, - const TravRayKFast& ray, const vfloat& time, vfloat& dist) - { - const vfloat vlower_x = madd(time, vfloat(node->lower_dx[i]), vfloat(node->lower_x[i])); - const vfloat vlower_y = madd(time, vfloat(node->lower_dy[i]), vfloat(node->lower_y[i])); - const vfloat vlower_z = madd(time, vfloat(node->lower_dz[i]), vfloat(node->lower_z[i])); - const vfloat vupper_x = madd(time, vfloat(node->upper_dx[i]), vfloat(node->upper_x[i])); - const vfloat vupper_y = madd(time, vfloat(node->upper_dy[i]), vfloat(node->upper_y[i])); - const vfloat vupper_z = madd(time, vfloat(node->upper_dz[i]), vfloat(node->upper_z[i])); - -#if defined(__aarch64__) - const vfloat lclipMinX = madd(vlower_x, ray.rdir.x, ray.neg_org_rdir.x); - const vfloat lclipMinY = madd(vlower_y, ray.rdir.y, ray.neg_org_rdir.y); - const vfloat lclipMinZ = madd(vlower_z, ray.rdir.z, ray.neg_org_rdir.z); - const vfloat lclipMaxX = madd(vupper_x, ray.rdir.x, ray.neg_org_rdir.x); - const vfloat lclipMaxY = madd(vupper_y, ray.rdir.y, ray.neg_org_rdir.y); - const vfloat lclipMaxZ = madd(vupper_z, ray.rdir.z, ray.neg_org_rdir.z); -#elif defined(__AVX2__) - const vfloat lclipMinX = msub(vlower_x, ray.rdir.x, ray.org_rdir.x); - const vfloat lclipMinY = msub(vlower_y, ray.rdir.y, ray.org_rdir.y); - const vfloat lclipMinZ = msub(vlower_z, ray.rdir.z, ray.org_rdir.z); - const vfloat lclipMaxX = msub(vupper_x, ray.rdir.x, ray.org_rdir.x); - const vfloat lclipMaxY = msub(vupper_y, ray.rdir.y, ray.org_rdir.y); - const vfloat lclipMaxZ = msub(vupper_z, ray.rdir.z, ray.org_rdir.z); -#else - const vfloat lclipMinX = (vlower_x - ray.org.x) * ray.rdir.x; - const vfloat lclipMinY = (vlower_y - ray.org.y) * ray.rdir.y; - const vfloat lclipMinZ = (vlower_z - ray.org.z) * ray.rdir.z; - const vfloat lclipMaxX = (vupper_x - ray.org.x) * ray.rdir.x; - const vfloat lclipMaxY = (vupper_y - ray.org.y) * ray.rdir.y; - const vfloat lclipMaxZ = (vupper_z - ray.org.z) * ray.rdir.z; -#endif - -#if defined(__AVX512F__) && !defined(__AVX512ER__) // SKX - if (K == 16) - { - /* use mixed float/int min/max */ - const vfloat lnearP = maxi(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ)); - const vfloat lfarP = mini(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ)); - const vbool lhit = asInt(maxi(lnearP, ray.tnear)) <= asInt(mini(lfarP, ray.tfar)); - dist = lnearP; - return lhit; - } - else -#endif - { - const vfloat lnearP = maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY), mini(lclipMinZ, lclipMaxZ)); - const vfloat lfarP = mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY), maxi(lclipMinZ, lclipMaxZ)); -#if defined(__AVX512F__) && !defined(__AVX512ER__) // SKX - const vbool lhit = asInt(maxi(lnearP, ray.tnear)) <= asInt(mini(lfarP, ray.tfar)); -#else - const vbool lhit = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar); -#endif - dist = lnearP; - return lhit; - } - } - - ////////////////////////////////////////////////////////////////////////////////////// - // Robust AABBNodeMB intersection - ////////////////////////////////////////////////////////////////////////////////////// - - template - __forceinline vbool intersectNodeKRobust(const typename BVHN::AABBNodeMB* node, const size_t i, - const TravRayKRobust& ray, const vfloat& time, vfloat& dist) - { - const vfloat vlower_x = madd(time, vfloat(node->lower_dx[i]), vfloat(node->lower_x[i])); - const vfloat vlower_y = madd(time, vfloat(node->lower_dy[i]), vfloat(node->lower_y[i])); - const vfloat vlower_z = madd(time, vfloat(node->lower_dz[i]), vfloat(node->lower_z[i])); - const vfloat vupper_x = madd(time, vfloat(node->upper_dx[i]), vfloat(node->upper_x[i])); - const vfloat vupper_y = madd(time, vfloat(node->upper_dy[i]), vfloat(node->upper_y[i])); - const vfloat vupper_z = madd(time, vfloat(node->upper_dz[i]), vfloat(node->upper_z[i])); - - const vfloat lclipMinX = (vlower_x - ray.org.x) * ray.rdir.x; - const vfloat lclipMinY = (vlower_y - ray.org.y) * ray.rdir.y; - const vfloat lclipMinZ = (vlower_z - ray.org.z) * ray.rdir.z; - const vfloat lclipMaxX = (vupper_x - ray.org.x) * ray.rdir.x; - const vfloat lclipMaxY = (vupper_y - ray.org.y) * ray.rdir.y; - const vfloat lclipMaxZ = (vupper_z - ray.org.z) * ray.rdir.z; - - const float round_up = 1.0f+3.0f*float(ulp); - const float round_down = 1.0f-3.0f*float(ulp); - -#if defined(__AVX512F__) && !defined(__AVX512ER__) // SKX - if (K == 16) - { - const vfloat lnearP = round_down*maxi(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ)); - const vfloat lfarP = round_up *mini(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ)); - const vbool lhit = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar); - dist = lnearP; - return lhit; - } - else -#endif - { - const vfloat lnearP = round_down*maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY), mini(lclipMinZ, lclipMaxZ)); - const vfloat lfarP = round_up *mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY), maxi(lclipMinZ, lclipMaxZ)); - const vbool lhit = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar); - dist = lnearP; - return lhit; - } - } - - ////////////////////////////////////////////////////////////////////////////////////// - // Fast AABBNodeMB4D intersection - ////////////////////////////////////////////////////////////////////////////////////// - - template - __forceinline vbool intersectNodeKMB4D(const typename BVHN::NodeRef ref, const size_t i, - const TravRayKFast& ray, const vfloat& time, vfloat& dist) - { - const typename BVHN::AABBNodeMB* node = ref.getAABBNodeMB(); - - const vfloat vlower_x = madd(time, vfloat(node->lower_dx[i]), vfloat(node->lower_x[i])); - const vfloat vlower_y = madd(time, vfloat(node->lower_dy[i]), vfloat(node->lower_y[i])); - const vfloat vlower_z = madd(time, vfloat(node->lower_dz[i]), vfloat(node->lower_z[i])); - const vfloat vupper_x = madd(time, vfloat(node->upper_dx[i]), vfloat(node->upper_x[i])); - const vfloat vupper_y = madd(time, vfloat(node->upper_dy[i]), vfloat(node->upper_y[i])); - const vfloat vupper_z = madd(time, vfloat(node->upper_dz[i]), vfloat(node->upper_z[i])); - -#if defined(__aarch64__) - const vfloat lclipMinX = madd(vlower_x, ray.rdir.x, ray.neg_org_rdir.x); - const vfloat lclipMinY = madd(vlower_y, ray.rdir.y, ray.neg_org_rdir.y); - const vfloat lclipMinZ = madd(vlower_z, ray.rdir.z, ray.neg_org_rdir.z); - const vfloat lclipMaxX = madd(vupper_x, ray.rdir.x, ray.neg_org_rdir.x); - const vfloat lclipMaxY = madd(vupper_y, ray.rdir.y, ray.neg_org_rdir.y); - const vfloat lclipMaxZ = madd(vupper_z, ray.rdir.z, ray.neg_org_rdir.z); -#elif defined(__AVX2__) - const vfloat lclipMinX = msub(vlower_x, ray.rdir.x, ray.org_rdir.x); - const vfloat lclipMinY = msub(vlower_y, ray.rdir.y, ray.org_rdir.y); - const vfloat lclipMinZ = msub(vlower_z, ray.rdir.z, ray.org_rdir.z); - const vfloat lclipMaxX = msub(vupper_x, ray.rdir.x, ray.org_rdir.x); - const vfloat lclipMaxY = msub(vupper_y, ray.rdir.y, ray.org_rdir.y); - const vfloat lclipMaxZ = msub(vupper_z, ray.rdir.z, ray.org_rdir.z); -#else - const vfloat lclipMinX = (vlower_x - ray.org.x) * ray.rdir.x; - const vfloat lclipMinY = (vlower_y - ray.org.y) * ray.rdir.y; - const vfloat lclipMinZ = (vlower_z - ray.org.z) * ray.rdir.z; - const vfloat lclipMaxX = (vupper_x - ray.org.x) * ray.rdir.x; - const vfloat lclipMaxY = (vupper_y - ray.org.y) * ray.rdir.y; - const vfloat lclipMaxZ = (vupper_z - ray.org.z) * ray.rdir.z; -#endif - - const vfloat lnearP = maxi(maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY)), mini(lclipMinZ, lclipMaxZ)); - const vfloat lfarP = mini(mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY)), maxi(lclipMinZ, lclipMaxZ)); - vbool lhit = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar); - if (unlikely(ref.isAABBNodeMB4D())) { - const typename BVHN::AABBNodeMB4D* node1 = (const typename BVHN::AABBNodeMB4D*) node; - lhit = lhit & (vfloat(node1->lower_t[i]) <= time) & (time < vfloat(node1->upper_t[i])); - } - dist = lnearP; - return lhit; - } - - ////////////////////////////////////////////////////////////////////////////////////// - // Robust AABBNodeMB4D intersection - ////////////////////////////////////////////////////////////////////////////////////// - - template - __forceinline vbool intersectNodeKMB4DRobust(const typename BVHN::NodeRef ref, const size_t i, - const TravRayKRobust& ray, const vfloat& time, vfloat& dist) - { - const typename BVHN::AABBNodeMB* node = ref.getAABBNodeMB(); - - const vfloat vlower_x = madd(time, vfloat(node->lower_dx[i]), vfloat(node->lower_x[i])); - const vfloat vlower_y = madd(time, vfloat(node->lower_dy[i]), vfloat(node->lower_y[i])); - const vfloat vlower_z = madd(time, vfloat(node->lower_dz[i]), vfloat(node->lower_z[i])); - const vfloat vupper_x = madd(time, vfloat(node->upper_dx[i]), vfloat(node->upper_x[i])); - const vfloat vupper_y = madd(time, vfloat(node->upper_dy[i]), vfloat(node->upper_y[i])); - const vfloat vupper_z = madd(time, vfloat(node->upper_dz[i]), vfloat(node->upper_z[i])); - - const vfloat lclipMinX = (vlower_x - ray.org.x) * ray.rdir.x; - const vfloat lclipMinY = (vlower_y - ray.org.y) * ray.rdir.y; - const vfloat lclipMinZ = (vlower_z - ray.org.z) * ray.rdir.z; - const vfloat lclipMaxX = (vupper_x - ray.org.x) * ray.rdir.x; - const vfloat lclipMaxY = (vupper_y - ray.org.y) * ray.rdir.y; - const vfloat lclipMaxZ = (vupper_z - ray.org.z) * ray.rdir.z; - - const float round_up = 1.0f+3.0f*float(ulp); - const float round_down = 1.0f-3.0f*float(ulp); - const vfloat lnearP = round_down*maxi(maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY)), mini(lclipMinZ, lclipMaxZ)); - const vfloat lfarP = round_up *mini(mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY)), maxi(lclipMinZ, lclipMaxZ)); - vbool lhit = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar); - - if (unlikely(ref.isAABBNodeMB4D())) { - const typename BVHN::AABBNodeMB4D* node1 = (const typename BVHN::AABBNodeMB4D*) node; - lhit = lhit & (vfloat(node1->lower_t[i]) <= time) & (time < vfloat(node1->upper_t[i])); - } - dist = lnearP; - return lhit; - } - - ////////////////////////////////////////////////////////////////////////////////////// - // Fast OBBNode intersection - ////////////////////////////////////////////////////////////////////////////////////// - - template - __forceinline vbool intersectNodeK(const typename BVHN::OBBNode* node, const size_t i, - const TravRayK& ray, vfloat& dist) - { - const AffineSpace3vf naabb(Vec3f(node->naabb.l.vx.x[i], node->naabb.l.vx.y[i], node->naabb.l.vx.z[i]), - Vec3f(node->naabb.l.vy.x[i], node->naabb.l.vy.y[i], node->naabb.l.vy.z[i]), - Vec3f(node->naabb.l.vz.x[i], node->naabb.l.vz.y[i], node->naabb.l.vz.z[i]), - Vec3f(node->naabb.p .x[i], node->naabb.p .y[i], node->naabb.p .z[i])); - - const Vec3vf dir = xfmVector(naabb, ray.dir); - const Vec3vf nrdir = Vec3vf(vfloat(-1.0f)) * rcp_safe(dir); // FIXME: negate instead of mul with -1? - const Vec3vf org = xfmPoint(naabb, ray.org); - - const vfloat lclipMinX = org.x * nrdir.x; // (Vec3fa(zero) - org) * rdir; - const vfloat lclipMinY = org.y * nrdir.y; - const vfloat lclipMinZ = org.z * nrdir.z; - const vfloat lclipMaxX = lclipMinX - nrdir.x; // (Vec3fa(one) - org) * rdir; - const vfloat lclipMaxY = lclipMinY - nrdir.y; - const vfloat lclipMaxZ = lclipMinZ - nrdir.z; - - vfloat lnearP = maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY), mini(lclipMinZ, lclipMaxZ)); - vfloat lfarP = mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY), maxi(lclipMinZ, lclipMaxZ)); - if (robust) { - lnearP = lnearP*vfloat(1.0f-3.0f*float(ulp)); - lfarP = lfarP *vfloat(1.0f+3.0f*float(ulp)); - } - const vbool lhit = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar); - dist = lnearP; - return lhit; - } - - ////////////////////////////////////////////////////////////////////////////////////// - // Fast OBBNodeMB intersection - ////////////////////////////////////////////////////////////////////////////////////// - - template - __forceinline vbool intersectNodeK(const typename BVHN::OBBNodeMB* node, const size_t i, - const TravRayK& ray, const vfloat& time, vfloat& dist) - { - const AffineSpace3vf xfm(Vec3f(node->space0.l.vx.x[i], node->space0.l.vx.y[i], node->space0.l.vx.z[i]), - Vec3f(node->space0.l.vy.x[i], node->space0.l.vy.y[i], node->space0.l.vy.z[i]), - Vec3f(node->space0.l.vz.x[i], node->space0.l.vz.y[i], node->space0.l.vz.z[i]), - Vec3f(node->space0.p .x[i], node->space0.p .y[i], node->space0.p .z[i])); - - const Vec3vf b0_lower = zero; - const Vec3vf b0_upper = one; - const Vec3vf b1_lower(node->b1.lower.x[i], node->b1.lower.y[i], node->b1.lower.z[i]); - const Vec3vf b1_upper(node->b1.upper.x[i], node->b1.upper.y[i], node->b1.upper.z[i]); - const Vec3vf lower = lerp(b0_lower, b1_lower, time); - const Vec3vf upper = lerp(b0_upper, b1_upper, time); - - const Vec3vf dir = xfmVector(xfm, ray.dir); - const Vec3vf rdir = rcp_safe(dir); - const Vec3vf org = xfmPoint(xfm, ray.org); - - const vfloat lclipMinX = (lower.x - org.x) * rdir.x; - const vfloat lclipMinY = (lower.y - org.y) * rdir.y; - const vfloat lclipMinZ = (lower.z - org.z) * rdir.z; - const vfloat lclipMaxX = (upper.x - org.x) * rdir.x; - const vfloat lclipMaxY = (upper.y - org.y) * rdir.y; - const vfloat lclipMaxZ = (upper.z - org.z) * rdir.z; - - vfloat lnearP = maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY), mini(lclipMinZ, lclipMaxZ)); - vfloat lfarP = mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY), maxi(lclipMinZ, lclipMaxZ)); - if (robust) { - lnearP = lnearP*vfloat(1.0f-3.0f*float(ulp)); - lfarP = lfarP *vfloat(1.0f+3.0f*float(ulp)); - } - - const vbool lhit = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar); - dist = lnearP; - return lhit; - } - - - - ////////////////////////////////////////////////////////////////////////////////////// - // QuantizedBaseNode intersection - ////////////////////////////////////////////////////////////////////////////////////// - - template - __forceinline vbool intersectQuantizedNodeK(const typename BVHN::QuantizedBaseNode* node, size_t i, - const TravRayK& ray, vfloat& dist) - - { - assert(movemask(node->validMask()) & ((size_t)1 << i)); - const vfloat lower_x = node->dequantizeLowerX(); - const vfloat upper_x = node->dequantizeUpperX(); - const vfloat lower_y = node->dequantizeLowerY(); - const vfloat upper_y = node->dequantizeUpperY(); - const vfloat lower_z = node->dequantizeLowerZ(); - const vfloat upper_z = node->dequantizeUpperZ(); - - #if defined(__aarch64__) - const vfloat lclipMinX = madd(lower_x[i], ray.rdir.x, ray.neg_org_rdir.x); - const vfloat lclipMinY = madd(lower_y[i], ray.rdir.y, ray.neg_org_rdir.y); - const vfloat lclipMinZ = madd(lower_z[i], ray.rdir.z, ray.neg_org_rdir.z); - const vfloat lclipMaxX = madd(upper_x[i], ray.rdir.x, ray.neg_org_rdir.x); - const vfloat lclipMaxY = madd(upper_y[i], ray.rdir.y, ray.neg_org_rdir.y); - const vfloat lclipMaxZ = madd(upper_z[i], ray.rdir.z, ray.neg_org_rdir.z); - #elif defined(__AVX2__) - const vfloat lclipMinX = msub(lower_x[i], ray.rdir.x, ray.org_rdir.x); - const vfloat lclipMinY = msub(lower_y[i], ray.rdir.y, ray.org_rdir.y); - const vfloat lclipMinZ = msub(lower_z[i], ray.rdir.z, ray.org_rdir.z); - const vfloat lclipMaxX = msub(upper_x[i], ray.rdir.x, ray.org_rdir.x); - const vfloat lclipMaxY = msub(upper_y[i], ray.rdir.y, ray.org_rdir.y); - const vfloat lclipMaxZ = msub(upper_z[i], ray.rdir.z, ray.org_rdir.z); - #else - const vfloat lclipMinX = (lower_x[i] - ray.org.x) * ray.rdir.x; - const vfloat lclipMinY = (lower_y[i] - ray.org.y) * ray.rdir.y; - const vfloat lclipMinZ = (lower_z[i] - ray.org.z) * ray.rdir.z; - const vfloat lclipMaxX = (upper_x[i] - ray.org.x) * ray.rdir.x; - const vfloat lclipMaxY = (upper_y[i] - ray.org.y) * ray.rdir.y; - const vfloat lclipMaxZ = (upper_z[i] - ray.org.z) * ray.rdir.z; - #endif - - #if defined(__AVX512F__) && !defined(__AVX512ER__) // SKX - if (K == 16) - { - /* use mixed float/int min/max */ - const vfloat lnearP = maxi(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ)); - const vfloat lfarP = mini(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ)); - const vbool lhit = asInt(maxi(lnearP, ray.tnear)) <= asInt(mini(lfarP, ray.tfar)); - dist = lnearP; - return lhit; - } - else - #endif - { - const vfloat lnearP = maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY), mini(lclipMinZ, lclipMaxZ)); - const vfloat lfarP = mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY), maxi(lclipMinZ, lclipMaxZ)); - #if defined(__AVX512F__) && !defined(__AVX512ER__) // SKX - const vbool lhit = asInt(maxi(lnearP, ray.tnear)) <= asInt(mini(lfarP, ray.tfar)); - #else - const vbool lhit = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar); - #endif - dist = lnearP; - return lhit; - } - } - - template - __forceinline vbool intersectQuantizedNodeK(const typename BVHN::QuantizedBaseNode* node, size_t i, - const TravRayK& ray, vfloat& dist) - - { - assert(movemask(node->validMask()) & ((size_t)1 << i)); - const vfloat lower_x = node->dequantizeLowerX(); - const vfloat upper_x = node->dequantizeUpperX(); - const vfloat lower_y = node->dequantizeLowerY(); - const vfloat upper_y = node->dequantizeUpperY(); - const vfloat lower_z = node->dequantizeLowerZ(); - const vfloat upper_z = node->dequantizeUpperZ(); - - const vfloat lclipMinX = (lower_x[i] - ray.org.x) * ray.rdir.x; - const vfloat lclipMinY = (lower_y[i] - ray.org.y) * ray.rdir.y; - const vfloat lclipMinZ = (lower_z[i] - ray.org.z) * ray.rdir.z; - const vfloat lclipMaxX = (upper_x[i] - ray.org.x) * ray.rdir.x; - const vfloat lclipMaxY = (upper_y[i] - ray.org.y) * ray.rdir.y; - const vfloat lclipMaxZ = (upper_z[i] - ray.org.z) * ray.rdir.z; - - const float round_up = 1.0f+3.0f*float(ulp); - const float round_down = 1.0f-3.0f*float(ulp); - - const vfloat lnearP = round_down*max(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ)); - const vfloat lfarP = round_up *min(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ)); - const vbool lhit = max(lnearP, ray.tnear) <= min(lfarP, ray.tfar); - dist = lnearP; - return lhit; - } - - template - __forceinline vbool intersectQuantizedNodeMBK(const typename BVHN::QuantizedBaseNodeMB* node, const size_t i, - const TravRayK& ray, const vfloat& time, vfloat& dist) - - { - assert(movemask(node->validMask()) & ((size_t)1 << i)); - - const vfloat lower_x = node->dequantizeLowerX(i,time); - const vfloat upper_x = node->dequantizeUpperX(i,time); - const vfloat lower_y = node->dequantizeLowerY(i,time); - const vfloat upper_y = node->dequantizeUpperY(i,time); - const vfloat lower_z = node->dequantizeLowerZ(i,time); - const vfloat upper_z = node->dequantizeUpperZ(i,time); - -#if defined(__aarch64__) - const vfloat lclipMinX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x); - const vfloat lclipMinY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y); - const vfloat lclipMinZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z); - const vfloat lclipMaxX = madd(upper_x, ray.rdir.x, ray.neg_org_rdir.x); - const vfloat lclipMaxY = madd(upper_y, ray.rdir.y, ray.neg_org_rdir.y); - const vfloat lclipMaxZ = madd(upper_z, ray.rdir.z, ray.neg_org_rdir.z); -#elif defined(__AVX2__) - const vfloat lclipMinX = msub(lower_x, ray.rdir.x, ray.org_rdir.x); - const vfloat lclipMinY = msub(lower_y, ray.rdir.y, ray.org_rdir.y); - const vfloat lclipMinZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z); - const vfloat lclipMaxX = msub(upper_x, ray.rdir.x, ray.org_rdir.x); - const vfloat lclipMaxY = msub(upper_y, ray.rdir.y, ray.org_rdir.y); - const vfloat lclipMaxZ = msub(upper_z, ray.rdir.z, ray.org_rdir.z); -#else - const vfloat lclipMinX = (lower_x - ray.org.x) * ray.rdir.x; - const vfloat lclipMinY = (lower_y - ray.org.y) * ray.rdir.y; - const vfloat lclipMinZ = (lower_z - ray.org.z) * ray.rdir.z; - const vfloat lclipMaxX = (upper_x - ray.org.x) * ray.rdir.x; - const vfloat lclipMaxY = (upper_y - ray.org.y) * ray.rdir.y; - const vfloat lclipMaxZ = (upper_z - ray.org.z) * ray.rdir.z; - #endif - const vfloat lnearP = max(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ)); - const vfloat lfarP = min(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ)); - const vbool lhit = max(lnearP, ray.tnear) <= min(lfarP, ray.tfar); - dist = lnearP; - return lhit; - } - - - template - __forceinline vbool intersectQuantizedNodeMBK(const typename BVHN::QuantizedBaseNodeMB* node, const size_t i, - const TravRayK& ray, const vfloat& time, vfloat& dist) - - { - assert(movemask(node->validMask()) & ((size_t)1 << i)); - - const vfloat lower_x = node->dequantizeLowerX(i,time); - const vfloat upper_x = node->dequantizeUpperX(i,time); - const vfloat lower_y = node->dequantizeLowerY(i,time); - const vfloat upper_y = node->dequantizeUpperY(i,time); - const vfloat lower_z = node->dequantizeLowerZ(i,time); - const vfloat upper_z = node->dequantizeUpperZ(i,time); - - const vfloat lclipMinX = (lower_x - ray.org.x) * ray.rdir.x; - const vfloat lclipMinY = (lower_y - ray.org.y) * ray.rdir.y; - const vfloat lclipMinZ = (lower_z - ray.org.z) * ray.rdir.z; - const vfloat lclipMaxX = (upper_x - ray.org.x) * ray.rdir.x; - const vfloat lclipMaxY = (upper_y - ray.org.y) * ray.rdir.y; - const vfloat lclipMaxZ = (upper_z - ray.org.z) * ray.rdir.z; - - const float round_up = 1.0f+3.0f*float(ulp); - const float round_down = 1.0f-3.0f*float(ulp); - - const vfloat lnearP = round_down*max(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ)); - const vfloat lfarP = round_up *min(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ)); - const vbool lhit = max(lnearP, ray.tnear) <= min(lfarP, ray.tfar); - dist = lnearP; - return lhit; - } - - - ////////////////////////////////////////////////////////////////////////////////////// - // Node intersectors used in hybrid traversal - ////////////////////////////////////////////////////////////////////////////////////// - - /*! Intersects N nodes with K rays */ - template - struct BVHNNodeIntersectorK; - - template - struct BVHNNodeIntersectorK - { - /* vmask is both an input and an output parameter! Its initial value should be the parent node - hit mask, which is used for correctly computing the current hit mask. The parent hit mask - is actually required only for motion blur node intersections (because different rays may - have different times), so for regular nodes vmask is simply overwritten. */ - static __forceinline bool intersect(const typename BVHN::NodeRef& node, size_t i, - const TravRayKFast& ray, const vfloat& time, vfloat& dist, vbool& vmask) - { - vmask = intersectNodeK(node.getAABBNode(), i, ray, dist); - return true; - } - }; - - template - struct BVHNNodeIntersectorK - { - static __forceinline bool intersect(const typename BVHN::NodeRef& node, size_t i, - const TravRayKRobust& ray, const vfloat& time, vfloat& dist, vbool& vmask) - { - vmask = intersectNodeKRobust(node.getAABBNode(), i, ray, dist); - return true; - } - }; - - template - struct BVHNNodeIntersectorK - { - static __forceinline bool intersect(const typename BVHN::NodeRef& node, size_t i, - const TravRayKFast& ray, const vfloat& time, vfloat& dist, vbool& vmask) - { - vmask = intersectNodeK(node.getAABBNodeMB(), i, ray, time, dist); - return true; - } - }; - - template - struct BVHNNodeIntersectorK - { - static __forceinline bool intersect(const typename BVHN::NodeRef& node, size_t i, - const TravRayKRobust& ray, const vfloat& time, vfloat& dist, vbool& vmask) - { - vmask = intersectNodeKRobust(node.getAABBNodeMB(), i, ray, time, dist); - return true; - } - }; - - template - struct BVHNNodeIntersectorK - { - static __forceinline bool intersect(const typename BVHN::NodeRef& node, size_t i, - const TravRayKFast& ray, const vfloat& time, vfloat& dist, vbool& vmask) - { - if (likely(node.isAABBNode())) vmask = intersectNodeK(node.getAABBNode(), i, ray, dist); - else /*if (unlikely(node.isOBBNode()))*/ vmask = intersectNodeK(node.ungetAABBNode(), i, ray, dist); - return true; - } - }; - - template - struct BVHNNodeIntersectorK - { - static __forceinline bool intersect(const typename BVHN::NodeRef& node, size_t i, - const TravRayKRobust& ray, const vfloat& time, vfloat& dist, vbool& vmask) - { - if (likely(node.isAABBNode())) vmask = intersectNodeKRobust(node.getAABBNode(), i, ray, dist); - else /*if (unlikely(node.isOBBNode()))*/ vmask = intersectNodeK(node.ungetAABBNode(), i, ray, dist); - return true; - } - }; - - template - struct BVHNNodeIntersectorK - { - static __forceinline bool intersect(const typename BVHN::NodeRef& node, size_t i, - const TravRayKFast& ray, const vfloat& time, vfloat& dist, vbool& vmask) - { - if (likely(node.isAABBNodeMB())) vmask = intersectNodeK(node.getAABBNodeMB(), i, ray, time, dist); - else /*if (unlikely(node.isOBBNodeMB()))*/ vmask = intersectNodeK(node.ungetAABBNodeMB(), i, ray, time, dist); - return true; - } - }; - - template - struct BVHNNodeIntersectorK - { - static __forceinline bool intersect(const typename BVHN::NodeRef& node, size_t i, - const TravRayKRobust& ray, const vfloat& time, vfloat& dist, vbool& vmask) - { - if (likely(node.isAABBNodeMB())) vmask = intersectNodeKRobust(node.getAABBNodeMB(), i, ray, time, dist); - else /*if (unlikely(node.isOBBNodeMB()))*/ vmask = intersectNodeK(node.ungetAABBNodeMB(), i, ray, time, dist); - return true; - } - }; - - template - struct BVHNNodeIntersectorK - { - static __forceinline bool intersect(const typename BVHN::NodeRef& node, size_t i, - const TravRayKFast& ray, const vfloat& time, vfloat& dist, vbool& vmask) - { - vmask &= intersectNodeKMB4D(node, i, ray, time, dist); - return true; - } - }; - - template - struct BVHNNodeIntersectorK - { - static __forceinline bool intersect(const typename BVHN::NodeRef& node, size_t i, - const TravRayKRobust& ray, const vfloat& time, vfloat& dist, vbool& vmask) - { - vmask &= intersectNodeKMB4DRobust(node, i, ray, time, dist); - return true; - } - }; - - template - struct BVHNNodeIntersectorK - { - static __forceinline bool intersect(const typename BVHN::NodeRef& node, size_t i, - const TravRayKFast& ray, const vfloat& time, vfloat& dist, vbool& vmask) - { - if (likely(node.isAABBNodeMB() || node.isAABBNodeMB4D())) { - vmask &= intersectNodeKMB4D(node, i, ray, time, dist); - } else /*if (unlikely(node.isOBBNodeMB()))*/ { - assert(node.isOBBNodeMB()); - vmask &= intersectNodeK(node.ungetAABBNodeMB(), i, ray, time, dist); - } - return true; - } - }; - - template - struct BVHNNodeIntersectorK - { - static __forceinline bool intersect(const typename BVHN::NodeRef& node, size_t i, - const TravRayKRobust& ray, const vfloat& time, vfloat& dist, vbool& vmask) - { - if (likely(node.isAABBNodeMB() || node.isAABBNodeMB4D())) { - vmask &= intersectNodeKMB4DRobust(node, i, ray, time, dist); - } else /*if (unlikely(node.isOBBNodeMB()))*/ { - assert(node.isOBBNodeMB()); - vmask &= intersectNodeK(node.ungetAABBNodeMB(), i, ray, time, dist); - } - return true; - } - }; - - - /*! Intersects N nodes with K rays */ - template - struct BVHNQuantizedBaseNodeIntersectorK; - - template - struct BVHNQuantizedBaseNodeIntersectorK - { - static __forceinline vbool intersectK(const typename BVHN::QuantizedBaseNode* node, const size_t i, - const TravRayK& ray, vfloat& dist) - { - return intersectQuantizedNodeK(node,i,ray,dist); - } - - static __forceinline vbool intersectK(const typename BVHN::QuantizedBaseNodeMB* node, const size_t i, - const TravRayK& ray, const vfloat& time, vfloat& dist) - { - return intersectQuantizedNodeMBK(node,i,ray,time,dist); - } - - }; - - template - struct BVHNQuantizedBaseNodeIntersectorK - { - static __forceinline vbool intersectK(const typename BVHN::QuantizedBaseNode* node, const size_t i, - const TravRayK& ray, vfloat& dist) - { - return intersectQuantizedNodeK(node,i,ray,dist); - } - - static __forceinline vbool intersectK(const typename BVHN::QuantizedBaseNodeMB* node, const size_t i, - const TravRayK& ray, const vfloat& time, vfloat& dist) - { - return intersectQuantizedNodeMBK(node,i,ray,time,dist); - } - }; - - - } -} diff --git a/thirdparty/embree-aarch64/kernels/bvh/node_intersector_packet_stream.h b/thirdparty/embree-aarch64/kernels/bvh/node_intersector_packet_stream.h deleted file mode 100644 index f379b57aea..0000000000 --- a/thirdparty/embree-aarch64/kernels/bvh/node_intersector_packet_stream.h +++ /dev/null @@ -1,215 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "node_intersector.h" - -namespace embree -{ - namespace isa - { - ////////////////////////////////////////////////////////////////////////////////////// - // Ray packet structure used in stream traversal - ////////////////////////////////////////////////////////////////////////////////////// - - template - struct TravRayKStream; - - /* Fast variant */ - template - struct TravRayKStream - { - __forceinline TravRayKStream() {} - - __forceinline TravRayKStream(const Vec3vf& ray_org, const Vec3vf& ray_dir, const vfloat& ray_tnear, const vfloat& ray_tfar) - { - init(ray_org, ray_dir); - tnear = ray_tnear; - tfar = ray_tfar; - } - - __forceinline void init(const Vec3vf& ray_org, const Vec3vf& ray_dir) - { - rdir = rcp_safe(ray_dir); -#if defined(__aarch64__) - neg_org_rdir = -(ray_org * rdir); -#else - org_rdir = ray_org * rdir; -#endif - } - - Vec3vf rdir; -#if defined(__aarch64__) - Vec3vf neg_org_rdir; -#else - Vec3vf org_rdir; -#endif - vfloat tnear; - vfloat tfar; - }; - - template - using TravRayKStreamFast = TravRayKStream; - - /* Robust variant */ - template - struct TravRayKStream - { - __forceinline TravRayKStream() {} - - __forceinline TravRayKStream(const Vec3vf& ray_org, const Vec3vf& ray_dir, const vfloat& ray_tnear, const vfloat& ray_tfar) - { - init(ray_org, ray_dir); - tnear = ray_tnear; - tfar = ray_tfar; - } - - __forceinline void init(const Vec3vf& ray_org, const Vec3vf& ray_dir) - { - rdir = vfloat(1.0f)/(zero_fix(ray_dir)); - org = ray_org; - } - - Vec3vf rdir; - Vec3vf org; - vfloat tnear; - vfloat tfar; - }; - - template - using TravRayKStreamRobust = TravRayKStream; - - ////////////////////////////////////////////////////////////////////////////////////// - // Fast AABBNode intersection - ////////////////////////////////////////////////////////////////////////////////////// - - template - __forceinline size_t intersectNode1(const typename BVHN::AABBNode* __restrict__ node, - const TravRayKStreamFast& ray, size_t k, const NearFarPrecalculations& nf) - { - const vfloat bminX = vfloat(*(const vfloat*)((const char*)&node->lower_x + nf.nearX)); - const vfloat bminY = vfloat(*(const vfloat*)((const char*)&node->lower_x + nf.nearY)); - const vfloat bminZ = vfloat(*(const vfloat*)((const char*)&node->lower_x + nf.nearZ)); - const vfloat bmaxX = vfloat(*(const vfloat*)((const char*)&node->lower_x + nf.farX)); - const vfloat bmaxY = vfloat(*(const vfloat*)((const char*)&node->lower_x + nf.farY)); - const vfloat bmaxZ = vfloat(*(const vfloat*)((const char*)&node->lower_x + nf.farZ)); - -#if defined (__aarch64__) - const vfloat rminX = madd(bminX, vfloat(ray.rdir.x[k]), vfloat(ray.neg_org_rdir.x[k])); - const vfloat rminY = madd(bminY, vfloat(ray.rdir.y[k]), vfloat(ray.neg_org_rdir.y[k])); - const vfloat rminZ = madd(bminZ, vfloat(ray.rdir.z[k]), vfloat(ray.neg_org_rdir.z[k])); - const vfloat rmaxX = madd(bmaxX, vfloat(ray.rdir.x[k]), vfloat(ray.neg_org_rdir.x[k])); - const vfloat rmaxY = madd(bmaxY, vfloat(ray.rdir.y[k]), vfloat(ray.neg_org_rdir.y[k])); - const vfloat rmaxZ = madd(bmaxZ, vfloat(ray.rdir.z[k]), vfloat(ray.neg_org_rdir.z[k])); -#else - const vfloat rminX = msub(bminX, vfloat(ray.rdir.x[k]), vfloat(ray.org_rdir.x[k])); - const vfloat rminY = msub(bminY, vfloat(ray.rdir.y[k]), vfloat(ray.org_rdir.y[k])); - const vfloat rminZ = msub(bminZ, vfloat(ray.rdir.z[k]), vfloat(ray.org_rdir.z[k])); - const vfloat rmaxX = msub(bmaxX, vfloat(ray.rdir.x[k]), vfloat(ray.org_rdir.x[k])); - const vfloat rmaxY = msub(bmaxY, vfloat(ray.rdir.y[k]), vfloat(ray.org_rdir.y[k])); - const vfloat rmaxZ = msub(bmaxZ, vfloat(ray.rdir.z[k]), vfloat(ray.org_rdir.z[k])); -#endif - const vfloat rmin = maxi(rminX, rminY, rminZ, vfloat(ray.tnear[k])); - const vfloat rmax = mini(rmaxX, rmaxY, rmaxZ, vfloat(ray.tfar[k])); - - const vbool vmask_first_hit = rmin <= rmax; - - return movemask(vmask_first_hit) & (((size_t)1 << N)-1); - } - - template - __forceinline size_t intersectNodeK(const typename BVHN::AABBNode* __restrict__ node, size_t i, - const TravRayKStreamFast& ray, const NearFarPrecalculations& nf) - { - char* ptr = (char*)&node->lower_x + i*sizeof(float); - const vfloat bminX = *(const float*)(ptr + nf.nearX); - const vfloat bminY = *(const float*)(ptr + nf.nearY); - const vfloat bminZ = *(const float*)(ptr + nf.nearZ); - const vfloat bmaxX = *(const float*)(ptr + nf.farX); - const vfloat bmaxY = *(const float*)(ptr + nf.farY); - const vfloat bmaxZ = *(const float*)(ptr + nf.farZ); - -#if defined (__aarch64__) - const vfloat rminX = madd(bminX, ray.rdir.x, ray.neg_org_rdir.x); - const vfloat rminY = madd(bminY, ray.rdir.y, ray.neg_org_rdir.y); - const vfloat rminZ = madd(bminZ, ray.rdir.z, ray.neg_org_rdir.z); - const vfloat rmaxX = madd(bmaxX, ray.rdir.x, ray.neg_org_rdir.x); - const vfloat rmaxY = madd(bmaxY, ray.rdir.y, ray.neg_org_rdir.y); - const vfloat rmaxZ = madd(bmaxZ, ray.rdir.z, ray.neg_org_rdir.z); -#else - const vfloat rminX = msub(bminX, ray.rdir.x, ray.org_rdir.x); - const vfloat rminY = msub(bminY, ray.rdir.y, ray.org_rdir.y); - const vfloat rminZ = msub(bminZ, ray.rdir.z, ray.org_rdir.z); - const vfloat rmaxX = msub(bmaxX, ray.rdir.x, ray.org_rdir.x); - const vfloat rmaxY = msub(bmaxY, ray.rdir.y, ray.org_rdir.y); - const vfloat rmaxZ = msub(bmaxZ, ray.rdir.z, ray.org_rdir.z); -#endif - - const vfloat rmin = maxi(rminX, rminY, rminZ, ray.tnear); - const vfloat rmax = mini(rmaxX, rmaxY, rmaxZ, ray.tfar); - - const vbool vmask_first_hit = rmin <= rmax; - - return movemask(vmask_first_hit); - } - - ////////////////////////////////////////////////////////////////////////////////////// - // Robust AABBNode intersection - ////////////////////////////////////////////////////////////////////////////////////// - - template - __forceinline size_t intersectNode1(const typename BVHN::AABBNode* __restrict__ node, - const TravRayKStreamRobust& ray, size_t k, const NearFarPrecalculations& nf) - { - const vfloat bminX = vfloat(*(const vfloat*)((const char*)&node->lower_x + nf.nearX)); - const vfloat bminY = vfloat(*(const vfloat*)((const char*)&node->lower_x + nf.nearY)); - const vfloat bminZ = vfloat(*(const vfloat*)((const char*)&node->lower_x + nf.nearZ)); - const vfloat bmaxX = vfloat(*(const vfloat*)((const char*)&node->lower_x + nf.farX)); - const vfloat bmaxY = vfloat(*(const vfloat*)((const char*)&node->lower_x + nf.farY)); - const vfloat bmaxZ = vfloat(*(const vfloat*)((const char*)&node->lower_x + nf.farZ)); - - const vfloat rminX = (bminX - vfloat(ray.org.x[k])) * vfloat(ray.rdir.x[k]); - const vfloat rminY = (bminY - vfloat(ray.org.y[k])) * vfloat(ray.rdir.y[k]); - const vfloat rminZ = (bminZ - vfloat(ray.org.z[k])) * vfloat(ray.rdir.z[k]); - const vfloat rmaxX = (bmaxX - vfloat(ray.org.x[k])) * vfloat(ray.rdir.x[k]); - const vfloat rmaxY = (bmaxY - vfloat(ray.org.y[k])) * vfloat(ray.rdir.y[k]); - const vfloat rmaxZ = (bmaxZ - vfloat(ray.org.z[k])) * vfloat(ray.rdir.z[k]); - const float round_up = 1.0f+3.0f*float(ulp); // FIXME: use per instruction rounding for AVX512 - const vfloat rmin = max(rminX, rminY, rminZ, vfloat(ray.tnear[k])); - const vfloat rmax = round_up *min(rmaxX, rmaxY, rmaxZ, vfloat(ray.tfar[k])); - - const vbool vmask_first_hit = rmin <= rmax; - - return movemask(vmask_first_hit) & (((size_t)1 << N)-1); - } - - template - __forceinline size_t intersectNodeK(const typename BVHN::AABBNode* __restrict__ node, size_t i, - const TravRayKStreamRobust& ray, const NearFarPrecalculations& nf) - { - char *ptr = (char*)&node->lower_x + i*sizeof(float); - const vfloat bminX = *(const float*)(ptr + nf.nearX); - const vfloat bminY = *(const float*)(ptr + nf.nearY); - const vfloat bminZ = *(const float*)(ptr + nf.nearZ); - const vfloat bmaxX = *(const float*)(ptr + nf.farX); - const vfloat bmaxY = *(const float*)(ptr + nf.farY); - const vfloat bmaxZ = *(const float*)(ptr + nf.farZ); - - const vfloat rminX = (bminX - ray.org.x) * ray.rdir.x; - const vfloat rminY = (bminY - ray.org.y) * ray.rdir.y; - const vfloat rminZ = (bminZ - ray.org.z) * ray.rdir.z; - const vfloat rmaxX = (bmaxX - ray.org.x) * ray.rdir.x; - const vfloat rmaxY = (bmaxY - ray.org.y) * ray.rdir.y; - const vfloat rmaxZ = (bmaxZ - ray.org.z) * ray.rdir.z; - - const float round_up = 1.0f+3.0f*float(ulp); - const vfloat rmin = max(rminX, rminY, rminZ, vfloat(ray.tnear)); - const vfloat rmax = round_up * min(rmaxX, rmaxY, rmaxZ, vfloat(ray.tfar)); - - const vbool vmask_first_hit = rmin <= rmax; - - return movemask(vmask_first_hit); - } - } -} diff --git a/thirdparty/embree-aarch64/kernels/common/accel.h b/thirdparty/embree-aarch64/kernels/common/accel.h deleted file mode 100644 index c038d3cf21..0000000000 --- a/thirdparty/embree-aarch64/kernels/common/accel.h +++ /dev/null @@ -1,556 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "default.h" -#include "ray.h" -#include "point_query.h" -#include "context.h" - -namespace embree -{ - class Scene; - - /*! Base class for the acceleration structure data. */ - class AccelData : public RefCount - { - ALIGNED_CLASS_(16); - public: - enum Type { TY_UNKNOWN = 0, TY_ACCELN = 1, TY_ACCEL_INSTANCE = 2, TY_BVH4 = 3, TY_BVH8 = 4 }; - - public: - AccelData (const Type type) - : bounds(empty), type(type) {} - - /*! notifies the acceleration structure about the deletion of some geometry */ - virtual void deleteGeometry(size_t geomID) {}; - - /*! clears the acceleration structure data */ - virtual void clear() = 0; - - /*! returns normal bounds */ - __forceinline BBox3fa getBounds() const { - return bounds.bounds(); - } - - /*! returns bounds for some time */ - __forceinline BBox3fa getBounds(float t) const { - return bounds.interpolate(t); - } - - /*! returns linear bounds */ - __forceinline LBBox3fa getLinearBounds() const { - return bounds; - } - - /*! checks if acceleration structure is empty */ - __forceinline bool isEmpty() const { - return bounds.bounds0.lower.x == float(pos_inf); - } - - public: - LBBox3fa bounds; // linear bounds - Type type; - }; - - /*! Base class for all intersectable and buildable acceleration structures. */ - class Accel : public AccelData - { - ALIGNED_CLASS_(16); - public: - - struct Intersectors; - - /*! Type of collide function */ - typedef void (*CollideFunc)(void* bvh0, void* bvh1, RTCCollideFunc callback, void* userPtr); - - /*! Type of point query function */ - typedef bool(*PointQueryFunc)(Intersectors* This, /*!< this pointer to accel */ - PointQuery* query, /*!< point query for lookup */ - PointQueryContext* context); /*!< point query context */ - - /*! Type of intersect function pointer for single rays. */ - typedef void (*IntersectFunc)(Intersectors* This, /*!< this pointer to accel */ - RTCRayHit& ray, /*!< ray to intersect */ - IntersectContext* context); - - /*! Type of intersect function pointer for ray packets of size 4. */ - typedef void (*IntersectFunc4)(const void* valid, /*!< pointer to valid mask */ - Intersectors* This, /*!< this pointer to accel */ - RTCRayHit4& ray, /*!< ray packet to intersect */ - IntersectContext* context); - - /*! Type of intersect function pointer for ray packets of size 8. */ - typedef void (*IntersectFunc8)(const void* valid, /*!< pointer to valid mask */ - Intersectors* This, /*!< this pointer to accel */ - RTCRayHit8& ray, /*!< ray packet to intersect */ - IntersectContext* context); - - /*! Type of intersect function pointer for ray packets of size 16. */ - typedef void (*IntersectFunc16)(const void* valid, /*!< pointer to valid mask */ - Intersectors* This, /*!< this pointer to accel */ - RTCRayHit16& ray, /*!< ray packet to intersect */ - IntersectContext* context); - - /*! Type of intersect function pointer for ray packets of size N. */ - typedef void (*IntersectFuncN)(Intersectors* This, /*!< this pointer to accel */ - RTCRayHitN** ray, /*!< ray stream to intersect */ - const size_t N, /*!< number of rays in stream */ - IntersectContext* context /*!< layout flags */); - - - /*! Type of occlusion function pointer for single rays. */ - typedef void (*OccludedFunc) (Intersectors* This, /*!< this pointer to accel */ - RTCRay& ray, /*!< ray to test occlusion */ - IntersectContext* context); - - /*! Type of occlusion function pointer for ray packets of size 4. */ - typedef void (*OccludedFunc4) (const void* valid, /*!< pointer to valid mask */ - Intersectors* This, /*!< this pointer to accel */ - RTCRay4& ray, /*!< ray packet to test occlusion. */ - IntersectContext* context); - - /*! Type of occlusion function pointer for ray packets of size 8. */ - typedef void (*OccludedFunc8) (const void* valid, /*!< pointer to valid mask */ - Intersectors* This, /*!< this pointer to accel */ - RTCRay8& ray, /*!< ray packet to test occlusion. */ - IntersectContext* context); - - /*! Type of occlusion function pointer for ray packets of size 16. */ - typedef void (*OccludedFunc16) (const void* valid, /*!< pointer to valid mask */ - Intersectors* This, /*!< this pointer to accel */ - RTCRay16& ray, /*!< ray packet to test occlusion. */ - IntersectContext* context); - - /*! Type of intersect function pointer for ray packets of size N. */ - typedef void (*OccludedFuncN)(Intersectors* This, /*!< this pointer to accel */ - RTCRayN** ray, /*!< ray stream to test occlusion */ - const size_t N, /*!< number of rays in stream */ - IntersectContext* context /*!< layout flags */); - typedef void (*ErrorFunc) (); - - struct Collider - { - Collider (ErrorFunc error = nullptr) - : collide((CollideFunc)error), name(nullptr) {} - - Collider (CollideFunc collide, const char* name) - : collide(collide), name(name) {} - - operator bool() const { return name; } - - public: - CollideFunc collide; - const char* name; - }; - - struct Intersector1 - { - Intersector1 (ErrorFunc error = nullptr) - : intersect((IntersectFunc)error), occluded((OccludedFunc)error), name(nullptr) {} - - Intersector1 (IntersectFunc intersect, OccludedFunc occluded, const char* name) - : intersect(intersect), occluded(occluded), pointQuery(nullptr), name(name) {} - - Intersector1 (IntersectFunc intersect, OccludedFunc occluded, PointQueryFunc pointQuery, const char* name) - : intersect(intersect), occluded(occluded), pointQuery(pointQuery), name(name) {} - - operator bool() const { return name; } - - public: - static const char* type; - IntersectFunc intersect; - OccludedFunc occluded; - PointQueryFunc pointQuery; - const char* name; - }; - - struct Intersector4 - { - Intersector4 (ErrorFunc error = nullptr) - : intersect((IntersectFunc4)error), occluded((OccludedFunc4)error), name(nullptr) {} - - Intersector4 (IntersectFunc4 intersect, OccludedFunc4 occluded, const char* name) - : intersect(intersect), occluded(occluded), name(name) {} - - operator bool() const { return name; } - - public: - static const char* type; - IntersectFunc4 intersect; - OccludedFunc4 occluded; - const char* name; - }; - - struct Intersector8 - { - Intersector8 (ErrorFunc error = nullptr) - : intersect((IntersectFunc8)error), occluded((OccludedFunc8)error), name(nullptr) {} - - Intersector8 (IntersectFunc8 intersect, OccludedFunc8 occluded, const char* name) - : intersect(intersect), occluded(occluded), name(name) {} - - operator bool() const { return name; } - - public: - static const char* type; - IntersectFunc8 intersect; - OccludedFunc8 occluded; - const char* name; - }; - - struct Intersector16 - { - Intersector16 (ErrorFunc error = nullptr) - : intersect((IntersectFunc16)error), occluded((OccludedFunc16)error), name(nullptr) {} - - Intersector16 (IntersectFunc16 intersect, OccludedFunc16 occluded, const char* name) - : intersect(intersect), occluded(occluded), name(name) {} - - operator bool() const { return name; } - - public: - static const char* type; - IntersectFunc16 intersect; - OccludedFunc16 occluded; - const char* name; - }; - - struct IntersectorN - { - IntersectorN (ErrorFunc error = nullptr) - : intersect((IntersectFuncN)error), occluded((OccludedFuncN)error), name(nullptr) {} - - IntersectorN (IntersectFuncN intersect, OccludedFuncN occluded, const char* name) - : intersect(intersect), occluded(occluded), name(name) {} - - operator bool() const { return name; } - - public: - static const char* type; - IntersectFuncN intersect; - OccludedFuncN occluded; - const char* name; - }; - - struct Intersectors - { - Intersectors() - : ptr(nullptr), leafIntersector(nullptr), collider(nullptr), intersector1(nullptr), intersector4(nullptr), intersector8(nullptr), intersector16(nullptr), intersectorN(nullptr) {} - - Intersectors (ErrorFunc error) - : ptr(nullptr), leafIntersector(nullptr), collider(error), intersector1(error), intersector4(error), intersector8(error), intersector16(error), intersectorN(error) {} - - void print(size_t ident) - { - if (collider.name) { - for (size_t i=0; ibuild(); - bounds = accel->bounds; - } - - void deleteGeometry(size_t geomID) { - if (accel ) accel->deleteGeometry(geomID); - if (builder) builder->deleteGeometry(geomID); - } - - void clear() { - if (accel) accel->clear(); - if (builder) builder->clear(); - } - - private: - std::unique_ptr accel; - std::unique_ptr builder; - }; -} diff --git a/thirdparty/embree-aarch64/kernels/common/acceln.cpp b/thirdparty/embree-aarch64/kernels/common/acceln.cpp deleted file mode 100644 index aadb4a64ef..0000000000 --- a/thirdparty/embree-aarch64/kernels/common/acceln.cpp +++ /dev/null @@ -1,232 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "acceln.h" -#include "ray.h" -#include "../../include/embree3/rtcore_ray.h" -#include "../../common/algorithms/parallel_for.h" - -namespace embree -{ - AccelN::AccelN() - : Accel(AccelData::TY_ACCELN), accels() {} - - AccelN::~AccelN() - { - for (size_t i=0; iptr; - for (size_t i=0; iaccels.size(); i++) - if (!This->accels[i]->isEmpty()) - changed |= This->accels[i]->intersectors.pointQuery(query,context); - return changed; - } - - void AccelN::intersect (Accel::Intersectors* This_in, RTCRayHit& ray, IntersectContext* context) - { - AccelN* This = (AccelN*)This_in->ptr; - for (size_t i=0; iaccels.size(); i++) - if (!This->accels[i]->isEmpty()) - This->accels[i]->intersectors.intersect(ray,context); - } - - void AccelN::intersect4 (const void* valid, Accel::Intersectors* This_in, RTCRayHit4& ray, IntersectContext* context) - { - AccelN* This = (AccelN*)This_in->ptr; - for (size_t i=0; iaccels.size(); i++) - if (!This->accels[i]->isEmpty()) - This->accels[i]->intersectors.intersect4(valid,ray,context); - } - - void AccelN::intersect8 (const void* valid, Accel::Intersectors* This_in, RTCRayHit8& ray, IntersectContext* context) - { - AccelN* This = (AccelN*)This_in->ptr; - for (size_t i=0; iaccels.size(); i++) - if (!This->accels[i]->isEmpty()) - This->accels[i]->intersectors.intersect8(valid,ray,context); - } - - void AccelN::intersect16 (const void* valid, Accel::Intersectors* This_in, RTCRayHit16& ray, IntersectContext* context) - { - AccelN* This = (AccelN*)This_in->ptr; - for (size_t i=0; iaccels.size(); i++) - if (!This->accels[i]->isEmpty()) - This->accels[i]->intersectors.intersect16(valid,ray,context); - } - - void AccelN::intersectN (Accel::Intersectors* This_in, RTCRayHitN** ray, const size_t N, IntersectContext* context) - { - AccelN* This = (AccelN*)This_in->ptr; - for (size_t i=0; iaccels.size(); i++) - if (!This->accels[i]->isEmpty()) - This->accels[i]->intersectors.intersectN(ray,N,context); - } - - void AccelN::occluded (Accel::Intersectors* This_in, RTCRay& ray, IntersectContext* context) - { - AccelN* This = (AccelN*)This_in->ptr; - for (size_t i=0; iaccels.size(); i++) { - if (This->accels[i]->isEmpty()) continue; - This->accels[i]->intersectors.occluded(ray,context); - if (ray.tfar < 0.0f) break; - } - } - - void AccelN::occluded4 (const void* valid, Accel::Intersectors* This_in, RTCRay4& ray, IntersectContext* context) - { - AccelN* This = (AccelN*)This_in->ptr; - for (size_t i=0; iaccels.size(); i++) { - if (This->accels[i]->isEmpty()) continue; - This->accels[i]->intersectors.occluded4(valid,ray,context); -#if defined(__SSE2__) || defined(__ARM_NEON) - vbool4 valid0 = asBool(((vint4*)valid)[0]); - vbool4 hit0 = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero); - if (unlikely(none(valid0 & hit0))) break; -#endif - } - } - - void AccelN::occluded8 (const void* valid, Accel::Intersectors* This_in, RTCRay8& ray, IntersectContext* context) - { - AccelN* This = (AccelN*)This_in->ptr; - for (size_t i=0; iaccels.size(); i++) { - if (This->accels[i]->isEmpty()) continue; - This->accels[i]->intersectors.occluded8(valid,ray,context); -#if defined(__SSE2__) || defined(__ARM_NEON) // FIXME: use higher ISA - vbool4 valid0 = asBool(((vint4*)valid)[0]); - vbool4 hit0 = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero); - vbool4 valid1 = asBool(((vint4*)valid)[1]); - vbool4 hit1 = ((vfloat4*)ray.tfar)[1] >= vfloat4(zero); - if (unlikely((none((valid0 & hit0) | (valid1 & hit1))))) break; -#endif - } - } - - void AccelN::occluded16 (const void* valid, Accel::Intersectors* This_in, RTCRay16& ray, IntersectContext* context) - { - AccelN* This = (AccelN*)This_in->ptr; - for (size_t i=0; iaccels.size(); i++) { - if (This->accels[i]->isEmpty()) continue; - This->accels[i]->intersectors.occluded16(valid,ray,context); -#if defined(__SSE2__) || defined(__ARM_NEON) // FIXME: use higher ISA - vbool4 valid0 = asBool(((vint4*)valid)[0]); - vbool4 hit0 = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero); - vbool4 valid1 = asBool(((vint4*)valid)[1]); - vbool4 hit1 = ((vfloat4*)ray.tfar)[1] >= vfloat4(zero); - vbool4 valid2 = asBool(((vint4*)valid)[2]); - vbool4 hit2 = ((vfloat4*)ray.tfar)[2] >= vfloat4(zero); - vbool4 valid3 = asBool(((vint4*)valid)[3]); - vbool4 hit3 = ((vfloat4*)ray.tfar)[3] >= vfloat4(zero); - if (unlikely((none((valid0 & hit0) | (valid1 & hit1) | (valid2 & hit2) | (valid3 & hit3))))) break; -#endif - } - } - - void AccelN::occludedN (Accel::Intersectors* This_in, RTCRayN** ray, const size_t N, IntersectContext* context) - { - AccelN* This = (AccelN*)This_in->ptr; - size_t M = N; - for (size_t i=0; iaccels.size(); i++) - if (!This->accels[i]->isEmpty()) - This->accels[i]->intersectors.occludedN(ray,M,context); - } - - void AccelN::accels_print(size_t ident) - { - for (size_t i=0; iintersectors.print(ident+2); - } - } - - void AccelN::accels_immutable() - { - for (size_t i=0; iimmutable(); - } - - void AccelN::accels_build () - { - /* reduce memory consumption */ - accels.shrink_to_fit(); - - /* build all acceleration structures in parallel */ - parallel_for (accels.size(), [&] (size_t i) { - accels[i]->build(); - }); - - /* create list of non-empty acceleration structures */ - bool valid1 = true; - bool valid4 = true; - bool valid8 = true; - bool valid16 = true; - for (size_t i=0; iintersectors.intersector1; - valid4 &= (bool) accels[i]->intersectors.intersector4; - valid8 &= (bool) accels[i]->intersectors.intersector8; - valid16 &= (bool) accels[i]->intersectors.intersector16; - } - - if (accels.size() == 1) { - type = accels[0]->type; // FIXME: should just assign entire Accel - bounds = accels[0]->bounds; - intersectors = accels[0]->intersectors; - } - else - { - type = AccelData::TY_ACCELN; - intersectors.ptr = this; - intersectors.intersector1 = Intersector1(&intersect,&occluded,&pointQuery,valid1 ? "AccelN::intersector1": nullptr); - intersectors.intersector4 = Intersector4(&intersect4,&occluded4,valid4 ? "AccelN::intersector4" : nullptr); - intersectors.intersector8 = Intersector8(&intersect8,&occluded8,valid8 ? "AccelN::intersector8" : nullptr); - intersectors.intersector16 = Intersector16(&intersect16,&occluded16,valid16 ? "AccelN::intersector16": nullptr); - intersectors.intersectorN = IntersectorN(&intersectN,&occludedN,"AccelN::intersectorN"); - - /*! calculate bounds */ - bounds = empty; - for (size_t i=0; ibounds); - } - } - - void AccelN::accels_select(bool filter) - { - for (size_t i=0; iintersectors.select(filter); - } - - void AccelN::accels_deleteGeometry(size_t geomID) - { - for (size_t i=0; ideleteGeometry(geomID); - } - - void AccelN::accels_clear() - { - for (size_t i=0; iclear(); - } - } -} - diff --git a/thirdparty/embree-aarch64/kernels/common/acceln.h b/thirdparty/embree-aarch64/kernels/common/acceln.h deleted file mode 100644 index 2edd98f647..0000000000 --- a/thirdparty/embree-aarch64/kernels/common/acceln.h +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "accel.h" - -namespace embree -{ - /*! merges N acceleration structures together, by processing them in order */ - class AccelN : public Accel - { - public: - AccelN (); - ~AccelN(); - - public: - void accels_add(Accel* accel); - void accels_init(); - - public: - static bool pointQuery (Accel::Intersectors* This, PointQuery* query, PointQueryContext* context); - - public: - static void intersect (Accel::Intersectors* This, RTCRayHit& ray, IntersectContext* context); - static void intersect4 (const void* valid, Accel::Intersectors* This, RTCRayHit4& ray, IntersectContext* context); - static void intersect8 (const void* valid, Accel::Intersectors* This, RTCRayHit8& ray, IntersectContext* context); - static void intersect16 (const void* valid, Accel::Intersectors* This, RTCRayHit16& ray, IntersectContext* context); - static void intersectN (Accel::Intersectors* This, RTCRayHitN** ray, const size_t N, IntersectContext* context); - - public: - static void occluded (Accel::Intersectors* This, RTCRay& ray, IntersectContext* context); - static void occluded4 (const void* valid, Accel::Intersectors* This, RTCRay4& ray, IntersectContext* context); - static void occluded8 (const void* valid, Accel::Intersectors* This, RTCRay8& ray, IntersectContext* context); - static void occluded16 (const void* valid, Accel::Intersectors* This, RTCRay16& ray, IntersectContext* context); - static void occludedN (Accel::Intersectors* This, RTCRayN** ray, const size_t N, IntersectContext* context); - - public: - void accels_print(size_t ident); - void accels_immutable(); - void accels_build (); - void accels_select(bool filter); - void accels_deleteGeometry(size_t geomID); - void accels_clear (); - - public: - std::vector accels; - }; -} diff --git a/thirdparty/embree-aarch64/kernels/common/accelset.cpp b/thirdparty/embree-aarch64/kernels/common/accelset.cpp deleted file mode 100644 index 79be1c4301..0000000000 --- a/thirdparty/embree-aarch64/kernels/common/accelset.cpp +++ /dev/null @@ -1,17 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "accelset.h" -#include "scene.h" - -namespace embree -{ - AccelSet::AccelSet (Device* device, Geometry::GType gtype, size_t numItems, size_t numTimeSteps) - : Geometry(device,gtype,(unsigned int)numItems,(unsigned int)numTimeSteps), boundsFunc(nullptr) {} - - AccelSet::IntersectorN::IntersectorN (ErrorFunc error) - : intersect((IntersectFuncN)error), occluded((OccludedFuncN)error), name(nullptr) {} - - AccelSet::IntersectorN::IntersectorN (IntersectFuncN intersect, OccludedFuncN occluded, const char* name) - : intersect(intersect), occluded(occluded), name(name) {} -} diff --git a/thirdparty/embree-aarch64/kernels/common/accelset.h b/thirdparty/embree-aarch64/kernels/common/accelset.h deleted file mode 100644 index 3774b2accb..0000000000 --- a/thirdparty/embree-aarch64/kernels/common/accelset.h +++ /dev/null @@ -1,248 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "default.h" -#include "builder.h" -#include "geometry.h" -#include "ray.h" -#include "hit.h" - -namespace embree -{ - struct IntersectFunctionNArguments; - struct OccludedFunctionNArguments; - - typedef void (*ReportIntersectionFunc) (IntersectFunctionNArguments* args, const RTCFilterFunctionNArguments* filter_args); - typedef void (*ReportOcclusionFunc) (OccludedFunctionNArguments* args, const RTCFilterFunctionNArguments* filter_args); - - struct IntersectFunctionNArguments : public RTCIntersectFunctionNArguments - { - IntersectContext* internal_context; - Geometry* geometry; - ReportIntersectionFunc report; - }; - - struct OccludedFunctionNArguments : public RTCOccludedFunctionNArguments - { - IntersectContext* internal_context; - Geometry* geometry; - ReportOcclusionFunc report; - }; - - /*! Base class for set of acceleration structures. */ - class AccelSet : public Geometry - { - public: - typedef RTCIntersectFunctionN IntersectFuncN; - typedef RTCOccludedFunctionN OccludedFuncN; - typedef void (*ErrorFunc) (); - - struct IntersectorN - { - IntersectorN (ErrorFunc error = nullptr) ; - IntersectorN (IntersectFuncN intersect, OccludedFuncN occluded, const char* name); - - operator bool() const { return name; } - - public: - static const char* type; - IntersectFuncN intersect; - OccludedFuncN occluded; - const char* name; - }; - - public: - - /*! construction */ - AccelSet (Device* device, Geometry::GType gtype, size_t items, size_t numTimeSteps); - - /*! makes the acceleration structure immutable */ - virtual void immutable () {} - - /*! build accel */ - virtual void build () = 0; - - /*! check if the i'th primitive is valid between the specified time range */ - __forceinline bool valid(size_t i, const range& itime_range) const - { - for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++) - if (!isvalid_non_empty(bounds(i,itime))) return false; - - return true; - } - - /*! Calculates the bounds of an item */ - __forceinline BBox3fa bounds(size_t i, size_t itime = 0) const - { - BBox3fa box; - assert(i < size()); - RTCBoundsFunctionArguments args; - args.geometryUserPtr = userPtr; - args.primID = (unsigned int)i; - args.timeStep = (unsigned int)itime; - args.bounds_o = (RTCBounds*)&box; - boundsFunc(&args); - return box; - } - - /*! calculates the linear bounds of the i'th item at the itime'th time segment */ - __forceinline LBBox3fa linearBounds(size_t i, size_t itime) const - { - BBox3fa box[2]; - assert(i < size()); - RTCBoundsFunctionArguments args; - args.geometryUserPtr = userPtr; - args.primID = (unsigned int)i; - args.timeStep = (unsigned int)(itime+0); - args.bounds_o = (RTCBounds*)&box[0]; - boundsFunc(&args); - args.timeStep = (unsigned int)(itime+1); - args.bounds_o = (RTCBounds*)&box[1]; - boundsFunc(&args); - return LBBox3fa(box[0],box[1]); - } - - /*! calculates the build bounds of the i'th item, if it's valid */ - __forceinline bool buildBounds(size_t i, BBox3fa* bbox = nullptr) const - { - const BBox3fa b = bounds(i); - if (bbox) *bbox = b; - return isvalid_non_empty(b); - } - - /*! calculates the build bounds of the i'th item at the itime'th time segment, if it's valid */ - __forceinline bool buildBounds(size_t i, size_t itime, BBox3fa& bbox) const - { - const LBBox3fa bounds = linearBounds(i,itime); - bbox = bounds.bounds0; // use bounding box of first timestep to build BVH - return isvalid_non_empty(bounds); - } - - /*! calculates the linear bounds of the i'th primitive for the specified time range */ - __forceinline LBBox3fa linearBounds(size_t primID, const BBox1f& dt) const { - return LBBox3fa([&] (size_t itime) { return bounds(primID, itime); }, dt, time_range, fnumTimeSegments); - } - - /*! calculates the linear bounds of the i'th primitive for the specified time range */ - __forceinline bool linearBounds(size_t i, const BBox1f& time_range, LBBox3fa& bbox) const { - if (!valid(i, timeSegmentRange(time_range))) return false; - bbox = linearBounds(i, time_range); - return true; - } - - /* gets version info of topology */ - unsigned int getTopologyVersion() const { - return numPrimitives; - } - - /* returns true if topology changed */ - bool topologyChanged(unsigned int otherVersion) const { - return numPrimitives != otherVersion; - } - - public: - - /*! Intersects a single ray with the scene. */ - __forceinline void intersect (RayHit& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportIntersectionFunc report) - { - assert(primID < size()); - assert(intersectorN.intersect); - - int mask = -1; - IntersectFunctionNArguments args; - args.valid = &mask; - args.geometryUserPtr = userPtr; - args.context = context->user; - args.rayhit = (RTCRayHitN*)&ray; - args.N = 1; - args.geomID = geomID; - args.primID = primID; - args.internal_context = context; - args.geometry = this; - args.report = report; - - intersectorN.intersect(&args); - } - - /*! Tests if single ray is occluded by the scene. */ - __forceinline void occluded (Ray& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportOcclusionFunc report) - { - assert(primID < size()); - assert(intersectorN.occluded); - - int mask = -1; - OccludedFunctionNArguments args; - args.valid = &mask; - args.geometryUserPtr = userPtr; - args.context = context->user; - args.ray = (RTCRayN*)&ray; - args.N = 1; - args.geomID = geomID; - args.primID = primID; - args.internal_context = context; - args.geometry = this; - args.report = report; - - intersectorN.occluded(&args); - } - - /*! Intersects a packet of K rays with the scene. */ - template - __forceinline void intersect (const vbool& valid, RayHitK& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportIntersectionFunc report) - { - assert(primID < size()); - assert(intersectorN.intersect); - - vint mask = valid.mask32(); - IntersectFunctionNArguments args; - args.valid = (int*)&mask; - args.geometryUserPtr = userPtr; - args.context = context->user; - args.rayhit = (RTCRayHitN*)&ray; - args.N = K; - args.geomID = geomID; - args.primID = primID; - args.internal_context = context; - args.geometry = this; - args.report = report; - - intersectorN.intersect(&args); - } - - /*! Tests if a packet of K rays is occluded by the scene. */ - template - __forceinline void occluded (const vbool& valid, RayK& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportOcclusionFunc report) - { - assert(primID < size()); - assert(intersectorN.occluded); - - vint mask = valid.mask32(); - OccludedFunctionNArguments args; - args.valid = (int*)&mask; - args.geometryUserPtr = userPtr; - args.context = context->user; - args.ray = (RTCRayN*)&ray; - args.N = K; - args.geomID = geomID; - args.primID = primID; - args.internal_context = context; - args.geometry = this; - args.report = report; - - intersectorN.occluded(&args); - } - - public: - RTCBoundsFunction boundsFunc; - IntersectorN intersectorN; - }; - -#define DEFINE_SET_INTERSECTORN(symbol,intersector) \ - AccelSet::IntersectorN symbol() { \ - return AccelSet::IntersectorN(intersector::intersect, \ - intersector::occluded, \ - TOSTRING(isa) "::" TOSTRING(symbol)); \ - } -} diff --git a/thirdparty/embree-aarch64/kernels/common/alloc.cpp b/thirdparty/embree-aarch64/kernels/common/alloc.cpp deleted file mode 100644 index 6fa406f03a..0000000000 --- a/thirdparty/embree-aarch64/kernels/common/alloc.cpp +++ /dev/null @@ -1,82 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "alloc.h" -#include "../../common/sys/thread.h" -#if defined(__aarch64__) && defined(BUILD_IOS) -#include "../../common/sys/barrier.h" -#endif - -namespace embree -{ - __thread FastAllocator::ThreadLocal2* FastAllocator::thread_local_allocator2 = nullptr; - SpinLock FastAllocator::s_thread_local_allocators_lock; - std::vector> FastAllocator::s_thread_local_allocators; - - struct fast_allocator_regression_test : public RegressionTest - { - BarrierSys barrier; - std::atomic numFailed; - std::unique_ptr alloc; - - fast_allocator_regression_test() - : RegressionTest("fast_allocator_regression_test"), numFailed(0) - { - registerRegressionTest(this); - } - - static void thread_alloc(fast_allocator_regression_test* This) - { - FastAllocator::CachedAllocator threadalloc = This->alloc->getCachedAllocator(); - - size_t* ptrs[1000]; - for (size_t j=0; j<1000; j++) - { - This->barrier.wait(); - for (size_t i=0; i<1000; i++) { - ptrs[i] = (size_t*) threadalloc.malloc0(sizeof(size_t)+(i%32)); - *ptrs[i] = size_t(threadalloc.talloc0) + i; - } - for (size_t i=0; i<1000; i++) { - if (*ptrs[i] != size_t(threadalloc.talloc0) + i) - This->numFailed++; - } - This->barrier.wait(); - } - } - - bool run () - { - alloc = make_unique(new FastAllocator(nullptr,false)); - numFailed.store(0); - - size_t numThreads = getNumberOfLogicalThreads(); - barrier.init(numThreads+1); - - /* create threads */ - std::vector threads; - for (size_t i=0; ireset(); - barrier.wait(); - barrier.wait(); - } - - /* destroy threads */ - for (size_t i=0; i -#endif - -namespace embree -{ - class FastAllocator - { - /*! maximum supported alignment */ - static const size_t maxAlignment = 64; - - /*! maximum allocation size */ - - /* default settings */ - //static const size_t defaultBlockSize = 4096; -#define maxAllocationSize size_t(2*1024*1024-maxAlignment) - - static const size_t MAX_THREAD_USED_BLOCK_SLOTS = 8; - - public: - - struct ThreadLocal2; - enum AllocationType { ALIGNED_MALLOC, EMBREE_OS_MALLOC, SHARED, ANY_TYPE }; - - /*! Per thread structure holding the current memory block. */ - struct __aligned(64) ThreadLocal - { - ALIGNED_CLASS_(64); - public: - - /*! Constructor for usage with ThreadLocalData */ - __forceinline ThreadLocal (ThreadLocal2* parent) - : parent(parent), ptr(nullptr), cur(0), end(0), allocBlockSize(0), bytesUsed(0), bytesWasted(0) {} - - /*! initialize allocator */ - void init(FastAllocator* alloc) - { - ptr = nullptr; - cur = end = 0; - bytesUsed = 0; - bytesWasted = 0; - allocBlockSize = 0; - if (alloc) allocBlockSize = alloc->defaultBlockSize; - } - - /* Allocate aligned memory from the threads memory block. */ - __forceinline void* malloc(FastAllocator* alloc, size_t bytes, size_t align = 16) - { - /* bind the thread local allocator to the proper FastAllocator*/ - parent->bind(alloc); - - assert(align <= maxAlignment); - bytesUsed += bytes; - - /* try to allocate in local block */ - size_t ofs = (align - cur) & (align-1); - cur += bytes + ofs; - if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; } - cur -= bytes + ofs; - - /* if allocation is too large allocate with parent allocator */ - if (4*bytes > allocBlockSize) { - return alloc->malloc(bytes,maxAlignment,false); - } - - /* get new partial block if allocation failed */ - size_t blockSize = allocBlockSize; - ptr = (char*) alloc->malloc(blockSize,maxAlignment,true); - bytesWasted += end-cur; - cur = 0; end = blockSize; - - /* retry allocation */ - ofs = (align - cur) & (align-1); - cur += bytes + ofs; - if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; } - cur -= bytes + ofs; - - /* get new full block if allocation failed */ - blockSize = allocBlockSize; - ptr = (char*) alloc->malloc(blockSize,maxAlignment,false); - bytesWasted += end-cur; - cur = 0; end = blockSize; - - /* retry allocation */ - ofs = (align - cur) & (align-1); - cur += bytes + ofs; - if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; } - cur -= bytes + ofs; - - /* should never happen as large allocations get handled specially above */ - assert(false); - return nullptr; - } - - - /*! returns amount of used bytes */ - __forceinline size_t getUsedBytes() const { return bytesUsed; } - - /*! returns amount of free bytes */ - __forceinline size_t getFreeBytes() const { return end-cur; } - - /*! returns amount of wasted bytes */ - __forceinline size_t getWastedBytes() const { return bytesWasted; } - - private: - ThreadLocal2* parent; - char* ptr; //!< pointer to memory block - size_t cur; //!< current location of the allocator - size_t end; //!< end of the memory block - size_t allocBlockSize; //!< block size for allocations - size_t bytesUsed; //!< number of total bytes allocated - size_t bytesWasted; //!< number of bytes wasted - }; - - /*! Two thread local structures. */ - struct __aligned(64) ThreadLocal2 - { - ALIGNED_CLASS_(64); - public: - - __forceinline ThreadLocal2() - : alloc(nullptr), alloc0(this), alloc1(this) {} - - /*! bind to fast allocator */ - __forceinline void bind(FastAllocator* alloc_i) - { - assert(alloc_i); - if (alloc.load() == alloc_i) return; -#if defined(__aarch64__) && defined(BUILD_IOS) - std::scoped_lock lock(mutex); -#else - Lock lock(mutex); -#endif - //if (alloc.load() == alloc_i) return; // not required as only one thread calls bind - if (alloc.load()) { - alloc.load()->bytesUsed += alloc0.getUsedBytes() + alloc1.getUsedBytes(); - alloc.load()->bytesFree += alloc0.getFreeBytes() + alloc1.getFreeBytes(); - alloc.load()->bytesWasted += alloc0.getWastedBytes() + alloc1.getWastedBytes(); - } - alloc0.init(alloc_i); - alloc1.init(alloc_i); - alloc.store(alloc_i); - alloc_i->join(this); - } - - /*! unbind to fast allocator */ - void unbind(FastAllocator* alloc_i) - { - assert(alloc_i); - if (alloc.load() != alloc_i) return; -#if defined(__aarch64__) && defined(BUILD_IOS) - std::scoped_lock lock(mutex); -#else - Lock lock(mutex); -#endif - if (alloc.load() != alloc_i) return; // required as a different thread calls unbind - alloc.load()->bytesUsed += alloc0.getUsedBytes() + alloc1.getUsedBytes(); - alloc.load()->bytesFree += alloc0.getFreeBytes() + alloc1.getFreeBytes(); - alloc.load()->bytesWasted += alloc0.getWastedBytes() + alloc1.getWastedBytes(); - alloc0.init(nullptr); - alloc1.init(nullptr); - alloc.store(nullptr); - } - - public: -#if defined(__aarch64__) && defined(BUILD_IOS) - std::mutex mutex; -#else - SpinLock mutex; //!< required as unbind is called from other threads -#endif - std::atomic alloc; //!< parent allocator - ThreadLocal alloc0; - ThreadLocal alloc1; - }; - - FastAllocator (Device* device, bool osAllocation) - : device(device), slotMask(0), usedBlocks(nullptr), freeBlocks(nullptr), use_single_mode(false), defaultBlockSize(PAGE_SIZE), estimatedSize(0), - growSize(PAGE_SIZE), maxGrowSize(maxAllocationSize), log2_grow_size_scale(0), bytesUsed(0), bytesFree(0), bytesWasted(0), atype(osAllocation ? EMBREE_OS_MALLOC : ALIGNED_MALLOC), - primrefarray(device,0) - { - for (size_t i=0; i& primrefarray_i) { - primrefarray = std::move(primrefarray_i); - } - - void unshare(mvector& primrefarray_o) - { - reset(); // this removes blocks that are allocated inside the shared primref array - primrefarray_o = std::move(primrefarray); - } - - /*! returns first fast thread local allocator */ - __forceinline ThreadLocal* _threadLocal() { - return &threadLocal2()->alloc0; - } - - void setOSallocation(bool flag) - { - atype = flag ? EMBREE_OS_MALLOC : ALIGNED_MALLOC; - } - - private: - - /*! returns both fast thread local allocators */ - __forceinline ThreadLocal2* threadLocal2() - { - ThreadLocal2* alloc = thread_local_allocator2; - if (alloc == nullptr) { - thread_local_allocator2 = alloc = new ThreadLocal2; -#if defined(__aarch64__) && defined(BUILD_IOS) - std::scoped_lock lock(s_thread_local_allocators_lock); -#else - Lock lock(s_thread_local_allocators_lock); -#endif - s_thread_local_allocators.push_back(make_unique(alloc)); - } - return alloc; - } - - public: - - __forceinline void join(ThreadLocal2* alloc) - { -#if defined(__aarch64__) && defined(BUILD_IOS) - std::scoped_lock lock(s_thread_local_allocators_lock); -#else - Lock lock(thread_local_allocators_lock); -#endif - thread_local_allocators.push_back(alloc); - } - - public: - - struct CachedAllocator - { - __forceinline CachedAllocator(void* ptr) - : alloc(nullptr), talloc0(nullptr), talloc1(nullptr) - { - assert(ptr == nullptr); - } - - __forceinline CachedAllocator(FastAllocator* alloc, ThreadLocal2* talloc) - : alloc(alloc), talloc0(&talloc->alloc0), talloc1(alloc->use_single_mode ? &talloc->alloc0 : &talloc->alloc1) {} - - __forceinline operator bool () const { - return alloc != nullptr; - } - - __forceinline void* operator() (size_t bytes, size_t align = 16) const { - return talloc0->malloc(alloc,bytes,align); - } - - __forceinline void* malloc0 (size_t bytes, size_t align = 16) const { - return talloc0->malloc(alloc,bytes,align); - } - - __forceinline void* malloc1 (size_t bytes, size_t align = 16) const { - return talloc1->malloc(alloc,bytes,align); - } - - public: - FastAllocator* alloc; - ThreadLocal* talloc0; - ThreadLocal* talloc1; - }; - - __forceinline CachedAllocator getCachedAllocator() { - return CachedAllocator(this,threadLocal2()); - } - - /*! Builder interface to create thread local allocator */ - struct Create - { - public: - __forceinline Create (FastAllocator* allocator) : allocator(allocator) {} - __forceinline CachedAllocator operator() () const { return allocator->getCachedAllocator(); } - - private: - FastAllocator* allocator; - }; - - void internal_fix_used_blocks() - { - /* move thread local blocks to global block list */ - for (size_t i = 0; i < MAX_THREAD_USED_BLOCK_SLOTS; i++) - { - while (threadBlocks[i].load() != nullptr) { - Block* nextUsedBlock = threadBlocks[i].load()->next; - threadBlocks[i].load()->next = usedBlocks.load(); - usedBlocks = threadBlocks[i].load(); - threadBlocks[i] = nextUsedBlock; - } - threadBlocks[i] = nullptr; - } - } - - static const size_t threadLocalAllocOverhead = 20; //! 20 means 5% parallel allocation overhead through unfilled thread local blocks -#if defined(__AVX512ER__) // KNL - static const size_t mainAllocOverheadStatic = 15; //! 15 means 7.5% allocation overhead through unfilled main alloc blocks -#else - static const size_t mainAllocOverheadStatic = 20; //! 20 means 5% allocation overhead through unfilled main alloc blocks -#endif - static const size_t mainAllocOverheadDynamic = 8; //! 20 means 12.5% allocation overhead through unfilled main alloc blocks - - /* calculates a single threaded threshold for the builders such - * that for small scenes the overhead of partly allocated blocks - * per thread is low */ - size_t fixSingleThreadThreshold(size_t branchingFactor, size_t defaultThreshold, size_t numPrimitives, size_t bytesEstimated) - { - if (numPrimitives == 0 || bytesEstimated == 0) - return defaultThreshold; - - /* calculate block size in bytes to fulfill threadLocalAllocOverhead constraint */ - const size_t single_mode_factor = use_single_mode ? 1 : 2; - const size_t threadCount = TaskScheduler::threadCount(); - const size_t singleThreadBytes = single_mode_factor*threadLocalAllocOverhead*defaultBlockSize; - - /* if we do not have to limit number of threads use optimal thresdhold */ - if ( (bytesEstimated+(singleThreadBytes-1))/singleThreadBytes >= threadCount) - return defaultThreshold; - - /* otherwise limit number of threads by calculating proper single thread threshold */ - else { - double bytesPerPrimitive = double(bytesEstimated)/double(numPrimitives); - return size_t(ceil(branchingFactor*singleThreadBytes/bytesPerPrimitive)); - } - } - - __forceinline size_t alignSize(size_t i) { - return (i+127)/128*128; - } - - /*! initializes the grow size */ - __forceinline void initGrowSizeAndNumSlots(size_t bytesEstimated, bool fast) - { - /* we do not need single thread local allocator mode */ - use_single_mode = false; - - /* calculate growSize such that at most mainAllocationOverhead gets wasted when a block stays unused */ - size_t mainAllocOverhead = fast ? mainAllocOverheadDynamic : mainAllocOverheadStatic; - size_t blockSize = alignSize(bytesEstimated/mainAllocOverhead); - growSize = maxGrowSize = clamp(blockSize,size_t(1024),maxAllocationSize); - - /* if we reached the maxAllocationSize for growSize, we can - * increase the number of allocation slots by still guaranteeing - * the mainAllocationOverhead */ - slotMask = 0x0; - - if (MAX_THREAD_USED_BLOCK_SLOTS >= 2 && bytesEstimated > 2*mainAllocOverhead*growSize) slotMask = 0x1; - if (MAX_THREAD_USED_BLOCK_SLOTS >= 4 && bytesEstimated > 4*mainAllocOverhead*growSize) slotMask = 0x3; - if (MAX_THREAD_USED_BLOCK_SLOTS >= 8 && bytesEstimated > 8*mainAllocOverhead*growSize) slotMask = 0x7; - if (MAX_THREAD_USED_BLOCK_SLOTS >= 8 && bytesEstimated > 16*mainAllocOverhead*growSize) { growSize *= 2; } /* if the overhead is tiny, double the growSize */ - - /* set the thread local alloc block size */ - size_t defaultBlockSizeSwitch = PAGE_SIZE+maxAlignment; - - /* for sufficiently large scene we can increase the defaultBlockSize over the defaultBlockSizeSwitch size */ -#if 0 // we do not do this as a block size of 4160 if for some reason best for KNL - const size_t threadCount = TaskScheduler::threadCount(); - const size_t single_mode_factor = use_single_mode ? 1 : 2; - const size_t singleThreadBytes = single_mode_factor*threadLocalAllocOverhead*defaultBlockSizeSwitch; - if (bytesEstimated+(singleThreadBytes-1))/singleThreadBytes >= threadCount) - defaultBlockSize = min(max(defaultBlockSizeSwitch,bytesEstimated/(single_mode_factor*threadLocalAllocOverhead*threadCount)),growSize); - - /* otherwise we grow the defaultBlockSize up to defaultBlockSizeSwitch */ - else -#endif - defaultBlockSize = clamp(blockSize,size_t(1024),defaultBlockSizeSwitch); - - if (bytesEstimated == 0) { - maxGrowSize = maxAllocationSize; // special mode if builder cannot estimate tree size - defaultBlockSize = defaultBlockSizeSwitch; - } - log2_grow_size_scale = 0; - - if (device->alloc_main_block_size != 0) growSize = device->alloc_main_block_size; - if (device->alloc_num_main_slots >= 1 ) slotMask = 0x0; - if (device->alloc_num_main_slots >= 2 ) slotMask = 0x1; - if (device->alloc_num_main_slots >= 4 ) slotMask = 0x3; - if (device->alloc_num_main_slots >= 8 ) slotMask = 0x7; - if (device->alloc_thread_block_size != 0) defaultBlockSize = device->alloc_thread_block_size; - if (device->alloc_single_thread_alloc != -1) use_single_mode = device->alloc_single_thread_alloc; - } - - /*! initializes the allocator */ - void init(size_t bytesAllocate, size_t bytesReserve, size_t bytesEstimate) - { - internal_fix_used_blocks(); - /* distribute the allocation to multiple thread block slots */ - slotMask = MAX_THREAD_USED_BLOCK_SLOTS-1; // FIXME: remove - if (usedBlocks.load() || freeBlocks.load()) { reset(); return; } - if (bytesReserve == 0) bytesReserve = bytesAllocate; - freeBlocks = Block::create(device,bytesAllocate,bytesReserve,nullptr,atype); - estimatedSize = bytesEstimate; - initGrowSizeAndNumSlots(bytesEstimate,true); - } - - /*! initializes the allocator */ - void init_estimate(size_t bytesEstimate) - { - internal_fix_used_blocks(); - if (usedBlocks.load() || freeBlocks.load()) { reset(); return; } - /* single allocator mode ? */ - estimatedSize = bytesEstimate; - //initGrowSizeAndNumSlots(bytesEstimate,false); - initGrowSizeAndNumSlots(bytesEstimate,false); - - } - - /*! frees state not required after build */ - __forceinline void cleanup() - { - internal_fix_used_blocks(); - - /* unbind all thread local allocators */ - for (auto alloc : thread_local_allocators) alloc->unbind(this); - thread_local_allocators.clear(); - } - - /*! resets the allocator, memory blocks get reused */ - void reset () - { - internal_fix_used_blocks(); - - bytesUsed.store(0); - bytesFree.store(0); - bytesWasted.store(0); - - /* reset all used blocks and move them to begin of free block list */ - while (usedBlocks.load() != nullptr) { - usedBlocks.load()->reset_block(); - Block* nextUsedBlock = usedBlocks.load()->next; - usedBlocks.load()->next = freeBlocks.load(); - freeBlocks = usedBlocks.load(); - usedBlocks = nextUsedBlock; - } - - /* remove all shared blocks as they are re-added during build */ - freeBlocks.store(Block::remove_shared_blocks(freeBlocks.load())); - - for (size_t i=0; iunbind(this); - thread_local_allocators.clear(); - } - - /*! frees all allocated memory */ - __forceinline void clear() - { - cleanup(); - bytesUsed.store(0); - bytesFree.store(0); - bytesWasted.store(0); - if (usedBlocks.load() != nullptr) usedBlocks.load()->clear_list(device); usedBlocks = nullptr; - if (freeBlocks.load() != nullptr) freeBlocks.load()->clear_list(device); freeBlocks = nullptr; - for (size_t i=0; imalloc(device,bytes,align,partial); - if (ptr) return ptr; - } - - /* throw error if allocation is too large */ - if (bytes > maxAllocationSize) - throw_RTCError(RTC_ERROR_UNKNOWN,"allocation is too large"); - - /* parallel block creation in case of no freeBlocks, avoids single global mutex */ - if (likely(freeBlocks.load() == nullptr)) - { -#if defined(__aarch64__) && defined(BUILD_IOS) - std::scoped_lock lock(slotMutex[slot]); -#else - Lock lock(slotMutex[slot]); -#endif - if (myUsedBlocks == threadUsedBlocks[slot]) { - const size_t alignedBytes = (bytes+(align-1)) & ~(align-1); - const size_t allocSize = max(min(growSize,maxGrowSize),alignedBytes); - assert(allocSize >= bytes); - threadBlocks[slot] = threadUsedBlocks[slot] = Block::create(device,allocSize,allocSize,threadBlocks[slot],atype); // FIXME: a large allocation might throw away a block here! - // FIXME: a direct allocation should allocate inside the block here, and not in the next loop! a different thread could do some allocation and make the large allocation fail. - } - continue; - } - - /* if this fails allocate new block */ - { -#if defined(__aarch64__) && defined(BUILD_IOS) - std::scoped_lock lock(mutex); -#else - Lock lock(mutex); -#endif - if (myUsedBlocks == threadUsedBlocks[slot]) - { - if (freeBlocks.load() != nullptr) { - Block* nextFreeBlock = freeBlocks.load()->next; - freeBlocks.load()->next = usedBlocks; - __memory_barrier(); - usedBlocks = freeBlocks.load(); - threadUsedBlocks[slot] = freeBlocks.load(); - freeBlocks = nextFreeBlock; - } else { - const size_t allocSize = min(growSize*incGrowSizeScale(),maxGrowSize); - usedBlocks = threadUsedBlocks[slot] = Block::create(device,allocSize,allocSize,usedBlocks,atype); // FIXME: a large allocation should get delivered directly, like above! - } - } - } - } - } - - /*! add new block */ - void addBlock(void* ptr, ssize_t bytes) - { -#if defined(__aarch64__) && defined(BUILD_IOS) - std::scoped_lock lock(mutex); -#else - Lock lock(mutex); -#endif - const size_t sizeof_Header = offsetof(Block,data[0]); - void* aptr = (void*) ((((size_t)ptr)+maxAlignment-1) & ~(maxAlignment-1)); - size_t ofs = (size_t) aptr - (size_t) ptr; - bytes -= ofs; - if (bytes < 4096) return; // ignore empty or very small blocks - freeBlocks = new (aptr) Block(SHARED,bytes-sizeof_Header,bytes-sizeof_Header,freeBlocks,ofs); - } - - /* special allocation only used from morton builder only a single time for each build */ - void* specialAlloc(size_t bytes) - { - assert(freeBlocks.load() != nullptr && freeBlocks.load()->getBlockAllocatedBytes() >= bytes); - return freeBlocks.load()->ptr(); - } - - struct Statistics - { - Statistics () - : bytesUsed(0), bytesFree(0), bytesWasted(0) {} - - Statistics (size_t bytesUsed, size_t bytesFree, size_t bytesWasted) - : bytesUsed(bytesUsed), bytesFree(bytesFree), bytesWasted(bytesWasted) {} - - Statistics (FastAllocator* alloc, AllocationType atype, bool huge_pages = false) - : bytesUsed(0), bytesFree(0), bytesWasted(0) - { - Block* usedBlocks = alloc->usedBlocks.load(); - Block* freeBlocks = alloc->freeBlocks.load(); - if (usedBlocks) bytesUsed += usedBlocks->getUsedBytes(atype,huge_pages); - if (freeBlocks) bytesFree += freeBlocks->getAllocatedBytes(atype,huge_pages); - if (usedBlocks) bytesFree += usedBlocks->getFreeBytes(atype,huge_pages); - if (freeBlocks) bytesWasted += freeBlocks->getWastedBytes(atype,huge_pages); - if (usedBlocks) bytesWasted += usedBlocks->getWastedBytes(atype,huge_pages); - } - - std::string str(size_t numPrimitives) - { - std::stringstream str; - str.setf(std::ios::fixed, std::ios::floatfield); - str << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, " - << "free = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesFree << " MB, " - << "wasted = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesWasted << " MB, " - << "total = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesAllocatedTotal() << " MB, " - << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesAllocatedTotal())/double(numPrimitives); - return str.str(); - } - - friend Statistics operator+ ( const Statistics& a, const Statistics& b) - { - return Statistics(a.bytesUsed+b.bytesUsed, - a.bytesFree+b.bytesFree, - a.bytesWasted+b.bytesWasted); - } - - size_t bytesAllocatedTotal() const { - return bytesUsed + bytesFree + bytesWasted; - } - - public: - size_t bytesUsed; - size_t bytesFree; - size_t bytesWasted; - }; - - Statistics getStatistics(AllocationType atype, bool huge_pages = false) { - return Statistics(this,atype,huge_pages); - } - - size_t getUsedBytes() { - return bytesUsed; - } - - size_t getWastedBytes() { - return bytesWasted; - } - - struct AllStatistics - { - AllStatistics (FastAllocator* alloc) - - : bytesUsed(alloc->bytesUsed), - bytesFree(alloc->bytesFree), - bytesWasted(alloc->bytesWasted), - stat_all(alloc,ANY_TYPE), - stat_malloc(alloc,ALIGNED_MALLOC), - stat_4K(alloc,EMBREE_OS_MALLOC,false), - stat_2M(alloc,EMBREE_OS_MALLOC,true), - stat_shared(alloc,SHARED) {} - - AllStatistics (size_t bytesUsed, - size_t bytesFree, - size_t bytesWasted, - Statistics stat_all, - Statistics stat_malloc, - Statistics stat_4K, - Statistics stat_2M, - Statistics stat_shared) - - : bytesUsed(bytesUsed), - bytesFree(bytesFree), - bytesWasted(bytesWasted), - stat_all(stat_all), - stat_malloc(stat_malloc), - stat_4K(stat_4K), - stat_2M(stat_2M), - stat_shared(stat_shared) {} - - friend AllStatistics operator+ (const AllStatistics& a, const AllStatistics& b) - { - return AllStatistics(a.bytesUsed+b.bytesUsed, - a.bytesFree+b.bytesFree, - a.bytesWasted+b.bytesWasted, - a.stat_all + b.stat_all, - a.stat_malloc + b.stat_malloc, - a.stat_4K + b.stat_4K, - a.stat_2M + b.stat_2M, - a.stat_shared + b.stat_shared); - } - - void print(size_t numPrimitives) - { - std::stringstream str0; - str0.setf(std::ios::fixed, std::ios::floatfield); - str0 << " alloc : " - << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, " - << " " - << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesUsed)/double(numPrimitives); - std::cout << str0.str() << std::endl; - - std::stringstream str1; - str1.setf(std::ios::fixed, std::ios::floatfield); - str1 << " alloc : " - << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, " - << "free = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesFree << " MB, " - << "wasted = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesWasted << " MB, " - << "total = " << std::setw(7) << std::setprecision(3) << 1E-6f*(bytesUsed+bytesFree+bytesWasted) << " MB, " - << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesUsed+bytesFree+bytesWasted)/double(numPrimitives); - std::cout << str1.str() << std::endl; - - std::cout << " total : " << stat_all.str(numPrimitives) << std::endl; - std::cout << " 4K : " << stat_4K.str(numPrimitives) << std::endl; - std::cout << " 2M : " << stat_2M.str(numPrimitives) << std::endl; - std::cout << " malloc: " << stat_malloc.str(numPrimitives) << std::endl; - std::cout << " shared: " << stat_shared.str(numPrimitives) << std::endl; - } - - private: - size_t bytesUsed; - size_t bytesFree; - size_t bytesWasted; - Statistics stat_all; - Statistics stat_malloc; - Statistics stat_4K; - Statistics stat_2M; - Statistics stat_shared; - }; - - void print_blocks() - { - std::cout << " estimatedSize = " << estimatedSize << ", slotMask = " << slotMask << ", use_single_mode = " << use_single_mode << ", maxGrowSize = " << maxGrowSize << ", defaultBlockSize = " << defaultBlockSize << std::endl; - - std::cout << " used blocks = "; - if (usedBlocks.load() != nullptr) usedBlocks.load()->print_list(); - std::cout << "[END]" << std::endl; - - std::cout << " free blocks = "; - if (freeBlocks.load() != nullptr) freeBlocks.load()->print_list(); - std::cout << "[END]" << std::endl; - } - - private: - - struct Block - { - static Block* create(MemoryMonitorInterface* device, size_t bytesAllocate, size_t bytesReserve, Block* next, AllocationType atype) - { - /* We avoid using os_malloc for small blocks as this could - * cause a risk of fragmenting the virtual address space and - * reach the limit of vm.max_map_count = 65k under Linux. */ - if (atype == EMBREE_OS_MALLOC && bytesAllocate < maxAllocationSize) - atype = ALIGNED_MALLOC; - - /* we need to additionally allocate some header */ - const size_t sizeof_Header = offsetof(Block,data[0]); - bytesAllocate = sizeof_Header+bytesAllocate; - bytesReserve = sizeof_Header+bytesReserve; - - /* consume full 4k pages with using os_malloc */ - if (atype == EMBREE_OS_MALLOC) { - bytesAllocate = ((bytesAllocate+PAGE_SIZE-1) & ~(PAGE_SIZE-1)); - bytesReserve = ((bytesReserve +PAGE_SIZE-1) & ~(PAGE_SIZE-1)); - } - - /* either use alignedMalloc or os_malloc */ - void *ptr = nullptr; - if (atype == ALIGNED_MALLOC) - { - /* special handling for default block size */ - if (bytesAllocate == (2*PAGE_SIZE_2M)) - { - const size_t alignment = maxAlignment; - if (device) device->memoryMonitor(bytesAllocate+alignment,false); - ptr = alignedMalloc(bytesAllocate,alignment); - - /* give hint to transparently convert these pages to 2MB pages */ - const size_t ptr_aligned_begin = ((size_t)ptr) & ~size_t(PAGE_SIZE_2M-1); - os_advise((void*)(ptr_aligned_begin + 0),PAGE_SIZE_2M); // may fail if no memory mapped before block - os_advise((void*)(ptr_aligned_begin + 1*PAGE_SIZE_2M),PAGE_SIZE_2M); - os_advise((void*)(ptr_aligned_begin + 2*PAGE_SIZE_2M),PAGE_SIZE_2M); // may fail if no memory mapped after block - - return new (ptr) Block(ALIGNED_MALLOC,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment); - } - else - { - const size_t alignment = maxAlignment; - if (device) device->memoryMonitor(bytesAllocate+alignment,false); - ptr = alignedMalloc(bytesAllocate,alignment); - return new (ptr) Block(ALIGNED_MALLOC,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment); - } - } - else if (atype == EMBREE_OS_MALLOC) - { - if (device) device->memoryMonitor(bytesAllocate,false); - bool huge_pages; ptr = os_malloc(bytesReserve,huge_pages); - return new (ptr) Block(EMBREE_OS_MALLOC,bytesAllocate-sizeof_Header,bytesReserve-sizeof_Header,next,0,huge_pages); - } - else - assert(false); - - return NULL; - } - - Block (AllocationType atype, size_t bytesAllocate, size_t bytesReserve, Block* next, size_t wasted, bool huge_pages = false) - : cur(0), allocEnd(bytesAllocate), reserveEnd(bytesReserve), next(next), wasted(wasted), atype(atype), huge_pages(huge_pages) - { - assert((((size_t)&data[0]) & (maxAlignment-1)) == 0); - } - - static Block* remove_shared_blocks(Block* head) - { - Block** prev_next = &head; - for (Block* block = head; block; block = block->next) { - if (block->atype == SHARED) *prev_next = block->next; - else prev_next = &block->next; - } - return head; - } - - void clear_list(MemoryMonitorInterface* device) - { - Block* block = this; - while (block) { - Block* next = block->next; - block->clear_block(device); - block = next; - } - } - - void clear_block (MemoryMonitorInterface* device) - { - const size_t sizeof_Header = offsetof(Block,data[0]); - const ssize_t sizeof_Alloced = wasted+sizeof_Header+getBlockAllocatedBytes(); - - if (atype == ALIGNED_MALLOC) { - alignedFree(this); - if (device) device->memoryMonitor(-sizeof_Alloced,true); - } - - else if (atype == EMBREE_OS_MALLOC) { - size_t sizeof_This = sizeof_Header+reserveEnd; - os_free(this,sizeof_This,huge_pages); - if (device) device->memoryMonitor(-sizeof_Alloced,true); - } - - else /* if (atype == SHARED) */ { - } - } - - void* malloc(MemoryMonitorInterface* device, size_t& bytes_in, size_t align, bool partial) - { - size_t bytes = bytes_in; - assert(align <= maxAlignment); - bytes = (bytes+(align-1)) & ~(align-1); - if (unlikely(cur+bytes > reserveEnd && !partial)) return nullptr; - const size_t i = cur.fetch_add(bytes); - if (unlikely(i+bytes > reserveEnd && !partial)) return nullptr; - if (unlikely(i > reserveEnd)) return nullptr; - bytes_in = bytes = min(bytes,reserveEnd-i); - - if (i+bytes > allocEnd) { - if (device) device->memoryMonitor(i+bytes-max(i,allocEnd),true); - } - return &data[i]; - } - - void* ptr() { - return &data[cur]; - } - - void reset_block () - { - allocEnd = max(allocEnd,(size_t)cur); - cur = 0; - } - - size_t getBlockUsedBytes() const { - return min(size_t(cur),reserveEnd); - } - - size_t getBlockFreeBytes() const { - return getBlockAllocatedBytes() - getBlockUsedBytes(); - } - - size_t getBlockAllocatedBytes() const { - return min(max(allocEnd,size_t(cur)),reserveEnd); - } - - size_t getBlockWastedBytes() const { - const size_t sizeof_Header = offsetof(Block,data[0]); - return sizeof_Header + wasted; - } - - size_t getBlockReservedBytes() const { - return reserveEnd; - } - - bool hasType(AllocationType atype_i, bool huge_pages_i) const - { - if (atype_i == ANY_TYPE ) return true; - else if (atype == EMBREE_OS_MALLOC) return atype_i == atype && huge_pages_i == huge_pages; - else return atype_i == atype; - } - - size_t getUsedBytes(AllocationType atype, bool huge_pages = false) const { - size_t bytes = 0; - for (const Block* block = this; block; block = block->next) { - if (!block->hasType(atype,huge_pages)) continue; - bytes += block->getBlockUsedBytes(); - } - return bytes; - } - - size_t getFreeBytes(AllocationType atype, bool huge_pages = false) const { - size_t bytes = 0; - for (const Block* block = this; block; block = block->next) { - if (!block->hasType(atype,huge_pages)) continue; - bytes += block->getBlockFreeBytes(); - } - return bytes; - } - - size_t getWastedBytes(AllocationType atype, bool huge_pages = false) const { - size_t bytes = 0; - for (const Block* block = this; block; block = block->next) { - if (!block->hasType(atype,huge_pages)) continue; - bytes += block->getBlockWastedBytes(); - } - return bytes; - } - - size_t getAllocatedBytes(AllocationType atype, bool huge_pages = false) const { - size_t bytes = 0; - for (const Block* block = this; block; block = block->next) { - if (!block->hasType(atype,huge_pages)) continue; - bytes += block->getBlockAllocatedBytes(); - } - return bytes; - } - - void print_list () - { - for (const Block* block = this; block; block = block->next) - block->print_block(); - } - - void print_block() const - { - if (atype == ALIGNED_MALLOC) std::cout << "A"; - else if (atype == EMBREE_OS_MALLOC) std::cout << "O"; - else if (atype == SHARED) std::cout << "S"; - if (huge_pages) std::cout << "H"; - size_t bytesUsed = getBlockUsedBytes(); - size_t bytesFree = getBlockFreeBytes(); - size_t bytesWasted = getBlockWastedBytes(); - std::cout << "[" << bytesUsed << ", " << bytesFree << ", " << bytesWasted << "] "; - } - - public: - std::atomic cur; //!< current location of the allocator - std::atomic allocEnd; //!< end of the allocated memory region - std::atomic reserveEnd; //!< end of the reserved memory region - Block* next; //!< pointer to next block in list - size_t wasted; //!< amount of memory wasted through block alignment - AllocationType atype; //!< allocation mode of the block - bool huge_pages; //!< whether the block uses huge pages - char align[maxAlignment-5*sizeof(size_t)-sizeof(AllocationType)-sizeof(bool)]; //!< align data to maxAlignment - char data[1]; //!< here starts memory to use for allocations - }; - - private: - Device* device; - SpinLock mutex; - size_t slotMask; - std::atomic threadUsedBlocks[MAX_THREAD_USED_BLOCK_SLOTS]; - std::atomic usedBlocks; - std::atomic freeBlocks; - - std::atomic threadBlocks[MAX_THREAD_USED_BLOCK_SLOTS]; -#if defined(__aarch64__) && defined(BUILD_IOS) - std::mutex slotMutex[MAX_THREAD_USED_BLOCK_SLOTS]; -#else - SpinLock slotMutex[MAX_THREAD_USED_BLOCK_SLOTS]; -#endif - - bool use_single_mode; - size_t defaultBlockSize; - size_t estimatedSize; - size_t growSize; - size_t maxGrowSize; - std::atomic log2_grow_size_scale; //!< log2 of scaling factor for grow size // FIXME: remove - std::atomic bytesUsed; - std::atomic bytesFree; - std::atomic bytesWasted; - static __thread ThreadLocal2* thread_local_allocator2; - static SpinLock s_thread_local_allocators_lock; - static std::vector> s_thread_local_allocators; -#if defined(__aarch64__) && defined(BUILD_IOS) - std::mutex thread_local_allocators_lock; -#else - SpinLock thread_local_allocators_lock; -#endif - std::vector thread_local_allocators; - AllocationType atype; - mvector primrefarray; //!< primrefarray used to allocate nodes - }; -} diff --git a/thirdparty/embree-aarch64/kernels/common/buffer.h b/thirdparty/embree-aarch64/kernels/common/buffer.h deleted file mode 100644 index 02d319c59d..0000000000 --- a/thirdparty/embree-aarch64/kernels/common/buffer.h +++ /dev/null @@ -1,263 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "default.h" -#include "device.h" - -namespace embree -{ - /*! Implements an API data buffer object. This class may or may not own the data. */ - class Buffer : public RefCount - { - public: - /*! Buffer construction */ - Buffer() - : device(nullptr), ptr(nullptr), numBytes(0), shared(false) {} - - /*! Buffer construction */ - Buffer(Device* device, size_t numBytes_in, void* ptr_in = nullptr) - : device(device), numBytes(numBytes_in) - { - device->refInc(); - - if (ptr_in) - { - shared = true; - ptr = (char*)ptr_in; - } - else - { - shared = false; - alloc(); - } - } - - /*! Buffer destruction */ - ~Buffer() { - free(); - device->refDec(); - } - - /*! this class is not copyable */ - private: - Buffer(const Buffer& other) DELETED; // do not implement - Buffer& operator =(const Buffer& other) DELETED; // do not implement - - public: - /* inits and allocates the buffer */ - void create(Device* device_in, size_t numBytes_in) - { - init(device_in, numBytes_in); - alloc(); - } - - /* inits the buffer */ - void init(Device* device_in, size_t numBytes_in) - { - free(); - device = device_in; - ptr = nullptr; - numBytes = numBytes_in; - shared = false; - } - - /*! sets shared buffer */ - void set(Device* device_in, void* ptr_in, size_t numBytes_in) - { - free(); - device = device_in; - ptr = (char*)ptr_in; - if (numBytes_in != (size_t)-1) - numBytes = numBytes_in; - shared = true; - } - - /*! allocated buffer */ - void alloc() - { - if (device) - device->memoryMonitor(this->bytes(), false); - size_t b = (this->bytes()+15) & ssize_t(-16); - ptr = (char*)alignedMalloc(b,16); - } - - /*! frees the buffer */ - void free() - { - if (shared) return; - alignedFree(ptr); - if (device) - device->memoryMonitor(-ssize_t(this->bytes()), true); - ptr = nullptr; - } - - /*! gets buffer pointer */ - void* data() - { - /* report error if buffer is not existing */ - if (!device) - throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer specified"); - - /* return buffer */ - return ptr; - } - - /*! returns pointer to first element */ - __forceinline char* getPtr() const { - return ptr; - } - - /*! returns the number of bytes of the buffer */ - __forceinline size_t bytes() const { - return numBytes; - } - - /*! returns true of the buffer is not empty */ - __forceinline operator bool() const { - return ptr; - } - - public: - Device* device; //!< device to report memory usage to - char* ptr; //!< pointer to buffer data - size_t numBytes; //!< number of bytes in the buffer - bool shared; //!< set if memory is shared with application - }; - - /*! An untyped contiguous range of a buffer. This class does not own the buffer content. */ - class RawBufferView - { - public: - /*! Buffer construction */ - RawBufferView() - : ptr_ofs(nullptr), stride(0), num(0), format(RTC_FORMAT_UNDEFINED), modCounter(1), modified(true), userData(0) {} - - public: - /*! sets the buffer view */ - void set(const Ref& buffer_in, size_t offset_in, size_t stride_in, size_t num_in, RTCFormat format_in) - { - if ((offset_in + stride_in * num_in) > (stride_in * buffer_in->numBytes)) - throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "buffer range out of bounds"); - - ptr_ofs = buffer_in->ptr + offset_in; - stride = stride_in; - num = num_in; - format = format_in; - modCounter++; - modified = true; - buffer = buffer_in; - } - - /*! returns pointer to the first element */ - __forceinline char* getPtr() const { - return ptr_ofs; - } - - /*! returns pointer to the i'th element */ - __forceinline char* getPtr(size_t i) const - { - assert(i otherModCounter; - } - - /*! mark buffer as modified or unmodified */ - __forceinline bool isLocalModified() const { - return modified; - } - - /*! clear local modified flag */ - __forceinline void clearLocalModified() { - modified = false; - } - - /*! returns true of the buffer is not empty */ - __forceinline operator bool() const { - return ptr_ofs; - } - - /*! checks padding to 16 byte check, fails hard */ - __forceinline void checkPadding16() const - { - if (ptr_ofs && num) - volatile int MAYBE_UNUSED w = *((int*)getPtr(size()-1)+3); // FIXME: is failing hard avoidable? - } - - public: - char* ptr_ofs; //!< base pointer plus offset - size_t stride; //!< stride of the buffer in bytes - size_t num; //!< number of elements in the buffer - RTCFormat format; //!< format of the buffer - unsigned int modCounter; //!< version ID of this buffer - bool modified; //!< local modified data - int userData; //!< special data - Ref buffer; //!< reference to the parent buffer - }; - - /*! A typed contiguous range of a buffer. This class does not own the buffer content. */ - template - class BufferView : public RawBufferView - { - public: - typedef T value_type; - - /*! access to the ith element of the buffer */ - __forceinline T& operator [](size_t i) { assert(i - class BufferView : public RawBufferView - { - public: - typedef Vec3fa value_type; - - /*! access to the ith element of the buffer */ - __forceinline const Vec3fa operator [](size_t i) const - { - assert(i - struct ProgressMonitorClosure : BuildProgressMonitor - { - public: - ProgressMonitorClosure (const Closure& closure) : closure(closure) {} - void operator() (size_t dn) const { closure(dn); } - private: - const Closure closure; - }; - template __forceinline const ProgressMonitorClosure BuildProgressMonitorFromClosure(const Closure& closure) { - return ProgressMonitorClosure(closure); - } - - struct LineSegments; - struct TriangleMesh; - struct QuadMesh; - struct UserGeometry; - - class Scene; - - typedef void (*createLineSegmentsAccelTy)(Scene* scene, LineSegments* mesh, AccelData*& accel, Builder*& builder); - typedef void (*createTriangleMeshAccelTy)(Scene* scene, unsigned int geomID, AccelData*& accel, Builder*& builder); - typedef void (*createQuadMeshAccelTy)(Scene* scene, unsigned int geomID, AccelData*& accel, Builder*& builder); - typedef void (*createUserGeometryAccelTy)(Scene* scene, unsigned int geomID, AccelData*& accel, Builder*& builder); - -} diff --git a/thirdparty/embree-aarch64/kernels/common/context.h b/thirdparty/embree-aarch64/kernels/common/context.h deleted file mode 100644 index d0185a74f2..0000000000 --- a/thirdparty/embree-aarch64/kernels/common/context.h +++ /dev/null @@ -1,131 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "default.h" -#include "rtcore.h" -#include "point_query.h" - -namespace embree -{ - class Scene; - - struct IntersectContext - { - public: - __forceinline IntersectContext(Scene* scene, RTCIntersectContext* user_context) - : scene(scene), user(user_context) {} - - __forceinline bool hasContextFilter() const { - return user->filter != nullptr; - } - - __forceinline bool isCoherent() const { - return embree::isCoherent(user->flags); - } - - __forceinline bool isIncoherent() const { - return embree::isIncoherent(user->flags); - } - - public: - Scene* scene; - RTCIntersectContext* user; - }; - - template - __forceinline Vec4vf enlargeRadiusToMinWidth(const IntersectContext* context, const Geometry* geom, const Vec3vf& ray_org, const Vec4vf& v) - { -#if RTC_MIN_WIDTH - const vfloat d = length(Vec3vf(v) - ray_org); - const vfloat r = clamp(context->user->minWidthDistanceFactor*d, v.w, geom->maxRadiusScale*v.w); - return Vec4vf(v.x,v.y,v.z,r); -#else - return v; -#endif - } - - template - __forceinline Vec3ff enlargeRadiusToMinWidth(const IntersectContext* context, const Geometry* geom, const Vec3fa& ray_org, const Vec3ff& v) - { -#if RTC_MIN_WIDTH - const float d = length(Vec3fa(v) - ray_org); - const float r = clamp(context->user->minWidthDistanceFactor*d, v.w, geom->maxRadiusScale*v.w); - return Vec3ff(v.x,v.y,v.z,r); -#else - return v; -#endif - } - - enum PointQueryType - { - POINT_QUERY_TYPE_UNDEFINED = 0, - POINT_QUERY_TYPE_SPHERE = 1, - POINT_QUERY_TYPE_AABB = 2, - }; - - typedef bool (*PointQueryFunction)(struct RTCPointQueryFunctionArguments* args); - - struct PointQueryContext - { - public: - __forceinline PointQueryContext(Scene* scene, - PointQuery* query_ws, - PointQueryType query_type, - PointQueryFunction func, - RTCPointQueryContext* userContext, - float similarityScale, - void* userPtr) - : scene(scene) - , query_ws(query_ws) - , query_type(query_type) - , func(func) - , userContext(userContext) - , similarityScale(similarityScale) - , userPtr(userPtr) - , primID(RTC_INVALID_GEOMETRY_ID) - , geomID(RTC_INVALID_GEOMETRY_ID) - , query_radius(query_ws->radius) - { - if (query_type == POINT_QUERY_TYPE_AABB) { - assert(similarityScale == 0.f); - updateAABB(); - } - if (userContext->instStackSize == 0) { - assert(similarityScale == 1.f); - } - } - - public: - __forceinline void updateAABB() - { - if (likely(query_ws->radius == (float)inf || userContext->instStackSize == 0)) { - query_radius = Vec3fa(query_ws->radius); - return; - } - - const AffineSpace3fa m = AffineSpace3fa_load_unaligned((AffineSpace3fa*)userContext->world2inst[userContext->instStackSize-1]); - BBox3fa bbox(Vec3fa(-query_ws->radius), Vec3fa(query_ws->radius)); - bbox = xfmBounds(m, bbox); - query_radius = 0.5f * (bbox.upper - bbox.lower); - } - -public: - Scene* scene; - - PointQuery* query_ws; // the original world space point query - PointQueryType query_type; - PointQueryFunction func; - RTCPointQueryContext* userContext; - const float similarityScale; - - void* userPtr; - - unsigned int primID; - unsigned int geomID; - - Vec3fa query_radius; // used if the query is converted to an AABB internally - }; -} - diff --git a/thirdparty/embree-aarch64/kernels/common/default.h b/thirdparty/embree-aarch64/kernels/common/default.h deleted file mode 100644 index 709119163b..0000000000 --- a/thirdparty/embree-aarch64/kernels/common/default.h +++ /dev/null @@ -1,273 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../../common/sys/platform.h" -#include "../../common/sys/sysinfo.h" -#include "../../common/sys/thread.h" -#include "../../common/sys/alloc.h" -#include "../../common/sys/ref.h" -#include "../../common/sys/intrinsics.h" -#include "../../common/sys/atomic.h" -#include "../../common/sys/mutex.h" -#include "../../common/sys/vector.h" -#include "../../common/sys/array.h" -#include "../../common/sys/string.h" -#include "../../common/sys/regression.h" -#include "../../common/sys/vector.h" - -#include "../../common/math/math.h" -#include "../../common/math/transcendental.h" -#include "../../common/simd/simd.h" -#include "../../common/math/vec2.h" -#include "../../common/math/vec3.h" -#include "../../common/math/vec4.h" -#include "../../common/math/vec2fa.h" -#include "../../common/math/vec3fa.h" -#include "../../common/math/interval.h" -#include "../../common/math/bbox.h" -#include "../../common/math/obbox.h" -#include "../../common/math/lbbox.h" -#include "../../common/math/linearspace2.h" -#include "../../common/math/linearspace3.h" -#include "../../common/math/affinespace.h" -#include "../../common/math/range.h" -#include "../../common/lexers/tokenstream.h" - -#include "../../common/tasking/taskscheduler.h" - -#define COMMA , - -#include "../config.h" -#include "isa.h" -#include "stat.h" -#include "profile.h" -#include "rtcore.h" -#include "vector.h" -#include "state.h" -#include "instance_stack.h" - -#include -#include -#include -#include -#include -#include - -#if !defined(_DEBUG) && defined(BUILD_IOS) -#undef assert -#define assert(_EXPR) -#endif - -namespace embree -{ - //////////////////////////////////////////////////////////////////////////////// - /// Vec2 shortcuts - //////////////////////////////////////////////////////////////////////////////// - - template using Vec2vf = Vec2>; - template using Vec2vd = Vec2>; - template using Vec2vr = Vec2>; - template using Vec2vi = Vec2>; - template using Vec2vl = Vec2>; - template using Vec2vb = Vec2>; - template using Vec2vbf = Vec2>; - template using Vec2vbd = Vec2>; - - typedef Vec2 Vec2vf4; - typedef Vec2 Vec2vd4; - typedef Vec2 Vec2vr4; - typedef Vec2 Vec2vi4; - typedef Vec2 Vec2vl4; - typedef Vec2 Vec2vb4; - typedef Vec2 Vec2vbf4; - typedef Vec2 Vec2vbd4; - - typedef Vec2 Vec2vf8; - typedef Vec2 Vec2vd8; - typedef Vec2 Vec2vr8; - typedef Vec2 Vec2vi8; - typedef Vec2 Vec2vl8; - typedef Vec2 Vec2vb8; - typedef Vec2 Vec2vbf8; - typedef Vec2 Vec2vbd8; - - typedef Vec2 Vec2vf16; - typedef Vec2 Vec2vd16; - typedef Vec2 Vec2vr16; - typedef Vec2 Vec2vi16; - typedef Vec2 Vec2vl16; - typedef Vec2 Vec2vb16; - typedef Vec2 Vec2vbf16; - typedef Vec2 Vec2vbd16; - - typedef Vec2 Vec2vfx; - typedef Vec2 Vec2vdx; - typedef Vec2 Vec2vrx; - typedef Vec2 Vec2vix; - typedef Vec2 Vec2vlx; - typedef Vec2 Vec2vbx; - typedef Vec2 Vec2vbfx; - typedef Vec2 Vec2vbdx; - - //////////////////////////////////////////////////////////////////////////////// - /// Vec3 shortcuts - //////////////////////////////////////////////////////////////////////////////// - - template using Vec3vf = Vec3>; - template using Vec3vd = Vec3>; - template using Vec3vr = Vec3>; - template using Vec3vi = Vec3>; - template using Vec3vl = Vec3>; - template using Vec3vb = Vec3>; - template using Vec3vbf = Vec3>; - template using Vec3vbd = Vec3>; - - typedef Vec3 Vec3vf4; - typedef Vec3 Vec3vd4; - typedef Vec3 Vec3vr4; - typedef Vec3 Vec3vi4; - typedef Vec3 Vec3vl4; - typedef Vec3 Vec3vb4; - typedef Vec3 Vec3vbf4; - typedef Vec3 Vec3vbd4; - - typedef Vec3 Vec3vf8; - typedef Vec3 Vec3vd8; - typedef Vec3 Vec3vr8; - typedef Vec3 Vec3vi8; - typedef Vec3 Vec3vl8; - typedef Vec3 Vec3vb8; - typedef Vec3 Vec3vbf8; - typedef Vec3 Vec3vbd8; - - typedef Vec3 Vec3vf16; - typedef Vec3 Vec3vd16; - typedef Vec3 Vec3vr16; - typedef Vec3 Vec3vi16; - typedef Vec3 Vec3vl16; - typedef Vec3 Vec3vb16; - typedef Vec3 Vec3vbf16; - typedef Vec3 Vec3vbd16; - - typedef Vec3 Vec3vfx; - typedef Vec3 Vec3vdx; - typedef Vec3 Vec3vrx; - typedef Vec3 Vec3vix; - typedef Vec3 Vec3vlx; - typedef Vec3 Vec3vbx; - typedef Vec3 Vec3vbfx; - typedef Vec3 Vec3vbdx; - - //////////////////////////////////////////////////////////////////////////////// - /// Vec4 shortcuts - //////////////////////////////////////////////////////////////////////////////// - - template using Vec4vf = Vec4>; - template using Vec4vd = Vec4>; - template using Vec4vr = Vec4>; - template using Vec4vi = Vec4>; - template using Vec4vl = Vec4>; - template using Vec4vb = Vec4>; - template using Vec4vbf = Vec4>; - template using Vec4vbd = Vec4>; - - typedef Vec4 Vec4vf4; - typedef Vec4 Vec4vd4; - typedef Vec4 Vec4vr4; - typedef Vec4 Vec4vi4; - typedef Vec4 Vec4vl4; - typedef Vec4 Vec4vb4; - typedef Vec4 Vec4vbf4; - typedef Vec4 Vec4vbd4; - - typedef Vec4 Vec4vf8; - typedef Vec4 Vec4vd8; - typedef Vec4 Vec4vr8; - typedef Vec4 Vec4vi8; - typedef Vec4 Vec4vl8; - typedef Vec4 Vec4vb8; - typedef Vec4 Vec4vbf8; - typedef Vec4 Vec4vbd8; - - typedef Vec4 Vec4vf16; - typedef Vec4 Vec4vd16; - typedef Vec4 Vec4vr16; - typedef Vec4 Vec4vi16; - typedef Vec4 Vec4vl16; - typedef Vec4 Vec4vb16; - typedef Vec4 Vec4vbf16; - typedef Vec4 Vec4vbd16; - - typedef Vec4 Vec4vfx; - typedef Vec4 Vec4vdx; - typedef Vec4 Vec4vrx; - typedef Vec4 Vec4vix; - typedef Vec4 Vec4vlx; - typedef Vec4 Vec4vbx; - typedef Vec4 Vec4vbfx; - typedef Vec4 Vec4vbdx; - - //////////////////////////////////////////////////////////////////////////////// - /// Other shortcuts - //////////////////////////////////////////////////////////////////////////////// - - template using BBox3vf = BBox>; - typedef BBox BBox3vf4; - typedef BBox BBox3vf8; - typedef BBox BBox3vf16; - - /* calculate time segment itime and fractional time ftime */ - __forceinline int getTimeSegment(float time, float numTimeSegments, float& ftime) - { - const float timeScaled = time * numTimeSegments; - const float itimef = clamp(floorf(timeScaled), 0.0f, numTimeSegments-1.0f); - ftime = timeScaled - itimef; - return int(itimef); - } - - __forceinline int getTimeSegment(float time, float start_time, float end_time, float numTimeSegments, float& ftime) - { - const float timeScaled = (time-start_time)/(end_time-start_time) * numTimeSegments; - const float itimef = clamp(floorf(timeScaled), 0.0f, numTimeSegments-1.0f); - ftime = timeScaled - itimef; - return int(itimef); - } - - template - __forceinline vint getTimeSegment(const vfloat& time, const vfloat& numTimeSegments, vfloat& ftime) - { - const vfloat timeScaled = time * numTimeSegments; - const vfloat itimef = clamp(floor(timeScaled), vfloat(zero), numTimeSegments-1.0f); - ftime = timeScaled - itimef; - return vint(itimef); - } - - template - __forceinline vint getTimeSegment(const vfloat& time, const vfloat& start_time, const vfloat& end_time, const vfloat& numTimeSegments, vfloat& ftime) - { - const vfloat timeScaled = (time-start_time)/(end_time-start_time) * numTimeSegments; - const vfloat itimef = clamp(floor(timeScaled), vfloat(zero), numTimeSegments-1.0f); - ftime = timeScaled - itimef; - return vint(itimef); - } - - /* calculate overlapping time segment range */ - __forceinline range getTimeSegmentRange(const BBox1f& time_range, float numTimeSegments) - { - const float round_up = 1.0f+2.0f*float(ulp); // corrects inaccuracies to precisely match time step - const float round_down = 1.0f-2.0f*float(ulp); - const int itime_lower = (int)max(floor(round_up *time_range.lower*numTimeSegments), 0.0f); - const int itime_upper = (int)min(ceil (round_down*time_range.upper*numTimeSegments), numTimeSegments); - return make_range(itime_lower, itime_upper); - } - - /* calculate overlapping time segment range */ - __forceinline range getTimeSegmentRange(const BBox1f& range, BBox1f time_range, float numTimeSegments) - { - const float lower = (range.lower-time_range.lower)/time_range.size(); - const float upper = (range.upper-time_range.lower)/time_range.size(); - return getTimeSegmentRange(BBox1f(lower,upper),numTimeSegments); - } -} diff --git a/thirdparty/embree-aarch64/kernels/common/device.cpp b/thirdparty/embree-aarch64/kernels/common/device.cpp deleted file mode 100644 index 16ec11b892..0000000000 --- a/thirdparty/embree-aarch64/kernels/common/device.cpp +++ /dev/null @@ -1,567 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "device.h" -#include "../hash.h" -#include "scene_triangle_mesh.h" -#include "scene_user_geometry.h" -#include "scene_instance.h" -#include "scene_curves.h" -#include "scene_subdiv_mesh.h" - -#include "../subdiv/tessellation_cache.h" - -#include "acceln.h" -#include "geometry.h" - -#include "../geometry/cylinder.h" - -#include "../bvh/bvh4_factory.h" -#include "../bvh/bvh8_factory.h" - -#include "../../common/tasking/taskscheduler.h" -#include "../../common/sys/alloc.h" - -namespace embree -{ - /*! some global variables that can be set via rtcSetParameter1i for debugging purposes */ - ssize_t Device::debug_int0 = 0; - ssize_t Device::debug_int1 = 0; - ssize_t Device::debug_int2 = 0; - ssize_t Device::debug_int3 = 0; - - DECLARE_SYMBOL2(RayStreamFilterFuncs,rayStreamFilterFuncs); - - static MutexSys g_mutex; - static std::map g_cache_size_map; - static std::map g_num_threads_map; - - Device::Device (const char* cfg) - { - /* check that CPU supports lowest ISA */ - if (!hasISA(ISA)) { - throw_RTCError(RTC_ERROR_UNSUPPORTED_CPU,"CPU does not support " ISA_STR); - } - - /* set default frequency level for detected CPU */ - switch (getCPUModel()) { - case CPU::UNKNOWN: frequency_level = FREQUENCY_SIMD256; break; - case CPU::XEON_ICE_LAKE: frequency_level = FREQUENCY_SIMD256; break; - case CPU::CORE_ICE_LAKE: frequency_level = FREQUENCY_SIMD256; break; - case CPU::CORE_TIGER_LAKE: frequency_level = FREQUENCY_SIMD128; break; - case CPU::CORE_COMET_LAKE: frequency_level = FREQUENCY_SIMD128; break; - case CPU::CORE_CANNON_LAKE:frequency_level = FREQUENCY_SIMD128; break; - case CPU::CORE_KABY_LAKE: frequency_level = FREQUENCY_SIMD128; break; - case CPU::XEON_SKY_LAKE: frequency_level = FREQUENCY_SIMD128; break; - case CPU::CORE_SKY_LAKE: frequency_level = FREQUENCY_SIMD128; break; - case CPU::XEON_BROADWELL: frequency_level = FREQUENCY_SIMD256; break; - case CPU::CORE_BROADWELL: frequency_level = FREQUENCY_SIMD256; break; - case CPU::XEON_HASWELL: frequency_level = FREQUENCY_SIMD256; break; - case CPU::CORE_HASWELL: frequency_level = FREQUENCY_SIMD256; break; - case CPU::XEON_IVY_BRIDGE: frequency_level = FREQUENCY_SIMD256; break; - case CPU::CORE_IVY_BRIDGE: frequency_level = FREQUENCY_SIMD256; break; - case CPU::SANDY_BRIDGE: frequency_level = FREQUENCY_SIMD256; break; - case CPU::NEHALEM: frequency_level = FREQUENCY_SIMD128; break; - case CPU::CORE2: frequency_level = FREQUENCY_SIMD128; break; - case CPU::CORE1: frequency_level = FREQUENCY_SIMD128; break; - } - - /* initialize global state */ -#if defined(EMBREE_CONFIG) - State::parseString(EMBREE_CONFIG); -#endif - State::parseString(cfg); - if (!ignore_config_files && FileName::executableFolder() != FileName("")) - State::parseFile(FileName::executableFolder()+FileName(".embree" TOSTRING(RTC_VERSION_MAJOR))); - if (!ignore_config_files && FileName::homeFolder() != FileName("")) - State::parseFile(FileName::homeFolder()+FileName(".embree" TOSTRING(RTC_VERSION_MAJOR))); - State::verify(); - - /* check whether selected ISA is supported by the HW, as the user could have forced an unsupported ISA */ - if (!checkISASupport()) { - throw_RTCError(RTC_ERROR_UNSUPPORTED_CPU,"CPU does not support selected ISA"); - } - - /*! do some internal tests */ - assert(isa::Cylinder::verify()); - - /*! enable huge page support if desired */ -#if defined(__WIN32__) - if (State::enable_selockmemoryprivilege) - State::hugepages_success &= win_enable_selockmemoryprivilege(State::verbosity(3)); -#endif - State::hugepages_success &= os_init(State::hugepages,State::verbosity(3)); - - /*! set tessellation cache size */ - setCacheSize( State::tessellation_cache_size ); - - /*! enable some floating point exceptions to catch bugs */ - if (State::float_exceptions) - { - int exceptions = _MM_MASK_MASK; - //exceptions &= ~_MM_MASK_INVALID; - exceptions &= ~_MM_MASK_DENORM; - exceptions &= ~_MM_MASK_DIV_ZERO; - //exceptions &= ~_MM_MASK_OVERFLOW; - //exceptions &= ~_MM_MASK_UNDERFLOW; - //exceptions &= ~_MM_MASK_INEXACT; - _MM_SET_EXCEPTION_MASK(exceptions); - } - - /* print info header */ - if (State::verbosity(1)) - print(); - if (State::verbosity(2)) - State::print(); - - /* register all algorithms */ - bvh4_factory = make_unique(new BVH4Factory(enabled_builder_cpu_features, enabled_cpu_features)); - -#if defined(EMBREE_TARGET_SIMD8) - bvh8_factory = make_unique(new BVH8Factory(enabled_builder_cpu_features, enabled_cpu_features)); -#endif - - /* setup tasking system */ - initTaskingSystem(numThreads); - - /* ray stream SOA to AOS conversion */ -#if defined(EMBREE_RAY_PACKETS) - RayStreamFilterFuncsType rayStreamFilterFuncs; - SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(enabled_cpu_features,rayStreamFilterFuncs); - rayStreamFilters = rayStreamFilterFuncs(); -#endif - } - - Device::~Device () - { - setCacheSize(0); - exitTaskingSystem(); - } - - std::string getEnabledTargets() - { - std::string v; -#if defined(EMBREE_TARGET_SSE2) - v += "SSE2 "; -#endif -#if defined(EMBREE_TARGET_SSE42) - v += "SSE4.2 "; -#endif -#if defined(EMBREE_TARGET_AVX) - v += "AVX "; -#endif -#if defined(EMBREE_TARGET_AVX2) - v += "AVX2 "; -#endif -#if defined(EMBREE_TARGET_AVX512KNL) - v += "AVX512KNL "; -#endif -#if defined(EMBREE_TARGET_AVX512SKX) - v += "AVX512SKX "; -#endif - return v; - } - - std::string getEmbreeFeatures() - { - std::string v; -#if defined(EMBREE_RAY_MASK) - v += "raymasks "; -#endif -#if defined (EMBREE_BACKFACE_CULLING) - v += "backfaceculling "; -#endif -#if defined (EMBREE_BACKFACE_CULLING_CURVES) - v += "backfacecullingcurves "; -#endif -#if defined(EMBREE_FILTER_FUNCTION) - v += "intersection_filter "; -#endif -#if defined (EMBREE_COMPACT_POLYS) - v += "compact_polys "; -#endif - return v; - } - - void Device::print() - { - const int cpu_features = getCPUFeatures(); - std::cout << std::endl; - std::cout << "Embree Ray Tracing Kernels " << RTC_VERSION_STRING << " (" << RTC_HASH << ")" << std::endl; - std::cout << " Compiler : " << getCompilerName() << std::endl; - std::cout << " Build : "; -#if defined(DEBUG) - std::cout << "Debug " << std::endl; -#else - std::cout << "Release " << std::endl; -#endif - std::cout << " Platform : " << getPlatformName() << std::endl; - std::cout << " CPU : " << stringOfCPUModel(getCPUModel()) << " (" << getCPUVendor() << ")" << std::endl; - std::cout << " Threads : " << getNumberOfLogicalThreads() << std::endl; - std::cout << " ISA : " << stringOfCPUFeatures(cpu_features) << std::endl; - std::cout << " Targets : " << supportedTargetList(cpu_features) << std::endl; - const bool hasFTZ = _mm_getcsr() & _MM_FLUSH_ZERO_ON; - const bool hasDAZ = _mm_getcsr() & _MM_DENORMALS_ZERO_ON; - std::cout << " MXCSR : " << "FTZ=" << hasFTZ << ", DAZ=" << hasDAZ << std::endl; - std::cout << " Config" << std::endl; - std::cout << " Threads : " << (numThreads ? toString(numThreads) : std::string("default")) << std::endl; - std::cout << " ISA : " << stringOfCPUFeatures(enabled_cpu_features) << std::endl; - std::cout << " Targets : " << supportedTargetList(enabled_cpu_features) << " (supported)" << std::endl; - std::cout << " " << getEnabledTargets() << " (compile time enabled)" << std::endl; - std::cout << " Features: " << getEmbreeFeatures() << std::endl; - std::cout << " Tasking : "; -#if defined(TASKING_TBB) - std::cout << "TBB" << TBB_VERSION_MAJOR << "." << TBB_VERSION_MINOR << " "; - #if TBB_INTERFACE_VERSION >= 12002 - std::cout << "TBB_header_interface_" << TBB_INTERFACE_VERSION << " TBB_lib_interface_" << TBB_runtime_interface_version() << " "; - #else - std::cout << "TBB_header_interface_" << TBB_INTERFACE_VERSION << " TBB_lib_interface_" << tbb::TBB_runtime_interface_version() << " "; - #endif -#endif -#if defined(TASKING_INTERNAL) - std::cout << "internal_tasking_system "; -#endif -#if defined(TASKING_GCD) && defined(BUILD_IOS) - std::cout << "GCD tasking system "; -#endif -#if defined(TASKING_PPL) - std::cout << "PPL "; -#endif - std::cout << std::endl; - - /* check of FTZ and DAZ flags are set in CSR */ - if (!hasFTZ || !hasDAZ) - { -#if !defined(_DEBUG) - if (State::verbosity(1)) -#endif - { - std::cout << std::endl; - std::cout << "================================================================================" << std::endl; - std::cout << " WARNING: \"Flush to Zero\" or \"Denormals are Zero\" mode not enabled " << std::endl - << " in the MXCSR control and status register. This can have a severe " << std::endl - << " performance impact. Please enable these modes for each application " << std::endl - << " thread the following way:" << std::endl - << std::endl - << " #include \"xmmintrin.h\"" << std::endl - << " #include \"pmmintrin.h\"" << std::endl - << std::endl - << " _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);" << std::endl - << " _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);" << std::endl; - std::cout << "================================================================================" << std::endl; - std::cout << std::endl; - } - } - std::cout << std::endl; - } - - void Device::setDeviceErrorCode(RTCError error) - { - RTCError* stored_error = errorHandler.error(); - if (*stored_error == RTC_ERROR_NONE) - *stored_error = error; - } - - RTCError Device::getDeviceErrorCode() - { - RTCError* stored_error = errorHandler.error(); - RTCError error = *stored_error; - *stored_error = RTC_ERROR_NONE; - return error; - } - - void Device::setThreadErrorCode(RTCError error) - { - RTCError* stored_error = g_errorHandler.error(); - if (*stored_error == RTC_ERROR_NONE) - *stored_error = error; - } - - RTCError Device::getThreadErrorCode() - { - RTCError* stored_error = g_errorHandler.error(); - RTCError error = *stored_error; - *stored_error = RTC_ERROR_NONE; - return error; - } - - void Device::process_error(Device* device, RTCError error, const char* str) - { - /* store global error code when device construction failed */ - if (!device) - return setThreadErrorCode(error); - - /* print error when in verbose mode */ - if (device->verbosity(1)) - { - switch (error) { - case RTC_ERROR_NONE : std::cerr << "Embree: No error"; break; - case RTC_ERROR_UNKNOWN : std::cerr << "Embree: Unknown error"; break; - case RTC_ERROR_INVALID_ARGUMENT : std::cerr << "Embree: Invalid argument"; break; - case RTC_ERROR_INVALID_OPERATION: std::cerr << "Embree: Invalid operation"; break; - case RTC_ERROR_OUT_OF_MEMORY : std::cerr << "Embree: Out of memory"; break; - case RTC_ERROR_UNSUPPORTED_CPU : std::cerr << "Embree: Unsupported CPU"; break; - default : std::cerr << "Embree: Invalid error code"; break; - }; - if (str) std::cerr << ", (" << str << ")"; - std::cerr << std::endl; - } - - /* call user specified error callback */ - if (device->error_function) - device->error_function(device->error_function_userptr,error,str); - - /* record error code */ - device->setDeviceErrorCode(error); - } - - void Device::memoryMonitor(ssize_t bytes, bool post) - { - if (State::memory_monitor_function && bytes != 0) { - if (!State::memory_monitor_function(State::memory_monitor_userptr,bytes,post)) { - if (bytes > 0) { // only throw exception when we allocate memory to never throw inside a destructor - throw_RTCError(RTC_ERROR_OUT_OF_MEMORY,"memory monitor forced termination"); - } - } - } - } - - size_t getMaxNumThreads() - { - size_t maxNumThreads = 0; - for (std::map::iterator i=g_num_threads_map.begin(); i != g_num_threads_map.end(); i++) - maxNumThreads = max(maxNumThreads, (*i).second); - if (maxNumThreads == 0) - maxNumThreads = std::numeric_limits::max(); - return maxNumThreads; - } - - size_t getMaxCacheSize() - { - size_t maxCacheSize = 0; - for (std::map::iterator i=g_cache_size_map.begin(); i!= g_cache_size_map.end(); i++) - maxCacheSize = max(maxCacheSize, (*i).second); - return maxCacheSize; - } - - void Device::setCacheSize(size_t bytes) - { -#if defined(EMBREE_GEOMETRY_SUBDIVISION) - Lock lock(g_mutex); - if (bytes == 0) g_cache_size_map.erase(this); - else g_cache_size_map[this] = bytes; - - size_t maxCacheSize = getMaxCacheSize(); - resizeTessellationCache(maxCacheSize); -#endif - } - - void Device::initTaskingSystem(size_t numThreads) - { - Lock lock(g_mutex); - if (numThreads == 0) - g_num_threads_map[this] = std::numeric_limits::max(); - else - g_num_threads_map[this] = numThreads; - - /* create task scheduler */ - size_t maxNumThreads = getMaxNumThreads(); - TaskScheduler::create(maxNumThreads,State::set_affinity,State::start_threads); -#if USE_TASK_ARENA - const size_t nThreads = min(maxNumThreads,TaskScheduler::threadCount()); - const size_t uThreads = min(max(numUserThreads,(size_t)1),nThreads); - arena = make_unique(new tbb::task_arena((int)nThreads,(unsigned int)uThreads)); -#endif - } - - void Device::exitTaskingSystem() - { - Lock lock(g_mutex); - g_num_threads_map.erase(this); - - /* terminate tasking system */ - if (g_num_threads_map.size() == 0) { - TaskScheduler::destroy(); - } - /* or configure new number of threads */ - else { - size_t maxNumThreads = getMaxNumThreads(); - TaskScheduler::create(maxNumThreads,State::set_affinity,State::start_threads); - } -#if USE_TASK_ARENA - arena.reset(); -#endif - } - - void Device::setProperty(const RTCDeviceProperty prop, ssize_t val) - { - /* hidden internal properties */ - switch ((size_t)prop) - { - case 1000000: debug_int0 = val; return; - case 1000001: debug_int1 = val; return; - case 1000002: debug_int2 = val; return; - case 1000003: debug_int3 = val; return; - } - - throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "unknown writable property"); - } - - ssize_t Device::getProperty(const RTCDeviceProperty prop) - { - size_t iprop = (size_t)prop; - - /* get name of internal regression test */ - if (iprop >= 2000000 && iprop < 3000000) - { - RegressionTest* test = getRegressionTest(iprop-2000000); - if (test) return (ssize_t) test->name.c_str(); - else return 0; - } - - /* run internal regression test */ - if (iprop >= 3000000 && iprop < 4000000) - { - RegressionTest* test = getRegressionTest(iprop-3000000); - if (test) return test->run(); - else return 0; - } - - /* documented properties */ - switch (prop) - { - case RTC_DEVICE_PROPERTY_VERSION_MAJOR: return RTC_VERSION_MAJOR; - case RTC_DEVICE_PROPERTY_VERSION_MINOR: return RTC_VERSION_MINOR; - case RTC_DEVICE_PROPERTY_VERSION_PATCH: return RTC_VERSION_PATCH; - case RTC_DEVICE_PROPERTY_VERSION : return RTC_VERSION; - -#if defined(EMBREE_TARGET_SIMD4) && defined(EMBREE_RAY_PACKETS) - case RTC_DEVICE_PROPERTY_NATIVE_RAY4_SUPPORTED: return hasISA(SSE2); -#else - case RTC_DEVICE_PROPERTY_NATIVE_RAY4_SUPPORTED: return 0; -#endif - -#if defined(EMBREE_TARGET_SIMD8) && defined(EMBREE_RAY_PACKETS) - case RTC_DEVICE_PROPERTY_NATIVE_RAY8_SUPPORTED: return hasISA(AVX); -#else - case RTC_DEVICE_PROPERTY_NATIVE_RAY8_SUPPORTED: return 0; -#endif - -#if defined(EMBREE_TARGET_SIMD16) && defined(EMBREE_RAY_PACKETS) - case RTC_DEVICE_PROPERTY_NATIVE_RAY16_SUPPORTED: return hasISA(AVX512KNL) | hasISA(AVX512SKX); -#else - case RTC_DEVICE_PROPERTY_NATIVE_RAY16_SUPPORTED: return 0; -#endif - -#if defined(EMBREE_RAY_PACKETS) - case RTC_DEVICE_PROPERTY_RAY_STREAM_SUPPORTED: return 1; -#else - case RTC_DEVICE_PROPERTY_RAY_STREAM_SUPPORTED: return 0; -#endif - -#if defined(EMBREE_RAY_MASK) - case RTC_DEVICE_PROPERTY_RAY_MASK_SUPPORTED: return 1; -#else - case RTC_DEVICE_PROPERTY_RAY_MASK_SUPPORTED: return 0; -#endif - -#if defined(EMBREE_BACKFACE_CULLING) - case RTC_DEVICE_PROPERTY_BACKFACE_CULLING_ENABLED: return 1; -#else - case RTC_DEVICE_PROPERTY_BACKFACE_CULLING_ENABLED: return 0; -#endif - -#if defined(EMBREE_BACKFACE_CULLING_CURVES) - case RTC_DEVICE_PROPERTY_BACKFACE_CULLING_CURVES_ENABLED: return 1; -#else - case RTC_DEVICE_PROPERTY_BACKFACE_CULLING_CURVES_ENABLED: return 0; -#endif - -#if defined(EMBREE_COMPACT_POLYS) - case RTC_DEVICE_PROPERTY_COMPACT_POLYS_ENABLED: return 1; -#else - case RTC_DEVICE_PROPERTY_COMPACT_POLYS_ENABLED: return 0; -#endif - -#if defined(EMBREE_FILTER_FUNCTION) - case RTC_DEVICE_PROPERTY_FILTER_FUNCTION_SUPPORTED: return 1; -#else - case RTC_DEVICE_PROPERTY_FILTER_FUNCTION_SUPPORTED: return 0; -#endif - -#if defined(EMBREE_IGNORE_INVALID_RAYS) - case RTC_DEVICE_PROPERTY_IGNORE_INVALID_RAYS_ENABLED: return 1; -#else - case RTC_DEVICE_PROPERTY_IGNORE_INVALID_RAYS_ENABLED: return 0; -#endif - -#if defined(TASKING_INTERNAL) - case RTC_DEVICE_PROPERTY_TASKING_SYSTEM: return 0; -#endif - -#if defined(TASKING_TBB) - case RTC_DEVICE_PROPERTY_TASKING_SYSTEM: return 1; -#endif - -#if defined(TASKING_PPL) - case RTC_DEVICE_PROPERTY_TASKING_SYSTEM: return 2; -#endif - -#if defined(TASKING_GCD) && defined(BUILD_IOS) - case RTC_DEVICE_PROPERTY_TASKING_SYSTEM: return 3; -#endif - -#if defined(EMBREE_GEOMETRY_TRIANGLE) - case RTC_DEVICE_PROPERTY_TRIANGLE_GEOMETRY_SUPPORTED: return 1; -#else - case RTC_DEVICE_PROPERTY_TRIANGLE_GEOMETRY_SUPPORTED: return 0; -#endif - -#if defined(EMBREE_GEOMETRY_QUAD) - case RTC_DEVICE_PROPERTY_QUAD_GEOMETRY_SUPPORTED: return 1; -#else - case RTC_DEVICE_PROPERTY_QUAD_GEOMETRY_SUPPORTED: return 0; -#endif - -#if defined(EMBREE_GEOMETRY_CURVE) - case RTC_DEVICE_PROPERTY_CURVE_GEOMETRY_SUPPORTED: return 1; -#else - case RTC_DEVICE_PROPERTY_CURVE_GEOMETRY_SUPPORTED: return 0; -#endif - -#if defined(EMBREE_GEOMETRY_SUBDIVISION) - case RTC_DEVICE_PROPERTY_SUBDIVISION_GEOMETRY_SUPPORTED: return 1; -#else - case RTC_DEVICE_PROPERTY_SUBDIVISION_GEOMETRY_SUPPORTED: return 0; -#endif - -#if defined(EMBREE_GEOMETRY_USER) - case RTC_DEVICE_PROPERTY_USER_GEOMETRY_SUPPORTED: return 1; -#else - case RTC_DEVICE_PROPERTY_USER_GEOMETRY_SUPPORTED: return 0; -#endif - -#if defined(EMBREE_GEOMETRY_POINT) - case RTC_DEVICE_PROPERTY_POINT_GEOMETRY_SUPPORTED: return 1; -#else - case RTC_DEVICE_PROPERTY_POINT_GEOMETRY_SUPPORTED: return 0; -#endif - -#if defined(TASKING_PPL) - case RTC_DEVICE_PROPERTY_JOIN_COMMIT_SUPPORTED: return 0; -#elif defined(TASKING_TBB) && (TBB_INTERFACE_VERSION_MAJOR < 8) - case RTC_DEVICE_PROPERTY_JOIN_COMMIT_SUPPORTED: return 0; -#else - case RTC_DEVICE_PROPERTY_JOIN_COMMIT_SUPPORTED: return 1; -#endif - -#if defined(TASKING_TBB) && TASKING_TBB_USE_TASK_ISOLATION - case RTC_DEVICE_PROPERTY_PARALLEL_COMMIT_SUPPORTED: return 1; -#else - case RTC_DEVICE_PROPERTY_PARALLEL_COMMIT_SUPPORTED: return 0; -#endif - - default: throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "unknown readable property"); break; - }; - } -} diff --git a/thirdparty/embree-aarch64/kernels/common/device.h b/thirdparty/embree-aarch64/kernels/common/device.h deleted file mode 100644 index e9a81bb109..0000000000 --- a/thirdparty/embree-aarch64/kernels/common/device.h +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "default.h" -#include "state.h" -#include "accel.h" - -namespace embree -{ - class BVH4Factory; - class BVH8Factory; - - class Device : public State, public MemoryMonitorInterface - { - ALIGNED_CLASS_(16); - - public: - - /*! Device construction */ - Device (const char* cfg); - - /*! Device destruction */ - virtual ~Device (); - - /*! prints info about the device */ - void print(); - - /*! sets the error code */ - void setDeviceErrorCode(RTCError error); - - /*! returns and clears the error code */ - RTCError getDeviceErrorCode(); - - /*! sets the error code */ - static void setThreadErrorCode(RTCError error); - - /*! returns and clears the error code */ - static RTCError getThreadErrorCode(); - - /*! processes error codes, do not call directly */ - static void process_error(Device* device, RTCError error, const char* str); - - /*! invokes the memory monitor callback */ - void memoryMonitor(ssize_t bytes, bool post); - - /*! sets the size of the software cache. */ - void setCacheSize(size_t bytes); - - /*! sets a property */ - void setProperty(const RTCDeviceProperty prop, ssize_t val); - - /*! gets a property */ - ssize_t getProperty(const RTCDeviceProperty prop); - - private: - - /*! initializes the tasking system */ - void initTaskingSystem(size_t numThreads); - - /*! shuts down the tasking system */ - void exitTaskingSystem(); - - /*! some variables that can be set via rtcSetParameter1i for debugging purposes */ - public: - static ssize_t debug_int0; - static ssize_t debug_int1; - static ssize_t debug_int2; - static ssize_t debug_int3; - - public: - std::unique_ptr bvh4_factory; -#if defined(EMBREE_TARGET_SIMD8) - std::unique_ptr bvh8_factory; -#endif - -#if USE_TASK_ARENA - std::unique_ptr arena; -#endif - - /* ray streams filter */ - RayStreamFilterFuncs rayStreamFilters; - }; -} diff --git a/thirdparty/embree-aarch64/kernels/common/geometry.cpp b/thirdparty/embree-aarch64/kernels/common/geometry.cpp deleted file mode 100644 index b3aa8e3396..0000000000 --- a/thirdparty/embree-aarch64/kernels/common/geometry.cpp +++ /dev/null @@ -1,259 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "geometry.h" -#include "scene.h" - -namespace embree -{ - const char* Geometry::gtype_names[Geometry::GTY_END] = - { - "flat_linear_curve", - "round_linear_curve", - "oriented_linear_curve", - "", - "flat_bezier_curve", - "round_bezier_curve", - "oriented_bezier_curve", - "", - "flat_bspline_curve", - "round_bspline_curve", - "oriented_bspline_curve", - "", - "flat_hermite_curve", - "round_hermite_curve", - "oriented_hermite_curve", - "", - "flat_catmull_rom_curve", - "round_catmull_rom_curve", - "oriented_catmull_rom_curve", - "", - "triangles", - "quads", - "grid", - "subdivs", - "", - "sphere", - "disc", - "oriented_disc", - "", - "usergeom", - "instance_cheap", - "instance_expensive", - }; - - Geometry::Geometry (Device* device, GType gtype, unsigned int numPrimitives, unsigned int numTimeSteps) - : device(device), userPtr(nullptr), - numPrimitives(numPrimitives), numTimeSteps(unsigned(numTimeSteps)), fnumTimeSegments(float(numTimeSteps-1)), time_range(0.0f,1.0f), - mask(-1), - gtype(gtype), - gsubtype(GTY_SUBTYPE_DEFAULT), - quality(RTC_BUILD_QUALITY_MEDIUM), - state((unsigned)State::MODIFIED), - enabled(true), - intersectionFilterN(nullptr), occlusionFilterN(nullptr), pointQueryFunc(nullptr) - { - device->refInc(); - } - - Geometry::~Geometry() - { - device->refDec(); - } - - void Geometry::setNumPrimitives(unsigned int numPrimitives_in) - { - if (numPrimitives_in == numPrimitives) return; - - numPrimitives = numPrimitives_in; - - Geometry::update(); - } - - void Geometry::setNumTimeSteps (unsigned int numTimeSteps_in) - { - if (numTimeSteps_in == numTimeSteps) { - return; - } - - numTimeSteps = numTimeSteps_in; - fnumTimeSegments = float(numTimeSteps_in-1); - - Geometry::update(); - } - - void Geometry::setTimeRange (const BBox1f range) - { - time_range = range; - Geometry::update(); - } - - void Geometry::update() - { - ++modCounter_; // FIXME: required? - state = (unsigned)State::MODIFIED; - } - - void Geometry::commit() - { - ++modCounter_; - state = (unsigned)State::COMMITTED; - } - - void Geometry::preCommit() - { - if (State::MODIFIED == (State)state) - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"geometry not committed"); - } - - void Geometry::postCommit() - { - } - - void Geometry::enable () - { - if (isEnabled()) - return; - - enabled = true; - ++modCounter_; - } - - void Geometry::disable () - { - if (isDisabled()) - return; - - enabled = false; - ++modCounter_; - } - - void Geometry::setUserData (void* ptr) - { - userPtr = ptr; - } - - void Geometry::setIntersectionFilterFunctionN (RTCFilterFunctionN filter) - { - if (!(getTypeMask() & (MTY_TRIANGLE_MESH | MTY_QUAD_MESH | MTY_CURVES | MTY_SUBDIV_MESH | MTY_USER_GEOMETRY | MTY_GRID_MESH))) - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"filter functions not supported for this geometry"); - - intersectionFilterN = filter; - } - - void Geometry::setOcclusionFilterFunctionN (RTCFilterFunctionN filter) - { - if (!(getTypeMask() & (MTY_TRIANGLE_MESH | MTY_QUAD_MESH | MTY_CURVES | MTY_SUBDIV_MESH | MTY_USER_GEOMETRY | MTY_GRID_MESH))) - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"filter functions not supported for this geometry"); - - occlusionFilterN = filter; - } - - void Geometry::setPointQueryFunction (RTCPointQueryFunction func) - { - pointQueryFunc = func; - } - - void Geometry::interpolateN(const RTCInterpolateNArguments* const args) - { - const void* valid_i = args->valid; - const unsigned* primIDs = args->primIDs; - const float* u = args->u; - const float* v = args->v; - unsigned int N = args->N; - RTCBufferType bufferType = args->bufferType; - unsigned int bufferSlot = args->bufferSlot; - float* P = args->P; - float* dPdu = args->dPdu; - float* dPdv = args->dPdv; - float* ddPdudu = args->ddPdudu; - float* ddPdvdv = args->ddPdvdv; - float* ddPdudv = args->ddPdudv; - unsigned int valueCount = args->valueCount; - - if (valueCount > 256) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"maximally 256 floating point values can be interpolated per vertex"); - const int* valid = (const int*) valid_i; - - __aligned(64) float P_tmp[256]; - __aligned(64) float dPdu_tmp[256]; - __aligned(64) float dPdv_tmp[256]; - __aligned(64) float ddPdudu_tmp[256]; - __aligned(64) float ddPdvdv_tmp[256]; - __aligned(64) float ddPdudv_tmp[256]; - - float* Pt = P ? P_tmp : nullptr; - float* dPdut = nullptr, *dPdvt = nullptr; - if (dPdu) { dPdut = dPdu_tmp; dPdvt = dPdv_tmp; } - float* ddPdudut = nullptr, *ddPdvdvt = nullptr, *ddPdudvt = nullptr; - if (ddPdudu) { ddPdudut = ddPdudu_tmp; ddPdvdvt = ddPdvdv_tmp; ddPdudvt = ddPdudv_tmp; } - - for (unsigned int i=0; iprimID < size()); - - RTCPointQueryFunctionArguments args; - args.query = (RTCPointQuery*)context->query_ws; - args.userPtr = context->userPtr; - args.primID = context->primID; - args.geomID = context->geomID; - args.context = context->userContext; - args.similarityScale = context->similarityScale; - - bool update = false; - if(context->func) update |= context->func(&args); - if(pointQueryFunc) update |= pointQueryFunc(&args); - - if (update && context->userContext->instStackSize > 0) - { - // update point query - if (context->query_type == POINT_QUERY_TYPE_AABB) { - context->updateAABB(); - } else { - assert(context->similarityScale > 0.f); - query->radius = context->query_ws->radius * context->similarityScale; - } - } - return update; - } -} diff --git a/thirdparty/embree-aarch64/kernels/common/geometry.h b/thirdparty/embree-aarch64/kernels/common/geometry.h deleted file mode 100644 index 953974bfd2..0000000000 --- a/thirdparty/embree-aarch64/kernels/common/geometry.h +++ /dev/null @@ -1,582 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "default.h" -#include "device.h" -#include "buffer.h" -#include "../common/point_query.h" -#include "../builders/priminfo.h" - -namespace embree -{ - class Scene; - class Geometry; - - struct GeometryCounts - { - __forceinline GeometryCounts() - : numFilterFunctions(0), - numTriangles(0), numMBTriangles(0), - numQuads(0), numMBQuads(0), - numBezierCurves(0), numMBBezierCurves(0), - numLineSegments(0), numMBLineSegments(0), - numSubdivPatches(0), numMBSubdivPatches(0), - numUserGeometries(0), numMBUserGeometries(0), - numInstancesCheap(0), numMBInstancesCheap(0), - numInstancesExpensive(0), numMBInstancesExpensive(0), - numGrids(0), numMBGrids(0), - numPoints(0), numMBPoints(0) {} - - __forceinline size_t size() const { - return numTriangles + numQuads + numBezierCurves + numLineSegments + numSubdivPatches + numUserGeometries + numInstancesCheap + numInstancesExpensive + numGrids + numPoints - + numMBTriangles + numMBQuads + numMBBezierCurves + numMBLineSegments + numMBSubdivPatches + numMBUserGeometries + numMBInstancesCheap + numMBInstancesExpensive + numMBGrids + numMBPoints; - } - - __forceinline unsigned int enabledGeometryTypesMask() const - { - unsigned int mask = 0; - if (numTriangles) mask |= 1 << 0; - if (numQuads) mask |= 1 << 1; - if (numBezierCurves+numLineSegments) mask |= 1 << 2; - if (numSubdivPatches) mask |= 1 << 3; - if (numUserGeometries) mask |= 1 << 4; - if (numInstancesCheap) mask |= 1 << 5; - if (numInstancesExpensive) mask |= 1 << 6; - if (numGrids) mask |= 1 << 7; - if (numPoints) mask |= 1 << 8; - - unsigned int maskMB = 0; - if (numMBTriangles) maskMB |= 1 << 0; - if (numMBQuads) maskMB |= 1 << 1; - if (numMBBezierCurves+numMBLineSegments) maskMB |= 1 << 2; - if (numMBSubdivPatches) maskMB |= 1 << 3; - if (numMBUserGeometries) maskMB |= 1 << 4; - if (numMBInstancesCheap) maskMB |= 1 << 5; - if (numMBInstancesExpensive) maskMB |= 1 << 6; - if (numMBGrids) maskMB |= 1 << 7; - if (numMBPoints) maskMB |= 1 << 8; - - return (mask<<8) + maskMB; - } - - __forceinline GeometryCounts operator+ (GeometryCounts const & rhs) const - { - GeometryCounts ret; - ret.numFilterFunctions = numFilterFunctions + rhs.numFilterFunctions; - ret.numTriangles = numTriangles + rhs.numTriangles; - ret.numMBTriangles = numMBTriangles + rhs.numMBTriangles; - ret.numQuads = numQuads + rhs.numQuads; - ret.numMBQuads = numMBQuads + rhs.numMBQuads; - ret.numBezierCurves = numBezierCurves + rhs.numBezierCurves; - ret.numMBBezierCurves = numMBBezierCurves + rhs.numMBBezierCurves; - ret.numLineSegments = numLineSegments + rhs.numLineSegments; - ret.numMBLineSegments = numMBLineSegments + rhs.numMBLineSegments; - ret.numSubdivPatches = numSubdivPatches + rhs.numSubdivPatches; - ret.numMBSubdivPatches = numMBSubdivPatches + rhs.numMBSubdivPatches; - ret.numUserGeometries = numUserGeometries + rhs.numUserGeometries; - ret.numMBUserGeometries = numMBUserGeometries + rhs.numMBUserGeometries; - ret.numInstancesCheap = numInstancesCheap + rhs.numInstancesCheap; - ret.numMBInstancesCheap = numMBInstancesCheap + rhs.numMBInstancesCheap; - ret.numInstancesExpensive = numInstancesExpensive + rhs.numInstancesExpensive; - ret.numMBInstancesExpensive = numMBInstancesExpensive + rhs.numMBInstancesExpensive; - ret.numGrids = numGrids + rhs.numGrids; - ret.numMBGrids = numMBGrids + rhs.numMBGrids; - ret.numPoints = numPoints + rhs.numPoints; - ret.numMBPoints = numMBPoints + rhs.numMBPoints; - - return ret; - } - - size_t numFilterFunctions; //!< number of geometries with filter functions enabled - size_t numTriangles; //!< number of enabled triangles - size_t numMBTriangles; //!< number of enabled motion blured triangles - size_t numQuads; //!< number of enabled quads - size_t numMBQuads; //!< number of enabled motion blurred quads - size_t numBezierCurves; //!< number of enabled curves - size_t numMBBezierCurves; //!< number of enabled motion blurred curves - size_t numLineSegments; //!< number of enabled line segments - size_t numMBLineSegments; //!< number of enabled line motion blurred segments - size_t numSubdivPatches; //!< number of enabled subdivision patches - size_t numMBSubdivPatches; //!< number of enabled motion blured subdivision patches - size_t numUserGeometries; //!< number of enabled user geometries - size_t numMBUserGeometries; //!< number of enabled motion blurred user geometries - size_t numInstancesCheap; //!< number of enabled cheap instances - size_t numMBInstancesCheap; //!< number of enabled motion blurred cheap instances - size_t numInstancesExpensive; //!< number of enabled expensive instances - size_t numMBInstancesExpensive; //!< number of enabled motion blurred expensive instances - size_t numGrids; //!< number of enabled grid geometries - size_t numMBGrids; //!< number of enabled motion blurred grid geometries - size_t numPoints; //!< number of enabled points - size_t numMBPoints; //!< number of enabled motion blurred points - }; - - /*! Base class all geometries are derived from */ - class Geometry : public RefCount - { - friend class Scene; - public: - - /*! type of geometry */ - enum GType - { - GTY_FLAT_LINEAR_CURVE = 0, - GTY_ROUND_LINEAR_CURVE = 1, - GTY_ORIENTED_LINEAR_CURVE = 2, - GTY_CONE_LINEAR_CURVE = 3, - - GTY_FLAT_BEZIER_CURVE = 4, - GTY_ROUND_BEZIER_CURVE = 5, - GTY_ORIENTED_BEZIER_CURVE = 6, - - GTY_FLAT_BSPLINE_CURVE = 8, - GTY_ROUND_BSPLINE_CURVE = 9, - GTY_ORIENTED_BSPLINE_CURVE = 10, - - GTY_FLAT_HERMITE_CURVE = 12, - GTY_ROUND_HERMITE_CURVE = 13, - GTY_ORIENTED_HERMITE_CURVE = 14, - - GTY_FLAT_CATMULL_ROM_CURVE = 16, - GTY_ROUND_CATMULL_ROM_CURVE = 17, - GTY_ORIENTED_CATMULL_ROM_CURVE = 18, - - GTY_TRIANGLE_MESH = 20, - GTY_QUAD_MESH = 21, - GTY_GRID_MESH = 22, - GTY_SUBDIV_MESH = 23, - - GTY_SPHERE_POINT = 25, - GTY_DISC_POINT = 26, - GTY_ORIENTED_DISC_POINT = 27, - - GTY_USER_GEOMETRY = 29, - GTY_INSTANCE_CHEAP = 30, - GTY_INSTANCE_EXPENSIVE = 31, - GTY_END = 32, - - GTY_BASIS_LINEAR = 0, - GTY_BASIS_BEZIER = 4, - GTY_BASIS_BSPLINE = 8, - GTY_BASIS_HERMITE = 12, - GTY_BASIS_CATMULL_ROM = 16, - GTY_BASIS_MASK = 28, - - GTY_SUBTYPE_FLAT_CURVE = 0, - GTY_SUBTYPE_ROUND_CURVE = 1, - GTY_SUBTYPE_ORIENTED_CURVE = 2, - GTY_SUBTYPE_MASK = 3, - }; - - enum GSubType - { - GTY_SUBTYPE_DEFAULT= 0, - GTY_SUBTYPE_INSTANCE_LINEAR = 0, - GTY_SUBTYPE_INSTANCE_QUATERNION = 1 - }; - - enum GTypeMask - { - MTY_FLAT_LINEAR_CURVE = 1ul << GTY_FLAT_LINEAR_CURVE, - MTY_ROUND_LINEAR_CURVE = 1ul << GTY_ROUND_LINEAR_CURVE, - MTY_CONE_LINEAR_CURVE = 1ul << GTY_CONE_LINEAR_CURVE, - MTY_ORIENTED_LINEAR_CURVE = 1ul << GTY_ORIENTED_LINEAR_CURVE, - - MTY_FLAT_BEZIER_CURVE = 1ul << GTY_FLAT_BEZIER_CURVE, - MTY_ROUND_BEZIER_CURVE = 1ul << GTY_ROUND_BEZIER_CURVE, - MTY_ORIENTED_BEZIER_CURVE = 1ul << GTY_ORIENTED_BEZIER_CURVE, - - MTY_FLAT_BSPLINE_CURVE = 1ul << GTY_FLAT_BSPLINE_CURVE, - MTY_ROUND_BSPLINE_CURVE = 1ul << GTY_ROUND_BSPLINE_CURVE, - MTY_ORIENTED_BSPLINE_CURVE = 1ul << GTY_ORIENTED_BSPLINE_CURVE, - - MTY_FLAT_HERMITE_CURVE = 1ul << GTY_FLAT_HERMITE_CURVE, - MTY_ROUND_HERMITE_CURVE = 1ul << GTY_ROUND_HERMITE_CURVE, - MTY_ORIENTED_HERMITE_CURVE = 1ul << GTY_ORIENTED_HERMITE_CURVE, - - MTY_FLAT_CATMULL_ROM_CURVE = 1ul << GTY_FLAT_CATMULL_ROM_CURVE, - MTY_ROUND_CATMULL_ROM_CURVE = 1ul << GTY_ROUND_CATMULL_ROM_CURVE, - MTY_ORIENTED_CATMULL_ROM_CURVE = 1ul << GTY_ORIENTED_CATMULL_ROM_CURVE, - - MTY_CURVE2 = MTY_FLAT_LINEAR_CURVE | MTY_ROUND_LINEAR_CURVE | MTY_CONE_LINEAR_CURVE | MTY_ORIENTED_LINEAR_CURVE, - - MTY_CURVE4 = MTY_FLAT_BEZIER_CURVE | MTY_ROUND_BEZIER_CURVE | MTY_ORIENTED_BEZIER_CURVE | - MTY_FLAT_BSPLINE_CURVE | MTY_ROUND_BSPLINE_CURVE | MTY_ORIENTED_BSPLINE_CURVE | - MTY_FLAT_HERMITE_CURVE | MTY_ROUND_HERMITE_CURVE | MTY_ORIENTED_HERMITE_CURVE | - MTY_FLAT_CATMULL_ROM_CURVE | MTY_ROUND_CATMULL_ROM_CURVE | MTY_ORIENTED_CATMULL_ROM_CURVE, - - MTY_SPHERE_POINT = 1ul << GTY_SPHERE_POINT, - MTY_DISC_POINT = 1ul << GTY_DISC_POINT, - MTY_ORIENTED_DISC_POINT = 1ul << GTY_ORIENTED_DISC_POINT, - - MTY_POINTS = MTY_SPHERE_POINT | MTY_DISC_POINT | MTY_ORIENTED_DISC_POINT, - - MTY_CURVES = MTY_CURVE2 | MTY_CURVE4 | MTY_POINTS, - - MTY_TRIANGLE_MESH = 1ul << GTY_TRIANGLE_MESH, - MTY_QUAD_MESH = 1ul << GTY_QUAD_MESH, - MTY_GRID_MESH = 1ul << GTY_GRID_MESH, - MTY_SUBDIV_MESH = 1ul << GTY_SUBDIV_MESH, - MTY_USER_GEOMETRY = 1ul << GTY_USER_GEOMETRY, - - MTY_INSTANCE_CHEAP = 1ul << GTY_INSTANCE_CHEAP, - MTY_INSTANCE_EXPENSIVE = 1ul << GTY_INSTANCE_EXPENSIVE, - MTY_INSTANCE = MTY_INSTANCE_CHEAP | MTY_INSTANCE_EXPENSIVE - }; - - static const char* gtype_names[GTY_END]; - - enum class State : unsigned { - MODIFIED = 0, - COMMITTED = 1, - }; - - public: - - /*! Geometry constructor */ - Geometry (Device* device, GType gtype, unsigned int numPrimitives, unsigned int numTimeSteps); - - /*! Geometry destructor */ - virtual ~Geometry(); - - public: - - /*! tests if geometry is enabled */ - __forceinline bool isEnabled() const { return enabled; } - - /*! tests if geometry is disabled */ - __forceinline bool isDisabled() const { return !isEnabled(); } - - /*! tests if that geometry has some filter function set */ - __forceinline bool hasFilterFunctions () const { - return (intersectionFilterN != nullptr) || (occlusionFilterN != nullptr); - } - - /*! returns geometry type */ - __forceinline GType getType() const { return gtype; } - - /*! returns curve type */ - __forceinline GType getCurveType() const { return (GType)(gtype & GTY_SUBTYPE_MASK); } - - /*! returns curve basis */ - __forceinline GType getCurveBasis() const { return (GType)(gtype & GTY_BASIS_MASK); } - - /*! returns geometry type mask */ - __forceinline GTypeMask getTypeMask() const { return (GTypeMask)(1 << gtype); } - - /*! returns number of primitives */ - __forceinline size_t size() const { return numPrimitives; } - - /*! sets the number of primitives */ - virtual void setNumPrimitives(unsigned int numPrimitives_in); - - /*! sets number of time steps */ - virtual void setNumTimeSteps (unsigned int numTimeSteps_in); - - /*! sets motion blur time range */ - void setTimeRange (const BBox1f range); - - /*! sets number of vertex attributes */ - virtual void setVertexAttributeCount (unsigned int N) { - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); - } - - /*! sets number of topologies */ - virtual void setTopologyCount (unsigned int N) { - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); - } - - /*! sets the build quality */ - void setBuildQuality(RTCBuildQuality quality_in) - { - this->quality = quality_in; - Geometry::update(); - } - - /* calculate time segment itime and fractional time ftime */ - __forceinline int timeSegment(float time, float& ftime) const { - return getTimeSegment(time,time_range.lower,time_range.upper,fnumTimeSegments,ftime); - } - - template - __forceinline vint timeSegment(const vfloat& time, vfloat& ftime) const { - return getTimeSegment(time,vfloat(time_range.lower),vfloat(time_range.upper),vfloat(fnumTimeSegments),ftime); - } - - /* calculate overlapping time segment range */ - __forceinline range timeSegmentRange(const BBox1f& range) const { - return getTimeSegmentRange(range,time_range,fnumTimeSegments); - } - - /* returns time that corresponds to time step */ - __forceinline float timeStep(const int i) const { - assert(i>=0 && i<(int)numTimeSteps); - return time_range.lower + time_range.size()*float(i)/fnumTimeSegments; - } - - /*! for all geometries */ - public: - - /*! Enable geometry. */ - virtual void enable(); - - /*! Update geometry. */ - void update(); - - /*! commit of geometry */ - virtual void commit(); - - /*! Update geometry buffer. */ - virtual void updateBuffer(RTCBufferType type, unsigned int slot) { - update(); // update everything for geometries not supporting this call - } - - /*! Disable geometry. */ - virtual void disable(); - - /*! Verify the geometry */ - virtual bool verify() { return true; } - - /*! called before every build */ - virtual void preCommit(); - - /*! called after every build */ - virtual void postCommit(); - - virtual void addElementsToCount (GeometryCounts & counts) const { - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); - }; - - /*! sets constant tessellation rate for the geometry */ - virtual void setTessellationRate(float N) { - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); - } - - /*! Sets the maximal curve radius scale allowed by min-width feature. */ - virtual void setMaxRadiusScale(float s) { - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); - } - - /*! Set user data pointer. */ - virtual void setUserData(void* ptr); - - /*! Get user data pointer. */ - __forceinline void* getUserData() const { - return userPtr; - } - - /*! interpolates user data to the specified u/v location */ - virtual void interpolate(const RTCInterpolateArguments* const args) { - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); - } - - /*! interpolates user data to the specified u/v locations */ - virtual void interpolateN(const RTCInterpolateNArguments* const args); - - /* point query api */ - bool pointQuery(PointQuery* query, PointQueryContext* context); - - /*! for subdivision surfaces only */ - public: - virtual void setSubdivisionMode (unsigned topologyID, RTCSubdivisionMode mode) { - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); - } - - virtual void setVertexAttributeTopology(unsigned int vertexBufferSlot, unsigned int indexBufferSlot) { - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); - } - - /*! Set displacement function. */ - virtual void setDisplacementFunction (RTCDisplacementFunctionN filter) { - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); - } - - virtual unsigned int getFirstHalfEdge(unsigned int faceID) { - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); - } - - virtual unsigned int getFace(unsigned int edgeID) { - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); - } - - virtual unsigned int getNextHalfEdge(unsigned int edgeID) { - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); - } - - virtual unsigned int getPreviousHalfEdge(unsigned int edgeID) { - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); - } - - virtual unsigned int getOppositeHalfEdge(unsigned int topologyID, unsigned int edgeID) { - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); - } - - /*! get fast access to first vertex buffer if applicable */ - virtual float * getCompactVertexArray () const { - return nullptr; - } - - /*! Returns the modified counter - how many times the geo has been modified */ - __forceinline unsigned int getModCounter () const { - return modCounter_; - } - - /*! for triangle meshes and bezier curves only */ - public: - - - /*! Sets ray mask. */ - virtual void setMask(unsigned mask) { - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); - } - - /*! Sets specified buffer. */ - virtual void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref& buffer, size_t offset, size_t stride, unsigned int num) { - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); - } - - /*! Gets specified buffer. */ - virtual void* getBuffer(RTCBufferType type, unsigned int slot) { - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); - } - - /*! Set intersection filter function for ray packets of size N. */ - virtual void setIntersectionFilterFunctionN (RTCFilterFunctionN filterN); - - /*! Set occlusion filter function for ray packets of size N. */ - virtual void setOcclusionFilterFunctionN (RTCFilterFunctionN filterN); - - /*! for instances only */ - public: - - /*! Sets the instanced scene */ - virtual void setInstancedScene(const Ref& scene) { - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); - } - - /*! Sets transformation of the instance */ - virtual void setTransform(const AffineSpace3fa& transform, unsigned int timeStep) { - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); - } - - /*! Sets transformation of the instance */ - virtual void setQuaternionDecomposition(const AffineSpace3ff& qd, unsigned int timeStep) { - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); - } - - /*! Returns the transformation of the instance */ - virtual AffineSpace3fa getTransform(float time) { - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); - } - - /*! for user geometries only */ - public: - - /*! Set bounds function. */ - virtual void setBoundsFunction (RTCBoundsFunction bounds, void* userPtr) { - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); - } - - /*! Set intersect function for ray packets of size N. */ - virtual void setIntersectFunctionN (RTCIntersectFunctionN intersect) { - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); - } - - /*! Set occlusion function for ray packets of size N. */ - virtual void setOccludedFunctionN (RTCOccludedFunctionN occluded) { - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); - } - - /*! Set point query function. */ - void setPointQueryFunction(RTCPointQueryFunction func); - - /*! returns number of time segments */ - __forceinline unsigned numTimeSegments () const { - return numTimeSteps-1; - } - - public: - - virtual PrimInfo createPrimRefArray(mvector& prims, const range& r, size_t k, unsigned int geomID) const { - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"createPrimRefArray not implemented for this geometry"); - } - - virtual PrimInfo createPrimRefArrayMB(mvector& prims, size_t itime, const range& r, size_t k, unsigned int geomID) const { - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"createPrimRefMBArray not implemented for this geometry"); - } - - virtual PrimInfoMB createPrimRefMBArray(mvector& prims, const BBox1f& t0t1, const range& r, size_t k, unsigned int geomID) const { - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"createPrimRefMBArray not implemented for this geometry"); - } - - virtual LinearSpace3fa computeAlignedSpace(const size_t primID) const { - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"computeAlignedSpace not implemented for this geometry"); - } - - virtual LinearSpace3fa computeAlignedSpaceMB(const size_t primID, const BBox1f time_range) const { - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"computeAlignedSpace not implemented for this geometry"); - } - - virtual Vec3fa computeDirection(unsigned int primID) const { - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"computeDirection not implemented for this geometry"); - } - - virtual Vec3fa computeDirection(unsigned int primID, size_t time) const { - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"computeDirection not implemented for this geometry"); - } - - virtual BBox3fa vbounds(size_t primID) const { - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vbounds not implemented for this geometry"); - } - - virtual BBox3fa vbounds(const LinearSpace3fa& space, size_t primID) const { - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vbounds not implemented for this geometry"); - } - - virtual BBox3fa vbounds(const Vec3fa& ofs, const float scale, const float r_scale0, const LinearSpace3fa& space, size_t i, size_t itime = 0) const { - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vbounds not implemented for this geometry"); - } - - virtual LBBox3fa vlinearBounds(size_t primID, const BBox1f& time_range) const { - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vlinearBounds not implemented for this geometry"); - } - - virtual LBBox3fa vlinearBounds(const LinearSpace3fa& space, size_t primID, const BBox1f& time_range) const { - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vlinearBounds not implemented for this geometry"); - } - - virtual LBBox3fa vlinearBounds(const Vec3fa& ofs, const float scale, const float r_scale0, const LinearSpace3fa& space, size_t primID, const BBox1f& time_range) const { - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vlinearBounds not implemented for this geometry"); - } - - public: - __forceinline bool hasIntersectionFilter() const { return intersectionFilterN != nullptr; } - __forceinline bool hasOcclusionFilter() const { return occlusionFilterN != nullptr; } - - public: - Device* device; //!< device this geometry belongs to - - void* userPtr; //!< user pointer - unsigned int numPrimitives; //!< number of primitives of this geometry - - unsigned int numTimeSteps; //!< number of time steps - float fnumTimeSegments; //!< number of time segments (precalculation) - BBox1f time_range; //!< motion blur time range - - unsigned int mask; //!< for masking out geometry - unsigned int modCounter_ = 1; //!< counter for every modification - used to rebuild scenes when geo is modified - - struct { - GType gtype : 8; //!< geometry type - GSubType gsubtype : 8; //!< geometry subtype - RTCBuildQuality quality : 3; //!< build quality for geometry - unsigned state : 2; - bool enabled : 1; //!< true if geometry is enabled - }; - - RTCFilterFunctionN intersectionFilterN; - RTCFilterFunctionN occlusionFilterN; - RTCPointQueryFunction pointQueryFunc; - }; -} diff --git a/thirdparty/embree-aarch64/kernels/common/hit.h b/thirdparty/embree-aarch64/kernels/common/hit.h deleted file mode 100644 index 32a198cdfe..0000000000 --- a/thirdparty/embree-aarch64/kernels/common/hit.h +++ /dev/null @@ -1,114 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "default.h" -#include "ray.h" -#include "instance_stack.h" - -namespace embree -{ - /* Hit structure for K hits */ - template - struct HitK - { - /* Default construction does nothing */ - __forceinline HitK() {} - - /* Constructs a hit */ - __forceinline HitK(const RTCIntersectContext* context, const vuint& geomID, const vuint& primID, const vfloat& u, const vfloat& v, const Vec3vf& Ng) - : Ng(Ng), u(u), v(v), primID(primID), geomID(geomID) - { - for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) - instID[l] = RTC_INVALID_GEOMETRY_ID; - instance_id_stack::copy(context->instID, instID); - } - - /* Returns the size of the hit */ - static __forceinline size_t size() { return K; } - - public: - Vec3vf Ng; // geometry normal - vfloat u; // barycentric u coordinate of hit - vfloat v; // barycentric v coordinate of hit - vuint primID; // primitive ID - vuint geomID; // geometry ID - vuint instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // instance ID - }; - - /* Specialization for a single hit */ - template<> - struct __aligned(16) HitK<1> - { - /* Default construction does nothing */ - __forceinline HitK() {} - - /* Constructs a hit */ - __forceinline HitK(const RTCIntersectContext* context, unsigned int geomID, unsigned int primID, float u, float v, const Vec3fa& Ng) - : Ng(Ng.x,Ng.y,Ng.z), u(u), v(v), primID(primID), geomID(geomID) - { - instance_id_stack::copy(context->instID, instID); - } - - /* Returns the size of the hit */ - static __forceinline size_t size() { return 1; } - - public: - Vec3 Ng; // geometry normal - float u; // barycentric u coordinate of hit - float v; // barycentric v coordinate of hit - unsigned int primID; // primitive ID - unsigned int geomID; // geometry ID - unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // instance ID - }; - - /* Shortcuts */ - typedef HitK<1> Hit; - typedef HitK<4> Hit4; - typedef HitK<8> Hit8; - typedef HitK<16> Hit16; - - /* Outputs hit to stream */ - template - __forceinline embree_ostream operator<<(embree_ostream cout, const HitK& ray) - { - cout << "{ " << embree_endl - << " Ng = " << ray.Ng << embree_endl - << " u = " << ray.u << embree_endl - << " v = " << ray.v << embree_endl - << " primID = " << ray.primID << embree_endl - << " geomID = " << ray.geomID << embree_endl - << " instID ="; - for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) - { - cout << " " << ray.instID[l]; - } - cout << embree_endl; - return cout << "}"; - } - - template - __forceinline void copyHitToRay(RayHit& ray, const Hit& hit) - { - ray.Ng = hit.Ng; - ray.u = hit.u; - ray.v = hit.v; - ray.primID = hit.primID; - ray.geomID = hit.geomID; - instance_id_stack::copy(hit.instID, ray.instID); - } - - template - __forceinline void copyHitToRay(const vbool &mask, RayHitK &ray, const HitK &hit) - { - vfloat::storeu(mask,&ray.Ng.x, hit.Ng.x); - vfloat::storeu(mask,&ray.Ng.y, hit.Ng.y); - vfloat::storeu(mask,&ray.Ng.z, hit.Ng.z); - vfloat::storeu(mask,&ray.u, hit.u); - vfloat::storeu(mask,&ray.v, hit.v); - vuint::storeu(mask,&ray.primID, hit.primID); - vuint::storeu(mask,&ray.geomID, hit.geomID); - instance_id_stack::copy(hit.instID, ray.instID, mask); - } -} diff --git a/thirdparty/embree-aarch64/kernels/common/instance_stack.h b/thirdparty/embree-aarch64/kernels/common/instance_stack.h deleted file mode 100644 index d7e3637f7b..0000000000 --- a/thirdparty/embree-aarch64/kernels/common/instance_stack.h +++ /dev/null @@ -1,199 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "rtcore.h" - -namespace embree { -namespace instance_id_stack { - -static_assert(RTC_MAX_INSTANCE_LEVEL_COUNT > 0, - "RTC_MAX_INSTANCE_LEVEL_COUNT must be greater than 0."); - -/******************************************************************************* - * Instance ID stack manipulation. - * This is used from the instance intersector. - ******************************************************************************/ - -/* - * Push an instance to the stack. - */ -RTC_FORCEINLINE bool push(RTCIntersectContext* context, - unsigned instanceId) -{ -#if RTC_MAX_INSTANCE_LEVEL_COUNT > 1 - const bool spaceAvailable = context->instStackSize < RTC_MAX_INSTANCE_LEVEL_COUNT; - /* We assert here because instances are silently dropped when the stack is full. - This might be quite hard to find in production. */ - assert(spaceAvailable); - if (likely(spaceAvailable)) - context->instID[context->instStackSize++] = instanceId; - return spaceAvailable; -#else - const bool spaceAvailable = (context->instID[0] == RTC_INVALID_GEOMETRY_ID); - assert(spaceAvailable); - if (likely(spaceAvailable)) - context->instID[0] = instanceId; - return spaceAvailable; -#endif -} - - -/* - * Pop the last instance pushed to the stack. - * Do not call on an empty stack. - */ -RTC_FORCEINLINE void pop(RTCIntersectContext* context) -{ - assert(context); -#if RTC_MAX_INSTANCE_LEVEL_COUNT > 1 - assert(context->instStackSize > 0); - context->instID[--context->instStackSize] = RTC_INVALID_GEOMETRY_ID; -#else - assert(context->instID[0] != RTC_INVALID_GEOMETRY_ID); - context->instID[0] = RTC_INVALID_GEOMETRY_ID; -#endif -} - -/******************************************************************************* - * Optimized instance id stack copy. - * The copy() function at the bottom of this block will either copy full - * stacks or copy only until the last valid element has been copied, depending - * on RTC_MAX_INSTANCE_LEVEL_COUNT. - ******************************************************************************/ - -/* - * Plain array assignment. This works for scalar->scalar, - * scalar->vector, and vector->vector. - */ -template -RTC_FORCEINLINE void level_copy(unsigned level, Src* src, Tgt* tgt) -{ - tgt[level] = src[level]; -} - -/* - * Masked SIMD vector->vector store. - */ -template -RTC_FORCEINLINE void level_copy(unsigned level, const vuint* src, vuint* tgt, const vbool& mask) -{ - vuint::storeu(mask, tgt + level, src[level]); -} - -/* - * Masked scalar->SIMD vector store. - */ -template -RTC_FORCEINLINE void level_copy(unsigned level, const unsigned* src, vuint* tgt, const vbool& mask) -{ - vuint::store(mask, tgt + level, src[level]); -} - -/* - * Indexed assign from vector to scalar. - */ -template -RTC_FORCEINLINE void level_copy(unsigned level, const vuint* src, unsigned* tgt, const size_t& idx) -{ - tgt[level] = src[level][idx]; -} - -/* - * Indexed assign from scalar to vector. - */ -template -RTC_FORCEINLINE void level_copy(unsigned level, const unsigned* src, vuint* tgt, const size_t& idx) -{ - tgt[level][idx] = src[level]; -} - -/* - * Indexed assign from vector to vector. - */ -template -RTC_FORCEINLINE void level_copy(unsigned level, const vuint* src, vuint* tgt, const size_t& i, const size_t& j) -{ - tgt[level][j] = src[level][i]; -} - -/* - * Check if the given stack level is valid. - * These are only used for large max stack sizes. - */ -RTC_FORCEINLINE bool level_valid(unsigned level, const unsigned* stack) -{ - return stack[level] != RTC_INVALID_GEOMETRY_ID; -} -RTC_FORCEINLINE bool level_valid(unsigned level, const unsigned* stack, const size_t& /*i*/) -{ - return stack[level] != RTC_INVALID_GEOMETRY_ID; -} -template -RTC_FORCEINLINE bool level_valid(unsigned level, const unsigned* stack, const vbool& /*mask*/) -{ - return stack[level] != RTC_INVALID_GEOMETRY_ID; -} - -template -RTC_FORCEINLINE bool level_valid(unsigned level, const vuint* stack) -{ - return any(stack[level] != RTC_INVALID_GEOMETRY_ID); -} -template -RTC_FORCEINLINE bool level_valid(unsigned level, const vuint* stack, const vbool& mask) -{ - return any(mask & (stack[level] != RTC_INVALID_GEOMETRY_ID)); -} - -template -RTC_FORCEINLINE bool level_valid(unsigned level, const vuint* stack, const size_t& i) -{ - return stack[level][i] != RTC_INVALID_GEOMETRY_ID; -} -template -RTC_FORCEINLINE bool level_valid(unsigned level, const vuint* stack, const size_t& i, const size_t& /*j*/) -{ - return stack[level][i] != RTC_INVALID_GEOMETRY_ID; -} - -/* - * Copy an instance ID stack. - * - * This function automatically selects a LevelFunctor from the above Assign - * structs. - */ -template -RTC_FORCEINLINE void copy(Src src, Tgt tgt, Args&&... args) -{ -#if (RTC_MAX_INSTANCE_LEVEL_COUNT == 1) - /* - * Avoid all loops for only one level. - */ - level_copy(0, src, tgt, std::forward(args)...); - -#elif (RTC_MAX_INSTANCE_LEVEL_COUNT <= 4) - /* - * It is faster to avoid the valid test for low level counts. - * Just copy the whole stack. - */ - for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) - level_copy(l, src, tgt, std::forward(args)...); - -#else - /* - * For general stack sizes, it pays off to test for validity. - */ - bool valid = true; - for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT && valid; ++l) - { - level_copy(l, src, tgt, std::forward(args)...); - valid = level_valid(l, src, std::forward(args)...); - } -#endif -} - -} // namespace instance_id_stack -} // namespace embree - diff --git a/thirdparty/embree-aarch64/kernels/common/isa.h b/thirdparty/embree-aarch64/kernels/common/isa.h deleted file mode 100644 index 63fb8d3351..0000000000 --- a/thirdparty/embree-aarch64/kernels/common/isa.h +++ /dev/null @@ -1,271 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../../common/sys/platform.h" -#include "../../common/sys/sysinfo.h" - -namespace embree -{ -#define DEFINE_SYMBOL2(type,name) \ - typedef type (*name##Func)(); \ - name##Func name; - -#define DECLARE_SYMBOL2(type,name) \ - namespace sse2 { extern type name(); } \ - namespace sse42 { extern type name(); } \ - namespace avx { extern type name(); } \ - namespace avx2 { extern type name(); } \ - namespace avx512knl { extern type name(); } \ - namespace avx512skx { extern type name(); } \ - void name##_error2() { throw_RTCError(RTC_ERROR_UNKNOWN,"internal error in ISA selection for " TOSTRING(name)); } \ - type name##_error() { return type(name##_error2); } \ - type name##_zero() { return type(nullptr); } - -#define DECLARE_ISA_FUNCTION(type,symbol,args) \ - namespace sse2 { extern type symbol(args); } \ - namespace sse42 { extern type symbol(args); } \ - namespace avx { extern type symbol(args); } \ - namespace avx2 { extern type symbol(args); } \ - namespace avx512knl { extern type symbol(args); } \ - namespace avx512skx { extern type symbol(args); } \ - inline type symbol##_error(args) { throw_RTCError(RTC_ERROR_UNSUPPORTED_CPU,"function " TOSTRING(symbol) " not supported by your CPU"); } \ - typedef type (*symbol##Ty)(args); \ - -#define DEFINE_ISA_FUNCTION(type,symbol,args) \ - typedef type (*symbol##Func)(args); \ - symbol##Func symbol; - -#define ZERO_SYMBOL(features,intersector) \ - intersector = intersector##_zero; - -#define INIT_SYMBOL(features,intersector) \ - intersector = decltype(intersector)(intersector##_error); - -#define SELECT_SYMBOL_DEFAULT(features,intersector) \ - intersector = isa::intersector; - -#if defined(__SSE__) || defined(__ARM_NEON) -#if !defined(EMBREE_TARGET_SIMD4) -#define EMBREE_TARGET_SIMD4 -#endif -#endif - -#if defined(EMBREE_TARGET_SSE42) -#define SELECT_SYMBOL_SSE42(features,intersector) \ - if ((features & SSE42) == SSE42) intersector = sse42::intersector; -#else -#define SELECT_SYMBOL_SSE42(features,intersector) -#endif - -#if defined(EMBREE_TARGET_AVX) || defined(__AVX__) -#if !defined(EMBREE_TARGET_SIMD8) -#define EMBREE_TARGET_SIMD8 -#endif -#if defined(__AVX__) // if default ISA is >= AVX we treat AVX target as default target -#define SELECT_SYMBOL_AVX(features,intersector) \ - if ((features & ISA) == ISA) intersector = isa::intersector; -#else -#define SELECT_SYMBOL_AVX(features,intersector) \ - if ((features & AVX) == AVX) intersector = avx::intersector; -#endif -#else -#define SELECT_SYMBOL_AVX(features,intersector) -#endif - -#if defined(EMBREE_TARGET_AVX2) -#if !defined(EMBREE_TARGET_SIMD8) -#define EMBREE_TARGET_SIMD8 -#endif -#define SELECT_SYMBOL_AVX2(features,intersector) \ - if ((features & AVX2) == AVX2) intersector = avx2::intersector; -#else -#define SELECT_SYMBOL_AVX2(features,intersector) -#endif - -#if defined(EMBREE_TARGET_AVX512KNL) -#if !defined(EMBREE_TARGET_SIMD16) -#define EMBREE_TARGET_SIMD16 -#endif -#define SELECT_SYMBOL_AVX512KNL(features,intersector) \ - if ((features & AVX512KNL) == AVX512KNL) intersector = avx512knl::intersector; -#else -#define SELECT_SYMBOL_AVX512KNL(features,intersector) -#endif - -#if defined(EMBREE_TARGET_AVX512SKX) -#if !defined(EMBREE_TARGET_SIMD16) -#define EMBREE_TARGET_SIMD16 -#endif -#define SELECT_SYMBOL_AVX512SKX(features,intersector) \ - if ((features & AVX512SKX) == AVX512SKX) intersector = avx512skx::intersector; -#else -#define SELECT_SYMBOL_AVX512SKX(features,intersector) -#endif - -#define SELECT_SYMBOL_DEFAULT_SSE42(features,intersector) \ - SELECT_SYMBOL_DEFAULT(features,intersector); \ - SELECT_SYMBOL_SSE42(features,intersector); - -#define SELECT_SYMBOL_DEFAULT_SSE42_AVX(features,intersector) \ - SELECT_SYMBOL_DEFAULT(features,intersector); \ - SELECT_SYMBOL_SSE42(features,intersector); \ - SELECT_SYMBOL_AVX(features,intersector); - -#define SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2(features,intersector) \ - SELECT_SYMBOL_DEFAULT(features,intersector); \ - SELECT_SYMBOL_SSE42(features,intersector); \ - SELECT_SYMBOL_AVX(features,intersector); \ - SELECT_SYMBOL_AVX2(features,intersector); - -#define SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX512SKX(features,intersector) \ - SELECT_SYMBOL_DEFAULT(features,intersector); \ - SELECT_SYMBOL_SSE42(features,intersector); \ - SELECT_SYMBOL_AVX(features,intersector); \ - SELECT_SYMBOL_AVX512SKX(features,intersector); - -#define SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(features,intersector) \ - SELECT_SYMBOL_DEFAULT(features,intersector); \ - SELECT_SYMBOL_AVX(features,intersector); \ - SELECT_SYMBOL_AVX2(features,intersector); \ - SELECT_SYMBOL_AVX512KNL(features,intersector); \ - SELECT_SYMBOL_AVX512SKX(features,intersector); - -#define SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512SKX(features,intersector) \ - SELECT_SYMBOL_DEFAULT(features,intersector); \ - SELECT_SYMBOL_AVX(features,intersector); \ - SELECT_SYMBOL_AVX2(features,intersector); \ - SELECT_SYMBOL_AVX512SKX(features,intersector); - -#define SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,intersector) \ - SELECT_SYMBOL_DEFAULT(features,intersector); \ - SELECT_SYMBOL_SSE42(features,intersector); \ - SELECT_SYMBOL_AVX(features,intersector); \ - SELECT_SYMBOL_AVX2(features,intersector); \ - SELECT_SYMBOL_AVX512KNL(features,intersector); \ - SELECT_SYMBOL_AVX512SKX(features,intersector); - -#define SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,intersector) \ - SELECT_SYMBOL_DEFAULT(features,intersector); \ - SELECT_SYMBOL_SSE42(features,intersector); \ - SELECT_SYMBOL_AVX(features,intersector); \ - SELECT_SYMBOL_AVX2(features,intersector); \ - SELECT_SYMBOL_AVX512SKX(features,intersector); - -#define SELECT_SYMBOL_DEFAULT_AVX(features,intersector) \ - SELECT_SYMBOL_DEFAULT(features,intersector); \ - SELECT_SYMBOL_AVX(features,intersector); - -#define SELECT_SYMBOL_DEFAULT_AVX_AVX2(features,intersector) \ - SELECT_SYMBOL_DEFAULT(features,intersector); \ - SELECT_SYMBOL_AVX(features,intersector); \ - SELECT_SYMBOL_AVX2(features,intersector); - -#define SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,intersector) \ - SELECT_SYMBOL_DEFAULT(features,intersector); \ - SELECT_SYMBOL_AVX(features,intersector); \ - SELECT_SYMBOL_AVX512KNL(features,intersector); - -#define SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL_AVX512SKX(features,intersector) \ - SELECT_SYMBOL_DEFAULT(features,intersector); \ - SELECT_SYMBOL_AVX(features,intersector); \ - SELECT_SYMBOL_AVX512KNL(features,intersector); \ - SELECT_SYMBOL_AVX512SKX(features,intersector); - -#define SELECT_SYMBOL_DEFAULT_AVX_AVX512SKX(features,intersector) \ - SELECT_SYMBOL_DEFAULT(features,intersector); \ - SELECT_SYMBOL_AVX(features,intersector); \ - SELECT_SYMBOL_AVX512SKX(features,intersector); - -#define SELECT_SYMBOL_INIT_AVX(features,intersector) \ - INIT_SYMBOL(features,intersector); \ - SELECT_SYMBOL_AVX(features,intersector); - -#define SELECT_SYMBOL_INIT_AVX_AVX2(features,intersector) \ - INIT_SYMBOL(features,intersector); \ - SELECT_SYMBOL_AVX(features,intersector); \ - SELECT_SYMBOL_AVX2(features,intersector); - -#define SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,intersector) \ - INIT_SYMBOL(features,intersector); \ - SELECT_SYMBOL_AVX(features,intersector); \ - SELECT_SYMBOL_AVX2(features,intersector); \ - SELECT_SYMBOL_AVX512SKX(features,intersector); - -#define SELECT_SYMBOL_INIT_SSE42_AVX_AVX2(features,intersector) \ - INIT_SYMBOL(features,intersector); \ - SELECT_SYMBOL_SSE42(features,intersector); \ - SELECT_SYMBOL_AVX(features,intersector); \ - SELECT_SYMBOL_AVX2(features,intersector); - -#define SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,intersector) \ - INIT_SYMBOL(features,intersector); \ - SELECT_SYMBOL_AVX(features,intersector); \ - SELECT_SYMBOL_AVX512KNL(features,intersector); - -#define SELECT_SYMBOL_INIT_AVX_AVX512KNL_AVX512SKX(features,intersector) \ - INIT_SYMBOL(features,intersector); \ - SELECT_SYMBOL_AVX(features,intersector); \ - SELECT_SYMBOL_AVX512KNL(features,intersector); \ - SELECT_SYMBOL_AVX512SKX(features,intersector); - -#define SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL(features,intersector) \ - INIT_SYMBOL(features,intersector); \ - SELECT_SYMBOL_AVX(features,intersector); \ - SELECT_SYMBOL_AVX2(features,intersector); \ - SELECT_SYMBOL_AVX512KNL(features,intersector); - -#define SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,intersector) \ - INIT_SYMBOL(features,intersector); \ - SELECT_SYMBOL_AVX(features,intersector); \ - SELECT_SYMBOL_AVX2(features,intersector); \ - SELECT_SYMBOL_AVX512KNL(features,intersector); \ - SELECT_SYMBOL_AVX512SKX(features,intersector); - -#define SELECT_SYMBOL_INIT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,intersector) \ - INIT_SYMBOL(features,intersector); \ - SELECT_SYMBOL_SSE42(features,intersector); \ - SELECT_SYMBOL_AVX(features,intersector); \ - SELECT_SYMBOL_AVX2(features,intersector); \ - SELECT_SYMBOL_AVX512KNL(features,intersector); \ - SELECT_SYMBOL_AVX512SKX(features,intersector); - -#define SELECT_SYMBOL_ZERO_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,intersector) \ - ZERO_SYMBOL(features,intersector); \ - SELECT_SYMBOL_SSE42(features,intersector); \ - SELECT_SYMBOL_AVX(features,intersector); \ - SELECT_SYMBOL_AVX2(features,intersector); \ - SELECT_SYMBOL_AVX512KNL(features,intersector); \ - SELECT_SYMBOL_AVX512SKX(features,intersector); - -#define SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(features,intersector) \ - SELECT_SYMBOL_DEFAULT(features,intersector); \ - SELECT_SYMBOL_AVX(features,intersector); \ - SELECT_SYMBOL_AVX2(features,intersector); \ - SELECT_SYMBOL_AVX512KNL(features,intersector); \ - SELECT_SYMBOL_AVX512SKX(features,intersector); - -#define SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,intersector) \ - INIT_SYMBOL(features,intersector); \ - SELECT_SYMBOL_AVX512KNL(features,intersector); \ - SELECT_SYMBOL_AVX512SKX(features,intersector); - -#define SELECT_SYMBOL_SSE42_AVX_AVX2(features,intersector) \ - SELECT_SYMBOL_SSE42(features,intersector); \ - SELECT_SYMBOL_AVX(features,intersector); \ - SELECT_SYMBOL_AVX2(features,intersector); - - struct VerifyMultiTargetLinking { - static __noinline int getISA(int depth = 5) { - if (depth == 0) return ISA; - else return getISA(depth-1); - } - }; - namespace sse2 { int getISA(); }; - namespace sse42 { int getISA(); }; - namespace avx { int getISA(); }; - namespace avx2 { int getISA(); }; - namespace avx512knl { int getISA(); }; - namespace avx512skx { int getISA(); }; -} diff --git a/thirdparty/embree-aarch64/kernels/common/motion_derivative.h b/thirdparty/embree-aarch64/kernels/common/motion_derivative.h deleted file mode 100644 index 82953f0e89..0000000000 --- a/thirdparty/embree-aarch64/kernels/common/motion_derivative.h +++ /dev/null @@ -1,325 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../../common/math/affinespace.h" -#include "../../common/math/interval.h" - -#include - -namespace embree { - -#define MOTION_DERIVATIVE_ROOT_EPSILON 1e-4f - -static void motion_derivative_coefficients(const float *p, float *coeff); - -struct MotionDerivativeCoefficients -{ - float theta; - float coeffs[3*8*7]; - - MotionDerivativeCoefficients() {} - - // xfm0 and xfm1 are interpret as quaternion decomposition - MotionDerivativeCoefficients(AffineSpace3ff const& xfm0, AffineSpace3ff const& xfm1) - { - // cosTheta of the two quaternions - const float cosTheta = min(1.f, max(-1.f, - xfm0.l.vx.w * xfm1.l.vx.w - + xfm0.l.vy.w * xfm1.l.vy.w - + xfm0.l.vz.w * xfm1.l.vz.w - + xfm0.p.w * xfm1.p.w)); - - theta = std::acos(cosTheta); - Vec4f qperp(xfm1.p.w, xfm1.l.vx.w, xfm1.l.vy.w, xfm1.l.vz.w); - if (cosTheta < 0.995f) { - // compute perpendicular quaternion - qperp.x = xfm1.p.w - cosTheta * xfm0.p.w; - qperp.y = xfm1.l.vx.w - cosTheta * xfm0.l.vx.w; - qperp.z = xfm1.l.vy.w - cosTheta * xfm0.l.vy.w; - qperp.w = xfm1.l.vz.w - cosTheta * xfm0.l.vz.w; - qperp = normalize(qperp); - } - const float p[33] = { - theta, - xfm0.l.vx.y, xfm0.l.vx.z, xfm0.l.vy.z, // translation component of xfm0 - xfm1.l.vx.y, xfm1.l.vx.z, xfm1.l.vy.z, // translation component of xfm1 - xfm0.p.w, xfm0.l.vx.w, xfm0.l.vy.w, xfm0.l.vz.w, // quaternion of xfm0 - qperp.x, qperp.y, qperp.z, qperp.w, - xfm0.l.vx.x, xfm0.l.vy.x, xfm0.l.vz.x, xfm0.p.x, // scale/skew component of xfm0 - xfm0.l.vy.y, xfm0.l.vz.y, xfm0.p.y, - xfm0.l.vz.z, xfm0.p.z, - xfm1.l.vx.x, xfm1.l.vy.x, xfm1.l.vz.x, xfm1.p.x, // scale/skew component of xfm1 - xfm1.l.vy.y, xfm1.l.vz.y, xfm1.p.y, - xfm1.l.vz.z, xfm1.p.z - }; - motion_derivative_coefficients(p, coeffs); - } -}; - -struct MotionDerivative -{ - float twoTheta; - float c[8]; - - MotionDerivative(MotionDerivativeCoefficients const& mdc, - int dim, Vec3fa const& p0, Vec3fa const& p1) - : twoTheta(2.f*mdc.theta) - { - const float p[7] = { 1, p0.x, p0.y, p0.z, p1.x, p1.y, p1.z }; - for (int i = 0; i < 8; ++i) { - c[i] = 0; - for (int j = 0; j < 7; ++j) { - c[i] += mdc.coeffs[8*7*dim + i*7 + j] * p[j]; - } - } - } - - template - struct EvalMotionDerivative - { - MotionDerivative const& md; - float offset; - - EvalMotionDerivative(MotionDerivative const& md, float offset) : md(md), offset(offset) {} - - T operator()(T const& time) const { - return md.c[0] + md.c[1] * time - + (md.c[2] + md.c[3] * time + md.c[4] * time * time) * cos(md.twoTheta * time) - + (md.c[5] + md.c[6] * time + md.c[7] * time * time) * sin(md.twoTheta * time) - + offset; - } - }; - - unsigned int findRoots( - Interval1f const& interval, - float offset, - float* roots, - unsigned int maxNumRoots) - { - unsigned int numRoots = 0; - EvalMotionDerivative eval(*this, offset); - findRoots(eval, interval, numRoots, roots, maxNumRoots); - return numRoots; - } - - template - static void findRoots( - - Eval const& eval, - Interval1f const& interval, - unsigned int& numRoots, - float* roots, - unsigned int maxNumRoots) - { - Interval1f range = eval(interval); - if (range.lower > 0 || range.upper < 0 || range.lower >= range.upper) return; - - const float split = 0.5f * (interval.upper + interval.lower); - if (interval.upper-interval.lower < 1e-7f || abs(split-interval.lower) < 1e-7f || abs(split-interval.upper) < 1e-7f) - { - // check if the root already exists - for (unsigned int k = 0; k < numRoots && k < maxNumRoots; ++k) { - if (abs(roots[k]-split) < MOTION_DERIVATIVE_ROOT_EPSILON) - return; - } - if (numRoots < maxNumRoots) { - roots[numRoots++] = split; - } - if (numRoots > maxNumRoots) { - printf("error: more roots than expected\n"); // FIXME: workaround for ICC2019.4 compiler bug under macOS - return; - } - return; - } - - findRoots(eval, Interval1f(interval.lower, split), numRoots, roots, maxNumRoots); - findRoots(eval, Interval1f(split, interval.upper), numRoots, roots, maxNumRoots); - } -}; - -/****************************************************************************** - * Code generated with sympy 1.4 * - * See http://www.sympy.org/ for more information. * - * * - * see * - * * - * scripts/generate_motion_derivative_coefficients.py * - * * - * for how this code is generated * - * * - ******************************************************************************/ -static void motion_derivative_coefficients(const float *p, float *coeff) -{ - coeff[0] = -p[1] + p[4] - p[7]*p[9]*p[23] + p[7]*p[9]*p[32] + p[7]*p[10]*p[21] - p[7]*p[10]*p[30] - p[8]*p[9]*p[21] + p[8]*p[9]*p[30] - p[8]*p[10]*p[23] + p[8]*p[10]*p[32] + p[9]*p[9]*p[18] - p[9]*p[9]*p[27] + p[10]*p[10]*p[18] - p[10]*p[10]*p[27] - p[11]*p[13]*p[23] + p[11]*p[13]*p[32] + p[11]*p[14]*p[21] - p[11]*p[14]*p[30] - p[12]*p[13]*p[21] + p[12]*p[13]*p[30] - p[12]*p[14]*p[23] + p[12]*p[14]*p[32] + p[13]*p[13]*p[18] - p[13]*p[13]*p[27] + p[14]*p[14]*p[18] - p[14]*p[14]*p[27] - p[18] + p[27]; - coeff[1] = 2*p[9]*p[9]*p[15] - p[9]*p[9]*p[24] + 2*p[10]*p[10]*p[15] - p[10]*p[10]*p[24] + 2*p[13]*p[13]*p[15] - p[13]*p[13]*p[24] + 2*p[14]*p[14]*p[15] - p[14]*p[14]*p[24] - 2*p[15] + p[24]; - coeff[2] = 2*p[7]*p[10]*p[19] - p[7]*p[10]*p[28] - 2*p[8]*p[9]*p[19] + p[8]*p[9]*p[28] + 2*p[9]*p[9]*p[16] - p[9]*p[9]*p[25] + 2*p[10]*p[10]*p[16] - p[10]*p[10]*p[25] + 2*p[11]*p[14]*p[19] - p[11]*p[14]*p[28] - 2*p[12]*p[13]*p[19] + p[12]*p[13]*p[28] + 2*p[13]*p[13]*p[16] - p[13]*p[13]*p[25] + 2*p[14]*p[14]*p[16] - p[14]*p[14]*p[25] - 2*p[16] + p[25]; - coeff[3] = -2*p[7]*p[9]*p[22] + p[7]*p[9]*p[31] + 2*p[7]*p[10]*p[20] - p[7]*p[10]*p[29] - 2*p[8]*p[9]*p[20] + p[8]*p[9]*p[29] - 2*p[8]*p[10]*p[22] + p[8]*p[10]*p[31] + 2*p[9]*p[9]*p[17] - p[9]*p[9]*p[26] + 2*p[10]*p[10]*p[17] - p[10]*p[10]*p[26] - 2*p[11]*p[13]*p[22] + p[11]*p[13]*p[31] + 2*p[11]*p[14]*p[20] - p[11]*p[14]*p[29] - 2*p[12]*p[13]*p[20] + p[12]*p[13]*p[29] - 2*p[12]*p[14]*p[22] + p[12]*p[14]*p[31] + 2*p[13]*p[13]*p[17] - p[13]*p[13]*p[26] + 2*p[14]*p[14]*p[17] - p[14]*p[14]*p[26] - 2*p[17] + p[26]; - coeff[4] = (-p[9]*p[9] - p[10]*p[10] - p[13]*p[13] - p[14]*p[14] + 1)*p[15]; - coeff[5] = -p[7]*p[10]*p[19] + p[8]*p[9]*p[19] - p[9]*p[9]*p[16] - p[10]*p[10]*p[16] - p[11]*p[14]*p[19] + p[12]*p[13]*p[19] - p[13]*p[13]*p[16] - p[14]*p[14]*p[16] + p[16]; - coeff[6] = p[7]*p[9]*p[22] - p[7]*p[10]*p[20] + p[8]*p[9]*p[20] + p[8]*p[10]*p[22] - p[9]*p[9]*p[17] - p[10]*p[10]*p[17] + p[11]*p[13]*p[22] - p[11]*p[14]*p[20] + p[12]*p[13]*p[20] + p[12]*p[14]*p[22] - p[13]*p[13]*p[17] - p[14]*p[14]*p[17] + p[17]; - coeff[7] = 0; - coeff[8] = -2*p[9]*p[9]*p[15] + 2*p[9]*p[9]*p[24] - 2*p[10]*p[10]*p[15] + 2*p[10]*p[10]*p[24] - 2*p[13]*p[13]*p[15] + 2*p[13]*p[13]*p[24] - 2*p[14]*p[14]*p[15] + 2*p[14]*p[14]*p[24] + 2*p[15] - 2*p[24]; - coeff[9] = -2*p[7]*p[10]*p[19] + 2*p[7]*p[10]*p[28] + 2*p[8]*p[9]*p[19] - 2*p[8]*p[9]*p[28] - 2*p[9]*p[9]*p[16] + 2*p[9]*p[9]*p[25] - 2*p[10]*p[10]*p[16] + 2*p[10]*p[10]*p[25] - 2*p[11]*p[14]*p[19] + 2*p[11]*p[14]*p[28] + 2*p[12]*p[13]*p[19] - 2*p[12]*p[13]*p[28] - 2*p[13]*p[13]*p[16] + 2*p[13]*p[13]*p[25] - 2*p[14]*p[14]*p[16] + 2*p[14]*p[14]*p[25] + 2*p[16] - 2*p[25]; - coeff[10] = 2*p[7]*p[9]*p[22] - 2*p[7]*p[9]*p[31] - 2*p[7]*p[10]*p[20] + 2*p[7]*p[10]*p[29] + 2*p[8]*p[9]*p[20] - 2*p[8]*p[9]*p[29] + 2*p[8]*p[10]*p[22] - 2*p[8]*p[10]*p[31] - 2*p[9]*p[9]*p[17] + 2*p[9]*p[9]*p[26] - 2*p[10]*p[10]*p[17] + 2*p[10]*p[10]*p[26] + 2*p[11]*p[13]*p[22] - 2*p[11]*p[13]*p[31] - 2*p[11]*p[14]*p[20] + 2*p[11]*p[14]*p[29] + 2*p[12]*p[13]*p[20] - 2*p[12]*p[13]*p[29] + 2*p[12]*p[14]*p[22] - 2*p[12]*p[14]*p[31] - 2*p[13]*p[13]*p[17] + 2*p[13]*p[13]*p[26] - 2*p[14]*p[14]*p[17] + 2*p[14]*p[14]*p[26] + 2*p[17] - 2*p[26]; - coeff[11] = 2*p[9]*p[9]*p[15] - 2*p[9]*p[9]*p[24] + 2*p[10]*p[10]*p[15] - 2*p[10]*p[10]*p[24] + 2*p[13]*p[13]*p[15] - 2*p[13]*p[13]*p[24] + 2*p[14]*p[14]*p[15] - 2*p[14]*p[14]*p[24] - 2*p[15] + 2*p[24]; - coeff[12] = 2*p[7]*p[10]*p[19] - 2*p[7]*p[10]*p[28] - 2*p[8]*p[9]*p[19] + 2*p[8]*p[9]*p[28] + 2*p[9]*p[9]*p[16] - 2*p[9]*p[9]*p[25] + 2*p[10]*p[10]*p[16] - 2*p[10]*p[10]*p[25] + 2*p[11]*p[14]*p[19] - 2*p[11]*p[14]*p[28] - 2*p[12]*p[13]*p[19] + 2*p[12]*p[13]*p[28] + 2*p[13]*p[13]*p[16] - 2*p[13]*p[13]*p[25] + 2*p[14]*p[14]*p[16] - 2*p[14]*p[14]*p[25] - 2*p[16] + 2*p[25]; - coeff[13] = -2*p[7]*p[9]*p[22] + 2*p[7]*p[9]*p[31] + 2*p[7]*p[10]*p[20] - 2*p[7]*p[10]*p[29] - 2*p[8]*p[9]*p[20] + 2*p[8]*p[9]*p[29] - 2*p[8]*p[10]*p[22] + 2*p[8]*p[10]*p[31] + 2*p[9]*p[9]*p[17] - 2*p[9]*p[9]*p[26] + 2*p[10]*p[10]*p[17] - 2*p[10]*p[10]*p[26] - 2*p[11]*p[13]*p[22] + 2*p[11]*p[13]*p[31] + 2*p[11]*p[14]*p[20] - 2*p[11]*p[14]*p[29] - 2*p[12]*p[13]*p[20] + 2*p[12]*p[13]*p[29] - 2*p[12]*p[14]*p[22] + 2*p[12]*p[14]*p[31] + 2*p[13]*p[13]*p[17] - 2*p[13]*p[13]*p[26] + 2*p[14]*p[14]*p[17] - 2*p[14]*p[14]*p[26] - 2*p[17] + 2*p[26]; - coeff[14] = 2*p[0]*p[7]*p[11]*p[18] + 2*p[0]*p[7]*p[13]*p[23] - 2*p[0]*p[7]*p[14]*p[21] + 2*p[0]*p[8]*p[12]*p[18] + 2*p[0]*p[8]*p[13]*p[21] + 2*p[0]*p[8]*p[14]*p[23] + 2*p[0]*p[9]*p[11]*p[23] + 2*p[0]*p[9]*p[12]*p[21] - 2*p[0]*p[9]*p[13]*p[18] - 2*p[0]*p[10]*p[11]*p[21] + 2*p[0]*p[10]*p[12]*p[23] - 2*p[0]*p[10]*p[14]*p[18] - p[7]*p[9]*p[23] + p[7]*p[9]*p[32] + p[7]*p[10]*p[21] - p[7]*p[10]*p[30] - p[8]*p[9]*p[21] + p[8]*p[9]*p[30] - p[8]*p[10]*p[23] + p[8]*p[10]*p[32] + p[9]*p[9]*p[18] - p[9]*p[9]*p[27] + p[10]*p[10]*p[18] - p[10]*p[10]*p[27] + p[11]*p[13]*p[23] - p[11]*p[13]*p[32] - p[11]*p[14]*p[21] + p[11]*p[14]*p[30] + p[12]*p[13]*p[21] - p[12]*p[13]*p[30] + p[12]*p[14]*p[23] - p[12]*p[14]*p[32] - p[13]*p[13]*p[18] + p[13]*p[13]*p[27] - p[14]*p[14]*p[18] + p[14]*p[14]*p[27]; - coeff[15] = 2*p[0]*p[7]*p[11]*p[15] + 2*p[0]*p[8]*p[12]*p[15] - 2*p[0]*p[9]*p[13]*p[15] - 2*p[0]*p[10]*p[14]*p[15] + 2*p[9]*p[9]*p[15] - p[9]*p[9]*p[24] + 2*p[10]*p[10]*p[15] - p[10]*p[10]*p[24] - 2*p[13]*p[13]*p[15] + p[13]*p[13]*p[24] - 2*p[14]*p[14]*p[15] + p[14]*p[14]*p[24]; - coeff[16] = 2*p[0]*p[7]*p[11]*p[16] - 2*p[0]*p[7]*p[14]*p[19] + 2*p[0]*p[8]*p[12]*p[16] + 2*p[0]*p[8]*p[13]*p[19] + 2*p[0]*p[9]*p[12]*p[19] - 2*p[0]*p[9]*p[13]*p[16] - 2*p[0]*p[10]*p[11]*p[19] - 2*p[0]*p[10]*p[14]*p[16] + 2*p[7]*p[10]*p[19] - p[7]*p[10]*p[28] - 2*p[8]*p[9]*p[19] + p[8]*p[9]*p[28] + 2*p[9]*p[9]*p[16] - p[9]*p[9]*p[25] + 2*p[10]*p[10]*p[16] - p[10]*p[10]*p[25] - 2*p[11]*p[14]*p[19] + p[11]*p[14]*p[28] + 2*p[12]*p[13]*p[19] - p[12]*p[13]*p[28] - 2*p[13]*p[13]*p[16] + p[13]*p[13]*p[25] - 2*p[14]*p[14]*p[16] + p[14]*p[14]*p[25]; - coeff[17] = 2*p[0]*p[7]*p[11]*p[17] + 2*p[0]*p[7]*p[13]*p[22] - 2*p[0]*p[7]*p[14]*p[20] + 2*p[0]*p[8]*p[12]*p[17] + 2*p[0]*p[8]*p[13]*p[20] + 2*p[0]*p[8]*p[14]*p[22] + 2*p[0]*p[9]*p[11]*p[22] + 2*p[0]*p[9]*p[12]*p[20] - 2*p[0]*p[9]*p[13]*p[17] - 2*p[0]*p[10]*p[11]*p[20] + 2*p[0]*p[10]*p[12]*p[22] - 2*p[0]*p[10]*p[14]*p[17] - 2*p[7]*p[9]*p[22] + p[7]*p[9]*p[31] + 2*p[7]*p[10]*p[20] - p[7]*p[10]*p[29] - 2*p[8]*p[9]*p[20] + p[8]*p[9]*p[29] - 2*p[8]*p[10]*p[22] + p[8]*p[10]*p[31] + 2*p[9]*p[9]*p[17] - p[9]*p[9]*p[26] + 2*p[10]*p[10]*p[17] - p[10]*p[10]*p[26] + 2*p[11]*p[13]*p[22] - p[11]*p[13]*p[31] - 2*p[11]*p[14]*p[20] + p[11]*p[14]*p[29] + 2*p[12]*p[13]*p[20] - p[12]*p[13]*p[29] + 2*p[12]*p[14]*p[22] - p[12]*p[14]*p[31] - 2*p[13]*p[13]*p[17] + p[13]*p[13]*p[26] - 2*p[14]*p[14]*p[17] + p[14]*p[14]*p[26]; - coeff[18] = (-p[9]*p[9] - p[10]*p[10] + p[13]*p[13] + p[14]*p[14])*p[15]; - coeff[19] = -p[7]*p[10]*p[19] + p[8]*p[9]*p[19] - p[9]*p[9]*p[16] - p[10]*p[10]*p[16] + p[11]*p[14]*p[19] - p[12]*p[13]*p[19] + p[13]*p[13]*p[16] + p[14]*p[14]*p[16]; - coeff[20] = p[7]*p[9]*p[22] - p[7]*p[10]*p[20] + p[8]*p[9]*p[20] + p[8]*p[10]*p[22] - p[9]*p[9]*p[17] - p[10]*p[10]*p[17] - p[11]*p[13]*p[22] + p[11]*p[14]*p[20] - p[12]*p[13]*p[20] - p[12]*p[14]*p[22] + p[13]*p[13]*p[17] + p[14]*p[14]*p[17]; - coeff[21] = 2*(-p[7]*p[11]*p[18] + p[7]*p[11]*p[27] - p[7]*p[13]*p[23] + p[7]*p[13]*p[32] + p[7]*p[14]*p[21] - p[7]*p[14]*p[30] - p[8]*p[12]*p[18] + p[8]*p[12]*p[27] - p[8]*p[13]*p[21] + p[8]*p[13]*p[30] - p[8]*p[14]*p[23] + p[8]*p[14]*p[32] - p[9]*p[11]*p[23] + p[9]*p[11]*p[32] - p[9]*p[12]*p[21] + p[9]*p[12]*p[30] + p[9]*p[13]*p[18] - p[9]*p[13]*p[27] + p[10]*p[11]*p[21] - p[10]*p[11]*p[30] - p[10]*p[12]*p[23] + p[10]*p[12]*p[32] + p[10]*p[14]*p[18] - p[10]*p[14]*p[27])*p[0]; - coeff[22] = -4*p[0]*p[7]*p[11]*p[15] + 2*p[0]*p[7]*p[11]*p[24] - 4*p[0]*p[8]*p[12]*p[15] + 2*p[0]*p[8]*p[12]*p[24] + 4*p[0]*p[9]*p[13]*p[15] - 2*p[0]*p[9]*p[13]*p[24] + 4*p[0]*p[10]*p[14]*p[15] - 2*p[0]*p[10]*p[14]*p[24] - 2*p[9]*p[9]*p[15] + 2*p[9]*p[9]*p[24] - 2*p[10]*p[10]*p[15] + 2*p[10]*p[10]*p[24] + 2*p[13]*p[13]*p[15] - 2*p[13]*p[13]*p[24] + 2*p[14]*p[14]*p[15] - 2*p[14]*p[14]*p[24]; - coeff[23] = -4*p[0]*p[7]*p[11]*p[16] + 2*p[0]*p[7]*p[11]*p[25] + 4*p[0]*p[7]*p[14]*p[19] - 2*p[0]*p[7]*p[14]*p[28] - 4*p[0]*p[8]*p[12]*p[16] + 2*p[0]*p[8]*p[12]*p[25] - 4*p[0]*p[8]*p[13]*p[19] + 2*p[0]*p[8]*p[13]*p[28] - 4*p[0]*p[9]*p[12]*p[19] + 2*p[0]*p[9]*p[12]*p[28] + 4*p[0]*p[9]*p[13]*p[16] - 2*p[0]*p[9]*p[13]*p[25] + 4*p[0]*p[10]*p[11]*p[19] - 2*p[0]*p[10]*p[11]*p[28] + 4*p[0]*p[10]*p[14]*p[16] - 2*p[0]*p[10]*p[14]*p[25] - 2*p[7]*p[10]*p[19] + 2*p[7]*p[10]*p[28] + 2*p[8]*p[9]*p[19] - 2*p[8]*p[9]*p[28] - 2*p[9]*p[9]*p[16] + 2*p[9]*p[9]*p[25] - 2*p[10]*p[10]*p[16] + 2*p[10]*p[10]*p[25] + 2*p[11]*p[14]*p[19] - 2*p[11]*p[14]*p[28] - 2*p[12]*p[13]*p[19] + 2*p[12]*p[13]*p[28] + 2*p[13]*p[13]*p[16] - 2*p[13]*p[13]*p[25] + 2*p[14]*p[14]*p[16] - 2*p[14]*p[14]*p[25]; - coeff[24] = -4*p[0]*p[7]*p[11]*p[17] + 2*p[0]*p[7]*p[11]*p[26] - 4*p[0]*p[7]*p[13]*p[22] + 2*p[0]*p[7]*p[13]*p[31] + 4*p[0]*p[7]*p[14]*p[20] - 2*p[0]*p[7]*p[14]*p[29] - 4*p[0]*p[8]*p[12]*p[17] + 2*p[0]*p[8]*p[12]*p[26] - 4*p[0]*p[8]*p[13]*p[20] + 2*p[0]*p[8]*p[13]*p[29] - 4*p[0]*p[8]*p[14]*p[22] + 2*p[0]*p[8]*p[14]*p[31] - 4*p[0]*p[9]*p[11]*p[22] + 2*p[0]*p[9]*p[11]*p[31] - 4*p[0]*p[9]*p[12]*p[20] + 2*p[0]*p[9]*p[12]*p[29] + 4*p[0]*p[9]*p[13]*p[17] - 2*p[0]*p[9]*p[13]*p[26] + 4*p[0]*p[10]*p[11]*p[20] - 2*p[0]*p[10]*p[11]*p[29] - 4*p[0]*p[10]*p[12]*p[22] + 2*p[0]*p[10]*p[12]*p[31] + 4*p[0]*p[10]*p[14]*p[17] - 2*p[0]*p[10]*p[14]*p[26] + 2*p[7]*p[9]*p[22] - 2*p[7]*p[9]*p[31] - 2*p[7]*p[10]*p[20] + 2*p[7]*p[10]*p[29] + 2*p[8]*p[9]*p[20] - 2*p[8]*p[9]*p[29] + 2*p[8]*p[10]*p[22] - 2*p[8]*p[10]*p[31] - 2*p[9]*p[9]*p[17] + 2*p[9]*p[9]*p[26] - 2*p[10]*p[10]*p[17] + 2*p[10]*p[10]*p[26] - 2*p[11]*p[13]*p[22] + 2*p[11]*p[13]*p[31] + 2*p[11]*p[14]*p[20] - 2*p[11]*p[14]*p[29] - 2*p[12]*p[13]*p[20] + 2*p[12]*p[13]*p[29] - 2*p[12]*p[14]*p[22] + 2*p[12]*p[14]*p[31] + 2*p[13]*p[13]*p[17] - 2*p[13]*p[13]*p[26] + 2*p[14]*p[14]*p[17] - 2*p[14]*p[14]*p[26]; - coeff[25] = 2*p[0]*p[7]*p[11]*p[15] + 2*p[0]*p[8]*p[12]*p[15] - 2*p[0]*p[9]*p[13]*p[15] - 2*p[0]*p[10]*p[14]*p[15] + 2*p[9]*p[9]*p[15] - 2*p[9]*p[9]*p[24] + 2*p[10]*p[10]*p[15] - 2*p[10]*p[10]*p[24] - 2*p[13]*p[13]*p[15] + 2*p[13]*p[13]*p[24] - 2*p[14]*p[14]*p[15] + 2*p[14]*p[14]*p[24]; - coeff[26] = 2*p[0]*p[7]*p[11]*p[16] - 2*p[0]*p[7]*p[14]*p[19] + 2*p[0]*p[8]*p[12]*p[16] + 2*p[0]*p[8]*p[13]*p[19] + 2*p[0]*p[9]*p[12]*p[19] - 2*p[0]*p[9]*p[13]*p[16] - 2*p[0]*p[10]*p[11]*p[19] - 2*p[0]*p[10]*p[14]*p[16] + 2*p[7]*p[10]*p[19] - 2*p[7]*p[10]*p[28] - 2*p[8]*p[9]*p[19] + 2*p[8]*p[9]*p[28] + 2*p[9]*p[9]*p[16] - 2*p[9]*p[9]*p[25] + 2*p[10]*p[10]*p[16] - 2*p[10]*p[10]*p[25] - 2*p[11]*p[14]*p[19] + 2*p[11]*p[14]*p[28] + 2*p[12]*p[13]*p[19] - 2*p[12]*p[13]*p[28] - 2*p[13]*p[13]*p[16] + 2*p[13]*p[13]*p[25] - 2*p[14]*p[14]*p[16] + 2*p[14]*p[14]*p[25]; - coeff[27] = 2*p[0]*p[7]*p[11]*p[17] + 2*p[0]*p[7]*p[13]*p[22] - 2*p[0]*p[7]*p[14]*p[20] + 2*p[0]*p[8]*p[12]*p[17] + 2*p[0]*p[8]*p[13]*p[20] + 2*p[0]*p[8]*p[14]*p[22] + 2*p[0]*p[9]*p[11]*p[22] + 2*p[0]*p[9]*p[12]*p[20] - 2*p[0]*p[9]*p[13]*p[17] - 2*p[0]*p[10]*p[11]*p[20] + 2*p[0]*p[10]*p[12]*p[22] - 2*p[0]*p[10]*p[14]*p[17] - 2*p[7]*p[9]*p[22] + 2*p[7]*p[9]*p[31] + 2*p[7]*p[10]*p[20] - 2*p[7]*p[10]*p[29] - 2*p[8]*p[9]*p[20] + 2*p[8]*p[9]*p[29] - 2*p[8]*p[10]*p[22] + 2*p[8]*p[10]*p[31] + 2*p[9]*p[9]*p[17] - 2*p[9]*p[9]*p[26] + 2*p[10]*p[10]*p[17] - 2*p[10]*p[10]*p[26] + 2*p[11]*p[13]*p[22] - 2*p[11]*p[13]*p[31] - 2*p[11]*p[14]*p[20] + 2*p[11]*p[14]*p[29] + 2*p[12]*p[13]*p[20] - 2*p[12]*p[13]*p[29] + 2*p[12]*p[14]*p[22] - 2*p[12]*p[14]*p[31] - 2*p[13]*p[13]*p[17] + 2*p[13]*p[13]*p[26] - 2*p[14]*p[14]*p[17] + 2*p[14]*p[14]*p[26]; - coeff[28] = 0; - coeff[29] = 2*(p[7]*p[11]*p[15] - p[7]*p[11]*p[24] + p[8]*p[12]*p[15] - p[8]*p[12]*p[24] - p[9]*p[13]*p[15] + p[9]*p[13]*p[24] - p[10]*p[14]*p[15] + p[10]*p[14]*p[24])*p[0]; - coeff[30] = 2*(p[7]*p[11]*p[16] - p[7]*p[11]*p[25] - p[7]*p[14]*p[19] + p[7]*p[14]*p[28] + p[8]*p[12]*p[16] - p[8]*p[12]*p[25] + p[8]*p[13]*p[19] - p[8]*p[13]*p[28] + p[9]*p[12]*p[19] - p[9]*p[12]*p[28] - p[9]*p[13]*p[16] + p[9]*p[13]*p[25] - p[10]*p[11]*p[19] + p[10]*p[11]*p[28] - p[10]*p[14]*p[16] + p[10]*p[14]*p[25])*p[0]; - coeff[31] = 2*(p[7]*p[11]*p[17] - p[7]*p[11]*p[26] + p[7]*p[13]*p[22] - p[7]*p[13]*p[31] - p[7]*p[14]*p[20] + p[7]*p[14]*p[29] + p[8]*p[12]*p[17] - p[8]*p[12]*p[26] + p[8]*p[13]*p[20] - p[8]*p[13]*p[29] + p[8]*p[14]*p[22] - p[8]*p[14]*p[31] + p[9]*p[11]*p[22] - p[9]*p[11]*p[31] + p[9]*p[12]*p[20] - p[9]*p[12]*p[29] - p[9]*p[13]*p[17] + p[9]*p[13]*p[26] - p[10]*p[11]*p[20] + p[10]*p[11]*p[29] + p[10]*p[12]*p[22] - p[10]*p[12]*p[31] - p[10]*p[14]*p[17] + p[10]*p[14]*p[26])*p[0]; - coeff[32] = 2*(-p[7]*p[11]*p[15] + p[7]*p[11]*p[24] - p[8]*p[12]*p[15] + p[8]*p[12]*p[24] + p[9]*p[13]*p[15] - p[9]*p[13]*p[24] + p[10]*p[14]*p[15] - p[10]*p[14]*p[24])*p[0]; - coeff[33] = 2*(-p[7]*p[11]*p[16] + p[7]*p[11]*p[25] + p[7]*p[14]*p[19] - p[7]*p[14]*p[28] - p[8]*p[12]*p[16] + p[8]*p[12]*p[25] - p[8]*p[13]*p[19] + p[8]*p[13]*p[28] - p[9]*p[12]*p[19] + p[9]*p[12]*p[28] + p[9]*p[13]*p[16] - p[9]*p[13]*p[25] + p[10]*p[11]*p[19] - p[10]*p[11]*p[28] + p[10]*p[14]*p[16] - p[10]*p[14]*p[25])*p[0]; - coeff[34] = 2*(-p[7]*p[11]*p[17] + p[7]*p[11]*p[26] - p[7]*p[13]*p[22] + p[7]*p[13]*p[31] + p[7]*p[14]*p[20] - p[7]*p[14]*p[29] - p[8]*p[12]*p[17] + p[8]*p[12]*p[26] - p[8]*p[13]*p[20] + p[8]*p[13]*p[29] - p[8]*p[14]*p[22] + p[8]*p[14]*p[31] - p[9]*p[11]*p[22] + p[9]*p[11]*p[31] - p[9]*p[12]*p[20] + p[9]*p[12]*p[29] + p[9]*p[13]*p[17] - p[9]*p[13]*p[26] + p[10]*p[11]*p[20] - p[10]*p[11]*p[29] - p[10]*p[12]*p[22] + p[10]*p[12]*p[31] + p[10]*p[14]*p[17] - p[10]*p[14]*p[26])*p[0]; - coeff[35] = -2*p[0]*p[7]*p[9]*p[23] + 2*p[0]*p[7]*p[10]*p[21] - 2*p[0]*p[8]*p[9]*p[21] - 2*p[0]*p[8]*p[10]*p[23] + 2*p[0]*p[9]*p[9]*p[18] + 2*p[0]*p[10]*p[10]*p[18] + 2*p[0]*p[11]*p[13]*p[23] - 2*p[0]*p[11]*p[14]*p[21] + 2*p[0]*p[12]*p[13]*p[21] + 2*p[0]*p[12]*p[14]*p[23] - 2*p[0]*p[13]*p[13]*p[18] - 2*p[0]*p[14]*p[14]*p[18] - p[7]*p[11]*p[18] + p[7]*p[11]*p[27] - p[7]*p[13]*p[23] + p[7]*p[13]*p[32] + p[7]*p[14]*p[21] - p[7]*p[14]*p[30] - p[8]*p[12]*p[18] + p[8]*p[12]*p[27] - p[8]*p[13]*p[21] + p[8]*p[13]*p[30] - p[8]*p[14]*p[23] + p[8]*p[14]*p[32] - p[9]*p[11]*p[23] + p[9]*p[11]*p[32] - p[9]*p[12]*p[21] + p[9]*p[12]*p[30] + p[9]*p[13]*p[18] - p[9]*p[13]*p[27] + p[10]*p[11]*p[21] - p[10]*p[11]*p[30] - p[10]*p[12]*p[23] + p[10]*p[12]*p[32] + p[10]*p[14]*p[18] - p[10]*p[14]*p[27]; - coeff[36] = 2*p[0]*p[9]*p[9]*p[15] + 2*p[0]*p[10]*p[10]*p[15] - 2*p[0]*p[13]*p[13]*p[15] - 2*p[0]*p[14]*p[14]*p[15] - 2*p[7]*p[11]*p[15] + p[7]*p[11]*p[24] - 2*p[8]*p[12]*p[15] + p[8]*p[12]*p[24] + 2*p[9]*p[13]*p[15] - p[9]*p[13]*p[24] + 2*p[10]*p[14]*p[15] - p[10]*p[14]*p[24]; - coeff[37] = 2*p[0]*p[7]*p[10]*p[19] - 2*p[0]*p[8]*p[9]*p[19] + 2*p[0]*p[9]*p[9]*p[16] + 2*p[0]*p[10]*p[10]*p[16] - 2*p[0]*p[11]*p[14]*p[19] + 2*p[0]*p[12]*p[13]*p[19] - 2*p[0]*p[13]*p[13]*p[16] - 2*p[0]*p[14]*p[14]*p[16] - 2*p[7]*p[11]*p[16] + p[7]*p[11]*p[25] + 2*p[7]*p[14]*p[19] - p[7]*p[14]*p[28] - 2*p[8]*p[12]*p[16] + p[8]*p[12]*p[25] - 2*p[8]*p[13]*p[19] + p[8]*p[13]*p[28] - 2*p[9]*p[12]*p[19] + p[9]*p[12]*p[28] + 2*p[9]*p[13]*p[16] - p[9]*p[13]*p[25] + 2*p[10]*p[11]*p[19] - p[10]*p[11]*p[28] + 2*p[10]*p[14]*p[16] - p[10]*p[14]*p[25]; - coeff[38] = -2*p[0]*p[7]*p[9]*p[22] + 2*p[0]*p[7]*p[10]*p[20] - 2*p[0]*p[8]*p[9]*p[20] - 2*p[0]*p[8]*p[10]*p[22] + 2*p[0]*p[9]*p[9]*p[17] + 2*p[0]*p[10]*p[10]*p[17] + 2*p[0]*p[11]*p[13]*p[22] - 2*p[0]*p[11]*p[14]*p[20] + 2*p[0]*p[12]*p[13]*p[20] + 2*p[0]*p[12]*p[14]*p[22] - 2*p[0]*p[13]*p[13]*p[17] - 2*p[0]*p[14]*p[14]*p[17] - 2*p[7]*p[11]*p[17] + p[7]*p[11]*p[26] - 2*p[7]*p[13]*p[22] + p[7]*p[13]*p[31] + 2*p[7]*p[14]*p[20] - p[7]*p[14]*p[29] - 2*p[8]*p[12]*p[17] + p[8]*p[12]*p[26] - 2*p[8]*p[13]*p[20] + p[8]*p[13]*p[29] - 2*p[8]*p[14]*p[22] + p[8]*p[14]*p[31] - 2*p[9]*p[11]*p[22] + p[9]*p[11]*p[31] - 2*p[9]*p[12]*p[20] + p[9]*p[12]*p[29] + 2*p[9]*p[13]*p[17] - p[9]*p[13]*p[26] + 2*p[10]*p[11]*p[20] - p[10]*p[11]*p[29] - 2*p[10]*p[12]*p[22] + p[10]*p[12]*p[31] + 2*p[10]*p[14]*p[17] - p[10]*p[14]*p[26]; - coeff[39] = (p[7]*p[11] + p[8]*p[12] - p[9]*p[13] - p[10]*p[14])*p[15]; - coeff[40] = p[7]*p[11]*p[16] - p[7]*p[14]*p[19] + p[8]*p[12]*p[16] + p[8]*p[13]*p[19] + p[9]*p[12]*p[19] - p[9]*p[13]*p[16] - p[10]*p[11]*p[19] - p[10]*p[14]*p[16]; - coeff[41] = p[7]*p[11]*p[17] + p[7]*p[13]*p[22] - p[7]*p[14]*p[20] + p[8]*p[12]*p[17] + p[8]*p[13]*p[20] + p[8]*p[14]*p[22] + p[9]*p[11]*p[22] + p[9]*p[12]*p[20] - p[9]*p[13]*p[17] - p[10]*p[11]*p[20] + p[10]*p[12]*p[22] - p[10]*p[14]*p[17]; - coeff[42] = 2*(p[7]*p[9]*p[23] - p[7]*p[9]*p[32] - p[7]*p[10]*p[21] + p[7]*p[10]*p[30] + p[8]*p[9]*p[21] - p[8]*p[9]*p[30] + p[8]*p[10]*p[23] - p[8]*p[10]*p[32] - p[9]*p[9]*p[18] + p[9]*p[9]*p[27] - p[10]*p[10]*p[18] + p[10]*p[10]*p[27] - p[11]*p[13]*p[23] + p[11]*p[13]*p[32] + p[11]*p[14]*p[21] - p[11]*p[14]*p[30] - p[12]*p[13]*p[21] + p[12]*p[13]*p[30] - p[12]*p[14]*p[23] + p[12]*p[14]*p[32] + p[13]*p[13]*p[18] - p[13]*p[13]*p[27] + p[14]*p[14]*p[18] - p[14]*p[14]*p[27])*p[0]; - coeff[43] = -4*p[0]*p[9]*p[9]*p[15] + 2*p[0]*p[9]*p[9]*p[24] - 4*p[0]*p[10]*p[10]*p[15] + 2*p[0]*p[10]*p[10]*p[24] + 4*p[0]*p[13]*p[13]*p[15] - 2*p[0]*p[13]*p[13]*p[24] + 4*p[0]*p[14]*p[14]*p[15] - 2*p[0]*p[14]*p[14]*p[24] + 2*p[7]*p[11]*p[15] - 2*p[7]*p[11]*p[24] + 2*p[8]*p[12]*p[15] - 2*p[8]*p[12]*p[24] - 2*p[9]*p[13]*p[15] + 2*p[9]*p[13]*p[24] - 2*p[10]*p[14]*p[15] + 2*p[10]*p[14]*p[24]; - coeff[44] = -4*p[0]*p[7]*p[10]*p[19] + 2*p[0]*p[7]*p[10]*p[28] + 4*p[0]*p[8]*p[9]*p[19] - 2*p[0]*p[8]*p[9]*p[28] - 4*p[0]*p[9]*p[9]*p[16] + 2*p[0]*p[9]*p[9]*p[25] - 4*p[0]*p[10]*p[10]*p[16] + 2*p[0]*p[10]*p[10]*p[25] + 4*p[0]*p[11]*p[14]*p[19] - 2*p[0]*p[11]*p[14]*p[28] - 4*p[0]*p[12]*p[13]*p[19] + 2*p[0]*p[12]*p[13]*p[28] + 4*p[0]*p[13]*p[13]*p[16] - 2*p[0]*p[13]*p[13]*p[25] + 4*p[0]*p[14]*p[14]*p[16] - 2*p[0]*p[14]*p[14]*p[25] + 2*p[7]*p[11]*p[16] - 2*p[7]*p[11]*p[25] - 2*p[7]*p[14]*p[19] + 2*p[7]*p[14]*p[28] + 2*p[8]*p[12]*p[16] - 2*p[8]*p[12]*p[25] + 2*p[8]*p[13]*p[19] - 2*p[8]*p[13]*p[28] + 2*p[9]*p[12]*p[19] - 2*p[9]*p[12]*p[28] - 2*p[9]*p[13]*p[16] + 2*p[9]*p[13]*p[25] - 2*p[10]*p[11]*p[19] + 2*p[10]*p[11]*p[28] - 2*p[10]*p[14]*p[16] + 2*p[10]*p[14]*p[25]; - coeff[45] = 4*p[0]*p[7]*p[9]*p[22] - 2*p[0]*p[7]*p[9]*p[31] - 4*p[0]*p[7]*p[10]*p[20] + 2*p[0]*p[7]*p[10]*p[29] + 4*p[0]*p[8]*p[9]*p[20] - 2*p[0]*p[8]*p[9]*p[29] + 4*p[0]*p[8]*p[10]*p[22] - 2*p[0]*p[8]*p[10]*p[31] - 4*p[0]*p[9]*p[9]*p[17] + 2*p[0]*p[9]*p[9]*p[26] - 4*p[0]*p[10]*p[10]*p[17] + 2*p[0]*p[10]*p[10]*p[26] - 4*p[0]*p[11]*p[13]*p[22] + 2*p[0]*p[11]*p[13]*p[31] + 4*p[0]*p[11]*p[14]*p[20] - 2*p[0]*p[11]*p[14]*p[29] - 4*p[0]*p[12]*p[13]*p[20] + 2*p[0]*p[12]*p[13]*p[29] - 4*p[0]*p[12]*p[14]*p[22] + 2*p[0]*p[12]*p[14]*p[31] + 4*p[0]*p[13]*p[13]*p[17] - 2*p[0]*p[13]*p[13]*p[26] + 4*p[0]*p[14]*p[14]*p[17] - 2*p[0]*p[14]*p[14]*p[26] + 2*p[7]*p[11]*p[17] - 2*p[7]*p[11]*p[26] + 2*p[7]*p[13]*p[22] - 2*p[7]*p[13]*p[31] - 2*p[7]*p[14]*p[20] + 2*p[7]*p[14]*p[29] + 2*p[8]*p[12]*p[17] - 2*p[8]*p[12]*p[26] + 2*p[8]*p[13]*p[20] - 2*p[8]*p[13]*p[29] + 2*p[8]*p[14]*p[22] - 2*p[8]*p[14]*p[31] + 2*p[9]*p[11]*p[22] - 2*p[9]*p[11]*p[31] + 2*p[9]*p[12]*p[20] - 2*p[9]*p[12]*p[29] - 2*p[9]*p[13]*p[17] + 2*p[9]*p[13]*p[26] - 2*p[10]*p[11]*p[20] + 2*p[10]*p[11]*p[29] + 2*p[10]*p[12]*p[22] - 2*p[10]*p[12]*p[31] - 2*p[10]*p[14]*p[17] + 2*p[10]*p[14]*p[26]; - coeff[46] = 2*p[0]*p[9]*p[9]*p[15] + 2*p[0]*p[10]*p[10]*p[15] - 2*p[0]*p[13]*p[13]*p[15] - 2*p[0]*p[14]*p[14]*p[15] - 2*p[7]*p[11]*p[15] + 2*p[7]*p[11]*p[24] - 2*p[8]*p[12]*p[15] + 2*p[8]*p[12]*p[24] + 2*p[9]*p[13]*p[15] - 2*p[9]*p[13]*p[24] + 2*p[10]*p[14]*p[15] - 2*p[10]*p[14]*p[24]; - coeff[47] = 2*p[0]*p[7]*p[10]*p[19] - 2*p[0]*p[8]*p[9]*p[19] + 2*p[0]*p[9]*p[9]*p[16] + 2*p[0]*p[10]*p[10]*p[16] - 2*p[0]*p[11]*p[14]*p[19] + 2*p[0]*p[12]*p[13]*p[19] - 2*p[0]*p[13]*p[13]*p[16] - 2*p[0]*p[14]*p[14]*p[16] - 2*p[7]*p[11]*p[16] + 2*p[7]*p[11]*p[25] + 2*p[7]*p[14]*p[19] - 2*p[7]*p[14]*p[28] - 2*p[8]*p[12]*p[16] + 2*p[8]*p[12]*p[25] - 2*p[8]*p[13]*p[19] + 2*p[8]*p[13]*p[28] - 2*p[9]*p[12]*p[19] + 2*p[9]*p[12]*p[28] + 2*p[9]*p[13]*p[16] - 2*p[9]*p[13]*p[25] + 2*p[10]*p[11]*p[19] - 2*p[10]*p[11]*p[28] + 2*p[10]*p[14]*p[16] - 2*p[10]*p[14]*p[25]; - coeff[48] = -2*p[0]*p[7]*p[9]*p[22] + 2*p[0]*p[7]*p[10]*p[20] - 2*p[0]*p[8]*p[9]*p[20] - 2*p[0]*p[8]*p[10]*p[22] + 2*p[0]*p[9]*p[9]*p[17] + 2*p[0]*p[10]*p[10]*p[17] + 2*p[0]*p[11]*p[13]*p[22] - 2*p[0]*p[11]*p[14]*p[20] + 2*p[0]*p[12]*p[13]*p[20] + 2*p[0]*p[12]*p[14]*p[22] - 2*p[0]*p[13]*p[13]*p[17] - 2*p[0]*p[14]*p[14]*p[17] - 2*p[7]*p[11]*p[17] + 2*p[7]*p[11]*p[26] - 2*p[7]*p[13]*p[22] + 2*p[7]*p[13]*p[31] + 2*p[7]*p[14]*p[20] - 2*p[7]*p[14]*p[29] - 2*p[8]*p[12]*p[17] + 2*p[8]*p[12]*p[26] - 2*p[8]*p[13]*p[20] + 2*p[8]*p[13]*p[29] - 2*p[8]*p[14]*p[22] + 2*p[8]*p[14]*p[31] - 2*p[9]*p[11]*p[22] + 2*p[9]*p[11]*p[31] - 2*p[9]*p[12]*p[20] + 2*p[9]*p[12]*p[29] + 2*p[9]*p[13]*p[17] - 2*p[9]*p[13]*p[26] + 2*p[10]*p[11]*p[20] - 2*p[10]*p[11]*p[29] - 2*p[10]*p[12]*p[22] + 2*p[10]*p[12]*p[31] + 2*p[10]*p[14]*p[17] - 2*p[10]*p[14]*p[26]; - coeff[49] = 0; - coeff[50] = 2*(p[9]*p[9]*p[15] - p[9]*p[9]*p[24] + p[10]*p[10]*p[15] - p[10]*p[10]*p[24] - p[13]*p[13]*p[15] + p[13]*p[13]*p[24] - p[14]*p[14]*p[15] + p[14]*p[14]*p[24])*p[0]; - coeff[51] = 2*(p[7]*p[10]*p[19] - p[7]*p[10]*p[28] - p[8]*p[9]*p[19] + p[8]*p[9]*p[28] + p[9]*p[9]*p[16] - p[9]*p[9]*p[25] + p[10]*p[10]*p[16] - p[10]*p[10]*p[25] - p[11]*p[14]*p[19] + p[11]*p[14]*p[28] + p[12]*p[13]*p[19] - p[12]*p[13]*p[28] - p[13]*p[13]*p[16] + p[13]*p[13]*p[25] - p[14]*p[14]*p[16] + p[14]*p[14]*p[25])*p[0]; - coeff[52] = 2*(-p[7]*p[9]*p[22] + p[7]*p[9]*p[31] + p[7]*p[10]*p[20] - p[7]*p[10]*p[29] - p[8]*p[9]*p[20] + p[8]*p[9]*p[29] - p[8]*p[10]*p[22] + p[8]*p[10]*p[31] + p[9]*p[9]*p[17] - p[9]*p[9]*p[26] + p[10]*p[10]*p[17] - p[10]*p[10]*p[26] + p[11]*p[13]*p[22] - p[11]*p[13]*p[31] - p[11]*p[14]*p[20] + p[11]*p[14]*p[29] + p[12]*p[13]*p[20] - p[12]*p[13]*p[29] + p[12]*p[14]*p[22] - p[12]*p[14]*p[31] - p[13]*p[13]*p[17] + p[13]*p[13]*p[26] - p[14]*p[14]*p[17] + p[14]*p[14]*p[26])*p[0]; - coeff[53] = 2*(-p[9]*p[9]*p[15] + p[9]*p[9]*p[24] - p[10]*p[10]*p[15] + p[10]*p[10]*p[24] + p[13]*p[13]*p[15] - p[13]*p[13]*p[24] + p[14]*p[14]*p[15] - p[14]*p[14]*p[24])*p[0]; - coeff[54] = 2*(-p[7]*p[10]*p[19] + p[7]*p[10]*p[28] + p[8]*p[9]*p[19] - p[8]*p[9]*p[28] - p[9]*p[9]*p[16] + p[9]*p[9]*p[25] - p[10]*p[10]*p[16] + p[10]*p[10]*p[25] + p[11]*p[14]*p[19] - p[11]*p[14]*p[28] - p[12]*p[13]*p[19] + p[12]*p[13]*p[28] + p[13]*p[13]*p[16] - p[13]*p[13]*p[25] + p[14]*p[14]*p[16] - p[14]*p[14]*p[25])*p[0]; - coeff[55] = 2*(p[7]*p[9]*p[22] - p[7]*p[9]*p[31] - p[7]*p[10]*p[20] + p[7]*p[10]*p[29] + p[8]*p[9]*p[20] - p[8]*p[9]*p[29] + p[8]*p[10]*p[22] - p[8]*p[10]*p[31] - p[9]*p[9]*p[17] + p[9]*p[9]*p[26] - p[10]*p[10]*p[17] + p[10]*p[10]*p[26] - p[11]*p[13]*p[22] + p[11]*p[13]*p[31] + p[11]*p[14]*p[20] - p[11]*p[14]*p[29] - p[12]*p[13]*p[20] + p[12]*p[13]*p[29] - p[12]*p[14]*p[22] + p[12]*p[14]*p[31] + p[13]*p[13]*p[17] - p[13]*p[13]*p[26] + p[14]*p[14]*p[17] - p[14]*p[14]*p[26])*p[0]; - coeff[56] = -p[2] + p[5] + p[7]*p[8]*p[23] - p[7]*p[8]*p[32] - p[7]*p[10]*p[18] + p[7]*p[10]*p[27] + p[8]*p[8]*p[21] - p[8]*p[8]*p[30] - p[8]*p[9]*p[18] + p[8]*p[9]*p[27] - p[9]*p[10]*p[23] + p[9]*p[10]*p[32] + p[10]*p[10]*p[21] - p[10]*p[10]*p[30] + p[11]*p[12]*p[23] - p[11]*p[12]*p[32] - p[11]*p[14]*p[18] + p[11]*p[14]*p[27] + p[12]*p[12]*p[21] - p[12]*p[12]*p[30] - p[12]*p[13]*p[18] + p[12]*p[13]*p[27] - p[13]*p[14]*p[23] + p[13]*p[14]*p[32] + p[14]*p[14]*p[21] - p[14]*p[14]*p[30] - p[21] + p[30]; - coeff[57] = -2*p[7]*p[10]*p[15] + p[7]*p[10]*p[24] - 2*p[8]*p[9]*p[15] + p[8]*p[9]*p[24] - 2*p[11]*p[14]*p[15] + p[11]*p[14]*p[24] - 2*p[12]*p[13]*p[15] + p[12]*p[13]*p[24]; - coeff[58] = -2*p[7]*p[10]*p[16] + p[7]*p[10]*p[25] + 2*p[8]*p[8]*p[19] - p[8]*p[8]*p[28] - 2*p[8]*p[9]*p[16] + p[8]*p[9]*p[25] + 2*p[10]*p[10]*p[19] - p[10]*p[10]*p[28] - 2*p[11]*p[14]*p[16] + p[11]*p[14]*p[25] + 2*p[12]*p[12]*p[19] - p[12]*p[12]*p[28] - 2*p[12]*p[13]*p[16] + p[12]*p[13]*p[25] + 2*p[14]*p[14]*p[19] - p[14]*p[14]*p[28] - 2*p[19] + p[28]; - coeff[59] = 2*p[7]*p[8]*p[22] - p[7]*p[8]*p[31] - 2*p[7]*p[10]*p[17] + p[7]*p[10]*p[26] + 2*p[8]*p[8]*p[20] - p[8]*p[8]*p[29] - 2*p[8]*p[9]*p[17] + p[8]*p[9]*p[26] - 2*p[9]*p[10]*p[22] + p[9]*p[10]*p[31] + 2*p[10]*p[10]*p[20] - p[10]*p[10]*p[29] + 2*p[11]*p[12]*p[22] - p[11]*p[12]*p[31] - 2*p[11]*p[14]*p[17] + p[11]*p[14]*p[26] + 2*p[12]*p[12]*p[20] - p[12]*p[12]*p[29] - 2*p[12]*p[13]*p[17] + p[12]*p[13]*p[26] - 2*p[13]*p[14]*p[22] + p[13]*p[14]*p[31] + 2*p[14]*p[14]*p[20] - p[14]*p[14]*p[29] - 2*p[20] + p[29]; - coeff[60] = (p[7]*p[10] + p[8]*p[9] + p[11]*p[14] + p[12]*p[13])*p[15]; - coeff[61] = p[7]*p[10]*p[16] - p[8]*p[8]*p[19] + p[8]*p[9]*p[16] - p[10]*p[10]*p[19] + p[11]*p[14]*p[16] - p[12]*p[12]*p[19] + p[12]*p[13]*p[16] - p[14]*p[14]*p[19] + p[19]; - coeff[62] = -p[7]*p[8]*p[22] + p[7]*p[10]*p[17] - p[8]*p[8]*p[20] + p[8]*p[9]*p[17] + p[9]*p[10]*p[22] - p[10]*p[10]*p[20] - p[11]*p[12]*p[22] + p[11]*p[14]*p[17] - p[12]*p[12]*p[20] + p[12]*p[13]*p[17] + p[13]*p[14]*p[22] - p[14]*p[14]*p[20] + p[20]; - coeff[63] = 0; - coeff[64] = 2*p[7]*p[10]*p[15] - 2*p[7]*p[10]*p[24] + 2*p[8]*p[9]*p[15] - 2*p[8]*p[9]*p[24] + 2*p[11]*p[14]*p[15] - 2*p[11]*p[14]*p[24] + 2*p[12]*p[13]*p[15] - 2*p[12]*p[13]*p[24]; - coeff[65] = 2*p[7]*p[10]*p[16] - 2*p[7]*p[10]*p[25] - 2*p[8]*p[8]*p[19] + 2*p[8]*p[8]*p[28] + 2*p[8]*p[9]*p[16] - 2*p[8]*p[9]*p[25] - 2*p[10]*p[10]*p[19] + 2*p[10]*p[10]*p[28] + 2*p[11]*p[14]*p[16] - 2*p[11]*p[14]*p[25] - 2*p[12]*p[12]*p[19] + 2*p[12]*p[12]*p[28] + 2*p[12]*p[13]*p[16] - 2*p[12]*p[13]*p[25] - 2*p[14]*p[14]*p[19] + 2*p[14]*p[14]*p[28] + 2*p[19] - 2*p[28]; - coeff[66] = -2*p[7]*p[8]*p[22] + 2*p[7]*p[8]*p[31] + 2*p[7]*p[10]*p[17] - 2*p[7]*p[10]*p[26] - 2*p[8]*p[8]*p[20] + 2*p[8]*p[8]*p[29] + 2*p[8]*p[9]*p[17] - 2*p[8]*p[9]*p[26] + 2*p[9]*p[10]*p[22] - 2*p[9]*p[10]*p[31] - 2*p[10]*p[10]*p[20] + 2*p[10]*p[10]*p[29] - 2*p[11]*p[12]*p[22] + 2*p[11]*p[12]*p[31] + 2*p[11]*p[14]*p[17] - 2*p[11]*p[14]*p[26] - 2*p[12]*p[12]*p[20] + 2*p[12]*p[12]*p[29] + 2*p[12]*p[13]*p[17] - 2*p[12]*p[13]*p[26] + 2*p[13]*p[14]*p[22] - 2*p[13]*p[14]*p[31] - 2*p[14]*p[14]*p[20] + 2*p[14]*p[14]*p[29] + 2*p[20] - 2*p[29]; - coeff[67] = -2*p[7]*p[10]*p[15] + 2*p[7]*p[10]*p[24] - 2*p[8]*p[9]*p[15] + 2*p[8]*p[9]*p[24] - 2*p[11]*p[14]*p[15] + 2*p[11]*p[14]*p[24] - 2*p[12]*p[13]*p[15] + 2*p[12]*p[13]*p[24]; - coeff[68] = -2*p[7]*p[10]*p[16] + 2*p[7]*p[10]*p[25] + 2*p[8]*p[8]*p[19] - 2*p[8]*p[8]*p[28] - 2*p[8]*p[9]*p[16] + 2*p[8]*p[9]*p[25] + 2*p[10]*p[10]*p[19] - 2*p[10]*p[10]*p[28] - 2*p[11]*p[14]*p[16] + 2*p[11]*p[14]*p[25] + 2*p[12]*p[12]*p[19] - 2*p[12]*p[12]*p[28] - 2*p[12]*p[13]*p[16] + 2*p[12]*p[13]*p[25] + 2*p[14]*p[14]*p[19] - 2*p[14]*p[14]*p[28] - 2*p[19] + 2*p[28]; - coeff[69] = 2*p[7]*p[8]*p[22] - 2*p[7]*p[8]*p[31] - 2*p[7]*p[10]*p[17] + 2*p[7]*p[10]*p[26] + 2*p[8]*p[8]*p[20] - 2*p[8]*p[8]*p[29] - 2*p[8]*p[9]*p[17] + 2*p[8]*p[9]*p[26] - 2*p[9]*p[10]*p[22] + 2*p[9]*p[10]*p[31] + 2*p[10]*p[10]*p[20] - 2*p[10]*p[10]*p[29] + 2*p[11]*p[12]*p[22] - 2*p[11]*p[12]*p[31] - 2*p[11]*p[14]*p[17] + 2*p[11]*p[14]*p[26] + 2*p[12]*p[12]*p[20] - 2*p[12]*p[12]*p[29] - 2*p[12]*p[13]*p[17] + 2*p[12]*p[13]*p[26] - 2*p[13]*p[14]*p[22] + 2*p[13]*p[14]*p[31] + 2*p[14]*p[14]*p[20] - 2*p[14]*p[14]*p[29] - 2*p[20] + 2*p[29]; - coeff[70] = 2*p[0]*p[7]*p[11]*p[21] - 2*p[0]*p[7]*p[12]*p[23] + 2*p[0]*p[7]*p[14]*p[18] - 2*p[0]*p[8]*p[11]*p[23] - 2*p[0]*p[8]*p[12]*p[21] + 2*p[0]*p[8]*p[13]*p[18] + 2*p[0]*p[9]*p[12]*p[18] + 2*p[0]*p[9]*p[13]*p[21] + 2*p[0]*p[9]*p[14]*p[23] + 2*p[0]*p[10]*p[11]*p[18] + 2*p[0]*p[10]*p[13]*p[23] - 2*p[0]*p[10]*p[14]*p[21] + p[7]*p[8]*p[23] - p[7]*p[8]*p[32] - p[7]*p[10]*p[18] + p[7]*p[10]*p[27] + p[8]*p[8]*p[21] - p[8]*p[8]*p[30] - p[8]*p[9]*p[18] + p[8]*p[9]*p[27] - p[9]*p[10]*p[23] + p[9]*p[10]*p[32] + p[10]*p[10]*p[21] - p[10]*p[10]*p[30] - p[11]*p[12]*p[23] + p[11]*p[12]*p[32] + p[11]*p[14]*p[18] - p[11]*p[14]*p[27] - p[12]*p[12]*p[21] + p[12]*p[12]*p[30] + p[12]*p[13]*p[18] - p[12]*p[13]*p[27] + p[13]*p[14]*p[23] - p[13]*p[14]*p[32] - p[14]*p[14]*p[21] + p[14]*p[14]*p[30]; - coeff[71] = 2*p[0]*p[7]*p[14]*p[15] + 2*p[0]*p[8]*p[13]*p[15] + 2*p[0]*p[9]*p[12]*p[15] + 2*p[0]*p[10]*p[11]*p[15] - 2*p[7]*p[10]*p[15] + p[7]*p[10]*p[24] - 2*p[8]*p[9]*p[15] + p[8]*p[9]*p[24] + 2*p[11]*p[14]*p[15] - p[11]*p[14]*p[24] + 2*p[12]*p[13]*p[15] - p[12]*p[13]*p[24]; - coeff[72] = 2*p[0]*p[7]*p[11]*p[19] + 2*p[0]*p[7]*p[14]*p[16] - 2*p[0]*p[8]*p[12]*p[19] + 2*p[0]*p[8]*p[13]*p[16] + 2*p[0]*p[9]*p[12]*p[16] + 2*p[0]*p[9]*p[13]*p[19] + 2*p[0]*p[10]*p[11]*p[16] - 2*p[0]*p[10]*p[14]*p[19] - 2*p[7]*p[10]*p[16] + p[7]*p[10]*p[25] + 2*p[8]*p[8]*p[19] - p[8]*p[8]*p[28] - 2*p[8]*p[9]*p[16] + p[8]*p[9]*p[25] + 2*p[10]*p[10]*p[19] - p[10]*p[10]*p[28] + 2*p[11]*p[14]*p[16] - p[11]*p[14]*p[25] - 2*p[12]*p[12]*p[19] + p[12]*p[12]*p[28] + 2*p[12]*p[13]*p[16] - p[12]*p[13]*p[25] - 2*p[14]*p[14]*p[19] + p[14]*p[14]*p[28]; - coeff[73] = 2*p[0]*p[7]*p[11]*p[20] - 2*p[0]*p[7]*p[12]*p[22] + 2*p[0]*p[7]*p[14]*p[17] - 2*p[0]*p[8]*p[11]*p[22] - 2*p[0]*p[8]*p[12]*p[20] + 2*p[0]*p[8]*p[13]*p[17] + 2*p[0]*p[9]*p[12]*p[17] + 2*p[0]*p[9]*p[13]*p[20] + 2*p[0]*p[9]*p[14]*p[22] + 2*p[0]*p[10]*p[11]*p[17] + 2*p[0]*p[10]*p[13]*p[22] - 2*p[0]*p[10]*p[14]*p[20] + 2*p[7]*p[8]*p[22] - p[7]*p[8]*p[31] - 2*p[7]*p[10]*p[17] + p[7]*p[10]*p[26] + 2*p[8]*p[8]*p[20] - p[8]*p[8]*p[29] - 2*p[8]*p[9]*p[17] + p[8]*p[9]*p[26] - 2*p[9]*p[10]*p[22] + p[9]*p[10]*p[31] + 2*p[10]*p[10]*p[20] - p[10]*p[10]*p[29] - 2*p[11]*p[12]*p[22] + p[11]*p[12]*p[31] + 2*p[11]*p[14]*p[17] - p[11]*p[14]*p[26] - 2*p[12]*p[12]*p[20] + p[12]*p[12]*p[29] + 2*p[12]*p[13]*p[17] - p[12]*p[13]*p[26] + 2*p[13]*p[14]*p[22] - p[13]*p[14]*p[31] - 2*p[14]*p[14]*p[20] + p[14]*p[14]*p[29]; - coeff[74] = (p[7]*p[10] + p[8]*p[9] - p[11]*p[14] - p[12]*p[13])*p[15]; - coeff[75] = p[7]*p[10]*p[16] - p[8]*p[8]*p[19] + p[8]*p[9]*p[16] - p[10]*p[10]*p[19] - p[11]*p[14]*p[16] + p[12]*p[12]*p[19] - p[12]*p[13]*p[16] + p[14]*p[14]*p[19]; - coeff[76] = -p[7]*p[8]*p[22] + p[7]*p[10]*p[17] - p[8]*p[8]*p[20] + p[8]*p[9]*p[17] + p[9]*p[10]*p[22] - p[10]*p[10]*p[20] + p[11]*p[12]*p[22] - p[11]*p[14]*p[17] + p[12]*p[12]*p[20] - p[12]*p[13]*p[17] - p[13]*p[14]*p[22] + p[14]*p[14]*p[20]; - coeff[77] = 2*(-p[7]*p[11]*p[21] + p[7]*p[11]*p[30] + p[7]*p[12]*p[23] - p[7]*p[12]*p[32] - p[7]*p[14]*p[18] + p[7]*p[14]*p[27] + p[8]*p[11]*p[23] - p[8]*p[11]*p[32] + p[8]*p[12]*p[21] - p[8]*p[12]*p[30] - p[8]*p[13]*p[18] + p[8]*p[13]*p[27] - p[9]*p[12]*p[18] + p[9]*p[12]*p[27] - p[9]*p[13]*p[21] + p[9]*p[13]*p[30] - p[9]*p[14]*p[23] + p[9]*p[14]*p[32] - p[10]*p[11]*p[18] + p[10]*p[11]*p[27] - p[10]*p[13]*p[23] + p[10]*p[13]*p[32] + p[10]*p[14]*p[21] - p[10]*p[14]*p[30])*p[0]; - coeff[78] = -4*p[0]*p[7]*p[14]*p[15] + 2*p[0]*p[7]*p[14]*p[24] - 4*p[0]*p[8]*p[13]*p[15] + 2*p[0]*p[8]*p[13]*p[24] - 4*p[0]*p[9]*p[12]*p[15] + 2*p[0]*p[9]*p[12]*p[24] - 4*p[0]*p[10]*p[11]*p[15] + 2*p[0]*p[10]*p[11]*p[24] + 2*p[7]*p[10]*p[15] - 2*p[7]*p[10]*p[24] + 2*p[8]*p[9]*p[15] - 2*p[8]*p[9]*p[24] - 2*p[11]*p[14]*p[15] + 2*p[11]*p[14]*p[24] - 2*p[12]*p[13]*p[15] + 2*p[12]*p[13]*p[24]; - coeff[79] = -4*p[0]*p[7]*p[11]*p[19] + 2*p[0]*p[7]*p[11]*p[28] - 4*p[0]*p[7]*p[14]*p[16] + 2*p[0]*p[7]*p[14]*p[25] + 4*p[0]*p[8]*p[12]*p[19] - 2*p[0]*p[8]*p[12]*p[28] - 4*p[0]*p[8]*p[13]*p[16] + 2*p[0]*p[8]*p[13]*p[25] - 4*p[0]*p[9]*p[12]*p[16] + 2*p[0]*p[9]*p[12]*p[25] - 4*p[0]*p[9]*p[13]*p[19] + 2*p[0]*p[9]*p[13]*p[28] - 4*p[0]*p[10]*p[11]*p[16] + 2*p[0]*p[10]*p[11]*p[25] + 4*p[0]*p[10]*p[14]*p[19] - 2*p[0]*p[10]*p[14]*p[28] + 2*p[7]*p[10]*p[16] - 2*p[7]*p[10]*p[25] - 2*p[8]*p[8]*p[19] + 2*p[8]*p[8]*p[28] + 2*p[8]*p[9]*p[16] - 2*p[8]*p[9]*p[25] - 2*p[10]*p[10]*p[19] + 2*p[10]*p[10]*p[28] - 2*p[11]*p[14]*p[16] + 2*p[11]*p[14]*p[25] + 2*p[12]*p[12]*p[19] - 2*p[12]*p[12]*p[28] - 2*p[12]*p[13]*p[16] + 2*p[12]*p[13]*p[25] + 2*p[14]*p[14]*p[19] - 2*p[14]*p[14]*p[28]; - coeff[80] = -4*p[0]*p[7]*p[11]*p[20] + 2*p[0]*p[7]*p[11]*p[29] + 4*p[0]*p[7]*p[12]*p[22] - 2*p[0]*p[7]*p[12]*p[31] - 4*p[0]*p[7]*p[14]*p[17] + 2*p[0]*p[7]*p[14]*p[26] + 4*p[0]*p[8]*p[11]*p[22] - 2*p[0]*p[8]*p[11]*p[31] + 4*p[0]*p[8]*p[12]*p[20] - 2*p[0]*p[8]*p[12]*p[29] - 4*p[0]*p[8]*p[13]*p[17] + 2*p[0]*p[8]*p[13]*p[26] - 4*p[0]*p[9]*p[12]*p[17] + 2*p[0]*p[9]*p[12]*p[26] - 4*p[0]*p[9]*p[13]*p[20] + 2*p[0]*p[9]*p[13]*p[29] - 4*p[0]*p[9]*p[14]*p[22] + 2*p[0]*p[9]*p[14]*p[31] - 4*p[0]*p[10]*p[11]*p[17] + 2*p[0]*p[10]*p[11]*p[26] - 4*p[0]*p[10]*p[13]*p[22] + 2*p[0]*p[10]*p[13]*p[31] + 4*p[0]*p[10]*p[14]*p[20] - 2*p[0]*p[10]*p[14]*p[29] - 2*p[7]*p[8]*p[22] + 2*p[7]*p[8]*p[31] + 2*p[7]*p[10]*p[17] - 2*p[7]*p[10]*p[26] - 2*p[8]*p[8]*p[20] + 2*p[8]*p[8]*p[29] + 2*p[8]*p[9]*p[17] - 2*p[8]*p[9]*p[26] + 2*p[9]*p[10]*p[22] - 2*p[9]*p[10]*p[31] - 2*p[10]*p[10]*p[20] + 2*p[10]*p[10]*p[29] + 2*p[11]*p[12]*p[22] - 2*p[11]*p[12]*p[31] - 2*p[11]*p[14]*p[17] + 2*p[11]*p[14]*p[26] + 2*p[12]*p[12]*p[20] - 2*p[12]*p[12]*p[29] - 2*p[12]*p[13]*p[17] + 2*p[12]*p[13]*p[26] - 2*p[13]*p[14]*p[22] + 2*p[13]*p[14]*p[31] + 2*p[14]*p[14]*p[20] - 2*p[14]*p[14]*p[29]; - coeff[81] = 2*p[0]*p[7]*p[14]*p[15] + 2*p[0]*p[8]*p[13]*p[15] + 2*p[0]*p[9]*p[12]*p[15] + 2*p[0]*p[10]*p[11]*p[15] - 2*p[7]*p[10]*p[15] + 2*p[7]*p[10]*p[24] - 2*p[8]*p[9]*p[15] + 2*p[8]*p[9]*p[24] + 2*p[11]*p[14]*p[15] - 2*p[11]*p[14]*p[24] + 2*p[12]*p[13]*p[15] - 2*p[12]*p[13]*p[24]; - coeff[82] = 2*p[0]*p[7]*p[11]*p[19] + 2*p[0]*p[7]*p[14]*p[16] - 2*p[0]*p[8]*p[12]*p[19] + 2*p[0]*p[8]*p[13]*p[16] + 2*p[0]*p[9]*p[12]*p[16] + 2*p[0]*p[9]*p[13]*p[19] + 2*p[0]*p[10]*p[11]*p[16] - 2*p[0]*p[10]*p[14]*p[19] - 2*p[7]*p[10]*p[16] + 2*p[7]*p[10]*p[25] + 2*p[8]*p[8]*p[19] - 2*p[8]*p[8]*p[28] - 2*p[8]*p[9]*p[16] + 2*p[8]*p[9]*p[25] + 2*p[10]*p[10]*p[19] - 2*p[10]*p[10]*p[28] + 2*p[11]*p[14]*p[16] - 2*p[11]*p[14]*p[25] - 2*p[12]*p[12]*p[19] + 2*p[12]*p[12]*p[28] + 2*p[12]*p[13]*p[16] - 2*p[12]*p[13]*p[25] - 2*p[14]*p[14]*p[19] + 2*p[14]*p[14]*p[28]; - coeff[83] = 2*p[0]*p[7]*p[11]*p[20] - 2*p[0]*p[7]*p[12]*p[22] + 2*p[0]*p[7]*p[14]*p[17] - 2*p[0]*p[8]*p[11]*p[22] - 2*p[0]*p[8]*p[12]*p[20] + 2*p[0]*p[8]*p[13]*p[17] + 2*p[0]*p[9]*p[12]*p[17] + 2*p[0]*p[9]*p[13]*p[20] + 2*p[0]*p[9]*p[14]*p[22] + 2*p[0]*p[10]*p[11]*p[17] + 2*p[0]*p[10]*p[13]*p[22] - 2*p[0]*p[10]*p[14]*p[20] + 2*p[7]*p[8]*p[22] - 2*p[7]*p[8]*p[31] - 2*p[7]*p[10]*p[17] + 2*p[7]*p[10]*p[26] + 2*p[8]*p[8]*p[20] - 2*p[8]*p[8]*p[29] - 2*p[8]*p[9]*p[17] + 2*p[8]*p[9]*p[26] - 2*p[9]*p[10]*p[22] + 2*p[9]*p[10]*p[31] + 2*p[10]*p[10]*p[20] - 2*p[10]*p[10]*p[29] - 2*p[11]*p[12]*p[22] + 2*p[11]*p[12]*p[31] + 2*p[11]*p[14]*p[17] - 2*p[11]*p[14]*p[26] - 2*p[12]*p[12]*p[20] + 2*p[12]*p[12]*p[29] + 2*p[12]*p[13]*p[17] - 2*p[12]*p[13]*p[26] + 2*p[13]*p[14]*p[22] - 2*p[13]*p[14]*p[31] - 2*p[14]*p[14]*p[20] + 2*p[14]*p[14]*p[29]; - coeff[84] = 0; - coeff[85] = 2*(p[7]*p[14]*p[15] - p[7]*p[14]*p[24] + p[8]*p[13]*p[15] - p[8]*p[13]*p[24] + p[9]*p[12]*p[15] - p[9]*p[12]*p[24] + p[10]*p[11]*p[15] - p[10]*p[11]*p[24])*p[0]; - coeff[86] = 2*(p[7]*p[11]*p[19] - p[7]*p[11]*p[28] + p[7]*p[14]*p[16] - p[7]*p[14]*p[25] - p[8]*p[12]*p[19] + p[8]*p[12]*p[28] + p[8]*p[13]*p[16] - p[8]*p[13]*p[25] + p[9]*p[12]*p[16] - p[9]*p[12]*p[25] + p[9]*p[13]*p[19] - p[9]*p[13]*p[28] + p[10]*p[11]*p[16] - p[10]*p[11]*p[25] - p[10]*p[14]*p[19] + p[10]*p[14]*p[28])*p[0]; - coeff[87] = 2*(p[7]*p[11]*p[20] - p[7]*p[11]*p[29] - p[7]*p[12]*p[22] + p[7]*p[12]*p[31] + p[7]*p[14]*p[17] - p[7]*p[14]*p[26] - p[8]*p[11]*p[22] + p[8]*p[11]*p[31] - p[8]*p[12]*p[20] + p[8]*p[12]*p[29] + p[8]*p[13]*p[17] - p[8]*p[13]*p[26] + p[9]*p[12]*p[17] - p[9]*p[12]*p[26] + p[9]*p[13]*p[20] - p[9]*p[13]*p[29] + p[9]*p[14]*p[22] - p[9]*p[14]*p[31] + p[10]*p[11]*p[17] - p[10]*p[11]*p[26] + p[10]*p[13]*p[22] - p[10]*p[13]*p[31] - p[10]*p[14]*p[20] + p[10]*p[14]*p[29])*p[0]; - coeff[88] = 2*(-p[7]*p[14]*p[15] + p[7]*p[14]*p[24] - p[8]*p[13]*p[15] + p[8]*p[13]*p[24] - p[9]*p[12]*p[15] + p[9]*p[12]*p[24] - p[10]*p[11]*p[15] + p[10]*p[11]*p[24])*p[0]; - coeff[89] = 2*(-p[7]*p[11]*p[19] + p[7]*p[11]*p[28] - p[7]*p[14]*p[16] + p[7]*p[14]*p[25] + p[8]*p[12]*p[19] - p[8]*p[12]*p[28] - p[8]*p[13]*p[16] + p[8]*p[13]*p[25] - p[9]*p[12]*p[16] + p[9]*p[12]*p[25] - p[9]*p[13]*p[19] + p[9]*p[13]*p[28] - p[10]*p[11]*p[16] + p[10]*p[11]*p[25] + p[10]*p[14]*p[19] - p[10]*p[14]*p[28])*p[0]; - coeff[90] = 2*(-p[7]*p[11]*p[20] + p[7]*p[11]*p[29] + p[7]*p[12]*p[22] - p[7]*p[12]*p[31] - p[7]*p[14]*p[17] + p[7]*p[14]*p[26] + p[8]*p[11]*p[22] - p[8]*p[11]*p[31] + p[8]*p[12]*p[20] - p[8]*p[12]*p[29] - p[8]*p[13]*p[17] + p[8]*p[13]*p[26] - p[9]*p[12]*p[17] + p[9]*p[12]*p[26] - p[9]*p[13]*p[20] + p[9]*p[13]*p[29] - p[9]*p[14]*p[22] + p[9]*p[14]*p[31] - p[10]*p[11]*p[17] + p[10]*p[11]*p[26] - p[10]*p[13]*p[22] + p[10]*p[13]*p[31] + p[10]*p[14]*p[20] - p[10]*p[14]*p[29])*p[0]; - coeff[91] = 2*p[0]*p[7]*p[8]*p[23] - 2*p[0]*p[7]*p[10]*p[18] + 2*p[0]*p[8]*p[8]*p[21] - 2*p[0]*p[8]*p[9]*p[18] - 2*p[0]*p[9]*p[10]*p[23] + 2*p[0]*p[10]*p[10]*p[21] - 2*p[0]*p[11]*p[12]*p[23] + 2*p[0]*p[11]*p[14]*p[18] - 2*p[0]*p[12]*p[12]*p[21] + 2*p[0]*p[12]*p[13]*p[18] + 2*p[0]*p[13]*p[14]*p[23] - 2*p[0]*p[14]*p[14]*p[21] - p[7]*p[11]*p[21] + p[7]*p[11]*p[30] + p[7]*p[12]*p[23] - p[7]*p[12]*p[32] - p[7]*p[14]*p[18] + p[7]*p[14]*p[27] + p[8]*p[11]*p[23] - p[8]*p[11]*p[32] + p[8]*p[12]*p[21] - p[8]*p[12]*p[30] - p[8]*p[13]*p[18] + p[8]*p[13]*p[27] - p[9]*p[12]*p[18] + p[9]*p[12]*p[27] - p[9]*p[13]*p[21] + p[9]*p[13]*p[30] - p[9]*p[14]*p[23] + p[9]*p[14]*p[32] - p[10]*p[11]*p[18] + p[10]*p[11]*p[27] - p[10]*p[13]*p[23] + p[10]*p[13]*p[32] + p[10]*p[14]*p[21] - p[10]*p[14]*p[30]; - coeff[92] = -2*p[0]*p[7]*p[10]*p[15] - 2*p[0]*p[8]*p[9]*p[15] + 2*p[0]*p[11]*p[14]*p[15] + 2*p[0]*p[12]*p[13]*p[15] - 2*p[7]*p[14]*p[15] + p[7]*p[14]*p[24] - 2*p[8]*p[13]*p[15] + p[8]*p[13]*p[24] - 2*p[9]*p[12]*p[15] + p[9]*p[12]*p[24] - 2*p[10]*p[11]*p[15] + p[10]*p[11]*p[24]; - coeff[93] = -2*p[0]*p[7]*p[10]*p[16] + 2*p[0]*p[8]*p[8]*p[19] - 2*p[0]*p[8]*p[9]*p[16] + 2*p[0]*p[10]*p[10]*p[19] + 2*p[0]*p[11]*p[14]*p[16] - 2*p[0]*p[12]*p[12]*p[19] + 2*p[0]*p[12]*p[13]*p[16] - 2*p[0]*p[14]*p[14]*p[19] - 2*p[7]*p[11]*p[19] + p[7]*p[11]*p[28] - 2*p[7]*p[14]*p[16] + p[7]*p[14]*p[25] + 2*p[8]*p[12]*p[19] - p[8]*p[12]*p[28] - 2*p[8]*p[13]*p[16] + p[8]*p[13]*p[25] - 2*p[9]*p[12]*p[16] + p[9]*p[12]*p[25] - 2*p[9]*p[13]*p[19] + p[9]*p[13]*p[28] - 2*p[10]*p[11]*p[16] + p[10]*p[11]*p[25] + 2*p[10]*p[14]*p[19] - p[10]*p[14]*p[28]; - coeff[94] = 2*p[0]*p[7]*p[8]*p[22] - 2*p[0]*p[7]*p[10]*p[17] + 2*p[0]*p[8]*p[8]*p[20] - 2*p[0]*p[8]*p[9]*p[17] - 2*p[0]*p[9]*p[10]*p[22] + 2*p[0]*p[10]*p[10]*p[20] - 2*p[0]*p[11]*p[12]*p[22] + 2*p[0]*p[11]*p[14]*p[17] - 2*p[0]*p[12]*p[12]*p[20] + 2*p[0]*p[12]*p[13]*p[17] + 2*p[0]*p[13]*p[14]*p[22] - 2*p[0]*p[14]*p[14]*p[20] - 2*p[7]*p[11]*p[20] + p[7]*p[11]*p[29] + 2*p[7]*p[12]*p[22] - p[7]*p[12]*p[31] - 2*p[7]*p[14]*p[17] + p[7]*p[14]*p[26] + 2*p[8]*p[11]*p[22] - p[8]*p[11]*p[31] + 2*p[8]*p[12]*p[20] - p[8]*p[12]*p[29] - 2*p[8]*p[13]*p[17] + p[8]*p[13]*p[26] - 2*p[9]*p[12]*p[17] + p[9]*p[12]*p[26] - 2*p[9]*p[13]*p[20] + p[9]*p[13]*p[29] - 2*p[9]*p[14]*p[22] + p[9]*p[14]*p[31] - 2*p[10]*p[11]*p[17] + p[10]*p[11]*p[26] - 2*p[10]*p[13]*p[22] + p[10]*p[13]*p[31] + 2*p[10]*p[14]*p[20] - p[10]*p[14]*p[29]; - coeff[95] = (p[7]*p[14] + p[8]*p[13] + p[9]*p[12] + p[10]*p[11])*p[15]; - coeff[96] = p[7]*p[11]*p[19] + p[7]*p[14]*p[16] - p[8]*p[12]*p[19] + p[8]*p[13]*p[16] + p[9]*p[12]*p[16] + p[9]*p[13]*p[19] + p[10]*p[11]*p[16] - p[10]*p[14]*p[19]; - coeff[97] = p[7]*p[11]*p[20] - p[7]*p[12]*p[22] + p[7]*p[14]*p[17] - p[8]*p[11]*p[22] - p[8]*p[12]*p[20] + p[8]*p[13]*p[17] + p[9]*p[12]*p[17] + p[9]*p[13]*p[20] + p[9]*p[14]*p[22] + p[10]*p[11]*p[17] + p[10]*p[13]*p[22] - p[10]*p[14]*p[20]; - coeff[98] = 2*(-p[7]*p[8]*p[23] + p[7]*p[8]*p[32] + p[7]*p[10]*p[18] - p[7]*p[10]*p[27] - p[8]*p[8]*p[21] + p[8]*p[8]*p[30] + p[8]*p[9]*p[18] - p[8]*p[9]*p[27] + p[9]*p[10]*p[23] - p[9]*p[10]*p[32] - p[10]*p[10]*p[21] + p[10]*p[10]*p[30] + p[11]*p[12]*p[23] - p[11]*p[12]*p[32] - p[11]*p[14]*p[18] + p[11]*p[14]*p[27] + p[12]*p[12]*p[21] - p[12]*p[12]*p[30] - p[12]*p[13]*p[18] + p[12]*p[13]*p[27] - p[13]*p[14]*p[23] + p[13]*p[14]*p[32] + p[14]*p[14]*p[21] - p[14]*p[14]*p[30])*p[0]; - coeff[99] = 4*p[0]*p[7]*p[10]*p[15] - 2*p[0]*p[7]*p[10]*p[24] + 4*p[0]*p[8]*p[9]*p[15] - 2*p[0]*p[8]*p[9]*p[24] - 4*p[0]*p[11]*p[14]*p[15] + 2*p[0]*p[11]*p[14]*p[24] - 4*p[0]*p[12]*p[13]*p[15] + 2*p[0]*p[12]*p[13]*p[24] + 2*p[7]*p[14]*p[15] - 2*p[7]*p[14]*p[24] + 2*p[8]*p[13]*p[15] - 2*p[8]*p[13]*p[24] + 2*p[9]*p[12]*p[15] - 2*p[9]*p[12]*p[24] + 2*p[10]*p[11]*p[15] - 2*p[10]*p[11]*p[24]; - coeff[100] = 4*p[0]*p[7]*p[10]*p[16] - 2*p[0]*p[7]*p[10]*p[25] - 4*p[0]*p[8]*p[8]*p[19] + 2*p[0]*p[8]*p[8]*p[28] + 4*p[0]*p[8]*p[9]*p[16] - 2*p[0]*p[8]*p[9]*p[25] - 4*p[0]*p[10]*p[10]*p[19] + 2*p[0]*p[10]*p[10]*p[28] - 4*p[0]*p[11]*p[14]*p[16] + 2*p[0]*p[11]*p[14]*p[25] + 4*p[0]*p[12]*p[12]*p[19] - 2*p[0]*p[12]*p[12]*p[28] - 4*p[0]*p[12]*p[13]*p[16] + 2*p[0]*p[12]*p[13]*p[25] + 4*p[0]*p[14]*p[14]*p[19] - 2*p[0]*p[14]*p[14]*p[28] + 2*p[7]*p[11]*p[19] - 2*p[7]*p[11]*p[28] + 2*p[7]*p[14]*p[16] - 2*p[7]*p[14]*p[25] - 2*p[8]*p[12]*p[19] + 2*p[8]*p[12]*p[28] + 2*p[8]*p[13]*p[16] - 2*p[8]*p[13]*p[25] + 2*p[9]*p[12]*p[16] - 2*p[9]*p[12]*p[25] + 2*p[9]*p[13]*p[19] - 2*p[9]*p[13]*p[28] + 2*p[10]*p[11]*p[16] - 2*p[10]*p[11]*p[25] - 2*p[10]*p[14]*p[19] + 2*p[10]*p[14]*p[28]; - coeff[101] = -4*p[0]*p[7]*p[8]*p[22] + 2*p[0]*p[7]*p[8]*p[31] + 4*p[0]*p[7]*p[10]*p[17] - 2*p[0]*p[7]*p[10]*p[26] - 4*p[0]*p[8]*p[8]*p[20] + 2*p[0]*p[8]*p[8]*p[29] + 4*p[0]*p[8]*p[9]*p[17] - 2*p[0]*p[8]*p[9]*p[26] + 4*p[0]*p[9]*p[10]*p[22] - 2*p[0]*p[9]*p[10]*p[31] - 4*p[0]*p[10]*p[10]*p[20] + 2*p[0]*p[10]*p[10]*p[29] + 4*p[0]*p[11]*p[12]*p[22] - 2*p[0]*p[11]*p[12]*p[31] - 4*p[0]*p[11]*p[14]*p[17] + 2*p[0]*p[11]*p[14]*p[26] + 4*p[0]*p[12]*p[12]*p[20] - 2*p[0]*p[12]*p[12]*p[29] - 4*p[0]*p[12]*p[13]*p[17] + 2*p[0]*p[12]*p[13]*p[26] - 4*p[0]*p[13]*p[14]*p[22] + 2*p[0]*p[13]*p[14]*p[31] + 4*p[0]*p[14]*p[14]*p[20] - 2*p[0]*p[14]*p[14]*p[29] + 2*p[7]*p[11]*p[20] - 2*p[7]*p[11]*p[29] - 2*p[7]*p[12]*p[22] + 2*p[7]*p[12]*p[31] + 2*p[7]*p[14]*p[17] - 2*p[7]*p[14]*p[26] - 2*p[8]*p[11]*p[22] + 2*p[8]*p[11]*p[31] - 2*p[8]*p[12]*p[20] + 2*p[8]*p[12]*p[29] + 2*p[8]*p[13]*p[17] - 2*p[8]*p[13]*p[26] + 2*p[9]*p[12]*p[17] - 2*p[9]*p[12]*p[26] + 2*p[9]*p[13]*p[20] - 2*p[9]*p[13]*p[29] + 2*p[9]*p[14]*p[22] - 2*p[9]*p[14]*p[31] + 2*p[10]*p[11]*p[17] - 2*p[10]*p[11]*p[26] + 2*p[10]*p[13]*p[22] - 2*p[10]*p[13]*p[31] - 2*p[10]*p[14]*p[20] + 2*p[10]*p[14]*p[29]; - coeff[102] = -2*p[0]*p[7]*p[10]*p[15] - 2*p[0]*p[8]*p[9]*p[15] + 2*p[0]*p[11]*p[14]*p[15] + 2*p[0]*p[12]*p[13]*p[15] - 2*p[7]*p[14]*p[15] + 2*p[7]*p[14]*p[24] - 2*p[8]*p[13]*p[15] + 2*p[8]*p[13]*p[24] - 2*p[9]*p[12]*p[15] + 2*p[9]*p[12]*p[24] - 2*p[10]*p[11]*p[15] + 2*p[10]*p[11]*p[24]; - coeff[103] = -2*p[0]*p[7]*p[10]*p[16] + 2*p[0]*p[8]*p[8]*p[19] - 2*p[0]*p[8]*p[9]*p[16] + 2*p[0]*p[10]*p[10]*p[19] + 2*p[0]*p[11]*p[14]*p[16] - 2*p[0]*p[12]*p[12]*p[19] + 2*p[0]*p[12]*p[13]*p[16] - 2*p[0]*p[14]*p[14]*p[19] - 2*p[7]*p[11]*p[19] + 2*p[7]*p[11]*p[28] - 2*p[7]*p[14]*p[16] + 2*p[7]*p[14]*p[25] + 2*p[8]*p[12]*p[19] - 2*p[8]*p[12]*p[28] - 2*p[8]*p[13]*p[16] + 2*p[8]*p[13]*p[25] - 2*p[9]*p[12]*p[16] + 2*p[9]*p[12]*p[25] - 2*p[9]*p[13]*p[19] + 2*p[9]*p[13]*p[28] - 2*p[10]*p[11]*p[16] + 2*p[10]*p[11]*p[25] + 2*p[10]*p[14]*p[19] - 2*p[10]*p[14]*p[28]; - coeff[104] = 2*p[0]*p[7]*p[8]*p[22] - 2*p[0]*p[7]*p[10]*p[17] + 2*p[0]*p[8]*p[8]*p[20] - 2*p[0]*p[8]*p[9]*p[17] - 2*p[0]*p[9]*p[10]*p[22] + 2*p[0]*p[10]*p[10]*p[20] - 2*p[0]*p[11]*p[12]*p[22] + 2*p[0]*p[11]*p[14]*p[17] - 2*p[0]*p[12]*p[12]*p[20] + 2*p[0]*p[12]*p[13]*p[17] + 2*p[0]*p[13]*p[14]*p[22] - 2*p[0]*p[14]*p[14]*p[20] - 2*p[7]*p[11]*p[20] + 2*p[7]*p[11]*p[29] + 2*p[7]*p[12]*p[22] - 2*p[7]*p[12]*p[31] - 2*p[7]*p[14]*p[17] + 2*p[7]*p[14]*p[26] + 2*p[8]*p[11]*p[22] - 2*p[8]*p[11]*p[31] + 2*p[8]*p[12]*p[20] - 2*p[8]*p[12]*p[29] - 2*p[8]*p[13]*p[17] + 2*p[8]*p[13]*p[26] - 2*p[9]*p[12]*p[17] + 2*p[9]*p[12]*p[26] - 2*p[9]*p[13]*p[20] + 2*p[9]*p[13]*p[29] - 2*p[9]*p[14]*p[22] + 2*p[9]*p[14]*p[31] - 2*p[10]*p[11]*p[17] + 2*p[10]*p[11]*p[26] - 2*p[10]*p[13]*p[22] + 2*p[10]*p[13]*p[31] + 2*p[10]*p[14]*p[20] - 2*p[10]*p[14]*p[29]; - coeff[105] = 0; - coeff[106] = 2*(-p[7]*p[10]*p[15] + p[7]*p[10]*p[24] - p[8]*p[9]*p[15] + p[8]*p[9]*p[24] + p[11]*p[14]*p[15] - p[11]*p[14]*p[24] + p[12]*p[13]*p[15] - p[12]*p[13]*p[24])*p[0]; - coeff[107] = 2*(-p[7]*p[10]*p[16] + p[7]*p[10]*p[25] + p[8]*p[8]*p[19] - p[8]*p[8]*p[28] - p[8]*p[9]*p[16] + p[8]*p[9]*p[25] + p[10]*p[10]*p[19] - p[10]*p[10]*p[28] + p[11]*p[14]*p[16] - p[11]*p[14]*p[25] - p[12]*p[12]*p[19] + p[12]*p[12]*p[28] + p[12]*p[13]*p[16] - p[12]*p[13]*p[25] - p[14]*p[14]*p[19] + p[14]*p[14]*p[28])*p[0]; - coeff[108] = 2*(p[7]*p[8]*p[22] - p[7]*p[8]*p[31] - p[7]*p[10]*p[17] + p[7]*p[10]*p[26] + p[8]*p[8]*p[20] - p[8]*p[8]*p[29] - p[8]*p[9]*p[17] + p[8]*p[9]*p[26] - p[9]*p[10]*p[22] + p[9]*p[10]*p[31] + p[10]*p[10]*p[20] - p[10]*p[10]*p[29] - p[11]*p[12]*p[22] + p[11]*p[12]*p[31] + p[11]*p[14]*p[17] - p[11]*p[14]*p[26] - p[12]*p[12]*p[20] + p[12]*p[12]*p[29] + p[12]*p[13]*p[17] - p[12]*p[13]*p[26] + p[13]*p[14]*p[22] - p[13]*p[14]*p[31] - p[14]*p[14]*p[20] + p[14]*p[14]*p[29])*p[0]; - coeff[109] = 2*(p[7]*p[10]*p[15] - p[7]*p[10]*p[24] + p[8]*p[9]*p[15] - p[8]*p[9]*p[24] - p[11]*p[14]*p[15] + p[11]*p[14]*p[24] - p[12]*p[13]*p[15] + p[12]*p[13]*p[24])*p[0]; - coeff[110] = 2*(p[7]*p[10]*p[16] - p[7]*p[10]*p[25] - p[8]*p[8]*p[19] + p[8]*p[8]*p[28] + p[8]*p[9]*p[16] - p[8]*p[9]*p[25] - p[10]*p[10]*p[19] + p[10]*p[10]*p[28] - p[11]*p[14]*p[16] + p[11]*p[14]*p[25] + p[12]*p[12]*p[19] - p[12]*p[12]*p[28] - p[12]*p[13]*p[16] + p[12]*p[13]*p[25] + p[14]*p[14]*p[19] - p[14]*p[14]*p[28])*p[0]; - coeff[111] = 2*(-p[7]*p[8]*p[22] + p[7]*p[8]*p[31] + p[7]*p[10]*p[17] - p[7]*p[10]*p[26] - p[8]*p[8]*p[20] + p[8]*p[8]*p[29] + p[8]*p[9]*p[17] - p[8]*p[9]*p[26] + p[9]*p[10]*p[22] - p[9]*p[10]*p[31] - p[10]*p[10]*p[20] + p[10]*p[10]*p[29] + p[11]*p[12]*p[22] - p[11]*p[12]*p[31] - p[11]*p[14]*p[17] + p[11]*p[14]*p[26] + p[12]*p[12]*p[20] - p[12]*p[12]*p[29] - p[12]*p[13]*p[17] + p[12]*p[13]*p[26] - p[13]*p[14]*p[22] + p[13]*p[14]*p[31] + p[14]*p[14]*p[20] - p[14]*p[14]*p[29])*p[0]; - coeff[112] = -p[3] + p[6] - p[7]*p[8]*p[21] + p[7]*p[8]*p[30] + p[7]*p[9]*p[18] - p[7]*p[9]*p[27] + p[8]*p[8]*p[23] - p[8]*p[8]*p[32] - p[8]*p[10]*p[18] + p[8]*p[10]*p[27] + p[9]*p[9]*p[23] - p[9]*p[9]*p[32] - p[9]*p[10]*p[21] + p[9]*p[10]*p[30] - p[11]*p[12]*p[21] + p[11]*p[12]*p[30] + p[11]*p[13]*p[18] - p[11]*p[13]*p[27] + p[12]*p[12]*p[23] - p[12]*p[12]*p[32] - p[12]*p[14]*p[18] + p[12]*p[14]*p[27] + p[13]*p[13]*p[23] - p[13]*p[13]*p[32] - p[13]*p[14]*p[21] + p[13]*p[14]*p[30] - p[23] + p[32]; - coeff[113] = 2*p[7]*p[9]*p[15] - p[7]*p[9]*p[24] - 2*p[8]*p[10]*p[15] + p[8]*p[10]*p[24] + 2*p[11]*p[13]*p[15] - p[11]*p[13]*p[24] - 2*p[12]*p[14]*p[15] + p[12]*p[14]*p[24]; - coeff[114] = -2*p[7]*p[8]*p[19] + p[7]*p[8]*p[28] + 2*p[7]*p[9]*p[16] - p[7]*p[9]*p[25] - 2*p[8]*p[10]*p[16] + p[8]*p[10]*p[25] - 2*p[9]*p[10]*p[19] + p[9]*p[10]*p[28] - 2*p[11]*p[12]*p[19] + p[11]*p[12]*p[28] + 2*p[11]*p[13]*p[16] - p[11]*p[13]*p[25] - 2*p[12]*p[14]*p[16] + p[12]*p[14]*p[25] - 2*p[13]*p[14]*p[19] + p[13]*p[14]*p[28]; - coeff[115] = -2*p[7]*p[8]*p[20] + p[7]*p[8]*p[29] + 2*p[7]*p[9]*p[17] - p[7]*p[9]*p[26] + 2*p[8]*p[8]*p[22] - p[8]*p[8]*p[31] - 2*p[8]*p[10]*p[17] + p[8]*p[10]*p[26] + 2*p[9]*p[9]*p[22] - p[9]*p[9]*p[31] - 2*p[9]*p[10]*p[20] + p[9]*p[10]*p[29] - 2*p[11]*p[12]*p[20] + p[11]*p[12]*p[29] + 2*p[11]*p[13]*p[17] - p[11]*p[13]*p[26] + 2*p[12]*p[12]*p[22] - p[12]*p[12]*p[31] - 2*p[12]*p[14]*p[17] + p[12]*p[14]*p[26] + 2*p[13]*p[13]*p[22] - p[13]*p[13]*p[31] - 2*p[13]*p[14]*p[20] + p[13]*p[14]*p[29] - 2*p[22] + p[31]; - coeff[116] = (-p[7]*p[9] + p[8]*p[10] - p[11]*p[13] + p[12]*p[14])*p[15]; - coeff[117] = p[7]*p[8]*p[19] - p[7]*p[9]*p[16] + p[8]*p[10]*p[16] + p[9]*p[10]*p[19] + p[11]*p[12]*p[19] - p[11]*p[13]*p[16] + p[12]*p[14]*p[16] + p[13]*p[14]*p[19]; - coeff[118] = p[7]*p[8]*p[20] - p[7]*p[9]*p[17] - p[8]*p[8]*p[22] + p[8]*p[10]*p[17] - p[9]*p[9]*p[22] + p[9]*p[10]*p[20] + p[11]*p[12]*p[20] - p[11]*p[13]*p[17] - p[12]*p[12]*p[22] + p[12]*p[14]*p[17] - p[13]*p[13]*p[22] + p[13]*p[14]*p[20] + p[22]; - coeff[119] = 0; - coeff[120] = -2*p[7]*p[9]*p[15] + 2*p[7]*p[9]*p[24] + 2*p[8]*p[10]*p[15] - 2*p[8]*p[10]*p[24] - 2*p[11]*p[13]*p[15] + 2*p[11]*p[13]*p[24] + 2*p[12]*p[14]*p[15] - 2*p[12]*p[14]*p[24]; - coeff[121] = 2*p[7]*p[8]*p[19] - 2*p[7]*p[8]*p[28] - 2*p[7]*p[9]*p[16] + 2*p[7]*p[9]*p[25] + 2*p[8]*p[10]*p[16] - 2*p[8]*p[10]*p[25] + 2*p[9]*p[10]*p[19] - 2*p[9]*p[10]*p[28] + 2*p[11]*p[12]*p[19] - 2*p[11]*p[12]*p[28] - 2*p[11]*p[13]*p[16] + 2*p[11]*p[13]*p[25] + 2*p[12]*p[14]*p[16] - 2*p[12]*p[14]*p[25] + 2*p[13]*p[14]*p[19] - 2*p[13]*p[14]*p[28]; - coeff[122] = 2*p[7]*p[8]*p[20] - 2*p[7]*p[8]*p[29] - 2*p[7]*p[9]*p[17] + 2*p[7]*p[9]*p[26] - 2*p[8]*p[8]*p[22] + 2*p[8]*p[8]*p[31] + 2*p[8]*p[10]*p[17] - 2*p[8]*p[10]*p[26] - 2*p[9]*p[9]*p[22] + 2*p[9]*p[9]*p[31] + 2*p[9]*p[10]*p[20] - 2*p[9]*p[10]*p[29] + 2*p[11]*p[12]*p[20] - 2*p[11]*p[12]*p[29] - 2*p[11]*p[13]*p[17] + 2*p[11]*p[13]*p[26] - 2*p[12]*p[12]*p[22] + 2*p[12]*p[12]*p[31] + 2*p[12]*p[14]*p[17] - 2*p[12]*p[14]*p[26] - 2*p[13]*p[13]*p[22] + 2*p[13]*p[13]*p[31] + 2*p[13]*p[14]*p[20] - 2*p[13]*p[14]*p[29] + 2*p[22] - 2*p[31]; - coeff[123] = 2*p[7]*p[9]*p[15] - 2*p[7]*p[9]*p[24] - 2*p[8]*p[10]*p[15] + 2*p[8]*p[10]*p[24] + 2*p[11]*p[13]*p[15] - 2*p[11]*p[13]*p[24] - 2*p[12]*p[14]*p[15] + 2*p[12]*p[14]*p[24]; - coeff[124] = -2*p[7]*p[8]*p[19] + 2*p[7]*p[8]*p[28] + 2*p[7]*p[9]*p[16] - 2*p[7]*p[9]*p[25] - 2*p[8]*p[10]*p[16] + 2*p[8]*p[10]*p[25] - 2*p[9]*p[10]*p[19] + 2*p[9]*p[10]*p[28] - 2*p[11]*p[12]*p[19] + 2*p[11]*p[12]*p[28] + 2*p[11]*p[13]*p[16] - 2*p[11]*p[13]*p[25] - 2*p[12]*p[14]*p[16] + 2*p[12]*p[14]*p[25] - 2*p[13]*p[14]*p[19] + 2*p[13]*p[14]*p[28]; - coeff[125] = -2*p[7]*p[8]*p[20] + 2*p[7]*p[8]*p[29] + 2*p[7]*p[9]*p[17] - 2*p[7]*p[9]*p[26] + 2*p[8]*p[8]*p[22] - 2*p[8]*p[8]*p[31] - 2*p[8]*p[10]*p[17] + 2*p[8]*p[10]*p[26] + 2*p[9]*p[9]*p[22] - 2*p[9]*p[9]*p[31] - 2*p[9]*p[10]*p[20] + 2*p[9]*p[10]*p[29] - 2*p[11]*p[12]*p[20] + 2*p[11]*p[12]*p[29] + 2*p[11]*p[13]*p[17] - 2*p[11]*p[13]*p[26] + 2*p[12]*p[12]*p[22] - 2*p[12]*p[12]*p[31] - 2*p[12]*p[14]*p[17] + 2*p[12]*p[14]*p[26] + 2*p[13]*p[13]*p[22] - 2*p[13]*p[13]*p[31] - 2*p[13]*p[14]*p[20] + 2*p[13]*p[14]*p[29] - 2*p[22] + 2*p[31]; - coeff[126] = 2*p[0]*p[7]*p[11]*p[23] + 2*p[0]*p[7]*p[12]*p[21] - 2*p[0]*p[7]*p[13]*p[18] + 2*p[0]*p[8]*p[11]*p[21] - 2*p[0]*p[8]*p[12]*p[23] + 2*p[0]*p[8]*p[14]*p[18] - 2*p[0]*p[9]*p[11]*p[18] - 2*p[0]*p[9]*p[13]*p[23] + 2*p[0]*p[9]*p[14]*p[21] + 2*p[0]*p[10]*p[12]*p[18] + 2*p[0]*p[10]*p[13]*p[21] + 2*p[0]*p[10]*p[14]*p[23] - p[7]*p[8]*p[21] + p[7]*p[8]*p[30] + p[7]*p[9]*p[18] - p[7]*p[9]*p[27] + p[8]*p[8]*p[23] - p[8]*p[8]*p[32] - p[8]*p[10]*p[18] + p[8]*p[10]*p[27] + p[9]*p[9]*p[23] - p[9]*p[9]*p[32] - p[9]*p[10]*p[21] + p[9]*p[10]*p[30] + p[11]*p[12]*p[21] - p[11]*p[12]*p[30] - p[11]*p[13]*p[18] + p[11]*p[13]*p[27] - p[12]*p[12]*p[23] + p[12]*p[12]*p[32] + p[12]*p[14]*p[18] - p[12]*p[14]*p[27] - p[13]*p[13]*p[23] + p[13]*p[13]*p[32] + p[13]*p[14]*p[21] - p[13]*p[14]*p[30]; - coeff[127] = -2*p[0]*p[7]*p[13]*p[15] + 2*p[0]*p[8]*p[14]*p[15] - 2*p[0]*p[9]*p[11]*p[15] + 2*p[0]*p[10]*p[12]*p[15] + 2*p[7]*p[9]*p[15] - p[7]*p[9]*p[24] - 2*p[8]*p[10]*p[15] + p[8]*p[10]*p[24] - 2*p[11]*p[13]*p[15] + p[11]*p[13]*p[24] + 2*p[12]*p[14]*p[15] - p[12]*p[14]*p[24]; - coeff[128] = 2*p[0]*p[7]*p[12]*p[19] - 2*p[0]*p[7]*p[13]*p[16] + 2*p[0]*p[8]*p[11]*p[19] + 2*p[0]*p[8]*p[14]*p[16] - 2*p[0]*p[9]*p[11]*p[16] + 2*p[0]*p[9]*p[14]*p[19] + 2*p[0]*p[10]*p[12]*p[16] + 2*p[0]*p[10]*p[13]*p[19] - 2*p[7]*p[8]*p[19] + p[7]*p[8]*p[28] + 2*p[7]*p[9]*p[16] - p[7]*p[9]*p[25] - 2*p[8]*p[10]*p[16] + p[8]*p[10]*p[25] - 2*p[9]*p[10]*p[19] + p[9]*p[10]*p[28] + 2*p[11]*p[12]*p[19] - p[11]*p[12]*p[28] - 2*p[11]*p[13]*p[16] + p[11]*p[13]*p[25] + 2*p[12]*p[14]*p[16] - p[12]*p[14]*p[25] + 2*p[13]*p[14]*p[19] - p[13]*p[14]*p[28]; - coeff[129] = 2*p[0]*p[7]*p[11]*p[22] + 2*p[0]*p[7]*p[12]*p[20] - 2*p[0]*p[7]*p[13]*p[17] + 2*p[0]*p[8]*p[11]*p[20] - 2*p[0]*p[8]*p[12]*p[22] + 2*p[0]*p[8]*p[14]*p[17] - 2*p[0]*p[9]*p[11]*p[17] - 2*p[0]*p[9]*p[13]*p[22] + 2*p[0]*p[9]*p[14]*p[20] + 2*p[0]*p[10]*p[12]*p[17] + 2*p[0]*p[10]*p[13]*p[20] + 2*p[0]*p[10]*p[14]*p[22] - 2*p[7]*p[8]*p[20] + p[7]*p[8]*p[29] + 2*p[7]*p[9]*p[17] - p[7]*p[9]*p[26] + 2*p[8]*p[8]*p[22] - p[8]*p[8]*p[31] - 2*p[8]*p[10]*p[17] + p[8]*p[10]*p[26] + 2*p[9]*p[9]*p[22] - p[9]*p[9]*p[31] - 2*p[9]*p[10]*p[20] + p[9]*p[10]*p[29] + 2*p[11]*p[12]*p[20] - p[11]*p[12]*p[29] - 2*p[11]*p[13]*p[17] + p[11]*p[13]*p[26] - 2*p[12]*p[12]*p[22] + p[12]*p[12]*p[31] + 2*p[12]*p[14]*p[17] - p[12]*p[14]*p[26] - 2*p[13]*p[13]*p[22] + p[13]*p[13]*p[31] + 2*p[13]*p[14]*p[20] - p[13]*p[14]*p[29]; - coeff[130] = (-p[7]*p[9] + p[8]*p[10] + p[11]*p[13] - p[12]*p[14])*p[15]; - coeff[131] = p[7]*p[8]*p[19] - p[7]*p[9]*p[16] + p[8]*p[10]*p[16] + p[9]*p[10]*p[19] - p[11]*p[12]*p[19] + p[11]*p[13]*p[16] - p[12]*p[14]*p[16] - p[13]*p[14]*p[19]; - coeff[132] = p[7]*p[8]*p[20] - p[7]*p[9]*p[17] - p[8]*p[8]*p[22] + p[8]*p[10]*p[17] - p[9]*p[9]*p[22] + p[9]*p[10]*p[20] - p[11]*p[12]*p[20] + p[11]*p[13]*p[17] + p[12]*p[12]*p[22] - p[12]*p[14]*p[17] + p[13]*p[13]*p[22] - p[13]*p[14]*p[20]; - coeff[133] = 2*(-p[7]*p[11]*p[23] + p[7]*p[11]*p[32] - p[7]*p[12]*p[21] + p[7]*p[12]*p[30] + p[7]*p[13]*p[18] - p[7]*p[13]*p[27] - p[8]*p[11]*p[21] + p[8]*p[11]*p[30] + p[8]*p[12]*p[23] - p[8]*p[12]*p[32] - p[8]*p[14]*p[18] + p[8]*p[14]*p[27] + p[9]*p[11]*p[18] - p[9]*p[11]*p[27] + p[9]*p[13]*p[23] - p[9]*p[13]*p[32] - p[9]*p[14]*p[21] + p[9]*p[14]*p[30] - p[10]*p[12]*p[18] + p[10]*p[12]*p[27] - p[10]*p[13]*p[21] + p[10]*p[13]*p[30] - p[10]*p[14]*p[23] + p[10]*p[14]*p[32])*p[0]; - coeff[134] = 4*p[0]*p[7]*p[13]*p[15] - 2*p[0]*p[7]*p[13]*p[24] - 4*p[0]*p[8]*p[14]*p[15] + 2*p[0]*p[8]*p[14]*p[24] + 4*p[0]*p[9]*p[11]*p[15] - 2*p[0]*p[9]*p[11]*p[24] - 4*p[0]*p[10]*p[12]*p[15] + 2*p[0]*p[10]*p[12]*p[24] - 2*p[7]*p[9]*p[15] + 2*p[7]*p[9]*p[24] + 2*p[8]*p[10]*p[15] - 2*p[8]*p[10]*p[24] + 2*p[11]*p[13]*p[15] - 2*p[11]*p[13]*p[24] - 2*p[12]*p[14]*p[15] + 2*p[12]*p[14]*p[24]; - coeff[135] = -4*p[0]*p[7]*p[12]*p[19] + 2*p[0]*p[7]*p[12]*p[28] + 4*p[0]*p[7]*p[13]*p[16] - 2*p[0]*p[7]*p[13]*p[25] - 4*p[0]*p[8]*p[11]*p[19] + 2*p[0]*p[8]*p[11]*p[28] - 4*p[0]*p[8]*p[14]*p[16] + 2*p[0]*p[8]*p[14]*p[25] + 4*p[0]*p[9]*p[11]*p[16] - 2*p[0]*p[9]*p[11]*p[25] - 4*p[0]*p[9]*p[14]*p[19] + 2*p[0]*p[9]*p[14]*p[28] - 4*p[0]*p[10]*p[12]*p[16] + 2*p[0]*p[10]*p[12]*p[25] - 4*p[0]*p[10]*p[13]*p[19] + 2*p[0]*p[10]*p[13]*p[28] + 2*p[7]*p[8]*p[19] - 2*p[7]*p[8]*p[28] - 2*p[7]*p[9]*p[16] + 2*p[7]*p[9]*p[25] + 2*p[8]*p[10]*p[16] - 2*p[8]*p[10]*p[25] + 2*p[9]*p[10]*p[19] - 2*p[9]*p[10]*p[28] - 2*p[11]*p[12]*p[19] + 2*p[11]*p[12]*p[28] + 2*p[11]*p[13]*p[16] - 2*p[11]*p[13]*p[25] - 2*p[12]*p[14]*p[16] + 2*p[12]*p[14]*p[25] - 2*p[13]*p[14]*p[19] + 2*p[13]*p[14]*p[28]; - coeff[136] = -4*p[0]*p[7]*p[11]*p[22] + 2*p[0]*p[7]*p[11]*p[31] - 4*p[0]*p[7]*p[12]*p[20] + 2*p[0]*p[7]*p[12]*p[29] + 4*p[0]*p[7]*p[13]*p[17] - 2*p[0]*p[7]*p[13]*p[26] - 4*p[0]*p[8]*p[11]*p[20] + 2*p[0]*p[8]*p[11]*p[29] + 4*p[0]*p[8]*p[12]*p[22] - 2*p[0]*p[8]*p[12]*p[31] - 4*p[0]*p[8]*p[14]*p[17] + 2*p[0]*p[8]*p[14]*p[26] + 4*p[0]*p[9]*p[11]*p[17] - 2*p[0]*p[9]*p[11]*p[26] + 4*p[0]*p[9]*p[13]*p[22] - 2*p[0]*p[9]*p[13]*p[31] - 4*p[0]*p[9]*p[14]*p[20] + 2*p[0]*p[9]*p[14]*p[29] - 4*p[0]*p[10]*p[12]*p[17] + 2*p[0]*p[10]*p[12]*p[26] - 4*p[0]*p[10]*p[13]*p[20] + 2*p[0]*p[10]*p[13]*p[29] - 4*p[0]*p[10]*p[14]*p[22] + 2*p[0]*p[10]*p[14]*p[31] + 2*p[7]*p[8]*p[20] - 2*p[7]*p[8]*p[29] - 2*p[7]*p[9]*p[17] + 2*p[7]*p[9]*p[26] - 2*p[8]*p[8]*p[22] + 2*p[8]*p[8]*p[31] + 2*p[8]*p[10]*p[17] - 2*p[8]*p[10]*p[26] - 2*p[9]*p[9]*p[22] + 2*p[9]*p[9]*p[31] + 2*p[9]*p[10]*p[20] - 2*p[9]*p[10]*p[29] - 2*p[11]*p[12]*p[20] + 2*p[11]*p[12]*p[29] + 2*p[11]*p[13]*p[17] - 2*p[11]*p[13]*p[26] + 2*p[12]*p[12]*p[22] - 2*p[12]*p[12]*p[31] - 2*p[12]*p[14]*p[17] + 2*p[12]*p[14]*p[26] + 2*p[13]*p[13]*p[22] - 2*p[13]*p[13]*p[31] - 2*p[13]*p[14]*p[20] + 2*p[13]*p[14]*p[29]; - coeff[137] = -2*p[0]*p[7]*p[13]*p[15] + 2*p[0]*p[8]*p[14]*p[15] - 2*p[0]*p[9]*p[11]*p[15] + 2*p[0]*p[10]*p[12]*p[15] + 2*p[7]*p[9]*p[15] - 2*p[7]*p[9]*p[24] - 2*p[8]*p[10]*p[15] + 2*p[8]*p[10]*p[24] - 2*p[11]*p[13]*p[15] + 2*p[11]*p[13]*p[24] + 2*p[12]*p[14]*p[15] - 2*p[12]*p[14]*p[24]; - coeff[138] = 2*p[0]*p[7]*p[12]*p[19] - 2*p[0]*p[7]*p[13]*p[16] + 2*p[0]*p[8]*p[11]*p[19] + 2*p[0]*p[8]*p[14]*p[16] - 2*p[0]*p[9]*p[11]*p[16] + 2*p[0]*p[9]*p[14]*p[19] + 2*p[0]*p[10]*p[12]*p[16] + 2*p[0]*p[10]*p[13]*p[19] - 2*p[7]*p[8]*p[19] + 2*p[7]*p[8]*p[28] + 2*p[7]*p[9]*p[16] - 2*p[7]*p[9]*p[25] - 2*p[8]*p[10]*p[16] + 2*p[8]*p[10]*p[25] - 2*p[9]*p[10]*p[19] + 2*p[9]*p[10]*p[28] + 2*p[11]*p[12]*p[19] - 2*p[11]*p[12]*p[28] - 2*p[11]*p[13]*p[16] + 2*p[11]*p[13]*p[25] + 2*p[12]*p[14]*p[16] - 2*p[12]*p[14]*p[25] + 2*p[13]*p[14]*p[19] - 2*p[13]*p[14]*p[28]; - coeff[139] = 2*p[0]*p[7]*p[11]*p[22] + 2*p[0]*p[7]*p[12]*p[20] - 2*p[0]*p[7]*p[13]*p[17] + 2*p[0]*p[8]*p[11]*p[20] - 2*p[0]*p[8]*p[12]*p[22] + 2*p[0]*p[8]*p[14]*p[17] - 2*p[0]*p[9]*p[11]*p[17] - 2*p[0]*p[9]*p[13]*p[22] + 2*p[0]*p[9]*p[14]*p[20] + 2*p[0]*p[10]*p[12]*p[17] + 2*p[0]*p[10]*p[13]*p[20] + 2*p[0]*p[10]*p[14]*p[22] - 2*p[7]*p[8]*p[20] + 2*p[7]*p[8]*p[29] + 2*p[7]*p[9]*p[17] - 2*p[7]*p[9]*p[26] + 2*p[8]*p[8]*p[22] - 2*p[8]*p[8]*p[31] - 2*p[8]*p[10]*p[17] + 2*p[8]*p[10]*p[26] + 2*p[9]*p[9]*p[22] - 2*p[9]*p[9]*p[31] - 2*p[9]*p[10]*p[20] + 2*p[9]*p[10]*p[29] + 2*p[11]*p[12]*p[20] - 2*p[11]*p[12]*p[29] - 2*p[11]*p[13]*p[17] + 2*p[11]*p[13]*p[26] - 2*p[12]*p[12]*p[22] + 2*p[12]*p[12]*p[31] + 2*p[12]*p[14]*p[17] - 2*p[12]*p[14]*p[26] - 2*p[13]*p[13]*p[22] + 2*p[13]*p[13]*p[31] + 2*p[13]*p[14]*p[20] - 2*p[13]*p[14]*p[29]; - coeff[140] = 0; - coeff[141] = 2*(-p[7]*p[13]*p[15] + p[7]*p[13]*p[24] + p[8]*p[14]*p[15] - p[8]*p[14]*p[24] - p[9]*p[11]*p[15] + p[9]*p[11]*p[24] + p[10]*p[12]*p[15] - p[10]*p[12]*p[24])*p[0]; - coeff[142] = 2*(p[7]*p[12]*p[19] - p[7]*p[12]*p[28] - p[7]*p[13]*p[16] + p[7]*p[13]*p[25] + p[8]*p[11]*p[19] - p[8]*p[11]*p[28] + p[8]*p[14]*p[16] - p[8]*p[14]*p[25] - p[9]*p[11]*p[16] + p[9]*p[11]*p[25] + p[9]*p[14]*p[19] - p[9]*p[14]*p[28] + p[10]*p[12]*p[16] - p[10]*p[12]*p[25] + p[10]*p[13]*p[19] - p[10]*p[13]*p[28])*p[0]; - coeff[143] = 2*(p[7]*p[11]*p[22] - p[7]*p[11]*p[31] + p[7]*p[12]*p[20] - p[7]*p[12]*p[29] - p[7]*p[13]*p[17] + p[7]*p[13]*p[26] + p[8]*p[11]*p[20] - p[8]*p[11]*p[29] - p[8]*p[12]*p[22] + p[8]*p[12]*p[31] + p[8]*p[14]*p[17] - p[8]*p[14]*p[26] - p[9]*p[11]*p[17] + p[9]*p[11]*p[26] - p[9]*p[13]*p[22] + p[9]*p[13]*p[31] + p[9]*p[14]*p[20] - p[9]*p[14]*p[29] + p[10]*p[12]*p[17] - p[10]*p[12]*p[26] + p[10]*p[13]*p[20] - p[10]*p[13]*p[29] + p[10]*p[14]*p[22] - p[10]*p[14]*p[31])*p[0]; - coeff[144] = 2*(p[7]*p[13]*p[15] - p[7]*p[13]*p[24] - p[8]*p[14]*p[15] + p[8]*p[14]*p[24] + p[9]*p[11]*p[15] - p[9]*p[11]*p[24] - p[10]*p[12]*p[15] + p[10]*p[12]*p[24])*p[0]; - coeff[145] = 2*(-p[7]*p[12]*p[19] + p[7]*p[12]*p[28] + p[7]*p[13]*p[16] - p[7]*p[13]*p[25] - p[8]*p[11]*p[19] + p[8]*p[11]*p[28] - p[8]*p[14]*p[16] + p[8]*p[14]*p[25] + p[9]*p[11]*p[16] - p[9]*p[11]*p[25] - p[9]*p[14]*p[19] + p[9]*p[14]*p[28] - p[10]*p[12]*p[16] + p[10]*p[12]*p[25] - p[10]*p[13]*p[19] + p[10]*p[13]*p[28])*p[0]; - coeff[146] = 2*(-p[7]*p[11]*p[22] + p[7]*p[11]*p[31] - p[7]*p[12]*p[20] + p[7]*p[12]*p[29] + p[7]*p[13]*p[17] - p[7]*p[13]*p[26] - p[8]*p[11]*p[20] + p[8]*p[11]*p[29] + p[8]*p[12]*p[22] - p[8]*p[12]*p[31] - p[8]*p[14]*p[17] + p[8]*p[14]*p[26] + p[9]*p[11]*p[17] - p[9]*p[11]*p[26] + p[9]*p[13]*p[22] - p[9]*p[13]*p[31] - p[9]*p[14]*p[20] + p[9]*p[14]*p[29] - p[10]*p[12]*p[17] + p[10]*p[12]*p[26] - p[10]*p[13]*p[20] + p[10]*p[13]*p[29] - p[10]*p[14]*p[22] + p[10]*p[14]*p[31])*p[0]; - coeff[147] = -2*p[0]*p[7]*p[8]*p[21] + 2*p[0]*p[7]*p[9]*p[18] + 2*p[0]*p[8]*p[8]*p[23] - 2*p[0]*p[8]*p[10]*p[18] + 2*p[0]*p[9]*p[9]*p[23] - 2*p[0]*p[9]*p[10]*p[21] + 2*p[0]*p[11]*p[12]*p[21] - 2*p[0]*p[11]*p[13]*p[18] - 2*p[0]*p[12]*p[12]*p[23] + 2*p[0]*p[12]*p[14]*p[18] - 2*p[0]*p[13]*p[13]*p[23] + 2*p[0]*p[13]*p[14]*p[21] - p[7]*p[11]*p[23] + p[7]*p[11]*p[32] - p[7]*p[12]*p[21] + p[7]*p[12]*p[30] + p[7]*p[13]*p[18] - p[7]*p[13]*p[27] - p[8]*p[11]*p[21] + p[8]*p[11]*p[30] + p[8]*p[12]*p[23] - p[8]*p[12]*p[32] - p[8]*p[14]*p[18] + p[8]*p[14]*p[27] + p[9]*p[11]*p[18] - p[9]*p[11]*p[27] + p[9]*p[13]*p[23] - p[9]*p[13]*p[32] - p[9]*p[14]*p[21] + p[9]*p[14]*p[30] - p[10]*p[12]*p[18] + p[10]*p[12]*p[27] - p[10]*p[13]*p[21] + p[10]*p[13]*p[30] - p[10]*p[14]*p[23] + p[10]*p[14]*p[32]; - coeff[148] = 2*p[0]*p[7]*p[9]*p[15] - 2*p[0]*p[8]*p[10]*p[15] - 2*p[0]*p[11]*p[13]*p[15] + 2*p[0]*p[12]*p[14]*p[15] + 2*p[7]*p[13]*p[15] - p[7]*p[13]*p[24] - 2*p[8]*p[14]*p[15] + p[8]*p[14]*p[24] + 2*p[9]*p[11]*p[15] - p[9]*p[11]*p[24] - 2*p[10]*p[12]*p[15] + p[10]*p[12]*p[24]; - coeff[149] = -2*p[0]*p[7]*p[8]*p[19] + 2*p[0]*p[7]*p[9]*p[16] - 2*p[0]*p[8]*p[10]*p[16] - 2*p[0]*p[9]*p[10]*p[19] + 2*p[0]*p[11]*p[12]*p[19] - 2*p[0]*p[11]*p[13]*p[16] + 2*p[0]*p[12]*p[14]*p[16] + 2*p[0]*p[13]*p[14]*p[19] - 2*p[7]*p[12]*p[19] + p[7]*p[12]*p[28] + 2*p[7]*p[13]*p[16] - p[7]*p[13]*p[25] - 2*p[8]*p[11]*p[19] + p[8]*p[11]*p[28] - 2*p[8]*p[14]*p[16] + p[8]*p[14]*p[25] + 2*p[9]*p[11]*p[16] - p[9]*p[11]*p[25] - 2*p[9]*p[14]*p[19] + p[9]*p[14]*p[28] - 2*p[10]*p[12]*p[16] + p[10]*p[12]*p[25] - 2*p[10]*p[13]*p[19] + p[10]*p[13]*p[28]; - coeff[150] = -2*p[0]*p[7]*p[8]*p[20] + 2*p[0]*p[7]*p[9]*p[17] + 2*p[0]*p[8]*p[8]*p[22] - 2*p[0]*p[8]*p[10]*p[17] + 2*p[0]*p[9]*p[9]*p[22] - 2*p[0]*p[9]*p[10]*p[20] + 2*p[0]*p[11]*p[12]*p[20] - 2*p[0]*p[11]*p[13]*p[17] - 2*p[0]*p[12]*p[12]*p[22] + 2*p[0]*p[12]*p[14]*p[17] - 2*p[0]*p[13]*p[13]*p[22] + 2*p[0]*p[13]*p[14]*p[20] - 2*p[7]*p[11]*p[22] + p[7]*p[11]*p[31] - 2*p[7]*p[12]*p[20] + p[7]*p[12]*p[29] + 2*p[7]*p[13]*p[17] - p[7]*p[13]*p[26] - 2*p[8]*p[11]*p[20] + p[8]*p[11]*p[29] + 2*p[8]*p[12]*p[22] - p[8]*p[12]*p[31] - 2*p[8]*p[14]*p[17] + p[8]*p[14]*p[26] + 2*p[9]*p[11]*p[17] - p[9]*p[11]*p[26] + 2*p[9]*p[13]*p[22] - p[9]*p[13]*p[31] - 2*p[9]*p[14]*p[20] + p[9]*p[14]*p[29] - 2*p[10]*p[12]*p[17] + p[10]*p[12]*p[26] - 2*p[10]*p[13]*p[20] + p[10]*p[13]*p[29] - 2*p[10]*p[14]*p[22] + p[10]*p[14]*p[31]; - coeff[151] = (-p[7]*p[13] + p[8]*p[14] - p[9]*p[11] + p[10]*p[12])*p[15]; - coeff[152] = p[7]*p[12]*p[19] - p[7]*p[13]*p[16] + p[8]*p[11]*p[19] + p[8]*p[14]*p[16] - p[9]*p[11]*p[16] + p[9]*p[14]*p[19] + p[10]*p[12]*p[16] + p[10]*p[13]*p[19]; - coeff[153] = p[7]*p[11]*p[22] + p[7]*p[12]*p[20] - p[7]*p[13]*p[17] + p[8]*p[11]*p[20] - p[8]*p[12]*p[22] + p[8]*p[14]*p[17] - p[9]*p[11]*p[17] - p[9]*p[13]*p[22] + p[9]*p[14]*p[20] + p[10]*p[12]*p[17] + p[10]*p[13]*p[20] + p[10]*p[14]*p[22]; - coeff[154] = 2*(p[7]*p[8]*p[21] - p[7]*p[8]*p[30] - p[7]*p[9]*p[18] + p[7]*p[9]*p[27] - p[8]*p[8]*p[23] + p[8]*p[8]*p[32] + p[8]*p[10]*p[18] - p[8]*p[10]*p[27] - p[9]*p[9]*p[23] + p[9]*p[9]*p[32] + p[9]*p[10]*p[21] - p[9]*p[10]*p[30] - p[11]*p[12]*p[21] + p[11]*p[12]*p[30] + p[11]*p[13]*p[18] - p[11]*p[13]*p[27] + p[12]*p[12]*p[23] - p[12]*p[12]*p[32] - p[12]*p[14]*p[18] + p[12]*p[14]*p[27] + p[13]*p[13]*p[23] - p[13]*p[13]*p[32] - p[13]*p[14]*p[21] + p[13]*p[14]*p[30])*p[0]; - coeff[155] = -4*p[0]*p[7]*p[9]*p[15] + 2*p[0]*p[7]*p[9]*p[24] + 4*p[0]*p[8]*p[10]*p[15] - 2*p[0]*p[8]*p[10]*p[24] + 4*p[0]*p[11]*p[13]*p[15] - 2*p[0]*p[11]*p[13]*p[24] - 4*p[0]*p[12]*p[14]*p[15] + 2*p[0]*p[12]*p[14]*p[24] - 2*p[7]*p[13]*p[15] + 2*p[7]*p[13]*p[24] + 2*p[8]*p[14]*p[15] - 2*p[8]*p[14]*p[24] - 2*p[9]*p[11]*p[15] + 2*p[9]*p[11]*p[24] + 2*p[10]*p[12]*p[15] - 2*p[10]*p[12]*p[24]; - coeff[156] = 4*p[0]*p[7]*p[8]*p[19] - 2*p[0]*p[7]*p[8]*p[28] - 4*p[0]*p[7]*p[9]*p[16] + 2*p[0]*p[7]*p[9]*p[25] + 4*p[0]*p[8]*p[10]*p[16] - 2*p[0]*p[8]*p[10]*p[25] + 4*p[0]*p[9]*p[10]*p[19] - 2*p[0]*p[9]*p[10]*p[28] - 4*p[0]*p[11]*p[12]*p[19] + 2*p[0]*p[11]*p[12]*p[28] + 4*p[0]*p[11]*p[13]*p[16] - 2*p[0]*p[11]*p[13]*p[25] - 4*p[0]*p[12]*p[14]*p[16] + 2*p[0]*p[12]*p[14]*p[25] - 4*p[0]*p[13]*p[14]*p[19] + 2*p[0]*p[13]*p[14]*p[28] + 2*p[7]*p[12]*p[19] - 2*p[7]*p[12]*p[28] - 2*p[7]*p[13]*p[16] + 2*p[7]*p[13]*p[25] + 2*p[8]*p[11]*p[19] - 2*p[8]*p[11]*p[28] + 2*p[8]*p[14]*p[16] - 2*p[8]*p[14]*p[25] - 2*p[9]*p[11]*p[16] + 2*p[9]*p[11]*p[25] + 2*p[9]*p[14]*p[19] - 2*p[9]*p[14]*p[28] + 2*p[10]*p[12]*p[16] - 2*p[10]*p[12]*p[25] + 2*p[10]*p[13]*p[19] - 2*p[10]*p[13]*p[28]; - coeff[157] = 4*p[0]*p[7]*p[8]*p[20] - 2*p[0]*p[7]*p[8]*p[29] - 4*p[0]*p[7]*p[9]*p[17] + 2*p[0]*p[7]*p[9]*p[26] - 4*p[0]*p[8]*p[8]*p[22] + 2*p[0]*p[8]*p[8]*p[31] + 4*p[0]*p[8]*p[10]*p[17] - 2*p[0]*p[8]*p[10]*p[26] - 4*p[0]*p[9]*p[9]*p[22] + 2*p[0]*p[9]*p[9]*p[31] + 4*p[0]*p[9]*p[10]*p[20] - 2*p[0]*p[9]*p[10]*p[29] - 4*p[0]*p[11]*p[12]*p[20] + 2*p[0]*p[11]*p[12]*p[29] + 4*p[0]*p[11]*p[13]*p[17] - 2*p[0]*p[11]*p[13]*p[26] + 4*p[0]*p[12]*p[12]*p[22] - 2*p[0]*p[12]*p[12]*p[31] - 4*p[0]*p[12]*p[14]*p[17] + 2*p[0]*p[12]*p[14]*p[26] + 4*p[0]*p[13]*p[13]*p[22] - 2*p[0]*p[13]*p[13]*p[31] - 4*p[0]*p[13]*p[14]*p[20] + 2*p[0]*p[13]*p[14]*p[29] + 2*p[7]*p[11]*p[22] - 2*p[7]*p[11]*p[31] + 2*p[7]*p[12]*p[20] - 2*p[7]*p[12]*p[29] - 2*p[7]*p[13]*p[17] + 2*p[7]*p[13]*p[26] + 2*p[8]*p[11]*p[20] - 2*p[8]*p[11]*p[29] - 2*p[8]*p[12]*p[22] + 2*p[8]*p[12]*p[31] + 2*p[8]*p[14]*p[17] - 2*p[8]*p[14]*p[26] - 2*p[9]*p[11]*p[17] + 2*p[9]*p[11]*p[26] - 2*p[9]*p[13]*p[22] + 2*p[9]*p[13]*p[31] + 2*p[9]*p[14]*p[20] - 2*p[9]*p[14]*p[29] + 2*p[10]*p[12]*p[17] - 2*p[10]*p[12]*p[26] + 2*p[10]*p[13]*p[20] - 2*p[10]*p[13]*p[29] + 2*p[10]*p[14]*p[22] - 2*p[10]*p[14]*p[31]; - coeff[158] = 2*p[0]*p[7]*p[9]*p[15] - 2*p[0]*p[8]*p[10]*p[15] - 2*p[0]*p[11]*p[13]*p[15] + 2*p[0]*p[12]*p[14]*p[15] + 2*p[7]*p[13]*p[15] - 2*p[7]*p[13]*p[24] - 2*p[8]*p[14]*p[15] + 2*p[8]*p[14]*p[24] + 2*p[9]*p[11]*p[15] - 2*p[9]*p[11]*p[24] - 2*p[10]*p[12]*p[15] + 2*p[10]*p[12]*p[24]; - coeff[159] = -2*p[0]*p[7]*p[8]*p[19] + 2*p[0]*p[7]*p[9]*p[16] - 2*p[0]*p[8]*p[10]*p[16] - 2*p[0]*p[9]*p[10]*p[19] + 2*p[0]*p[11]*p[12]*p[19] - 2*p[0]*p[11]*p[13]*p[16] + 2*p[0]*p[12]*p[14]*p[16] + 2*p[0]*p[13]*p[14]*p[19] - 2*p[7]*p[12]*p[19] + 2*p[7]*p[12]*p[28] + 2*p[7]*p[13]*p[16] - 2*p[7]*p[13]*p[25] - 2*p[8]*p[11]*p[19] + 2*p[8]*p[11]*p[28] - 2*p[8]*p[14]*p[16] + 2*p[8]*p[14]*p[25] + 2*p[9]*p[11]*p[16] - 2*p[9]*p[11]*p[25] - 2*p[9]*p[14]*p[19] + 2*p[9]*p[14]*p[28] - 2*p[10]*p[12]*p[16] + 2*p[10]*p[12]*p[25] - 2*p[10]*p[13]*p[19] + 2*p[10]*p[13]*p[28]; - coeff[160] = -2*p[0]*p[7]*p[8]*p[20] + 2*p[0]*p[7]*p[9]*p[17] + 2*p[0]*p[8]*p[8]*p[22] - 2*p[0]*p[8]*p[10]*p[17] + 2*p[0]*p[9]*p[9]*p[22] - 2*p[0]*p[9]*p[10]*p[20] + 2*p[0]*p[11]*p[12]*p[20] - 2*p[0]*p[11]*p[13]*p[17] - 2*p[0]*p[12]*p[12]*p[22] + 2*p[0]*p[12]*p[14]*p[17] - 2*p[0]*p[13]*p[13]*p[22] + 2*p[0]*p[13]*p[14]*p[20] - 2*p[7]*p[11]*p[22] + 2*p[7]*p[11]*p[31] - 2*p[7]*p[12]*p[20] + 2*p[7]*p[12]*p[29] + 2*p[7]*p[13]*p[17] - 2*p[7]*p[13]*p[26] - 2*p[8]*p[11]*p[20] + 2*p[8]*p[11]*p[29] + 2*p[8]*p[12]*p[22] - 2*p[8]*p[12]*p[31] - 2*p[8]*p[14]*p[17] + 2*p[8]*p[14]*p[26] + 2*p[9]*p[11]*p[17] - 2*p[9]*p[11]*p[26] + 2*p[9]*p[13]*p[22] - 2*p[9]*p[13]*p[31] - 2*p[9]*p[14]*p[20] + 2*p[9]*p[14]*p[29] - 2*p[10]*p[12]*p[17] + 2*p[10]*p[12]*p[26] - 2*p[10]*p[13]*p[20] + 2*p[10]*p[13]*p[29] - 2*p[10]*p[14]*p[22] + 2*p[10]*p[14]*p[31]; - coeff[161] = 0; - coeff[162] = 2*(p[7]*p[9]*p[15] - p[7]*p[9]*p[24] - p[8]*p[10]*p[15] + p[8]*p[10]*p[24] - p[11]*p[13]*p[15] + p[11]*p[13]*p[24] + p[12]*p[14]*p[15] - p[12]*p[14]*p[24])*p[0]; - coeff[163] = 2*(-p[7]*p[8]*p[19] + p[7]*p[8]*p[28] + p[7]*p[9]*p[16] - p[7]*p[9]*p[25] - p[8]*p[10]*p[16] + p[8]*p[10]*p[25] - p[9]*p[10]*p[19] + p[9]*p[10]*p[28] + p[11]*p[12]*p[19] - p[11]*p[12]*p[28] - p[11]*p[13]*p[16] + p[11]*p[13]*p[25] + p[12]*p[14]*p[16] - p[12]*p[14]*p[25] + p[13]*p[14]*p[19] - p[13]*p[14]*p[28])*p[0]; - coeff[164] = 2*(-p[7]*p[8]*p[20] + p[7]*p[8]*p[29] + p[7]*p[9]*p[17] - p[7]*p[9]*p[26] + p[8]*p[8]*p[22] - p[8]*p[8]*p[31] - p[8]*p[10]*p[17] + p[8]*p[10]*p[26] + p[9]*p[9]*p[22] - p[9]*p[9]*p[31] - p[9]*p[10]*p[20] + p[9]*p[10]*p[29] + p[11]*p[12]*p[20] - p[11]*p[12]*p[29] - p[11]*p[13]*p[17] + p[11]*p[13]*p[26] - p[12]*p[12]*p[22] + p[12]*p[12]*p[31] + p[12]*p[14]*p[17] - p[12]*p[14]*p[26] - p[13]*p[13]*p[22] + p[13]*p[13]*p[31] + p[13]*p[14]*p[20] - p[13]*p[14]*p[29])*p[0]; - coeff[165] = 2*(-p[7]*p[9]*p[15] + p[7]*p[9]*p[24] + p[8]*p[10]*p[15] - p[8]*p[10]*p[24] + p[11]*p[13]*p[15] - p[11]*p[13]*p[24] - p[12]*p[14]*p[15] + p[12]*p[14]*p[24])*p[0]; - coeff[166] = 2*(p[7]*p[8]*p[19] - p[7]*p[8]*p[28] - p[7]*p[9]*p[16] + p[7]*p[9]*p[25] + p[8]*p[10]*p[16] - p[8]*p[10]*p[25] + p[9]*p[10]*p[19] - p[9]*p[10]*p[28] - p[11]*p[12]*p[19] + p[11]*p[12]*p[28] + p[11]*p[13]*p[16] - p[11]*p[13]*p[25] - p[12]*p[14]*p[16] + p[12]*p[14]*p[25] - p[13]*p[14]*p[19] + p[13]*p[14]*p[28])*p[0]; - coeff[167] = 2*(p[7]*p[8]*p[20] - p[7]*p[8]*p[29] - p[7]*p[9]*p[17] + p[7]*p[9]*p[26] - p[8]*p[8]*p[22] + p[8]*p[8]*p[31] + p[8]*p[10]*p[17] - p[8]*p[10]*p[26] - p[9]*p[9]*p[22] + p[9]*p[9]*p[31] + p[9]*p[10]*p[20] - p[9]*p[10]*p[29] - p[11]*p[12]*p[20] + p[11]*p[12]*p[29] + p[11]*p[13]*p[17] - p[11]*p[13]*p[26] + p[12]*p[12]*p[22] - p[12]*p[12]*p[31] - p[12]*p[14]*p[17] + p[12]*p[14]*p[26] + p[13]*p[13]*p[22] - p[13]*p[13]*p[31] - p[13]*p[14]*p[20] + p[13]*p[14]*p[29])*p[0]; -} - -} // namespace embree diff --git a/thirdparty/embree-aarch64/kernels/common/point_query.h b/thirdparty/embree-aarch64/kernels/common/point_query.h deleted file mode 100644 index 27d158ca3a..0000000000 --- a/thirdparty/embree-aarch64/kernels/common/point_query.h +++ /dev/null @@ -1,136 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "default.h" - -namespace embree -{ - /* Point query structure for closest point query */ - template - struct RTC_ALIGN(16) PointQueryK - { - /* Default construction does nothing */ - __forceinline PointQueryK() {} - - /* Constructs a ray from origin, direction, and ray segment. Near - * has to be smaller than far */ - __forceinline PointQueryK(const Vec3vf& p, const vfloat& radius = inf, const vfloat& time = zero) - : p(p), time(time), radius(radius) {} - - /* Returns the size of the ray */ - static __forceinline size_t size() { return K; } - - /* Calculates if this is a valid ray that does not cause issues during traversal */ - __forceinline vbool valid() const - { - const vbool vx = (abs(p.x) <= vfloat(FLT_LARGE)); - const vbool vy = (abs(p.y) <= vfloat(FLT_LARGE)); - const vbool vz = (abs(p.z) <= vfloat(FLT_LARGE)); - const vbool vn = radius >= vfloat(0); - const vbool vf = abs(time) < vfloat(inf); - return vx & vy & vz & vn & vf; - } - - __forceinline void get(PointQueryK<1>* ray) const; - __forceinline void get(size_t i, PointQueryK<1>& ray) const; - __forceinline void set(const PointQueryK<1>* ray); - __forceinline void set(size_t i, const PointQueryK<1>& ray); - - Vec3vf p; // location of the query point - vfloat time; // time for motion blur - vfloat radius; // radius for the point query - }; - - /* Specialization for a single point query */ - template<> - struct RTC_ALIGN(16) PointQueryK<1> - { - /* Default construction does nothing */ - __forceinline PointQueryK() {} - - /* Constructs a ray from origin, direction, and ray segment. Near - * has to be smaller than far */ - __forceinline PointQueryK(const Vec3fa& p, float radius = inf, float time = zero) - : p(p), time(time), radius(radius) {} - - /* Calculates if this is a valid ray that does not cause issues during traversal */ - __forceinline bool valid() const { - return all(le_mask(abs(Vec3fa(p)), Vec3fa(FLT_LARGE)) & le_mask(Vec3fa(0.f), Vec3fa(radius))) && abs(time) < float(inf); - } - - Vec3f p; - float time; - float radius; - }; - - /* Converts point query packet to single point query */ - template - __forceinline void PointQueryK::get(PointQueryK<1>* query) const - { - for (size_t i = 0; i < K; i++) // FIXME: use SIMD transpose - { - query[i].p.x = p.x[i]; - query[i].p.y = p.y[i]; - query[i].p.z = p.z[i]; - query[i].time = time[i]; - query[i].radius = radius[i]; - } - } - - /* Extracts a single point query out of a point query packet*/ - template - __forceinline void PointQueryK::get(size_t i, PointQueryK<1>& query) const - { - query.p.x = p.x[i]; - query.p.y = p.y[i]; - query.p.z = p.z[i]; - query.radius = radius[i]; - query.time = time[i]; - } - - /* Converts single point query to point query packet */ - template - __forceinline void PointQueryK::set(const PointQueryK<1>* query) - { - for (size_t i = 0; i < K; i++) - { - p.x[i] = query[i].p.x; - p.y[i] = query[i].p.y; - p.z[i] = query[i].p.z; - radius[i] = query[i].radius; - time[i] = query[i].time; - } - } - - /* inserts a single point query into a point query packet element */ - template - __forceinline void PointQueryK::set(size_t i, const PointQueryK<1>& query) - { - p.x[i] = query.p.x; - p.y[i] = query.p.y; - p.z[i] = query.p.z; - radius[i] = query.radius; - time[i] = query.time; - } - - /* Shortcuts */ - typedef PointQueryK<1> PointQuery; - typedef PointQueryK<4> PointQuery4; - typedef PointQueryK<8> PointQuery8; - typedef PointQueryK<16> PointQuery16; - struct PointQueryN; - - /* Outputs point query to stream */ - template - __forceinline embree_ostream operator <<(embree_ostream cout, const PointQueryK& query) - { - cout << "{ " << embree_endl - << " p = " << query.p << embree_endl - << " r = " << query.radius << embree_endl - << " time = " << query.time << embree_endl - << "}"; - return cout; - } -} diff --git a/thirdparty/embree-aarch64/kernels/common/primref.h b/thirdparty/embree-aarch64/kernels/common/primref.h deleted file mode 100644 index ce75c982bb..0000000000 --- a/thirdparty/embree-aarch64/kernels/common/primref.h +++ /dev/null @@ -1,138 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "default.h" - -namespace embree -{ - /*! A primitive reference stores the bounds of the primitive and its ID. */ - struct __aligned(32) PrimRef - { - __forceinline PrimRef () {} - -#if defined(__AVX__) - __forceinline PrimRef(const PrimRef& v) { - vfloat8::store((float*)this,vfloat8::load((float*)&v)); - } - __forceinline PrimRef& operator=(const PrimRef& v) { - vfloat8::store((float*)this,vfloat8::load((float*)&v)); return *this; - } -#endif - - __forceinline PrimRef (const BBox3fa& bounds, unsigned int geomID, unsigned int primID) - { - lower = Vec3fx(bounds.lower, geomID); - upper = Vec3fx(bounds.upper, primID); - } - - __forceinline PrimRef (const BBox3fa& bounds, size_t id) - { -#if defined(__X86_64__) || defined(__aarch64__) - lower = Vec3fx(bounds.lower, (unsigned)(id & 0xFFFFFFFF)); - upper = Vec3fx(bounds.upper, (unsigned)((id >> 32) & 0xFFFFFFFF)); -#else - lower = Vec3fx(bounds.lower, (unsigned)id); - upper = Vec3fx(bounds.upper, (unsigned)0); -#endif - } - - /*! calculates twice the center of the primitive */ - __forceinline const Vec3fa center2() const { - return lower+upper; - } - - /*! return the bounding box of the primitive */ - __forceinline const BBox3fa bounds() const { - return BBox3fa(lower,upper); - } - - /*! size for bin heuristic is 1 */ - __forceinline unsigned size() const { - return 1; - } - - /*! returns bounds and centroid used for binning */ - __forceinline void binBoundsAndCenter(BBox3fa& bounds_o, Vec3fa& center_o) const - { - bounds_o = bounds(); - center_o = embree::center2(bounds_o); - } - - __forceinline unsigned& geomIDref() { // FIXME: remove !!!!!!! - return lower.u; - } - __forceinline unsigned& primIDref() { // FIXME: remove !!!!!!! - return upper.u; - } - - /*! returns the geometry ID */ - __forceinline unsigned geomID() const { - return lower.a; - } - - /*! returns the primitive ID */ - __forceinline unsigned primID() const { - return upper.a; - } - - /*! returns an size_t sized ID */ - __forceinline size_t ID() const { -#if defined(__X86_64__) || defined(__aarch64__) - return size_t(lower.u) + (size_t(upper.u) << 32); -#else - return size_t(lower.u); -#endif - } - - /*! special function for operator< */ - __forceinline uint64_t ID64() const { - return (((uint64_t)primID()) << 32) + (uint64_t)geomID(); - } - - /*! allows sorting the primrefs by ID */ - friend __forceinline bool operator<(const PrimRef& p0, const PrimRef& p1) { - return p0.ID64() < p1.ID64(); - } - - /*! Outputs primitive reference to a stream. */ - friend __forceinline embree_ostream operator<<(embree_ostream cout, const PrimRef& ref) { - return cout << "{ lower = " << ref.lower << ", upper = " << ref.upper << ", geomID = " << ref.geomID() << ", primID = " << ref.primID() << " }"; - } - - public: - Vec3fx lower; //!< lower bounds and geomID - Vec3fx upper; //!< upper bounds and primID - }; - - /*! fast exchange for PrimRefs */ - __forceinline void xchg(PrimRef& a, PrimRef& b) - { -#if defined(__AVX__) - const vfloat8 aa = vfloat8::load((float*)&a); - const vfloat8 bb = vfloat8::load((float*)&b); - vfloat8::store((float*)&a,bb); - vfloat8::store((float*)&b,aa); -#else - std::swap(a,b); -#endif - } - - /************************************************************************************/ - /************************************************************************************/ - /************************************************************************************/ - /************************************************************************************/ - - struct SubGridBuildData { - unsigned short sx,sy; - unsigned int primID; - - __forceinline SubGridBuildData() {}; - __forceinline SubGridBuildData(const unsigned int sx, const unsigned int sy, const unsigned int primID) : sx(sx), sy(sy), primID(primID) {}; - - __forceinline size_t x() const { return (size_t)sx & 0x7fff; } - __forceinline size_t y() const { return (size_t)sy & 0x7fff; } - - }; -} diff --git a/thirdparty/embree-aarch64/kernels/common/primref_mb.h b/thirdparty/embree-aarch64/kernels/common/primref_mb.h deleted file mode 100644 index b6c1ad5712..0000000000 --- a/thirdparty/embree-aarch64/kernels/common/primref_mb.h +++ /dev/null @@ -1,262 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "default.h" - -#define MBLUR_BIN_LBBOX 1 - -namespace embree -{ -#if MBLUR_BIN_LBBOX - - /*! A primitive reference stores the bounds of the primitive and its ID. */ - struct PrimRefMB - { - typedef LBBox3fa BBox; - - __forceinline PrimRefMB () {} - - __forceinline PrimRefMB (const LBBox3fa& lbounds_i, unsigned int activeTimeSegments, BBox1f time_range, unsigned int totalTimeSegments, unsigned int geomID, unsigned int primID) - : lbounds((LBBox3fx)lbounds_i), time_range(time_range) - { - assert(activeTimeSegments > 0); - lbounds.bounds0.lower.a = geomID; - lbounds.bounds0.upper.a = primID; - lbounds.bounds1.lower.a = activeTimeSegments; - lbounds.bounds1.upper.a = totalTimeSegments; - } - - __forceinline PrimRefMB (EmptyTy empty, const LBBox3fa& lbounds_i, unsigned int activeTimeSegments, BBox1f time_range, unsigned int totalTimeSegments, size_t id) - : lbounds((LBBox3fx)lbounds_i), time_range(time_range) - { - assert(activeTimeSegments > 0); -#if defined(__X86_64__) || defined(__aarch64__) - lbounds.bounds0.lower.a = id & 0xFFFFFFFF; - lbounds.bounds0.upper.a = (id >> 32) & 0xFFFFFFFF; -#else - lbounds.bounds0.lower.a = id; - lbounds.bounds0.upper.a = 0; -#endif - lbounds.bounds1.lower.a = activeTimeSegments; - lbounds.bounds1.upper.a = totalTimeSegments; - } - - __forceinline PrimRefMB (const LBBox3fa& lbounds_i, unsigned int activeTimeSegments, BBox1f time_range, unsigned int totalTimeSegments, size_t id) - : lbounds((LBBox3fx)lbounds_i), time_range(time_range) - { - assert(activeTimeSegments > 0); -#if defined(__X86_64__) || defined(__aarch64__) - lbounds.bounds0.lower.u = id & 0xFFFFFFFF; - lbounds.bounds0.upper.u = (id >> 32) & 0xFFFFFFFF; -#else - lbounds.bounds0.lower.u = id; - lbounds.bounds0.upper.u = 0; -#endif - lbounds.bounds1.lower.a = activeTimeSegments; - lbounds.bounds1.upper.a = totalTimeSegments; - } - - /*! returns bounds for binning */ - __forceinline LBBox3fa bounds() const { - return lbounds; - } - - /*! returns the number of time segments of this primref */ - __forceinline unsigned size() const { - return lbounds.bounds1.lower.a; - } - - __forceinline unsigned totalTimeSegments() const { - return lbounds.bounds1.upper.a; - } - - /* calculate overlapping time segment range */ - __forceinline range timeSegmentRange(const BBox1f& range) const { - return getTimeSegmentRange(range,time_range,float(totalTimeSegments())); - } - - /* returns time that corresponds to time step */ - __forceinline float timeStep(const int i) const { - assert(i>=0 && i<=(int)totalTimeSegments()); - return time_range.lower + time_range.size()*float(i)/float(totalTimeSegments()); - } - - /*! checks if time range overlaps */ - __forceinline bool time_range_overlap(const BBox1f& range) const - { - if (0.9999f*time_range.upper <= range.lower) return false; - if (1.0001f*time_range.lower >= range.upper) return false; - return true; - } - - /*! returns center for binning */ - __forceinline Vec3fa binCenter() const { - return center2(lbounds.interpolate(0.5f)); - } - - /*! returns bounds and centroid used for binning */ - __forceinline void binBoundsAndCenter(LBBox3fa& bounds_o, Vec3fa& center_o) const - { - bounds_o = bounds(); - center_o = binCenter(); - } - - /*! returns the geometry ID */ - __forceinline unsigned geomID() const { - return lbounds.bounds0.lower.a; - } - - /*! returns the primitive ID */ - __forceinline unsigned primID() const { - return lbounds.bounds0.upper.a; - } - - /*! returns an size_t sized ID */ - __forceinline size_t ID() const { -#if defined(__X86_64__) || defined(__aarch64__) - return size_t(lbounds.bounds0.lower.u) + (size_t(lbounds.bounds0.upper.u) << 32); -#else - return size_t(lbounds.bounds0.lower.u); -#endif - } - - /*! special function for operator< */ - __forceinline uint64_t ID64() const { - return (((uint64_t)primID()) << 32) + (uint64_t)geomID(); - } - - /*! allows sorting the primrefs by ID */ - friend __forceinline bool operator<(const PrimRefMB& p0, const PrimRefMB& p1) { - return p0.ID64() < p1.ID64(); - } - - /*! Outputs primitive reference to a stream. */ - friend __forceinline embree_ostream operator<<(embree_ostream cout, const PrimRefMB& ref) { - return cout << "{ time_range = " << ref.time_range << ", bounds = " << ref.bounds() << ", geomID = " << ref.geomID() << ", primID = " << ref.primID() << ", active_segments = " << ref.size() << ", total_segments = " << ref.totalTimeSegments() << " }"; - } - - public: - LBBox3fx lbounds; - BBox1f time_range; // entire geometry time range - }; - -#else - - /*! A primitive reference stores the bounds of the primitive and its ID. */ - struct __aligned(16) PrimRefMB - { - typedef BBox3fa BBox; - - __forceinline PrimRefMB () {} - - __forceinline PrimRefMB (const LBBox3fa& bounds, unsigned int activeTimeSegments, BBox1f time_range, unsigned int totalTimeSegments, unsigned int geomID, unsigned int primID) - : bbox(bounds.interpolate(0.5f)), _activeTimeSegments(activeTimeSegments), _totalTimeSegments(totalTimeSegments), time_range(time_range) - { - assert(activeTimeSegments > 0); - bbox.lower.a = geomID; - bbox.upper.a = primID; - } - - __forceinline PrimRefMB (EmptyTy empty, const LBBox3fa& bounds, unsigned int activeTimeSegments, BBox1f time_range, unsigned int totalTimeSegments, size_t id) - : bbox(bounds.interpolate(0.5f)), _activeTimeSegments(activeTimeSegments), _totalTimeSegments(totalTimeSegments), time_range(time_range) - { - assert(activeTimeSegments > 0); -#if defined(__X86_64__) || defined(__aarch64__) - bbox.lower.u = id & 0xFFFFFFFF; - bbox.upper.u = (id >> 32) & 0xFFFFFFFF; -#else - bbox.lower.u = id; - bbox.upper.u = 0; -#endif - } - - /*! returns bounds for binning */ - __forceinline BBox3fa bounds() const { - return bbox; - } - - /*! returns the number of time segments of this primref */ - __forceinline unsigned int size() const { - return _activeTimeSegments; - } - - __forceinline unsigned int totalTimeSegments() const { - return _totalTimeSegments; - } - - /* calculate overlapping time segment range */ - __forceinline range timeSegmentRange(const BBox1f& range) const { - return getTimeSegmentRange(range,time_range,float(_totalTimeSegments)); - } - - /* returns time that corresponds to time step */ - __forceinline float timeStep(const int i) const { - assert(i>=0 && i<=(int)_totalTimeSegments); - return time_range.lower + time_range.size()*float(i)/float(_totalTimeSegments); - } - - /*! checks if time range overlaps */ - __forceinline bool time_range_overlap(const BBox1f& range) const - { - if (0.9999f*time_range.upper <= range.lower) return false; - if (1.0001f*time_range.lower >= range.upper) return false; - return true; - } - - /*! returns center for binning */ - __forceinline Vec3fa binCenter() const { - return center2(bounds()); - } - - /*! returns bounds and centroid used for binning */ - __forceinline void binBoundsAndCenter(BBox3fa& bounds_o, Vec3fa& center_o) const - { - bounds_o = bounds(); - center_o = center2(bounds()); - } - - /*! returns the geometry ID */ - __forceinline unsigned int geomID() const { - return bbox.lower.a; - } - - /*! returns the primitive ID */ - __forceinline unsigned int primID() const { - return bbox.upper.a; - } - - /*! returns an size_t sized ID */ - __forceinline size_t ID() const { -#if defined(__X86_64__) || defined(__aarch64__) - return size_t(bbox.lower.u) + (size_t(bbox.upper.u) << 32); -#else - return size_t(bbox.lower.u); -#endif - } - - /*! special function for operator< */ - __forceinline uint64_t ID64() const { - return (((uint64_t)primID()) << 32) + (uint64_t)geomID(); - } - - /*! allows sorting the primrefs by ID */ - friend __forceinline bool operator<(const PrimRefMB& p0, const PrimRefMB& p1) { - return p0.ID64() < p1.ID64(); - } - - /*! Outputs primitive reference to a stream. */ - friend __forceinline embree_ostream operator<<(embree_ostream cout, const PrimRefMB& ref) { - return cout << "{ bounds = " << ref.bounds() << ", geomID = " << ref.geomID() << ", primID = " << ref.primID() << ", active_segments = " << ref.size() << ", total_segments = " << ref.totalTimeSegments() << " }"; - } - - public: - BBox3fa bbox; // bounds, geomID, primID - unsigned int _activeTimeSegments; - unsigned int _totalTimeSegments; - BBox1f time_range; // entire geometry time range - }; - -#endif -} diff --git a/thirdparty/embree-aarch64/kernels/common/profile.h b/thirdparty/embree-aarch64/kernels/common/profile.h deleted file mode 100644 index a7de36414d..0000000000 --- a/thirdparty/embree-aarch64/kernels/common/profile.h +++ /dev/null @@ -1,159 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "default.h" - -namespace embree -{ - /*! helper structure for the implementation of the profile functions below */ - struct ProfileTimer - { - static const size_t N = 20; - - ProfileTimer () {} - - ProfileTimer (const size_t numSkip) : i(0), j(0), maxJ(0), numSkip(numSkip), t0(0) - { - for (size_t i=0; i=numSkip) { - dt_min[j] = min(dt_min[j],dt); - dt_avg[j] = dt_avg[j] + dt; - dt_max[j] = max(dt_max[j],dt); - } - j++; - maxJ = max(maxJ,j); - } - - __forceinline void relative (const char* name) - { - const double t1 = getSeconds(); - const double dt = t1-tj; - tj = t1; - assert(names[j] == nullptr || names[j] == name); - names[j] = name; - if (i == 0) dt_fst[j] = dt; - if (i>=numSkip) { - dt_min[j] = min(dt_min[j],dt); - dt_avg[j] = dt_avg[j] + dt; - dt_max[j] = max(dt_max[j],dt); - } - j++; - maxJ = max(maxJ,j); - } - - void print(size_t numElements) - { - for (size_t k=0; k - void profile(const size_t numSkip, const size_t numIter, const size_t numElements, const Closure& closure) - { - ProfileTimer timer(numSkip); - - for (size_t i=0; i - void profile(ProfileTimer& timer, const size_t numSkip, const size_t numIter, const size_t numElements, const Closure& closure) - { - timer = ProfileTimer(numSkip); - - for (size_t i=0; i - struct RayK - { - /* Default construction does nothing */ - __forceinline RayK() {} - - /* Constructs a ray from origin, direction, and ray segment. Near - * has to be smaller than far */ - __forceinline RayK(const Vec3vf& org, const Vec3vf& dir, - const vfloat& tnear = zero, const vfloat& tfar = inf, - const vfloat& time = zero, const vint& mask = -1, const vint& id = 0, const vint& flags = 0) - : org(org), dir(dir), _tnear(tnear), tfar(tfar), _time(time), mask(mask), id(id), flags(flags) {} - - /* Returns the size of the ray */ - static __forceinline size_t size() { return K; } - - /* Calculates if this is a valid ray that does not cause issues during traversal */ - __forceinline vbool valid() const - { - const vbool vx = (abs(org.x) <= vfloat(FLT_LARGE)) & (abs(dir.x) <= vfloat(FLT_LARGE)); - const vbool vy = (abs(org.y) <= vfloat(FLT_LARGE)) & (abs(dir.y) <= vfloat(FLT_LARGE)); - const vbool vz = (abs(org.z) <= vfloat(FLT_LARGE)) & (abs(dir.z) <= vfloat(FLT_LARGE)); - const vbool vn = abs(tnear()) <= vfloat(inf); - const vbool vf = abs(tfar) <= vfloat(inf); - return vx & vy & vz & vn & vf; - } - - __forceinline void get(RayK<1>* ray) const; - __forceinline void get(size_t i, RayK<1>& ray) const; - __forceinline void set(const RayK<1>* ray); - __forceinline void set(size_t i, const RayK<1>& ray); - - __forceinline void copy(size_t dest, size_t source); - - __forceinline vint octant() const - { - return select(dir.x < 0.0f, vint(1), vint(zero)) | - select(dir.y < 0.0f, vint(2), vint(zero)) | - select(dir.z < 0.0f, vint(4), vint(zero)); - } - - /* Ray data */ - Vec3vf org; // ray origin - vfloat _tnear; // start of ray segment - Vec3vf dir; // ray direction - vfloat _time; // time of this ray for motion blur - vfloat tfar; // end of ray segment - vint mask; // used to mask out objects during traversal - vint id; - vint flags; - - __forceinline vfloat& tnear() { return _tnear; } - __forceinline vfloat& time() { return _time; } - __forceinline const vfloat& tnear() const { return _tnear; } - __forceinline const vfloat& time() const { return _time; } - }; - - /* Ray+hit structure for K rays */ - template - struct RayHitK : RayK - { - using RayK::org; - using RayK::_tnear; - using RayK::dir; - using RayK::_time; - using RayK::tfar; - using RayK::mask; - using RayK::id; - using RayK::flags; - - using RayK::tnear; - using RayK::time; - - /* Default construction does nothing */ - __forceinline RayHitK() {} - - /* Constructs a ray from origin, direction, and ray segment. Near - * has to be smaller than far */ - __forceinline RayHitK(const Vec3vf& org, const Vec3vf& dir, - const vfloat& tnear = zero, const vfloat& tfar = inf, - const vfloat& time = zero, const vint& mask = -1, const vint& id = 0, const vint& flags = 0) - : RayK(org, dir, tnear, tfar, time, mask, id, flags), - geomID(RTC_INVALID_GEOMETRY_ID) - { - for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) - instID[l] = RTC_INVALID_GEOMETRY_ID; - } - - __forceinline RayHitK(const RayK& ray) - : RayK(ray), - geomID(RTC_INVALID_GEOMETRY_ID) - { - for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) - instID[l] = RTC_INVALID_GEOMETRY_ID; - } - - __forceinline RayHitK& operator =(const RayK& ray) - { - org = ray.org; - _tnear = ray._tnear; - dir = ray.dir; - _time = ray._time; - tfar = ray.tfar; - mask = ray.mask; - id = ray.id; - flags = ray.flags; - - geomID = RTC_INVALID_GEOMETRY_ID; - for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) - instID[l] = RTC_INVALID_GEOMETRY_ID; - - return *this; - } - - /* Calculates if the hit is valid */ - __forceinline void verifyHit(const vbool& valid0) const - { - vbool valid = valid0 & geomID != vuint(RTC_INVALID_GEOMETRY_ID); - const vbool vt = (abs(tfar) <= vfloat(FLT_LARGE)) | (tfar == vfloat(neg_inf)); - const vbool vu = (abs(u) <= vfloat(FLT_LARGE)); - const vbool vv = (abs(u) <= vfloat(FLT_LARGE)); - const vbool vnx = abs(Ng.x) <= vfloat(FLT_LARGE); - const vbool vny = abs(Ng.y) <= vfloat(FLT_LARGE); - const vbool vnz = abs(Ng.z) <= vfloat(FLT_LARGE); - if (any(valid & !vt)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid t"); - if (any(valid & !vu)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid u"); - if (any(valid & !vv)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid v"); - if (any(valid & !vnx)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid Ng.x"); - if (any(valid & !vny)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid Ng.y"); - if (any(valid & !vnz)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid Ng.z"); - } - - __forceinline void get(RayHitK<1>* ray) const; - __forceinline void get(size_t i, RayHitK<1>& ray) const; - __forceinline void set(const RayHitK<1>* ray); - __forceinline void set(size_t i, const RayHitK<1>& ray); - - __forceinline void copy(size_t dest, size_t source); - - /* Hit data */ - Vec3vf Ng; // geometry normal - vfloat u; // barycentric u coordinate of hit - vfloat v; // barycentric v coordinate of hit - vuint primID; // primitive ID - vuint geomID; // geometry ID - vuint instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // instance ID - }; - - /* Specialization for a single ray */ - template<> - struct RayK<1> - { - /* Default construction does nothing */ - __forceinline RayK() {} - - /* Constructs a ray from origin, direction, and ray segment. Near - * has to be smaller than far */ - __forceinline RayK(const Vec3fa& org, const Vec3fa& dir, float tnear = zero, float tfar = inf, float time = zero, int mask = -1, int id = 0, int flags = 0) - : org(org,tnear), dir(dir,time), tfar(tfar), mask(mask), id(id), flags(flags) {} - - /* Calculates if this is a valid ray that does not cause issues during traversal */ - __forceinline bool valid() const { - return all(le_mask(abs(Vec3fa(org)), Vec3fa(FLT_LARGE)) & le_mask(abs(Vec3fa(dir)), Vec3fa(FLT_LARGE))) && abs(tnear()) <= float(inf) && abs(tfar) <= float(inf); - } - - /* Ray data */ - Vec3ff org; // 3 floats for ray origin, 1 float for tnear - //float tnear; // start of ray segment - Vec3ff dir; // 3 floats for ray direction, 1 float for time - // float time; - float tfar; // end of ray segment - int mask; // used to mask out objects during traversal - int id; // ray ID - int flags; // ray flags - - __forceinline float& tnear() { return org.w; }; - __forceinline const float& tnear() const { return org.w; }; - - __forceinline float& time() { return dir.w; }; - __forceinline const float& time() const { return dir.w; }; - - }; - - template<> - struct RayHitK<1> : RayK<1> - { - /* Default construction does nothing */ - __forceinline RayHitK() {} - - /* Constructs a ray from origin, direction, and ray segment. Near - * has to be smaller than far */ - __forceinline RayHitK(const Vec3fa& org, const Vec3fa& dir, float tnear = zero, float tfar = inf, float time = zero, int mask = -1, int id = 0, int flags = 0) - : RayK<1>(org, dir, tnear, tfar, time, mask, id, flags), - geomID(RTC_INVALID_GEOMETRY_ID) {} - - __forceinline RayHitK(const RayK<1>& ray) - : RayK<1>(ray), - geomID(RTC_INVALID_GEOMETRY_ID) {} - - __forceinline RayHitK<1>& operator =(const RayK<1>& ray) - { - org = ray.org; - dir = ray.dir; - tfar = ray.tfar; - mask = ray.mask; - id = ray.id; - flags = ray.flags; - - geomID = RTC_INVALID_GEOMETRY_ID; - - return *this; - } - - /* Calculates if the hit is valid */ - __forceinline void verifyHit() const - { - if (geomID == RTC_INVALID_GEOMETRY_ID) return; - const bool vt = (abs(tfar) <= FLT_LARGE) || (tfar == float(neg_inf)); - const bool vu = (abs(u) <= FLT_LARGE); - const bool vv = (abs(u) <= FLT_LARGE); - const bool vnx = abs(Ng.x) <= FLT_LARGE; - const bool vny = abs(Ng.y) <= FLT_LARGE; - const bool vnz = abs(Ng.z) <= FLT_LARGE; - if (!vt) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid t"); - if (!vu) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid u"); - if (!vv) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid v"); - if (!vnx) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid Ng.x"); - if (!vny) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid Ng.y"); - if (!vnz) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid Ng.z"); - } - - /* Hit data */ - Vec3f Ng; // not normalized geometry normal - float u; // barycentric u coordinate of hit - float v; // barycentric v coordinate of hit - unsigned int primID; // primitive ID - unsigned int geomID; // geometry ID - unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // instance ID - }; - - /* Converts ray packet to single rays */ - template - __forceinline void RayK::get(RayK<1>* ray) const - { - for (size_t i = 0; i < K; i++) // FIXME: use SIMD transpose - { - ray[i].org.x = org.x[i]; ray[i].org.y = org.y[i]; ray[i].org.z = org.z[i]; ray[i].tnear() = tnear()[i]; - ray[i].dir.x = dir.x[i]; ray[i].dir.y = dir.y[i]; ray[i].dir.z = dir.z[i]; ray[i].time() = time()[i]; - ray[i].tfar = tfar[i]; ray[i].mask = mask[i]; ray[i].id = id[i]; ray[i].flags = flags[i]; - } - } - - template - __forceinline void RayHitK::get(RayHitK<1>* ray) const - { - // FIXME: use SIMD transpose - for (size_t i = 0; i < K; i++) - get(i, ray[i]); - } - - /* Extracts a single ray out of a ray packet*/ - template - __forceinline void RayK::get(size_t i, RayK<1>& ray) const - { - ray.org.x = org.x[i]; ray.org.y = org.y[i]; ray.org.z = org.z[i]; ray.tnear() = tnear()[i]; - ray.dir.x = dir.x[i]; ray.dir.y = dir.y[i]; ray.dir.z = dir.z[i]; ray.time() = time()[i]; - ray.tfar = tfar[i]; ray.mask = mask[i]; ray.id = id[i]; ray.flags = flags[i]; - } - - template - __forceinline void RayHitK::get(size_t i, RayHitK<1>& ray) const - { - ray.org.x = org.x[i]; ray.org.y = org.y[i]; ray.org.z = org.z[i]; ray.tnear() = tnear()[i]; - ray.dir.x = dir.x[i]; ray.dir.y = dir.y[i]; ray.dir.z = dir.z[i]; ray.tfar = tfar[i]; ray.time() = time()[i]; - ray.mask = mask[i]; ray.id = id[i]; ray.flags = flags[i]; - ray.Ng.x = Ng.x[i]; ray.Ng.y = Ng.y[i]; ray.Ng.z = Ng.z[i]; - ray.u = u[i]; ray.v = v[i]; - ray.primID = primID[i]; ray.geomID = geomID[i]; - - instance_id_stack::copy(instID, ray.instID, i); - } - - /* Converts single rays to ray packet */ - template - __forceinline void RayK::set(const RayK<1>* ray) - { - // FIXME: use SIMD transpose - for (size_t i = 0; i < K; i++) - set(i, ray[i]); - } - - template - __forceinline void RayHitK::set(const RayHitK<1>* ray) - { - // FIXME: use SIMD transpose - for (size_t i = 0; i < K; i++) - set(i, ray[i]); - } - - /* inserts a single ray into a ray packet element */ - template - __forceinline void RayK::set(size_t i, const RayK<1>& ray) - { - org.x[i] = ray.org.x; org.y[i] = ray.org.y; org.z[i] = ray.org.z; tnear()[i] = ray.tnear(); - dir.x[i] = ray.dir.x; dir.y[i] = ray.dir.y; dir.z[i] = ray.dir.z; time()[i] = ray.time(); - tfar[i] = ray.tfar; mask[i] = ray.mask; id[i] = ray.id; flags[i] = ray.flags; - } - - template - __forceinline void RayHitK::set(size_t i, const RayHitK<1>& ray) - { - org.x[i] = ray.org.x; org.y[i] = ray.org.y; org.z[i] = ray.org.z; tnear()[i] = ray.tnear(); - dir.x[i] = ray.dir.x; dir.y[i] = ray.dir.y; dir.z[i] = ray.dir.z; time()[i] = ray.time(); - tfar[i] = ray.tfar; mask[i] = ray.mask; id[i] = ray.id; flags[i] = ray.flags; - Ng.x[i] = ray.Ng.x; Ng.y[i] = ray.Ng.y; Ng.z[i] = ray.Ng.z; - u[i] = ray.u; v[i] = ray.v; - primID[i] = ray.primID; geomID[i] = ray.geomID; - - instance_id_stack::copy(ray.instID, instID, i); - } - - /* copies a ray packet element into another element*/ - template - __forceinline void RayK::copy(size_t dest, size_t source) - { - org.x[dest] = org.x[source]; org.y[dest] = org.y[source]; org.z[dest] = org.z[source]; tnear()[dest] = tnear()[source]; - dir.x[dest] = dir.x[source]; dir.y[dest] = dir.y[source]; dir.z[dest] = dir.z[source]; time()[dest] = time()[source]; - tfar [dest] = tfar[source]; mask[dest] = mask[source]; id[dest] = id[source]; flags[dest] = flags[source]; - } - - template - __forceinline void RayHitK::copy(size_t dest, size_t source) - { - org.x[dest] = org.x[source]; org.y[dest] = org.y[source]; org.z[dest] = org.z[source]; tnear()[dest] = tnear()[source]; - dir.x[dest] = dir.x[source]; dir.y[dest] = dir.y[source]; dir.z[dest] = dir.z[source]; time()[dest] = time()[source]; - tfar [dest] = tfar[source]; mask[dest] = mask[source]; id[dest] = id[source]; flags[dest] = flags[source]; - Ng.x[dest] = Ng.x[source]; Ng.y[dest] = Ng.y[source]; Ng.z[dest] = Ng.z[source]; - u[dest] = u[source]; v[dest] = v[source]; - primID[dest] = primID[source]; geomID[dest] = geomID[source]; - - instance_id_stack::copy(instID, instID, source, dest); - } - - /* Shortcuts */ - typedef RayK<1> Ray; - typedef RayK<4> Ray4; - typedef RayK<8> Ray8; - typedef RayK<16> Ray16; - struct RayN; - - typedef RayHitK<1> RayHit; - typedef RayHitK<4> RayHit4; - typedef RayHitK<8> RayHit8; - typedef RayHitK<16> RayHit16; - struct RayHitN; - - template - struct RayTypeHelper; - - template - struct RayTypeHelper - { - typedef RayHitK Ty; - }; - - template - struct RayTypeHelper - { - typedef RayK Ty; - }; - - template - using RayType = typename RayTypeHelper<1, intersect>::Ty; - - template - using RayTypeK = typename RayTypeHelper::Ty; - - /* Outputs ray to stream */ - template - __forceinline embree_ostream operator <<(embree_ostream cout, const RayK& ray) - { - return cout << "{ " << embree_endl - << " org = " << ray.org << embree_endl - << " dir = " << ray.dir << embree_endl - << " near = " << ray.tnear() << embree_endl - << " far = " << ray.tfar << embree_endl - << " time = " << ray.time() << embree_endl - << " mask = " << ray.mask << embree_endl - << " id = " << ray.id << embree_endl - << " flags = " << ray.flags << embree_endl - << "}"; - } - - template - __forceinline embree_ostream operator <<(embree_ostream cout, const RayHitK& ray) - { - cout << "{ " << embree_endl - << " org = " << ray.org << embree_endl - << " dir = " << ray.dir << embree_endl - << " near = " << ray.tnear() << embree_endl - << " far = " << ray.tfar << embree_endl - << " time = " << ray.time() << embree_endl - << " mask = " << ray.mask << embree_endl - << " id = " << ray.id << embree_endl - << " flags = " << ray.flags << embree_endl - << " Ng = " << ray.Ng - << " u = " << ray.u << embree_endl - << " v = " << ray.v << embree_endl - << " primID = " << ray.primID << embree_endl - << " geomID = " << ray.geomID << embree_endl - << " instID ="; - for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) - { - cout << " " << ray.instID[l]; - } - cout << embree_endl; - return cout << "}"; - } - - struct RayStreamSOA - { - __forceinline RayStreamSOA(void* rays, size_t N) - : ptr((char*)rays), N(N) {} - - /* ray data access functions */ - __forceinline float* org_x(size_t offset = 0) { return (float*)&ptr[0*4*N+offset]; } // x coordinate of ray origin - __forceinline float* org_y(size_t offset = 0) { return (float*)&ptr[1*4*N+offset]; } // y coordinate of ray origin - __forceinline float* org_z(size_t offset = 0) { return (float*)&ptr[2*4*N+offset]; }; // z coordinate of ray origin - __forceinline float* tnear(size_t offset = 0) { return (float*)&ptr[3*4*N+offset]; }; // start of ray segment - - __forceinline float* dir_x(size_t offset = 0) { return (float*)&ptr[4*4*N+offset]; }; // x coordinate of ray direction - __forceinline float* dir_y(size_t offset = 0) { return (float*)&ptr[5*4*N+offset]; }; // y coordinate of ray direction - __forceinline float* dir_z(size_t offset = 0) { return (float*)&ptr[6*4*N+offset]; }; // z coordinate of ray direction - __forceinline float* time (size_t offset = 0) { return (float*)&ptr[7*4*N+offset]; }; // time of this ray for motion blur - - __forceinline float* tfar (size_t offset = 0) { return (float*)&ptr[8*4*N+offset]; }; // end of ray segment (set to hit distance) - __forceinline int* mask (size_t offset = 0) { return (int*)&ptr[9*4*N+offset]; }; // used to mask out objects during traversal (optional) - __forceinline int* id (size_t offset = 0) { return (int*)&ptr[10*4*N+offset]; }; // id - __forceinline int* flags(size_t offset = 0) { return (int*)&ptr[11*4*N+offset]; }; // flags - - /* hit data access functions */ - __forceinline float* Ng_x(size_t offset = 0) { return (float*)&ptr[12*4*N+offset]; }; // x coordinate of geometry normal - __forceinline float* Ng_y(size_t offset = 0) { return (float*)&ptr[13*4*N+offset]; }; // y coordinate of geometry normal - __forceinline float* Ng_z(size_t offset = 0) { return (float*)&ptr[14*4*N+offset]; }; // z coordinate of geometry normal - - __forceinline float* u(size_t offset = 0) { return (float*)&ptr[15*4*N+offset]; }; // barycentric u coordinate of hit - __forceinline float* v(size_t offset = 0) { return (float*)&ptr[16*4*N+offset]; }; // barycentric v coordinate of hit - - __forceinline unsigned int* primID(size_t offset = 0) { return (unsigned int*)&ptr[17*4*N+offset]; }; // primitive ID - __forceinline unsigned int* geomID(size_t offset = 0) { return (unsigned int*)&ptr[18*4*N+offset]; }; // geometry ID - __forceinline unsigned int* instID(size_t level, size_t offset = 0) { return (unsigned int*)&ptr[19*4*N+level*4*N+offset]; }; // instance ID - - __forceinline Ray getRayByOffset(size_t offset) - { - Ray ray; - ray.org.x = org_x(offset)[0]; - ray.org.y = org_y(offset)[0]; - ray.org.z = org_z(offset)[0]; - ray.tnear() = tnear(offset)[0]; - ray.dir.x = dir_x(offset)[0]; - ray.dir.y = dir_y(offset)[0]; - ray.dir.z = dir_z(offset)[0]; - ray.time() = time(offset)[0]; - ray.tfar = tfar(offset)[0]; - ray.mask = mask(offset)[0]; - ray.id = id(offset)[0]; - ray.flags = flags(offset)[0]; - return ray; - } - - template - __forceinline RayK getRayByOffset(size_t offset) - { - RayK ray; - ray.org.x = vfloat::loadu(org_x(offset)); - ray.org.y = vfloat::loadu(org_y(offset)); - ray.org.z = vfloat::loadu(org_z(offset)); - ray.tnear = vfloat::loadu(tnear(offset)); - ray.dir.x = vfloat::loadu(dir_x(offset)); - ray.dir.y = vfloat::loadu(dir_y(offset)); - ray.dir.z = vfloat::loadu(dir_z(offset)); - ray.time = vfloat::loadu(time(offset)); - ray.tfar = vfloat::loadu(tfar(offset)); - ray.mask = vint::loadu(mask(offset)); - ray.id = vint::loadu(id(offset)); - ray.flags = vint::loadu(flags(offset)); - return ray; - } - - template - __forceinline RayK getRayByOffset(const vbool& valid, size_t offset) - { - RayK ray; - ray.org.x = vfloat::loadu(valid, org_x(offset)); - ray.org.y = vfloat::loadu(valid, org_y(offset)); - ray.org.z = vfloat::loadu(valid, org_z(offset)); - ray.tnear() = vfloat::loadu(valid, tnear(offset)); - ray.dir.x = vfloat::loadu(valid, dir_x(offset)); - ray.dir.y = vfloat::loadu(valid, dir_y(offset)); - ray.dir.z = vfloat::loadu(valid, dir_z(offset)); - ray.time() = vfloat::loadu(valid, time(offset)); - ray.tfar = vfloat::loadu(valid, tfar(offset)); - -#if !defined(__AVX__) - /* SSE: some ray members must be loaded with scalar instructions to ensure that we don't cause memory faults, - because the SSE masked loads always access the entire vector */ - if (unlikely(!all(valid))) - { - ray.mask = zero; - ray.id = zero; - ray.flags = zero; - - for (size_t k = 0; k < K; k++) - { - if (likely(valid[k])) - { - ray.mask[k] = mask(offset)[k]; - ray.id[k] = id(offset)[k]; - ray.flags[k] = flags(offset)[k]; - } - } - } - else -#endif - { - ray.mask = vint::loadu(valid, mask(offset)); - ray.id = vint::loadu(valid, id(offset)); - ray.flags = vint::loadu(valid, flags(offset)); - } - - return ray; - } - - template - __forceinline void setHitByOffset(const vbool& valid_i, size_t offset, const RayHitK& ray) - { - /* - * valid_i: stores which of the input rays exist (do not access nonexistent rays!) - * valid: stores which of the rays actually hit something. - */ - vbool valid = valid_i; - valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID); - - if (likely(any(valid))) - { - vfloat::storeu(valid, tfar(offset), ray.tfar); - vfloat::storeu(valid, Ng_x(offset), ray.Ng.x); - vfloat::storeu(valid, Ng_y(offset), ray.Ng.y); - vfloat::storeu(valid, Ng_z(offset), ray.Ng.z); - vfloat::storeu(valid, u(offset), ray.u); - vfloat::storeu(valid, v(offset), ray.v); - -#if !defined(__AVX__) - /* SSE: some ray members must be stored with scalar instructions to ensure that we don't cause memory faults, - because the SSE masked stores always access the entire vector */ - if (unlikely(!all(valid_i))) - { - for (size_t k = 0; k < K; k++) - { - if (likely(valid[k])) - { - primID(offset)[k] = ray.primID[k]; - geomID(offset)[k] = ray.geomID[k]; - - instID(0, offset)[k] = ray.instID[0][k]; -#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1) - for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && ray.instID[l-1][k] != RTC_INVALID_GEOMETRY_ID; ++l) - instID(l, offset)[k] = ray.instID[l][k]; -#endif - } - } - } - else -#endif - { - vuint::storeu(valid, primID(offset), ray.primID); - vuint::storeu(valid, geomID(offset), ray.geomID); - - vuint::storeu(valid, instID(0, offset), ray.instID[0]); -#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1) - for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l) - vuint::storeu(valid, instID(l, offset), ray.instID[l]); -#endif - } - } - } - - template - __forceinline void setHitByOffset(const vbool& valid_i, size_t offset, const RayK& ray) - { - vbool valid = valid_i; - valid &= (ray.tfar < 0.0f); - - if (likely(any(valid))) - vfloat::storeu(valid, tfar(offset), ray.tfar); - } - - __forceinline size_t getOctantByOffset(size_t offset) - { - const float dx = dir_x(offset)[0]; - const float dy = dir_y(offset)[0]; - const float dz = dir_z(offset)[0]; - const size_t octantID = (dx < 0.0f ? 1 : 0) + (dy < 0.0f ? 2 : 0) + (dz < 0.0f ? 4 : 0); - return octantID; - } - - __forceinline bool isValidByOffset(size_t offset) - { - const float nnear = tnear(offset)[0]; - const float ffar = tfar(offset)[0]; - return nnear <= ffar; - } - - template - __forceinline RayK getRayByOffset(const vbool& valid, const vint& offset) - { - RayK ray; - -#if defined(__AVX2__) - ray.org.x = vfloat::template gather<1>(valid, org_x(), offset); - ray.org.y = vfloat::template gather<1>(valid, org_y(), offset); - ray.org.z = vfloat::template gather<1>(valid, org_z(), offset); - ray.tnear() = vfloat::template gather<1>(valid, tnear(), offset); - ray.dir.x = vfloat::template gather<1>(valid, dir_x(), offset); - ray.dir.y = vfloat::template gather<1>(valid, dir_y(), offset); - ray.dir.z = vfloat::template gather<1>(valid, dir_z(), offset); - ray.time() = vfloat::template gather<1>(valid, time(), offset); - ray.tfar = vfloat::template gather<1>(valid, tfar(), offset); - ray.mask = vint::template gather<1>(valid, mask(), offset); - ray.id = vint::template gather<1>(valid, id(), offset); - ray.flags = vint::template gather<1>(valid, flags(), offset); -#else - ray.org = zero; - ray.tnear() = zero; - ray.dir = zero; - ray.time() = zero; - ray.tfar = zero; - ray.mask = zero; - ray.id = zero; - ray.flags = zero; - - for (size_t k = 0; k < K; k++) - { - if (likely(valid[k])) - { - const size_t ofs = offset[k]; - - ray.org.x[k] = *org_x(ofs); - ray.org.y[k] = *org_y(ofs); - ray.org.z[k] = *org_z(ofs); - ray.tnear()[k] = *tnear(ofs); - ray.dir.x[k] = *dir_x(ofs); - ray.dir.y[k] = *dir_y(ofs); - ray.dir.z[k] = *dir_z(ofs); - ray.time()[k] = *time(ofs); - ray.tfar[k] = *tfar(ofs); - ray.mask[k] = *mask(ofs); - ray.id[k] = *id(ofs); - ray.flags[k] = *flags(ofs); - } - } -#endif - - return ray; - } - - template - __forceinline void setHitByOffset(const vbool& valid_i, const vint& offset, const RayHitK& ray) - { - vbool valid = valid_i; - valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID); - - if (likely(any(valid))) - { -#if defined(__AVX512F__) - vfloat::template scatter<1>(valid, tfar(), offset, ray.tfar); - vfloat::template scatter<1>(valid, Ng_x(), offset, ray.Ng.x); - vfloat::template scatter<1>(valid, Ng_y(), offset, ray.Ng.y); - vfloat::template scatter<1>(valid, Ng_z(), offset, ray.Ng.z); - vfloat::template scatter<1>(valid, u(), offset, ray.u); - vfloat::template scatter<1>(valid, v(), offset, ray.v); - vuint::template scatter<1>(valid, primID(), offset, ray.primID); - vuint::template scatter<1>(valid, geomID(), offset, ray.geomID); - - vuint::template scatter<1>(valid, instID(0), offset, ray.instID[0]); -#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1) - for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l) - vuint::template scatter<1>(valid, instID(l), offset, ray.instID[l]); -#endif -#else - size_t valid_bits = movemask(valid); - while (valid_bits != 0) - { - const size_t k = bscf(valid_bits); - const size_t ofs = offset[k]; - - *tfar(ofs) = ray.tfar[k]; - - *Ng_x(ofs) = ray.Ng.x[k]; - *Ng_y(ofs) = ray.Ng.y[k]; - *Ng_z(ofs) = ray.Ng.z[k]; - *u(ofs) = ray.u[k]; - *v(ofs) = ray.v[k]; - *primID(ofs) = ray.primID[k]; - *geomID(ofs) = ray.geomID[k]; - - *instID(0, ofs) = ray.instID[0][k]; -#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1) - for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && ray.instID[l-1][k] != RTC_INVALID_GEOMETRY_ID; ++l) - *instID(l, ofs) = ray.instID[l][k]; -#endif - } -#endif - } - } - - template - __forceinline void setHitByOffset(const vbool& valid_i, const vint& offset, const RayK& ray) - { - vbool valid = valid_i; - valid &= (ray.tfar < 0.0f); - - if (likely(any(valid))) - { -#if defined(__AVX512F__) - vfloat::template scatter<1>(valid, tfar(), offset, ray.tfar); -#else - size_t valid_bits = movemask(valid); - while (valid_bits != 0) - { - const size_t k = bscf(valid_bits); - const size_t ofs = offset[k]; - - *tfar(ofs) = ray.tfar[k]; - } -#endif - } - } - - char* __restrict__ ptr; - size_t N; - }; - - template - struct StackRayStreamSOA : public RayStreamSOA - { - __forceinline StackRayStreamSOA(size_t K) - : RayStreamSOA(data, K) { assert(K <= MAX_K); } - - char data[MAX_K / 4 * sizeof(RayHit4)]; - }; - - - struct RayStreamSOP - { - template - __forceinline void init(T& t) - { - org_x = (float*)&t.org.x; - org_y = (float*)&t.org.y; - org_z = (float*)&t.org.z; - tnear = (float*)&t.tnear; - dir_x = (float*)&t.dir.x; - dir_y = (float*)&t.dir.y; - dir_z = (float*)&t.dir.z; - time = (float*)&t.time; - tfar = (float*)&t.tfar; - mask = (unsigned int*)&t.mask; - id = (unsigned int*)&t.id; - flags = (unsigned int*)&t.flags; - - Ng_x = (float*)&t.Ng.x; - Ng_y = (float*)&t.Ng.y; - Ng_z = (float*)&t.Ng.z; - u = (float*)&t.u; - v = (float*)&t.v; - primID = (unsigned int*)&t.primID; - geomID = (unsigned int*)&t.geomID; - - for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) - instID[l] = (unsigned int*)&t.instID[l]; - } - - __forceinline Ray getRayByOffset(size_t offset) - { - Ray ray; - ray.org.x = *(float* __restrict__)((char*)org_x + offset); - ray.org.y = *(float* __restrict__)((char*)org_y + offset); - ray.org.z = *(float* __restrict__)((char*)org_z + offset); - ray.dir.x = *(float* __restrict__)((char*)dir_x + offset); - ray.dir.y = *(float* __restrict__)((char*)dir_y + offset); - ray.dir.z = *(float* __restrict__)((char*)dir_z + offset); - ray.tfar = *(float* __restrict__)((char*)tfar + offset); - ray.tnear() = tnear ? *(float* __restrict__)((char*)tnear + offset) : 0.0f; - ray.time() = time ? *(float* __restrict__)((char*)time + offset) : 0.0f; - ray.mask = mask ? *(unsigned int* __restrict__)((char*)mask + offset) : -1; - ray.id = id ? *(unsigned int* __restrict__)((char*)id + offset) : -1; - ray.flags = flags ? *(unsigned int* __restrict__)((char*)flags + offset) : -1; - return ray; - } - - template - __forceinline RayK getRayByOffset(const vbool& valid, size_t offset) - { - RayK ray; - ray.org.x = vfloat::loadu(valid, (float* __restrict__)((char*)org_x + offset)); - ray.org.y = vfloat::loadu(valid, (float* __restrict__)((char*)org_y + offset)); - ray.org.z = vfloat::loadu(valid, (float* __restrict__)((char*)org_z + offset)); - ray.dir.x = vfloat::loadu(valid, (float* __restrict__)((char*)dir_x + offset)); - ray.dir.y = vfloat::loadu(valid, (float* __restrict__)((char*)dir_y + offset)); - ray.dir.z = vfloat::loadu(valid, (float* __restrict__)((char*)dir_z + offset)); - ray.tfar = vfloat::loadu(valid, (float* __restrict__)((char*)tfar + offset)); - ray.tnear() = tnear ? vfloat::loadu(valid, (float* __restrict__)((char*)tnear + offset)) : 0.0f; - ray.time() = time ? vfloat::loadu(valid, (float* __restrict__)((char*)time + offset)) : 0.0f; - ray.mask = mask ? vint::loadu(valid, (const void* __restrict__)((char*)mask + offset)) : -1; - ray.id = id ? vint::loadu(valid, (const void* __restrict__)((char*)id + offset)) : -1; - ray.flags = flags ? vint::loadu(valid, (const void* __restrict__)((char*)flags + offset)) : -1; - return ray; - } - - template - __forceinline Vec3vf getDirByOffset(const vbool& valid, size_t offset) - { - Vec3vf dir; - dir.x = vfloat::loadu(valid, (float* __restrict__)((char*)dir_x + offset)); - dir.y = vfloat::loadu(valid, (float* __restrict__)((char*)dir_y + offset)); - dir.z = vfloat::loadu(valid, (float* __restrict__)((char*)dir_z + offset)); - return dir; - } - - __forceinline void setHitByOffset(size_t offset, const RayHit& ray) - { - if (ray.geomID != RTC_INVALID_GEOMETRY_ID) - { - *(float* __restrict__)((char*)tfar + offset) = ray.tfar; - - if (likely(Ng_x)) *(float* __restrict__)((char*)Ng_x + offset) = ray.Ng.x; - if (likely(Ng_y)) *(float* __restrict__)((char*)Ng_y + offset) = ray.Ng.y; - if (likely(Ng_z)) *(float* __restrict__)((char*)Ng_z + offset) = ray.Ng.z; - *(float* __restrict__)((char*)u + offset) = ray.u; - *(float* __restrict__)((char*)v + offset) = ray.v; - *(unsigned int* __restrict__)((char*)geomID + offset) = ray.geomID; - *(unsigned int* __restrict__)((char*)primID + offset) = ray.primID; - - if (likely(instID[0])) { - *(unsigned int* __restrict__)((char*)instID[0] + offset) = ray.instID[0]; -#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1) - for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID; ++l) - *(unsigned int* __restrict__)((char*)instID[l] + offset) = ray.instID[l]; -#endif - } - } - } - - __forceinline void setHitByOffset(size_t offset, const Ray& ray) - { - *(float* __restrict__)((char*)tfar + offset) = ray.tfar; - } - - template - __forceinline void setHitByOffset(const vbool& valid_i, size_t offset, const RayHitK& ray) - { - vbool valid = valid_i; - valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID); - - if (likely(any(valid))) - { - vfloat::storeu(valid, (float* __restrict__)((char*)tfar + offset), ray.tfar); - - if (likely(Ng_x)) vfloat::storeu(valid, (float* __restrict__)((char*)Ng_x + offset), ray.Ng.x); - if (likely(Ng_y)) vfloat::storeu(valid, (float* __restrict__)((char*)Ng_y + offset), ray.Ng.y); - if (likely(Ng_z)) vfloat::storeu(valid, (float* __restrict__)((char*)Ng_z + offset), ray.Ng.z); - vfloat::storeu(valid, (float* __restrict__)((char*)u + offset), ray.u); - vfloat::storeu(valid, (float* __restrict__)((char*)v + offset), ray.v); - vuint::storeu(valid, (unsigned int* __restrict__)((char*)primID + offset), ray.primID); - vuint::storeu(valid, (unsigned int* __restrict__)((char*)geomID + offset), ray.geomID); - - if (likely(instID[0])) { - vuint::storeu(valid, (unsigned int* __restrict__)((char*)instID[0] + offset), ray.instID[0]); -#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1) - for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l) - vuint::storeu(valid, (unsigned int* __restrict__)((char*)instID[l] + offset), ray.instID[l]); -#endif - } - } - } - - template - __forceinline void setHitByOffset(const vbool& valid_i, size_t offset, const RayK& ray) - { - vbool valid = valid_i; - valid &= (ray.tfar < 0.0f); - - if (likely(any(valid))) - vfloat::storeu(valid, (float* __restrict__)((char*)tfar + offset), ray.tfar); - } - - __forceinline size_t getOctantByOffset(size_t offset) - { - const float dx = *(float* __restrict__)((char*)dir_x + offset); - const float dy = *(float* __restrict__)((char*)dir_y + offset); - const float dz = *(float* __restrict__)((char*)dir_z + offset); - const size_t octantID = (dx < 0.0f ? 1 : 0) + (dy < 0.0f ? 2 : 0) + (dz < 0.0f ? 4 : 0); - return octantID; - } - - __forceinline bool isValidByOffset(size_t offset) - { - const float nnear = tnear ? *(float* __restrict__)((char*)tnear + offset) : 0.0f; - const float ffar = *(float* __restrict__)((char*)tfar + offset); - return nnear <= ffar; - } - - template - __forceinline vbool isValidByOffset(const vbool& valid, size_t offset) - { - const vfloat nnear = tnear ? vfloat::loadu(valid, (float* __restrict__)((char*)tnear + offset)) : 0.0f; - const vfloat ffar = vfloat::loadu(valid, (float* __restrict__)((char*)tfar + offset)); - return nnear <= ffar; - } - - template - __forceinline RayK getRayByOffset(const vbool& valid, const vint& offset) - { - RayK ray; - -#if defined(__AVX2__) - ray.org.x = vfloat::template gather<1>(valid, org_x, offset); - ray.org.y = vfloat::template gather<1>(valid, org_y, offset); - ray.org.z = vfloat::template gather<1>(valid, org_z, offset); - ray.dir.x = vfloat::template gather<1>(valid, dir_x, offset); - ray.dir.y = vfloat::template gather<1>(valid, dir_y, offset); - ray.dir.z = vfloat::template gather<1>(valid, dir_z, offset); - ray.tfar = vfloat::template gather<1>(valid, tfar, offset); - ray.tnear() = tnear ? vfloat::template gather<1>(valid, tnear, offset) : vfloat(zero); - ray.time() = time ? vfloat::template gather<1>(valid, time, offset) : vfloat(zero); - ray.mask = mask ? vint::template gather<1>(valid, (int*)mask, offset) : vint(-1); - ray.id = id ? vint::template gather<1>(valid, (int*)id, offset) : vint(-1); - ray.flags = flags ? vint::template gather<1>(valid, (int*)flags, offset) : vint(-1); -#else - ray.org = zero; - ray.tnear() = zero; - ray.dir = zero; - ray.tfar = zero; - ray.time() = zero; - ray.mask = zero; - ray.id = zero; - ray.flags = zero; - - for (size_t k = 0; k < K; k++) - { - if (likely(valid[k])) - { - const size_t ofs = offset[k]; - - ray.org.x[k] = *(float* __restrict__)((char*)org_x + ofs); - ray.org.y[k] = *(float* __restrict__)((char*)org_y + ofs); - ray.org.z[k] = *(float* __restrict__)((char*)org_z + ofs); - ray.dir.x[k] = *(float* __restrict__)((char*)dir_x + ofs); - ray.dir.y[k] = *(float* __restrict__)((char*)dir_y + ofs); - ray.dir.z[k] = *(float* __restrict__)((char*)dir_z + ofs); - ray.tfar[k] = *(float* __restrict__)((char*)tfar + ofs); - ray.tnear()[k] = tnear ? *(float* __restrict__)((char*)tnear + ofs) : 0.0f; - ray.time()[k] = time ? *(float* __restrict__)((char*)time + ofs) : 0.0f; - ray.mask[k] = mask ? *(int* __restrict__)((char*)mask + ofs) : -1; - ray.id[k] = id ? *(int* __restrict__)((char*)id + ofs) : -1; - ray.flags[k] = flags ? *(int* __restrict__)((char*)flags + ofs) : -1; - } - } -#endif - - return ray; - } - - template - __forceinline void setHitByOffset(const vbool& valid_i, const vint& offset, const RayHitK& ray) - { - vbool valid = valid_i; - valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID); - - if (likely(any(valid))) - { -#if defined(__AVX512F__) - vfloat::template scatter<1>(valid, tfar, offset, ray.tfar); - - if (likely(Ng_x)) vfloat::template scatter<1>(valid, Ng_x, offset, ray.Ng.x); - if (likely(Ng_y)) vfloat::template scatter<1>(valid, Ng_y, offset, ray.Ng.y); - if (likely(Ng_z)) vfloat::template scatter<1>(valid, Ng_z, offset, ray.Ng.z); - vfloat::template scatter<1>(valid, u, offset, ray.u); - vfloat::template scatter<1>(valid, v, offset, ray.v); - vuint::template scatter<1>(valid, (unsigned int*)geomID, offset, ray.geomID); - vuint::template scatter<1>(valid, (unsigned int*)primID, offset, ray.primID); - - if (likely(instID[0])) { - vuint::template scatter<1>(valid, (unsigned int*)instID[0], offset, ray.instID[0]); -#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1) - for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l) - vuint::template scatter<1>(valid, (unsigned int*)instID[l], offset, ray.instID[l]); -#endif - } -#else - size_t valid_bits = movemask(valid); - while (valid_bits != 0) - { - const size_t k = bscf(valid_bits); - const size_t ofs = offset[k]; - - *(float* __restrict__)((char*)tfar + ofs) = ray.tfar[k]; - - if (likely(Ng_x)) *(float* __restrict__)((char*)Ng_x + ofs) = ray.Ng.x[k]; - if (likely(Ng_y)) *(float* __restrict__)((char*)Ng_y + ofs) = ray.Ng.y[k]; - if (likely(Ng_z)) *(float* __restrict__)((char*)Ng_z + ofs) = ray.Ng.z[k]; - *(float* __restrict__)((char*)u + ofs) = ray.u[k]; - *(float* __restrict__)((char*)v + ofs) = ray.v[k]; - *(unsigned int* __restrict__)((char*)primID + ofs) = ray.primID[k]; - *(unsigned int* __restrict__)((char*)geomID + ofs) = ray.geomID[k]; - - if (likely(instID[0])) { - *(unsigned int* __restrict__)((char*)instID[0] + ofs) = ray.instID[0][k]; -#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1) - for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && ray.instID[l-1][k] != RTC_INVALID_GEOMETRY_ID; ++l) - *(unsigned int* __restrict__)((char*)instID[l] + ofs) = ray.instID[l][k]; -#endif - } - } -#endif - } - } - - template - __forceinline void setHitByOffset(const vbool& valid_i, const vint& offset, const RayK& ray) - { - vbool valid = valid_i; - valid &= (ray.tfar < 0.0f); - - if (likely(any(valid))) - { -#if defined(__AVX512F__) - vfloat::template scatter<1>(valid, tfar, offset, ray.tfar); -#else - size_t valid_bits = movemask(valid); - while (valid_bits != 0) - { - const size_t k = bscf(valid_bits); - const size_t ofs = offset[k]; - - *(float* __restrict__)((char*)tfar + ofs) = ray.tfar[k]; - } -#endif - } - } - - /* ray data */ - float* __restrict__ org_x; // x coordinate of ray origin - float* __restrict__ org_y; // y coordinate of ray origin - float* __restrict__ org_z; // z coordinate of ray origin - float* __restrict__ tnear; // start of ray segment (optional) - - float* __restrict__ dir_x; // x coordinate of ray direction - float* __restrict__ dir_y; // y coordinate of ray direction - float* __restrict__ dir_z; // z coordinate of ray direction - float* __restrict__ time; // time of this ray for motion blur (optional) - - float* __restrict__ tfar; // end of ray segment (set to hit distance) - unsigned int* __restrict__ mask; // used to mask out objects during traversal (optional) - unsigned int* __restrict__ id; // ray ID - unsigned int* __restrict__ flags; // ray flags - - /* hit data */ - float* __restrict__ Ng_x; // x coordinate of geometry normal (optional) - float* __restrict__ Ng_y; // y coordinate of geometry normal (optional) - float* __restrict__ Ng_z; // z coordinate of geometry normal (optional) - - float* __restrict__ u; // barycentric u coordinate of hit - float* __restrict__ v; // barycentric v coordinate of hit - - unsigned int* __restrict__ primID; // primitive ID - unsigned int* __restrict__ geomID; // geometry ID - unsigned int* __restrict__ instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // instance ID (optional) - }; - - - struct RayStreamAOS - { - __forceinline RayStreamAOS(void* rays) - : ptr((Ray*)rays) {} - - __forceinline Ray& getRayByOffset(size_t offset) - { - return *(Ray*)((char*)ptr + offset); - } - - template - __forceinline RayK getRayByOffset(const vint& offset); - - template - __forceinline RayK getRayByOffset(const vbool& valid, const vint& offset) - { - const vint valid_offset = select(valid, offset, vintx(zero)); - return getRayByOffset(valid_offset); - } - - template - __forceinline void setHitByOffset(const vbool& valid_i, const vint& offset, const RayHitK& ray) - { - vbool valid = valid_i; - valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID); - - if (likely(any(valid))) - { -#if defined(__AVX512F__) - vfloat::template scatter<1>(valid, &ptr->tfar, offset, ray.tfar); - vfloat::template scatter<1>(valid, &((RayHit*)ptr)->Ng.x, offset, ray.Ng.x); - vfloat::template scatter<1>(valid, &((RayHit*)ptr)->Ng.y, offset, ray.Ng.y); - vfloat::template scatter<1>(valid, &((RayHit*)ptr)->Ng.z, offset, ray.Ng.z); - vfloat::template scatter<1>(valid, &((RayHit*)ptr)->u, offset, ray.u); - vfloat::template scatter<1>(valid, &((RayHit*)ptr)->v, offset, ray.v); - vuint::template scatter<1>(valid, (unsigned int*)&((RayHit*)ptr)->primID, offset, ray.primID); - vuint::template scatter<1>(valid, (unsigned int*)&((RayHit*)ptr)->geomID, offset, ray.geomID); - - vuint::template scatter<1>(valid, (unsigned int*)&((RayHit*)ptr)->instID[0], offset, ray.instID[0]); -#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1) - for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l) - vuint::template scatter<1>(valid, (unsigned int*)&((RayHit*)ptr)->instID[l], offset, ray.instID[l]); -#endif -#else - size_t valid_bits = movemask(valid); - while (valid_bits != 0) - { - const size_t k = bscf(valid_bits); - RayHit* __restrict__ ray_k = (RayHit*)((char*)ptr + offset[k]); - ray_k->tfar = ray.tfar[k]; - ray_k->Ng.x = ray.Ng.x[k]; - ray_k->Ng.y = ray.Ng.y[k]; - ray_k->Ng.z = ray.Ng.z[k]; - ray_k->u = ray.u[k]; - ray_k->v = ray.v[k]; - ray_k->primID = ray.primID[k]; - ray_k->geomID = ray.geomID[k]; - - instance_id_stack::copy(ray.instID, ray_k->instID, k); - } -#endif - } - } - - template - __forceinline void setHitByOffset(const vbool& valid_i, const vint& offset, const RayK& ray) - { - vbool valid = valid_i; - valid &= (ray.tfar < 0.0f); - - if (likely(any(valid))) - { -#if defined(__AVX512F__) - vfloat::template scatter<1>(valid, &ptr->tfar, offset, ray.tfar); -#else - size_t valid_bits = movemask(valid); - while (valid_bits != 0) - { - const size_t k = bscf(valid_bits); - Ray* __restrict__ ray_k = (Ray*)((char*)ptr + offset[k]); - ray_k->tfar = ray.tfar[k]; - } -#endif - } - } - - Ray* __restrict__ ptr; - }; - - template<> - __forceinline Ray4 RayStreamAOS::getRayByOffset(const vint4& offset) - { - Ray4 ray; - - /* load and transpose: org.x, org.y, org.z, tnear */ - const vfloat4 a0 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[0]))->org); - const vfloat4 a1 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[1]))->org); - const vfloat4 a2 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[2]))->org); - const vfloat4 a3 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[3]))->org); - - transpose(a0,a1,a2,a3, ray.org.x, ray.org.y, ray.org.z, ray.tnear()); - - /* load and transpose: dir.x, dir.y, dir.z, time */ - const vfloat4 b0 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[0]))->dir); - const vfloat4 b1 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[1]))->dir); - const vfloat4 b2 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[2]))->dir); - const vfloat4 b3 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[3]))->dir); - - transpose(b0,b1,b2,b3, ray.dir.x, ray.dir.y, ray.dir.z, ray.time()); - - /* load and transpose: tfar, mask, id, flags */ - const vfloat4 c0 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[0]))->tfar); - const vfloat4 c1 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[1]))->tfar); - const vfloat4 c2 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[2]))->tfar); - const vfloat4 c3 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[3]))->tfar); - - vfloat4 maskf, idf, flagsf; - transpose(c0,c1,c2,c3, ray.tfar, maskf, idf, flagsf); - ray.mask = asInt(maskf); - ray.id = asInt(idf); - ray.flags = asInt(flagsf); - - return ray; - } - -#if defined(__AVX__) - template<> - __forceinline Ray8 RayStreamAOS::getRayByOffset(const vint8& offset) - { - Ray8 ray; - - /* load and transpose: org.x, org.y, org.z, tnear, dir.x, dir.y, dir.z, time */ - const vfloat8 ab0 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[0]))->org); - const vfloat8 ab1 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[1]))->org); - const vfloat8 ab2 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[2]))->org); - const vfloat8 ab3 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[3]))->org); - const vfloat8 ab4 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[4]))->org); - const vfloat8 ab5 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[5]))->org); - const vfloat8 ab6 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[6]))->org); - const vfloat8 ab7 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[7]))->org); - - transpose(ab0,ab1,ab2,ab3,ab4,ab5,ab6,ab7, ray.org.x, ray.org.y, ray.org.z, ray.tnear(), ray.dir.x, ray.dir.y, ray.dir.z, ray.time()); - - /* load and transpose: tfar, mask, id, flags */ - const vfloat4 c0 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[0]))->tfar); - const vfloat4 c1 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[1]))->tfar); - const vfloat4 c2 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[2]))->tfar); - const vfloat4 c3 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[3]))->tfar); - const vfloat4 c4 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[4]))->tfar); - const vfloat4 c5 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[5]))->tfar); - const vfloat4 c6 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[6]))->tfar); - const vfloat4 c7 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[7]))->tfar); - - vfloat8 maskf, idf, flagsf; - transpose(c0,c1,c2,c3,c4,c5,c6,c7, ray.tfar, maskf, idf, flagsf); - ray.mask = asInt(maskf); - ray.id = asInt(idf); - ray.flags = asInt(flagsf); - - return ray; - } -#endif - -#if defined(__AVX512F__) - template<> - __forceinline Ray16 RayStreamAOS::getRayByOffset(const vint16& offset) - { - Ray16 ray; - - /* load and transpose: org.x, org.y, org.z, tnear, dir.x, dir.y, dir.z, time */ - const vfloat8 ab0 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 0]))->org); - const vfloat8 ab1 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 1]))->org); - const vfloat8 ab2 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 2]))->org); - const vfloat8 ab3 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 3]))->org); - const vfloat8 ab4 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 4]))->org); - const vfloat8 ab5 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 5]))->org); - const vfloat8 ab6 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 6]))->org); - const vfloat8 ab7 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 7]))->org); - const vfloat8 ab8 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 8]))->org); - const vfloat8 ab9 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 9]))->org); - const vfloat8 ab10 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[10]))->org); - const vfloat8 ab11 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[11]))->org); - const vfloat8 ab12 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[12]))->org); - const vfloat8 ab13 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[13]))->org); - const vfloat8 ab14 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[14]))->org); - const vfloat8 ab15 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[15]))->org); - - transpose(ab0,ab1,ab2,ab3,ab4,ab5,ab6,ab7,ab8,ab9,ab10,ab11,ab12,ab13,ab14,ab15, - ray.org.x, ray.org.y, ray.org.z, ray.tnear(), ray.dir.x, ray.dir.y, ray.dir.z, ray.time()); - - /* load and transpose: tfar, mask, id, flags */ - const vfloat4 c0 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 0]))->tfar); - const vfloat4 c1 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 1]))->tfar); - const vfloat4 c2 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 2]))->tfar); - const vfloat4 c3 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 3]))->tfar); - const vfloat4 c4 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 4]))->tfar); - const vfloat4 c5 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 5]))->tfar); - const vfloat4 c6 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 6]))->tfar); - const vfloat4 c7 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 7]))->tfar); - const vfloat4 c8 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 8]))->tfar); - const vfloat4 c9 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 9]))->tfar); - const vfloat4 c10 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[10]))->tfar); - const vfloat4 c11 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[11]))->tfar); - const vfloat4 c12 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[12]))->tfar); - const vfloat4 c13 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[13]))->tfar); - const vfloat4 c14 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[14]))->tfar); - const vfloat4 c15 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[15]))->tfar); - - vfloat16 maskf, idf, flagsf; - transpose(c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15, - ray.tfar, maskf, idf, flagsf); - ray.mask = asInt(maskf); - ray.id = asInt(idf); - ray.flags = asInt(flagsf); - - return ray; - } -#endif - - - struct RayStreamAOP - { - __forceinline RayStreamAOP(void* rays) - : ptr((Ray**)rays) {} - - __forceinline Ray& getRayByIndex(size_t index) - { - return *ptr[index]; - } - - template - __forceinline RayK getRayByIndex(const vint& index); - - template - __forceinline RayK getRayByIndex(const vbool& valid, const vint& index) - { - const vint valid_index = select(valid, index, vintx(zero)); - return getRayByIndex(valid_index); - } - - template - __forceinline void setHitByIndex(const vbool& valid_i, const vint& index, const RayHitK& ray) - { - vbool valid = valid_i; - valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID); - - if (likely(any(valid))) - { - size_t valid_bits = movemask(valid); - while (valid_bits != 0) - { - const size_t k = bscf(valid_bits); - RayHit* __restrict__ ray_k = (RayHit*)ptr[index[k]]; - - ray_k->tfar = ray.tfar[k]; - ray_k->Ng.x = ray.Ng.x[k]; - ray_k->Ng.y = ray.Ng.y[k]; - ray_k->Ng.z = ray.Ng.z[k]; - ray_k->u = ray.u[k]; - ray_k->v = ray.v[k]; - ray_k->primID = ray.primID[k]; - ray_k->geomID = ray.geomID[k]; - instance_id_stack::copy(ray.instID, ray_k->instID, k); - } - } - } - - template - __forceinline void setHitByIndex(const vbool& valid_i, const vint& index, const RayK& ray) - { - vbool valid = valid_i; - valid &= (ray.tfar < 0.0f); - - if (likely(any(valid))) - { - size_t valid_bits = movemask(valid); - while (valid_bits != 0) - { - const size_t k = bscf(valid_bits); - Ray* __restrict__ ray_k = ptr[index[k]]; - - ray_k->tfar = ray.tfar[k]; - } - } - } - - Ray** __restrict__ ptr; - }; - - template<> - __forceinline Ray4 RayStreamAOP::getRayByIndex(const vint4& index) - { - Ray4 ray; - - /* load and transpose: org.x, org.y, org.z, tnear */ - const vfloat4 a0 = vfloat4::loadu(&ptr[index[0]]->org); - const vfloat4 a1 = vfloat4::loadu(&ptr[index[1]]->org); - const vfloat4 a2 = vfloat4::loadu(&ptr[index[2]]->org); - const vfloat4 a3 = vfloat4::loadu(&ptr[index[3]]->org); - - transpose(a0,a1,a2,a3, ray.org.x, ray.org.y, ray.org.z, ray.tnear()); - - /* load and transpose: dir.x, dir.y, dir.z, time */ - const vfloat4 b0 = vfloat4::loadu(&ptr[index[0]]->dir); - const vfloat4 b1 = vfloat4::loadu(&ptr[index[1]]->dir); - const vfloat4 b2 = vfloat4::loadu(&ptr[index[2]]->dir); - const vfloat4 b3 = vfloat4::loadu(&ptr[index[3]]->dir); - - transpose(b0,b1,b2,b3, ray.dir.x, ray.dir.y, ray.dir.z, ray.time()); - - /* load and transpose: tfar, mask, id, flags */ - const vfloat4 c0 = vfloat4::loadu(&ptr[index[0]]->tfar); - const vfloat4 c1 = vfloat4::loadu(&ptr[index[1]]->tfar); - const vfloat4 c2 = vfloat4::loadu(&ptr[index[2]]->tfar); - const vfloat4 c3 = vfloat4::loadu(&ptr[index[3]]->tfar); - - vfloat4 maskf, idf, flagsf; - transpose(c0,c1,c2,c3, ray.tfar, maskf, idf, flagsf); - ray.mask = asInt(maskf); - ray.id = asInt(idf); - ray.flags = asInt(flagsf); - - return ray; - } - -#if defined(__AVX__) - template<> - __forceinline Ray8 RayStreamAOP::getRayByIndex(const vint8& index) - { - Ray8 ray; - - /* load and transpose: org.x, org.y, org.z, tnear, dir.x, dir.y, dir.z, time */ - const vfloat8 ab0 = vfloat8::loadu(&ptr[index[0]]->org); - const vfloat8 ab1 = vfloat8::loadu(&ptr[index[1]]->org); - const vfloat8 ab2 = vfloat8::loadu(&ptr[index[2]]->org); - const vfloat8 ab3 = vfloat8::loadu(&ptr[index[3]]->org); - const vfloat8 ab4 = vfloat8::loadu(&ptr[index[4]]->org); - const vfloat8 ab5 = vfloat8::loadu(&ptr[index[5]]->org); - const vfloat8 ab6 = vfloat8::loadu(&ptr[index[6]]->org); - const vfloat8 ab7 = vfloat8::loadu(&ptr[index[7]]->org); - - transpose(ab0,ab1,ab2,ab3,ab4,ab5,ab6,ab7, ray.org.x, ray.org.y, ray.org.z, ray.tnear(), ray.dir.x, ray.dir.y, ray.dir.z, ray.time()); - - /* load and transpose: tfar, mask, id, flags */ - const vfloat4 c0 = vfloat4::loadu(&ptr[index[0]]->tfar); - const vfloat4 c1 = vfloat4::loadu(&ptr[index[1]]->tfar); - const vfloat4 c2 = vfloat4::loadu(&ptr[index[2]]->tfar); - const vfloat4 c3 = vfloat4::loadu(&ptr[index[3]]->tfar); - const vfloat4 c4 = vfloat4::loadu(&ptr[index[4]]->tfar); - const vfloat4 c5 = vfloat4::loadu(&ptr[index[5]]->tfar); - const vfloat4 c6 = vfloat4::loadu(&ptr[index[6]]->tfar); - const vfloat4 c7 = vfloat4::loadu(&ptr[index[7]]->tfar); - - vfloat8 maskf, idf, flagsf; - transpose(c0,c1,c2,c3,c4,c5,c6,c7, ray.tfar, maskf, idf, flagsf); - ray.mask = asInt(maskf); - ray.id = asInt(idf); - ray.flags = asInt(flagsf); - - return ray; - } -#endif - -#if defined(__AVX512F__) - template<> - __forceinline Ray16 RayStreamAOP::getRayByIndex(const vint16& index) - { - Ray16 ray; - - /* load and transpose: org.x, org.y, org.z, tnear, dir.x, dir.y, dir.z, time */ - const vfloat8 ab0 = vfloat8::loadu(&ptr[index[0]]->org); - const vfloat8 ab1 = vfloat8::loadu(&ptr[index[1]]->org); - const vfloat8 ab2 = vfloat8::loadu(&ptr[index[2]]->org); - const vfloat8 ab3 = vfloat8::loadu(&ptr[index[3]]->org); - const vfloat8 ab4 = vfloat8::loadu(&ptr[index[4]]->org); - const vfloat8 ab5 = vfloat8::loadu(&ptr[index[5]]->org); - const vfloat8 ab6 = vfloat8::loadu(&ptr[index[6]]->org); - const vfloat8 ab7 = vfloat8::loadu(&ptr[index[7]]->org); - const vfloat8 ab8 = vfloat8::loadu(&ptr[index[8]]->org); - const vfloat8 ab9 = vfloat8::loadu(&ptr[index[9]]->org); - const vfloat8 ab10 = vfloat8::loadu(&ptr[index[10]]->org); - const vfloat8 ab11 = vfloat8::loadu(&ptr[index[11]]->org); - const vfloat8 ab12 = vfloat8::loadu(&ptr[index[12]]->org); - const vfloat8 ab13 = vfloat8::loadu(&ptr[index[13]]->org); - const vfloat8 ab14 = vfloat8::loadu(&ptr[index[14]]->org); - const vfloat8 ab15 = vfloat8::loadu(&ptr[index[15]]->org); - - transpose(ab0,ab1,ab2,ab3,ab4,ab5,ab6,ab7,ab8,ab9,ab10,ab11,ab12,ab13,ab14,ab15, - ray.org.x, ray.org.y, ray.org.z, ray.tnear(), ray.dir.x, ray.dir.y, ray.dir.z, ray.time()); - - /* load and transpose: tfar, mask, id, flags */ - const vfloat4 c0 = vfloat4::loadu(&ptr[index[0]]->tfar); - const vfloat4 c1 = vfloat4::loadu(&ptr[index[1]]->tfar); - const vfloat4 c2 = vfloat4::loadu(&ptr[index[2]]->tfar); - const vfloat4 c3 = vfloat4::loadu(&ptr[index[3]]->tfar); - const vfloat4 c4 = vfloat4::loadu(&ptr[index[4]]->tfar); - const vfloat4 c5 = vfloat4::loadu(&ptr[index[5]]->tfar); - const vfloat4 c6 = vfloat4::loadu(&ptr[index[6]]->tfar); - const vfloat4 c7 = vfloat4::loadu(&ptr[index[7]]->tfar); - const vfloat4 c8 = vfloat4::loadu(&ptr[index[8]]->tfar); - const vfloat4 c9 = vfloat4::loadu(&ptr[index[9]]->tfar); - const vfloat4 c10 = vfloat4::loadu(&ptr[index[10]]->tfar); - const vfloat4 c11 = vfloat4::loadu(&ptr[index[11]]->tfar); - const vfloat4 c12 = vfloat4::loadu(&ptr[index[12]]->tfar); - const vfloat4 c13 = vfloat4::loadu(&ptr[index[13]]->tfar); - const vfloat4 c14 = vfloat4::loadu(&ptr[index[14]]->tfar); - const vfloat4 c15 = vfloat4::loadu(&ptr[index[15]]->tfar); - - vfloat16 maskf, idf, flagsf; - transpose(c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15, - ray.tfar, maskf, idf, flagsf); - - ray.mask = asInt(maskf); - ray.id = asInt(idf); - ray.flags = asInt(flagsf); - - return ray; - } -#endif -} diff --git a/thirdparty/embree-aarch64/kernels/common/rtcore.cpp b/thirdparty/embree-aarch64/kernels/common/rtcore.cpp deleted file mode 100644 index 625fbf6d4f..0000000000 --- a/thirdparty/embree-aarch64/kernels/common/rtcore.cpp +++ /dev/null @@ -1,1799 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#define RTC_EXPORT_API - -#include "default.h" -#include "device.h" -#include "scene.h" -#include "context.h" -#include "../../include/embree3/rtcore_ray.h" - -#if defined(__aarch64__) && defined(BUILD_IOS) -#include -#endif - -using namespace embree; - -RTC_NAMESPACE_BEGIN; - - /* mutex to make API thread safe */ -#if defined(__aarch64__) && defined(BUILD_IOS) - static std::mutex g_mutex; -#else - static MutexSys g_mutex; -#endif - - RTC_API RTCDevice rtcNewDevice(const char* config) - { - RTC_CATCH_BEGIN; - RTC_TRACE(rtcNewDevice); -#if defined(__aarch64__) && defined(BUILD_IOS) - std::scoped_lock lock(g_mutex); -#else - Lock lock(g_mutex); -#endif - Device* device = new Device(config); - return (RTCDevice) device->refInc(); - RTC_CATCH_END(nullptr); - return (RTCDevice) nullptr; - } - - RTC_API void rtcRetainDevice(RTCDevice hdevice) - { - Device* device = (Device*) hdevice; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcRetainDevice); - RTC_VERIFY_HANDLE(hdevice); -#if defined(__aarch64__) && defined(BUILD_IOS) - std::scoped_lock lock(g_mutex); -#else - Lock lock(g_mutex); -#endif - device->refInc(); - RTC_CATCH_END(nullptr); - } - - RTC_API void rtcReleaseDevice(RTCDevice hdevice) - { - Device* device = (Device*) hdevice; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcReleaseDevice); - RTC_VERIFY_HANDLE(hdevice); -#if defined(__aarch64__) && defined(BUILD_IOS) - std::scoped_lock lock(g_mutex); -#else - Lock lock(g_mutex); -#endif - device->refDec(); - RTC_CATCH_END(nullptr); - } - - RTC_API ssize_t rtcGetDeviceProperty(RTCDevice hdevice, RTCDeviceProperty prop) - { - Device* device = (Device*) hdevice; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcGetDeviceProperty); - RTC_VERIFY_HANDLE(hdevice); -#if defined(__aarch64__) && defined(BUILD_IOS) - std::scoped_lock lock(g_mutex); -#else - Lock lock(g_mutex); -#endif - return device->getProperty(prop); - RTC_CATCH_END(device); - return 0; - } - - RTC_API void rtcSetDeviceProperty(RTCDevice hdevice, const RTCDeviceProperty prop, ssize_t val) - { - Device* device = (Device*) hdevice; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcSetDeviceProperty); - const bool internal_prop = (size_t)prop >= 1000000 && (size_t)prop < 1000004; - if (!internal_prop) RTC_VERIFY_HANDLE(hdevice); // allow NULL device for special internal settings -#if defined(__aarch64__) && defined(BUILD_IOS) - std::scoped_lock lock(g_mutex); -#else - Lock lock(g_mutex); -#endif - device->setProperty(prop,val); - RTC_CATCH_END(device); - } - - RTC_API RTCError rtcGetDeviceError(RTCDevice hdevice) - { - Device* device = (Device*) hdevice; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcGetDeviceError); - if (device == nullptr) return Device::getThreadErrorCode(); - else return device->getDeviceErrorCode(); - RTC_CATCH_END(device); - return RTC_ERROR_UNKNOWN; - } - - RTC_API void rtcSetDeviceErrorFunction(RTCDevice hdevice, RTCErrorFunction error, void* userPtr) - { - Device* device = (Device*) hdevice; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcSetDeviceErrorFunction); - RTC_VERIFY_HANDLE(hdevice); - device->setErrorFunction(error, userPtr); - RTC_CATCH_END(device); - } - - RTC_API void rtcSetDeviceMemoryMonitorFunction(RTCDevice hdevice, RTCMemoryMonitorFunction memoryMonitor, void* userPtr) - { - Device* device = (Device*) hdevice; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcSetDeviceMemoryMonitorFunction); - device->setMemoryMonitorFunction(memoryMonitor, userPtr); - RTC_CATCH_END(device); - } - - RTC_API RTCBuffer rtcNewBuffer(RTCDevice hdevice, size_t byteSize) - { - RTC_CATCH_BEGIN; - RTC_TRACE(rtcNewBuffer); - RTC_VERIFY_HANDLE(hdevice); - Buffer* buffer = new Buffer((Device*)hdevice, byteSize); - return (RTCBuffer)buffer->refInc(); - RTC_CATCH_END((Device*)hdevice); - return nullptr; - } - - RTC_API RTCBuffer rtcNewSharedBuffer(RTCDevice hdevice, void* ptr, size_t byteSize) - { - RTC_CATCH_BEGIN; - RTC_TRACE(rtcNewSharedBuffer); - RTC_VERIFY_HANDLE(hdevice); - Buffer* buffer = new Buffer((Device*)hdevice, byteSize, ptr); - return (RTCBuffer)buffer->refInc(); - RTC_CATCH_END((Device*)hdevice); - return nullptr; - } - - RTC_API void* rtcGetBufferData(RTCBuffer hbuffer) - { - Buffer* buffer = (Buffer*)hbuffer; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcGetBufferData); - RTC_VERIFY_HANDLE(hbuffer); - return buffer->data(); - RTC_CATCH_END2(buffer); - return nullptr; - } - - RTC_API void rtcRetainBuffer(RTCBuffer hbuffer) - { - Buffer* buffer = (Buffer*)hbuffer; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcRetainBuffer); - RTC_VERIFY_HANDLE(hbuffer); - buffer->refInc(); - RTC_CATCH_END2(buffer); - } - - RTC_API void rtcReleaseBuffer(RTCBuffer hbuffer) - { - Buffer* buffer = (Buffer*)hbuffer; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcReleaseBuffer); - RTC_VERIFY_HANDLE(hbuffer); - buffer->refDec(); - RTC_CATCH_END2(buffer); - } - - RTC_API RTCScene rtcNewScene (RTCDevice hdevice) - { - RTC_CATCH_BEGIN; - RTC_TRACE(rtcNewScene); - RTC_VERIFY_HANDLE(hdevice); - Scene* scene = new Scene((Device*)hdevice); - return (RTCScene) scene->refInc(); - RTC_CATCH_END((Device*)hdevice); - return nullptr; - } - - RTC_API RTCDevice rtcGetSceneDevice(RTCScene hscene) - { - Scene* scene = (Scene*) hscene; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcGetSceneDevice); - RTC_VERIFY_HANDLE(hscene); - return (RTCDevice)scene->device->refInc(); // user will own one additional device reference - RTC_CATCH_END2(scene); - return (RTCDevice)nullptr; - } - - RTC_API void rtcSetSceneProgressMonitorFunction(RTCScene hscene, RTCProgressMonitorFunction progress, void* ptr) - { - Scene* scene = (Scene*) hscene; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcSetSceneProgressMonitorFunction); - RTC_VERIFY_HANDLE(hscene); -#if defined(__aarch64__) && defined(BUILD_IOS) - std::scoped_lock lock(g_mutex); -#else - Lock lock(g_mutex); -#endif - scene->setProgressMonitorFunction(progress,ptr); - RTC_CATCH_END2(scene); - } - - RTC_API void rtcSetSceneBuildQuality (RTCScene hscene, RTCBuildQuality quality) - { - Scene* scene = (Scene*) hscene; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcSetSceneBuildQuality); - RTC_VERIFY_HANDLE(hscene); - if (quality != RTC_BUILD_QUALITY_LOW && - quality != RTC_BUILD_QUALITY_MEDIUM && - quality != RTC_BUILD_QUALITY_HIGH) - // -- GODOT start -- - // throw std::runtime_error("invalid build quality"); - abort(); - // -- GODOT end -- - scene->setBuildQuality(quality); - RTC_CATCH_END2(scene); - } - - RTC_API void rtcSetSceneFlags (RTCScene hscene, RTCSceneFlags flags) - { - Scene* scene = (Scene*) hscene; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcSetSceneFlags); - RTC_VERIFY_HANDLE(hscene); - scene->setSceneFlags(flags); - RTC_CATCH_END2(scene); - } - - RTC_API RTCSceneFlags rtcGetSceneFlags(RTCScene hscene) - { - Scene* scene = (Scene*) hscene; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcGetSceneFlags); - RTC_VERIFY_HANDLE(hscene); - return scene->getSceneFlags(); - RTC_CATCH_END2(scene); - return RTC_SCENE_FLAG_NONE; - } - - RTC_API void rtcCommitScene (RTCScene hscene) - { - Scene* scene = (Scene*) hscene; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcCommitScene); - RTC_VERIFY_HANDLE(hscene); - scene->commit(false); - RTC_CATCH_END2(scene); - } - - RTC_API void rtcJoinCommitScene (RTCScene hscene) - { - Scene* scene = (Scene*) hscene; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcJoinCommitScene); - RTC_VERIFY_HANDLE(hscene); - scene->commit(true); - RTC_CATCH_END2(scene); - } - - RTC_API void rtcGetSceneBounds(RTCScene hscene, RTCBounds* bounds_o) - { - Scene* scene = (Scene*) hscene; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcGetSceneBounds); - RTC_VERIFY_HANDLE(hscene); - if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); - BBox3fa bounds = scene->bounds.bounds(); - bounds_o->lower_x = bounds.lower.x; - bounds_o->lower_y = bounds.lower.y; - bounds_o->lower_z = bounds.lower.z; - bounds_o->align0 = 0; - bounds_o->upper_x = bounds.upper.x; - bounds_o->upper_y = bounds.upper.y; - bounds_o->upper_z = bounds.upper.z; - bounds_o->align1 = 0; - RTC_CATCH_END2(scene); - } - - RTC_API void rtcGetSceneLinearBounds(RTCScene hscene, RTCLinearBounds* bounds_o) - { - Scene* scene = (Scene*) hscene; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcGetSceneBounds); - RTC_VERIFY_HANDLE(hscene); - if (bounds_o == nullptr) - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid destination pointer"); - if (scene->isModified()) - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); - - bounds_o->bounds0.lower_x = scene->bounds.bounds0.lower.x; - bounds_o->bounds0.lower_y = scene->bounds.bounds0.lower.y; - bounds_o->bounds0.lower_z = scene->bounds.bounds0.lower.z; - bounds_o->bounds0.align0 = 0; - bounds_o->bounds0.upper_x = scene->bounds.bounds0.upper.x; - bounds_o->bounds0.upper_y = scene->bounds.bounds0.upper.y; - bounds_o->bounds0.upper_z = scene->bounds.bounds0.upper.z; - bounds_o->bounds0.align1 = 0; - bounds_o->bounds1.lower_x = scene->bounds.bounds1.lower.x; - bounds_o->bounds1.lower_y = scene->bounds.bounds1.lower.y; - bounds_o->bounds1.lower_z = scene->bounds.bounds1.lower.z; - bounds_o->bounds1.align0 = 0; - bounds_o->bounds1.upper_x = scene->bounds.bounds1.upper.x; - bounds_o->bounds1.upper_y = scene->bounds.bounds1.upper.y; - bounds_o->bounds1.upper_z = scene->bounds.bounds1.upper.z; - bounds_o->bounds1.align1 = 0; - RTC_CATCH_END2(scene); - } - - RTC_API void rtcCollide (RTCScene hscene0, RTCScene hscene1, RTCCollideFunc callback, void* userPtr) - { - Scene* scene0 = (Scene*) hscene0; - Scene* scene1 = (Scene*) hscene1; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcCollide); -#if defined(DEBUG) - RTC_VERIFY_HANDLE(hscene0); - RTC_VERIFY_HANDLE(hscene1); - if (scene0->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed"); - if (scene1->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed"); - if (scene0->device != scene1->device) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scenes are from different devices"); - auto nUserPrims0 = scene0->getNumPrimitives (Geometry::MTY_USER_GEOMETRY, false); - auto nUserPrims1 = scene1->getNumPrimitives (Geometry::MTY_USER_GEOMETRY, false); - if (scene0->numPrimitives() != nUserPrims0 && scene1->numPrimitives() != nUserPrims1) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scenes must only contain user geometries with a single timestep"); -#endif - scene0->intersectors.collide(scene0,scene1,callback,userPtr); - RTC_CATCH_END(scene0->device); - } - - inline bool pointQuery(Scene* scene, RTCPointQuery* query, RTCPointQueryContext* userContext, RTCPointQueryFunction queryFunc, void* userPtr) - { - bool changed = false; - if (userContext->instStackSize > 0) - { - const AffineSpace3fa transform = AffineSpace3fa_load_unaligned((AffineSpace3fa*)userContext->world2inst[userContext->instStackSize-1]); - - float similarityScale = 0.f; - const bool similtude = similarityTransform(transform, &similarityScale); - assert((similtude && similarityScale > 0) || (!similtude && similarityScale == 0.f)); - - PointQuery query_inst; - query_inst.p = xfmPoint(transform, Vec3fa(query->x, query->y, query->z)); - query_inst.radius = query->radius * similarityScale; - query_inst.time = query->time; - - PointQueryContext context_inst(scene, (PointQuery*)query, - similtude ? POINT_QUERY_TYPE_SPHERE : POINT_QUERY_TYPE_AABB, - queryFunc, userContext, similarityScale, userPtr); - changed = scene->intersectors.pointQuery((PointQuery*)&query_inst, &context_inst); - } - else - { - PointQueryContext context(scene, (PointQuery*)query, - POINT_QUERY_TYPE_SPHERE, queryFunc, userContext, 1.f, userPtr); - changed = scene->intersectors.pointQuery((PointQuery*)query, &context); - } - return changed; - } - - RTC_API bool rtcPointQuery(RTCScene hscene, RTCPointQuery* query, RTCPointQueryContext* userContext, RTCPointQueryFunction queryFunc, void* userPtr) - { - Scene* scene = (Scene*) hscene; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcPointQuery); -#if defined(DEBUG) - RTC_VERIFY_HANDLE(hscene); - RTC_VERIFY_HANDLE(userContext); - if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed"); - if (((size_t)query) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "query not aligned to 16 bytes"); - if (((size_t)userContext) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "context not aligned to 16 bytes"); -#endif - - return pointQuery(scene, query, userContext, queryFunc, userPtr); - RTC_CATCH_END2_FALSE(scene); - } - - RTC_API bool rtcPointQuery4 (const int* valid, RTCScene hscene, RTCPointQuery4* query, struct RTCPointQueryContext* userContext, RTCPointQueryFunction queryFunc, void** userPtrN) - { - Scene* scene = (Scene*) hscene; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcPointQuery4); - -#if defined(DEBUG) - RTC_VERIFY_HANDLE(hscene); - if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed"); - if (((size_t)valid) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 16 bytes"); - if (((size_t)query) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "query not aligned to 16 bytes"); -#endif - STAT(size_t cnt=0; for (size_t i=0; i<4; i++) cnt += ((int*)valid)[i] == -1;); - STAT3(point_query.travs,cnt,cnt,cnt); - - bool changed = false; - PointQuery4* query4 = (PointQuery4*)query; - PointQuery query1; - for (size_t i=0; i<4; i++) { - if (!valid[i]) continue; - query4->get(i,query1); - changed |= pointQuery(scene, (RTCPointQuery*)&query1, userContext, queryFunc, userPtrN?userPtrN[i]:NULL); - query4->set(i,query1); - } - return changed; - RTC_CATCH_END2_FALSE(scene); - } - - RTC_API bool rtcPointQuery8 (const int* valid, RTCScene hscene, RTCPointQuery8* query, struct RTCPointQueryContext* userContext, RTCPointQueryFunction queryFunc, void** userPtrN) - { - Scene* scene = (Scene*) hscene; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcPointQuery8); - -#if defined(DEBUG) - RTC_VERIFY_HANDLE(hscene); - if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed"); - if (((size_t)valid) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 16 bytes"); - if (((size_t)query) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "query not aligned to 16 bytes"); -#endif - STAT(size_t cnt=0; for (size_t i=0; i<4; i++) cnt += ((int*)valid)[i] == -1;); - STAT3(point_query.travs,cnt,cnt,cnt); - - bool changed = false; - PointQuery8* query8 = (PointQuery8*)query; - PointQuery query1; - for (size_t i=0; i<8; i++) { - if (!valid[i]) continue; - query8->get(i,query1); - changed |= pointQuery(scene, (RTCPointQuery*)&query1, userContext, queryFunc, userPtrN?userPtrN[i]:NULL); - query8->set(i,query1); - } - return changed; - RTC_CATCH_END2_FALSE(scene); - } - - RTC_API bool rtcPointQuery16 (const int* valid, RTCScene hscene, RTCPointQuery16* query, struct RTCPointQueryContext* userContext, RTCPointQueryFunction queryFunc, void** userPtrN) - { - Scene* scene = (Scene*) hscene; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcPointQuery16); - -#if defined(DEBUG) - RTC_VERIFY_HANDLE(hscene); - if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed"); - if (((size_t)valid) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 16 bytes"); - if (((size_t)query) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "query not aligned to 16 bytes"); -#endif - STAT(size_t cnt=0; for (size_t i=0; i<4; i++) cnt += ((int*)valid)[i] == -1;); - STAT3(point_query.travs,cnt,cnt,cnt); - - bool changed = false; - PointQuery16* query16 = (PointQuery16*)query; - PointQuery query1; - for (size_t i=0; i<16; i++) { - if (!valid[i]) continue; - PointQuery query1; query16->get(i,query1); - changed |= pointQuery(scene, (RTCPointQuery*)&query1, userContext, queryFunc, userPtrN?userPtrN[i]:NULL); - query16->set(i,query1); - } - return changed; - RTC_CATCH_END2_FALSE(scene); - } - - RTC_API void rtcIntersect1 (RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit* rayhit) - { - Scene* scene = (Scene*) hscene; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcIntersect1); -#if defined(DEBUG) - RTC_VERIFY_HANDLE(hscene); - if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); - if (((size_t)rayhit) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 16 bytes"); -#endif - STAT3(normal.travs,1,1,1); - IntersectContext context(scene,user_context); - scene->intersectors.intersect(*rayhit,&context); -#if defined(DEBUG) - ((RayHit*)rayhit)->verifyHit(); -#endif - RTC_CATCH_END2(scene); - } - - RTC_API void rtcIntersect4 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit4* rayhit) - { - Scene* scene = (Scene*) hscene; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcIntersect4); - -#if defined(DEBUG) - RTC_VERIFY_HANDLE(hscene); - if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); - if (((size_t)valid) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 16 bytes"); - if (((size_t)rayhit) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit not aligned to 16 bytes"); -#endif - STAT(size_t cnt=0; for (size_t i=0; i<4; i++) cnt += ((int*)valid)[i] == -1;); - STAT3(normal.travs,cnt,cnt,cnt); - - IntersectContext context(scene,user_context); -#if !defined(EMBREE_RAY_PACKETS) - RayHit4* rayhit4 = (RayHit4*)rayhit; - for (size_t i=0; i<4; i++) { - if (!valid[i]) continue; - RayHit ray1; rayhit4->get(i,ray1); - scene->intersectors.intersect((RTCRayHit&)ray1,&context); - rayhit4->set(i,ray1); - } -#else - scene->intersectors.intersect4(valid,*rayhit,&context); -#endif - - RTC_CATCH_END2(scene); - } - - RTC_API void rtcIntersect8 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit8* rayhit) - { - Scene* scene = (Scene*) hscene; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcIntersect8); - -#if defined(DEBUG) - RTC_VERIFY_HANDLE(hscene); - if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); - if (((size_t)valid) & 0x1F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 32 bytes"); - if (((size_t)rayhit) & 0x1F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit not aligned to 32 bytes"); -#endif - STAT(size_t cnt=0; for (size_t i=0; i<8; i++) cnt += ((int*)valid)[i] == -1;); - STAT3(normal.travs,cnt,cnt,cnt); - - IntersectContext context(scene,user_context); -#if !defined(EMBREE_RAY_PACKETS) - RayHit8* rayhit8 = (RayHit8*) rayhit; - for (size_t i=0; i<8; i++) { - if (!valid[i]) continue; - RayHit ray1; rayhit8->get(i,ray1); - scene->intersectors.intersect((RTCRayHit&)ray1,&context); - rayhit8->set(i,ray1); - } -#else - if (likely(scene->intersectors.intersector8)) - scene->intersectors.intersect8(valid,*rayhit,&context); - else - scene->device->rayStreamFilters.intersectSOA(scene,(char*)rayhit,8,1,sizeof(RTCRayHit8),&context); -#endif - RTC_CATCH_END2(scene); - } - - RTC_API void rtcIntersect16 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit16* rayhit) - { - Scene* scene = (Scene*) hscene; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcIntersect16); - -#if defined(DEBUG) - RTC_VERIFY_HANDLE(hscene); - if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); - if (((size_t)valid) & 0x3F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 64 bytes"); - if (((size_t)rayhit) & 0x3F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit not aligned to 64 bytes"); -#endif - STAT(size_t cnt=0; for (size_t i=0; i<16; i++) cnt += ((int*)valid)[i] == -1;); - STAT3(normal.travs,cnt,cnt,cnt); - - IntersectContext context(scene,user_context); -#if !defined(EMBREE_RAY_PACKETS) - RayHit16* rayhit16 = (RayHit16*) rayhit; - for (size_t i=0; i<16; i++) { - if (!valid[i]) continue; - RayHit ray1; rayhit16->get(i,ray1); - scene->intersectors.intersect((RTCRayHit&)ray1,&context); - rayhit16->set(i,ray1); - } -#else - if (likely(scene->intersectors.intersector16)) - scene->intersectors.intersect16(valid,*rayhit,&context); - else - scene->device->rayStreamFilters.intersectSOA(scene,(char*)rayhit,16,1,sizeof(RTCRayHit16),&context); -#endif - RTC_CATCH_END2(scene); - } - - RTC_API void rtcIntersect1M (RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit* rayhit, unsigned int M, size_t byteStride) - { - Scene* scene = (Scene*) hscene; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcIntersect1M); - -#if defined (EMBREE_RAY_PACKETS) -#if defined(DEBUG) - RTC_VERIFY_HANDLE(hscene); - if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); - if (((size_t)rayhit ) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes"); -#endif - STAT3(normal.travs,M,M,M); - IntersectContext context(scene,user_context); - - /* fast codepath for single rays */ - if (likely(M == 1)) { - if (likely(rayhit->ray.tnear <= rayhit->ray.tfar)) - scene->intersectors.intersect(*rayhit,&context); - } - - /* codepath for streams */ - else { - scene->device->rayStreamFilters.intersectAOS(scene,rayhit,M,byteStride,&context); - } -#else - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect1M not supported"); -#endif - RTC_CATCH_END2(scene); - } - - RTC_API void rtcIntersect1Mp (RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit** rn, unsigned int M) - { - Scene* scene = (Scene*) hscene; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcIntersect1Mp); - -#if defined (EMBREE_RAY_PACKETS) -#if defined(DEBUG) - RTC_VERIFY_HANDLE(hscene); - if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); - if (((size_t)rn) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes"); -#endif - STAT3(normal.travs,M,M,M); - IntersectContext context(scene,user_context); - - /* fast codepath for single rays */ - if (likely(M == 1)) { - if (likely(rn[0]->ray.tnear <= rn[0]->ray.tfar)) - scene->intersectors.intersect(*rn[0],&context); - } - - /* codepath for streams */ - else { - scene->device->rayStreamFilters.intersectAOP(scene,rn,M,&context); - } -#else - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect1Mp not supported"); -#endif - RTC_CATCH_END2(scene); - } - - RTC_API void rtcIntersectNM (RTCScene hscene, RTCIntersectContext* user_context, struct RTCRayHitN* rayhit, unsigned int N, unsigned int M, size_t byteStride) - { - Scene* scene = (Scene*) hscene; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcIntersectNM); - -#if defined (EMBREE_RAY_PACKETS) -#if defined(DEBUG) - RTC_VERIFY_HANDLE(hscene); - if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); - if (((size_t)rayhit) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes"); -#endif - STAT3(normal.travs,N*M,N*M,N*M); - IntersectContext context(scene,user_context); - - /* code path for single ray streams */ - if (likely(N == 1)) - { - /* fast code path for streams of size 1 */ - if (likely(M == 1)) { - if (likely(((RTCRayHit*)rayhit)->ray.tnear <= ((RTCRayHit*)rayhit)->ray.tfar)) - scene->intersectors.intersect(*(RTCRayHit*)rayhit,&context); - } - /* normal codepath for single ray streams */ - else { - scene->device->rayStreamFilters.intersectAOS(scene,(RTCRayHit*)rayhit,M,byteStride,&context); - } - } - /* code path for ray packet streams */ - else { - scene->device->rayStreamFilters.intersectSOA(scene,(char*)rayhit,N,M,byteStride,&context); - } -#else - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersectNM not supported"); -#endif - RTC_CATCH_END2(scene); - } - - RTC_API void rtcIntersectNp (RTCScene hscene, RTCIntersectContext* user_context, const RTCRayHitNp* rayhit, unsigned int N) - { - Scene* scene = (Scene*) hscene; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcIntersectNp); - -#if defined (EMBREE_RAY_PACKETS) -#if defined(DEBUG) - RTC_VERIFY_HANDLE(hscene); - if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); - if (((size_t)rayhit->ray.org_x ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.org_x not aligned to 4 bytes"); - if (((size_t)rayhit->ray.org_y ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.org_y not aligned to 4 bytes"); - if (((size_t)rayhit->ray.org_z ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.org_z not aligned to 4 bytes"); - if (((size_t)rayhit->ray.dir_x ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.dir_x not aligned to 4 bytes"); - if (((size_t)rayhit->ray.dir_y ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.dir_y not aligned to 4 bytes"); - if (((size_t)rayhit->ray.dir_z ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.dir_z not aligned to 4 bytes"); - if (((size_t)rayhit->ray.tnear ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.dir_x not aligned to 4 bytes"); - if (((size_t)rayhit->ray.tfar ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.tnear not aligned to 4 bytes"); - if (((size_t)rayhit->ray.time ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.time not aligned to 4 bytes"); - if (((size_t)rayhit->ray.mask ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.mask not aligned to 4 bytes"); - if (((size_t)rayhit->hit.Ng_x ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.Ng_x not aligned to 4 bytes"); - if (((size_t)rayhit->hit.Ng_y ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.Ng_y not aligned to 4 bytes"); - if (((size_t)rayhit->hit.Ng_z ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.Ng_z not aligned to 4 bytes"); - if (((size_t)rayhit->hit.u ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.u not aligned to 4 bytes"); - if (((size_t)rayhit->hit.v ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.v not aligned to 4 bytes"); - if (((size_t)rayhit->hit.geomID) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.geomID not aligned to 4 bytes"); - if (((size_t)rayhit->hit.primID) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.primID not aligned to 4 bytes"); - if (((size_t)rayhit->hit.instID) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.instID not aligned to 4 bytes"); -#endif - STAT3(normal.travs,N,N,N); - IntersectContext context(scene,user_context); - scene->device->rayStreamFilters.intersectSOP(scene,rayhit,N,&context); -#else - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersectNp not supported"); -#endif - RTC_CATCH_END2(scene); - } - - RTC_API void rtcOccluded1 (RTCScene hscene, RTCIntersectContext* user_context, RTCRay* ray) - { - Scene* scene = (Scene*) hscene; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcOccluded1); - STAT3(shadow.travs,1,1,1); -#if defined(DEBUG) - RTC_VERIFY_HANDLE(hscene); - if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); - if (((size_t)ray) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 16 bytes"); -#endif - IntersectContext context(scene,user_context); - scene->intersectors.occluded(*ray,&context); - RTC_CATCH_END2(scene); - } - - RTC_API void rtcOccluded4 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRay4* ray) - { - Scene* scene = (Scene*) hscene; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcOccluded4); - -#if defined(DEBUG) - RTC_VERIFY_HANDLE(hscene); - if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); - if (((size_t)valid) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 16 bytes"); - if (((size_t)ray) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 16 bytes"); -#endif - STAT(size_t cnt=0; for (size_t i=0; i<4; i++) cnt += ((int*)valid)[i] == -1;); - STAT3(shadow.travs,cnt,cnt,cnt); - - IntersectContext context(scene,user_context); -#if !defined(EMBREE_RAY_PACKETS) - Ray4* ray4 = (Ray4*) ray; - for (size_t i=0; i<4; i++) { - if (!valid[i]) continue; - Ray ray1; ray4->get(i,ray1); - scene->intersectors.occluded((RTCRay&)ray1,&context); - ray4->set(i,ray1); - } -#else - scene->intersectors.occluded4(valid,*ray,&context); -#endif - - RTC_CATCH_END2(scene); - } - - RTC_API void rtcOccluded8 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRay8* ray) - { - Scene* scene = (Scene*) hscene; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcOccluded8); - -#if defined(DEBUG) - RTC_VERIFY_HANDLE(hscene); - if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); - if (((size_t)valid) & 0x1F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 32 bytes"); - if (((size_t)ray) & 0x1F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 32 bytes"); -#endif - STAT(size_t cnt=0; for (size_t i=0; i<8; i++) cnt += ((int*)valid)[i] == -1;); - STAT3(shadow.travs,cnt,cnt,cnt); - - IntersectContext context(scene,user_context); -#if !defined(EMBREE_RAY_PACKETS) - Ray8* ray8 = (Ray8*) ray; - for (size_t i=0; i<8; i++) { - if (!valid[i]) continue; - Ray ray1; ray8->get(i,ray1); - scene->intersectors.occluded((RTCRay&)ray1,&context); - ray8->set(i,ray1); - } -#else - if (likely(scene->intersectors.intersector8)) - scene->intersectors.occluded8(valid,*ray,&context); - else - scene->device->rayStreamFilters.occludedSOA(scene,(char*)ray,8,1,sizeof(RTCRay8),&context); -#endif - - RTC_CATCH_END2(scene); - } - - RTC_API void rtcOccluded16 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRay16* ray) - { - Scene* scene = (Scene*) hscene; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcOccluded16); - -#if defined(DEBUG) - RTC_VERIFY_HANDLE(hscene); - if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); - if (((size_t)valid) & 0x3F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 64 bytes"); - if (((size_t)ray) & 0x3F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 64 bytes"); -#endif - STAT(size_t cnt=0; for (size_t i=0; i<16; i++) cnt += ((int*)valid)[i] == -1;); - STAT3(shadow.travs,cnt,cnt,cnt); - - IntersectContext context(scene,user_context); -#if !defined(EMBREE_RAY_PACKETS) - Ray16* ray16 = (Ray16*) ray; - for (size_t i=0; i<16; i++) { - if (!valid[i]) continue; - Ray ray1; ray16->get(i,ray1); - scene->intersectors.occluded((RTCRay&)ray1,&context); - ray16->set(i,ray1); - } -#else - if (likely(scene->intersectors.intersector16)) - scene->intersectors.occluded16(valid,*ray,&context); - else - scene->device->rayStreamFilters.occludedSOA(scene,(char*)ray,16,1,sizeof(RTCRay16),&context); -#endif - - RTC_CATCH_END2(scene); - } - - RTC_API void rtcOccluded1M(RTCScene hscene, RTCIntersectContext* user_context, RTCRay* ray, unsigned int M, size_t byteStride) - { - Scene* scene = (Scene*) hscene; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcOccluded1M); - -#if defined (EMBREE_RAY_PACKETS) -#if defined(DEBUG) - RTC_VERIFY_HANDLE(hscene); - if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); - if (((size_t)ray) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes"); -#endif - STAT3(shadow.travs,M,M,M); - IntersectContext context(scene,user_context); - /* fast codepath for streams of size 1 */ - if (likely(M == 1)) { - if (likely(ray->tnear <= ray->tfar)) - scene->intersectors.occluded (*ray,&context); - } - /* codepath for normal streams */ - else { - scene->device->rayStreamFilters.occludedAOS(scene,ray,M,byteStride,&context); - } -#else - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcOccluded1M not supported"); -#endif - RTC_CATCH_END2(scene); - } - - RTC_API void rtcOccluded1Mp(RTCScene hscene, RTCIntersectContext* user_context, RTCRay** ray, unsigned int M) - { - Scene* scene = (Scene*) hscene; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcOccluded1Mp); - -#if defined (EMBREE_RAY_PACKETS) -#if defined(DEBUG) - RTC_VERIFY_HANDLE(hscene); - if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); - if (((size_t)ray) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes"); -#endif - STAT3(shadow.travs,M,M,M); - IntersectContext context(scene,user_context); - - /* fast codepath for streams of size 1 */ - if (likely(M == 1)) { - if (likely(ray[0]->tnear <= ray[0]->tfar)) - scene->intersectors.occluded (*ray[0],&context); - } - /* codepath for normal streams */ - else { - scene->device->rayStreamFilters.occludedAOP(scene,ray,M,&context); - } -#else - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcOccluded1Mp not supported"); -#endif - RTC_CATCH_END2(scene); - } - - RTC_API void rtcOccludedNM(RTCScene hscene, RTCIntersectContext* user_context, RTCRayN* ray, unsigned int N, unsigned int M, size_t byteStride) - { - Scene* scene = (Scene*) hscene; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcOccludedNM); - -#if defined (EMBREE_RAY_PACKETS) -#if defined(DEBUG) - RTC_VERIFY_HANDLE(hscene); - if (byteStride < sizeof(RTCRayHit)) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"byteStride too small"); - if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); - if (((size_t)ray) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes"); -#endif - STAT3(shadow.travs,N*M,N*N,N*N); - IntersectContext context(scene,user_context); - - /* codepath for single rays */ - if (likely(N == 1)) - { - /* fast path for streams of size 1 */ - if (likely(M == 1)) { - if (likely(((RTCRay*)ray)->tnear <= ((RTCRay*)ray)->tfar)) - scene->intersectors.occluded (*(RTCRay*)ray,&context); - } - /* codepath for normal ray streams */ - else { - scene->device->rayStreamFilters.occludedAOS(scene,(RTCRay*)ray,M,byteStride,&context); - } - } - /* code path for ray packet streams */ - else { - scene->device->rayStreamFilters.occludedSOA(scene,(char*)ray,N,M,byteStride,&context); - } -#else - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcOccludedNM not supported"); -#endif - RTC_CATCH_END2(scene); - } - - RTC_API void rtcOccludedNp(RTCScene hscene, RTCIntersectContext* user_context, const RTCRayNp* ray, unsigned int N) - { - Scene* scene = (Scene*) hscene; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcOccludedNp); - -#if defined (EMBREE_RAY_PACKETS) -#if defined(DEBUG) - RTC_VERIFY_HANDLE(hscene); - if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); - if (((size_t)ray->org_x ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "org_x not aligned to 4 bytes"); - if (((size_t)ray->org_y ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "org_y not aligned to 4 bytes"); - if (((size_t)ray->org_z ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "org_z not aligned to 4 bytes"); - if (((size_t)ray->dir_x ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "dir_x not aligned to 4 bytes"); - if (((size_t)ray->dir_y ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "dir_y not aligned to 4 bytes"); - if (((size_t)ray->dir_z ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "dir_z not aligned to 4 bytes"); - if (((size_t)ray->tnear ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "dir_x not aligned to 4 bytes"); - if (((size_t)ray->tfar ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "tnear not aligned to 4 bytes"); - if (((size_t)ray->time ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "time not aligned to 4 bytes"); - if (((size_t)ray->mask ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 4 bytes"); -#endif - STAT3(shadow.travs,N,N,N); - IntersectContext context(scene,user_context); - scene->device->rayStreamFilters.occludedSOP(scene,ray,N,&context); -#else - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcOccludedNp not supported"); -#endif - RTC_CATCH_END2(scene); - } - - RTC_API void rtcRetainScene (RTCScene hscene) - { - Scene* scene = (Scene*) hscene; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcRetainScene); - RTC_VERIFY_HANDLE(hscene); - scene->refInc(); - RTC_CATCH_END2(scene); - } - - RTC_API void rtcReleaseScene (RTCScene hscene) - { - Scene* scene = (Scene*) hscene; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcReleaseScene); - RTC_VERIFY_HANDLE(hscene); - scene->refDec(); - RTC_CATCH_END2(scene); - } - - RTC_API void rtcSetGeometryInstancedScene(RTCGeometry hgeometry, RTCScene hscene) - { - Geometry* geometry = (Geometry*) hgeometry; - Ref scene = (Scene*) hscene; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcSetGeometryInstancedScene); - RTC_VERIFY_HANDLE(hgeometry); - RTC_VERIFY_HANDLE(hscene); - geometry->setInstancedScene(scene); - RTC_CATCH_END2(geometry); - } - - AffineSpace3fa loadTransform(RTCFormat format, const float* xfm) - { - AffineSpace3fa space = one; - switch (format) - { - case RTC_FORMAT_FLOAT3X4_ROW_MAJOR: - space = AffineSpace3fa(Vec3fa(xfm[ 0], xfm[ 4], xfm[ 8]), - Vec3fa(xfm[ 1], xfm[ 5], xfm[ 9]), - Vec3fa(xfm[ 2], xfm[ 6], xfm[10]), - Vec3fa(xfm[ 3], xfm[ 7], xfm[11])); - break; - - case RTC_FORMAT_FLOAT3X4_COLUMN_MAJOR: - space = AffineSpace3fa(Vec3fa(xfm[ 0], xfm[ 1], xfm[ 2]), - Vec3fa(xfm[ 3], xfm[ 4], xfm[ 5]), - Vec3fa(xfm[ 6], xfm[ 7], xfm[ 8]), - Vec3fa(xfm[ 9], xfm[10], xfm[11])); - break; - - case RTC_FORMAT_FLOAT4X4_COLUMN_MAJOR: - space = AffineSpace3fa(Vec3fa(xfm[ 0], xfm[ 1], xfm[ 2]), - Vec3fa(xfm[ 4], xfm[ 5], xfm[ 6]), - Vec3fa(xfm[ 8], xfm[ 9], xfm[10]), - Vec3fa(xfm[12], xfm[13], xfm[14])); - break; - - default: - throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid matrix format"); - break; - } - return space; - } - - void storeTransform(const AffineSpace3fa& space, RTCFormat format, float* xfm) - { - switch (format) - { - case RTC_FORMAT_FLOAT3X4_ROW_MAJOR: - xfm[ 0] = space.l.vx.x; xfm[ 1] = space.l.vy.x; xfm[ 2] = space.l.vz.x; xfm[ 3] = space.p.x; - xfm[ 4] = space.l.vx.y; xfm[ 5] = space.l.vy.y; xfm[ 6] = space.l.vz.y; xfm[ 7] = space.p.y; - xfm[ 8] = space.l.vx.z; xfm[ 9] = space.l.vy.z; xfm[10] = space.l.vz.z; xfm[11] = space.p.z; - break; - - case RTC_FORMAT_FLOAT3X4_COLUMN_MAJOR: - xfm[ 0] = space.l.vx.x; xfm[ 1] = space.l.vx.y; xfm[ 2] = space.l.vx.z; - xfm[ 3] = space.l.vy.x; xfm[ 4] = space.l.vy.y; xfm[ 5] = space.l.vy.z; - xfm[ 6] = space.l.vz.x; xfm[ 7] = space.l.vz.y; xfm[ 8] = space.l.vz.z; - xfm[ 9] = space.p.x; xfm[10] = space.p.y; xfm[11] = space.p.z; - break; - - case RTC_FORMAT_FLOAT4X4_COLUMN_MAJOR: - xfm[ 0] = space.l.vx.x; xfm[ 1] = space.l.vx.y; xfm[ 2] = space.l.vx.z; xfm[ 3] = 0.f; - xfm[ 4] = space.l.vy.x; xfm[ 5] = space.l.vy.y; xfm[ 6] = space.l.vy.z; xfm[ 7] = 0.f; - xfm[ 8] = space.l.vz.x; xfm[ 9] = space.l.vz.y; xfm[10] = space.l.vz.z; xfm[11] = 0.f; - xfm[12] = space.p.x; xfm[13] = space.p.y; xfm[14] = space.p.z; xfm[15] = 1.f; - break; - - default: - throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid matrix format"); - break; - } - } - - RTC_API void rtcSetGeometryTransform(RTCGeometry hgeometry, unsigned int timeStep, RTCFormat format, const void* xfm) - { - Geometry* geometry = (Geometry*) hgeometry; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcSetGeometryTransform); - RTC_VERIFY_HANDLE(hgeometry); - RTC_VERIFY_HANDLE(xfm); - const AffineSpace3fa transform = loadTransform(format, (const float*)xfm); - geometry->setTransform(transform, timeStep); - RTC_CATCH_END2(geometry); - } - - RTC_API void rtcSetGeometryTransformQuaternion(RTCGeometry hgeometry, unsigned int timeStep, const RTCQuaternionDecomposition* qd) - { - Geometry* geometry = (Geometry*) hgeometry; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcSetGeometryTransformQuaternion); - RTC_VERIFY_HANDLE(hgeometry); - RTC_VERIFY_HANDLE(qd); - - AffineSpace3fx transform; - transform.l.vx.x = qd->scale_x; - transform.l.vy.y = qd->scale_y; - transform.l.vz.z = qd->scale_z; - transform.l.vy.x = qd->skew_xy; - transform.l.vz.x = qd->skew_xz; - transform.l.vz.y = qd->skew_yz; - transform.l.vx.y = qd->translation_x; - transform.l.vx.z = qd->translation_y; - transform.l.vy.z = qd->translation_z; - transform.p.x = qd->shift_x; - transform.p.y = qd->shift_y; - transform.p.z = qd->shift_z; - - // normalize quaternion - Quaternion3f q(qd->quaternion_r, qd->quaternion_i, qd->quaternion_j, qd->quaternion_k); - q = normalize(q); - transform.l.vx.w = q.i; - transform.l.vy.w = q.j; - transform.l.vz.w = q.k; - transform.p.w = q.r; - - geometry->setQuaternionDecomposition(transform, timeStep); - RTC_CATCH_END2(geometry); - } - - RTC_API void rtcGetGeometryTransform(RTCGeometry hgeometry, float time, RTCFormat format, void* xfm) - { - Geometry* geometry = (Geometry*) hgeometry; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcGetGeometryTransform); - const AffineSpace3fa transform = geometry->getTransform(time); - storeTransform(transform, format, (float*)xfm); - RTC_CATCH_END2(geometry); - } - - RTC_API void rtcFilterIntersection(const struct RTCIntersectFunctionNArguments* const args_i, const struct RTCFilterFunctionNArguments* filter_args) - { - IntersectFunctionNArguments* args = (IntersectFunctionNArguments*) args_i; - args->report(args,filter_args); - } - - RTC_API void rtcFilterOcclusion(const struct RTCOccludedFunctionNArguments* const args_i, const struct RTCFilterFunctionNArguments* filter_args) - { - OccludedFunctionNArguments* args = (OccludedFunctionNArguments*) args_i; - args->report(args,filter_args); - } - - RTC_API RTCGeometry rtcNewGeometry (RTCDevice hdevice, RTCGeometryType type) - { - Device* device = (Device*) hdevice; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcNewGeometry); - RTC_VERIFY_HANDLE(hdevice); - - switch (type) - { - case RTC_GEOMETRY_TYPE_TRIANGLE: - { -#if defined(EMBREE_GEOMETRY_TRIANGLE) - createTriangleMeshTy createTriangleMesh = nullptr; - SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_cpu_features,createTriangleMesh); - Geometry* geom = createTriangleMesh(device); - return (RTCGeometry) geom->refInc(); -#else - throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_TRIANGLE is not supported"); -#endif - } - - case RTC_GEOMETRY_TYPE_QUAD: - { -#if defined(EMBREE_GEOMETRY_QUAD) - createQuadMeshTy createQuadMesh = nullptr; - SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_cpu_features,createQuadMesh); - Geometry* geom = createQuadMesh(device); - return (RTCGeometry) geom->refInc(); -#else - throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_QUAD is not supported"); -#endif - } - - case RTC_GEOMETRY_TYPE_SPHERE_POINT: - case RTC_GEOMETRY_TYPE_DISC_POINT: - case RTC_GEOMETRY_TYPE_ORIENTED_DISC_POINT: - { -#if defined(EMBREE_GEOMETRY_POINT) - createPointsTy createPoints = nullptr; - SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_builder_cpu_features, createPoints); - - Geometry *geom; - switch(type) { - case RTC_GEOMETRY_TYPE_SPHERE_POINT: - geom = createPoints(device, Geometry::GTY_SPHERE_POINT); - break; - case RTC_GEOMETRY_TYPE_DISC_POINT: - geom = createPoints(device, Geometry::GTY_DISC_POINT); - break; - case RTC_GEOMETRY_TYPE_ORIENTED_DISC_POINT: - geom = createPoints(device, Geometry::GTY_ORIENTED_DISC_POINT); - break; - default: - geom = nullptr; - break; - } - return (RTCGeometry) geom->refInc(); -#else - throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_POINT is not supported"); -#endif - } - - case RTC_GEOMETRY_TYPE_CONE_LINEAR_CURVE: - case RTC_GEOMETRY_TYPE_ROUND_LINEAR_CURVE: - case RTC_GEOMETRY_TYPE_FLAT_LINEAR_CURVE: - - case RTC_GEOMETRY_TYPE_ROUND_BEZIER_CURVE: - case RTC_GEOMETRY_TYPE_FLAT_BEZIER_CURVE: - case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_BEZIER_CURVE: - - case RTC_GEOMETRY_TYPE_ROUND_BSPLINE_CURVE: - case RTC_GEOMETRY_TYPE_FLAT_BSPLINE_CURVE: - case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_BSPLINE_CURVE: - - case RTC_GEOMETRY_TYPE_ROUND_HERMITE_CURVE: - case RTC_GEOMETRY_TYPE_FLAT_HERMITE_CURVE: - case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_HERMITE_CURVE: - - case RTC_GEOMETRY_TYPE_ROUND_CATMULL_ROM_CURVE: - case RTC_GEOMETRY_TYPE_FLAT_CATMULL_ROM_CURVE: - case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_CATMULL_ROM_CURVE: - { -#if defined(EMBREE_GEOMETRY_CURVE) - createLineSegmentsTy createLineSegments = nullptr; - SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_cpu_features,createLineSegments); - createCurvesTy createCurves = nullptr; - SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_cpu_features,createCurves); - - Geometry* geom; - switch (type) { - case RTC_GEOMETRY_TYPE_CONE_LINEAR_CURVE : geom = createLineSegments (device,Geometry::GTY_CONE_LINEAR_CURVE); break; - case RTC_GEOMETRY_TYPE_ROUND_LINEAR_CURVE : geom = createLineSegments (device,Geometry::GTY_ROUND_LINEAR_CURVE); break; - case RTC_GEOMETRY_TYPE_FLAT_LINEAR_CURVE : geom = createLineSegments (device,Geometry::GTY_FLAT_LINEAR_CURVE); break; - //case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_LINEAR_CURVE : geom = createLineSegments (device,Geometry::GTY_ORIENTED_LINEAR_CURVE); break; - - case RTC_GEOMETRY_TYPE_ROUND_BEZIER_CURVE : geom = createCurves(device,Geometry::GTY_ROUND_BEZIER_CURVE); break; - case RTC_GEOMETRY_TYPE_FLAT_BEZIER_CURVE : geom = createCurves(device,Geometry::GTY_FLAT_BEZIER_CURVE); break; - case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_BEZIER_CURVE : geom = createCurves(device,Geometry::GTY_ORIENTED_BEZIER_CURVE); break; - - case RTC_GEOMETRY_TYPE_ROUND_BSPLINE_CURVE : geom = createCurves(device,Geometry::GTY_ROUND_BSPLINE_CURVE); break; - case RTC_GEOMETRY_TYPE_FLAT_BSPLINE_CURVE : geom = createCurves(device,Geometry::GTY_FLAT_BSPLINE_CURVE); break; - case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_BSPLINE_CURVE : geom = createCurves(device,Geometry::GTY_ORIENTED_BSPLINE_CURVE); break; - - case RTC_GEOMETRY_TYPE_ROUND_HERMITE_CURVE : geom = createCurves(device,Geometry::GTY_ROUND_HERMITE_CURVE); break; - case RTC_GEOMETRY_TYPE_FLAT_HERMITE_CURVE : geom = createCurves(device,Geometry::GTY_FLAT_HERMITE_CURVE); break; - case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_HERMITE_CURVE : geom = createCurves(device,Geometry::GTY_ORIENTED_HERMITE_CURVE); break; - - case RTC_GEOMETRY_TYPE_ROUND_CATMULL_ROM_CURVE : geom = createCurves(device,Geometry::GTY_ROUND_CATMULL_ROM_CURVE); break; - case RTC_GEOMETRY_TYPE_FLAT_CATMULL_ROM_CURVE : geom = createCurves(device,Geometry::GTY_FLAT_CATMULL_ROM_CURVE); break; - case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_CATMULL_ROM_CURVE : geom = createCurves(device,Geometry::GTY_ORIENTED_CATMULL_ROM_CURVE); break; - default: geom = nullptr; break; - } - return (RTCGeometry) geom->refInc(); -#else - throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_CURVE is not supported"); -#endif - } - - case RTC_GEOMETRY_TYPE_SUBDIVISION: - { -#if defined(EMBREE_GEOMETRY_SUBDIVISION) - createSubdivMeshTy createSubdivMesh = nullptr; - SELECT_SYMBOL_DEFAULT_AVX(device->enabled_cpu_features,createSubdivMesh); - //SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_cpu_features,createSubdivMesh); // FIXME: this does not work for some reason? - Geometry* geom = createSubdivMesh(device); - return (RTCGeometry) geom->refInc(); -#else - throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_SUBDIVISION is not supported"); -#endif - } - - case RTC_GEOMETRY_TYPE_USER: - { -#if defined(EMBREE_GEOMETRY_USER) - createUserGeometryTy createUserGeometry = nullptr; - SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_cpu_features,createUserGeometry); - Geometry* geom = createUserGeometry(device); - return (RTCGeometry) geom->refInc(); -#else - throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_USER is not supported"); -#endif - } - - case RTC_GEOMETRY_TYPE_INSTANCE: - { -#if defined(EMBREE_GEOMETRY_INSTANCE) - createInstanceTy createInstance = nullptr; - SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_cpu_features,createInstance); - Geometry* geom = createInstance(device); - return (RTCGeometry) geom->refInc(); -#else - throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_INSTANCE is not supported"); -#endif - } - - case RTC_GEOMETRY_TYPE_GRID: - { -#if defined(EMBREE_GEOMETRY_GRID) - createGridMeshTy createGridMesh = nullptr; - SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_cpu_features,createGridMesh); - Geometry* geom = createGridMesh(device); - return (RTCGeometry) geom->refInc(); -#else - throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_GRID is not supported"); -#endif - } - - default: - throw_RTCError(RTC_ERROR_UNKNOWN,"invalid geometry type"); - } - - RTC_CATCH_END(device); - return nullptr; - } - - RTC_API void rtcSetGeometryUserPrimitiveCount(RTCGeometry hgeometry, unsigned int userPrimitiveCount) - { - Geometry* geometry = (Geometry*) hgeometry; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcSetGeometryUserPrimitiveCount); - RTC_VERIFY_HANDLE(hgeometry); - - if (unlikely(geometry->getType() != Geometry::GTY_USER_GEOMETRY)) - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation only allowed for user geometries"); - - geometry->setNumPrimitives(userPrimitiveCount); - RTC_CATCH_END2(geometry); - } - - RTC_API void rtcSetGeometryTimeStepCount(RTCGeometry hgeometry, unsigned int timeStepCount) - { - Geometry* geometry = (Geometry*) hgeometry; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcSetGeometryTimeStepCount); - RTC_VERIFY_HANDLE(hgeometry); - - if (timeStepCount > RTC_MAX_TIME_STEP_COUNT) - throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"number of time steps is out of range"); - - geometry->setNumTimeSteps(timeStepCount); - RTC_CATCH_END2(geometry); - } - - RTC_API void rtcSetGeometryTimeRange(RTCGeometry hgeometry, float startTime, float endTime) - { - Ref geometry = (Geometry*) hgeometry; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcSetGeometryTimeRange); - RTC_VERIFY_HANDLE(hgeometry); - - if (startTime > endTime) - throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"startTime has to be smaller or equal to the endTime"); - - geometry->setTimeRange(BBox1f(startTime,endTime)); - RTC_CATCH_END2(geometry); - } - - RTC_API void rtcSetGeometryVertexAttributeCount(RTCGeometry hgeometry, unsigned int N) - { - Geometry* geometry = (Geometry*) hgeometry; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcSetGeometryVertexAttributeCount); - RTC_VERIFY_HANDLE(hgeometry); - geometry->setVertexAttributeCount(N); - RTC_CATCH_END2(geometry); - } - - RTC_API void rtcSetGeometryTopologyCount(RTCGeometry hgeometry, unsigned int N) - { - Geometry* geometry = (Geometry*) hgeometry; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcSetGeometryTopologyCount); - RTC_VERIFY_HANDLE(hgeometry); - geometry->setTopologyCount(N); - RTC_CATCH_END2(geometry); - } - - RTC_API void rtcSetGeometryBuildQuality (RTCGeometry hgeometry, RTCBuildQuality quality) - { - Geometry* geometry = (Geometry*) hgeometry; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcSetGeometryBuildQuality); - RTC_VERIFY_HANDLE(hgeometry); - if (quality != RTC_BUILD_QUALITY_LOW && - quality != RTC_BUILD_QUALITY_MEDIUM && - quality != RTC_BUILD_QUALITY_HIGH && - quality != RTC_BUILD_QUALITY_REFIT) - // -- GODOT start -- - // throw std::runtime_error("invalid build quality"); - abort(); - // -- GODOT end -- - geometry->setBuildQuality(quality); - RTC_CATCH_END2(geometry); - } - - RTC_API void rtcSetGeometryMaxRadiusScale(RTCGeometry hgeometry, float maxRadiusScale) - { - Geometry* geometry = (Geometry*) hgeometry; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcSetGeometryMaxRadiusScale); - RTC_VERIFY_HANDLE(hgeometry); -#if RTC_MIN_WIDTH - if (maxRadiusScale < 1.0f) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"maximal radius scale has to be larger or equal to 1"); - geometry->setMaxRadiusScale(maxRadiusScale); -#else - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"min-width feature is not enabled"); -#endif - RTC_CATCH_END2(geometry); - } - - RTC_API void rtcSetGeometryMask (RTCGeometry hgeometry, unsigned int mask) - { - Geometry* geometry = (Geometry*) hgeometry; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcSetGeometryMask); - RTC_VERIFY_HANDLE(hgeometry); - geometry->setMask(mask); - RTC_CATCH_END2(geometry); - } - - RTC_API void rtcSetGeometrySubdivisionMode (RTCGeometry hgeometry, unsigned topologyID, RTCSubdivisionMode mode) - { - Geometry* geometry = (Geometry*) hgeometry; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcSetGeometrySubdivisionMode); - RTC_VERIFY_HANDLE(hgeometry); - geometry->setSubdivisionMode(topologyID,mode); - RTC_CATCH_END2(geometry); - } - - RTC_API void rtcSetGeometryVertexAttributeTopology(RTCGeometry hgeometry, unsigned int vertexAttributeID, unsigned int topologyID) - { - Geometry* geometry = (Geometry*) hgeometry; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcSetGeometryVertexAttributeTopology); - RTC_VERIFY_HANDLE(hgeometry); - geometry->setVertexAttributeTopology(vertexAttributeID, topologyID); - RTC_CATCH_END2(geometry); - } - - RTC_API void rtcSetGeometryBuffer(RTCGeometry hgeometry, RTCBufferType type, unsigned int slot, RTCFormat format, RTCBuffer hbuffer, size_t byteOffset, size_t byteStride, size_t itemCount) - { - Geometry* geometry = (Geometry*) hgeometry; - Ref buffer = (Buffer*)hbuffer; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcSetGeometryBuffer); - RTC_VERIFY_HANDLE(hgeometry); - RTC_VERIFY_HANDLE(hbuffer); - - if (geometry->device != buffer->device) - throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"inputs are from different devices"); - - if (itemCount > 0xFFFFFFFFu) - throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"buffer too large"); - - geometry->setBuffer(type, slot, format, buffer, byteOffset, byteStride, (unsigned int)itemCount); - RTC_CATCH_END2(geometry); - } - - RTC_API void rtcSetSharedGeometryBuffer(RTCGeometry hgeometry, RTCBufferType type, unsigned int slot, RTCFormat format, const void* ptr, size_t byteOffset, size_t byteStride, size_t itemCount) - { - Geometry* geometry = (Geometry*) hgeometry; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcSetSharedGeometryBuffer); - RTC_VERIFY_HANDLE(hgeometry); - - if (itemCount > 0xFFFFFFFFu) - throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"buffer too large"); - - Ref buffer = new Buffer(geometry->device, itemCount*byteStride, (char*)ptr + byteOffset); - geometry->setBuffer(type, slot, format, buffer, 0, byteStride, (unsigned int)itemCount); - RTC_CATCH_END2(geometry); - } - - RTC_API void* rtcSetNewGeometryBuffer(RTCGeometry hgeometry, RTCBufferType type, unsigned int slot, RTCFormat format, size_t byteStride, size_t itemCount) - { - Geometry* geometry = (Geometry*) hgeometry; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcSetNewGeometryBuffer); - RTC_VERIFY_HANDLE(hgeometry); - - if (itemCount > 0xFFFFFFFFu) - throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"buffer too large"); - - /* vertex buffers need to get overallocated slightly as elements are accessed using SSE loads */ - size_t bytes = itemCount*byteStride; - if (type == RTC_BUFFER_TYPE_VERTEX || type == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE) - bytes += (16 - (byteStride%16))%16; - - Ref buffer = new Buffer(geometry->device, bytes); - geometry->setBuffer(type, slot, format, buffer, 0, byteStride, (unsigned int)itemCount); - return buffer->data(); - RTC_CATCH_END2(geometry); - return nullptr; - } - - RTC_API void* rtcGetGeometryBufferData(RTCGeometry hgeometry, RTCBufferType type, unsigned int slot) - { - Geometry* geometry = (Geometry*) hgeometry; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcGetGeometryBufferData); - RTC_VERIFY_HANDLE(hgeometry); - return geometry->getBuffer(type, slot); - RTC_CATCH_END2(geometry); - return nullptr; - } - - RTC_API void rtcEnableGeometry (RTCGeometry hgeometry) - { - Geometry* geometry = (Geometry*) hgeometry; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcEnableGeometry); - RTC_VERIFY_HANDLE(hgeometry); - geometry->enable(); - RTC_CATCH_END2(geometry); - } - - RTC_API void rtcUpdateGeometryBuffer (RTCGeometry hgeometry, RTCBufferType type, unsigned int slot) - { - Geometry* geometry = (Geometry*) hgeometry; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcUpdateGeometryBuffer); - RTC_VERIFY_HANDLE(hgeometry); - geometry->updateBuffer(type, slot); - RTC_CATCH_END2(geometry); - } - - RTC_API void rtcDisableGeometry (RTCGeometry hgeometry) - { - Geometry* geometry = (Geometry*) hgeometry; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcDisableGeometry); - RTC_VERIFY_HANDLE(hgeometry); - geometry->disable(); - RTC_CATCH_END2(geometry); - } - - RTC_API void rtcSetGeometryTessellationRate (RTCGeometry hgeometry, float tessellationRate) - { - Geometry* geometry = (Geometry*) hgeometry; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcSetGeometryTessellationRate); - RTC_VERIFY_HANDLE(hgeometry); - geometry->setTessellationRate(tessellationRate); - RTC_CATCH_END2(geometry); - } - - RTC_API void rtcSetGeometryUserData (RTCGeometry hgeometry, void* ptr) - { - Geometry* geometry = (Geometry*) hgeometry; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcSetGeometryUserData); - RTC_VERIFY_HANDLE(hgeometry); - geometry->setUserData(ptr); - RTC_CATCH_END2(geometry); - } - - RTC_API void* rtcGetGeometryUserData (RTCGeometry hgeometry) - { - Geometry* geometry = (Geometry*) hgeometry; // no ref counting here! - RTC_CATCH_BEGIN; - RTC_TRACE(rtcGetGeometryUserData); - RTC_VERIFY_HANDLE(hgeometry); - return geometry->getUserData(); - RTC_CATCH_END2(geometry); - return nullptr; - } - - RTC_API void rtcSetGeometryBoundsFunction (RTCGeometry hgeometry, RTCBoundsFunction bounds, void* userPtr) - { - Geometry* geometry = (Geometry*) hgeometry; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcSetGeometryBoundsFunction); - RTC_VERIFY_HANDLE(hgeometry); - geometry->setBoundsFunction(bounds,userPtr); - RTC_CATCH_END2(geometry); - } - - RTC_API void rtcSetGeometryDisplacementFunction (RTCGeometry hgeometry, RTCDisplacementFunctionN displacement) - { - Geometry* geometry = (Geometry*) hgeometry; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcSetGeometryDisplacementFunction); - RTC_VERIFY_HANDLE(hgeometry); - geometry->setDisplacementFunction(displacement); - RTC_CATCH_END2(geometry); - } - - RTC_API void rtcSetGeometryIntersectFunction (RTCGeometry hgeometry, RTCIntersectFunctionN intersect) - { - Geometry* geometry = (Geometry*) hgeometry; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcSetGeometryIntersectFunction); - RTC_VERIFY_HANDLE(hgeometry); - geometry->setIntersectFunctionN(intersect); - RTC_CATCH_END2(geometry); - } - - RTC_API void rtcSetGeometryPointQueryFunction(RTCGeometry hgeometry, RTCPointQueryFunction pointQuery) - { - Geometry* geometry = (Geometry*) hgeometry; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcSetGeometryPointQueryFunction); - RTC_VERIFY_HANDLE(hgeometry); - geometry->setPointQueryFunction(pointQuery); - RTC_CATCH_END2(geometry); - } - - RTC_API unsigned int rtcGetGeometryFirstHalfEdge(RTCGeometry hgeometry, unsigned int faceID) - { - Geometry* geometry = (Geometry*) hgeometry; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcGetGeometryFirstHalfEdge); - return geometry->getFirstHalfEdge(faceID); - RTC_CATCH_END2(geometry); - return -1; - } - - RTC_API unsigned int rtcGetGeometryFace(RTCGeometry hgeometry, unsigned int edgeID) - { - Geometry* geometry = (Geometry*) hgeometry; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcGetGeometryFace); - return geometry->getFace(edgeID); - RTC_CATCH_END2(geometry); - return -1; - } - - RTC_API unsigned int rtcGetGeometryNextHalfEdge(RTCGeometry hgeometry, unsigned int edgeID) - { - Geometry* geometry = (Geometry*) hgeometry; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcGetGeometryNextHalfEdge); - return geometry->getNextHalfEdge(edgeID); - RTC_CATCH_END2(geometry); - return -1; - } - - RTC_API unsigned int rtcGetGeometryPreviousHalfEdge(RTCGeometry hgeometry, unsigned int edgeID) - { - Geometry* geometry = (Geometry*) hgeometry; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcGetGeometryPreviousHalfEdge); - return geometry->getPreviousHalfEdge(edgeID); - RTC_CATCH_END2(geometry); - return -1; - } - - RTC_API unsigned int rtcGetGeometryOppositeHalfEdge(RTCGeometry hgeometry, unsigned int topologyID, unsigned int edgeID) - { - Geometry* geometry = (Geometry*) hgeometry; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcGetGeometryOppositeHalfEdge); - return geometry->getOppositeHalfEdge(topologyID,edgeID); - RTC_CATCH_END2(geometry); - return -1; - } - - RTC_API void rtcSetGeometryOccludedFunction (RTCGeometry hgeometry, RTCOccludedFunctionN occluded) - { - Geometry* geometry = (Geometry*) hgeometry; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcSetOccludedFunctionN); - RTC_VERIFY_HANDLE(hgeometry); - geometry->setOccludedFunctionN(occluded); - RTC_CATCH_END2(geometry); - } - - RTC_API void rtcSetGeometryIntersectFilterFunction (RTCGeometry hgeometry, RTCFilterFunctionN filter) - { - Geometry* geometry = (Geometry*) hgeometry; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcSetGeometryIntersectFilterFunction); - RTC_VERIFY_HANDLE(hgeometry); - geometry->setIntersectionFilterFunctionN(filter); - RTC_CATCH_END2(geometry); - } - - RTC_API void rtcSetGeometryOccludedFilterFunction (RTCGeometry hgeometry, RTCFilterFunctionN filter) - { - Geometry* geometry = (Geometry*) hgeometry; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcSetGeometryOccludedFilterFunction); - RTC_VERIFY_HANDLE(hgeometry); - geometry->setOcclusionFilterFunctionN(filter); - RTC_CATCH_END2(geometry); - } - - RTC_API void rtcInterpolate(const RTCInterpolateArguments* const args) - { - Geometry* geometry = (Geometry*) args->geometry; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcInterpolate); -#if defined(DEBUG) - RTC_VERIFY_HANDLE(args->geometry); -#endif - geometry->interpolate(args); - RTC_CATCH_END2(geometry); - } - - RTC_API void rtcInterpolateN(const RTCInterpolateNArguments* const args) - { - Geometry* geometry = (Geometry*) args->geometry; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcInterpolateN); -#if defined(DEBUG) - RTC_VERIFY_HANDLE(args->geometry); -#endif - geometry->interpolateN(args); - RTC_CATCH_END2(geometry); - } - - RTC_API void rtcCommitGeometry (RTCGeometry hgeometry) - { - Geometry* geometry = (Geometry*) hgeometry; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcCommitGeometry); - RTC_VERIFY_HANDLE(hgeometry); - return geometry->commit(); - RTC_CATCH_END2(geometry); - } - - RTC_API unsigned int rtcAttachGeometry (RTCScene hscene, RTCGeometry hgeometry) - { - Scene* scene = (Scene*) hscene; - Geometry* geometry = (Geometry*) hgeometry; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcAttachGeometry); - RTC_VERIFY_HANDLE(hscene); - RTC_VERIFY_HANDLE(hgeometry); - if (scene->device != geometry->device) - throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"inputs are from different devices"); - return scene->bind(RTC_INVALID_GEOMETRY_ID,geometry); - RTC_CATCH_END2(scene); - return -1; - } - - RTC_API void rtcAttachGeometryByID (RTCScene hscene, RTCGeometry hgeometry, unsigned int geomID) - { - Scene* scene = (Scene*) hscene; - Geometry* geometry = (Geometry*) hgeometry; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcAttachGeometryByID); - RTC_VERIFY_HANDLE(hscene); - RTC_VERIFY_HANDLE(hgeometry); - RTC_VERIFY_GEOMID(geomID); - if (scene->device != geometry->device) - throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"inputs are from different devices"); - scene->bind(geomID,geometry); - RTC_CATCH_END2(scene); - } - - RTC_API void rtcDetachGeometry (RTCScene hscene, unsigned int geomID) - { - Scene* scene = (Scene*) hscene; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcDetachGeometry); - RTC_VERIFY_HANDLE(hscene); - RTC_VERIFY_GEOMID(geomID); - scene->detachGeometry(geomID); - RTC_CATCH_END2(scene); - } - - RTC_API void rtcRetainGeometry (RTCGeometry hgeometry) - { - Geometry* geometry = (Geometry*) hgeometry; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcRetainGeometry); - RTC_VERIFY_HANDLE(hgeometry); - geometry->refInc(); - RTC_CATCH_END2(geometry); - } - - RTC_API void rtcReleaseGeometry (RTCGeometry hgeometry) - { - Geometry* geometry = (Geometry*) hgeometry; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcReleaseGeometry); - RTC_VERIFY_HANDLE(hgeometry); - geometry->refDec(); - RTC_CATCH_END2(geometry); - } - - RTC_API RTCGeometry rtcGetGeometry (RTCScene hscene, unsigned int geomID) - { - Scene* scene = (Scene*) hscene; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcGetGeometry); -#if defined(DEBUG) - RTC_VERIFY_HANDLE(hscene); - RTC_VERIFY_GEOMID(geomID); -#endif - return (RTCGeometry) scene->get(geomID); - RTC_CATCH_END2(scene); - return nullptr; - } - -RTC_NAMESPACE_END diff --git a/thirdparty/embree-aarch64/kernels/common/rtcore.h b/thirdparty/embree-aarch64/kernels/common/rtcore.h deleted file mode 100644 index 4b070e122b..0000000000 --- a/thirdparty/embree-aarch64/kernels/common/rtcore.h +++ /dev/null @@ -1,142 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../../include/embree3/rtcore.h" -RTC_NAMESPACE_USE - -namespace embree -{ - /*! decoding of intersection flags */ - __forceinline bool isCoherent (RTCIntersectContextFlags flags) { return (flags & RTC_INTERSECT_CONTEXT_FLAG_COHERENT) == RTC_INTERSECT_CONTEXT_FLAG_COHERENT; } - __forceinline bool isIncoherent(RTCIntersectContextFlags flags) { return (flags & RTC_INTERSECT_CONTEXT_FLAG_COHERENT) == RTC_INTERSECT_CONTEXT_FLAG_INCOHERENT; } - -#if defined(TASKING_TBB) && (TBB_INTERFACE_VERSION_MAJOR >= 8) -# define USE_TASK_ARENA 1 -#else -# define USE_TASK_ARENA 0 -#endif - -#if defined(TASKING_TBB) && (TBB_INTERFACE_VERSION >= 11009) // TBB 2019 Update 9 -# define TASKING_TBB_USE_TASK_ISOLATION 1 -#else -# define TASKING_TBB_USE_TASK_ISOLATION 0 -#endif - -/*! Macros used in the rtcore API implementation */ -// -- GODOT start -- -// #define RTC_CATCH_BEGIN try { -#define RTC_CATCH_BEGIN - -// #define RTC_CATCH_END(device) \ -// } catch (std::bad_alloc&) { \ -// Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \ -// } catch (rtcore_error& e) { \ -// Device::process_error(device,e.error,e.what()); \ -// } catch (std::exception& e) { \ -// Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \ -// } catch (...) { \ -// Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \ -// } -#define RTC_CATCH_END(device) - -// #define RTC_CATCH_END2(scene) \ -// } catch (std::bad_alloc&) { \ -// Device* device = scene ? scene->device : nullptr; \ -// Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \ -// } catch (rtcore_error& e) { \ -// Device* device = scene ? scene->device : nullptr; \ -// Device::process_error(device,e.error,e.what()); \ -// } catch (std::exception& e) { \ -// Device* device = scene ? scene->device : nullptr; \ -// Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \ -// } catch (...) { \ -// Device* device = scene ? scene->device : nullptr; \ -// Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \ -// } -#define RTC_CATCH_END2(scene) - -// #define RTC_CATCH_END2_FALSE(scene) \ -// } catch (std::bad_alloc&) { \ -// Device* device = scene ? scene->device : nullptr; \ -// Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \ -// return false; \ -// } catch (rtcore_error& e) { \ -// Device* device = scene ? scene->device : nullptr; \ -// Device::process_error(device,e.error,e.what()); \ -// return false; \ -// } catch (std::exception& e) { \ -// Device* device = scene ? scene->device : nullptr; \ -// Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \ -// return false; \ -// } catch (...) { \ -// Device* device = scene ? scene->device : nullptr; \ -// Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \ -// return false; \ -// } -#define RTC_CATCH_END2_FALSE(scene) return false; -// -- GODOT end -- - -#define RTC_VERIFY_HANDLE(handle) \ - if (handle == nullptr) { \ - throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"invalid argument"); \ - } - -#define RTC_VERIFY_GEOMID(id) \ - if (id == RTC_INVALID_GEOMETRY_ID) { \ - throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"invalid argument"); \ - } - -#define RTC_VERIFY_UPPER(id,upper) \ - if (id > upper) { \ - throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"invalid argument"); \ - } - -#define RTC_VERIFY_RANGE(id,lower,upper) \ - if (id < lower || id > upper) \ - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"argument out of bounds"); - -#if 0 // enable to debug print all API calls -#define RTC_TRACE(x) std::cout << #x << std::endl; -#else -#define RTC_TRACE(x) -#endif - -// -- GODOT begin -- -// /*! used to throw embree API errors */ -// struct rtcore_error : public std::exception -// { -// __forceinline rtcore_error(RTCError error, const std::string& str) -// : error(error), str(str) {} -// -// ~rtcore_error() throw() {} -// -// const char* what () const throw () { -// return str.c_str(); -// } -// -// RTCError error; -// std::string str; -// }; -// -- GODOT end -- - -#if defined(DEBUG) // only report file and line in debug mode - // -- GODOT begin -- - // #define throw_RTCError(error,str) \ - // throw rtcore_error(error,std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)); - #define throw_RTCError(error,str) \ - printf(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)), abort(); - // -- GODOT end -- -#else - // -- GODOT begin -- - // #define throw_RTCError(error,str) \ - // throw rtcore_error(error,str); - #define throw_RTCError(error,str) \ - abort(); - // -- GODOT end -- -#endif - -#define RTC_BUILD_ARGUMENTS_HAS(settings,member) \ - (settings.byteSize > (offsetof(RTCBuildArguments,member)+sizeof(settings.member))) -} diff --git a/thirdparty/embree-aarch64/kernels/common/rtcore_builder.cpp b/thirdparty/embree-aarch64/kernels/common/rtcore_builder.cpp deleted file mode 100644 index 6bb96bba07..0000000000 --- a/thirdparty/embree-aarch64/kernels/common/rtcore_builder.cpp +++ /dev/null @@ -1,442 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#define RTC_EXPORT_API - -#include "default.h" -#include "device.h" -#include "scene.h" -#include "context.h" -#include "alloc.h" - -#include "../builders/bvh_builder_sah.h" -#include "../builders/bvh_builder_morton.h" - -namespace embree -{ - namespace isa // FIXME: support more ISAs for builders - { - struct BVH : public RefCount - { - BVH (Device* device) - : device(device), allocator(device,true), morton_src(device,0), morton_tmp(device,0) - { - device->refInc(); - } - - ~BVH() { - device->refDec(); - } - - public: - Device* device; - FastAllocator allocator; - mvector morton_src; - mvector morton_tmp; - }; - - void* rtcBuildBVHMorton(const RTCBuildArguments* arguments) - { - BVH* bvh = (BVH*) arguments->bvh; - RTCBuildPrimitive* prims_i = arguments->primitives; - size_t primitiveCount = arguments->primitiveCount; - RTCCreateNodeFunction createNode = arguments->createNode; - RTCSetNodeChildrenFunction setNodeChildren = arguments->setNodeChildren; - RTCSetNodeBoundsFunction setNodeBounds = arguments->setNodeBounds; - RTCCreateLeafFunction createLeaf = arguments->createLeaf; - RTCProgressMonitorFunction buildProgress = arguments->buildProgress; - void* userPtr = arguments->userPtr; - - std::atomic progress(0); - - /* initialize temporary arrays for morton builder */ - PrimRef* prims = (PrimRef*) prims_i; - mvector& morton_src = bvh->morton_src; - mvector& morton_tmp = bvh->morton_tmp; - morton_src.resize(primitiveCount); - morton_tmp.resize(primitiveCount); - - /* compute centroid bounds */ - const BBox3fa centBounds = parallel_reduce ( size_t(0), primitiveCount, BBox3fa(empty), [&](const range& r) -> BBox3fa { - - BBox3fa bounds(empty); - for (size_t i=r.begin(); i& r) { - BVHBuilderMorton::MortonCodeGenerator generator(mapping,&morton_src[r.begin()]); - for (size_t i=r.begin(); i root = BVHBuilderMorton::build>( - - /* thread local allocator for fast allocations */ - [&] () -> FastAllocator::CachedAllocator { - return bvh->allocator.getCachedAllocator(); - }, - - /* lambda function that allocates BVH nodes */ - [&] ( const FastAllocator::CachedAllocator& alloc, size_t N ) -> void* { - return createNode((RTCThreadLocalAllocator)&alloc, (unsigned int)N,userPtr); - }, - - /* lambda function that sets bounds */ - [&] (void* node, const std::pair* children, size_t N) -> std::pair - { - BBox3fa bounds = empty; - void* childptrs[BVHBuilderMorton::MAX_BRANCHING_FACTOR]; - const RTCBounds* cbounds[BVHBuilderMorton::MAX_BRANCHING_FACTOR]; - for (size_t i=0; i& current, const FastAllocator::CachedAllocator& alloc) -> std::pair - { - RTCBuildPrimitive localBuildPrims[RTC_BUILD_MAX_PRIMITIVES_PER_LEAF]; - BBox3fa bounds = empty; - for (size_t i=0;i BBox3fa { - return prims[morton.index].bounds(); - }, - - /* progress monitor function */ - [&] (size_t dn) { - if (!buildProgress) return true; - const size_t n = progress.fetch_add(dn)+dn; - const double f = std::min(1.0,double(n)/double(primitiveCount)); - return buildProgress(userPtr,f); - }, - - morton_src.data(),morton_tmp.data(),primitiveCount, - *arguments); - - bvh->allocator.cleanup(); - return root.first; - } - - void* rtcBuildBVHBinnedSAH(const RTCBuildArguments* arguments) - { - BVH* bvh = (BVH*) arguments->bvh; - RTCBuildPrimitive* prims = arguments->primitives; - size_t primitiveCount = arguments->primitiveCount; - RTCCreateNodeFunction createNode = arguments->createNode; - RTCSetNodeChildrenFunction setNodeChildren = arguments->setNodeChildren; - RTCSetNodeBoundsFunction setNodeBounds = arguments->setNodeBounds; - RTCCreateLeafFunction createLeaf = arguments->createLeaf; - RTCProgressMonitorFunction buildProgress = arguments->buildProgress; - void* userPtr = arguments->userPtr; - - std::atomic progress(0); - - /* calculate priminfo */ - auto computeBounds = [&](const range& r) -> CentGeomBBox3fa - { - CentGeomBBox3fa bounds(empty); - for (size_t j=r.begin(); j( - - /* thread local allocator for fast allocations */ - [&] () -> FastAllocator::CachedAllocator { - return bvh->allocator.getCachedAllocator(); - }, - - /* lambda function that creates BVH nodes */ - [&](BVHBuilderBinnedSAH::BuildRecord* children, const size_t N, const FastAllocator::CachedAllocator& alloc) -> void* - { - void* node = createNode((RTCThreadLocalAllocator)&alloc, (unsigned int)N,userPtr); - const RTCBounds* cbounds[GeneralBVHBuilder::MAX_BRANCHING_FACTOR]; - for (size_t i=0; i void* { - setNodeChildren(node,children, (unsigned int)N,userPtr); - return node; - }, - - /* lambda function that creates BVH leaves */ - [&](const PrimRef* prims, const range& range, const FastAllocator::CachedAllocator& alloc) -> void* { - return createLeaf((RTCThreadLocalAllocator)&alloc,(RTCBuildPrimitive*)(prims+range.begin()),range.size(),userPtr); - }, - - /* progress monitor function */ - [&] (size_t dn) { - if (!buildProgress) return true; - const size_t n = progress.fetch_add(dn)+dn; - const double f = std::min(1.0,double(n)/double(primitiveCount)); - return buildProgress(userPtr,f); - }, - - (PrimRef*)prims,pinfo,*arguments); - - bvh->allocator.cleanup(); - return root; - } - - static __forceinline const std::pair mergePair(const std::pair& a, const std::pair& b) { - CentGeomBBox3fa centBounds = CentGeomBBox3fa::merge2(a.first,b.first); - unsigned int maxGeomID = max(a.second,b.second); - return std::pair(centBounds,maxGeomID); - } - - void* rtcBuildBVHSpatialSAH(const RTCBuildArguments* arguments) - { - BVH* bvh = (BVH*) arguments->bvh; - RTCBuildPrimitive* prims = arguments->primitives; - size_t primitiveCount = arguments->primitiveCount; - RTCCreateNodeFunction createNode = arguments->createNode; - RTCSetNodeChildrenFunction setNodeChildren = arguments->setNodeChildren; - RTCSetNodeBoundsFunction setNodeBounds = arguments->setNodeBounds; - RTCCreateLeafFunction createLeaf = arguments->createLeaf; - RTCSplitPrimitiveFunction splitPrimitive = arguments->splitPrimitive; - RTCProgressMonitorFunction buildProgress = arguments->buildProgress; - void* userPtr = arguments->userPtr; - - std::atomic progress(0); - - /* calculate priminfo */ - - auto computeBounds = [&](const range& r) -> std::pair - { - CentGeomBBox3fa bounds(empty); - unsigned maxGeomID = 0; - for (size_t j=r.begin(); j(bounds,maxGeomID); - }; - - - const std::pair pair = - parallel_reduce(size_t(0),primitiveCount,size_t(1024),size_t(1024),std::pair(CentGeomBBox3fa(empty),0), computeBounds, mergePair); - - CentGeomBBox3fa bounds = pair.first; - const unsigned int maxGeomID = pair.second; - - if (unlikely(maxGeomID >= ((unsigned int)1 << (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)))) - { - /* fallback code for max geomID larger than threshold */ - return rtcBuildBVHBinnedSAH(arguments); - } - - const PrimInfo pinfo(0,primitiveCount,bounds); - - /* function that splits a build primitive */ - struct Splitter - { - Splitter (RTCSplitPrimitiveFunction splitPrimitive, unsigned geomID, unsigned primID, void* userPtr) - : splitPrimitive(splitPrimitive), geomID(geomID), primID(primID), userPtr(userPtr) {} - - __forceinline void operator() (PrimRef& prim, const size_t dim, const float pos, PrimRef& left_o, PrimRef& right_o) const - { - prim.geomIDref() &= BVHBuilderBinnedFastSpatialSAH::GEOMID_MASK; - splitPrimitive((RTCBuildPrimitive*)&prim,(unsigned)dim,pos,(RTCBounds*)&left_o,(RTCBounds*)&right_o,userPtr); - left_o.geomIDref() = geomID; left_o.primIDref() = primID; - right_o.geomIDref() = geomID; right_o.primIDref() = primID; - } - - __forceinline void operator() (const BBox3fa& box, const size_t dim, const float pos, BBox3fa& left_o, BBox3fa& right_o) const - { - PrimRef prim(box,geomID & BVHBuilderBinnedFastSpatialSAH::GEOMID_MASK,primID); - splitPrimitive((RTCBuildPrimitive*)&prim,(unsigned)dim,pos,(RTCBounds*)&left_o,(RTCBounds*)&right_o,userPtr); - } - - RTCSplitPrimitiveFunction splitPrimitive; - unsigned geomID; - unsigned primID; - void* userPtr; - }; - - /* build BVH */ - void* root = BVHBuilderBinnedFastSpatialSAH::build( - - /* thread local allocator for fast allocations */ - [&] () -> FastAllocator::CachedAllocator { - return bvh->allocator.getCachedAllocator(); - }, - - /* lambda function that creates BVH nodes */ - [&] (BVHBuilderBinnedFastSpatialSAH::BuildRecord* children, const size_t N, const FastAllocator::CachedAllocator& alloc) -> void* - { - void* node = createNode((RTCThreadLocalAllocator)&alloc, (unsigned int)N,userPtr); - const RTCBounds* cbounds[GeneralBVHBuilder::MAX_BRANCHING_FACTOR]; - for (size_t i=0; i void* { - setNodeChildren(node,children, (unsigned int)N,userPtr); - return node; - }, - - /* lambda function that creates BVH leaves */ - [&] (const PrimRef* prims, const range& range, const FastAllocator::CachedAllocator& alloc) -> void* { - return createLeaf((RTCThreadLocalAllocator)&alloc,(RTCBuildPrimitive*)(prims+range.begin()),range.size(),userPtr); - }, - - /* returns the splitter */ - [&] ( const PrimRef& prim ) -> Splitter { - return Splitter(splitPrimitive,prim.geomID(),prim.primID(),userPtr); - }, - - /* progress monitor function */ - [&] (size_t dn) { - if (!buildProgress) return true; - const size_t n = progress.fetch_add(dn)+dn; - const double f = std::min(1.0,double(n)/double(primitiveCount)); - return buildProgress(userPtr,f); - }, - - (PrimRef*)prims, - arguments->primitiveArrayCapacity, - pinfo,*arguments); - - bvh->allocator.cleanup(); - return root; - } - } -} - -using namespace embree; -using namespace embree::isa; - -RTC_NAMESPACE_BEGIN - - RTC_API RTCBVH rtcNewBVH(RTCDevice device) - { - RTC_CATCH_BEGIN; - RTC_TRACE(rtcNewAllocator); - RTC_VERIFY_HANDLE(device); - BVH* bvh = new BVH((Device*)device); - return (RTCBVH) bvh->refInc(); - RTC_CATCH_END((Device*)device); - return nullptr; - } - - RTC_API void* rtcBuildBVH(const RTCBuildArguments* arguments) - { - BVH* bvh = (BVH*) arguments->bvh; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcBuildBVH); - RTC_VERIFY_HANDLE(bvh); - RTC_VERIFY_HANDLE(arguments); - RTC_VERIFY_HANDLE(arguments->createNode); - RTC_VERIFY_HANDLE(arguments->setNodeChildren); - RTC_VERIFY_HANDLE(arguments->setNodeBounds); - RTC_VERIFY_HANDLE(arguments->createLeaf); - - if (arguments->primitiveArrayCapacity < arguments->primitiveCount) - throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"primitiveArrayCapacity must be greater or equal to primitiveCount") - - /* initialize the allocator */ - bvh->allocator.init_estimate(arguments->primitiveCount*sizeof(BBox3fa)); - bvh->allocator.reset(); - - /* switch between differnet builders based on quality level */ - if (arguments->buildQuality == RTC_BUILD_QUALITY_LOW) - return rtcBuildBVHMorton(arguments); - else if (arguments->buildQuality == RTC_BUILD_QUALITY_MEDIUM) - return rtcBuildBVHBinnedSAH(arguments); - else if (arguments->buildQuality == RTC_BUILD_QUALITY_HIGH) { - if (arguments->splitPrimitive == nullptr || arguments->primitiveArrayCapacity <= arguments->primitiveCount) - return rtcBuildBVHBinnedSAH(arguments); - else - return rtcBuildBVHSpatialSAH(arguments); - } - else - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid build quality"); - - /* if we are in dynamic mode, then do not clear temporary data */ - if (!(arguments->buildFlags & RTC_BUILD_FLAG_DYNAMIC)) - { - bvh->morton_src.clear(); - bvh->morton_tmp.clear(); - } - - RTC_CATCH_END(bvh->device); - return nullptr; - } - - RTC_API void* rtcThreadLocalAlloc(RTCThreadLocalAllocator localAllocator, size_t bytes, size_t align) - { - FastAllocator::CachedAllocator* alloc = (FastAllocator::CachedAllocator*) localAllocator; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcThreadLocalAlloc); - return alloc->malloc0(bytes,align); - RTC_CATCH_END(alloc->alloc->getDevice()); - return nullptr; - } - - RTC_API void rtcMakeStaticBVH(RTCBVH hbvh) - { - BVH* bvh = (BVH*) hbvh; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcStaticBVH); - RTC_VERIFY_HANDLE(hbvh); - bvh->morton_src.clear(); - bvh->morton_tmp.clear(); - RTC_CATCH_END(bvh->device); - } - - RTC_API void rtcRetainBVH(RTCBVH hbvh) - { - BVH* bvh = (BVH*) hbvh; - Device* device = bvh ? bvh->device : nullptr; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcRetainBVH); - RTC_VERIFY_HANDLE(hbvh); - bvh->refInc(); - RTC_CATCH_END(device); - } - - RTC_API void rtcReleaseBVH(RTCBVH hbvh) - { - BVH* bvh = (BVH*) hbvh; - Device* device = bvh ? bvh->device : nullptr; - RTC_CATCH_BEGIN; - RTC_TRACE(rtcReleaseBVH); - RTC_VERIFY_HANDLE(hbvh); - bvh->refDec(); - RTC_CATCH_END(device); - } - -RTC_NAMESPACE_END diff --git a/thirdparty/embree-aarch64/kernels/common/scene.cpp b/thirdparty/embree-aarch64/kernels/common/scene.cpp deleted file mode 100644 index 1e23aeb415..0000000000 --- a/thirdparty/embree-aarch64/kernels/common/scene.cpp +++ /dev/null @@ -1,976 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "scene.h" - -#include "../bvh/bvh4_factory.h" -#include "../bvh/bvh8_factory.h" -#include "../../common/algorithms/parallel_reduce.h" - -namespace embree -{ - /* error raising rtcIntersect and rtcOccluded functions */ - void missing_rtcCommit() { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); } - void invalid_rtcIntersect1() { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect and rtcOccluded not enabled"); } - void invalid_rtcIntersect4() { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect4 and rtcOccluded4 not enabled"); } - void invalid_rtcIntersect8() { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect8 and rtcOccluded8 not enabled"); } - void invalid_rtcIntersect16() { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect16 and rtcOccluded16 not enabled"); } - void invalid_rtcIntersectN() { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersectN and rtcOccludedN not enabled"); } - - Scene::Scene (Device* device) - : device(device), - flags_modified(true), enabled_geometry_types(0), - scene_flags(RTC_SCENE_FLAG_NONE), - quality_flags(RTC_BUILD_QUALITY_MEDIUM), - is_build(false), modified(true), - progressInterface(this), progress_monitor_function(nullptr), progress_monitor_ptr(nullptr), progress_monitor_counter(0) - { - device->refInc(); - - intersectors = Accel::Intersectors(missing_rtcCommit); - - /* one can overwrite flags through device for debugging */ - if (device->quality_flags != -1) - quality_flags = (RTCBuildQuality) device->quality_flags; - if (device->scene_flags != -1) - scene_flags = (RTCSceneFlags) device->scene_flags; - } - - Scene::~Scene() noexcept - { - device->refDec(); - } - - void Scene::printStatistics() - { - /* calculate maximum number of time segments */ - unsigned max_time_steps = 0; - for (size_t i=0; inumTimeSteps); - } - - /* initialize vectors*/ - std::vector statistics[Geometry::GTY_END]; - for (size_t i=0; igetType(); - assert(tynumTimeSegments(); - assert((unsigned int)timesegments < max_time_steps); - statistics[ty][timesegments] += get(i)->size(); - } - - /* print statistics */ - std::cout << std::setw(23) << "segments" << ": "; - for (size_t t=0; ttri_accel == "default") - { - if (quality_flags != RTC_BUILD_QUALITY_LOW) - { - int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel(); - switch (mode) { - case /*0b00*/ 0: -#if defined (EMBREE_TARGET_SIMD8) - if (device->canUseAVX()) - { - if (quality_flags == RTC_BUILD_QUALITY_HIGH) - accels_add(device->bvh8_factory->BVH8Triangle4(this,BVHFactory::BuildVariant::HIGH_QUALITY,BVHFactory::IntersectVariant::FAST)); - else - accels_add(device->bvh8_factory->BVH8Triangle4(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST)); - } - else -#endif - { - if (quality_flags == RTC_BUILD_QUALITY_HIGH) - accels_add(device->bvh4_factory->BVH4Triangle4(this,BVHFactory::BuildVariant::HIGH_QUALITY,BVHFactory::IntersectVariant::FAST)); - else - accels_add(device->bvh4_factory->BVH4Triangle4(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST)); - } - break; - - case /*0b01*/ 1: -#if defined (EMBREE_TARGET_SIMD8) - if (device->canUseAVX()) - accels_add(device->bvh8_factory->BVH8Triangle4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); - else -#endif - accels_add(device->bvh4_factory->BVH4Triangle4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); - - break; - case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST )); break; - case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break; - } - } - else /* dynamic */ - { -#if defined (EMBREE_TARGET_SIMD8) - if (device->canUseAVX()) - { - int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel(); - switch (mode) { - case /*0b00*/ 0: accels_add(device->bvh8_factory->BVH8Triangle4 (this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST )); break; - case /*0b01*/ 1: accels_add(device->bvh8_factory->BVH8Triangle4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break; - case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST )); break; - case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break; - } - } - else -#endif - { - int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel(); - switch (mode) { - case /*0b00*/ 0: accels_add(device->bvh4_factory->BVH4Triangle4 (this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST )); break; - case /*0b01*/ 1: accels_add(device->bvh4_factory->BVH4Triangle4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break; - case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST )); break; - case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break; - } - } - } - } - else if (device->tri_accel == "bvh4.triangle4") accels_add(device->bvh4_factory->BVH4Triangle4 (this)); - else if (device->tri_accel == "bvh4.triangle4v") accels_add(device->bvh4_factory->BVH4Triangle4v(this)); - else if (device->tri_accel == "bvh4.triangle4i") accels_add(device->bvh4_factory->BVH4Triangle4i(this)); - else if (device->tri_accel == "qbvh4.triangle4i") accels_add(device->bvh4_factory->BVH4QuantizedTriangle4i(this)); - -#if defined (EMBREE_TARGET_SIMD8) - else if (device->tri_accel == "bvh8.triangle4") accels_add(device->bvh8_factory->BVH8Triangle4 (this)); - else if (device->tri_accel == "bvh8.triangle4v") accels_add(device->bvh8_factory->BVH8Triangle4v(this)); - else if (device->tri_accel == "bvh8.triangle4i") accels_add(device->bvh8_factory->BVH8Triangle4i(this)); - else if (device->tri_accel == "qbvh8.triangle4i") accels_add(device->bvh8_factory->BVH8QuantizedTriangle4i(this)); - else if (device->tri_accel == "qbvh8.triangle4") accels_add(device->bvh8_factory->BVH8QuantizedTriangle4(this)); -#endif - else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown triangle acceleration structure "+device->tri_accel); -#endif - } - - void Scene::createTriangleMBAccel() - { -#if defined(EMBREE_GEOMETRY_TRIANGLE) - if (device->tri_accel_mb == "default") - { - int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel(); - -#if defined (EMBREE_TARGET_SIMD8) - if (device->canUseAVX2()) // BVH8 reduces performance on AVX only-machines - { - switch (mode) { - case /*0b00*/ 0: accels_add(device->bvh8_factory->BVH8Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST )); break; - case /*0b01*/ 1: accels_add(device->bvh8_factory->BVH8Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break; - case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST )); break; - case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break; - } - } - else -#endif - { - switch (mode) { - case /*0b00*/ 0: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST )); break; - case /*0b01*/ 1: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break; - case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST )); break; - case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break; - } - } - } - else if (device->tri_accel_mb == "bvh4.triangle4imb") accels_add(device->bvh4_factory->BVH4Triangle4iMB(this)); - else if (device->tri_accel_mb == "bvh4.triangle4vmb") accels_add(device->bvh4_factory->BVH4Triangle4vMB(this)); -#if defined (EMBREE_TARGET_SIMD8) - else if (device->tri_accel_mb == "bvh8.triangle4imb") accels_add(device->bvh8_factory->BVH8Triangle4iMB(this)); - else if (device->tri_accel_mb == "bvh8.triangle4vmb") accels_add(device->bvh8_factory->BVH8Triangle4vMB(this)); -#endif - else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown motion blur triangle acceleration structure "+device->tri_accel_mb); -#endif - } - - void Scene::createQuadAccel() - { -#if defined(EMBREE_GEOMETRY_QUAD) - if (device->quad_accel == "default") - { - if (quality_flags != RTC_BUILD_QUALITY_LOW) - { - /* static */ - int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel(); - switch (mode) { - case /*0b00*/ 0: -#if defined (EMBREE_TARGET_SIMD8) - if (device->canUseAVX()) - { - if (quality_flags == RTC_BUILD_QUALITY_HIGH) - accels_add(device->bvh8_factory->BVH8Quad4v(this,BVHFactory::BuildVariant::HIGH_QUALITY,BVHFactory::IntersectVariant::FAST)); - else - accels_add(device->bvh8_factory->BVH8Quad4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST)); - } - else -#endif - { - if (quality_flags == RTC_BUILD_QUALITY_HIGH) - accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::HIGH_QUALITY,BVHFactory::IntersectVariant::FAST)); - else - accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST)); - } - break; - - case /*0b01*/ 1: -#if defined (EMBREE_TARGET_SIMD8) - if (device->canUseAVX()) - accels_add(device->bvh8_factory->BVH8Quad4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); - else -#endif - accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); - break; - - case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Quad4i(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST)); break; - case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Quad4i(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break; - } - } - else /* dynamic */ - { -#if defined (EMBREE_TARGET_SIMD8) - if (device->canUseAVX()) - { - int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel(); - switch (mode) { - case /*0b00*/ 0: accels_add(device->bvh8_factory->BVH8Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST)); break; - case /*0b01*/ 1: accels_add(device->bvh8_factory->BVH8Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break; - case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST)); break; - case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break; - } - } - else -#endif - { - int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel(); - switch (mode) { - case /*0b00*/ 0: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST)); break; - case /*0b01*/ 1: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break; - case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST)); break; - case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break; - } - } - } - } - else if (device->quad_accel == "bvh4.quad4v") accels_add(device->bvh4_factory->BVH4Quad4v(this)); - else if (device->quad_accel == "bvh4.quad4i") accels_add(device->bvh4_factory->BVH4Quad4i(this)); - else if (device->quad_accel == "qbvh4.quad4i") accels_add(device->bvh4_factory->BVH4QuantizedQuad4i(this)); - -#if defined (EMBREE_TARGET_SIMD8) - else if (device->quad_accel == "bvh8.quad4v") accels_add(device->bvh8_factory->BVH8Quad4v(this)); - else if (device->quad_accel == "bvh8.quad4i") accels_add(device->bvh8_factory->BVH8Quad4i(this)); - else if (device->quad_accel == "qbvh8.quad4i") accels_add(device->bvh8_factory->BVH8QuantizedQuad4i(this)); -#endif - else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown quad acceleration structure "+device->quad_accel); -#endif - } - - void Scene::createQuadMBAccel() - { -#if defined(EMBREE_GEOMETRY_QUAD) - if (device->quad_accel_mb == "default") - { - int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel(); - switch (mode) { - case /*0b00*/ 0: -#if defined (EMBREE_TARGET_SIMD8) - if (device->canUseAVX()) - accels_add(device->bvh8_factory->BVH8Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST)); - else -#endif - accels_add(device->bvh4_factory->BVH4Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST)); - break; - - case /*0b01*/ 1: -#if defined (EMBREE_TARGET_SIMD8) - if (device->canUseAVX()) - accels_add(device->bvh8_factory->BVH8Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); - else -#endif - accels_add(device->bvh4_factory->BVH4Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); - break; - - case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST )); break; - case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break; - } - } - else if (device->quad_accel_mb == "bvh4.quad4imb") accels_add(device->bvh4_factory->BVH4Quad4iMB(this)); -#if defined (EMBREE_TARGET_SIMD8) - else if (device->quad_accel_mb == "bvh8.quad4imb") accels_add(device->bvh8_factory->BVH8Quad4iMB(this)); -#endif - else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown quad motion blur acceleration structure "+device->quad_accel_mb); -#endif - } - - void Scene::createHairAccel() - { -#if defined(EMBREE_GEOMETRY_CURVE) || defined(EMBREE_GEOMETRY_POINT) - if (device->hair_accel == "default") - { - int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel(); -#if defined (EMBREE_TARGET_SIMD8) - if (device->canUseAVX2()) // only enable on HSW machines, for SNB this codepath is slower - { - switch (mode) { - case /*0b00*/ 0: accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8v(this,BVHFactory::IntersectVariant::FAST)); break; - case /*0b01*/ 1: accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8v(this,BVHFactory::IntersectVariant::ROBUST)); break; - case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve8i(this,BVHFactory::IntersectVariant::FAST)); break; - case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve8i(this,BVHFactory::IntersectVariant::ROBUST)); break; - } - } - else -#endif - { - switch (mode) { - case /*0b00*/ 0: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4v(this,BVHFactory::IntersectVariant::FAST)); break; - case /*0b01*/ 1: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4v(this,BVHFactory::IntersectVariant::ROBUST)); break; - case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4i(this,BVHFactory::IntersectVariant::FAST)); break; - case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4i(this,BVHFactory::IntersectVariant::ROBUST)); break; - } - } - } - else if (device->hair_accel == "bvh4obb.virtualcurve4v" ) accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4v(this,BVHFactory::IntersectVariant::FAST)); - else if (device->hair_accel == "bvh4obb.virtualcurve4i" ) accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4i(this,BVHFactory::IntersectVariant::FAST)); -#if defined (EMBREE_TARGET_SIMD8) - else if (device->hair_accel == "bvh8obb.virtualcurve8v" ) accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8v(this,BVHFactory::IntersectVariant::FAST)); - else if (device->hair_accel == "bvh4obb.virtualcurve8i" ) accels_add(device->bvh4_factory->BVH4OBBVirtualCurve8i(this,BVHFactory::IntersectVariant::FAST)); -#endif - else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown hair acceleration structure "+device->hair_accel); -#endif - } - - void Scene::createHairMBAccel() - { -#if defined(EMBREE_GEOMETRY_CURVE) || defined(EMBREE_GEOMETRY_POINT) - if (device->hair_accel_mb == "default") - { -#if defined (EMBREE_TARGET_SIMD8) - if (device->canUseAVX2()) // only enable on HSW machines, on SNB this codepath is slower - { - if (isRobustAccel()) accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8iMB(this,BVHFactory::IntersectVariant::ROBUST)); - else accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8iMB(this,BVHFactory::IntersectVariant::FAST)); - } - else -#endif - { - if (isRobustAccel()) accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4iMB(this,BVHFactory::IntersectVariant::ROBUST)); - else accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4iMB(this,BVHFactory::IntersectVariant::FAST)); - } - } - else if (device->hair_accel_mb == "bvh4.virtualcurve4imb") accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4iMB(this,BVHFactory::IntersectVariant::FAST)); - -#if defined (EMBREE_TARGET_SIMD8) - else if (device->hair_accel_mb == "bvh4.virtualcurve8imb") accels_add(device->bvh4_factory->BVH4OBBVirtualCurve8iMB(this,BVHFactory::IntersectVariant::FAST)); - else if (device->hair_accel_mb == "bvh8.virtualcurve8imb") accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8iMB(this,BVHFactory::IntersectVariant::FAST)); -#endif - else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown motion blur hair acceleration structure "+device->hair_accel_mb); -#endif - } - - void Scene::createSubdivAccel() - { -#if defined(EMBREE_GEOMETRY_SUBDIVISION) - if (device->subdiv_accel == "default") { - accels_add(device->bvh4_factory->BVH4SubdivPatch1(this)); - } - else if (device->subdiv_accel == "bvh4.grid.eager" ) accels_add(device->bvh4_factory->BVH4SubdivPatch1(this)); - else if (device->subdiv_accel == "bvh4.subdivpatch1eager" ) accels_add(device->bvh4_factory->BVH4SubdivPatch1(this)); - else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown subdiv accel "+device->subdiv_accel); -#endif - } - - void Scene::createSubdivMBAccel() - { -#if defined(EMBREE_GEOMETRY_SUBDIVISION) - if (device->subdiv_accel_mb == "default") { - accels_add(device->bvh4_factory->BVH4SubdivPatch1MB(this)); - } - else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown subdiv mblur accel "+device->subdiv_accel_mb); -#endif - } - - void Scene::createUserGeometryAccel() - { -#if defined(EMBREE_GEOMETRY_USER) - if (device->object_accel == "default") - { -#if defined (EMBREE_TARGET_SIMD8) - if (device->canUseAVX() && !isCompactAccel()) - { - if (quality_flags != RTC_BUILD_QUALITY_LOW) { - accels_add(device->bvh8_factory->BVH8UserGeometry(this,BVHFactory::BuildVariant::STATIC)); - } else { - accels_add(device->bvh8_factory->BVH8UserGeometry(this,BVHFactory::BuildVariant::DYNAMIC)); - } - } - else -#endif - { - if (quality_flags != RTC_BUILD_QUALITY_LOW) { - accels_add(device->bvh4_factory->BVH4UserGeometry(this,BVHFactory::BuildVariant::STATIC)); - } else { - accels_add(device->bvh4_factory->BVH4UserGeometry(this,BVHFactory::BuildVariant::DYNAMIC)); - } - } - } - else if (device->object_accel == "bvh4.object") accels_add(device->bvh4_factory->BVH4UserGeometry(this)); -#if defined (EMBREE_TARGET_SIMD8) - else if (device->object_accel == "bvh8.object") accels_add(device->bvh8_factory->BVH8UserGeometry(this)); -#endif - else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown user geometry accel "+device->object_accel); -#endif - } - - void Scene::createUserGeometryMBAccel() - { -#if defined(EMBREE_GEOMETRY_USER) - if (device->object_accel_mb == "default" ) { -#if defined (EMBREE_TARGET_SIMD8) - if (device->canUseAVX() && !isCompactAccel()) - accels_add(device->bvh8_factory->BVH8UserGeometryMB(this)); - else -#endif - accels_add(device->bvh4_factory->BVH4UserGeometryMB(this)); - } - else if (device->object_accel_mb == "bvh4.object") accels_add(device->bvh4_factory->BVH4UserGeometryMB(this)); -#if defined (EMBREE_TARGET_SIMD8) - else if (device->object_accel_mb == "bvh8.object") accels_add(device->bvh8_factory->BVH8UserGeometryMB(this)); -#endif - else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown user geometry mblur accel "+device->object_accel_mb); -#endif - } - - void Scene::createInstanceAccel() - { -#if defined(EMBREE_GEOMETRY_INSTANCE) - // if (device->object_accel == "default") - { -#if defined (EMBREE_TARGET_SIMD8) - if (device->canUseAVX() && !isCompactAccel()) { - if (quality_flags != RTC_BUILD_QUALITY_LOW) { - accels_add(device->bvh8_factory->BVH8Instance(this, false, BVHFactory::BuildVariant::STATIC)); - } else { - accels_add(device->bvh8_factory->BVH8Instance(this, false, BVHFactory::BuildVariant::DYNAMIC)); - } - } - else -#endif - { - if (quality_flags != RTC_BUILD_QUALITY_LOW) { - accels_add(device->bvh4_factory->BVH4Instance(this, false, BVHFactory::BuildVariant::STATIC)); - } else { - accels_add(device->bvh4_factory->BVH4Instance(this, false, BVHFactory::BuildVariant::DYNAMIC)); - } - } - } - // else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown instance accel "+device->instance_accel); -#endif - } - - void Scene::createInstanceMBAccel() - { -#if defined(EMBREE_GEOMETRY_INSTANCE) - //if (device->instance_accel_mb == "default") - { -#if defined (EMBREE_TARGET_SIMD8) - if (device->canUseAVX() && !isCompactAccel()) - accels_add(device->bvh8_factory->BVH8InstanceMB(this, false)); - else -#endif - accels_add(device->bvh4_factory->BVH4InstanceMB(this, false)); - } - //else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown instance mblur accel "+device->instance_accel_mb); -#endif - } - - void Scene::createInstanceExpensiveAccel() - { -#if defined(EMBREE_GEOMETRY_INSTANCE) - // if (device->object_accel == "default") - { -#if defined (EMBREE_TARGET_SIMD8) - if (device->canUseAVX() && !isCompactAccel()) { - if (quality_flags != RTC_BUILD_QUALITY_LOW) { - accels_add(device->bvh8_factory->BVH8Instance(this, true, BVHFactory::BuildVariant::STATIC)); - } else { - accels_add(device->bvh8_factory->BVH8Instance(this, true, BVHFactory::BuildVariant::DYNAMIC)); - } - } - else -#endif - { - if (quality_flags != RTC_BUILD_QUALITY_LOW) { - accels_add(device->bvh4_factory->BVH4Instance(this, true, BVHFactory::BuildVariant::STATIC)); - } else { - accels_add(device->bvh4_factory->BVH4Instance(this, true, BVHFactory::BuildVariant::DYNAMIC)); - } - } - } - // else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown instance accel "+device->instance_accel); -#endif - } - - void Scene::createInstanceExpensiveMBAccel() - { -#if defined(EMBREE_GEOMETRY_INSTANCE) - //if (device->instance_accel_mb == "default") - { -#if defined (EMBREE_TARGET_SIMD8) - if (device->canUseAVX() && !isCompactAccel()) - accels_add(device->bvh8_factory->BVH8InstanceMB(this, true)); - else -#endif - accels_add(device->bvh4_factory->BVH4InstanceMB(this, true)); - } - //else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown instance mblur accel "+device->instance_accel_mb); -#endif - } - - void Scene::createGridAccel() - { - BVHFactory::IntersectVariant ivariant = isRobustAccel() ? BVHFactory::IntersectVariant::ROBUST : BVHFactory::IntersectVariant::FAST; -#if defined(EMBREE_GEOMETRY_GRID) - if (device->grid_accel == "default") - { -#if defined (EMBREE_TARGET_SIMD8) - if (device->canUseAVX() && !isCompactAccel()) - { - accels_add(device->bvh8_factory->BVH8Grid(this,BVHFactory::BuildVariant::STATIC,ivariant)); - } - else -#endif - { - accels_add(device->bvh4_factory->BVH4Grid(this,BVHFactory::BuildVariant::STATIC,ivariant)); - } - } - else if (device->grid_accel == "bvh4.grid") accels_add(device->bvh4_factory->BVH4Grid(this,BVHFactory::BuildVariant::STATIC,ivariant)); -#if defined (EMBREE_TARGET_SIMD8) - else if (device->grid_accel == "bvh8.grid") accels_add(device->bvh8_factory->BVH8Grid(this,BVHFactory::BuildVariant::STATIC,ivariant)); -#endif - else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown grid accel "+device->grid_accel); -#endif - - } - - void Scene::createGridMBAccel() - { -#if defined(EMBREE_GEOMETRY_GRID) - if (device->grid_accel_mb == "default") - { - accels_add(device->bvh4_factory->BVH4GridMB(this,BVHFactory::BuildVariant::STATIC)); - } - else if (device->grid_accel_mb == "bvh4mb.grid") accels_add(device->bvh4_factory->BVH4GridMB(this)); - else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown grid mb accel "+device->grid_accel); -#endif - - } - - void Scene::clear() { - } - - unsigned Scene::bind(unsigned geomID, Ref geometry) - { -#if defined(__aarch64__) && defined(BUILD_IOS) - std::scoped_lock lock(geometriesMutex); -#else - Lock lock(geometriesMutex); -#endif - if (geomID == RTC_INVALID_GEOMETRY_ID) { - geomID = id_pool.allocate(); - if (geomID == RTC_INVALID_GEOMETRY_ID) - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"too many geometries inside scene"); - } - else - { - if (!id_pool.add(geomID)) - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid geometry ID provided"); - } - if (geomID >= geometries.size()) { - geometries.resize(geomID+1); - vertices.resize(geomID+1); - geometryModCounters_.resize(geomID+1); - } - geometries[geomID] = geometry; - geometryModCounters_[geomID] = 0; - if (geometry->isEnabled()) { - setModified (); - } - return geomID; - } - - void Scene::detachGeometry(size_t geomID) - { -#if defined(__aarch64__) && defined(BUILD_IOS) - std::scoped_lock lock(geometriesMutex); -#else - Lock lock(geometriesMutex); -#endif - - if (geomID >= geometries.size()) - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid geometry ID"); - - Ref& geometry = geometries[geomID]; - if (geometry == null) - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid geometry"); - - if (geometry->isEnabled()) { - setModified (); - } - accels_deleteGeometry(unsigned(geomID)); - id_pool.deallocate((unsigned)geomID); - geometries[geomID] = null; - vertices[geomID] = nullptr; - geometryModCounters_[geomID] = 0; - } - - void Scene::updateInterface() - { - is_build = true; - } - - void Scene::commit_task () - { - checkIfModifiedAndSet (); - if (!isModified()) { - return; - } - - /* print scene statistics */ - if (device->verbosity(2)) - printStatistics(); - - progress_monitor_counter = 0; - - /* gather scene stats and call preCommit function of each geometry */ - this->world = parallel_reduce (size_t(0), geometries.size(), GeometryCounts (), - [this](const range& r)->GeometryCounts - { - GeometryCounts c; - for (auto i=r.begin(); iisEnabled()) - { - geometries[i]->preCommit(); - geometries[i]->addElementsToCount (c); - c.numFilterFunctions += (int) geometries[i]->hasFilterFunctions(); - } - } - return c; - }, - std::plus() - ); - - /* select acceleration structures to build */ - unsigned int new_enabled_geometry_types = world.enabledGeometryTypesMask(); - if (flags_modified || new_enabled_geometry_types != enabled_geometry_types) - { - accels_init(); - - /* we need to make all geometries modified, otherwise two level builder will - not rebuild currently not modified geometries */ - parallel_for(geometryModCounters_.size(), [&] ( const size_t i ) { - geometryModCounters_[i] = 0; - }); - - if (getNumPrimitives(TriangleMesh::geom_type,false)) createTriangleAccel(); - if (getNumPrimitives(TriangleMesh::geom_type,true)) createTriangleMBAccel(); - if (getNumPrimitives(QuadMesh::geom_type,false)) createQuadAccel(); - if (getNumPrimitives(QuadMesh::geom_type,true)) createQuadMBAccel(); - if (getNumPrimitives(GridMesh::geom_type,false)) createGridAccel(); - if (getNumPrimitives(GridMesh::geom_type,true)) createGridMBAccel(); - if (getNumPrimitives(SubdivMesh::geom_type,false)) createSubdivAccel(); - if (getNumPrimitives(SubdivMesh::geom_type,true)) createSubdivMBAccel(); - if (getNumPrimitives(Geometry::MTY_CURVES,false)) createHairAccel(); - if (getNumPrimitives(Geometry::MTY_CURVES,true)) createHairMBAccel(); - if (getNumPrimitives(UserGeometry::geom_type,false)) createUserGeometryAccel(); - if (getNumPrimitives(UserGeometry::geom_type,true)) createUserGeometryMBAccel(); - if (getNumPrimitives(Geometry::MTY_INSTANCE_CHEAP,false)) createInstanceAccel(); - if (getNumPrimitives(Geometry::MTY_INSTANCE_CHEAP,true)) createInstanceMBAccel(); - if (getNumPrimitives(Geometry::MTY_INSTANCE_EXPENSIVE,false)) createInstanceExpensiveAccel(); - if (getNumPrimitives(Geometry::MTY_INSTANCE_EXPENSIVE,true)) createInstanceExpensiveMBAccel(); - - flags_modified = false; - enabled_geometry_types = new_enabled_geometry_types; - } - - /* select fast code path if no filter function is present */ - accels_select(hasFilterFunction()); - - /* build all hierarchies of this scene */ - accels_build(); - - /* make static geometry immutable */ - if (!isDynamicAccel()) { - accels_immutable(); - flags_modified = true; // in non-dynamic mode we have to re-create accels - } - - /* call postCommit function of each geometry */ - parallel_for(geometries.size(), [&] ( const size_t i ) { - if (geometries[i] && geometries[i]->isEnabled()) { - geometries[i]->postCommit(); - vertices[i] = geometries[i]->getCompactVertexArray(); - geometryModCounters_[i] = geometries[i]->getModCounter(); - } - }); - - updateInterface(); - - if (device->verbosity(2)) { - std::cout << "created scene intersector" << std::endl; - accels_print(2); - std::cout << "selected scene intersector" << std::endl; - intersectors.print(2); - } - - setModified(false); - } - - void Scene::setBuildQuality(RTCBuildQuality quality_flags_i) - { - if (quality_flags == quality_flags_i) return; - quality_flags = quality_flags_i; - flags_modified = true; - } - - RTCBuildQuality Scene::getBuildQuality() const { - return quality_flags; - } - - void Scene::setSceneFlags(RTCSceneFlags scene_flags_i) - { - if (scene_flags == scene_flags_i) return; - scene_flags = scene_flags_i; - flags_modified = true; - } - - RTCSceneFlags Scene::getSceneFlags() const { - return scene_flags; - } - -#if defined(TASKING_INTERNAL) - - void Scene::commit (bool join) - { - Lock buildLock(buildMutex,false); - - /* allocates own taskscheduler for each build */ - Ref scheduler = nullptr; - { - Lock lock(schedulerMutex); - scheduler = this->scheduler; - if (scheduler == null) { - buildLock.lock(); - this->scheduler = scheduler = new TaskScheduler; - } - } - - /* worker threads join build */ - if (!buildLock.isLocked()) - { - if (!join) - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"use rtcJoinCommitScene to join a build operation"); - - scheduler->join(); - return; - } - - /* initiate build */ - // -- GODOT start -- - // try { - scheduler->spawn_root([&]() { commit_task(); Lock lock(schedulerMutex); this->scheduler = nullptr; }, 1, !join); - // } - // catch (...) { - // accels_clear(); - // updateInterface(); - // Lock lock(schedulerMutex); - // this->scheduler = nullptr; - // throw; - // } - // -- GODOT end -- - } - -#endif - -#if defined(TASKING_TBB) || defined(TASKING_GCD) - - void Scene::commit (bool join) - { -#if defined(TASKING_TBB) && (TBB_INTERFACE_VERSION_MAJOR < 8) - if (join) - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcJoinCommitScene not supported with this TBB version"); -#endif - - /* try to obtain build lock */ - Lock lock(buildMutex,buildMutex.try_lock()); - - /* join hierarchy build */ - if (!lock.isLocked()) - { -#if !TASKING_TBB_USE_TASK_ISOLATION - if (!join) - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invoking rtcCommitScene from multiple threads is not supported with this TBB version"); -#endif - - do { - -#if defined(TASKING_GCD) - // Do Nothing -#else -#if USE_TASK_ARENA - if (join) { - device->arena->execute([&]{ group.wait(); }); - } - else -#endif - { - group.wait(); - } -#endif - - pause_cpu(); - yield(); - - } while (!buildMutex.try_lock()); - - buildMutex.unlock(); - return; - } - - /* for best performance set FTZ and DAZ flags in the MXCSR control and status register */ - const unsigned int mxcsr = _mm_getcsr(); - _mm_setcsr(mxcsr | /* FTZ */ (1<<15) | /* DAZ */ (1<<6)); - - try { -#if defined(TASKING_TBB) -#if TBB_INTERFACE_VERSION_MAJOR < 8 - tbb::task_group_context ctx( tbb::task_group_context::isolated, tbb::task_group_context::default_traits); -#else - tbb::task_group_context ctx( tbb::task_group_context::isolated, tbb::task_group_context::default_traits | tbb::task_group_context::fp_settings ); -#endif - //ctx.set_priority(tbb::priority_high); - -#if USE_TASK_ARENA - if (join) - { - device->arena->execute([&]{ - group.run([&]{ - tbb::parallel_for (size_t(0), size_t(1), size_t(1), [&] (size_t) { commit_task(); }, ctx); - }); - group.wait(); - }); - } - else -#endif - { - group.run([&]{ - tbb::parallel_for (size_t(0), size_t(1), size_t(1), [&] (size_t) { commit_task(); }, ctx); - }); - group.wait(); - } - - /* reset MXCSR register again */ - _mm_setcsr(mxcsr); - -#elif defined(TASKING_GCD) - - commit_task(); - -#endif // #if defined(TASKING_TBB) - - } - catch (...) - { - /* reset MXCSR register again */ - _mm_setcsr(mxcsr); - - accels_clear(); - updateInterface(); - throw; - } - } -#endif - -#if defined(TASKING_PPL) - - void Scene::commit (bool join) - { -#if defined(TASKING_PPL) - if (join) - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcJoinCommitScene not supported with PPL"); -#endif - - /* try to obtain build lock */ - Lock lock(buildMutex); - - checkIfModifiedAndSet (); - if (!isModified()) { - return; - } - - /* for best performance set FTZ and DAZ flags in the MXCSR control and status register */ - const unsigned int mxcsr = _mm_getcsr(); - _mm_setcsr(mxcsr | /* FTZ */ (1<<15) | /* DAZ */ (1<<6)); - - try { - - group.run([&]{ - concurrency::parallel_for(size_t(0), size_t(1), size_t(1), [&](size_t) { commit_task(); }); - }); - group.wait(); - - /* reset MXCSR register again */ - _mm_setcsr(mxcsr); - } - catch (...) - { - /* reset MXCSR register again */ - _mm_setcsr(mxcsr); - - accels_clear(); - updateInterface(); - throw; - } - } -#endif - - void Scene::setProgressMonitorFunction(RTCProgressMonitorFunction func, void* ptr) - { - progress_monitor_function = func; - progress_monitor_ptr = ptr; - } - - void Scene::progressMonitor(double dn) - { - if (progress_monitor_function) { - size_t n = size_t(dn) + progress_monitor_counter.fetch_add(size_t(dn)); - if (!progress_monitor_function(progress_monitor_ptr, n / (double(numPrimitives())))) { - throw_RTCError(RTC_ERROR_CANCELLED,"progress monitor forced termination"); - } - } - } -} diff --git a/thirdparty/embree-aarch64/kernels/common/scene.h b/thirdparty/embree-aarch64/kernels/common/scene.h deleted file mode 100644 index b41c6cde91..0000000000 --- a/thirdparty/embree-aarch64/kernels/common/scene.h +++ /dev/null @@ -1,390 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "default.h" -#include "device.h" -#include "builder.h" -#include "../../common/algorithms/parallel_any_of.h" -#include "scene_triangle_mesh.h" -#include "scene_quad_mesh.h" -#include "scene_user_geometry.h" -#include "scene_instance.h" -#include "scene_curves.h" -#include "scene_line_segments.h" -#include "scene_subdiv_mesh.h" -#include "scene_grid_mesh.h" -#include "scene_points.h" -#include "../subdiv/tessellation_cache.h" - -#include "acceln.h" -#include "geometry.h" - -namespace embree -{ - /*! Base class all scenes are derived from */ - class Scene : public AccelN - { - ALIGNED_CLASS_(std::alignment_of::value); - - public: - template - class Iterator - { - public: - Iterator () {} - - Iterator (Scene* scene, bool all = false) - : scene(scene), all(all) {} - - __forceinline Ty* at(const size_t i) - { - Geometry* geom = scene->geometries[i].ptr; - if (geom == nullptr) return nullptr; - if (!all && !geom->isEnabled()) return nullptr; - const size_t mask = geom->getTypeMask() & Ty::geom_type; - if (!(mask)) return nullptr; - if ((geom->numTimeSteps != 1) != mblur) return nullptr; - return (Ty*) geom; - } - - __forceinline Ty* operator[] (const size_t i) { - return at(i); - } - - __forceinline size_t size() const { - return scene->size(); - } - - __forceinline size_t numPrimitives() const { - return scene->getNumPrimitives(Ty::geom_type,mblur); - } - - __forceinline size_t maxPrimitivesPerGeometry() - { - size_t ret = 0; - for (size_t i=0; isize(); i++) { - Ty* mesh = at(i); - if (mesh == nullptr) continue; - ret = max(ret,mesh->size()); - } - return ret; - } - - __forceinline unsigned int maxGeomID() - { - unsigned int ret = 0; - for (size_t i=0; isize(); i++) { - Ty* mesh = at(i); - if (mesh == nullptr) continue; - ret = max(ret,(unsigned int)i); - } - return ret; - } - - __forceinline unsigned maxTimeStepsPerGeometry() - { - unsigned ret = 0; - for (size_t i=0; isize(); i++) { - Ty* mesh = at(i); - if (mesh == nullptr) continue; - ret = max(ret,mesh->numTimeSteps); - } - return ret; - } - - private: - Scene* scene; - bool all; - }; - - class Iterator2 - { - public: - Iterator2 () {} - - Iterator2 (Scene* scene, Geometry::GTypeMask typemask, bool mblur) - : scene(scene), typemask(typemask), mblur(mblur) {} - - __forceinline Geometry* at(const size_t i) - { - Geometry* geom = scene->geometries[i].ptr; - if (geom == nullptr) return nullptr; - if (!geom->isEnabled()) return nullptr; - if (!(geom->getTypeMask() & typemask)) return nullptr; - if ((geom->numTimeSteps != 1) != mblur) return nullptr; - return geom; - } - - __forceinline Geometry* operator[] (const size_t i) { - return at(i); - } - - __forceinline size_t size() const { - return scene->size(); - } - - private: - Scene* scene; - Geometry::GTypeMask typemask; - bool mblur; - }; - - public: - - /*! Scene construction */ - Scene (Device* device); - - /*! Scene destruction */ - ~Scene () noexcept; - - private: - /*! class is non-copyable */ - Scene (const Scene& other) DELETED; // do not implement - Scene& operator= (const Scene& other) DELETED; // do not implement - - public: - void createTriangleAccel(); - void createTriangleMBAccel(); - void createQuadAccel(); - void createQuadMBAccel(); - void createHairAccel(); - void createHairMBAccel(); - void createSubdivAccel(); - void createSubdivMBAccel(); - void createUserGeometryAccel(); - void createUserGeometryMBAccel(); - void createInstanceAccel(); - void createInstanceMBAccel(); - void createInstanceExpensiveAccel(); - void createInstanceExpensiveMBAccel(); - void createGridAccel(); - void createGridMBAccel(); - - /*! prints statistics about the scene */ - void printStatistics(); - - /*! clears the scene */ - void clear(); - - /*! detaches some geometry */ - void detachGeometry(size_t geomID); - - void setBuildQuality(RTCBuildQuality quality_flags); - RTCBuildQuality getBuildQuality() const; - - void setSceneFlags(RTCSceneFlags scene_flags); - RTCSceneFlags getSceneFlags() const; - - void commit (bool join); - void commit_task (); - void build () {} - - void updateInterface(); - - /* return number of geometries */ - __forceinline size_t size() const { return geometries.size(); } - - /* bind geometry to the scene */ - unsigned int bind (unsigned geomID, Ref geometry); - - /* determines if scene is modified */ - __forceinline bool isModified() const { return modified; } - - /* sets modified flag */ - __forceinline void setModified(bool f = true) { - modified = f; - } - - __forceinline bool isGeometryModified(size_t geomID) - { - Ref& g = geometries[geomID]; - if (!g) return false; - return g->getModCounter() > geometryModCounters_[geomID]; - } - - protected: - - __forceinline void checkIfModifiedAndSet () - { - if (isModified ()) return; - - auto geometryIsModified = [this](size_t geomID)->bool { - return isGeometryModified(geomID); - }; - - if (parallel_any_of (size_t(0), geometries.size (), geometryIsModified)) { - setModified (); - } - } - - public: - - /* get mesh by ID */ - __forceinline Geometry* get(size_t i) { assert(i < geometries.size()); return geometries[i].ptr; } - __forceinline const Geometry* get(size_t i) const { assert(i < geometries.size()); return geometries[i].ptr; } - - template - __forceinline Mesh* get(size_t i) { - assert(i < geometries.size()); - assert(geometries[i]->getTypeMask() & Mesh::geom_type); - return (Mesh*)geometries[i].ptr; - } - template - __forceinline const Mesh* get(size_t i) const { - assert(i < geometries.size()); - assert(geometries[i]->getTypeMask() & Mesh::geom_type); - return (Mesh*)geometries[i].ptr; - } - - template - __forceinline Mesh* getSafe(size_t i) { - assert(i < geometries.size()); - if (geometries[i] == null) return nullptr; - if (!(geometries[i]->getTypeMask() & Mesh::geom_type)) return nullptr; - else return (Mesh*) geometries[i].ptr; - } - - __forceinline Ref get_locked(size_t i) { - Lock lock(geometriesMutex); - assert(i < geometries.size()); - return geometries[i]; - } - - /* flag decoding */ - __forceinline bool isFastAccel() const { return !isCompactAccel() && !isRobustAccel(); } - __forceinline bool isCompactAccel() const { return scene_flags & RTC_SCENE_FLAG_COMPACT; } - __forceinline bool isRobustAccel() const { return scene_flags & RTC_SCENE_FLAG_ROBUST; } - __forceinline bool isStaticAccel() const { return !(scene_flags & RTC_SCENE_FLAG_DYNAMIC); } - __forceinline bool isDynamicAccel() const { return scene_flags & RTC_SCENE_FLAG_DYNAMIC; } - - __forceinline bool hasContextFilterFunction() const { - return scene_flags & RTC_SCENE_FLAG_CONTEXT_FILTER_FUNCTION; - } - - __forceinline bool hasGeometryFilterFunction() { - return world.numFilterFunctions != 0; - } - - __forceinline bool hasFilterFunction() { - return hasContextFilterFunction() || hasGeometryFilterFunction(); - } - - /* test if scene got already build */ - __forceinline bool isBuild() const { return is_build; } - - public: - IDPool id_pool; - vector> geometries; //!< list of all user geometries - vector geometryModCounters_; - vector vertices; - - public: - Device* device; - - /* these are to detect if we need to recreate the acceleration structures */ - bool flags_modified; - unsigned int enabled_geometry_types; - - RTCSceneFlags scene_flags; - RTCBuildQuality quality_flags; - MutexSys buildMutex; - SpinLock geometriesMutex; - bool is_build; - private: - bool modified; //!< true if scene got modified - - public: - - /*! global lock step task scheduler */ -#if defined(TASKING_INTERNAL) - MutexSys schedulerMutex; - Ref scheduler; -#elif defined(TASKING_TBB) && TASKING_TBB_USE_TASK_ISOLATION - tbb::isolated_task_group group; -#elif defined(TASKING_TBB) - tbb::task_group group; -#elif defined(TASKING_PPL) - concurrency::task_group group; -#endif - - public: - struct BuildProgressMonitorInterface : public BuildProgressMonitor { - BuildProgressMonitorInterface(Scene* scene) - : scene(scene) {} - void operator() (size_t dn) const { scene->progressMonitor(double(dn)); } - private: - Scene* scene; - }; - BuildProgressMonitorInterface progressInterface; - RTCProgressMonitorFunction progress_monitor_function; - void* progress_monitor_ptr; - std::atomic progress_monitor_counter; - void progressMonitor(double nprims); - void setProgressMonitorFunction(RTCProgressMonitorFunction func, void* ptr); - - private: - GeometryCounts world; //!< counts for geometry - - public: - - __forceinline size_t numPrimitives() const { - return world.size(); - } - - __forceinline size_t getNumPrimitives(Geometry::GTypeMask mask, bool mblur) const - { - size_t count = 0; - - if (mask & Geometry::MTY_TRIANGLE_MESH) - count += mblur ? world.numMBTriangles : world.numTriangles; - - if (mask & Geometry::MTY_QUAD_MESH) - count += mblur ? world.numMBQuads : world.numQuads; - - if (mask & Geometry::MTY_CURVE2) - count += mblur ? world.numMBLineSegments : world.numLineSegments; - - if (mask & Geometry::MTY_CURVE4) - count += mblur ? world.numMBBezierCurves : world.numBezierCurves; - - if (mask & Geometry::MTY_POINTS) - count += mblur ? world.numMBPoints : world.numPoints; - - if (mask & Geometry::MTY_SUBDIV_MESH) - count += mblur ? world.numMBSubdivPatches : world.numSubdivPatches; - - if (mask & Geometry::MTY_USER_GEOMETRY) - count += mblur ? world.numMBUserGeometries : world.numUserGeometries; - - if (mask & Geometry::MTY_INSTANCE_CHEAP) - count += mblur ? world.numMBInstancesCheap : world.numInstancesCheap; - - if (mask & Geometry::MTY_INSTANCE_EXPENSIVE) - count += mblur ? world.numMBInstancesExpensive : world.numInstancesExpensive; - - if (mask & Geometry::MTY_GRID_MESH) - count += mblur ? world.numMBGrids : world.numGrids; - - return count; - } - - template - __forceinline unsigned getNumTimeSteps() - { - if (!mblur) - return 1; - - Scene::Iterator iter(this); - return iter.maxTimeStepsPerGeometry(); - } - - template - __forceinline unsigned int getMaxGeomID() - { - Scene::Iterator iter(this); - return iter.maxGeomID(); - } - }; -} diff --git a/thirdparty/embree-aarch64/kernels/common/scene_curves.h b/thirdparty/embree-aarch64/kernels/common/scene_curves.h deleted file mode 100644 index 2649ab0e3e..0000000000 --- a/thirdparty/embree-aarch64/kernels/common/scene_curves.h +++ /dev/null @@ -1,341 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "default.h" -#include "geometry.h" -#include "buffer.h" - -namespace embree -{ - /*! represents an array of bicubic bezier curves */ - struct CurveGeometry : public Geometry - { - /*! type of this geometry */ - static const Geometry::GTypeMask geom_type = Geometry::MTY_CURVE4; - - public: - - /*! bezier curve construction */ - CurveGeometry (Device* device, Geometry::GType gtype); - - public: - void setMask(unsigned mask); - void setNumTimeSteps (unsigned int numTimeSteps); - void setVertexAttributeCount (unsigned int N); - void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref& buffer, size_t offset, size_t stride, unsigned int num); - void* getBuffer(RTCBufferType type, unsigned int slot); - void updateBuffer(RTCBufferType type, unsigned int slot); - void commit(); - bool verify(); - void setTessellationRate(float N); - void setMaxRadiusScale(float s); - void addElementsToCount (GeometryCounts & counts) const; - - public: - - /*! returns the number of vertices */ - __forceinline size_t numVertices() const { - return vertices[0].size(); - } - - /*! returns the i'th curve */ - __forceinline const unsigned int& curve(size_t i) const { - return curves[i]; - } - - /*! returns i'th vertex of the first time step */ - __forceinline Vec3ff vertex(size_t i) const { - return vertices0[i]; - } - - /*! returns i'th normal of the first time step */ - __forceinline Vec3fa normal(size_t i) const { - return normals0[i]; - } - - /*! returns i'th tangent of the first time step */ - __forceinline Vec3ff tangent(size_t i) const { - return tangents0[i]; - } - - /*! returns i'th normal derivative of the first time step */ - __forceinline Vec3fa dnormal(size_t i) const { - return dnormals0[i]; - } - - /*! returns i'th radius of the first time step */ - __forceinline float radius(size_t i) const { - return vertices0[i].w; - } - - /*! returns i'th vertex of itime'th timestep */ - __forceinline Vec3ff vertex(size_t i, size_t itime) const { - return vertices[itime][i]; - } - - /*! returns i'th normal of itime'th timestep */ - __forceinline Vec3fa normal(size_t i, size_t itime) const { - return normals[itime][i]; - } - - /*! returns i'th tangent of itime'th timestep */ - __forceinline Vec3ff tangent(size_t i, size_t itime) const { - return tangents[itime][i]; - } - - /*! returns i'th normal derivative of itime'th timestep */ - __forceinline Vec3fa dnormal(size_t i, size_t itime) const { - return dnormals[itime][i]; - } - - /*! returns i'th radius of itime'th timestep */ - __forceinline float radius(size_t i, size_t itime) const { - return vertices[itime][i].w; - } - - /*! gathers the curve starting with i'th vertex */ - __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, size_t i) const - { - p0 = vertex(i+0); - p1 = vertex(i+1); - p2 = vertex(i+2); - p3 = vertex(i+3); - } - - /*! gathers the curve starting with i'th vertex of itime'th timestep */ - __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, size_t i, size_t itime) const - { - p0 = vertex(i+0,itime); - p1 = vertex(i+1,itime); - p2 = vertex(i+2,itime); - p3 = vertex(i+3,itime); - } - - /*! gathers the curve starting with i'th vertex */ - __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, Vec3fa& n0, Vec3fa& n1, Vec3fa& n2, Vec3fa& n3, size_t i) const - { - p0 = vertex(i+0); - p1 = vertex(i+1); - p2 = vertex(i+2); - p3 = vertex(i+3); - n0 = normal(i+0); - n1 = normal(i+1); - n2 = normal(i+2); - n3 = normal(i+3); - } - - /*! gathers the curve starting with i'th vertex of itime'th timestep */ - __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, Vec3fa& n0, Vec3fa& n1, Vec3fa& n2, Vec3fa& n3, size_t i, size_t itime) const - { - p0 = vertex(i+0,itime); - p1 = vertex(i+1,itime); - p2 = vertex(i+2,itime); - p3 = vertex(i+3,itime); - n0 = normal(i+0,itime); - n1 = normal(i+1,itime); - n2 = normal(i+2,itime); - n3 = normal(i+3,itime); - } - - /*! prefetches the curve starting with i'th vertex of itime'th timestep */ - __forceinline void prefetchL1_vertices(size_t i) const - { - prefetchL1(vertices0.getPtr(i)+0); - prefetchL1(vertices0.getPtr(i)+64); - } - - /*! prefetches the curve starting with i'th vertex of itime'th timestep */ - __forceinline void prefetchL2_vertices(size_t i) const - { - prefetchL2(vertices0.getPtr(i)+0); - prefetchL2(vertices0.getPtr(i)+64); - } - - /*! loads curve vertices for specified time */ - __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, size_t i, float time) const - { - float ftime; - const size_t itime = timeSegment(time, ftime); - - const float t0 = 1.0f - ftime; - const float t1 = ftime; - Vec3ff a0,a1,a2,a3; - gather(a0,a1,a2,a3,i,itime); - Vec3ff b0,b1,b2,b3; - gather(b0,b1,b2,b3,i,itime+1); - p0 = madd(Vec3ff(t0),a0,t1*b0); - p1 = madd(Vec3ff(t0),a1,t1*b1); - p2 = madd(Vec3ff(t0),a2,t1*b2); - p3 = madd(Vec3ff(t0),a3,t1*b3); - } - - /*! loads curve vertices for specified time */ - __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, Vec3fa& n0, Vec3fa& n1, Vec3fa& n2, Vec3fa& n3, size_t i, float time) const - { - float ftime; - const size_t itime = timeSegment(time, ftime); - - const float t0 = 1.0f - ftime; - const float t1 = ftime; - Vec3ff a0,a1,a2,a3; Vec3fa an0,an1,an2,an3; - gather(a0,a1,a2,a3,an0,an1,an2,an3,i,itime); - Vec3ff b0,b1,b2,b3; Vec3fa bn0,bn1,bn2,bn3; - gather(b0,b1,b2,b3,bn0,bn1,bn2,bn3,i,itime+1); - p0 = madd(Vec3ff(t0),a0,t1*b0); - p1 = madd(Vec3ff(t0),a1,t1*b1); - p2 = madd(Vec3ff(t0),a2,t1*b2); - p3 = madd(Vec3ff(t0),a3,t1*b3); - n0 = madd(Vec3ff(t0),an0,t1*bn0); - n1 = madd(Vec3ff(t0),an1,t1*bn1); - n2 = madd(Vec3ff(t0),an2,t1*bn2); - n3 = madd(Vec3ff(t0),an3,t1*bn3); - } - - template - __forceinline TensorLinearCubicBezierSurface3fa getNormalOrientedCurve(IntersectContext* context, const Vec3fa& ray_org, const unsigned int primID, const size_t itime) const - { - Vec3ff v0,v1,v2,v3; Vec3fa n0,n1,n2,n3; - unsigned int vertexID = curve(primID); - gather(v0,v1,v2,v3,n0,n1,n2,n3,vertexID,itime); - SourceCurve3ff ccurve(v0,v1,v2,v3); - SourceCurve3fa ncurve(n0,n1,n2,n3); - ccurve = enlargeRadiusToMinWidth(context,this,ray_org,ccurve); - return TensorLinearCubicBezierSurface3fa::fromCenterAndNormalCurve(ccurve,ncurve); - } - - template - __forceinline TensorLinearCubicBezierSurface3fa getNormalOrientedCurve(IntersectContext* context, const Vec3fa& ray_org, const unsigned int primID, const float time) const - { - float ftime; - const size_t itime = timeSegment(time, ftime); - const TensorLinearCubicBezierSurface3fa curve0 = getNormalOrientedCurve(context,ray_org,primID,itime+0); - const TensorLinearCubicBezierSurface3fa curve1 = getNormalOrientedCurve(context,ray_org,primID,itime+1); - return clerp(curve0,curve1,ftime); - } - - /*! gathers the hermite curve starting with i'th vertex */ - __forceinline void gather_hermite(Vec3ff& p0, Vec3ff& t0, Vec3ff& p1, Vec3ff& t1, size_t i) const - { - p0 = vertex (i+0); - p1 = vertex (i+1); - t0 = tangent(i+0); - t1 = tangent(i+1); - } - - /*! gathers the hermite curve starting with i'th vertex of itime'th timestep */ - __forceinline void gather_hermite(Vec3ff& p0, Vec3ff& t0, Vec3ff& p1, Vec3ff& t1, size_t i, size_t itime) const - { - p0 = vertex (i+0,itime); - p1 = vertex (i+1,itime); - t0 = tangent(i+0,itime); - t1 = tangent(i+1,itime); - } - - /*! loads curve vertices for specified time */ - __forceinline void gather_hermite(Vec3ff& p0, Vec3ff& t0, Vec3ff& p1, Vec3ff& t1, size_t i, float time) const - { - float ftime; - const size_t itime = timeSegment(time, ftime); - const float f0 = 1.0f - ftime, f1 = ftime; - Vec3ff ap0,at0,ap1,at1; - gather_hermite(ap0,at0,ap1,at1,i,itime); - Vec3ff bp0,bt0,bp1,bt1; - gather_hermite(bp0,bt0,bp1,bt1,i,itime+1); - p0 = madd(Vec3ff(f0),ap0,f1*bp0); - t0 = madd(Vec3ff(f0),at0,f1*bt0); - p1 = madd(Vec3ff(f0),ap1,f1*bp1); - t1 = madd(Vec3ff(f0),at1,f1*bt1); - } - - /*! gathers the hermite curve starting with i'th vertex */ - __forceinline void gather_hermite(Vec3ff& p0, Vec3ff& t0, Vec3fa& n0, Vec3fa& dn0, Vec3ff& p1, Vec3ff& t1, Vec3fa& n1, Vec3fa& dn1, size_t i) const - { - p0 = vertex (i+0); - p1 = vertex (i+1); - t0 = tangent(i+0); - t1 = tangent(i+1); - n0 = normal(i+0); - n1 = normal(i+1); - dn0 = dnormal(i+0); - dn1 = dnormal(i+1); - } - - /*! gathers the hermite curve starting with i'th vertex of itime'th timestep */ - __forceinline void gather_hermite(Vec3ff& p0, Vec3ff& t0, Vec3fa& n0, Vec3fa& dn0, Vec3ff& p1, Vec3ff& t1, Vec3fa& n1, Vec3fa& dn1, size_t i, size_t itime) const - { - p0 = vertex (i+0,itime); - p1 = vertex (i+1,itime); - t0 = tangent(i+0,itime); - t1 = tangent(i+1,itime); - n0 = normal(i+0,itime); - n1 = normal(i+1,itime); - dn0 = dnormal(i+0,itime); - dn1 = dnormal(i+1,itime); - } - - /*! loads curve vertices for specified time */ - __forceinline void gather_hermite(Vec3ff& p0, Vec3fa& t0, Vec3fa& n0, Vec3fa& dn0, Vec3ff& p1, Vec3fa& t1, Vec3fa& n1, Vec3fa& dn1, size_t i, float time) const - { - float ftime; - const size_t itime = timeSegment(time, ftime); - const float f0 = 1.0f - ftime, f1 = ftime; - Vec3ff ap0,at0,ap1,at1; Vec3fa an0,adn0,an1,adn1; - gather_hermite(ap0,at0,an0,adn0,ap1,at1,an1,adn1,i,itime); - Vec3ff bp0,bt0,bp1,bt1; Vec3fa bn0,bdn0,bn1,bdn1; - gather_hermite(bp0,bt0,bn0,bdn0,bp1,bt1,bn1,bdn1,i,itime+1); - p0 = madd(Vec3ff(f0),ap0,f1*bp0); - t0 = madd(Vec3ff(f0),at0,f1*bt0); - n0 = madd(Vec3ff(f0),an0,f1*bn0); - dn0= madd(Vec3ff(f0),adn0,f1*bdn0); - p1 = madd(Vec3ff(f0),ap1,f1*bp1); - t1 = madd(Vec3ff(f0),at1,f1*bt1); - n1 = madd(Vec3ff(f0),an1,f1*bn1); - dn1= madd(Vec3ff(f0),adn1,f1*bdn1); - } - - template - __forceinline TensorLinearCubicBezierSurface3fa getNormalOrientedHermiteCurve(IntersectContext* context, const Vec3fa& ray_org, const unsigned int primID, const size_t itime) const - { - Vec3ff v0,t0,v1,t1; Vec3fa n0,dn0,n1,dn1; - unsigned int vertexID = curve(primID); - gather_hermite(v0,t0,n0,dn0,v1,t1,n1,dn1,vertexID,itime); - - SourceCurve3ff ccurve(v0,t0,v1,t1); - SourceCurve3fa ncurve(n0,dn0,n1,dn1); - ccurve = enlargeRadiusToMinWidth(context,this,ray_org,ccurve); - return TensorLinearCubicBezierSurface3fa::fromCenterAndNormalCurve(ccurve,ncurve); - } - - template - __forceinline TensorLinearCubicBezierSurface3fa getNormalOrientedHermiteCurve(IntersectContext* context, const Vec3fa& ray_org, const unsigned int primID, const float time) const - { - float ftime; - const size_t itime = timeSegment(time, ftime); - const TensorLinearCubicBezierSurface3fa curve0 = getNormalOrientedHermiteCurve(context, ray_org, primID,itime+0); - const TensorLinearCubicBezierSurface3fa curve1 = getNormalOrientedHermiteCurve(context, ray_org, primID,itime+1); - return clerp(curve0,curve1,ftime); - } - - private: - void resizeBuffers(unsigned int numSteps); - - public: - BufferView curves; //!< array of curve indices - BufferView vertices0; //!< fast access to first vertex buffer - BufferView normals0; //!< fast access to first normal buffer - BufferView tangents0; //!< fast access to first tangent buffer - BufferView dnormals0; //!< fast access to first normal derivative buffer - vector> vertices; //!< vertex array for each timestep - vector> normals; //!< normal array for each timestep - vector> tangents; //!< tangent array for each timestep - vector> dnormals; //!< normal derivative array for each timestep - BufferView flags; //!< start, end flag per segment - vector> vertexAttribs; //!< user buffers - int tessellationRate; //!< tessellation rate for flat curve - float maxRadiusScale = 1.0; //!< maximal min-width scaling of curve radii - }; - - DECLARE_ISA_FUNCTION(CurveGeometry*, createCurves, Device* COMMA Geometry::GType); -} diff --git a/thirdparty/embree-aarch64/kernels/common/scene_grid_mesh.h b/thirdparty/embree-aarch64/kernels/common/scene_grid_mesh.h deleted file mode 100644 index c08658466a..0000000000 --- a/thirdparty/embree-aarch64/kernels/common/scene_grid_mesh.h +++ /dev/null @@ -1,215 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "geometry.h" -#include "buffer.h" - -namespace embree -{ - /*! Grid Mesh */ - struct GridMesh : public Geometry - { - /*! type of this geometry */ - static const Geometry::GTypeMask geom_type = Geometry::MTY_GRID_MESH; - - /*! grid */ - struct Grid - { - unsigned int startVtxID; - unsigned int lineVtxOffset; - unsigned short resX,resY; - - /* border flags due to 3x3 vertex pattern */ - __forceinline unsigned int get3x3FlagsX(const unsigned int x) const - { - return (x + 2 >= (unsigned int)resX) ? (1<<15) : 0; - } - - /* border flags due to 3x3 vertex pattern */ - __forceinline unsigned int get3x3FlagsY(const unsigned int y) const - { - return (y + 2 >= (unsigned int)resY) ? (1<<15) : 0; - } - - /*! outputs grid structure */ - __forceinline friend embree_ostream operator<<(embree_ostream cout, const Grid& t) { - return cout << "Grid { startVtxID " << t.startVtxID << ", lineVtxOffset " << t.lineVtxOffset << ", resX " << t.resX << ", resY " << t.resY << " }"; - } - }; - - public: - - /*! grid mesh construction */ - GridMesh (Device* device); - - /* geometry interface */ - public: - void setMask(unsigned mask); - void setNumTimeSteps (unsigned int numTimeSteps); - void setVertexAttributeCount (unsigned int N); - void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref& buffer, size_t offset, size_t stride, unsigned int num); - void* getBuffer(RTCBufferType type, unsigned int slot); - void updateBuffer(RTCBufferType type, unsigned int slot); - void commit(); - bool verify(); - void interpolate(const RTCInterpolateArguments* const args); - void addElementsToCount (GeometryCounts & counts) const; - - __forceinline unsigned int getNumSubGrids(const size_t gridID) - { - const Grid &g = grid(gridID); - return max((unsigned int)1,((unsigned int)g.resX >> 1) * ((unsigned int)g.resY >> 1)); - } - - /*! get fast access to first vertex buffer */ - __forceinline float * getCompactVertexArray () const { - return (float*) vertices0.getPtr(); - } - - public: - - /*! returns number of vertices */ - __forceinline size_t numVertices() const { - return vertices[0].size(); - } - - /*! returns i'th grid*/ - __forceinline const Grid& grid(size_t i) const { - return grids[i]; - } - - /*! returns i'th vertex of the first time step */ - __forceinline const Vec3fa vertex(size_t i) const { // FIXME: check if this does a unaligned load - return vertices0[i]; - } - - /*! returns i'th vertex of the first time step */ - __forceinline const char* vertexPtr(size_t i) const { - return vertices0.getPtr(i); - } - - /*! returns i'th vertex of itime'th timestep */ - __forceinline const Vec3fa vertex(size_t i, size_t itime) const { - return vertices[itime][i]; - } - - /*! returns i'th vertex of itime'th timestep */ - __forceinline const char* vertexPtr(size_t i, size_t itime) const { - return vertices[itime].getPtr(i); - } - - /*! returns i'th vertex of the first timestep */ - __forceinline size_t grid_vertex_index(const Grid& g, size_t x, size_t y) const { - assert(x < (size_t)g.resX); - assert(y < (size_t)g.resY); - return g.startVtxID + x + y * g.lineVtxOffset; - } - - /*! returns i'th vertex of the first timestep */ - __forceinline const Vec3fa grid_vertex(const Grid& g, size_t x, size_t y) const { - const size_t index = grid_vertex_index(g,x,y); - return vertex(index); - } - - /*! returns i'th vertex of the itime'th timestep */ - __forceinline const Vec3fa grid_vertex(const Grid& g, size_t x, size_t y, size_t itime) const { - const size_t index = grid_vertex_index(g,x,y); - return vertex(index,itime); - } - - /*! calculates the build bounds of the i'th primitive, if it's valid */ - __forceinline bool buildBounds(const Grid& g, size_t sx, size_t sy, BBox3fa& bbox) const - { - BBox3fa b(empty); - for (size_t t=0; t& itime_range) const - { - if (unlikely(gridID >= grids.size())) return false; - const Grid &g = grid(gridID); - if (unlikely(g.startVtxID + 0 >= vertices0.size())) return false; - if (unlikely(g.startVtxID + (g.resY-1)*g.lineVtxOffset + g.resX-1 >= vertices0.size())) return false; - - for (size_t y=0;y grids; //!< array of triangles - BufferView vertices0; //!< fast access to first vertex buffer - vector> vertices; //!< vertex array for each timestep - vector vertexAttribs; //!< vertex attributes - }; - - namespace isa - { - struct GridMeshISA : public GridMesh - { - GridMeshISA (Device* device) - : GridMesh(device) {} - }; - } - - DECLARE_ISA_FUNCTION(GridMesh*, createGridMesh, Device*); -} diff --git a/thirdparty/embree-aarch64/kernels/common/scene_instance.h b/thirdparty/embree-aarch64/kernels/common/scene_instance.h deleted file mode 100644 index 7ff82a4fb8..0000000000 --- a/thirdparty/embree-aarch64/kernels/common/scene_instance.h +++ /dev/null @@ -1,272 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "geometry.h" -#include "accel.h" - -namespace embree -{ - struct MotionDerivativeCoefficients; - - /*! Instanced acceleration structure */ - struct Instance : public Geometry - { - ALIGNED_STRUCT_(16); - static const Geometry::GTypeMask geom_type = Geometry::MTY_INSTANCE; - - public: - Instance (Device* device, Accel* object = nullptr, unsigned int numTimeSteps = 1); - ~Instance(); - - private: - Instance (const Instance& other) DELETED; // do not implement - Instance& operator= (const Instance& other) DELETED; // do not implement - - private: - LBBox3fa nonlinearBounds(const BBox1f& time_range_in, - const BBox1f& geom_time_range, - float geom_time_segments) const; - - BBox3fa boundSegment(size_t itime, - BBox3fa const& obbox0, BBox3fa const& obbox1, - BBox3fa const& bbox0, BBox3fa const& bbox1, - float t_min, float t_max) const; - - /* calculates the (correct) interpolated bounds */ - __forceinline BBox3fa bounds(size_t itime0, size_t itime1, float f) const - { - if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION)) - return xfmBounds(slerp(local2world[itime0], local2world[itime1], f), - lerp(getObjectBounds(itime0), getObjectBounds(itime1), f)); - return xfmBounds(lerp(local2world[itime0], local2world[itime1], f), - lerp(getObjectBounds(itime0), getObjectBounds(itime1), f)); - } - - public: - virtual void setNumTimeSteps (unsigned int numTimeSteps) override; - virtual void setInstancedScene(const Ref& scene) override; - virtual void setTransform(const AffineSpace3fa& local2world, unsigned int timeStep) override; - virtual void setQuaternionDecomposition(const AffineSpace3ff& qd, unsigned int timeStep) override; - virtual AffineSpace3fa getTransform(float time) override; - virtual void setMask (unsigned mask) override; - virtual void build() {} - virtual void addElementsToCount (GeometryCounts & counts) const override; - virtual void commit() override; - - public: - - /*! calculates the bounds of instance */ - __forceinline BBox3fa bounds(size_t i) const { - assert(i == 0); - if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION)) - return xfmBounds(quaternionDecompositionToAffineSpace(local2world[0]),object->bounds.bounds()); - return xfmBounds(local2world[0],object->bounds.bounds()); - } - - /*! gets the bounds of the instanced scene */ - __forceinline BBox3fa getObjectBounds(size_t itime) const { - return object->getBounds(timeStep(itime)); - } - - /*! calculates the bounds of instance */ - __forceinline BBox3fa bounds(size_t i, size_t itime) const { - assert(i == 0); - if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION)) - return xfmBounds(quaternionDecompositionToAffineSpace(local2world[itime]),getObjectBounds(itime)); - return xfmBounds(local2world[itime],getObjectBounds(itime)); - } - - /*! calculates the linear bounds of the i'th primitive for the specified time range */ - __forceinline LBBox3fa linearBounds(size_t i, const BBox1f& dt) const { - assert(i == 0); - LBBox3fa lbbox = nonlinearBounds(dt, time_range, fnumTimeSegments); - return lbbox; - } - - /*! calculates the build bounds of the i'th item, if it's valid */ - __forceinline bool buildBounds(size_t i, BBox3fa* bbox = nullptr) const - { - assert(i==0); - const BBox3fa b = bounds(i); - if (bbox) *bbox = b; - return isvalid(b); - } - - /*! calculates the build bounds of the i'th item at the itime'th time segment, if it's valid */ - __forceinline bool buildBounds(size_t i, size_t itime, BBox3fa& bbox) const - { - assert(i==0); - const LBBox3fa bounds = linearBounds(i,itime); - bbox = bounds.bounds (); - return isvalid(bounds); - } - - /* gets version info of topology */ - unsigned int getTopologyVersion() const { - return numPrimitives; - } - - /* returns true if topology changed */ - bool topologyChanged(unsigned int otherVersion) const { - return numPrimitives != otherVersion; - } - - /*! check if the i'th primitive is valid between the specified time range */ - __forceinline bool valid(size_t i, const range& itime_range) const - { - assert(i == 0); - for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++) - if (!isvalid(bounds(i,itime))) return false; - - return true; - } - - __forceinline AffineSpace3fa getLocal2World() const - { - if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION)) - return quaternionDecompositionToAffineSpace(local2world[0]); - return local2world[0]; - } - - __forceinline AffineSpace3fa getLocal2World(float t) const - { - float ftime; const unsigned int itime = timeSegment(t, ftime); - if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION)) - return slerp(local2world[itime+0],local2world[itime+1],ftime); - return lerp(local2world[itime+0],local2world[itime+1],ftime); - } - - __forceinline AffineSpace3fa getWorld2Local() const { - return world2local0; - } - - __forceinline AffineSpace3fa getWorld2Local(float t) const { - return rcp(getLocal2World(t)); - } - - template - __forceinline AffineSpace3vf getWorld2Local(const vbool& valid, const vfloat& t) const - { - if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION)) - return getWorld2LocalSlerp(valid, t); - return getWorld2LocalLerp(valid, t); - } - - private: - - template - __forceinline AffineSpace3vf getWorld2LocalSlerp(const vbool& valid, const vfloat& t) const - { - vfloat ftime; - const vint itime_k = timeSegment(t, ftime); - assert(any(valid)); - const size_t index = bsf(movemask(valid)); - const int itime = itime_k[index]; - if (likely(all(valid, itime_k == vint(itime)))) { - return rcp(slerp(AffineSpace3vff(local2world[itime+0]), - AffineSpace3vff(local2world[itime+1]), - ftime)); - } - else { - AffineSpace3vff space0,space1; - vbool valid1 = valid; - while (any(valid1)) { - vbool valid2; - const int itime = next_unique(valid1, itime_k, valid2); - space0 = select(valid2, AffineSpace3vff(local2world[itime+0]), space0); - space1 = select(valid2, AffineSpace3vff(local2world[itime+1]), space1); - } - return rcp(slerp(space0, space1, ftime)); - } - } - - template - __forceinline AffineSpace3vf getWorld2LocalLerp(const vbool& valid, const vfloat& t) const - { - vfloat ftime; - const vint itime_k = timeSegment(t, ftime); - assert(any(valid)); - const size_t index = bsf(movemask(valid)); - const int itime = itime_k[index]; - if (likely(all(valid, itime_k == vint(itime)))) { - return rcp(lerp(AffineSpace3vf((AffineSpace3fa)local2world[itime+0]), - AffineSpace3vf((AffineSpace3fa)local2world[itime+1]), - ftime)); - } else { - AffineSpace3vf space0,space1; - vbool valid1 = valid; - while (any(valid1)) { - vbool valid2; - const int itime = next_unique(valid1, itime_k, valid2); - space0 = select(valid2, AffineSpace3vf((AffineSpace3fa)local2world[itime+0]), space0); - space1 = select(valid2, AffineSpace3vf((AffineSpace3fa)local2world[itime+1]), space1); - } - return rcp(lerp(space0, space1, ftime)); - } - } - - public: - Accel* object; //!< pointer to instanced acceleration structure - AffineSpace3ff* local2world; //!< transformation from local space to world space for each timestep (either normal matrix or quaternion decomposition) - AffineSpace3fa world2local0; //!< transformation from world space to local space for timestep 0 - }; - - namespace isa - { - struct InstanceISA : public Instance - { - InstanceISA (Device* device) - : Instance(device) {} - - PrimInfo createPrimRefArray(mvector& prims, const range& r, size_t k, unsigned int geomID) const - { - assert(r.begin() == 0); - assert(r.end() == 1); - - PrimInfo pinfo(empty); - BBox3fa b = empty; - if (!buildBounds(0,&b)) return pinfo; - // const BBox3fa b = bounds(0); - // if (!isvalid(b)) return pinfo; - - const PrimRef prim(b,geomID,unsigned(0)); - pinfo.add_center2(prim); - prims[k++] = prim; - return pinfo; - } - - PrimInfo createPrimRefArrayMB(mvector& prims, size_t itime, const range& r, size_t k, unsigned int geomID) const - { - assert(r.begin() == 0); - assert(r.end() == 1); - - PrimInfo pinfo(empty); - BBox3fa b = empty; - if (!buildBounds(0,&b)) return pinfo; - // if (!valid(0,range(itime))) return pinfo; - // const PrimRef prim(linearBounds(0,itime).bounds(),geomID,unsigned(0)); - const PrimRef prim(b,geomID,unsigned(0)); - pinfo.add_center2(prim); - prims[k++] = prim; - return pinfo; - } - - PrimInfoMB createPrimRefMBArray(mvector& prims, const BBox1f& t0t1, const range& r, size_t k, unsigned int geomID) const - { - assert(r.begin() == 0); - assert(r.end() == 1); - - PrimInfoMB pinfo(empty); - if (!valid(0, timeSegmentRange(t0t1))) return pinfo; - const PrimRefMB prim(linearBounds(0,t0t1),this->numTimeSegments(),this->time_range,this->numTimeSegments(),geomID,unsigned(0)); - pinfo.add_primref(prim); - prims[k++] = prim; - return pinfo; - } - }; - } - - DECLARE_ISA_FUNCTION(Instance*, createInstance, Device*); -} diff --git a/thirdparty/embree-aarch64/kernels/common/scene_line_segments.h b/thirdparty/embree-aarch64/kernels/common/scene_line_segments.h deleted file mode 100644 index c0f9ee8f77..0000000000 --- a/thirdparty/embree-aarch64/kernels/common/scene_line_segments.h +++ /dev/null @@ -1,307 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "default.h" -#include "geometry.h" -#include "buffer.h" - -namespace embree -{ - /*! represents an array of line segments */ - struct LineSegments : public Geometry - { - /*! type of this geometry */ - static const Geometry::GTypeMask geom_type = Geometry::MTY_CURVE2; - - public: - - /*! line segments construction */ - LineSegments (Device* device, Geometry::GType gtype); - - public: - void setMask (unsigned mask); - void setNumTimeSteps (unsigned int numTimeSteps); - void setVertexAttributeCount (unsigned int N); - void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref& buffer, size_t offset, size_t stride, unsigned int num); - void* getBuffer(RTCBufferType type, unsigned int slot); - void updateBuffer(RTCBufferType type, unsigned int slot); - void commit(); - bool verify (); - void interpolate(const RTCInterpolateArguments* const args); - void setTessellationRate(float N); - void setMaxRadiusScale(float s); - void addElementsToCount (GeometryCounts & counts) const; - - public: - - /*! returns the number of vertices */ - __forceinline size_t numVertices() const { - return vertices[0].size(); - } - - /*! returns the i'th segment */ - __forceinline const unsigned int& segment(size_t i) const { - return segments[i]; - } - - /*! returns the segment to the left of the i'th segment */ - __forceinline bool segmentLeftExists(size_t i) const { - assert (flags); - return (flags[i] & RTC_CURVE_FLAG_NEIGHBOR_LEFT) != 0; - } - - /*! returns the segment to the right of the i'th segment */ - __forceinline bool segmentRightExists(size_t i) const { - assert (flags); - return (flags[i] & RTC_CURVE_FLAG_NEIGHBOR_RIGHT) != 0; - } - - /*! returns i'th vertex of the first time step */ - __forceinline Vec3ff vertex(size_t i) const { - return vertices0[i]; - } - - /*! returns i'th vertex of the first time step */ - __forceinline const char* vertexPtr(size_t i) const { - return vertices0.getPtr(i); - } - - /*! returns i'th normal of the first time step */ - __forceinline Vec3fa normal(size_t i) const { - return normals0[i]; - } - - /*! returns i'th radius of the first time step */ - __forceinline float radius(size_t i) const { - return vertices0[i].w; - } - - /*! returns i'th vertex of itime'th timestep */ - __forceinline Vec3ff vertex(size_t i, size_t itime) const { - return vertices[itime][i]; - } - - /*! returns i'th vertex of itime'th timestep */ - __forceinline const char* vertexPtr(size_t i, size_t itime) const { - return vertices[itime].getPtr(i); - } - - /*! returns i'th normal of itime'th timestep */ - __forceinline Vec3fa normal(size_t i, size_t itime) const { - return normals[itime][i]; - } - - /*! returns i'th radius of itime'th timestep */ - __forceinline float radius(size_t i, size_t itime) const { - return vertices[itime][i].w; - } - - /*! calculates bounding box of i'th line segment */ - __forceinline BBox3fa bounds(const Vec3ff& v0, const Vec3ff& v1) const - { - const BBox3ff b = merge(BBox3ff(v0),BBox3ff(v1)); - return enlarge((BBox3fa)b,maxRadiusScale*Vec3fa(max(v0.w,v1.w))); - } - - /*! calculates bounding box of i'th line segment */ - __forceinline BBox3fa bounds(size_t i) const - { - const unsigned int index = segment(i); - const Vec3ff v0 = vertex(index+0); - const Vec3ff v1 = vertex(index+1); - return bounds(v0,v1); - } - - /*! calculates bounding box of i'th line segment for the itime'th time step */ - __forceinline BBox3fa bounds(size_t i, size_t itime) const - { - const unsigned int index = segment(i); - const Vec3ff v0 = vertex(index+0,itime); - const Vec3ff v1 = vertex(index+1,itime); - return bounds(v0,v1); - } - - /*! calculates bounding box of i'th line segment */ - __forceinline BBox3fa bounds(const LinearSpace3fa& space, size_t i) const - { - const unsigned int index = segment(i); - const Vec3ff v0 = vertex(index+0); - const Vec3ff v1 = vertex(index+1); - const Vec3ff w0(xfmVector(space,(Vec3fa)v0),v0.w); - const Vec3ff w1(xfmVector(space,(Vec3fa)v1),v1.w); - return bounds(w0,w1); - } - - /*! calculates bounding box of i'th line segment for the itime'th time step */ - __forceinline BBox3fa bounds(const LinearSpace3fa& space, size_t i, size_t itime) const - { - const unsigned int index = segment(i); - const Vec3ff v0 = vertex(index+0,itime); - const Vec3ff v1 = vertex(index+1,itime); - const Vec3ff w0(xfmVector(space,(Vec3fa)v0),v0.w); - const Vec3ff w1(xfmVector(space,(Vec3fa)v1),v1.w); - return bounds(w0,w1); - } - - /*! check if the i'th primitive is valid at the itime'th timestep */ - __forceinline bool valid(size_t i, size_t itime) const { - return valid(i, make_range(itime, itime)); - } - - /*! check if the i'th primitive is valid between the specified time range */ - __forceinline bool valid(size_t i, const range& itime_range) const - { - const unsigned int index = segment(i); - if (index+1 >= numVertices()) return false; - - for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++) - { - const Vec3ff v0 = vertex(index+0,itime); if (unlikely(!isvalid4(v0))) return false; - const Vec3ff v1 = vertex(index+1,itime); if (unlikely(!isvalid4(v1))) return false; - if (min(v0.w,v1.w) < 0.0f) return false; - } - return true; - } - - /*! calculates the linear bounds of the i'th primitive at the itimeGlobal'th time segment */ - __forceinline LBBox3fa linearBounds(size_t i, size_t itime) const { - return LBBox3fa(bounds(i,itime+0),bounds(i,itime+1)); - } - - /*! calculates the build bounds of the i'th primitive, if it's valid */ - __forceinline bool buildBounds(size_t i, BBox3fa* bbox) const - { - if (!valid(i,0)) return false; - *bbox = bounds(i); - return true; - } - - /*! calculates the build bounds of the i'th primitive at the itime'th time segment, if it's valid */ - __forceinline bool buildBounds(size_t i, size_t itime, BBox3fa& bbox) const - { - if (!valid(i,itime+0) || !valid(i,itime+1)) return false; - bbox = bounds(i,itime); // use bounds of first time step in builder - return true; - } - - /*! calculates the linear bounds of the i'th primitive for the specified time range */ - __forceinline LBBox3fa linearBounds(size_t primID, const BBox1f& dt) const { - return LBBox3fa([&] (size_t itime) { return bounds(primID, itime); }, dt, time_range, fnumTimeSegments); - } - - /*! calculates the linear bounds of the i'th primitive for the specified time range */ - __forceinline LBBox3fa linearBounds(const LinearSpace3fa& space, size_t primID, const BBox1f& dt) const { - return LBBox3fa([&] (size_t itime) { return bounds(space, primID, itime); }, dt, time_range, fnumTimeSegments); - } - - /*! calculates the linear bounds of the i'th primitive for the specified time range */ - __forceinline bool linearBounds(size_t i, const BBox1f& time_range, LBBox3fa& bbox) const - { - if (!valid(i, timeSegmentRange(time_range))) return false; - bbox = linearBounds(i, time_range); - return true; - } - - /*! get fast access to first vertex buffer */ - __forceinline float * getCompactVertexArray () const { - return (float*) vertices0.getPtr(); - } - - public: - BufferView segments; //!< array of line segment indices - BufferView vertices0; //!< fast access to first vertex buffer - BufferView normals0; //!< fast access to first normal buffer - BufferView flags; //!< start, end flag per segment - vector> vertices; //!< vertex array for each timestep - vector> normals; //!< normal array for each timestep - vector> vertexAttribs; //!< user buffers - int tessellationRate; //!< tessellation rate for bezier curve - float maxRadiusScale = 1.0; //!< maximal min-width scaling of curve radii - }; - - namespace isa - { - struct LineSegmentsISA : public LineSegments - { - LineSegmentsISA (Device* device, Geometry::GType gtype) - : LineSegments(device,gtype) {} - - Vec3fa computeDirection(unsigned int primID) const - { - const unsigned vtxID = segment(primID); - const Vec3fa v0 = vertex(vtxID+0); - const Vec3fa v1 = vertex(vtxID+1); - return v1-v0; - } - - Vec3fa computeDirection(unsigned int primID, size_t time) const - { - const unsigned vtxID = segment(primID); - const Vec3fa v0 = vertex(vtxID+0,time); - const Vec3fa v1 = vertex(vtxID+1,time); - return v1-v0; - } - - PrimInfo createPrimRefArray(mvector& prims, const range& r, size_t k, unsigned int geomID) const - { - PrimInfo pinfo(empty); - for (size_t j=r.begin(); j& prims, size_t itime, const range& r, size_t k, unsigned int geomID) const - { - PrimInfo pinfo(empty); - for (size_t j=r.begin(); j& prims, const BBox1f& t0t1, const range& r, size_t k, unsigned int geomID) const - { - PrimInfoMB pinfo(empty); - for (size_t j=r.begin(); jnumTimeSegments(),this->time_range,this->numTimeSegments(),geomID,unsigned(j)); - pinfo.add_primref(prim); - prims[k++] = prim; - } - return pinfo; - } - - BBox3fa vbounds(size_t i) const { - return bounds(i); - } - - BBox3fa vbounds(const LinearSpace3fa& space, size_t i) const { - return bounds(space,i); - } - - LBBox3fa vlinearBounds(size_t primID, const BBox1f& time_range) const { - return linearBounds(primID,time_range); - } - - LBBox3fa vlinearBounds(const LinearSpace3fa& space, size_t primID, const BBox1f& time_range) const { - return linearBounds(space,primID,time_range); - } - }; - } - - DECLARE_ISA_FUNCTION(LineSegments*, createLineSegments, Device* COMMA Geometry::GType); -} diff --git a/thirdparty/embree-aarch64/kernels/common/scene_points.h b/thirdparty/embree-aarch64/kernels/common/scene_points.h deleted file mode 100644 index 1d39ed07ba..0000000000 --- a/thirdparty/embree-aarch64/kernels/common/scene_points.h +++ /dev/null @@ -1,282 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "buffer.h" -#include "default.h" -#include "geometry.h" - -namespace embree -{ - /*! represents an array of points */ - struct Points : public Geometry - { - /*! type of this geometry */ - static const Geometry::GTypeMask geom_type = Geometry::MTY_POINTS; - - public: - /*! line segments construction */ - Points(Device* device, Geometry::GType gtype); - - public: - void setMask(unsigned mask); - void setNumTimeSteps(unsigned int numTimeSteps); - void setVertexAttributeCount(unsigned int N); - void setBuffer(RTCBufferType type, - unsigned int slot, - RTCFormat format, - const Ref& buffer, - size_t offset, - size_t stride, - unsigned int num); - void* getBuffer(RTCBufferType type, unsigned int slot); - void updateBuffer(RTCBufferType type, unsigned int slot); - void commit(); - bool verify(); - void setMaxRadiusScale(float s); - void addElementsToCount (GeometryCounts & counts) const; - - public: - /*! returns the number of vertices */ - __forceinline size_t numVertices() const { - return vertices[0].size(); - } - - /*! returns i'th vertex of the first time step */ - __forceinline Vec3ff vertex(size_t i) const { - return vertices0[i]; - } - - /*! returns i'th vertex of the first time step */ - __forceinline const char* vertexPtr(size_t i) const { - return vertices0.getPtr(i); - } - - /*! returns i'th normal of the first time step */ - __forceinline Vec3fa normal(size_t i) const { - return normals0[i]; - } - - /*! returns i'th radius of the first time step */ - __forceinline float radius(size_t i) const { - return vertices0[i].w; - } - - /*! returns i'th vertex of itime'th timestep */ - __forceinline Vec3ff vertex(size_t i, size_t itime) const { - return vertices[itime][i]; - } - - /*! returns i'th vertex of itime'th timestep */ - __forceinline const char* vertexPtr(size_t i, size_t itime) const { - return vertices[itime].getPtr(i); - } - - /*! returns i'th normal of itime'th timestep */ - __forceinline Vec3fa normal(size_t i, size_t itime) const { - return normals[itime][i]; - } - - /*! returns i'th radius of itime'th timestep */ - __forceinline float radius(size_t i, size_t itime) const { - return vertices[itime][i].w; - } - - /*! calculates bounding box of i'th line segment */ - __forceinline BBox3fa bounds(const Vec3ff& v0) const { - return enlarge(BBox3fa(v0), maxRadiusScale*Vec3fa(v0.w)); - } - - /*! calculates bounding box of i'th line segment */ - __forceinline BBox3fa bounds(size_t i) const - { - const Vec3ff v0 = vertex(i); - return bounds(v0); - } - - /*! calculates bounding box of i'th line segment for the itime'th time step */ - __forceinline BBox3fa bounds(size_t i, size_t itime) const - { - const Vec3ff v0 = vertex(i, itime); - return bounds(v0); - } - - /*! calculates bounding box of i'th line segment */ - __forceinline BBox3fa bounds(const LinearSpace3fa& space, size_t i) const - { - const Vec3ff v0 = vertex(i); - const Vec3ff w0(xfmVector(space, (Vec3fa)v0), v0.w); - return bounds(w0); - } - - /*! calculates bounding box of i'th line segment for the itime'th time step */ - __forceinline BBox3fa bounds(const LinearSpace3fa& space, size_t i, size_t itime) const - { - const Vec3ff v0 = vertex(i, itime); - const Vec3ff w0(xfmVector(space, (Vec3fa)v0), v0.w); - return bounds(w0); - } - - /*! check if the i'th primitive is valid at the itime'th timestep */ - __forceinline bool valid(size_t i, size_t itime) const { - return valid(i, make_range(itime, itime)); - } - - /*! check if the i'th primitive is valid between the specified time range */ - __forceinline bool valid(size_t i, const range& itime_range) const - { - const unsigned int index = (unsigned int)i; - if (index >= numVertices()) - return false; - - for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++) { - const Vec3ff v0 = vertex(index + 0, itime); - if (unlikely(!isvalid4(v0))) - return false; - if (v0.w < 0.0f) - return false; - } - return true; - } - - /*! calculates the linear bounds of the i'th primitive at the itimeGlobal'th time segment */ - __forceinline LBBox3fa linearBounds(size_t i, size_t itime) const { - return LBBox3fa(bounds(i, itime + 0), bounds(i, itime + 1)); - } - - /*! calculates the build bounds of the i'th primitive, if it's valid */ - __forceinline bool buildBounds(size_t i, BBox3fa* bbox) const - { - if (!valid(i, 0)) - return false; - *bbox = bounds(i); - return true; - } - - /*! calculates the build bounds of the i'th primitive at the itime'th time segment, if it's valid */ - __forceinline bool buildBounds(size_t i, size_t itime, BBox3fa& bbox) const - { - if (!valid(i, itime + 0) || !valid(i, itime + 1)) - return false; - bbox = bounds(i, itime); // use bounds of first time step in builder - return true; - } - - /*! calculates the linear bounds of the i'th primitive for the specified time range */ - __forceinline LBBox3fa linearBounds(size_t primID, const BBox1f& dt) const { - return LBBox3fa([&](size_t itime) { return bounds(primID, itime); }, dt, time_range, fnumTimeSegments); - } - - /*! calculates the linear bounds of the i'th primitive for the specified time range */ - __forceinline LBBox3fa linearBounds(const LinearSpace3fa& space, size_t primID, const BBox1f& dt) const { - return LBBox3fa([&](size_t itime) { return bounds(space, primID, itime); }, dt, time_range, fnumTimeSegments); - } - - /*! calculates the linear bounds of the i'th primitive for the specified time range */ - __forceinline bool linearBounds(size_t i, const BBox1f& time_range, LBBox3fa& bbox) const - { - if (!valid(i, timeSegmentRange(time_range))) return false; - bbox = linearBounds(i, time_range); - return true; - } - - /*! get fast access to first vertex buffer */ - __forceinline float * getCompactVertexArray () const { - return (float*) vertices0.getPtr(); - } - - public: - BufferView vertices0; //!< fast access to first vertex buffer - BufferView normals0; //!< fast access to first normal buffer - vector> vertices; //!< vertex array for each timestep - vector> normals; //!< normal array for each timestep - vector> vertexAttribs; //!< user buffers - float maxRadiusScale = 1.0; //!< maximal min-width scaling of curve radii - }; - - namespace isa - { - struct PointsISA : public Points - { - PointsISA(Device* device, Geometry::GType gtype) : Points(device, gtype) {} - - Vec3fa computeDirection(unsigned int primID) const - { - return Vec3fa(1, 0, 0); - } - - Vec3fa computeDirection(unsigned int primID, size_t time) const - { - return Vec3fa(1, 0, 0); - } - - PrimInfo createPrimRefArray(mvector& prims, const range& r, size_t k, unsigned int geomID) const - { - PrimInfo pinfo(empty); - for (size_t j = r.begin(); j < r.end(); j++) { - BBox3fa bounds = empty; - if (!buildBounds(j, &bounds)) - continue; - const PrimRef prim(bounds, geomID, unsigned(j)); - pinfo.add_center2(prim); - prims[k++] = prim; - } - return pinfo; - } - - PrimInfo createPrimRefArrayMB(mvector& prims, size_t itime, const range& r, size_t k, unsigned int geomID) const - { - PrimInfo pinfo(empty); - for (size_t j = r.begin(); j < r.end(); j++) { - BBox3fa bounds = empty; - if (!buildBounds(j, itime, bounds)) - continue; - const PrimRef prim(bounds, geomID, unsigned(j)); - pinfo.add_center2(prim); - prims[k++] = prim; - } - return pinfo; - } - - PrimInfoMB createPrimRefMBArray(mvector& prims, - const BBox1f& t0t1, - const range& r, - size_t k, - unsigned int geomID) const - { - PrimInfoMB pinfo(empty); - for (size_t j = r.begin(); j < r.end(); j++) { - if (!valid(j, timeSegmentRange(t0t1))) - continue; - const PrimRefMB prim(linearBounds(j, t0t1), this->numTimeSegments(), this->time_range, this->numTimeSegments(), geomID, unsigned(j)); - pinfo.add_primref(prim); - prims[k++] = prim; - } - return pinfo; - } - - BBox3fa vbounds(size_t i) const - { - return bounds(i); - } - - BBox3fa vbounds(const LinearSpace3fa& space, size_t i) const - { - return bounds(space, i); - } - - LBBox3fa vlinearBounds(size_t primID, const BBox1f& time_range) const - { - return linearBounds(primID, time_range); - } - - LBBox3fa vlinearBounds(const LinearSpace3fa& space, size_t primID, const BBox1f& time_range) const - { - return linearBounds(space, primID, time_range); - } - }; - } // namespace isa - - DECLARE_ISA_FUNCTION(Points*, createPoints, Device* COMMA Geometry::GType); -} // namespace embree diff --git a/thirdparty/embree-aarch64/kernels/common/scene_quad_mesh.h b/thirdparty/embree-aarch64/kernels/common/scene_quad_mesh.h deleted file mode 100644 index d5bb054b14..0000000000 --- a/thirdparty/embree-aarch64/kernels/common/scene_quad_mesh.h +++ /dev/null @@ -1,277 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "geometry.h" -#include "buffer.h" - -namespace embree -{ - /*! Quad Mesh */ - struct QuadMesh : public Geometry - { - /*! type of this geometry */ - static const Geometry::GTypeMask geom_type = Geometry::MTY_QUAD_MESH; - - /*! triangle indices */ - struct Quad - { - uint32_t v[4]; - - /*! outputs triangle indices */ - __forceinline friend embree_ostream operator<<(embree_ostream cout, const Quad& q) { - return cout << "Quad {" << q.v[0] << ", " << q.v[1] << ", " << q.v[2] << ", " << q.v[3] << " }"; - } - }; - - public: - - /*! quad mesh construction */ - QuadMesh (Device* device); - - /* geometry interface */ - public: - void setMask(unsigned mask); - void setNumTimeSteps (unsigned int numTimeSteps); - void setVertexAttributeCount (unsigned int N); - void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref& buffer, size_t offset, size_t stride, unsigned int num); - void* getBuffer(RTCBufferType type, unsigned int slot); - void updateBuffer(RTCBufferType type, unsigned int slot); - void commit(); - bool verify(); - void interpolate(const RTCInterpolateArguments* const args); - void addElementsToCount (GeometryCounts & counts) const; - - public: - - /*! returns number of vertices */ - __forceinline size_t numVertices() const { - return vertices[0].size(); - } - - /*! returns i'th quad */ - __forceinline const Quad& quad(size_t i) const { - return quads[i]; - } - - /*! returns i'th vertex of itime'th timestep */ - __forceinline const Vec3fa vertex(size_t i) const { - return vertices0[i]; - } - - /*! returns i'th vertex of itime'th timestep */ - __forceinline const char* vertexPtr(size_t i) const { - return vertices0.getPtr(i); - } - - /*! returns i'th vertex of itime'th timestep */ - __forceinline const Vec3fa vertex(size_t i, size_t itime) const { - return vertices[itime][i]; - } - - /*! returns i'th vertex of itime'th timestep */ - __forceinline const char* vertexPtr(size_t i, size_t itime) const { - return vertices[itime].getPtr(i); - } - - /*! calculates the bounds of the i'th quad */ - __forceinline BBox3fa bounds(size_t i) const - { - const Quad& q = quad(i); - const Vec3fa v0 = vertex(q.v[0]); - const Vec3fa v1 = vertex(q.v[1]); - const Vec3fa v2 = vertex(q.v[2]); - const Vec3fa v3 = vertex(q.v[3]); - return BBox3fa(min(v0,v1,v2,v3),max(v0,v1,v2,v3)); - } - - /*! calculates the bounds of the i'th quad at the itime'th timestep */ - __forceinline BBox3fa bounds(size_t i, size_t itime) const - { - const Quad& q = quad(i); - const Vec3fa v0 = vertex(q.v[0],itime); - const Vec3fa v1 = vertex(q.v[1],itime); - const Vec3fa v2 = vertex(q.v[2],itime); - const Vec3fa v3 = vertex(q.v[3],itime); - return BBox3fa(min(v0,v1,v2,v3),max(v0,v1,v2,v3)); - } - - /*! check if the i'th primitive is valid at the itime'th timestep */ - __forceinline bool valid(size_t i, size_t itime) const { - return valid(i, make_range(itime, itime)); - } - - /*! check if the i'th primitive is valid between the specified time range */ - __forceinline bool valid(size_t i, const range& itime_range) const - { - const Quad& q = quad(i); - if (unlikely(q.v[0] >= numVertices())) return false; - if (unlikely(q.v[1] >= numVertices())) return false; - if (unlikely(q.v[2] >= numVertices())) return false; - if (unlikely(q.v[3] >= numVertices())) return false; - - for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++) - { - if (!isvalid(vertex(q.v[0],itime))) return false; - if (!isvalid(vertex(q.v[1],itime))) return false; - if (!isvalid(vertex(q.v[2],itime))) return false; - if (!isvalid(vertex(q.v[3],itime))) return false; - } - - return true; - } - - /*! calculates the linear bounds of the i'th quad at the itimeGlobal'th time segment */ - __forceinline LBBox3fa linearBounds(size_t i, size_t itime) const { - return LBBox3fa(bounds(i,itime+0),bounds(i,itime+1)); - } - - /*! calculates the build bounds of the i'th primitive, if it's valid */ - __forceinline bool buildBounds(size_t i, BBox3fa* bbox = nullptr) const - { - const Quad& q = quad(i); - if (q.v[0] >= numVertices()) return false; - if (q.v[1] >= numVertices()) return false; - if (q.v[2] >= numVertices()) return false; - if (q.v[3] >= numVertices()) return false; - - for (unsigned int t=0; t= numVertices())) return false; - if (unlikely(q.v[1] >= numVertices())) return false; - if (unlikely(q.v[2] >= numVertices())) return false; - if (unlikely(q.v[3] >= numVertices())) return false; - - assert(itime+1 < numTimeSteps); - const Vec3fa a0 = vertex(q.v[0],itime+0); if (unlikely(!isvalid(a0))) return false; - const Vec3fa a1 = vertex(q.v[1],itime+0); if (unlikely(!isvalid(a1))) return false; - const Vec3fa a2 = vertex(q.v[2],itime+0); if (unlikely(!isvalid(a2))) return false; - const Vec3fa a3 = vertex(q.v[3],itime+0); if (unlikely(!isvalid(a3))) return false; - const Vec3fa b0 = vertex(q.v[0],itime+1); if (unlikely(!isvalid(b0))) return false; - const Vec3fa b1 = vertex(q.v[1],itime+1); if (unlikely(!isvalid(b1))) return false; - const Vec3fa b2 = vertex(q.v[2],itime+1); if (unlikely(!isvalid(b2))) return false; - const Vec3fa b3 = vertex(q.v[3],itime+1); if (unlikely(!isvalid(b3))) return false; - - /* use bounds of first time step in builder */ - bbox = BBox3fa(min(a0,a1,a2,a3),max(a0,a1,a2,a3)); - return true; - } - - /*! calculates the linear bounds of the i'th primitive for the specified time range */ - __forceinline LBBox3fa linearBounds(size_t primID, const BBox1f& dt) const { - return LBBox3fa([&] (size_t itime) { return bounds(primID, itime); }, dt, time_range, fnumTimeSegments); - } - - /*! calculates the linear bounds of the i'th primitive for the specified time range */ - __forceinline bool linearBounds(size_t i, const BBox1f& dt, LBBox3fa& bbox) const - { - if (!valid(i, timeSegmentRange(dt))) return false; - bbox = linearBounds(i, dt); - return true; - } - - /*! get fast access to first vertex buffer */ - __forceinline float * getCompactVertexArray () const { - return (float*) vertices0.getPtr(); - } - - /* gets version info of topology */ - unsigned int getTopologyVersion() const { - return quads.modCounter; - } - - /* returns true if topology changed */ - bool topologyChanged(unsigned int otherVersion) const { - return quads.isModified(otherVersion); // || numPrimitivesChanged; - } - - /* returns the projected area */ - __forceinline float projectedPrimitiveArea(const size_t i) const { - const Quad& q = quad(i); - const Vec3fa v0 = vertex(q.v[0]); - const Vec3fa v1 = vertex(q.v[1]); - const Vec3fa v2 = vertex(q.v[2]); - const Vec3fa v3 = vertex(q.v[3]); - return areaProjectedTriangle(v0,v1,v3) + - areaProjectedTriangle(v1,v2,v3); - } - - public: - BufferView quads; //!< array of quads - BufferView vertices0; //!< fast access to first vertex buffer - vector> vertices; //!< vertex array for each timestep - vector> vertexAttribs; //!< vertex attribute buffers - }; - - namespace isa - { - struct QuadMeshISA : public QuadMesh - { - QuadMeshISA (Device* device) - : QuadMesh(device) {} - - PrimInfo createPrimRefArray(mvector& prims, const range& r, size_t k, unsigned int geomID) const - { - PrimInfo pinfo(empty); - for (size_t j=r.begin(); j& prims, size_t itime, const range& r, size_t k, unsigned int geomID) const - { - PrimInfo pinfo(empty); - for (size_t j=r.begin(); j& prims, const BBox1f& t0t1, const range& r, size_t k, unsigned int geomID) const - { - PrimInfoMB pinfo(empty); - for (size_t j=r.begin(); jnumTimeSegments(),this->time_range,this->numTimeSegments(),geomID,unsigned(j)); - pinfo.add_primref(prim); - prims[k++] = prim; - } - return pinfo; - } - }; - } - - DECLARE_ISA_FUNCTION(QuadMesh*, createQuadMesh, Device*); -} diff --git a/thirdparty/embree-aarch64/kernels/common/scene_subdiv_mesh.h b/thirdparty/embree-aarch64/kernels/common/scene_subdiv_mesh.h deleted file mode 100644 index d0246009db..0000000000 --- a/thirdparty/embree-aarch64/kernels/common/scene_subdiv_mesh.h +++ /dev/null @@ -1,326 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "geometry.h" -#include "buffer.h" -#include "../subdiv/half_edge.h" -#include "../subdiv/tessellation_cache.h" -#include "../subdiv/catmullclark_coefficients.h" -#include "../subdiv/patch.h" -#include "../../common/algorithms/parallel_map.h" -#include "../../common/algorithms/parallel_set.h" - -namespace embree -{ - class SubdivMesh : public Geometry - { - ALIGNED_CLASS_(16); - public: - - typedef HalfEdge::Edge Edge; - - /*! type of this geometry */ - static const Geometry::GTypeMask geom_type = Geometry::MTY_SUBDIV_MESH; - - /*! structure used to sort half edges using radix sort by their key */ - struct KeyHalfEdge - { - KeyHalfEdge() {} - - KeyHalfEdge (uint64_t key, HalfEdge* edge) - : key(key), edge(edge) {} - - __forceinline operator uint64_t() const { - return key; - } - - friend __forceinline bool operator<(const KeyHalfEdge& e0, const KeyHalfEdge& e1) { - return e0.key < e1.key; - } - - public: - uint64_t key; - HalfEdge* edge; - }; - - public: - - /*! subdiv mesh construction */ - SubdivMesh(Device* device); - - public: - void setMask (unsigned mask); - void setSubdivisionMode (unsigned int topologyID, RTCSubdivisionMode mode); - void setVertexAttributeTopology(unsigned int vertexAttribID, unsigned int topologyID); - void setNumTimeSteps (unsigned int numTimeSteps); - void setVertexAttributeCount (unsigned int N); - void setTopologyCount (unsigned int N); - void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref& buffer, size_t offset, size_t stride, unsigned int num); - void* getBuffer(RTCBufferType type, unsigned int slot); - void updateBuffer(RTCBufferType type, unsigned int slot); - void setTessellationRate(float N); - bool verify(); - void commit(); - void addElementsToCount (GeometryCounts & counts) const; - void setDisplacementFunction (RTCDisplacementFunctionN func); - unsigned int getFirstHalfEdge(unsigned int faceID); - unsigned int getFace(unsigned int edgeID); - unsigned int getNextHalfEdge(unsigned int edgeID); - unsigned int getPreviousHalfEdge(unsigned int edgeID); - unsigned int getOppositeHalfEdge(unsigned int topologyID, unsigned int edgeID); - - public: - - /*! return the number of faces */ - size_t numFaces() const { - return faceVertices.size(); - } - - /*! return the number of edges */ - size_t numEdges() const { - return topology[0].vertexIndices.size(); - } - - /*! return the number of vertices */ - size_t numVertices() const { - return vertices[0].size(); - } - - /*! calculates the bounds of the i'th subdivision patch at the j'th timestep */ - __forceinline BBox3fa bounds(size_t i, size_t j = 0) const { - return topology[0].getHalfEdge(i)->bounds(vertices[j]); - } - - /*! check if the i'th primitive is valid */ - __forceinline bool valid(size_t i) const { - return topology[0].valid(i) && !invalidFace(i); - } - - /*! check if the i'th primitive is valid for the j'th time range */ - __forceinline bool valid(size_t i, size_t j) const { - return topology[0].valid(i) && !invalidFace(i,j); - } - - /*! prints some statistics */ - void printStatistics(); - - /*! initializes the half edge data structure */ - void initializeHalfEdgeStructures (); - - public: - - /*! returns the vertex buffer for some time step */ - __forceinline const BufferView& getVertexBuffer( const size_t t = 0 ) const { - return vertices[t]; - } - - /* returns tessellation level of edge */ - __forceinline float getEdgeLevel(const size_t i) const - { - if (levels) return clamp(levels[i],1.0f,4096.0f); // FIXME: do we want to limit edge level? - else return clamp(tessellationRate,1.0f,4096.0f); // FIXME: do we want to limit edge level? - } - - public: - RTCDisplacementFunctionN displFunc; //!< displacement function - - /*! all buffers in this section are provided by the application */ - public: - - /*! the topology contains all data that may differ when - * interpolating different user data buffers */ - struct Topology - { - public: - - /*! Default topology construction */ - Topology () : halfEdges(nullptr,0) {} - - /*! Topology initialization */ - Topology (SubdivMesh* mesh); - - /*! make the class movable */ - public: - Topology (Topology&& other) // FIXME: this is only required to workaround compilation issues under Windows - : mesh(std::move(other.mesh)), - vertexIndices(std::move(other.vertexIndices)), - subdiv_mode(std::move(other.subdiv_mode)), - halfEdges(std::move(other.halfEdges)), - halfEdges0(std::move(other.halfEdges0)), - halfEdges1(std::move(other.halfEdges1)) {} - - Topology& operator= (Topology&& other) // FIXME: this is only required to workaround compilation issues under Windows - { - mesh = std::move(other.mesh); - vertexIndices = std::move(other.vertexIndices); - subdiv_mode = std::move(other.subdiv_mode); - halfEdges = std::move(other.halfEdges); - halfEdges0 = std::move(other.halfEdges0); - halfEdges1 = std::move(other.halfEdges1); - return *this; - } - - public: - /*! check if the i'th primitive is valid in this topology */ - __forceinline bool valid(size_t i) const - { - if (unlikely(subdiv_mode == RTC_SUBDIVISION_MODE_NO_BOUNDARY)) { - if (getHalfEdge(i)->faceHasBorder()) return false; - } - return true; - } - - /*! updates the interpolation mode for the topology */ - void setSubdivisionMode (RTCSubdivisionMode mode); - - /*! marks all buffers as modified */ - void update (); - - /*! verifies index array */ - bool verify (size_t numVertices); - - /*! initializes the half edge data structure */ - void initializeHalfEdgeStructures (); - - private: - - /*! recalculates the half edges */ - void calculateHalfEdges(); - - /*! updates half edges when recalculation is not necessary */ - void updateHalfEdges(); - - /*! user input data */ - public: - - SubdivMesh* mesh; - - /*! indices of the vertices composing each face */ - BufferView vertexIndices; - - /*! subdiv interpolation mode */ - RTCSubdivisionMode subdiv_mode; - - /*! generated data */ - public: - - /*! returns the start half edge for face f */ - __forceinline const HalfEdge* getHalfEdge ( const size_t f ) const { - return &halfEdges[mesh->faceStartEdge[f]]; - } - - /*! Half edge structure, generated by initHalfEdgeStructures */ - mvector halfEdges; - - /*! the following data is only required during construction of the - * half edge structure and can be cleared for static scenes */ - private: - - /*! two arrays used to sort the half edges */ - std::vector halfEdges0; - std::vector halfEdges1; - }; - - /*! returns the start half edge for topology t and face f */ - __forceinline const HalfEdge* getHalfEdge ( const size_t t , const size_t f ) const { - return topology[t].getHalfEdge(f); - } - - /*! buffer containing the number of vertices for each face */ - BufferView faceVertices; - - /*! array of topologies */ - vector topology; - - /*! vertex buffer (one buffer for each time step) */ - vector> vertices; - - /*! user data buffers */ - vector vertexAttribs; - - /*! edge crease buffer containing edges (pairs of vertices) that carry edge crease weights */ - BufferView edge_creases; - - /*! edge crease weights for each edge of the edge_creases buffer */ - BufferView edge_crease_weights; - - /*! vertex crease buffer containing all vertices that carry vertex crease weights */ - BufferView vertex_creases; - - /*! vertex crease weights for each vertex of the vertex_creases buffer */ - BufferView vertex_crease_weights; - - /*! subdivision level for each half edge of the vertexIndices buffer */ - BufferView levels; - float tessellationRate; // constant rate that is used when levels is not set - - /*! buffer that marks specific faces as holes */ - BufferView holes; - - /*! all data in this section is generated by initializeHalfEdgeStructures function */ - private: - - /*! number of half edges used by faces */ - size_t numHalfEdges; - - /*! fast lookup table to find the first half edge for some face */ - mvector faceStartEdge; - - /*! fast lookup table to find the face for some half edge */ - mvector halfEdgeFace; - - /*! set with all holes */ - parallel_set holeSet; - - /*! fast lookup table to detect invalid faces */ - mvector invalid_face; - - /*! test if face i is invalid in timestep j */ - __forceinline int8_t& invalidFace(size_t i, size_t j = 0) { return invalid_face[i*numTimeSteps+j]; } - __forceinline const int8_t& invalidFace(size_t i, size_t j = 0) const { return invalid_face[i*numTimeSteps+j]; } - - /*! interpolation cache */ - public: - static __forceinline size_t numInterpolationSlots4(size_t stride) { return (stride+15)/16; } - static __forceinline size_t numInterpolationSlots8(size_t stride) { return (stride+31)/32; } - static __forceinline size_t interpolationSlot(size_t prim, size_t slot, size_t stride) { - const size_t slots = numInterpolationSlots4(stride); - assert(slot < slots); - return slots*prim+slot; - } - std::vector> vertex_buffer_tags; - std::vector> vertex_attrib_buffer_tags; - std::vector patch_eval_trees; - - /*! the following data is only required during construction of the - * half edge structure and can be cleared for static scenes */ - private: - - /*! map with all vertex creases */ - parallel_map vertexCreaseMap; - - /*! map with all edge creases */ - parallel_map edgeCreaseMap; - - protected: - - /*! counts number of geometry commits */ - size_t commitCounter; - }; - - namespace isa - { - struct SubdivMeshISA : public SubdivMesh - { - SubdivMeshISA (Device* device) - : SubdivMesh(device) {} - - void interpolate(const RTCInterpolateArguments* const args); - void interpolateN(const RTCInterpolateNArguments* const args); - }; - } - - DECLARE_ISA_FUNCTION(SubdivMesh*, createSubdivMesh, Device*); -}; diff --git a/thirdparty/embree-aarch64/kernels/common/scene_triangle_mesh.cpp b/thirdparty/embree-aarch64/kernels/common/scene_triangle_mesh.cpp deleted file mode 100644 index d1c2750f14..0000000000 --- a/thirdparty/embree-aarch64/kernels/common/scene_triangle_mesh.cpp +++ /dev/null @@ -1,243 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "scene_triangle_mesh.h" -#include "scene.h" - -namespace embree -{ -#if defined(EMBREE_LOWEST_ISA) - - TriangleMesh::TriangleMesh (Device* device) - : Geometry(device,GTY_TRIANGLE_MESH,0,1) - { - vertices.resize(numTimeSteps); - } - - void TriangleMesh::setMask (unsigned mask) - { - this->mask = mask; - Geometry::update(); - } - - void TriangleMesh::setNumTimeSteps (unsigned int numTimeSteps) - { - vertices.resize(numTimeSteps); - Geometry::setNumTimeSteps(numTimeSteps); - } - - void TriangleMesh::setVertexAttributeCount (unsigned int N) - { - vertexAttribs.resize(N); - Geometry::update(); - } - - void TriangleMesh::setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref& buffer, size_t offset, size_t stride, unsigned int num) - { - /* verify that all accesses are 4 bytes aligned */ - if (((size_t(buffer->getPtr()) + offset) & 0x3) || (stride & 0x3)) - throw_RTCError(RTC_ERROR_INVALID_OPERATION, "data must be 4 bytes aligned"); - - if (type == RTC_BUFFER_TYPE_VERTEX) - { - if (format != RTC_FORMAT_FLOAT3) - throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid vertex buffer format"); - - /* if buffer is larger than 16GB the premultiplied index optimization does not work */ - if (stride*num > 16ll*1024ll*1024ll*1024ll) - throw_RTCError(RTC_ERROR_INVALID_OPERATION, "vertex buffer can be at most 16GB large"); - - if (slot >= vertices.size()) - throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid vertex buffer slot"); - - vertices[slot].set(buffer, offset, stride, num, format); - vertices[slot].checkPadding16(); - vertices0 = vertices[0]; - } - else if (type == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE) - { - if (format < RTC_FORMAT_FLOAT || format > RTC_FORMAT_FLOAT16) - throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid vertex attribute buffer format"); - - if (slot >= vertexAttribs.size()) - throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid vertex attribute buffer slot"); - - vertexAttribs[slot].set(buffer, offset, stride, num, format); - vertexAttribs[slot].checkPadding16(); - } - else if (type == RTC_BUFFER_TYPE_INDEX) - { - if (slot != 0) - throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot"); - if (format != RTC_FORMAT_UINT3) - throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid index buffer format"); - - triangles.set(buffer, offset, stride, num, format); - setNumPrimitives(num); - } - else - throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "unknown buffer type"); - } - - void* TriangleMesh::getBuffer(RTCBufferType type, unsigned int slot) - { - if (type == RTC_BUFFER_TYPE_INDEX) - { - if (slot != 0) - throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot"); - return triangles.getPtr(); - } - else if (type == RTC_BUFFER_TYPE_VERTEX) - { - if (slot >= vertices.size()) - throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot"); - return vertices[slot].getPtr(); - } - else if (type == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE) - { - if (slot >= vertexAttribs.size()) - throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot"); - return vertexAttribs[slot].getPtr(); - } - else - { - throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "unknown buffer type"); - return nullptr; - } - } - - void TriangleMesh::updateBuffer(RTCBufferType type, unsigned int slot) - { - if (type == RTC_BUFFER_TYPE_INDEX) - { - if (slot != 0) - throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot"); - triangles.setModified(); - } - else if (type == RTC_BUFFER_TYPE_VERTEX) - { - if (slot >= vertices.size()) - throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot"); - vertices[slot].setModified(); - } - else if (type == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE) - { - if (slot >= vertexAttribs.size()) - throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot"); - vertexAttribs[slot].setModified(); - } - else - { - throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "unknown buffer type"); - } - - Geometry::update(); - } - - void TriangleMesh::commit() - { - /* verify that stride of all time steps are identical */ - for (unsigned int t=0; t= numVertices()) return false; - if (triangles[i].v[1] >= numVertices()) return false; - if (triangles[i].v[2] >= numVertices()) return false; - } - - /*! verify vertices */ - for (const auto& buffer : vertices) - for (size_t i=0; iprimID; - float u = args->u; - float v = args->v; - RTCBufferType bufferType = args->bufferType; - unsigned int bufferSlot = args->bufferSlot; - float* P = args->P; - float* dPdu = args->dPdu; - float* dPdv = args->dPdv; - float* ddPdudu = args->ddPdudu; - float* ddPdvdv = args->ddPdvdv; - float* ddPdudv = args->ddPdudv; - unsigned int valueCount = args->valueCount; - - /* calculate base pointer and stride */ - assert((bufferType == RTC_BUFFER_TYPE_VERTEX && bufferSlot < numTimeSteps) || - (bufferType == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE && bufferSlot <= vertexAttribs.size())); - const char* src = nullptr; - size_t stride = 0; - if (bufferType == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE) { - src = vertexAttribs[bufferSlot].getPtr(); - stride = vertexAttribs[bufferSlot].getStride(); - } else { - src = vertices[bufferSlot].getPtr(); - stride = vertices[bufferSlot].getStride(); - } - - for (unsigned int i=0; i& buffer, size_t offset, size_t stride, unsigned int num); - void* getBuffer(RTCBufferType type, unsigned int slot); - void updateBuffer(RTCBufferType type, unsigned int slot); - void commit(); - bool verify(); - void interpolate(const RTCInterpolateArguments* const args); - void addElementsToCount (GeometryCounts & counts) const; - - public: - - /*! returns number of vertices */ - __forceinline size_t numVertices() const { - return vertices[0].size(); - } - - /*! returns i'th triangle*/ - __forceinline const Triangle& triangle(size_t i) const { - return triangles[i]; - } - - /*! returns i'th vertex of the first time step */ - __forceinline const Vec3fa vertex(size_t i) const { - return vertices0[i]; - } - - /*! returns i'th vertex of the first time step */ - __forceinline const char* vertexPtr(size_t i) const { - return vertices0.getPtr(i); - } - - /*! returns i'th vertex of itime'th timestep */ - __forceinline const Vec3fa vertex(size_t i, size_t itime) const { - return vertices[itime][i]; - } - - /*! returns i'th vertex of itime'th timestep */ - __forceinline const char* vertexPtr(size_t i, size_t itime) const { - return vertices[itime].getPtr(i); - } - - /*! calculates the bounds of the i'th triangle */ - __forceinline BBox3fa bounds(size_t i) const - { - const Triangle& tri = triangle(i); - const Vec3fa v0 = vertex(tri.v[0]); - const Vec3fa v1 = vertex(tri.v[1]); - const Vec3fa v2 = vertex(tri.v[2]); - return BBox3fa(min(v0,v1,v2),max(v0,v1,v2)); - } - - /*! calculates the bounds of the i'th triangle at the itime'th timestep */ - __forceinline BBox3fa bounds(size_t i, size_t itime) const - { - const Triangle& tri = triangle(i); - const Vec3fa v0 = vertex(tri.v[0],itime); - const Vec3fa v1 = vertex(tri.v[1],itime); - const Vec3fa v2 = vertex(tri.v[2],itime); - return BBox3fa(min(v0,v1,v2),max(v0,v1,v2)); - } - - /*! check if the i'th primitive is valid at the itime'th timestep */ - __forceinline bool valid(size_t i, size_t itime) const { - return valid(i, make_range(itime, itime)); - } - - /*! check if the i'th primitive is valid between the specified time range */ - __forceinline bool valid(size_t i, const range& itime_range) const - { - const Triangle& tri = triangle(i); - if (unlikely(tri.v[0] >= numVertices())) return false; - if (unlikely(tri.v[1] >= numVertices())) return false; - if (unlikely(tri.v[2] >= numVertices())) return false; - - for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++) - { - if (!isvalid(vertex(tri.v[0],itime))) return false; - if (!isvalid(vertex(tri.v[1],itime))) return false; - if (!isvalid(vertex(tri.v[2],itime))) return false; - } - - return true; - } - - /*! calculates the linear bounds of the i'th primitive at the itimeGlobal'th time segment */ - __forceinline LBBox3fa linearBounds(size_t i, size_t itime) const { - return LBBox3fa(bounds(i,itime+0),bounds(i,itime+1)); - } - - /*! calculates the build bounds of the i'th primitive, if it's valid */ - __forceinline bool buildBounds(size_t i, BBox3fa* bbox = nullptr) const - { - const Triangle& tri = triangle(i); - if (unlikely(tri.v[0] >= numVertices())) return false; - if (unlikely(tri.v[1] >= numVertices())) return false; - if (unlikely(tri.v[2] >= numVertices())) return false; - - for (size_t t=0; t= numVertices())) return false; - if (unlikely(tri.v[1] >= numVertices())) return false; - if (unlikely(tri.v[2] >= numVertices())) return false; - - assert(itime+1 < numTimeSteps); - const Vec3fa a0 = vertex(tri.v[0],itime+0); if (unlikely(!isvalid(a0))) return false; - const Vec3fa a1 = vertex(tri.v[1],itime+0); if (unlikely(!isvalid(a1))) return false; - const Vec3fa a2 = vertex(tri.v[2],itime+0); if (unlikely(!isvalid(a2))) return false; - const Vec3fa b0 = vertex(tri.v[0],itime+1); if (unlikely(!isvalid(b0))) return false; - const Vec3fa b1 = vertex(tri.v[1],itime+1); if (unlikely(!isvalid(b1))) return false; - const Vec3fa b2 = vertex(tri.v[2],itime+1); if (unlikely(!isvalid(b2))) return false; - - /* use bounds of first time step in builder */ - bbox = BBox3fa(min(a0,a1,a2),max(a0,a1,a2)); - return true; - } - - /*! calculates the linear bounds of the i'th primitive for the specified time range */ - __forceinline LBBox3fa linearBounds(size_t primID, const BBox1f& dt) const { - return LBBox3fa([&] (size_t itime) { return bounds(primID, itime); }, dt, time_range, fnumTimeSegments); - } - - /*! calculates the linear bounds of the i'th primitive for the specified time range */ - __forceinline bool linearBounds(size_t i, const BBox1f& dt, LBBox3fa& bbox) const { - if (!valid(i, timeSegmentRange(dt))) return false; - bbox = linearBounds(i, dt); - return true; - } - - /*! get fast access to first vertex buffer */ - __forceinline float * getCompactVertexArray () const { - return (float*) vertices0.getPtr(); - } - - /* gets version info of topology */ - unsigned int getTopologyVersion() const { - return triangles.modCounter; - } - - /* returns true if topology changed */ - bool topologyChanged(unsigned int otherVersion) const { - return triangles.isModified(otherVersion); // || numPrimitivesChanged; - } - - /* returns the projected area */ - __forceinline float projectedPrimitiveArea(const size_t i) const { - const Triangle& tri = triangle(i); - const Vec3fa v0 = vertex(tri.v[0]); - const Vec3fa v1 = vertex(tri.v[1]); - const Vec3fa v2 = vertex(tri.v[2]); - return areaProjectedTriangle(v0,v1,v2); - } - - public: - BufferView triangles; //!< array of triangles - BufferView vertices0; //!< fast access to first vertex buffer - vector> vertices; //!< vertex array for each timestep - vector vertexAttribs; //!< vertex attributes - }; - - namespace isa - { - struct TriangleMeshISA : public TriangleMesh - { - TriangleMeshISA (Device* device) - : TriangleMesh(device) {} - - PrimInfo createPrimRefArray(mvector& prims, const range& r, size_t k, unsigned int geomID) const - { - PrimInfo pinfo(empty); - for (size_t j=r.begin(); j& prims, size_t itime, const range& r, size_t k, unsigned int geomID) const - { - PrimInfo pinfo(empty); - for (size_t j=r.begin(); j& prims, const BBox1f& t0t1, const range& r, size_t k, unsigned int geomID) const - { - PrimInfoMB pinfo(empty); - for (size_t j=r.begin(); jnumTimeSegments(),this->time_range,this->numTimeSegments(),geomID,unsigned(j)); - pinfo.add_primref(prim); - prims[k++] = prim; - } - return pinfo; - } - }; - } - - DECLARE_ISA_FUNCTION(TriangleMesh*, createTriangleMesh, Device*); -} diff --git a/thirdparty/embree-aarch64/kernels/common/scene_user_geometry.h b/thirdparty/embree-aarch64/kernels/common/scene_user_geometry.h deleted file mode 100644 index 8d11ed6986..0000000000 --- a/thirdparty/embree-aarch64/kernels/common/scene_user_geometry.h +++ /dev/null @@ -1,77 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "accelset.h" - -namespace embree -{ - /*! User geometry with user defined intersection functions */ - struct UserGeometry : public AccelSet - { - /*! type of this geometry */ - static const Geometry::GTypeMask geom_type = Geometry::MTY_USER_GEOMETRY; - - public: - UserGeometry (Device* device, unsigned int items = 0, unsigned int numTimeSteps = 1); - virtual void setMask (unsigned mask); - virtual void setBoundsFunction (RTCBoundsFunction bounds, void* userPtr); - virtual void setIntersectFunctionN (RTCIntersectFunctionN intersect); - virtual void setOccludedFunctionN (RTCOccludedFunctionN occluded); - virtual void build() {} - virtual void addElementsToCount (GeometryCounts & counts) const; - }; - - namespace isa - { - struct UserGeometryISA : public UserGeometry - { - UserGeometryISA (Device* device) - : UserGeometry(device) {} - - PrimInfo createPrimRefArray(mvector& prims, const range& r, size_t k, unsigned int geomID) const - { - PrimInfo pinfo(empty); - for (size_t j=r.begin(); j& prims, size_t itime, const range& r, size_t k, unsigned int geomID) const - { - PrimInfo pinfo(empty); - for (size_t j=r.begin(); j& prims, const BBox1f& t0t1, const range& r, size_t k, unsigned int geomID) const - { - PrimInfoMB pinfo(empty); - for (size_t j=r.begin(); jnumTimeSegments(),this->time_range,this->numTimeSegments(),geomID,unsigned(j)); - pinfo.add_primref(prim); - prims[k++] = prim; - } - return pinfo; - } - }; - } - - DECLARE_ISA_FUNCTION(UserGeometry*, createUserGeometry, Device*); -} diff --git a/thirdparty/embree-aarch64/kernels/common/stack_item.h b/thirdparty/embree-aarch64/kernels/common/stack_item.h deleted file mode 100644 index 533c385365..0000000000 --- a/thirdparty/embree-aarch64/kernels/common/stack_item.h +++ /dev/null @@ -1,125 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "default.h" - -namespace embree -{ - /*! An item on the stack holds the node ID and distance of that node. */ - template - struct __aligned(16) StackItemT - { - /*! assert that the xchg function works */ - static_assert(sizeof(T) <= 12, "sizeof(T) <= 12 failed"); - - __forceinline StackItemT() {} - - __forceinline StackItemT(T &ptr, unsigned &dist) : ptr(ptr), dist(dist) {} - - /*! use SSE instructions to swap stack items */ - __forceinline static void xchg(StackItemT& a, StackItemT& b) - { - const vfloat4 sse_a = vfloat4::load((float*)&a); - const vfloat4 sse_b = vfloat4::load((float*)&b); - vfloat4::store(&a,sse_b); - vfloat4::store(&b,sse_a); - } - - /*! Sort 2 stack items. */ - __forceinline friend void sort(StackItemT& s1, StackItemT& s2) { - if (s2.dist < s1.dist) xchg(s2,s1); - } - - /*! Sort 3 stack items. */ - __forceinline friend void sort(StackItemT& s1, StackItemT& s2, StackItemT& s3) - { - if (s2.dist < s1.dist) xchg(s2,s1); - if (s3.dist < s2.dist) xchg(s3,s2); - if (s2.dist < s1.dist) xchg(s2,s1); - } - - /*! Sort 4 stack items. */ - __forceinline friend void sort(StackItemT& s1, StackItemT& s2, StackItemT& s3, StackItemT& s4) - { - if (s2.dist < s1.dist) xchg(s2,s1); - if (s4.dist < s3.dist) xchg(s4,s3); - if (s3.dist < s1.dist) xchg(s3,s1); - if (s4.dist < s2.dist) xchg(s4,s2); - if (s3.dist < s2.dist) xchg(s3,s2); - } - - /*! use SSE instructions to swap stack items */ - __forceinline static void cmp_xchg(vint4& a, vint4& b) - { -#if defined(__AVX512VL__) - const vboolf4 mask(shuffle<2,2,2,2>(b) < shuffle<2,2,2,2>(a)); -#else - const vboolf4 mask0(b < a); - const vboolf4 mask(shuffle<2,2,2,2>(mask0)); -#endif - const vint4 c = select(mask,b,a); - const vint4 d = select(mask,a,b); - a = c; - b = d; - } - - /*! Sort 3 stack items. */ - __forceinline static void sort3(vint4& s1, vint4& s2, vint4& s3) - { - cmp_xchg(s2,s1); - cmp_xchg(s3,s2); - cmp_xchg(s2,s1); - } - - /*! Sort 4 stack items. */ - __forceinline static void sort4(vint4& s1, vint4& s2, vint4& s3, vint4& s4) - { - cmp_xchg(s2,s1); - cmp_xchg(s4,s3); - cmp_xchg(s3,s1); - cmp_xchg(s4,s2); - cmp_xchg(s3,s2); - } - - - /*! Sort N stack items. */ - __forceinline friend void sort(StackItemT* begin, StackItemT* end) - { - for (StackItemT* i = begin+1; i != end; ++i) - { - const vfloat4 item = vfloat4::load((float*)i); - const unsigned dist = i->dist; - StackItemT* j = i; - - while ((j != begin) && ((j-1)->dist < dist)) - { - vfloat4::store(j, vfloat4::load((float*)(j-1))); - --j; - } - - vfloat4::store(j, item); - } - } - - public: - T ptr; - unsigned dist; - }; - - /*! An item on the stack holds the node ID and active ray mask. */ - template - struct __aligned(8) StackItemMaskT - { - T ptr; - size_t mask; - }; - - struct __aligned(8) StackItemMaskCoherent - { - size_t mask; - size_t parent; - size_t child; - }; -} diff --git a/thirdparty/embree-aarch64/kernels/common/stat.cpp b/thirdparty/embree-aarch64/kernels/common/stat.cpp deleted file mode 100644 index b73c3a8c76..0000000000 --- a/thirdparty/embree-aarch64/kernels/common/stat.cpp +++ /dev/null @@ -1,128 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "stat.h" - -namespace embree -{ - Stat Stat::instance; - - Stat::Stat () { - } - - Stat::~Stat () - { -#ifdef EMBREE_STAT_COUNTERS - Stat::print(std::cout); -#endif - } - - void Stat::print(std::ostream& cout) - { - Counters& cntrs = instance.cntrs; - Counters::Data& data = instance.cntrs.code; - //Counters::Data& data = instance.cntrs.active; - - /* print absolute numbers */ - cout << "--------- ABSOLUTE ---------" << std::endl; - cout << " #normal_travs = " << float(data.normal.travs )*1E-6 << "M" << std::endl; - cout << " #nodes = " << float(data.normal.trav_nodes )*1E-6 << "M" << std::endl; - cout << " #nodes_xfm = " << float(data.normal.trav_xfm_nodes )*1E-6 << "M" << std::endl; - cout << " #leaves = " << float(data.normal.trav_leaves )*1E-6 << "M" << std::endl; - cout << " #prims = " << float(data.normal.trav_prims )*1E-6 << "M" << std::endl; - cout << " #prim_hits = " << float(data.normal.trav_prim_hits )*1E-6 << "M" << std::endl; - - cout << " #stack nodes = " << float(data.normal.trav_stack_nodes )*1E-6 << "M" << std::endl; - cout << " #stack pop = " << float(data.normal.trav_stack_pop )*1E-6 << "M" << std::endl; - - size_t normal_box_hits = 0; - size_t weighted_box_hits = 0; - for (size_t i=0;i travs; - std::atomic trav_nodes; - std::atomic trav_leaves; - std::atomic trav_prims; - std::atomic trav_prim_hits; - std::atomic trav_hit_boxes[SIZE_HISTOGRAM+1]; - std::atomic trav_stack_pop; - std::atomic trav_stack_nodes; - std::atomic trav_xfm_nodes; - - } normal, shadow, point_query; - } all, active, code; - - std::atomic user[10]; - }; - - public: - - static __forceinline Counters& get() { - return instance.cntrs; - } - - static void clear() { - instance.cntrs.clear(); - } - - static void print(embree_ostream cout); - - private: - Counters cntrs; - - private: - static Stat instance; - }; -} diff --git a/thirdparty/embree-aarch64/kernels/common/state.cpp b/thirdparty/embree-aarch64/kernels/common/state.cpp deleted file mode 100644 index 51fc9b7826..0000000000 --- a/thirdparty/embree-aarch64/kernels/common/state.cpp +++ /dev/null @@ -1,543 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "state.h" -#include "../../common/lexers/streamfilters.h" - -namespace embree -{ - MutexSys g_printMutex; - - State::ErrorHandler State::g_errorHandler; - - State::ErrorHandler::ErrorHandler() - : thread_error(createTls()) {} - - State::ErrorHandler::~ErrorHandler() - { - Lock lock(errors_mutex); - for (size_t i=0; i lock(errors_mutex); - stored_error = new RTCError(RTC_ERROR_NONE); - thread_errors.push_back(stored_error); - setTls(thread_error,stored_error); - return stored_error; - } - - State::State () - : enabled_cpu_features(getCPUFeatures()), - enabled_builder_cpu_features(enabled_cpu_features), - frequency_level(FREQUENCY_SIMD256) - { - tri_accel = "default"; - tri_builder = "default"; - tri_traverser = "default"; - - tri_accel_mb = "default"; - tri_builder_mb = "default"; - tri_traverser_mb = "default"; - - quad_accel = "default"; - quad_builder = "default"; - quad_traverser = "default"; - - quad_accel_mb = "default"; - quad_builder_mb = "default"; - quad_traverser_mb = "default"; - - line_accel = "default"; - line_builder = "default"; - line_traverser = "default"; - - line_accel_mb = "default"; - line_builder_mb = "default"; - line_traverser_mb = "default"; - - hair_accel = "default"; - hair_builder = "default"; - hair_traverser = "default"; - - hair_accel_mb = "default"; - hair_builder_mb = "default"; - hair_traverser_mb = "default"; - - object_accel = "default"; - object_builder = "default"; - object_accel_min_leaf_size = 1; - object_accel_max_leaf_size = 1; - - object_accel_mb = "default"; - object_builder_mb = "default"; - object_accel_mb_min_leaf_size = 1; - object_accel_mb_max_leaf_size = 1; - - max_spatial_split_replications = 1.2f; - useSpatialPreSplits = false; - - tessellation_cache_size = 128*1024*1024; - - subdiv_accel = "default"; - subdiv_accel_mb = "default"; - - grid_accel = "default"; - grid_builder = "default"; - grid_accel_mb = "default"; - grid_builder_mb = "default"; - - instancing_open_min = 0; - instancing_block_size = 0; - instancing_open_factor = 8.0f; - instancing_open_max_depth = 32; - instancing_open_max = 50000000; - - ignore_config_files = false; - float_exceptions = false; - quality_flags = -1; - scene_flags = -1; - verbose = 0; - benchmark = 0; - - numThreads = 0; - numUserThreads = 0; - -#if TASKING_INTERNAL - set_affinity = true; -#else - set_affinity = false; -#endif - /* per default enable affinity on KNL */ - if (hasISA(AVX512KNL)) set_affinity = true; - - start_threads = false; - enable_selockmemoryprivilege = false; -#if defined(__LINUX__) - hugepages = true; -#else - hugepages = false; -#endif - hugepages_success = true; - - alloc_main_block_size = 0; - alloc_num_main_slots = 0; - alloc_thread_block_size = 0; - alloc_single_thread_alloc = -1; - - error_function = nullptr; - error_function_userptr = nullptr; - - memory_monitor_function = nullptr; - memory_monitor_userptr = nullptr; - } - - State::~State() { - } - - bool State::hasISA(const int isa) { - return (enabled_cpu_features & isa) == isa; - } - - bool State::checkISASupport() { -#if defined(__ARM_NEON) - /* - * NEON CPU type is a mixture of NEON and SSE2 - */ - - bool hasSSE2 = (getCPUFeatures() & enabled_cpu_features) & CPU_FEATURE_SSE2; - - /* this will be true when explicitly initialize Device with `isa=neon` config */ - bool hasNEON = (getCPUFeatures() & enabled_cpu_features) & CPU_FEATURE_NEON; - - return hasSSE2 || hasNEON; -#else - return (getCPUFeatures() & enabled_cpu_features) == enabled_cpu_features; -#endif - } - - void State::verify() - { - /* verify that calculations stay in range */ - assert(rcp(min_rcp_input)*FLT_LARGE+FLT_LARGE < 0.01f*FLT_MAX); - - /* here we verify that CPP files compiled for a specific ISA only - * call that same or lower ISA version of non-inlined class member - * functions */ -#if defined(DEBUG) -#if defined(EMBREE_TARGET_SSE2) -#if !defined(__ARM_NEON) - assert(sse2::getISA() <= SSE2); -#endif -#endif -#if defined(EMBREE_TARGET_SSE42) - assert(sse42::getISA() <= SSE42); -#endif -#if defined(EMBREE_TARGET_AVX) - assert(avx::getISA() <= AVX); -#endif -#if defined(EMBREE_TARGET_AVX2) - assert(avx2::getISA() <= AVX2); -#endif -#if defined (EMBREE_TARGET_AVX512KNL) - assert(avx512knl::getISA() <= AVX512KNL); -#endif -#if defined (EMBREE_TARGET_AVX512SKX) - assert(avx512skx::getISA() <= AVX512SKX); -#endif -#endif - } - - const char* symbols[3] = { "=", ",", "|" }; - - bool State::parseFile(const FileName& fileName) - { - FILE* f = fopen(fileName.c_str(),"r"); - if (!f) return false; - Ref > file = new FileStream(f,fileName); - - std::vector syms; - for (size_t i=0; i cin = new TokenStream(new LineCommentFilter(file,"#"), - TokenStream::alpha+TokenStream::ALPHA+TokenStream::numbers+"_.", - TokenStream::separators,syms); - parse(cin); - return true; - } - - void State::parseString(const char* cfg) - { - if (cfg == nullptr) return; - - std::vector syms; - for (size_t i=0; i cin = new TokenStream(new StrStream(cfg), - TokenStream::alpha+TokenStream::ALPHA+TokenStream::numbers+"_.", - TokenStream::separators,syms); - parse(cin); - } - - int string_to_cpufeatures(const std::string& isa) - { - if (isa == "sse" ) return SSE; - else if (isa == "sse2") return SSE2; - else if (isa == "sse3") return SSE3; - else if (isa == "ssse3") return SSSE3; - else if (isa == "sse41") return SSE41; - else if (isa == "sse4.1") return SSE41; - else if (isa == "sse42") return SSE42; - else if (isa == "sse4.2") return SSE42; - else if (isa == "avx") return AVX; - else if (isa == "avxi") return AVXI; - else if (isa == "avx2") return AVX2; - else if (isa == "avx512knl") return AVX512KNL; - else if (isa == "avx512skx") return AVX512SKX; - else return SSE2; - } - - void State::parse(Ref cin) - { - /* parse until end of stream */ - while (cin->peek() != Token::Eof()) - { - const Token tok = cin->get(); - - if (tok == Token::Id("threads") && cin->trySymbol("=")) - numThreads = cin->get().Int(); - - else if (tok == Token::Id("user_threads")&& cin->trySymbol("=")) - numUserThreads = cin->get().Int(); - - else if (tok == Token::Id("set_affinity")&& cin->trySymbol("=")) - set_affinity = cin->get().Int(); - - else if (tok == Token::Id("affinity")&& cin->trySymbol("=")) - set_affinity = cin->get().Int(); - - else if (tok == Token::Id("start_threads")&& cin->trySymbol("=")) - start_threads = cin->get().Int(); - - else if (tok == Token::Id("isa") && cin->trySymbol("=")) { - std::string isa = toLowerCase(cin->get().Identifier()); - enabled_cpu_features = string_to_cpufeatures(isa); - enabled_builder_cpu_features = enabled_cpu_features; - } - - else if (tok == Token::Id("max_isa") && cin->trySymbol("=")) { - std::string isa = toLowerCase(cin->get().Identifier()); - enabled_cpu_features &= string_to_cpufeatures(isa); - enabled_builder_cpu_features &= enabled_cpu_features; - } - - else if (tok == Token::Id("max_builder_isa") && cin->trySymbol("=")) { - std::string isa = toLowerCase(cin->get().Identifier()); - enabled_builder_cpu_features &= string_to_cpufeatures(isa); - } - - else if (tok == Token::Id("frequency_level") && cin->trySymbol("=")) { - std::string freq = cin->get().Identifier(); - if (freq == "simd128") frequency_level = FREQUENCY_SIMD128; - else if (freq == "simd256") frequency_level = FREQUENCY_SIMD256; - else if (freq == "simd512") frequency_level = FREQUENCY_SIMD512; - } - - else if (tok == Token::Id("enable_selockmemoryprivilege") && cin->trySymbol("=")) { - enable_selockmemoryprivilege = cin->get().Int(); - } - else if (tok == Token::Id("hugepages") && cin->trySymbol("=")) { - hugepages = cin->get().Int(); - } - - else if (tok == Token::Id("ignore_config_files") && cin->trySymbol("=")) - ignore_config_files = cin->get().Int(); - else if (tok == Token::Id("float_exceptions") && cin->trySymbol("=")) - float_exceptions = cin->get().Int(); - - else if ((tok == Token::Id("tri_accel") || tok == Token::Id("accel")) && cin->trySymbol("=")) - tri_accel = cin->get().Identifier(); - else if ((tok == Token::Id("tri_builder") || tok == Token::Id("builder")) && cin->trySymbol("=")) - tri_builder = cin->get().Identifier(); - else if ((tok == Token::Id("tri_traverser") || tok == Token::Id("traverser")) && cin->trySymbol("=")) - tri_traverser = cin->get().Identifier(); - - else if ((tok == Token::Id("tri_accel_mb") || tok == Token::Id("accel_mb")) && cin->trySymbol("=")) - tri_accel_mb = cin->get().Identifier(); - else if ((tok == Token::Id("tri_builder_mb") || tok == Token::Id("builder_mb")) && cin->trySymbol("=")) - tri_builder_mb = cin->get().Identifier(); - else if ((tok == Token::Id("tri_traverser_mb") || tok == Token::Id("traverser_mb")) && cin->trySymbol("=")) - tri_traverser_mb = cin->get().Identifier(); - - else if ((tok == Token::Id("quad_accel")) && cin->trySymbol("=")) - quad_accel = cin->get().Identifier(); - else if ((tok == Token::Id("quad_builder")) && cin->trySymbol("=")) - quad_builder = cin->get().Identifier(); - else if ((tok == Token::Id("quad_traverser")) && cin->trySymbol("=")) - quad_traverser = cin->get().Identifier(); - - else if ((tok == Token::Id("quad_accel_mb")) && cin->trySymbol("=")) - quad_accel_mb = cin->get().Identifier(); - else if ((tok == Token::Id("quad_builder_mb")) && cin->trySymbol("=")) - quad_builder_mb = cin->get().Identifier(); - else if ((tok == Token::Id("quad_traverser_mb")) && cin->trySymbol("=")) - quad_traverser_mb = cin->get().Identifier(); - - else if ((tok == Token::Id("line_accel")) && cin->trySymbol("=")) - line_accel = cin->get().Identifier(); - else if ((tok == Token::Id("line_builder")) && cin->trySymbol("=")) - line_builder = cin->get().Identifier(); - else if ((tok == Token::Id("line_traverser")) && cin->trySymbol("=")) - line_traverser = cin->get().Identifier(); - - else if ((tok == Token::Id("line_accel_mb")) && cin->trySymbol("=")) - line_accel_mb = cin->get().Identifier(); - else if ((tok == Token::Id("line_builder_mb")) && cin->trySymbol("=")) - line_builder_mb = cin->get().Identifier(); - else if ((tok == Token::Id("line_traverser_mb")) && cin->trySymbol("=")) - line_traverser_mb = cin->get().Identifier(); - - else if (tok == Token::Id("hair_accel") && cin->trySymbol("=")) - hair_accel = cin->get().Identifier(); - else if (tok == Token::Id("hair_builder") && cin->trySymbol("=")) - hair_builder = cin->get().Identifier(); - else if (tok == Token::Id("hair_traverser") && cin->trySymbol("=")) - hair_traverser = cin->get().Identifier(); - - else if (tok == Token::Id("hair_accel_mb") && cin->trySymbol("=")) - hair_accel_mb = cin->get().Identifier(); - else if (tok == Token::Id("hair_builder_mb") && cin->trySymbol("=")) - hair_builder_mb = cin->get().Identifier(); - else if (tok == Token::Id("hair_traverser_mb") && cin->trySymbol("=")) - hair_traverser_mb = cin->get().Identifier(); - - else if (tok == Token::Id("object_accel") && cin->trySymbol("=")) - object_accel = cin->get().Identifier(); - else if (tok == Token::Id("object_builder") && cin->trySymbol("=")) - object_builder = cin->get().Identifier(); - else if (tok == Token::Id("object_accel_min_leaf_size") && cin->trySymbol("=")) - object_accel_min_leaf_size = cin->get().Int(); - else if (tok == Token::Id("object_accel_max_leaf_size") && cin->trySymbol("=")) - object_accel_max_leaf_size = cin->get().Int(); - - else if (tok == Token::Id("object_accel_mb") && cin->trySymbol("=")) - object_accel_mb = cin->get().Identifier(); - else if (tok == Token::Id("object_builder_mb") && cin->trySymbol("=")) - object_builder_mb = cin->get().Identifier(); - else if (tok == Token::Id("object_accel_mb_min_leaf_size") && cin->trySymbol("=")) - object_accel_mb_min_leaf_size = cin->get().Int(); - else if (tok == Token::Id("object_accel_mb_max_leaf_size") && cin->trySymbol("=")) - object_accel_mb_max_leaf_size = cin->get().Int(); - - else if (tok == Token::Id("instancing_open_min") && cin->trySymbol("=")) - instancing_open_min = cin->get().Int(); - else if (tok == Token::Id("instancing_block_size") && cin->trySymbol("=")) { - instancing_block_size = cin->get().Int(); - instancing_open_factor = 0.0f; - } - else if (tok == Token::Id("instancing_open_max_depth") && cin->trySymbol("=")) - instancing_open_max_depth = cin->get().Int(); - else if (tok == Token::Id("instancing_open_factor") && cin->trySymbol("=")) { - instancing_block_size = 0; - instancing_open_factor = cin->get().Float(); - } - else if (tok == Token::Id("instancing_open_max") && cin->trySymbol("=")) - instancing_open_max = cin->get().Int(); - - else if (tok == Token::Id("subdiv_accel") && cin->trySymbol("=")) - subdiv_accel = cin->get().Identifier(); - else if (tok == Token::Id("subdiv_accel_mb") && cin->trySymbol("=")) - subdiv_accel_mb = cin->get().Identifier(); - - else if (tok == Token::Id("grid_accel") && cin->trySymbol("=")) - grid_accel = cin->get().Identifier(); - else if (tok == Token::Id("grid_accel_mb") && cin->trySymbol("=")) - grid_accel_mb = cin->get().Identifier(); - - else if (tok == Token::Id("verbose") && cin->trySymbol("=")) - verbose = cin->get().Int(); - else if (tok == Token::Id("benchmark") && cin->trySymbol("=")) - benchmark = cin->get().Int(); - - else if (tok == Token::Id("quality")) { - if (cin->trySymbol("=")) { - Token flag = cin->get(); - if (flag == Token::Id("low")) quality_flags = RTC_BUILD_QUALITY_LOW; - else if (flag == Token::Id("medium")) quality_flags = RTC_BUILD_QUALITY_MEDIUM; - else if (flag == Token::Id("high")) quality_flags = RTC_BUILD_QUALITY_HIGH; - } - } - - else if (tok == Token::Id("scene_flags")) { - scene_flags = 0; - if (cin->trySymbol("=")) { - do { - Token flag = cin->get(); - if (flag == Token::Id("dynamic") ) scene_flags |= RTC_SCENE_FLAG_DYNAMIC; - else if (flag == Token::Id("compact")) scene_flags |= RTC_SCENE_FLAG_COMPACT; - else if (flag == Token::Id("robust")) scene_flags |= RTC_SCENE_FLAG_ROBUST; - } while (cin->trySymbol("|")); - } - } - - else if (tok == Token::Id("max_spatial_split_replications") && cin->trySymbol("=")) - max_spatial_split_replications = cin->get().Float(); - - else if (tok == Token::Id("presplits") && cin->trySymbol("=")) - useSpatialPreSplits = cin->get().Int() != 0 ? true : false; - - else if (tok == Token::Id("tessellation_cache_size") && cin->trySymbol("=")) - tessellation_cache_size = size_t(cin->get().Float()*1024.0f*1024.0f); - else if (tok == Token::Id("cache_size") && cin->trySymbol("=")) - tessellation_cache_size = size_t(cin->get().Float()*1024.0f*1024.0f); - - else if (tok == Token::Id("alloc_main_block_size") && cin->trySymbol("=")) - alloc_main_block_size = cin->get().Int(); - else if (tok == Token::Id("alloc_num_main_slots") && cin->trySymbol("=")) - alloc_num_main_slots = cin->get().Int(); - else if (tok == Token::Id("alloc_thread_block_size") && cin->trySymbol("=")) - alloc_thread_block_size = cin->get().Int(); - else if (tok == Token::Id("alloc_single_thread_alloc") && cin->trySymbol("=")) - alloc_single_thread_alloc = cin->get().Int(); - - cin->trySymbol(","); // optional , separator - } - } - - bool State::verbosity(size_t N) { - return N <= verbose; - } - - void State::print() - { - std::cout << "general:" << std::endl; - std::cout << " build threads = " << numThreads << std::endl; - std::cout << " build user threads = " << numUserThreads << std::endl; - std::cout << " start_threads = " << start_threads << std::endl; - std::cout << " affinity = " << set_affinity << std::endl; - std::cout << " frequency_level = "; - switch (frequency_level) { - case FREQUENCY_SIMD128: std::cout << "simd128" << std::endl; break; - case FREQUENCY_SIMD256: std::cout << "simd256" << std::endl; break; - case FREQUENCY_SIMD512: std::cout << "simd512" << std::endl; break; - default: std::cout << "error" << std::endl; break; - } - - std::cout << " hugepages = "; - if (!hugepages) std::cout << "disabled" << std::endl; - else if (hugepages_success) std::cout << "enabled" << std::endl; - else std::cout << "failed" << std::endl; - - std::cout << " verbosity = " << verbose << std::endl; - std::cout << " cache_size = " << float(tessellation_cache_size)*1E-6 << " MB" << std::endl; - std::cout << " max_spatial_split_replications = " << max_spatial_split_replications << std::endl; - - std::cout << "triangles:" << std::endl; - std::cout << " accel = " << tri_accel << std::endl; - std::cout << " builder = " << tri_builder << std::endl; - std::cout << " traverser = " << tri_traverser << std::endl; - - std::cout << "motion blur triangles:" << std::endl; - std::cout << " accel = " << tri_accel_mb << std::endl; - std::cout << " builder = " << tri_builder_mb << std::endl; - std::cout << " traverser = " << tri_traverser_mb << std::endl; - - std::cout << "quads:" << std::endl; - std::cout << " accel = " << quad_accel << std::endl; - std::cout << " builder = " << quad_builder << std::endl; - std::cout << " traverser = " << quad_traverser << std::endl; - - std::cout << "motion blur quads:" << std::endl; - std::cout << " accel = " << quad_accel_mb << std::endl; - std::cout << " builder = " << quad_builder_mb << std::endl; - std::cout << " traverser = " << quad_traverser_mb << std::endl; - - std::cout << "line segments:" << std::endl; - std::cout << " accel = " << line_accel << std::endl; - std::cout << " builder = " << line_builder << std::endl; - std::cout << " traverser = " << line_traverser << std::endl; - - std::cout << "motion blur line segments:" << std::endl; - std::cout << " accel = " << line_accel_mb << std::endl; - std::cout << " builder = " << line_builder_mb << std::endl; - std::cout << " traverser = " << line_traverser_mb << std::endl; - - std::cout << "hair:" << std::endl; - std::cout << " accel = " << hair_accel << std::endl; - std::cout << " builder = " << hair_builder << std::endl; - std::cout << " traverser = " << hair_traverser << std::endl; - - std::cout << "motion blur hair:" << std::endl; - std::cout << " accel = " << hair_accel_mb << std::endl; - std::cout << " builder = " << hair_builder_mb << std::endl; - std::cout << " traverser = " << hair_traverser_mb << std::endl; - - std::cout << "subdivision surfaces:" << std::endl; - std::cout << " accel = " << subdiv_accel << std::endl; - - std::cout << "grids:" << std::endl; - std::cout << " accel = " << grid_accel << std::endl; - std::cout << " builder = " << grid_builder << std::endl; - - std::cout << "motion blur grids:" << std::endl; - std::cout << " accel = " << grid_accel_mb << std::endl; - std::cout << " builder = " << grid_builder_mb << std::endl; - - std::cout << "object_accel:" << std::endl; - std::cout << " min_leaf_size = " << object_accel_min_leaf_size << std::endl; - std::cout << " max_leaf_size = " << object_accel_max_leaf_size << std::endl; - - std::cout << "object_accel_mb:" << std::endl; - std::cout << " min_leaf_size = " << object_accel_mb_min_leaf_size << std::endl; - std::cout << " max_leaf_size = " << object_accel_mb_max_leaf_size << std::endl; - } -} diff --git a/thirdparty/embree-aarch64/kernels/common/state.h b/thirdparty/embree-aarch64/kernels/common/state.h deleted file mode 100644 index d0fccc023f..0000000000 --- a/thirdparty/embree-aarch64/kernels/common/state.h +++ /dev/null @@ -1,197 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "default.h" - -namespace embree -{ - /* mutex to make printing to cout thread safe */ - extern MutexSys g_printMutex; - - struct State : public RefCount - { - public: - /*! state construction */ - State (); - - /*! state destruction */ - ~State(); - - /*! verifies that state is correct */ - void verify(); - - /*! parses state from a configuration file */ - bool parseFile(const FileName& fileName); - - /*! parses the state from a string */ - void parseString(const char* cfg); - - /*! parses the state from a stream */ - void parse(Ref cin); - - /*! prints the state */ - void print(); - - /*! checks if verbosity level is at least N */ - bool verbosity(size_t N); - - /*! checks if some particular ISA is enabled */ - bool hasISA(const int isa); - - /*! check whether selected ISA is supported by the HW */ - bool checkISASupport(); - - public: - std::string tri_accel; //!< acceleration structure to use for triangles - std::string tri_builder; //!< builder to use for triangles - std::string tri_traverser; //!< traverser to use for triangles - - public: - std::string tri_accel_mb; //!< acceleration structure to use for motion blur triangles - std::string tri_builder_mb; //!< builder to use for motion blur triangles - std::string tri_traverser_mb; //!< traverser to use for triangles - - public: - std::string quad_accel; //!< acceleration structure to use for quads - std::string quad_builder; //!< builder to use for quads - std::string quad_traverser; //!< traverser to use for quads - - public: - std::string quad_accel_mb; //!< acceleration structure to use for motion blur quads - std::string quad_builder_mb; //!< builder to use for motion blur quads - std::string quad_traverser_mb; //!< traverser to use for motion blur quads - - public: - std::string line_accel; //!< acceleration structure to use for line segments - std::string line_builder; //!< builder to use for line segments - std::string line_traverser; //!< traverser to use for line segments - - public: - std::string line_accel_mb; //!< acceleration structure to use for motion blur line segments - std::string line_builder_mb; //!< builder to use for motion blur line segments - std::string line_traverser_mb; //!< traverser to use for motion blur line segments - - public: - std::string hair_accel; //!< hair acceleration structure to use - std::string hair_builder; //!< builder to use for hair - std::string hair_traverser; //!< traverser to use for hair - - public: - std::string hair_accel_mb; //!< acceleration structure to use for motion blur hair - std::string hair_builder_mb; //!< builder to use for motion blur hair - std::string hair_traverser_mb; //!< traverser to use for motion blur hair - - public: - std::string object_accel; //!< acceleration structure for user geometries - std::string object_builder; //!< builder for user geometries - int object_accel_min_leaf_size; //!< minimum leaf size for object acceleration structure - int object_accel_max_leaf_size; //!< maximum leaf size for object acceleration structure - - public: - std::string object_accel_mb; //!< acceleration structure for user geometries - std::string object_builder_mb; //!< builder for user geometries - int object_accel_mb_min_leaf_size; //!< minimum leaf size for mblur object acceleration structure - int object_accel_mb_max_leaf_size; //!< maximum leaf size for mblur object acceleration structure - - public: - std::string subdiv_accel; //!< acceleration structure to use for subdivision surfaces - std::string subdiv_accel_mb; //!< acceleration structure to use for subdivision surfaces - - public: - std::string grid_accel; //!< acceleration structure to use for grids - std::string grid_builder; //!< builder for grids - std::string grid_accel_mb; //!< acceleration structure to use for motion blur grids - std::string grid_builder_mb; //!< builder for motion blur grids - - public: - float max_spatial_split_replications; //!< maximally replications*N many primitives in accel for spatial splits - bool useSpatialPreSplits; //!< use spatial pre-splits instead of the full spatial split builder - size_t tessellation_cache_size; //!< size of the shared tessellation cache - - public: - size_t instancing_open_min; //!< instancing opens tree to minimally that number of subtrees - size_t instancing_block_size; //!< instancing opens tree up to average block size of primitives - float instancing_open_factor; //!< instancing opens tree up to x times the number of instances - size_t instancing_open_max_depth; //!< maximum open depth for geometries - size_t instancing_open_max; //!< instancing opens tree to maximally that number of subtrees - - public: - bool ignore_config_files; //!< if true no more config files get parse - bool float_exceptions; //!< enable floating point exceptions - int quality_flags; - int scene_flags; - size_t verbose; //!< verbosity of output - size_t benchmark; //!< true - - public: - size_t numThreads; //!< number of threads to use in builders - size_t numUserThreads; //!< number of user provided threads to use in builders - bool set_affinity; //!< sets affinity for worker threads - bool start_threads; //!< true when threads should be started at device creation time - int enabled_cpu_features; //!< CPU ISA features to use - int enabled_builder_cpu_features; //!< CPU ISA features to use for builders only - enum FREQUENCY_LEVEL { - FREQUENCY_SIMD128, - FREQUENCY_SIMD256, - FREQUENCY_SIMD512 - } frequency_level; //!< frequency level the app wants to run on (default is SIMD256) - bool enable_selockmemoryprivilege; //!< configures the SeLockMemoryPrivilege under Windows to enable huge pages - bool hugepages; //!< true if huge pages should get used - bool hugepages_success; //!< status for enabling huge pages - - public: - size_t alloc_main_block_size; //!< main allocation block size (shared between threads) - int alloc_num_main_slots; //!< number of such shared blocks to be used to allocate - size_t alloc_thread_block_size; //!< size of thread local allocator block size - int alloc_single_thread_alloc; //!< in single mode nodes and leaves use same thread local allocator - - public: - - /*! checks if we can use AVX */ - bool canUseAVX() { - return hasISA(AVX) && frequency_level != FREQUENCY_SIMD128; - } - - /*! checks if we can use AVX2 */ - bool canUseAVX2() { - return hasISA(AVX2) && frequency_level != FREQUENCY_SIMD128; - } - - struct ErrorHandler - { - public: - ErrorHandler(); - ~ErrorHandler(); - RTCError* error(); - - public: - tls_t thread_error; - std::vector thread_errors; - MutexSys errors_mutex; - }; - ErrorHandler errorHandler; - static ErrorHandler g_errorHandler; - - public: - void setErrorFunction(RTCErrorFunction fptr, void* uptr) - { - error_function = fptr; - error_function_userptr = uptr; - } - - RTCErrorFunction error_function; - void* error_function_userptr; - - public: - void setMemoryMonitorFunction(RTCMemoryMonitorFunction fptr, void* uptr) - { - memory_monitor_function = fptr; - memory_monitor_userptr = uptr; - } - - RTCMemoryMonitorFunction memory_monitor_function; - void* memory_monitor_userptr; - }; -} diff --git a/thirdparty/embree-aarch64/kernels/common/vector.h b/thirdparty/embree-aarch64/kernels/common/vector.h deleted file mode 100644 index b478762240..0000000000 --- a/thirdparty/embree-aarch64/kernels/common/vector.h +++ /dev/null @@ -1,76 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "default.h" - -namespace embree -{ - /*! invokes the memory monitor callback */ - struct MemoryMonitorInterface { - virtual void memoryMonitor(ssize_t bytes, bool post) = 0; - }; - - /*! allocator that performs aligned monitored allocations */ - template - struct aligned_monitored_allocator - { - typedef T value_type; - typedef T* pointer; - typedef const T* const_pointer; - typedef T& reference; - typedef const T& const_reference; - typedef std::size_t size_type; - typedef std::ptrdiff_t difference_type; - - __forceinline aligned_monitored_allocator(MemoryMonitorInterface* device) - : device(device), hugepages(false) {} - - __forceinline pointer allocate( size_type n ) - { - if (n) { - assert(device); - device->memoryMonitor(n*sizeof(T),false); - } - if (n*sizeof(value_type) >= 14 * PAGE_SIZE_2M) - { - pointer p = (pointer) os_malloc(n*sizeof(value_type),hugepages); - assert(p); - return p; - } - return (pointer) alignedMalloc(n*sizeof(value_type),alignment); - } - - __forceinline void deallocate( pointer p, size_type n ) - { - if (p) - { - if (n*sizeof(value_type) >= 14 * PAGE_SIZE_2M) - os_free(p,n*sizeof(value_type),hugepages); - else - alignedFree(p); - } - else assert(n == 0); - - if (n) { - assert(device); - device->memoryMonitor(-ssize_t(n)*sizeof(T),true); - } - } - - __forceinline void construct( pointer p, const_reference val ) { - new (p) T(val); - } - - __forceinline void destroy( pointer p ) { - p->~T(); - } - - private: - MemoryMonitorInterface* device; - bool hugepages; - }; - - /*! monitored vector */ - template - using mvector = vector_t::value> >; -} diff --git a/thirdparty/embree-aarch64/kernels/config.h b/thirdparty/embree-aarch64/kernels/config.h deleted file mode 100644 index 80a8ab2a56..0000000000 --- a/thirdparty/embree-aarch64/kernels/config.h +++ /dev/null @@ -1,76 +0,0 @@ - -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -/* #undef EMBREE_RAY_MASK */ -/* #undef EMBREE_STAT_COUNTERS */ -/* #undef EMBREE_BACKFACE_CULLING */ -/* #undef EMBREE_BACKFACE_CULLING_CURVES */ -#define EMBREE_FILTER_FUNCTION -/* #undef EMBREE_IGNORE_INVALID_RAYS */ -#define EMBREE_GEOMETRY_TRIANGLE -/* #undef EMBREE_GEOMETRY_QUAD */ -/* #undef EMBREE_GEOMETRY_CURVE */ -/* #undef EMBREE_GEOMETRY_SUBDIVISION */ -/* #undef EMBREE_GEOMETRY_USER */ -/* #undef EMBREE_GEOMETRY_INSTANCE */ -/* #undef EMBREE_GEOMETRY_GRID */ -/* #undef EMBREE_GEOMETRY_POINT */ -/* #undef EMBREE_RAY_PACKETS */ -/* #undef EMBREE_COMPACT_POLYS */ - -#define EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR 2.0 - -#if defined(EMBREE_GEOMETRY_TRIANGLE) - #define IF_ENABLED_TRIS(x) x -#else - #define IF_ENABLED_TRIS(x) -#endif - -#if defined(EMBREE_GEOMETRY_QUAD) - #define IF_ENABLED_QUADS(x) x -#else - #define IF_ENABLED_QUADS(x) -#endif - -#if defined(EMBREE_GEOMETRY_CURVE) || defined(EMBREE_GEOMETRY_POINT) - #define IF_ENABLED_CURVES_OR_POINTS(x) x -#else - #define IF_ENABLED_CURVES_OR_POINTS(x) -#endif - -#if defined(EMBREE_GEOMETRY_CURVE) - #define IF_ENABLED_CURVES(x) x -#else - #define IF_ENABLED_CURVES(x) -#endif - -#if defined(EMBREE_GEOMETRY_POINT) - #define IF_ENABLED_POINTS(x) x -#else - #define IF_ENABLED_POINTS(x) -#endif - -#if defined(EMBREE_GEOMETRY_SUBDIVISION) - #define IF_ENABLED_SUBDIV(x) x -#else - #define IF_ENABLED_SUBDIV(x) -#endif - -#if defined(EMBREE_GEOMETRY_USER) - #define IF_ENABLED_USER(x) x -#else - #define IF_ENABLED_USER(x) -#endif - -#if defined(EMBREE_GEOMETRY_INSTANCE) - #define IF_ENABLED_INSTANCE(x) x -#else - #define IF_ENABLED_INSTANCE(x) -#endif - -#if defined(EMBREE_GEOMETRY_GRID) - #define IF_ENABLED_GRIDS(x) x -#else - #define IF_ENABLED_GRIDS(x) -#endif diff --git a/thirdparty/embree-aarch64/kernels/geometry/cone.h b/thirdparty/embree-aarch64/kernels/geometry/cone.h deleted file mode 100644 index 961ef86160..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/cone.h +++ /dev/null @@ -1,321 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../common/ray.h" - -namespace embree -{ - namespace isa - { - struct Cone - { - const Vec3fa p0; //!< start position of cone - const Vec3fa p1; //!< end position of cone - const float r0; //!< start radius of cone - const float r1; //!< end radius of cone - - __forceinline Cone(const Vec3fa& p0, const float r0, const Vec3fa& p1, const float r1) - : p0(p0), p1(p1), r0(r0), r1(r1) {} - - __forceinline bool intersect(const Vec3fa& org, const Vec3fa& dir, - BBox1f& t_o, - float& u0_o, Vec3fa& Ng0_o, - float& u1_o, Vec3fa& Ng1_o) const - { - /* calculate quadratic equation to solve */ - const Vec3fa v0 = p0-org; - const Vec3fa v1 = p1-org; - - const float rl = rcp_length(v1-v0); - const Vec3fa P0 = v0, dP = (v1-v0)*rl; - const float dr = (r1-r0)*rl; - const Vec3fa O = -P0, dO = dir; - - const float dOdO = dot(dO,dO); - const float OdO = dot(dO,O); - const float OO = dot(O,O); - const float dOz = dot(dP,dO); - const float Oz = dot(dP,O); - - const float R = r0 + Oz*dr; - const float A = dOdO - sqr(dOz) * (1.0f+sqr(dr)); - const float B = 2.0f * (OdO - dOz*(Oz + R*dr)); - const float C = OO - (sqr(Oz) + sqr(R)); - - /* we miss the cone if determinant is smaller than zero */ - const float D = B*B - 4.0f*A*C; - if (D < 0.0f) return false; - - /* special case for rays that are "parallel" to the cone */ - const float eps = float(1<<8)*float(ulp)*max(abs(dOdO),abs(sqr(dOz))); - if (unlikely(abs(A) < eps)) - { - /* cylinder case */ - if (abs(dr) < 16.0f*float(ulp)) { - if (C <= 0.0f) { t_o = BBox1f(neg_inf,pos_inf); return true; } - else { t_o = BBox1f(pos_inf,neg_inf); return false; } - } - - /* cone case */ - else - { - /* if we hit the negative cone there cannot be a hit */ - const float t = -C/B; - const float z0 = Oz+t*dOz; - const float z0r = r0+z0*dr; - if (z0r < 0.0f) return false; - - /* test if we start inside or outside the cone */ - if (dOz*dr > 0.0f) t_o = BBox1f(t,pos_inf); - else t_o = BBox1f(neg_inf,t); - } - } - - /* standard case for "non-parallel" rays */ - else - { - const float Q = sqrt(D); - const float rcp_2A = rcp(2.0f*A); - t_o.lower = (-B-Q)*rcp_2A; - t_o.upper = (-B+Q)*rcp_2A; - - /* standard case where both hits are on same cone */ - if (likely(A > 0.0f)) { - const float z0 = Oz+t_o.lower*dOz; - const float z0r = r0+z0*dr; - if (z0r < 0.0f) return false; - } - - /* special case where the hits are on the positive and negative cone */ - else - { - /* depending on the ray direction and the open direction - * of the cone we have a hit from inside or outside the - * cone */ - if (dOz*dr > 0) t_o.upper = pos_inf; - else t_o.lower = neg_inf; - } - } - - /* calculates u and Ng for near hit */ - { - u0_o = (Oz+t_o.lower*dOz)*rl; - const Vec3fa Pr = t_o.lower*dir; - const Vec3fa Pl = v0 + u0_o*(v1-v0); - const Vec3fa R = normalize(Pr-Pl); - const Vec3fa U = (p1-p0)+(r1-r0)*R; - const Vec3fa V = cross(p1-p0,R); - Ng0_o = cross(V,U); - } - - /* calculates u and Ng for far hit */ - { - u1_o = (Oz+t_o.upper*dOz)*rl; - const Vec3fa Pr = t_o.upper*dir; - const Vec3fa Pl = v0 + u1_o*(v1-v0); - const Vec3fa R = normalize(Pr-Pl); - const Vec3fa U = (p1-p0)+(r1-r0)*R; - const Vec3fa V = cross(p1-p0,R); - Ng1_o = cross(V,U); - } - return true; - } - - __forceinline bool intersect(const Vec3fa& org, const Vec3fa& dir, BBox1f& t_o) const - { - float u0_o; Vec3fa Ng0_o; float u1_o; Vec3fa Ng1_o; - return intersect(org,dir,t_o,u0_o,Ng0_o,u1_o,Ng1_o); - } - - static bool verify(const size_t id, const Cone& cone, const Ray& ray, bool shouldhit, const float t0, const float t1) - { - float eps = 0.001f; - BBox1f t; bool hit; - hit = cone.intersect(ray.org,ray.dir,t); - - bool failed = hit != shouldhit; - if (shouldhit) failed |= std::isinf(t0) ? t0 != t.lower : (t0 == -1E6) ? t.lower > -1E6f : abs(t0-t.lower) > eps; - if (shouldhit) failed |= std::isinf(t1) ? t1 != t.upper : (t1 == +1E6) ? t.upper < +1E6f : abs(t1-t.upper) > eps; - if (!failed) return true; - embree_cout << "Cone test " << id << " failed: cone = " << cone << ", ray = " << ray << ", hit = " << hit << ", t = " << t << embree_endl; - return false; - } - - /* verify cone class */ - static bool verify() - { - bool passed = true; - const Cone cone0(Vec3fa(0.0f,0.0f,0.0f),0.0f,Vec3fa(1.0f,0.0f,0.0f),1.0f); - passed &= verify(0,cone0,Ray(Vec3fa(-2.0f,1.0f,0.0f),Vec3fa(+1.0f,+0.0f,+0.0f),0.0f,float(inf)),true,3.0f,pos_inf); - passed &= verify(1,cone0,Ray(Vec3fa(+2.0f,1.0f,0.0f),Vec3fa(-1.0f,+0.0f,+0.0f),0.0f,float(inf)),true,neg_inf,1.0f); - passed &= verify(2,cone0,Ray(Vec3fa(-1.0f,0.0f,2.0f),Vec3fa(+0.0f,+0.0f,-1.0f),0.0f,float(inf)),false,0.0f,0.0f); - passed &= verify(3,cone0,Ray(Vec3fa(+1.0f,0.0f,2.0f),Vec3fa(+0.0f,+0.0f,-1.0f),0.0f,float(inf)),true,1.0f,3.0f); - passed &= verify(4,cone0,Ray(Vec3fa(-1.0f,0.0f,0.0f),Vec3fa(+1.0f,+0.0f,+0.0f),0.0f,float(inf)),true,1.0f,pos_inf); - passed &= verify(5,cone0,Ray(Vec3fa(+1.0f,0.0f,0.0f),Vec3fa(-1.0f,+0.0f,+0.0f),0.0f,float(inf)),true,neg_inf,1.0f); - passed &= verify(6,cone0,Ray(Vec3fa(+0.0f,0.0f,1.0f),Vec3fa(+0.0f,+0.0f,-1.0f),0.0f,float(inf)),true,1.0f,1.0f); - passed &= verify(7,cone0,Ray(Vec3fa(+0.0f,1.0f,0.0f),Vec3fa(-1.0f,-1.0f,+0.0f),0.0f,float(inf)),false,0.0f,0.0f); - passed &= verify(8,cone0,Ray(Vec3fa(+0.0f,1.0f,0.0f),Vec3fa(+1.0f,-1.0f,+0.0f),0.0f,float(inf)),true,0.5f,+1E6); - passed &= verify(9,cone0,Ray(Vec3fa(+0.0f,1.0f,0.0f),Vec3fa(-1.0f,+1.0f,+0.0f),0.0f,float(inf)),true,-1E6,-0.5f); - const Cone cone1(Vec3fa(0.0f,0.0f,0.0f),1.0f,Vec3fa(1.0f,0.0f,0.0f),0.0f); - passed &= verify(10,cone1,Ray(Vec3fa(-2.0f,1.0f,0.0f),Vec3fa(+1.0f,+0.0f,+0.0f),0.0f,float(inf)),true,neg_inf,2.0f); - passed &= verify(11,cone1,Ray(Vec3fa(-1.0f,0.0f,2.0f),Vec3fa(+0.0f,+0.0f,-1.0f),0.0f,float(inf)),true,0.0f,4.0f); - const Cone cylinder(Vec3fa(0.0f,0.0f,0.0f),1.0f,Vec3fa(1.0f,0.0f,0.0f),1.0f); - passed &= verify(12,cylinder,Ray(Vec3fa(-2.0f,1.0f,0.0f),Vec3fa( 0.0f,-1.0f,+0.0f),0.0f,float(inf)),true,0.0f,2.0f); - passed &= verify(13,cylinder,Ray(Vec3fa(+2.0f,1.0f,0.0f),Vec3fa( 0.0f,-1.0f,+0.0f),0.0f,float(inf)),true,0.0f,2.0f); - passed &= verify(14,cylinder,Ray(Vec3fa(+2.0f,1.0f,2.0f),Vec3fa( 0.0f,-1.0f,+0.0f),0.0f,float(inf)),false,0.0f,0.0f); - passed &= verify(15,cylinder,Ray(Vec3fa(+0.0f,0.0f,0.0f),Vec3fa( 1.0f, 0.0f,+0.0f),0.0f,float(inf)),true,neg_inf,pos_inf); - passed &= verify(16,cylinder,Ray(Vec3fa(+0.0f,0.0f,0.0f),Vec3fa(-1.0f, 0.0f,+0.0f),0.0f,float(inf)),true,neg_inf,pos_inf); - passed &= verify(17,cylinder,Ray(Vec3fa(+0.0f,2.0f,0.0f),Vec3fa( 1.0f, 0.0f,+0.0f),0.0f,float(inf)),false,pos_inf,neg_inf); - passed &= verify(18,cylinder,Ray(Vec3fa(+0.0f,2.0f,0.0f),Vec3fa(-1.0f, 0.0f,+0.0f),0.0f,float(inf)),false,pos_inf,neg_inf); - return passed; - } - - /*! output operator */ - friend __forceinline embree_ostream operator<<(embree_ostream cout, const Cone& c) { - return cout << "Cone { p0 = " << c.p0 << ", r0 = " << c.r0 << ", p1 = " << c.p1 << ", r1 = " << c.r1 << "}"; - } - }; - - template - struct ConeN - { - typedef Vec3> Vec3vfN; - - const Vec3vfN p0; //!< start position of cone - const Vec3vfN p1; //!< end position of cone - const vfloat r0; //!< start radius of cone - const vfloat r1; //!< end radius of cone - - __forceinline ConeN(const Vec3vfN& p0, const vfloat& r0, const Vec3vfN& p1, const vfloat& r1) - : p0(p0), p1(p1), r0(r0), r1(r1) {} - - __forceinline Cone operator[] (const size_t i) const - { - assert(i intersect(const Vec3fa& org, const Vec3fa& dir, - BBox>& t_o, - vfloat& u0_o, Vec3vfN& Ng0_o, - vfloat& u1_o, Vec3vfN& Ng1_o) const - { - /* calculate quadratic equation to solve */ - const Vec3vfN v0 = p0-Vec3vfN(org); - const Vec3vfN v1 = p1-Vec3vfN(org); - - const vfloat rl = rcp_length(v1-v0); - const Vec3vfN P0 = v0, dP = (v1-v0)*rl; - const vfloat dr = (r1-r0)*rl; - const Vec3vfN O = -P0, dO = dir; - - const vfloat dOdO = dot(dO,dO); - const vfloat OdO = dot(dO,O); - const vfloat OO = dot(O,O); - const vfloat dOz = dot(dP,dO); - const vfloat Oz = dot(dP,O); - - const vfloat R = r0 + Oz*dr; - const vfloat A = dOdO - sqr(dOz) * (vfloat(1.0f)+sqr(dr)); - const vfloat B = 2.0f * (OdO - dOz*(Oz + R*dr)); - const vfloat C = OO - (sqr(Oz) + sqr(R)); - - /* we miss the cone if determinant is smaller than zero */ - const vfloat D = B*B - 4.0f*A*C; - vbool valid = D >= 0.0f; - if (none(valid)) return valid; - - /* special case for rays that are "parallel" to the cone */ - const vfloat eps = float(1<<8)*float(ulp)*max(abs(dOdO),abs(sqr(dOz))); - const vbool validt = valid & (abs(A) < eps); - const vbool validf = valid & !(abs(A) < eps); - if (unlikely(any(validt))) - { - const vboolx validtt = validt & (abs(dr) < 16.0f*float(ulp)); - const vboolx validtf = validt & (abs(dr) >= 16.0f*float(ulp)); - - /* cylinder case */ - if (unlikely(any(validtt))) - { - t_o.lower = select(validtt, select(C <= 0.0f, vfloat(neg_inf), vfloat(pos_inf)), t_o.lower); - t_o.upper = select(validtt, select(C <= 0.0f, vfloat(pos_inf), vfloat(neg_inf)), t_o.upper); - valid &= !validtt | C <= 0.0f; - } - - /* cone case */ - if (any(validtf)) - { - /* if we hit the negative cone there cannot be a hit */ - const vfloat t = -C/B; - const vfloat z0 = Oz+t*dOz; - const vfloat z0r = r0+z0*dr; - valid &= !validtf | z0r >= 0.0f; - - /* test if we start inside or outside the cone */ - t_o.lower = select(validtf, select(dOz*dr > 0.0f, t, vfloat(neg_inf)), t_o.lower); - t_o.upper = select(validtf, select(dOz*dr > 0.0f, vfloat(pos_inf), t), t_o.upper); - } - } - - /* standard case for "non-parallel" rays */ - if (likely(any(validf))) - { - const vfloat Q = sqrt(D); - const vfloat rcp_2A = 0.5f*rcp(A); - t_o.lower = select(validf, (-B-Q)*rcp_2A, t_o.lower); - t_o.upper = select(validf, (-B+Q)*rcp_2A, t_o.upper); - - /* standard case where both hits are on same cone */ - const vbool validft = validf & A>0.0f; - const vbool validff = validf & !(A>0.0f); - if (any(validft)) { - const vfloat z0 = Oz+t_o.lower*dOz; - const vfloat z0r = r0+z0*dr; - valid &= !validft | z0r >= 0.0f; - } - - /* special case where the hits are on the positive and negative cone */ - if (any(validff)) { - /* depending on the ray direction and the open direction - * of the cone we have a hit from inside or outside the - * cone */ - t_o.lower = select(validff, select(dOz*dr > 0.0f, t_o.lower, float(neg_inf)), t_o.lower); - t_o.upper = select(validff, select(dOz*dr > 0.0f, float(pos_inf), t_o.upper), t_o.upper); - } - } - - /* calculates u and Ng for near hit */ - { - u0_o = (Oz+t_o.lower*dOz)*rl; - const Vec3vfN Pr = t_o.lower*Vec3vfN(dir); - const Vec3vfN Pl = v0 + u0_o*(v1-v0); - const Vec3vfN R = normalize(Pr-Pl); - const Vec3vfN U = (p1-p0)+(r1-r0)*R; - const Vec3vfN V = cross(p1-p0,R); - Ng0_o = cross(V,U); - } - - /* calculates u and Ng for far hit */ - { - u1_o = (Oz+t_o.upper*dOz)*rl; - const Vec3vfN Pr = t_o.lower*Vec3vfN(dir); - const Vec3vfN Pl = v0 + u1_o*(v1-v0); - const Vec3vfN R = normalize(Pr-Pl); - const Vec3vfN U = (p1-p0)+(r1-r0)*R; - const Vec3vfN V = cross(p1-p0,R); - Ng1_o = cross(V,U); - } - return valid; - } - - __forceinline vbool intersect(const Vec3fa& org, const Vec3fa& dir, BBox>& t_o) const - { - vfloat u0_o; Vec3vfN Ng0_o; vfloat u1_o; Vec3vfN Ng1_o; - return intersect(org,dir,t_o,u0_o,Ng0_o,u1_o,Ng1_o); - } - }; - } -} - diff --git a/thirdparty/embree-aarch64/kernels/geometry/coneline_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/coneline_intersector.h deleted file mode 100644 index 0902baff7d..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/coneline_intersector.h +++ /dev/null @@ -1,209 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../common/ray.h" -#include "curve_intersector_precalculations.h" - -namespace embree -{ - namespace isa - { - namespace __coneline_internal - { - template - static __forceinline bool intersectCone(const vbool& valid_i, - const Vec3vf& ray_org_in, const Vec3vf& ray_dir, - const vfloat& ray_tnear, const ray_tfar_func& ray_tfar, - const Vec4vf& v0, const Vec4vf& v1, - const vbool& cL, const vbool& cR, - const Epilog& epilog) - { - vbool valid = valid_i; - - /* move ray origin closer to make calculations numerically stable */ - const vfloat dOdO = sqr(ray_dir); - const vfloat rcp_dOdO = rcp(dOdO); - const Vec3vf center = vfloat(0.5f)*(v0.xyz()+v1.xyz()); - const vfloat dt = dot(center-ray_org_in,ray_dir)*rcp_dOdO; - const Vec3vf ray_org = ray_org_in + dt*ray_dir; - - const Vec3vf dP = v1.xyz() - v0.xyz(); - const Vec3vf p0 = ray_org - v0.xyz(); - const Vec3vf p1 = ray_org - v1.xyz(); - - const vfloat dPdP = sqr(dP); - const vfloat dP0 = dot(p0,dP); - const vfloat dP1 = dot(p1,dP); - const vfloat dOdP = dot(ray_dir,dP); - - // intersect cone body - const vfloat dr = v0.w - v1.w; - const vfloat hy = dPdP + sqr(dr); - const vfloat dO0 = dot(ray_dir,p0); - const vfloat OO = sqr(p0); - const vfloat dPdP2 = sqr(dPdP); - const vfloat dPdPr0 = dPdP*v0.w; - - const vfloat A = dPdP2 - sqr(dOdP)*hy; - const vfloat B = dPdP2*dO0 - dP0*dOdP*hy + dPdPr0*(dr*dOdP); - const vfloat C = dPdP2*OO - sqr(dP0)*hy + dPdPr0*(2.0f*dr*dP0 - dPdPr0); - - const vfloat D = B*B - A*C; - valid &= D >= 0.0f; - if (unlikely(none(valid))) { - return false; - } - - /* standard case for "non-parallel" rays */ - const vfloat Q = sqrt(D); - const vfloat rcp_A = rcp(A); - /* special case for rays that are "parallel" to the cone - assume miss */ - const vbool isParallel = abs(A) <= min_rcp_input; - - vfloat t_cone_lower = select (isParallel, neg_inf, (-B-Q)*rcp_A); - vfloat t_cone_upper = select (isParallel, pos_inf, (-B+Q)*rcp_A); - const vfloat y_lower = dP0 + t_cone_lower*dOdP; - const vfloat y_upper = dP0 + t_cone_upper*dOdP; - t_cone_lower = select(valid & y_lower > 0.0f & y_lower < dPdP, t_cone_lower, pos_inf); - t_cone_upper = select(valid & y_upper > 0.0f & y_upper < dPdP, t_cone_upper, neg_inf); - - const vbool hitDisk0 = valid & cL; - const vbool hitDisk1 = valid & cR; - const vfloat rcp_dOdP = rcp(dOdP); - const vfloat t_disk0 = select (hitDisk0, select (sqr(p0*dOdP-ray_dir*dP0)<(sqr(v0.w)*sqr(dOdP)), -dP0*rcp_dOdP, pos_inf), pos_inf); - const vfloat t_disk1 = select (hitDisk1, select (sqr(p1*dOdP-ray_dir*dP1)<(sqr(v1.w)*sqr(dOdP)), -dP1*rcp_dOdP, pos_inf), pos_inf); - const vfloat t_disk_lower = min(t_disk0, t_disk1); - const vfloat t_disk_upper = max(t_disk0, t_disk1); - - const vfloat t_lower = min(t_cone_lower, t_disk_lower); - const vfloat t_upper = max(t_cone_upper, select(t_lower==t_disk_lower, - select(t_disk_upper==vfloat(pos_inf),neg_inf,t_disk_upper), - select(t_disk_lower==vfloat(pos_inf),neg_inf,t_disk_lower))); - - const vbool valid_lower = valid & ray_tnear <= dt+t_lower & dt+t_lower <= ray_tfar() & t_lower != vfloat(pos_inf); - const vbool valid_upper = valid & ray_tnear <= dt+t_upper & dt+t_upper <= ray_tfar() & t_upper != vfloat(neg_inf); - - const vbool valid_first = valid_lower | valid_upper; - if (unlikely(none(valid_first))) - return false; - - const vfloat t_first = select(valid_lower, t_lower, t_upper); - const vfloat y_first = select(valid_lower, y_lower, y_upper); - - const vfloat rcp_dPdP = rcp(dPdP); - const Vec3vf dP2drr0dP = dPdP*dr*v0.w*dP; - const Vec3vf dPhy = dP*hy; - const vbool cone_hit_first = valid & (t_first == t_cone_lower | t_first == t_cone_upper); - const vbool disk0_hit_first = valid & (t_first == t_disk0); - const Vec3vf Ng_first = select(cone_hit_first, dPdP2*(p0+t_first*ray_dir)+dP2drr0dP-dPhy*y_first, select(disk0_hit_first, -dP, dP)); - const vfloat u_first = select(cone_hit_first, y_first*rcp_dPdP, select(disk0_hit_first, vfloat(zero), vfloat(one))); - - /* invoke intersection filter for first hit */ - RoundLineIntersectorHitM hit(u_first,zero,dt+t_first,Ng_first); - const bool is_hit_first = epilog(valid_first, hit); - - /* check for possible second hits before potentially accepted hit */ - const vfloat t_second = t_upper; - const vfloat y_second = y_upper; - const vbool valid_second = valid_lower & valid_upper & (dt+t_upper <= ray_tfar()); - if (unlikely(none(valid_second))) - return is_hit_first; - - /* invoke intersection filter for second hit */ - const vbool cone_hit_second = t_second == t_cone_lower | t_second == t_cone_upper; - const vbool disk0_hit_second = t_second == t_disk0; - const Vec3vf Ng_second = select(cone_hit_second, dPdP2*(p0+t_second*ray_dir)+dP2drr0dP-dPhy*y_second, select(disk0_hit_second, -dP, dP)); - const vfloat u_second = select(cone_hit_second, y_second*rcp_dPdP, select(disk0_hit_first, vfloat(zero), vfloat(one))); - - hit = RoundLineIntersectorHitM(u_second,zero,dt+t_second,Ng_second); - const bool is_hit_second = epilog(valid_second, hit); - - return is_hit_first | is_hit_second; - } - } - - template - struct ConeLineIntersectorHitM - { - __forceinline ConeLineIntersectorHitM() {} - - __forceinline ConeLineIntersectorHitM(const vfloat& u, const vfloat& v, const vfloat& t, const Vec3vf& Ng) - : vu(u), vv(v), vt(t), vNg(Ng) {} - - __forceinline void finalize() {} - - __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); } - __forceinline float t (const size_t i) const { return vt[i]; } - __forceinline Vec3fa Ng(const size_t i) const { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); } - - public: - vfloat vu; - vfloat vv; - vfloat vt; - Vec3vf vNg; - }; - - template - struct ConeCurveIntersector1 - { - typedef CurvePrecalculations1 Precalculations; - - struct ray_tfar { - Ray& ray; - __forceinline ray_tfar(Ray& ray) : ray(ray) {} - __forceinline vfloat operator() () const { return ray.tfar; }; - }; - - template - static __forceinline bool intersect(const vbool& valid_i, - Ray& ray, - IntersectContext* context, - const LineSegments* geom, - const Precalculations& pre, - const Vec4vf& v0i, const Vec4vf& v1i, - const vbool& cL, const vbool& cR, - const Epilog& epilog) - { - const Vec3vf ray_org(ray.org.x, ray.org.y, ray.org.z); - const Vec3vf ray_dir(ray.dir.x, ray.dir.y, ray.dir.z); - const vfloat ray_tnear(ray.tnear()); - const Vec4vf v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i); - const Vec4vf v1 = enlargeRadiusToMinWidth(context,geom,ray_org,v1i); - return __coneline_internal::intersectCone(valid_i,ray_org,ray_dir,ray_tnear,ray_tfar(ray),v0,v1,cL,cR,epilog); - } - }; - - template - struct ConeCurveIntersectorK - { - typedef CurvePrecalculationsK Precalculations; - - struct ray_tfar { - RayK& ray; - size_t k; - __forceinline ray_tfar(RayK& ray, size_t k) : ray(ray), k(k) {} - __forceinline vfloat operator() () const { return ray.tfar[k]; }; - }; - - template - static __forceinline bool intersect(const vbool& valid_i, - RayK& ray, size_t k, - IntersectContext* context, - const LineSegments* geom, - const Precalculations& pre, - const Vec4vf& v0i, const Vec4vf& v1i, - const vbool& cL, const vbool& cR, - const Epilog& epilog) - { - const Vec3vf ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]); - const Vec3vf ray_dir(ray.dir.x[k], ray.dir.y[k], ray.dir.z[k]); - const vfloat ray_tnear = ray.tnear()[k]; - const Vec4vf v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i); - const Vec4vf v1 = enlargeRadiusToMinWidth(context,geom,ray_org,v1i); - return __coneline_internal::intersectCone(valid_i,ray_org,ray_dir,ray_tnear,ray_tfar(ray,k),v0,v1,cL,cR,epilog); - } - }; - } -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/conelinei_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/conelinei_intersector.h deleted file mode 100644 index d47218eb8b..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/conelinei_intersector.h +++ /dev/null @@ -1,141 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "coneline_intersector.h" -#include "intersector_epilog.h" - -namespace embree -{ - namespace isa - { - template - struct ConeCurveMiIntersector1 - { - typedef LineMi Primitive; - typedef CurvePrecalculations1 Precalculations; - - static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& line) - { - STAT3(normal.trav_prims,1,1,1); - const LineSegments* geom = context->scene->get(line.geomID()); - Vec4vf v0,v1; - vbool cL,cR; - line.gather(v0,v1,cL,cR,geom); - const vbool valid = line.template valid(); - ConeCurveIntersector1::intersect(valid,ray,context,geom,pre,v0,v1,cL,cR,Intersect1EpilogM(ray,context,line.geomID(),line.primID())); - } - - static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& line) - { - STAT3(shadow.trav_prims,1,1,1); - const LineSegments* geom = context->scene->get(line.geomID()); - Vec4vf v0,v1; - vbool cL,cR; - line.gather(v0,v1,cL,cR,geom); - const vbool valid = line.template valid(); - return ConeCurveIntersector1::intersect(valid,ray,context,geom,pre,v0,v1,cL,cR,Occluded1EpilogM(ray,context,line.geomID(),line.primID())); - return false; - } - - static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& line) - { - return PrimitivePointQuery1::pointQuery(query, context, line); - } - }; - - template - struct ConeCurveMiMBIntersector1 - { - typedef LineMi Primitive; - typedef CurvePrecalculations1 Precalculations; - - static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& line) - { - STAT3(normal.trav_prims,1,1,1); - const LineSegments* geom = context->scene->get(line.geomID()); - Vec4vf v0,v1; - vbool cL,cR; - line.gather(v0,v1,cL,cR,geom,ray.time()); - const vbool valid = line.template valid(); - ConeCurveIntersector1::intersect(valid,ray,context,geom,pre,v0,v1,cL,cR,Intersect1EpilogM(ray,context,line.geomID(),line.primID())); - } - - static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& line) - { - STAT3(shadow.trav_prims,1,1,1); - const LineSegments* geom = context->scene->get(line.geomID()); - Vec4vf v0,v1; - vbool cL,cR; - line.gather(v0,v1,cL,cR,geom,ray.time()); - const vbool valid = line.template valid(); - return ConeCurveIntersector1::intersect(valid,ray,context,geom,pre,v0,v1,cL,cR,Occluded1EpilogM(ray,context,line.geomID(),line.primID())); - return false; - } - - static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& line) - { - return PrimitivePointQuery1::pointQuery(query, context, line); - } - }; - - template - struct ConeCurveMiIntersectorK - { - typedef LineMi Primitive; - typedef CurvePrecalculationsK Precalculations; - - static __forceinline void intersect(const Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive& line) - { - STAT3(normal.trav_prims,1,1,1); - const LineSegments* geom = context->scene->get(line.geomID()); - Vec4vf v0,v1; - vbool cL,cR; - line.gather(v0,v1,cL,cR,geom); - const vbool valid = line.template valid(); - ConeCurveIntersectorK::intersect(valid,ray,k,context,geom,pre,v0,v1,cL,cR,Intersect1KEpilogM(ray,k,context,line.geomID(),line.primID())); - } - - static __forceinline bool occluded(const Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const Primitive& line) - { - STAT3(shadow.trav_prims,1,1,1); - const LineSegments* geom = context->scene->get(line.geomID()); - Vec4vf v0,v1; - vbool cL,cR; - line.gather(v0,v1,cL,cR,geom); - const vbool valid = line.template valid(); - return ConeCurveIntersectorK::intersect(valid,ray,k,context,geom,pre,v0,v1,cL,cR,Occluded1KEpilogM(ray,k,context,line.geomID(),line.primID())); - } - }; - - template - struct ConeCurveMiMBIntersectorK - { - typedef LineMi Primitive; - typedef CurvePrecalculationsK Precalculations; - - static __forceinline void intersect(const Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive& line) - { - STAT3(normal.trav_prims,1,1,1); - const LineSegments* geom = context->scene->get(line.geomID()); - Vec4vf v0,v1; - vbool cL,cR; - line.gather(v0,v1,cL,cR,geom,ray.time()[k]); - const vbool valid = line.template valid(); - ConeCurveIntersectorK::intersect(valid,ray,k,context,geom,pre,v0,v1,cL,cR,Intersect1KEpilogM(ray,k,context,line.geomID(),line.primID())); - } - - static __forceinline bool occluded(const Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const Primitive& line) - { - STAT3(shadow.trav_prims,1,1,1); - const LineSegments* geom = context->scene->get(line.geomID()); - Vec4vf v0,v1; - vbool cL,cR; - line.gather(v0,v1,cL,cR,geom,ray.time()[k]); - const vbool valid = line.template valid(); - return ConeCurveIntersectorK::intersect(valid,ray,k,context,geom,pre,v0,v1,cL,cR,Occluded1KEpilogM(ray,k,context,line.geomID(),line.primID())); - } - }; - } -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/curveNi.h b/thirdparty/embree-aarch64/kernels/geometry/curveNi.h deleted file mode 100644 index 51384f1959..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/curveNi.h +++ /dev/null @@ -1,222 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "primitive.h" -#include "curve_intersector_precalculations.h" - -namespace embree -{ - template - struct CurveNi - { - struct Type : public PrimitiveType { - const char* name() const; - size_t sizeActive(const char* This) const; - size_t sizeTotal(const char* This) const; - size_t getBytes(const char* This) const; - }; - static Type type; - - public: - - /* Returns maximum number of stored primitives */ - static __forceinline size_t max_size() { return M; } - - /* Returns required number of primitive blocks for N primitives */ - static __forceinline size_t blocks(size_t N) { return (N+M-1)/M; } - - static __forceinline size_t bytes(size_t N) - { - const size_t f = N/M, r = N%M; - static_assert(sizeof(CurveNi) == 22+25*M, "internal data layout issue"); - return f*sizeof(CurveNi) + (r!=0)*(22 + 25*r); - } - - public: - - /*! Default constructor. */ - __forceinline CurveNi () {} - - /*! fill curve from curve list */ - __forceinline void fill(const PrimRef* prims, size_t& begin, size_t _end, Scene* scene) - { - size_t end = min(begin+M,_end); - N = (uint8_t)(end-begin); - const unsigned int geomID0 = prims[begin].geomID(); - this->geomID(N) = geomID0; - ty = (uint8_t) scene->get(geomID0)->getType(); - - /* encode all primitives */ - BBox3fa bounds = empty; - for (size_t i=0; iget(geomID)->vbounds(primID)); - } - - /* calculate offset and scale */ - Vec3fa loffset = bounds.lower; - float lscale = reduce_min(256.0f/(bounds.size()*sqrt(3.0f))); - if (bounds.size() == Vec3fa(zero)) lscale = 0.0f; - *this->offset(N) = loffset; - *this->scale(N) = lscale; - - /* encode all primitives */ - for (size_t i=0; iget(geomID)->computeAlignedSpace(primID); - - const LinearSpace3fa space3(trunc(126.0f*space2.vx),trunc(126.0f*space2.vy),trunc(126.0f*space2.vz)); - const BBox3fa bounds = scene->get(geomID)->vbounds(loffset,lscale,max(length(space3.vx),length(space3.vy),length(space3.vz)),space3.transposed(),primID); - - bounds_vx_x(N)[i] = (int8_t) space3.vx.x; - bounds_vx_y(N)[i] = (int8_t) space3.vx.y; - bounds_vx_z(N)[i] = (int8_t) space3.vx.z; - bounds_vx_lower(N)[i] = (short) clamp(floor(bounds.lower.x),-32767.0f,32767.0f); - bounds_vx_upper(N)[i] = (short) clamp(ceil (bounds.upper.x),-32767.0f,32767.0f); - assert(-32767.0f <= floor(bounds.lower.x) && floor(bounds.lower.x) <= 32767.0f); - assert(-32767.0f <= ceil (bounds.upper.x) && ceil (bounds.upper.x) <= 32767.0f); - - bounds_vy_x(N)[i] = (int8_t) space3.vy.x; - bounds_vy_y(N)[i] = (int8_t) space3.vy.y; - bounds_vy_z(N)[i] = (int8_t) space3.vy.z; - bounds_vy_lower(N)[i] = (short) clamp(floor(bounds.lower.y),-32767.0f,32767.0f); - bounds_vy_upper(N)[i] = (short) clamp(ceil (bounds.upper.y),-32767.0f,32767.0f); - assert(-32767.0f <= floor(bounds.lower.y) && floor(bounds.lower.y) <= 32767.0f); - assert(-32767.0f <= ceil (bounds.upper.y) && ceil (bounds.upper.y) <= 32767.0f); - - bounds_vz_x(N)[i] = (int8_t) space3.vz.x; - bounds_vz_y(N)[i] = (int8_t) space3.vz.y; - bounds_vz_z(N)[i] = (int8_t) space3.vz.z; - bounds_vz_lower(N)[i] = (short) clamp(floor(bounds.lower.z),-32767.0f,32767.0f); - bounds_vz_upper(N)[i] = (short) clamp(ceil (bounds.upper.z),-32767.0f,32767.0f); - assert(-32767.0f <= floor(bounds.lower.z) && floor(bounds.lower.z) <= 32767.0f); - assert(-32767.0f <= ceil (bounds.upper.z) && ceil (bounds.upper.z) <= 32767.0f); - - this->primID(N)[i] = primID; - } - } - - template - __forceinline static typename BVH::NodeRef createLeaf (BVH* bvh, const PrimRef* prims, const range& set, const Allocator& alloc) - { - size_t start = set.begin(); - size_t items = CurveNi::blocks(set.size()); - size_t numbytes = CurveNi::bytes(set.size()); - CurveNi* accel = (CurveNi*) alloc.malloc1(numbytes,BVH::byteAlignment); - for (size_t i=0; iscene); - } - return bvh->encodeLeaf((int8_t*)accel,items); - }; - - public: - - // 27.6 - 46 bytes per primitive - uint8_t ty; - uint8_t N; - uint8_t data[4+25*M+16]; - - /* - struct Layout - { - unsigned int geomID; - unsigned int primID[N]; - - int8_t bounds_vx_x[N]; - int8_t bounds_vx_y[N]; - int8_t bounds_vx_z[N]; - short bounds_vx_lower[N]; - short bounds_vx_upper[N]; - - int8_t bounds_vy_x[N]; - int8_t bounds_vy_y[N]; - int8_t bounds_vy_z[N]; - short bounds_vy_lower[N]; - short bounds_vy_upper[N]; - - int8_t bounds_vz_x[N]; - int8_t bounds_vz_y[N]; - int8_t bounds_vz_z[N]; - short bounds_vz_lower[N]; - short bounds_vz_upper[N]; - - Vec3f offset; - float scale; - }; - */ - - __forceinline unsigned int& geomID(size_t N) { return *(unsigned int*)((int8_t*)this+2); } - __forceinline const unsigned int& geomID(size_t N) const { return *(unsigned int*)((int8_t*)this+2); } - - __forceinline unsigned int* primID(size_t N) { return (unsigned int*)((int8_t*)this+6); } - __forceinline const unsigned int* primID(size_t N) const { return (unsigned int*)((int8_t*)this+6); } - - __forceinline int8_t* bounds_vx_x(size_t N) { return (int8_t*)((int8_t*)this+6+4*N); } - __forceinline const int8_t* bounds_vx_x(size_t N) const { return (int8_t*)((int8_t*)this+6+4*N); } - - __forceinline int8_t* bounds_vx_y(size_t N) { return (int8_t*)((int8_t*)this+6+5*N); } - __forceinline const int8_t* bounds_vx_y(size_t N) const { return (int8_t*)((int8_t*)this+6+5*N); } - - __forceinline int8_t* bounds_vx_z(size_t N) { return (int8_t*)((int8_t*)this+6+6*N); } - __forceinline const int8_t* bounds_vx_z(size_t N) const { return (int8_t*)((int8_t*)this+6+6*N); } - - __forceinline short* bounds_vx_lower(size_t N) { return (short*)((int8_t*)this+6+7*N); } - __forceinline const short* bounds_vx_lower(size_t N) const { return (short*)((int8_t*)this+6+7*N); } - - __forceinline short* bounds_vx_upper(size_t N) { return (short*)((int8_t*)this+6+9*N); } - __forceinline const short* bounds_vx_upper(size_t N) const { return (short*)((int8_t*)this+6+9*N); } - - __forceinline int8_t* bounds_vy_x(size_t N) { return (int8_t*)((int8_t*)this+6+11*N); } - __forceinline const int8_t* bounds_vy_x(size_t N) const { return (int8_t*)((int8_t*)this+6+11*N); } - - __forceinline int8_t* bounds_vy_y(size_t N) { return (int8_t*)((int8_t*)this+6+12*N); } - __forceinline const int8_t* bounds_vy_y(size_t N) const { return (int8_t*)((int8_t*)this+6+12*N); } - - __forceinline int8_t* bounds_vy_z(size_t N) { return (int8_t*)((int8_t*)this+6+13*N); } - __forceinline const int8_t* bounds_vy_z(size_t N) const { return (int8_t*)((int8_t*)this+6+13*N); } - - __forceinline short* bounds_vy_lower(size_t N) { return (short*)((int8_t*)this+6+14*N); } - __forceinline const short* bounds_vy_lower(size_t N) const { return (short*)((int8_t*)this+6+14*N); } - - __forceinline short* bounds_vy_upper(size_t N) { return (short*)((int8_t*)this+6+16*N); } - __forceinline const short* bounds_vy_upper(size_t N) const { return (short*)((int8_t*)this+6+16*N); } - - __forceinline int8_t* bounds_vz_x(size_t N) { return (int8_t*)((int8_t*)this+6+18*N); } - __forceinline const int8_t* bounds_vz_x(size_t N) const { return (int8_t*)((int8_t*)this+6+18*N); } - - __forceinline int8_t* bounds_vz_y(size_t N) { return (int8_t*)((int8_t*)this+6+19*N); } - __forceinline const int8_t* bounds_vz_y(size_t N) const { return (int8_t*)((int8_t*)this+6+19*N); } - - __forceinline int8_t* bounds_vz_z(size_t N) { return (int8_t*)((int8_t*)this+6+20*N); } - __forceinline const int8_t* bounds_vz_z(size_t N) const { return (int8_t*)((int8_t*)this+6+20*N); } - - __forceinline short* bounds_vz_lower(size_t N) { return (short*)((int8_t*)this+6+21*N); } - __forceinline const short* bounds_vz_lower(size_t N) const { return (short*)((int8_t*)this+6+21*N); } - - __forceinline short* bounds_vz_upper(size_t N) { return (short*)((int8_t*)this+6+23*N); } - __forceinline const short* bounds_vz_upper(size_t N) const { return (short*)((int8_t*)this+6+23*N); } - - __forceinline Vec3f* offset(size_t N) { return (Vec3f*)((int8_t*)this+6+25*N); } - __forceinline const Vec3f* offset(size_t N) const { return (Vec3f*)((int8_t*)this+6+25*N); } - - __forceinline float* scale(size_t N) { return (float*)((int8_t*)this+6+25*N+12); } - __forceinline const float* scale(size_t N) const { return (float*)((int8_t*)this+6+25*N+12); } - - __forceinline int8_t* end(size_t N) { return (int8_t*)this+6+25*N+16; } - __forceinline const int8_t* end(size_t N) const { return (int8_t*)this+6+25*N+16; } - }; - - template - typename CurveNi::Type CurveNi::type; - - typedef CurveNi<4> Curve4i; - typedef CurveNi<8> Curve8i; -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/curveNi_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/curveNi_intersector.h deleted file mode 100644 index 0f9038c9fc..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/curveNi_intersector.h +++ /dev/null @@ -1,569 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "curveNi.h" - -namespace embree -{ - namespace isa - { - template - struct CurveNiIntersector1 - { - typedef CurveNi Primitive; - typedef Vec3vf Vec3vfM; - typedef LinearSpace3LinearSpace3vfM; - typedef CurvePrecalculations1 Precalculations; - - static __forceinline vbool intersect(Ray& ray, const Primitive& prim, vfloat& tNear_o) - { - const size_t N = prim.N; - const vfloat4 offset_scale = vfloat4::loadu(prim.offset(N)); - const Vec3fa offset = Vec3fa(offset_scale); - const Vec3fa scale = Vec3fa(shuffle<3,3,3,3>(offset_scale)); - const Vec3fa org1 = (ray.org-offset)*scale; - const Vec3fa dir1 = ray.dir*scale; - - const LinearSpace3vfM space(vfloat::load(prim.bounds_vx_x(N)), vfloat::load(prim.bounds_vx_y(N)), vfloat::load(prim.bounds_vx_z(N)), - vfloat::load(prim.bounds_vy_x(N)), vfloat::load(prim.bounds_vy_y(N)), vfloat::load(prim.bounds_vy_z(N)), - vfloat::load(prim.bounds_vz_x(N)), vfloat::load(prim.bounds_vz_y(N)), vfloat::load(prim.bounds_vz_z(N))); - - const Vec3vfM dir2 = xfmVector(space,Vec3vfM(dir1)); - const Vec3vfM org2 = xfmPoint (space,Vec3vfM(org1)); - const Vec3vfM rcp_dir2 = rcp_safe(dir2); - - const vfloat t_lower_x = (vfloat::load(prim.bounds_vx_lower(N))-vfloat(org2.x))*vfloat(rcp_dir2.x); - const vfloat t_upper_x = (vfloat::load(prim.bounds_vx_upper(N))-vfloat(org2.x))*vfloat(rcp_dir2.x); - const vfloat t_lower_y = (vfloat::load(prim.bounds_vy_lower(N))-vfloat(org2.y))*vfloat(rcp_dir2.y); - const vfloat t_upper_y = (vfloat::load(prim.bounds_vy_upper(N))-vfloat(org2.y))*vfloat(rcp_dir2.y); - const vfloat t_lower_z = (vfloat::load(prim.bounds_vz_lower(N))-vfloat(org2.z))*vfloat(rcp_dir2.z); - const vfloat t_upper_z = (vfloat::load(prim.bounds_vz_upper(N))-vfloat(org2.z))*vfloat(rcp_dir2.z); - - const vfloat round_up (1.0f+3.0f*float(ulp)); - const vfloat round_down(1.0f-3.0f*float(ulp)); - const vfloat tNear = round_down*max(mini(t_lower_x,t_upper_x),mini(t_lower_y,t_upper_y),mini(t_lower_z,t_upper_z),vfloat(ray.tnear())); - const vfloat tFar = round_up *min(maxi(t_lower_x,t_upper_x),maxi(t_lower_y,t_upper_y),maxi(t_lower_z,t_upper_z),vfloat(ray.tfar)); - tNear_o = tNear; - return (vint(step) < vint(prim.N)) & (tNear <= tFar); - } - - template - static __forceinline void intersect_t(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim) - { - vfloat tNear; - vbool valid = intersect(ray,prim,tNear); - - const size_t N = prim.N; - size_t mask = movemask(valid); - while (mask) - { - const size_t i = bscf(mask); - STAT3(normal.trav_prims,1,1,1); - const unsigned int geomID = prim.geomID(N); - const unsigned int primID = prim.primID(N)[i]; - const CurveGeometry* geom = context->scene->get(geomID); - Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID)); - - size_t mask1 = mask; - const size_t i1 = bscf(mask1); - if (mask) { - const unsigned int primID1 = prim.primID(N)[i1]; - geom->prefetchL1_vertices(geom->curve(primID1)); - if (mask1) { - const size_t i2 = bsf(mask1); - const unsigned int primID2 = prim.primID(N)[i2]; - geom->prefetchL2_vertices(geom->curve(primID2)); - } - } - - Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,Epilog(ray,context,geomID,primID)); - mask &= movemask(tNear <= vfloat(ray.tfar)); - } - } - - template - static __forceinline bool occluded_t(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim) - { - vfloat tNear; - vbool valid = intersect(ray,prim,tNear); - - const size_t N = prim.N; - size_t mask = movemask(valid); - while (mask) - { - const size_t i = bscf(mask); - STAT3(shadow.trav_prims,1,1,1); - const unsigned int geomID = prim.geomID(N); - const unsigned int primID = prim.primID(N)[i]; - const CurveGeometry* geom = context->scene->get(geomID); - Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID)); - - size_t mask1 = mask; - const size_t i1 = bscf(mask1); - if (mask) { - const unsigned int primID1 = prim.primID(N)[i1]; - geom->prefetchL1_vertices(geom->curve(primID1)); - if (mask1) { - const size_t i2 = bsf(mask1); - const unsigned int primID2 = prim.primID(N)[i2]; - geom->prefetchL2_vertices(geom->curve(primID2)); - } - } - - if (Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,Epilog(ray,context,geomID,primID))) - return true; - - mask &= movemask(tNear <= vfloat(ray.tfar)); - } - return false; - } - - template - static __forceinline void intersect_n(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim) - { - vfloat tNear; - vbool valid = intersect(ray,prim,tNear); - - const size_t N = prim.N; - size_t mask = movemask(valid); - while (mask) - { - const size_t i = bscf(mask); - STAT3(normal.trav_prims,1,1,1); - const unsigned int geomID = prim.geomID(N); - const unsigned int primID = prim.primID(N)[i]; - const CurveGeometry* geom = context->scene->get(geomID); - - unsigned int vertexID = geom->curve(primID); - Vec3ff a0,a1,a2,a3; Vec3fa n0,n1,n2,n3; geom->gather(a0,a1,a2,a3,n0,n1,n2,n3,vertexID); - - size_t mask1 = mask; - const size_t i1 = bscf(mask1); - if (mask) { - const unsigned int primID1 = prim.primID(N)[i1]; - geom->prefetchL1_vertices(geom->curve(primID1)); - if (mask1) { - const size_t i2 = bsf(mask1); - const unsigned int primID2 = prim.primID(N)[i2]; - geom->prefetchL2_vertices(geom->curve(primID2)); - } - } - - Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,n0,n1,n2,n3,Epilog(ray,context,geomID,primID)); - mask &= movemask(tNear <= vfloat(ray.tfar)); - } - } - - template - static __forceinline bool occluded_n(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim) - { - vfloat tNear; - vbool valid = intersect(ray,prim,tNear); - - const size_t N = prim.N; - size_t mask = movemask(valid); - while (mask) - { - const size_t i = bscf(mask); - STAT3(shadow.trav_prims,1,1,1); - const unsigned int geomID = prim.geomID(N); - const unsigned int primID = prim.primID(N)[i]; - const CurveGeometry* geom = context->scene->get(geomID); - - unsigned int vertexID = geom->curve(primID); - Vec3ff a0,a1,a2,a3; Vec3fa n0,n1,n2,n3; geom->gather(a0,a1,a2,a3,n0,n1,n2,n3,vertexID); - - size_t mask1 = mask; - const size_t i1 = bscf(mask1); - if (mask) { - const unsigned int primID1 = prim.primID(N)[i1]; - geom->prefetchL1_vertices(geom->curve(primID1)); - if (mask1) { - const size_t i2 = bsf(mask1); - const unsigned int primID2 = prim.primID(N)[i2]; - geom->prefetchL2_vertices(geom->curve(primID2)); - } - } - - if (Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,n0,n1,n2,n3,Epilog(ray,context,geomID,primID))) - return true; - - mask &= movemask(tNear <= vfloat(ray.tfar)); - } - return false; - } - - template - static __forceinline void intersect_h(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim) - { - vfloat tNear; - vbool valid = intersect(ray,prim,tNear); - - const size_t N = prim.N; - size_t mask = movemask(valid); - while (mask) - { - const size_t i = bscf(mask); - STAT3(normal.trav_prims,1,1,1); - const unsigned int geomID = prim.geomID(N); - const unsigned int primID = prim.primID(N)[i]; - const CurveGeometry* geom = context->scene->get(geomID); - Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID)); - Intersector().intersect(pre,ray,context,geom,primID,p0,t0,p1,t1,Epilog(ray,context,geomID,primID)); - mask &= movemask(tNear <= vfloat(ray.tfar)); - } - } - - template - static __forceinline bool occluded_h(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim) - { - vfloat tNear; - vbool valid = intersect(ray,prim,tNear); - - const size_t N = prim.N; - size_t mask = movemask(valid); - while (mask) - { - const size_t i = bscf(mask); - STAT3(shadow.trav_prims,1,1,1); - const unsigned int geomID = prim.geomID(N); - const unsigned int primID = prim.primID(N)[i]; - const CurveGeometry* geom = context->scene->get(geomID); - Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID)); - if (Intersector().intersect(pre,ray,context,geom,primID,p0,t0,p1,t1,Epilog(ray,context,geomID,primID))) - return true; - - mask &= movemask(tNear <= vfloat(ray.tfar)); - } - return false; - } - - template - static __forceinline void intersect_hn(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim) - { - vfloat tNear; - vbool valid = intersect(ray,prim,tNear); - - const size_t N = prim.N; - size_t mask = movemask(valid); - while (mask) - { - const size_t i = bscf(mask); - STAT3(normal.trav_prims,1,1,1); - const unsigned int geomID = prim.geomID(N); - const unsigned int primID = prim.primID(N)[i]; - const CurveGeometry* geom = context->scene->get(geomID); - Vec3ff p0,t0,p1,t1; Vec3fa n0,dn0,n1,dn1; geom->gather_hermite(p0,t0,n0,dn0,p1,t1,n1,dn1,geom->curve(primID)); - Intersector().intersect(pre,ray,context,geom,primID,p0,t0,p1,t1,n0,dn0,n1,dn1,Epilog(ray,context,geomID,primID)); - mask &= movemask(tNear <= vfloat(ray.tfar)); - } - } - - template - static __forceinline bool occluded_hn(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim) - { - vfloat tNear; - vbool valid = intersect(ray,prim,tNear); - - const size_t N = prim.N; - size_t mask = movemask(valid); - while (mask) - { - const size_t i = bscf(mask); - STAT3(shadow.trav_prims,1,1,1); - const unsigned int geomID = prim.geomID(N); - const unsigned int primID = prim.primID(N)[i]; - const CurveGeometry* geom = context->scene->get(geomID); - Vec3ff p0,t0,p1,t1; Vec3fa n0,dn0,n1,dn1; geom->gather_hermite(p0,t0,n0,dn0,p1,t1,n1,dn1,geom->curve(primID)); - if (Intersector().intersect(pre,ray,context,geom,primID,p0,t0,p1,t1,n0,dn0,n1,dn1,Epilog(ray,context,geomID,primID))) - return true; - - mask &= movemask(tNear <= vfloat(ray.tfar)); - } - return false; - } - }; - - template - struct CurveNiIntersectorK - { - typedef CurveNi Primitive; - typedef Vec3vf Vec3vfM; - typedef LinearSpace3LinearSpace3vfM; - typedef CurvePrecalculationsK Precalculations; - - static __forceinline vbool intersect(RayK& ray, const size_t k, const Primitive& prim, vfloat& tNear_o) - { - const size_t N = prim.N; - const vfloat4 offset_scale = vfloat4::loadu(prim.offset(N)); - const Vec3fa offset = Vec3fa(offset_scale); - const Vec3fa scale = Vec3fa(shuffle<3,3,3,3>(offset_scale)); - - const Vec3fa ray_org(ray.org.x[k],ray.org.y[k],ray.org.z[k]); - const Vec3fa ray_dir(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]); - const Vec3fa org1 = (ray_org-offset)*scale; - const Vec3fa dir1 = ray_dir*scale; - - const LinearSpace3vfM space(vfloat::load(prim.bounds_vx_x(N)), vfloat::load(prim.bounds_vx_y(N)), vfloat::load(prim.bounds_vx_z(N)), - vfloat::load(prim.bounds_vy_x(N)), vfloat::load(prim.bounds_vy_y(N)), vfloat::load(prim.bounds_vy_z(N)), - vfloat::load(prim.bounds_vz_x(N)), vfloat::load(prim.bounds_vz_y(N)), vfloat::load(prim.bounds_vz_z(N))); - - const Vec3vfM dir2 = xfmVector(space,Vec3vfM(dir1)); - const Vec3vfM org2 = xfmPoint (space,Vec3vfM(org1)); - const Vec3vfM rcp_dir2 = rcp_safe(dir2); - - const vfloat t_lower_x = (vfloat::load(prim.bounds_vx_lower(N))-vfloat(org2.x))*vfloat(rcp_dir2.x); - const vfloat t_upper_x = (vfloat::load(prim.bounds_vx_upper(N))-vfloat(org2.x))*vfloat(rcp_dir2.x); - const vfloat t_lower_y = (vfloat::load(prim.bounds_vy_lower(N))-vfloat(org2.y))*vfloat(rcp_dir2.y); - const vfloat t_upper_y = (vfloat::load(prim.bounds_vy_upper(N))-vfloat(org2.y))*vfloat(rcp_dir2.y); - const vfloat t_lower_z = (vfloat::load(prim.bounds_vz_lower(N))-vfloat(org2.z))*vfloat(rcp_dir2.z); - const vfloat t_upper_z = (vfloat::load(prim.bounds_vz_upper(N))-vfloat(org2.z))*vfloat(rcp_dir2.z); - - const vfloat round_up (1.0f+3.0f*float(ulp)); - const vfloat round_down(1.0f-3.0f*float(ulp)); - const vfloat tNear = round_down*max(mini(t_lower_x,t_upper_x),mini(t_lower_y,t_upper_y),mini(t_lower_z,t_upper_z),vfloat(ray.tnear()[k])); - const vfloat tFar = round_up *min(maxi(t_lower_x,t_upper_x),maxi(t_lower_y,t_upper_y),maxi(t_lower_z,t_upper_z),vfloat(ray.tfar[k])); - tNear_o = tNear; - return (vint(step) < vint(prim.N)) & (tNear <= tFar); - } - - template - static __forceinline void intersect_t(Precalculations& pre, RayHitK& ray, const size_t k, IntersectContext* context, const Primitive& prim) - { - vfloat tNear; - vbool valid = intersect(ray,k,prim,tNear); - - const size_t N = prim.N; - size_t mask = movemask(valid); - while (mask) - { - const size_t i = bscf(mask); - STAT3(normal.trav_prims,1,1,1); - const unsigned int geomID = prim.geomID(N); - const unsigned int primID = prim.primID(N)[i]; - const CurveGeometry* geom = context->scene->get(geomID); - Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID)); - - size_t mask1 = mask; - const size_t i1 = bscf(mask1); - if (mask) { - const unsigned int primID1 = prim.primID(N)[i1]; - geom->prefetchL1_vertices(geom->curve(primID1)); - if (mask1) { - const size_t i2 = bsf(mask1); - const unsigned int primID2 = prim.primID(N)[i2]; - geom->prefetchL2_vertices(geom->curve(primID2)); - } - } - - Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,Epilog(ray,k,context,geomID,primID)); - mask &= movemask(tNear <= vfloat(ray.tfar[k])); - } - } - - template - static __forceinline bool occluded_t(Precalculations& pre, RayK& ray, const size_t k, IntersectContext* context, const Primitive& prim) - { - vfloat tNear; - vbool valid = intersect(ray,k,prim,tNear); - - const size_t N = prim.N; - size_t mask = movemask(valid); - while (mask) - { - const size_t i = bscf(mask); - STAT3(shadow.trav_prims,1,1,1); - const unsigned int geomID = prim.geomID(N); - const unsigned int primID = prim.primID(N)[i]; - const CurveGeometry* geom = context->scene->get(geomID); - Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID)); - - size_t mask1 = mask; - const size_t i1 = bscf(mask1); - if (mask) { - const unsigned int primID1 = prim.primID(N)[i1]; - geom->prefetchL1_vertices(geom->curve(primID1)); - if (mask1) { - const size_t i2 = bsf(mask1); - const unsigned int primID2 = prim.primID(N)[i2]; - geom->prefetchL2_vertices(geom->curve(primID2)); - } - } - - if (Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,Epilog(ray,k,context,geomID,primID))) - return true; - - mask &= movemask(tNear <= vfloat(ray.tfar[k])); - } - return false; - } - - template - static __forceinline void intersect_n(Precalculations& pre, RayHitK& ray, const size_t k, IntersectContext* context, const Primitive& prim) - { - vfloat tNear; - vbool valid = intersect(ray,k,prim,tNear); - - const size_t N = prim.N; - size_t mask = movemask(valid); - while (mask) - { - const size_t i = bscf(mask); - STAT3(normal.trav_prims,1,1,1); - const unsigned int geomID = prim.geomID(N); - const unsigned int primID = prim.primID(N)[i]; - const CurveGeometry* geom = context->scene->get(geomID); - - unsigned int vertexID = geom->curve(primID); - Vec3ff a0,a1,a2,a3; Vec3fa n0,n1,n2,n3; geom->gather(a0,a1,a2,a3,n0,n1,n2,n3,vertexID); - - size_t mask1 = mask; - const size_t i1 = bscf(mask1); - if (mask) { - const unsigned int primID1 = prim.primID(N)[i1]; - geom->prefetchL1_vertices(geom->curve(primID1)); - if (mask1) { - const size_t i2 = bsf(mask1); - const unsigned int primID2 = prim.primID(N)[i2]; - geom->prefetchL2_vertices(geom->curve(primID2)); - } - } - - Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,n0,n1,n2,n3,Epilog(ray,k,context,geomID,primID)); - mask &= movemask(tNear <= vfloat(ray.tfar[k])); - } - } - - template - static __forceinline bool occluded_n(Precalculations& pre, RayK& ray, const size_t k, IntersectContext* context, const Primitive& prim) - { - vfloat tNear; - vbool valid = intersect(ray,k,prim,tNear); - - const size_t N = prim.N; - size_t mask = movemask(valid); - while (mask) - { - const size_t i = bscf(mask); - STAT3(shadow.trav_prims,1,1,1); - const unsigned int geomID = prim.geomID(N); - const unsigned int primID = prim.primID(N)[i]; - const CurveGeometry* geom = context->scene->get(geomID); - - unsigned int vertexID = geom->curve(primID); - Vec3ff a0,a1,a2,a3; Vec3fa n0,n1,n2,n3; geom->gather(a0,a1,a2,a3,n0,n1,n2,n3,vertexID); - - size_t mask1 = mask; - const size_t i1 = bscf(mask1); - if (mask) { - const unsigned int primID1 = prim.primID(N)[i1]; - geom->prefetchL1_vertices(geom->curve(primID1)); - if (mask1) { - const size_t i2 = bsf(mask1); - const unsigned int primID2 = prim.primID(N)[i2]; - geom->prefetchL2_vertices(geom->curve(primID2)); - } - } - - if (Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,n0,n1,n2,n3,Epilog(ray,k,context,geomID,primID))) - return true; - - mask &= movemask(tNear <= vfloat(ray.tfar[k])); - } - return false; - } - - template - static __forceinline void intersect_h(Precalculations& pre, RayHitK& ray, const size_t k, IntersectContext* context, const Primitive& prim) - { - vfloat tNear; - vbool valid = intersect(ray,k,prim,tNear); - - const size_t N = prim.N; - size_t mask = movemask(valid); - while (mask) - { - const size_t i = bscf(mask); - STAT3(normal.trav_prims,1,1,1); - const unsigned int geomID = prim.geomID(N); - const unsigned int primID = prim.primID(N)[i]; - const CurveGeometry* geom = context->scene->get(geomID); - Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID)); - Intersector().intersect(pre,ray,k,context,geom,primID,p0,t0,p1,t1,Epilog(ray,k,context,geomID,primID)); - mask &= movemask(tNear <= vfloat(ray.tfar[k])); - } - } - - template - static __forceinline bool occluded_h(Precalculations& pre, RayK& ray, const size_t k, IntersectContext* context, const Primitive& prim) - { - vfloat tNear; - vbool valid = intersect(ray,k,prim,tNear); - - const size_t N = prim.N; - size_t mask = movemask(valid); - while (mask) - { - const size_t i = bscf(mask); - STAT3(shadow.trav_prims,1,1,1); - const unsigned int geomID = prim.geomID(N); - const unsigned int primID = prim.primID(N)[i]; - const CurveGeometry* geom = context->scene->get(geomID); - Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID)); - if (Intersector().intersect(pre,ray,k,context,geom,primID,p0,t0,p1,t1,Epilog(ray,k,context,geomID,primID))) - return true; - - mask &= movemask(tNear <= vfloat(ray.tfar[k])); - } - return false; - } - - template - static __forceinline void intersect_hn(Precalculations& pre, RayHitK& ray, const size_t k, IntersectContext* context, const Primitive& prim) - { - vfloat tNear; - vbool valid = intersect(ray,k,prim,tNear); - - const size_t N = prim.N; - size_t mask = movemask(valid); - while (mask) - { - const size_t i = bscf(mask); - STAT3(normal.trav_prims,1,1,1); - const unsigned int geomID = prim.geomID(N); - const unsigned int primID = prim.primID(N)[i]; - const CurveGeometry* geom = context->scene->get(geomID); - Vec3ff p0,t0,p1,t1; Vec3fa n0,dn0,n1,dn1; geom->gather_hermite(p0,t0,n0,dn0,p1,t1,n1,dn1,geom->curve(primID)); - Intersector().intersect(pre,ray,k,context,geom,primID,p0,t0,p1,t1,n0,dn0,n1,dn1,Epilog(ray,k,context,geomID,primID)); - mask &= movemask(tNear <= vfloat(ray.tfar[k])); - } - } - - template - static __forceinline bool occluded_hn(Precalculations& pre, RayK& ray, const size_t k, IntersectContext* context, const Primitive& prim) - { - vfloat tNear; - vbool valid = intersect(ray,k,prim,tNear); - - const size_t N = prim.N; - size_t mask = movemask(valid); - while (mask) - { - const size_t i = bscf(mask); - STAT3(shadow.trav_prims,1,1,1); - const unsigned int geomID = prim.geomID(N); - const unsigned int primID = prim.primID(N)[i]; - const CurveGeometry* geom = context->scene->get(geomID); - Vec3ff p0,t0,p1,t1; Vec3fa n0,dn0,n1,dn1; geom->gather_hermite(p0,t0,n0,dn0,p1,t1,n1,dn1,geom->curve(primID)); - if (Intersector().intersect(pre,ray,k,context,geom,primID,p0,t0,p1,t1,n0,dn0,n1,dn1,Epilog(ray,k,context,geomID,primID))) - return true; - - mask &= movemask(tNear <= vfloat(ray.tfar[k])); - } - return false; - } - }; - } -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/curveNi_mb.h b/thirdparty/embree-aarch64/kernels/geometry/curveNi_mb.h deleted file mode 100644 index 0cd8f833fd..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/curveNi_mb.h +++ /dev/null @@ -1,278 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "primitive.h" -#include "curve_intersector_precalculations.h" - -namespace embree -{ - template - struct CurveNiMB - { - struct Type : public PrimitiveType { - const char* name() const; - size_t sizeActive(const char* This) const; - size_t sizeTotal(const char* This) const; - size_t getBytes(const char* This) const; - }; - static Type type; - - public: - - /* Returns maximum number of stored primitives */ - static __forceinline size_t max_size() { return M; } - - /* Returns required number of primitive blocks for N primitives */ - static __forceinline size_t blocks(size_t N) { return (N+M-1)/M; } - - static __forceinline size_t bytes(size_t N) - { - const size_t f = N/M, r = N%M; - static_assert(sizeof(CurveNiMB) == 6+37*M+24, "internal data layout issue"); - return f*sizeof(CurveNiMB) + (r!=0)*(6+37*r+24); - } - - public: - - /*! Default constructor. */ - __forceinline CurveNiMB () {} - - /*! fill curve from curve list */ - __forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t& begin, size_t _end, Scene* scene, const BBox1f time_range) - { - size_t end = min(begin+M,_end); - N = (uint8_t)(end-begin); - const unsigned int geomID0 = prims[begin].geomID(); - this->geomID(N) = geomID0; - ty = (uint8_t) scene->get(geomID0)->getType(); - - /* encode all primitives */ - LBBox3fa lbounds = empty; - for (size_t i=0; iget(geomID)->vlinearBounds(primID,time_range)); - } - BBox3fa bounds = lbounds.bounds(); - - /* calculate offset and scale */ - Vec3fa loffset = bounds.lower; - float lscale = reduce_min(256.0f/(bounds.size()*sqrt(3.0f))); - if (bounds.size() == Vec3fa(zero)) lscale = 0.0f; - *this->offset(N) = loffset; - *this->scale(N) = lscale; - this->time_offset(N) = time_range.lower; - this->time_scale(N) = 1.0f/time_range.size(); - - /* encode all primitives */ - for (size_t i=0; iget(geomID)->computeAlignedSpaceMB(primID,time_range); - - const LinearSpace3fa space3(trunc(126.0f*space2.vx),trunc(126.0f*space2.vy),trunc(126.0f*space2.vz)); - const LBBox3fa bounds = scene->get(geomID)->vlinearBounds(loffset,lscale,max(length(space3.vx),length(space3.vy),length(space3.vz)),space3.transposed(),primID,time_range); - - // NOTE: this weird (int8_t) (short) cast works around VS2015 Win32 compiler bug - bounds_vx_x(N)[i] = (int8_t) (short) space3.vx.x; - bounds_vx_y(N)[i] = (int8_t) (short) space3.vx.y; - bounds_vx_z(N)[i] = (int8_t) (short) space3.vx.z; - bounds_vx_lower0(N)[i] = (short) clamp(floor(bounds.bounds0.lower.x),-32767.0f,32767.0f); - bounds_vx_upper0(N)[i] = (short) clamp(ceil (bounds.bounds0.upper.x),-32767.0f,32767.0f); - bounds_vx_lower1(N)[i] = (short) clamp(floor(bounds.bounds1.lower.x),-32767.0f,32767.0f); - bounds_vx_upper1(N)[i] = (short) clamp(ceil (bounds.bounds1.upper.x),-32767.0f,32767.0f); - assert(-32767.0f <= floor(bounds.bounds0.lower.x) && floor(bounds.bounds0.lower.x) <= 32767.0f); - assert(-32767.0f <= ceil (bounds.bounds0.upper.x) && ceil (bounds.bounds0.upper.x) <= 32767.0f); - assert(-32767.0f <= floor(bounds.bounds1.lower.x) && floor(bounds.bounds1.lower.x) <= 32767.0f); - assert(-32767.0f <= ceil (bounds.bounds1.upper.x) && ceil (bounds.bounds1.upper.x) <= 32767.0f); - - bounds_vy_x(N)[i] = (int8_t) (short) space3.vy.x; - bounds_vy_y(N)[i] = (int8_t) (short) space3.vy.y; - bounds_vy_z(N)[i] = (int8_t) (short) space3.vy.z; - bounds_vy_lower0(N)[i] = (short) clamp(floor(bounds.bounds0.lower.y),-32767.0f,32767.0f); - bounds_vy_upper0(N)[i] = (short) clamp(ceil (bounds.bounds0.upper.y),-32767.0f,32767.0f); - bounds_vy_lower1(N)[i] = (short) clamp(floor(bounds.bounds1.lower.y),-32767.0f,32767.0f); - bounds_vy_upper1(N)[i] = (short) clamp(ceil (bounds.bounds1.upper.y),-32767.0f,32767.0f); - assert(-32767.0f <= floor(bounds.bounds0.lower.y) && floor(bounds.bounds0.lower.y) <= 32767.0f); - assert(-32767.0f <= ceil (bounds.bounds0.upper.y) && ceil (bounds.bounds0.upper.y) <= 32767.0f); - assert(-32767.0f <= floor(bounds.bounds1.lower.y) && floor(bounds.bounds1.lower.y) <= 32767.0f); - assert(-32767.0f <= ceil (bounds.bounds1.upper.y) && ceil (bounds.bounds1.upper.y) <= 32767.0f); - - bounds_vz_x(N)[i] = (int8_t) (short) space3.vz.x; - bounds_vz_y(N)[i] = (int8_t) (short) space3.vz.y; - bounds_vz_z(N)[i] = (int8_t) (short) space3.vz.z; - bounds_vz_lower0(N)[i] = (short) clamp(floor(bounds.bounds0.lower.z),-32767.0f,32767.0f); - bounds_vz_upper0(N)[i] = (short) clamp(ceil (bounds.bounds0.upper.z),-32767.0f,32767.0f); - bounds_vz_lower1(N)[i] = (short) clamp(floor(bounds.bounds1.lower.z),-32767.0f,32767.0f); - bounds_vz_upper1(N)[i] = (short) clamp(ceil (bounds.bounds1.upper.z),-32767.0f,32767.0f); - assert(-32767.0f <= floor(bounds.bounds0.lower.z) && floor(bounds.bounds0.lower.z) <= 32767.0f); - assert(-32767.0f <= ceil (bounds.bounds0.upper.z) && ceil (bounds.bounds0.upper.z) <= 32767.0f); - assert(-32767.0f <= floor(bounds.bounds1.lower.z) && floor(bounds.bounds1.lower.z) <= 32767.0f); - assert(-32767.0f <= ceil (bounds.bounds1.upper.z) && ceil (bounds.bounds1.upper.z) <= 32767.0f); - - this->primID(N)[i] = primID; - } - - return lbounds; - } - - template - __forceinline static typename BVH::NodeRecordMB4D createLeafMB(BVH* bvh, const SetMB& prims, const Allocator& alloc) - { - size_t start = prims.begin(); - size_t end = prims.end(); - size_t items = CurveNiMB::blocks(prims.size()); - size_t numbytes = CurveNiMB::bytes(prims.size()); - CurveNiMB* accel = (CurveNiMB*) alloc.malloc1(numbytes,BVH::byteAlignment); - const typename BVH::NodeRef node = bvh->encodeLeaf((int8_t*)accel,items); - - LBBox3fa bounds = empty; - for (size_t i=0; idata(),start,end,bvh->scene,prims.time_range)); - - return typename BVH::NodeRecordMB4D(node,bounds,prims.time_range); - }; - - - public: - - // 27.6 - 46 bytes per primitive - uint8_t ty; - uint8_t N; - uint8_t data[4+37*M+24]; - - /* - struct Layout - { - unsigned int geomID; - unsigned int primID[N]; - - int8_t bounds_vx_x[N]; - int8_t bounds_vx_y[N]; - int8_t bounds_vx_z[N]; - short bounds_vx_lower0[N]; - short bounds_vx_upper0[N]; - short bounds_vx_lower1[N]; - short bounds_vx_upper1[N]; - - int8_t bounds_vy_x[N]; - int8_t bounds_vy_y[N]; - int8_t bounds_vy_z[N]; - short bounds_vy_lower0[N]; - short bounds_vy_upper0[N]; - short bounds_vy_lower1[N]; - short bounds_vy_upper1[N]; - - int8_t bounds_vz_x[N]; - int8_t bounds_vz_y[N]; - int8_t bounds_vz_z[N]; - short bounds_vz_lower0[N]; - short bounds_vz_upper0[N]; - short bounds_vz_lower1[N]; - short bounds_vz_upper1[N]; - - Vec3f offset; - float scale; - - float time_offset; - float time_scale; - }; - */ - - __forceinline unsigned int& geomID(size_t N) { return *(unsigned int*)((int8_t*)this+2); } - __forceinline const unsigned int& geomID(size_t N) const { return *(unsigned int*)((int8_t*)this+2); } - - __forceinline unsigned int* primID(size_t N) { return (unsigned int*)((int8_t*)this+6); } - __forceinline const unsigned int* primID(size_t N) const { return (unsigned int*)((int8_t*)this+6); } - - __forceinline int8_t* bounds_vx_x(size_t N) { return (int8_t*)((int8_t*)this+6+4*N); } - __forceinline const int8_t* bounds_vx_x(size_t N) const { return (int8_t*)((int8_t*)this+6+4*N); } - - __forceinline int8_t* bounds_vx_y(size_t N) { return (int8_t*)((int8_t*)this+6+5*N); } - __forceinline const int8_t* bounds_vx_y(size_t N) const { return (int8_t*)((int8_t*)this+6+5*N); } - - __forceinline int8_t* bounds_vx_z(size_t N) { return (int8_t*)((int8_t*)this+6+6*N); } - __forceinline const int8_t* bounds_vx_z(size_t N) const { return (int8_t*)((int8_t*)this+6+6*N); } - - __forceinline short* bounds_vx_lower0(size_t N) { return (short*)((int8_t*)this+6+7*N); } - __forceinline const short* bounds_vx_lower0(size_t N) const { return (short*)((int8_t*)this+6+7*N); } - - __forceinline short* bounds_vx_upper0(size_t N) { return (short*)((int8_t*)this+6+9*N); } - __forceinline const short* bounds_vx_upper0(size_t N) const { return (short*)((int8_t*)this+6+9*N); } - - __forceinline short* bounds_vx_lower1(size_t N) { return (short*)((int8_t*)this+6+11*N); } - __forceinline const short* bounds_vx_lower1(size_t N) const { return (short*)((int8_t*)this+6+11*N); } - - __forceinline short* bounds_vx_upper1(size_t N) { return (short*)((int8_t*)this+6+13*N); } - __forceinline const short* bounds_vx_upper1(size_t N) const { return (short*)((int8_t*)this+6+13*N); } - - __forceinline int8_t* bounds_vy_x(size_t N) { return (int8_t*)((int8_t*)this+6+15*N); } - __forceinline const int8_t* bounds_vy_x(size_t N) const { return (int8_t*)((int8_t*)this+6+15*N); } - - __forceinline int8_t* bounds_vy_y(size_t N) { return (int8_t*)((int8_t*)this+6+16*N); } - __forceinline const int8_t* bounds_vy_y(size_t N) const { return (int8_t*)((int8_t*)this+6+16*N); } - - __forceinline int8_t* bounds_vy_z(size_t N) { return (int8_t*)((int8_t*)this+6+17*N); } - __forceinline const int8_t* bounds_vy_z(size_t N) const { return (int8_t*)((int8_t*)this+6+17*N); } - - __forceinline short* bounds_vy_lower0(size_t N) { return (short*)((int8_t*)this+6+18*N); } - __forceinline const short* bounds_vy_lower0(size_t N) const { return (short*)((int8_t*)this+6+18*N); } - - __forceinline short* bounds_vy_upper0(size_t N) { return (short*)((int8_t*)this+6+20*N); } - __forceinline const short* bounds_vy_upper0(size_t N) const { return (short*)((int8_t*)this+6+20*N); } - - __forceinline short* bounds_vy_lower1(size_t N) { return (short*)((int8_t*)this+6+22*N); } - __forceinline const short* bounds_vy_lower1(size_t N) const { return (short*)((int8_t*)this+6+22*N); } - - __forceinline short* bounds_vy_upper1(size_t N) { return (short*)((int8_t*)this+6+24*N); } - __forceinline const short* bounds_vy_upper1(size_t N) const { return (short*)((int8_t*)this+6+24*N); } - - __forceinline int8_t* bounds_vz_x(size_t N) { return (int8_t*)((int8_t*)this+6+26*N); } - __forceinline const int8_t* bounds_vz_x(size_t N) const { return (int8_t*)((int8_t*)this+6+26*N); } - - __forceinline int8_t* bounds_vz_y(size_t N) { return (int8_t*)((int8_t*)this+6+27*N); } - __forceinline const int8_t* bounds_vz_y(size_t N) const { return (int8_t*)((int8_t*)this+6+27*N); } - - __forceinline int8_t* bounds_vz_z(size_t N) { return (int8_t*)((int8_t*)this+6+28*N); } - __forceinline const int8_t* bounds_vz_z(size_t N) const { return (int8_t*)((int8_t*)this+6+28*N); } - - __forceinline short* bounds_vz_lower0(size_t N) { return (short*)((int8_t*)this+6+29*N); } - __forceinline const short* bounds_vz_lower0(size_t N) const { return (short*)((int8_t*)this+6+29*N); } - - __forceinline short* bounds_vz_upper0(size_t N) { return (short*)((int8_t*)this+6+31*N); } - __forceinline const short* bounds_vz_upper0(size_t N) const { return (short*)((int8_t*)this+6+31*N); } - - __forceinline short* bounds_vz_lower1(size_t N) { return (short*)((int8_t*)this+6+33*N); } - __forceinline const short* bounds_vz_lower1(size_t N) const { return (short*)((int8_t*)this+6+33*N); } - - __forceinline short* bounds_vz_upper1(size_t N) { return (short*)((int8_t*)this+6+35*N); } - __forceinline const short* bounds_vz_upper1(size_t N) const { return (short*)((int8_t*)this+6+35*N); } - - __forceinline Vec3f* offset(size_t N) { return (Vec3f*)((int8_t*)this+6+37*N); } - __forceinline const Vec3f* offset(size_t N) const { return (Vec3f*)((int8_t*)this+6+37*N); } - - __forceinline float* scale(size_t N) { return (float*)((int8_t*)this+6+37*N+12); } - __forceinline const float* scale(size_t N) const { return (float*)((int8_t*)this+6+37*N+12); } - - __forceinline float& time_offset(size_t N) { return *(float*)((int8_t*)this+6+37*N+16); } - __forceinline const float& time_offset(size_t N) const { return *(float*)((int8_t*)this+6+37*N+16); } - - __forceinline float& time_scale(size_t N) { return *(float*)((int8_t*)this+6+37*N+20); } - __forceinline const float& time_scale(size_t N) const { return *(float*)((int8_t*)this+6+37*N+20); } - - __forceinline int8_t* end(size_t N) { return (int8_t*)this+6+37*N+24; } - __forceinline const int8_t* end(size_t N) const { return (int8_t*)this+6+37*N+24; } - }; - - template - typename CurveNiMB::Type CurveNiMB::type; - - typedef CurveNiMB<4> Curve4iMB; - typedef CurveNiMB<8> Curve8iMB; -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/curveNi_mb_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/curveNi_mb_intersector.h deleted file mode 100644 index 0cbc764668..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/curveNi_mb_intersector.h +++ /dev/null @@ -1,516 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "curveNi_mb.h" -#include "../subdiv/linear_bezier_patch.h" - -namespace embree -{ - namespace isa - { - template - struct CurveNiMBIntersector1 - { - typedef CurveNiMB Primitive; - typedef Vec3vf Vec3vfM; - typedef LinearSpace3LinearSpace3vfM; - typedef CurvePrecalculations1 Precalculations; - - static __forceinline vbool intersect(Ray& ray, const Primitive& prim, vfloat& tNear_o) - { - const size_t N = prim.N; - const vfloat4 offset_scale = vfloat4::loadu(prim.offset(N)); - const Vec3fa offset = Vec3fa(offset_scale); - const Vec3fa scale = Vec3fa(shuffle<3,3,3,3>(offset_scale)); - const Vec3fa org1 = (ray.org-offset)*scale; - const Vec3fa dir1 = ray.dir*scale; - - const LinearSpace3vfM space(vfloat::load(prim.bounds_vx_x(N)), vfloat::load(prim.bounds_vx_y(N)), vfloat::load(prim.bounds_vx_z(N)), - vfloat::load(prim.bounds_vy_x(N)), vfloat::load(prim.bounds_vy_y(N)), vfloat::load(prim.bounds_vy_z(N)), - vfloat::load(prim.bounds_vz_x(N)), vfloat::load(prim.bounds_vz_y(N)), vfloat::load(prim.bounds_vz_z(N))); - - const Vec3vfM dir2 = xfmVector(space,Vec3vfM(dir1)); - const Vec3vfM org2 = xfmPoint (space,Vec3vfM(org1)); - const Vec3vfM rcp_dir2 = rcp_safe(dir2); - - const vfloat ltime = (ray.time()-prim.time_offset(N))*prim.time_scale(N); - const vfloat vx_lower0 = vfloat::load(prim.bounds_vx_lower0(N)); - const vfloat vx_lower1 = vfloat::load(prim.bounds_vx_lower1(N)); - const vfloat vx_lower = madd(ltime,vx_lower1-vx_lower0,vx_lower0); - const vfloat vx_upper0 = vfloat::load(prim.bounds_vx_upper0(N)); - const vfloat vx_upper1 = vfloat::load(prim.bounds_vx_upper1(N)); - const vfloat vx_upper = madd(ltime,vx_upper1-vx_upper0,vx_upper0); - - const vfloat vy_lower0 = vfloat::load(prim.bounds_vy_lower0(N)); - const vfloat vy_lower1 = vfloat::load(prim.bounds_vy_lower1(N)); - const vfloat vy_lower = madd(ltime,vy_lower1-vy_lower0,vy_lower0); - const vfloat vy_upper0 = vfloat::load(prim.bounds_vy_upper0(N)); - const vfloat vy_upper1 = vfloat::load(prim.bounds_vy_upper1(N)); - const vfloat vy_upper = madd(ltime,vy_upper1-vy_upper0,vy_upper0); - - const vfloat vz_lower0 = vfloat::load(prim.bounds_vz_lower0(N)); - const vfloat vz_lower1 = vfloat::load(prim.bounds_vz_lower1(N)); - const vfloat vz_lower = madd(ltime,vz_lower1-vz_lower0,vz_lower0); - const vfloat vz_upper0 = vfloat::load(prim.bounds_vz_upper0(N)); - const vfloat vz_upper1 = vfloat::load(prim.bounds_vz_upper1(N)); - const vfloat vz_upper = madd(ltime,vz_upper1-vz_upper0,vz_upper0); - - const vfloat t_lower_x = (vx_lower-vfloat(org2.x))*vfloat(rcp_dir2.x); - const vfloat t_upper_x = (vx_upper-vfloat(org2.x))*vfloat(rcp_dir2.x); - const vfloat t_lower_y = (vy_lower-vfloat(org2.y))*vfloat(rcp_dir2.y); - const vfloat t_upper_y = (vy_upper-vfloat(org2.y))*vfloat(rcp_dir2.y); - const vfloat t_lower_z = (vz_lower-vfloat(org2.z))*vfloat(rcp_dir2.z); - const vfloat t_upper_z = (vz_upper-vfloat(org2.z))*vfloat(rcp_dir2.z); - - const vfloat round_up (1.0f+3.0f*float(ulp)); - const vfloat round_down(1.0f-3.0f*float(ulp)); - const vfloat tNear = round_down*max(mini(t_lower_x,t_upper_x),mini(t_lower_y,t_upper_y),mini(t_lower_z,t_upper_z),vfloat(ray.tnear())); - const vfloat tFar = round_up *min(maxi(t_lower_x,t_upper_x),maxi(t_lower_y,t_upper_y),maxi(t_lower_z,t_upper_z),vfloat(ray.tfar)); - tNear_o = tNear; - return (vint(step) < vint(prim.N)) & (tNear <= tFar); - } - - template - static __forceinline void intersect_t(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim) - { - vfloat tNear; - vbool valid = intersect(ray,prim,tNear); - - const size_t N = prim.N; - size_t mask = movemask(valid); - while (mask) - { - const size_t i = bscf(mask); - STAT3(normal.trav_prims,1,1,1); - const unsigned int geomID = prim.geomID(N); - const unsigned int primID = prim.primID(N)[i]; - const CurveGeometry* geom = context->scene->get(geomID); - Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID),ray.time()); - - Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,Epilog(ray,context,geomID,primID)); - mask &= movemask(tNear <= vfloat(ray.tfar)); - } - } - - template - static __forceinline bool occluded_t(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim) - { - vfloat tNear; - vbool valid = intersect(ray,prim,tNear); - - const size_t N = prim.N; - size_t mask = movemask(valid); - while (mask) - { - const size_t i = bscf(mask); - STAT3(shadow.trav_prims,1,1,1); - const unsigned int geomID = prim.geomID(N); - const unsigned int primID = prim.primID(N)[i]; - const CurveGeometry* geom = context->scene->get(geomID); - Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID),ray.time()); - - if (Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,Epilog(ray,context,geomID,primID))) - return true; - - mask &= movemask(tNear <= vfloat(ray.tfar)); - } - return false; - } - - template - static __forceinline void intersect_n(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim) - { - vfloat tNear; - vbool valid = intersect(ray,prim,tNear); - - const size_t N = prim.N; - size_t mask = movemask(valid); - while (mask) - { - const size_t i = bscf(mask); - STAT3(normal.trav_prims,1,1,1); - const unsigned int geomID = prim.geomID(N); - const unsigned int primID = prim.primID(N)[i]; - const CurveGeometry* geom = context->scene->get(geomID); - const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedCurve(context, ray.org, primID,ray.time()); - Intersector().intersect(pre,ray,context,geom,primID,curve,Epilog(ray,context,geomID,primID)); - mask &= movemask(tNear <= vfloat(ray.tfar)); - } - } - - template - static __forceinline bool occluded_n(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim) - { - vfloat tNear; - vbool valid = intersect(ray,prim,tNear); - - const size_t N = prim.N; - size_t mask = movemask(valid); - while (mask) - { - const size_t i = bscf(mask); - STAT3(shadow.trav_prims,1,1,1); - const unsigned int geomID = prim.geomID(N); - const unsigned int primID = prim.primID(N)[i]; - const CurveGeometry* geom = context->scene->get(geomID); - const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedCurve(context, ray.org, primID,ray.time()); - - if (Intersector().intersect(pre,ray,context,geom,primID,curve,Epilog(ray,context,geomID,primID))) - return true; - - mask &= movemask(tNear <= vfloat(ray.tfar)); - } - return false; - } - - template - static __forceinline void intersect_h(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim) - { - vfloat tNear; - vbool valid = intersect(ray,prim,tNear); - - const size_t N = prim.N; - size_t mask = movemask(valid); - while (mask) - { - const size_t i = bscf(mask); - STAT3(normal.trav_prims,1,1,1); - const unsigned int geomID = prim.geomID(N); - const unsigned int primID = prim.primID(N)[i]; - const CurveGeometry* geom = context->scene->get(geomID); - Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID),ray.time()); - Intersector().intersect(pre,ray,context,geom,primID,p0,t0,p1,t1,Epilog(ray,context,geomID,primID)); - mask &= movemask(tNear <= vfloat(ray.tfar)); - } - } - - template - static __forceinline bool occluded_h(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim) - { - vfloat tNear; - vbool valid = intersect(ray,prim,tNear); - - const size_t N = prim.N; - size_t mask = movemask(valid); - while (mask) - { - const size_t i = bscf(mask); - STAT3(shadow.trav_prims,1,1,1); - const unsigned int geomID = prim.geomID(N); - const unsigned int primID = prim.primID(N)[i]; - const CurveGeometry* geom = context->scene->get(geomID); - Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID),ray.time()); - if (Intersector().intersect(pre,ray,context,geom,primID,p0,t0,p1,t1,Epilog(ray,context,geomID,primID))) - return true; - - mask &= movemask(tNear <= vfloat(ray.tfar)); - } - return false; - } - - template - static __forceinline void intersect_hn(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim) - { - vfloat tNear; - vbool valid = intersect(ray,prim,tNear); - - const size_t N = prim.N; - size_t mask = movemask(valid); - while (mask) - { - const size_t i = bscf(mask); - STAT3(normal.trav_prims,1,1,1); - const unsigned int geomID = prim.geomID(N); - const unsigned int primID = prim.primID(N)[i]; - const CurveGeometry* geom = context->scene->get(geomID); - const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedHermiteCurve(context, ray.org, primID,ray.time()); - Intersector().intersect(pre,ray,context,geom,primID,curve,Epilog(ray,context,geomID,primID)); - mask &= movemask(tNear <= vfloat(ray.tfar)); - } - } - - template - static __forceinline bool occluded_hn(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim) - { - vfloat tNear; - vbool valid = intersect(ray,prim,tNear); - - const size_t N = prim.N; - size_t mask = movemask(valid); - while (mask) - { - const size_t i = bscf(mask); - STAT3(shadow.trav_prims,1,1,1); - const unsigned int geomID = prim.geomID(N); - const unsigned int primID = prim.primID(N)[i]; - const CurveGeometry* geom = context->scene->get(geomID); - const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedHermiteCurve(context, ray.org, primID,ray.time()); - if (Intersector().intersect(pre,ray,context,geom,primID,curve,Epilog(ray,context,geomID,primID))) - return true; - - mask &= movemask(tNear <= vfloat(ray.tfar)); - } - return false; - } - }; - - template - struct CurveNiMBIntersectorK - { - typedef CurveNiMB Primitive; - typedef Vec3vf Vec3vfM; - typedef LinearSpace3LinearSpace3vfM; - typedef CurvePrecalculationsK Precalculations; - - static __forceinline vbool intersect(RayK& ray, const size_t k, const Primitive& prim, vfloat& tNear_o) - { - const size_t N = prim.N; - const vfloat4 offset_scale = vfloat4::loadu(prim.offset(N)); - const Vec3fa offset = Vec3fa(offset_scale); - const Vec3fa scale = Vec3fa(shuffle<3,3,3,3>(offset_scale)); - - const Vec3fa ray_org(ray.org.x[k],ray.org.y[k],ray.org.z[k]); - const Vec3fa ray_dir(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]); - const Vec3fa org1 = (ray_org-offset)*scale; - const Vec3fa dir1 = ray_dir*scale; - - const LinearSpace3vfM space(vfloat::load(prim.bounds_vx_x(N)), vfloat::load(prim.bounds_vx_y(N)), vfloat::load(prim.bounds_vx_z(N)), - vfloat::load(prim.bounds_vy_x(N)), vfloat::load(prim.bounds_vy_y(N)), vfloat::load(prim.bounds_vy_z(N)), - vfloat::load(prim.bounds_vz_x(N)), vfloat::load(prim.bounds_vz_y(N)), vfloat::load(prim.bounds_vz_z(N))); - - const Vec3vfM dir2 = xfmVector(space,Vec3vfM(dir1)); - const Vec3vfM org2 = xfmPoint (space,Vec3vfM(org1)); - const Vec3vfM rcp_dir2 = rcp_safe(dir2); - - const vfloat ltime = (ray.time()[k]-prim.time_offset(N))*prim.time_scale(N); - const vfloat vx_lower0 = vfloat::load(prim.bounds_vx_lower0(N)); - const vfloat vx_lower1 = vfloat::load(prim.bounds_vx_lower1(N)); - const vfloat vx_lower = madd(ltime,vx_lower1-vx_lower0,vx_lower0); - const vfloat vx_upper0 = vfloat::load(prim.bounds_vx_upper0(N)); - const vfloat vx_upper1 = vfloat::load(prim.bounds_vx_upper1(N)); - const vfloat vx_upper = madd(ltime,vx_upper1-vx_upper0,vx_upper0); - - const vfloat vy_lower0 = vfloat::load(prim.bounds_vy_lower0(N)); - const vfloat vy_lower1 = vfloat::load(prim.bounds_vy_lower1(N)); - const vfloat vy_lower = madd(ltime,vy_lower1-vy_lower0,vy_lower0); - const vfloat vy_upper0 = vfloat::load(prim.bounds_vy_upper0(N)); - const vfloat vy_upper1 = vfloat::load(prim.bounds_vy_upper1(N)); - const vfloat vy_upper = madd(ltime,vy_upper1-vy_upper0,vy_upper0); - - const vfloat vz_lower0 = vfloat::load(prim.bounds_vz_lower0(N)); - const vfloat vz_lower1 = vfloat::load(prim.bounds_vz_lower1(N)); - const vfloat vz_lower = madd(ltime,vz_lower1-vz_lower0,vz_lower0); - const vfloat vz_upper0 = vfloat::load(prim.bounds_vz_upper0(N)); - const vfloat vz_upper1 = vfloat::load(prim.bounds_vz_upper1(N)); - const vfloat vz_upper = madd(ltime,vz_upper1-vz_upper0,vz_upper0); - - const vfloat t_lower_x = (vx_lower-vfloat(org2.x))*vfloat(rcp_dir2.x); - const vfloat t_upper_x = (vx_upper-vfloat(org2.x))*vfloat(rcp_dir2.x); - const vfloat t_lower_y = (vy_lower-vfloat(org2.y))*vfloat(rcp_dir2.y); - const vfloat t_upper_y = (vy_upper-vfloat(org2.y))*vfloat(rcp_dir2.y); - const vfloat t_lower_z = (vz_lower-vfloat(org2.z))*vfloat(rcp_dir2.z); - const vfloat t_upper_z = (vz_upper-vfloat(org2.z))*vfloat(rcp_dir2.z); - - const vfloat round_up (1.0f+3.0f*float(ulp)); - const vfloat round_down(1.0f-3.0f*float(ulp)); - const vfloat tNear = round_down*max(mini(t_lower_x,t_upper_x),mini(t_lower_y,t_upper_y),mini(t_lower_z,t_upper_z),vfloat(ray.tnear()[k])); - const vfloat tFar = round_up *min(maxi(t_lower_x,t_upper_x),maxi(t_lower_y,t_upper_y),maxi(t_lower_z,t_upper_z),vfloat(ray.tfar[k])); - tNear_o = tNear; - return (vint(step) < vint(prim.N)) & (tNear <= tFar); - } - - template - static __forceinline void intersect_t(Precalculations& pre, RayHitK& ray, const size_t k, IntersectContext* context, const Primitive& prim) - { - - vfloat tNear; - vbool valid = intersect(ray,k,prim,tNear); - - const size_t N = prim.N; - size_t mask = movemask(valid); - while (mask) - { - const size_t i = bscf(mask); - STAT3(normal.trav_prims,1,1,1); - const unsigned int geomID = prim.geomID(N); - const unsigned int primID = prim.primID(N)[i]; - const CurveGeometry* geom = context->scene->get(geomID); - Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID),ray.time()[k]); - - Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,Epilog(ray,k,context,geomID,primID)); - mask &= movemask(tNear <= vfloat(ray.tfar[k])); - } - } - - template - static __forceinline bool occluded_t(Precalculations& pre, RayK& ray, const size_t k, IntersectContext* context, const Primitive& prim) - { - vfloat tNear; - vbool valid = intersect(ray,k,prim,tNear); - - const size_t N = prim.N; - size_t mask = movemask(valid); - while (mask) - { - const size_t i = bscf(mask); - STAT3(shadow.trav_prims,1,1,1); - const unsigned int geomID = prim.geomID(N); - const unsigned int primID = prim.primID(N)[i]; - const CurveGeometry* geom = context->scene->get(geomID); - Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID),ray.time()[k]); - - if (Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,Epilog(ray,k,context,geomID,primID))) - return true; - - mask &= movemask(tNear <= vfloat(ray.tfar[k])); - } - return false; - } - - template - static __forceinline void intersect_n(Precalculations& pre, RayHitK& ray, const size_t k, IntersectContext* context, const Primitive& prim) - { - - vfloat tNear; - vbool valid = intersect(ray,k,prim,tNear); - - const size_t N = prim.N; - size_t mask = movemask(valid); - while (mask) - { - const size_t i = bscf(mask); - STAT3(normal.trav_prims,1,1,1); - const unsigned int geomID = prim.geomID(N); - const unsigned int primID = prim.primID(N)[i]; - const CurveGeometry* geom = context->scene->get(geomID); - const Vec3fa ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]); - const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedCurve(context, ray_org, primID,ray.time()[k]); - Intersector().intersect(pre,ray,k,context,geom,primID,curve,Epilog(ray,k,context,geomID,primID)); - mask &= movemask(tNear <= vfloat(ray.tfar[k])); - } - } - - template - static __forceinline bool occluded_n(Precalculations& pre, RayK& ray, const size_t k, IntersectContext* context, const Primitive& prim) - { - vfloat tNear; - vbool valid = intersect(ray,k,prim,tNear); - - const size_t N = prim.N; - size_t mask = movemask(valid); - while (mask) - { - const size_t i = bscf(mask); - STAT3(shadow.trav_prims,1,1,1); - const unsigned int geomID = prim.geomID(N); - const unsigned int primID = prim.primID(N)[i]; - const CurveGeometry* geom = context->scene->get(geomID); - const Vec3fa ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]); - const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedCurve(context, ray_org, primID,ray.time()[k]); - - if (Intersector().intersect(pre,ray,k,context,geom,primID,curve,Epilog(ray,k,context,geomID,primID))) - return true; - - mask &= movemask(tNear <= vfloat(ray.tfar[k])); - } - return false; - } - - template - static __forceinline void intersect_h(Precalculations& pre, RayHitK& ray, const size_t k, IntersectContext* context, const Primitive& prim) - { - - vfloat tNear; - vbool valid = intersect(ray,k,prim,tNear); - - const size_t N = prim.N; - size_t mask = movemask(valid); - while (mask) - { - const size_t i = bscf(mask); - STAT3(normal.trav_prims,1,1,1); - const unsigned int geomID = prim.geomID(N); - const unsigned int primID = prim.primID(N)[i]; - const CurveGeometry* geom = context->scene->get(geomID); - Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID),ray.time()[k]); - Intersector().intersect(pre,ray,k,context,geom,primID,p0,t0,p1,t1,Epilog(ray,k,context,geomID,primID)); - mask &= movemask(tNear <= vfloat(ray.tfar[k])); - } - } - - template - static __forceinline bool occluded_h(Precalculations& pre, RayK& ray, const size_t k, IntersectContext* context, const Primitive& prim) - { - vfloat tNear; - vbool valid = intersect(ray,k,prim,tNear); - - const size_t N = prim.N; - size_t mask = movemask(valid); - while (mask) - { - const size_t i = bscf(mask); - STAT3(shadow.trav_prims,1,1,1); - const unsigned int geomID = prim.geomID(N); - const unsigned int primID = prim.primID(N)[i]; - const CurveGeometry* geom = context->scene->get(geomID); - Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID),ray.time()[k]); - if (Intersector().intersect(pre,ray,k,context,geom,primID,p0,t0,p1,t1,Epilog(ray,k,context,geomID,primID))) - return true; - - mask &= movemask(tNear <= vfloat(ray.tfar[k])); - } - return false; - } - - template - static __forceinline void intersect_hn(Precalculations& pre, RayHitK& ray, const size_t k, IntersectContext* context, const Primitive& prim) - { - - vfloat tNear; - vbool valid = intersect(ray,k,prim,tNear); - - const size_t N = prim.N; - size_t mask = movemask(valid); - while (mask) - { - const size_t i = bscf(mask); - STAT3(normal.trav_prims,1,1,1); - const unsigned int geomID = prim.geomID(N); - const unsigned int primID = prim.primID(N)[i]; - const CurveGeometry* geom = context->scene->get(geomID); - const Vec3fa ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]); - const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedHermiteCurve(context, ray_org, primID,ray.time()[k]); - Intersector().intersect(pre,ray,k,context,geom,primID,curve,Epilog(ray,k,context,geomID,primID)); - mask &= movemask(tNear <= vfloat(ray.tfar[k])); - } - } - - template - static __forceinline bool occluded_hn(Precalculations& pre, RayK& ray, const size_t k, IntersectContext* context, const Primitive& prim) - { - vfloat tNear; - vbool valid = intersect(ray,k,prim,tNear); - - const size_t N = prim.N; - size_t mask = movemask(valid); - while (mask) - { - const size_t i = bscf(mask); - STAT3(shadow.trav_prims,1,1,1); - const unsigned int geomID = prim.geomID(N); - const unsigned int primID = prim.primID(N)[i]; - const CurveGeometry* geom = context->scene->get(geomID); - const Vec3fa ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]); - const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedHermiteCurve(context, ray_org, primID,ray.time()[k]); - if (Intersector().intersect(pre,ray,k,context,geom,primID,curve,Epilog(ray,k,context,geomID,primID))) - return true; - - mask &= movemask(tNear <= vfloat(ray.tfar[k])); - } - return false; - } - }; - } -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/curveNv.h b/thirdparty/embree-aarch64/kernels/geometry/curveNv.h deleted file mode 100644 index 6eb5e30b39..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/curveNv.h +++ /dev/null @@ -1,101 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "curveNi.h" - -namespace embree -{ - template - struct CurveNv : public CurveNi - { - using CurveNi::N; - - struct Type : public PrimitiveType { - const char* name() const; - size_t sizeActive(const char* This) const; - size_t sizeTotal(const char* This) const; - size_t getBytes(const char* This) const; - }; - static Type type; - - public: - - /* Returns maximum number of stored primitives */ - static __forceinline size_t max_size() { return M; } - - /* Returns required number of primitive blocks for N primitives */ - static __forceinline size_t blocks(size_t N) { return (N+M-1)/M; } - - static __forceinline size_t bytes(size_t N) - { - const size_t f = N/M, r = N%M; - static_assert(sizeof(CurveNv) == 22+25*M+4*16*M, "internal data layout issue"); - return f*sizeof(CurveNv) + (r!=0)*(22 + 25*r + 4*16*r); - } - - public: - - /*! Default constructor. */ - __forceinline CurveNv () {} - - /*! fill curve from curve list */ - __forceinline void fill(const PrimRef* prims, size_t& begin, size_t _end, Scene* scene) - { - size_t end = min(begin+M,_end); - size_t N = end-begin; - - /* encode all primitives */ - for (size_t i=0; iget(geomID); - const unsigned vtxID = mesh->curve(primID); - Vec3fa::storeu(&this->vertices(i,N)[0],mesh->vertex(vtxID+0)); - Vec3fa::storeu(&this->vertices(i,N)[1],mesh->vertex(vtxID+1)); - Vec3fa::storeu(&this->vertices(i,N)[2],mesh->vertex(vtxID+2)); - Vec3fa::storeu(&this->vertices(i,N)[3],mesh->vertex(vtxID+3)); - } - } - - template - __forceinline static typename BVH::NodeRef createLeaf (BVH* bvh, const PrimRef* prims, const range& set, const Allocator& alloc) - { - if (set.size() == 0) - return BVH::emptyNode; - - /* fall back to CurveNi for oriented curves */ - unsigned int geomID = prims[set.begin()].geomID(); - if (bvh->scene->get(geomID)->getCurveType() == Geometry::GTY_SUBTYPE_ORIENTED_CURVE) { - return CurveNi::createLeaf(bvh,prims,set,alloc); - } - if (bvh->scene->get(geomID)->getCurveBasis() == Geometry::GTY_BASIS_HERMITE) { - return CurveNi::createLeaf(bvh,prims,set,alloc); - } - - size_t start = set.begin(); - size_t items = CurveNv::blocks(set.size()); - size_t numbytes = CurveNv::bytes(set.size()); - CurveNv* accel = (CurveNv*) alloc.malloc1(numbytes,BVH::byteAlignment); - for (size_t i=0; i::fill(prims,start,set.end(),bvh->scene); - accel[i].CurveNi::fill(prims,start,set.end(),bvh->scene); - } - return bvh->encodeLeaf((char*)accel,items); - }; - - public: - unsigned char data[4*16*M]; - __forceinline Vec3fa* vertices(size_t i, size_t N) { return (Vec3fa*)CurveNi::end(N)+4*i; } - __forceinline const Vec3fa* vertices(size_t i, size_t N) const { return (Vec3fa*)CurveNi::end(N)+4*i; } - }; - - template - typename CurveNv::Type CurveNv::type; - - typedef CurveNv<4> Curve4v; - typedef CurveNv<8> Curve8v; -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/curveNv_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/curveNv_intersector.h deleted file mode 100644 index e20da2882e..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/curveNv_intersector.h +++ /dev/null @@ -1,181 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "curveNv.h" -#include "curveNi_intersector.h" - -namespace embree -{ - namespace isa - { - template - struct CurveNvIntersector1 : public CurveNiIntersector1 - { - typedef CurveNv Primitive; - typedef CurvePrecalculations1 Precalculations; - - template - static __forceinline void intersect_t(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim) - { - vfloat tNear; - vbool valid = CurveNiIntersector1::intersect(ray,prim,tNear); - - const size_t N = prim.N; - size_t mask = movemask(valid); - while (mask) - { - const size_t i = bscf(mask); - STAT3(normal.trav_prims,1,1,1); - const unsigned int geomID = prim.geomID(N); - const unsigned int primID = prim.primID(N)[i]; - const CurveGeometry* geom = (CurveGeometry*) context->scene->get(geomID); - const Vec3ff a0 = Vec3ff::loadu(&prim.vertices(i,N)[0]); - const Vec3ff a1 = Vec3ff::loadu(&prim.vertices(i,N)[1]); - const Vec3ff a2 = Vec3ff::loadu(&prim.vertices(i,N)[2]); - const Vec3ff a3 = Vec3ff::loadu(&prim.vertices(i,N)[3]); - - size_t mask1 = mask; - const size_t i1 = bscf(mask1); - if (mask) { - prefetchL1(&prim.vertices(i1,N)[0]); - prefetchL1(&prim.vertices(i1,N)[4]); - if (mask1) { - const size_t i2 = bsf(mask1); - prefetchL2(&prim.vertices(i2,N)[0]); - prefetchL2(&prim.vertices(i2,N)[4]); - } - } - - Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,Epilog(ray,context,geomID,primID)); - mask &= movemask(tNear <= vfloat(ray.tfar)); - } - } - - template - static __forceinline bool occluded_t(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim) - { - vfloat tNear; - vbool valid = CurveNiIntersector1::intersect(ray,prim,tNear); - - const size_t N = prim.N; - size_t mask = movemask(valid); - while (mask) - { - const size_t i = bscf(mask); - STAT3(shadow.trav_prims,1,1,1); - const unsigned int geomID = prim.geomID(N); - const unsigned int primID = prim.primID(N)[i]; - const CurveGeometry* geom = (CurveGeometry*) context->scene->get(geomID); - const Vec3ff a0 = Vec3ff::loadu(&prim.vertices(i,N)[0]); - const Vec3ff a1 = Vec3ff::loadu(&prim.vertices(i,N)[1]); - const Vec3ff a2 = Vec3ff::loadu(&prim.vertices(i,N)[2]); - const Vec3ff a3 = Vec3ff::loadu(&prim.vertices(i,N)[3]); - - size_t mask1 = mask; - const size_t i1 = bscf(mask1); - if (mask) { - prefetchL1(&prim.vertices(i1,N)[0]); - prefetchL1(&prim.vertices(i1,N)[4]); - if (mask1) { - const size_t i2 = bsf(mask1); - prefetchL2(&prim.vertices(i2,N)[0]); - prefetchL2(&prim.vertices(i2,N)[4]); - } - } - - if (Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,Epilog(ray,context,geomID,primID))) - return true; - - mask &= movemask(tNear <= vfloat(ray.tfar)); - } - return false; - } - }; - - template - struct CurveNvIntersectorK : public CurveNiIntersectorK - { - typedef CurveNv Primitive; - typedef CurvePrecalculationsK Precalculations; - - template - static __forceinline void intersect_t(Precalculations& pre, RayHitK& ray, const size_t k, IntersectContext* context, const Primitive& prim) - { - vfloat tNear; - vbool valid = CurveNiIntersectorK::intersect(ray,k,prim,tNear); - - const size_t N = prim.N; - size_t mask = movemask(valid); - while (mask) - { - const size_t i = bscf(mask); - STAT3(normal.trav_prims,1,1,1); - const unsigned int geomID = prim.geomID(N); - const unsigned int primID = prim.primID(N)[i]; - const CurveGeometry* geom = (CurveGeometry*) context->scene->get(geomID); - const Vec3ff a0 = Vec3ff::loadu(&prim.vertices(i,N)[0]); - const Vec3ff a1 = Vec3ff::loadu(&prim.vertices(i,N)[1]); - const Vec3ff a2 = Vec3ff::loadu(&prim.vertices(i,N)[2]); - const Vec3ff a3 = Vec3ff::loadu(&prim.vertices(i,N)[3]); - - size_t mask1 = mask; - const size_t i1 = bscf(mask1); - if (mask) { - prefetchL1(&prim.vertices(i1,N)[0]); - prefetchL1(&prim.vertices(i1,N)[4]); - if (mask1) { - const size_t i2 = bsf(mask1); - prefetchL2(&prim.vertices(i2,N)[0]); - prefetchL2(&prim.vertices(i2,N)[4]); - } - } - - Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,Epilog(ray,k,context,geomID,primID)); - mask &= movemask(tNear <= vfloat(ray.tfar[k])); - } - } - - template - static __forceinline bool occluded_t(Precalculations& pre, RayK& ray, const size_t k, IntersectContext* context, const Primitive& prim) - { - vfloat tNear; - vbool valid = CurveNiIntersectorK::intersect(ray,k,prim,tNear); - - const size_t N = prim.N; - size_t mask = movemask(valid); - while (mask) - { - const size_t i = bscf(mask); - STAT3(shadow.trav_prims,1,1,1); - const unsigned int geomID = prim.geomID(N); - const unsigned int primID = prim.primID(N)[i]; - const CurveGeometry* geom = (CurveGeometry*) context->scene->get(geomID); - const Vec3ff a0 = Vec3ff::loadu(&prim.vertices(i,N)[0]); - const Vec3ff a1 = Vec3ff::loadu(&prim.vertices(i,N)[1]); - const Vec3ff a2 = Vec3ff::loadu(&prim.vertices(i,N)[2]); - const Vec3ff a3 = Vec3ff::loadu(&prim.vertices(i,N)[3]); - - size_t mask1 = mask; - const size_t i1 = bscf(mask1); - if (mask) { - prefetchL1(&prim.vertices(i1,N)[0]); - prefetchL1(&prim.vertices(i1,N)[4]); - if (mask1) { - const size_t i2 = bsf(mask1); - prefetchL2(&prim.vertices(i2,N)[0]); - prefetchL2(&prim.vertices(i2,N)[4]); - } - } - - if (Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,Epilog(ray,k,context,geomID,primID))) - return true; - - mask &= movemask(tNear <= vfloat(ray.tfar[k])); - } - return false; - } - }; - } -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector.h deleted file mode 100644 index 204958f7cc..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector.h +++ /dev/null @@ -1,98 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "primitive.h" -#include "../subdiv/bezier_curve.h" -#include "../common/primref.h" -#include "bezier_hair_intersector.h" -#include "bezier_ribbon_intersector.h" -#include "bezier_curve_intersector.h" -#include "oriented_curve_intersector.h" -#include "../bvh/node_intersector1.h" - -// FIXME: this file seems replicate of curve_intersector_virtual.h - -namespace embree -{ - namespace isa - { - struct VirtualCurveIntersector1 - { - typedef unsigned char Primitive; - typedef CurvePrecalculations1 Precalculations; - - template - static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay &tray, size_t& lazy_node) - { - assert(num == 1); - RTCGeometryType ty = (RTCGeometryType)(*prim); - assert(This->leafIntersector); - VirtualCurvePrimitive::Intersectors& leafIntersector = ((VirtualCurvePrimitive*) This->leafIntersector)->vtbl[ty]; - leafIntersector.intersect<1>(&pre,&ray,context,prim); - } - - template - static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay &tray, size_t& lazy_node) - { - assert(num == 1); - RTCGeometryType ty = (RTCGeometryType)(*prim); - assert(This->leafIntersector); - VirtualCurvePrimitive::Intersectors& leafIntersector = ((VirtualCurvePrimitive*) This->leafIntersector)->vtbl[ty]; - return leafIntersector.occluded<1>(&pre,&ray,context,prim); - } - }; - - template - struct VirtualCurveIntersectorK - { - typedef unsigned char Primitive; - typedef CurvePrecalculationsK Precalculations; - - static __forceinline void intersect(const vbool& valid_i, const Accel::Intersectors* This, Precalculations& pre, RayHitK& ray, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node) - { - assert(num == 1); - RTCGeometryType ty = (RTCGeometryType)(*prim); - assert(This->leafIntersector); - VirtualCurvePrimitive::Intersectors& leafIntersector = ((VirtualCurvePrimitive*) This->leafIntersector)->vtbl[ty]; - size_t mask = movemask(valid_i); - while (mask) leafIntersector.intersect(&pre,&ray,bscf(mask),context,prim); - } - - static __forceinline vbool occluded(const vbool& valid_i, const Accel::Intersectors* This, Precalculations& pre, RayK& ray, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node) - { - assert(num == 1); - RTCGeometryType ty = (RTCGeometryType)(*prim); - assert(This->leafIntersector); - VirtualCurvePrimitive::Intersectors& leafIntersector = ((VirtualCurvePrimitive*) This->leafIntersector)->vtbl[ty]; - vbool valid_o = false; - size_t mask = movemask(valid_i); - while (mask) { - size_t k = bscf(mask); - if (leafIntersector.occluded(&pre,&ray,k,context,prim)) - set(valid_o, k); - } - return valid_o; - } - - static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node) - { - assert(num == 1); - RTCGeometryType ty = (RTCGeometryType)(*prim); - assert(This->leafIntersector); - VirtualCurvePrimitive::Intersectors& leafIntersector = ((VirtualCurvePrimitive*) This->leafIntersector)->vtbl[ty]; - leafIntersector.intersect(&pre,&ray,k,context,prim); - } - - static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node) - { - assert(num == 1); - RTCGeometryType ty = (RTCGeometryType)(*prim); - assert(This->leafIntersector); - VirtualCurvePrimitive::Intersectors& leafIntersector = ((VirtualCurvePrimitive*) This->leafIntersector)->vtbl[ty]; - return leafIntersector.occluded(&pre,&ray,k,context,prim); - } - }; - } -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_distance.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_distance.h deleted file mode 100644 index 343cc8ff28..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_distance.h +++ /dev/null @@ -1,129 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../common/ray.h" -#include "curve_intersector_precalculations.h" - -namespace embree -{ - namespace isa - { - template - struct DistanceCurveHit - { - __forceinline DistanceCurveHit() {} - - __forceinline DistanceCurveHit(const vbool& valid, const vfloat& U, const vfloat& V, const vfloat& T, const int i, const int N, - const NativeCurve3fa& curve3D) - : U(U), V(V), T(T), i(i), N(N), curve3D(curve3D), valid(valid) {} - - __forceinline void finalize() - { - vu = (vfloat(step)+U+vfloat(float(i)))*(1.0f/float(N)); - vv = V; - vt = T; - } - - __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); } - __forceinline float t (const size_t i) const { return vt[i]; } - __forceinline Vec3fa Ng(const size_t i) const { - return curve3D.eval_du(vu[i]); - } - - public: - vfloat U; - vfloat V; - vfloat T; - int i, N; - NativeCurve3fa curve3D; - - public: - vbool valid; - vfloat vu; - vfloat vv; - vfloat vt; - }; - - template - struct DistanceCurve1Intersector1 - { - template - __forceinline bool intersect(const CurvePrecalculations1& pre,Ray& ray, - IntersectContext* context, - const CurveGeometry* geom, const unsigned int primID, - const Vec3fa& v0, const Vec3fa& v1, const Vec3fa& v2, const Vec3fa& v3, - const Epilog& epilog) - { - const int N = geom->tessellationRate; - - /* transform control points into ray space */ - const NativeCurve3fa curve3Di(v0,v1,v2,v3); - const NativeCurve3fa curve3D = enlargeRadiusToMinWidth(context,geom,ray.org,curve3Di); - const NativeCurve3fa curve2D = curve3D.xfm_pr(pre.ray_space,ray.org); - - /* evaluate the bezier curve */ - vboolx valid = vfloatx(step) < vfloatx(float(N)); - const Vec4vfx p0 = curve2D.template eval0(0,N); - const Vec4vfx p1 = curve2D.template eval1(0,N); - - /* approximative intersection with cone */ - const Vec4vfx v = p1-p0; - const Vec4vfx w = -p0; - const vfloatx d0 = madd(w.x,v.x,w.y*v.y); - const vfloatx d1 = madd(v.x,v.x,v.y*v.y); - const vfloatx u = clamp(d0*rcp(d1),vfloatx(zero),vfloatx(one)); - const Vec4vfx p = madd(u,v,p0); - const vfloatx t = p.z*pre.depth_scale; - const vfloatx d2 = madd(p.x,p.x,p.y*p.y); - const vfloatx r = p.w; - const vfloatx r2 = r*r; - valid &= (d2 <= r2) & (vfloatx(ray.tnear()) <= t) & (t <= vfloatx(ray.tfar)); - if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f) - valid &= t > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR)*r*pre.depth_scale; // ignore self intersections - - /* update hit information */ - bool ishit = false; - if (unlikely(any(valid))) { - DistanceCurveHit hit(valid,u,0.0f,t,0,N,curve3D); - ishit = ishit | epilog(valid,hit); - } - - if (unlikely(VSIZEX < N)) - { - /* process SIMD-size many segments per iteration */ - for (int i=VSIZEX; i(i,N); - const Vec4vfx p1 = curve2D.template eval1(i,N); - - /* approximative intersection with cone */ - const Vec4vfx v = p1-p0; - const Vec4vfx w = -p0; - const vfloatx d0 = madd(w.x,v.x,w.y*v.y); - const vfloatx d1 = madd(v.x,v.x,v.y*v.y); - const vfloatx u = clamp(d0*rcp(d1),vfloatx(zero),vfloatx(one)); - const Vec4vfx p = madd(u,v,p0); - const vfloatx t = p.z*pre.depth_scale; - const vfloatx d2 = madd(p.x,p.x,p.y*p.y); - const vfloatx r = p.w; - const vfloatx r2 = r*r; - valid &= (d2 <= r2) & (vfloatx(ray.tnear()) <= t) & (t <= vfloatx(ray.tfar)); - if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f) - valid &= t > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR)*r*pre.depth_scale; // ignore self intersections - - /* update hit information */ - if (unlikely(any(valid))) { - DistanceCurveHit hit(valid,u,0.0f,t,i,N,curve3D); - ishit = ishit | epilog(valid,hit); - } - } - } - return ishit; - } - }; - } -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_oriented.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_oriented.h deleted file mode 100644 index 47531027fc..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_oriented.h +++ /dev/null @@ -1,417 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../common/ray.h" -#include "curve_intersector_precalculations.h" -#include "curve_intersector_sweep.h" -#include "../subdiv/linear_bezier_patch.h" - -#define DBG(x) - -namespace embree -{ - namespace isa - { - template - struct TensorLinearCubicBezierSurfaceIntersector - { - const LinearSpace3fa& ray_space; - Ray& ray; - TensorLinearCubicBezierSurface3fa curve3d; - TensorLinearCubicBezierSurface2fa curve2d; - float eps; - const Epilog& epilog; - bool isHit; - - __forceinline TensorLinearCubicBezierSurfaceIntersector (const LinearSpace3fa& ray_space, Ray& ray, const TensorLinearCubicBezierSurface3fa& curve3d, const Epilog& epilog) - : ray_space(ray_space), ray(ray), curve3d(curve3d), epilog(epilog), isHit(false) - { - const TensorLinearCubicBezierSurface3fa curve3dray = curve3d.xfm(ray_space,ray.org); - curve2d = TensorLinearCubicBezierSurface2fa(CubicBezierCurve2fa(curve3dray.L),CubicBezierCurve2fa(curve3dray.R)); - const BBox2fa b2 = curve2d.bounds(); - eps = 8.0f*float(ulp)*reduce_max(max(abs(b2.lower),abs(b2.upper))); - } - - __forceinline Interval1f solve_linear(const float u0, const float u1, const float& p0, const float& p1) - { - if (p1 == p0) { - if (p0 == 0.0f) return Interval1f(u0,u1); - else return Interval1f(empty); - } - const float t = -p0/(p1-p0); - const float tt = lerp(u0,u1,t); - return Interval1f(tt); - } - - __forceinline void solve_linear(const float u0, const float u1, const Interval1f& p0, const Interval1f& p1, Interval1f& u) - { - if (sign(p0.lower) != sign(p0.upper)) u.extend(u0); - if (sign(p0.lower) != sign(p1.lower)) u.extend(solve_linear(u0,u1,p0.lower,p1.lower)); - if (sign(p0.upper) != sign(p1.upper)) u.extend(solve_linear(u0,u1,p0.upper,p1.upper)); - if (sign(p1.lower) != sign(p1.upper)) u.extend(u1); - } - - __forceinline Interval1f bezier_clipping(const CubicBezierCurve& curve) - { - Interval1f u = empty; - solve_linear(0.0f/3.0f,1.0f/3.0f,curve.v0,curve.v1,u); - solve_linear(0.0f/3.0f,2.0f/3.0f,curve.v0,curve.v2,u); - solve_linear(0.0f/3.0f,3.0f/3.0f,curve.v0,curve.v3,u); - solve_linear(1.0f/3.0f,2.0f/3.0f,curve.v1,curve.v2,u); - solve_linear(1.0f/3.0f,3.0f/3.0f,curve.v1,curve.v3,u); - solve_linear(2.0f/3.0f,3.0f/3.0f,curve.v2,curve.v3,u); - return intersect(u,Interval1f(0.0f,1.0f)); - } - - __forceinline Interval1f bezier_clipping(const LinearBezierCurve& curve) - { - Interval1f v = empty; - solve_linear(0.0f,1.0f,curve.v0,curve.v1,v); - return intersect(v,Interval1f(0.0f,1.0f)); - } - - __forceinline void solve_bezier_clipping(BBox1f cu, BBox1f cv, const TensorLinearCubicBezierSurface2fa& curve2) - { - BBox2fa bounds = curve2.bounds(); - if (bounds.upper.x < 0.0f) return; - if (bounds.upper.y < 0.0f) return; - if (bounds.lower.x > 0.0f) return; - if (bounds.lower.y > 0.0f) return; - - if (max(cu.size(),cv.size()) < 1E-4f) - { - const float u = cu.center(); - const float v = cv.center(); - TensorLinearCubicBezierSurface1f curve_z = curve3d.xfm(ray_space.row2(),ray.org); - const float t = curve_z.eval(u,v); - if (ray.tnear() <= t && t <= ray.tfar) { - const Vec3fa Ng = cross(curve3d.eval_du(u,v),curve3d.eval_dv(u,v)); - BezierCurveHit hit(t,u,v,Ng); - isHit |= epilog(hit); - } - return; - } - - const Vec2fa dv = curve2.axis_v(); - const TensorLinearCubicBezierSurface1f curve1v = curve2.xfm(dv); - LinearBezierCurve curve0v = curve1v.reduce_u(); - if (!curve0v.hasRoot()) return; - - const Interval1f v = bezier_clipping(curve0v); - if (isEmpty(v)) return; - TensorLinearCubicBezierSurface2fa curve2a = curve2.clip_v(v); - cv = BBox1f(lerp(cv.lower,cv.upper,v.lower),lerp(cv.lower,cv.upper,v.upper)); - - const Vec2fa du = curve2.axis_u(); - const TensorLinearCubicBezierSurface1f curve1u = curve2a.xfm(du); - CubicBezierCurve curve0u = curve1u.reduce_v(); - int roots = curve0u.maxRoots(); - if (roots == 0) return; - - if (roots == 1) - { - const Interval1f u = bezier_clipping(curve0u); - if (isEmpty(u)) return; - TensorLinearCubicBezierSurface2fa curve2b = curve2a.clip_u(u); - cu = BBox1f(lerp(cu.lower,cu.upper,u.lower),lerp(cu.lower,cu.upper,u.upper)); - solve_bezier_clipping(cu,cv,curve2b); - return; - } - - TensorLinearCubicBezierSurface2fa curve2l, curve2r; - curve2a.split_u(curve2l,curve2r); - solve_bezier_clipping(BBox1f(cu.lower,cu.center()),cv,curve2l); - solve_bezier_clipping(BBox1f(cu.center(),cu.upper),cv,curve2r); - } - - __forceinline bool solve_bezier_clipping() - { - solve_bezier_clipping(BBox1f(0.0f,1.0f),BBox1f(0.0f,1.0f),curve2d); - return isHit; - } - - __forceinline void solve_newton_raphson(BBox1f cu, BBox1f cv) - { - Vec2fa uv(cu.center(),cv.center()); - const Vec2fa dfdu = curve2d.eval_du(uv.x,uv.y); - const Vec2fa dfdv = curve2d.eval_dv(uv.x,uv.y); - const LinearSpace2fa rcp_J = rcp(LinearSpace2fa(dfdu,dfdv)); - solve_newton_raphson_loop(cu,cv,uv,dfdu,dfdv,rcp_J); - } - - __forceinline void solve_newton_raphson_loop(BBox1f cu, BBox1f cv, const Vec2fa& uv_in, const Vec2fa& dfdu, const Vec2fa& dfdv, const LinearSpace2fa& rcp_J) - { - Vec2fa uv = uv_in; - - for (size_t i=0; i<200; i++) - { - const Vec2fa f = curve2d.eval(uv.x,uv.y); - const Vec2fa duv = rcp_J*f; - uv -= duv; - - if (max(abs(f.x),abs(f.y)) < eps) - { - const float u = uv.x; - const float v = uv.y; - if (!(u >= 0.0f && u <= 1.0f)) return; // rejects NaNs - if (!(v >= 0.0f && v <= 1.0f)) return; // rejects NaNs - const TensorLinearCubicBezierSurface1f curve_z = curve3d.xfm(ray_space.row2(),ray.org); - const float t = curve_z.eval(u,v); - if (!(ray.tnear() <= t && t <= ray.tfar)) return; // rejects NaNs - const Vec3fa Ng = cross(curve3d.eval_du(u,v),curve3d.eval_dv(u,v)); - BezierCurveHit hit(t,u,v,Ng); - isHit |= epilog(hit); - return; - } - } - } - - __forceinline bool clip_v(BBox1f& cu, BBox1f& cv) - { - const Vec2fa dv = curve2d.eval_dv(cu.lower,cv.lower); - const TensorLinearCubicBezierSurface1f curve1v = curve2d.xfm(dv).clip(cu,cv); - LinearBezierCurve curve0v = curve1v.reduce_u(); - if (!curve0v.hasRoot()) return false; - Interval1f v = bezier_clipping(curve0v); - if (isEmpty(v)) return false; - v = intersect(v + Interval1f(-0.1f,+0.1f),Interval1f(0.0f,1.0f)); - cv = BBox1f(lerp(cv.lower,cv.upper,v.lower),lerp(cv.lower,cv.upper,v.upper)); - return true; - } - - __forceinline bool solve_krawczyk(bool very_small, BBox1f& cu, BBox1f& cv) - { - /* perform bezier clipping in v-direction to get tight v-bounds */ - TensorLinearCubicBezierSurface2fa curve2 = curve2d.clip(cu,cv); - const Vec2fa dv = curve2.axis_v(); - const TensorLinearCubicBezierSurface1f curve1v = curve2.xfm(dv); - LinearBezierCurve curve0v = curve1v.reduce_u(); - if (unlikely(!curve0v.hasRoot())) return true; - Interval1f v = bezier_clipping(curve0v); - if (unlikely(isEmpty(v))) return true; - v = intersect(v + Interval1f(-0.1f,+0.1f),Interval1f(0.0f,1.0f)); - curve2 = curve2.clip_v(v); - cv = BBox1f(lerp(cv.lower,cv.upper,v.lower),lerp(cv.lower,cv.upper,v.upper)); - - /* perform one newton raphson iteration */ - Vec2fa c(cu.center(),cv.center()); - Vec2fa f,dfdu,dfdv; curve2d.eval(c.x,c.y,f,dfdu,dfdv); - const LinearSpace2fa rcp_J = rcp(LinearSpace2fa(dfdu,dfdv)); - const Vec2fa c1 = c - rcp_J*f; - - /* calculate bounds of derivatives */ - const BBox2fa bounds_du = (1.0f/cu.size())*curve2.derivative_u().bounds(); - const BBox2fa bounds_dv = (1.0f/cv.size())*curve2.derivative_v().bounds(); - - /* calculate krawczyk test */ - LinearSpace2> I(Interval1f(1.0f), Interval1f(0.0f), - Interval1f(0.0f), Interval1f(1.0f)); - - LinearSpace2> G(Interval1f(bounds_du.lower.x,bounds_du.upper.x), Interval1f(bounds_dv.lower.x,bounds_dv.upper.x), - Interval1f(bounds_du.lower.y,bounds_du.upper.y), Interval1f(bounds_dv.lower.y,bounds_dv.upper.y)); - - const LinearSpace2 rcp_J2(rcp_J); - const LinearSpace2> rcp_Ji(rcp_J2); - - const Vec2 x(cu,cv); - const Vec2 K = Vec2(Vec2f(c1)) + (I - rcp_Ji*G)*(x-Vec2(Vec2f(c))); - - /* test if there is no solution */ - const Vec2 KK = intersect(K,x); - if (unlikely(isEmpty(KK.x) || isEmpty(KK.y))) return true; - - /* exit if convergence cannot get proven, but terminate if we are very small */ - if (unlikely(!subset(K,x) && !very_small)) return false; - - /* solve using newton raphson iteration of convergence is guarenteed */ - solve_newton_raphson_loop(cu,cv,c1,dfdu,dfdv,rcp_J); - return true; - } - - __forceinline void solve_newton_raphson_no_recursion(BBox1f cu, BBox1f cv) - { - if (!clip_v(cu,cv)) return; - return solve_newton_raphson(cu,cv); - } - - __forceinline void solve_newton_raphson_recursion(BBox1f cu, BBox1f cv) - { - unsigned int sptr = 0; - const unsigned int stack_size = 4; - unsigned int mask_stack[stack_size]; - BBox1f cu_stack[stack_size]; - BBox1f cv_stack[stack_size]; - goto entry; - - /* terminate if stack is empty */ - while (sptr) - { - /* pop from stack */ - { - sptr--; - size_t mask = mask_stack[sptr]; - cu = cu_stack[sptr]; - cv = cv_stack[sptr]; - const size_t i = bscf(mask); - mask_stack[sptr] = mask; - if (mask) sptr++; // there are still items on the stack - - /* process next element recurse into each hit curve segment */ - const float u0 = float(i+0)*(1.0f/(VSIZEX-1)); - const float u1 = float(i+1)*(1.0f/(VSIZEX-1)); - const BBox1f cui(lerp(cu.lower,cu.upper,u0),lerp(cu.lower,cu.upper,u1)); - cu = cui; - } - -#if 0 - solve_newton_raphson_no_recursion(cu,cv); - continue; - -#else - /* we assume convergence for small u ranges and verify using krawczyk */ - if (cu.size() < 1.0f/6.0f) { - const bool very_small = cu.size() < 0.001f || sptr >= stack_size; - if (solve_krawczyk(very_small,cu,cv)) { - continue; - } - } -#endif - - entry: - - /* split the curve into VSIZEX-1 segments in u-direction */ - vboolx valid = true; - TensorLinearCubicBezierSurface subcurves = curve2d.clip_v(cv).vsplit_u(valid,cu); - - /* slabs test in u-direction */ - Vec2vfx ndv = cross(subcurves.axis_v()); - BBox boundsv = subcurves.vxfm(ndv).bounds(); - valid &= boundsv.lower <= eps; - valid &= boundsv.upper >= -eps; - if (none(valid)) continue; - - /* slabs test in v-direction */ - Vec2vfx ndu = cross(subcurves.axis_u()); - BBox boundsu = subcurves.vxfm(ndu).bounds(); - valid &= boundsu.lower <= eps; - valid &= boundsu.upper >= -eps; - if (none(valid)) continue; - - /* push valid segments to stack */ - assert(sptr < stack_size); - mask_stack [sptr] = movemask(valid); - cu_stack [sptr] = cu; - cv_stack [sptr] = cv; - sptr++; - } - } - - __forceinline bool solve_newton_raphson_main() - { - BBox1f vu(0.0f,1.0f); - BBox1f vv(0.0f,1.0f); - solve_newton_raphson_recursion(vu,vv); - return isHit; - } - }; - - - template class SourceCurve> - struct OrientedCurve1Intersector1 - { - //template using Curve = SourceCurve; - typedef SourceCurve SourceCurve3ff; - typedef SourceCurve SourceCurve3fa; - - __forceinline OrientedCurve1Intersector1() {} - - __forceinline OrientedCurve1Intersector1(const Ray& ray, const void* ptr) {} - - template - __noinline bool intersect(const CurvePrecalculations1& pre, Ray& ray, - IntersectContext* context, - const CurveGeometry* geom, const unsigned int primID, - const Vec3ff& v0i, const Vec3ff& v1i, const Vec3ff& v2i, const Vec3ff& v3i, - const Vec3fa& n0i, const Vec3fa& n1i, const Vec3fa& n2i, const Vec3fa& n3i, - const Epilog& epilog) const - { - STAT3(normal.trav_prims,1,1,1); - - SourceCurve3ff ccurve(v0i,v1i,v2i,v3i); - SourceCurve3fa ncurve(n0i,n1i,n2i,n3i); - ccurve = enlargeRadiusToMinWidth(context,geom,ray.org,ccurve); - TensorLinearCubicBezierSurface3fa curve = TensorLinearCubicBezierSurface3fa::fromCenterAndNormalCurve(ccurve,ncurve); - //return TensorLinearCubicBezierSurfaceIntersector(pre.ray_space,ray,curve,epilog).solve_bezier_clipping(); - return TensorLinearCubicBezierSurfaceIntersector(pre.ray_space,ray,curve,epilog).solve_newton_raphson_main(); - } - - template - __noinline bool intersect(const CurvePrecalculations1& pre, Ray& ray, - IntersectContext* context, - const CurveGeometry* geom, const unsigned int primID, - const TensorLinearCubicBezierSurface3fa& curve, const Epilog& epilog) const - { - STAT3(normal.trav_prims,1,1,1); - //return TensorLinearCubicBezierSurfaceIntersector(pre.ray_space,ray,curve,epilog).solve_bezier_clipping(); - return TensorLinearCubicBezierSurfaceIntersector(pre.ray_space,ray,curve,epilog).solve_newton_raphson_main(); - } - }; - - template class SourceCurve, int K> - struct OrientedCurve1IntersectorK - { - //template using Curve = SourceCurve; - typedef SourceCurve SourceCurve3ff; - typedef SourceCurve SourceCurve3fa; - - struct Ray1 - { - __forceinline Ray1(RayK& ray, size_t k) - : org(ray.org.x[k],ray.org.y[k],ray.org.z[k]), dir(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]), _tnear(ray.tnear()[k]), tfar(ray.tfar[k]) {} - - Vec3fa org; - Vec3fa dir; - float _tnear; - float& tfar; - - __forceinline float& tnear() { return _tnear; } - //__forceinline float& tfar() { return _tfar; } - __forceinline const float& tnear() const { return _tnear; } - //__forceinline const float& tfar() const { return _tfar; } - }; - - template - __forceinline bool intersect(const CurvePrecalculationsK& pre, RayK& vray, size_t k, - IntersectContext* context, - const CurveGeometry* geom, const unsigned int primID, - const Vec3ff& v0i, const Vec3ff& v1i, const Vec3ff& v2i, const Vec3ff& v3i, - const Vec3fa& n0i, const Vec3fa& n1i, const Vec3fa& n2i, const Vec3fa& n3i, - const Epilog& epilog) - { - STAT3(normal.trav_prims,1,1,1); - Ray1 ray(vray,k); - SourceCurve3ff ccurve(v0i,v1i,v2i,v3i); - SourceCurve3fa ncurve(n0i,n1i,n2i,n3i); - ccurve = enlargeRadiusToMinWidth(context,geom,ray.org,ccurve); - TensorLinearCubicBezierSurface3fa curve = TensorLinearCubicBezierSurface3fa::fromCenterAndNormalCurve(ccurve,ncurve); - //return TensorLinearCubicBezierSurfaceIntersector(pre.ray_space[k],ray,curve,epilog).solve_bezier_clipping(); - return TensorLinearCubicBezierSurfaceIntersector(pre.ray_space[k],ray,curve,epilog).solve_newton_raphson_main(); - } - - template - __forceinline bool intersect(const CurvePrecalculationsK& pre, RayK& vray, size_t k, - IntersectContext* context, - const CurveGeometry* geom, const unsigned int primID, - const TensorLinearCubicBezierSurface3fa& curve, - const Epilog& epilog) - { - STAT3(normal.trav_prims,1,1,1); - Ray1 ray(vray,k); - //return TensorLinearCubicBezierSurfaceIntersector(pre.ray_space[k],ray,curve,epilog).solve_bezier_clipping(); - return TensorLinearCubicBezierSurfaceIntersector(pre.ray_space[k],ray,curve,epilog).solve_newton_raphson_main(); - } - }; - } -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_precalculations.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_precalculations.h deleted file mode 100644 index 6e9fc91925..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_precalculations.h +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../common/ray.h" -#include "../common/geometry.h" - -namespace embree -{ - namespace isa - { - struct CurvePrecalculations1 - { - float depth_scale; - LinearSpace3fa ray_space; - - __forceinline CurvePrecalculations1() {} - - __forceinline CurvePrecalculations1(const Ray& ray, const void* ptr) - { - depth_scale = rsqrt(dot(ray.dir,ray.dir)); - LinearSpace3fa space = frame(depth_scale*ray.dir); - space.vz *= depth_scale; - ray_space = space.transposed(); - } - }; - - template - struct CurvePrecalculationsK - { - vfloat depth_scale; - LinearSpace3fa ray_space[K]; - - __forceinline CurvePrecalculationsK(const vbool& valid, const RayK& ray) - { - size_t mask = movemask(valid); - depth_scale = rsqrt(dot(ray.dir,ray.dir)); - while (mask) { - size_t k = bscf(mask); - Vec3fa ray_dir_k = Vec3fa(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]); - LinearSpace3fa ray_space_k = frame(depth_scale[k]*ray_dir_k); - ray_space_k.vz *= depth_scale[k]; - ray_space[k] = ray_space_k.transposed(); - } - } - }; - } -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_ribbon.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_ribbon.h deleted file mode 100644 index a99cf99d56..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_ribbon.h +++ /dev/null @@ -1,214 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../common/ray.h" -#include "quad_intersector.h" -#include "curve_intersector_precalculations.h" - -#define Bezier1Intersector1 RibbonCurve1Intersector1 -#define Bezier1IntersectorK RibbonCurve1IntersectorK - -namespace embree -{ - namespace isa - { - template - struct RibbonHit - { - __forceinline RibbonHit() {} - - __forceinline RibbonHit(const vbool& valid, const vfloat& U, const vfloat& V, const vfloat& T, const int i, const int N, - const NativeCurve3ff& curve3D) - : U(U), V(V), T(T), i(i), N(N), curve3D(curve3D), valid(valid) {} - - __forceinline void finalize() - { - vu = (vfloat(step)+U+vfloat(float(i)))*(1.0f/float(N)); - vv = V; - vt = T; - } - - __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); } - __forceinline float t (const size_t i) const { return vt[i]; } - __forceinline Vec3fa Ng(const size_t i) const { - return curve3D.eval_du(vu[i]); - } - - public: - vfloat U; - vfloat V; - vfloat T; - int i, N; - NativeCurve3ff curve3D; - - public: - vbool valid; - vfloat vu; - vfloat vv; - vfloat vt; - }; - - /* calculate squared distance of point p0 to line p1->p2 */ - __forceinline std::pair sqr_point_line_distance(const Vec2vfx& p0, const Vec2vfx& p1, const Vec2vfx& p2) - { - const vfloatx num = det(p2-p1,p1-p0); - const vfloatx den2 = dot(p2-p1,p2-p1); - return std::make_pair(num*num,den2); - } - - /* performs culling against a cylinder */ - __forceinline vboolx cylinder_culling_test(const Vec2vfx& p0, const Vec2vfx& p1, const Vec2vfx& p2, const vfloatx& r) - { - const std::pair d = sqr_point_line_distance(p0,p1,p2); - return d.first <= r*r*d.second; - } - - template - __forceinline bool intersect_ribbon(const Vec3fa& ray_org, const Vec3fa& ray_dir, const float ray_tnear, const float& ray_tfar, - const LinearSpace3fa& ray_space, const float& depth_scale, - const NativeCurve3ff& curve3D, const int N, - const Epilog& epilog) - { - /* transform control points into ray space */ - const NativeCurve3ff curve2D = curve3D.xfm_pr(ray_space,ray_org); - float eps = 4.0f*float(ulp)*reduce_max(max(abs(curve2D.v0),abs(curve2D.v1),abs(curve2D.v2),abs(curve2D.v3))); - - /* evaluate the bezier curve */ - bool ishit = false; - vboolx valid = vfloatx(step) < vfloatx(float(N)); - const Vec4vfx p0 = curve2D.template eval0(0,N); - const Vec4vfx p1 = curve2D.template eval1(0,N); - valid &= cylinder_culling_test(zero,Vec2vfx(p0.x,p0.y),Vec2vfx(p1.x,p1.y),max(p0.w,p1.w)); - - if (any(valid)) - { - Vec3vfx dp0dt = curve2D.template derivative0(0,N); - Vec3vfx dp1dt = curve2D.template derivative1(0,N); - dp0dt = select(reduce_max(abs(dp0dt)) < vfloatx(eps),Vec3vfx(p1-p0),dp0dt); - dp1dt = select(reduce_max(abs(dp1dt)) < vfloatx(eps),Vec3vfx(p1-p0),dp1dt); - const Vec3vfx n0(dp0dt.y,-dp0dt.x,0.0f); - const Vec3vfx n1(dp1dt.y,-dp1dt.x,0.0f); - const Vec3vfx nn0 = normalize(n0); - const Vec3vfx nn1 = normalize(n1); - const Vec3vfx lp0 = madd(p0.w,nn0,Vec3vfx(p0)); - const Vec3vfx lp1 = madd(p1.w,nn1,Vec3vfx(p1)); - const Vec3vfx up0 = nmadd(p0.w,nn0,Vec3vfx(p0)); - const Vec3vfx up1 = nmadd(p1.w,nn1,Vec3vfx(p1)); - - vfloatx vu,vv,vt; - vboolx valid0 = intersect_quad_backface_culling(valid,zero,Vec3fa(0,0,1),ray_tnear,ray_tfar,lp0,lp1,up1,up0,vu,vv,vt); - - if (any(valid0)) - { - /* ignore self intersections */ - if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f) { - vfloatx r = lerp(p0.w, p1.w, vu); - valid0 &= vt > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR)*r*depth_scale; - } - - if (any(valid0)) - { - vv = madd(2.0f,vv,vfloatx(-1.0f)); - RibbonHit bhit(valid0,vu,vv,vt,0,N,curve3D); - ishit |= epilog(bhit.valid,bhit); - } - } - } - - if (unlikely(VSIZEX < N)) - { - /* process SIMD-size many segments per iteration */ - for (int i=VSIZEX; i(i,N); - const Vec4vfx p1 = curve2D.template eval1(i,N); - valid &= cylinder_culling_test(zero,Vec2vfx(p0.x,p0.y),Vec2vfx(p1.x,p1.y),max(p0.w,p1.w)); - if (none(valid)) continue; - - Vec3vfx dp0dt = curve2D.template derivative0(i,N); - Vec3vfx dp1dt = curve2D.template derivative1(i,N); - dp0dt = select(reduce_max(abs(dp0dt)) < vfloatx(eps),Vec3vfx(p1-p0),dp0dt); - dp1dt = select(reduce_max(abs(dp1dt)) < vfloatx(eps),Vec3vfx(p1-p0),dp1dt); - const Vec3vfx n0(dp0dt.y,-dp0dt.x,0.0f); - const Vec3vfx n1(dp1dt.y,-dp1dt.x,0.0f); - const Vec3vfx nn0 = normalize(n0); - const Vec3vfx nn1 = normalize(n1); - const Vec3vfx lp0 = madd(p0.w,nn0,Vec3vfx(p0)); - const Vec3vfx lp1 = madd(p1.w,nn1,Vec3vfx(p1)); - const Vec3vfx up0 = nmadd(p0.w,nn0,Vec3vfx(p0)); - const Vec3vfx up1 = nmadd(p1.w,nn1,Vec3vfx(p1)); - - vfloatx vu,vv,vt; - vboolx valid0 = intersect_quad_backface_culling(valid,zero,Vec3fa(0,0,1),ray_tnear,ray_tfar,lp0,lp1,up1,up0,vu,vv,vt); - - if (any(valid0)) - { - /* ignore self intersections */ - if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f) { - vfloatx r = lerp(p0.w, p1.w, vu); - valid0 &= vt > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR)*r*depth_scale; - } - - if (any(valid0)) - { - vv = madd(2.0f,vv,vfloatx(-1.0f)); - RibbonHit bhit(valid0,vu,vv,vt,i,N,curve3D); - ishit |= epilog(bhit.valid,bhit); - } - } - } - } - return ishit; - } - - template class NativeCurve> - struct RibbonCurve1Intersector1 - { - typedef NativeCurve NativeCurve3ff; - - template - __forceinline bool intersect(const CurvePrecalculations1& pre, Ray& ray, - IntersectContext* context, - const CurveGeometry* geom, const unsigned int primID, - const Vec3ff& v0, const Vec3ff& v1, const Vec3ff& v2, const Vec3ff& v3, - const Epilog& epilog) - { - const int N = geom->tessellationRate; - NativeCurve3ff curve(v0,v1,v2,v3); - curve = enlargeRadiusToMinWidth(context,geom,ray.org,curve); - return intersect_ribbon(ray.org,ray.dir,ray.tnear(),ray.tfar, - pre.ray_space,pre.depth_scale, - curve,N, - epilog); - } - }; - - template class NativeCurve, int K> - struct RibbonCurve1IntersectorK - { - typedef NativeCurve NativeCurve3ff; - - template - __forceinline bool intersect(const CurvePrecalculationsK& pre, RayK& ray, size_t k, - IntersectContext* context, - const CurveGeometry* geom, const unsigned int primID, - const Vec3ff& v0, const Vec3ff& v1, const Vec3ff& v2, const Vec3ff& v3, - const Epilog& epilog) - { - const int N = geom->tessellationRate; - const Vec3fa ray_org(ray.org.x[k],ray.org.y[k],ray.org.z[k]); - const Vec3fa ray_dir(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]); - NativeCurve3ff curve(v0,v1,v2,v3); - curve = enlargeRadiusToMinWidth(context,geom,ray_org,curve); - return intersect_ribbon(ray_org,ray_dir,ray.tnear()[k],ray.tfar[k], - pre.ray_space[k],pre.depth_scale[k], - curve,N, - epilog); - } - }; - } -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_sweep.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_sweep.h deleted file mode 100644 index 883cedc3d2..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_sweep.h +++ /dev/null @@ -1,362 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../common/ray.h" -#include "cylinder.h" -#include "plane.h" -#include "line_intersector.h" -#include "curve_intersector_precalculations.h" - -namespace embree -{ - namespace isa - { - static const size_t numJacobianIterations = 5; -#if defined(__AVX__) - static const size_t numBezierSubdivisions = 2; -#else - static const size_t numBezierSubdivisions = 3; -#endif - - struct BezierCurveHit - { - __forceinline BezierCurveHit() {} - - __forceinline BezierCurveHit(const float t, const float u, const Vec3fa& Ng) - : t(t), u(u), v(0.0f), Ng(Ng) {} - - __forceinline BezierCurveHit(const float t, const float u, const float v, const Vec3fa& Ng) - : t(t), u(u), v(v), Ng(Ng) {} - - __forceinline void finalize() {} - - public: - float t; - float u; - float v; - Vec3fa Ng; - }; - - template - __forceinline bool intersect_bezier_iterative_debug(const Ray& ray, const float dt, const NativeCurve3ff& curve, size_t i, - const vfloatx& u, const BBox& tp, const BBox& h0, const BBox& h1, - const Vec3vfx& Ng, const Vec4vfx& dP0du, const Vec4vfx& dP3du, - const Epilog& epilog) - { - if (tp.lower[i]+dt > ray.tfar) return false; - Vec3fa Ng_o = Vec3fa(Ng.x[i],Ng.y[i],Ng.z[i]); - if (h0.lower[i] == tp.lower[i]) Ng_o = -Vec3fa(dP0du.x[i],dP0du.y[i],dP0du.z[i]); - if (h1.lower[i] == tp.lower[i]) Ng_o = +Vec3fa(dP3du.x[i],dP3du.y[i],dP3du.z[i]); - BezierCurveHit hit(tp.lower[i]+dt,u[i],Ng_o); - return epilog(hit); - } - - template - __forceinline bool intersect_bezier_iterative_jacobian(const Ray& ray, const float dt, const NativeCurve3ff& curve, float u, float t, const Epilog& epilog) - { - const Vec3fa org = zero; - const Vec3fa dir = ray.dir; - const float length_ray_dir = length(dir); - - /* error of curve evaluations is propertional to largest coordinate */ - const BBox3ff box = curve.bounds(); - const float P_err = 16.0f*float(ulp)*reduce_max(max(abs(box.lower),abs(box.upper))); - - for (size_t i=0; i= 0.0f && u <= 1.0f)) return false; // rejects NaNs - const Vec3fa R = normalize(Q-P); - const Vec3fa U = madd(Vec3fa(dPdu.w),R,dPdu); - const Vec3fa V = cross(dPdu,R); - BezierCurveHit hit(t,u,cross(V,U)); - return epilog(hit); - } - } - return false; - } - - template - bool intersect_bezier_recursive_jacobian(const Ray& ray, const float dt, const NativeCurve3ff& curve, - float u0, float u1, unsigned int depth, const Epilog& epilog) - { -#if defined(__AVX__) - typedef vbool8 vboolx; // maximally 8-wide to work around KNL issues - typedef vint8 vintx; - typedef vfloat8 vfloatx; -#else - typedef vbool4 vboolx; - typedef vint4 vintx; - typedef vfloat4 vfloatx; -#endif - typedef Vec3 Vec3vfx; - typedef Vec4 Vec4vfx; - - unsigned int maxDepth = numBezierSubdivisions; - bool found = false; - const Vec3fa org = zero; - const Vec3fa dir = ray.dir; - - unsigned int sptr = 0; - const unsigned int stack_size = numBezierSubdivisions+1; // +1 because of unstable workaround below - struct StackEntry { - vboolx valid; - vfloatx tlower; - float u0; - float u1; - unsigned int depth; - }; - StackEntry stack[stack_size]; - goto entry; - - /* terminate if stack is empty */ - while (sptr) - { - /* pop from stack */ - { - sptr--; - vboolx valid = stack[sptr].valid; - const vfloatx tlower = stack[sptr].tlower; - valid &= tlower+dt <= ray.tfar; - if (none(valid)) continue; - u0 = stack[sptr].u0; - u1 = stack[sptr].u1; - depth = stack[sptr].depth; - const size_t i = select_min(valid,tlower); clear(valid,i); - stack[sptr].valid = valid; - if (any(valid)) sptr++; // there are still items on the stack - - /* process next segment */ - const vfloatx vu0 = lerp(u0,u1,vfloatx(step)*(1.0f/(vfloatx::size-1))); - u0 = vu0[i+0]; - u1 = vu0[i+1]; - } - entry: - - /* subdivide curve */ - const float dscale = (u1-u0)*(1.0f/(3.0f*(vfloatx::size-1))); - const vfloatx vu0 = lerp(u0,u1,vfloatx(step)*(1.0f/(vfloatx::size-1))); - Vec4vfx P0, dP0du; curve.veval(vu0,P0,dP0du); dP0du = dP0du * Vec4vfx(dscale); - const Vec4vfx P3 = shift_right_1(P0); - const Vec4vfx dP3du = shift_right_1(dP0du); - const Vec4vfx P1 = P0 + dP0du; - const Vec4vfx P2 = P3 - dP3du; - - /* calculate bounding cylinders */ - const vfloatx rr1 = sqr_point_to_line_distance(Vec3vfx(dP0du),Vec3vfx(P3-P0)); - const vfloatx rr2 = sqr_point_to_line_distance(Vec3vfx(dP3du),Vec3vfx(P3-P0)); - const vfloatx maxr12 = sqrt(max(rr1,rr2)); - const vfloatx one_plus_ulp = 1.0f+2.0f*float(ulp); - const vfloatx one_minus_ulp = 1.0f-2.0f*float(ulp); - vfloatx r_outer = max(P0.w,P1.w,P2.w,P3.w)+maxr12; - vfloatx r_inner = min(P0.w,P1.w,P2.w,P3.w)-maxr12; - r_outer = one_plus_ulp*r_outer; - r_inner = max(0.0f,one_minus_ulp*r_inner); - const CylinderN cylinder_outer(Vec3vfx(P0),Vec3vfx(P3),r_outer); - const CylinderN cylinder_inner(Vec3vfx(P0),Vec3vfx(P3),r_inner); - vboolx valid = true; clear(valid,vfloatx::size-1); - - /* intersect with outer cylinder */ - BBox tc_outer; vfloatx u_outer0; Vec3vfx Ng_outer0; vfloatx u_outer1; Vec3vfx Ng_outer1; - valid &= cylinder_outer.intersect(org,dir,tc_outer,u_outer0,Ng_outer0,u_outer1,Ng_outer1); - if (none(valid)) continue; - - /* intersect with cap-planes */ - BBox tp(ray.tnear()-dt,ray.tfar-dt); - tp = embree::intersect(tp,tc_outer); - BBox h0 = HalfPlaneN(Vec3vfx(P0),+Vec3vfx(dP0du)).intersect(org,dir); - tp = embree::intersect(tp,h0); - BBox h1 = HalfPlaneN(Vec3vfx(P3),-Vec3vfx(dP3du)).intersect(org,dir); - tp = embree::intersect(tp,h1); - valid &= tp.lower <= tp.upper; - if (none(valid)) continue; - - /* clamp and correct u parameter */ - u_outer0 = clamp(u_outer0,vfloatx(0.0f),vfloatx(1.0f)); - u_outer1 = clamp(u_outer1,vfloatx(0.0f),vfloatx(1.0f)); - u_outer0 = lerp(u0,u1,(vfloatx(step)+u_outer0)*(1.0f/float(vfloatx::size))); - u_outer1 = lerp(u0,u1,(vfloatx(step)+u_outer1)*(1.0f/float(vfloatx::size))); - - /* intersect with inner cylinder */ - BBox tc_inner; - vfloatx u_inner0 = zero; Vec3vfx Ng_inner0 = zero; vfloatx u_inner1 = zero; Vec3vfx Ng_inner1 = zero; - const vboolx valid_inner = cylinder_inner.intersect(org,dir,tc_inner,u_inner0,Ng_inner0,u_inner1,Ng_inner1); - - /* at the unstable area we subdivide deeper */ - const vboolx unstable0 = (!valid_inner) | (abs(dot(Vec3vfx(Vec3fa(ray.dir)),Ng_inner0)) < 0.3f); - const vboolx unstable1 = (!valid_inner) | (abs(dot(Vec3vfx(Vec3fa(ray.dir)),Ng_inner1)) < 0.3f); - - /* subtract the inner interval from the current hit interval */ - BBox tp0, tp1; - subtract(tp,tc_inner,tp0,tp1); - vboolx valid0 = valid & (tp0.lower <= tp0.upper); - vboolx valid1 = valid & (tp1.lower <= tp1.upper); - if (none(valid0 | valid1)) continue; - - /* iterate over all first hits front to back */ - const vintx termDepth0 = select(unstable0,vintx(maxDepth+1),vintx(maxDepth)); - vboolx recursion_valid0 = valid0 & (depth < termDepth0); - valid0 &= depth >= termDepth0; - - while (any(valid0)) - { - const size_t i = select_min(valid0,tp0.lower); clear(valid0,i); - found = found | intersect_bezier_iterative_jacobian(ray,dt,curve,u_outer0[i],tp0.lower[i],epilog); - //found = found | intersect_bezier_iterative_debug (ray,dt,curve,i,u_outer0,tp0,h0,h1,Ng_outer0,dP0du,dP3du,epilog); - valid0 &= tp0.lower+dt <= ray.tfar; - } - valid1 &= tp1.lower+dt <= ray.tfar; - - /* iterate over all second hits front to back */ - const vintx termDepth1 = select(unstable1,vintx(maxDepth+1),vintx(maxDepth)); - vboolx recursion_valid1 = valid1 & (depth < termDepth1); - valid1 &= depth >= termDepth1; - while (any(valid1)) - { - const size_t i = select_min(valid1,tp1.lower); clear(valid1,i); - found = found | intersect_bezier_iterative_jacobian(ray,dt,curve,u_outer1[i],tp1.upper[i],epilog); - //found = found | intersect_bezier_iterative_debug (ray,dt,curve,i,u_outer1,tp1,h0,h1,Ng_outer1,dP0du,dP3du,epilog); - valid1 &= tp1.lower+dt <= ray.tfar; - } - - /* push valid segments to stack */ - recursion_valid0 &= tp0.lower+dt <= ray.tfar; - recursion_valid1 &= tp1.lower+dt <= ray.tfar; - const vboolx recursion_valid = recursion_valid0 | recursion_valid1; - if (any(recursion_valid)) - { - assert(sptr < stack_size); - stack[sptr].valid = recursion_valid; - stack[sptr].tlower = select(recursion_valid0,tp0.lower,tp1.lower); - stack[sptr].u0 = u0; - stack[sptr].u1 = u1; - stack[sptr].depth = depth+1; - sptr++; - } - } - return found; - } - - template class NativeCurve> - struct SweepCurve1Intersector1 - { - typedef NativeCurve NativeCurve3ff; - - template - __noinline bool intersect(const CurvePrecalculations1& pre, Ray& ray, - IntersectContext* context, - const CurveGeometry* geom, const unsigned int primID, - const Vec3ff& v0, const Vec3ff& v1, const Vec3ff& v2, const Vec3ff& v3, - const Epilog& epilog) - { - STAT3(normal.trav_prims,1,1,1); - - /* move ray closer to make intersection stable */ - NativeCurve3ff curve0(v0,v1,v2,v3); - curve0 = enlargeRadiusToMinWidth(context,geom,ray.org,curve0); - const float dt = dot(curve0.center()-ray.org,ray.dir)*rcp(dot(ray.dir,ray.dir)); - const Vec3ff ref(madd(Vec3fa(dt),ray.dir,ray.org),0.0f); - const NativeCurve3ff curve1 = curve0-ref; - return intersect_bezier_recursive_jacobian(ray,dt,curve1,0.0f,1.0f,1,epilog); - } - }; - - template class NativeCurve, int K> - struct SweepCurve1IntersectorK - { - typedef NativeCurve NativeCurve3ff; - - struct Ray1 - { - __forceinline Ray1(RayK& ray, size_t k) - : org(ray.org.x[k],ray.org.y[k],ray.org.z[k]), dir(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]), _tnear(ray.tnear()[k]), tfar(ray.tfar[k]) {} - - Vec3fa org; - Vec3fa dir; - float _tnear; - float& tfar; - - __forceinline float& tnear() { return _tnear; } - //__forceinline float& tfar() { return _tfar; } - __forceinline const float& tnear() const { return _tnear; } - //__forceinline const float& tfar() const { return _tfar; } - - }; - - template - __forceinline bool intersect(const CurvePrecalculationsK& pre, RayK& vray, size_t k, - IntersectContext* context, - const CurveGeometry* geom, const unsigned int primID, - const Vec3ff& v0, const Vec3ff& v1, const Vec3ff& v2, const Vec3ff& v3, - const Epilog& epilog) - { - STAT3(normal.trav_prims,1,1,1); - Ray1 ray(vray,k); - - /* move ray closer to make intersection stable */ - NativeCurve3ff curve0(v0,v1,v2,v3); - curve0 = enlargeRadiusToMinWidth(context,geom,ray.org,curve0); - const float dt = dot(curve0.center()-ray.org,ray.dir)*rcp(dot(ray.dir,ray.dir)); - const Vec3ff ref(madd(Vec3fa(dt),ray.dir,ray.org),0.0f); - const NativeCurve3ff curve1 = curve0-ref; - return intersect_bezier_recursive_jacobian(ray,dt,curve1,0.0f,1.0f,1,epilog); - } - }; - } -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual.h deleted file mode 100644 index e1f4238130..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual.h +++ /dev/null @@ -1,671 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "primitive.h" -#include "../subdiv/bezier_curve.h" -#include "../common/primref.h" -#include "curve_intersector_precalculations.h" -#include "../bvh/node_intersector1.h" -#include "../bvh/node_intersector_packet.h" - -#include "intersector_epilog.h" - -#include "../subdiv/bezier_curve.h" -#include "../subdiv/bspline_curve.h" -#include "../subdiv/hermite_curve.h" -#include "../subdiv/catmullrom_curve.h" - -#include "spherei_intersector.h" -#include "disci_intersector.h" - -#include "linei_intersector.h" -#include "roundlinei_intersector.h" -#include "conelinei_intersector.h" - -#include "curveNi_intersector.h" -#include "curveNv_intersector.h" -#include "curveNi_mb_intersector.h" - -#include "curve_intersector_distance.h" -#include "curve_intersector_ribbon.h" -#include "curve_intersector_oriented.h" -#include "curve_intersector_sweep.h" - -namespace embree -{ - struct VirtualCurveIntersector - { - typedef void (*Intersect1Ty)(void* pre, void* ray, IntersectContext* context, const void* primitive); - typedef bool (*Occluded1Ty )(void* pre, void* ray, IntersectContext* context, const void* primitive); - - typedef void (*Intersect4Ty)(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive); - typedef bool (*Occluded4Ty) (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive); - - typedef void (*Intersect8Ty)(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive); - typedef bool (*Occluded8Ty) (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive); - - typedef void (*Intersect16Ty)(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive); - typedef bool (*Occluded16Ty) (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive); - - public: - struct Intersectors - { - Intersectors() {} // WARNING: Do not zero initialize this, as we otherwise get problems with thread unsafe local static variable initialization (e.g. on VS2013) in curve_intersector_virtual.cpp. - - template void intersect(void* pre, void* ray, IntersectContext* context, const void* primitive); - template bool occluded (void* pre, void* ray, IntersectContext* context, const void* primitive); - - template void intersect(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive); - template bool occluded (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive); - - public: - Intersect1Ty intersect1; - Occluded1Ty occluded1; - Intersect4Ty intersect4; - Occluded4Ty occluded4; - Intersect8Ty intersect8; - Occluded8Ty occluded8; - Intersect16Ty intersect16; - Occluded16Ty occluded16; - }; - - Intersectors vtbl[Geometry::GTY_END]; - }; - - template<> __forceinline void VirtualCurveIntersector::Intersectors::intersect<1> (void* pre, void* ray, IntersectContext* context, const void* primitive) { assert(intersect1); intersect1(pre,ray,context,primitive); } - template<> __forceinline bool VirtualCurveIntersector::Intersectors::occluded<1> (void* pre, void* ray, IntersectContext* context, const void* primitive) { assert(occluded1); return occluded1(pre,ray,context,primitive); } - - template<> __forceinline void VirtualCurveIntersector::Intersectors::intersect<4>(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive) { assert(intersect4); intersect4(pre,ray,k,context,primitive); } - template<> __forceinline bool VirtualCurveIntersector::Intersectors::occluded<4> (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive) { assert(occluded4); return occluded4(pre,ray,k,context,primitive); } - -#if defined(__AVX__) - template<> __forceinline void VirtualCurveIntersector::Intersectors::intersect<8>(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive) { assert(intersect8); intersect8(pre,ray,k,context,primitive); } - template<> __forceinline bool VirtualCurveIntersector::Intersectors::occluded<8> (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive) { assert(occluded8); return occluded8(pre,ray,k,context,primitive); } -#endif - -#if defined(__AVX512F__) - template<> __forceinline void VirtualCurveIntersector::Intersectors::intersect<16>(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive) { assert(intersect16); intersect16(pre,ray,k,context,primitive); } - template<> __forceinline bool VirtualCurveIntersector::Intersectors::occluded<16> (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive) { assert(occluded16); return occluded16(pre,ray,k,context,primitive); } -#endif - - namespace isa - { - struct VirtualCurveIntersector1 - { - typedef unsigned char Primitive; - typedef CurvePrecalculations1 Precalculations; - - template - static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay &tray, size_t& lazy_node) - { - assert(num == 1); - RTCGeometryType ty = (RTCGeometryType)(*prim); - assert(This->leafIntersector); - VirtualCurveIntersector::Intersectors& leafIntersector = ((VirtualCurveIntersector*) This->leafIntersector)->vtbl[ty]; - leafIntersector.intersect<1>(&pre,&ray,context,prim); - } - - template - static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay &tray, size_t& lazy_node) - { - assert(num == 1); - RTCGeometryType ty = (RTCGeometryType)(*prim); - assert(This->leafIntersector); - VirtualCurveIntersector::Intersectors& leafIntersector = ((VirtualCurveIntersector*) This->leafIntersector)->vtbl[ty]; - return leafIntersector.occluded<1>(&pre,&ray,context,prim); - } - }; - - template - struct VirtualCurveIntersectorK - { - typedef unsigned char Primitive; - typedef CurvePrecalculationsK Precalculations; - - template - static __forceinline void intersect(const vbool& valid_i, const Accel::Intersectors* This, Precalculations& pre, RayHitK& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK &tray, size_t& lazy_node) - { - assert(num == 1); - RTCGeometryType ty = (RTCGeometryType)(*prim); - assert(This->leafIntersector); - VirtualCurveIntersector::Intersectors& leafIntersector = ((VirtualCurveIntersector*) This->leafIntersector)->vtbl[ty]; - size_t mask = movemask(valid_i); - while (mask) leafIntersector.intersect(&pre,&ray,bscf(mask),context,prim); - } - - template - static __forceinline vbool occluded(const vbool& valid_i, const Accel::Intersectors* This, Precalculations& pre, RayK& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK &tray, size_t& lazy_node) - { - assert(num == 1); - RTCGeometryType ty = (RTCGeometryType)(*prim); - assert(This->leafIntersector); - VirtualCurveIntersector::Intersectors& leafIntersector = ((VirtualCurveIntersector*) This->leafIntersector)->vtbl[ty]; - vbool valid_o = false; - size_t mask = movemask(valid_i); - while (mask) { - size_t k = bscf(mask); - if (leafIntersector.occluded(&pre,&ray,k,context,prim)) - set(valid_o, k); - } - return valid_o; - } - - template - static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay &tray, size_t& lazy_node) - { - assert(num == 1); - RTCGeometryType ty = (RTCGeometryType)(*prim); - assert(This->leafIntersector); - VirtualCurveIntersector::Intersectors& leafIntersector = ((VirtualCurveIntersector*) This->leafIntersector)->vtbl[ty]; - leafIntersector.intersect(&pre,&ray,k,context,prim); - } - - template - static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay &tray, size_t& lazy_node) - { - assert(num == 1); - RTCGeometryType ty = (RTCGeometryType)(*prim); - assert(This->leafIntersector); - VirtualCurveIntersector::Intersectors& leafIntersector = ((VirtualCurveIntersector*) This->leafIntersector)->vtbl[ty]; - return leafIntersector.occluded(&pre,&ray,k,context,prim); - } - }; - - template - static VirtualCurveIntersector::Intersectors LinearRoundConeNiIntersectors() - { - VirtualCurveIntersector::Intersectors intersectors; - intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &RoundLinearCurveMiIntersector1::intersect; - intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &RoundLinearCurveMiIntersector1::occluded; - intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &RoundLinearCurveMiIntersectorK::intersect; - intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &RoundLinearCurveMiIntersectorK::occluded; -#if defined(__AVX__) - intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&RoundLinearCurveMiIntersectorK::intersect; - intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &RoundLinearCurveMiIntersectorK::occluded; -#endif -#if defined(__AVX512F__) - intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&RoundLinearCurveMiIntersectorK::intersect; - intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &RoundLinearCurveMiIntersectorK::occluded; -#endif - return intersectors; - } - - template - static VirtualCurveIntersector::Intersectors LinearConeNiIntersectors() - { - VirtualCurveIntersector::Intersectors intersectors; - intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &ConeCurveMiIntersector1::intersect; - intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &ConeCurveMiIntersector1::occluded; - intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &ConeCurveMiIntersectorK::intersect; - intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &ConeCurveMiIntersectorK::occluded; -#if defined(__AVX__) - intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&ConeCurveMiIntersectorK::intersect; - intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &ConeCurveMiIntersectorK::occluded; -#endif -#if defined(__AVX512F__) - intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&ConeCurveMiIntersectorK::intersect; - intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &ConeCurveMiIntersectorK::occluded; -#endif - return intersectors; - } - - template - static VirtualCurveIntersector::Intersectors LinearRoundConeNiMBIntersectors() - { - VirtualCurveIntersector::Intersectors intersectors; - intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &RoundLinearCurveMiMBIntersector1::intersect; - intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &RoundLinearCurveMiMBIntersector1::occluded; - intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &RoundLinearCurveMiMBIntersectorK::intersect; - intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &RoundLinearCurveMiMBIntersectorK::occluded; -#if defined(__AVX__) - intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&RoundLinearCurveMiMBIntersectorK::intersect; - intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &RoundLinearCurveMiMBIntersectorK::occluded; -#endif -#if defined(__AVX512F__) - intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&RoundLinearCurveMiMBIntersectorK::intersect; - intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &RoundLinearCurveMiMBIntersectorK::occluded; -#endif - return intersectors; - } - - template - static VirtualCurveIntersector::Intersectors LinearConeNiMBIntersectors() - { - VirtualCurveIntersector::Intersectors intersectors; - intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &ConeCurveMiMBIntersector1::intersect; - intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &ConeCurveMiMBIntersector1::occluded; - intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &ConeCurveMiMBIntersectorK::intersect; - intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &ConeCurveMiMBIntersectorK::occluded; -#if defined(__AVX__) - intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&ConeCurveMiMBIntersectorK::intersect; - intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &ConeCurveMiMBIntersectorK::occluded; -#endif -#if defined(__AVX512F__) - intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&ConeCurveMiMBIntersectorK::intersect; - intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &ConeCurveMiMBIntersectorK::occluded; -#endif - return intersectors; - } - - - template - static VirtualCurveIntersector::Intersectors LinearRibbonNiIntersectors() - { - VirtualCurveIntersector::Intersectors intersectors; - intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &FlatLinearCurveMiIntersector1::intersect; - intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &FlatLinearCurveMiIntersector1::occluded; - intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &FlatLinearCurveMiIntersectorK::intersect; - intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &FlatLinearCurveMiIntersectorK::occluded; -#if defined(__AVX__) - intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&FlatLinearCurveMiIntersectorK::intersect; - intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &FlatLinearCurveMiIntersectorK::occluded; -#endif -#if defined(__AVX512F__) - intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&FlatLinearCurveMiIntersectorK::intersect; - intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &FlatLinearCurveMiIntersectorK::occluded; -#endif - return intersectors; - } - - template - static VirtualCurveIntersector::Intersectors LinearRibbonNiMBIntersectors() - { - VirtualCurveIntersector::Intersectors intersectors; - intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &FlatLinearCurveMiMBIntersector1::intersect; - intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &FlatLinearCurveMiMBIntersector1::occluded; - intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &FlatLinearCurveMiMBIntersectorK::intersect; - intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &FlatLinearCurveMiMBIntersectorK::occluded; -#if defined(__AVX__) - intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&FlatLinearCurveMiMBIntersectorK::intersect; - intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &FlatLinearCurveMiMBIntersectorK::occluded; -#endif -#if defined(__AVX512F__) - intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&FlatLinearCurveMiMBIntersectorK::intersect; - intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &FlatLinearCurveMiMBIntersectorK::occluded; -#endif - return intersectors; - } - - template - static VirtualCurveIntersector::Intersectors SphereNiIntersectors() - { - VirtualCurveIntersector::Intersectors intersectors; - intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &SphereMiIntersector1::intersect; - intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &SphereMiIntersector1::occluded; - intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &SphereMiIntersectorK::intersect; - intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &SphereMiIntersectorK::occluded; -#if defined(__AVX__) - intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&SphereMiIntersectorK::intersect; - intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &SphereMiIntersectorK::occluded; -#endif -#if defined(__AVX512F__) - intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&SphereMiIntersectorK::intersect; - intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &SphereMiIntersectorK::occluded; -#endif - return intersectors; - } - - template - static VirtualCurveIntersector::Intersectors SphereNiMBIntersectors() - { - VirtualCurveIntersector::Intersectors intersectors; - intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &SphereMiMBIntersector1::intersect; - intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &SphereMiMBIntersector1::occluded; - intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &SphereMiMBIntersectorK::intersect; - intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &SphereMiMBIntersectorK::occluded; -#if defined(__AVX__) - intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&SphereMiMBIntersectorK::intersect; - intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &SphereMiMBIntersectorK::occluded; -#endif -#if defined(__AVX512F__) - intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&SphereMiMBIntersectorK::intersect; - intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &SphereMiMBIntersectorK::occluded; -#endif - return intersectors; - } - - template - static VirtualCurveIntersector::Intersectors DiscNiIntersectors() - { - VirtualCurveIntersector::Intersectors intersectors; - intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &DiscMiIntersector1::intersect; - intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &DiscMiIntersector1::occluded; - intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &DiscMiIntersectorK::intersect; - intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &DiscMiIntersectorK::occluded; -#if defined(__AVX__) - intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&DiscMiIntersectorK::intersect; - intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &DiscMiIntersectorK::occluded; -#endif -#if defined(__AVX512F__) - intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&DiscMiIntersectorK::intersect; - intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &DiscMiIntersectorK::occluded; -#endif - return intersectors; - } - - template - static VirtualCurveIntersector::Intersectors DiscNiMBIntersectors() - { - VirtualCurveIntersector::Intersectors intersectors; - intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &DiscMiMBIntersector1::intersect; - intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &DiscMiMBIntersector1::occluded; - intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &DiscMiMBIntersectorK::intersect; - intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &DiscMiMBIntersectorK::occluded; -#if defined(__AVX__) - intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&DiscMiMBIntersectorK::intersect; - intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &DiscMiMBIntersectorK::occluded; -#endif -#if defined(__AVX512F__) - intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&DiscMiMBIntersectorK::intersect; - intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &DiscMiMBIntersectorK::occluded; -#endif - return intersectors; - } - - template - static VirtualCurveIntersector::Intersectors OrientedDiscNiIntersectors() - { - VirtualCurveIntersector::Intersectors intersectors; - intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &OrientedDiscMiIntersector1::intersect; - intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &OrientedDiscMiIntersector1::occluded; - intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &OrientedDiscMiIntersectorK::intersect; - intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &OrientedDiscMiIntersectorK::occluded; -#if defined(__AVX__) - intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&OrientedDiscMiIntersectorK::intersect; - intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &OrientedDiscMiIntersectorK::occluded; -#endif -#if defined(__AVX512F__) - intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&OrientedDiscMiIntersectorK::intersect; - intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &OrientedDiscMiIntersectorK::occluded; -#endif - return intersectors; - } - - template - static VirtualCurveIntersector::Intersectors OrientedDiscNiMBIntersectors() - { - VirtualCurveIntersector::Intersectors intersectors; - intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &OrientedDiscMiMBIntersector1::intersect; - intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &OrientedDiscMiMBIntersector1::occluded; - intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &OrientedDiscMiMBIntersectorK::intersect; - intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &OrientedDiscMiMBIntersectorK::occluded; -#if defined(__AVX__) - intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&OrientedDiscMiMBIntersectorK::intersect; - intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &OrientedDiscMiMBIntersectorK::occluded; -#endif -#if defined(__AVX512F__) - intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&OrientedDiscMiMBIntersectorK::intersect; - intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &OrientedDiscMiMBIntersectorK::occluded; -#endif - return intersectors; - } - - template class Curve, int N> - static VirtualCurveIntersector::Intersectors RibbonNiIntersectors() - { - VirtualCurveIntersector::Intersectors intersectors; - intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiIntersector1::template intersect_t, Intersect1EpilogMU >; - intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &CurveNiIntersector1::template occluded_t , Occluded1EpilogMU >; - intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &CurveNiIntersectorK::template intersect_t, Intersect1KEpilogMU >; - intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &CurveNiIntersectorK::template occluded_t , Occluded1KEpilogMU >; -#if defined(__AVX__) - intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiIntersectorK::template intersect_t, Intersect1KEpilogMU >; - intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &CurveNiIntersectorK::template occluded_t , Occluded1KEpilogMU >; -#endif -#if defined(__AVX512F__) - intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiIntersectorK::template intersect_t, Intersect1KEpilogMU >; - intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &CurveNiIntersectorK::template occluded_t , Occluded1KEpilogMU >; -#endif - return intersectors; - } - - template class Curve, int N> - static VirtualCurveIntersector::Intersectors RibbonNvIntersectors() - { - VirtualCurveIntersector::Intersectors intersectors; - intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNvIntersector1::template intersect_t, Intersect1EpilogMU >; - intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &CurveNvIntersector1::template occluded_t , Occluded1EpilogMU >; - intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &CurveNvIntersectorK::template intersect_t, Intersect1KEpilogMU >; - intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &CurveNvIntersectorK::template occluded_t , Occluded1KEpilogMU >; -#if defined(__AVX__) - intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNvIntersectorK::template intersect_t, Intersect1KEpilogMU >; - intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &CurveNvIntersectorK::template occluded_t , Occluded1KEpilogMU >; -#endif -#if defined(__AVX512F__) - intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNvIntersectorK::template intersect_t, Intersect1KEpilogMU >; - intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &CurveNvIntersectorK::template occluded_t , Occluded1KEpilogMU >; -#endif - return intersectors; - } - - template class Curve, int N> - static VirtualCurveIntersector::Intersectors RibbonNiMBIntersectors() - { - VirtualCurveIntersector::Intersectors intersectors; - intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiMBIntersector1::template intersect_t, Intersect1EpilogMU >; - intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &CurveNiMBIntersector1::template occluded_t , Occluded1EpilogMU >; - intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &CurveNiMBIntersectorK::template intersect_t, Intersect1KEpilogMU >; - intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &CurveNiMBIntersectorK::template occluded_t , Occluded1KEpilogMU >; -#if defined(__AVX__) - intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiMBIntersectorK::template intersect_t, Intersect1KEpilogMU >; - intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &CurveNiMBIntersectorK::template occluded_t , Occluded1KEpilogMU >; -#endif -#if defined(__AVX512F__) - intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiMBIntersectorK::template intersect_t, Intersect1KEpilogMU >; - intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &CurveNiMBIntersectorK::template occluded_t , Occluded1KEpilogMU >; -#endif - return intersectors; - } - - template class Curve, int N> - static VirtualCurveIntersector::Intersectors CurveNiIntersectors() - { - VirtualCurveIntersector::Intersectors intersectors; - intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiIntersector1::template intersect_t, Intersect1Epilog1 >; - intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &CurveNiIntersector1::template occluded_t , Occluded1Epilog1 >; - intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiIntersectorK::template intersect_t, Intersect1KEpilog1<4,true> >; - intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &CurveNiIntersectorK::template occluded_t , Occluded1KEpilog1<4,true> >; -#if defined(__AVX__) - intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiIntersectorK::template intersect_t, Intersect1KEpilog1<8,true> >; - intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &CurveNiIntersectorK::template occluded_t , Occluded1KEpilog1<8,true> >; -#endif -#if defined(__AVX512F__) - intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiIntersectorK::template intersect_t, Intersect1KEpilog1<16,true> >; - intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &CurveNiIntersectorK::template occluded_t , Occluded1KEpilog1<16,true> >; -#endif - return intersectors; - } - - template class Curve, int N> - static VirtualCurveIntersector::Intersectors CurveNvIntersectors() - { - VirtualCurveIntersector::Intersectors intersectors; - intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNvIntersector1::template intersect_t, Intersect1Epilog1 >; - intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &CurveNvIntersector1::template occluded_t , Occluded1Epilog1 >; - intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNvIntersectorK::template intersect_t, Intersect1KEpilog1<4,true> >; - intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &CurveNvIntersectorK::template occluded_t , Occluded1KEpilog1<4,true> >; -#if defined(__AVX__) - intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNvIntersectorK::template intersect_t, Intersect1KEpilog1<8,true> >; - intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &CurveNvIntersectorK::template occluded_t , Occluded1KEpilog1<8,true> >; -#endif -#if defined(__AVX512F__) - intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNvIntersectorK::template intersect_t, Intersect1KEpilog1<16,true> >; - intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &CurveNvIntersectorK::template occluded_t , Occluded1KEpilog1<16,true> >; -#endif - return intersectors; - } - - template class Curve, int N> - static VirtualCurveIntersector::Intersectors CurveNiMBIntersectors() - { - VirtualCurveIntersector::Intersectors intersectors; - intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiMBIntersector1::template intersect_t, Intersect1Epilog1 >; - intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &CurveNiMBIntersector1::template occluded_t , Occluded1Epilog1 >; - intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiMBIntersectorK::template intersect_t, Intersect1KEpilog1<4,true> >; - intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &CurveNiMBIntersectorK::template occluded_t , Occluded1KEpilog1<4,true> >; -#if defined(__AVX__) - intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiMBIntersectorK::template intersect_t, Intersect1KEpilog1<8,true> >; - intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &CurveNiMBIntersectorK::template occluded_t , Occluded1KEpilog1<8,true> >; -#endif -#if defined(__AVX512F__) - intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiMBIntersectorK::template intersect_t, Intersect1KEpilog1<16,true> >; - intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &CurveNiMBIntersectorK::template occluded_t , Occluded1KEpilog1<16,true> >; -#endif - return intersectors; - } - - template class Curve, int N> - static VirtualCurveIntersector::Intersectors OrientedCurveNiIntersectors() - { - VirtualCurveIntersector::Intersectors intersectors; - intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiIntersector1::template intersect_n, Intersect1Epilog1 >; - intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &CurveNiIntersector1::template occluded_n , Occluded1Epilog1 >; - intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiIntersectorK::template intersect_n, Intersect1KEpilog1<4,true> >; - intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &CurveNiIntersectorK::template occluded_n , Occluded1KEpilog1<4,true> >; -#if defined(__AVX__) - intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiIntersectorK::template intersect_n, Intersect1KEpilog1<8,true> >; - intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &CurveNiIntersectorK::template occluded_n , Occluded1KEpilog1<8,true> >; -#endif -#if defined(__AVX512F__) - intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiIntersectorK::template intersect_n, Intersect1KEpilog1<16,true> >; - intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &CurveNiIntersectorK::template occluded_n , Occluded1KEpilog1<16,true> >; -#endif - return intersectors; - } - - template class Curve, int N> - static VirtualCurveIntersector::Intersectors OrientedCurveNiMBIntersectors() - { - VirtualCurveIntersector::Intersectors intersectors; - intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiMBIntersector1::template intersect_n, Intersect1Epilog1 >; - intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &CurveNiMBIntersector1::template occluded_n , Occluded1Epilog1 >; - intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiMBIntersectorK::template intersect_n, Intersect1KEpilog1<4,true> >; - intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &CurveNiMBIntersectorK::template occluded_n , Occluded1KEpilog1<4,true> >; -#if defined(__AVX__) - intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiMBIntersectorK::template intersect_n, Intersect1KEpilog1<8,true> >; - intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &CurveNiMBIntersectorK::template occluded_n , Occluded1KEpilog1<8,true> >; -#endif -#if defined(__AVX512F__) - intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiMBIntersectorK::template intersect_n, Intersect1KEpilog1<16,true> >; - intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &CurveNiMBIntersectorK::template occluded_n , Occluded1KEpilog1<16,true> >; -#endif - return intersectors; - } - - template class Curve, int N> - static VirtualCurveIntersector::Intersectors HermiteRibbonNiIntersectors() - { - VirtualCurveIntersector::Intersectors intersectors; - intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiIntersector1::template intersect_h, Intersect1EpilogMU >; - intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &CurveNiIntersector1::template occluded_h , Occluded1EpilogMU >; - intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiIntersectorK::template intersect_h, Intersect1KEpilogMU >; - intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &CurveNiIntersectorK::template occluded_h , Occluded1KEpilogMU >; -#if defined(__AVX__) - intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiIntersectorK::template intersect_h, Intersect1KEpilogMU >; - intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &CurveNiIntersectorK::template occluded_h , Occluded1KEpilogMU >; -#endif -#if defined(__AVX512F__) - intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiIntersectorK::template intersect_h, Intersect1KEpilogMU >; - intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &CurveNiIntersectorK::template occluded_h , Occluded1KEpilogMU >; -#endif - return intersectors; - } - - template class Curve, int N> - static VirtualCurveIntersector::Intersectors HermiteRibbonNiMBIntersectors() - { - VirtualCurveIntersector::Intersectors intersectors; - intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiMBIntersector1::template intersect_h, Intersect1EpilogMU >; - intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &CurveNiMBIntersector1::template occluded_h , Occluded1EpilogMU >; - intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiMBIntersectorK::template intersect_h, Intersect1KEpilogMU >; - intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &CurveNiMBIntersectorK::template occluded_h , Occluded1KEpilogMU >; -#if defined(__AVX__) - intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiMBIntersectorK::template intersect_h, Intersect1KEpilogMU >; - intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &CurveNiMBIntersectorK::template occluded_h , Occluded1KEpilogMU >; -#endif -#if defined(__AVX512F__) - intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiMBIntersectorK::template intersect_h, Intersect1KEpilogMU >; - intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &CurveNiMBIntersectorK::template occluded_h , Occluded1KEpilogMU >; -#endif - return intersectors; - } - - template class Curve, int N> - static VirtualCurveIntersector::Intersectors HermiteCurveNiIntersectors() - { - VirtualCurveIntersector::Intersectors intersectors; - intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiIntersector1::template intersect_h, Intersect1Epilog1 >; - intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &CurveNiIntersector1::template occluded_h , Occluded1Epilog1 >; - intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiIntersectorK::template intersect_h, Intersect1KEpilog1<4,true> >; - intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &CurveNiIntersectorK::template occluded_h , Occluded1KEpilog1<4,true> >; -#if defined(__AVX__) - intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiIntersectorK::template intersect_h, Intersect1KEpilog1<8,true> >; - intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &CurveNiIntersectorK::template occluded_h , Occluded1KEpilog1<8,true> >; -#endif -#if defined(__AVX512F__) - intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiIntersectorK::template intersect_h, Intersect1KEpilog1<16,true> >; - intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &CurveNiIntersectorK::template occluded_h , Occluded1KEpilog1<16,true> >; -#endif - return intersectors; - } - - template class Curve, int N> - static VirtualCurveIntersector::Intersectors HermiteCurveNiMBIntersectors() - { - VirtualCurveIntersector::Intersectors intersectors; - intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiMBIntersector1::template intersect_h, Intersect1Epilog1 >; - intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &CurveNiMBIntersector1::template occluded_h , Occluded1Epilog1 >; - intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiMBIntersectorK::template intersect_h, Intersect1KEpilog1<4,true> >; - intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &CurveNiMBIntersectorK::template occluded_h , Occluded1KEpilog1<4,true> >; -#if defined(__AVX__) - intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiMBIntersectorK::template intersect_h, Intersect1KEpilog1<8,true> >; - intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &CurveNiMBIntersectorK::template occluded_h , Occluded1KEpilog1<8,true> >; -#endif -#if defined(__AVX512F__) - intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiMBIntersectorK::template intersect_h, Intersect1KEpilog1<16,true> >; - intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &CurveNiMBIntersectorK::template occluded_h , Occluded1KEpilog1<16,true> >; -#endif - return intersectors; - } - - template class Curve, int N> - static VirtualCurveIntersector::Intersectors HermiteOrientedCurveNiIntersectors() - { - VirtualCurveIntersector::Intersectors intersectors; - intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiIntersector1::template intersect_hn, Intersect1Epilog1 >; - intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &CurveNiIntersector1::template occluded_hn , Occluded1Epilog1 >; - intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiIntersectorK::template intersect_hn, Intersect1KEpilog1<4,true> >; - intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &CurveNiIntersectorK::template occluded_hn , Occluded1KEpilog1<4,true> >; -#if defined(__AVX__) - intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiIntersectorK::template intersect_hn, Intersect1KEpilog1<8,true> >; - intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &CurveNiIntersectorK::template occluded_hn , Occluded1KEpilog1<8,true> >; -#endif -#if defined(__AVX512F__) - intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiIntersectorK::template intersect_hn, Intersect1KEpilog1<16,true> >; - intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &CurveNiIntersectorK::template occluded_hn , Occluded1KEpilog1<16,true> >; -#endif - return intersectors; - } - - template class Curve, int N> - static VirtualCurveIntersector::Intersectors HermiteOrientedCurveNiMBIntersectors() - { - VirtualCurveIntersector::Intersectors intersectors; - intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiMBIntersector1::template intersect_hn, Intersect1Epilog1 >; - intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &CurveNiMBIntersector1::template occluded_hn , Occluded1Epilog1 >; - intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiMBIntersectorK::template intersect_hn, Intersect1KEpilog1<4,true> >; - intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &CurveNiMBIntersectorK::template occluded_hn , Occluded1KEpilog1<4,true> >; -#if defined(__AVX__) - intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiMBIntersectorK::template intersect_hn, Intersect1KEpilog1<8,true> >; - intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &CurveNiMBIntersectorK::template occluded_hn , Occluded1KEpilog1<8,true> >; -#endif -#if defined(__AVX512F__) - intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiMBIntersectorK::template intersect_hn, Intersect1KEpilog1<16,true> >; - intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &CurveNiMBIntersectorK::template occluded_hn , Occluded1KEpilog1<16,true> >; -#endif - return intersectors; - } - } -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_bezier_curve.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_bezier_curve.h deleted file mode 100644 index 69cf612275..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_bezier_curve.h +++ /dev/null @@ -1,21 +0,0 @@ -// Copyright 2020 Light Transport Entertainment Inc. -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "curve_intersector_virtual.h" - -namespace embree -{ - namespace isa - { - void AddVirtualCurveBezierCurveInterector4i(VirtualCurveIntersector &prim); - void AddVirtualCurveBezierCurveInterector4v(VirtualCurveIntersector &prim); - void AddVirtualCurveBezierCurveInterector4iMB(VirtualCurveIntersector &prim); -#if defined(__AVX__) - void AddVirtualCurveBezierCurveInterector8i(VirtualCurveIntersector &prim); - void AddVirtualCurveBezierCurveInterector8v(VirtualCurveIntersector &prim); - void AddVirtualCurveBezierCurveInterector8iMB(VirtualCurveIntersector &prim); -#endif - } -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_bspline_curve.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_bspline_curve.h deleted file mode 100644 index d37e41098e..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_bspline_curve.h +++ /dev/null @@ -1,21 +0,0 @@ -// Copyright 2020 Light Transport Entertainment Inc. -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "curve_intersector_virtual.h" - -namespace embree -{ - namespace isa - { - void AddVirtualCurveBSplineCurveInterector4i(VirtualCurveIntersector &prim); - void AddVirtualCurveBSplineCurveInterector4v(VirtualCurveIntersector &prim); - void AddVirtualCurveBSplineCurveInterector4iMB(VirtualCurveIntersector &prim); -#if defined(__AVX__) - void AddVirtualCurveBSplineCurveInterector8i(VirtualCurveIntersector &prim); - void AddVirtualCurveBSplineCurveInterector8v(VirtualCurveIntersector &prim); - void AddVirtualCurveBSplineCurveInterector8iMB(VirtualCurveIntersector &prim); -#endif - } -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_catmullrom_curve.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_catmullrom_curve.h deleted file mode 100644 index a133a11d63..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_catmullrom_curve.h +++ /dev/null @@ -1,21 +0,0 @@ -// Copyright 2020 Light Transport Entertainment Inc. -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "curve_intersector_virtual.h" - -namespace embree -{ - namespace isa - { - void AddVirtualCurveCatmullRomCurveInterector4i(VirtualCurveIntersector &prim); - void AddVirtualCurveCatmullRomCurveInterector4v(VirtualCurveIntersector &prim); - void AddVirtualCurveCatmullRomCurveInterector4iMB(VirtualCurveIntersector &prim); -#if defined(__AVX__) - void AddVirtualCurveCatmullRomCurveInterector8i(VirtualCurveIntersector &prim); - void AddVirtualCurveCatmullRomCurveInterector8v(VirtualCurveIntersector &prim); - void AddVirtualCurveCatmullRomCurveInterector8iMB(VirtualCurveIntersector &prim); -#endif - } -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_hermite_curve.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_hermite_curve.h deleted file mode 100644 index 9aec35da45..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_hermite_curve.h +++ /dev/null @@ -1,21 +0,0 @@ -// Copyright 2020 Light Transport Entertainment Inc. -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "curve_intersector_virtual.h" - -namespace embree -{ - namespace isa - { - void AddVirtualCurveHermiteCurveInterector4i(VirtualCurveIntersector &prim); - void AddVirtualCurveHermiteCurveInterector4v(VirtualCurveIntersector &prim); - void AddVirtualCurveHermiteCurveInterector4iMB(VirtualCurveIntersector &prim); -#if defined(__AVX__) - void AddVirtualCurveHermiteCurveInterector8i(VirtualCurveIntersector &prim); - void AddVirtualCurveHermiteCurveInterector8v(VirtualCurveIntersector &prim); - void AddVirtualCurveHermiteCurveInterector8iMB(VirtualCurveIntersector &prim); -#endif - } -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_linear_curve.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_linear_curve.h deleted file mode 100644 index dd37d194f5..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_linear_curve.h +++ /dev/null @@ -1,21 +0,0 @@ -// Copyright 2020 Light Transport Entertainment Inc. -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "curve_intersector_virtual.h" - -namespace embree -{ - namespace isa - { - void AddVirtualCurveLinearCurveInterector4i(VirtualCurveIntersector &prim); - void AddVirtualCurveLinearCurveInterector4v(VirtualCurveIntersector &prim); - void AddVirtualCurveLinearCurveInterector4iMB(VirtualCurveIntersector &prim); -#if defined(__AVX__) - void AddVirtualCurveLinearCurveInterector8i(VirtualCurveIntersector &prim); - void AddVirtualCurveLinearCurveInterector8v(VirtualCurveIntersector &prim); - void AddVirtualCurveLinearCurveInterector8iMB(VirtualCurveIntersector &prim); -#endif - } -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_point.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_point.h deleted file mode 100644 index fe5ceed840..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_point.h +++ /dev/null @@ -1,22 +0,0 @@ -// Copyright 2020 Light Transport Entertainment Inc. -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "curve_intersector_virtual.h" - -namespace embree -{ - namespace isa - { - void AddVirtualCurvePointInterector4i(VirtualCurveIntersector &prim); - void AddVirtualCurvePointInterector4v(VirtualCurveIntersector &prim); - void AddVirtualCurvePointInterector4iMB(VirtualCurveIntersector &prim); - -#if defined (__AVX__) - void AddVirtualCurvePointInterector8i(VirtualCurveIntersector &prim); - void AddVirtualCurvePointInterector8v(VirtualCurveIntersector &prim); - void AddVirtualCurvePointInterector8iMB(VirtualCurveIntersector &prim); -#endif - } -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/cylinder.h b/thirdparty/embree-aarch64/kernels/geometry/cylinder.h deleted file mode 100644 index 39a582864c..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/cylinder.h +++ /dev/null @@ -1,223 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../common/ray.h" - -namespace embree -{ - namespace isa - { - struct Cylinder - { - const Vec3fa p0; //!< start location - const Vec3fa p1; //!< end position - const float rr; //!< squared radius of cylinder - - __forceinline Cylinder(const Vec3fa& p0, const Vec3fa& p1, const float r) - : p0(p0), p1(p1), rr(sqr(r)) {} - - __forceinline Cylinder(const Vec3fa& p0, const Vec3fa& p1, const float rr, bool) - : p0(p0), p1(p1), rr(rr) {} - - __forceinline bool intersect(const Vec3fa& org, - const Vec3fa& dir, - BBox1f& t_o, - float& u0_o, Vec3fa& Ng0_o, - float& u1_o, Vec3fa& Ng1_o) const - { - /* calculate quadratic equation to solve */ - const float rl = rcp_length(p1-p0); - const Vec3fa P0 = p0, dP = (p1-p0)*rl; - const Vec3fa O = org-P0, dO = dir; - - const float dOdO = dot(dO,dO); - const float OdO = dot(dO,O); - const float OO = dot(O,O); - const float dOz = dot(dP,dO); - const float Oz = dot(dP,O); - - const float A = dOdO - sqr(dOz); - const float B = 2.0f * (OdO - dOz*Oz); - const float C = OO - sqr(Oz) - rr; - - /* we miss the cylinder if determinant is smaller than zero */ - const float D = B*B - 4.0f*A*C; - if (D < 0.0f) { - t_o = BBox1f(pos_inf,neg_inf); - return false; - } - - /* special case for rays that are parallel to the cylinder */ - const float eps = 16.0f*float(ulp)*max(abs(dOdO),abs(sqr(dOz))); - if (abs(A) < eps) - { - if (C <= 0.0f) { - t_o = BBox1f(neg_inf,pos_inf); - return true; - } else { - t_o = BBox1f(pos_inf,neg_inf); - return false; - } - } - - /* standard case for rays that are not parallel to the cylinder */ - const float Q = sqrt(D); - const float rcp_2A = rcp(2.0f*A); - const float t0 = (-B-Q)*rcp_2A; - const float t1 = (-B+Q)*rcp_2A; - - /* calculates u and Ng for near hit */ - { - u0_o = madd(t0,dOz,Oz)*rl; - const Vec3fa Pr = t0*dir; - const Vec3fa Pl = madd(u0_o,p1-p0,p0); - Ng0_o = Pr-Pl; - } - - /* calculates u and Ng for far hit */ - { - u1_o = madd(t1,dOz,Oz)*rl; - const Vec3fa Pr = t1*dir; - const Vec3fa Pl = madd(u1_o,p1-p0,p0); - Ng1_o = Pr-Pl; - } - - t_o.lower = t0; - t_o.upper = t1; - return true; - } - - __forceinline bool intersect(const Vec3fa& org_i, const Vec3fa& dir, BBox1f& t_o) const - { - float u0_o; Vec3fa Ng0_o; - float u1_o; Vec3fa Ng1_o; - return intersect(org_i,dir,t_o,u0_o,Ng0_o,u1_o,Ng1_o); - } - - static bool verify(const size_t id, const Cylinder& cylinder, const RayHit& ray, bool shouldhit, const float t0, const float t1) - { - float eps = 0.001f; - BBox1f t; bool hit; - hit = cylinder.intersect(ray.org,ray.dir,t); - - bool failed = hit != shouldhit; - if (shouldhit) failed |= std::isinf(t0) ? t0 != t.lower : abs(t0-t.lower) > eps; - if (shouldhit) failed |= std::isinf(t1) ? t1 != t.upper : abs(t1-t.upper) > eps; - if (!failed) return true; - embree_cout << "Cylinder test " << id << " failed: cylinder = " << cylinder << ", ray = " << ray << ", hit = " << hit << ", t = " << t << embree_endl; - return false; - } - - /* verify cylinder class */ - static bool verify() - { - bool passed = true; - const Cylinder cylinder(Vec3fa(0.0f,0.0f,0.0f),Vec3fa(1.0f,0.0f,0.0f),1.0f); - passed &= verify(0,cylinder,RayHit(Vec3fa(-2.0f,1.0f,0.0f),Vec3fa( 0.0f,-1.0f,+0.0f),0.0f,float(inf)),true,0.0f,2.0f); - passed &= verify(1,cylinder,RayHit(Vec3fa(+2.0f,1.0f,0.0f),Vec3fa( 0.0f,-1.0f,+0.0f),0.0f,float(inf)),true,0.0f,2.0f); - passed &= verify(2,cylinder,RayHit(Vec3fa(+2.0f,1.0f,2.0f),Vec3fa( 0.0f,-1.0f,+0.0f),0.0f,float(inf)),false,0.0f,0.0f); - passed &= verify(3,cylinder,RayHit(Vec3fa(+0.0f,0.0f,0.0f),Vec3fa( 1.0f, 0.0f,+0.0f),0.0f,float(inf)),true,neg_inf,pos_inf); - passed &= verify(4,cylinder,RayHit(Vec3fa(+0.0f,0.0f,0.0f),Vec3fa(-1.0f, 0.0f,+0.0f),0.0f,float(inf)),true,neg_inf,pos_inf); - passed &= verify(5,cylinder,RayHit(Vec3fa(+0.0f,2.0f,0.0f),Vec3fa( 1.0f, 0.0f,+0.0f),0.0f,float(inf)),false,pos_inf,neg_inf); - passed &= verify(6,cylinder,RayHit(Vec3fa(+0.0f,2.0f,0.0f),Vec3fa(-1.0f, 0.0f,+0.0f),0.0f,float(inf)),false,pos_inf,neg_inf); - return passed; - } - - /*! output operator */ - friend __forceinline embree_ostream operator<<(embree_ostream cout, const Cylinder& c) { - return cout << "Cylinder { p0 = " << c.p0 << ", p1 = " << c.p1 << ", r = " << sqrtf(c.rr) << "}"; - } - }; - - template - struct CylinderN - { - const Vec3vf p0; //!< start location - const Vec3vf p1; //!< end position - const vfloat rr; //!< squared radius of cylinder - - __forceinline CylinderN(const Vec3vf& p0, const Vec3vf& p1, const vfloat& r) - : p0(p0), p1(p1), rr(sqr(r)) {} - - __forceinline CylinderN(const Vec3vf& p0, const Vec3vf& p1, const vfloat& rr, bool) - : p0(p0), p1(p1), rr(rr) {} - - - __forceinline vbool intersect(const Vec3fa& org, const Vec3fa& dir, - BBox>& t_o, - vfloat& u0_o, Vec3vf& Ng0_o, - vfloat& u1_o, Vec3vf& Ng1_o) const - { - /* calculate quadratic equation to solve */ - const vfloat rl = rcp_length(p1-p0); - const Vec3vf P0 = p0, dP = (p1-p0)*rl; - const Vec3vf O = Vec3vf(org)-P0, dO = dir; - - const vfloat dOdO = dot(dO,dO); - const vfloat OdO = dot(dO,O); - const vfloat OO = dot(O,O); - const vfloat dOz = dot(dP,dO); - const vfloat Oz = dot(dP,O); - - const vfloat A = dOdO - sqr(dOz); - const vfloat B = 2.0f * (OdO - dOz*Oz); - const vfloat C = OO - sqr(Oz) - rr; - - /* we miss the cylinder if determinant is smaller than zero */ - const vfloat D = B*B - 4.0f*A*C; - vbool valid = D >= 0.0f; - if (none(valid)) { - t_o = BBox>(empty); - return valid; - } - - /* standard case for rays that are not parallel to the cylinder */ - const vfloat Q = sqrt(D); - const vfloat rcp_2A = rcp(2.0f*A); - const vfloat t0 = (-B-Q)*rcp_2A; - const vfloat t1 = (-B+Q)*rcp_2A; - - /* calculates u and Ng for near hit */ - { - u0_o = madd(t0,dOz,Oz)*rl; - const Vec3vf Pr = t0*Vec3vf(dir); - const Vec3vf Pl = madd(u0_o,p1-p0,p0); - Ng0_o = Pr-Pl; - } - - /* calculates u and Ng for far hit */ - { - u1_o = madd(t1,dOz,Oz)*rl; - const Vec3vf Pr = t1*Vec3vf(dir); - const Vec3vf Pl = madd(u1_o,p1-p0,p0); - Ng1_o = Pr-Pl; - } - - t_o.lower = select(valid, t0, vfloat(pos_inf)); - t_o.upper = select(valid, t1, vfloat(neg_inf)); - - /* special case for rays that are parallel to the cylinder */ - const vfloat eps = 16.0f*float(ulp)*max(abs(dOdO),abs(sqr(dOz))); - vbool validt = valid & (abs(A) < eps); - if (unlikely(any(validt))) - { - vbool inside = C <= 0.0f; - t_o.lower = select(validt,select(inside,vfloat(neg_inf),vfloat(pos_inf)),t_o.lower); - t_o.upper = select(validt,select(inside,vfloat(pos_inf),vfloat(neg_inf)),t_o.upper); - valid &= !validt | inside; - } - return valid; - } - - __forceinline vbool intersect(const Vec3fa& org_i, const Vec3fa& dir, BBox>& t_o) const - { - vfloat u0_o; Vec3vf Ng0_o; - vfloat u1_o; Vec3vf Ng1_o; - return intersect(org_i,dir,t_o,u0_o,Ng0_o,u1_o,Ng1_o); - } - }; - } -} - diff --git a/thirdparty/embree-aarch64/kernels/geometry/disc_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/disc_intersector.h deleted file mode 100644 index e8305780e5..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/disc_intersector.h +++ /dev/null @@ -1,216 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../common/ray.h" -#include "../common/scene_points.h" -#include "curve_intersector_precalculations.h" - -namespace embree -{ - namespace isa - { - template - struct DiscIntersectorHitM - { - __forceinline DiscIntersectorHitM() {} - - __forceinline DiscIntersectorHitM(const vfloat& u, const vfloat& v, const vfloat& t, const Vec3vf& Ng) - : vu(u), vv(v), vt(t), vNg(Ng) - { - } - - __forceinline void finalize() {} - - __forceinline Vec2f uv(const size_t i) const - { - return Vec2f(vu[i], vv[i]); - } - __forceinline float t(const size_t i) const - { - return vt[i]; - } - __forceinline Vec3fa Ng(const size_t i) const - { - return Vec3fa(vNg.x[i], vNg.y[i], vNg.z[i]); - } - - public: - vfloat vu; - vfloat vv; - vfloat vt; - Vec3vf vNg; - }; - - template - struct DiscIntersector1 - { - typedef CurvePrecalculations1 Precalculations; - - template - static __forceinline bool intersect( - const vbool& valid_i, - Ray& ray, - IntersectContext* context, - const Points* geom, - const Precalculations& pre, - const Vec4vf& v0i, - const Epilog& epilog) - { - vbool valid = valid_i; - - const Vec3vf ray_org(ray.org.x, ray.org.y, ray.org.z); - const Vec3vf ray_dir(ray.dir.x, ray.dir.y, ray.dir.z); - const vfloat rd2 = rcp(dot(ray_dir, ray_dir)); - - const Vec4vf v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i); - const Vec3vf center = v0.xyz(); - const vfloat radius = v0.w; - - const Vec3vf c0 = center - ray_org; - const vfloat projC0 = dot(c0, ray_dir) * rd2; - - valid &= (vfloat(ray.tnear()) <= projC0) & (projC0 <= vfloat(ray.tfar)); - if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f) - valid &= projC0 > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR) * radius * pre.depth_scale; // ignore self intersections - if (unlikely(none(valid))) - return false; - - const Vec3vf perp = c0 - projC0 * ray_dir; - const vfloat l2 = dot(perp, perp); - const vfloat r2 = radius * radius; - valid &= (l2 <= r2); - if (unlikely(none(valid))) - return false; - - DiscIntersectorHitM hit(zero, zero, projC0, -ray_dir); - return epilog(valid, hit); - } - - template - static __forceinline bool intersect(const vbool& valid_i, - Ray& ray, - IntersectContext* context, - const Points* geom, - const Precalculations& pre, - const Vec4vf& v0i, - const Vec3vf& normal, - const Epilog& epilog) - { - vbool valid = valid_i; - const Vec3vf ray_org(ray.org.x, ray.org.y, ray.org.z); - - const Vec4vf v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i); - const Vec3vf center = v0.xyz(); - const vfloat radius = v0.w; - - vfloat divisor = dot(Vec3vf((Vec3fa)ray.dir), normal); - const vbool parallel = divisor == vfloat(0.f); - valid &= !parallel; - divisor = select(parallel, 1.f, divisor); // prevent divide by zero - - vfloat t = dot(center - Vec3vf((Vec3fa)ray.org), Vec3vf(normal)) / divisor; - - valid &= (vfloat(ray.tnear()) <= t) & (t <= vfloat(ray.tfar)); - if (unlikely(none(valid))) - return false; - - Vec3vf intersection = Vec3vf((Vec3fa)ray.org) + Vec3vf((Vec3fa)ray.dir) * t; - vfloat dist2 = dot(intersection - center, intersection - center); - valid &= dist2 < radius * radius; - if (unlikely(none(valid))) - return false; - - DiscIntersectorHitM hit(zero, zero, t, normal); - return epilog(valid, hit); - } - }; - - template - struct DiscIntersectorK - { - typedef CurvePrecalculationsK Precalculations; - - template - static __forceinline bool intersect(const vbool& valid_i, - RayK& ray, - size_t k, - IntersectContext* context, - const Points* geom, - const Precalculations& pre, - const Vec4vf& v0i, - const Epilog& epilog) - { - vbool valid = valid_i; - - const Vec3vf ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]); - const Vec3vf ray_dir(ray.dir.x[k], ray.dir.y[k], ray.dir.z[k]); - const vfloat rd2 = rcp(dot(ray_dir, ray_dir)); - - const Vec4vf v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i); - const Vec3vf center = v0.xyz(); - const vfloat radius = v0.w; - - const Vec3vf c0 = center - ray_org; - const vfloat projC0 = dot(c0, ray_dir) * rd2; - - valid &= (vfloat(ray.tnear()[k]) <= projC0) & (projC0 <= vfloat(ray.tfar[k])); - if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f) - valid &= projC0 > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR) * radius * pre.depth_scale[k]; // ignore self intersections - if (unlikely(none(valid))) - return false; - - const Vec3vf perp = c0 - projC0 * ray_dir; - const vfloat l2 = dot(perp, perp); - const vfloat r2 = radius * radius; - valid &= (l2 <= r2); - if (unlikely(none(valid))) - return false; - - DiscIntersectorHitM hit(zero, zero, projC0, -ray_dir); - return epilog(valid, hit); - } - - template - static __forceinline bool intersect(const vbool& valid_i, - RayK& ray, - size_t k, - IntersectContext* context, - const Points* geom, - const Precalculations& pre, - const Vec4vf& v0i, - const Vec3vf& normal, - const Epilog& epilog) - { - vbool valid = valid_i; - const Vec3vf ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]); - const Vec3vf ray_dir(ray.dir.x[k], ray.dir.y[k], ray.dir.z[k]); - - const Vec4vf v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i); - const Vec3vf center = v0.xyz(); - const vfloat radius = v0.w; - - vfloat divisor = dot(Vec3vf(ray_dir), normal); - const vbool parallel = divisor == vfloat(0.f); - valid &= !parallel; - divisor = select(parallel, 1.f, divisor); // prevent divide by zero - - vfloat t = dot(center - Vec3vf(ray_org), Vec3vf(normal)) / divisor; - - valid &= (vfloat(ray.tnear()[k]) <= t) & (t <= vfloat(ray.tfar[k])); - if (unlikely(none(valid))) - return false; - - Vec3vf intersection = Vec3vf(ray_org) + Vec3vf(ray_dir) * t; - vfloat dist2 = dot(intersection - center, intersection - center); - valid &= dist2 < radius * radius; - if (unlikely(none(valid))) - return false; - - DiscIntersectorHitM hit(zero, zero, t, normal); - return epilog(valid, hit); - } - }; - } // namespace isa -} // namespace embree diff --git a/thirdparty/embree-aarch64/kernels/geometry/disci_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/disci_intersector.h deleted file mode 100644 index e1dc3aa98e..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/disci_intersector.h +++ /dev/null @@ -1,277 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "disc_intersector.h" -#include "intersector_epilog.h" -#include "pointi.h" - -namespace embree -{ - namespace isa - { - template - struct DiscMiIntersector1 - { - typedef PointMi Primitive; - typedef CurvePrecalculations1 Precalculations; - - static __forceinline void intersect(const Precalculations& pre, - RayHit& ray, - IntersectContext* context, - const Primitive& Disc) - { - STAT3(normal.trav_prims, 1, 1, 1); - const Points* geom = context->scene->get(Disc.geomID()); - Vec4vf v0; Disc.gather(v0, geom); - const vbool valid = Disc.template valid(); - DiscIntersector1::intersect( - valid, ray, context, geom, pre, v0, Intersect1EpilogM(ray, context, Disc.geomID(), Disc.primID())); - } - - static __forceinline bool occluded(const Precalculations& pre, - Ray& ray, - IntersectContext* context, - const Primitive& Disc) - { - STAT3(shadow.trav_prims, 1, 1, 1); - const Points* geom = context->scene->get(Disc.geomID()); - Vec4vf v0; Disc.gather(v0, geom); - const vbool valid = Disc.template valid(); - return DiscIntersector1::intersect( - valid, ray, context, geom, pre, v0, Occluded1EpilogM(ray, context, Disc.geomID(), Disc.primID())); - } - }; - - template - struct DiscMiMBIntersector1 - { - typedef PointMi Primitive; - typedef CurvePrecalculations1 Precalculations; - - static __forceinline void intersect(const Precalculations& pre, - RayHit& ray, - IntersectContext* context, - const Primitive& Disc) - { - STAT3(normal.trav_prims, 1, 1, 1); - const Points* geom = context->scene->get(Disc.geomID()); - Vec4vf v0; Disc.gather(v0, geom, ray.time()); - const vbool valid = Disc.template valid(); - DiscIntersector1::intersect( - valid, ray, context, geom, pre, v0, Intersect1EpilogM(ray, context, Disc.geomID(), Disc.primID())); - } - - static __forceinline bool occluded(const Precalculations& pre, - Ray& ray, - IntersectContext* context, - const Primitive& Disc) - { - STAT3(shadow.trav_prims, 1, 1, 1); - const Points* geom = context->scene->get(Disc.geomID()); - Vec4vf v0; Disc.gather(v0, geom, ray.time()); - const vbool valid = Disc.template valid(); - return DiscIntersector1::intersect( - valid, ray, context, geom, pre, v0, Occluded1EpilogM(ray, context, Disc.geomID(), Disc.primID())); - } - }; - - template - struct DiscMiIntersectorK - { - typedef PointMi Primitive; - typedef CurvePrecalculationsK Precalculations; - - static __forceinline void intersect( - const Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive& Disc) - { - STAT3(normal.trav_prims, 1, 1, 1); - const Points* geom = context->scene->get(Disc.geomID()); - Vec4vf v0; Disc.gather(v0, geom); - const vbool valid = Disc.template valid(); - DiscIntersectorK::intersect( - valid, ray, k, context, geom, pre, v0, - Intersect1KEpilogM(ray, k, context, Disc.geomID(), Disc.primID())); - } - - static __forceinline bool occluded( - const Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const Primitive& Disc) - { - STAT3(shadow.trav_prims, 1, 1, 1); - const Points* geom = context->scene->get(Disc.geomID()); - Vec4vf v0; Disc.gather(v0, geom); - const vbool valid = Disc.template valid(); - return DiscIntersectorK::intersect( - valid, ray, k, context, geom, pre, v0, - Occluded1KEpilogM(ray, k, context, Disc.geomID(), Disc.primID())); - } - }; - - template - struct DiscMiMBIntersectorK - { - typedef PointMi Primitive; - typedef CurvePrecalculationsK Precalculations; - - static __forceinline void intersect( - const Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive& Disc) - { - STAT3(normal.trav_prims, 1, 1, 1); - const Points* geom = context->scene->get(Disc.geomID()); - Vec4vf v0; Disc.gather(v0, geom, ray.time()[k]); - const vbool valid = Disc.template valid(); - DiscIntersectorK::intersect( - valid, ray, k, context, geom, pre, v0, - Intersect1KEpilogM(ray, k, context, Disc.geomID(), Disc.primID())); - } - - static __forceinline bool occluded( - const Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const Primitive& Disc) - { - STAT3(shadow.trav_prims, 1, 1, 1); - const Points* geom = context->scene->get(Disc.geomID()); - Vec4vf v0; Disc.gather(v0, geom, ray.time()[k]); - const vbool valid = Disc.template valid(); - return DiscIntersectorK::intersect( - valid, ray, k, context, geom, pre, v0, Occluded1KEpilogM(ray, k, context, Disc.geomID(), Disc.primID())); - } - }; - - template - struct OrientedDiscMiIntersector1 - { - typedef PointMi Primitive; - typedef CurvePrecalculations1 Precalculations; - - static __forceinline void intersect(const Precalculations& pre, - RayHit& ray, - IntersectContext* context, - const Primitive& Disc) - { - STAT3(normal.trav_prims, 1, 1, 1); - const Points* geom = context->scene->get(Disc.geomID()); - Vec4vf v0; Vec3vf n0; - Disc.gather(v0, n0, geom); - const vbool valid = Disc.template valid(); - DiscIntersector1::intersect( - valid, ray, context, geom, pre, v0, n0, Intersect1EpilogM(ray, context, Disc.geomID(), Disc.primID())); - } - - static __forceinline bool occluded(const Precalculations& pre, - Ray& ray, - IntersectContext* context, - const Primitive& Disc) - { - STAT3(shadow.trav_prims, 1, 1, 1); - const Points* geom = context->scene->get(Disc.geomID()); - Vec4vf v0; Vec3vf n0; - Disc.gather(v0, n0, geom); - const vbool valid = Disc.template valid(); - return DiscIntersector1::intersect( - valid, ray, context, geom, pre, v0, n0, Occluded1EpilogM(ray, context, Disc.geomID(), Disc.primID())); - } - }; - - template - struct OrientedDiscMiMBIntersector1 - { - typedef PointMi Primitive; - typedef CurvePrecalculations1 Precalculations; - - static __forceinline void intersect(const Precalculations& pre, - RayHit& ray, - IntersectContext* context, - const Primitive& Disc) - { - STAT3(normal.trav_prims, 1, 1, 1); - const Points* geom = context->scene->get(Disc.geomID()); - Vec4vf v0; Vec3vf n0; - Disc.gather(v0, n0, geom, ray.time()); - const vbool valid = Disc.template valid(); - DiscIntersector1::intersect( - valid, ray, context, geom, pre, v0, n0, Intersect1EpilogM(ray, context, Disc.geomID(), Disc.primID())); - } - - static __forceinline bool occluded(const Precalculations& pre, - Ray& ray, - IntersectContext* context, - const Primitive& Disc) - { - STAT3(shadow.trav_prims, 1, 1, 1); - const Points* geom = context->scene->get(Disc.geomID()); - Vec4vf v0; Vec3vf n0; - Disc.gather(v0, n0, geom, ray.time()); - const vbool valid = Disc.template valid(); - return DiscIntersector1::intersect( - valid, ray, context, geom, pre, v0, n0, Occluded1EpilogM(ray, context, Disc.geomID(), Disc.primID())); - } - }; - - template - struct OrientedDiscMiIntersectorK - { - typedef PointMi Primitive; - typedef CurvePrecalculationsK Precalculations; - - static __forceinline void intersect( - const Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive& Disc) - { - STAT3(normal.trav_prims, 1, 1, 1); - const Points* geom = context->scene->get(Disc.geomID()); - Vec4vf v0; Vec3vf n0; - Disc.gather(v0, n0, geom); - const vbool valid = Disc.template valid(); - DiscIntersectorK::intersect( - valid, ray, k, context, geom, pre, v0, n0, - Intersect1KEpilogM(ray, k, context, Disc.geomID(), Disc.primID())); - } - - static __forceinline bool occluded( - const Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const Primitive& Disc) - { - STAT3(shadow.trav_prims, 1, 1, 1); - const Points* geom = context->scene->get(Disc.geomID()); - Vec4vf v0; Vec3vf n0; - Disc.gather(v0, n0, geom); - const vbool valid = Disc.template valid(); - return DiscIntersectorK::intersect( - valid, ray, k, context, geom, pre, v0, n0, - Occluded1KEpilogM(ray, k, context, Disc.geomID(), Disc.primID())); - } - }; - - template - struct OrientedDiscMiMBIntersectorK - { - typedef PointMi Primitive; - typedef CurvePrecalculationsK Precalculations; - - static __forceinline void intersect( - const Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive& Disc) - { - STAT3(normal.trav_prims, 1, 1, 1); - const Points* geom = context->scene->get(Disc.geomID()); - Vec4vf v0; Vec3vf n0; - Disc.gather(v0, n0, geom, ray.time()[k]); - const vbool valid = Disc.template valid(); - DiscIntersectorK::intersect( - valid, ray, k, context, geom, pre, v0, n0, - Intersect1KEpilogM(ray, k, context, Disc.geomID(), Disc.primID())); - } - - static __forceinline bool occluded( - const Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const Primitive& Disc) - { - STAT3(shadow.trav_prims, 1, 1, 1); - const Points* geom = context->scene->get(Disc.geomID()); - Vec4vf v0; Vec3vf n0; - Disc.gather(v0, n0, geom, ray.time()[k]); - const vbool valid = Disc.template valid(); - return DiscIntersectorK::intersect( - valid, ray, k, context, geom, pre, v0, n0, - Occluded1KEpilogM(ray, k, context, Disc.geomID(), Disc.primID())); - } - }; - } // namespace isa -} // namespace embree diff --git a/thirdparty/embree-aarch64/kernels/geometry/filter.h b/thirdparty/embree-aarch64/kernels/geometry/filter.h deleted file mode 100644 index 4cdf7a395a..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/filter.h +++ /dev/null @@ -1,204 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../common/geometry.h" -#include "../common/ray.h" -#include "../common/hit.h" -#include "../common/context.h" - -namespace embree -{ - namespace isa - { - __forceinline bool runIntersectionFilter1Helper(RTCFilterFunctionNArguments* args, const Geometry* const geometry, IntersectContext* context) - { - if (geometry->intersectionFilterN) - { - assert(context->scene->hasGeometryFilterFunction()); - geometry->intersectionFilterN(args); - - if (args->valid[0] == 0) - return false; - } - - if (context->user->filter) { - assert(context->scene->hasContextFilterFunction()); - context->user->filter(args); - - if (args->valid[0] == 0) - return false; - } - - copyHitToRay(*(RayHit*)args->ray,*(Hit*)args->hit); - return true; - } - - __forceinline bool runIntersectionFilter1(const Geometry* const geometry, RayHit& ray, IntersectContext* context, Hit& hit) - { - RTCFilterFunctionNArguments args; - int mask = -1; - args.valid = &mask; - args.geometryUserPtr = geometry->userPtr; - args.context = context->user; - args.ray = (RTCRayN*)&ray; - args.hit = (RTCHitN*)&hit; - args.N = 1; - return runIntersectionFilter1Helper(&args,geometry,context); - } - - __forceinline void reportIntersection1(IntersectFunctionNArguments* args, const RTCFilterFunctionNArguments* filter_args) - { -#if defined(EMBREE_FILTER_FUNCTION) - IntersectContext* MAYBE_UNUSED context = args->internal_context; - const Geometry* const geometry = args->geometry; - if (geometry->intersectionFilterN) { - assert(context->scene->hasGeometryFilterFunction()); - geometry->intersectionFilterN(filter_args); - } - - //if (args->valid[0] == 0) - // return; - - if (context->user->filter) { - assert(context->scene->hasContextFilterFunction()); - context->user->filter(filter_args); - } -#endif - } - - __forceinline bool runOcclusionFilter1Helper(RTCFilterFunctionNArguments* args, const Geometry* const geometry, IntersectContext* context) - { - if (geometry->occlusionFilterN) - { - assert(context->scene->hasGeometryFilterFunction()); - geometry->occlusionFilterN(args); - - if (args->valid[0] == 0) - return false; - } - - if (context->user->filter) { - assert(context->scene->hasContextFilterFunction()); - context->user->filter(args); - - if (args->valid[0] == 0) - return false; - } - return true; - } - - __forceinline bool runOcclusionFilter1(const Geometry* const geometry, Ray& ray, IntersectContext* context, Hit& hit) - { - RTCFilterFunctionNArguments args; - int mask = -1; - args.valid = &mask; - args.geometryUserPtr = geometry->userPtr; - args.context = context->user; - args.ray = (RTCRayN*)&ray; - args.hit = (RTCHitN*)&hit; - args.N = 1; - return runOcclusionFilter1Helper(&args,geometry,context); - } - - __forceinline void reportOcclusion1(OccludedFunctionNArguments* args, const RTCFilterFunctionNArguments* filter_args) - { -#if defined(EMBREE_FILTER_FUNCTION) - IntersectContext* MAYBE_UNUSED context = args->internal_context; - const Geometry* const geometry = args->geometry; - if (geometry->occlusionFilterN) { - assert(context->scene->hasGeometryFilterFunction()); - geometry->occlusionFilterN(filter_args); - } - - //if (args->valid[0] == 0) - // return false; - - if (context->user->filter) { - assert(context->scene->hasContextFilterFunction()); - context->user->filter(filter_args); - } -#endif - } - - template - __forceinline vbool runIntersectionFilterHelper(RTCFilterFunctionNArguments* args, const Geometry* const geometry, IntersectContext* context) - { - vint* mask = (vint*) args->valid; - if (geometry->intersectionFilterN) - { - assert(context->scene->hasGeometryFilterFunction()); - geometry->intersectionFilterN(args); - } - - vbool valid_o = *mask != vint(zero); - if (none(valid_o)) return valid_o; - - if (context->user->filter) { - assert(context->scene->hasContextFilterFunction()); - context->user->filter(args); - } - - valid_o = *mask != vint(zero); - if (none(valid_o)) return valid_o; - - copyHitToRay(valid_o,*(RayHitK*)args->ray,*(HitK*)args->hit); - return valid_o; - } - - template - __forceinline vbool runIntersectionFilter(const vbool& valid, const Geometry* const geometry, RayHitK& ray, IntersectContext* context, HitK& hit) - { - RTCFilterFunctionNArguments args; - vint mask = valid.mask32(); - args.valid = (int*)&mask; - args.geometryUserPtr = geometry->userPtr; - args.context = context->user; - args.ray = (RTCRayN*)&ray; - args.hit = (RTCHitN*)&hit; - args.N = K; - return runIntersectionFilterHelper(&args,geometry,context); - } - - template - __forceinline vbool runOcclusionFilterHelper(RTCFilterFunctionNArguments* args, const Geometry* const geometry, IntersectContext* context) - { - vint* mask = (vint*) args->valid; - if (geometry->occlusionFilterN) - { - assert(context->scene->hasGeometryFilterFunction()); - geometry->occlusionFilterN(args); - } - - vbool valid_o = *mask != vint(zero); - - if (none(valid_o)) return valid_o; - - if (context->user->filter) { - assert(context->scene->hasContextFilterFunction()); - context->user->filter(args); - } - - valid_o = *mask != vint(zero); - - RayK* ray = (RayK*) args->ray; - ray->tfar = select(valid_o, vfloat(neg_inf), ray->tfar); - return valid_o; - } - - template - __forceinline vbool runOcclusionFilter(const vbool& valid, const Geometry* const geometry, RayK& ray, IntersectContext* context, HitK& hit) - { - RTCFilterFunctionNArguments args; - vint mask = valid.mask32(); - args.valid = (int*)&mask; - args.geometryUserPtr = geometry->userPtr; - args.context = context->user; - args.ray = (RTCRayN*)&ray; - args.hit = (RTCHitN*)&hit; - args.N = K; - return runOcclusionFilterHelper(&args,geometry,context); - } - } -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/grid_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/grid_intersector.h deleted file mode 100644 index 46a0af0827..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/grid_intersector.h +++ /dev/null @@ -1,99 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "grid_soa.h" -#include "grid_soa_intersector1.h" -#include "grid_soa_intersector_packet.h" -#include "../common/ray.h" - -namespace embree -{ - namespace isa - { - template - class SubdivPatch1Precalculations : public T - { - public: - __forceinline SubdivPatch1Precalculations (const Ray& ray, const void* ptr) - : T(ray,ptr) {} - }; - - template - class SubdivPatch1PrecalculationsK : public T - { - public: - __forceinline SubdivPatch1PrecalculationsK (const vbool& valid, RayK& ray) - : T(valid,ray) {} - }; - - class Grid1Intersector1 - { - public: - typedef GridSOA Primitive; - typedef Grid1Precalculations Precalculations; - - /*! Intersect a ray with the primitive. */ - static __forceinline void intersect(Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t ty, size_t& lazy_node) - { - GridSOAIntersector1::intersect(pre,ray,context,prim,lazy_node); - } - static __forceinline void intersect(Precalculations& pre, RayHit& ray, IntersectContext* context, size_t ty0, const Primitive* prim, size_t ty, size_t& lazy_node) { - intersect(pre,ray,context,prim,ty,lazy_node); - } - - /*! Test if the ray is occluded by the primitive */ - static __forceinline bool occluded(Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t ty, size_t& lazy_node) - { - GridSOAIntersector1::occluded(pre,ray,context,prim,lazy_node); - } - static __forceinline bool occluded(Precalculations& pre, Ray& ray, IntersectContext* context, size_t ty0, const Primitive* prim, size_t ty, size_t& lazy_node) { - return occluded(pre,ray,context,prim,ty,lazy_node); - } - - static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive* prim, size_t ty, size_t& lazy_node) { - assert(false && "not implemented"); - return false; - } - - static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, size_t ty0, const Primitive* prim, size_t ty, size_t& lazy_node) { - assert(false && "not implemented"); - return false; - } - }; - - template - struct GridIntersectorK - { - typedef GridSOA Primitive; - typedef SubdivPatch1PrecalculationsK::Precalculations> Precalculations; - - - static __forceinline void intersect(const vbool& valid, Precalculations& pre, RayHitK& ray, IntersectContext* context, const Primitive* prim, size_t ty, size_t& lazy_node) - { - GridSOAIntersectorK::intersect(valid,pre,ray,context,prim,lazy_node); - } - - static __forceinline vbool occluded(const vbool& valid, Precalculations& pre, RayK& ray, IntersectContext* context, const Primitive* prim, size_t ty, size_t& lazy_node) - { - GridSOAIntersectorK::occluded(valid,pre,ray,context,prim,lazy_node); - } - - static __forceinline void intersect(Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t ty, size_t& lazy_node) - { - GridSOAIntersectorK::intersect(pre,ray,k,context,prim,lazy_node); - } - - static __forceinline bool occluded(Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t ty, size_t& lazy_node) - { - GridSOAIntersectorK::occluded(pre,ray,k,context,prim,lazy_node); - } - }; - - typedef Grid1IntersectorK<4> SubdivPatch1Intersector4; - typedef Grid1IntersectorK<8> SubdivPatch1Intersector8; - typedef Grid1IntersectorK<16> SubdivPatch1Intersector16; - - } -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/grid_soa.h b/thirdparty/embree-aarch64/kernels/geometry/grid_soa.h deleted file mode 100644 index d3b275586c..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/grid_soa.h +++ /dev/null @@ -1,275 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../common/ray.h" -#include "../common/scene_subdiv_mesh.h" -#include "../bvh/bvh.h" -#include "../subdiv/tessellation.h" -#include "../subdiv/tessellation_cache.h" -#include "subdivpatch1.h" - -namespace embree -{ - namespace isa - { - class GridSOA - { - public: - - /*! GridSOA constructor */ - GridSOA(const SubdivPatch1Base* patches, const unsigned time_steps, - const unsigned x0, const unsigned x1, const unsigned y0, const unsigned y1, const unsigned swidth, const unsigned sheight, - const SubdivMesh* const geom, const size_t totalBvhBytes, const size_t gridBytes, BBox3fa* bounds_o = nullptr); - - /*! Subgrid creation */ - template - static GridSOA* create(const SubdivPatch1Base* patches, const unsigned time_steps, - unsigned x0, unsigned x1, unsigned y0, unsigned y1, - const Scene* scene, Allocator& alloc, BBox3fa* bounds_o = nullptr) - { - const unsigned width = x1-x0+1; - const unsigned height = y1-y0+1; - const GridRange range(0,width-1,0,height-1); - size_t bvhBytes = 0; - if (time_steps == 1) - bvhBytes = getBVHBytes(range,sizeof(BVH4::AABBNode),0); - else { - bvhBytes = (time_steps-1)*getBVHBytes(range,sizeof(BVH4::AABBNodeMB),0); - bvhBytes += getTemporalBVHBytes(make_range(0,int(time_steps-1)),sizeof(BVH4::AABBNodeMB4D)); - } - const size_t gridBytes = 4*size_t(width)*size_t(height)*sizeof(float); - size_t rootBytes = time_steps*sizeof(BVH4::NodeRef); -#if !defined(__X86_64__) && !defined(__aarch64__) - rootBytes += 4; // We read 2 elements behind the grid. As we store at least 8 root bytes after the grid we are fine in 64 bit mode. But in 32 bit mode we have to do additional padding. -#endif - void* data = alloc(offsetof(GridSOA,data)+bvhBytes+time_steps*gridBytes+rootBytes); - assert(data); - return new (data) GridSOA(patches,time_steps,x0,x1,y0,y1,patches->grid_u_res,patches->grid_v_res,scene->get(patches->geomID()),bvhBytes,gridBytes,bounds_o); - } - - /*! Grid creation */ - template - static GridSOA* create(const SubdivPatch1Base* const patches, const unsigned time_steps, - const Scene* scene, const Allocator& alloc, BBox3fa* bounds_o = nullptr) - { - return create(patches,time_steps,0,patches->grid_u_res-1,0,patches->grid_v_res-1,scene,alloc,bounds_o); - } - - /*! returns reference to root */ - __forceinline BVH4::NodeRef& root(size_t t = 0) { return (BVH4::NodeRef&)data[rootOffset + t*sizeof(BVH4::NodeRef)]; } - __forceinline const BVH4::NodeRef& root(size_t t = 0) const { return (BVH4::NodeRef&)data[rootOffset + t*sizeof(BVH4::NodeRef)]; } - - /*! returns pointer to BVH array */ - __forceinline int8_t* bvhData() { return &data[0]; } - __forceinline const int8_t* bvhData() const { return &data[0]; } - - /*! returns pointer to Grid array */ - __forceinline float* gridData(size_t t = 0) { return (float*) &data[gridOffset + t*gridBytes]; } - __forceinline const float* gridData(size_t t = 0) const { return (float*) &data[gridOffset + t*gridBytes]; } - - __forceinline void* encodeLeaf(size_t u, size_t v) { - return (void*) (16*(v * width + u + 1)); // +1 to not create empty leaf - } - __forceinline float* decodeLeaf(size_t t, const void* ptr) { - return gridData(t) + (((size_t) (ptr) >> 4) - 1); - } - - /*! returns the size of the BVH over the grid in bytes */ - static size_t getBVHBytes(const GridRange& range, const size_t nodeBytes, const size_t leafBytes); - - /*! returns the size of the temporal BVH over the time range BVHs */ - static size_t getTemporalBVHBytes(const range time_range, const size_t nodeBytes); - - /*! calculates bounding box of grid range */ - __forceinline BBox3fa calculateBounds(size_t time, const GridRange& range) const - { - const float* const grid_array = gridData(time); - const float* const grid_x_array = grid_array + 0 * dim_offset; - const float* const grid_y_array = grid_array + 1 * dim_offset; - const float* const grid_z_array = grid_array + 2 * dim_offset; - - /* compute the bounds just for the range! */ - BBox3fa bounds( empty ); - for (unsigned v = range.v_start; v<=range.v_end; v++) - { - for (unsigned u = range.u_start; u<=range.u_end; u++) - { - const float x = grid_x_array[ v * width + u]; - const float y = grid_y_array[ v * width + u]; - const float z = grid_z_array[ v * width + u]; - bounds.extend( Vec3fa(x,y,z) ); - } - } - assert(is_finite(bounds)); - return bounds; - } - - /*! Evaluates grid over patch and builds BVH4 tree over the grid. */ - std::pair buildBVH(BBox3fa* bounds_o); - - /*! Create BVH4 tree over grid. */ - std::pair buildBVH(const GridRange& range, size_t& allocator); - - /*! Evaluates grid over patch and builds MSMBlur BVH4 tree over the grid. */ - std::pair buildMSMBlurBVH(const range time_range, BBox3fa* bounds_o); - - /*! Create MBlur BVH4 tree over grid. */ - std::pair buildMBlurBVH(size_t time, const GridRange& range, size_t& allocator); - - /*! Create MSMBlur BVH4 tree over grid. */ - std::pair buildMSMBlurBVH(const range time_range, size_t& allocator, BBox3fa* bounds_o); - - template - struct MapUV - { - typedef typename Loader::vfloat vfloat; - const float* const grid_uv; - size_t line_offset; - size_t lines; - - __forceinline MapUV(const float* const grid_uv, size_t line_offset, const size_t lines) - : grid_uv(grid_uv), line_offset(line_offset), lines(lines) {} - - __forceinline void operator() (vfloat& u, vfloat& v) const { - const Vec3 tri_v012_uv = Loader::gather(grid_uv,line_offset,lines); - const Vec2 uv0 = GridSOA::decodeUV(tri_v012_uv[0]); - const Vec2 uv1 = GridSOA::decodeUV(tri_v012_uv[1]); - const Vec2 uv2 = GridSOA::decodeUV(tri_v012_uv[2]); - const Vec2 uv = u * uv1 + v * uv2 + (1.0f-u-v) * uv0; - u = uv[0];v = uv[1]; - } - }; - - struct Gather2x3 - { - enum { M = 4 }; - typedef vbool4 vbool; - typedef vint4 vint; - typedef vfloat4 vfloat; - - static __forceinline const Vec3vf4 gather(const float* const grid, const size_t line_offset, const size_t lines) - { - vfloat4 r0 = vfloat4::loadu(grid + 0*line_offset); - vfloat4 r1 = vfloat4::loadu(grid + 1*line_offset); // this accesses 2 elements too much in case of 2x2 grid, but this is ok as we ensure enough padding after the grid - if (unlikely(line_offset == 2)) - { - r0 = shuffle<0,1,1,1>(r0); - r1 = shuffle<0,1,1,1>(r1); - } - return Vec3vf4(unpacklo(r0,r1), // r00, r10, r01, r11 - shuffle<1,1,2,2>(r0), // r01, r01, r02, r02 - shuffle<0,1,1,2>(r1)); // r10, r11, r11, r12 - } - - static __forceinline void gather(const float* const grid_x, - const float* const grid_y, - const float* const grid_z, - const size_t line_offset, - const size_t lines, - Vec3vf4& v0_o, - Vec3vf4& v1_o, - Vec3vf4& v2_o) - { - const Vec3vf4 tri_v012_x = gather(grid_x,line_offset,lines); - const Vec3vf4 tri_v012_y = gather(grid_y,line_offset,lines); - const Vec3vf4 tri_v012_z = gather(grid_z,line_offset,lines); - v0_o = Vec3vf4(tri_v012_x[0],tri_v012_y[0],tri_v012_z[0]); - v1_o = Vec3vf4(tri_v012_x[1],tri_v012_y[1],tri_v012_z[1]); - v2_o = Vec3vf4(tri_v012_x[2],tri_v012_y[2],tri_v012_z[2]); - } - }; - -#if defined (__AVX__) - struct Gather3x3 - { - enum { M = 8 }; - typedef vbool8 vbool; - typedef vint8 vint; - typedef vfloat8 vfloat; - - static __forceinline const Vec3vf8 gather(const float* const grid, const size_t line_offset, const size_t lines) - { - vfloat4 ra = vfloat4::loadu(grid + 0*line_offset); - vfloat4 rb = vfloat4::loadu(grid + 1*line_offset); // this accesses 2 elements too much in case of 2x2 grid, but this is ok as we ensure enough padding after the grid - vfloat4 rc; - if (likely(lines > 2)) - rc = vfloat4::loadu(grid + 2*line_offset); - else - rc = rb; - - if (unlikely(line_offset == 2)) - { - ra = shuffle<0,1,1,1>(ra); - rb = shuffle<0,1,1,1>(rb); - rc = shuffle<0,1,1,1>(rc); - } - - const vfloat8 r0 = vfloat8(ra,rb); - const vfloat8 r1 = vfloat8(rb,rc); - return Vec3vf8(unpacklo(r0,r1), // r00, r10, r01, r11, r10, r20, r11, r21 - shuffle<1,1,2,2>(r0), // r01, r01, r02, r02, r11, r11, r12, r12 - shuffle<0,1,1,2>(r1)); // r10, r11, r11, r12, r20, r21, r21, r22 - } - - static __forceinline void gather(const float* const grid_x, - const float* const grid_y, - const float* const grid_z, - const size_t line_offset, - const size_t lines, - Vec3vf8& v0_o, - Vec3vf8& v1_o, - Vec3vf8& v2_o) - { - const Vec3vf8 tri_v012_x = gather(grid_x,line_offset,lines); - const Vec3vf8 tri_v012_y = gather(grid_y,line_offset,lines); - const Vec3vf8 tri_v012_z = gather(grid_z,line_offset,lines); - v0_o = Vec3vf8(tri_v012_x[0],tri_v012_y[0],tri_v012_z[0]); - v1_o = Vec3vf8(tri_v012_x[1],tri_v012_y[1],tri_v012_z[1]); - v2_o = Vec3vf8(tri_v012_x[2],tri_v012_y[2],tri_v012_z[2]); - } - }; -#endif - - template - static __forceinline Vec2 decodeUV(const vfloat& uv) - { - typedef typename vfloat::Int vint; - const vint iu = asInt(uv) & 0xffff; - const vint iv = srl(asInt(uv),16); - const vfloat u = (vfloat)iu * vfloat(8.0f/0x10000); - const vfloat v = (vfloat)iv * vfloat(8.0f/0x10000); - return Vec2(u,v); - } - - __forceinline unsigned int geomID() const { - return _geomID; - } - - __forceinline unsigned int primID() const { - return _primID; - } - - public: - BVH4::NodeRef troot; -#if !defined(__X86_64__) && !defined(__aarch64__) - unsigned align1; -#endif - unsigned time_steps; - unsigned width; - - unsigned height; - unsigned dim_offset; - unsigned _geomID; - unsigned _primID; - - unsigned align2; - unsigned gridOffset; - unsigned gridBytes; - unsigned rootOffset; - - int8_t data[1]; //!< after the struct we first store the BVH, then the grid, and finally the roots - }; - } -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/grid_soa_intersector1.h b/thirdparty/embree-aarch64/kernels/geometry/grid_soa_intersector1.h deleted file mode 100644 index 2ed922a5ae..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/grid_soa_intersector1.h +++ /dev/null @@ -1,207 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "grid_soa.h" -#include "../common/ray.h" -#include "triangle_intersector_pluecker.h" - -namespace embree -{ - namespace isa - { - class GridSOAIntersector1 - { - public: - typedef void Primitive; - - class Precalculations - { - public: - __forceinline Precalculations (const Ray& ray, const void* ptr) - : grid(nullptr) {} - - public: - GridSOA* grid; - int itime; - float ftime; - }; - - template - static __forceinline void intersect(RayHit& ray, - IntersectContext* context, - const float* const grid_x, - const size_t line_offset, - const size_t lines, - Precalculations& pre) - { - typedef typename Loader::vfloat vfloat; - const size_t dim_offset = pre.grid->dim_offset; - const float* const grid_y = grid_x + 1 * dim_offset; - const float* const grid_z = grid_x + 2 * dim_offset; - const float* const grid_uv = grid_x + 3 * dim_offset; - Vec3 v0, v1, v2; - Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,v0,v1,v2); - GridSOA::MapUV mapUV(grid_uv,line_offset,lines); - PlueckerIntersector1 intersector(ray,nullptr); - intersector.intersect(ray,v0,v1,v2,mapUV,Intersect1EpilogMU(ray,context,pre.grid->geomID(),pre.grid->primID())); - }; - - template - static __forceinline bool occluded(Ray& ray, - IntersectContext* context, - const float* const grid_x, - const size_t line_offset, - const size_t lines, - Precalculations& pre) - { - typedef typename Loader::vfloat vfloat; - const size_t dim_offset = pre.grid->dim_offset; - const float* const grid_y = grid_x + 1 * dim_offset; - const float* const grid_z = grid_x + 2 * dim_offset; - const float* const grid_uv = grid_x + 3 * dim_offset; - - Vec3 v0, v1, v2; - Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,v0,v1,v2); - - GridSOA::MapUV mapUV(grid_uv,line_offset,lines); - PlueckerIntersector1 intersector(ray,nullptr); - return intersector.intersect(ray,v0,v1,v2,mapUV,Occluded1EpilogMU(ray,context,pre.grid->geomID(),pre.grid->primID())); - } - - /*! Intersect a ray with the primitive. */ - static __forceinline void intersect(Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node) - { - const size_t line_offset = pre.grid->width; - const size_t lines = pre.grid->height; - const float* const grid_x = pre.grid->decodeLeaf(0,prim); - -#if defined(__AVX__) - intersect( ray, context, grid_x, line_offset, lines, pre); -#else - intersect(ray, context, grid_x , line_offset, lines, pre); - if (likely(lines > 2)) - intersect(ray, context, grid_x+line_offset, line_offset, lines, pre); -#endif - } - - /*! Test if the ray is occluded by the primitive */ - static __forceinline bool occluded(Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node) - { - const size_t line_offset = pre.grid->width; - const size_t lines = pre.grid->height; - const float* const grid_x = pre.grid->decodeLeaf(0,prim); - -#if defined(__AVX__) - return occluded( ray, context, grid_x, line_offset, lines, pre); -#else - if (occluded(ray, context, grid_x , line_offset, lines, pre)) return true; - if (likely(lines > 2)) - if (occluded(ray, context, grid_x+line_offset, line_offset, lines, pre)) return true; -#endif - return false; - } - }; - - class GridSOAMBIntersector1 - { - public: - typedef void Primitive; - typedef GridSOAIntersector1::Precalculations Precalculations; - - template - static __forceinline void intersect(RayHit& ray, const float ftime, - IntersectContext* context, - const float* const grid_x, - const size_t line_offset, - const size_t lines, - Precalculations& pre) - { - typedef typename Loader::vfloat vfloat; - const size_t dim_offset = pre.grid->dim_offset; - const size_t grid_offset = pre.grid->gridBytes >> 2; - const float* const grid_y = grid_x + 1 * dim_offset; - const float* const grid_z = grid_x + 2 * dim_offset; - const float* const grid_uv = grid_x + 3 * dim_offset; - - Vec3 a0, a1, a2; - Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,a0,a1,a2); - - Vec3 b0, b1, b2; - Loader::gather(grid_x+grid_offset,grid_y+grid_offset,grid_z+grid_offset,line_offset,lines,b0,b1,b2); - - Vec3 v0 = lerp(a0,b0,vfloat(ftime)); - Vec3 v1 = lerp(a1,b1,vfloat(ftime)); - Vec3 v2 = lerp(a2,b2,vfloat(ftime)); - - GridSOA::MapUV mapUV(grid_uv,line_offset,lines); - PlueckerIntersector1 intersector(ray,nullptr); - intersector.intersect(ray,v0,v1,v2,mapUV,Intersect1EpilogMU(ray,context,pre.grid->geomID(),pre.grid->primID())); - }; - - template - static __forceinline bool occluded(Ray& ray, const float ftime, - IntersectContext* context, - const float* const grid_x, - const size_t line_offset, - const size_t lines, - Precalculations& pre) - { - typedef typename Loader::vfloat vfloat; - const size_t dim_offset = pre.grid->dim_offset; - const size_t grid_offset = pre.grid->gridBytes >> 2; - const float* const grid_y = grid_x + 1 * dim_offset; - const float* const grid_z = grid_x + 2 * dim_offset; - const float* const grid_uv = grid_x + 3 * dim_offset; - - Vec3 a0, a1, a2; - Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,a0,a1,a2); - - Vec3 b0, b1, b2; - Loader::gather(grid_x+grid_offset,grid_y+grid_offset,grid_z+grid_offset,line_offset,lines,b0,b1,b2); - - Vec3 v0 = lerp(a0,b0,vfloat(ftime)); - Vec3 v1 = lerp(a1,b1,vfloat(ftime)); - Vec3 v2 = lerp(a2,b2,vfloat(ftime)); - - GridSOA::MapUV mapUV(grid_uv,line_offset,lines); - PlueckerIntersector1 intersector(ray,nullptr); - return intersector.intersect(ray,v0,v1,v2,mapUV,Occluded1EpilogMU(ray,context,pre.grid->geomID(),pre.grid->primID())); - } - - /*! Intersect a ray with the primitive. */ - static __forceinline void intersect(Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node) - { - const size_t line_offset = pre.grid->width; - const size_t lines = pre.grid->height; - const float* const grid_x = pre.grid->decodeLeaf(pre.itime,prim); - -#if defined(__AVX__) - intersect( ray, pre.ftime, context, grid_x, line_offset, lines, pre); -#else - intersect(ray, pre.ftime, context, grid_x, line_offset, lines, pre); - if (likely(lines > 2)) - intersect(ray, pre.ftime, context, grid_x+line_offset, line_offset, lines, pre); -#endif - } - - /*! Test if the ray is occluded by the primitive */ - static __forceinline bool occluded(Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node) - { - const size_t line_offset = pre.grid->width; - const size_t lines = pre.grid->height; - const float* const grid_x = pre.grid->decodeLeaf(pre.itime,prim); - -#if defined(__AVX__) - return occluded( ray, pre.ftime, context, grid_x, line_offset, lines, pre); -#else - if (occluded(ray, pre.ftime, context, grid_x , line_offset, lines, pre)) return true; - if (likely(lines > 2)) - if (occluded(ray, pre.ftime, context, grid_x+line_offset, line_offset, lines, pre)) return true; -#endif - return false; - } - }; - } -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/grid_soa_intersector_packet.h b/thirdparty/embree-aarch64/kernels/geometry/grid_soa_intersector_packet.h deleted file mode 100644 index 41d66e1e28..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/grid_soa_intersector_packet.h +++ /dev/null @@ -1,445 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "grid_soa.h" -#include "../common/ray.h" -#include "triangle_intersector_pluecker.h" - -namespace embree -{ - namespace isa - { - template - struct MapUV0 - { - const float* const grid_uv; - size_t ofs00, ofs01, ofs10, ofs11; - - __forceinline MapUV0(const float* const grid_uv, size_t ofs00, size_t ofs01, size_t ofs10, size_t ofs11) - : grid_uv(grid_uv), ofs00(ofs00), ofs01(ofs01), ofs10(ofs10), ofs11(ofs11) {} - - __forceinline void operator() (vfloat& u, vfloat& v) const { - const vfloat uv00(grid_uv[ofs00]); - const vfloat uv01(grid_uv[ofs01]); - const vfloat uv10(grid_uv[ofs10]); - const vfloat uv11(grid_uv[ofs11]); - const Vec2vf uv0 = GridSOA::decodeUV(uv00); - const Vec2vf uv1 = GridSOA::decodeUV(uv01); - const Vec2vf uv2 = GridSOA::decodeUV(uv10); - const Vec2vf uv = madd(u,uv1,madd(v,uv2,(1.0f-u-v)*uv0)); - u = uv[0]; v = uv[1]; - } - }; - - template - struct MapUV1 - { - const float* const grid_uv; - size_t ofs00, ofs01, ofs10, ofs11; - - __forceinline MapUV1(const float* const grid_uv, size_t ofs00, size_t ofs01, size_t ofs10, size_t ofs11) - : grid_uv(grid_uv), ofs00(ofs00), ofs01(ofs01), ofs10(ofs10), ofs11(ofs11) {} - - __forceinline void operator() (vfloat& u, vfloat& v) const { - const vfloat uv00(grid_uv[ofs00]); - const vfloat uv01(grid_uv[ofs01]); - const vfloat uv10(grid_uv[ofs10]); - const vfloat uv11(grid_uv[ofs11]); - const Vec2vf uv0 = GridSOA::decodeUV(uv10); - const Vec2vf uv1 = GridSOA::decodeUV(uv01); - const Vec2vf uv2 = GridSOA::decodeUV(uv11); - const Vec2vf uv = madd(u,uv1,madd(v,uv2,(1.0f-u-v)*uv0)); - u = uv[0]; v = uv[1]; - } - }; - - template - class GridSOAIntersectorK - { - public: - typedef void Primitive; - - class Precalculations - { -#if defined(__AVX__) - static const int M = 8; -#else - static const int M = 4; -#endif - - public: - __forceinline Precalculations (const vbool& valid, const RayK& ray) - : grid(nullptr), intersector(valid,ray) {} - - public: - GridSOA* grid; - PlueckerIntersectorK intersector; // FIXME: use quad intersector - }; - - /*! Intersect a ray with the primitive. */ - static __forceinline void intersect(const vbool& valid_i, Precalculations& pre, RayHitK& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node) - { - const size_t dim_offset = pre.grid->dim_offset; - const size_t line_offset = pre.grid->width; - const float* const grid_x = pre.grid->decodeLeaf(0,prim); - const float* const grid_y = grid_x + 1 * dim_offset; - const float* const grid_z = grid_x + 2 * dim_offset; - const float* const grid_uv = grid_x + 3 * dim_offset; - - const size_t max_x = pre.grid->width == 2 ? 1 : 2; - const size_t max_y = pre.grid->height == 2 ? 1 : 2; - for (size_t y=0; y p00(grid_x[ofs00],grid_y[ofs00],grid_z[ofs00]); - const Vec3vf p01(grid_x[ofs01],grid_y[ofs01],grid_z[ofs01]); - const Vec3vf p10(grid_x[ofs10],grid_y[ofs10],grid_z[ofs10]); - const Vec3vf p11(grid_x[ofs11],grid_y[ofs11],grid_z[ofs11]); - - pre.intersector.intersectK(valid_i,ray,p00,p01,p10,MapUV0(grid_uv,ofs00,ofs01,ofs10,ofs11),IntersectKEpilogMU<1,K,true>(ray,context,pre.grid->geomID(),pre.grid->primID())); - pre.intersector.intersectK(valid_i,ray,p10,p01,p11,MapUV1(grid_uv,ofs00,ofs01,ofs10,ofs11),IntersectKEpilogMU<1,K,true>(ray,context,pre.grid->geomID(),pre.grid->primID())); - } - } - } - - /*! Test if the ray is occluded by the primitive */ - static __forceinline vbool occluded(const vbool& valid_i, Precalculations& pre, RayK& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node) - { - const size_t dim_offset = pre.grid->dim_offset; - const size_t line_offset = pre.grid->width; - const float* const grid_x = pre.grid->decodeLeaf(0,prim); - const float* const grid_y = grid_x + 1 * dim_offset; - const float* const grid_z = grid_x + 2 * dim_offset; - const float* const grid_uv = grid_x + 3 * dim_offset; - - vbool valid = valid_i; - const size_t max_x = pre.grid->width == 2 ? 1 : 2; - const size_t max_y = pre.grid->height == 2 ? 1 : 2; - for (size_t y=0; y p00(grid_x[ofs00],grid_y[ofs00],grid_z[ofs00]); - const Vec3vf p01(grid_x[ofs01],grid_y[ofs01],grid_z[ofs01]); - const Vec3vf p10(grid_x[ofs10],grid_y[ofs10],grid_z[ofs10]); - const Vec3vf p11(grid_x[ofs11],grid_y[ofs11],grid_z[ofs11]); - - pre.intersector.intersectK(valid,ray,p00,p01,p10,MapUV0(grid_uv,ofs00,ofs01,ofs10,ofs11),OccludedKEpilogMU<1,K,true>(valid,ray,context,pre.grid->geomID(),pre.grid->primID())); - if (none(valid)) break; - pre.intersector.intersectK(valid,ray,p10,p01,p11,MapUV1(grid_uv,ofs00,ofs01,ofs10,ofs11),OccludedKEpilogMU<1,K,true>(valid,ray,context,pre.grid->geomID(),pre.grid->primID())); - if (none(valid)) break; - } - } - return !valid; - } - - template - static __forceinline void intersect(RayHitK& ray, size_t k, - IntersectContext* context, - const float* const grid_x, - const size_t line_offset, - const size_t lines, - Precalculations& pre) - { - typedef typename Loader::vfloat vfloat; - const size_t dim_offset = pre.grid->dim_offset; - const float* const grid_y = grid_x + 1 * dim_offset; - const float* const grid_z = grid_x + 2 * dim_offset; - const float* const grid_uv = grid_x + 3 * dim_offset; - Vec3 v0, v1, v2; Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,v0,v1,v2); - pre.intersector.intersect(ray,k,v0,v1,v2,GridSOA::MapUV(grid_uv,line_offset,lines),Intersect1KEpilogMU(ray,k,context,pre.grid->geomID(),pre.grid->primID())); - }; - - template - static __forceinline bool occluded(RayK& ray, size_t k, - IntersectContext* context, - const float* const grid_x, - const size_t line_offset, - const size_t lines, - Precalculations& pre) - { - typedef typename Loader::vfloat vfloat; - const size_t dim_offset = pre.grid->dim_offset; - const float* const grid_y = grid_x + 1 * dim_offset; - const float* const grid_z = grid_x + 2 * dim_offset; - const float* const grid_uv = grid_x + 3 * dim_offset; - Vec3 v0, v1, v2; Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,v0,v1,v2); - return pre.intersector.intersect(ray,k,v0,v1,v2,GridSOA::MapUV(grid_uv,line_offset,lines),Occluded1KEpilogMU(ray,k,context,pre.grid->geomID(),pre.grid->primID())); - } - - /*! Intersect a ray with the primitive. */ - static __forceinline void intersect(Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t& lazy_node) - { - const size_t line_offset = pre.grid->width; - const size_t lines = pre.grid->height; - const float* const grid_x = pre.grid->decodeLeaf(0,prim); -#if defined(__AVX__) - intersect( ray, k, context, grid_x, line_offset, lines, pre); -#else - intersect(ray, k, context, grid_x , line_offset, lines, pre); - if (likely(lines > 2)) - intersect(ray, k, context, grid_x+line_offset, line_offset, lines, pre); -#endif - } - - /*! Test if the ray is occluded by the primitive */ - static __forceinline bool occluded(Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t& lazy_node) - { - const size_t line_offset = pre.grid->width; - const size_t lines = pre.grid->height; - const float* const grid_x = pre.grid->decodeLeaf(0,prim); - -#if defined(__AVX__) - return occluded( ray, k, context, grid_x, line_offset, lines, pre); -#else - if (occluded(ray, k, context, grid_x , line_offset, lines, pre)) return true; - if (likely(lines > 2)) - if (occluded(ray, k, context, grid_x+line_offset, line_offset, lines, pre)) return true; -#endif - return false; - } - }; - - template - class GridSOAMBIntersectorK - { - public: - typedef void Primitive; - typedef typename GridSOAIntersectorK::Precalculations Precalculations; - - /*! Intersect a ray with the primitive. */ - static __forceinline void intersect(const vbool& valid_i, Precalculations& pre, RayHitK& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node) - { - vfloat vftime; - vint vitime = getTimeSegment(ray.time(), vfloat((float)(pre.grid->time_steps-1)), vftime); - - vbool valid1 = valid_i; - while (any(valid1)) { - const size_t j = bsf(movemask(valid1)); - const int itime = vitime[j]; - const vbool valid2 = valid1 & (itime == vitime); - valid1 = valid1 & !valid2; - intersect(valid2,pre,ray,vftime,itime,context,prim,lazy_node); - } - } - - /*! Intersect a ray with the primitive. */ - static __forceinline void intersect(const vbool& valid_i, Precalculations& pre, RayHitK& ray, const vfloat& ftime, int itime, IntersectContext* context, const Primitive* prim, size_t& lazy_node) - { - const size_t grid_offset = pre.grid->gridBytes >> 2; - const size_t dim_offset = pre.grid->dim_offset; - const size_t line_offset = pre.grid->width; - const float* const grid_x = pre.grid->decodeLeaf(itime,prim); - const float* const grid_y = grid_x + 1 * dim_offset; - const float* const grid_z = grid_x + 2 * dim_offset; - const float* const grid_uv = grid_x + 3 * dim_offset; - - const size_t max_x = pre.grid->width == 2 ? 1 : 2; - const size_t max_y = pre.grid->height == 2 ? 1 : 2; - for (size_t y=0; y a00(grid_x[ofs00],grid_y[ofs00],grid_z[ofs00]); - const Vec3vf a01(grid_x[ofs01],grid_y[ofs01],grid_z[ofs01]); - const Vec3vf a10(grid_x[ofs10],grid_y[ofs10],grid_z[ofs10]); - const Vec3vf a11(grid_x[ofs11],grid_y[ofs11],grid_z[ofs11]); - ofs00 += grid_offset; - ofs01 += grid_offset; - ofs10 += grid_offset; - ofs11 += grid_offset; - const Vec3vf b00(grid_x[ofs00],grid_y[ofs00],grid_z[ofs00]); - const Vec3vf b01(grid_x[ofs01],grid_y[ofs01],grid_z[ofs01]); - const Vec3vf b10(grid_x[ofs10],grid_y[ofs10],grid_z[ofs10]); - const Vec3vf b11(grid_x[ofs11],grid_y[ofs11],grid_z[ofs11]); - const Vec3vf p00 = lerp(a00,b00,ftime); - const Vec3vf p01 = lerp(a01,b01,ftime); - const Vec3vf p10 = lerp(a10,b10,ftime); - const Vec3vf p11 = lerp(a11,b11,ftime); - - pre.intersector.intersectK(valid_i,ray,p00,p01,p10,MapUV0(grid_uv,ofs00,ofs01,ofs10,ofs11),IntersectKEpilogMU<1,K,true>(ray,context,pre.grid->geomID(),pre.grid->primID())); - pre.intersector.intersectK(valid_i,ray,p10,p01,p11,MapUV1(grid_uv,ofs00,ofs01,ofs10,ofs11),IntersectKEpilogMU<1,K,true>(ray,context,pre.grid->geomID(),pre.grid->primID())); - } - } - } - - /*! Test if the ray is occluded by the primitive */ - static __forceinline vbool occluded(const vbool& valid_i, Precalculations& pre, RayK& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node) - { - vfloat vftime; - vint vitime = getTimeSegment(ray.time(), vfloat((float)(pre.grid->time_steps-1)), vftime); - - vbool valid_o = valid_i; - vbool valid1 = valid_i; - while (any(valid1)) { - const int j = int(bsf(movemask(valid1))); - const int itime = vitime[j]; - const vbool valid2 = valid1 & (itime == vitime); - valid1 = valid1 & !valid2; - valid_o &= !valid2 | occluded(valid2,pre,ray,vftime,itime,context,prim,lazy_node); - } - return !valid_o; - } - - /*! Test if the ray is occluded by the primitive */ - static __forceinline vbool occluded(const vbool& valid_i, Precalculations& pre, RayK& ray, const vfloat& ftime, int itime, IntersectContext* context, const Primitive* prim, size_t& lazy_node) - { - const size_t grid_offset = pre.grid->gridBytes >> 2; - const size_t dim_offset = pre.grid->dim_offset; - const size_t line_offset = pre.grid->width; - const float* const grid_x = pre.grid->decodeLeaf(itime,prim); - const float* const grid_y = grid_x + 1 * dim_offset; - const float* const grid_z = grid_x + 2 * dim_offset; - const float* const grid_uv = grid_x + 3 * dim_offset; - - vbool valid = valid_i; - const size_t max_x = pre.grid->width == 2 ? 1 : 2; - const size_t max_y = pre.grid->height == 2 ? 1 : 2; - for (size_t y=0; y a00(grid_x[ofs00],grid_y[ofs00],grid_z[ofs00]); - const Vec3vf a01(grid_x[ofs01],grid_y[ofs01],grid_z[ofs01]); - const Vec3vf a10(grid_x[ofs10],grid_y[ofs10],grid_z[ofs10]); - const Vec3vf a11(grid_x[ofs11],grid_y[ofs11],grid_z[ofs11]); - ofs00 += grid_offset; - ofs01 += grid_offset; - ofs10 += grid_offset; - ofs11 += grid_offset; - const Vec3vf b00(grid_x[ofs00],grid_y[ofs00],grid_z[ofs00]); - const Vec3vf b01(grid_x[ofs01],grid_y[ofs01],grid_z[ofs01]); - const Vec3vf b10(grid_x[ofs10],grid_y[ofs10],grid_z[ofs10]); - const Vec3vf b11(grid_x[ofs11],grid_y[ofs11],grid_z[ofs11]); - const Vec3vf p00 = lerp(a00,b00,ftime); - const Vec3vf p01 = lerp(a01,b01,ftime); - const Vec3vf p10 = lerp(a10,b10,ftime); - const Vec3vf p11 = lerp(a11,b11,ftime); - - pre.intersector.intersectK(valid,ray,p00,p01,p10,MapUV0(grid_uv,ofs00,ofs01,ofs10,ofs11),OccludedKEpilogMU<1,K,true>(valid,ray,context,pre.grid->geomID(),pre.grid->primID())); - if (none(valid)) break; - pre.intersector.intersectK(valid,ray,p10,p01,p11,MapUV1(grid_uv,ofs00,ofs01,ofs10,ofs11),OccludedKEpilogMU<1,K,true>(valid,ray,context,pre.grid->geomID(),pre.grid->primID())); - if (none(valid)) break; - } - } - return valid; - } - - template - static __forceinline void intersect(RayHitK& ray, size_t k, - const float ftime, - IntersectContext* context, - const float* const grid_x, - const size_t line_offset, - const size_t lines, - Precalculations& pre) - { - typedef typename Loader::vfloat vfloat; - const size_t grid_offset = pre.grid->gridBytes >> 2; - const size_t dim_offset = pre.grid->dim_offset; - const float* const grid_y = grid_x + 1 * dim_offset; - const float* const grid_z = grid_x + 2 * dim_offset; - const float* const grid_uv = grid_x + 3 * dim_offset; - - Vec3 a0, a1, a2; - Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,a0,a1,a2); - - Vec3 b0, b1, b2; - Loader::gather(grid_x+grid_offset,grid_y+grid_offset,grid_z+grid_offset,line_offset,lines,b0,b1,b2); - - Vec3 v0 = lerp(a0,b0,vfloat(ftime)); - Vec3 v1 = lerp(a1,b1,vfloat(ftime)); - Vec3 v2 = lerp(a2,b2,vfloat(ftime)); - - pre.intersector.intersect(ray,k,v0,v1,v2,GridSOA::MapUV(grid_uv,line_offset,lines),Intersect1KEpilogMU(ray,k,context,pre.grid->geomID(),pre.grid->primID())); - }; - - template - static __forceinline bool occluded(RayK& ray, size_t k, - const float ftime, - IntersectContext* context, - const float* const grid_x, - const size_t line_offset, - const size_t lines, - Precalculations& pre) - { - typedef typename Loader::vfloat vfloat; - const size_t grid_offset = pre.grid->gridBytes >> 2; - const size_t dim_offset = pre.grid->dim_offset; - const float* const grid_y = grid_x + 1 * dim_offset; - const float* const grid_z = grid_x + 2 * dim_offset; - const float* const grid_uv = grid_x + 3 * dim_offset; - - Vec3 a0, a1, a2; - Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,a0,a1,a2); - - Vec3 b0, b1, b2; - Loader::gather(grid_x+grid_offset,grid_y+grid_offset,grid_z+grid_offset,line_offset,lines,b0,b1,b2); - - Vec3 v0 = lerp(a0,b0,vfloat(ftime)); - Vec3 v1 = lerp(a1,b1,vfloat(ftime)); - Vec3 v2 = lerp(a2,b2,vfloat(ftime)); - - return pre.intersector.intersect(ray,k,v0,v1,v2,GridSOA::MapUV(grid_uv,line_offset,lines),Occluded1KEpilogMU(ray,k,context,pre.grid->geomID(),pre.grid->primID())); - } - - /*! Intersect a ray with the primitive. */ - static __forceinline void intersect(Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t& lazy_node) - { - float ftime; - int itime = getTimeSegment(ray.time()[k], float(pre.grid->time_steps-1), ftime); - - const size_t line_offset = pre.grid->width; - const size_t lines = pre.grid->height; - const float* const grid_x = pre.grid->decodeLeaf(itime,prim); - -#if defined(__AVX__) - intersect( ray, k, ftime, context, grid_x, line_offset, lines, pre); -#else - intersect(ray, k, ftime, context, grid_x, line_offset, lines, pre); - if (likely(lines > 2)) - intersect(ray, k, ftime, context, grid_x+line_offset, line_offset, lines, pre); -#endif - } - - /*! Test if the ray is occluded by the primitive */ - static __forceinline bool occluded(Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t& lazy_node) - { - float ftime; - int itime = getTimeSegment(ray.time()[k], float(pre.grid->time_steps-1), ftime); - - const size_t line_offset = pre.grid->width; - const size_t lines = pre.grid->height; - const float* const grid_x = pre.grid->decodeLeaf(itime,prim); - -#if defined(__AVX__) - return occluded( ray, k, ftime, context, grid_x, line_offset, lines, pre); -#else - if (occluded(ray, k, ftime, context, grid_x, line_offset, lines, pre)) return true; - if (likely(lines > 2)) - if (occluded(ray, k, ftime, context, grid_x+line_offset, line_offset, lines, pre)) return true; -#endif - return false; - } - }; - } -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/instance.h b/thirdparty/embree-aarch64/kernels/geometry/instance.h deleted file mode 100644 index 66893d581f..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/instance.h +++ /dev/null @@ -1,78 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "primitive.h" -#include "../common/scene_instance.h" - -namespace embree -{ - struct InstancePrimitive - { - struct Type : public PrimitiveType - { - const char* name() const; - size_t sizeActive(const char* This) const; - size_t sizeTotal(const char* This) const; - size_t getBytes(const char* This) const; - }; - static Type type; - - public: - - /* primitive supports multiple time segments */ - static const bool singleTimeSegment = false; - - /* Returns maximum number of stored primitives */ - static __forceinline size_t max_size() { return 1; } - - /* Returns required number of primitive blocks for N primitives */ - static __forceinline size_t blocks(size_t N) { return N; } - - public: - - InstancePrimitive (const Instance* instance, unsigned int instID) - : instance(instance) - , instID_(instID) - {} - - __forceinline void fill(const PrimRef* prims, size_t& i, size_t end, Scene* scene) - { - assert(end-i == 1); - const PrimRef& prim = prims[i]; i++; - const unsigned int geomID = prim.geomID(); - const Instance* instance = scene->get(geomID); - new (this) InstancePrimitive(instance, geomID); - } - - __forceinline LBBox3fa fillMB(const PrimRef* prims, size_t& i, size_t end, Scene* scene, size_t itime) - { - assert(end-i == 1); - const PrimRef& prim = prims[i]; i++; - const unsigned int geomID = prim.geomID(); - const Instance* instance = scene->get(geomID); - new (this) InstancePrimitive(instance,geomID); - return instance->linearBounds(0,itime); - } - - __forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t& i, size_t end, Scene* scene, const BBox1f time_range) - { - assert(end-i == 1); - const PrimRefMB& prim = prims[i]; i++; - const unsigned int geomID = prim.geomID(); - const Instance* instance = scene->get(geomID); - new (this) InstancePrimitive(instance,geomID); - return instance->linearBounds(0,time_range); - } - - /* Updates the primitive */ - __forceinline BBox3fa update(Instance* instance) { - return instance->bounds(0); - } - - public: - const Instance* instance; - const unsigned int instID_ = std::numeric_limits::max (); - }; -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/instance_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/instance_intersector.h deleted file mode 100644 index 91731a39c5..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/instance_intersector.h +++ /dev/null @@ -1,84 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "instance.h" -#include "../common/ray.h" -#include "../common/point_query.h" - -namespace embree -{ - namespace isa - { - struct InstanceIntersector1 - { - typedef InstancePrimitive Primitive; - - struct Precalculations { - __forceinline Precalculations (const Ray& ray, const void *ptr) {} - }; - - static void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim); - static bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim); - static bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& prim); - }; - - struct InstanceIntersector1MB - { - typedef InstancePrimitive Primitive; - - struct Precalculations { - __forceinline Precalculations (const Ray& ray, const void *ptr) {} - }; - - static void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim); - static bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim); - static bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& prim); - }; - - template - struct InstanceIntersectorK - { - typedef InstancePrimitive Primitive; - - struct Precalculations { - __forceinline Precalculations (const vbool& valid, const RayK& ray) {} - }; - - static void intersect(const vbool& valid_i, const Precalculations& pre, RayHitK& ray, IntersectContext* context, const Primitive& prim); - static vbool occluded(const vbool& valid_i, const Precalculations& pre, RayK& ray, IntersectContext* context, const Primitive& prim); - - static __forceinline void intersect(Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive& prim) { - intersect(vbool(1<& ray, size_t k, IntersectContext* context, const Primitive& prim) { - occluded(vbool(1< - struct InstanceIntersectorKMB - { - typedef InstancePrimitive Primitive; - - struct Precalculations { - __forceinline Precalculations (const vbool& valid, const RayK& ray) {} - }; - - static void intersect(const vbool& valid_i, const Precalculations& pre, RayHitK& ray, IntersectContext* context, const Primitive& prim); - static vbool occluded(const vbool& valid_i, const Precalculations& pre, RayK& ray, IntersectContext* context, const Primitive& prim); - - static __forceinline void intersect(Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive& prim) { - intersect(vbool(1<& ray, size_t k, IntersectContext* context, const Primitive& prim) { - occluded(vbool(1< - struct UVIdentity { - __forceinline void operator() (vfloat& u, vfloat& v) const {} - }; - - - template - struct Intersect1Epilog1 - { - RayHit& ray; - IntersectContext* context; - const unsigned int geomID; - const unsigned int primID; - - __forceinline Intersect1Epilog1(RayHit& ray, - IntersectContext* context, - const unsigned int geomID, - const unsigned int primID) - : ray(ray), context(context), geomID(geomID), primID(primID) {} - - template - __forceinline bool operator() (Hit& hit) const - { - /* ray mask test */ - Scene* scene MAYBE_UNUSED = context->scene; - Geometry* geometry MAYBE_UNUSED = scene->get(geomID); -#if defined(EMBREE_RAY_MASK) - if ((geometry->mask & ray.mask) == 0) return false; -#endif - hit.finalize(); - - /* intersection filter test */ -#if defined(EMBREE_FILTER_FUNCTION) - if (filter) { - if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) { - HitK<1> h(context->user,geomID,primID,hit.u,hit.v,hit.Ng); - const float old_t = ray.tfar; - ray.tfar = hit.t; - bool found = runIntersectionFilter1(geometry,ray,context,h); - if (!found) ray.tfar = old_t; - return found; - } - } -#endif - - /* update hit information */ - ray.tfar = hit.t; - ray.Ng = hit.Ng; - ray.u = hit.u; - ray.v = hit.v; - ray.primID = primID; - ray.geomID = geomID; - instance_id_stack::copy(context->user->instID, ray.instID); - return true; - } - }; - - template - struct Occluded1Epilog1 - { - Ray& ray; - IntersectContext* context; - const unsigned int geomID; - const unsigned int primID; - - __forceinline Occluded1Epilog1(Ray& ray, - IntersectContext* context, - const unsigned int geomID, - const unsigned int primID) - : ray(ray), context(context), geomID(geomID), primID(primID) {} - - template - __forceinline bool operator() (Hit& hit) const - { - /* ray mask test */ - Scene* scene MAYBE_UNUSED = context->scene; - Geometry* geometry MAYBE_UNUSED = scene->get(geomID); - - -#if defined(EMBREE_RAY_MASK) - if ((geometry->mask & ray.mask) == 0) return false; -#endif - hit.finalize(); - - /* intersection filter test */ -#if defined(EMBREE_FILTER_FUNCTION) - if (filter) { - if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter())) { - HitK<1> h(context->user,geomID,primID,hit.u,hit.v,hit.Ng); - const float old_t = ray.tfar; - ray.tfar = hit.t; - const bool found = runOcclusionFilter1(geometry,ray,context,h); - if (!found) ray.tfar = old_t; - return found; - } - } -#endif - return true; - } - }; - - template - struct Intersect1KEpilog1 - { - RayHitK& ray; - size_t k; - IntersectContext* context; - const unsigned int geomID; - const unsigned int primID; - - __forceinline Intersect1KEpilog1(RayHitK& ray, size_t k, - IntersectContext* context, - const unsigned int geomID, - const unsigned int primID) - : ray(ray), k(k), context(context), geomID(geomID), primID(primID) {} - - template - __forceinline bool operator() (Hit& hit) const - { - /* ray mask test */ - Scene* scene MAYBE_UNUSED = context->scene; - Geometry* geometry MAYBE_UNUSED = scene->get(geomID); -#if defined(EMBREE_RAY_MASK) - if ((geometry->mask & ray.mask[k]) == 0) - return false; -#endif - hit.finalize(); - - /* intersection filter test */ -#if defined(EMBREE_FILTER_FUNCTION) - if (filter) { - if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) { - HitK h(context->user,geomID,primID,hit.u,hit.v,hit.Ng); - const float old_t = ray.tfar[k]; - ray.tfar[k] = hit.t; - const bool found = any(runIntersectionFilter(vbool(1<*, const size_t&>(context->user->instID, ray.instID, k); - return true; - } - }; - - template - struct Occluded1KEpilog1 - { - RayK& ray; - size_t k; - IntersectContext* context; - const unsigned int geomID; - const unsigned int primID; - - __forceinline Occluded1KEpilog1(RayK& ray, size_t k, - IntersectContext* context, - const unsigned int geomID, - const unsigned int primID) - : ray(ray), k(k), context(context), geomID(geomID), primID(primID) {} - - template - __forceinline bool operator() (Hit& hit) const - { - /* ray mask test */ - Scene* scene MAYBE_UNUSED = context->scene; - Geometry* geometry MAYBE_UNUSED = scene->get(geomID); -#if defined(EMBREE_RAY_MASK) - if ((geometry->mask & ray.mask[k]) == 0) - return false; -#endif - - /* intersection filter test */ -#if defined(EMBREE_FILTER_FUNCTION) - if (filter) { - if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter())) { - hit.finalize(); - HitK h(context->user,geomID,primID,hit.u,hit.v,hit.Ng); - const float old_t = ray.tfar[k]; - ray.tfar[k] = hit.t; - const bool found = any(runOcclusionFilter(vbool(1< - struct Intersect1EpilogM - { - RayHit& ray; - IntersectContext* context; - const vuint& geomIDs; - const vuint& primIDs; - - __forceinline Intersect1EpilogM(RayHit& ray, - IntersectContext* context, - const vuint& geomIDs, - const vuint& primIDs) - : ray(ray), context(context), geomIDs(geomIDs), primIDs(primIDs) {} - - template - __forceinline bool operator() (const vbool& valid_i, Hit& hit) const - { - Scene* scene MAYBE_UNUSED = context->scene; - vbool valid = valid_i; - if (Mx > M) valid &= (1<get(geomID); - -#if defined(EMBREE_RAY_MASK) - /* goto next hit if mask test fails */ - if ((geometry->mask & ray.mask) == 0) { - clear(valid,i); - continue; - } -#endif - -#if defined(EMBREE_FILTER_FUNCTION) - /* call intersection filter function */ - if (filter) { - if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) { - const Vec2f uv = hit.uv(i); - HitK<1> h(context->user,geomID,primIDs[i],uv.x,uv.y,hit.Ng(i)); - const float old_t = ray.tfar; - ray.tfar = hit.t(i); - const bool found = runIntersectionFilter1(geometry,ray,context,h); - if (!found) ray.tfar = old_t; - foundhit |= found; - clear(valid,i); - valid &= hit.vt <= ray.tfar; // intersection filters may modify tfar value - continue; - } - } -#endif - break; - } -#endif - - /* update hit information */ - const Vec2f uv = hit.uv(i); - ray.tfar = hit.vt[i]; - ray.Ng.x = hit.vNg.x[i]; - ray.Ng.y = hit.vNg.y[i]; - ray.Ng.z = hit.vNg.z[i]; - ray.u = uv.x; - ray.v = uv.y; - ray.primID = primIDs[i]; - ray.geomID = geomID; - instance_id_stack::copy(context->user->instID, ray.instID); - return true; - - } - }; - -#if 0 && defined(__AVX512F__) // do not enable, this reduced frequency for BVH4 - template - struct Intersect1EpilogM - { - static const size_t Mx = 16; - RayHit& ray; - IntersectContext* context; - const vuint& geomIDs; - const vuint& primIDs; - - __forceinline Intersect1EpilogM(RayHit& ray, - IntersectContext* context, - const vuint& geomIDs, - const vuint& primIDs) - : ray(ray), context(context), geomIDs(geomIDs), primIDs(primIDs) {} - - template - __forceinline bool operator() (const vbool& valid_i, Hit& hit) const - { - Scene* MAYBE_UNUSED scene = context->scene; - vbool valid = valid_i; - if (Mx > M) valid &= (1<get(geomID); - -#if defined(EMBREE_RAY_MASK) - /* goto next hit if mask test fails */ - if ((geometry->mask & ray.mask) == 0) { - clear(valid,i); - continue; - } -#endif - -#if defined(EMBREE_FILTER_FUNCTION) - /* call intersection filter function */ - if (filter) { - if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) { - const Vec2f uv = hit.uv(i); - HitK<1> h(context->user,geomID,primIDs[i],uv.x,uv.y,hit.Ng(i)); - const float old_t = ray.tfar; - ray.tfar = hit.t(i); - const bool found = runIntersectionFilter1(geometry,ray,context,h); - if (!found) ray.tfar = old_t; - foundhit |= found; - clear(valid,i); - valid &= hit.vt <= ray.tfar; // intersection filters may modify tfar value - continue; - } - } -#endif - break; - } -#endif - - vbool finalMask(((unsigned int)1 << i)); - ray.update(finalMask,hit.vt,hit.vu,hit.vv,hit.vNg.x,hit.vNg.y,hit.vNg.z,geomID,primIDs); - instance_id_stack::foreach([&](unsigned level) - { - ray.instID[level] = context->user->instID[level]; - return (context->user->instID[level] != RTC_INVALID_GEOMETRY_ID); - }); - return true; - - } - }; -#endif - - template - struct Occluded1EpilogM - { - Ray& ray; - IntersectContext* context; - const vuint& geomIDs; - const vuint& primIDs; - - __forceinline Occluded1EpilogM(Ray& ray, - IntersectContext* context, - const vuint& geomIDs, - const vuint& primIDs) - : ray(ray), context(context), geomIDs(geomIDs), primIDs(primIDs) {} - - template - __forceinline bool operator() (const vbool& valid_i, Hit& hit) const - { - Scene* scene MAYBE_UNUSED = context->scene; - /* intersection filter test */ -#if defined(EMBREE_FILTER_FUNCTION) || defined(EMBREE_RAY_MASK) - if (unlikely(filter)) - hit.finalize(); /* called only once */ - - vbool valid = valid_i; - if (Mx > M) valid &= (1<get(geomID); - -#if defined(EMBREE_RAY_MASK) - /* goto next hit if mask test fails */ - if ((geometry->mask & ray.mask) == 0) { - m=btc(m,i); - continue; - } -#endif - -#if defined(EMBREE_FILTER_FUNCTION) - /* if we have no filter then the test passed */ - if (filter) { - if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter())) - { - const Vec2f uv = hit.uv(i); - HitK<1> h(context->user,geomID,primIDs[i],uv.x,uv.y,hit.Ng(i)); - const float old_t = ray.tfar; - ray.tfar = hit.t(i); - if (runOcclusionFilter1(geometry,ray,context,h)) return true; - ray.tfar = old_t; - m=btc(m,i); - continue; - } - } -#endif - break; - } -#endif - - return true; - } - }; - - template - struct Intersect1EpilogMU - { - RayHit& ray; - IntersectContext* context; - const unsigned int geomID; - const unsigned int primID; - - __forceinline Intersect1EpilogMU(RayHit& ray, - IntersectContext* context, - const unsigned int geomID, - const unsigned int primID) - : ray(ray), context(context), geomID(geomID), primID(primID) {} - - template - __forceinline bool operator() (const vbool& valid_i, Hit& hit) const - { - /* ray mask test */ - Scene* scene MAYBE_UNUSED = context->scene; - Geometry* geometry MAYBE_UNUSED = scene->get(geomID); -#if defined(EMBREE_RAY_MASK) - if ((geometry->mask & ray.mask) == 0) return false; -#endif - - vbool valid = valid_i; - hit.finalize(); - - size_t i = select_min(valid,hit.vt); - - /* intersection filter test */ -#if defined(EMBREE_FILTER_FUNCTION) - if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) - { - bool foundhit = false; - while (true) - { - /* call intersection filter function */ - Vec2f uv = hit.uv(i); - const float old_t = ray.tfar; - ray.tfar = hit.t(i); - HitK<1> h(context->user,geomID,primID,uv.x,uv.y,hit.Ng(i)); - const bool found = runIntersectionFilter1(geometry,ray,context,h); - if (!found) ray.tfar = old_t; - foundhit |= found; - clear(valid,i); - valid &= hit.vt <= ray.tfar; // intersection filters may modify tfar value - if (unlikely(none(valid))) break; - i = select_min(valid,hit.vt); - } - return foundhit; - } -#endif - - /* update hit information */ - const Vec2f uv = hit.uv(i); - const Vec3fa Ng = hit.Ng(i); - ray.tfar = hit.t(i); - ray.Ng.x = Ng.x; - ray.Ng.y = Ng.y; - ray.Ng.z = Ng.z; - ray.u = uv.x; - ray.v = uv.y; - ray.primID = primID; - ray.geomID = geomID; - instance_id_stack::copy(context->user->instID, ray.instID); - return true; - } - }; - - template - struct Occluded1EpilogMU - { - Ray& ray; - IntersectContext* context; - const unsigned int geomID; - const unsigned int primID; - - __forceinline Occluded1EpilogMU(Ray& ray, - IntersectContext* context, - const unsigned int geomID, - const unsigned int primID) - : ray(ray), context(context), geomID(geomID), primID(primID) {} - - template - __forceinline bool operator() (const vbool& valid, Hit& hit) const - { - /* ray mask test */ - Scene* scene MAYBE_UNUSED = context->scene; - Geometry* geometry MAYBE_UNUSED = scene->get(geomID); -#if defined(EMBREE_RAY_MASK) - if ((geometry->mask & ray.mask) == 0) return false; -#endif - - /* intersection filter test */ -#if defined(EMBREE_FILTER_FUNCTION) - if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter())) - { - hit.finalize(); - for (size_t m=movemask(valid), i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) - { - const Vec2f uv = hit.uv(i); - const float old_t = ray.tfar; - ray.tfar = hit.t(i); - HitK<1> h(context->user,geomID,primID,uv.x,uv.y,hit.Ng(i)); - if (runOcclusionFilter1(geometry,ray,context,h)) return true; - ray.tfar = old_t; - } - return false; - } -#endif - return true; - } - }; - - template - struct IntersectKEpilogM - { - RayHitK& ray; - IntersectContext* context; - const vuint& geomIDs; - const vuint& primIDs; - const size_t i; - - __forceinline IntersectKEpilogM(RayHitK& ray, - IntersectContext* context, - const vuint& geomIDs, - const vuint& primIDs, - size_t i) - : ray(ray), context(context), geomIDs(geomIDs), primIDs(primIDs), i(i) {} - - template - __forceinline vbool operator() (const vbool& valid_i, const Hit& hit) const - { - Scene* scene MAYBE_UNUSED = context->scene; - - vfloat u, v, t; - Vec3vf Ng; - vbool valid = valid_i; - - std::tie(u,v,t,Ng) = hit(); - - const unsigned int geomID = geomIDs[i]; - const unsigned int primID = primIDs[i]; - Geometry* geometry MAYBE_UNUSED = scene->get(geomID); - - /* ray masking test */ -#if defined(EMBREE_RAY_MASK) - valid &= (geometry->mask & ray.mask) != 0; - if (unlikely(none(valid))) return false; -#endif - - /* occlusion filter test */ -#if defined(EMBREE_FILTER_FUNCTION) - if (filter) { - if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) { - HitK h(context->user,geomID,primID,u,v,Ng); - const vfloat old_t = ray.tfar; - ray.tfar = select(valid,t,ray.tfar); - const vbool m_accept = runIntersectionFilter(valid,geometry,ray,context,h); - ray.tfar = select(m_accept,ray.tfar,old_t); - return m_accept; - } - } -#endif - - /* update hit information */ - vfloat::store(valid,&ray.tfar,t); - vfloat::store(valid,&ray.Ng.x,Ng.x); - vfloat::store(valid,&ray.Ng.y,Ng.y); - vfloat::store(valid,&ray.Ng.z,Ng.z); - vfloat::store(valid,&ray.u,u); - vfloat::store(valid,&ray.v,v); - vuint::store(valid,&ray.primID,primID); - vuint::store(valid,&ray.geomID,geomID); - instance_id_stack::copy*, const vbool&>(context->user->instID, ray.instID, valid); - return valid; - } - }; - - template - struct OccludedKEpilogM - { - vbool& valid0; - RayK& ray; - IntersectContext* context; - const vuint& geomIDs; - const vuint& primIDs; - const size_t i; - - __forceinline OccludedKEpilogM(vbool& valid0, - RayK& ray, - IntersectContext* context, - const vuint& geomIDs, - const vuint& primIDs, - size_t i) - : valid0(valid0), ray(ray), context(context), geomIDs(geomIDs), primIDs(primIDs), i(i) {} - - template - __forceinline vbool operator() (const vbool& valid_i, const Hit& hit) const - { - vbool valid = valid_i; - - /* ray masking test */ - Scene* scene MAYBE_UNUSED = context->scene; - const unsigned int geomID = geomIDs[i]; - const unsigned int primID = primIDs[i]; - Geometry* geometry MAYBE_UNUSED = scene->get(geomID); -#if defined(EMBREE_RAY_MASK) - valid &= (geometry->mask & ray.mask) != 0; - if (unlikely(none(valid))) return valid; -#endif - - /* intersection filter test */ -#if defined(EMBREE_FILTER_FUNCTION) - if (filter) { - if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter())) - { - vfloat u, v, t; - Vec3vf Ng; - std::tie(u,v,t,Ng) = hit(); - HitK h(context->user,geomID,primID,u,v,Ng); - const vfloat old_t = ray.tfar; - ray.tfar = select(valid,t,ray.tfar); - valid = runOcclusionFilter(valid,geometry,ray,context,h); - ray.tfar = select(valid,ray.tfar,old_t); - } - } -#endif - - /* update occlusion */ - valid0 = valid0 & !valid; - return valid; - } - }; - - template - struct IntersectKEpilogMU - { - RayHitK& ray; - IntersectContext* context; - const unsigned int geomID; - const unsigned int primID; - - __forceinline IntersectKEpilogMU(RayHitK& ray, - IntersectContext* context, - const unsigned int geomID, - const unsigned int primID) - : ray(ray), context(context), geomID(geomID), primID(primID) {} - - template - __forceinline vbool operator() (const vbool& valid_org, const Hit& hit) const - { - vbool valid = valid_org; - vfloat u, v, t; - Vec3vf Ng; - std::tie(u,v,t,Ng) = hit(); - - Scene* scene MAYBE_UNUSED = context->scene; - Geometry* geometry MAYBE_UNUSED = scene->get(geomID); - - /* ray masking test */ -#if defined(EMBREE_RAY_MASK) - valid &= (geometry->mask & ray.mask) != 0; - if (unlikely(none(valid))) return false; -#endif - - /* intersection filter test */ -#if defined(EMBREE_FILTER_FUNCTION) - if (filter) { - if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) { - HitK h(context->user,geomID,primID,u,v,Ng); - const vfloat old_t = ray.tfar; - ray.tfar = select(valid,t,ray.tfar); - const vbool m_accept = runIntersectionFilter(valid,geometry,ray,context,h); - ray.tfar = select(m_accept,ray.tfar,old_t); - return m_accept; - } - } -#endif - - /* update hit information */ - vfloat::store(valid,&ray.tfar,t); - vfloat::store(valid,&ray.Ng.x,Ng.x); - vfloat::store(valid,&ray.Ng.y,Ng.y); - vfloat::store(valid,&ray.Ng.z,Ng.z); - vfloat::store(valid,&ray.u,u); - vfloat::store(valid,&ray.v,v); - vuint::store(valid,&ray.primID,primID); - vuint::store(valid,&ray.geomID,geomID); - instance_id_stack::copy*, const vbool&>(context->user->instID, ray.instID, valid); - - return valid; - } - }; - - template - struct OccludedKEpilogMU - { - vbool& valid0; - RayK& ray; - IntersectContext* context; - const unsigned int geomID; - const unsigned int primID; - - __forceinline OccludedKEpilogMU(vbool& valid0, - RayK& ray, - IntersectContext* context, - const unsigned int geomID, - const unsigned int primID) - : valid0(valid0), ray(ray), context(context), geomID(geomID), primID(primID) {} - - template - __forceinline vbool operator() (const vbool& valid_i, const Hit& hit) const - { - vbool valid = valid_i; - Scene* scene MAYBE_UNUSED = context->scene; - Geometry* geometry MAYBE_UNUSED = scene->get(geomID); - -#if defined(EMBREE_RAY_MASK) - valid &= (geometry->mask & ray.mask) != 0; - if (unlikely(none(valid))) return false; -#endif - - /* occlusion filter test */ -#if defined(EMBREE_FILTER_FUNCTION) - if (filter) { - if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter())) - { - vfloat u, v, t; - Vec3vf Ng; - std::tie(u,v,t,Ng) = hit(); - HitK h(context->user,geomID,primID,u,v,Ng); - const vfloat old_t = ray.tfar; - ray.tfar = select(valid,t,ray.tfar); - valid = runOcclusionFilter(valid,geometry,ray,context,h); - ray.tfar = select(valid,ray.tfar,old_t); - } - } -#endif - - /* update occlusion */ - valid0 = valid0 & !valid; - return valid; - } - }; - - template - struct Intersect1KEpilogM - { - RayHitK& ray; - size_t k; - IntersectContext* context; - const vuint& geomIDs; - const vuint& primIDs; - - __forceinline Intersect1KEpilogM(RayHitK& ray, size_t k, - IntersectContext* context, - const vuint& geomIDs, - const vuint& primIDs) - : ray(ray), k(k), context(context), geomIDs(geomIDs), primIDs(primIDs) {} - - template - __forceinline bool operator() (const vbool& valid_i, Hit& hit) const - { - Scene* scene MAYBE_UNUSED = context->scene; - vbool valid = valid_i; - hit.finalize(); - if (Mx > M) valid &= (1<get(geomID); - -#if defined(EMBREE_RAY_MASK) - /* goto next hit if mask test fails */ - if ((geometry->mask & ray.mask[k]) == 0) { - clear(valid,i); - continue; - } -#endif - -#if defined(EMBREE_FILTER_FUNCTION) - /* call intersection filter function */ - if (filter) { - if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) { - assert(i h(context->user,geomID,primIDs[i],uv.x,uv.y,hit.Ng(i)); - const float old_t = ray.tfar[k]; - ray.tfar[k] = hit.t(i); - const bool found = any(runIntersectionFilter(vbool(1<(hit.vNg.x),vfloat(hit.vNg.y),vfloat(hit.vNg.z),geomID,vuint(primIDs)); -#else - const Vec2f uv = hit.uv(i); - ray.tfar[k] = hit.t(i); - ray.Ng.x[k] = hit.vNg.x[i]; - ray.Ng.y[k] = hit.vNg.y[i]; - ray.Ng.z[k] = hit.vNg.z[i]; - ray.u[k] = uv.x; - ray.v[k] = uv.y; - ray.primID[k] = primIDs[i]; - ray.geomID[k] = geomID; - instance_id_stack::copy*, const size_t&>(context->user->instID, ray.instID, k); -#endif - return true; - } - }; - - template - struct Occluded1KEpilogM - { - RayK& ray; - size_t k; - IntersectContext* context; - const vuint& geomIDs; - const vuint& primIDs; - - __forceinline Occluded1KEpilogM(RayK& ray, size_t k, - IntersectContext* context, - const vuint& geomIDs, - const vuint& primIDs) - : ray(ray), k(k), context(context), geomIDs(geomIDs), primIDs(primIDs) {} - - template - __forceinline bool operator() (const vbool& valid_i, Hit& hit) const - { - Scene* scene MAYBE_UNUSED = context->scene; - - /* intersection filter test */ -#if defined(EMBREE_FILTER_FUNCTION) || defined(EMBREE_RAY_MASK) - if (unlikely(filter)) - hit.finalize(); /* called only once */ - - vbool valid = valid_i; - if (Mx > M) valid &= (1<get(geomID); - -#if defined(EMBREE_RAY_MASK) - /* goto next hit if mask test fails */ - if ((geometry->mask & ray.mask[k]) == 0) { - m=btc(m,i); - continue; - } -#endif - -#if defined(EMBREE_FILTER_FUNCTION) - /* execute occlusion filer */ - if (filter) { - if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter())) - { - const Vec2f uv = hit.uv(i); - const float old_t = ray.tfar[k]; - ray.tfar[k] = hit.t(i); - HitK h(context->user,geomID,primIDs[i],uv.x,uv.y,hit.Ng(i)); - if (any(runOcclusionFilter(vbool(1< - struct Intersect1KEpilogMU - { - RayHitK& ray; - size_t k; - IntersectContext* context; - const unsigned int geomID; - const unsigned int primID; - - __forceinline Intersect1KEpilogMU(RayHitK& ray, size_t k, - IntersectContext* context, - const unsigned int geomID, - const unsigned int primID) - : ray(ray), k(k), context(context), geomID(geomID), primID(primID) {} - - template - __forceinline bool operator() (const vbool& valid_i, Hit& hit) const - { - Scene* scene MAYBE_UNUSED = context->scene; - Geometry* geometry MAYBE_UNUSED = scene->get(geomID); -#if defined(EMBREE_RAY_MASK) - /* ray mask test */ - if ((geometry->mask & ray.mask[k]) == 0) - return false; -#endif - - /* finalize hit calculation */ - vbool valid = valid_i; - hit.finalize(); - size_t i = select_min(valid,hit.vt); - - /* intersection filter test */ -#if defined(EMBREE_FILTER_FUNCTION) - if (filter) { - if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) - { - bool foundhit = false; - while (true) - { - const Vec2f uv = hit.uv(i); - const float old_t = ray.tfar[k]; - ray.tfar[k] = hit.t(i); - HitK h(context->user,geomID,primID,uv.x,uv.y,hit.Ng(i)); - const bool found = any(runIntersectionFilter(vbool(1<(Ng.x),vfloat(Ng.y),vfloat(Ng.z),geomID,vuint(primID)); -#else - const Vec2f uv = hit.uv(i); - const Vec3fa Ng = hit.Ng(i); - ray.tfar[k] = hit.t(i); - ray.Ng.x[k] = Ng.x; - ray.Ng.y[k] = Ng.y; - ray.Ng.z[k] = Ng.z; - ray.u[k] = uv.x; - ray.v[k] = uv.y; - ray.primID[k] = primID; - ray.geomID[k] = geomID; - instance_id_stack::copy*, const size_t&>(context->user->instID, ray.instID, k); -#endif - return true; - } - }; - - template - struct Occluded1KEpilogMU - { - RayK& ray; - size_t k; - IntersectContext* context; - const unsigned int geomID; - const unsigned int primID; - - __forceinline Occluded1KEpilogMU(RayK& ray, size_t k, - IntersectContext* context, - const unsigned int geomID, - const unsigned int primID) - : ray(ray), k(k), context(context), geomID(geomID), primID(primID) {} - - template - __forceinline bool operator() (const vbool& valid_i, Hit& hit) const - { - Scene* scene MAYBE_UNUSED = context->scene; - Geometry* geometry MAYBE_UNUSED = scene->get(geomID); -#if defined(EMBREE_RAY_MASK) - /* ray mask test */ - if ((geometry->mask & ray.mask[k]) == 0) - return false; -#endif - - /* intersection filter test */ -#if defined(EMBREE_FILTER_FUNCTION) - if (filter) { - if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter())) - { - hit.finalize(); - for (size_t m=movemask(valid_i), i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) - { - const Vec2f uv = hit.uv(i); - const float old_t = ray.tfar[k]; - ray.tfar[k] = hit.t(i); - HitK h(context->user,geomID,primID,uv.x,uv.y,hit.Ng(i)); - if (any(runOcclusionFilter(vbool(1< - struct ArrayIntersector1 - { - typedef typename Intersector::Primitive Primitive; - typedef typename Intersector::Precalculations Precalculations; - - template - static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay &tray, size_t& lazy_node) - { - for (size_t i=0; i - static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay &tray, size_t& lazy_node) - { - for (size_t i=0; i - static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, const Primitive* prim, size_t num, const TravPointQuery &tquery, size_t& lazy_node) - { - bool changed = false; - for (size_t i=0; i - static __forceinline void intersectK(const vbool& valid, /* PrecalculationsK& pre, */ RayHitK& ray, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node) - { - } - - template - static __forceinline vbool occludedK(const vbool& valid, /* PrecalculationsK& pre, */ RayK& ray, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node) - { - return valid; - } - }; - - template - struct ArrayIntersectorK_1 - { - typedef typename Intersector::Primitive Primitive; - typedef typename Intersector::Precalculations Precalculations; - - template - static __forceinline void intersect(const vbool& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK &tray, size_t& lazy_node) - { - for (size_t i=0; i - static __forceinline vbool occluded(const vbool& valid, const Accel::Intersectors* This, Precalculations& pre, RayK& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK &tray, size_t& lazy_node) - { - vbool valid0 = valid; - for (size_t i=0; i - static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay &tray, size_t& lazy_node) - { - for (size_t i=0; i - static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay &tray, size_t& lazy_node) - { - for (size_t i=0; i - struct ArrayIntersectorKStream - { - typedef typename IntersectorK::Primitive PrimitiveK; - typedef typename IntersectorK::Precalculations PrecalculationsK; - - static __forceinline void intersectK(const vbool& valid, const Accel::Intersectors* This, /* PrecalculationsK& pre, */ RayHitK& ray, IntersectContext* context, const PrimitiveK* prim, size_t num, size_t& lazy_node) - { - PrecalculationsK pre(valid,ray); // FIXME: might cause trouble - - for (size_t i=0; i occludedK(const vbool& valid, const Accel::Intersectors* This, /* PrecalculationsK& pre, */ RayK& ray, IntersectContext* context, const PrimitiveK* prim, size_t num, size_t& lazy_node) - { - PrecalculationsK pre(valid,ray); // FIXME: might cause trouble - vbool valid0 = valid; - for (size_t i=0; i& ray, size_t k, IntersectContext* context, const PrimitiveK* prim, size_t num, size_t& lazy_node) - { - PrecalculationsK pre(ray.tnear() <= ray.tfar,ray); // FIXME: might cause trouble - for (size_t i=0; i& ray, size_t k, IntersectContext* context, const PrimitiveK* prim, size_t num, size_t& lazy_node) - { - PrecalculationsK pre(ray.tnear() <= ray.tfar,ray); // FIXME: might cause trouble - for (size_t i=0; i** __restrict__ inputPackets, IntersectContext* context, const PrimitiveK* prim, size_t num, size_t& lazy_node) - { - size_t m_occluded = 0; - for (size_t i=0; i &ray = *inputPackets[rayID / K]; - const size_t k = rayID % K; - PrecalculationsK pre(ray.tnear() <= ray.tfar,ray); // FIXME: might cause trouble - if (IntersectorK::occluded(pre,ray,k,context,prim[i])) - { - m_occluded |= (size_t)1 << rayID; - ray.tfar[k] = neg_inf; - } - } - } - return m_occluded; - } - }; - } -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/line_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/line_intersector.h deleted file mode 100644 index eef5b0b1fd..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/line_intersector.h +++ /dev/null @@ -1,141 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../common/ray.h" -#include "curve_intersector_precalculations.h" - -namespace embree -{ - namespace isa - { - template - struct LineIntersectorHitM - { - __forceinline LineIntersectorHitM() {} - - __forceinline LineIntersectorHitM(const vfloat& u, const vfloat& v, const vfloat& t, const Vec3vf& Ng) - : vu(u), vv(v), vt(t), vNg(Ng) {} - - __forceinline void finalize() {} - - __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); } - __forceinline float t (const size_t i) const { return vt[i]; } - __forceinline Vec3fa Ng(const size_t i) const { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); } - - public: - vfloat vu; - vfloat vv; - vfloat vt; - Vec3vf vNg; - }; - - template - struct FlatLinearCurveIntersector1 - { - typedef CurvePrecalculations1 Precalculations; - - template - static __forceinline bool intersect(const vbool& valid_i, - Ray& ray, - IntersectContext* context, - const LineSegments* geom, - const Precalculations& pre, - const Vec4vf& v0i, const Vec4vf& v1i, - const Epilog& epilog) - { - /* transform end points into ray space */ - vbool valid = valid_i; - vfloat depth_scale = pre.depth_scale; - LinearSpace3> ray_space = pre.ray_space; - - const Vec3vf ray_org ((Vec3fa)ray.org); - const Vec4vf v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i); - const Vec4vf v1 = enlargeRadiusToMinWidth(context,geom,ray_org,v1i); - - Vec4vf p0(xfmVector(ray_space,v0.xyz()-ray_org), v0.w); - Vec4vf p1(xfmVector(ray_space,v1.xyz()-ray_org), v1.w); - - /* approximative intersection with cone */ - const Vec4vf v = p1-p0; - const Vec4vf w = -p0; - const vfloat d0 = madd(w.x,v.x,w.y*v.y); - const vfloat d1 = madd(v.x,v.x,v.y*v.y); - const vfloat u = clamp(d0*rcp(d1),vfloat(zero),vfloat(one)); - const Vec4vf p = madd(u,v,p0); - const vfloat t = p.z; - const vfloat d2 = madd(p.x,p.x,p.y*p.y); - const vfloat r = p.w; - const vfloat r2 = r*r; - valid &= (d2 <= r2) & (vfloat(ray.tnear()) <= t) & (t <= vfloat(ray.tfar)); - if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f) - valid &= t > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR)*r*depth_scale; // ignore self intersections - if (unlikely(none(valid))) return false; - - /* ignore denormalized segments */ - const Vec3vf T = v1.xyz()-v0.xyz(); - valid &= (T.x != vfloat(zero)) | (T.y != vfloat(zero)) | (T.z != vfloat(zero)); - if (unlikely(none(valid))) return false; - - /* update hit information */ - LineIntersectorHitM hit(u,zero,t,T); - return epilog(valid,hit); - } - }; - - template - struct FlatLinearCurveIntersectorK - { - typedef CurvePrecalculationsK Precalculations; - - template - static __forceinline bool intersect(const vbool& valid_i, - RayK& ray, size_t k, - IntersectContext* context, - const LineSegments* geom, - const Precalculations& pre, - const Vec4vf& v0i, const Vec4vf& v1i, - const Epilog& epilog) - { - /* transform end points into ray space */ - vbool valid = valid_i; - vfloat depth_scale = pre.depth_scale[k]; - LinearSpace3> ray_space = pre.ray_space[k]; - const Vec3vf ray_org(ray.org.x[k],ray.org.y[k],ray.org.z[k]); - const Vec3vf ray_dir(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]); - - const Vec4vf v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i); - const Vec4vf v1 = enlargeRadiusToMinWidth(context,geom,ray_org,v1i); - - Vec4vf p0(xfmVector(ray_space,v0.xyz()-ray_org), v0.w); - Vec4vf p1(xfmVector(ray_space,v1.xyz()-ray_org), v1.w); - - /* approximative intersection with cone */ - const Vec4vf v = p1-p0; - const Vec4vf w = -p0; - const vfloat d0 = madd(w.x,v.x,w.y*v.y); - const vfloat d1 = madd(v.x,v.x,v.y*v.y); - const vfloat u = clamp(d0*rcp(d1),vfloat(zero),vfloat(one)); - const Vec4vf p = madd(u,v,p0); - const vfloat t = p.z; - const vfloat d2 = madd(p.x,p.x,p.y*p.y); - const vfloat r = p.w; - const vfloat r2 = r*r; - valid &= (d2 <= r2) & (vfloat(ray.tnear()[k]) <= t) & (t <= vfloat(ray.tfar[k])); - if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f) - valid &= t > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR)*r*depth_scale; // ignore self intersections - if (unlikely(none(valid))) return false; - - /* ignore denormalized segments */ - const Vec3vf T = v1.xyz()-v0.xyz(); - valid &= (T.x != vfloat(zero)) | (T.y != vfloat(zero)) | (T.z != vfloat(zero)); - if (unlikely(none(valid))) return false; - - /* update hit information */ - LineIntersectorHitM hit(u,zero,t,T); - return epilog(valid,hit); - } - }; - } -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/linei.h b/thirdparty/embree-aarch64/kernels/geometry/linei.h deleted file mode 100644 index a72029ca53..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/linei.h +++ /dev/null @@ -1,709 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "primitive.h" - -namespace embree -{ - template - struct LineMi - { - /* Virtual interface to query information about the line segment type */ - struct Type : public PrimitiveType - { - const char* name() const; - size_t sizeActive(const char* This) const; - size_t sizeTotal(const char* This) const; - size_t getBytes(const char* This) const; - }; - static Type type; - - public: - - /* primitive supports multiple time segments */ - static const bool singleTimeSegment = false; - - /* Returns maximum number of stored line segments */ - static __forceinline size_t max_size() { return M; } - - /* Returns required number of primitive blocks for N line segments */ - static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); } - - /* Returns required number of bytes for N line segments */ - static __forceinline size_t bytes(size_t N) { return blocks(N)*sizeof(LineMi); } - - public: - - /* Default constructor */ - __forceinline LineMi() { } - - /* Construction from vertices and IDs */ - __forceinline LineMi(const vuint& v0, unsigned short leftExists, unsigned short rightExists, const vuint& geomIDs, const vuint& primIDs, Geometry::GType gtype) - : gtype((unsigned char)gtype), m((unsigned char)popcnt(vuint(primIDs) != vuint(-1))), sharedGeomID(geomIDs[0]), leftExists (leftExists), rightExists(rightExists), v0(v0), primIDs(primIDs) - { - assert(all(vuint(geomID()) == geomIDs)); - } - - /* Returns a mask that tells which line segments are valid */ - __forceinline vbool valid() const { return primIDs != vuint(-1); } - - /* Returns a mask that tells which line segments are valid */ - template - __forceinline vbool valid() const { return vuint(primIDs) != vuint(-1); } - - /* Returns if the specified line segment is valid */ - __forceinline bool valid(const size_t i) const { assert(i - //static __forceinline T unmask(T &index) { return index & 0x3fffffff; } - - __forceinline unsigned int geomID(unsigned int i = 0) const { return sharedGeomID; } - //__forceinline vuint geomID() { return unmask(geomIDs); } - //__forceinline const vuint geomID() const { return unmask(geomIDs); } - //__forceinline unsigned int geomID(const size_t i) const { assert(i& primID() { return primIDs; } - __forceinline const vuint& primID() const { return primIDs; } - __forceinline unsigned int primID(const size_t i) const { assert(i& p0, - Vec4vf& p1, - const LineSegments* geom) const; - - __forceinline void gatheri(Vec4vf& p0, - Vec4vf& p1, - const LineSegments* geom, - const int itime) const; - - __forceinline void gather(Vec4vf& p0, - Vec4vf& p1, - const LineSegments* geom, - float time) const; - - /* gather the line segments with lateral info */ - __forceinline void gather(Vec4vf& p0, - Vec4vf& p1, - Vec4vf& pL, - Vec4vf& pR, - const LineSegments* geom) const; - - __forceinline void gatheri(Vec4vf& p0, - Vec4vf& p1, - Vec4vf& pL, - Vec4vf& pR, - const LineSegments* geom, - const int itime) const; - - __forceinline void gather(Vec4vf& p0, - Vec4vf& p1, - Vec4vf& pL, - Vec4vf& pR, - const LineSegments* geom, - float time) const; - - __forceinline void gather(Vec4vf& p0, - Vec4vf& p1, - vbool& cL, - vbool& cR, - const LineSegments* geom) const; - - __forceinline void gatheri(Vec4vf& p0, - Vec4vf& p1, - vbool& cL, - vbool& cR, - const LineSegments* geom, - const int itime) const; - - __forceinline void gather(Vec4vf& p0, - Vec4vf& p1, - vbool& cL, - vbool& cR, - const LineSegments* geom, - float time) const; - - /* Calculate the bounds of the line segments */ - __forceinline const BBox3fa bounds(const Scene* scene, size_t itime = 0) const - { - BBox3fa bounds = empty; - for (size_t i=0; iget(geomID(i)); - const Vec3ff& p0 = geom->vertex(v0[i]+0,itime); - const Vec3ff& p1 = geom->vertex(v0[i]+1,itime); - BBox3fa b = merge(BBox3fa(p0),BBox3fa(p1)); - b = enlarge(b,Vec3fa(max(p0.w,p1.w))); - bounds.extend(b); - } - return bounds; - } - - /* Calculate the linear bounds of the primitive */ - __forceinline LBBox3fa linearBounds(const Scene* scene, size_t itime) { - return LBBox3fa(bounds(scene,itime+0), bounds(scene,itime+1)); - } - - __forceinline LBBox3fa linearBounds(const Scene *const scene, size_t itime, size_t numTimeSteps) { - LBBox3fa allBounds = empty; - for (size_t i=0; iget(geomID(i)); - allBounds.extend(geom->linearBounds(primID(i), itime, numTimeSteps)); - } - return allBounds; - } - - __forceinline LBBox3fa linearBounds(const Scene *const scene, const BBox1f time_range) - { - LBBox3fa allBounds = empty; - for (size_t i=0; iget(geomID((unsigned int)i)); - allBounds.extend(geom->linearBounds(primID(i), time_range)); - } - return allBounds; - } - - /* Fill line segment from line segment list */ - template - __forceinline void fill(const PrimRefT* prims, size_t& begin, size_t end, Scene* scene) - { - Geometry::GType gty = scene->get(prims[begin].geomID())->getType(); - vuint geomID, primID; - vuint v0; - unsigned short leftExists = 0; - unsigned short rightExists = 0; - const PrimRefT* prim = &prims[begin]; - - for (size_t i=0; iget(prim->geomID()); - if (begingeomID(); - primID[i] = prim->primID(); - v0[i] = geom->segment(prim->primID()); - leftExists |= geom->segmentLeftExists(primID[i]) << i; - rightExists |= geom->segmentRightExists(primID[i]) << i; - begin++; - } else { - assert(i); - if (i>0) { - geomID[i] = geomID[i-1]; - primID[i] = -1; - v0[i] = v0[i-1]; - } - } - if (begin - __forceinline static typename BVH::NodeRef createLeaf (BVH* bvh, const PrimRef* prims, const range& set, const Allocator& alloc) - { - size_t start = set.begin(); - size_t items = LineMi::blocks(set.size()); - size_t numbytes = LineMi::bytes(set.size()); - LineMi* accel = (LineMi*) alloc.malloc1(numbytes,M*sizeof(float)); - for (size_t i=0; iscene); - } - return bvh->encodeLeaf((char*)accel,items); - }; - - __forceinline LBBox3fa fillMB(const PrimRef* prims, size_t& begin, size_t end, Scene* scene, size_t itime) - { - fill(prims,begin,end,scene); - return linearBounds(scene,itime); - } - - __forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t& begin, size_t end, Scene* scene, const BBox1f time_range) - { - fill(prims,begin,end,scene); - return linearBounds(scene,time_range); - } - - template - __forceinline static typename BVH::NodeRecordMB4D createLeafMB(BVH* bvh, const SetMB& prims, const Allocator& alloc) - { - size_t start = prims.begin(); - size_t end = prims.end(); - size_t items = LineMi::blocks(prims.size()); - size_t numbytes = LineMi::bytes(prims.size()); - LineMi* accel = (LineMi*) alloc.malloc1(numbytes,M*sizeof(float)); - const typename BVH::NodeRef node = bvh->encodeLeaf((char*)accel,items); - - LBBox3fa bounds = empty; - for (size_t i=0; idata(),start,end,bvh->scene,prims.time_range)); - - return typename BVH::NodeRecordMB4D(node,bounds,prims.time_range); - }; - - /* Updates the primitive */ - __forceinline BBox3fa update(LineSegments* geom) - { - BBox3fa bounds = empty; - for (size_t i=0; ivertex(v0[i]+0); - const Vec3ff& p1 = geom->vertex(v0[i]+1); - BBox3fa b = merge(BBox3fa(p0),BBox3fa(p1)); - b = enlarge(b,Vec3fa(max(p0.w,p1.w))); - bounds.extend(b); - } - return bounds; - } - - /*! output operator */ - friend __forceinline embree_ostream operator<<(embree_ostream cout, const LineMi& line) { - return cout << "Line" << M << "i {" << line.v0 << ", " << line.geomID() << ", " << line.primID() << "}"; - } - - public: - unsigned char gtype; - unsigned char m; - unsigned int sharedGeomID; - unsigned short leftExists, rightExists; - vuint v0; // index of start vertex - private: - vuint primIDs; // primitive ID - }; - - template<> - __forceinline void LineMi<4>::gather(Vec4vf4& p0, - Vec4vf4& p1, - const LineSegments* geom) const - { - const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0])); - const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1])); - const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2])); - const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3])); - transpose(a0,a1,a2,a3,p0.x,p0.y,p0.z,p0.w); - - const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1)); - const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1)); - const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1)); - const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1)); - transpose(b0,b1,b2,b3,p1.x,p1.y,p1.z,p1.w); - } - - template<> - __forceinline void LineMi<4>::gatheri(Vec4vf4& p0, - Vec4vf4& p1, - const LineSegments* geom, - const int itime) const - { - const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0],itime)); - const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1],itime)); - const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2],itime)); - const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3],itime)); - transpose(a0,a1,a2,a3,p0.x,p0.y,p0.z,p0.w); - - const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1,itime)); - const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1,itime)); - const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1,itime)); - const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1,itime)); - transpose(b0,b1,b2,b3,p1.x,p1.y,p1.z,p1.w); - } - - template<> - __forceinline void LineMi<4>::gather(Vec4vf4& p0, - Vec4vf4& p1, - const LineSegments* geom, - float time) const - { - float ftime; - const int itime = geom->timeSegment(time, ftime); - - Vec4vf4 a0,a1; - gatheri(a0,a1,geom,itime); - Vec4vf4 b0,b1; - gatheri(b0,b1,geom,itime+1); - p0 = lerp(a0,b0,vfloat4(ftime)); - p1 = lerp(a1,b1,vfloat4(ftime)); - } - - template<> - __forceinline void LineMi<4>::gather(Vec4vf4& p0, - Vec4vf4& p1, - vbool4& cL, - vbool4& cR, - const LineSegments* geom) const - { - gather(p0,p1,geom); - cL = !vbool4(leftExists); - cR = !vbool4(rightExists); - } - - template<> - __forceinline void LineMi<4>::gatheri(Vec4vf4& p0, - Vec4vf4& p1, - vbool4& cL, - vbool4& cR, - const LineSegments* geom, - const int itime) const - { - gatheri(p0,p1,geom,itime); - cL = !vbool4(leftExists); - cR = !vbool4(rightExists); - } - - template<> - __forceinline void LineMi<4>::gather(Vec4vf4& p0, - Vec4vf4& p1, - vbool4& cL, - vbool4& cR, - const LineSegments* geom, - float time) const - { - float ftime; - const int itime = geom->timeSegment(time, ftime); - - Vec4vf4 a0,a1; - gatheri(a0,a1,geom,itime); - Vec4vf4 b0,b1; - gatheri(b0,b1,geom,itime+1); - p0 = lerp(a0,b0,vfloat4(ftime)); - p1 = lerp(a1,b1,vfloat4(ftime)); - cL = !vbool4(leftExists); - cR = !vbool4(rightExists); - } - - template<> - __forceinline void LineMi<4>::gather(Vec4vf4& p0, - Vec4vf4& p1, - Vec4vf4& pL, - Vec4vf4& pR, - const LineSegments* geom) const - { - const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0])); - const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1])); - const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2])); - const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3])); - transpose(a0,a1,a2,a3,p0.x,p0.y,p0.z,p0.w); - - const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1)); - const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1)); - const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1)); - const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1)); - transpose(b0,b1,b2,b3,p1.x,p1.y,p1.z,p1.w); - - const vfloat4 l0 = (leftExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]-1)) : vfloat4(inf); - const vfloat4 l1 = (leftExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]-1)) : vfloat4(inf); - const vfloat4 l2 = (leftExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]-1)) : vfloat4(inf); - const vfloat4 l3 = (leftExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]-1)) : vfloat4(inf); - transpose(l0,l1,l2,l3,pL.x,pL.y,pL.z,pL.w); - - const vfloat4 r0 = (rightExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]+2)) : vfloat4(inf); - const vfloat4 r1 = (rightExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]+2)) : vfloat4(inf); - const vfloat4 r2 = (rightExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]+2)) : vfloat4(inf); - const vfloat4 r3 = (rightExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]+2)) : vfloat4(inf); - transpose(r0,r1,r2,r3,pR.x,pR.y,pR.z,pR.w); - } - - template<> - __forceinline void LineMi<4>::gatheri(Vec4vf4& p0, - Vec4vf4& p1, - Vec4vf4& pL, - Vec4vf4& pR, - const LineSegments* geom, - const int itime) const - { - const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0],itime)); - const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1],itime)); - const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2],itime)); - const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3],itime)); - transpose(a0,a1,a2,a3,p0.x,p0.y,p0.z,p0.w); - - const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1,itime)); - const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1,itime)); - const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1,itime)); - const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1,itime)); - transpose(b0,b1,b2,b3,p1.x,p1.y,p1.z,p1.w); - - const vfloat4 l0 = (leftExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]-1,itime)) : vfloat4(inf); - const vfloat4 l1 = (leftExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]-1,itime)) : vfloat4(inf); - const vfloat4 l2 = (leftExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]-1,itime)) : vfloat4(inf); - const vfloat4 l3 = (leftExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]-1,itime)) : vfloat4(inf); - transpose(l0,l1,l2,l3,pL.x,pL.y,pL.z,pL.w); - - const vfloat4 r0 = (rightExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]+2,itime)) : vfloat4(inf); - const vfloat4 r1 = (rightExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]+2,itime)) : vfloat4(inf); - const vfloat4 r2 = (rightExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]+2,itime)) : vfloat4(inf); - const vfloat4 r3 = (rightExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]+2,itime)) : vfloat4(inf); - transpose(r0,r1,r2,r3,pR.x,pR.y,pR.z,pR.w); - } - - template<> - __forceinline void LineMi<4>::gather(Vec4vf4& p0, - Vec4vf4& p1, - Vec4vf4& pL, - Vec4vf4& pR, - const LineSegments* geom, - float time) const - { - float ftime; - const int itime = geom->timeSegment(time, ftime); - - Vec4vf4 a0,a1,aL,aR; - gatheri(a0,a1,aL,aR,geom,itime); - Vec4vf4 b0,b1,bL,bR; - gatheri(b0,b1,bL,bR,geom,itime+1); - p0 = lerp(a0,b0,vfloat4(ftime)); - p1 = lerp(a1,b1,vfloat4(ftime)); - pL = lerp(aL,bL,vfloat4(ftime)); - pR = lerp(aR,bR,vfloat4(ftime)); - } - -#if defined(__AVX__) - - template<> - __forceinline void LineMi<8>::gather(Vec4vf8& p0, - Vec4vf8& p1, - const LineSegments* geom) const - { - const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0])); - const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1])); - const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2])); - const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3])); - const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(v0[4])); - const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(v0[5])); - const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(v0[6])); - const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(v0[7])); - transpose(a0,a1,a2,a3,a4,a5,a6,a7,p0.x,p0.y,p0.z,p0.w); - - const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1)); - const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1)); - const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1)); - const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1)); - const vfloat4 b4 = vfloat4::loadu(geom->vertexPtr(v0[4]+1)); - const vfloat4 b5 = vfloat4::loadu(geom->vertexPtr(v0[5]+1)); - const vfloat4 b6 = vfloat4::loadu(geom->vertexPtr(v0[6]+1)); - const vfloat4 b7 = vfloat4::loadu(geom->vertexPtr(v0[7]+1)); - transpose(b0,b1,b2,b3,b4,b5,b6,b7,p1.x,p1.y,p1.z,p1.w); - } - - template<> - __forceinline void LineMi<8>::gatheri(Vec4vf8& p0, - Vec4vf8& p1, - const LineSegments* geom, - const int itime) const - { - const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0],itime)); - const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1],itime)); - const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2],itime)); - const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3],itime)); - const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(v0[4],itime)); - const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(v0[5],itime)); - const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(v0[6],itime)); - const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(v0[7],itime)); - transpose(a0,a1,a2,a3,a4,a5,a6,a7,p0.x,p0.y,p0.z,p0.w); - - const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1,itime)); - const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1,itime)); - const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1,itime)); - const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1,itime)); - const vfloat4 b4 = vfloat4::loadu(geom->vertexPtr(v0[4]+1,itime)); - const vfloat4 b5 = vfloat4::loadu(geom->vertexPtr(v0[5]+1,itime)); - const vfloat4 b6 = vfloat4::loadu(geom->vertexPtr(v0[6]+1,itime)); - const vfloat4 b7 = vfloat4::loadu(geom->vertexPtr(v0[7]+1,itime)); - transpose(b0,b1,b2,b3,b4,b5,b6,b7,p1.x,p1.y,p1.z,p1.w); - } - - template<> - __forceinline void LineMi<8>::gather(Vec4vf8& p0, - Vec4vf8& p1, - const LineSegments* geom, - float time) const - { - float ftime; - const int itime = geom->timeSegment(time, ftime); - - Vec4vf8 a0,a1; - gatheri(a0,a1,geom,itime); - Vec4vf8 b0,b1; - gatheri(b0,b1,geom,itime+1); - p0 = lerp(a0,b0,vfloat8(ftime)); - p1 = lerp(a1,b1,vfloat8(ftime)); - } - - template<> - __forceinline void LineMi<8>::gather(Vec4vf8& p0, - Vec4vf8& p1, - Vec4vf8& pL, - Vec4vf8& pR, - const LineSegments* geom) const - { - const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0])); - const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1])); - const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2])); - const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3])); - const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(v0[4])); - const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(v0[5])); - const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(v0[6])); - const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(v0[7])); - transpose(a0,a1,a2,a3,a4,a5,a6,a7,p0.x,p0.y,p0.z,p0.w); - - const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1)); - const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1)); - const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1)); - const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1)); - const vfloat4 b4 = vfloat4::loadu(geom->vertexPtr(v0[4]+1)); - const vfloat4 b5 = vfloat4::loadu(geom->vertexPtr(v0[5]+1)); - const vfloat4 b6 = vfloat4::loadu(geom->vertexPtr(v0[6]+1)); - const vfloat4 b7 = vfloat4::loadu(geom->vertexPtr(v0[7]+1)); - transpose(b0,b1,b2,b3,b4,b5,b6,b7,p1.x,p1.y,p1.z,p1.w); - - const vfloat4 l0 = (leftExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]-1)) : vfloat4(inf); - const vfloat4 l1 = (leftExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]-1)) : vfloat4(inf); - const vfloat4 l2 = (leftExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]-1)) : vfloat4(inf); - const vfloat4 l3 = (leftExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]-1)) : vfloat4(inf); - const vfloat4 l4 = (leftExists & (1<<4)) ? vfloat4::loadu(geom->vertexPtr(v0[4]-1)) : vfloat4(inf); - const vfloat4 l5 = (leftExists & (1<<5)) ? vfloat4::loadu(geom->vertexPtr(v0[5]-1)) : vfloat4(inf); - const vfloat4 l6 = (leftExists & (1<<6)) ? vfloat4::loadu(geom->vertexPtr(v0[6]-1)) : vfloat4(inf); - const vfloat4 l7 = (leftExists & (1<<7)) ? vfloat4::loadu(geom->vertexPtr(v0[7]-1)) : vfloat4(inf); - transpose(l0,l1,l2,l3,l4,l5,l6,l7,pL.x,pL.y,pL.z,pL.w); - - const vfloat4 r0 = (rightExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]+2)) : vfloat4(inf); - const vfloat4 r1 = (rightExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]+2)) : vfloat4(inf); - const vfloat4 r2 = (rightExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]+2)) : vfloat4(inf); - const vfloat4 r3 = (rightExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]+2)) : vfloat4(inf); - const vfloat4 r4 = (rightExists & (1<<4)) ? vfloat4::loadu(geom->vertexPtr(v0[4]+2)) : vfloat4(inf); - const vfloat4 r5 = (rightExists & (1<<5)) ? vfloat4::loadu(geom->vertexPtr(v0[5]+2)) : vfloat4(inf); - const vfloat4 r6 = (rightExists & (1<<6)) ? vfloat4::loadu(geom->vertexPtr(v0[6]+2)) : vfloat4(inf); - const vfloat4 r7 = (rightExists & (1<<7)) ? vfloat4::loadu(geom->vertexPtr(v0[7]+2)) : vfloat4(inf); - transpose(r0,r1,r2,r3,r4,r5,r6,r7,pR.x,pR.y,pR.z,pR.w); - } - - template<> - __forceinline void LineMi<8>::gatheri(Vec4vf8& p0, - Vec4vf8& p1, - Vec4vf8& pL, - Vec4vf8& pR, - const LineSegments* geom, - const int itime) const - { - const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0],itime)); - const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1],itime)); - const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2],itime)); - const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3],itime)); - const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(v0[4],itime)); - const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(v0[5],itime)); - const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(v0[6],itime)); - const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(v0[7],itime)); - transpose(a0,a1,a2,a3,a4,a5,a6,a7,p0.x,p0.y,p0.z,p0.w); - - const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1,itime)); - const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1,itime)); - const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1,itime)); - const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1,itime)); - const vfloat4 b4 = vfloat4::loadu(geom->vertexPtr(v0[4]+1,itime)); - const vfloat4 b5 = vfloat4::loadu(geom->vertexPtr(v0[5]+1,itime)); - const vfloat4 b6 = vfloat4::loadu(geom->vertexPtr(v0[6]+1,itime)); - const vfloat4 b7 = vfloat4::loadu(geom->vertexPtr(v0[7]+1,itime)); - transpose(b0,b1,b2,b3,b4,b5,b6,b7,p1.x,p1.y,p1.z,p1.w); - - const vfloat4 l0 = (leftExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]-1,itime)) : vfloat4(inf); - const vfloat4 l1 = (leftExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]-1,itime)) : vfloat4(inf); - const vfloat4 l2 = (leftExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]-1,itime)) : vfloat4(inf); - const vfloat4 l3 = (leftExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]-1,itime)) : vfloat4(inf); - const vfloat4 l4 = (leftExists & (1<<4)) ? vfloat4::loadu(geom->vertexPtr(v0[4]-1,itime)) : vfloat4(inf); - const vfloat4 l5 = (leftExists & (1<<5)) ? vfloat4::loadu(geom->vertexPtr(v0[5]-1,itime)) : vfloat4(inf); - const vfloat4 l6 = (leftExists & (1<<6)) ? vfloat4::loadu(geom->vertexPtr(v0[6]-1,itime)) : vfloat4(inf); - const vfloat4 l7 = (leftExists & (1<<7)) ? vfloat4::loadu(geom->vertexPtr(v0[7]-1,itime)) : vfloat4(inf); - transpose(l0,l1,l2,l3,l4,l5,l6,l7,pL.x,pL.y,pL.z,pL.w); - - const vfloat4 r0 = (rightExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]+2,itime)) : vfloat4(inf); - const vfloat4 r1 = (rightExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]+2,itime)) : vfloat4(inf); - const vfloat4 r2 = (rightExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]+2,itime)) : vfloat4(inf); - const vfloat4 r3 = (rightExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]+2,itime)) : vfloat4(inf); - const vfloat4 r4 = (rightExists & (1<<4)) ? vfloat4::loadu(geom->vertexPtr(v0[4]+2,itime)) : vfloat4(inf); - const vfloat4 r5 = (rightExists & (1<<5)) ? vfloat4::loadu(geom->vertexPtr(v0[5]+2,itime)) : vfloat4(inf); - const vfloat4 r6 = (rightExists & (1<<6)) ? vfloat4::loadu(geom->vertexPtr(v0[6]+2,itime)) : vfloat4(inf); - const vfloat4 r7 = (rightExists & (1<<7)) ? vfloat4::loadu(geom->vertexPtr(v0[7]+2,itime)) : vfloat4(inf); - transpose(r0,r1,r2,r3,r4,r5,r6,r7,pR.x,pR.y,pR.z,pR.w); - } - - template<> - __forceinline void LineMi<8>::gather(Vec4vf8& p0, - Vec4vf8& p1, - Vec4vf8& pL, - Vec4vf8& pR, - const LineSegments* geom, - float time) const - { - float ftime; - const int itime = geom->timeSegment(time, ftime); - - Vec4vf8 a0,a1,aL,aR; - gatheri(a0,a1,aL,aR,geom,itime); - Vec4vf8 b0,b1,bL,bR; - gatheri(b0,b1,bL,bR,geom,itime+1); - p0 = lerp(a0,b0,vfloat8(ftime)); - p1 = lerp(a1,b1,vfloat8(ftime)); - pL = lerp(aL,bL,vfloat8(ftime)); - pR = lerp(aR,bR,vfloat8(ftime)); - } - - template<> - __forceinline void LineMi<8>::gather(Vec4vf8& p0, - Vec4vf8& p1, - vbool8& cL, - vbool8& cR, - const LineSegments* geom) const - { - gather(p0,p1,geom); - cL = !vbool8(leftExists); - cR = !vbool8(rightExists); - } - - template<> - __forceinline void LineMi<8>::gatheri(Vec4vf8& p0, - Vec4vf8& p1, - vbool8& cL, - vbool8& cR, - const LineSegments* geom, - const int itime) const - { - gatheri(p0,p1,geom,itime); - cL = !vbool8(leftExists); - cR = !vbool8(rightExists); - } - - template<> - __forceinline void LineMi<8>::gather(Vec4vf8& p0, - Vec4vf8& p1, - vbool8& cL, - vbool8& cR, - const LineSegments* geom, - float time) const - { - float ftime; - const int itime = geom->timeSegment(time, ftime); - - Vec4vf8 a0,a1; - gatheri(a0,a1,geom,itime); - Vec4vf8 b0,b1; - gatheri(b0,b1,geom,itime+1); - p0 = lerp(a0,b0,vfloat8(ftime)); - p1 = lerp(a1,b1,vfloat8(ftime)); - cL = !vbool8(leftExists); - cR = !vbool8(rightExists); - } - -#endif - - template - typename LineMi::Type LineMi::type; - - typedef LineMi<4> Line4i; - typedef LineMi<8> Line8i; -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/linei_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/linei_intersector.h deleted file mode 100644 index a431796a88..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/linei_intersector.h +++ /dev/null @@ -1,124 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "linei.h" -#include "line_intersector.h" -#include "intersector_epilog.h" - -namespace embree -{ - namespace isa - { - template - struct FlatLinearCurveMiIntersector1 - { - typedef LineMi Primitive; - typedef CurvePrecalculations1 Precalculations; - - static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& line) - { - STAT3(normal.trav_prims,1,1,1); - const LineSegments* geom = context->scene->get(line.geomID()); - Vec4vf v0,v1; line.gather(v0,v1,geom); - const vbool valid = line.template valid(); - FlatLinearCurveIntersector1::intersect(valid,ray,context,geom,pre,v0,v1,Intersect1EpilogM(ray,context,line.geomID(),line.primID())); - } - - static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& line) - { - STAT3(shadow.trav_prims,1,1,1); - const LineSegments* geom = context->scene->get(line.geomID()); - Vec4vf v0,v1; line.gather(v0,v1,geom); - const vbool valid = line.template valid(); - return FlatLinearCurveIntersector1::intersect(valid,ray,context,geom,pre,v0,v1,Occluded1EpilogM(ray,context,line.geomID(),line.primID())); - } - - static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& line) - { - return PrimitivePointQuery1::pointQuery(query, context, line); - } - }; - - template - struct FlatLinearCurveMiMBIntersector1 - { - typedef LineMi Primitive; - typedef CurvePrecalculations1 Precalculations; - - static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& line) - { - STAT3(normal.trav_prims,1,1,1); - const LineSegments* geom = context->scene->get(line.geomID()); - Vec4vf v0,v1; line.gather(v0,v1,geom,ray.time()); - const vbool valid = line.template valid(); - FlatLinearCurveIntersector1::intersect(valid,ray,context,geom,pre,v0,v1,Intersect1EpilogM(ray,context,line.geomID(),line.primID())); - } - - static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& line) - { - STAT3(shadow.trav_prims,1,1,1); - const LineSegments* geom = context->scene->get(line.geomID()); - Vec4vf v0,v1; line.gather(v0,v1,geom,ray.time()); - const vbool valid = line.template valid(); - return FlatLinearCurveIntersector1::intersect(valid,ray,context,geom,pre,v0,v1,Occluded1EpilogM(ray,context,line.geomID(),line.primID())); - } - - static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& line) - { - return PrimitivePointQuery1::pointQuery(query, context, line); - } - }; - - template - struct FlatLinearCurveMiIntersectorK - { - typedef LineMi Primitive; - typedef CurvePrecalculationsK Precalculations; - - static __forceinline void intersect(const Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive& line) - { - STAT3(normal.trav_prims,1,1,1); - const LineSegments* geom = context->scene->get(line.geomID()); - Vec4vf v0,v1; line.gather(v0,v1,geom); - const vbool valid = line.template valid(); - FlatLinearCurveIntersectorK::intersect(valid,ray,k,context,geom,pre,v0,v1,Intersect1KEpilogM(ray,k,context,line.geomID(),line.primID())); - } - - static __forceinline bool occluded(const Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const Primitive& line) - { - STAT3(shadow.trav_prims,1,1,1); - const LineSegments* geom = context->scene->get(line.geomID()); - Vec4vf v0,v1; line.gather(v0,v1,geom); - const vbool valid = line.template valid(); - return FlatLinearCurveIntersectorK::intersect(valid,ray,k,context,geom,pre,v0,v1,Occluded1KEpilogM(ray,k,context,line.geomID(),line.primID())); - } - }; - - template - struct FlatLinearCurveMiMBIntersectorK - { - typedef LineMi Primitive; - typedef CurvePrecalculationsK Precalculations; - - static __forceinline void intersect(const Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive& line) - { - STAT3(normal.trav_prims,1,1,1); - const LineSegments* geom = context->scene->get(line.geomID()); - Vec4vf v0,v1; line.gather(v0,v1,geom,ray.time()[k]); - const vbool valid = line.template valid(); - FlatLinearCurveIntersectorK::intersect(valid,ray,k,context,geom,pre,v0,v1,Intersect1KEpilogM(ray,k,context,line.geomID(),line.primID())); - } - - static __forceinline bool occluded(const Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const Primitive& line) - { - STAT3(shadow.trav_prims,1,1,1); - const LineSegments* geom = context->scene->get(line.geomID()); - Vec4vf v0,v1; line.gather(v0,v1,geom,ray.time()[k]); - const vbool valid = line.template valid(); - return FlatLinearCurveIntersectorK::intersect(valid,ray,k,context,geom,pre,v0,v1,Occluded1KEpilogM(ray,k,context,line.geomID(),line.primID())); - } - }; - } -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/object.h b/thirdparty/embree-aarch64/kernels/geometry/object.h deleted file mode 100644 index f26391de52..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/object.h +++ /dev/null @@ -1,84 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "primitive.h" - -namespace embree -{ - struct Object - { - struct Type : public PrimitiveType - { - const char* name() const; - size_t sizeActive(const char* This) const; - size_t sizeTotal(const char* This) const; - size_t getBytes(const char* This) const; - }; - static Type type; - - public: - - /* primitive supports multiple time segments */ - static const bool singleTimeSegment = false; - - /* Returns maximum number of stored primitives */ - static __forceinline size_t max_size() { return 1; } - - /* Returns required number of primitive blocks for N primitives */ - static __forceinline size_t blocks(size_t N) { return N; } - - public: - - /*! constructs a virtual object */ - Object (unsigned geomID, unsigned primID) - : _geomID(geomID), _primID(primID) {} - - __forceinline unsigned geomID() const { - return _geomID; - } - - __forceinline unsigned primID() const { - return _primID; - } - - /*! fill triangle from triangle list */ - __forceinline void fill(const PrimRef* prims, size_t& i, size_t end, Scene* scene) - { - const PrimRef& prim = prims[i]; i++; - new (this) Object(prim.geomID(), prim.primID()); - } - - /*! fill triangle from triangle list */ - __forceinline LBBox3fa fillMB(const PrimRef* prims, size_t& i, size_t end, Scene* scene, size_t itime) - { - const PrimRef& prim = prims[i]; i++; - const unsigned geomID = prim.geomID(); - const unsigned primID = prim.primID(); - new (this) Object(geomID, primID); - AccelSet* accel = (AccelSet*) scene->get(geomID); - return accel->linearBounds(primID,itime); - } - - /*! fill triangle from triangle list */ - __forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t& i, size_t end, Scene* scene, const BBox1f time_range) - { - const PrimRefMB& prim = prims[i]; i++; - const unsigned geomID = prim.geomID(); - const unsigned primID = prim.primID(); - new (this) Object(geomID, primID); - AccelSet* accel = (AccelSet*) scene->get(geomID); - return accel->linearBounds(primID,time_range); - } - - /* Updates the primitive */ - __forceinline BBox3fa update(AccelSet* mesh) { - return mesh->bounds(primID()); - } - - private: - unsigned int _geomID; //!< geometry ID - unsigned int _primID; //!< primitive ID - }; -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/object_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/object_intersector.h deleted file mode 100644 index 97882e0e59..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/object_intersector.h +++ /dev/null @@ -1,127 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "object.h" -#include "../common/ray.h" - -namespace embree -{ - namespace isa - { - template - struct ObjectIntersector1 - { - typedef Object Primitive; - - static const bool validIntersectorK = false; - - struct Precalculations { - __forceinline Precalculations() {} - __forceinline Precalculations (const Ray& ray, const void *ptr) {} - }; - - static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim) - { - AccelSet* accel = (AccelSet*) context->scene->get(prim.geomID()); - - /* perform ray mask test */ -#if defined(EMBREE_RAY_MASK) - if ((ray.mask & accel->mask) == 0) - return; -#endif - - accel->intersect(ray,prim.geomID(),prim.primID(),context,reportIntersection1); - } - - static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim) - { - AccelSet* accel = (AccelSet*) context->scene->get(prim.geomID()); - /* perform ray mask test */ -#if defined(EMBREE_RAY_MASK) - if ((ray.mask & accel->mask) == 0) - return false; -#endif - - accel->occluded(ray,prim.geomID(),prim.primID(),context,&reportOcclusion1); - return ray.tfar < 0.0f; - } - - static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& prim) - { - AccelSet* accel = (AccelSet*)context->scene->get(prim.geomID()); - context->geomID = prim.geomID(); - context->primID = prim.primID(); - return accel->pointQuery(query, context); - } - - template - static __forceinline void intersectK(const vbool& valid, /* PrecalculationsK& pre, */ RayHitK& ray, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node) - { - assert(false); - } - - template - static __forceinline vbool occludedK(const vbool& valid, /* PrecalculationsK& pre, */ RayK& ray, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node) - { - assert(false); - return valid; - } - }; - - template - struct ObjectIntersectorK - { - typedef Object Primitive; - - struct Precalculations { - __forceinline Precalculations (const vbool& valid, const RayK& ray) {} - }; - - static __forceinline void intersect(const vbool& valid_i, const Precalculations& pre, RayHitK& ray, IntersectContext* context, const Primitive& prim) - { - vbool valid = valid_i; - AccelSet* accel = (AccelSet*) context->scene->get(prim.geomID()); - - /* perform ray mask test */ -#if defined(EMBREE_RAY_MASK) - valid &= (ray.mask & accel->mask) != 0; - if (none(valid)) return; -#endif - accel->intersect(valid,ray,prim.geomID(),prim.primID(),context,&reportIntersection1); - } - - static __forceinline vbool occluded(const vbool& valid_i, const Precalculations& pre, RayK& ray, IntersectContext* context, const Primitive& prim) - { - vbool valid = valid_i; - AccelSet* accel = (AccelSet*) context->scene->get(prim.geomID()); - - /* perform ray mask test */ -#if defined(EMBREE_RAY_MASK) - valid &= (ray.mask & accel->mask) != 0; - if (none(valid)) return false; -#endif - accel->occluded(valid,ray,prim.geomID(),prim.primID(),context,&reportOcclusion1); - return ray.tfar < 0.0f; - } - - static __forceinline void intersect(Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive& prim) { - intersect(vbool(1<& ray, size_t k, IntersectContext* context, const Primitive& prim) { - occluded(vbool(1< ObjectIntersector4; - typedef ObjectIntersectorK<8,false> ObjectIntersector8; - typedef ObjectIntersectorK<16,false> ObjectIntersector16; - - typedef ObjectIntersectorK<4,true> ObjectIntersector4MB; - typedef ObjectIntersectorK<8,true> ObjectIntersector8MB; - typedef ObjectIntersectorK<16,true> ObjectIntersector16MB; - } -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/plane.h b/thirdparty/embree-aarch64/kernels/geometry/plane.h deleted file mode 100644 index ebe45db558..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/plane.h +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../common/ray.h" - -namespace embree -{ - namespace isa - { - struct HalfPlane - { - const Vec3fa P; //!< plane origin - const Vec3fa N; //!< plane normal - - __forceinline HalfPlane(const Vec3fa& P, const Vec3fa& N) - : P(P), N(N) {} - - __forceinline BBox1f intersect(const Vec3fa& ray_org, const Vec3fa& ray_dir) const - { - Vec3fa O = Vec3fa(ray_org) - P; - Vec3fa D = Vec3fa(ray_dir); - float ON = dot(O,N); - float DN = dot(D,N); - bool eps = abs(DN) < min_rcp_input; - float t = -ON*rcp(DN); - float lower = select(eps || DN < 0.0f, float(neg_inf), t); - float upper = select(eps || DN > 0.0f, float(pos_inf), t); - return BBox1f(lower,upper); - } - }; - - template - struct HalfPlaneN - { - const Vec3vf P; //!< plane origin - const Vec3vf N; //!< plane normal - - __forceinline HalfPlaneN(const Vec3vf& P, const Vec3vf& N) - : P(P), N(N) {} - - __forceinline BBox> intersect(const Vec3fa& ray_org, const Vec3fa& ray_dir) const - { - Vec3vf O = Vec3vf((Vec3fa)ray_org) - P; - Vec3vf D = Vec3vf((Vec3fa)ray_dir); - vfloat ON = dot(O,N); - vfloat DN = dot(D,N); - vbool eps = abs(DN) < min_rcp_input; - vfloat t = -ON*rcp(DN); - vfloat lower = select(eps | DN < 0.0f, vfloat(neg_inf), t); - vfloat upper = select(eps | DN > 0.0f, vfloat(pos_inf), t); - return BBox>(lower,upper); - } - }; - } -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/pointi.h b/thirdparty/embree-aarch64/kernels/geometry/pointi.h deleted file mode 100644 index 4ba298e86b..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/pointi.h +++ /dev/null @@ -1,417 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "primitive.h" - -namespace embree -{ - template - struct PointMi - { - /* Virtual interface to query information about the line segment type */ - struct Type : public PrimitiveType - { - const char* name() const; - size_t sizeActive(const char* This) const; - size_t sizeTotal(const char* This) const; - size_t getBytes(const char* This) const; - }; - static Type type; - - public: - /* primitive supports multiple time segments */ - static const bool singleTimeSegment = false; - - /* Returns maximum number of stored line segments */ - static __forceinline size_t max_size() - { - return M; - } - - /* Returns required number of primitive blocks for N line segments */ - static __forceinline size_t blocks(size_t N) - { - return (N + max_size() - 1) / max_size(); - } - - /* Returns required number of bytes for N line segments */ - static __forceinline size_t bytes(size_t N) - { - return blocks(N) * sizeof(PointMi); - } - - public: - /* Default constructor */ - __forceinline PointMi() {} - - /* Construction from vertices and IDs */ - __forceinline PointMi(const vuint& geomIDs, const vuint& primIDs, Geometry::GType gtype, uint32_t numPrimitives) - : gtype((unsigned char)gtype), - numPrimitives(numPrimitives), - sharedGeomID(geomIDs[0]), - primIDs(primIDs) - { - assert(all(vuint(geomID()) == geomIDs)); - } - - /* Returns a mask that tells which line segments are valid */ - __forceinline vbool valid() const { - return vint(step) < vint(numPrimitives); - } - - /* Returns a mask that tells which line segments are valid */ - template __forceinline vbool valid() const { - return vint(step) < vint(numPrimitives); - } - - /* Returns if the specified line segment is valid */ - __forceinline bool valid(const size_t i) const - { - assert(i < M); - return i < numPrimitives; - } - - /* Returns the number of stored line segments */ - __forceinline size_t size() const { - return numPrimitives; - } - - __forceinline unsigned int geomID(unsigned int i = 0) const { - return sharedGeomID; - } - - __forceinline vuint& primID() { - return primIDs; - } - __forceinline const vuint& primID() const { - return primIDs; - } - __forceinline unsigned int primID(const size_t i) const { - assert(i < M); - return primIDs[i]; - } - - /* gather the line segments */ - __forceinline void gather(Vec4vf& p0, const Points* geom) const; - __forceinline void gather(Vec4vf& p0, Vec3vf& n0, const Points* geom) const; - - __forceinline void gatheri(Vec4vf& p0, const Points* geom, const int itime) const; - __forceinline void gatheri(Vec4vf& p0, Vec3vf& n0, const Points* geom, const int itime) const; - - __forceinline void gather(Vec4vf& p0, const Points* geom, float time) const; - __forceinline void gather(Vec4vf& p0, Vec3vf& n0, const Points* geom, float time) const; - - /* Calculate the bounds of the line segments */ - __forceinline const BBox3fa bounds(const Scene* scene, size_t itime = 0) const - { - BBox3fa bounds = empty; - for (size_t i = 0; i < M && valid(i); i++) { - const Points* geom = scene->get(geomID(i)); - bounds.extend(geom->bounds(primID(i),itime)); - } - return bounds; - } - - /* Calculate the linear bounds of the primitive */ - __forceinline LBBox3fa linearBounds(const Scene* scene, size_t itime) { - return LBBox3fa(bounds(scene, itime + 0), bounds(scene, itime + 1)); - } - - __forceinline LBBox3fa linearBounds(const Scene* const scene, size_t itime, size_t numTimeSteps) - { - LBBox3fa allBounds = empty; - for (size_t i = 0; i < M && valid(i); i++) { - const Points* geom = scene->get(geomID(i)); - allBounds.extend(geom->linearBounds(primID(i), itime, numTimeSteps)); - } - return allBounds; - } - - __forceinline LBBox3fa linearBounds(const Scene* const scene, const BBox1f time_range) - { - LBBox3fa allBounds = empty; - for (size_t i = 0; i < M && valid(i); i++) { - const Points* geom = scene->get(geomID((unsigned int)i)); - allBounds.extend(geom->linearBounds(primID(i), time_range)); - } - return allBounds; - } - - /* Fill line segment from line segment list */ - template - __forceinline void fill(const PrimRefT* prims, size_t& begin, size_t end, Scene* scene) - { - Geometry::GType gty = scene->get(prims[begin].geomID())->getType(); - vuint geomID, primID; - vuint v0; - const PrimRefT* prim = &prims[begin]; - - int numPrimitives = 0; - for (size_t i = 0; i < M; i++) { - if (begin < end) { - geomID[i] = prim->geomID(); - primID[i] = prim->primID(); - begin++; - numPrimitives++; - } else { - assert(i); - if (i > 0) { - geomID[i] = geomID[i - 1]; - primID[i] = primID[i - 1]; - } - } - if (begin < end) - prim = &prims[begin]; // FIXME: remove this line - } - new (this) PointMi(geomID, primID, gty, numPrimitives); // FIXME: use non temporal store - } - - template - __forceinline static typename BVH::NodeRef createLeaf(BVH* bvh, - const PrimRef* prims, - const range& set, - const Allocator& alloc) - { - size_t start = set.begin(); - size_t items = PointMi::blocks(set.size()); - size_t numbytes = PointMi::bytes(set.size()); - PointMi* accel = (PointMi*)alloc.malloc1(numbytes, M * sizeof(float)); - for (size_t i = 0; i < items; i++) { - accel[i].fill(prims, start, set.end(), bvh->scene); - } - return bvh->encodeLeaf((char*)accel, items); - }; - - __forceinline LBBox3fa fillMB(const PrimRef* prims, size_t& begin, size_t end, Scene* scene, size_t itime) - { - fill(prims, begin, end, scene); - return linearBounds(scene, itime); - } - - __forceinline LBBox3fa fillMB( - const PrimRefMB* prims, size_t& begin, size_t end, Scene* scene, const BBox1f time_range) - { - fill(prims, begin, end, scene); - return linearBounds(scene, time_range); - } - - template - __forceinline static typename BVH::NodeRecordMB4D createLeafMB(BVH* bvh, const SetMB& prims, const Allocator& alloc) - { - size_t start = prims.object_range.begin(); - size_t end = prims.object_range.end(); - size_t items = PointMi::blocks(prims.object_range.size()); - size_t numbytes = PointMi::bytes(prims.object_range.size()); - PointMi* accel = (PointMi*)alloc.malloc1(numbytes, M * sizeof(float)); - const typename BVH::NodeRef node = bvh->encodeLeaf((char*)accel, items); - - LBBox3fa bounds = empty; - for (size_t i = 0; i < items; i++) - bounds.extend(accel[i].fillMB(prims.prims->data(), start, end, bvh->scene, prims.time_range)); - - return typename BVH::NodeRecordMB4D(node, bounds, prims.time_range); - }; - - /*! output operator */ - friend __forceinline embree_ostream operator<<(embree_ostream cout, const PointMi& line) - { - return cout << "Line" << M << "i {" << line.v0 << ", " << line.geomID() << ", " << line.primID() << "}"; - } - - public: - unsigned char gtype; - unsigned char numPrimitives; - unsigned int sharedGeomID; - - private: - vuint primIDs; // primitive ID - }; - - template<> - __forceinline void PointMi<4>::gather(Vec4vf4& p0, const Points* geom) const - { - const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(primID(0))); - const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(primID(1))); - const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(primID(2))); - const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(primID(3))); - transpose(a0, a1, a2, a3, p0.x, p0.y, p0.z, p0.w); - } - - template<> - __forceinline void PointMi<4>::gather(Vec4vf4& p0, Vec3vf4& n0, const Points* geom) const - { - const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(primID(0))); - const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(primID(1))); - const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(primID(2))); - const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(primID(3))); - transpose(a0, a1, a2, a3, p0.x, p0.y, p0.z, p0.w); - const vfloat4 b0 = vfloat4(geom->normal(primID(0))); - const vfloat4 b1 = vfloat4(geom->normal(primID(1))); - const vfloat4 b2 = vfloat4(geom->normal(primID(2))); - const vfloat4 b3 = vfloat4(geom->normal(primID(3))); - transpose(b0, b1, b2, b3, n0.x, n0.y, n0.z); - } - - template<> - __forceinline void PointMi<4>::gatheri(Vec4vf4& p0, const Points* geom, const int itime) const - { - const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(primID(0), itime)); - const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(primID(1), itime)); - const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(primID(2), itime)); - const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(primID(3), itime)); - transpose(a0, a1, a2, a3, p0.x, p0.y, p0.z, p0.w); - } - - template<> - __forceinline void PointMi<4>::gatheri(Vec4vf4& p0, Vec3vf4& n0, const Points* geom, const int itime) const - { - const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(primID(0), itime)); - const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(primID(1), itime)); - const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(primID(2), itime)); - const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(primID(3), itime)); - transpose(a0, a1, a2, a3, p0.x, p0.y, p0.z, p0.w); - const vfloat4 b0 = vfloat4(geom->normal(primID(0), itime)); - const vfloat4 b1 = vfloat4(geom->normal(primID(1), itime)); - const vfloat4 b2 = vfloat4(geom->normal(primID(2), itime)); - const vfloat4 b3 = vfloat4(geom->normal(primID(3), itime)); - transpose(b0, b1, b2, b3, n0.x, n0.y, n0.z); - } - - template<> - __forceinline void PointMi<4>::gather(Vec4vf4& p0, const Points* geom, float time) const - { - float ftime; - const int itime = geom->timeSegment(time, ftime); - - Vec4vf4 a0; gatheri(a0, geom, itime); - Vec4vf4 b0; gatheri(b0, geom, itime + 1); - p0 = lerp(a0, b0, vfloat4(ftime)); - } - - template<> - __forceinline void PointMi<4>::gather(Vec4vf4& p0, Vec3vf4& n0, const Points* geom, float time) const - { - float ftime; - const int itime = geom->timeSegment(time, ftime); - - Vec4vf4 a0, b0; - Vec3vf4 norm0, norm1; - gatheri(a0, norm0, geom, itime); - gatheri(b0, norm1, geom, itime + 1); - p0 = lerp(a0, b0, vfloat4(ftime)); - n0 = lerp(norm0, norm1, vfloat4(ftime)); - } - -#if defined(__AVX__) - - template<> - __forceinline void PointMi<8>::gather(Vec4vf8& p0, const Points* geom) const - { - const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(primID(0))); - const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(primID(1))); - const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(primID(2))); - const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(primID(3))); - const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(primID(4))); - const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(primID(5))); - const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(primID(6))); - const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(primID(7))); - transpose(a0, a1, a2, a3, a4, a5, a6, a7, p0.x, p0.y, p0.z, p0.w); - } - - template<> - __forceinline void PointMi<8>::gather(Vec4vf8& p0, Vec3vf8& n0, const Points* geom) const - { - const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(primID(0))); - const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(primID(1))); - const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(primID(2))); - const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(primID(3))); - const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(primID(4))); - const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(primID(5))); - const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(primID(6))); - const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(primID(7))); - transpose(a0, a1, a2, a3, a4, a5, a6, a7, p0.x, p0.y, p0.z, p0.w); - const vfloat4 b0 = vfloat4(geom->normal(primID(0))); - const vfloat4 b1 = vfloat4(geom->normal(primID(1))); - const vfloat4 b2 = vfloat4(geom->normal(primID(2))); - const vfloat4 b3 = vfloat4(geom->normal(primID(3))); - const vfloat4 b4 = vfloat4(geom->normal(primID(4))); - const vfloat4 b5 = vfloat4(geom->normal(primID(5))); - const vfloat4 b6 = vfloat4(geom->normal(primID(6))); - const vfloat4 b7 = vfloat4(geom->normal(primID(7))); - transpose(b0, b1, b2, b3, b4, b5, b6, b7, n0.x, n0.y, n0.z); - } - - template<> - __forceinline void PointMi<8>::gatheri(Vec4vf8& p0, const Points* geom, const int itime) const - { - const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(primID(0), itime)); - const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(primID(1), itime)); - const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(primID(2), itime)); - const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(primID(3), itime)); - const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(primID(4), itime)); - const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(primID(5), itime)); - const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(primID(6), itime)); - const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(primID(7), itime)); - transpose(a0, a1, a2, a3, a4, a5, a6, a7, p0.x, p0.y, p0.z, p0.w); - } - - template<> - __forceinline void PointMi<8>::gatheri(Vec4vf8& p0, Vec3vf8& n0, const Points* geom, const int itime) const - { - const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(primID(0), itime)); - const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(primID(1), itime)); - const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(primID(2), itime)); - const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(primID(3), itime)); - const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(primID(4), itime)); - const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(primID(5), itime)); - const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(primID(6), itime)); - const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(primID(7), itime)); - transpose(a0, a1, a2, a3, a4, a5, a6, a7, p0.x, p0.y, p0.z, p0.w); - const vfloat4 b0 = vfloat4(geom->normal(primID(0), itime)); - const vfloat4 b1 = vfloat4(geom->normal(primID(1), itime)); - const vfloat4 b2 = vfloat4(geom->normal(primID(2), itime)); - const vfloat4 b3 = vfloat4(geom->normal(primID(3), itime)); - const vfloat4 b4 = vfloat4(geom->normal(primID(4), itime)); - const vfloat4 b5 = vfloat4(geom->normal(primID(5), itime)); - const vfloat4 b6 = vfloat4(geom->normal(primID(6), itime)); - const vfloat4 b7 = vfloat4(geom->normal(primID(7), itime)); - transpose(b0, b1, b2, b3, b4, b5, b6, b7, n0.x, n0.y, n0.z); - } - - template<> - __forceinline void PointMi<8>::gather(Vec4vf8& p0, const Points* geom, float time) const - { - float ftime; - const int itime = geom->timeSegment(time, ftime); - - Vec4vf8 a0; - gatheri(a0, geom, itime); - Vec4vf8 b0; - gatheri(b0, geom, itime + 1); - p0 = lerp(a0, b0, vfloat8(ftime)); - } - - template<> - __forceinline void PointMi<8>::gather(Vec4vf8& p0, Vec3vf8& n0, const Points* geom, float time) const - { - float ftime; - const int itime = geom->timeSegment(time, ftime); - - Vec4vf8 a0, b0; - Vec3vf8 norm0, norm1; - gatheri(a0, norm0, geom, itime); - gatheri(b0, norm1, geom, itime + 1); - p0 = lerp(a0, b0, vfloat8(ftime)); - n0 = lerp(norm0, norm1, vfloat8(ftime)); - } -#endif - - template - typename PointMi::Type PointMi::type; - - typedef PointMi<4> Point4i; - typedef PointMi<8> Point8i; - -} // namespace embree diff --git a/thirdparty/embree-aarch64/kernels/geometry/primitive.h b/thirdparty/embree-aarch64/kernels/geometry/primitive.h deleted file mode 100644 index 41e5b2b304..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/primitive.h +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../common/default.h" -#include "../common/scene.h" -#include "../../common/simd/simd.h" -#include "../common/primref.h" -#include "../common/primref_mb.h" - -namespace embree -{ - struct PrimitiveType - { - /*! returns name of this primitive type */ - virtual const char* name() const = 0; - - /*! Returns the number of stored active primitives in a block. */ - virtual size_t sizeActive(const char* This) const = 0; - - /*! Returns the number of stored active and inactive primitives in a block. */ - virtual size_t sizeTotal(const char* This) const = 0; - - /*! Returns the number of bytes of block. */ - virtual size_t getBytes(const char* This) const = 0; - }; - - template - struct PrimitivePointQuery1 - { - static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& prim) - { - bool changed = false; - for (size_t i = 0; i < Primitive::max_size(); i++) - { - if (!prim.valid(i)) break; - STAT3(point_query.trav_prims,1,1,1); - AccelSet* accel = (AccelSet*)context->scene->get(prim.geomID(i)); - context->geomID = prim.geomID(i); - context->primID = prim.primID(i); - changed |= accel->pointQuery(query, context); - } - return changed; - } - - static __forceinline void pointQueryNoop(PointQuery* query, PointQueryContext* context, const Primitive& prim) { } - }; -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/primitive4.cpp b/thirdparty/embree-aarch64/kernels/geometry/primitive4.cpp deleted file mode 100644 index f93574c9c8..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/primitive4.cpp +++ /dev/null @@ -1,379 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "primitive.h" -#include "curveNv.h" -#include "curveNi.h" -#include "curveNi_mb.h" -#include "linei.h" -#include "triangle.h" -#include "trianglev.h" -#include "trianglev_mb.h" -#include "trianglei.h" -#include "quadv.h" -#include "quadi.h" -#include "subdivpatch1.h" -#include "object.h" -#include "instance.h" -#include "subgrid.h" - -namespace embree -{ - /********************** Curve4v **************************/ - - template<> - const char* Curve4v::Type::name () const { - return "curve4v"; - } - - template<> - size_t Curve4v::Type::sizeActive(const char* This) const - { - if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR) - return ((Line4i*)This)->size(); - else - return ((Curve4v*)This)->N; - } - - template<> - size_t Curve4v::Type::sizeTotal(const char* This) const - { - if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR) - return 4; - else - return ((Curve4v*)This)->N; - } - - template<> - size_t Curve4v::Type::getBytes(const char* This) const - { - if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR) - return Line4i::bytes(sizeActive(This)); - else - return Curve4v::bytes(sizeActive(This)); - } - - /********************** Curve4i **************************/ - - template<> - const char* Curve4i::Type::name () const { - return "curve4i"; - } - - template<> - size_t Curve4i::Type::sizeActive(const char* This) const - { - if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR) - return ((Line4i*)This)->size(); - else - return ((Curve4i*)This)->N; - } - - template<> - size_t Curve4i::Type::sizeTotal(const char* This) const - { - if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR) - return 4; - else - return ((Curve4i*)This)->N; - } - - template<> - size_t Curve4i::Type::getBytes(const char* This) const - { - if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR) - return Line4i::bytes(sizeActive(This)); - else - return Curve4i::bytes(sizeActive(This)); - } - - /********************** Curve4iMB **************************/ - - template<> - const char* Curve4iMB::Type::name () const { - return "curve4imb"; - } - - template<> - size_t Curve4iMB::Type::sizeActive(const char* This) const - { - if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR) - return ((Line4i*)This)->size(); - else - return ((Curve4iMB*)This)->N; - } - - template<> - size_t Curve4iMB::Type::sizeTotal(const char* This) const - { - if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR) - return 4; - else - return ((Curve4iMB*)This)->N; - } - - template<> - size_t Curve4iMB::Type::getBytes(const char* This) const - { - if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR) - return Line4i::bytes(sizeActive(This)); - else - return Curve4iMB::bytes(sizeActive(This)); - } - - /********************** Line4i **************************/ - - template<> - const char* Line4i::Type::name () const { - return "line4i"; - } - - template<> - size_t Line4i::Type::sizeActive(const char* This) const { - return ((Line4i*)This)->size(); - } - - template<> - size_t Line4i::Type::sizeTotal(const char* This) const { - return 4; - } - - template<> - size_t Line4i::Type::getBytes(const char* This) const { - return sizeof(Line4i); - } - - /********************** Triangle4 **************************/ - - template<> - const char* Triangle4::Type::name () const { - return "triangle4"; - } - - template<> - size_t Triangle4::Type::sizeActive(const char* This) const { - return ((Triangle4*)This)->size(); - } - - template<> - size_t Triangle4::Type::sizeTotal(const char* This) const { - return 4; - } - - template<> - size_t Triangle4::Type::getBytes(const char* This) const { - return sizeof(Triangle4); - } - - /********************** Triangle4v **************************/ - - template<> - const char* Triangle4v::Type::name () const { - return "triangle4v"; - } - - template<> - size_t Triangle4v::Type::sizeActive(const char* This) const { - return ((Triangle4v*)This)->size(); - } - - template<> - size_t Triangle4v::Type::sizeTotal(const char* This) const { - return 4; - } - - template<> - size_t Triangle4v::Type::getBytes(const char* This) const { - return sizeof(Triangle4v); - } - - /********************** Triangle4i **************************/ - - template<> - const char* Triangle4i::Type::name () const { - return "triangle4i"; - } - - template<> - size_t Triangle4i::Type::sizeActive(const char* This) const { - return ((Triangle4i*)This)->size(); - } - - template<> - size_t Triangle4i::Type::sizeTotal(const char* This) const { - return 4; - } - - template<> - size_t Triangle4i::Type::getBytes(const char* This) const { - return sizeof(Triangle4i); - } - - /********************** Triangle4vMB **************************/ - - template<> - const char* Triangle4vMB::Type::name () const { - return "triangle4vmb"; - } - - template<> - size_t Triangle4vMB::Type::sizeActive(const char* This) const { - return ((Triangle4vMB*)This)->size(); - } - - template<> - size_t Triangle4vMB::Type::sizeTotal(const char* This) const { - return 4; - } - - template<> - size_t Triangle4vMB::Type::getBytes(const char* This) const { - return sizeof(Triangle4vMB); - } - - /********************** Quad4v **************************/ - - template<> - const char* Quad4v::Type::name () const { - return "quad4v"; - } - - template<> - size_t Quad4v::Type::sizeActive(const char* This) const { - return ((Quad4v*)This)->size(); - } - - template<> - size_t Quad4v::Type::sizeTotal(const char* This) const { - return 4; - } - - template<> - size_t Quad4v::Type::getBytes(const char* This) const { - return sizeof(Quad4v); - } - - /********************** Quad4i **************************/ - - template<> - const char* Quad4i::Type::name () const { - return "quad4i"; - } - - template<> - size_t Quad4i::Type::sizeActive(const char* This) const { - return ((Quad4i*)This)->size(); - } - - template<> - size_t Quad4i::Type::sizeTotal(const char* This) const { - return 4; - } - - template<> - size_t Quad4i::Type::getBytes(const char* This) const { - return sizeof(Quad4i); - } - - /********************** SubdivPatch1 **************************/ - - const char* SubdivPatch1::Type::name () const { - return "subdivpatch1"; - } - - size_t SubdivPatch1::Type::sizeActive(const char* This) const { - return 1; - } - - size_t SubdivPatch1::Type::sizeTotal(const char* This) const { - return 1; - } - - size_t SubdivPatch1::Type::getBytes(const char* This) const { - return sizeof(SubdivPatch1); - } - - SubdivPatch1::Type SubdivPatch1::type; - - /********************** Virtual Object **************************/ - - const char* Object::Type::name () const { - return "object"; - } - - size_t Object::Type::sizeActive(const char* This) const { - return 1; - } - - size_t Object::Type::sizeTotal(const char* This) const { - return 1; - } - - size_t Object::Type::getBytes(const char* This) const { - return sizeof(Object); - } - - Object::Type Object::type; - - /********************** Instance **************************/ - - const char* InstancePrimitive::Type::name () const { - return "instance"; - } - - size_t InstancePrimitive::Type::sizeActive(const char* This) const { - return 1; - } - - size_t InstancePrimitive::Type::sizeTotal(const char* This) const { - return 1; - } - - size_t InstancePrimitive::Type::getBytes(const char* This) const { - return sizeof(InstancePrimitive); - } - - InstancePrimitive::Type InstancePrimitive::type; - - /********************** SubGrid **************************/ - - const char* SubGrid::Type::name () const { - return "subgrid"; - } - - size_t SubGrid::Type::sizeActive(const char* This) const { - return 1; - } - - size_t SubGrid::Type::sizeTotal(const char* This) const { - return 1; - } - - size_t SubGrid::Type::getBytes(const char* This) const { - return sizeof(SubGrid); - } - - SubGrid::Type SubGrid::type; - - /********************** SubGridQBVH4 **************************/ - - template<> - const char* SubGridQBVH4::Type::name () const { - return "SubGridQBVH4"; - } - - template<> - size_t SubGridQBVH4::Type::sizeActive(const char* This) const { - return 1; - } - - template<> - size_t SubGridQBVH4::Type::sizeTotal(const char* This) const { - return 1; - } - - template<> - size_t SubGridQBVH4::Type::getBytes(const char* This) const { - return sizeof(SubGridQBVH4); - } -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/quad_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/quad_intersector.h deleted file mode 100644 index 57ff4e60e5..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/quad_intersector.h +++ /dev/null @@ -1,76 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -namespace embree -{ - namespace isa - { - /*! Intersects a ray with a quad with backface culling - * enabled. The quad v0,v1,v2,v3 is split into two triangles - * v0,v1,v3 and v2,v3,v1. The edge v1,v2 decides which of the two - * triangles gets intersected. */ - template - __forceinline vbool intersect_quad_backface_culling(const vbool& valid0, - const Vec3fa& ray_org, - const Vec3fa& ray_dir, - const float ray_tnear, - const float ray_tfar, - const Vec3vf& quad_v0, - const Vec3vf& quad_v1, - const Vec3vf& quad_v2, - const Vec3vf& quad_v3, - vfloat& u_o, - vfloat& v_o, - vfloat& t_o) - { - /* calculate vertices relative to ray origin */ - vbool valid = valid0; - const Vec3vf O = Vec3vf(ray_org); - const Vec3vf D = Vec3vf(ray_dir); - const Vec3vf va = quad_v0-O; - const Vec3vf vb = quad_v1-O; - const Vec3vf vc = quad_v2-O; - const Vec3vf vd = quad_v3-O; - - const Vec3vf edb = vb-vd; - const vfloat WW = dot(cross(vd,edb),D); - const Vec3vf v0 = select(WW <= 0.0f,va,vc); - const Vec3vf v1 = select(WW <= 0.0f,vb,vd); - const Vec3vf v2 = select(WW <= 0.0f,vd,vb); - - /* calculate edges */ - const Vec3vf e0 = v2-v0; - const Vec3vf e1 = v0-v1; - - /* perform edge tests */ - const vfloat U = dot(cross(v0,e0),D); - const vfloat V = dot(cross(v1,e1),D); - valid &= max(U,V) <= 0.0f; - if (unlikely(none(valid))) return false; - - /* calculate geometry normal and denominator */ - const Vec3vf Ng = cross(e1,e0); - const vfloat den = dot(Ng,D); - const vfloat rcpDen = rcp(den); - - /* perform depth test */ - const vfloat t = rcpDen*dot(v0,Ng); - valid &= vfloat(ray_tnear) <= t & t <= vfloat(ray_tfar); - if (unlikely(none(valid))) return false; - - /* avoid division by 0 */ - valid &= den != vfloat(zero); - if (unlikely(none(valid))) return false; - - /* update hit information */ - t_o = t; - u_o = U * rcpDen; - v_o = V * rcpDen; - u_o = select(WW <= 0.0f,u_o,1.0f-u_o); - v_o = select(WW <= 0.0f,v_o,1.0f-v_o); - return valid; - } - } -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/quad_intersector_moeller.h b/thirdparty/embree-aarch64/kernels/geometry/quad_intersector_moeller.h deleted file mode 100644 index 74e8c7720c..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/quad_intersector_moeller.h +++ /dev/null @@ -1,566 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "quadv.h" -#include "triangle_intersector_moeller.h" - -namespace embree -{ - namespace isa - { - template - struct QuadHitM - { - __forceinline QuadHitM() {} - - __forceinline QuadHitM(const vbool& valid, - const vfloat& U, - const vfloat& V, - const vfloat& T, - const vfloat& absDen, - const Vec3vf& Ng, - const vbool& flags) - : U(U), V(V), T(T), absDen(absDen), tri_Ng(Ng), valid(valid), flags(flags) {} - - __forceinline void finalize() - { - const vfloat rcpAbsDen = rcp(absDen); - vt = T * rcpAbsDen; - const vfloat u = min(U * rcpAbsDen,1.0f); - const vfloat v = min(V * rcpAbsDen,1.0f); - const vfloat u1 = vfloat(1.0f) - u; - const vfloat v1 = vfloat(1.0f) - v; -#if !defined(__AVX__) || defined(EMBREE_BACKFACE_CULLING) - vu = select(flags,u1,u); - vv = select(flags,v1,v); - vNg = Vec3vf(tri_Ng.x,tri_Ng.y,tri_Ng.z); -#else - const vfloat flip = select(flags,vfloat(-1.0f),vfloat(1.0f)); - vv = select(flags,u1,v); - vu = select(flags,v1,u); - vNg = Vec3vf(flip*tri_Ng.x,flip*tri_Ng.y,flip*tri_Ng.z); -#endif - } - - __forceinline Vec2f uv(const size_t i) - { - const float u = vu[i]; - const float v = vv[i]; - return Vec2f(u,v); - } - - __forceinline float t(const size_t i) { return vt[i]; } - __forceinline Vec3fa Ng(const size_t i) { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); } - - private: - vfloat U; - vfloat V; - vfloat T; - vfloat absDen; - Vec3vf tri_Ng; - - public: - vbool valid; - vfloat vu; - vfloat vv; - vfloat vt; - Vec3vf vNg; - - public: - const vbool flags; - }; - - template - struct QuadHitK - { - __forceinline QuadHitK(const vfloat& U, - const vfloat& V, - const vfloat& T, - const vfloat& absDen, - const Vec3vf& Ng, - const vbool& flags) - : U(U), V(V), T(T), absDen(absDen), flags(flags), tri_Ng(Ng) {} - - __forceinline std::tuple,vfloat,vfloat,Vec3vf> operator() () const - { - const vfloat rcpAbsDen = rcp(absDen); - const vfloat t = T * rcpAbsDen; - const vfloat u0 = min(U * rcpAbsDen,1.0f); - const vfloat v0 = min(V * rcpAbsDen,1.0f); - const vfloat u1 = vfloat(1.0f) - u0; - const vfloat v1 = vfloat(1.0f) - v0; - const vfloat u = select(flags,u1,u0); - const vfloat v = select(flags,v1,v0); - const Vec3vf Ng(tri_Ng.x,tri_Ng.y,tri_Ng.z); - return std::make_tuple(u,v,t,Ng); - } - - private: - const vfloat U; - const vfloat V; - const vfloat T; - const vfloat absDen; - const vbool flags; - const Vec3vf tri_Ng; - }; - - /* ----------------------------- */ - /* -- single ray intersectors -- */ - /* ----------------------------- */ - - - template - struct QuadMIntersector1MoellerTrumbore; - - /*! Intersects M quads with 1 ray */ - template - struct QuadMIntersector1MoellerTrumbore - { - __forceinline QuadMIntersector1MoellerTrumbore() {} - - __forceinline QuadMIntersector1MoellerTrumbore(const Ray& ray, const void* ptr) {} - - __forceinline void intersect(RayHit& ray, IntersectContext* context, - const Vec3vf& v0, const Vec3vf& v1, const Vec3vf& v2, const Vec3vf& v3, - const vuint& geomID, const vuint& primID) const - { - MoellerTrumboreHitM hit; - MoellerTrumboreIntersector1 intersector(ray,nullptr); - Intersect1EpilogM epilog(ray,context,geomID,primID); - - /* intersect first triangle */ - if (intersector.intersect(ray,v0,v1,v3,hit)) - epilog(hit.valid,hit); - - /* intersect second triangle */ - if (intersector.intersect(ray,v2,v3,v1,hit)) - { - hit.U = hit.absDen - hit.U; - hit.V = hit.absDen - hit.V; - epilog(hit.valid,hit); - } - } - - __forceinline bool occluded(Ray& ray, IntersectContext* context, - const Vec3vf& v0, const Vec3vf& v1, const Vec3vf& v2, const Vec3vf& v3, - const vuint& geomID, const vuint& primID) const - { - MoellerTrumboreHitM hit; - MoellerTrumboreIntersector1 intersector(ray,nullptr); - Occluded1EpilogM epilog(ray,context,geomID,primID); - - /* intersect first triangle */ - if (intersector.intersect(ray,v0,v1,v3,hit)) - { - if (epilog(hit.valid,hit)) - return true; - } - - /* intersect second triangle */ - if (intersector.intersect(ray,v2,v3,v1,hit)) - { - hit.U = hit.absDen - hit.U; - hit.V = hit.absDen - hit.V; - if (epilog(hit.valid,hit)) - return true; - } - return false; - } - }; - -#if defined(__AVX512ER__) // KNL - - /*! Intersects 4 quads with 1 ray using AVX512 */ - template - struct QuadMIntersector1MoellerTrumbore<4,filter> - { - __forceinline QuadMIntersector1MoellerTrumbore() {} - - __forceinline QuadMIntersector1MoellerTrumbore(const Ray& ray, const void* ptr) {} - - template - __forceinline bool intersect(Ray& ray, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const Epilog& epilog) const - { - const Vec3vf16 vtx0(select(0x0f0f,vfloat16(v0.x),vfloat16(v2.x)), - select(0x0f0f,vfloat16(v0.y),vfloat16(v2.y)), - select(0x0f0f,vfloat16(v0.z),vfloat16(v2.z))); -#if !defined(EMBREE_BACKFACE_CULLING) - const Vec3vf16 vtx1(vfloat16(v1.x),vfloat16(v1.y),vfloat16(v1.z)); - const Vec3vf16 vtx2(vfloat16(v3.x),vfloat16(v3.y),vfloat16(v3.z)); -#else - const Vec3vf16 vtx1(select(0x0f0f,vfloat16(v1.x),vfloat16(v3.x)), - select(0x0f0f,vfloat16(v1.y),vfloat16(v3.y)), - select(0x0f0f,vfloat16(v1.z),vfloat16(v3.z))); - const Vec3vf16 vtx2(select(0x0f0f,vfloat16(v3.x),vfloat16(v1.x)), - select(0x0f0f,vfloat16(v3.y),vfloat16(v1.y)), - select(0x0f0f,vfloat16(v3.z),vfloat16(v1.z))); -#endif - const vbool16 flags(0xf0f0); - - MoellerTrumboreHitM<16> hit; - MoellerTrumboreIntersector1<16> intersector(ray,nullptr); - if (unlikely(intersector.intersect(ray,vtx0,vtx1,vtx2,hit))) - { - vfloat16 U = hit.U, V = hit.V, absDen = hit.absDen; -#if !defined(EMBREE_BACKFACE_CULLING) - hit.U = select(flags,absDen-V,U); - hit.V = select(flags,absDen-U,V); - hit.vNg *= select(flags,vfloat16(-1.0f),vfloat16(1.0f)); // FIXME: use XOR -#else - hit.U = select(flags,absDen-U,U); - hit.V = select(flags,absDen-V,V); -#endif - if (likely(epilog(hit.valid,hit))) - return true; - } - return false; - } - - __forceinline bool intersect(RayHit& ray, IntersectContext* context, - const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, - const vuint4& geomID, const vuint4& primID) const - { - return intersect(ray,v0,v1,v2,v3,Intersect1EpilogM<8,16,filter>(ray,context,vuint8(geomID),vuint8(primID))); - } - - __forceinline bool occluded(Ray& ray, IntersectContext* context, - const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, - const vuint4& geomID, const vuint4& primID) const - { - return intersect(ray,v0,v1,v2,v3,Occluded1EpilogM<8,16,filter>(ray,context,vuint8(geomID),vuint8(primID))); - } - }; - -#elif defined(__AVX__) - - /*! Intersects 4 quads with 1 ray using AVX */ - template - struct QuadMIntersector1MoellerTrumbore<4,filter> - { - __forceinline QuadMIntersector1MoellerTrumbore() {} - - __forceinline QuadMIntersector1MoellerTrumbore(const Ray& ray, const void* ptr) {} - - template - __forceinline bool intersect(Ray& ray, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const Epilog& epilog) const - { - const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z)); -#if !defined(EMBREE_BACKFACE_CULLING) - const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z)); - const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z)); -#else - const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z)); - const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z)); -#endif - MoellerTrumboreHitM<8> hit; - MoellerTrumboreIntersector1<8> intersector(ray,nullptr); - const vbool8 flags(0,0,0,0,1,1,1,1); - if (unlikely(intersector.intersect(ray,vtx0,vtx1,vtx2,hit))) - { - vfloat8 U = hit.U, V = hit.V, absDen = hit.absDen; - -#if !defined(EMBREE_BACKFACE_CULLING) - hit.U = select(flags,absDen-V,U); - hit.V = select(flags,absDen-U,V); - hit.vNg *= select(flags,vfloat8(-1.0f),vfloat8(1.0f)); // FIXME: use XOR -#else - hit.U = select(flags,absDen-U,U); - hit.V = select(flags,absDen-V,V); -#endif - if (unlikely(epilog(hit.valid,hit))) - return true; - } - return false; - } - - __forceinline bool intersect(RayHit& ray, IntersectContext* context, - const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, - const vuint4& geomID, const vuint4& primID) const - { - return intersect(ray,v0,v1,v2,v3,Intersect1EpilogM<8,8,filter>(ray,context,vuint8(geomID),vuint8(primID))); - } - - __forceinline bool occluded(Ray& ray, IntersectContext* context, - const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, - const vuint4& geomID, const vuint4& primID) const - { - return intersect(ray,v0,v1,v2,v3,Occluded1EpilogM<8,8,filter>(ray,context,vuint8(geomID),vuint8(primID))); - } - }; - -#endif - - /* ----------------------------- */ - /* -- ray packet intersectors -- */ - /* ----------------------------- */ - - - struct MoellerTrumboreIntersector1KTriangleM - { - /*! Intersect k'th ray from ray packet of size K with M triangles. */ - template - static __forceinline bool intersect(RayK& ray, - size_t k, - const Vec3vf& tri_v0, - const Vec3vf& tri_e1, - const Vec3vf& tri_e2, - const Vec3vf& tri_Ng, - const vbool& flags, - const Epilog& epilog) - { - /* calculate denominator */ - const Vec3vf O = broadcast>(ray.org,k); - const Vec3vf D = broadcast>(ray.dir,k); - const Vec3vf C = Vec3vf(tri_v0) - O; - const Vec3vf R = cross(C,D); - const vfloat den = dot(Vec3vf(tri_Ng),D); - const vfloat absDen = abs(den); - const vfloat sgnDen = signmsk(den); - - /* perform edge tests */ - const vfloat U = dot(R,Vec3vf(tri_e2)) ^ sgnDen; - const vfloat V = dot(R,Vec3vf(tri_e1)) ^ sgnDen; - - /* perform backface culling */ -#if defined(EMBREE_BACKFACE_CULLING) - vbool valid = (den < vfloat(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen); -#else - vbool valid = (den != vfloat(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen); -#endif - if (likely(none(valid))) return false; - - /* perform depth test */ - const vfloat T = dot(Vec3vf(tri_Ng),C) ^ sgnDen; - valid &= (absDen*vfloat(ray.tnear()[k]) < T) & (T <= absDen*vfloat(ray.tfar[k])); - if (likely(none(valid))) return false; - - /* calculate hit information */ - QuadHitM hit(valid,U,V,T,absDen,tri_Ng,flags); - return epilog(valid,hit); - } - - template - static __forceinline bool intersect1(RayK& ray, - size_t k, - const Vec3vf& v0, - const Vec3vf& v1, - const Vec3vf& v2, - const vbool& flags, - const Epilog& epilog) - { - const Vec3vf e1 = v0-v1; - const Vec3vf e2 = v2-v0; - const Vec3vf Ng = cross(e2,e1); - return intersect(ray,k,v0,e1,e2,Ng,flags,epilog); - } - }; - - template - struct QuadMIntersectorKMoellerTrumboreBase - { - __forceinline QuadMIntersectorKMoellerTrumboreBase(const vbool& valid, const RayK& ray) {} - - /*! Intersects K rays with one of M triangles. */ - template - __forceinline vbool intersectK(const vbool& valid0, - RayK& ray, - const Vec3vf& tri_v0, - const Vec3vf& tri_e1, - const Vec3vf& tri_e2, - const Vec3vf& tri_Ng, - const vbool& flags, - const Epilog& epilog) const - { - /* calculate denominator */ - vbool valid = valid0; - const Vec3vf C = tri_v0 - ray.org; - const Vec3vf R = cross(C,ray.dir); - const vfloat den = dot(tri_Ng,ray.dir); - const vfloat absDen = abs(den); - const vfloat sgnDen = signmsk(den); - - /* test against edge p2 p0 */ - const vfloat U = dot(R,tri_e2) ^ sgnDen; - valid &= U >= 0.0f; - if (likely(none(valid))) return false; - - /* test against edge p0 p1 */ - const vfloat V = dot(R,tri_e1) ^ sgnDen; - valid &= V >= 0.0f; - if (likely(none(valid))) return false; - - /* test against edge p1 p2 */ - const vfloat W = absDen-U-V; - valid &= W >= 0.0f; - if (likely(none(valid))) return false; - - /* perform depth test */ - const vfloat T = dot(tri_Ng,C) ^ sgnDen; - valid &= (absDen*ray.tnear() < T) & (T <= absDen*ray.tfar); - if (unlikely(none(valid))) return false; - - /* perform backface culling */ -#if defined(EMBREE_BACKFACE_CULLING) - valid &= den < vfloat(zero); - if (unlikely(none(valid))) return false; -#else - valid &= den != vfloat(zero); - if (unlikely(none(valid))) return false; -#endif - - /* calculate hit information */ - QuadHitK hit(U,V,T,absDen,tri_Ng,flags); - return epilog(valid,hit); - } - - /*! Intersects K rays with one of M quads. */ - template - __forceinline vbool intersectK(const vbool& valid0, - RayK& ray, - const Vec3vf& tri_v0, - const Vec3vf& tri_v1, - const Vec3vf& tri_v2, - const vbool& flags, - const Epilog& epilog) const - { - const Vec3vf e1 = tri_v0-tri_v1; - const Vec3vf e2 = tri_v2-tri_v0; - const Vec3vf Ng = cross(e2,e1); - return intersectK(valid0,ray,tri_v0,e1,e2,Ng,flags,epilog); - } - - /*! Intersects K rays with one of M quads. */ - template - __forceinline bool intersectK(const vbool& valid0, - RayK& ray, - const Vec3vf& v0, - const Vec3vf& v1, - const Vec3vf& v2, - const Vec3vf& v3, - const Epilog& epilog) const - { - intersectK(valid0,ray,v0,v1,v3,vbool(false),epilog); - if (none(valid0)) return true; - intersectK(valid0,ray,v2,v3,v1,vbool(true ),epilog); - return none(valid0); - } - }; - - template - struct QuadMIntersectorKMoellerTrumbore : public QuadMIntersectorKMoellerTrumboreBase - { - __forceinline QuadMIntersectorKMoellerTrumbore(const vbool& valid, const RayK& ray) - : QuadMIntersectorKMoellerTrumboreBase(valid,ray) {} - - __forceinline void intersect1(RayHitK& ray, size_t k, IntersectContext* context, - const Vec3vf& v0, const Vec3vf& v1, const Vec3vf& v2, const Vec3vf& v3, - const vuint& geomID, const vuint& primID) const - { - Intersect1KEpilogM epilog(ray,k,context,geomID,primID); - MoellerTrumboreIntersector1KTriangleM::intersect1(ray,k,v0,v1,v3,vbool(false),epilog); - MoellerTrumboreIntersector1KTriangleM::intersect1(ray,k,v2,v3,v1,vbool(true ),epilog); - } - - __forceinline bool occluded1(RayK& ray, size_t k, IntersectContext* context, - const Vec3vf& v0, const Vec3vf& v1, const Vec3vf& v2, const Vec3vf& v3, - const vuint& geomID, const vuint& primID) const - { - Occluded1KEpilogM epilog(ray,k,context,geomID,primID); - if (MoellerTrumboreIntersector1KTriangleM::intersect1(ray,k,v0,v1,v3,vbool(false),epilog)) return true; - if (MoellerTrumboreIntersector1KTriangleM::intersect1(ray,k,v2,v3,v1,vbool(true ),epilog)) return true; - return false; - } - }; - - -#if defined(__AVX512ER__) // KNL - - /*! Intersects 4 quads with 1 ray using AVX512 */ - template - struct QuadMIntersectorKMoellerTrumbore<4,K,filter> : public QuadMIntersectorKMoellerTrumboreBase<4,K,filter> - { - __forceinline QuadMIntersectorKMoellerTrumbore(const vbool& valid, const RayK& ray) - : QuadMIntersectorKMoellerTrumboreBase<4,K,filter>(valid,ray) {} - - template - __forceinline bool intersect1(RayK& ray, size_t k, - const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const Epilog& epilog) const - { - const Vec3vf16 vtx0(select(0x0f0f,vfloat16(v0.x),vfloat16(v2.x)), - select(0x0f0f,vfloat16(v0.y),vfloat16(v2.y)), - select(0x0f0f,vfloat16(v0.z),vfloat16(v2.z))); -#if !defined(EMBREE_BACKFACE_CULLING) - const Vec3vf16 vtx1(vfloat16(v1.x),vfloat16(v1.y),vfloat16(v1.z)); - const Vec3vf16 vtx2(vfloat16(v3.x),vfloat16(v3.y),vfloat16(v3.z)); -#else - const Vec3vf16 vtx1(select(0x0f0f,vfloat16(v1.x),vfloat16(v3.x)), - select(0x0f0f,vfloat16(v1.y),vfloat16(v3.y)), - select(0x0f0f,vfloat16(v1.z),vfloat16(v3.z))); - const Vec3vf16 vtx2(select(0x0f0f,vfloat16(v3.x),vfloat16(v1.x)), - select(0x0f0f,vfloat16(v3.y),vfloat16(v1.y)), - select(0x0f0f,vfloat16(v3.z),vfloat16(v1.z))); -#endif - const vbool16 flags(0xf0f0); - return MoellerTrumboreIntersector1KTriangleM::intersect1(ray,k,vtx0,vtx1,vtx2,flags,epilog); - } - - __forceinline bool intersect1(RayHitK& ray, size_t k, IntersectContext* context, - const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, - const vuint4& geomID, const vuint4& primID) const - { - return intersect1(ray,k,v0,v1,v2,v3,Intersect1KEpilogM<8,16,K,filter>(ray,k,context,vuint8(geomID),vuint8(primID))); - } - - __forceinline bool occluded1(RayK& ray, size_t k, IntersectContext* context, - const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, - const vuint4& geomID, const vuint4& primID) const - { - return intersect1(ray,k,v0,v1,v2,v3,Occluded1KEpilogM<8,16,K,filter>(ray,k,context,vuint8(geomID),vuint8(primID))); - } - }; - -#elif defined(__AVX__) - - /*! Intersects 4 quads with 1 ray using AVX */ - template - struct QuadMIntersectorKMoellerTrumbore<4,K,filter> : public QuadMIntersectorKMoellerTrumboreBase<4,K,filter> - { - __forceinline QuadMIntersectorKMoellerTrumbore(const vbool& valid, const RayK& ray) - : QuadMIntersectorKMoellerTrumboreBase<4,K,filter>(valid,ray) {} - - template - __forceinline bool intersect1(RayK& ray, size_t k, - const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const Epilog& epilog) const - { - const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z)); -#if !defined(EMBREE_BACKFACE_CULLING) - const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z)); - const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z)); -#else - const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z)); - const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z)); -#endif - const vbool8 flags(0,0,0,0,1,1,1,1); - return MoellerTrumboreIntersector1KTriangleM::intersect1(ray,k,vtx0,vtx1,vtx2,flags,epilog); - } - - __forceinline bool intersect1(RayHitK& ray, size_t k, IntersectContext* context, - const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, - const vuint4& geomID, const vuint4& primID) const - { - return intersect1(ray,k,v0,v1,v2,v3,Intersect1KEpilogM<8,8,K,filter>(ray,k,context,vuint8(geomID),vuint8(primID))); - } - - __forceinline bool occluded1(RayK& ray, size_t k, IntersectContext* context, - const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, - const vuint4& geomID, const vuint4& primID) const - { - return intersect1(ray,k,v0,v1,v2,v3,Occluded1KEpilogM<8,8,K,filter>(ray,k,context,vuint8(geomID),vuint8(primID))); - } - }; - -#endif - } -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/quad_intersector_pluecker.h b/thirdparty/embree-aarch64/kernels/geometry/quad_intersector_pluecker.h deleted file mode 100644 index 7ca3aed0a0..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/quad_intersector_pluecker.h +++ /dev/null @@ -1,529 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "quad_intersector_moeller.h" - -/*! Modified Pluecker ray/triangle intersector. The test first shifts - * the ray origin into the origin of the coordinate system and then - * uses Pluecker coordinates for the intersection. Due to the shift, - * the Pluecker coordinate calculation simplifies and the tests get - * numerically stable. The edge equations are watertight along the - * edge for neighboring triangles. */ - -namespace embree -{ - namespace isa - { - template - struct QuadHitPlueckerM - { - __forceinline QuadHitPlueckerM() {} - - __forceinline QuadHitPlueckerM(const vbool& valid, - const vfloat& U, - const vfloat& V, - const vfloat& UVW, - const vfloat& t, - const Vec3vf& Ng, - const vbool& flags) - : U(U), V(V), UVW(UVW), tri_Ng(Ng), valid(valid), vt(t), flags(flags) {} - - __forceinline void finalize() - { - const vbool invalid = abs(UVW) < min_rcp_input; - const vfloat rcpUVW = select(invalid,vfloat(0.0f),rcp(UVW)); - const vfloat u = min(U * rcpUVW,1.0f); - const vfloat v = min(V * rcpUVW,1.0f); - const vfloat u1 = vfloat(1.0f) - u; - const vfloat v1 = vfloat(1.0f) - v; -#if !defined(__AVX__) || defined(EMBREE_BACKFACE_CULLING) - vu = select(flags,u1,u); - vv = select(flags,v1,v); - vNg = Vec3vf(tri_Ng.x,tri_Ng.y,tri_Ng.z); -#else - const vfloat flip = select(flags,vfloat(-1.0f),vfloat(1.0f)); - vv = select(flags,u1,v); - vu = select(flags,v1,u); - vNg = Vec3vf(flip*tri_Ng.x,flip*tri_Ng.y,flip*tri_Ng.z); -#endif - } - - __forceinline Vec2f uv(const size_t i) - { - const float u = vu[i]; - const float v = vv[i]; - return Vec2f(u,v); - } - - __forceinline float t(const size_t i) { return vt[i]; } - __forceinline Vec3fa Ng(const size_t i) { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); } - - private: - vfloat U; - vfloat V; - vfloat UVW; - Vec3vf tri_Ng; - - public: - vbool valid; - vfloat vu; - vfloat vv; - vfloat vt; - Vec3vf vNg; - - public: - const vbool flags; - }; - - template - struct QuadHitPlueckerK - { - __forceinline QuadHitPlueckerK(const vfloat& U, - const vfloat& V, - const vfloat& UVW, - const vfloat& t, - const Vec3vf& Ng, - const vbool& flags) - : U(U), V(V), UVW(UVW), t(t), flags(flags), tri_Ng(Ng) {} - - __forceinline std::tuple,vfloat,vfloat,Vec3vf> operator() () const - { - const vbool invalid = abs(UVW) < min_rcp_input; - const vfloat rcpUVW = select(invalid,vfloat(0.0f),rcp(UVW)); - const vfloat u0 = min(U * rcpUVW,1.0f); - const vfloat v0 = min(V * rcpUVW,1.0f); - const vfloat u1 = vfloat(1.0f) - u0; - const vfloat v1 = vfloat(1.0f) - v0; - const vfloat u = select(flags,u1,u0); - const vfloat v = select(flags,v1,v0); - const Vec3vf Ng(tri_Ng.x,tri_Ng.y,tri_Ng.z); - return std::make_tuple(u,v,t,Ng); - } - - private: - const vfloat U; - const vfloat V; - const vfloat UVW; - const vfloat t; - const vbool flags; - const Vec3vf tri_Ng; - }; - - struct PlueckerIntersectorTriangle1 - { - template - static __forceinline bool intersect(Ray& ray, - const Vec3vf& tri_v0, - const Vec3vf& tri_v1, - const Vec3vf& tri_v2, - const vbool& flags, - const Epilog& epilog) - { - /* calculate vertices relative to ray origin */ - const Vec3vf O = Vec3vf((Vec3fa)ray.org); - const Vec3vf D = Vec3vf((Vec3fa)ray.dir); - const Vec3vf v0 = tri_v0-O; - const Vec3vf v1 = tri_v1-O; - const Vec3vf v2 = tri_v2-O; - - /* calculate triangle edges */ - const Vec3vf e0 = v2-v0; - const Vec3vf e1 = v0-v1; - const Vec3vf e2 = v1-v2; - - /* perform edge tests */ - const vfloat U = dot(cross(e0,v2+v0),D); - const vfloat V = dot(cross(e1,v0+v1),D); - const vfloat W = dot(cross(e2,v1+v2),D); - const vfloat UVW = U+V+W; - const vfloat eps = float(ulp)*abs(UVW); -#if defined(EMBREE_BACKFACE_CULLING) - vbool valid = max(U,V,W) <= eps; -#else - vbool valid = (min(U,V,W) >= -eps) | (max(U,V,W) <= eps); -#endif - if (unlikely(none(valid))) return false; - - /* calculate geometry normal and denominator */ - const Vec3vf Ng = stable_triangle_normal(e0,e1,e2); - const vfloat den = twice(dot(Ng,D)); - - /* perform depth test */ - const vfloat T = twice(dot(v0,Ng)); - const vfloat t = rcp(den)*T; - valid &= vfloat(ray.tnear()) <= t & t <= vfloat(ray.tfar); - valid &= den != vfloat(zero); - if (unlikely(none(valid))) return false; - - /* update hit information */ - QuadHitPlueckerM hit(valid,U,V,UVW,t,Ng,flags); - return epilog(valid,hit); - } - }; - - /*! Intersects M quads with 1 ray */ - template - struct QuadMIntersector1Pluecker - { - __forceinline QuadMIntersector1Pluecker() {} - - __forceinline QuadMIntersector1Pluecker(const Ray& ray, const void* ptr) {} - - __forceinline void intersect(RayHit& ray, IntersectContext* context, - const Vec3vf& v0, const Vec3vf& v1, const Vec3vf& v2, const Vec3vf& v3, - const vuint& geomID, const vuint& primID) const - { - Intersect1EpilogM epilog(ray,context,geomID,primID); - PlueckerIntersectorTriangle1::intersect(ray,v0,v1,v3,vbool(false),epilog); - PlueckerIntersectorTriangle1::intersect(ray,v2,v3,v1,vbool(true),epilog); - } - - __forceinline bool occluded(Ray& ray, IntersectContext* context, - const Vec3vf& v0, const Vec3vf& v1, const Vec3vf& v2, const Vec3vf& v3, - const vuint& geomID, const vuint& primID) const - { - Occluded1EpilogM epilog(ray,context,geomID,primID); - if (PlueckerIntersectorTriangle1::intersect(ray,v0,v1,v3,vbool(false),epilog)) return true; - if (PlueckerIntersectorTriangle1::intersect(ray,v2,v3,v1,vbool(true ),epilog)) return true; - return false; - } - }; - -#if defined(__AVX512ER__) // KNL - - /*! Intersects 4 quads with 1 ray using AVX512 */ - template - struct QuadMIntersector1Pluecker<4,filter> - { - __forceinline QuadMIntersector1Pluecker() {} - - __forceinline QuadMIntersector1Pluecker(const Ray& ray, const void* ptr) {} - - template - __forceinline bool intersect(Ray& ray, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const Epilog& epilog) const - { - const Vec3vf16 vtx0(select(0x0f0f,vfloat16(v0.x),vfloat16(v2.x)), - select(0x0f0f,vfloat16(v0.y),vfloat16(v2.y)), - select(0x0f0f,vfloat16(v0.z),vfloat16(v2.z))); -#if !defined(EMBREE_BACKFACE_CULLING) - const Vec3vf16 vtx1(vfloat16(v1.x),vfloat16(v1.y),vfloat16(v1.z)); - const Vec3vf16 vtx2(vfloat16(v3.x),vfloat16(v3.y),vfloat16(v3.z)); -#else - const Vec3vf16 vtx1(select(0x0f0f,vfloat16(v1.x),vfloat16(v3.x)), - select(0x0f0f,vfloat16(v1.y),vfloat16(v3.y)), - select(0x0f0f,vfloat16(v1.z),vfloat16(v3.z))); - const Vec3vf16 vtx2(select(0x0f0f,vfloat16(v3.x),vfloat16(v1.x)), - select(0x0f0f,vfloat16(v3.y),vfloat16(v1.y)), - select(0x0f0f,vfloat16(v3.z),vfloat16(v1.z))); -#endif - const vbool16 flags(0xf0f0); - return PlueckerIntersectorTriangle1::intersect(ray,vtx0,vtx1,vtx2,flags,epilog); - } - - __forceinline bool intersect(RayHit& ray, IntersectContext* context, - const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, - const vuint4& geomID, const vuint4& primID) const - { - return intersect(ray,v0,v1,v2,v3,Intersect1EpilogM<8,16,filter>(ray,context,vuint8(geomID),vuint8(primID))); - } - - __forceinline bool occluded(Ray& ray, IntersectContext* context, - const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, - const vuint4& geomID, const vuint4& primID) const - { - return intersect(ray,v0,v1,v2,v3,Occluded1EpilogM<8,16,filter>(ray,context,vuint8(geomID),vuint8(primID))); - } - }; - -#elif defined(__AVX__) - - /*! Intersects 4 quads with 1 ray using AVX */ - template - struct QuadMIntersector1Pluecker<4,filter> - { - __forceinline QuadMIntersector1Pluecker() {} - - __forceinline QuadMIntersector1Pluecker(const Ray& ray, const void* ptr) {} - - template - __forceinline bool intersect(Ray& ray, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const Epilog& epilog) const - { - const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z)); -#if !defined(EMBREE_BACKFACE_CULLING) - const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z)); - const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z)); -#else - const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z)); - const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z)); -#endif - const vbool8 flags(0,0,0,0,1,1,1,1); - return PlueckerIntersectorTriangle1::intersect(ray,vtx0,vtx1,vtx2,flags,epilog); - } - - __forceinline bool intersect(RayHit& ray, IntersectContext* context, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, - const vuint4& geomID, const vuint4& primID) const - { - return intersect(ray,v0,v1,v2,v3,Intersect1EpilogM<8,8,filter>(ray,context,vuint8(geomID),vuint8(primID))); - } - - __forceinline bool occluded(Ray& ray, IntersectContext* context, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, - const vuint4& geomID, const vuint4& primID) const - { - return intersect(ray,v0,v1,v2,v3,Occluded1EpilogM<8,8,filter>(ray,context,vuint8(geomID),vuint8(primID))); - } - }; - -#endif - - - /* ----------------------------- */ - /* -- ray packet intersectors -- */ - /* ----------------------------- */ - - struct PlueckerIntersector1KTriangleM - { - /*! Intersect k'th ray from ray packet of size K with M triangles. */ - template - static __forceinline bool intersect1(RayK& ray, - size_t k, - const Vec3vf& tri_v0, - const Vec3vf& tri_v1, - const Vec3vf& tri_v2, - const vbool& flags, - const Epilog& epilog) - { - /* calculate vertices relative to ray origin */ - const Vec3vf O = broadcast>(ray.org,k); - const Vec3vf D = broadcast>(ray.dir,k); - const Vec3vf v0 = tri_v0-O; - const Vec3vf v1 = tri_v1-O; - const Vec3vf v2 = tri_v2-O; - - /* calculate triangle edges */ - const Vec3vf e0 = v2-v0; - const Vec3vf e1 = v0-v1; - const Vec3vf e2 = v1-v2; - - /* perform edge tests */ - const vfloat U = dot(cross(e0,v2+v0),D); - const vfloat V = dot(cross(e1,v0+v1),D); - const vfloat W = dot(cross(e2,v1+v2),D); - const vfloat UVW = U+V+W; - const vfloat eps = float(ulp)*abs(UVW); -#if defined(EMBREE_BACKFACE_CULLING) - vbool valid = max(U,V,W) <= eps; -#else - vbool valid = (min(U,V,W) >= -eps) | (max(U,V,W) <= eps); -#endif - if (unlikely(none(valid))) return false; - - /* calculate geometry normal and denominator */ - const Vec3vf Ng = stable_triangle_normal(e0,e1,e2); - const vfloat den = twice(dot(Ng,D)); - - /* perform depth test */ - const vfloat T = twice(dot(v0,Ng)); - const vfloat t = rcp(den)*T; - valid &= vfloat(ray.tnear()[k]) <= t & t <= vfloat(ray.tfar[k]); - if (unlikely(none(valid))) return false; - - /* avoid division by 0 */ - valid &= den != vfloat(zero); - if (unlikely(none(valid))) return false; - - /* update hit information */ - QuadHitPlueckerM hit(valid,U,V,UVW,t,Ng,flags); - return epilog(valid,hit); - } - }; - - template - struct QuadMIntersectorKPlueckerBase - { - __forceinline QuadMIntersectorKPlueckerBase(const vbool& valid, const RayK& ray) {} - - /*! Intersects K rays with one of M triangles. */ - template - __forceinline vbool intersectK(const vbool& valid0, - RayK& ray, - const Vec3vf& tri_v0, - const Vec3vf& tri_v1, - const Vec3vf& tri_v2, - const vbool& flags, - const Epilog& epilog) const - { - /* calculate vertices relative to ray origin */ - vbool valid = valid0; - const Vec3vf O = ray.org; - const Vec3vf D = ray.dir; - const Vec3vf v0 = tri_v0-O; - const Vec3vf v1 = tri_v1-O; - const Vec3vf v2 = tri_v2-O; - - /* calculate triangle edges */ - const Vec3vf e0 = v2-v0; - const Vec3vf e1 = v0-v1; - const Vec3vf e2 = v1-v2; - - /* perform edge tests */ - const vfloat U = dot(Vec3vf(cross(e0,v2+v0)),D); - const vfloat V = dot(Vec3vf(cross(e1,v0+v1)),D); - const vfloat W = dot(Vec3vf(cross(e2,v1+v2)),D); - const vfloat UVW = U+V+W; - const vfloat eps = float(ulp)*abs(UVW); -#if defined(EMBREE_BACKFACE_CULLING) - valid &= max(U,V,W) <= eps; -#else - valid &= (min(U,V,W) >= -eps) | (max(U,V,W) <= eps); -#endif - if (unlikely(none(valid))) return false; - - /* calculate geometry normal and denominator */ - const Vec3vf Ng = stable_triangle_normal(e0,e1,e2); - const vfloat den = twice(dot(Vec3vf(Ng),D)); - - /* perform depth test */ - const vfloat T = twice(dot(v0,Vec3vf(Ng))); - const vfloat t = rcp(den)*T; - valid &= ray.tnear() <= t & t <= ray.tfar; - valid &= den != vfloat(zero); - if (unlikely(none(valid))) return false; - - /* calculate hit information */ - QuadHitPlueckerK hit(U,V,UVW,t,Ng,flags); - return epilog(valid,hit); - } - - /*! Intersects K rays with one of M quads. */ - template - __forceinline bool intersectK(const vbool& valid0, - RayK& ray, - const Vec3vf& v0, - const Vec3vf& v1, - const Vec3vf& v2, - const Vec3vf& v3, - const Epilog& epilog) const - { - intersectK(valid0,ray,v0,v1,v3,vbool(false),epilog); - if (none(valid0)) return true; - intersectK(valid0,ray,v2,v3,v1,vbool(true ),epilog); - return none(valid0); - } - }; - - template - struct QuadMIntersectorKPluecker : public QuadMIntersectorKPlueckerBase - { - __forceinline QuadMIntersectorKPluecker(const vbool& valid, const RayK& ray) - : QuadMIntersectorKPlueckerBase(valid,ray) {} - - __forceinline void intersect1(RayHitK& ray, size_t k, IntersectContext* context, - const Vec3vf& v0, const Vec3vf& v1, const Vec3vf& v2, const Vec3vf& v3, - const vuint& geomID, const vuint& primID) const - { - Intersect1KEpilogM epilog(ray,k,context,geomID,primID); - PlueckerIntersector1KTriangleM::intersect1(ray,k,v0,v1,v3,vbool(false),epilog); - PlueckerIntersector1KTriangleM::intersect1(ray,k,v2,v3,v1,vbool(true ),epilog); - } - - __forceinline bool occluded1(RayK& ray, size_t k, IntersectContext* context, - const Vec3vf& v0, const Vec3vf& v1, const Vec3vf& v2, const Vec3vf& v3, - const vuint& geomID, const vuint& primID) const - { - Occluded1KEpilogM epilog(ray,k,context,geomID,primID); - if (PlueckerIntersector1KTriangleM::intersect1(ray,k,v0,v1,v3,vbool(false),epilog)) return true; - if (PlueckerIntersector1KTriangleM::intersect1(ray,k,v2,v3,v1,vbool(true ),epilog)) return true; - return false; - } - }; - -#if defined(__AVX512ER__) // KNL - - /*! Intersects 4 quads with 1 ray using AVX512 */ - template - struct QuadMIntersectorKPluecker<4,K,filter> : public QuadMIntersectorKPlueckerBase<4,K,filter> - { - __forceinline QuadMIntersectorKPluecker(const vbool& valid, const RayK& ray) - : QuadMIntersectorKPlueckerBase<4,K,filter>(valid,ray) {} - - template - __forceinline bool intersect1(RayK& ray, size_t k, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const Epilog& epilog) const - { - const Vec3vf16 vtx0(select(0x0f0f,vfloat16(v0.x),vfloat16(v2.x)), - select(0x0f0f,vfloat16(v0.y),vfloat16(v2.y)), - select(0x0f0f,vfloat16(v0.z),vfloat16(v2.z))); -#if !defined(EMBREE_BACKFACE_CULLING) - const Vec3vf16 vtx1(vfloat16(v1.x),vfloat16(v1.y),vfloat16(v1.z)); - const Vec3vf16 vtx2(vfloat16(v3.x),vfloat16(v3.y),vfloat16(v3.z)); -#else - const Vec3vf16 vtx1(select(0x0f0f,vfloat16(v1.x),vfloat16(v3.x)), - select(0x0f0f,vfloat16(v1.y),vfloat16(v3.y)), - select(0x0f0f,vfloat16(v1.z),vfloat16(v3.z))); - const Vec3vf16 vtx2(select(0x0f0f,vfloat16(v3.x),vfloat16(v1.x)), - select(0x0f0f,vfloat16(v3.y),vfloat16(v1.y)), - select(0x0f0f,vfloat16(v3.z),vfloat16(v1.z))); -#endif - - const vbool16 flags(0xf0f0); - return PlueckerIntersector1KTriangleM::intersect1(ray,k,vtx0,vtx1,vtx2,flags,epilog); - } - - __forceinline bool intersect1(RayHitK& ray, size_t k, IntersectContext* context, - const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, - const vuint4& geomID, const vuint4& primID) const - { - return intersect1(ray,k,v0,v1,v2,v3,Intersect1KEpilogM<8,16,K,filter>(ray,k,context,vuint8(geomID),vuint8(primID))); - } - - __forceinline bool occluded1(RayK& ray, size_t k, IntersectContext* context, - const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, - const vuint4& geomID, const vuint4& primID) const - { - return intersect1(ray,k,v0,v1,v2,v3,Occluded1KEpilogM<8,16,K,filter>(ray,k,context,vuint8(geomID),vuint8(primID))); - } - }; - -#elif defined(__AVX__) - - /*! Intersects 4 quads with 1 ray using AVX */ - template - struct QuadMIntersectorKPluecker<4,K,filter> : public QuadMIntersectorKPlueckerBase<4,K,filter> - { - __forceinline QuadMIntersectorKPluecker(const vbool& valid, const RayK& ray) - : QuadMIntersectorKPlueckerBase<4,K,filter>(valid,ray) {} - - template - __forceinline bool intersect1(RayK& ray, size_t k, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const Epilog& epilog) const - { - const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z)); - const vbool8 flags(0,0,0,0,1,1,1,1); -#if !defined(EMBREE_BACKFACE_CULLING) - const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z)); - const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z)); -#else - const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z)); - const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z)); -#endif - return PlueckerIntersector1KTriangleM::intersect1(ray,k,vtx0,vtx1,vtx2,flags,epilog); - } - - __forceinline bool intersect1(RayHitK& ray, size_t k, IntersectContext* context, - const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, - const vuint4& geomID, const vuint4& primID) const - { - return intersect1(ray,k,v0,v1,v2,v3,Intersect1KEpilogM<8,8,K,filter>(ray,k,context,vuint8(geomID),vuint8(primID))); - } - - __forceinline bool occluded1(RayK& ray, size_t k, IntersectContext* context, - const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, - const vuint4& geomID, const vuint4& primID) const - { - return intersect1(ray,k,v0,v1,v2,v3,Occluded1KEpilogM<8,8,K,filter>(ray,k,context,vuint8(geomID),vuint8(primID))); - } - }; - -#endif - } -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/quadi.h b/thirdparty/embree-aarch64/kernels/geometry/quadi.h deleted file mode 100644 index 741ec519ab..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/quadi.h +++ /dev/null @@ -1,483 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "primitive.h" -#include "../common/scene.h" - -namespace embree -{ - /* Stores M quads from an indexed face set */ - template - struct QuadMi - { - /* Virtual interface to query information about the quad type */ - struct Type : public PrimitiveType - { - const char* name() const; - size_t sizeActive(const char* This) const; - size_t sizeTotal(const char* This) const; - size_t getBytes(const char* This) const; - }; - static Type type; - - public: - - /* primitive supports multiple time segments */ - static const bool singleTimeSegment = false; - - /* Returns maximum number of stored quads */ - static __forceinline size_t max_size() { return M; } - - /* Returns required number of primitive blocks for N primitives */ - static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); } - - public: - - /* Default constructor */ - __forceinline QuadMi() { } - - /* Construction from vertices and IDs */ - __forceinline QuadMi(const vuint& v0, - const vuint& v1, - const vuint& v2, - const vuint& v3, - const vuint& geomIDs, - const vuint& primIDs) -#if defined(EMBREE_COMPACT_POLYS) - : geomIDs(geomIDs), primIDs(primIDs) {} -#else - : v0_(v0),v1_(v1), v2_(v2), v3_(v3), geomIDs(geomIDs), primIDs(primIDs) {} -#endif - - /* Returns a mask that tells which quads are valid */ - __forceinline vbool valid() const { return primIDs != vuint(-1); } - - /* Returns if the specified quad is valid */ - __forceinline bool valid(const size_t i) const { assert(i& geomID() { return geomIDs; } - __forceinline const vuint& geomID() const { return geomIDs; } - __forceinline unsigned int geomID(const size_t i) const { assert(i& primID() { return primIDs; } - __forceinline const vuint& primID() const { return primIDs; } - __forceinline unsigned int primID(const size_t i) const { assert(iget(geomID(i)); - bounds.extend(mesh->bounds(primID(i),itime)); - } - return bounds; - } - - /* Calculate the linear bounds of the primitive */ - __forceinline LBBox3fa linearBounds(const Scene* const scene, const size_t itime) { - return LBBox3fa(bounds(scene,itime+0),bounds(scene,itime+1)); - } - - __forceinline LBBox3fa linearBounds(const Scene *const scene, size_t itime, size_t numTimeSteps) - { - LBBox3fa allBounds = empty; - for (size_t i=0; iget(geomID(i)); - allBounds.extend(mesh->linearBounds(primID(i), itime, numTimeSteps)); - } - return allBounds; - } - - __forceinline LBBox3fa linearBounds(const Scene *const scene, const BBox1f time_range) - { - LBBox3fa allBounds = empty; - for (size_t i=0; iget(geomID(i)); - allBounds.extend(mesh->linearBounds(primID(i), time_range)); - } - return allBounds; - } - - /* Fill quad from quad list */ - template - __forceinline void fill(const PrimRefT* prims, size_t& begin, size_t end, Scene* scene) - { - vuint geomID = -1, primID = -1; - const PrimRefT* prim = &prims[begin]; - vuint v0 = zero, v1 = zero, v2 = zero, v3 = zero; - - for (size_t i=0; igeomID(); - primID[i] = prim->primID(); -#if !defined(EMBREE_COMPACT_POLYS) - const QuadMesh* mesh = scene->get(prim->geomID()); - const QuadMesh::Quad& q = mesh->quad(prim->primID()); - unsigned int_stride = mesh->vertices0.getStride()/4; - v0[i] = q.v[0] * int_stride; - v1[i] = q.v[1] * int_stride; - v2[i] = q.v[2] * int_stride; - v3[i] = q.v[3] * int_stride; -#endif - begin++; - } else { - assert(i); - if (likely(i > 0)) { - geomID[i] = geomID[0]; // always valid geomIDs - primID[i] = -1; // indicates invalid data - v0[i] = v0[0]; - v1[i] = v0[0]; - v2[i] = v0[0]; - v3[i] = v0[0]; - } - } - if (begin( " -#if !defined(EMBREE_COMPACT_POLYS) - << "v0 = " << quad.v0_ << ", v1 = " << quad.v1_ << ", v2 = " << quad.v2_ << ", v3 = " << quad.v3_ << ", " -#endif - << "geomID = " << quad.geomIDs << ", primID = " << quad.primIDs << " )"; - } - - protected: -#if !defined(EMBREE_COMPACT_POLYS) - vuint v0_; // 4 byte offset of 1st vertex - vuint v1_; // 4 byte offset of 2nd vertex - vuint v2_; // 4 byte offset of 3rd vertex - vuint v3_; // 4 byte offset of 4th vertex -#endif - vuint geomIDs; // geometry ID of mesh - vuint primIDs; // primitive ID of primitive inside mesh - }; - - namespace isa - { - - template - struct QuadMi : public embree::QuadMi - { -#if !defined(EMBREE_COMPACT_POLYS) - using embree::QuadMi::v0_; - using embree::QuadMi::v1_; - using embree::QuadMi::v2_; - using embree::QuadMi::v3_; -#endif - using embree::QuadMi::geomIDs; - using embree::QuadMi::primIDs; - using embree::QuadMi::geomID; - using embree::QuadMi::primID; - using embree::QuadMi::valid; - - template - __forceinline Vec3f getVertex(const size_t index, const Scene *const scene) const - { -#if defined(EMBREE_COMPACT_POLYS) - const QuadMesh* mesh = scene->get(geomID(index)); - const QuadMesh::Quad& quad = mesh->quad(primID(index)); - return (Vec3f) mesh->vertices[0][quad.v[vid]]; -#else - const vuint& v = getVertexOffset(); - const float* vertices = scene->vertices[geomID(index)]; - return (Vec3f&) vertices[v[index]]; -#endif - } - - template - __forceinline Vec3 getVertex(const size_t index, const Scene *const scene, const size_t itime, const T& ftime) const - { -#if defined(EMBREE_COMPACT_POLYS) - const QuadMesh* mesh = scene->get(geomID(index)); - const QuadMesh::Quad& quad = mesh->quad(primID(index)); - const Vec3fa v0 = mesh->vertices[itime+0][quad.v[vid]]; - const Vec3fa v1 = mesh->vertices[itime+1][quad.v[vid]]; -#else - const vuint& v = getVertexOffset(); - const QuadMesh* mesh = scene->get(geomID(index)); - const float* vertices0 = (const float*) mesh->vertexPtr(0,itime+0); - const float* vertices1 = (const float*) mesh->vertexPtr(0,itime+1); - const Vec3fa v0 = Vec3fa::loadu(vertices0+v[index]); - const Vec3fa v1 = Vec3fa::loadu(vertices1+v[index]); -#endif - const Vec3 p0(v0.x,v0.y,v0.z); - const Vec3 p1(v1.x,v1.y,v1.z); - return lerp(p0,p1,ftime); - } - - template - __forceinline Vec3 getVertex(const vbool& valid, const size_t index, const Scene *const scene, const vint& itime, const T& ftime) const - { - Vec3 p0, p1; - const QuadMesh* mesh = scene->get(geomID(index)); - - for (size_t mask=movemask(valid), i=bsf(mask); mask; mask=btc(mask,i), i=bsf(mask)) - { -#if defined(EMBREE_COMPACT_POLYS) - const QuadMesh::Quad& quad = mesh->quad(primID(index)); - const Vec3fa v0 = mesh->vertices[itime[i]+0][quad.v[vid]]; - const Vec3fa v1 = mesh->vertices[itime[i]+1][quad.v[vid]]; -#else - const vuint& v = getVertexOffset(); - const float* vertices0 = (const float*) mesh->vertexPtr(0,itime[i]+0); - const float* vertices1 = (const float*) mesh->vertexPtr(0,itime[i]+1); - const Vec3fa v0 = Vec3fa::loadu(vertices0+v[index]); - const Vec3fa v1 = Vec3fa::loadu(vertices1+v[index]); -#endif - p0.x[i] = v0.x; p0.y[i] = v0.y; p0.z[i] = v0.z; - p1.x[i] = v1.x; p1.y[i] = v1.y; p1.z[i] = v1.z; - } - return (T(one)-ftime)*p0 + ftime*p1; - } - - struct Quad { - vfloat4 v0,v1,v2,v3; - }; - -#if defined(EMBREE_COMPACT_POLYS) - - __forceinline Quad loadQuad(const int i, const Scene* const scene) const - { - const unsigned int geomID = geomIDs[i]; - const unsigned int primID = primIDs[i]; - if (unlikely(primID == -1)) return { zero, zero, zero, zero }; - const QuadMesh* mesh = scene->get(geomID); - const QuadMesh::Quad& quad = mesh->quad(primID); - const vfloat4 v0 = (vfloat4) mesh->vertices0[quad.v[0]]; - const vfloat4 v1 = (vfloat4) mesh->vertices0[quad.v[1]]; - const vfloat4 v2 = (vfloat4) mesh->vertices0[quad.v[2]]; - const vfloat4 v3 = (vfloat4) mesh->vertices0[quad.v[3]]; - return { v0, v1, v2, v3 }; - } - - __forceinline Quad loadQuad(const int i, const int itime, const Scene* const scene) const - { - const unsigned int geomID = geomIDs[i]; - const unsigned int primID = primIDs[i]; - if (unlikely(primID == -1)) return { zero, zero, zero, zero }; - const QuadMesh* mesh = scene->get(geomID); - const QuadMesh::Quad& quad = mesh->quad(primID); - const vfloat4 v0 = (vfloat4) mesh->vertices[itime][quad.v[0]]; - const vfloat4 v1 = (vfloat4) mesh->vertices[itime][quad.v[1]]; - const vfloat4 v2 = (vfloat4) mesh->vertices[itime][quad.v[2]]; - const vfloat4 v3 = (vfloat4) mesh->vertices[itime][quad.v[3]]; - return { v0, v1, v2, v3 }; - } - -#else - - __forceinline Quad loadQuad(const int i, const Scene* const scene) const - { - const float* vertices = scene->vertices[geomID(i)]; - const vfloat4 v0 = vfloat4::loadu(vertices + v0_[i]); - const vfloat4 v1 = vfloat4::loadu(vertices + v1_[i]); - const vfloat4 v2 = vfloat4::loadu(vertices + v2_[i]); - const vfloat4 v3 = vfloat4::loadu(vertices + v3_[i]); - return { v0, v1, v2, v3 }; - } - - __forceinline Quad loadQuad(const int i, const int itime, const Scene* const scene) const - { - const unsigned int geomID = geomIDs[i]; - const QuadMesh* mesh = scene->get(geomID); - const float* vertices = (const float*) mesh->vertexPtr(0,itime); - const vfloat4 v0 = vfloat4::loadu(vertices + v0_[i]); - const vfloat4 v1 = vfloat4::loadu(vertices + v1_[i]); - const vfloat4 v2 = vfloat4::loadu(vertices + v2_[i]); - const vfloat4 v3 = vfloat4::loadu(vertices + v3_[i]); - return { v0, v1, v2, v3 }; - } - -#endif - - /* Gather the quads */ - __forceinline void gather(Vec3vf& p0, - Vec3vf& p1, - Vec3vf& p2, - Vec3vf& p3, - const Scene *const scene) const; - -#if defined(__AVX512F__) - __forceinline void gather(Vec3vf16& p0, - Vec3vf16& p1, - Vec3vf16& p2, - Vec3vf16& p3, - const Scene *const scene) const; -#endif - - template -#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 2000) // workaround for compiler bug in ICC 2019 - __noinline -#else - __forceinline -#endif - void gather(const vbool& valid, - Vec3vf& p0, - Vec3vf& p1, - Vec3vf& p2, - Vec3vf& p3, - const size_t index, - const Scene* const scene, - const vfloat& time) const - { - const QuadMesh* mesh = scene->get(geomID(index)); - - vfloat ftime; - const vint itime = mesh->timeSegment(time, ftime); - - const size_t first = bsf(movemask(valid)); - if (likely(all(valid,itime[first] == itime))) - { - p0 = getVertex<0>(index, scene, itime[first], ftime); - p1 = getVertex<1>(index, scene, itime[first], ftime); - p2 = getVertex<2>(index, scene, itime[first], ftime); - p3 = getVertex<3>(index, scene, itime[first], ftime); - } - else - { - p0 = getVertex<0>(valid, index, scene, itime, ftime); - p1 = getVertex<1>(valid, index, scene, itime, ftime); - p2 = getVertex<2>(valid, index, scene, itime, ftime); - p3 = getVertex<3>(valid, index, scene, itime, ftime); - } - } - - __forceinline void gather(Vec3vf& p0, - Vec3vf& p1, - Vec3vf& p2, - Vec3vf& p3, - const QuadMesh* mesh, - const Scene *const scene, - const int itime) const; - - __forceinline void gather(Vec3vf& p0, - Vec3vf& p1, - Vec3vf& p2, - Vec3vf& p3, - const Scene *const scene, - const float time) const; - - /* Updates the primitive */ - __forceinline BBox3fa update(QuadMesh* mesh) - { - BBox3fa bounds = empty; - for (size_t i=0; iquad(primId); - const Vec3fa p0 = mesh->vertex(q.v[0]); - const Vec3fa p1 = mesh->vertex(q.v[1]); - const Vec3fa p2 = mesh->vertex(q.v[2]); - const Vec3fa p3 = mesh->vertex(q.v[3]); - bounds.extend(merge(BBox3fa(p0),BBox3fa(p1),BBox3fa(p2),BBox3fa(p3))); - } - return bounds; - } - - private: -#if !defined(EMBREE_COMPACT_POLYS) - template const vuint& getVertexOffset() const; -#endif - }; - -#if !defined(EMBREE_COMPACT_POLYS) - template<> template<> __forceinline const vuint<4>& QuadMi<4>::getVertexOffset<0>() const { return v0_; } - template<> template<> __forceinline const vuint<4>& QuadMi<4>::getVertexOffset<1>() const { return v1_; } - template<> template<> __forceinline const vuint<4>& QuadMi<4>::getVertexOffset<2>() const { return v2_; } - template<> template<> __forceinline const vuint<4>& QuadMi<4>::getVertexOffset<3>() const { return v3_; } -#endif - - template<> - __forceinline void QuadMi<4>::gather(Vec3vf4& p0, - Vec3vf4& p1, - Vec3vf4& p2, - Vec3vf4& p3, - const Scene *const scene) const - { - prefetchL1(((char*)this)+0*64); - prefetchL1(((char*)this)+1*64); - const Quad tri0 = loadQuad(0,scene); - const Quad tri1 = loadQuad(1,scene); - const Quad tri2 = loadQuad(2,scene); - const Quad tri3 = loadQuad(3,scene); - transpose(tri0.v0,tri1.v0,tri2.v0,tri3.v0,p0.x,p0.y,p0.z); - transpose(tri0.v1,tri1.v1,tri2.v1,tri3.v1,p1.x,p1.y,p1.z); - transpose(tri0.v2,tri1.v2,tri2.v2,tri3.v2,p2.x,p2.y,p2.z); - transpose(tri0.v3,tri1.v3,tri2.v3,tri3.v3,p3.x,p3.y,p3.z); - } - - template<> - __forceinline void QuadMi<4>::gather(Vec3vf4& p0, - Vec3vf4& p1, - Vec3vf4& p2, - Vec3vf4& p3, - const QuadMesh* mesh, - const Scene *const scene, - const int itime) const - { - // FIXME: for trianglei there all geometries are identical, is this the case here too? - - const Quad tri0 = loadQuad(0,itime,scene); - const Quad tri1 = loadQuad(1,itime,scene); - const Quad tri2 = loadQuad(2,itime,scene); - const Quad tri3 = loadQuad(3,itime,scene); - transpose(tri0.v0,tri1.v0,tri2.v0,tri3.v0,p0.x,p0.y,p0.z); - transpose(tri0.v1,tri1.v1,tri2.v1,tri3.v1,p1.x,p1.y,p1.z); - transpose(tri0.v2,tri1.v2,tri2.v2,tri3.v2,p2.x,p2.y,p2.z); - transpose(tri0.v3,tri1.v3,tri2.v3,tri3.v3,p3.x,p3.y,p3.z); - } - - template<> - __forceinline void QuadMi<4>::gather(Vec3vf4& p0, - Vec3vf4& p1, - Vec3vf4& p2, - Vec3vf4& p3, - const Scene *const scene, - const float time) const - { - const QuadMesh* mesh = scene->get(geomID(0)); // in mblur mode all geometries are identical - - float ftime; - const int itime = mesh->timeSegment(time, ftime); - - Vec3vf4 a0,a1,a2,a3; gather(a0,a1,a2,a3,mesh,scene,itime); - Vec3vf4 b0,b1,b2,b3; gather(b0,b1,b2,b3,mesh,scene,itime+1); - p0 = lerp(a0,b0,vfloat4(ftime)); - p1 = lerp(a1,b1,vfloat4(ftime)); - p2 = lerp(a2,b2,vfloat4(ftime)); - p3 = lerp(a3,b3,vfloat4(ftime)); - } - } - - template - typename QuadMi::Type QuadMi::type; - - typedef QuadMi<4> Quad4i; -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/quadi_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/quadi_intersector.h deleted file mode 100644 index 96cf7f1ca2..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/quadi_intersector.h +++ /dev/null @@ -1,350 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "quadi.h" -#include "quad_intersector_moeller.h" -#include "quad_intersector_pluecker.h" - -namespace embree -{ - namespace isa - { - /*! Intersects M quads with 1 ray */ - template - struct QuadMiIntersector1Moeller - { - typedef QuadMi Primitive; - typedef QuadMIntersector1MoellerTrumbore Precalculations; - - /*! Intersect a ray with the M quads and updates the hit. */ - static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& quad) - { - STAT3(normal.trav_prims,1,1,1); - Vec3vf v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene); - pre.intersect(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID()); - } - - /*! Test if the ray is occluded by one of M quads. */ - static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& quad) - { - STAT3(shadow.trav_prims,1,1,1); - Vec3vf v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene); - return pre.occluded(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID()); - } - - static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& quad) - { - return PrimitivePointQuery1::pointQuery(query, context, quad); - } - }; - - /*! Intersects M triangles with K rays. */ - template - struct QuadMiIntersectorKMoeller - { - typedef QuadMi Primitive; - typedef QuadMIntersectorKMoellerTrumbore Precalculations; - - /*! Intersects K rays with M triangles. */ - static __forceinline void intersect(const vbool& valid_i, Precalculations& pre, RayHitK& ray, IntersectContext* context, const QuadMi& quad) - { - Scene* scene = context->scene; - for (size_t i=0; i::max_size(); i++) - { - if (!quad.valid(i)) break; - STAT3(normal.trav_prims,1,popcnt(valid_i),K); - const Vec3vf p0 = quad.template getVertex<0>(i,scene); - const Vec3vf p1 = quad.template getVertex<1>(i,scene); - const Vec3vf p2 = quad.template getVertex<2>(i,scene); - const Vec3vf p3 = quad.template getVertex<3>(i,scene); - pre.intersectK(valid_i,ray,p0,p1,p2,p3,IntersectKEpilogM(ray,context,quad.geomID(),quad.primID(),i)); - } - } - - /*! Test for K rays if they are occluded by any of the M triangles. */ - static __forceinline vbool occluded(const vbool& valid_i, Precalculations& pre, RayK& ray, IntersectContext* context, const QuadMi& quad) - { - Scene* scene = context->scene; - vbool valid0 = valid_i; - for (size_t i=0; i::max_size(); i++) - { - if (!quad.valid(i)) break; - STAT3(shadow.trav_prims,1,popcnt(valid0),K); - const Vec3vf p0 = quad.template getVertex<0>(i,scene); - const Vec3vf p1 = quad.template getVertex<1>(i,scene); - const Vec3vf p2 = quad.template getVertex<2>(i,scene); - const Vec3vf p3 = quad.template getVertex<3>(i,scene); - if (pre.intersectK(valid0,ray,p0,p1,p2,p3,OccludedKEpilogM(valid0,ray,context,quad.geomID(),quad.primID(),i))) - break; - } - return !valid0; - } - - /*! Intersect a ray with M triangles and updates the hit. */ - static __forceinline void intersect(Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const QuadMi& quad) - { - STAT3(normal.trav_prims,1,1,1); - Vec3vf4 v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene); - pre.intersect1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID()); - } - - /*! Test if the ray is occluded by one of the M triangles. */ - static __forceinline bool occluded(Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const QuadMi& quad) - { - STAT3(shadow.trav_prims,1,1,1); - Vec3vf4 v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene); - return pre.occluded1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID()); - } - }; - - /*! Intersects M quads with 1 ray */ - template - struct QuadMiIntersector1Pluecker - { - typedef QuadMi Primitive; - typedef QuadMIntersector1Pluecker Precalculations; - - /*! Intersect a ray with the M quads and updates the hit. */ - static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& quad) - { - STAT3(normal.trav_prims,1,1,1); - Vec3vf v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene); - pre.intersect(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID()); - } - - /*! Test if the ray is occluded by one of M quads. */ - static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& quad) - { - STAT3(shadow.trav_prims,1,1,1); - Vec3vf v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene); - return pre.occluded(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID()); - } - - static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& quad) - { - return PrimitivePointQuery1::pointQuery(query, context, quad); - } - }; - - /*! Intersects M triangles with K rays. */ - template - struct QuadMiIntersectorKPluecker - { - typedef QuadMi Primitive; - typedef QuadMIntersectorKPluecker Precalculations; - - /*! Intersects K rays with M triangles. */ - static __forceinline void intersect(const vbool& valid_i, Precalculations& pre, RayHitK& ray, IntersectContext* context, const QuadMi& quad) - { - Scene* scene = context->scene; - for (size_t i=0; i::max_size(); i++) - { - if (!quad.valid(i)) break; - STAT3(normal.trav_prims,1,popcnt(valid_i),K); - const Vec3vf p0 = quad.template getVertex<0>(i,scene); - const Vec3vf p1 = quad.template getVertex<1>(i,scene); - const Vec3vf p2 = quad.template getVertex<2>(i,scene); - const Vec3vf p3 = quad.template getVertex<3>(i,scene); - pre.intersectK(valid_i,ray,p0,p1,p2,p3,IntersectKEpilogM(ray,context,quad.geomID(),quad.primID(),i)); - } - } - - /*! Test for K rays if they are occluded by any of the M triangles. */ - static __forceinline vbool occluded(const vbool& valid_i, Precalculations& pre, RayK& ray, IntersectContext* context, const QuadMi& quad) - { - Scene* scene = context->scene; - vbool valid0 = valid_i; - for (size_t i=0; i::max_size(); i++) - { - if (!quad.valid(i)) break; - STAT3(shadow.trav_prims,1,popcnt(valid0),K); - const Vec3vf p0 = quad.template getVertex<0>(i,scene); - const Vec3vf p1 = quad.template getVertex<1>(i,scene); - const Vec3vf p2 = quad.template getVertex<2>(i,scene); - const Vec3vf p3 = quad.template getVertex<3>(i,scene); - if (pre.intersectK(valid0,ray,p0,p1,p2,p3,OccludedKEpilogM(valid0,ray,context,quad.geomID(),quad.primID(),i))) - break; - } - return !valid0; - } - - /*! Intersect a ray with M triangles and updates the hit. */ - static __forceinline void intersect(Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const QuadMi& quad) - { - STAT3(normal.trav_prims,1,1,1); - Vec3vf4 v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene); - pre.intersect1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID()); - } - - /*! Test if the ray is occluded by one of the M triangles. */ - static __forceinline bool occluded(Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const QuadMi& quad) - { - STAT3(shadow.trav_prims,1,1,1); - Vec3vf4 v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene); - return pre.occluded1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID()); - } - }; - - /*! Intersects M motion blur quads with 1 ray */ - template - struct QuadMiMBIntersector1Moeller - { - typedef QuadMi Primitive; - typedef QuadMIntersector1MoellerTrumbore Precalculations; - - /*! Intersect a ray with the M quads and updates the hit. */ - static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& quad) - { - STAT3(normal.trav_prims,1,1,1); - Vec3vf v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time()); - pre.intersect(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID()); - } - - /*! Test if the ray is occluded by one of M quads. */ - static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& quad) - { - STAT3(shadow.trav_prims,1,1,1); - Vec3vf v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time()); - return pre.occluded(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID()); - } - - static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& quad) - { - return PrimitivePointQuery1::pointQuery(query, context, quad); - } - }; - - /*! Intersects M motion blur quads with K rays. */ - template - struct QuadMiMBIntersectorKMoeller - { - typedef QuadMi Primitive; - typedef QuadMIntersectorKMoellerTrumbore Precalculations; - - /*! Intersects K rays with M quads. */ - static __forceinline void intersect(const vbool& valid_i, Precalculations& pre, RayHitK& ray, IntersectContext* context, const QuadMi& quad) - { - for (size_t i=0; i::max_size(); i++) - { - if (!quad.valid(i)) break; - STAT3(normal.trav_prims,1,popcnt(valid_i),K); - Vec3vf v0,v1,v2,v3; quad.gather(valid_i,v0,v1,v2,v3,i,context->scene,ray.time()); - pre.intersectK(valid_i,ray,v0,v1,v2,v3,IntersectKEpilogM(ray,context,quad.geomID(),quad.primID(),i)); - } - } - - /*! Test for K rays if they are occluded by any of the M quads. */ - static __forceinline vbool occluded(const vbool& valid_i, Precalculations& pre, RayK& ray, IntersectContext* context, const QuadMi& quad) - { - vbool valid0 = valid_i; - for (size_t i=0; i::max_size(); i++) - { - if (!quad.valid(i)) break; - STAT3(shadow.trav_prims,1,popcnt(valid0),K); - Vec3vf v0,v1,v2,v3; quad.gather(valid_i,v0,v1,v2,v3,i,context->scene,ray.time()); - if (pre.intersectK(valid0,ray,v0,v1,v2,v3,OccludedKEpilogM(valid0,ray,context,quad.geomID(),quad.primID(),i))) - break; - } - return !valid0; - } - - /*! Intersect a ray with M quads and updates the hit. */ - static __forceinline void intersect(Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const QuadMi& quad) - { - STAT3(normal.trav_prims,1,1,1); - Vec3vf v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time()[k]); - pre.intersect1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID()); - } - - /*! Test if the ray is occluded by one of the M quads. */ - static __forceinline bool occluded(Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const QuadMi& quad) - { - STAT3(shadow.trav_prims,1,1,1); - Vec3vf v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time()[k]); - return pre.occluded1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID()); - } - }; - - /*! Intersects M motion blur quads with 1 ray */ - template - struct QuadMiMBIntersector1Pluecker - { - typedef QuadMi Primitive; - typedef QuadMIntersector1Pluecker Precalculations; - - /*! Intersect a ray with the M quads and updates the hit. */ - static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& quad) - { - STAT3(normal.trav_prims,1,1,1); - Vec3vf v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time()); - pre.intersect(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID()); - } - - /*! Test if the ray is occluded by one of M quads. */ - static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& quad) - { - STAT3(shadow.trav_prims,1,1,1); - Vec3vf v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time()); - return pre.occluded(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID()); - } - - static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& quad) - { - return PrimitivePointQuery1::pointQuery(query, context, quad); - } - }; - - /*! Intersects M motion blur quads with K rays. */ - template - struct QuadMiMBIntersectorKPluecker - { - typedef QuadMi Primitive; - typedef QuadMIntersectorKPluecker Precalculations; - - /*! Intersects K rays with M quads. */ - static __forceinline void intersect(const vbool& valid_i, Precalculations& pre, RayHitK& ray, IntersectContext* context, const QuadMi& quad) - { - for (size_t i=0; i::max_size(); i++) - { - if (!quad.valid(i)) break; - STAT3(normal.trav_prims,1,popcnt(valid_i),K); - Vec3vf v0,v1,v2,v3; quad.gather(valid_i,v0,v1,v2,v3,i,context->scene,ray.time()); - pre.intersectK(valid_i,ray,v0,v1,v2,v3,IntersectKEpilogM(ray,context,quad.geomID(),quad.primID(),i)); - } - } - - /*! Test for K rays if they are occluded by any of the M quads. */ - static __forceinline vbool occluded(const vbool& valid_i, Precalculations& pre, RayK& ray, IntersectContext* context, const QuadMi& quad) - { - vbool valid0 = valid_i; - for (size_t i=0; i::max_size(); i++) - { - if (!quad.valid(i)) break; - STAT3(shadow.trav_prims,1,popcnt(valid0),K); - Vec3vf v0,v1,v2,v3; quad.gather(valid_i,v0,v1,v2,v3,i,context->scene,ray.time()); - if (pre.intersectK(valid0,ray,v0,v1,v2,v3,OccludedKEpilogM(valid0,ray,context,quad.geomID(),quad.primID(),i))) - break; - } - return !valid0; - } - - /*! Intersect a ray with M quads and updates the hit. */ - static __forceinline void intersect(Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const QuadMi& quad) - { - STAT3(normal.trav_prims,1,1,1); - Vec3vf v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time()[k]); - pre.intersect1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID()); - } - - /*! Test if the ray is occluded by one of the M quads. */ - static __forceinline bool occluded(Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const QuadMi& quad) - { - STAT3(shadow.trav_prims,1,1,1); - Vec3vf v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time()[k]); - return pre.occluded1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID()); - } - }; - } -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/quadv.h b/thirdparty/embree-aarch64/kernels/geometry/quadv.h deleted file mode 100644 index 0a1fe4d128..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/quadv.h +++ /dev/null @@ -1,165 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "primitive.h" - -namespace embree -{ - /* Stores the vertices of M quads in struct of array layout */ - template - struct QuadMv - { - public: - struct Type : public PrimitiveType - { - const char* name() const; - size_t sizeActive(const char* This) const; - size_t sizeTotal(const char* This) const; - size_t getBytes(const char* This) const; - }; - static Type type; - - public: - - /* Returns maximum number of stored quads */ - static __forceinline size_t max_size() { return M; } - - /* Returns required number of primitive blocks for N primitives */ - static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); } - - public: - - /* Default constructor */ - __forceinline QuadMv() {} - - /* Construction from vertices and IDs */ - __forceinline QuadMv(const Vec3vf& v0, const Vec3vf& v1, const Vec3vf& v2, const Vec3vf& v3, const vuint& geomIDs, const vuint& primIDs) - : v0(v0), v1(v1), v2(v2), v3(v3), geomIDs(geomIDs), primIDs(primIDs) {} - - /* Returns a mask that tells which quads are valid */ - __forceinline vbool valid() const { return geomIDs != vuint(-1); } - - /* Returns true if the specified quad is valid */ - __forceinline bool valid(const size_t i) const { assert(i& geomID() { return geomIDs; } - __forceinline const vuint& geomID() const { return geomIDs; } - __forceinline unsigned int geomID(const size_t i) const { assert(i primID() { return primIDs; } - __forceinline const vuint primID() const { return primIDs; } - __forceinline unsigned int primID(const size_t i) const { assert(i lower = min(v0,v1,v2,v3); - Vec3vf upper = max(v0,v1,v2,v3); - vbool mask = valid(); - lower.x = select(mask,lower.x,vfloat(pos_inf)); - lower.y = select(mask,lower.y,vfloat(pos_inf)); - lower.z = select(mask,lower.z,vfloat(pos_inf)); - upper.x = select(mask,upper.x,vfloat(neg_inf)); - upper.y = select(mask,upper.y,vfloat(neg_inf)); - upper.z = select(mask,upper.z,vfloat(neg_inf)); - return BBox3fa(Vec3fa(reduce_min(lower.x),reduce_min(lower.y),reduce_min(lower.z)), - Vec3fa(reduce_max(upper.x),reduce_max(upper.y),reduce_max(upper.z))); - } - - /* Non temporal store */ - __forceinline static void store_nt(QuadMv* dst, const QuadMv& src) - { - vfloat::store_nt(&dst->v0.x,src.v0.x); - vfloat::store_nt(&dst->v0.y,src.v0.y); - vfloat::store_nt(&dst->v0.z,src.v0.z); - vfloat::store_nt(&dst->v1.x,src.v1.x); - vfloat::store_nt(&dst->v1.y,src.v1.y); - vfloat::store_nt(&dst->v1.z,src.v1.z); - vfloat::store_nt(&dst->v2.x,src.v2.x); - vfloat::store_nt(&dst->v2.y,src.v2.y); - vfloat::store_nt(&dst->v2.z,src.v2.z); - vfloat::store_nt(&dst->v3.x,src.v3.x); - vfloat::store_nt(&dst->v3.y,src.v3.y); - vfloat::store_nt(&dst->v3.z,src.v3.z); - vuint::store_nt(&dst->geomIDs,src.geomIDs); - vuint::store_nt(&dst->primIDs,src.primIDs); - } - - /* Fill quad from quad list */ - __forceinline void fill(const PrimRef* prims, size_t& begin, size_t end, Scene* scene) - { - vuint vgeomID = -1, vprimID = -1; - Vec3vf v0 = zero, v1 = zero, v2 = zero, v3 = zero; - - for (size_t i=0; iget(geomID); - const QuadMesh::Quad& quad = mesh->quad(primID); - const Vec3fa& p0 = mesh->vertex(quad.v[0]); - const Vec3fa& p1 = mesh->vertex(quad.v[1]); - const Vec3fa& p2 = mesh->vertex(quad.v[2]); - const Vec3fa& p3 = mesh->vertex(quad.v[3]); - vgeomID [i] = geomID; - vprimID [i] = primID; - v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z; - v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z; - v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z; - v3.x[i] = p3.x; v3.y[i] = p3.y; v3.z[i] = p3.z; - } - QuadMv::store_nt(this,QuadMv(v0,v1,v2,v3,vgeomID,vprimID)); - } - - /* Updates the primitive */ - __forceinline BBox3fa update(QuadMesh* mesh) - { - BBox3fa bounds = empty; - vuint vgeomID = -1, vprimID = -1; - Vec3vf v0 = zero, v1 = zero, v2 = zero; - - for (size_t i=0; iquad(primId); - const Vec3fa p0 = mesh->vertex(quad.v[0]); - const Vec3fa p1 = mesh->vertex(quad.v[1]); - const Vec3fa p2 = mesh->vertex(quad.v[2]); - const Vec3fa p3 = mesh->vertex(quad.v[3]); - bounds.extend(merge(BBox3fa(p0),BBox3fa(p1),BBox3fa(p2),BBox3fa(p3))); - vgeomID [i] = geomId; - vprimID [i] = primId; - v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z; - v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z; - v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z; - v3.x[i] = p3.x; v3.y[i] = p3.y; v3.z[i] = p3.z; - } - new (this) QuadMv(v0,v1,v2,v3,vgeomID,vprimID); - return bounds; - } - - public: - Vec3vf v0; // 1st vertex of the quads - Vec3vf v1; // 2nd vertex of the quads - Vec3vf v2; // 3rd vertex of the quads - Vec3vf v3; // 4rd vertex of the quads - private: - vuint geomIDs; // geometry ID - vuint primIDs; // primitive ID - }; - - template - typename QuadMv::Type QuadMv::type; - - typedef QuadMv<4> Quad4v; -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/quadv_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/quadv_intersector.h deleted file mode 100644 index 30a24b291a..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/quadv_intersector.h +++ /dev/null @@ -1,181 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "quadv.h" -#include "quad_intersector_moeller.h" -#include "quad_intersector_pluecker.h" - -namespace embree -{ - namespace isa - { - /*! Intersects M quads with 1 ray */ - template - struct QuadMvIntersector1Moeller - { - typedef QuadMv Primitive; - typedef QuadMIntersector1MoellerTrumbore Precalculations; - - /*! Intersect a ray with the M quads and updates the hit. */ - static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& quad) - { - STAT3(normal.trav_prims,1,1,1); - pre.intersect(ray,context,quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID()); - } - - /*! Test if the ray is occluded by one of M quads. */ - static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& quad) - { - STAT3(shadow.trav_prims,1,1,1); - return pre.occluded(ray,context, quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID()); - } - - static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& quad) - { - return PrimitivePointQuery1::pointQuery(query, context, quad); - } - }; - - /*! Intersects M triangles with K rays. */ - template - struct QuadMvIntersectorKMoeller - { - typedef QuadMv Primitive; - typedef QuadMIntersectorKMoellerTrumbore Precalculations; - - /*! Intersects K rays with M triangles. */ - static __forceinline void intersect(const vbool& valid_i, Precalculations& pre, RayHitK& ray, IntersectContext* context, const QuadMv& quad) - { - for (size_t i=0; i::max_size(); i++) - { - if (!quad.valid(i)) break; - STAT3(normal.trav_prims,1,popcnt(valid_i),K); - const Vec3vf p0 = broadcast>(quad.v0,i); - const Vec3vf p1 = broadcast>(quad.v1,i); - const Vec3vf p2 = broadcast>(quad.v2,i); - const Vec3vf p3 = broadcast>(quad.v3,i); - pre.intersectK(valid_i,ray,p0,p1,p2,p3,IntersectKEpilogM(ray,context,quad.geomID(),quad.primID(),i)); - } - } - - /*! Test for K rays if they are occluded by any of the M triangles. */ - static __forceinline vbool occluded(const vbool& valid_i, Precalculations& pre, RayK& ray, IntersectContext* context, const QuadMv& quad) - { - vbool valid0 = valid_i; - - for (size_t i=0; i::max_size(); i++) - { - if (!quad.valid(i)) break; - STAT3(shadow.trav_prims,1,popcnt(valid0),K); - const Vec3vf p0 = broadcast>(quad.v0,i); - const Vec3vf p1 = broadcast>(quad.v1,i); - const Vec3vf p2 = broadcast>(quad.v2,i); - const Vec3vf p3 = broadcast>(quad.v3,i); - if (pre.intersectK(valid0,ray,p0,p1,p2,p3,OccludedKEpilogM(valid0,ray,context,quad.geomID(),quad.primID(),i))) - break; - } - return !valid0; - } - - /*! Intersect a ray with M triangles and updates the hit. */ - static __forceinline void intersect(Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const QuadMv& quad) - { - STAT3(normal.trav_prims,1,1,1); - pre.intersect1(ray,k,context,quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID()); - } - - /*! Test if the ray is occluded by one of the M triangles. */ - static __forceinline bool occluded(Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const QuadMv& quad) - { - STAT3(shadow.trav_prims,1,1,1); - return pre.occluded1(ray,k,context,quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID()); - } - }; - - /*! Intersects M quads with 1 ray */ - template - struct QuadMvIntersector1Pluecker - { - typedef QuadMv Primitive; - typedef QuadMIntersector1Pluecker Precalculations; - - /*! Intersect a ray with the M quads and updates the hit. */ - static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& quad) - { - STAT3(normal.trav_prims,1,1,1); - pre.intersect(ray,context,quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID()); - } - - /*! Test if the ray is occluded by one of M quads. */ - static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& quad) - { - STAT3(shadow.trav_prims,1,1,1); - return pre.occluded(ray,context, quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID()); - } - - static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& quad) - { - return PrimitivePointQuery1::pointQuery(query, context, quad); - } - }; - - /*! Intersects M triangles with K rays. */ - template - struct QuadMvIntersectorKPluecker - { - typedef QuadMv Primitive; - typedef QuadMIntersectorKPluecker Precalculations; - - /*! Intersects K rays with M triangles. */ - static __forceinline void intersect(const vbool& valid_i, Precalculations& pre, RayHitK& ray, IntersectContext* context, const QuadMv& quad) - { - for (size_t i=0; i::max_size(); i++) - { - if (!quad.valid(i)) break; - STAT3(normal.trav_prims,1,popcnt(valid_i),K); - const Vec3vf p0 = broadcast>(quad.v0,i); - const Vec3vf p1 = broadcast>(quad.v1,i); - const Vec3vf p2 = broadcast>(quad.v2,i); - const Vec3vf p3 = broadcast>(quad.v3,i); - pre.intersectK(valid_i,ray,p0,p1,p2,p3,IntersectKEpilogM(ray,context,quad.geomID(),quad.primID(),i)); - } - } - - /*! Test for K rays if they are occluded by any of the M triangles. */ - static __forceinline vbool occluded(const vbool& valid_i, Precalculations& pre, RayK& ray, IntersectContext* context, const QuadMv& quad) - { - vbool valid0 = valid_i; - - for (size_t i=0; i::max_size(); i++) - { - if (!quad.valid(i)) break; - STAT3(shadow.trav_prims,1,popcnt(valid0),K); - const Vec3vf p0 = broadcast>(quad.v0,i); - const Vec3vf p1 = broadcast>(quad.v1,i); - const Vec3vf p2 = broadcast>(quad.v2,i); - const Vec3vf p3 = broadcast>(quad.v3,i); - if (pre.intersectK(valid0,ray,p0,p1,p2,p3,OccludedKEpilogM(valid0,ray,context,quad.geomID(),quad.primID(),i))) - break; - } - return !valid0; - } - - /*! Intersect a ray with M triangles and updates the hit. */ - static __forceinline void intersect(Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const QuadMv& quad) - { - STAT3(normal.trav_prims,1,1,1); - pre.intersect1(ray,k,context,quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID()); - } - - /*! Test if the ray is occluded by one of the M triangles. */ - static __forceinline bool occluded(Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const QuadMv& quad) - { - STAT3(shadow.trav_prims,1,1,1); - return pre.occluded1(ray,k,context,quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID()); - } - }; - } -} - diff --git a/thirdparty/embree-aarch64/kernels/geometry/roundline_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/roundline_intersector.h deleted file mode 100644 index cdf68f486b..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/roundline_intersector.h +++ /dev/null @@ -1,710 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../common/ray.h" -#include "curve_intersector_precalculations.h" - - -/* - - This file implements the intersection of a ray with a round linear - curve segment. We define the geometry of such a round linear curve - segment from point p0 with radius r0 to point p1 with radius r1 - using the cone that touches spheres p0/r0 and p1/r1 tangentially - plus the sphere p1/r1. We denote the tangentially touching cone from - p0/r0 to p1/r1 with cone(p0,r0,p1,r1) and the cone plus the ending - sphere with cone_sphere(p0,r0,p1,r1). - - For multiple connected round linear curve segments this construction - yield a proper shape when viewed from the outside. Using the - following CSG we can also handle the interiour in most common cases: - - round_linear_curve(pl,rl,p0,r0,p1,r1,pr,rr) = - cone_sphere(p0,r0,p1,r1) - cone(pl,rl,p0,r0) - cone(p1,r1,pr,rr) - - Thus by subtracting the neighboring cone geometries, we cut away - parts of the center cone_sphere surface which lie inside the - combined curve. This approach works as long as geometry of the - current cone_sphere penetrates into direct neighbor segments only, - and not into segments further away. - - To construct a cone that touches two spheres at p0 and p1 with r0 - and r1, one has to increase the cone radius at r0 and r1 to obtain - larger radii w0 and w1, such that the infinite cone properly touches - the spheres. From the paper "Ray Tracing Generalized Tube - Primitives: Method and Applications" - (https://www.researchgate.net/publication/334378683_Ray_Tracing_Generalized_Tube_Primitives_Method_and_Applications) - one can derive the following equations for these increased - radii: - - sr = 1.0f / sqrt(1-sqr(dr)/sqr(p1-p0)) - w0 = sr*r0 - w1 = sr*r1 - - Further, we want the cone to start where it touches the sphere at p0 - and to end where it touches sphere at p1. Therefore, we need to - construct clipping locations y0 and y1 for the start and end of the - cone. These start and end clipping location of the cone can get - calculated as: - - Y0 = - r0 * (r1-r0) / length(p1-p0) - Y1 = length(p1-p0) - r1 * (r1-r0) / length(p1-p0) - - Where the cone starts a distance Y0 and ends a distance Y1 away of - point p0 along the cone center. The distance between Y1-Y0 can get - calculated as: - - dY = length(p1-p0) - (r1-r0)^2 / length(p1-p0) - - In the code below, Y will always be scaled by length(p1-p0) to - obtain y and you will find the terms r0*(r1-r0) and - (p1-p0)^2-(r1-r0)^2. - - */ - -namespace embree -{ - namespace isa - { - template - struct RoundLineIntersectorHitM - { - __forceinline RoundLineIntersectorHitM() {} - - __forceinline RoundLineIntersectorHitM(const vfloat& u, const vfloat& v, const vfloat& t, const Vec3vf& Ng) - : vu(u), vv(v), vt(t), vNg(Ng) {} - - __forceinline void finalize() {} - - __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); } - __forceinline float t (const size_t i) const { return vt[i]; } - __forceinline Vec3fa Ng(const size_t i) const { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); } - - public: - vfloat vu; - vfloat vv; - vfloat vt; - Vec3vf vNg; - }; - - namespace __roundline_internal - { - template - struct ConeGeometry - { - ConeGeometry (const Vec4vf& a, const Vec4vf& b) - : p0(a.xyz()), p1(b.xyz()), dP(p1-p0), dPdP(dot(dP,dP)), r0(a.w), sqr_r0(sqr(r0)), r1(b.w), dr(r1-r0), drdr(dr*dr), r0dr (r0*dr), g(dPdP - drdr) {} - - /* - - This function tests if a point is accepted by first cone - clipping plane. - - First, we need to project the point onto the line p0->p1: - - Y = (p-p0)*(p1-p0)/length(p1-p0) - - This value y is the distance to the projection point from - p0. The clip distances are calculated as: - - Y0 = - r0 * (r1-r0) / length(p1-p0) - Y1 = length(p1-p0) - r1 * (r1-r0) / length(p1-p0) - - Thus to test if the point p is accepted by the first - clipping plane we need to test Y > Y0 and to test if it - is accepted by the second clipping plane we need to test - Y < Y1. - - By multiplying the calculations with length(p1-p0) these - calculation can get simplied to: - - y = (p-p0)*(p1-p0) - y0 = - r0 * (r1-r0) - y1 = (p1-p0)^2 - r1 * (r1-r0) - - and the test y > y0 and y < y1. - - */ - - __forceinline vbool isClippedByPlane (const vbool& valid_i, const Vec3vf& p) const - { - const Vec3vf p0p = p - p0; - const vfloat y = dot(p0p,dP); - const vfloat cap0 = -r0dr; - const vbool inside_cone = y > cap0; - return valid_i & (p0.x != vfloat(inf)) & (p1.x != vfloat(inf)) & inside_cone; - } - - /* - - This function tests whether a point lies inside the capped cone - tangential to its ending spheres. - - Therefore one has to check if the point is inside the - region defined by the cone clipping planes, which is - performed similar as in the previous function. - - To perform the inside cone test we need to project the - point onto the line p0->p1: - - dP = p1-p0 - Y = (p-p0)*dP/length(dP) - - This value Y is the distance to the projection point from - p0. To obtain a parameter value u going from 0 to 1 along - the line p0->p1 we calculate: - - U = Y/length(dP) - - The radii to use at points p0 and p1 are: - - w0 = sr * r0 - w1 = sr * r1 - dw = w1-w0 - - Using these radii and u one can directly test if the point - lies inside the cone using the formula dP*dP < wy*wy with: - - wy = w0 + u*dw - py = p0 + u*dP - p - - By multiplying the calculations with length(p1-p0) and - inserting the definition of w can obtain simpler equations: - - y = (p-p0)*dP - ry = r0 + y/dP^2 * dr - wy = sr*ry - py = p0 + y/dP^2*dP - p - y0 = - r0 * dr - y1 = dP^2 - r1 * dr - - Thus for the in-cone test we get: - - py^2 < wy^2 - <=> py^2 < sr^2 * ry^2 - <=> py^2 * ( dP^2 - dr^2 ) < dP^2 * ry^2 - - This can further get simplified to: - - (p0-p)^2 * (dP^2 - dr^2) - y^2 < dP^2 * r0^2 + 2.0f*r0*dr*y; - - */ - - __forceinline vbool isInsideCappedCone (const vbool& valid_i, const Vec3vf& p) const - { - const Vec3vf p0p = p - p0; - const vfloat y = dot(p0p,dP); - const vfloat cap0 = -r0dr+vfloat(ulp); - const vfloat cap1 = -r1*dr + dPdP; - - vbool inside_cone = valid_i & (p0.x != vfloat(inf)) & (p1.x != vfloat(inf)); - inside_cone &= y > cap0; // start clipping plane - inside_cone &= y < cap1; // end clipping plane - inside_cone &= sqr(p0p)*g - sqr(y) < dPdP * sqr_r0 + 2.0f*r0dr*y; // in cone test - return inside_cone; - } - - protected: - Vec3vf p0; - Vec3vf p1; - Vec3vf dP; - vfloat dPdP; - vfloat r0; - vfloat sqr_r0; - vfloat r1; - vfloat dr; - vfloat drdr; - vfloat r0dr; - vfloat g; - }; - - template - struct ConeGeometryIntersector : public ConeGeometry - { - using ConeGeometry::p0; - using ConeGeometry::p1; - using ConeGeometry::dP; - using ConeGeometry::dPdP; - using ConeGeometry::r0; - using ConeGeometry::sqr_r0; - using ConeGeometry::r1; - using ConeGeometry::dr; - using ConeGeometry::r0dr; - using ConeGeometry::g; - - ConeGeometryIntersector (const Vec3vf& ray_org, const Vec3vf& ray_dir, const vfloat& dOdO, const vfloat& rcp_dOdO, const Vec4vf& a, const Vec4vf& b) - : ConeGeometry(a,b), org(ray_org), O(ray_org-p0), dO(ray_dir), dOdO(dOdO), rcp_dOdO(rcp_dOdO), OdP(dot(dP,O)), dOdP(dot(dP,dO)), yp(OdP + r0dr) {} - - /* - - This function intersects a ray with a cone that touches a - start sphere p0/r0 and end sphere p1/r1. - - To find this ray/cone intersections one could just - calculate radii w0 and w1 as described above and use a - standard ray/cone intersection routine with these - radii. However, it turns out that calculations can get - simplified when deriving a specialized ray/cone - intersection for this special case. We perform - calculations relative to the cone origin p0 and define: - - O = ray_org - p0 - dO = ray_dir - dP = p1-p0 - dr = r1-r0 - dw = w1-w0 - - For some t we can compute the potential hit point h = O + t*dO and - project it onto the cone vector dP to obtain u = (h*dP)/(dP*dP). In - case of an intersection, the squared distance from the hit point - projected onto the cone center line to the hit point should be equal - to the squared cone radius at u: - - (u*dP - h)^2 = (w0 + u*dw)^2 - - Inserting the definition of h, u, w0, and dw into this formula, then - factoring out all terms, and sorting by t^2, t^1, and t^0 terms - yields a quadratic equation to solve. - - Inserting u: - ( (h*dP)*dP/dP^2 - h )^2 = ( w0 + (h*dP)*dw/dP^2 )^2 - - Multiplying by dP^4: - ( (h*dP)*dP - h*dP^2 )^2 = ( w0*dP^2 + (h*dP)*dw )^2 - - Inserting w0 and dw: - ( (h*dP)*dP - h*dP^2 )^2 = ( r0*dP^2 + (h*dP)*dr )^2 / (1-dr^2/dP^2) - ( (h*dP)*dP - h*dP^2 )^2 *(dP^2 - dr^2) = dP^2 * ( r0*dP^2 + (h*dP)*dr )^2 - - Now one can insert the definition of h, factor out, and presort by t: - ( ((O + t*dO)*dP)*dP - (O + t*dO)*dP^2 )^2 *(dP^2 - dr^2) = dP^2 * ( r0*dP^2 + ((O + t*dO)*dP)*dr )^2 - ( (O*dP)*dP-O*dP^2 + t*( (dO*dP)*dP - dO*dP^2 ) )^2 *(dP^2 - dr^2) = dP^2 * ( r0*dP^2 + (O*dP)*dr + t*(dO*dP)*dr )^2 - - Factoring out further and sorting by t^2, t^1 and t^0 yields: - - 0 = t^2 * [ ((dO*dP)*dP - dO-dP^2)^2 * (dP^2 - dr^2) - dP^2*(dO*dP)^2*dr^2 ] - + 2*t^1 * [ ((O*dP)*dP - O*dP^2) * ((dO*dP)*dP - dO*dP^2) * (dP^2 - dr^2) - dP^2*(r0*dP^2 + (O*dP)*dr)*(dO*dP)*dr ] - + t^0 * [ ( (O*dP)*dP - O*dP^2)^2 * (dP^2-dr^2) - dP^2*(r0*dP^2 + (O*dP)*dr)^2 ] - - This can be simplified to: - - 0 = t^2 * [ (dP^2 - dr^2)*dO^2 - (dO*dP)^2 ] - + 2*t^1 * [ (dP^2 - dr^2)*(O*dO) - (dO*dP)*(O*dP + r0*dr) ] - + t^0 * [ (dP^2 - dr^2)*O^2 - (O*dP)^2 - r0^2*dP^2 - 2.0f*r0*dr*(O*dP) ] - - Solving this quadratic equation yields the values for t at which the - ray intersects the cone. - - */ - - __forceinline bool intersectCone(vbool& valid, vfloat& lower, vfloat& upper) - { - /* return no hit by default */ - lower = pos_inf; - upper = neg_inf; - - /* compute quadratic equation A*t^2 + B*t + C = 0 */ - const vfloat OO = dot(O,O); - const vfloat OdO = dot(dO,O); - const vfloat A = g * dOdO - sqr(dOdP); - const vfloat B = 2.0f * (g*OdO - dOdP*yp); - const vfloat C = g*OO - sqr(OdP) - sqr_r0*dPdP - 2.0f*r0dr*OdP; - - /* we miss the cone if determinant is smaller than zero */ - const vfloat D = B*B - 4.0f*A*C; - valid &= (D >= 0.0f & g > 0.0f); // if g <= 0 then the cone is inside a sphere end - - /* When rays are parallel to the cone surface, then the - * ray may be inside or outside the cone. We just assume a - * miss in that case, which is fine as rays inside the - * cone would anyway hit the ending spheres in that - * case. */ - valid &= abs(A) > min_rcp_input; - if (unlikely(none(valid))) { - return false; - } - - /* compute distance to front and back hit */ - const vfloat Q = sqrt(D); - const vfloat rcp_2A = rcp(2.0f*A); - t_cone_front = (-B-Q)*rcp_2A; - y_cone_front = yp + t_cone_front*dOdP; - lower = select( (y_cone_front > -(float)ulp) & (y_cone_front <= g) & (g > 0.0f), t_cone_front, vfloat(pos_inf)); -#if !defined (EMBREE_BACKFACE_CULLING_CURVES) - t_cone_back = (-B+Q)*rcp_2A; - y_cone_back = yp + t_cone_back *dOdP; - upper = select( (y_cone_back > -(float)ulp) & (y_cone_back <= g) & (g > 0.0f), t_cone_back , vfloat(neg_inf)); -#endif - return true; - } - - /* - This function intersects the ray with the end sphere at - p1. We already clip away hits that are inside the - neighboring cone segment. - - */ - - __forceinline void intersectEndSphere(vbool& valid, - const ConeGeometry& coneR, - vfloat& lower, vfloat& upper) - { - /* calculate front and back hit with end sphere */ - const Vec3vf O1 = org - p1; - const vfloat O1dO = dot(O1,dO); - const vfloat h2 = sqr(O1dO) - dOdO*(sqr(O1) - sqr(r1)); - const vfloat rhs1 = select( h2 >= 0.0f, sqrt(h2), vfloat(neg_inf) ); - - /* clip away front hit if it is inside next cone segment */ - t_sph1_front = (-O1dO - rhs1)*rcp_dOdO; - const Vec3vf hit_front = org + t_sph1_front*dO; - vbool valid_sph1_front = h2 >= 0.0f & yp + t_sph1_front*dOdP > g & !coneR.isClippedByPlane (valid, hit_front); - lower = select(valid_sph1_front, t_sph1_front, vfloat(pos_inf)); - -#if !defined(EMBREE_BACKFACE_CULLING_CURVES) - /* clip away back hit if it is inside next cone segment */ - t_sph1_back = (-O1dO + rhs1)*rcp_dOdO; - const Vec3vf hit_back = org + t_sph1_back*dO; - vbool valid_sph1_back = h2 >= 0.0f & yp + t_sph1_back*dOdP > g & !coneR.isClippedByPlane (valid, hit_back); - upper = select(valid_sph1_back, t_sph1_back, vfloat(neg_inf)); -#else - upper = vfloat(neg_inf); -#endif - } - - __forceinline void intersectBeginSphere(const vbool& valid, - vfloat& lower, vfloat& upper) - { - /* calculate front and back hit with end sphere */ - const Vec3vf O1 = org - p0; - const vfloat O1dO = dot(O1,dO); - const vfloat h2 = sqr(O1dO) - dOdO*(sqr(O1) - sqr(r0)); - const vfloat rhs1 = select( h2 >= 0.0f, sqrt(h2), vfloat(neg_inf) ); - - /* clip away front hit if it is inside next cone segment */ - t_sph0_front = (-O1dO - rhs1)*rcp_dOdO; - vbool valid_sph1_front = valid & h2 >= 0.0f & yp + t_sph0_front*dOdP < 0; - lower = select(valid_sph1_front, t_sph0_front, vfloat(pos_inf)); - -#if !defined(EMBREE_BACKFACE_CULLING_CURVES) - /* clip away back hit if it is inside next cone segment */ - t_sph0_back = (-O1dO + rhs1)*rcp_dOdO; - vbool valid_sph1_back = valid & h2 >= 0.0f & yp + t_sph0_back*dOdP < 0; - upper = select(valid_sph1_back, t_sph0_back, vfloat(neg_inf)); -#else - upper = vfloat(neg_inf); -#endif - } - - /* - - This function calculates the geometry normal of some cone hit. - - For a given hit point h (relative to p0) with a cone - starting at p0 with radius w0 and ending at p1 with - radius w1 one normally calculates the geometry normal by - first calculating the parmetric u hit location along the - cone: - - u = dot(h,dP)/dP^2 - - Using this value one can now directly calculate the - geometry normal by bending the connection vector (h-u*dP) - from hit to projected hit with some cone dependent value - dw/sqrt(dP^2) * normalize(dP): - - Ng = normalize(h-u*dP) - dw/length(dP) * normalize(dP) - - The length of the vector (h-u*dP) can also get calculated - by interpolating the radii as w0+u*dw which yields: - - Ng = (h-u*dP)/(w0+u*dw) - dw/dP^2 * dP - - Multiplying with (w0+u*dw) yield a scaled Ng': - - Ng' = (h-u*dP) - (w0+u*dw)*dw/dP^2*dP - - Inserting the definition of w0 and dw and refactoring - yield a furhter scaled Ng'': - - Ng'' = (dP^2 - dr^2) (h-q) - (r0+u*dr)*dr*dP - - Now inserting the definition of u gives and multiplying - with the denominator yields: - - Ng''' = (dP^2-dr^2)*(dP^2*h-dot(h,dP)*dP) - (dP^2*r0+dot(h,dP)*dr)*dr*dP - - Factoring out, cancelling terms, dividing by dP^2, and - factoring again yields finally: - - Ng'''' = (dP^2-dr^2)*h - dP*(dot(h,dP) + r0*dr) - - */ - - __forceinline Vec3vf Ng_cone(const vbool& front_hit) const - { -#if !defined(EMBREE_BACKFACE_CULLING_CURVES) - const vfloat y = select(front_hit, y_cone_front, y_cone_back); - const vfloat t = select(front_hit, t_cone_front, t_cone_back); - const Vec3vf h = O + t*dO; - return g*h-dP*y; -#else - const Vec3vf h = O + t_cone_front*dO; - return g*h-dP*y_cone_front; -#endif - } - - /* compute geometry normal of sphere hit as the difference - * vector from hit point to sphere center */ - - __forceinline Vec3vf Ng_sphere1(const vbool& front_hit) const - { -#if !defined(EMBREE_BACKFACE_CULLING_CURVES) - const vfloat t_sph1 = select(front_hit, t_sph1_front, t_sph1_back); - return org+t_sph1*dO-p1; -#else - return org+t_sph1_front*dO-p1; -#endif - } - - __forceinline Vec3vf Ng_sphere0(const vbool& front_hit) const - { -#if !defined(EMBREE_BACKFACE_CULLING_CURVES) - const vfloat t_sph0 = select(front_hit, t_sph0_front, t_sph0_back); - return org+t_sph0*dO-p0; -#else - return org+t_sph0_front*dO-p0; -#endif - } - - /* - This function calculates the u coordinate of a - hit. Therefore we use the hit distance y (which is zero - at the first cone clipping plane) and divide by distance - g between the clipping planes. - - */ - - __forceinline vfloat u_cone(const vbool& front_hit) const - { -#if !defined(EMBREE_BACKFACE_CULLING_CURVES) - const vfloat y = select(front_hit, y_cone_front, y_cone_back); - return clamp(y*rcp(g)); -#else - return clamp(y_cone_front*rcp(g)); -#endif - } - - private: - Vec3vf org; - Vec3vf O; - Vec3vf dO; - vfloat dOdO; - vfloat rcp_dOdO; - vfloat OdP; - vfloat dOdP; - - /* for ray/cone intersection */ - private: - vfloat yp; - vfloat y_cone_front; - vfloat t_cone_front; -#if !defined (EMBREE_BACKFACE_CULLING_CURVES) - vfloat y_cone_back; - vfloat t_cone_back; -#endif - - /* for ray/sphere intersection */ - private: - vfloat t_sph1_front; - vfloat t_sph0_front; -#if !defined (EMBREE_BACKFACE_CULLING_CURVES) - vfloat t_sph1_back; - vfloat t_sph0_back; -#endif - }; - - - template - static __forceinline bool intersectConeSphere(const vbool& valid_i, - const Vec3vf& ray_org_in, const Vec3vf& ray_dir, - const vfloat& ray_tnear, const ray_tfar_func& ray_tfar, - const Vec4vf& v0, const Vec4vf& v1, - const Vec4vf& vL, const Vec4vf& vR, - const Epilog& epilog) - { - vbool valid = valid_i; - - /* move ray origin closer to make calculations numerically stable */ - const vfloat dOdO = sqr(ray_dir); - const vfloat rcp_dOdO = rcp(dOdO); - const Vec3vf center = vfloat(0.5f)*(v0.xyz()+v1.xyz()); - const vfloat dt = dot(center-ray_org_in,ray_dir)*rcp_dOdO; - const Vec3vf ray_org = ray_org_in + dt*ray_dir; - - /* intersect with cone from v0 to v1 */ - vfloat t_cone_lower, t_cone_upper; - ConeGeometryIntersector cone (ray_org, ray_dir, dOdO, rcp_dOdO, v0, v1); - vbool validCone = valid; - cone.intersectCone(validCone, t_cone_lower, t_cone_upper); - - valid &= (validCone | (cone.g <= 0.0f)); // if cone is entirely in sphere end - check sphere - if (unlikely(none(valid))) - return false; - - /* cone hits inside the neighboring capped cones are inside the geometry and thus ignored */ - const ConeGeometry coneL (v0, vL); - const ConeGeometry coneR (v1, vR); -#if !defined(EMBREE_BACKFACE_CULLING_CURVES) - const Vec3vf hit_lower = ray_org + t_cone_lower*ray_dir; - const Vec3vf hit_upper = ray_org + t_cone_upper*ray_dir; - t_cone_lower = select (!coneL.isInsideCappedCone (validCone, hit_lower) & !coneR.isInsideCappedCone (validCone, hit_lower), t_cone_lower, vfloat(pos_inf)); - t_cone_upper = select (!coneL.isInsideCappedCone (validCone, hit_upper) & !coneR.isInsideCappedCone (validCone, hit_upper), t_cone_upper, vfloat(neg_inf)); -#endif - - /* intersect ending sphere */ - vfloat t_sph1_lower, t_sph1_upper; - vfloat t_sph0_lower = vfloat(pos_inf); - vfloat t_sph0_upper = vfloat(neg_inf); - cone.intersectEndSphere(valid, coneR, t_sph1_lower, t_sph1_upper); - - const vbool isBeginPoint = valid & (vL[0] == vfloat(pos_inf)); - if (unlikely(any(isBeginPoint))) { - cone.intersectBeginSphere (isBeginPoint, t_sph0_lower, t_sph0_upper); - } - - /* CSG union of cone and end sphere */ - vfloat t_sph_lower = min(t_sph0_lower, t_sph1_lower); - vfloat t_cone_sphere_lower = min(t_cone_lower, t_sph_lower); -#if !defined (EMBREE_BACKFACE_CULLING_CURVES) - vfloat t_sph_upper = max(t_sph0_upper, t_sph1_upper); - vfloat t_cone_sphere_upper = max(t_cone_upper, t_sph_upper); - - /* filter out hits that are not in tnear/tfar range */ - const vbool valid_lower = valid & ray_tnear <= dt+t_cone_sphere_lower & dt+t_cone_sphere_lower <= ray_tfar() & t_cone_sphere_lower != vfloat(pos_inf); - const vbool valid_upper = valid & ray_tnear <= dt+t_cone_sphere_upper & dt+t_cone_sphere_upper <= ray_tfar() & t_cone_sphere_upper != vfloat(neg_inf); - - /* check if there is a first hit */ - const vbool valid_first = valid_lower | valid_upper; - if (unlikely(none(valid_first))) - return false; - - /* construct first hit */ - const vfloat t_first = select(valid_lower, t_cone_sphere_lower, t_cone_sphere_upper); - const vbool cone_hit_first = t_first == t_cone_lower | t_first == t_cone_upper; - const vbool sph0_hit_first = t_first == t_sph0_lower | t_first == t_sph0_upper; - const Vec3vf Ng_first = select(cone_hit_first, cone.Ng_cone(valid_lower), select (sph0_hit_first, cone.Ng_sphere0(valid_lower), cone.Ng_sphere1(valid_lower))); - const vfloat u_first = select(cone_hit_first, cone.u_cone(valid_lower), select (sph0_hit_first, vfloat(zero), vfloat(one))); - - /* invoke intersection filter for first hit */ - RoundLineIntersectorHitM hit(u_first,zero,dt+t_first,Ng_first); - const bool is_hit_first = epilog(valid_first, hit); - - /* check for possible second hits before potentially accepted hit */ - const vfloat t_second = t_cone_sphere_upper; - const vbool valid_second = valid_lower & valid_upper & (dt+t_cone_sphere_upper <= ray_tfar()); - if (unlikely(none(valid_second))) - return is_hit_first; - - /* invoke intersection filter for second hit */ - const vbool cone_hit_second = t_second == t_cone_lower | t_second == t_cone_upper; - const vbool sph0_hit_second = t_second == t_sph0_lower | t_second == t_sph0_upper; - const Vec3vf Ng_second = select(cone_hit_second, cone.Ng_cone(false), select (sph0_hit_second, cone.Ng_sphere0(false), cone.Ng_sphere1(false))); - const vfloat u_second = select(cone_hit_second, cone.u_cone(false), select (sph0_hit_second, vfloat(zero), vfloat(one))); - - hit = RoundLineIntersectorHitM(u_second,zero,dt+t_second,Ng_second); - const bool is_hit_second = epilog(valid_second, hit); - - return is_hit_first | is_hit_second; -#else - /* filter out hits that are not in tnear/tfar range */ - const vbool valid_lower = valid & ray_tnear <= dt+t_cone_sphere_lower & dt+t_cone_sphere_lower <= ray_tfar() & t_cone_sphere_lower != vfloat(pos_inf); - - /* check if there is a valid hit */ - if (unlikely(none(valid_lower))) - return false; - - /* construct first hit */ - const vbool cone_hit_first = t_cone_sphere_lower == t_cone_lower | t_cone_sphere_lower == t_cone_upper; - const vbool sph0_hit_first = t_cone_sphere_lower == t_sph0_lower | t_cone_sphere_lower == t_sph0_upper; - const Vec3vf Ng_first = select(cone_hit_first, cone.Ng_cone(valid_lower), select (sph0_hit_first, cone.Ng_sphere0(valid_lower), cone.Ng_sphere1(valid_lower))); - const vfloat u_first = select(cone_hit_first, cone.u_cone(valid_lower), select (sph0_hit_first, vfloat(zero), vfloat(one))); - - /* invoke intersection filter for first hit */ - RoundLineIntersectorHitM hit(u_first,zero,dt+t_cone_sphere_lower,Ng_first); - const bool is_hit_first = epilog(valid_lower, hit); - - return is_hit_first; -#endif - } - - } // end namespace __roundline_internal - - template - struct RoundLinearCurveIntersector1 - { - typedef CurvePrecalculations1 Precalculations; - - struct ray_tfar { - Ray& ray; - __forceinline ray_tfar(Ray& ray) : ray(ray) {} - __forceinline vfloat operator() () const { return ray.tfar; }; - }; - - template - static __forceinline bool intersect(const vbool& valid_i, - Ray& ray, - IntersectContext* context, - const LineSegments* geom, - const Precalculations& pre, - const Vec4vf& v0i, const Vec4vf& v1i, - const Vec4vf& vLi, const Vec4vf& vRi, - const Epilog& epilog) - { - const Vec3vf ray_org(ray.org.x, ray.org.y, ray.org.z); - const Vec3vf ray_dir(ray.dir.x, ray.dir.y, ray.dir.z); - const vfloat ray_tnear(ray.tnear()); - const Vec4vf v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i); - const Vec4vf v1 = enlargeRadiusToMinWidth(context,geom,ray_org,v1i); - const Vec4vf vL = enlargeRadiusToMinWidth(context,geom,ray_org,vLi); - const Vec4vf vR = enlargeRadiusToMinWidth(context,geom,ray_org,vRi); - return __roundline_internal::intersectConeSphere(valid_i,ray_org,ray_dir,ray_tnear,ray_tfar(ray),v0,v1,vL,vR,epilog); - } - }; - - template - struct RoundLinearCurveIntersectorK - { - typedef CurvePrecalculationsK Precalculations; - - struct ray_tfar { - RayK& ray; - size_t k; - __forceinline ray_tfar(RayK& ray, size_t k) : ray(ray), k(k) {} - __forceinline vfloat operator() () const { return ray.tfar[k]; }; - }; - - template - static __forceinline bool intersect(const vbool& valid_i, - RayK& ray, size_t k, - IntersectContext* context, - const LineSegments* geom, - const Precalculations& pre, - const Vec4vf& v0i, const Vec4vf& v1i, - const Vec4vf& vLi, const Vec4vf& vRi, - const Epilog& epilog) - { - const Vec3vf ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]); - const Vec3vf ray_dir(ray.dir.x[k], ray.dir.y[k], ray.dir.z[k]); - const vfloat ray_tnear = ray.tnear()[k]; - const Vec4vf v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i); - const Vec4vf v1 = enlargeRadiusToMinWidth(context,geom,ray_org,v1i); - const Vec4vf vL = enlargeRadiusToMinWidth(context,geom,ray_org,vLi); - const Vec4vf vR = enlargeRadiusToMinWidth(context,geom,ray_org,vRi); - return __roundline_internal::intersectConeSphere(valid_i,ray_org,ray_dir,ray_tnear,ray_tfar(ray,k),v0,v1,vL,vR,epilog); - } - }; - } -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/roundlinei_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/roundlinei_intersector.h deleted file mode 100644 index 079817335e..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/roundlinei_intersector.h +++ /dev/null @@ -1,136 +0,0 @@ -// ======================================================================== // -// Copyright 2009-2020 Intel Corporation // -// // -// Licensed under the Apache License, Version 2.0 (the "License"); // -// you may not use this file except in compliance with the License. // -// You may obtain a copy of the License at // -// // -// http://www.apache.org/licenses/LICENSE-2.0 // -// // -// Unless required by applicable law or agreed to in writing, software // -// distributed under the License is distributed on an "AS IS" BASIS, // -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // -// See the License for the specific language governing permissions and // -// limitations under the License. // -// ======================================================================== // - -#pragma once - -#include "roundline_intersector.h" -#include "intersector_epilog.h" - -namespace embree -{ - namespace isa - { - template - struct RoundLinearCurveMiIntersector1 - { - typedef LineMi Primitive; - typedef CurvePrecalculations1 Precalculations; - - static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& line) - { - STAT3(normal.trav_prims,1,1,1); - const LineSegments* geom = context->scene->get(line.geomID()); - Vec4vf v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom); - const vbool valid = line.template valid(); - RoundLinearCurveIntersector1::intersect(valid,ray,context,geom,pre,v0,v1,vL,vR,Intersect1EpilogM(ray,context,line.geomID(),line.primID())); - } - - static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& line) - { - STAT3(shadow.trav_prims,1,1,1); - const LineSegments* geom = context->scene->get(line.geomID()); - Vec4vf v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom); - const vbool valid = line.template valid(); - return RoundLinearCurveIntersector1::intersect(valid,ray,context,geom,pre,v0,v1,vL,vR,Occluded1EpilogM(ray,context,line.geomID(),line.primID())); - } - - static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& line) - { - return PrimitivePointQuery1::pointQuery(query, context, line); - } - }; - - template - struct RoundLinearCurveMiMBIntersector1 - { - typedef LineMi Primitive; - typedef CurvePrecalculations1 Precalculations; - - static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& line) - { - STAT3(normal.trav_prims,1,1,1); - const LineSegments* geom = context->scene->get(line.geomID()); - Vec4vf v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom,ray.time()); - const vbool valid = line.template valid(); - RoundLinearCurveIntersector1::intersect(valid,ray,context,geom,pre,v0,v1,vL,vR,Intersect1EpilogM(ray,context,line.geomID(),line.primID())); - } - - static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& line) - { - STAT3(shadow.trav_prims,1,1,1); - const LineSegments* geom = context->scene->get(line.geomID()); - Vec4vf v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom,ray.time()); - const vbool valid = line.template valid(); - return RoundLinearCurveIntersector1::intersect(valid,ray,context,geom,pre,v0,v1,vL,vR,Occluded1EpilogM(ray,context,line.geomID(),line.primID())); - } - - static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& line) - { - return PrimitivePointQuery1::pointQuery(query, context, line); - } - }; - - template - struct RoundLinearCurveMiIntersectorK - { - typedef LineMi Primitive; - typedef CurvePrecalculationsK Precalculations; - - static __forceinline void intersect(const Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive& line) - { - STAT3(normal.trav_prims,1,1,1); - const LineSegments* geom = context->scene->get(line.geomID()); - Vec4vf v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom); - const vbool valid = line.template valid(); - RoundLinearCurveIntersectorK::intersect(valid,ray,k,context,geom,pre,v0,v1,vL,vR,Intersect1KEpilogM(ray,k,context,line.geomID(),line.primID())); - } - - static __forceinline bool occluded(const Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const Primitive& line) - { - STAT3(shadow.trav_prims,1,1,1); - const LineSegments* geom = context->scene->get(line.geomID()); - Vec4vf v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom); - const vbool valid = line.template valid(); - return RoundLinearCurveIntersectorK::intersect(valid,ray,k,context,geom,pre,v0,v1,vL,vR,Occluded1KEpilogM(ray,k,context,line.geomID(),line.primID())); - } - }; - - template - struct RoundLinearCurveMiMBIntersectorK - { - typedef LineMi Primitive; - typedef CurvePrecalculationsK Precalculations; - - static __forceinline void intersect(const Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive& line) - { - STAT3(normal.trav_prims,1,1,1); - const LineSegments* geom = context->scene->get(line.geomID()); - Vec4vf v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom,ray.time()[k]); - const vbool valid = line.template valid(); - RoundLinearCurveIntersectorK::intersect(valid,ray,k,context,geom,pre,v0,v1,vL,vR,Intersect1KEpilogM(ray,k,context,line.geomID(),line.primID())); - } - - static __forceinline bool occluded(const Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const Primitive& line) - { - STAT3(shadow.trav_prims,1,1,1); - const LineSegments* geom = context->scene->get(line.geomID()); - Vec4vf v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom,ray.time()[k]); - const vbool valid = line.template valid(); - return RoundLinearCurveIntersectorK::intersect(valid,ray,k,context,geom,pre,v0,v1,vL,vR,Occluded1KEpilogM(ray,k,context,line.geomID(),line.primID())); - } - }; - } -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/sphere_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/sphere_intersector.h deleted file mode 100644 index 3ab90c29ef..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/sphere_intersector.h +++ /dev/null @@ -1,183 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../common/ray.h" -#include "../common/scene_points.h" -#include "curve_intersector_precalculations.h" - -namespace embree -{ - namespace isa - { - template - struct SphereIntersectorHitM - { - __forceinline SphereIntersectorHitM() {} - - __forceinline SphereIntersectorHitM(const vfloat& t, const Vec3vf& Ng) - : vt(t), vNg(Ng) {} - - __forceinline void finalize() {} - - __forceinline Vec2f uv(const size_t i) const { - return Vec2f(0.0f, 0.0f); - } - __forceinline float t(const size_t i) const { - return vt[i]; - } - __forceinline Vec3fa Ng(const size_t i) const { - return Vec3fa(vNg.x[i], vNg.y[i], vNg.z[i]); - } - - public: - vfloat vt; - Vec3vf vNg; - }; - - template - struct SphereIntersector1 - { - typedef CurvePrecalculations1 Precalculations; - - template - static __forceinline bool intersect( - const vbool& valid_i, Ray& ray, - const Precalculations& pre, const Vec4vf& v0, const Epilog& epilog) - { - vbool valid = valid_i; - - const vfloat rd2 = rcp(dot(ray.dir, ray.dir)); - const Vec3vf ray_org(ray.org.x, ray.org.y, ray.org.z); - const Vec3vf ray_dir(ray.dir.x, ray.dir.y, ray.dir.z); - const Vec3vf center = v0.xyz(); - const vfloat radius = v0.w; - - const Vec3vf c0 = center - ray_org; - const vfloat projC0 = dot(c0, ray_dir) * rd2; - const Vec3vf perp = c0 - projC0 * ray_dir; - const vfloat l2 = dot(perp, perp); - const vfloat r2 = radius * radius; - valid &= (l2 <= r2); - if (unlikely(none(valid))) - return false; - - const vfloat td = sqrt((r2 - l2) * rd2); - const vfloat t_front = projC0 - td; - const vfloat t_back = projC0 + td; - - const vbool valid_front = valid & (ray.tnear() <= t_front) & (t_front <= ray.tfar); - const vbool valid_back = valid & (ray.tnear() <= t_back ) & (t_back <= ray.tfar); - - /* check if there is a first hit */ - const vbool valid_first = valid_front | valid_back; - if (unlikely(none(valid_first))) - return false; - - /* construct first hit */ - const vfloat td_front = -td; - const vfloat td_back = +td; - const vfloat t_first = select(valid_front, t_front, t_back); - const Vec3vf Ng_first = select(valid_front, td_front, td_back) * ray_dir - perp; - SphereIntersectorHitM hit(t_first, Ng_first); - - /* invoke intersection filter for first hit */ - const bool is_hit_first = epilog(valid_first, hit); - - /* check for possible second hits before potentially accepted hit */ - const vfloat t_second = t_back; - const vbool valid_second = valid_front & valid_back & (t_second <= ray.tfar); - if (unlikely(none(valid_second))) - return is_hit_first; - - /* invoke intersection filter for second hit */ - const Vec3vf Ng_second = td_back * ray_dir - perp; - hit = SphereIntersectorHitM (t_second, Ng_second); - const bool is_hit_second = epilog(valid_second, hit); - - return is_hit_first | is_hit_second; - } - - template - static __forceinline bool intersect( - const vbool& valid_i, Ray& ray, IntersectContext* context, const Points* geom, - const Precalculations& pre, const Vec4vf& v0i, const Epilog& epilog) - { - const Vec3vf ray_org(ray.org.x, ray.org.y, ray.org.z); - const Vec4vf v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i); - return intersect(valid_i,ray,pre,v0,epilog); - } - }; - - template - struct SphereIntersectorK - { - typedef CurvePrecalculationsK Precalculations; - - template - static __forceinline bool intersect(const vbool& valid_i, - RayK& ray, size_t k, - IntersectContext* context, - const Points* geom, - const Precalculations& pre, - const Vec4vf& v0i, - const Epilog& epilog) - { - vbool valid = valid_i; - - const Vec3vf ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]); - const Vec3vf ray_dir(ray.dir.x[k], ray.dir.y[k], ray.dir.z[k]); - const vfloat rd2 = rcp(dot(ray_dir, ray_dir)); - - const Vec4vf v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i); - const Vec3vf center = v0.xyz(); - const vfloat radius = v0.w; - - const Vec3vf c0 = center - ray_org; - const vfloat projC0 = dot(c0, ray_dir) * rd2; - const Vec3vf perp = c0 - projC0 * ray_dir; - const vfloat l2 = dot(perp, perp); - const vfloat r2 = radius * radius; - valid &= (l2 <= r2); - if (unlikely(none(valid))) - return false; - - const vfloat td = sqrt((r2 - l2) * rd2); - const vfloat t_front = projC0 - td; - const vfloat t_back = projC0 + td; - - const vbool valid_front = valid & (ray.tnear()[k] <= t_front) & (t_front <= ray.tfar[k]); - const vbool valid_back = valid & (ray.tnear()[k] <= t_back ) & (t_back <= ray.tfar[k]); - - /* check if there is a first hit */ - const vbool valid_first = valid_front | valid_back; - if (unlikely(none(valid_first))) - return false; - - /* construct first hit */ - const vfloat td_front = -td; - const vfloat td_back = +td; - const vfloat t_first = select(valid_front, t_front, t_back); - const Vec3vf Ng_first = select(valid_front, td_front, td_back) * ray_dir - perp; - SphereIntersectorHitM hit(t_first, Ng_first); - - /* invoke intersection filter for first hit */ - const bool is_hit_first = epilog(valid_first, hit); - - /* check for possible second hits before potentially accepted hit */ - const vfloat t_second = t_back; - const vbool valid_second = valid_front & valid_back & (t_second <= ray.tfar[k]); - if (unlikely(none(valid_second))) - return is_hit_first; - - /* invoke intersection filter for second hit */ - const Vec3vf Ng_second = td_back * ray_dir - perp; - hit = SphereIntersectorHitM (t_second, Ng_second); - const bool is_hit_second = epilog(valid_second, hit); - - return is_hit_first | is_hit_second; - } - }; - } // namespace isa -} // namespace embree diff --git a/thirdparty/embree-aarch64/kernels/geometry/spherei_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/spherei_intersector.h deleted file mode 100644 index 1146847602..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/spherei_intersector.h +++ /dev/null @@ -1,156 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "intersector_epilog.h" -#include "pointi.h" -#include "sphere_intersector.h" - -namespace embree -{ - namespace isa - { - template - struct SphereMiIntersector1 - { - typedef PointMi Primitive; - typedef CurvePrecalculations1 Precalculations; - - static __forceinline void intersect(const Precalculations& pre, - RayHit& ray, - IntersectContext* context, - const Primitive& sphere) - { - STAT3(normal.trav_prims, 1, 1, 1); - const Points* geom = context->scene->get(sphere.geomID()); - Vec4vf v0; sphere.gather(v0, geom); - const vbool valid = sphere.template valid(); - SphereIntersector1::intersect( - valid, ray, context, geom, pre, v0, Intersect1EpilogM(ray, context, sphere.geomID(), sphere.primID())); - } - - static __forceinline bool occluded(const Precalculations& pre, - Ray& ray, - IntersectContext* context, - const Primitive& sphere) - { - STAT3(shadow.trav_prims, 1, 1, 1); - const Points* geom = context->scene->get(sphere.geomID()); - Vec4vf v0; sphere.gather(v0, geom); - const vbool valid = sphere.template valid(); - return SphereIntersector1::intersect( - valid, ray, context, geom, pre, v0, Occluded1EpilogM(ray, context, sphere.geomID(), sphere.primID())); - } - - static __forceinline bool pointQuery(PointQuery* query, - PointQueryContext* context, - const Primitive& sphere) - { - return PrimitivePointQuery1::pointQuery(query, context, sphere); - } - }; - - template - struct SphereMiMBIntersector1 - { - typedef PointMi Primitive; - typedef CurvePrecalculations1 Precalculations; - - static __forceinline void intersect(const Precalculations& pre, - RayHit& ray, - IntersectContext* context, - const Primitive& sphere) - { - STAT3(normal.trav_prims, 1, 1, 1); - const Points* geom = context->scene->get(sphere.geomID()); - Vec4vf v0; sphere.gather(v0, geom, ray.time()); - const vbool valid = sphere.template valid(); - SphereIntersector1::intersect( - valid, ray, context, geom, pre, v0, Intersect1EpilogM(ray, context, sphere.geomID(), sphere.primID())); - } - - static __forceinline bool occluded(const Precalculations& pre, - Ray& ray, - IntersectContext* context, - const Primitive& sphere) - { - STAT3(shadow.trav_prims, 1, 1, 1); - const Points* geom = context->scene->get(sphere.geomID()); - Vec4vf v0; sphere.gather(v0, geom, ray.time()); - const vbool valid = sphere.template valid(); - return SphereIntersector1::intersect( - valid, ray, context, geom, pre, v0, Occluded1EpilogM(ray, context, sphere.geomID(), sphere.primID())); - } - - static __forceinline bool pointQuery(PointQuery* query, - PointQueryContext* context, - const Primitive& sphere) - { - return PrimitivePointQuery1::pointQuery(query, context, sphere); - } - }; - - template - struct SphereMiIntersectorK - { - typedef PointMi Primitive; - typedef CurvePrecalculationsK Precalculations; - - static __forceinline void intersect( - const Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive& sphere) - { - STAT3(normal.trav_prims, 1, 1, 1); - const Points* geom = context->scene->get(sphere.geomID()); - Vec4vf v0; sphere.gather(v0, geom); - const vbool valid = sphere.template valid(); - SphereIntersectorK::intersect( - valid, ray, k, context, geom, pre, v0, - Intersect1KEpilogM(ray, k, context, sphere.geomID(), sphere.primID())); - } - - static __forceinline bool occluded( - const Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const Primitive& sphere) - { - STAT3(shadow.trav_prims, 1, 1, 1); - const Points* geom = context->scene->get(sphere.geomID()); - Vec4vf v0; sphere.gather(v0, geom); - const vbool valid = sphere.template valid(); - return SphereIntersectorK::intersect( - valid, ray, k, context, geom, pre, v0, - Occluded1KEpilogM(ray, k, context, sphere.geomID(), sphere.primID())); - } - }; - - template - struct SphereMiMBIntersectorK - { - typedef PointMi Primitive; - typedef CurvePrecalculationsK Precalculations; - - static __forceinline void intersect( - const Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive& sphere) - { - STAT3(normal.trav_prims, 1, 1, 1); - const Points* geom = context->scene->get(sphere.geomID()); - Vec4vf v0; sphere.gather(v0, geom, ray.time()[k]); - const vbool valid = sphere.template valid(); - SphereIntersectorK::intersect( - valid, ray, k, context, geom, pre, v0, - Intersect1KEpilogM(ray, k, context, sphere.geomID(), sphere.primID())); - } - - static __forceinline bool occluded( - const Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const Primitive& sphere) - { - STAT3(shadow.trav_prims, 1, 1, 1); - const Points* geom = context->scene->get(sphere.geomID()); - Vec4vf v0; sphere.gather(v0, geom, ray.time()[k]); - const vbool valid = sphere.template valid(); - return SphereIntersectorK::intersect( - valid, ray, k, context, geom, pre, v0, - Occluded1KEpilogM(ray, k, context, sphere.geomID(), sphere.primID())); - } - }; - } // namespace isa -} // namespace embree diff --git a/thirdparty/embree-aarch64/kernels/geometry/subdivpatch1.h b/thirdparty/embree-aarch64/kernels/geometry/subdivpatch1.h deleted file mode 100644 index 94ad46ad87..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/subdivpatch1.h +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../geometry/primitive.h" -#include "../subdiv/subdivpatch1base.h" - -namespace embree -{ - - struct __aligned(64) SubdivPatch1 : public SubdivPatch1Base - { - struct Type : public PrimitiveType - { - const char* name() const; - size_t sizeActive(const char* This) const; - size_t sizeTotal(const char* This) const; - size_t getBytes(const char* This) const; - }; - - static Type type; - - public: - - /*! constructor for cached subdiv patch */ - SubdivPatch1 (const unsigned int gID, - const unsigned int pID, - const unsigned int subPatch, - const SubdivMesh *const mesh, - const size_t time, - const Vec2f uv[4], - const float edge_level[4], - const int subdiv[4], - const int simd_width) - : SubdivPatch1Base(gID,pID,subPatch,mesh,time,uv,edge_level,subdiv,simd_width) {} - }; -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/subdivpatch1_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/subdivpatch1_intersector.h deleted file mode 100644 index 74ec1de258..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/subdivpatch1_intersector.h +++ /dev/null @@ -1,237 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "subdivpatch1.h" -#include "grid_soa.h" -#include "grid_soa_intersector1.h" -#include "grid_soa_intersector_packet.h" -#include "../common/ray.h" - -namespace embree -{ - namespace isa - { - template - class SubdivPatch1Precalculations : public T - { - public: - __forceinline SubdivPatch1Precalculations (const Ray& ray, const void* ptr) - : T(ray,ptr) {} - }; - - template - class SubdivPatch1PrecalculationsK : public T - { - public: - __forceinline SubdivPatch1PrecalculationsK (const vbool& valid, RayK& ray) - : T(valid,ray) {} - }; - - class SubdivPatch1Intersector1 - { - public: - typedef GridSOA Primitive; - typedef SubdivPatch1Precalculations Precalculations; - - static __forceinline bool processLazyNode(Precalculations& pre, IntersectContext* context, const Primitive* prim, size_t& lazy_node) - { - lazy_node = prim->root(0); - pre.grid = (Primitive*)prim; - return false; - } - - /*! Intersect a ray with the primitive. */ - template - static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay &tray, size_t& lazy_node) - { - if (likely(ty == 0)) GridSOAIntersector1::intersect(pre,ray,context,prim,lazy_node); - else processLazyNode(pre,context,prim,lazy_node); - } - - template - static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, size_t ty0, const Primitive* prim, size_t ty, const TravRay &tray, size_t& lazy_node) { - intersect(This,pre,ray,context,prim,ty,tray,lazy_node); - } - - /*! Test if the ray is occluded by the primitive */ - template - static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay &tray, size_t& lazy_node) - { - if (likely(ty == 0)) return GridSOAIntersector1::occluded(pre,ray,context,prim,lazy_node); - else return processLazyNode(pre,context,prim,lazy_node); - } - - template - static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, size_t ty0, const Primitive* prim, size_t ty, const TravRay &tray, size_t& lazy_node) { - return occluded(This,pre,ray,context,prim,ty,tray,lazy_node); - } - - template - static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, const Primitive* prim, size_t ty, const TravPointQuery &tquery, size_t& lazy_node) - { - // TODO: PointQuery implement - assert(false && "not implemented"); - return false; - } - - template - static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, size_t ty0, const Primitive* prim, size_t ty, const TravPointQuery &tquery, size_t& lazy_node) { - return pointQuery(This,query,context,prim,ty,tquery,lazy_node); - } - }; - - class SubdivPatch1MBIntersector1 - { - public: - typedef SubdivPatch1 Primitive; - typedef GridSOAMBIntersector1::Precalculations Precalculations; - - static __forceinline bool processLazyNode(Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim_i, size_t& lazy_node) - { - Primitive* prim = (Primitive*) prim_i; - GridSOA* grid = nullptr; - grid = (GridSOA*) prim->root_ref.get(); - pre.itime = getTimeSegment(ray.time(), float(grid->time_steps-1), pre.ftime); - lazy_node = grid->root(pre.itime); - pre.grid = grid; - return false; - } - - /*! Intersect a ray with the primitive. */ - template - static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay &tray, size_t& lazy_node) - { - if (likely(ty == 0)) GridSOAMBIntersector1::intersect(pre,ray,context,prim,lazy_node); - else processLazyNode(pre,ray,context,prim,lazy_node); - } - - template - static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, size_t ty0, const Primitive* prim, size_t ty, const TravRay &tray, size_t& lazy_node) { - intersect(This,pre,ray,context,prim,ty,tray,lazy_node); - } - - /*! Test if the ray is occluded by the primitive */ - template - static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay &tray, size_t& lazy_node) - { - if (likely(ty == 0)) return GridSOAMBIntersector1::occluded(pre,ray,context,prim,lazy_node); - else return processLazyNode(pre,ray,context,prim,lazy_node); - } - - template - static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, size_t ty0, const Primitive* prim, size_t ty, const TravRay &tray, size_t& lazy_node) { - return occluded(This,pre,ray,context,prim,ty,tray,lazy_node); - } - - template - static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, const Primitive* prim, size_t ty, const TravPointQuery &tquery, size_t& lazy_node) - { - // TODO: PointQuery implement - assert(false && "not implemented"); - return false; - } - - template - static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, size_t ty0, const Primitive* prim, size_t ty, const TravPointQuery &tquery, size_t& lazy_node) { - return pointQuery(This,query,context,prim,ty,tquery,lazy_node); - } - }; - - template - struct SubdivPatch1IntersectorK - { - typedef GridSOA Primitive; - typedef SubdivPatch1PrecalculationsK::Precalculations> Precalculations; - - static __forceinline bool processLazyNode(Precalculations& pre, IntersectContext* context, const Primitive* prim, size_t& lazy_node) - { - lazy_node = prim->root(0); - pre.grid = (Primitive*)prim; - return false; - } - - template - static __forceinline void intersect(const vbool& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRayK &tray, size_t& lazy_node) - { - if (likely(ty == 0)) GridSOAIntersectorK::intersect(valid,pre,ray,context,prim,lazy_node); - else processLazyNode(pre,context,prim,lazy_node); - } - - template - static __forceinline vbool occluded(const vbool& valid, const Accel::Intersectors* This, Precalculations& pre, RayK& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRayK &tray, size_t& lazy_node) - { - if (likely(ty == 0)) return GridSOAIntersectorK::occluded(valid,pre,ray,context,prim,lazy_node); - else return processLazyNode(pre,context,prim,lazy_node); - } - - template - static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay &tray, size_t& lazy_node) - { - if (likely(ty == 0)) GridSOAIntersectorK::intersect(pre,ray,k,context,prim,lazy_node); - else processLazyNode(pre,context,prim,lazy_node); - } - - template - static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay &tray, size_t& lazy_node) - { - if (likely(ty == 0)) return GridSOAIntersectorK::occluded(pre,ray,k,context,prim,lazy_node); - else return processLazyNode(pre,context,prim,lazy_node); - } - }; - - typedef SubdivPatch1IntersectorK<4> SubdivPatch1Intersector4; - typedef SubdivPatch1IntersectorK<8> SubdivPatch1Intersector8; - typedef SubdivPatch1IntersectorK<16> SubdivPatch1Intersector16; - - template - struct SubdivPatch1MBIntersectorK - { - typedef SubdivPatch1 Primitive; - //typedef GridSOAMBIntersectorK::Precalculations Precalculations; - typedef SubdivPatch1PrecalculationsK::Precalculations> Precalculations; - - static __forceinline bool processLazyNode(Precalculations& pre, IntersectContext* context, const Primitive* prim_i, size_t& lazy_node) - { - Primitive* prim = (Primitive*) prim_i; - GridSOA* grid = (GridSOA*) prim->root_ref.get(); - lazy_node = grid->troot; - pre.grid = grid; - return false; - } - - template - static __forceinline void intersect(const vbool& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRayK &tray, size_t& lazy_node) - { - if (likely(ty == 0)) GridSOAMBIntersectorK::intersect(valid,pre,ray,context,prim,lazy_node); - else processLazyNode(pre,context,prim,lazy_node); - } - - template - static __forceinline vbool occluded(const vbool& valid, const Accel::Intersectors* This, Precalculations& pre, RayK& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRayK &tray, size_t& lazy_node) - { - if (likely(ty == 0)) return GridSOAMBIntersectorK::occluded(valid,pre,ray,context,prim,lazy_node); - else return processLazyNode(pre,context,prim,lazy_node); - } - - template - static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay &tray, size_t& lazy_node) - { - if (likely(ty == 0)) GridSOAMBIntersectorK::intersect(pre,ray,k,context,prim,lazy_node); - else processLazyNode(pre,context,prim,lazy_node); - } - - template - static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay &tray, size_t& lazy_node) - { - if (likely(ty == 0)) return GridSOAMBIntersectorK::occluded(pre,ray,k,context,prim,lazy_node); - else return processLazyNode(pre,context,prim,lazy_node); - } - }; - - typedef SubdivPatch1MBIntersectorK<4> SubdivPatch1MBIntersector4; - typedef SubdivPatch1MBIntersectorK<8> SubdivPatch1MBIntersector8; - typedef SubdivPatch1MBIntersectorK<16> SubdivPatch1MBIntersector16; - } -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/subgrid.h b/thirdparty/embree-aarch64/kernels/geometry/subgrid.h deleted file mode 100644 index 39fa6fb0f0..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/subgrid.h +++ /dev/null @@ -1,517 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../common/ray.h" -#include "../common/scene_grid_mesh.h" -#include "../bvh/bvh.h" - -namespace embree -{ - /* Stores M quads from an indexed face set */ - struct SubGrid - { - /* Virtual interface to query information about the quad type */ - struct Type : public PrimitiveType - { - const char* name() const; - size_t sizeActive(const char* This) const; - size_t sizeTotal(const char* This) const; - size_t getBytes(const char* This) const; - }; - static Type type; - - public: - - /* primitive supports multiple time segments */ - static const bool singleTimeSegment = false; - - /* Returns maximum number of stored quads */ - static __forceinline size_t max_size() { return 1; } - - /* Returns required number of primitive blocks for N primitives */ - static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); } - - public: - - /* Default constructor */ - __forceinline SubGrid() { } - - /* Construction from vertices and IDs */ - __forceinline SubGrid(const unsigned int x, - const unsigned int y, - const unsigned int geomID, - const unsigned int primID) - : _x(x), _y(y), _geomID(geomID), _primID(primID) - { - } - - __forceinline bool invalid3x3X() const { return (unsigned int)_x & (1<<15); } - __forceinline bool invalid3x3Y() const { return (unsigned int)_y & (1<<15); } - - /* Gather the quads */ - __forceinline void gather(Vec3vf4& p0, - Vec3vf4& p1, - Vec3vf4& p2, - Vec3vf4& p3, - const GridMesh* const mesh, - const GridMesh::Grid &g) const - { - /* first quad always valid */ - const size_t vtxID00 = g.startVtxID + x() + y() * g.lineVtxOffset; - const size_t vtxID01 = vtxID00 + 1; - const vfloat4 vtx00 = vfloat4::loadu(mesh->vertexPtr(vtxID00)); - const vfloat4 vtx01 = vfloat4::loadu(mesh->vertexPtr(vtxID01)); - const size_t vtxID10 = vtxID00 + g.lineVtxOffset; - const size_t vtxID11 = vtxID01 + g.lineVtxOffset; - const vfloat4 vtx10 = vfloat4::loadu(mesh->vertexPtr(vtxID10)); - const vfloat4 vtx11 = vfloat4::loadu(mesh->vertexPtr(vtxID11)); - - /* deltaX => vtx02, vtx12 */ - const size_t deltaX = invalid3x3X() ? 0 : 1; - const size_t vtxID02 = vtxID01 + deltaX; - const vfloat4 vtx02 = vfloat4::loadu(mesh->vertexPtr(vtxID02)); - const size_t vtxID12 = vtxID11 + deltaX; - const vfloat4 vtx12 = vfloat4::loadu(mesh->vertexPtr(vtxID12)); - - /* deltaY => vtx20, vtx21 */ - const size_t deltaY = invalid3x3Y() ? 0 : g.lineVtxOffset; - const size_t vtxID20 = vtxID10 + deltaY; - const size_t vtxID21 = vtxID11 + deltaY; - const vfloat4 vtx20 = vfloat4::loadu(mesh->vertexPtr(vtxID20)); - const vfloat4 vtx21 = vfloat4::loadu(mesh->vertexPtr(vtxID21)); - - /* deltaX/deltaY => vtx22 */ - const size_t vtxID22 = vtxID11 + deltaX + deltaY; - const vfloat4 vtx22 = vfloat4::loadu(mesh->vertexPtr(vtxID22)); - - transpose(vtx00,vtx01,vtx11,vtx10,p0.x,p0.y,p0.z); - transpose(vtx01,vtx02,vtx12,vtx11,p1.x,p1.y,p1.z); - transpose(vtx11,vtx12,vtx22,vtx21,p2.x,p2.y,p2.z); - transpose(vtx10,vtx11,vtx21,vtx20,p3.x,p3.y,p3.z); - } - - template - __forceinline vfloat4 getVertexMB(const GridMesh* const mesh, const size_t offset, const size_t itime, const float ftime) const - { - const T v0 = T::loadu(mesh->vertexPtr(offset,itime+0)); - const T v1 = T::loadu(mesh->vertexPtr(offset,itime+1)); - return lerp(v0,v1,ftime); - } - - /* Gather the quads */ - __forceinline void gatherMB(Vec3vf4& p0, - Vec3vf4& p1, - Vec3vf4& p2, - Vec3vf4& p3, - const GridMesh* const mesh, - const GridMesh::Grid &g, - const size_t itime, - const float ftime) const - { - /* first quad always valid */ - const size_t vtxID00 = g.startVtxID + x() + y() * g.lineVtxOffset; - const size_t vtxID01 = vtxID00 + 1; - const vfloat4 vtx00 = getVertexMB(mesh,vtxID00,itime,ftime); - const vfloat4 vtx01 = getVertexMB(mesh,vtxID01,itime,ftime); - const size_t vtxID10 = vtxID00 + g.lineVtxOffset; - const size_t vtxID11 = vtxID01 + g.lineVtxOffset; - const vfloat4 vtx10 = getVertexMB(mesh,vtxID10,itime,ftime); - const vfloat4 vtx11 = getVertexMB(mesh,vtxID11,itime,ftime); - - /* deltaX => vtx02, vtx12 */ - const size_t deltaX = invalid3x3X() ? 0 : 1; - const size_t vtxID02 = vtxID01 + deltaX; - const vfloat4 vtx02 = getVertexMB(mesh,vtxID02,itime,ftime); - const size_t vtxID12 = vtxID11 + deltaX; - const vfloat4 vtx12 = getVertexMB(mesh,vtxID12,itime,ftime); - - /* deltaY => vtx20, vtx21 */ - const size_t deltaY = invalid3x3Y() ? 0 : g.lineVtxOffset; - const size_t vtxID20 = vtxID10 + deltaY; - const size_t vtxID21 = vtxID11 + deltaY; - const vfloat4 vtx20 = getVertexMB(mesh,vtxID20,itime,ftime); - const vfloat4 vtx21 = getVertexMB(mesh,vtxID21,itime,ftime); - - /* deltaX/deltaY => vtx22 */ - const size_t vtxID22 = vtxID11 + deltaX + deltaY; - const vfloat4 vtx22 = getVertexMB(mesh,vtxID22,itime,ftime); - - transpose(vtx00,vtx01,vtx11,vtx10,p0.x,p0.y,p0.z); - transpose(vtx01,vtx02,vtx12,vtx11,p1.x,p1.y,p1.z); - transpose(vtx11,vtx12,vtx22,vtx21,p2.x,p2.y,p2.z); - transpose(vtx10,vtx11,vtx21,vtx20,p3.x,p3.y,p3.z); - } - - - - /* Gather the quads */ - __forceinline void gather(Vec3vf4& p0, - Vec3vf4& p1, - Vec3vf4& p2, - Vec3vf4& p3, - const Scene *const scene) const - { - const GridMesh* const mesh = scene->get(geomID()); - const GridMesh::Grid &g = mesh->grid(primID()); - gather(p0,p1,p2,p3,mesh,g); - } - - /* Gather the quads in the motion blur case */ - __forceinline void gatherMB(Vec3vf4& p0, - Vec3vf4& p1, - Vec3vf4& p2, - Vec3vf4& p3, - const Scene *const scene, - const size_t itime, - const float ftime) const - { - const GridMesh* const mesh = scene->get(geomID()); - const GridMesh::Grid &g = mesh->grid(primID()); - gatherMB(p0,p1,p2,p3,mesh,g,itime,ftime); - } - - /* Gather the quads */ - __forceinline void gather(Vec3fa vtx[16], const Scene *const scene) const - { - const GridMesh* mesh = scene->get(geomID()); - const GridMesh::Grid &g = mesh->grid(primID()); - - /* first quad always valid */ - const size_t vtxID00 = g.startVtxID + x() + y() * g.lineVtxOffset; - const size_t vtxID01 = vtxID00 + 1; - const Vec3fa vtx00 = Vec3fa::loadu(mesh->vertexPtr(vtxID00)); - const Vec3fa vtx01 = Vec3fa::loadu(mesh->vertexPtr(vtxID01)); - const size_t vtxID10 = vtxID00 + g.lineVtxOffset; - const size_t vtxID11 = vtxID01 + g.lineVtxOffset; - const Vec3fa vtx10 = Vec3fa::loadu(mesh->vertexPtr(vtxID10)); - const Vec3fa vtx11 = Vec3fa::loadu(mesh->vertexPtr(vtxID11)); - - /* deltaX => vtx02, vtx12 */ - const size_t deltaX = invalid3x3X() ? 0 : 1; - const size_t vtxID02 = vtxID01 + deltaX; - const Vec3fa vtx02 = Vec3fa::loadu(mesh->vertexPtr(vtxID02)); - const size_t vtxID12 = vtxID11 + deltaX; - const Vec3fa vtx12 = Vec3fa::loadu(mesh->vertexPtr(vtxID12)); - - /* deltaY => vtx20, vtx21 */ - const size_t deltaY = invalid3x3Y() ? 0 : g.lineVtxOffset; - const size_t vtxID20 = vtxID10 + deltaY; - const size_t vtxID21 = vtxID11 + deltaY; - const Vec3fa vtx20 = Vec3fa::loadu(mesh->vertexPtr(vtxID20)); - const Vec3fa vtx21 = Vec3fa::loadu(mesh->vertexPtr(vtxID21)); - - /* deltaX/deltaY => vtx22 */ - const size_t vtxID22 = vtxID11 + deltaX + deltaY; - const Vec3fa vtx22 = Vec3fa::loadu(mesh->vertexPtr(vtxID22)); - - vtx[ 0] = vtx00; vtx[ 1] = vtx01; vtx[ 2] = vtx11; vtx[ 3] = vtx10; - vtx[ 4] = vtx01; vtx[ 5] = vtx02; vtx[ 6] = vtx12; vtx[ 7] = vtx11; - vtx[ 8] = vtx10; vtx[ 9] = vtx11; vtx[10] = vtx21; vtx[11] = vtx20; - vtx[12] = vtx11; vtx[13] = vtx12; vtx[14] = vtx22; vtx[15] = vtx21; - } - - /* Gather the quads */ - __forceinline void gatherMB(vfloat4 vtx[16], const Scene *const scene, const size_t itime, const float ftime) const - { - const GridMesh* mesh = scene->get(geomID()); - const GridMesh::Grid &g = mesh->grid(primID()); - - /* first quad always valid */ - const size_t vtxID00 = g.startVtxID + x() + y() * g.lineVtxOffset; - const size_t vtxID01 = vtxID00 + 1; - const vfloat4 vtx00 = getVertexMB(mesh,vtxID00,itime,ftime); - const vfloat4 vtx01 = getVertexMB(mesh,vtxID01,itime,ftime); - const size_t vtxID10 = vtxID00 + g.lineVtxOffset; - const size_t vtxID11 = vtxID01 + g.lineVtxOffset; - const vfloat4 vtx10 = getVertexMB(mesh,vtxID10,itime,ftime); - const vfloat4 vtx11 = getVertexMB(mesh,vtxID11,itime,ftime); - - /* deltaX => vtx02, vtx12 */ - const size_t deltaX = invalid3x3X() ? 0 : 1; - const size_t vtxID02 = vtxID01 + deltaX; - const vfloat4 vtx02 = getVertexMB(mesh,vtxID02,itime,ftime); - const size_t vtxID12 = vtxID11 + deltaX; - const vfloat4 vtx12 = getVertexMB(mesh,vtxID12,itime,ftime); - - /* deltaY => vtx20, vtx21 */ - const size_t deltaY = invalid3x3Y() ? 0 : g.lineVtxOffset; - const size_t vtxID20 = vtxID10 + deltaY; - const size_t vtxID21 = vtxID11 + deltaY; - const vfloat4 vtx20 = getVertexMB(mesh,vtxID20,itime,ftime); - const vfloat4 vtx21 = getVertexMB(mesh,vtxID21,itime,ftime); - - /* deltaX/deltaY => vtx22 */ - const size_t vtxID22 = vtxID11 + deltaX + deltaY; - const vfloat4 vtx22 = getVertexMB(mesh,vtxID22,itime,ftime); - - vtx[ 0] = vtx00; vtx[ 1] = vtx01; vtx[ 2] = vtx11; vtx[ 3] = vtx10; - vtx[ 4] = vtx01; vtx[ 5] = vtx02; vtx[ 6] = vtx12; vtx[ 7] = vtx11; - vtx[ 8] = vtx10; vtx[ 9] = vtx11; vtx[10] = vtx21; vtx[11] = vtx20; - vtx[12] = vtx11; vtx[13] = vtx12; vtx[14] = vtx22; vtx[15] = vtx21; - } - - - /* Calculate the bounds of the subgrid */ - __forceinline const BBox3fa bounds(const Scene *const scene, const size_t itime=0) const - { - BBox3fa bounds = empty; - FATAL("not implemented yet"); - return bounds; - } - - /* Calculate the linear bounds of the primitive */ - __forceinline LBBox3fa linearBounds(const Scene* const scene, const size_t itime) - { - return LBBox3fa(bounds(scene,itime+0),bounds(scene,itime+1)); - } - - __forceinline LBBox3fa linearBounds(const Scene *const scene, size_t itime, size_t numTimeSteps) - { - LBBox3fa allBounds = empty; - FATAL("not implemented yet"); - return allBounds; - } - - __forceinline LBBox3fa linearBounds(const Scene *const scene, const BBox1f time_range) - { - LBBox3fa allBounds = empty; - FATAL("not implemented yet"); - return allBounds; - } - - - friend embree_ostream operator<<(embree_ostream cout, const SubGrid& sg) { - return cout << "SubGrid " << " ( x " << sg.x() << ", y = " << sg.y() << ", geomID = " << sg.geomID() << ", primID = " << sg.primID() << " )"; - } - - __forceinline unsigned int geomID() const { return _geomID; } - __forceinline unsigned int primID() const { return _primID; } - __forceinline unsigned int x() const { return (unsigned int)_x & 0x7fff; } - __forceinline unsigned int y() const { return (unsigned int)_y & 0x7fff; } - - private: - unsigned short _x; - unsigned short _y; - unsigned int _geomID; // geometry ID of mesh - unsigned int _primID; // primitive ID of primitive inside mesh - }; - - struct SubGridID { - unsigned short x; - unsigned short y; - unsigned int primID; - - __forceinline SubGridID() {} - __forceinline SubGridID(const unsigned int x, const unsigned int y, const unsigned int primID) : - x(x), y(y), primID(primID) {} - }; - - /* QuantizedBaseNode as large subgrid leaf */ - template - struct SubGridQBVHN - { - /* Virtual interface to query information about the quad type */ - struct Type : public PrimitiveType - { - const char* name() const; - size_t sizeActive(const char* This) const; - size_t sizeTotal(const char* This) const; - size_t getBytes(const char* This) const; - }; - static Type type; - - public: - - __forceinline size_t size() const - { - for (size_t i=0;i::AABBNode node; - node.clear(); - for (size_t i=0;i::QuantizedBaseNode qnode; - - unsigned int _geomID; // geometry ID of mesh - - - friend embree_ostream operator<<(embree_ostream cout, const SubGridQBVHN& sg) { - cout << "SubGridQBVHN " << embree_endl; - for (size_t i=0;i - typename SubGridQBVHN::Type SubGridQBVHN::type; - - typedef SubGridQBVHN<4> SubGridQBVH4; - typedef SubGridQBVHN<8> SubGridQBVH8; - - - /* QuantizedBaseNode as large subgrid leaf */ - template - struct SubGridMBQBVHN - { - /* Virtual interface to query information about the quad type */ - struct Type : public PrimitiveType - { - const char* name() const; - size_t sizeActive(const char* This) const; - size_t sizeTotal(const char* This) const; - size_t getBytes(const char* This) const; - }; - static Type type; - - public: - - __forceinline size_t size() const - { - for (size_t i=0;i::AABBNode node0,node1; - node0.clear(); - node1.clear(); - for (size_t i=0;i - __forceinline vfloat adjustTime(const vfloat &t) const { return time_scale * (t-time_offset); } - - public: - SubGridID subgridIDs[N]; - - typename BVHN::QuantizedBaseNodeMB qnode; - - float time_offset; - float time_scale; - unsigned int _geomID; // geometry ID of mesh - - - friend embree_ostream operator<<(embree_ostream cout, const SubGridMBQBVHN& sg) { - cout << "SubGridMBQBVHN " << embree_endl; - for (size_t i=0;i - struct SubGridIntersector1Moeller - { - typedef SubGridQBVHN Primitive; - typedef SubGridQuadMIntersector1MoellerTrumbore<4,filter> Precalculations; - - static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const SubGrid& subgrid) - { - STAT3(normal.trav_prims,1,1,1); - const GridMesh* mesh = context->scene->get(subgrid.geomID()); - const GridMesh::Grid &g = mesh->grid(subgrid.primID()); - - Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene); - pre.intersect(ray,context,v0,v1,v2,v3,g,subgrid); - } - - static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const SubGrid& subgrid) - { - STAT3(shadow.trav_prims,1,1,1); - const GridMesh* mesh = context->scene->get(subgrid.geomID()); - const GridMesh::Grid &g = mesh->grid(subgrid.primID()); - - Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene); - return pre.occluded(ray,context,v0,v1,v2,v3,g,subgrid); - } - - static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const SubGrid& subgrid) - { - STAT3(point_query.trav_prims,1,1,1); - AccelSet* accel = (AccelSet*)context->scene->get(subgrid.geomID()); - assert(accel); - context->geomID = subgrid.geomID(); - context->primID = subgrid.primID(); - return accel->pointQuery(query, context); - } - - template - static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay &tray, size_t& lazy_node) - { - BVHNQuantizedBaseNodeIntersector1 isec1; - - for (size_t i=0;i dist; - size_t mask = isec1.intersect(&prim[i].qnode,tray,dist); -#if defined(__AVX__) - STAT3(normal.trav_hit_boxes[popcnt(mask)],1,1,1); -#endif - while(mask != 0) - { - const size_t ID = bscf(mask); - assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask())); - - if (unlikely(dist[ID] > ray.tfar)) continue; - intersect(pre,ray,context,prim[i].subgrid(ID)); - } - } - } - template - static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay &tray, size_t& lazy_node) - - { - BVHNQuantizedBaseNodeIntersector1 isec1; - - for (size_t i=0;i dist; - size_t mask = isec1.intersect(&prim[i].qnode,tray,dist); - while(mask != 0) - { - const size_t ID = bscf(mask); - assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask())); - - if (occluded(pre,ray,context,prim[i].subgrid(ID))) - return true; - } - } - return false; - } - - static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, const Primitive* prim, size_t num, const TravPointQuery &tquery, size_t& lazy_node) - { - bool changed = false; - for (size_t i=0;i dist; - size_t mask; - if (likely(context->query_type == POINT_QUERY_TYPE_SPHERE)) { - mask = BVHNQuantizedBaseNodePointQuerySphere1::pointQuery(&prim[i].qnode,tquery,dist); - } else { - mask = BVHNQuantizedBaseNodePointQueryAABB1::pointQuery(&prim[i].qnode,tquery,dist); - } - while(mask != 0) - { - const size_t ID = bscf(mask); - assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask())); - changed |= pointQuery(query, context, prim[i].subgrid(ID)); - } - } - return changed; - } - }; - - template - struct SubGridIntersector1Pluecker - { - typedef SubGridQBVHN Primitive; - typedef SubGridQuadMIntersector1Pluecker<4,filter> Precalculations; - - static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const SubGrid& subgrid) - { - STAT3(normal.trav_prims,1,1,1); - const GridMesh* mesh = context->scene->get(subgrid.geomID()); - const GridMesh::Grid &g = mesh->grid(subgrid.primID()); - - Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene); - pre.intersect(ray,context,v0,v1,v2,v3,g,subgrid); - } - - static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const SubGrid& subgrid) - { - STAT3(shadow.trav_prims,1,1,1); - const GridMesh* mesh = context->scene->get(subgrid.geomID()); - const GridMesh::Grid &g = mesh->grid(subgrid.primID()); - - Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene); - return pre.occluded(ray,context,v0,v1,v2,v3,g,subgrid); - } - - static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const SubGrid& subgrid) - { - STAT3(point_query.trav_prims,1,1,1); - AccelSet* accel = (AccelSet*)context->scene->get(subgrid.geomID()); - context->geomID = subgrid.geomID(); - context->primID = subgrid.primID(); - return accel->pointQuery(query, context); - } - - template - static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay &tray, size_t& lazy_node) - { - BVHNQuantizedBaseNodeIntersector1 isec1; - - for (size_t i=0;i dist; - size_t mask = isec1.intersect(&prim[i].qnode,tray,dist); -#if defined(__AVX__) - STAT3(normal.trav_hit_boxes[popcnt(mask)],1,1,1); -#endif - while(mask != 0) - { - const size_t ID = bscf(mask); - assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask())); - - if (unlikely(dist[ID] > ray.tfar)) continue; - intersect(pre,ray,context,prim[i].subgrid(ID)); - } - } - } - - template - static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay &tray, size_t& lazy_node) - { - BVHNQuantizedBaseNodeIntersector1 isec1; - - for (size_t i=0;i dist; - size_t mask = isec1.intersect(&prim[i].qnode,tray,dist); - while(mask != 0) - { - const size_t ID = bscf(mask); - assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask())); - - if (occluded(pre,ray,context,prim[i].subgrid(ID))) - return true; - } - } - return false; - } - - static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, const Primitive* prim, size_t num, const TravPointQuery &tquery, size_t& lazy_node) - { - bool changed = false; - for (size_t i=0;i dist; - size_t mask; - if (likely(context->query_type == POINT_QUERY_TYPE_SPHERE)) { - mask = BVHNQuantizedBaseNodePointQuerySphere1::pointQuery(&prim[i].qnode,tquery,dist); - } else { - mask = BVHNQuantizedBaseNodePointQueryAABB1::pointQuery(&prim[i].qnode,tquery,dist); - } -#if defined(__AVX__) - STAT3(point_query.trav_hit_boxes[popcnt(mask)],1,1,1); -#endif - while(mask != 0) - { - const size_t ID = bscf(mask); - assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask())); - changed |= pointQuery(query, context, prim[i].subgrid(ID)); - } - } - return changed; - } - }; - - template - struct SubGridIntersectorKMoeller - { - typedef SubGridQBVHN Primitive; - typedef SubGridQuadMIntersectorKMoellerTrumbore<4,K,filter> Precalculations; - - static __forceinline void intersect(const vbool& valid_i, Precalculations& pre, RayHitK& ray, IntersectContext* context, const SubGrid& subgrid) - { - Vec3fa vtx[16]; - const GridMesh* mesh = context->scene->get(subgrid.geomID()); - const GridMesh::Grid &g = mesh->grid(subgrid.primID()); - - subgrid.gather(vtx,context->scene); - for (unsigned int i=0; i<4; i++) - { - const Vec3vf p0 = vtx[i*4+0]; - const Vec3vf p1 = vtx[i*4+1]; - const Vec3vf p2 = vtx[i*4+2]; - const Vec3vf p3 = vtx[i*4+3]; - STAT3(normal.trav_prims,1,popcnt(valid_i),K); - pre.intersectK(valid_i,ray,p0,p1,p2,p3,g,subgrid,i,IntersectKEpilogM<4,K,filter>(ray,context,subgrid.geomID(),subgrid.primID(),i)); - } - } - - static __forceinline vbool occluded(const vbool& valid_i, Precalculations& pre, RayK& ray, IntersectContext* context, const SubGrid& subgrid) - { - vbool valid0 = valid_i; - Vec3fa vtx[16]; - const GridMesh* mesh = context->scene->get(subgrid.geomID()); - const GridMesh::Grid &g = mesh->grid(subgrid.primID()); - - subgrid.gather(vtx,context->scene); - for (unsigned int i=0; i<4; i++) - { - const Vec3vf p0 = vtx[i*4+0]; - const Vec3vf p1 = vtx[i*4+1]; - const Vec3vf p2 = vtx[i*4+2]; - const Vec3vf p3 = vtx[i*4+3]; - STAT3(shadow.trav_prims,1,popcnt(valid0),K); - if (pre.intersectK(valid0,ray,p0,p1,p2,p3,g,subgrid,i,OccludedKEpilogM<4,K,filter>(valid0,ray,context,subgrid.geomID(),subgrid.primID(),i))) - break; - } - return !valid0; - } - - static __forceinline void intersect(Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const SubGrid& subgrid) - { - STAT3(normal.trav_prims,1,1,1); - const GridMesh* mesh = context->scene->get(subgrid.geomID()); - const GridMesh::Grid &g = mesh->grid(subgrid.primID()); - - Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene); - pre.intersect1(ray,k,context,v0,v1,v2,v3,g,subgrid); - } - - static __forceinline bool occluded(Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const SubGrid& subgrid) - { - STAT3(shadow.trav_prims,1,1,1); - const GridMesh* mesh = context->scene->get(subgrid.geomID()); - const GridMesh::Grid &g = mesh->grid(subgrid.primID()); - Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene); - return pre.occluded1(ray,k,context,v0,v1,v2,v3,g,subgrid); - } - - template - static __forceinline void intersect(const vbool& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK &tray, size_t& lazy_node) - { - BVHNQuantizedBaseNodeIntersectorK isecK; - for (size_t j=0;j dist; - while(m_valid) - { - const size_t i = bscf(m_valid); - if (none(valid & isecK.intersectK(&prim[j].qnode,i,tray,dist))) continue; - intersect(valid,pre,ray,context,prim[j].subgrid(i)); - } - } - } - - template - static __forceinline vbool occluded(const vbool& valid, const Accel::Intersectors* This, Precalculations& pre, RayK& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK &tray, size_t& lazy_node) - { - BVHNQuantizedBaseNodeIntersectorK isecK; - vbool valid0 = valid; - for (size_t j=0;j dist; - while(m_valid) - { - const size_t i = bscf(m_valid); - if (none(valid0 & isecK.intersectK(&prim[j].qnode,i,tray,dist))) continue; - valid0 &= !occluded(valid0,pre,ray,context,prim[j].subgrid(i)); - if (none(valid0)) break; - } - } - return !valid0; - } - - template - static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay &tray, size_t& lazy_node) - { - BVHNQuantizedBaseNodeIntersector1 isec1; - - for (size_t i=0;i dist; - size_t mask = isec1.intersect(&prim[i].qnode,tray,dist); - while(mask != 0) - { - const size_t ID = bscf(mask); - assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask())); - - if (unlikely(dist[ID] > ray.tfar[k])) continue; - intersect(pre,ray,k,context,prim[i].subgrid(ID)); - } - } - } - - template - static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay &tray, size_t& lazy_node) - { - BVHNQuantizedBaseNodeIntersector1 isec1; - - for (size_t i=0;i dist; - size_t mask = isec1.intersect(&prim[i].qnode,tray,dist); - while(mask != 0) - { - const size_t ID = bscf(mask); - assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask())); - - if (occluded(pre,ray,k,context,prim[i].subgrid(ID))) - return true; - } - } - return false; - } - }; - - - template - struct SubGridIntersectorKPluecker - { - typedef SubGridQBVHN Primitive; - typedef SubGridQuadMIntersectorKPluecker<4,K,filter> Precalculations; - - static __forceinline void intersect(const vbool& valid_i, Precalculations& pre, RayHitK& ray, IntersectContext* context, const SubGrid& subgrid) - { - Vec3fa vtx[16]; - const GridMesh* mesh = context->scene->get(subgrid.geomID()); - const GridMesh::Grid &g = mesh->grid(subgrid.primID()); - - subgrid.gather(vtx,context->scene); - for (unsigned int i=0; i<4; i++) - { - const Vec3vf p0 = vtx[i*4+0]; - const Vec3vf p1 = vtx[i*4+1]; - const Vec3vf p2 = vtx[i*4+2]; - const Vec3vf p3 = vtx[i*4+3]; - STAT3(normal.trav_prims,1,popcnt(valid_i),K); - pre.intersectK(valid_i,ray,p0,p1,p2,p3,g,subgrid,i,IntersectKEpilogM<4,K,filter>(ray,context,subgrid.geomID(),subgrid.primID(),i)); - } - } - - static __forceinline vbool occluded(const vbool& valid_i, Precalculations& pre, RayK& ray, IntersectContext* context, const SubGrid& subgrid) - { - vbool valid0 = valid_i; - Vec3fa vtx[16]; - const GridMesh* mesh = context->scene->get(subgrid.geomID()); - const GridMesh::Grid &g = mesh->grid(subgrid.primID()); - - subgrid.gather(vtx,context->scene); - for (unsigned int i=0; i<4; i++) - { - const Vec3vf p0 = vtx[i*4+0]; - const Vec3vf p1 = vtx[i*4+1]; - const Vec3vf p2 = vtx[i*4+2]; - const Vec3vf p3 = vtx[i*4+3]; - STAT3(shadow.trav_prims,1,popcnt(valid0),K); - if (pre.intersectK(valid0,ray,p0,p1,p2,p3,g,subgrid,i,OccludedKEpilogM<4,K,filter>(valid0,ray,context,subgrid.geomID(),subgrid.primID(),i))) - break; - } - return !valid0; - } - - static __forceinline void intersect(Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const SubGrid& subgrid) - { - STAT3(normal.trav_prims,1,1,1); - const GridMesh* mesh = context->scene->get(subgrid.geomID()); - const GridMesh::Grid &g = mesh->grid(subgrid.primID()); - - Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene); - pre.intersect1(ray,k,context,v0,v1,v2,v3,g,subgrid); - } - - static __forceinline bool occluded(Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const SubGrid& subgrid) - { - STAT3(shadow.trav_prims,1,1,1); - const GridMesh* mesh = context->scene->get(subgrid.geomID()); - const GridMesh::Grid &g = mesh->grid(subgrid.primID()); - Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene); - return pre.occluded1(ray,k,context,v0,v1,v2,v3,g,subgrid); - } - - template - static __forceinline void intersect(const vbool& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK &tray, size_t& lazy_node) - { - BVHNQuantizedBaseNodeIntersectorK isecK; - for (size_t j=0;j dist; - while(m_valid) - { - const size_t i = bscf(m_valid); - if (none(valid & isecK.intersectK(&prim[j].qnode,i,tray,dist))) continue; - intersect(valid,pre,ray,context,prim[j].subgrid(i)); - } - } - } - - template - static __forceinline vbool occluded(const vbool& valid, const Accel::Intersectors* This, Precalculations& pre, RayK& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK &tray, size_t& lazy_node) - { - BVHNQuantizedBaseNodeIntersectorK isecK; - vbool valid0 = valid; - for (size_t j=0;j dist; - while(m_valid) - { - const size_t i = bscf(m_valid); - if (none(valid0 & isecK.intersectK(&prim[j].qnode,i,tray,dist))) continue; - valid0 &= !occluded(valid0,pre,ray,context,prim[j].subgrid(i)); - if (none(valid0)) break; - } - } - return !valid0; - } - - template - static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay &tray, size_t& lazy_node) - { - BVHNQuantizedBaseNodeIntersector1 isec1; - - for (size_t i=0;i dist; - size_t mask = isec1.intersect(&prim[i].qnode,tray,dist); - while(mask != 0) - { - const size_t ID = bscf(mask); - assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask())); - - if (unlikely(dist[ID] > ray.tfar[k])) continue; - intersect(pre,ray,k,context,prim[i].subgrid(ID)); - } - } - } - - template - static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay &tray, size_t& lazy_node) - { - BVHNQuantizedBaseNodeIntersector1 isec1; - - for (size_t i=0;i dist; - size_t mask = isec1.intersect(&prim[i].qnode,tray,dist); - while(mask != 0) - { - const size_t ID = bscf(mask); - assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask())); - - if (occluded(pre,ray,k,context,prim[i].subgrid(ID))) - return true; - } - } - return false; - } - }; - - - - } -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/subgrid_intersector_moeller.h b/thirdparty/embree-aarch64/kernels/geometry/subgrid_intersector_moeller.h deleted file mode 100644 index f65b4abf61..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/subgrid_intersector_moeller.h +++ /dev/null @@ -1,493 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "subgrid.h" -#include "quad_intersector_moeller.h" - -namespace embree -{ - namespace isa - { - - /* ----------------------------- */ - /* -- single ray intersectors -- */ - /* ----------------------------- */ - - template - __forceinline void interpolateUV(MoellerTrumboreHitM &hit,const GridMesh::Grid &g, const SubGrid& subgrid) - { - /* correct U,V interpolation across the entire grid */ - const vint sx((int)subgrid.x()); - const vint sy((int)subgrid.y()); - const vint sxM(sx + vint(0,1,1,0)); - const vint syM(sy + vint(0,0,1,1)); - const float inv_resX = rcp((float)((int)g.resX-1)); - const float inv_resY = rcp((float)((int)g.resY-1)); - hit.U = (hit.U + (vfloat)sxM * hit.absDen) * inv_resX; - hit.V = (hit.V + (vfloat)syM * hit.absDen) * inv_resY; - } - - template - struct SubGridQuadMIntersector1MoellerTrumbore; - - template - struct SubGridQuadMIntersector1MoellerTrumbore - { - __forceinline SubGridQuadMIntersector1MoellerTrumbore() {} - - __forceinline SubGridQuadMIntersector1MoellerTrumbore(const Ray& ray, const void* ptr) {} - - __forceinline void intersect(RayHit& ray, IntersectContext* context, - const Vec3vf& v0, const Vec3vf& v1, const Vec3vf& v2, const Vec3vf& v3, - const GridMesh::Grid &g, const SubGrid& subgrid) const - { - MoellerTrumboreHitM hit; - MoellerTrumboreIntersector1 intersector(ray,nullptr); - Intersect1EpilogMU epilog(ray,context,subgrid.geomID(),subgrid.primID()); - - /* intersect first triangle */ - if (intersector.intersect(ray,v0,v1,v3,hit)) - { - interpolateUV(hit,g,subgrid); - epilog(hit.valid,hit); - } - - /* intersect second triangle */ - if (intersector.intersect(ray,v2,v3,v1,hit)) - { - hit.U = hit.absDen - hit.U; - hit.V = hit.absDen - hit.V; - interpolateUV(hit,g,subgrid); - epilog(hit.valid,hit); - } - } - - __forceinline bool occluded(Ray& ray, IntersectContext* context, - const Vec3vf& v0, const Vec3vf& v1, const Vec3vf& v2, const Vec3vf& v3, - const GridMesh::Grid &g, const SubGrid& subgrid) const - { - MoellerTrumboreHitM hit; - MoellerTrumboreIntersector1 intersector(ray,nullptr); - Occluded1EpilogMU epilog(ray,context,subgrid.geomID(),subgrid.primID()); - - /* intersect first triangle */ - if (intersector.intersect(ray,v0,v1,v3,hit)) - { - interpolateUV(hit,g,subgrid); - if (epilog(hit.valid,hit)) - return true; - } - - /* intersect second triangle */ - if (intersector.intersect(ray,v2,v3,v1,hit)) - { - hit.U = hit.absDen - hit.U; - hit.V = hit.absDen - hit.V; - interpolateUV(hit,g,subgrid); - if (epilog(hit.valid,hit)) - return true; - } - return false; - } - }; - -#if defined (__AVX__) - - /*! Intersects 4 quads with 1 ray using AVX */ - template - struct SubGridQuadMIntersector1MoellerTrumbore<4,filter> - { - __forceinline SubGridQuadMIntersector1MoellerTrumbore() {} - - __forceinline SubGridQuadMIntersector1MoellerTrumbore(const Ray& ray, const void* ptr) {} - - template - __forceinline bool intersect(Ray& ray, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const GridMesh::Grid &g, const SubGrid& subgrid, const Epilog& epilog) const - { - const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z)); -#if !defined(EMBREE_BACKFACE_CULLING) - const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z)); - const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z)); -#else - const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z)); - const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z)); -#endif - MoellerTrumboreHitM<8> hit; - MoellerTrumboreIntersector1<8> intersector(ray,nullptr); - const vbool8 flags(0,0,0,0,1,1,1,1); - if (unlikely(intersector.intersect(ray,vtx0,vtx1,vtx2,hit))) - { - vfloat8 U = hit.U, V = hit.V, absDen = hit.absDen; - -#if !defined(EMBREE_BACKFACE_CULLING) - hit.U = select(flags,absDen-V,U); - hit.V = select(flags,absDen-U,V); - hit.vNg *= select(flags,vfloat8(-1.0f),vfloat8(1.0f)); -#else - hit.U = select(flags,absDen-U,U); - hit.V = select(flags,absDen-V,V); -#endif - /* correct U,V interpolation across the entire grid */ - const vint8 sx((int)subgrid.x()); - const vint8 sy((int)subgrid.y()); - const vint8 sx8(sx + vint8(0,1,1,0,0,1,1,0)); - const vint8 sy8(sy + vint8(0,0,1,1,0,0,1,1)); - const float inv_resX = rcp((float)((int)g.resX-1)); - const float inv_resY = rcp((float)((int)g.resY-1)); - hit.U = (hit.U + (vfloat8)sx8 * absDen) * inv_resX; - hit.V = (hit.V + (vfloat8)sy8 * absDen) * inv_resY; - - if (unlikely(epilog(hit.valid,hit))) - return true; - } - return false; - } - - __forceinline bool intersect(RayHit& ray, IntersectContext* context, - const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, - const GridMesh::Grid &g, const SubGrid& subgrid) const - { - return intersect(ray,v0,v1,v2,v3,g,subgrid,Intersect1EpilogMU<8,filter>(ray,context,subgrid.geomID(),subgrid.primID())); - } - - __forceinline bool occluded(Ray& ray, IntersectContext* context, - const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, - const GridMesh::Grid &g, const SubGrid& subgrid) const - { - return intersect(ray,v0,v1,v2,v3,g,subgrid,Occluded1EpilogMU<8,filter>(ray,context,subgrid.geomID(),subgrid.primID())); - } - }; - -#endif - - // ============================================================================================================================ - // ============================================================================================================================ - // ============================================================================================================================ - - - /* ----------------------------- */ - /* -- ray packet intersectors -- */ - /* ----------------------------- */ - - template - struct SubGridQuadHitK - { - __forceinline SubGridQuadHitK(const vfloat& U, - const vfloat& V, - const vfloat& T, - const vfloat& absDen, - const Vec3vf& Ng, - const vbool& flags, - const GridMesh::Grid &g, - const SubGrid& subgrid, - const unsigned int i) - : U(U), V(V), T(T), absDen(absDen), flags(flags), tri_Ng(Ng), g(g), subgrid(subgrid), i(i) {} - - __forceinline std::tuple,vfloat,vfloat,Vec3vf> operator() () const - { - const vfloat rcpAbsDen = rcp(absDen); - const vfloat t = T * rcpAbsDen; - const vfloat u0 = min(U * rcpAbsDen,1.0f); - const vfloat v0 = min(V * rcpAbsDen,1.0f); - const vfloat u1 = vfloat(1.0f) - u0; - const vfloat v1 = vfloat(1.0f) - v0; - const vfloat uu = select(flags,u1,u0); - const vfloat vv = select(flags,v1,v0); - const unsigned int sx = subgrid.x() + (unsigned int)(i % 2); - const unsigned int sy = subgrid.y() + (unsigned int)(i >>1); - const float inv_resX = rcp((float)(int)(g.resX-1)); - const float inv_resY = rcp((float)(int)(g.resY-1)); - const vfloat u = (uu + (float)(int)sx) * inv_resX; - const vfloat v = (vv + (float)(int)sy) * inv_resY; - const Vec3vf Ng(tri_Ng.x,tri_Ng.y,tri_Ng.z); - return std::make_tuple(u,v,t,Ng); - } - - private: - const vfloat U; - const vfloat V; - const vfloat T; - const vfloat absDen; - const vbool flags; - const Vec3vf tri_Ng; - - const GridMesh::Grid &g; - const SubGrid& subgrid; - const size_t i; - }; - - template - struct SubGridQuadMIntersectorKMoellerTrumboreBase - { - __forceinline SubGridQuadMIntersectorKMoellerTrumboreBase(const vbool& valid, const RayK& ray) {} - - template - __forceinline vbool intersectK(const vbool& valid0, - RayK& ray, - const Vec3vf& tri_v0, - const Vec3vf& tri_e1, - const Vec3vf& tri_e2, - const Vec3vf& tri_Ng, - const vbool& flags, - const GridMesh::Grid &g, - const SubGrid &subgrid, - const unsigned int i, - const Epilog& epilog) const - { - /* calculate denominator */ - vbool valid = valid0; - const Vec3vf C = tri_v0 - ray.org; - const Vec3vf R = cross(C,ray.dir); - const vfloat den = dot(tri_Ng,ray.dir); - const vfloat absDen = abs(den); - const vfloat sgnDen = signmsk(den); - - /* test against edge p2 p0 */ - const vfloat U = dot(R,tri_e2) ^ sgnDen; - valid &= U >= 0.0f; - if (likely(none(valid))) return false; - - /* test against edge p0 p1 */ - const vfloat V = dot(R,tri_e1) ^ sgnDen; - valid &= V >= 0.0f; - if (likely(none(valid))) return false; - - /* test against edge p1 p2 */ - const vfloat W = absDen-U-V; - valid &= W >= 0.0f; - if (likely(none(valid))) return false; - - /* perform depth test */ - const vfloat T = dot(tri_Ng,C) ^ sgnDen; - valid &= (absDen*ray.tnear() < T) & (T <= absDen*ray.tfar); - if (unlikely(none(valid))) return false; - - /* perform backface culling */ -#if defined(EMBREE_BACKFACE_CULLING) - valid &= den < vfloat(zero); - if (unlikely(none(valid))) return false; -#else - valid &= den != vfloat(zero); - if (unlikely(none(valid))) return false; -#endif - - /* calculate hit information */ - SubGridQuadHitK hit(U,V,T,absDen,tri_Ng,flags,g,subgrid,i); - return epilog(valid,hit); - } - - template - __forceinline vbool intersectK(const vbool& valid0, - RayK& ray, - const Vec3vf& tri_v0, - const Vec3vf& tri_v1, - const Vec3vf& tri_v2, - const vbool& flags, - const GridMesh::Grid &g, - const SubGrid &subgrid, - const unsigned int i, - const Epilog& epilog) const - { - const Vec3vf e1 = tri_v0-tri_v1; - const Vec3vf e2 = tri_v2-tri_v0; - const Vec3vf Ng = cross(e2,e1); - return intersectK(valid0,ray,tri_v0,e1,e2,Ng,flags,g,subgrid,i,epilog); - } - - template - __forceinline bool intersectK(const vbool& valid0, - RayK& ray, - const Vec3vf& v0, - const Vec3vf& v1, - const Vec3vf& v2, - const Vec3vf& v3, - const GridMesh::Grid &g, - const SubGrid &subgrid, - const unsigned int i, - const Epilog& epilog) const - { - intersectK(valid0,ray,v0,v1,v3,vbool(false),g,subgrid,i,epilog); - if (none(valid0)) return true; - intersectK(valid0,ray,v2,v3,v1,vbool(true ),g,subgrid,i,epilog); - return none(valid0); - } - - static __forceinline bool intersect1(RayK& ray, - size_t k, - const Vec3vf& tri_v0, - const Vec3vf& tri_e1, - const Vec3vf& tri_e2, - const Vec3vf& tri_Ng, - MoellerTrumboreHitM &hit) - { - /* calculate denominator */ - const Vec3vf O = broadcast>(ray.org,k); - const Vec3vf D = broadcast>(ray.dir,k); - const Vec3vf C = Vec3vf(tri_v0) - O; - const Vec3vf R = cross(C,D); - const vfloat den = dot(Vec3vf(tri_Ng),D); - const vfloat absDen = abs(den); - const vfloat sgnDen = signmsk(den); - - /* perform edge tests */ - const vfloat U = dot(R,Vec3vf(tri_e2)) ^ sgnDen; - const vfloat V = dot(R,Vec3vf(tri_e1)) ^ sgnDen; - - /* perform backface culling */ -#if defined(EMBREE_BACKFACE_CULLING) - vbool valid = (den < vfloat(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen); -#else - vbool valid = (den != vfloat(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen); -#endif - if (likely(none(valid))) return false; - - /* perform depth test */ - const vfloat T = dot(Vec3vf(tri_Ng),C) ^ sgnDen; - valid &= (absDen*vfloat(ray.tnear()[k]) < T) & (T <= absDen*vfloat(ray.tfar[k])); - if (likely(none(valid))) return false; - - /* calculate hit information */ - new (&hit) MoellerTrumboreHitM(valid,U,V,T,absDen,tri_Ng); - return true; - } - - static __forceinline bool intersect1(RayK& ray, - size_t k, - const Vec3vf& v0, - const Vec3vf& v1, - const Vec3vf& v2, - MoellerTrumboreHitM &hit) - { - const Vec3vf e1 = v0-v1; - const Vec3vf e2 = v2-v0; - const Vec3vf Ng = cross(e2,e1); - return intersect1(ray,k,v0,e1,e2,Ng,hit); - } - - }; - - template - struct SubGridQuadMIntersectorKMoellerTrumbore : public SubGridQuadMIntersectorKMoellerTrumboreBase - { - __forceinline SubGridQuadMIntersectorKMoellerTrumbore(const vbool& valid, const RayK& ray) - : SubGridQuadMIntersectorKMoellerTrumboreBase(valid,ray) {} - - __forceinline void intersect1(RayHitK& ray, size_t k, IntersectContext* context, - const Vec3vf& v0, const Vec3vf& v1, const Vec3vf& v2, const Vec3vf& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const - { - Intersect1KEpilogMU epilog(ray,k,context,subgrid.geomID(),subgrid.primID()); - - MoellerTrumboreHitM<4> hit; - if (SubGridQuadMIntersectorKMoellerTrumboreBase<4,K,filter>::intersect1(ray,k,v0,v1,v3,hit)) - { - interpolateUV(hit,g,subgrid); - epilog(hit.valid,hit); - } - - if (SubGridQuadMIntersectorKMoellerTrumboreBase<4,K,filter>::intersect1(ray,k,v2,v3,v1,hit)) - { - hit.U = hit.absDen - hit.U; - hit.V = hit.absDen - hit.V; - interpolateUV(hit,g,subgrid); - epilog(hit.valid,hit); - } - - } - - __forceinline bool occluded1(RayK& ray, size_t k, IntersectContext* context, - const Vec3vf& v0, const Vec3vf& v1, const Vec3vf& v2, const Vec3vf& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const - { - Occluded1KEpilogMU epilog(ray,k,context,subgrid.geomID(),subgrid.primID()); - - MoellerTrumboreHitM<4> hit; - if (SubGridQuadMIntersectorKMoellerTrumboreBase<4,K,filter>::intersect1(ray,k,v0,v1,v3,hit)) - { - interpolateUV(hit,g,subgrid); - if (epilog(hit.valid,hit)) return true; - } - - if (SubGridQuadMIntersectorKMoellerTrumboreBase<4,K,filter>::intersect1(ray,k,v2,v3,v1,hit)) - { - hit.U = hit.absDen - hit.U; - hit.V = hit.absDen - hit.V; - interpolateUV(hit,g,subgrid); - if (epilog(hit.valid,hit)) return true; - } - return false; - } - }; - - -#if defined (__AVX__) - - /*! Intersects 4 quads with 1 ray using AVX */ - template - struct SubGridQuadMIntersectorKMoellerTrumbore<4,K,filter> : public SubGridQuadMIntersectorKMoellerTrumboreBase<4,K,filter> - { - __forceinline SubGridQuadMIntersectorKMoellerTrumbore(const vbool& valid, const RayK& ray) - : SubGridQuadMIntersectorKMoellerTrumboreBase<4,K,filter>(valid,ray) {} - - template - __forceinline bool intersect1(RayK& ray, size_t k,const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, - const GridMesh::Grid &g, const SubGrid &subgrid, const Epilog& epilog) const - { - const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z)); -#if !defined(EMBREE_BACKFACE_CULLING) - const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z)); - const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z)); -#else - const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z)); - const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z)); -#endif - const vbool8 flags(0,0,0,0,1,1,1,1); - - MoellerTrumboreHitM<8> hit; - if (SubGridQuadMIntersectorKMoellerTrumboreBase<8,K,filter>::intersect1(ray,k,vtx0,vtx1,vtx2,hit)) - { - vfloat8 U = hit.U, V = hit.V, absDen = hit.absDen; -#if !defined(EMBREE_BACKFACE_CULLING) - hit.U = select(flags,absDen-V,U); - hit.V = select(flags,absDen-U,V); - hit.vNg *= select(flags,vfloat8(-1.0f),vfloat8(1.0f)); -#else - hit.U = select(flags,absDen-U,U); - hit.V = select(flags,absDen-V,V); -#endif - - /* correct U,V interpolation across the entire grid */ - const vint8 sx((int)subgrid.x()); - const vint8 sy((int)subgrid.y()); - const vint8 sx8(sx + vint8(0,1,1,0,0,1,1,0)); - const vint8 sy8(sy + vint8(0,0,1,1,0,0,1,1)); - const float inv_resX = rcp((float)((int)g.resX-1)); - const float inv_resY = rcp((float)((int)g.resY-1)); - hit.U = (hit.U + (vfloat8)sx8 * absDen) * inv_resX; - hit.V = (hit.V + (vfloat8)sy8 * absDen) * inv_resY; - if (unlikely(epilog(hit.valid,hit))) - return true; - - } - return false; - } - - __forceinline bool intersect1(RayHitK& ray, size_t k, IntersectContext* context, - const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const - { - return intersect1(ray,k,v0,v1,v2,v3,g,subgrid,Intersect1KEpilogMU<8,K,filter>(ray,k,context,subgrid.geomID(),subgrid.primID())); - } - - __forceinline bool occluded1(RayK& ray, size_t k, IntersectContext* context, - const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const - { - return intersect1(ray,k,v0,v1,v2,v3,g,subgrid,Occluded1KEpilogMU<8,K,filter>(ray,k,context,subgrid.geomID(),subgrid.primID())); - } - }; - -#endif - - - - } -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/subgrid_intersector_pluecker.h b/thirdparty/embree-aarch64/kernels/geometry/subgrid_intersector_pluecker.h deleted file mode 100644 index 1cd88aa799..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/subgrid_intersector_pluecker.h +++ /dev/null @@ -1,508 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "subgrid.h" -#include "quad_intersector_moeller.h" -#include "quad_intersector_pluecker.h" - -namespace embree -{ - namespace isa - { - - template - struct SubGridQuadHitPlueckerM - { - __forceinline SubGridQuadHitPlueckerM() {} - - __forceinline SubGridQuadHitPlueckerM(const vbool& valid, - const vfloat& U, - const vfloat& V, - const vfloat& UVW, - const vfloat& t, - const Vec3vf& Ng, - const vbool& flags) : valid(valid), vt(t) - { - const vbool invalid = abs(UVW) < min_rcp_input; - const vfloat rcpUVW = select(invalid,vfloat(0.0f),rcp(UVW)); - const vfloat u = min(U * rcpUVW,1.0f); - const vfloat v = min(V * rcpUVW,1.0f); - const vfloat u1 = vfloat(1.0f) - u; - const vfloat v1 = vfloat(1.0f) - v; -#if !defined(__AVX__) || defined(EMBREE_BACKFACE_CULLING) - vu = select(flags,u1,u); - vv = select(flags,v1,v); - vNg = Vec3vf(Ng.x,Ng.y,Ng.z); -#else - const vfloat flip = select(flags,vfloat(-1.0f),vfloat(1.0f)); - vv = select(flags,u1,v); - vu = select(flags,v1,u); - vNg = Vec3vf(flip*Ng.x,flip*Ng.y,flip*Ng.z); -#endif - } - - __forceinline void finalize() - { - } - - __forceinline Vec2f uv(const size_t i) - { - const float u = vu[i]; - const float v = vv[i]; - return Vec2f(u,v); - } - - __forceinline float t(const size_t i) { return vt[i]; } - __forceinline Vec3fa Ng(const size_t i) { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); } - - public: - vbool valid; - vfloat vu; - vfloat vv; - vfloat vt; - Vec3vf vNg; - }; - - template - __forceinline void interpolateUV(SubGridQuadHitPlueckerM &hit,const GridMesh::Grid &g, const SubGrid& subgrid, const vint &stepX, const vint &stepY) - { - /* correct U,V interpolation across the entire grid */ - const vint sx((int)subgrid.x()); - const vint sy((int)subgrid.y()); - const vint sxM(sx + stepX); - const vint syM(sy + stepY); - const float inv_resX = rcp((float)((int)g.resX-1)); - const float inv_resY = rcp((float)((int)g.resY-1)); - hit.vu = (hit.vu + vfloat(sxM)) * inv_resX; - hit.vv = (hit.vv + vfloat(syM)) * inv_resY; - } - - template - __forceinline static bool intersectPluecker(Ray& ray, - const Vec3vf& tri_v0, - const Vec3vf& tri_v1, - const Vec3vf& tri_v2, - const vbool& flags, - SubGridQuadHitPlueckerM &hit) - { - /* calculate vertices relative to ray origin */ - const Vec3vf O = Vec3vf((Vec3fa)ray.org); - const Vec3vf D = Vec3vf((Vec3fa)ray.dir); - const Vec3vf v0 = tri_v0-O; - const Vec3vf v1 = tri_v1-O; - const Vec3vf v2 = tri_v2-O; - - /* calculate triangle edges */ - const Vec3vf e0 = v2-v0; - const Vec3vf e1 = v0-v1; - const Vec3vf e2 = v1-v2; - - /* perform edge tests */ - const vfloat U = dot(cross(e0,v2+v0),D); - const vfloat V = dot(cross(e1,v0+v1),D); - const vfloat W = dot(cross(e2,v1+v2),D); - const vfloat UVW = U+V+W; - const vfloat eps = float(ulp)*abs(UVW); -#if defined(EMBREE_BACKFACE_CULLING) - vbool valid = max(U,V,W) <= eps; -#else - vbool valid = (min(U,V,W) >= -eps) | (max(U,V,W) <= eps); -#endif - if (unlikely(none(valid))) return false; - - /* calculate geometry normal and denominator */ - const Vec3vf Ng = stable_triangle_normal(e0,e1,e2); - const vfloat den = twice(dot(Ng,D)); - - /* perform depth test */ - const vfloat T = twice(dot(v0,Ng)); - const vfloat t = rcp(den)*T; - valid &= vfloat(ray.tnear()) <= t & t <= vfloat(ray.tfar); - valid &= den != vfloat(zero); - if (unlikely(none(valid))) return false; - - /* update hit information */ - new (&hit) SubGridQuadHitPlueckerM(valid,U,V,UVW,t,Ng,flags); - return true; - } - - template - struct SubGridQuadMIntersector1Pluecker; - - template - struct SubGridQuadMIntersector1Pluecker - { - __forceinline SubGridQuadMIntersector1Pluecker() {} - - __forceinline SubGridQuadMIntersector1Pluecker(const Ray& ray, const void* ptr) {} - - __forceinline void intersect(RayHit& ray, IntersectContext* context, - const Vec3vf& v0, const Vec3vf& v1, const Vec3vf& v2, const Vec3vf& v3, - const GridMesh::Grid &g, const SubGrid& subgrid) const - { - SubGridQuadHitPlueckerM hit; - Intersect1EpilogMU epilog(ray,context,subgrid.geomID(),subgrid.primID()); - - /* intersect first triangle */ - if (intersectPluecker(ray,v0,v1,v3,vbool(false),hit)) - { - interpolateUV(hit,g,subgrid,vint(0,1,1,0),vint(0,0,1,1)); - epilog(hit.valid,hit); - } - - /* intersect second triangle */ - if (intersectPluecker(ray,v2,v3,v1,vbool(true),hit)) - { - interpolateUV(hit,g,subgrid,vint(0,1,1,0),vint(0,0,1,1)); - epilog(hit.valid,hit); - } - } - - __forceinline bool occluded(Ray& ray, IntersectContext* context, - const Vec3vf& v0, const Vec3vf& v1, const Vec3vf& v2, const Vec3vf& v3, - const GridMesh::Grid &g, const SubGrid& subgrid) const - { - SubGridQuadHitPlueckerM hit; - Occluded1EpilogMU epilog(ray,context,subgrid.geomID(),subgrid.primID()); - - /* intersect first triangle */ - if (intersectPluecker(ray,v0,v1,v3,vbool(false),hit)) - { - interpolateUV(hit,g,subgrid,vint(0,1,1,0),vint(0,0,1,1)); - if (epilog(hit.valid,hit)) - return true; - } - - /* intersect second triangle */ - if (intersectPluecker(ray,v2,v3,v1,vbool(true),hit)) - { - interpolateUV(hit,g,subgrid,vint(0,1,1,0),vint(0,0,1,1)); - if (epilog(hit.valid,hit)) - return true; - } - - return false; - } - }; - -#if defined (__AVX__) - - /*! Intersects 4 quads with 1 ray using AVX */ - template - struct SubGridQuadMIntersector1Pluecker<4,filter> - { - __forceinline SubGridQuadMIntersector1Pluecker() {} - - __forceinline SubGridQuadMIntersector1Pluecker(const Ray& ray, const void* ptr) {} - - template - __forceinline bool intersect(Ray& ray, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const GridMesh::Grid &g, const SubGrid& subgrid, const Epilog& epilog) const - { - const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z)); -#if !defined(EMBREE_BACKFACE_CULLING) - const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z)); - const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z)); -#else - const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z)); - const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z)); -#endif - SubGridQuadHitPlueckerM<8> hit; - const vbool8 flags(0,0,0,0,1,1,1,1); - if (unlikely(intersectPluecker(ray,vtx0,vtx1,vtx2,flags,hit))) - { - /* correct U,V interpolation across the entire grid */ - interpolateUV<8>(hit,g,subgrid,vint<8>(0,1,1,0,0,1,1,0),vint<8>(0,0,1,1,0,0,1,1)); - if (unlikely(epilog(hit.valid,hit))) - return true; - } - return false; - } - - __forceinline bool intersect(RayHit& ray, IntersectContext* context, - const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, - const GridMesh::Grid &g, const SubGrid& subgrid) const - { - return intersect(ray,v0,v1,v2,v3,g,subgrid,Intersect1EpilogMU<8,filter>(ray,context,subgrid.geomID(),subgrid.primID())); - } - - __forceinline bool occluded(Ray& ray, IntersectContext* context, - const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, - const GridMesh::Grid &g, const SubGrid& subgrid) const - { - return intersect(ray,v0,v1,v2,v3,g,subgrid,Occluded1EpilogMU<8,filter>(ray,context,subgrid.geomID(),subgrid.primID())); - } - }; - -#endif - - - /* ----------------------------- */ - /* -- ray packet intersectors -- */ - /* ----------------------------- */ - - template - struct SubGridQuadHitPlueckerK - { - __forceinline SubGridQuadHitPlueckerK(const vfloat& U, - const vfloat& V, - const vfloat& UVW, - const vfloat& t, - const Vec3vf& Ng, - const vbool& flags, - const GridMesh::Grid &g, - const SubGrid& subgrid, - const unsigned int i) - : U(U), V(V), UVW(UVW), t(t), flags(flags), tri_Ng(Ng), g(g), subgrid(subgrid), i(i) {} - - __forceinline std::tuple,vfloat,vfloat,Vec3vf> operator() () const - { - const vbool invalid = abs(UVW) < min_rcp_input; - const vfloat rcpUVW = select(invalid,vfloat(0.0f),rcp(UVW)); - const vfloat u0 = min(U * rcpUVW,1.0f); - const vfloat v0 = min(V * rcpUVW,1.0f); - const vfloat u1 = vfloat(1.0f) - u0; - const vfloat v1 = vfloat(1.0f) - v0; - const vfloat uu = select(flags,u1,u0); - const vfloat vv = select(flags,v1,v0); - const unsigned int sx = subgrid.x() + (unsigned int)(i % 2); - const unsigned int sy = subgrid.y() + (unsigned int)(i >>1); - const float inv_resX = rcp((float)(int)(g.resX-1)); - const float inv_resY = rcp((float)(int)(g.resY-1)); - const vfloat u = (uu + (float)(int)sx) * inv_resX; - const vfloat v = (vv + (float)(int)sy) * inv_resY; - const Vec3vf Ng(tri_Ng.x,tri_Ng.y,tri_Ng.z); - return std::make_tuple(u,v,t,Ng); - } - - private: - const vfloat U; - const vfloat V; - const vfloat UVW; - const vfloat t; - const vfloat absDen; - const vbool flags; - const Vec3vf tri_Ng; - - const GridMesh::Grid &g; - const SubGrid& subgrid; - const size_t i; - }; - - - template - struct SubGridQuadMIntersectorKPlueckerBase - { - __forceinline SubGridQuadMIntersectorKPlueckerBase(const vbool& valid, const RayK& ray) {} - - template - __forceinline vbool intersectK(const vbool& valid0, - RayK& ray, - const Vec3vf& tri_v0, - const Vec3vf& tri_v1, - const Vec3vf& tri_v2, - const Vec3vf& tri_Ng, - const vbool& flags, - const GridMesh::Grid &g, - const SubGrid &subgrid, - const unsigned int i, - const Epilog& epilog) const - { - /* calculate denominator */ - /* calculate vertices relative to ray origin */ - vbool valid = valid0; - const Vec3vf O = ray.org; - const Vec3vf D = ray.dir; - const Vec3vf v0 = tri_v0-O; - const Vec3vf v1 = tri_v1-O; - const Vec3vf v2 = tri_v2-O; - - /* calculate triangle edges */ - const Vec3vf e0 = v2-v0; - const Vec3vf e1 = v0-v1; - const Vec3vf e2 = v1-v2; - - /* perform edge tests */ - const vfloat U = dot(Vec3vf(cross(e0,v2+v0)),D); - const vfloat V = dot(Vec3vf(cross(e1,v0+v1)),D); - const vfloat W = dot(Vec3vf(cross(e2,v1+v2)),D); - const vfloat UVW = U+V+W; - const vfloat eps = float(ulp)*abs(UVW); -#if defined(EMBREE_BACKFACE_CULLING) - valid &= max(U,V,W) <= eps; -#else - valid &= (min(U,V,W) >= -eps) | (max(U,V,W) <= eps); -#endif - if (unlikely(none(valid))) return false; - - /* calculate geometry normal and denominator */ - const Vec3vf Ng = stable_triangle_normal(e0,e1,e2); - const vfloat den = twice(dot(Vec3vf(Ng),D)); - - /* perform depth test */ - const vfloat T = twice(dot(v0,Vec3vf(Ng))); - const vfloat t = rcp(den)*T; - valid &= ray.tnear() <= t & t <= ray.tfar; - valid &= den != vfloat(zero); - if (unlikely(none(valid))) return false; - - /* calculate hit information */ - SubGridQuadHitPlueckerK hit(U,V,UVW,t,tri_Ng,flags,g,subgrid,i); - return epilog(valid,hit); - } - - template - __forceinline vbool intersectK(const vbool& valid0, - RayK& ray, - const Vec3vf& v0, - const Vec3vf& v1, - const Vec3vf& v2, - const vbool& flags, - const GridMesh::Grid &g, - const SubGrid &subgrid, - const unsigned int i, - const Epilog& epilog) const - { - const Vec3vf e1 = v0-v1; - const Vec3vf e2 = v2-v0; - const Vec3vf Ng = cross(e2,e1); - return intersectK(valid0,ray,v0,v1,v2,Ng,flags,g,subgrid,i,epilog); - } - - template - __forceinline bool intersectK(const vbool& valid0, - RayK& ray, - const Vec3vf& v0, - const Vec3vf& v1, - const Vec3vf& v2, - const Vec3vf& v3, - const GridMesh::Grid &g, - const SubGrid &subgrid, - const unsigned int i, - const Epilog& epilog) const - { - intersectK(valid0,ray,v0,v1,v3,vbool(false),g,subgrid,i,epilog); - if (none(valid0)) return true; - intersectK(valid0,ray,v2,v3,v1,vbool(true ),g,subgrid,i,epilog); - return none(valid0); - } - - static __forceinline bool intersect1(RayK& ray, - size_t k, - const Vec3vf& tri_v0, - const Vec3vf& tri_v1, - const Vec3vf& tri_v2, - const Vec3vf& tri_Ng, - const vbool& flags, - SubGridQuadHitPlueckerM &hit) - { - /* calculate vertices relative to ray origin */ - const Vec3vf O = broadcast>(ray.org,k); - const Vec3vf D = broadcast>(ray.dir,k); - const Vec3vf v0 = tri_v0-O; - const Vec3vf v1 = tri_v1-O; - const Vec3vf v2 = tri_v2-O; - - /* calculate triangle edges */ - const Vec3vf e0 = v2-v0; - const Vec3vf e1 = v0-v1; - const Vec3vf e2 = v1-v2; - - /* perform edge tests */ - const vfloat U = dot(cross(e0,v2+v0),D); - const vfloat V = dot(cross(e1,v0+v1),D); - const vfloat W = dot(cross(e2,v1+v2),D); - const vfloat UVW = U+V+W; - const vfloat eps = float(ulp)*abs(UVW); -#if defined(EMBREE_BACKFACE_CULLING) - vbool valid = max(U,V,W) <= eps ; -#else - vbool valid = (min(U,V,W) >= -eps) | (max(U,V,W) <= eps); -#endif - if (unlikely(none(valid))) return false; - - /* calculate geometry normal and denominator */ - const Vec3vf Ng = stable_triangle_normal(e0,e1,e2); - const vfloat den = twice(dot(Ng,D)); - - /* perform depth test */ - const vfloat T = twice(dot(v0,Ng)); - const vfloat t = rcp(den)*T; - valid &= vfloat(ray.tnear()[k]) <= t & t <= vfloat(ray.tfar[k]); - if (unlikely(none(valid))) return false; - - /* avoid division by 0 */ - valid &= den != vfloat(zero); - if (unlikely(none(valid))) return false; - - /* update hit information */ - new (&hit) SubGridQuadHitPlueckerM(valid,U,V,UVW,t,tri_Ng,flags); - return true; - } - - static __forceinline bool intersect1(RayK& ray, - size_t k, - const Vec3vf& v0, - const Vec3vf& v1, - const Vec3vf& v2, - const vbool& flags, - SubGridQuadHitPlueckerM &hit) - { - const Vec3vf e1 = v0-v1; - const Vec3vf e2 = v2-v0; - const Vec3vf Ng = cross(e2,e1); // FIXME: optimize!!! - return intersect1(ray,k,v0,v1,v2,Ng,flags,hit); - } - - }; - - template - struct SubGridQuadMIntersectorKPluecker : public SubGridQuadMIntersectorKPlueckerBase - { - __forceinline SubGridQuadMIntersectorKPluecker(const vbool& valid, const RayK& ray) - : SubGridQuadMIntersectorKPlueckerBase(valid,ray) {} - - __forceinline void intersect1(RayHitK& ray, size_t k, IntersectContext* context, - const Vec3vf& v0, const Vec3vf& v1, const Vec3vf& v2, const Vec3vf& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const - { - Intersect1KEpilogMU epilog(ray,k,context,subgrid.geomID(),subgrid.primID()); - - SubGridQuadHitPlueckerM<4> hit; - if (SubGridQuadMIntersectorKPlueckerBase<4,K,filter>::intersect1(ray,k,v0,v1,v3,vboolf4(false),hit)) - { - interpolateUV(hit,g,subgrid,vint(0,1,1,0),vint(0,0,1,1)); - epilog(hit.valid,hit); - } - - if (SubGridQuadMIntersectorKPlueckerBase<4,K,filter>::intersect1(ray,k,v2,v3,v1,vboolf4(true),hit)) - { - interpolateUV(hit,g,subgrid,vint(0,1,1,0),vint(0,0,1,1)); - epilog(hit.valid,hit); - } - - } - - __forceinline bool occluded1(RayK& ray, size_t k, IntersectContext* context, - const Vec3vf& v0, const Vec3vf& v1, const Vec3vf& v2, const Vec3vf& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const - { - Occluded1KEpilogMU epilog(ray,k,context,subgrid.geomID(),subgrid.primID()); - - SubGridQuadHitPlueckerM<4> hit; - if (SubGridQuadMIntersectorKPlueckerBase<4,K,filter>::intersect1(ray,k,v0,v1,v3,vboolf4(false),hit)) - { - interpolateUV(hit,g,subgrid,vint(0,1,1,0),vint(0,0,1,1)); - if (epilog(hit.valid,hit)) return true; - } - - if (SubGridQuadMIntersectorKPlueckerBase<4,K,filter>::intersect1(ray,k,v2,v3,v1,vboolf4(true),hit)) - { - interpolateUV(hit,g,subgrid,vint(0,1,1,0),vint(0,0,1,1)); - if (epilog(hit.valid,hit)) return true; - } - return false; - } - }; - - } -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/subgrid_mb_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/subgrid_mb_intersector.h deleted file mode 100644 index 400a88b985..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/subgrid_mb_intersector.h +++ /dev/null @@ -1,236 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "subgrid_intersector.h" - -namespace embree -{ - namespace isa - { - template - struct SubGridMBIntersector1Pluecker - { - typedef SubGridMBQBVHN Primitive; - typedef SubGridQuadMIntersector1Pluecker<4,filter> Precalculations; - - static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const SubGrid& subgrid) - { - STAT3(normal.trav_prims,1,1,1); - const GridMesh* mesh = context->scene->get(subgrid.geomID()); - const GridMesh::Grid &g = mesh->grid(subgrid.primID()); - - float ftime; - const int itime = mesh->timeSegment(ray.time(), ftime); - Vec3vf4 v0,v1,v2,v3; subgrid.gatherMB(v0,v1,v2,v3,context->scene,itime,ftime); - pre.intersect(ray,context,v0,v1,v2,v3,g,subgrid); - } - - static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const SubGrid& subgrid) - { - STAT3(shadow.trav_prims,1,1,1); - const GridMesh* mesh = context->scene->get(subgrid.geomID()); - const GridMesh::Grid &g = mesh->grid(subgrid.primID()); - - float ftime; - const int itime = mesh->timeSegment(ray.time(), ftime); - - Vec3vf4 v0,v1,v2,v3; subgrid.gatherMB(v0,v1,v2,v3,context->scene,itime,ftime); - return pre.occluded(ray,context,v0,v1,v2,v3,g,subgrid); - } - - static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const SubGrid& subgrid) - { - return PrimitivePointQuery1::pointQuery(query, context, subgrid); - } - - template - static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay &tray, size_t& lazy_node) - { - BVHNQuantizedBaseNodeIntersector1 isec1; - for (size_t i=0;i dist; - const float time = prim[i].adjustTime(ray.time()); - - assert(time <= 1.0f); - size_t mask = isec1.intersect(&prim[i].qnode,tray,time,dist); -#if defined(__AVX__) - STAT3(normal.trav_hit_boxes[popcnt(mask)],1,1,1); -#endif - while(mask != 0) - { - const size_t ID = bscf(mask); - if (unlikely(dist[ID] > ray.tfar)) continue; - intersect(pre,ray,context,prim[i].subgrid(ID)); - } - } - } - - template - static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay &tray, size_t& lazy_node) - { - BVHNQuantizedBaseNodeIntersector1 isec1; - for (size_t i=0;i dist; - size_t mask = isec1.intersect(&prim[i].qnode,tray,time,dist); - while(mask != 0) - { - const size_t ID = bscf(mask); - if (occluded(pre,ray,context,prim[i].subgrid(ID))) - return true; - } - } - return false; - } - - static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, const Primitive* prim, size_t num, const TravPointQuery &tquery, size_t& lazy_node) - { - assert(false && "not implemented"); - return false; - } - }; - - - template - struct SubGridMBIntersectorKPluecker - { - typedef SubGridMBQBVHN Primitive; - typedef SubGridQuadMIntersectorKPluecker<4,K,filter> Precalculations; - - static __forceinline void intersect(const vbool& valid_i, Precalculations& pre, RayHitK& ray, IntersectContext* context, const SubGrid& subgrid) - { - size_t m_valid = movemask(valid_i); - while(m_valid) - { - size_t ID = bscf(m_valid); - intersect(pre,ray,ID,context,subgrid); - } - } - - static __forceinline vbool occluded(const vbool& valid_i, Precalculations& pre, RayK& ray, IntersectContext* context, const SubGrid& subgrid) - { - vbool valid0 = valid_i; - size_t m_valid = movemask(valid_i); - while(m_valid) - { - size_t ID = bscf(m_valid); - if (occluded(pre,ray,ID,context,subgrid)) - clear(valid0,ID); - } - return !valid0; - } - - static __forceinline void intersect(Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const SubGrid& subgrid) - { - STAT3(normal.trav_prims,1,1,1); - const GridMesh* mesh = context->scene->get(subgrid.geomID()); - const GridMesh::Grid &g = mesh->grid(subgrid.primID()); - - vfloat ftime; - const vint itime = mesh->timeSegment(ray.time(), ftime); - Vec3vf4 v0,v1,v2,v3; subgrid.gatherMB(v0,v1,v2,v3,context->scene,itime[k],ftime[k]); - pre.intersect1(ray,k,context,v0,v1,v2,v3,g,subgrid); - } - - static __forceinline bool occluded(Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const SubGrid& subgrid) - { - STAT3(shadow.trav_prims,1,1,1); - const GridMesh* mesh = context->scene->get(subgrid.geomID()); - const GridMesh::Grid &g = mesh->grid(subgrid.primID()); - - vfloat ftime; - const vint itime = mesh->timeSegment(ray.time(), ftime); - Vec3vf4 v0,v1,v2,v3; subgrid.gatherMB(v0,v1,v2,v3,context->scene,itime[k],ftime[k]); - return pre.occluded1(ray,k,context,v0,v1,v2,v3,g,subgrid); - } - - template - static __forceinline void intersect(const vbool& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK &tray, size_t& lazy_node) - { - BVHNQuantizedBaseNodeIntersectorK isecK; - for (size_t j=0;j time = prim[j].adjustTime(ray.time()); - - vfloat dist; - while(m_valid) - { - const size_t i = bscf(m_valid); - if (none(valid & isecK.intersectK(&prim[j].qnode,i,tray,time,dist))) continue; - intersect(valid,pre,ray,context,prim[j].subgrid(i)); - } - } - } - - template - static __forceinline vbool occluded(const vbool& valid, const Accel::Intersectors* This, Precalculations& pre, RayK& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK &tray, size_t& lazy_node) - { - BVHNQuantizedBaseNodeIntersectorK isecK; - - vbool valid0 = valid; - for (size_t j=0;j time = prim[j].adjustTime(ray.time()); - vfloat dist; - while(m_valid) - { - const size_t i = bscf(m_valid); - if (none(valid0 & isecK.intersectK(&prim[j].qnode,i,tray,time,dist))) continue; - valid0 &= !occluded(valid0,pre,ray,context,prim[j].subgrid(i)); - if (none(valid0)) break; - } - } - return !valid0; - } - - template - static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay &tray, size_t& lazy_node) - { - BVHNQuantizedBaseNodeIntersector1 isec1; - for (size_t i=0;i dist; - const float time = prim[i].adjustTime(ray.time()[k]); - assert(time <= 1.0f); - - size_t mask = isec1.intersect(&prim[i].qnode,tray,time,dist); - while(mask != 0) - { - const size_t ID = bscf(mask); - if (unlikely(dist[ID] > ray.tfar[k])) continue; - intersect(pre,ray,k,context,prim[i].subgrid(ID)); - } - } - } - - template - static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay &tray, size_t& lazy_node) - { - BVHNQuantizedBaseNodeIntersector1 isec1; - - for (size_t i=0;i dist; - const float time = prim[i].adjustTime(ray.time()[k]); - assert(time <= 1.0f); - - size_t mask = isec1.intersect(&prim[i].qnode,tray,time,dist); - while(mask != 0) - { - const size_t ID = bscf(mask); - if (occluded(pre,ray,k,context,prim[i].subgrid(ID))) - return true; - } - } - return false; - } - }; - } -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/triangle.h b/thirdparty/embree-aarch64/kernels/geometry/triangle.h deleted file mode 100644 index 0dedf6dc4c..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/triangle.h +++ /dev/null @@ -1,162 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "primitive.h" - -namespace embree -{ - /* Precalculated representation for M triangles. Stores for each - triangle a base vertex, two edges, and the geometry normal to - speed up intersection calculations */ - template - struct TriangleM - { - public: - struct Type : public PrimitiveType - { - const char* name() const; - size_t sizeActive(const char* This) const; - size_t sizeTotal(const char* This) const; - size_t getBytes(const char* This) const; - }; - static Type type; - - public: - - /* Returns maximum number of stored triangles */ - static __forceinline size_t max_size() { return M; } - - /* Returns required number of primitive blocks for N primitives */ - static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); } - - public: - - /* Default constructor */ - __forceinline TriangleM() {} - - /* Construction from vertices and IDs */ - __forceinline TriangleM(const Vec3vf& v0, const Vec3vf& v1, const Vec3vf& v2, const vuint& geomIDs, const vuint& primIDs) - : v0(v0), e1(v0-v1), e2(v2-v0), geomIDs(geomIDs), primIDs(primIDs) {} - - /* Returns a mask that tells which triangles are valid */ - __forceinline vbool valid() const { return geomIDs != vuint(-1); } - - /* Returns true if the specified triangle is valid */ - __forceinline bool valid(const size_t i) const { assert(i& geomID() { return geomIDs; } - __forceinline const vuint& geomID() const { return geomIDs; } - __forceinline unsigned int geomID(const size_t i) const { assert(i& primID() { return primIDs; } - __forceinline const vuint& primID() const { return primIDs; } - __forceinline unsigned int primID(const size_t i) const { assert(i p0 = v0; - Vec3vf p1 = v0-e1; - Vec3vf p2 = v0+e2; - Vec3vf lower = min(p0,p1,p2); - Vec3vf upper = max(p0,p1,p2); - vbool mask = valid(); - lower.x = select(mask,lower.x,vfloat(pos_inf)); - lower.y = select(mask,lower.y,vfloat(pos_inf)); - lower.z = select(mask,lower.z,vfloat(pos_inf)); - upper.x = select(mask,upper.x,vfloat(neg_inf)); - upper.y = select(mask,upper.y,vfloat(neg_inf)); - upper.z = select(mask,upper.z,vfloat(neg_inf)); - return BBox3fa(Vec3fa(reduce_min(lower.x),reduce_min(lower.y),reduce_min(lower.z)), - Vec3fa(reduce_max(upper.x),reduce_max(upper.y),reduce_max(upper.z))); - } - - /* Non temporal store */ - __forceinline static void store_nt(TriangleM* dst, const TriangleM& src) - { - vfloat::store_nt(&dst->v0.x,src.v0.x); - vfloat::store_nt(&dst->v0.y,src.v0.y); - vfloat::store_nt(&dst->v0.z,src.v0.z); - vfloat::store_nt(&dst->e1.x,src.e1.x); - vfloat::store_nt(&dst->e1.y,src.e1.y); - vfloat::store_nt(&dst->e1.z,src.e1.z); - vfloat::store_nt(&dst->e2.x,src.e2.x); - vfloat::store_nt(&dst->e2.y,src.e2.y); - vfloat::store_nt(&dst->e2.z,src.e2.z); - vuint::store_nt(&dst->geomIDs,src.geomIDs); - vuint::store_nt(&dst->primIDs,src.primIDs); - } - - /* Fill triangle from triangle list */ - __forceinline void fill(const PrimRef* prims, size_t& begin, size_t end, Scene* scene) - { - vuint vgeomID = -1, vprimID = -1; - Vec3vf v0 = zero, v1 = zero, v2 = zero; - - for (size_t i=0; iget(geomID); - const TriangleMesh::Triangle& tri = mesh->triangle(primID); - const Vec3fa& p0 = mesh->vertex(tri.v[0]); - const Vec3fa& p1 = mesh->vertex(tri.v[1]); - const Vec3fa& p2 = mesh->vertex(tri.v[2]); - vgeomID [i] = geomID; - vprimID [i] = primID; - v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z; - v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z; - v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z; - } - TriangleM::store_nt(this,TriangleM(v0,v1,v2,vgeomID,vprimID)); - } - - /* Updates the primitive */ - __forceinline BBox3fa update(TriangleMesh* mesh) - { - BBox3fa bounds = empty; - vuint vgeomID = -1, vprimID = -1; - Vec3vf v0 = zero, v1 = zero, v2 = zero; - - for (size_t i=0; itriangle(primId); - const Vec3fa p0 = mesh->vertex(tri.v[0]); - const Vec3fa p1 = mesh->vertex(tri.v[1]); - const Vec3fa p2 = mesh->vertex(tri.v[2]); - bounds.extend(merge(BBox3fa(p0),BBox3fa(p1),BBox3fa(p2))); - vgeomID [i] = geomId; - vprimID [i] = primId; - v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z; - v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z; - v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z; - } - TriangleM::store_nt(this,TriangleM(v0,v1,v2,vgeomID,vprimID)); - return bounds; - } - - public: - Vec3vf v0; // base vertex of the triangles - Vec3vf e1; // 1st edge of the triangles (v0-v1) - Vec3vf e2; // 2nd edge of the triangles (v2-v0) - private: - vuint geomIDs; // geometry IDs - vuint primIDs; // primitive IDs - }; - - template - typename TriangleM::Type TriangleM::type; - - typedef TriangleM<4> Triangle4; -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector.h deleted file mode 100644 index 125a42c5fe..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector.h +++ /dev/null @@ -1,96 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "triangle.h" -#include "triangle_intersector_moeller.h" - -namespace embree -{ - namespace isa - { - /*! Intersects M triangles with 1 ray */ - template - struct TriangleMIntersector1Moeller - { - typedef TriangleM Primitive; - typedef MoellerTrumboreIntersector1 Precalculations; - - /*! Intersect a ray with the M triangles and updates the hit. */ - static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const TriangleM& tri) - { - STAT3(normal.trav_prims,1,1,1); - pre.intersectEdge(ray,tri.v0,tri.e1,tri.e2,Intersect1EpilogM(ray,context,tri.geomID(),tri.primID())); - } - - /*! Test if the ray is occluded by one of M triangles. */ - static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const TriangleM& tri) - { - STAT3(shadow.trav_prims,1,1,1); - return pre.intersectEdge(ray,tri.v0,tri.e1,tri.e2,Occluded1EpilogM(ray,context,tri.geomID(),tri.primID())); - } - - static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri) - { - return PrimitivePointQuery1::pointQuery(query, context, tri); - } - - }; - - /*! Intersects M triangles with K rays. */ - template - struct TriangleMIntersectorKMoeller - { - typedef TriangleM Primitive; - typedef MoellerTrumboreIntersectorK Precalculations; - - /*! Intersects K rays with M triangles. */ - static __forceinline void intersect(const vbool& valid_i, Precalculations& pre, RayHitK& ray, IntersectContext* context, const TriangleM& tri) - { - STAT_USER(0,TriangleM::max_size()); - for (size_t i=0; i::max_size(); i++) - { - if (!tri.valid(i)) break; - STAT3(normal.trav_prims,1,popcnt(valid_i),K); - const Vec3vf p0 = broadcast>(tri.v0,i); - const Vec3vf e1 = broadcast>(tri.e1,i); - const Vec3vf e2 = broadcast>(tri.e2,i); - pre.intersectEdgeK(valid_i,ray,p0,e1,e2,IntersectKEpilogM(ray,context,tri.geomID(),tri.primID(),i)); - } - } - - /*! Test for K rays if they are occluded by any of the M triangles. */ - static __forceinline vbool occluded(const vbool& valid_i, Precalculations& pre, RayK& ray, IntersectContext* context, const TriangleM& tri) - { - vbool valid0 = valid_i; - - for (size_t i=0; i::max_size(); i++) - { - if (!tri.valid(i)) break; - STAT3(shadow.trav_prims,1,popcnt(valid0),K); - const Vec3vf p0 = broadcast>(tri.v0,i); - const Vec3vf e1 = broadcast>(tri.e1,i); - const Vec3vf e2 = broadcast>(tri.e2,i); - pre.intersectEdgeK(valid0,ray,p0,e1,e2,OccludedKEpilogM(valid0,ray,context,tri.geomID(),tri.primID(),i)); - if (none(valid0)) break; - } - return !valid0; - } - - /*! Intersect a ray with M triangles and updates the hit. */ - static __forceinline void intersect(Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const TriangleM& tri) - { - STAT3(normal.trav_prims,1,1,1); - pre.intersectEdge(ray,k,tri.v0,tri.e1,tri.e2,Intersect1KEpilogM(ray,k,context,tri.geomID(),tri.primID())); - } - - /*! Test if the ray is occluded by one of the M triangles. */ - static __forceinline bool occluded(Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const TriangleM& tri) - { - STAT3(shadow.trav_prims,1,1,1); - return pre.intersectEdge(ray,k,tri.v0,tri.e1,tri.e2,Occluded1KEpilogM(ray,k,context,tri.geomID(),tri.primID())); - } - }; - } -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector_moeller.h b/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector_moeller.h deleted file mode 100644 index b5a8519236..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector_moeller.h +++ /dev/null @@ -1,403 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "triangle.h" -#include "intersector_epilog.h" - -/*! This intersector implements a modified version of the Moeller - * Trumbore intersector from the paper "Fast, Minimum Storage - * Ray-Triangle Intersection". In contrast to the paper we - * precalculate some factors and factor the calculations differently - * to allow precalculating the cross product e1 x e2. The resulting - * algorithm is similar to the fastest one of the paper "Optimizing - * Ray-Triangle Intersection via Automated Search". */ - -namespace embree -{ - namespace isa - { - template - struct MoellerTrumboreHitM - { - __forceinline MoellerTrumboreHitM() {} - - __forceinline MoellerTrumboreHitM(const vbool& valid, const vfloat& U, const vfloat& V, const vfloat& T, const vfloat& absDen, const Vec3vf& Ng) - : U(U), V(V), T(T), absDen(absDen), valid(valid), vNg(Ng) {} - - __forceinline void finalize() - { - const vfloat rcpAbsDen = rcp(absDen); - vt = T * rcpAbsDen; - vu = U * rcpAbsDen; - vv = V * rcpAbsDen; - } - - __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); } - __forceinline float t (const size_t i) const { return vt[i]; } - __forceinline Vec3fa Ng(const size_t i) const { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); } - - public: - vfloat U; - vfloat V; - vfloat T; - vfloat absDen; - - public: - vbool valid; - vfloat vu; - vfloat vv; - vfloat vt; - Vec3vf vNg; - }; - - template - struct MoellerTrumboreIntersector1 - { - __forceinline MoellerTrumboreIntersector1() {} - - __forceinline MoellerTrumboreIntersector1(const Ray& ray, const void* ptr) {} - - __forceinline bool intersect(const vbool& valid0, - Ray& ray, - const Vec3vf& tri_v0, - const Vec3vf& tri_e1, - const Vec3vf& tri_e2, - const Vec3vf& tri_Ng, - MoellerTrumboreHitM& hit) const - { - /* calculate denominator */ - vbool valid = valid0; - const Vec3vf O = Vec3vf((Vec3fa)ray.org); - const Vec3vf D = Vec3vf((Vec3fa)ray.dir); - const Vec3vf C = Vec3vf(tri_v0) - O; - const Vec3vf R = cross(C,D); - const vfloat den = dot(Vec3vf(tri_Ng),D); - - const vfloat absDen = abs(den); - const vfloat sgnDen = signmsk(den); - - /* perform edge tests */ - const vfloat U = dot(R,Vec3vf(tri_e2)) ^ sgnDen; - const vfloat V = dot(R,Vec3vf(tri_e1)) ^ sgnDen; - - /* perform backface culling */ -#if defined(EMBREE_BACKFACE_CULLING) - valid &= (den < vfloat(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen); -#else - valid &= (den != vfloat(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen); -#endif - if (likely(none(valid))) return false; - - /* perform depth test */ - const vfloat T = dot(Vec3vf(tri_Ng),C) ^ sgnDen; - valid &= (absDen*vfloat(ray.tnear()) < T) & (T <= absDen*vfloat(ray.tfar)); - if (likely(none(valid))) return false; - - - /* update hit information */ - new (&hit) MoellerTrumboreHitM(valid,U,V,T,absDen,tri_Ng); - - return true; - } - - __forceinline bool intersectEdge(Ray& ray, - const Vec3vf& tri_v0, - const Vec3vf& tri_e1, - const Vec3vf& tri_e2, - MoellerTrumboreHitM& hit) const - { - vbool valid = true; - const Vec3> tri_Ng = cross(tri_e2,tri_e1); - return intersect(valid,ray,tri_v0,tri_e1,tri_e2,tri_Ng,hit); - } - - __forceinline bool intersect(Ray& ray, - const Vec3vf& v0, - const Vec3vf& v1, - const Vec3vf& v2, - MoellerTrumboreHitM& hit) const - { - const Vec3vf e1 = v0-v1; - const Vec3vf e2 = v2-v0; - return intersectEdge(ray,v0,e1,e2,hit); - } - - __forceinline bool intersect(const vbool& valid, - Ray& ray, - const Vec3vf& v0, - const Vec3vf& v1, - const Vec3vf& v2, - MoellerTrumboreHitM& hit) const - { - const Vec3vf e1 = v0-v1; - const Vec3vf e2 = v2-v0; - return intersectEdge(valid,ray,v0,e1,e2,hit); - } - - template - __forceinline bool intersectEdge(Ray& ray, - const Vec3vf& v0, - const Vec3vf& e1, - const Vec3vf& e2, - const Epilog& epilog) const - { - MoellerTrumboreHitM hit; - if (likely(intersectEdge(ray,v0,e1,e2,hit))) return epilog(hit.valid,hit); - return false; - } - - template - __forceinline bool intersect(Ray& ray, - const Vec3vf& v0, - const Vec3vf& v1, - const Vec3vf& v2, - const Epilog& epilog) const - { - MoellerTrumboreHitM hit; - if (likely(intersect(ray,v0,v1,v2,hit))) return epilog(hit.valid,hit); - return false; - } - - template - __forceinline bool intersect(const vbool& valid, - Ray& ray, - const Vec3vf& v0, - const Vec3vf& v1, - const Vec3vf& v2, - const Epilog& epilog) const - { - MoellerTrumboreHitM hit; - if (likely(intersect(valid,ray,v0,v1,v2,hit))) return epilog(hit.valid,hit); - return false; - } - }; - - template - struct MoellerTrumboreHitK - { - __forceinline MoellerTrumboreHitK(const vfloat& U, const vfloat& V, const vfloat& T, const vfloat& absDen, const Vec3vf& Ng) - : U(U), V(V), T(T), absDen(absDen), Ng(Ng) {} - - __forceinline std::tuple,vfloat,vfloat,Vec3vf> operator() () const - { - const vfloat rcpAbsDen = rcp(absDen); - const vfloat t = T * rcpAbsDen; - const vfloat u = U * rcpAbsDen; - const vfloat v = V * rcpAbsDen; - return std::make_tuple(u,v,t,Ng); - } - - private: - const vfloat U; - const vfloat V; - const vfloat T; - const vfloat absDen; - const Vec3vf Ng; - }; - - template - struct MoellerTrumboreIntersectorK - { - __forceinline MoellerTrumboreIntersectorK(const vbool& valid, const RayK& ray) {} - - /*! Intersects K rays with one of M triangles. */ - template - __forceinline vbool intersectK(const vbool& valid0, - //RayK& ray, - const Vec3vf& ray_org, - const Vec3vf& ray_dir, - const vfloat& ray_tnear, - const vfloat& ray_tfar, - const Vec3vf& tri_v0, - const Vec3vf& tri_e1, - const Vec3vf& tri_e2, - const Vec3vf& tri_Ng, - const Epilog& epilog) const - { - /* calculate denominator */ - vbool valid = valid0; - const Vec3vf C = tri_v0 - ray_org; - const Vec3vf R = cross(C,ray_dir); - const vfloat den = dot(tri_Ng,ray_dir); - const vfloat absDen = abs(den); - const vfloat sgnDen = signmsk(den); - - /* test against edge p2 p0 */ - const vfloat U = dot(tri_e2,R) ^ sgnDen; - valid &= U >= 0.0f; - if (likely(none(valid))) return false; - - /* test against edge p0 p1 */ - const vfloat V = dot(tri_e1,R) ^ sgnDen; - valid &= V >= 0.0f; - if (likely(none(valid))) return false; - - /* test against edge p1 p2 */ - const vfloat W = absDen-U-V; - valid &= W >= 0.0f; - if (likely(none(valid))) return false; - - /* perform depth test */ - const vfloat T = dot(tri_Ng,C) ^ sgnDen; - valid &= (absDen*ray_tnear < T) & (T <= absDen*ray_tfar); - if (unlikely(none(valid))) return false; - - /* perform backface culling */ -#if defined(EMBREE_BACKFACE_CULLING) - valid &= den < vfloat(zero); - if (unlikely(none(valid))) return false; -#else - valid &= den != vfloat(zero); - if (unlikely(none(valid))) return false; -#endif - - /* calculate hit information */ - MoellerTrumboreHitK hit(U,V,T,absDen,tri_Ng); - return epilog(valid,hit); - } - - /*! Intersects K rays with one of M triangles. */ - template - __forceinline vbool intersectK(const vbool& valid0, - RayK& ray, - const Vec3vf& tri_v0, - const Vec3vf& tri_v1, - const Vec3vf& tri_v2, - const Epilog& epilog) const - { - const Vec3vf e1 = tri_v0-tri_v1; - const Vec3vf e2 = tri_v2-tri_v0; - const Vec3vf Ng = cross(e2,e1); - return intersectK(valid0,ray.org,ray.dir,ray.tnear(),ray.tfar,tri_v0,e1,e2,Ng,epilog); - } - - /*! Intersects K rays with one of M triangles. */ - template - __forceinline vbool intersectEdgeK(const vbool& valid0, - RayK& ray, - const Vec3vf& tri_v0, - const Vec3vf& tri_e1, - const Vec3vf& tri_e2, - const Epilog& epilog) const - { - const Vec3vf tri_Ng = cross(tri_e2,tri_e1); - return intersectK(valid0,ray.org,ray.dir,ray.tnear(),ray.tfar,tri_v0,tri_e1,tri_e2,tri_Ng,epilog); - } - - /*! Intersect k'th ray from ray packet of size K with M triangles. */ - __forceinline bool intersectEdge(RayK& ray, - size_t k, - const Vec3vf& tri_v0, - const Vec3vf& tri_e1, - const Vec3vf& tri_e2, - MoellerTrumboreHitM& hit) const - { - /* calculate denominator */ - typedef Vec3vf Vec3vfM; - const Vec3vf tri_Ng = cross(tri_e2,tri_e1); - - const Vec3vfM O = broadcast>(ray.org,k); - const Vec3vfM D = broadcast>(ray.dir,k); - const Vec3vfM C = Vec3vfM(tri_v0) - O; - const Vec3vfM R = cross(C,D); - const vfloat den = dot(Vec3vfM(tri_Ng),D); - const vfloat absDen = abs(den); - const vfloat sgnDen = signmsk(den); - - /* perform edge tests */ - const vfloat U = dot(Vec3vf(tri_e2),R) ^ sgnDen; - const vfloat V = dot(Vec3vf(tri_e1),R) ^ sgnDen; - - /* perform backface culling */ -#if defined(EMBREE_BACKFACE_CULLING) - vbool valid = (den < vfloat(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen); -#else - vbool valid = (den != vfloat(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen); -#endif - if (likely(none(valid))) return false; - - /* perform depth test */ - const vfloat T = dot(Vec3vf(tri_Ng),C) ^ sgnDen; - valid &= (absDen*vfloat(ray.tnear()[k]) < T) & (T <= absDen*vfloat(ray.tfar[k])); - if (likely(none(valid))) return false; - - /* calculate hit information */ - new (&hit) MoellerTrumboreHitM(valid,U,V,T,absDen,tri_Ng); - return true; - } - - __forceinline bool intersectEdge(RayK& ray, - size_t k, - const BBox>& time_range, - const Vec3vf& tri_v0, - const Vec3vf& tri_e1, - const Vec3vf& tri_e2, - MoellerTrumboreHitM& hit) const - { - if (likely(intersect(ray,k,tri_v0,tri_e1,tri_e2,hit))) - { - hit.valid &= time_range.lower <= vfloat(ray.time[k]); - hit.valid &= vfloat(ray.time[k]) < time_range.upper; - return any(hit.valid); - } - return false; - } - - template - __forceinline bool intersectEdge(RayK& ray, - size_t k, - const Vec3vf& tri_v0, - const Vec3vf& tri_e1, - const Vec3vf& tri_e2, - const Epilog& epilog) const - { - MoellerTrumboreHitM hit; - if (likely(intersectEdge(ray,k,tri_v0,tri_e1,tri_e2,hit))) return epilog(hit.valid,hit); - return false; - } - - template - __forceinline bool intersectEdge(RayK& ray, - size_t k, - const BBox>& time_range, - const Vec3vf& tri_v0, - const Vec3vf& tri_e1, - const Vec3vf& tri_e2, - const Epilog& epilog) const - { - MoellerTrumboreHitM hit; - if (likely(intersectEdge(ray,k,time_range,tri_v0,tri_e1,tri_e2,hit))) return epilog(hit.valid,hit); - return false; - } - - template - __forceinline bool intersect(RayK& ray, - size_t k, - const Vec3vf& v0, - const Vec3vf& v1, - const Vec3vf& v2, - const Epilog& epilog) const - { - const Vec3vf e1 = v0-v1; - const Vec3vf e2 = v2-v0; - return intersectEdge(ray,k,v0,e1,e2,epilog); - } - - template - __forceinline bool intersect(RayK& ray, - size_t k, - const BBox>& time_range, - const Vec3vf& v0, - const Vec3vf& v1, - const Vec3vf& v2, - const Epilog& epilog) const - { - const Vec3vf e1 = v0-v1; - const Vec3vf e2 = v2-v0; - return intersectEdge(ray,k,time_range,v0,e1,e2,epilog); - } - }; - } -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector_pluecker.h b/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector_pluecker.h deleted file mode 100644 index f1de99d208..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector_pluecker.h +++ /dev/null @@ -1,247 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "triangle.h" -#include "trianglev.h" -#include "trianglev_mb.h" -#include "intersector_epilog.h" - -/*! Modified Pluecker ray/triangle intersector. The test first shifts - * the ray origin into the origin of the coordinate system and then - * uses Pluecker coordinates for the intersection. Due to the shift, - * the Pluecker coordinate calculation simplifies and the tests get - * numerically stable. The edge equations are watertight along the - * edge for neighboring triangles. */ - -namespace embree -{ - namespace isa - { - template - struct PlueckerHitM - { - __forceinline PlueckerHitM(const vfloat& U, const vfloat& V, const vfloat& UVW, const vfloat& t, const Vec3vf& Ng, const UVMapper& mapUV) - : U(U), V(V), UVW(UVW), mapUV(mapUV), vt(t), vNg(Ng) {} - - __forceinline void finalize() - { - const vbool invalid = abs(UVW) < min_rcp_input; - const vfloat rcpUVW = select(invalid,vfloat(0.0f),rcp(UVW)); - vu = U * rcpUVW; - vv = V * rcpUVW; - mapUV(vu,vv); - } - - __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); } - __forceinline float t (const size_t i) const { return vt[i]; } - __forceinline Vec3fa Ng(const size_t i) const { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); } - - private: - const vfloat U; - const vfloat V; - const vfloat UVW; - const UVMapper& mapUV; - - public: - vfloat vu; - vfloat vv; - vfloat vt; - Vec3vf vNg; - }; - - template - struct PlueckerIntersector1 - { - __forceinline PlueckerIntersector1() {} - - __forceinline PlueckerIntersector1(const Ray& ray, const void* ptr) {} - - template - __forceinline bool intersect(Ray& ray, - const Vec3vf& tri_v0, - const Vec3vf& tri_v1, - const Vec3vf& tri_v2, - const UVMapper& mapUV, - const Epilog& epilog) const - { - /* calculate vertices relative to ray origin */ - const Vec3vf O = Vec3vf((Vec3fa)ray.org); - const Vec3vf D = Vec3vf((Vec3fa)ray.dir); - const Vec3vf v0 = tri_v0-O; - const Vec3vf v1 = tri_v1-O; - const Vec3vf v2 = tri_v2-O; - - /* calculate triangle edges */ - const Vec3vf e0 = v2-v0; - const Vec3vf e1 = v0-v1; - const Vec3vf e2 = v1-v2; - - /* perform edge tests */ - const vfloat U = dot(cross(e0,v2+v0),D); - const vfloat V = dot(cross(e1,v0+v1),D); - const vfloat W = dot(cross(e2,v1+v2),D); - const vfloat UVW = U+V+W; - const vfloat eps = float(ulp)*abs(UVW); -#if defined(EMBREE_BACKFACE_CULLING) - vbool valid = max(U,V,W) <= eps; -#else - vbool valid = (min(U,V,W) >= -eps) | (max(U,V,W) <= eps); -#endif - if (unlikely(none(valid))) return false; - - /* calculate geometry normal and denominator */ - const Vec3vf Ng = stable_triangle_normal(e0,e1,e2); - const vfloat den = twice(dot(Ng,D)); - - /* perform depth test */ - const vfloat T = twice(dot(v0,Ng)); - const vfloat t = rcp(den)*T; - valid &= vfloat(ray.tnear()) <= t & t <= vfloat(ray.tfar); - valid &= den != vfloat(zero); - if (unlikely(none(valid))) return false; - - /* update hit information */ - PlueckerHitM hit(U,V,UVW,t,Ng,mapUV); - return epilog(valid,hit); - } - }; - - template - struct PlueckerHitK - { - __forceinline PlueckerHitK(const vfloat& U, const vfloat& V, const vfloat& UVW, const vfloat& t, const Vec3vf& Ng, const UVMapper& mapUV) - : U(U), V(V), UVW(UVW), t(t), Ng(Ng), mapUV(mapUV) {} - - __forceinline std::tuple,vfloat,vfloat,Vec3vf> operator() () const - { - const vbool invalid = abs(UVW) < min_rcp_input; - const vfloat rcpUVW = select(invalid,vfloat(0.0f),rcp(UVW)); - vfloat u = U * rcpUVW; - vfloat v = V * rcpUVW; - mapUV(u,v); - return std::make_tuple(u,v,t,Ng); - } - - private: - const vfloat U; - const vfloat V; - const vfloat UVW; - const vfloat t; - const Vec3vf Ng; - const UVMapper& mapUV; - }; - - template - struct PlueckerIntersectorK - { - __forceinline PlueckerIntersectorK(const vbool& valid, const RayK& ray) {} - - /*! Intersects K rays with one of M triangles. */ - template - __forceinline vbool intersectK(const vbool& valid0, - RayK& ray, - const Vec3vf& tri_v0, - const Vec3vf& tri_v1, - const Vec3vf& tri_v2, - const UVMapper& mapUV, - const Epilog& epilog) const - { - /* calculate vertices relative to ray origin */ - vbool valid = valid0; - const Vec3vf O = ray.org; - const Vec3vf D = ray.dir; - const Vec3vf v0 = tri_v0-O; - const Vec3vf v1 = tri_v1-O; - const Vec3vf v2 = tri_v2-O; - - /* calculate triangle edges */ - const Vec3vf e0 = v2-v0; - const Vec3vf e1 = v0-v1; - const Vec3vf e2 = v1-v2; - - /* perform edge tests */ - const vfloat U = dot(Vec3vf(cross(e0,v2+v0)),D); - const vfloat V = dot(Vec3vf(cross(e1,v0+v1)),D); - const vfloat W = dot(Vec3vf(cross(e2,v1+v2)),D); - const vfloat UVW = U+V+W; - const vfloat eps = float(ulp)*abs(UVW); -#if defined(EMBREE_BACKFACE_CULLING) - valid &= max(U,V,W) <= eps; -#else - valid &= (min(U,V,W) >= -eps) | (max(U,V,W) <= eps); -#endif - if (unlikely(none(valid))) return false; - - /* calculate geometry normal and denominator */ - const Vec3vf Ng = stable_triangle_normal(e0,e1,e2); - const vfloat den = twice(dot(Vec3vf(Ng),D)); - - /* perform depth test */ - const vfloat T = twice(dot(v0,Vec3vf(Ng))); - const vfloat t = rcp(den)*T; - valid &= ray.tnear() <= t & t <= ray.tfar; - valid &= den != vfloat(zero); - if (unlikely(none(valid))) return false; - - /* calculate hit information */ - PlueckerHitK hit(U,V,UVW,t,Ng,mapUV); - return epilog(valid,hit); - } - - /*! Intersect k'th ray from ray packet of size K with M triangles. */ - template - __forceinline bool intersect(RayK& ray, size_t k, - const Vec3vf& tri_v0, - const Vec3vf& tri_v1, - const Vec3vf& tri_v2, - const UVMapper& mapUV, - const Epilog& epilog) const - { - /* calculate vertices relative to ray origin */ - const Vec3vf O = broadcast>(ray.org,k); - const Vec3vf D = broadcast>(ray.dir,k); - const Vec3vf v0 = tri_v0-O; - const Vec3vf v1 = tri_v1-O; - const Vec3vf v2 = tri_v2-O; - - /* calculate triangle edges */ - const Vec3vf e0 = v2-v0; - const Vec3vf e1 = v0-v1; - const Vec3vf e2 = v1-v2; - - /* perform edge tests */ - const vfloat U = dot(cross(e0,v2+v0),D); - const vfloat V = dot(cross(e1,v0+v1),D); - const vfloat W = dot(cross(e2,v1+v2),D); - const vfloat UVW = U+V+W; - const vfloat eps = float(ulp)*abs(UVW); -#if defined(EMBREE_BACKFACE_CULLING) - vbool valid = max(U,V,W) <= eps; -#else - vbool valid = (min(U,V,W) >= -eps) | (max(U,V,W) <= eps); -#endif - if (unlikely(none(valid))) return false; - - /* calculate geometry normal and denominator */ - const Vec3vf Ng = stable_triangle_normal(e0,e1,e2); - const vfloat den = twice(dot(Ng,D)); - - /* perform depth test */ - const vfloat T = twice(dot(v0,Ng)); - const vfloat t = rcp(den)*T; - valid &= vfloat(ray.tnear()[k]) <= t & t <= vfloat(ray.tfar[k]); - if (unlikely(none(valid))) return false; - - /* avoid division by 0 */ - valid &= den != vfloat(zero); - if (unlikely(none(valid))) return false; - - /* update hit information */ - PlueckerHitM hit(U,V,UVW,t,Ng,mapUV); - return epilog(valid,hit); - } - }; - } -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector_woop.h b/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector_woop.h deleted file mode 100644 index 63e649d8fb..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector_woop.h +++ /dev/null @@ -1,418 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "triangle.h" -#include "intersector_epilog.h" - -/*! This intersector implements a modified version of the Woop's ray-triangle intersection test */ - -namespace embree -{ - namespace isa - { - template - struct WoopHitM - { - __forceinline WoopHitM() {} - - __forceinline WoopHitM(const vbool& valid, - const vfloat& U, - const vfloat& V, - const vfloat& T, - const vfloat& inv_det, - const Vec3vf& Ng) - : U(U), V(V), T(T), inv_det(inv_det), valid(valid), vNg(Ng) {} - - __forceinline void finalize() - { - vt = T; - vu = U*inv_det; - vv = V*inv_det; - } - - __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); } - __forceinline float t (const size_t i) const { return vt[i]; } - __forceinline Vec3fa Ng(const size_t i) const { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); } - - private: - const vfloat U; - const vfloat V; - const vfloat T; - const vfloat inv_det; - - public: - const vbool valid; - vfloat vu; - vfloat vv; - vfloat vt; - Vec3vf vNg; - }; - - template - struct WoopPrecalculations1 - { - unsigned int kx,ky,kz; - Vec3vf org; - Vec3fa S; - __forceinline WoopPrecalculations1() {} - - __forceinline WoopPrecalculations1(const Ray& ray, const void* ptr) - { - kz = maxDim(abs(ray.dir)); - kx = (kz+1) % 3; - ky = (kx+1) % 3; - const float inv_dir_kz = rcp(ray.dir[kz]); - if (ray.dir[kz]) std::swap(kx,ky); - S.x = ray.dir[kx] * inv_dir_kz; - S.y = ray.dir[ky] * inv_dir_kz; - S.z = inv_dir_kz; - org = Vec3vf(ray.org[kx],ray.org[ky],ray.org[kz]); - } - }; - - - template - struct WoopIntersector1 - { - - typedef WoopPrecalculations1 Precalculations; - - __forceinline WoopIntersector1() {} - - __forceinline WoopIntersector1(const Ray& ray, const void* ptr) {} - - static __forceinline bool intersect(const vbool& valid0, - Ray& ray, - const Precalculations& pre, - const Vec3vf& tri_v0, - const Vec3vf& tri_v1, - const Vec3vf& tri_v2, - WoopHitM& hit) - { - vbool valid = valid0; - - /* vertices relative to ray origin */ - const Vec3vf org = Vec3vf(pre.org.x,pre.org.y,pre.org.z); - const Vec3vf A = Vec3vf(tri_v0[pre.kx],tri_v0[pre.ky],tri_v0[pre.kz]) - org; - const Vec3vf B = Vec3vf(tri_v1[pre.kx],tri_v1[pre.ky],tri_v1[pre.kz]) - org; - const Vec3vf C = Vec3vf(tri_v2[pre.kx],tri_v2[pre.ky],tri_v2[pre.kz]) - org; - - /* shear and scale vertices */ - const vfloat Ax = nmadd(A.z,pre.S.x,A.x); - const vfloat Ay = nmadd(A.z,pre.S.y,A.y); - const vfloat Bx = nmadd(B.z,pre.S.x,B.x); - const vfloat By = nmadd(B.z,pre.S.y,B.y); - const vfloat Cx = nmadd(C.z,pre.S.x,C.x); - const vfloat Cy = nmadd(C.z,pre.S.y,C.y); - - /* scaled barycentric */ - const vfloat U0 = Cx*By; - const vfloat U1 = Cy*Bx; - const vfloat V0 = Ax*Cy; - const vfloat V1 = Ay*Cx; - const vfloat W0 = Bx*Ay; - const vfloat W1 = By*Ax; -#if !defined(__AVX512F__) - valid &= (U0 >= U1) & (V0 >= V1) & (W0 >= W1) | - (U0 <= U1) & (V0 <= V1) & (W0 <= W1); -#else - valid &= ge(ge(U0 >= U1,V0,V1),W0,W1) | le(le(U0 <= U1,V0,V1),W0,W1); -#endif - - if (likely(none(valid))) return false; - const vfloat U = U0-U1; - const vfloat V = V0-V1; - const vfloat W = W0-W1; - - const vfloat det = U+V+W; - - valid &= det != 0.0f; - const vfloat inv_det = rcp(det); - - const vfloat Az = pre.S.z * A.z; - const vfloat Bz = pre.S.z * B.z; - const vfloat Cz = pre.S.z * C.z; - const vfloat T = madd(U,Az,madd(V,Bz,W*Cz)); - const vfloat t = T * inv_det; - /* perform depth test */ - valid &= (vfloat(ray.tnear()) < t) & (t <= vfloat(ray.tfar)); - if (likely(none(valid))) return false; - - const Vec3vf tri_Ng = cross(tri_v2-tri_v0,tri_v0-tri_v1); - - /* update hit information */ - new (&hit) WoopHitM(valid,U,V,t,inv_det,tri_Ng); - return true; - } - - static __forceinline bool intersect(Ray& ray, - const Precalculations& pre, - const Vec3vf& v0, - const Vec3vf& v1, - const Vec3vf& v2, - WoopHitM& hit) - { - vbool valid = true; - return intersect(valid,ray,pre,v0,v1,v2,hit); - } - - - template - static __forceinline bool intersect(Ray& ray, - const Precalculations& pre, - const Vec3vf& v0, - const Vec3vf& v1, - const Vec3vf& v2, - const Epilog& epilog) - { - WoopHitM hit; - if (likely(intersect(ray,pre,v0,v1,v2,hit))) return epilog(hit.valid,hit); - return false; - } - - template - static __forceinline bool intersect(const vbool& valid, - Ray& ray, - const Precalculations& pre, - const Vec3vf& v0, - const Vec3vf& v1, - const Vec3vf& v2, - const Epilog& epilog) - { - WoopHitM hit; - if (likely(intersect(valid,ray,pre,v0,v1,v2,hit))) return epilog(hit.valid,hit); - return false; - } - }; - -#if 0 - template - struct WoopHitK - { - __forceinline WoopHitK(const vfloat& U, const vfloat& V, const vfloat& T, const vfloat& absDen, const Vec3vf& Ng) - : U(U), V(V), T(T), absDen(absDen), Ng(Ng) {} - - __forceinline std::tuple,vfloat,vfloat,Vec3vf> operator() () const - { - const vfloat rcpAbsDen = rcp(absDen); - const vfloat t = T * rcpAbsDen; - const vfloat u = U * rcpAbsDen; - const vfloat v = V * rcpAbsDen; - return std::make_tuple(u,v,t,Ng); - } - - private: - const vfloat U; - const vfloat V; - const vfloat T; - const vfloat absDen; - const Vec3vf Ng; - }; - - template - struct WoopIntersectorK - { - __forceinline WoopIntersectorK(const vbool& valid, const RayK& ray) {} - - /*! Intersects K rays with one of M triangles. */ - template - __forceinline vbool intersectK(const vbool& valid0, - //RayK& ray, - const Vec3vf& ray_org, - const Vec3vf& ray_dir, - const vfloat& ray_tnear, - const vfloat& ray_tfar, - const Vec3vf& tri_v0, - const Vec3vf& tri_e1, - const Vec3vf& tri_e2, - const Vec3vf& tri_Ng, - const Epilog& epilog) const - { - /* calculate denominator */ - vbool valid = valid0; - const Vec3vf C = tri_v0 - ray_org; - const Vec3vf R = cross(C,ray_dir); - const vfloat den = dot(tri_Ng,ray_dir); - const vfloat absDen = abs(den); - const vfloat sgnDen = signmsk(den); - - /* test against edge p2 p0 */ - const vfloat U = dot(tri_e2,R) ^ sgnDen; - valid &= U >= 0.0f; - if (likely(none(valid))) return false; - - /* test against edge p0 p1 */ - const vfloat V = dot(tri_e1,R) ^ sgnDen; - valid &= V >= 0.0f; - if (likely(none(valid))) return false; - - /* test against edge p1 p2 */ - const vfloat W = absDen-U-V; - valid &= W >= 0.0f; - if (likely(none(valid))) return false; - - /* perform depth test */ - const vfloat T = dot(tri_Ng,C) ^ sgnDen; - valid &= (absDen*ray_tnear < T) & (T <= absDen*ray_tfar); - if (unlikely(none(valid))) return false; - - /* perform backface culling */ -#if defined(EMBREE_BACKFACE_CULLING) - valid &= den < vfloat(zero); - if (unlikely(none(valid))) return false; -#else - valid &= den != vfloat(zero); - if (unlikely(none(valid))) return false; -#endif - - /* calculate hit information */ - WoopHitK hit(U,V,T,absDen,tri_Ng); - return epilog(valid,hit); - } - - /*! Intersects K rays with one of M triangles. */ - template - __forceinline vbool intersectK(const vbool& valid0, - RayK& ray, - const Vec3vf& tri_v0, - const Vec3vf& tri_v1, - const Vec3vf& tri_v2, - const Epilog& epilog) const - { - const Vec3vf e1 = tri_v0-tri_v1; - const Vec3vf e2 = tri_v2-tri_v0; - const Vec3vf Ng = cross(e2,e1); - return intersectK(valid0,ray.org,ray.dir,ray.tnear(),ray.tfar,tri_v0,e1,e2,Ng,epilog); - } - - /*! Intersects K rays with one of M triangles. */ - template - __forceinline vbool intersectEdgeK(const vbool& valid0, - RayK& ray, - const Vec3vf& tri_v0, - const Vec3vf& tri_e1, - const Vec3vf& tri_e2, - const Epilog& epilog) const - { - const Vec3vf tri_Ng = cross(tri_e2,tri_e1); - return intersectK(valid0,ray.org,ray.dir,ray.tnear(),ray.tfar,tri_v0,tri_e1,tri_e2,tri_Ng,epilog); - } - - /*! Intersect k'th ray from ray packet of size K with M triangles. */ - __forceinline bool intersectEdge(RayK& ray, - size_t k, - const Vec3vf& tri_v0, - const Vec3vf& tri_e1, - const Vec3vf& tri_e2, - WoopHitM& hit) const - { - /* calculate denominator */ - typedef Vec3vf Vec3vfM; - const Vec3vf tri_Ng = cross(tri_e2,tri_e1); - - const Vec3vfM O = broadcast>(ray.org,k); - const Vec3vfM D = broadcast>(ray.dir,k); - const Vec3vfM C = Vec3vfM(tri_v0) - O; - const Vec3vfM R = cross(C,D); - const vfloat den = dot(Vec3vfM(tri_Ng),D); - const vfloat absDen = abs(den); - const vfloat sgnDen = signmsk(den); - - /* perform edge tests */ - const vfloat U = dot(Vec3vf(tri_e2),R) ^ sgnDen; - const vfloat V = dot(Vec3vf(tri_e1),R) ^ sgnDen; - - /* perform backface culling */ -#if defined(EMBREE_BACKFACE_CULLING) - vbool valid = (den < vfloat(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen); -#else - vbool valid = (den != vfloat(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen); -#endif - if (likely(none(valid))) return false; - - /* perform depth test */ - const vfloat T = dot(Vec3vf(tri_Ng),C) ^ sgnDen; - valid &= (absDen*vfloat(ray.tnear()[k]) < T) & (T <= absDen*vfloat(ray.tfar[k])); - if (likely(none(valid))) return false; - - /* calculate hit information */ - new (&hit) WoopHitM(valid,U,V,T,absDen,tri_Ng); - return true; - } - - __forceinline bool intersectEdge(RayK& ray, - size_t k, - const BBox>& time_range, - const Vec3vf& tri_v0, - const Vec3vf& tri_e1, - const Vec3vf& tri_e2, - WoopHitM& hit) const - { - if (likely(intersect(ray,k,tri_v0,tri_e1,tri_e2,hit))) - { - hit.valid &= time_range.lower <= vfloat(ray.time[k]); - hit.valid &= vfloat(ray.time[k]) < time_range.upper; - return any(hit.valid); - } - return false; - } - - template - __forceinline bool intersectEdge(RayK& ray, - size_t k, - const Vec3vf& tri_v0, - const Vec3vf& tri_e1, - const Vec3vf& tri_e2, - const Epilog& epilog) const - { - WoopHitM hit; - if (likely(intersectEdge(ray,k,tri_v0,tri_e1,tri_e2,hit))) return epilog(hit.valid,hit); - return false; - } - - template - __forceinline bool intersectEdge(RayK& ray, - size_t k, - const BBox>& time_range, - const Vec3vf& tri_v0, - const Vec3vf& tri_e1, - const Vec3vf& tri_e2, - const Epilog& epilog) const - { - WoopHitM hit; - if (likely(intersectEdge(ray,k,time_range,tri_v0,tri_e1,tri_e2,hit))) return epilog(hit.valid,hit); - return false; - } - - template - __forceinline bool intersect(RayK& ray, - size_t k, - const Vec3vf& v0, - const Vec3vf& v1, - const Vec3vf& v2, - const Epilog& epilog) const - { - const Vec3vf e1 = v0-v1; - const Vec3vf e2 = v2-v0; - return intersectEdge(ray,k,v0,e1,e2,epilog); - } - - template - __forceinline bool intersect(RayK& ray, - size_t k, - const BBox>& time_range, - const Vec3vf& v0, - const Vec3vf& v1, - const Vec3vf& v2, - const Epilog& epilog) const - { - const Vec3vf e1 = v0-v1; - const Vec3vf e2 = v2-v0; - return intersectEdge(ray,k,time_range,v0,e1,e2,epilog); - } - }; -#endif - } -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/triangle_triangle_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/triangle_triangle_intersector.h deleted file mode 100644 index 91b35c36f3..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/triangle_triangle_intersector.h +++ /dev/null @@ -1,132 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "primitive.h" - -namespace embree -{ - namespace isa - { - struct TriangleTriangleIntersector - { - __forceinline static float T(float pa0, float pa1, float da0, float da1) { - return pa0 + (pa1-pa0)*da0/(da0-da1); - } - - __forceinline static bool point_line_side(const Vec2f& p, const Vec2f& a0, const Vec2f& a1) { - return det(p-a0,a0-a1) >= 0.0f; - } - - __forceinline static bool point_inside_triangle(const Vec2f& p, const Vec2f& a, const Vec2f& b, const Vec2f& c) - { - const bool pab = point_line_side(p,a,b); - const bool pbc = point_line_side(p,b,c); - const bool pca = point_line_side(p,c,a); - return pab == pbc && pab == pca; - } - - __forceinline static bool intersect_line_line(const Vec2f& a0, const Vec2f& a1, const Vec2f& b0, const Vec2f& b1) - { - const bool different_sides0 = point_line_side(b0,a0,a1) != point_line_side(b1,a0,a1); - const bool different_sides1 = point_line_side(a0,b0,b1) != point_line_side(a1,b0,b1); - return different_sides0 && different_sides1; - } - - __forceinline static bool intersect_triangle_triangle (const Vec2f& a0, const Vec2f& a1, const Vec2f& a2, - const Vec2f& b0, const Vec2f& b1, const Vec2f& b2) - { - const bool a01_b01 = intersect_line_line(a0,a1,b0,b1); - if (a01_b01) return true; - const bool a01_b12 = intersect_line_line(a0,a1,b1,b2); - if (a01_b12) return true; - const bool a01_b20 = intersect_line_line(a0,a1,b2,b0); - if (a01_b20) return true; - const bool a12_b01 = intersect_line_line(a1,a2,b0,b1); - if (a12_b01) return true; - const bool a12_b12 = intersect_line_line(a1,a2,b1,b2); - if (a12_b12) return true; - const bool a12_b20 = intersect_line_line(a1,a2,b2,b0); - if (a12_b20) return true; - const bool a20_b01 = intersect_line_line(a2,a0,b0,b1); - if (a20_b01) return true; - const bool a20_b12 = intersect_line_line(a2,a0,b1,b2); - if (a20_b12) return true; - const bool a20_b20 = intersect_line_line(a2,a0,b2,b0); - if (a20_b20) return true; - - bool a_in_b = point_inside_triangle(a0,b0,b1,b2) && point_inside_triangle(a1,b0,b1,b2) && point_inside_triangle(a2,b0,b1,b2); - if (a_in_b) return true; - - bool b_in_a = point_inside_triangle(b0,a0,a1,a2) && point_inside_triangle(b1,a0,a1,a2) && point_inside_triangle(b2,a0,a1,a2); - if (b_in_a) return true; - - return false; - } - - static bool intersect_triangle_triangle (const Vec3fa& a0, const Vec3fa& a1, const Vec3fa& a2, - const Vec3fa& b0, const Vec3fa& b1, const Vec3fa& b2) - { - const float eps = 1E-5f; - - /* calculate triangle planes */ - const Vec3fa Na = cross(a1-a0,a2-a0); - const float Ca = dot(Na,a0); - const Vec3fa Nb = cross(b1-b0,b2-b0); - const float Cb = dot(Nb,b0); - - /* project triangle A onto plane B */ - const float da0 = dot(Nb,a0)-Cb; - const float da1 = dot(Nb,a1)-Cb; - const float da2 = dot(Nb,a2)-Cb; - if (max(da0,da1,da2) < -eps) return false; - if (min(da0,da1,da2) > +eps) return false; - //CSTAT(bvh_collide_prim_intersections4++); - - /* project triangle B onto plane A */ - const float db0 = dot(Na,b0)-Ca; - const float db1 = dot(Na,b1)-Ca; - const float db2 = dot(Na,b2)-Ca; - if (max(db0,db1,db2) < -eps) return false; - if (min(db0,db1,db2) > +eps) return false; - //CSTAT(bvh_collide_prim_intersections5++); - - if (unlikely((std::fabs(da0) < eps && std::fabs(da1) < eps && std::fabs(da2) < eps) || - (std::fabs(db0) < eps && std::fabs(db1) < eps && std::fabs(db2) < eps))) - { - const size_t dz = maxDim(Na); - const size_t dx = (dz+1)%3; - const size_t dy = (dx+1)%3; - const Vec2f A0(a0[dx],a0[dy]); - const Vec2f A1(a1[dx],a1[dy]); - const Vec2f A2(a2[dx],a2[dy]); - const Vec2f B0(b0[dx],b0[dy]); - const Vec2f B1(b1[dx],b1[dy]); - const Vec2f B2(b2[dx],b2[dy]); - return intersect_triangle_triangle(A0,A1,A2,B0,B1,B2); - } - - const Vec3fa D = cross(Na,Nb); - const float pa0 = dot(D,a0); - const float pa1 = dot(D,a1); - const float pa2 = dot(D,a2); - const float pb0 = dot(D,b0); - const float pb1 = dot(D,b1); - const float pb2 = dot(D,b2); - - BBox1f ba = empty; - if (min(da0,da1) <= 0.0f && max(da0,da1) >= 0.0f && abs(da0-da1) > 0.0f) ba.extend(T(pa0,pa1,da0,da1)); - if (min(da1,da2) <= 0.0f && max(da1,da2) >= 0.0f && abs(da1-da2) > 0.0f) ba.extend(T(pa1,pa2,da1,da2)); - if (min(da2,da0) <= 0.0f && max(da2,da0) >= 0.0f && abs(da2-da0) > 0.0f) ba.extend(T(pa2,pa0,da2,da0)); - - BBox1f bb = empty; - if (min(db0,db1) <= 0.0f && max(db0,db1) >= 0.0f && abs(db0-db1) > 0.0f) bb.extend(T(pb0,pb1,db0,db1)); - if (min(db1,db2) <= 0.0f && max(db1,db2) >= 0.0f && abs(db1-db2) > 0.0f) bb.extend(T(pb1,pb2,db1,db2)); - if (min(db2,db0) <= 0.0f && max(db2,db0) >= 0.0f && abs(db2-db0) > 0.0f) bb.extend(T(pb2,pb0,db2,db0)); - - return conjoint(ba,bb); - } - }; - } -} - - diff --git a/thirdparty/embree-aarch64/kernels/geometry/trianglei.h b/thirdparty/embree-aarch64/kernels/geometry/trianglei.h deleted file mode 100644 index 4f3118cc0c..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/trianglei.h +++ /dev/null @@ -1,442 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "primitive.h" -#include "../common/scene.h" - -namespace embree -{ - /* Stores M triangles from an indexed face set */ - template - struct TriangleMi - { - /* Virtual interface to query information about the triangle type */ - struct Type : public PrimitiveType - { - const char* name() const; - size_t sizeActive(const char* This) const; - size_t sizeTotal(const char* This) const; - size_t getBytes(const char* This) const; - }; - static Type type; - - public: - - /* primitive supports multiple time segments */ - static const bool singleTimeSegment = false; - - /* Returns maximum number of stored triangles */ - static __forceinline size_t max_size() { return M; } - - /* Returns required number of primitive blocks for N primitives */ - static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); } - - public: - - /* Default constructor */ - __forceinline TriangleMi() { } - - /* Construction from vertices and IDs */ - __forceinline TriangleMi(const vuint& v0, - const vuint& v1, - const vuint& v2, - const vuint& geomIDs, - const vuint& primIDs) -#if defined(EMBREE_COMPACT_POLYS) - : geomIDs(geomIDs), primIDs(primIDs) {} -#else - : v0_(v0), v1_(v1), v2_(v2), geomIDs(geomIDs), primIDs(primIDs) {} -#endif - - /* Returns a mask that tells which triangles are valid */ - __forceinline vbool valid() const { return primIDs != vuint(-1); } - - /* Returns if the specified triangle is valid */ - __forceinline bool valid(const size_t i) const { assert(i geomID() const { return geomIDs; } - __forceinline unsigned int geomID(const size_t i) const { assert(i primID() const { return primIDs; } - __forceinline unsigned int primID(const size_t i) const { assert(iget(geomID(i)); - bounds.extend(mesh->bounds(primID(i),itime)); - } - return bounds; - } - - /* Calculate the linear bounds of the primitive */ - __forceinline LBBox3fa linearBounds(const Scene *const scene, size_t itime) { - return LBBox3fa(bounds(scene,itime+0),bounds(scene,itime+1)); - } - - __forceinline LBBox3fa linearBounds(const Scene *const scene, size_t itime, size_t numTimeSteps) - { - LBBox3fa allBounds = empty; - for (size_t i=0; iget(geomID(i)); - allBounds.extend(mesh->linearBounds(primID(i), itime, numTimeSteps)); - } - return allBounds; - } - - __forceinline LBBox3fa linearBounds(const Scene *const scene, const BBox1f time_range) - { - LBBox3fa allBounds = empty; - for (size_t i=0; iget(geomID(i)); - allBounds.extend(mesh->linearBounds(primID(i), time_range)); - } - return allBounds; - } - - /* Non-temporal store */ - __forceinline static void store_nt(TriangleMi* dst, const TriangleMi& src) - { -#if !defined(EMBREE_COMPACT_POLYS) - vuint::store_nt(&dst->v0_,src.v0_); - vuint::store_nt(&dst->v1_,src.v1_); - vuint::store_nt(&dst->v2_,src.v2_); -#endif - vuint::store_nt(&dst->geomIDs,src.geomIDs); - vuint::store_nt(&dst->primIDs,src.primIDs); - } - - /* Fill triangle from triangle list */ - template - __forceinline void fill(const PrimRefT* prims, size_t& begin, size_t end, Scene* scene) - { - vuint v0 = zero, v1 = zero, v2 = zero; - vuint geomID = -1, primID = -1; - const PrimRefT* prim = &prims[begin]; - - for (size_t i=0; igeomID(); - primID[i] = prim->primID(); -#if !defined(EMBREE_COMPACT_POLYS) - const TriangleMesh* mesh = scene->get(prim->geomID()); - const TriangleMesh::Triangle& tri = mesh->triangle(prim->primID()); - unsigned int int_stride = mesh->vertices0.getStride()/4; - v0[i] = tri.v[0] * int_stride; - v1[i] = tri.v[1] * int_stride; - v2[i] = tri.v[2] * int_stride; -#endif - begin++; - } else { - assert(i); - if (likely(i > 0)) { - geomID[i] = geomID[0]; - primID[i] = -1; - v0[i] = v0[0]; - v1[i] = v0[0]; - v2[i] = v0[0]; - } - } - if (begintriangle(primId); - const Vec3fa p0 = mesh->vertex(tri.v[0]); - const Vec3fa p1 = mesh->vertex(tri.v[1]); - const Vec3fa p2 = mesh->vertex(tri.v[2]); - bounds.extend(merge(BBox3fa(p0),BBox3fa(p1),BBox3fa(p2))); - } - return bounds; - } - - protected: -#if !defined(EMBREE_COMPACT_POLYS) - vuint v0_; // 4 byte offset of 1st vertex - vuint v1_; // 4 byte offset of 2nd vertex - vuint v2_; // 4 byte offset of 3rd vertex -#endif - vuint geomIDs; // geometry ID of mesh - vuint primIDs; // primitive ID of primitive inside mesh - }; - - namespace isa - { - - template - struct TriangleMi : public embree::TriangleMi - { -#if !defined(EMBREE_COMPACT_POLYS) - using embree::TriangleMi::v0_; - using embree::TriangleMi::v1_; - using embree::TriangleMi::v2_; -#endif - using embree::TriangleMi::geomIDs; - using embree::TriangleMi::primIDs; - using embree::TriangleMi::geomID; - using embree::TriangleMi::primID; - using embree::TriangleMi::valid; - - /* loads a single vertex */ - template - __forceinline Vec3f getVertex(const size_t index, const Scene *const scene) const - { -#if defined(EMBREE_COMPACT_POLYS) - const TriangleMesh* mesh = scene->get(geomID(index)); - const TriangleMesh::Triangle& tri = mesh->triangle(primID(index)); - return (Vec3f) mesh->vertices[0][tri.v[vid]]; -#else - const vuint& v = getVertexOffset(); - const float* vertices = scene->vertices[geomID(index)]; - return (Vec3f&) vertices[v[index]]; -#endif - } - - template - __forceinline Vec3 getVertex(const size_t index, const Scene *const scene, const size_t itime, const T& ftime) const - { -#if defined(EMBREE_COMPACT_POLYS) - const TriangleMesh* mesh = scene->get(geomID(index)); - const TriangleMesh::Triangle& tri = mesh->triangle(primID(index)); - const Vec3fa v0 = mesh->vertices[itime+0][tri.v[vid]]; - const Vec3fa v1 = mesh->vertices[itime+1][tri.v[vid]]; -#else - const vuint& v = getVertexOffset(); - const TriangleMesh* mesh = scene->get(geomID(index)); - const float* vertices0 = (const float*) mesh->vertexPtr(0,itime+0); - const float* vertices1 = (const float*) mesh->vertexPtr(0,itime+1); - const Vec3fa v0 = Vec3fa::loadu(vertices0+v[index]); - const Vec3fa v1 = Vec3fa::loadu(vertices1+v[index]); -#endif - const Vec3 p0(v0.x,v0.y,v0.z); - const Vec3 p1(v1.x,v1.y,v1.z); - return lerp(p0,p1,ftime); - } - - template - __forceinline Vec3 getVertex(const vbool& valid, const size_t index, const Scene *const scene, const vint& itime, const T& ftime) const - { - Vec3 p0, p1; - const TriangleMesh* mesh = scene->get(geomID(index)); - - for (size_t mask=movemask(valid), i=bsf(mask); mask; mask=btc(mask,i), i=bsf(mask)) - { -#if defined(EMBREE_COMPACT_POLYS) - const TriangleMesh::Triangle& tri = mesh->triangle(primID(index)); - const Vec3fa v0 = mesh->vertices[itime[i]+0][tri.v[vid]]; - const Vec3fa v1 = mesh->vertices[itime[i]+1][tri.v[vid]]; -#else - const vuint& v = getVertexOffset(); - const float* vertices0 = (const float*) mesh->vertexPtr(0,itime[i]+0); - const float* vertices1 = (const float*) mesh->vertexPtr(0,itime[i]+1); - const Vec3fa v0 = Vec3fa::loadu(vertices0+v[index]); - const Vec3fa v1 = Vec3fa::loadu(vertices1+v[index]); -#endif - p0.x[i] = v0.x; p0.y[i] = v0.y; p0.z[i] = v0.z; - p1.x[i] = v1.x; p1.y[i] = v1.y; p1.z[i] = v1.z; - } - return (T(one)-ftime)*p0 + ftime*p1; - } - - struct Triangle { - vfloat4 v0,v1,v2; - }; - -#if defined(EMBREE_COMPACT_POLYS) - - __forceinline Triangle loadTriangle(const int i, const Scene* const scene) const - { - const unsigned int geomID = geomIDs[i]; - const unsigned int primID = primIDs[i]; - if (unlikely(primID == -1)) return { zero, zero, zero }; - const TriangleMesh* mesh = scene->get(geomID); - const TriangleMesh::Triangle& tri = mesh->triangle(primID); - const vfloat4 v0 = (vfloat4) mesh->vertices0[tri.v[0]]; - const vfloat4 v1 = (vfloat4) mesh->vertices0[tri.v[1]]; - const vfloat4 v2 = (vfloat4) mesh->vertices0[tri.v[2]]; - return { v0, v1, v2 }; - } - - __forceinline Triangle loadTriangle(const int i, const int itime, const TriangleMesh* const mesh) const - { - const unsigned int primID = primIDs[i]; - if (unlikely(primID == -1)) return { zero, zero, zero }; - const TriangleMesh::Triangle& tri = mesh->triangle(primID); - const vfloat4 v0 = (vfloat4) mesh->vertices[itime][tri.v[0]]; - const vfloat4 v1 = (vfloat4) mesh->vertices[itime][tri.v[1]]; - const vfloat4 v2 = (vfloat4) mesh->vertices[itime][tri.v[2]]; - return { v0, v1, v2 }; - } - -#else - - __forceinline Triangle loadTriangle(const int i, const Scene* const scene) const - { - const float* vertices = scene->vertices[geomID(i)]; - const vfloat4 v0 = vfloat4::loadu(vertices + v0_[i]); - const vfloat4 v1 = vfloat4::loadu(vertices + v1_[i]); - const vfloat4 v2 = vfloat4::loadu(vertices + v2_[i]); - return { v0, v1, v2 }; - } - - __forceinline Triangle loadTriangle(const int i, const int itime, const TriangleMesh* const mesh) const - { - const float* vertices = (const float*) mesh->vertexPtr(0,itime); - const vfloat4 v0 = vfloat4::loadu(vertices + v0_[i]); - const vfloat4 v1 = vfloat4::loadu(vertices + v1_[i]); - const vfloat4 v2 = vfloat4::loadu(vertices + v2_[i]); - return { v0, v1, v2 }; - } - -#endif - - /* Gather the triangles */ - __forceinline void gather(Vec3vf& p0, Vec3vf& p1, Vec3vf& p2, const Scene* const scene) const; - - template -#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 2000) // workaround for compiler bug in ICC 2019 - __noinline -#else - __forceinline -#endif - void gather(const vbool& valid, - Vec3vf& p0, - Vec3vf& p1, - Vec3vf& p2, - const size_t index, - const Scene* const scene, - const vfloat& time) const - { - const TriangleMesh* mesh = scene->get(geomID(index)); - - vfloat ftime; - const vint itime = mesh->timeSegment(time, ftime); - - const size_t first = bsf(movemask(valid)); - if (likely(all(valid,itime[first] == itime))) - { - p0 = getVertex<0>(index, scene, itime[first], ftime); - p1 = getVertex<1>(index, scene, itime[first], ftime); - p2 = getVertex<2>(index, scene, itime[first], ftime); - } else { - p0 = getVertex<0>(valid, index, scene, itime, ftime); - p1 = getVertex<1>(valid, index, scene, itime, ftime); - p2 = getVertex<2>(valid, index, scene, itime, ftime); - } - } - - __forceinline void gather(Vec3vf& p0, - Vec3vf& p1, - Vec3vf& p2, - const TriangleMesh* mesh, - const Scene *const scene, - const int itime) const; - - __forceinline void gather(Vec3vf& p0, - Vec3vf& p1, - Vec3vf& p2, - const Scene *const scene, - const float time) const; - - -#if !defined(EMBREE_COMPACT_POLYS) - template const vuint& getVertexOffset() const; -#endif - }; - -#if !defined(EMBREE_COMPACT_POLYS) - template<> template<> __forceinline const vuint<4>& TriangleMi<4>::getVertexOffset<0>() const { return v0_; } - template<> template<> __forceinline const vuint<4>& TriangleMi<4>::getVertexOffset<1>() const { return v1_; } - template<> template<> __forceinline const vuint<4>& TriangleMi<4>::getVertexOffset<2>() const { return v2_; } -#endif - - template<> - __forceinline void TriangleMi<4>::gather(Vec3vf4& p0, - Vec3vf4& p1, - Vec3vf4& p2, - const Scene* const scene) const - { - const Triangle tri0 = loadTriangle(0,scene); - const Triangle tri1 = loadTriangle(1,scene); - const Triangle tri2 = loadTriangle(2,scene); - const Triangle tri3 = loadTriangle(3,scene); - transpose(tri0.v0,tri1.v0,tri2.v0,tri3.v0,p0.x,p0.y,p0.z); - transpose(tri0.v1,tri1.v1,tri2.v1,tri3.v1,p1.x,p1.y,p1.z); - transpose(tri0.v2,tri1.v2,tri2.v2,tri3.v2,p2.x,p2.y,p2.z); - } - - template<> - __forceinline void TriangleMi<4>::gather(Vec3vf4& p0, - Vec3vf4& p1, - Vec3vf4& p2, - const TriangleMesh* mesh, - const Scene *const scene, - const int itime) const - { - const Triangle tri0 = loadTriangle(0,itime,mesh); - const Triangle tri1 = loadTriangle(1,itime,mesh); - const Triangle tri2 = loadTriangle(2,itime,mesh); - const Triangle tri3 = loadTriangle(3,itime,mesh); - transpose(tri0.v0,tri1.v0,tri2.v0,tri3.v0,p0.x,p0.y,p0.z); - transpose(tri0.v1,tri1.v1,tri2.v1,tri3.v1,p1.x,p1.y,p1.z); - transpose(tri0.v2,tri1.v2,tri2.v2,tri3.v2,p2.x,p2.y,p2.z); - } - - template<> - __forceinline void TriangleMi<4>::gather(Vec3vf4& p0, - Vec3vf4& p1, - Vec3vf4& p2, - const Scene *const scene, - const float time) const - { - const TriangleMesh* mesh = scene->get(geomID(0)); // in mblur mode all geometries are identical - - float ftime; - const int itime = mesh->timeSegment(time, ftime); - - Vec3vf4 a0,a1,a2; gather(a0,a1,a2,mesh,scene,itime); - Vec3vf4 b0,b1,b2; gather(b0,b1,b2,mesh,scene,itime+1); - p0 = lerp(a0,b0,vfloat4(ftime)); - p1 = lerp(a1,b1,vfloat4(ftime)); - p2 = lerp(a2,b2,vfloat4(ftime)); - } - } - - template - typename TriangleMi::Type TriangleMi::type; - - typedef TriangleMi<4> Triangle4i; -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/trianglei_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/trianglei_intersector.h deleted file mode 100644 index e2f106a62c..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/trianglei_intersector.h +++ /dev/null @@ -1,336 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "trianglei.h" -#include "triangle_intersector_moeller.h" -#include "triangle_intersector_pluecker.h" - -namespace embree -{ - namespace isa - { - /*! Intersects M triangles with 1 ray */ - template - struct TriangleMiIntersector1Moeller - { - typedef TriangleMi Primitive; - typedef MoellerTrumboreIntersector1 Precalculations; - - static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri) - { - STAT3(normal.trav_prims,1,1,1); - Vec3vf v0, v1, v2; tri.gather(v0,v1,v2,context->scene); - pre.intersect(ray,v0,v1,v2,/*UVIdentity(),*/Intersect1EpilogM(ray,context,tri.geomID(),tri.primID())); - } - - static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri) - { - STAT3(shadow.trav_prims,1,1,1); - Vec3vf v0, v1, v2; tri.gather(v0,v1,v2,context->scene); - return pre.intersect(ray,v0,v1,v2,/*UVIdentity(),*/Occluded1EpilogM(ray,context,tri.geomID(),tri.primID())); - } - - static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri) - { - return PrimitivePointQuery1::pointQuery(query, context, tri); - } - }; - - /*! Intersects M triangles with K rays */ - template - struct TriangleMiIntersectorKMoeller - { - typedef TriangleMi Primitive; - typedef MoellerTrumboreIntersectorK Precalculations; - - static __forceinline void intersect(const vbool& valid_i, Precalculations& pre, RayHitK& ray, IntersectContext* context, const Primitive& tri) - { - const Scene* scene = context->scene; - for (size_t i=0; i::size()); - const Vec3vf v0 = tri.template getVertex<0>(i,scene); - const Vec3vf v1 = tri.template getVertex<1>(i,scene); - const Vec3vf v2 = tri.template getVertex<2>(i,scene); - pre.intersectK(valid_i,ray,v0,v1,v2,/*UVIdentity(),*/IntersectKEpilogM(ray,context,tri.geomID(),tri.primID(),i)); - } - } - - static __forceinline vbool occluded(const vbool& valid_i, Precalculations& pre, RayK& ray, IntersectContext* context, const Primitive& tri) - { - vbool valid0 = valid_i; - const Scene* scene = context->scene; - - for (size_t i=0; i::size()); - const Vec3vf v0 = tri.template getVertex<0>(i,scene); - const Vec3vf v1 = tri.template getVertex<1>(i,scene); - const Vec3vf v2 = tri.template getVertex<2>(i,scene); - pre.intersectK(valid0,ray,v0,v1,v2,/*UVIdentity(),*/OccludedKEpilogM(valid0,ray,context,tri.geomID(),tri.primID(),i)); - if (none(valid0)) break; - } - return !valid0; - } - - static __forceinline void intersect(Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive& tri) - { - STAT3(normal.trav_prims,1,1,1); - Vec3vf v0, v1, v2; tri.gather(v0,v1,v2,context->scene); - pre.intersect(ray,k,v0,v1,v2,/*UVIdentity(),*/Intersect1KEpilogM(ray,k,context,tri.geomID(),tri.primID())); - } - - static __forceinline bool occluded(Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const Primitive& tri) - { - STAT3(shadow.trav_prims,1,1,1); - Vec3vf v0, v1, v2; tri.gather(v0,v1,v2,context->scene); - return pre.intersect(ray,k,v0,v1,v2,/*UVIdentity(),*/Occluded1KEpilogM(ray,k,context,tri.geomID(),tri.primID())); - } - }; - - /*! Intersects M triangles with 1 ray */ - template - struct TriangleMiIntersector1Pluecker - { - typedef TriangleMi Primitive; - typedef PlueckerIntersector1 Precalculations; - - static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri) - { - STAT3(normal.trav_prims,1,1,1); - Vec3vf v0, v1, v2; tri.gather(v0,v1,v2,context->scene); - pre.intersect(ray,v0,v1,v2,UVIdentity(),Intersect1EpilogM(ray,context,tri.geomID(),tri.primID())); - } - - static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri) - { - STAT3(shadow.trav_prims,1,1,1); - Vec3vf v0, v1, v2; tri.gather(v0,v1,v2,context->scene); - return pre.intersect(ray,v0,v1,v2,UVIdentity(),Occluded1EpilogM(ray,context,tri.geomID(),tri.primID())); - } - - static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri) - { - return PrimitivePointQuery1::pointQuery(query, context, tri); - } - }; - - /*! Intersects M triangles with K rays */ - template - struct TriangleMiIntersectorKPluecker - { - typedef TriangleMi Primitive; - typedef PlueckerIntersectorK Precalculations; - - static __forceinline void intersect(const vbool& valid_i, Precalculations& pre, RayHitK& ray, IntersectContext* context, const Primitive& tri) - { - const Scene* scene = context->scene; - for (size_t i=0; i::size()); - const Vec3vf v0 = tri.template getVertex<0>(i,scene); - const Vec3vf v1 = tri.template getVertex<1>(i,scene); - const Vec3vf v2 = tri.template getVertex<2>(i,scene); - pre.intersectK(valid_i,ray,v0,v1,v2,UVIdentity(),IntersectKEpilogM(ray,context,tri.geomID(),tri.primID(),i)); - } - } - - static __forceinline vbool occluded(const vbool& valid_i, Precalculations& pre, RayK& ray, IntersectContext* context, const Primitive& tri) - { - vbool valid0 = valid_i; - const Scene* scene = context->scene; - - for (size_t i=0; i::size()); - const Vec3vf v0 = tri.template getVertex<0>(i,scene); - const Vec3vf v1 = tri.template getVertex<1>(i,scene); - const Vec3vf v2 = tri.template getVertex<2>(i,scene); - pre.intersectK(valid0,ray,v0,v1,v2,UVIdentity(),OccludedKEpilogM(valid0,ray,context,tri.geomID(),tri.primID(),i)); - if (none(valid0)) break; - } - return !valid0; - } - - static __forceinline void intersect(Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive& tri) - { - STAT3(normal.trav_prims,1,1,1); - Vec3vf v0, v1, v2; tri.gather(v0,v1,v2,context->scene); - pre.intersect(ray,k,v0,v1,v2,UVIdentity(),Intersect1KEpilogM(ray,k,context,tri.geomID(),tri.primID())); - } - - static __forceinline bool occluded(Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const Primitive& tri) - { - STAT3(shadow.trav_prims,1,1,1); - Vec3vf v0, v1, v2; tri.gather(v0,v1,v2,context->scene); - return pre.intersect(ray,k,v0,v1,v2,UVIdentity(),Occluded1KEpilogM(ray,k,context,tri.geomID(),tri.primID())); - } - }; - - /*! Intersects M motion blur triangles with 1 ray */ - template - struct TriangleMiMBIntersector1Moeller - { - typedef TriangleMi Primitive; - typedef MoellerTrumboreIntersector1 Precalculations; - - /*! Intersect a ray with the M triangles and updates the hit. */ - static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri) - { - STAT3(normal.trav_prims,1,1,1); - Vec3vf v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time()); - pre.intersect(ray,v0,v1,v2,/*UVIdentity(),*/Intersect1EpilogM(ray,context,tri.geomID(),tri.primID())); - } - - /*! Test if the ray is occluded by one of M triangles. */ - static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri) - { - STAT3(shadow.trav_prims,1,1,1); - Vec3vf v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time()); - return pre.intersect(ray,v0,v1,v2,/*UVIdentity(),*/Occluded1EpilogM(ray,context,tri.geomID(),tri.primID())); - } - - static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri) - { - return PrimitivePointQuery1::pointQuery(query, context, tri); - } - }; - - /*! Intersects M motion blur triangles with K rays. */ - template - struct TriangleMiMBIntersectorKMoeller - { - typedef TriangleMi Primitive; - typedef MoellerTrumboreIntersectorK Precalculations; - - /*! Intersects K rays with M triangles. */ - static __forceinline void intersect(const vbool& valid_i, Precalculations& pre, RayHitK& ray, IntersectContext* context, const TriangleMi& tri) - { - for (size_t i=0; i::max_size(); i++) - { - if (!tri.valid(i)) break; - STAT3(normal.trav_prims,1,popcnt(valid_i),K); - Vec3vf v0,v1,v2; tri.gather(valid_i,v0,v1,v2,i,context->scene,ray.time()); - pre.intersectK(valid_i,ray,v0,v1,v2,/*UVIdentity(),*/IntersectKEpilogM(ray,context,tri.geomID(),tri.primID(),i)); - } - } - - /*! Test for K rays if they are occluded by any of the M triangles. */ - static __forceinline vbool occluded(const vbool& valid_i, Precalculations& pre, RayK& ray, IntersectContext* context, const TriangleMi& tri) - { - vbool valid0 = valid_i; - for (size_t i=0; i::max_size(); i++) - { - if (!tri.valid(i)) break; - STAT3(shadow.trav_prims,1,popcnt(valid0),K); - Vec3vf v0,v1,v2; tri.gather(valid_i,v0,v1,v2,i,context->scene,ray.time()); - pre.intersectK(valid0,ray,v0,v1,v2,/*UVIdentity(),*/OccludedKEpilogM(valid0,ray,context,tri.geomID(),tri.primID(),i)); - if (none(valid0)) break; - } - return !valid0; - } - - /*! Intersect a ray with M triangles and updates the hit. */ - static __forceinline void intersect(Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const TriangleMi& tri) - { - STAT3(normal.trav_prims,1,1,1); - Vec3vf v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time()[k]); - pre.intersect(ray,k,v0,v1,v2,/*UVIdentity(),*/Intersect1KEpilogM(ray,k,context,tri.geomID(),tri.primID())); - } - - /*! Test if the ray is occluded by one of the M triangles. */ - static __forceinline bool occluded(Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const TriangleMi& tri) - { - STAT3(shadow.trav_prims,1,1,1); - Vec3vf v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time()[k]); - return pre.intersect(ray,k,v0,v1,v2,/*UVIdentity(),*/Occluded1KEpilogM(ray,k,context,tri.geomID(),tri.primID())); - } - }; - - /*! Intersects M motion blur triangles with 1 ray */ - template - struct TriangleMiMBIntersector1Pluecker - { - typedef TriangleMi Primitive; - typedef PlueckerIntersector1 Precalculations; - - /*! Intersect a ray with the M triangles and updates the hit. */ - static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri) - { - STAT3(normal.trav_prims,1,1,1); - Vec3vf v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time()); - pre.intersect(ray,v0,v1,v2,UVIdentity(),Intersect1EpilogM(ray,context,tri.geomID(),tri.primID())); - } - - /*! Test if the ray is occluded by one of M triangles. */ - static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri) - { - STAT3(shadow.trav_prims,1,1,1); - Vec3vf v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time()); - return pre.intersect(ray,v0,v1,v2,UVIdentity(),Occluded1EpilogM(ray,context,tri.geomID(),tri.primID())); - } - - static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri) - { - return PrimitivePointQuery1::pointQuery(query, context, tri); - } - }; - - /*! Intersects M motion blur triangles with K rays. */ - template - struct TriangleMiMBIntersectorKPluecker - { - typedef TriangleMi Primitive; - typedef PlueckerIntersectorK Precalculations; - - /*! Intersects K rays with M triangles. */ - static __forceinline void intersect(const vbool& valid_i, Precalculations& pre, RayHitK& ray, IntersectContext* context, const TriangleMi& tri) - { - for (size_t i=0; i::max_size(); i++) - { - if (!tri.valid(i)) break; - STAT3(normal.trav_prims,1,popcnt(valid_i),K); - Vec3vf v0,v1,v2; tri.gather(valid_i,v0,v1,v2,i,context->scene,ray.time()); - pre.intersectK(valid_i,ray,v0,v1,v2,UVIdentity(),IntersectKEpilogM(ray,context,tri.geomID(),tri.primID(),i)); - } - } - - /*! Test for K rays if they are occluded by any of the M triangles. */ - static __forceinline vbool occluded(const vbool& valid_i, Precalculations& pre, RayK& ray, IntersectContext* context, const TriangleMi& tri) - { - vbool valid0 = valid_i; - for (size_t i=0; i::max_size(); i++) - { - if (!tri.valid(i)) break; - STAT3(shadow.trav_prims,1,popcnt(valid0),K); - Vec3vf v0,v1,v2; tri.gather(valid_i,v0,v1,v2,i,context->scene,ray.time()); - pre.intersectK(valid0,ray,v0,v1,v2,UVIdentity(),OccludedKEpilogM(valid0,ray,context,tri.geomID(),tri.primID(),i)); - if (none(valid0)) break; - } - return !valid0; - } - - /*! Intersect a ray with M triangles and updates the hit. */ - static __forceinline void intersect(Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const TriangleMi& tri) - { - STAT3(normal.trav_prims,1,1,1); - Vec3vf v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time()[k]); - pre.intersect(ray,k,v0,v1,v2,UVIdentity(),Intersect1KEpilogM(ray,k,context,tri.geomID(),tri.primID())); - } - - /*! Test if the ray is occluded by one of the M triangles. */ - static __forceinline bool occluded(Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const TriangleMi& tri) - { - STAT3(shadow.trav_prims,1,1,1); - Vec3vf v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time()[k]); - return pre.intersect(ray,k,v0,v1,v2,UVIdentity(),Occluded1KEpilogM(ray,k,context,tri.geomID(),tri.primID())); - } - }; - } -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/trianglev.h b/thirdparty/embree-aarch64/kernels/geometry/trianglev.h deleted file mode 100644 index 19af389e73..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/trianglev.h +++ /dev/null @@ -1,157 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "primitive.h" - -namespace embree -{ - /* Stores the vertices of M triangles in struct of array layout */ - template - struct TriangleMv - { - public: - struct Type : public PrimitiveType - { - const char* name() const; - size_t sizeActive(const char* This) const; - size_t sizeTotal(const char* This) const; - size_t getBytes(const char* This) const; - }; - static Type type; - - public: - - /* Returns maximum number of stored triangles */ - static __forceinline size_t max_size() { return M; } - - /* Returns required number of primitive blocks for N primitives */ - static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); } - - public: - - /* Default constructor */ - __forceinline TriangleMv() {} - - /* Construction from vertices and IDs */ - __forceinline TriangleMv(const Vec3vf& v0, const Vec3vf& v1, const Vec3vf& v2, const vuint& geomIDs, const vuint& primIDs) - : v0(v0), v1(v1), v2(v2), geomIDs(geomIDs), primIDs(primIDs) {} - - /* Returns a mask that tells which triangles are valid */ - __forceinline vbool valid() const { return geomIDs != vuint(-1); } - - /* Returns true if the specified triangle is valid */ - __forceinline bool valid(const size_t i) const { assert(i& geomID() { return geomIDs; } - __forceinline const vuint& geomID() const { return geomIDs; } - __forceinline unsigned int geomID(const size_t i) const { assert(i& primID() { return primIDs; } - __forceinline const vuint& primID() const { return primIDs; } - __forceinline unsigned int primID(const size_t i) const { assert(i lower = min(v0,v1,v2); - Vec3vf upper = max(v0,v1,v2); - vbool mask = valid(); - lower.x = select(mask,lower.x,vfloat(pos_inf)); - lower.y = select(mask,lower.y,vfloat(pos_inf)); - lower.z = select(mask,lower.z,vfloat(pos_inf)); - upper.x = select(mask,upper.x,vfloat(neg_inf)); - upper.y = select(mask,upper.y,vfloat(neg_inf)); - upper.z = select(mask,upper.z,vfloat(neg_inf)); - return BBox3fa(Vec3fa(reduce_min(lower.x),reduce_min(lower.y),reduce_min(lower.z)), - Vec3fa(reduce_max(upper.x),reduce_max(upper.y),reduce_max(upper.z))); - } - - /* Non temporal store */ - __forceinline static void store_nt(TriangleMv* dst, const TriangleMv& src) - { - vfloat::store_nt(&dst->v0.x,src.v0.x); - vfloat::store_nt(&dst->v0.y,src.v0.y); - vfloat::store_nt(&dst->v0.z,src.v0.z); - vfloat::store_nt(&dst->v1.x,src.v1.x); - vfloat::store_nt(&dst->v1.y,src.v1.y); - vfloat::store_nt(&dst->v1.z,src.v1.z); - vfloat::store_nt(&dst->v2.x,src.v2.x); - vfloat::store_nt(&dst->v2.y,src.v2.y); - vfloat::store_nt(&dst->v2.z,src.v2.z); - vuint::store_nt(&dst->geomIDs,src.geomIDs); - vuint::store_nt(&dst->primIDs,src.primIDs); - } - - /* Fill triangle from triangle list */ - __forceinline void fill(const PrimRef* prims, size_t& begin, size_t end, Scene* scene) - { - vuint vgeomID = -1, vprimID = -1; - Vec3vf v0 = zero, v1 = zero, v2 = zero; - - for (size_t i=0; iget(geomID); - const TriangleMesh::Triangle& tri = mesh->triangle(primID); - const Vec3fa& p0 = mesh->vertex(tri.v[0]); - const Vec3fa& p1 = mesh->vertex(tri.v[1]); - const Vec3fa& p2 = mesh->vertex(tri.v[2]); - vgeomID [i] = geomID; - vprimID [i] = primID; - v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z; - v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z; - v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z; - } - TriangleMv::store_nt(this,TriangleMv(v0,v1,v2,vgeomID,vprimID)); - } - - /* Updates the primitive */ - __forceinline BBox3fa update(TriangleMesh* mesh) - { - BBox3fa bounds = empty; - vuint vgeomID = -1, vprimID = -1; - Vec3vf v0 = zero, v1 = zero, v2 = zero; - - for (size_t i=0; itriangle(primId); - const Vec3fa p0 = mesh->vertex(tri.v[0]); - const Vec3fa p1 = mesh->vertex(tri.v[1]); - const Vec3fa p2 = mesh->vertex(tri.v[2]); - bounds.extend(merge(BBox3fa(p0),BBox3fa(p1),BBox3fa(p2))); - vgeomID [i] = geomId; - vprimID [i] = primId; - v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z; - v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z; - v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z; - } - new (this) TriangleMv(v0,v1,v2,vgeomID,vprimID); - return bounds; - } - - public: - Vec3vf v0; // 1st vertex of the triangles - Vec3vf v1; // 2nd vertex of the triangles - Vec3vf v2; // 3rd vertex of the triangles - private: - vuint geomIDs; // geometry ID - vuint primIDs; // primitive ID - }; - - template - typename TriangleMv::Type TriangleMv::type; - - typedef TriangleMv<4> Triangle4v; -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/trianglev_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/trianglev_intersector.h deleted file mode 100644 index 6af0d5a11c..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/trianglev_intersector.h +++ /dev/null @@ -1,206 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "triangle.h" -#include "triangle_intersector_pluecker.h" -#include "triangle_intersector_moeller.h" -#include "triangle_intersector_woop.h" - -namespace embree -{ - namespace isa - { - /*! Intersects M triangles with 1 ray */ - template - struct TriangleMvIntersector1Moeller - { - typedef TriangleMv Primitive; - typedef MoellerTrumboreIntersector1 Precalculations; - - /*! Intersect a ray with M triangles and updates the hit. */ - static __forceinline void intersect(Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri) - { - STAT3(normal.trav_prims,1,1,1); - pre.intersect(ray,tri.v0,tri.v1,tri.v2,/*UVIdentity(),*/Intersect1EpilogM(ray,context,tri.geomID(),tri.primID())); - } - - /*! Test if the ray is occluded by one of the M triangles. */ - static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri) - { - STAT3(shadow.trav_prims,1,1,1); - return pre.intersect(ray,tri.v0,tri.v1,tri.v2,/*UVIdentity(),*/Occluded1EpilogM(ray,context,tri.geomID(),tri.primID())); - } - - static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri) - { - return PrimitivePointQuery1::pointQuery(query, context, tri); - } - }; - - - template - struct TriangleMvIntersector1Woop - { - typedef TriangleMv Primitive; - typedef WoopIntersector1 intersec; - typedef WoopPrecalculations1 Precalculations; - - /*! Intersect a ray with M triangles and updates the hit. */ - static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri) - { - STAT3(normal.trav_prims,1,1,1); - intersec::intersect(ray,pre,tri.v0,tri.v1,tri.v2,Intersect1EpilogM(ray,context,tri.geomID(),tri.primID())); - } - - /*! Test if the ray is occluded by one of the M triangles. */ - static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri) - { - STAT3(shadow.trav_prims,1,1,1); - return intersec::intersect(ray,pre,tri.v0,tri.v1,tri.v2,Occluded1EpilogM(ray,context,tri.geomID(),tri.primID())); - } - - static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri) - { - return PrimitivePointQuery1::pointQuery(query, context, tri); - } - }; - - - /*! Intersects M triangles with K rays */ - template - struct TriangleMvIntersectorKMoeller - { - typedef TriangleMv Primitive; - typedef MoellerTrumboreIntersectorK Precalculations; - - /*! Intersects K rays with M triangles. */ - static __forceinline void intersect(const vbool& valid_i, Precalculations& pre, RayHitK& ray, IntersectContext* context, const Primitive& tri) - { - for (size_t i=0; i v0 = broadcast>(tri.v0,i); - const Vec3vf v1 = broadcast>(tri.v1,i); - const Vec3vf v2 = broadcast>(tri.v2,i); - pre.intersectK(valid_i,ray,v0,v1,v2,/*UVIdentity(),*/IntersectKEpilogM(ray,context,tri.geomID(),tri.primID(),i)); - } - } - - /*! Test for K rays if they are occluded by any of the M triangles. */ - static __forceinline vbool occluded(const vbool& valid_i, Precalculations& pre, RayK& ray, IntersectContext* context, const Primitive& tri) - { - vbool valid0 = valid_i; - - for (size_t i=0; i v0 = broadcast>(tri.v0,i); - const Vec3vf v1 = broadcast>(tri.v1,i); - const Vec3vf v2 = broadcast>(tri.v2,i); - pre.intersectK(valid0,ray,v0,v1,v2,/*UVIdentity(),*/OccludedKEpilogM(valid0,ray,context,tri.geomID(),tri.primID(),i)); - if (none(valid0)) break; - } - return !valid0; - } - - /*! Intersect a ray with M triangles and updates the hit. */ - static __forceinline void intersect(Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive& tri) - { - STAT3(normal.trav_prims,1,1,1); - pre.intersect(ray,k,tri.v0,tri.v1,tri.v2,/*UVIdentity(),*/Intersect1KEpilogM(ray,k,context,tri.geomID(),tri.primID())); //FIXME: M,Mx - } - - /*! Test if the ray is occluded by one of the M triangles. */ - static __forceinline bool occluded(Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const Primitive& tri) - { - STAT3(shadow.trav_prims,1,1,1); - return pre.intersect(ray,k,tri.v0,tri.v1,tri.v2,/*UVIdentity(),*/Occluded1KEpilogM(ray,k,context,tri.geomID(),tri.primID())); //FIXME: M,Mx - } - }; - - /*! Intersects M triangles with 1 ray */ - template - struct TriangleMvIntersector1Pluecker - { - typedef TriangleMv Primitive; - typedef PlueckerIntersector1 Precalculations; - - /*! Intersect a ray with M triangles and updates the hit. */ - static __forceinline void intersect(Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri) - { - STAT3(normal.trav_prims,1,1,1); - pre.intersect(ray,tri.v0,tri.v1,tri.v2,UVIdentity(),Intersect1EpilogM(ray,context,tri.geomID(),tri.primID())); - } - - /*! Test if the ray is occluded by one of the M triangles. */ - static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri) - { - STAT3(shadow.trav_prims,1,1,1); - return pre.intersect(ray,tri.v0,tri.v1,tri.v2,UVIdentity(),Occluded1EpilogM(ray,context,tri.geomID(),tri.primID())); - } - - static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri) - { - return PrimitivePointQuery1::pointQuery(query, context, tri); - } - }; - - /*! Intersects M triangles with K rays */ - template - struct TriangleMvIntersectorKPluecker - { - typedef TriangleMv Primitive; - typedef PlueckerIntersectorK Precalculations; - - /*! Intersects K rays with M triangles. */ - static __forceinline void intersect(const vbool& valid_i, Precalculations& pre, RayHitK& ray, IntersectContext* context, const Primitive& tri) - { - for (size_t i=0; i v0 = broadcast>(tri.v0,i); - const Vec3vf v1 = broadcast>(tri.v1,i); - const Vec3vf v2 = broadcast>(tri.v2,i); - pre.intersectK(valid_i,ray,v0,v1,v2,UVIdentity(),IntersectKEpilogM(ray,context,tri.geomID(),tri.primID(),i)); - } - } - - /*! Test for K rays if they are occluded by any of the M triangles. */ - static __forceinline vbool occluded(const vbool& valid_i, Precalculations& pre, RayK& ray, IntersectContext* context, const Primitive& tri) - { - vbool valid0 = valid_i; - - for (size_t i=0; i v0 = broadcast>(tri.v0,i); - const Vec3vf v1 = broadcast>(tri.v1,i); - const Vec3vf v2 = broadcast>(tri.v2,i); - pre.intersectK(valid0,ray,v0,v1,v2,UVIdentity(),OccludedKEpilogM(valid0,ray,context,tri.geomID(),tri.primID(),i)); - if (none(valid0)) break; - } - return !valid0; - } - - /*! Intersect a ray with M triangles and updates the hit. */ - static __forceinline void intersect(Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive& tri) - { - STAT3(normal.trav_prims,1,1,1); - pre.intersect(ray,k,tri.v0,tri.v1,tri.v2,UVIdentity(),Intersect1KEpilogM(ray,k,context,tri.geomID(),tri.primID())); //FIXME: M,Mx - } - - /*! Test if the ray is occluded by one of the M triangles. */ - static __forceinline bool occluded(Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const Primitive& tri) - { - STAT3(shadow.trav_prims,1,1,1); - return pre.intersect(ray,k,tri.v0,tri.v1,tri.v2,UVIdentity(),Occluded1KEpilogM(ray,k,context,tri.geomID(),tri.primID())); //FIXME: M,Mx - } - }; - } -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/trianglev_mb.h b/thirdparty/embree-aarch64/kernels/geometry/trianglev_mb.h deleted file mode 100644 index 63137aee16..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/trianglev_mb.h +++ /dev/null @@ -1,201 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "primitive.h" - -namespace embree -{ - /* Stores the vertices of M triangles in struct of array layout */ - template - struct TriangleMvMB - { - public: - struct Type : public PrimitiveType - { - const char* name() const; - size_t sizeActive(const char* This) const; - size_t sizeTotal(const char* This) const; - size_t getBytes(const char* This) const; - }; - - static Type type; - - public: - - /* primitive supports single time segments */ - static const bool singleTimeSegment = true; - - /* Returns maximum number of stored triangles */ - static __forceinline size_t max_size() { return M; } - - /* Returns required number of primitive blocks for N primitives */ - static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); } - - public: - - /* Default constructor */ - __forceinline TriangleMvMB() {} - - /* Construction from vertices and IDs */ - __forceinline TriangleMvMB(const Vec3vf& a0, const Vec3vf& a1, - const Vec3vf& b0, const Vec3vf& b1, - const Vec3vf& c0, const Vec3vf& c1, - const vuint& geomIDs, const vuint& primIDs) - : v0(a0), v1(b0), v2(c0), dv0(a1-a0), dv1(b1-b0), dv2(c1-c0), geomIDs(geomIDs), primIDs(primIDs) {} - - /* Returns a mask that tells which triangles are valid */ - __forceinline vbool valid() const { return geomIDs != vuint(-1); } - - /* Returns if the specified triangle is valid */ - __forceinline bool valid(const size_t i) const { assert(i& geomID() { return geomIDs; } - __forceinline const vuint& geomID() const { return geomIDs; } - __forceinline unsigned int geomID(const size_t i) const { assert(i& primID() { return primIDs; } - __forceinline const vuint& primID() const { return primIDs; } - __forceinline unsigned int primID(const size_t i) const { assert(i lower = min(v0,v1,v2); - Vec3vf upper = max(v0,v1,v2); - const vbool mask = valid(); - lower.x = select(mask,lower.x,vfloat(pos_inf)); - lower.y = select(mask,lower.y,vfloat(pos_inf)); - lower.z = select(mask,lower.z,vfloat(pos_inf)); - upper.x = select(mask,upper.x,vfloat(neg_inf)); - upper.y = select(mask,upper.y,vfloat(neg_inf)); - upper.z = select(mask,upper.z,vfloat(neg_inf)); - return BBox3fa(Vec3fa(reduce_min(lower.x),reduce_min(lower.y),reduce_min(lower.z)), - Vec3fa(reduce_max(upper.x),reduce_max(upper.y),reduce_max(upper.z))); - } - - /* Calculate the bounds of the triangles at t1 */ - __forceinline BBox3fa bounds1() const - { - const Vec3vf p0 = v0+dv0; - const Vec3vf p1 = v1+dv1; - const Vec3vf p2 = v2+dv2; - Vec3vf lower = min(p0,p1,p2); - Vec3vf upper = max(p0,p1,p2); - const vbool mask = valid(); - lower.x = select(mask,lower.x,vfloat(pos_inf)); - lower.y = select(mask,lower.y,vfloat(pos_inf)); - lower.z = select(mask,lower.z,vfloat(pos_inf)); - upper.x = select(mask,upper.x,vfloat(neg_inf)); - upper.y = select(mask,upper.y,vfloat(neg_inf)); - upper.z = select(mask,upper.z,vfloat(neg_inf)); - return BBox3fa(Vec3fa(reduce_min(lower.x),reduce_min(lower.y),reduce_min(lower.z)), - Vec3fa(reduce_max(upper.x),reduce_max(upper.y),reduce_max(upper.z))); - } - - /* Calculate the linear bounds of the primitive */ - __forceinline LBBox3fa linearBounds() const { - return LBBox3fa(bounds0(),bounds1()); - } - - /* Fill triangle from triangle list */ - __forceinline LBBox3fa fillMB(const PrimRef* prims, size_t& begin, size_t end, Scene* scene, size_t itime) - { - vuint vgeomID = -1, vprimID = -1; - Vec3vf va0 = zero, vb0 = zero, vc0 = zero; - Vec3vf va1 = zero, vb1 = zero, vc1 = zero; - - BBox3fa bounds0 = empty; - BBox3fa bounds1 = empty; - - for (size_t i=0; iget(geomID); - const TriangleMesh::Triangle& tri = mesh->triangle(primID); - const Vec3fa& a0 = mesh->vertex(tri.v[0],itime+0); bounds0.extend(a0); - const Vec3fa& a1 = mesh->vertex(tri.v[0],itime+1); bounds1.extend(a1); - const Vec3fa& b0 = mesh->vertex(tri.v[1],itime+0); bounds0.extend(b0); - const Vec3fa& b1 = mesh->vertex(tri.v[1],itime+1); bounds1.extend(b1); - const Vec3fa& c0 = mesh->vertex(tri.v[2],itime+0); bounds0.extend(c0); - const Vec3fa& c1 = mesh->vertex(tri.v[2],itime+1); bounds1.extend(c1); - vgeomID [i] = geomID; - vprimID [i] = primID; - va0.x[i] = a0.x; va0.y[i] = a0.y; va0.z[i] = a0.z; - va1.x[i] = a1.x; va1.y[i] = a1.y; va1.z[i] = a1.z; - vb0.x[i] = b0.x; vb0.y[i] = b0.y; vb0.z[i] = b0.z; - vb1.x[i] = b1.x; vb1.y[i] = b1.y; vb1.z[i] = b1.z; - vc0.x[i] = c0.x; vc0.y[i] = c0.y; vc0.z[i] = c0.z; - vc1.x[i] = c1.x; vc1.y[i] = c1.y; vc1.z[i] = c1.z; - } - new (this) TriangleMvMB(va0,va1,vb0,vb1,vc0,vc1,vgeomID,vprimID); - return LBBox3fa(bounds0,bounds1); - } - - /* Fill triangle from triangle list */ - __forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t& begin, size_t end, Scene* scene, const BBox1f time_range) - { - vuint vgeomID = -1, vprimID = -1; - Vec3vf va0 = zero, vb0 = zero, vc0 = zero; - Vec3vf va1 = zero, vb1 = zero, vc1 = zero; - - LBBox3fa allBounds = empty; - for (size_t i=0; iget(geomID); - const range itime_range = mesh->timeSegmentRange(time_range); - assert(itime_range.size() == 1); - const int ilower = itime_range.begin(); - const TriangleMesh::Triangle& tri = mesh->triangle(primID); - allBounds.extend(mesh->linearBounds(primID, time_range)); - const Vec3fa& a0 = mesh->vertex(tri.v[0],ilower+0); - const Vec3fa& a1 = mesh->vertex(tri.v[0],ilower+1); - const Vec3fa& b0 = mesh->vertex(tri.v[1],ilower+0); - const Vec3fa& b1 = mesh->vertex(tri.v[1],ilower+1); - const Vec3fa& c0 = mesh->vertex(tri.v[2],ilower+0); - const Vec3fa& c1 = mesh->vertex(tri.v[2],ilower+1); - const BBox1f time_range_v(mesh->timeStep(ilower+0),mesh->timeStep(ilower+1)); - auto a01 = globalLinear(std::make_pair(a0,a1),time_range_v); - auto b01 = globalLinear(std::make_pair(b0,b1),time_range_v); - auto c01 = globalLinear(std::make_pair(c0,c1),time_range_v); - vgeomID [i] = geomID; - vprimID [i] = primID; - va0.x[i] = a01.first .x; va0.y[i] = a01.first .y; va0.z[i] = a01.first .z; - va1.x[i] = a01.second.x; va1.y[i] = a01.second.y; va1.z[i] = a01.second.z; - vb0.x[i] = b01.first .x; vb0.y[i] = b01.first .y; vb0.z[i] = b01.first .z; - vb1.x[i] = b01.second.x; vb1.y[i] = b01.second.y; vb1.z[i] = b01.second.z; - vc0.x[i] = c01.first .x; vc0.y[i] = c01.first .y; vc0.z[i] = c01.first .z; - vc1.x[i] = c01.second.x; vc1.y[i] = c01.second.y; vc1.z[i] = c01.second.z; - } - new (this) TriangleMvMB(va0,va1,vb0,vb1,vc0,vc1,vgeomID,vprimID); - return allBounds; - } - - public: - Vec3vf v0; // 1st vertex of the triangles - Vec3vf v1; // 2nd vertex of the triangles - Vec3vf v2; // 3rd vertex of the triangles - Vec3vf dv0; // difference vector between time steps t0 and t1 for first vertex - Vec3vf dv1; // difference vector between time steps t0 and t1 for second vertex - Vec3vf dv2; // difference vector between time steps t0 and t1 for third vertex - private: - vuint geomIDs; // geometry ID - vuint primIDs; // primitive ID - }; - - template - typename TriangleMvMB::Type TriangleMvMB::type; - - typedef TriangleMvMB<4> Triangle4vMB; -} diff --git a/thirdparty/embree-aarch64/kernels/geometry/trianglev_mb_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/trianglev_mb_intersector.h deleted file mode 100644 index 35a260d826..0000000000 --- a/thirdparty/embree-aarch64/kernels/geometry/trianglev_mb_intersector.h +++ /dev/null @@ -1,211 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "triangle.h" -#include "intersector_epilog.h" - -namespace embree -{ - namespace isa - { - /*! Intersects M motion blur triangles with 1 ray */ - template - struct TriangleMvMBIntersector1Moeller - { - typedef TriangleMvMB Primitive; - typedef MoellerTrumboreIntersector1 Precalculations; - - /*! Intersect a ray with the M triangles and updates the hit. */ - static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const TriangleMvMB& tri) - { - STAT3(normal.trav_prims,1,1,1); - const Vec3vf time(ray.time()); - const Vec3vf v0 = madd(time,Vec3vf(tri.dv0),Vec3vf(tri.v0)); - const Vec3vf v1 = madd(time,Vec3vf(tri.dv1),Vec3vf(tri.v1)); - const Vec3vf v2 = madd(time,Vec3vf(tri.dv2),Vec3vf(tri.v2)); - pre.intersect(ray,v0,v1,v2,Intersect1EpilogM(ray,context,tri.geomID(),tri.primID())); - } - - /*! Test if the ray is occluded by one of M triangles. */ - static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const TriangleMvMB& tri) - { - STAT3(shadow.trav_prims,1,1,1); - const Vec3vf time(ray.time()); - const Vec3vf v0 = madd(time,Vec3vf(tri.dv0),Vec3vf(tri.v0)); - const Vec3vf v1 = madd(time,Vec3vf(tri.dv1),Vec3vf(tri.v1)); - const Vec3vf v2 = madd(time,Vec3vf(tri.dv2),Vec3vf(tri.v2)); - return pre.intersect(ray,v0,v1,v2,Occluded1EpilogM(ray,context,tri.geomID(),tri.primID())); - } - - static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri) - { - return PrimitivePointQuery1::pointQuery(query, context, tri); - } - }; - - /*! Intersects M motion blur triangles with K rays. */ - template - struct TriangleMvMBIntersectorKMoeller - { - typedef TriangleMvMB Primitive; - typedef MoellerTrumboreIntersectorK Precalculations; - - /*! Intersects K rays with M triangles. */ - static __forceinline void intersect(const vbool& valid_i, Precalculations& pre, RayHitK& ray, IntersectContext* context, const TriangleMvMB& tri) - { - for (size_t i=0; i::max_size(); i++) - { - if (!tri.valid(i)) break; - STAT3(normal.trav_prims,1,popcnt(valid_i),K); - const Vec3vf time(ray.time()); - const Vec3vf v0 = madd(time,broadcast>(tri.dv0,i),broadcast>(tri.v0,i)); - const Vec3vf v1 = madd(time,broadcast>(tri.dv1,i),broadcast>(tri.v1,i)); - const Vec3vf v2 = madd(time,broadcast>(tri.dv2,i),broadcast>(tri.v2,i)); - pre.intersectK(valid_i,ray,v0,v1,v2,IntersectKEpilogM(ray,context,tri.geomID(),tri.primID(),i)); - } - } - - /*! Test for K rays if they are occluded by any of the M triangles. */ - static __forceinline vbool occluded(const vbool& valid_i, Precalculations& pre, RayK& ray, IntersectContext* context, const TriangleMvMB& tri) - { - vbool valid0 = valid_i; - - for (size_t i=0; i::max_size(); i++) - { - if (!tri.valid(i)) break; - STAT3(shadow.trav_prims,1,popcnt(valid0),K); - const Vec3vf time(ray.time()); - const Vec3vf v0 = madd(time,broadcast>(tri.dv0,i),broadcast>(tri.v0,i)); - const Vec3vf v1 = madd(time,broadcast>(tri.dv1,i),broadcast>(tri.v1,i)); - const Vec3vf v2 = madd(time,broadcast>(tri.dv2,i),broadcast>(tri.v2,i)); - pre.intersectK(valid0,ray,v0,v1,v2,OccludedKEpilogM(valid0,ray,context,tri.geomID(),tri.primID(),i)); - if (none(valid0)) break; - } - return !valid0; - } - - /*! Intersect a ray with M triangles and updates the hit. */ - static __forceinline void intersect(Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const TriangleMvMB& tri) - { - STAT3(normal.trav_prims,1,1,1); - const Vec3vf time(ray.time()[k]); - const Vec3vf v0 = madd(time,Vec3vf(tri.dv0),Vec3vf(tri.v0)); - const Vec3vf v1 = madd(time,Vec3vf(tri.dv1),Vec3vf(tri.v1)); - const Vec3vf v2 = madd(time,Vec3vf(tri.dv2),Vec3vf(tri.v2)); - pre.intersect(ray,k,v0,v1,v2,Intersect1KEpilogM(ray,k,context,tri.geomID(),tri.primID())); - } - - /*! Test if the ray is occluded by one of the M triangles. */ - static __forceinline bool occluded(Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const TriangleMvMB& tri) - { - STAT3(shadow.trav_prims,1,1,1); - const Vec3vf time(ray.time()[k]); - const Vec3vf v0 = madd(time,Vec3vf(tri.dv0),Vec3vf(tri.v0)); - const Vec3vf v1 = madd(time,Vec3vf(tri.dv1),Vec3vf(tri.v1)); - const Vec3vf v2 = madd(time,Vec3vf(tri.dv2),Vec3vf(tri.v2)); - return pre.intersect(ray,k,v0,v1,v2,Occluded1KEpilogM(ray,k,context,tri.geomID(),tri.primID())); - } - }; - - /*! Intersects M motion blur triangles with 1 ray */ - template - struct TriangleMvMBIntersector1Pluecker - { - typedef TriangleMvMB Primitive; - typedef PlueckerIntersector1 Precalculations; - - /*! Intersect a ray with the M triangles and updates the hit. */ - static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const TriangleMvMB& tri) - { - STAT3(normal.trav_prims,1,1,1); - const Vec3vf time(ray.time()); - const Vec3vf v0 = madd(time,Vec3vf(tri.dv0),Vec3vf(tri.v0)); - const Vec3vf v1 = madd(time,Vec3vf(tri.dv1),Vec3vf(tri.v1)); - const Vec3vf v2 = madd(time,Vec3vf(tri.dv2),Vec3vf(tri.v2)); - pre.intersect(ray,v0,v1,v2,UVIdentity(),Intersect1EpilogM(ray,context,tri.geomID(),tri.primID())); - } - - /*! Test if the ray is occluded by one of M triangles. */ - static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const TriangleMvMB& tri) - { - STAT3(shadow.trav_prims,1,1,1); - const Vec3vf time(ray.time()); - const Vec3vf v0 = madd(time,Vec3vf(tri.dv0),Vec3vf(tri.v0)); - const Vec3vf v1 = madd(time,Vec3vf(tri.dv1),Vec3vf(tri.v1)); - const Vec3vf v2 = madd(time,Vec3vf(tri.dv2),Vec3vf(tri.v2)); - return pre.intersect(ray,v0,v1,v2,UVIdentity(),Occluded1EpilogM(ray,context,tri.geomID(),tri.primID())); - } - - static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri) - { - return PrimitivePointQuery1::pointQuery(query, context, tri); - } - }; - - /*! Intersects M motion blur triangles with K rays. */ - template - struct TriangleMvMBIntersectorKPluecker - { - typedef TriangleMvMB Primitive; - typedef PlueckerIntersectorK Precalculations; - - /*! Intersects K rays with M triangles. */ - static __forceinline void intersect(const vbool& valid_i, Precalculations& pre, RayHitK& ray, IntersectContext* context, const TriangleMvMB& tri) - { - for (size_t i=0; i::max_size(); i++) - { - if (!tri.valid(i)) break; - STAT3(normal.trav_prims,1,popcnt(valid_i),K); - const Vec3vf time(ray.time()); - const Vec3vf v0 = madd(time,broadcast>(tri.dv0,i),broadcast>(tri.v0,i)); - const Vec3vf v1 = madd(time,broadcast>(tri.dv1,i),broadcast>(tri.v1,i)); - const Vec3vf v2 = madd(time,broadcast>(tri.dv2,i),broadcast>(tri.v2,i)); - pre.intersectK(valid_i,ray,v0,v1,v2,UVIdentity(),IntersectKEpilogM(ray,context,tri.geomID(),tri.primID(),i)); - } - } - - /*! Test for K rays if they are occluded by any of the M triangles. */ - static __forceinline vbool occluded(const vbool& valid_i, Precalculations& pre, RayK& ray, IntersectContext* context, const TriangleMvMB& tri) - { - vbool valid0 = valid_i; - - for (size_t i=0; i::max_size(); i++) - { - if (!tri.valid(i)) break; - STAT3(shadow.trav_prims,1,popcnt(valid0),K); - const Vec3vf time(ray.time()); - const Vec3vf v0 = madd(time,broadcast>(tri.dv0,i),broadcast>(tri.v0,i)); - const Vec3vf v1 = madd(time,broadcast>(tri.dv1,i),broadcast>(tri.v1,i)); - const Vec3vf v2 = madd(time,broadcast>(tri.dv2,i),broadcast>(tri.v2,i)); - pre.intersectK(valid0,ray,v0,v1,v2,UVIdentity(),OccludedKEpilogM(valid0,ray,context,tri.geomID(),tri.primID(),i)); - if (none(valid0)) break; - } - return !valid0; - } - - /*! Intersect a ray with M triangles and updates the hit. */ - static __forceinline void intersect(Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const TriangleMvMB& tri) - { - STAT3(normal.trav_prims,1,1,1); - const Vec3vf time(ray.time()[k]); - const Vec3vf v0 = madd(time,Vec3vf(tri.dv0),Vec3vf(tri.v0)); - const Vec3vf v1 = madd(time,Vec3vf(tri.dv1),Vec3vf(tri.v1)); - const Vec3vf v2 = madd(time,Vec3vf(tri.dv2),Vec3vf(tri.v2)); - pre.intersect(ray,k,v0,v1,v2,UVIdentity(),Intersect1KEpilogM(ray,k,context,tri.geomID(),tri.primID())); - } - - /*! Test if the ray is occluded by one of the M triangles. */ - static __forceinline bool occluded(Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const TriangleMvMB& tri) - { - STAT3(shadow.trav_prims,1,1,1); - const Vec3vf time(ray.time()[k]); - const Vec3vf v0 = madd(time,Vec3vf(tri.dv0),Vec3vf(tri.v0)); - const Vec3vf v1 = madd(time,Vec3vf(tri.dv1),Vec3vf(tri.v1)); - const Vec3vf v2 = madd(time,Vec3vf(tri.dv2),Vec3vf(tri.v2)); - return pre.intersect(ray,k,v0,v1,v2,UVIdentity(),Occluded1KEpilogM(ray,k,context,tri.geomID(),tri.primID())); - } - }; - } -} diff --git a/thirdparty/embree-aarch64/kernels/hash.h b/thirdparty/embree-aarch64/kernels/hash.h deleted file mode 100644 index 4abbe203d6..0000000000 --- a/thirdparty/embree-aarch64/kernels/hash.h +++ /dev/null @@ -1,5 +0,0 @@ - -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#define RTC_HASH "6ef362f99af80c9dfe8dd2bfc582d9067897edc6" diff --git a/thirdparty/embree-aarch64/kernels/subdiv/bezier_curve.h b/thirdparty/embree-aarch64/kernels/subdiv/bezier_curve.h deleted file mode 100644 index c0e78820f8..0000000000 --- a/thirdparty/embree-aarch64/kernels/subdiv/bezier_curve.h +++ /dev/null @@ -1,669 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../common/default.h" -#include "../common/scene_curves.h" - -namespace embree -{ - class BezierBasis - { - public: - - template - static __forceinline Vec4 eval(const T& u) - { - const T t1 = u; - const T t0 = 1.0f-t1; - const T B0 = t0 * t0 * t0; - const T B1 = 3.0f * t1 * (t0 * t0); - const T B2 = 3.0f * (t1 * t1) * t0; - const T B3 = t1 * t1 * t1; - return Vec4(B0,B1,B2,B3); - } - - template - static __forceinline Vec4 derivative(const T& u) - { - const T t1 = u; - const T t0 = 1.0f-t1; - const T B0 = -(t0*t0); - const T B1 = madd(-2.0f,t0*t1,t0*t0); - const T B2 = msub(+2.0f,t0*t1,t1*t1); - const T B3 = +(t1*t1); - return T(3.0f)*Vec4(B0,B1,B2,B3); - } - - template - static __forceinline Vec4 derivative2(const T& u) - { - const T t1 = u; - const T t0 = 1.0f-t1; - const T B0 = t0; - const T B1 = madd(-2.0f,t0,t1); - const T B2 = madd(-2.0f,t1,t0); - const T B3 = t1; - return T(6.0f)*Vec4(B0,B1,B2,B3); - } - }; - - struct PrecomputedBezierBasis - { - enum { N = 16 }; - public: - PrecomputedBezierBasis() {} - PrecomputedBezierBasis(int shift); - - /* basis for bezier evaluation */ - public: - float c0[N+1][N+1]; - float c1[N+1][N+1]; - float c2[N+1][N+1]; - float c3[N+1][N+1]; - - /* basis for bezier derivative evaluation */ - public: - float d0[N+1][N+1]; - float d1[N+1][N+1]; - float d2[N+1][N+1]; - float d3[N+1][N+1]; - }; - extern PrecomputedBezierBasis bezier_basis0; - extern PrecomputedBezierBasis bezier_basis1; - - - template - struct LinearBezierCurve - { - V v0,v1; - - __forceinline LinearBezierCurve () {} - - __forceinline LinearBezierCurve (const LinearBezierCurve& other) - : v0(other.v0), v1(other.v1) {} - - __forceinline LinearBezierCurve& operator= (const LinearBezierCurve& other) { - v0 = other.v0; v1 = other.v1; return *this; - } - - __forceinline LinearBezierCurve (const V& v0, const V& v1) - : v0(v0), v1(v1) {} - - __forceinline V begin() const { return v0; } - __forceinline V end () const { return v1; } - - bool hasRoot() const; - - friend embree_ostream operator<<(embree_ostream cout, const LinearBezierCurve& a) { - return cout << "LinearBezierCurve (" << a.v0 << ", " << a.v1 << ")"; - } - }; - - template<> __forceinline bool LinearBezierCurve::hasRoot() const { - return numRoots(v0,v1); - } - - template - struct QuadraticBezierCurve - { - V v0,v1,v2; - - __forceinline QuadraticBezierCurve () {} - - __forceinline QuadraticBezierCurve (const QuadraticBezierCurve& other) - : v0(other.v0), v1(other.v1), v2(other.v2) {} - - __forceinline QuadraticBezierCurve& operator= (const QuadraticBezierCurve& other) { - v0 = other.v0; v1 = other.v1; v2 = other.v2; return *this; - } - - __forceinline QuadraticBezierCurve (const V& v0, const V& v1, const V& v2) - : v0(v0), v1(v1), v2(v2) {} - - __forceinline V begin() const { return v0; } - __forceinline V end () const { return v2; } - - __forceinline V interval() const { - return merge(v0,v1,v2); - } - - __forceinline BBox bounds() const { - return merge(BBox(v0),BBox(v1),BBox(v2)); - } - - friend embree_ostream operator<<(embree_ostream cout, const QuadraticBezierCurve& a) { - return cout << "QuadraticBezierCurve ( (" << a.u.lower << ", " << a.u.upper << "), " << a.v0 << ", " << a.v1 << ", " << a.v2 << ")"; - } - }; - - - typedef QuadraticBezierCurve QuadraticBezierCurve1f; - typedef QuadraticBezierCurve QuadraticBezierCurve2fa; - typedef QuadraticBezierCurve QuadraticBezierCurve3fa; - - template - struct CubicBezierCurve - { - Vertex v0,v1,v2,v3; - - __forceinline CubicBezierCurve() {} - - template - __forceinline CubicBezierCurve (const CubicBezierCurve& other) - : v0(other.v0), v1(other.v1), v2(other.v2), v3(other.v3) {} - - __forceinline CubicBezierCurve& operator= (const CubicBezierCurve& other) { - v0 = other.v0; v1 = other.v1; v2 = other.v2; v3 = other.v3; return *this; - } - - __forceinline CubicBezierCurve(const Vertex& v0, const Vertex& v1, const Vertex& v2, const Vertex& v3) - : v0(v0), v1(v1), v2(v2), v3(v3) {} - - __forceinline Vertex begin() const { - return v0; - } - - __forceinline Vertex end() const { - return v3; - } - - __forceinline Vertex center() const { - return 0.25f*(v0+v1+v2+v3); - } - - __forceinline Vertex begin_direction() const { - return v1-v0; - } - - __forceinline Vertex end_direction() const { - return v3-v2; - } - - __forceinline CubicBezierCurve xfm(const Vertex& dx) const { - return CubicBezierCurve(dot(v0,dx),dot(v1,dx),dot(v2,dx),dot(v3,dx)); - } - - __forceinline CubicBezierCurve vxfm(const Vertex& dx) const { - return CubicBezierCurve(dot(v0,dx),dot(v1,dx),dot(v2,dx),dot(v3,dx)); - } - - __forceinline CubicBezierCurve xfm(const Vertex& dx, const Vertex& p) const { - return CubicBezierCurve(dot(v0-p,dx),dot(v1-p,dx),dot(v2-p,dx),dot(v3-p,dx)); - } - - __forceinline CubicBezierCurve xfm(const LinearSpace3fa& space) const - { - const Vec3fa q0 = xfmVector(space,v0); - const Vec3fa q1 = xfmVector(space,v1); - const Vec3fa q2 = xfmVector(space,v2); - const Vec3fa q3 = xfmVector(space,v3); - return CubicBezierCurve(q0,q1,q2,q3); - } - - __forceinline CubicBezierCurve xfm(const LinearSpace3fa& space, const Vec3fa& p) const - { - const Vec3fa q0 = xfmVector(space,v0-p); - const Vec3fa q1 = xfmVector(space,v1-p); - const Vec3fa q2 = xfmVector(space,v2-p); - const Vec3fa q3 = xfmVector(space,v3-p); - return CubicBezierCurve(q0,q1,q2,q3); - } - - __forceinline CubicBezierCurve xfm_pr(const LinearSpace3fa& space, const Vec3fa& p) const - { - const Vec3ff q0(xfmVector(space,(Vec3fa)v0-p), v0.w); - const Vec3ff q1(xfmVector(space,(Vec3fa)v1-p), v1.w); - const Vec3ff q2(xfmVector(space,(Vec3fa)v2-p), v2.w); - const Vec3ff q3(xfmVector(space,(Vec3fa)v3-p), v3.w); - return CubicBezierCurve(q0,q1,q2,q3); - } - - __forceinline CubicBezierCurve xfm(const LinearSpace3fa& space, const Vec3fa& p, const float s) const - { - const Vec3fa q0 = xfmVector(space,s*(v0-p)); - const Vec3fa q1 = xfmVector(space,s*(v1-p)); - const Vec3fa q2 = xfmVector(space,s*(v2-p)); - const Vec3fa q3 = xfmVector(space,s*(v3-p)); - return CubicBezierCurve(q0,q1,q2,q3); - } - - __forceinline int maxRoots() const; - - __forceinline BBox bounds() const { - return merge(BBox(v0),BBox(v1),BBox(v2),BBox(v3)); - } - - __forceinline friend CubicBezierCurve operator +( const CubicBezierCurve& a, const CubicBezierCurve& b ) { - return CubicBezierCurve(a.v0+b.v0,a.v1+b.v1,a.v2+b.v2,a.v3+b.v3); - } - - __forceinline friend CubicBezierCurve operator -( const CubicBezierCurve& a, const CubicBezierCurve& b ) { - return CubicBezierCurve(a.v0-b.v0,a.v1-b.v1,a.v2-b.v2,a.v3-b.v3); - } - - __forceinline friend CubicBezierCurve operator -( const CubicBezierCurve& a, const Vertex& b ) { - return CubicBezierCurve(a.v0-b,a.v1-b,a.v2-b,a.v3-b); - } - - __forceinline friend CubicBezierCurve operator *( const Vertex& a, const CubicBezierCurve& b ) { - return CubicBezierCurve(a*b.v0,a*b.v1,a*b.v2,a*b.v3); - } - - __forceinline friend CubicBezierCurve cmadd( const Vertex& a, const CubicBezierCurve& b, const CubicBezierCurve& c) { - return CubicBezierCurve(madd(a,b.v0,c.v0),madd(a,b.v1,c.v1),madd(a,b.v2,c.v2),madd(a,b.v3,c.v3)); - } - - __forceinline friend CubicBezierCurve clerp ( const CubicBezierCurve& a, const CubicBezierCurve& b, const Vertex& t ) { - return cmadd((Vertex(1.0f)-t),a,t*b); - } - - __forceinline friend CubicBezierCurve merge ( const CubicBezierCurve& a, const CubicBezierCurve& b ) { - return CubicBezierCurve(merge(a.v0,b.v0),merge(a.v1,b.v1),merge(a.v2,b.v2),merge(a.v3,b.v3)); - } - - __forceinline void split(CubicBezierCurve& left, CubicBezierCurve& right, const float t = 0.5f) const - { - const Vertex p00 = v0; - const Vertex p01 = v1; - const Vertex p02 = v2; - const Vertex p03 = v3; - - const Vertex p10 = lerp(p00,p01,t); - const Vertex p11 = lerp(p01,p02,t); - const Vertex p12 = lerp(p02,p03,t); - const Vertex p20 = lerp(p10,p11,t); - const Vertex p21 = lerp(p11,p12,t); - const Vertex p30 = lerp(p20,p21,t); - - new (&left ) CubicBezierCurve(p00,p10,p20,p30); - new (&right) CubicBezierCurve(p30,p21,p12,p03); - } - - __forceinline CubicBezierCurve split() const - { - const float u0 = 0.0f, u1 = 1.0f; - const float dscale = (u1-u0)*(1.0f/(3.0f*(VSIZEX-1))); - const vfloatx vu0 = lerp(u0,u1,vfloatx(step)*(1.0f/(VSIZEX-1))); - Vec2vfx P0, dP0du; evalN(vu0,P0,dP0du); dP0du = dP0du * Vec2vfx(dscale); - const Vec2vfx P3 = shift_right_1(P0); - const Vec2vfx dP3du = shift_right_1(dP0du); - const Vec2vfx P1 = P0 + dP0du; - const Vec2vfx P2 = P3 - dP3du; - return CubicBezierCurve(P0,P1,P2,P3); - } - - __forceinline CubicBezierCurve split(const BBox1f& u) const - { - const float u0 = u.lower, u1 = u.upper; - const float dscale = (u1-u0)*(1.0f/(3.0f*(VSIZEX-1))); - const vfloatx vu0 = lerp(u0,u1,vfloatx(step)*(1.0f/(VSIZEX-1))); - Vec2vfx P0, dP0du; evalN(vu0,P0,dP0du); dP0du = dP0du * Vec2vfx(dscale); - const Vec2vfx P3 = shift_right_1(P0); - const Vec2vfx dP3du = shift_right_1(dP0du); - const Vec2vfx P1 = P0 + dP0du; - const Vec2vfx P2 = P3 - dP3du; - return CubicBezierCurve(P0,P1,P2,P3); - } - - __forceinline void eval(float t, Vertex& p, Vertex& dp) const - { - const Vertex p00 = v0; - const Vertex p01 = v1; - const Vertex p02 = v2; - const Vertex p03 = v3; - - const Vertex p10 = lerp(p00,p01,t); - const Vertex p11 = lerp(p01,p02,t); - const Vertex p12 = lerp(p02,p03,t); - const Vertex p20 = lerp(p10,p11,t); - const Vertex p21 = lerp(p11,p12,t); - const Vertex p30 = lerp(p20,p21,t); - - p = p30; - dp = Vertex(3.0f)*(p21-p20); - } - -#if 0 - __forceinline Vertex eval(float t) const - { - const Vertex p00 = v0; - const Vertex p01 = v1; - const Vertex p02 = v2; - const Vertex p03 = v3; - - const Vertex p10 = lerp(p00,p01,t); - const Vertex p11 = lerp(p01,p02,t); - const Vertex p12 = lerp(p02,p03,t); - const Vertex p20 = lerp(p10,p11,t); - const Vertex p21 = lerp(p11,p12,t); - const Vertex p30 = lerp(p20,p21,t); - - return p30; - } -#else - __forceinline Vertex eval(const float t) const - { - const Vec4 b = BezierBasis::eval(t); - return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3))); - } -#endif - - __forceinline Vertex eval_dt(float t) const - { - const Vertex p00 = v1-v0; - const Vertex p01 = v2-v1; - const Vertex p02 = v3-v2; - const Vertex p10 = lerp(p00,p01,t); - const Vertex p11 = lerp(p01,p02,t); - const Vertex p20 = lerp(p10,p11,t); - return Vertex(3.0f)*p20; - } - - __forceinline Vertex eval_du(const float t) const - { - const Vec4 b = BezierBasis::derivative(t); - return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3))); - } - - __forceinline Vertex eval_dudu(const float t) const - { - const Vec4 b = BezierBasis::derivative2(t); - return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3))); - } - - __forceinline void evalN(const vfloatx& t, Vec2vfx& p, Vec2vfx& dp) const - { - const Vec2vfx p00 = v0; - const Vec2vfx p01 = v1; - const Vec2vfx p02 = v2; - const Vec2vfx p03 = v3; - - const Vec2vfx p10 = lerp(p00,p01,t); - const Vec2vfx p11 = lerp(p01,p02,t); - const Vec2vfx p12 = lerp(p02,p03,t); - - const Vec2vfx p20 = lerp(p10,p11,t); - const Vec2vfx p21 = lerp(p11,p12,t); - - const Vec2vfx p30 = lerp(p20,p21,t); - - p = p30; - dp = vfloatx(3.0f)*(p21-p20); - } - - __forceinline void eval(const float t, Vertex& p, Vertex& dp, Vertex& ddp) const - { - const Vertex p00 = v0; - const Vertex p01 = v1; - const Vertex p02 = v2; - const Vertex p03 = v3; - const Vertex p10 = lerp(p00,p01,t); - const Vertex p11 = lerp(p01,p02,t); - const Vertex p12 = lerp(p02,p03,t); - const Vertex p20 = lerp(p10,p11,t); - const Vertex p21 = lerp(p11,p12,t); - const Vertex p30 = lerp(p20,p21,t); - p = p30; - dp = 3.0f*(p21-p20); - ddp = eval_dudu(t); - } - - __forceinline CubicBezierCurve clip(const Interval1f& u1) const - { - Vertex f0,df0; eval(u1.lower,f0,df0); - Vertex f1,df1; eval(u1.upper,f1,df1); - float s = u1.upper-u1.lower; - return CubicBezierCurve(f0,f0+s*(1.0f/3.0f)*df0,f1-s*(1.0f/3.0f)*df1,f1); - } - - __forceinline QuadraticBezierCurve derivative() const - { - const Vertex q0 = 3.0f*(v1-v0); - const Vertex q1 = 3.0f*(v2-v1); - const Vertex q2 = 3.0f*(v3-v2); - return QuadraticBezierCurve(q0,q1,q2); - } - - __forceinline BBox derivative_bounds(const Interval1f& u1) const - { - Vertex f0,df0; eval(u1.lower,f0,df0); - Vertex f3,df3; eval(u1.upper,f3,df3); - const float s = u1.upper-u1.lower; - const Vertex f1 = f0+s*(1.0f/3.0f)*df0; - const Vertex f2 = f3-s*(1.0f/3.0f)*df3; - const Vertex q0 = s*df0; - const Vertex q1 = 3.0f*(f2-f1); - const Vertex q2 = s*df3; - return merge(BBox(q0),BBox(q1),BBox(q2)); - } - - template - __forceinline Vec4vf veval(const vfloat& t) const - { - const Vec4vf b = BezierBasis::eval(t); - return madd(b.x, Vec4vf(v0), madd(b.y, Vec4vf(v1), madd(b.z, Vec4vf(v2), b.w * Vec4vf(v3)))); - } - - template - __forceinline Vec4vf veval_du(const vfloat& t) const - { - const Vec4vf b = BezierBasis::derivative(t); - return madd(b.x, Vec4vf(v0), madd(b.y, Vec4vf(v1), madd(b.z, Vec4vf(v2), b.w * Vec4vf(v3)))); - } - - template - __forceinline Vec4vf veval_dudu(const vfloat& t) const - { - const Vec4vf b = BezierBasis::derivative2(t); - return madd(b.x, Vec4vf(v0), madd(b.y, Vec4vf(v1), madd(b.z, Vec4vf(v2), b.w * Vec4vf(v3)))); - } - - template - __forceinline void veval(const vfloat& t, Vec4vf& p, Vec4vf& dp) const - { - const Vec4vf p00 = v0; - const Vec4vf p01 = v1; - const Vec4vf p02 = v2; - const Vec4vf p03 = v3; - - const Vec4vf p10 = lerp(p00,p01,t); - const Vec4vf p11 = lerp(p01,p02,t); - const Vec4vf p12 = lerp(p02,p03,t); - const Vec4vf p20 = lerp(p10,p11,t); - const Vec4vf p21 = lerp(p11,p12,t); - const Vec4vf p30 = lerp(p20,p21,t); - - p = p30; - dp = vfloat(3.0f)*(p21-p20); - } - - template> - __forceinline Vec eval0(const int ofs, const int size) const - { - assert(size <= PrecomputedBezierBasis::N); - assert(ofs <= size); - return madd(vfloat::loadu(&bezier_basis0.c0[size][ofs]), Vec(v0), - madd(vfloat::loadu(&bezier_basis0.c1[size][ofs]), Vec(v1), - madd(vfloat::loadu(&bezier_basis0.c2[size][ofs]), Vec(v2), - vfloat::loadu(&bezier_basis0.c3[size][ofs]) * Vec(v3)))); - } - - template> - __forceinline Vec eval1(const int ofs, const int size) const - { - assert(size <= PrecomputedBezierBasis::N); - assert(ofs <= size); - return madd(vfloat::loadu(&bezier_basis1.c0[size][ofs]), Vec(v0), - madd(vfloat::loadu(&bezier_basis1.c1[size][ofs]), Vec(v1), - madd(vfloat::loadu(&bezier_basis1.c2[size][ofs]), Vec(v2), - vfloat::loadu(&bezier_basis1.c3[size][ofs]) * Vec(v3)))); - } - - template> - __forceinline Vec derivative0(const int ofs, const int size) const - { - assert(size <= PrecomputedBezierBasis::N); - assert(ofs <= size); - return madd(vfloat::loadu(&bezier_basis0.d0[size][ofs]), Vec(v0), - madd(vfloat::loadu(&bezier_basis0.d1[size][ofs]), Vec(v1), - madd(vfloat::loadu(&bezier_basis0.d2[size][ofs]), Vec(v2), - vfloat::loadu(&bezier_basis0.d3[size][ofs]) * Vec(v3)))); - } - - template> - __forceinline Vec derivative1(const int ofs, const int size) const - { - assert(size <= PrecomputedBezierBasis::N); - assert(ofs <= size); - return madd(vfloat::loadu(&bezier_basis1.d0[size][ofs]), Vec(v0), - madd(vfloat::loadu(&bezier_basis1.d1[size][ofs]), Vec(v1), - madd(vfloat::loadu(&bezier_basis1.d2[size][ofs]), Vec(v2), - vfloat::loadu(&bezier_basis1.d3[size][ofs]) * Vec(v3)))); - } - - /* calculates bounds of bezier curve geometry */ - __forceinline BBox3fa accurateBounds() const - { - const int N = 7; - const float scale = 1.0f/(3.0f*(N-1)); - Vec3vfx pl(pos_inf), pu(neg_inf); - for (int i=0; i<=N; i+=VSIZEX) - { - vintx vi = vintx(i)+vintx(step); - vboolx valid = vi <= vintx(N); - const Vec3vfx p = eval0>(i,N); - const Vec3vfx dp = derivative0>(i,N); - const Vec3vfx pm = p-Vec3vfx(scale)*select(vi!=vintx(0),dp,Vec3vfx(zero)); - const Vec3vfx pp = p+Vec3vfx(scale)*select(vi!=vintx(N),dp,Vec3vfx(zero)); - pl = select(valid,min(pl,p,pm,pp),pl); // FIXME: use masked min - pu = select(valid,max(pu,p,pm,pp),pu); // FIXME: use masked min - } - const Vec3fa lower(reduce_min(pl.x),reduce_min(pl.y),reduce_min(pl.z)); - const Vec3fa upper(reduce_max(pu.x),reduce_max(pu.y),reduce_max(pu.z)); - return BBox3fa(lower,upper); - } - - /* calculates bounds of bezier curve geometry */ - __forceinline BBox3fa accurateRoundBounds() const - { - const int N = 7; - const float scale = 1.0f/(3.0f*(N-1)); - Vec4vfx pl(pos_inf), pu(neg_inf); - for (int i=0; i<=N; i+=VSIZEX) - { - vintx vi = vintx(i)+vintx(step); - vboolx valid = vi <= vintx(N); - const Vec4vfx p = eval0(i,N); - const Vec4vfx dp = derivative0(i,N); - const Vec4vfx pm = p-Vec4vfx(scale)*select(vi!=vintx(0),dp,Vec4vfx(zero)); - const Vec4vfx pp = p+Vec4vfx(scale)*select(vi!=vintx(N),dp,Vec4vfx(zero)); - pl = select(valid,min(pl,p,pm,pp),pl); // FIXME: use masked min - pu = select(valid,max(pu,p,pm,pp),pu); // FIXME: use masked min - } - const Vec3fa lower(reduce_min(pl.x),reduce_min(pl.y),reduce_min(pl.z)); - const Vec3fa upper(reduce_max(pu.x),reduce_max(pu.y),reduce_max(pu.z)); - const float r_min = reduce_min(pl.w); - const float r_max = reduce_max(pu.w); - const Vec3fa upper_r = Vec3fa(max(abs(r_min),abs(r_max))); - return enlarge(BBox3fa(lower,upper),upper_r); - } - - /* calculates bounds when tessellated into N line segments */ - __forceinline BBox3fa accurateFlatBounds(int N) const - { - if (likely(N == 4)) - { - const Vec4vf4 pi = eval0<4>(0,4); - const Vec3fa lower(reduce_min(pi.x),reduce_min(pi.y),reduce_min(pi.z)); - const Vec3fa upper(reduce_max(pi.x),reduce_max(pi.y),reduce_max(pi.z)); - const Vec3fa upper_r = Vec3fa(reduce_max(abs(pi.w))); - return enlarge(BBox3fa(min(lower,v3),max(upper,v3)),max(upper_r,Vec3fa(abs(v3.w)))); - } - else - { - Vec3vfx pl(pos_inf), pu(neg_inf); vfloatx ru(0.0f); - for (int i=0; i(i,N); - - pl.x = select(valid,min(pl.x,pi.x),pl.x); // FIXME: use masked min - pl.y = select(valid,min(pl.y,pi.y),pl.y); - pl.z = select(valid,min(pl.z,pi.z),pl.z); - - pu.x = select(valid,max(pu.x,pi.x),pu.x); // FIXME: use masked min - pu.y = select(valid,max(pu.y,pi.y),pu.y); - pu.z = select(valid,max(pu.z,pi.z),pu.z); - - ru = select(valid,max(ru,abs(pi.w)),ru); - } - const Vec3fa lower(reduce_min(pl.x),reduce_min(pl.y),reduce_min(pl.z)); - const Vec3fa upper(reduce_max(pu.x),reduce_max(pu.y),reduce_max(pu.z)); - const Vec3fa upper_r(reduce_max(ru)); - return enlarge(BBox3fa(min(lower,v3),max(upper,v3)),max(upper_r,Vec3fa(abs(v3.w)))); - } - } - - friend __forceinline embree_ostream operator<<(embree_ostream cout, const CubicBezierCurve& curve) { - return cout << "CubicBezierCurve { v0 = " << curve.v0 << ", v1 = " << curve.v1 << ", v2 = " << curve.v2 << ", v3 = " << curve.v3 << " }"; - } - }; - -#if defined(__AVX__) - template<> - __forceinline CubicBezierCurve CubicBezierCurve::clip(const Interval1f& u1) const - { - const vfloat8 p00 = vfloat8(v0); - const vfloat8 p01 = vfloat8(v1); - const vfloat8 p02 = vfloat8(v2); - const vfloat8 p03 = vfloat8(v3); - - const vfloat8 t(vfloat4(u1.lower),vfloat4(u1.upper)); - const vfloat8 p10 = lerp(p00,p01,t); - const vfloat8 p11 = lerp(p01,p02,t); - const vfloat8 p12 = lerp(p02,p03,t); - const vfloat8 p20 = lerp(p10,p11,t); - const vfloat8 p21 = lerp(p11,p12,t); - const vfloat8 p30 = lerp(p20,p21,t); - - const vfloat8 f01 = p30; - const vfloat8 df01 = vfloat8(3.0f)*(p21-p20); - - const vfloat4 f0 = extract4<0>(f01), f1 = extract4<1>(f01); - const vfloat4 df0 = extract4<0>(df01), df1 = extract4<1>(df01); - const float s = u1.upper-u1.lower; - return CubicBezierCurve(f0,f0+s*(1.0f/3.0f)*df0,f1-s*(1.0f/3.0f)*df1,f1); - } -#endif - - template using BezierCurveT = CubicBezierCurve; - - typedef CubicBezierCurve CubicBezierCurve1f; - typedef CubicBezierCurve CubicBezierCurve2fa; - typedef CubicBezierCurve CubicBezierCurve3fa; - typedef CubicBezierCurve BezierCurve3fa; - - template<> __forceinline int CubicBezierCurve::maxRoots() const - { - float eps = 1E-4f; - bool neg0 = v0 <= 0.0f; bool zero0 = fabs(v0) < eps; - bool neg1 = v1 <= 0.0f; bool zero1 = fabs(v1) < eps; - bool neg2 = v2 <= 0.0f; bool zero2 = fabs(v2) < eps; - bool neg3 = v3 <= 0.0f; bool zero3 = fabs(v3) < eps; - return (neg0 != neg1 || zero0) + (neg1 != neg2 || zero1) + (neg2 != neg3 || zero2 || zero3); - } - - template<> __forceinline int CubicBezierCurve::maxRoots() const { - return numRoots(v0,v1) + numRoots(v1,v2) + numRoots(v2,v3); - } - - __forceinline CubicBezierCurve enlargeRadiusToMinWidth(const IntersectContext* context, const CurveGeometry* geom, const Vec3fa& ray_org, const CubicBezierCurve& curve) - { - return CubicBezierCurve(enlargeRadiusToMinWidth(context,geom,ray_org,curve.v0), - enlargeRadiusToMinWidth(context,geom,ray_org,curve.v1), - enlargeRadiusToMinWidth(context,geom,ray_org,curve.v2), - enlargeRadiusToMinWidth(context,geom,ray_org,curve.v3)); - } -} diff --git a/thirdparty/embree-aarch64/kernels/subdiv/bezier_patch.h b/thirdparty/embree-aarch64/kernels/subdiv/bezier_patch.h deleted file mode 100644 index d87ed41ccb..0000000000 --- a/thirdparty/embree-aarch64/kernels/subdiv/bezier_patch.h +++ /dev/null @@ -1,372 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "catmullclark_patch.h" -#include "bezier_curve.h" - -namespace embree -{ - template - static __forceinline T deCasteljau(const S& uu, const T& v0, const T& v1, const T& v2, const T& v3) - { - const T v0_1 = lerp(v0,v1,uu); - const T v1_1 = lerp(v1,v2,uu); - const T v2_1 = lerp(v2,v3,uu); - const T v0_2 = lerp(v0_1,v1_1,uu); - const T v1_2 = lerp(v1_1,v2_1,uu); - const T v0_3 = lerp(v0_2,v1_2,uu); - return v0_3; - } - - template - static __forceinline T deCasteljau_tangent(const S& uu, const T& v0, const T& v1, const T& v2, const T& v3) - { - const T v0_1 = lerp(v0,v1,uu); - const T v1_1 = lerp(v1,v2,uu); - const T v2_1 = lerp(v2,v3,uu); - const T v0_2 = lerp(v0_1,v1_1,uu); - const T v1_2 = lerp(v1_1,v2_1,uu); - return S(3.0f)*(v1_2-v0_2); - } - - template - __forceinline Vertex computeInnerBezierControlPoint(const Vertex v[4][4], const size_t y, const size_t x) { - return 1.0f / 36.0f * (16.0f * v[y][x] + 4.0f * (v[y-1][x] + v[y+1][x] + v[y][x-1] + v[y][x+1]) + (v[y-1][x-1] + v[y+1][x+1] + v[y-1][x+1] + v[y+1][x-1])); - } - - template - __forceinline Vertex computeTopEdgeBezierControlPoint(const Vertex v[4][4], const size_t y, const size_t x) { - return 1.0f / 18.0f * (8.0f * v[y][x] + 4.0f * v[y-1][x] + 2.0f * (v[y][x-1] + v[y][x+1]) + (v[y-1][x-1] + v[y-1][x+1])); - } - - template - __forceinline Vertex computeBottomEdgeBezierControlPoint(const Vertex v[4][4], const size_t y, const size_t x) { - return 1.0f / 18.0f * (8.0f * v[y][x] + 4.0f * v[y+1][x] + 2.0f * (v[y][x-1] + v[y][x+1]) + v[y+1][x-1] + v[y+1][x+1]); - } - - template - __forceinline Vertex computeLeftEdgeBezierControlPoint(const Vertex v[4][4], const size_t y, const size_t x) { - return 1.0f / 18.0f * (8.0f * v[y][x] + 4.0f * v[y][x-1] + 2.0f * (v[y-1][x] + v[y+1][x]) + v[y-1][x-1] + v[y+1][x-1]); - } - - template - __forceinline Vertex computeRightEdgeBezierControlPoint(const Vertex v[4][4], const size_t y, const size_t x) { - return 1.0f / 18.0f * (8.0f * v[y][x] + 4.0f * v[y][x+1] + 2.0f * (v[y-1][x] + v[y+1][x]) + v[y-1][x+1] + v[y+1][x+1]); - } - - template - __forceinline Vertex computeCornerBezierControlPoint(const Vertex v[4][4], const size_t y, const size_t x, const ssize_t delta_y, const ssize_t delta_x) - { - return 1.0f / 9.0f * (4.0f * v[y][x] + 2.0f * (v[y+delta_y][x] + v[y][x+delta_x]) + v[y+delta_y][x+delta_x]); - } - - template - class __aligned(64) BezierPatchT - { - public: - Vertex matrix[4][4]; - - public: - - __forceinline BezierPatchT() {} - - __forceinline BezierPatchT (const HalfEdge* edge, const char* vertices, size_t stride); - - __forceinline BezierPatchT(const CatmullClarkPatchT& patch); - - __forceinline BezierPatchT(const CatmullClarkPatchT& patch, - const BezierCurveT* border0, - const BezierCurveT* border1, - const BezierCurveT* border2, - const BezierCurveT* border3); - - __forceinline BezierPatchT(const BSplinePatchT& source) - { - /* compute inner bezier control points */ - matrix[0][0] = computeInnerBezierControlPoint(source.v,1,1); - matrix[0][3] = computeInnerBezierControlPoint(source.v,1,2); - matrix[3][3] = computeInnerBezierControlPoint(source.v,2,2); - matrix[3][0] = computeInnerBezierControlPoint(source.v,2,1); - - /* compute top edge control points */ - matrix[0][1] = computeRightEdgeBezierControlPoint(source.v,1,1); - matrix[0][2] = computeLeftEdgeBezierControlPoint(source.v,1,2); - - /* compute buttom edge control points */ - matrix[3][1] = computeRightEdgeBezierControlPoint(source.v,2,1); - matrix[3][2] = computeLeftEdgeBezierControlPoint(source.v,2,2); - - /* compute left edge control points */ - matrix[1][0] = computeBottomEdgeBezierControlPoint(source.v,1,1); - matrix[2][0] = computeTopEdgeBezierControlPoint(source.v,2,1); - - /* compute right edge control points */ - matrix[1][3] = computeBottomEdgeBezierControlPoint(source.v,1,2); - matrix[2][3] = computeTopEdgeBezierControlPoint(source.v,2,2); - - /* compute corner control points */ - matrix[1][1] = computeCornerBezierControlPoint(source.v,1,1, 1, 1); - matrix[1][2] = computeCornerBezierControlPoint(source.v,1,2, 1,-1); - matrix[2][2] = computeCornerBezierControlPoint(source.v,2,2,-1,-1); - matrix[2][1] = computeCornerBezierControlPoint(source.v,2,1,-1, 1); - } - - static __forceinline Vertex_t bilinear(const Vec4f Bu, const Vertex matrix[4][4], const Vec4f Bv) - { - const Vertex_t M0 = madd(Bu.x,matrix[0][0],madd(Bu.y,matrix[0][1],madd(Bu.z,matrix[0][2],Bu.w * matrix[0][3]))); - const Vertex_t M1 = madd(Bu.x,matrix[1][0],madd(Bu.y,matrix[1][1],madd(Bu.z,matrix[1][2],Bu.w * matrix[1][3]))); - const Vertex_t M2 = madd(Bu.x,matrix[2][0],madd(Bu.y,matrix[2][1],madd(Bu.z,matrix[2][2],Bu.w * matrix[2][3]))); - const Vertex_t M3 = madd(Bu.x,matrix[3][0],madd(Bu.y,matrix[3][1],madd(Bu.z,matrix[3][2],Bu.w * matrix[3][3]))); - return madd(Bv.x,M0,madd(Bv.y,M1,madd(Bv.z,M2,Bv.w*M3))); - } - - static __forceinline Vertex_t eval(const Vertex matrix[4][4], const float uu, const float vv) - { - const Vec4f Bu = BezierBasis::eval(uu); - const Vec4f Bv = BezierBasis::eval(vv); - return bilinear(Bu,matrix,Bv); - } - - static __forceinline Vertex_t eval_du(const Vertex matrix[4][4], const float uu, const float vv) - { - const Vec4f Bu = BezierBasis::derivative(uu); - const Vec4f Bv = BezierBasis::eval(vv); - return bilinear(Bu,matrix,Bv); - } - - static __forceinline Vertex_t eval_dv(const Vertex matrix[4][4], const float uu, const float vv) - { - const Vec4f Bu = BezierBasis::eval(uu); - const Vec4f Bv = BezierBasis::derivative(vv); - return bilinear(Bu,matrix,Bv); - } - - static __forceinline Vertex_t eval_dudu(const Vertex matrix[4][4], const float uu, const float vv) - { - const Vec4f Bu = BezierBasis::derivative2(uu); - const Vec4f Bv = BezierBasis::eval(vv); - return bilinear(Bu,matrix,Bv); - } - - static __forceinline Vertex_t eval_dvdv(const Vertex matrix[4][4], const float uu, const float vv) - { - const Vec4f Bu = BezierBasis::eval(uu); - const Vec4f Bv = BezierBasis::derivative2(vv); - return bilinear(Bu,matrix,Bv); - } - - static __forceinline Vertex_t eval_dudv(const Vertex matrix[4][4], const float uu, const float vv) - { - const Vec4f Bu = BezierBasis::derivative(uu); - const Vec4f Bv = BezierBasis::derivative(vv); - return bilinear(Bu,matrix,Bv); - } - - static __forceinline Vertex_t normal(const Vertex matrix[4][4], const float uu, const float vv) - { - const Vertex_t dPdu = eval_du(matrix,uu,vv); - const Vertex_t dPdv = eval_dv(matrix,uu,vv); - return cross(dPdu,dPdv); - } - - __forceinline Vertex_t normal(const float uu, const float vv) - { - const Vertex_t dPdu = eval_du(matrix,uu,vv); - const Vertex_t dPdv = eval_dv(matrix,uu,vv); - return cross(dPdu,dPdv); - } - - __forceinline Vertex_t eval(const float uu, const float vv) const { - return eval(matrix,uu,vv); - } - - __forceinline Vertex_t eval_du(const float uu, const float vv) const { - return eval_du(matrix,uu,vv); - } - - __forceinline Vertex_t eval_dv(const float uu, const float vv) const { - return eval_dv(matrix,uu,vv); - } - - __forceinline Vertex_t eval_dudu(const float uu, const float vv) const { - return eval_dudu(matrix,uu,vv); - } - - __forceinline Vertex_t eval_dvdv(const float uu, const float vv) const { - return eval_dvdv(matrix,uu,vv); - } - - __forceinline Vertex_t eval_dudv(const float uu, const float vv) const { - return eval_dudv(matrix,uu,vv); - } - - __forceinline void eval(const float u, const float v, Vertex* P, Vertex* dPdu, Vertex* dPdv, Vertex* ddPdudu, Vertex* ddPdvdv, Vertex* ddPdudv, const float dscale = 1.0f) const - { - if (P) { - *P = eval(u,v); - } - if (dPdu) { - assert(dPdu); *dPdu = eval_du(u,v)*dscale; - assert(dPdv); *dPdv = eval_dv(u,v)*dscale; - } - if (ddPdudu) { - assert(ddPdudu); *ddPdudu = eval_dudu(u,v)*sqr(dscale); - assert(ddPdvdv); *ddPdvdv = eval_dvdv(u,v)*sqr(dscale); - assert(ddPdudv); *ddPdudv = eval_dudv(u,v)*sqr(dscale); - } - } - - template - __forceinline vfloat eval(const size_t i, const vfloat& uu, const vfloat& vv, const Vec4& u_n, const Vec4& v_n) const - { - const vfloat curve0_x = v_n[0] * vfloat(matrix[0][0][i]) + v_n[1] * vfloat(matrix[1][0][i]) + v_n[2] * vfloat(matrix[2][0][i]) + v_n[3] * vfloat(matrix[3][0][i]); - const vfloat curve1_x = v_n[0] * vfloat(matrix[0][1][i]) + v_n[1] * vfloat(matrix[1][1][i]) + v_n[2] * vfloat(matrix[2][1][i]) + v_n[3] * vfloat(matrix[3][1][i]); - const vfloat curve2_x = v_n[0] * vfloat(matrix[0][2][i]) + v_n[1] * vfloat(matrix[1][2][i]) + v_n[2] * vfloat(matrix[2][2][i]) + v_n[3] * vfloat(matrix[3][2][i]); - const vfloat curve3_x = v_n[0] * vfloat(matrix[0][3][i]) + v_n[1] * vfloat(matrix[1][3][i]) + v_n[2] * vfloat(matrix[2][3][i]) + v_n[3] * vfloat(matrix[3][3][i]); - return u_n[0] * curve0_x + u_n[1] * curve1_x + u_n[2] * curve2_x + u_n[3] * curve3_x; - } - - template - __forceinline void eval(const vbool& valid, const vfloat& uu, const vfloat& vv, - float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv, - const float dscale, const size_t dstride, const size_t N) const - { - if (P) { - const Vec4 u_n = BezierBasis::eval(uu); - const Vec4 v_n = BezierBasis::eval(vv); - for (size_t i=0; i u_n = BezierBasis::derivative(uu); - const Vec4 v_n = BezierBasis::eval(vv); - for (size_t i=0; i u_n = BezierBasis::eval(uu); - const Vec4 v_n = BezierBasis::derivative(vv); - for (size_t i=0; i u_n = BezierBasis::derivative2(uu); - const Vec4 v_n = BezierBasis::eval(vv); - for (size_t i=0; i u_n = BezierBasis::eval(uu); - const Vec4 v_n = BezierBasis::derivative2(vv); - for (size_t i=0; i u_n = BezierBasis::derivative(uu); - const Vec4 v_n = BezierBasis::derivative(vv); - for (size_t i=0; i - static __forceinline Vec3 eval(const Vertex matrix[4][4], const T& uu, const T& vv) - { - const T one_minus_uu = 1.0f - uu; - const T one_minus_vv = 1.0f - vv; - - const T B0_u = one_minus_uu * one_minus_uu * one_minus_uu; - const T B0_v = one_minus_vv * one_minus_vv * one_minus_vv; - const T B1_u = 3.0f * (one_minus_uu * uu * one_minus_uu); - const T B1_v = 3.0f * (one_minus_vv * vv * one_minus_vv); - const T B2_u = 3.0f * (uu * one_minus_uu * uu); - const T B2_v = 3.0f * (vv * one_minus_vv * vv); - const T B3_u = uu * uu * uu; - const T B3_v = vv * vv * vv; - - const T x = - madd(B0_v,madd(B0_u,matrix[0][0].x,madd(B1_u,matrix[0][1].x,madd(B2_u,matrix[0][2].x,B3_u*matrix[0][3].x))), - madd(B1_v,madd(B0_u,matrix[1][0].x,madd(B1_u,matrix[1][1].x,madd(B2_u,matrix[1][2].x,B3_u*matrix[1][3].x))), - madd(B2_v,madd(B0_u,matrix[2][0].x,madd(B1_u,matrix[2][1].x,madd(B2_u,matrix[2][2].x,B3_u*matrix[2][3].x))), - B3_v*madd(B0_u,matrix[3][0].x,madd(B1_u,matrix[3][1].x,madd(B2_u,matrix[3][2].x,B3_u*matrix[3][3].x)))))); - - const T y = - madd(B0_v,madd(B0_u,matrix[0][0].y,madd(B1_u,matrix[0][1].y,madd(B2_u,matrix[0][2].y,B3_u*matrix[0][3].y))), - madd(B1_v,madd(B0_u,matrix[1][0].y,madd(B1_u,matrix[1][1].y,madd(B2_u,matrix[1][2].y,B3_u*matrix[1][3].y))), - madd(B2_v,madd(B0_u,matrix[2][0].y,madd(B1_u,matrix[2][1].y,madd(B2_u,matrix[2][2].y,B3_u*matrix[2][3].y))), - B3_v*madd(B0_u,matrix[3][0].y,madd(B1_u,matrix[3][1].y,madd(B2_u,matrix[3][2].y,B3_u*matrix[3][3].y)))))); - - const T z = - madd(B0_v,madd(B0_u,matrix[0][0].z,madd(B1_u,matrix[0][1].z,madd(B2_u,matrix[0][2].z,B3_u*matrix[0][3].z))), - madd(B1_v,madd(B0_u,matrix[1][0].z,madd(B1_u,matrix[1][1].z,madd(B2_u,matrix[1][2].z,B3_u*matrix[1][3].z))), - madd(B2_v,madd(B0_u,matrix[2][0].z,madd(B1_u,matrix[2][1].z,madd(B2_u,matrix[2][2].z,B3_u*matrix[2][3].z))), - B3_v*madd(B0_u,matrix[3][0].z,madd(B1_u,matrix[3][1].z,madd(B2_u,matrix[3][2].z,B3_u*matrix[3][3].z)))))); - - return Vec3(x,y,z); - } - - template - __forceinline Vec3 eval(const vfloat& uu, const vfloat& vv) const { - return eval(matrix,uu,vv); - } - - template - static __forceinline Vec3 normal(const Vertex matrix[4][4], const T& uu, const T& vv) - { - - const Vec3 matrix_00 = Vec3(matrix[0][0].x,matrix[0][0].y,matrix[0][0].z); - const Vec3 matrix_01 = Vec3(matrix[0][1].x,matrix[0][1].y,matrix[0][1].z); - const Vec3 matrix_02 = Vec3(matrix[0][2].x,matrix[0][2].y,matrix[0][2].z); - const Vec3 matrix_03 = Vec3(matrix[0][3].x,matrix[0][3].y,matrix[0][3].z); - - const Vec3 matrix_10 = Vec3(matrix[1][0].x,matrix[1][0].y,matrix[1][0].z); - const Vec3 matrix_11 = Vec3(matrix[1][1].x,matrix[1][1].y,matrix[1][1].z); - const Vec3 matrix_12 = Vec3(matrix[1][2].x,matrix[1][2].y,matrix[1][2].z); - const Vec3 matrix_13 = Vec3(matrix[1][3].x,matrix[1][3].y,matrix[1][3].z); - - const Vec3 matrix_20 = Vec3(matrix[2][0].x,matrix[2][0].y,matrix[2][0].z); - const Vec3 matrix_21 = Vec3(matrix[2][1].x,matrix[2][1].y,matrix[2][1].z); - const Vec3 matrix_22 = Vec3(matrix[2][2].x,matrix[2][2].y,matrix[2][2].z); - const Vec3 matrix_23 = Vec3(matrix[2][3].x,matrix[2][3].y,matrix[2][3].z); - - const Vec3 matrix_30 = Vec3(matrix[3][0].x,matrix[3][0].y,matrix[3][0].z); - const Vec3 matrix_31 = Vec3(matrix[3][1].x,matrix[3][1].y,matrix[3][1].z); - const Vec3 matrix_32 = Vec3(matrix[3][2].x,matrix[3][2].y,matrix[3][2].z); - const Vec3 matrix_33 = Vec3(matrix[3][3].x,matrix[3][3].y,matrix[3][3].z); - - /* tangentU */ - const Vec3 col0 = deCasteljau(vv, matrix_00, matrix_10, matrix_20, matrix_30); - const Vec3 col1 = deCasteljau(vv, matrix_01, matrix_11, matrix_21, matrix_31); - const Vec3 col2 = deCasteljau(vv, matrix_02, matrix_12, matrix_22, matrix_32); - const Vec3 col3 = deCasteljau(vv, matrix_03, matrix_13, matrix_23, matrix_33); - - const Vec3 tangentU = deCasteljau_tangent(uu, col0, col1, col2, col3); - - /* tangentV */ - const Vec3 row0 = deCasteljau(uu, matrix_00, matrix_01, matrix_02, matrix_03); - const Vec3 row1 = deCasteljau(uu, matrix_10, matrix_11, matrix_12, matrix_13); - const Vec3 row2 = deCasteljau(uu, matrix_20, matrix_21, matrix_22, matrix_23); - const Vec3 row3 = deCasteljau(uu, matrix_30, matrix_31, matrix_32, matrix_33); - - const Vec3 tangentV = deCasteljau_tangent(vv, row0, row1, row2, row3); - - /* normal = tangentU x tangentV */ - const Vec3 n = cross(tangentU,tangentV); - return n; - } - - template - __forceinline Vec3 normal(const vfloat& uu, const vfloat& vv) const { - return normal(matrix,uu,vv); - } - }; - - typedef BezierPatchT BezierPatch3fa; -} diff --git a/thirdparty/embree-aarch64/kernels/subdiv/bilinear_patch.h b/thirdparty/embree-aarch64/kernels/subdiv/bilinear_patch.h deleted file mode 100644 index 35748754bd..0000000000 --- a/thirdparty/embree-aarch64/kernels/subdiv/bilinear_patch.h +++ /dev/null @@ -1,191 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "catmullclark_patch.h" -#include "bezier_curve.h" - -namespace embree -{ - template - class __aligned(64) BilinearPatchT - { - typedef CatmullClark1RingT CatmullClarkRing; - typedef CatmullClarkPatchT CatmullClarkPatch; - - public: - Vertex v[4]; - - public: - - __forceinline BilinearPatchT () {} - - __forceinline BilinearPatchT (const HalfEdge* edge, const BufferView& vertices) { - init(edge,vertices.getPtr(),vertices.getStride()); - } - - __forceinline BilinearPatchT (const HalfEdge* edge, const char* vertices, size_t stride) { - init(edge,vertices,stride); - } - - __forceinline void init (const HalfEdge* edge, const char* vertices, size_t stride) - { - v[0] = Vertex::loadu(vertices+edge->getStartVertexIndex()*stride); edge = edge->next(); - v[1] = Vertex::loadu(vertices+edge->getStartVertexIndex()*stride); edge = edge->next(); - v[2] = Vertex::loadu(vertices+edge->getStartVertexIndex()*stride); edge = edge->next(); - v[3] = Vertex::loadu(vertices+edge->getStartVertexIndex()*stride); edge = edge->next(); - } - - __forceinline BilinearPatchT (const CatmullClarkPatch& patch) - { - v[0] = patch.ring[0].getLimitVertex(); - v[1] = patch.ring[1].getLimitVertex(); - v[2] = patch.ring[2].getLimitVertex(); - v[3] = patch.ring[3].getLimitVertex(); - } - - __forceinline BBox bounds() const - { - - BBox bounds (v[0]); - bounds.extend(v[1]); - bounds.extend(v[2]); - bounds.extend(v[3]); - return bounds; - } - - __forceinline Vertex eval(const float uu, const float vv) const { - return lerp(lerp(v[0],v[1],uu),lerp(v[3],v[2],uu),vv); - } - - __forceinline Vertex eval_du(const float uu, const float vv) const { - return lerp(v[1]-v[0],v[2]-v[3],vv); - } - - __forceinline Vertex eval_dv(const float uu, const float vv) const { - return lerp(v[3]-v[0],v[2]-v[1],uu); - } - - __forceinline Vertex eval_dudu(const float uu, const float vv) const { - return Vertex(zero); - } - - __forceinline Vertex eval_dvdv(const float uu, const float vv) const { - return Vertex(zero); - } - - __forceinline Vertex eval_dudv(const float uu, const float vv) const { - return (v[2]-v[3]) - (v[1]-v[0]); - } - - __forceinline Vertex normal(const float uu, const float vv) const { - return cross(eval_du(uu,vv),eval_dv(uu,vv)); - } - - __forceinline void eval(const float u, const float v, - Vertex* P, Vertex* dPdu, Vertex* dPdv, Vertex* ddPdudu, Vertex* ddPdvdv, Vertex* ddPdudv, - const float dscale = 1.0f) const - { - if (P) { - *P = eval(u,v); - } - if (dPdu) { - assert(dPdu); *dPdu = eval_du(u,v)*dscale; - assert(dPdv); *dPdv = eval_dv(u,v)*dscale; - } - if (ddPdudu) { - assert(ddPdudu); *ddPdudu = eval_dudu(u,v)*sqr(dscale); - assert(ddPdvdv); *ddPdvdv = eval_dvdv(u,v)*sqr(dscale); - assert(ddPdudv); *ddPdudv = eval_dudv(u,v)*sqr(dscale); - } - } - - template - __forceinline Vec3 eval(const vfloat& uu, const vfloat& vv) const - { - const vfloat x = lerp(lerp(v[0].x,v[1].x,uu),lerp(v[3].x,v[2].x,uu),vv); - const vfloat y = lerp(lerp(v[0].y,v[1].y,uu),lerp(v[3].y,v[2].y,uu),vv); - const vfloat z = lerp(lerp(v[0].z,v[1].z,uu),lerp(v[3].z,v[2].z,uu),vv); - return Vec3(x,y,z); - } - - template - __forceinline Vec3 eval_du(const vfloat& uu, const vfloat& vv) const - { - const vfloat x = lerp(v[1].x-v[0].x,v[2].x-v[3].x,vv); - const vfloat y = lerp(v[1].y-v[0].y,v[2].y-v[3].y,vv); - const vfloat z = lerp(v[1].z-v[0].z,v[2].z-v[3].z,vv); - return Vec3(x,y,z); - } - - template - __forceinline Vec3 eval_dv(const vfloat& uu, const vfloat& vv) const - { - const vfloat x = lerp(v[3].x-v[0].x,v[2].x-v[1].x,uu); - const vfloat y = lerp(v[3].y-v[0].y,v[2].y-v[1].y,uu); - const vfloat z = lerp(v[3].z-v[0].z,v[2].z-v[1].z,uu); - return Vec3(x,y,z); - } - - template - __forceinline Vec3 normal(const vfloat& uu, const vfloat& vv) const { - return cross(eval_du(uu,vv),eval_dv(uu,vv)); - } - - template - __forceinline vfloat eval(const size_t i, const vfloat& uu, const vfloat& vv) const { - return lerp(lerp(v[0][i],v[1][i],uu),lerp(v[3][i],v[2][i],uu),vv); - } - - template - __forceinline vfloat eval_du(const size_t i, const vfloat& uu, const vfloat& vv) const { - return lerp(v[1][i]-v[0][i],v[2][i]-v[3][i],vv); - } - - template - __forceinline vfloat eval_dv(const size_t i, const vfloat& uu, const vfloat& vv) const { - return lerp(v[3][i]-v[0][i],v[2][i]-v[1][i],uu); - } - - template - __forceinline vfloat eval_dudu(const size_t i, const vfloat& uu, const vfloat& vv) const { - return vfloat(zero); - } - - template - __forceinline vfloat eval_dvdv(const size_t i, const vfloat& uu, const vfloat& vv) const { - return vfloat(zero); - } - - template - __forceinline vfloat eval_dudv(const size_t i, const vfloat& uu, const vfloat& vv) const { - return (v[2][i]-v[3][i]) - (v[1][i]-v[0][i]); - } - - template - __forceinline void eval(const vbool& valid, const vfloat& uu, const vfloat& vv, - float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv, - const float dscale, const size_t dstride, const size_t N) const - { - if (P) { - for (size_t i=0; i BilinearPatch3fa; -} diff --git a/thirdparty/embree-aarch64/kernels/subdiv/bspline_curve.h b/thirdparty/embree-aarch64/kernels/subdiv/bspline_curve.h deleted file mode 100644 index a325667328..0000000000 --- a/thirdparty/embree-aarch64/kernels/subdiv/bspline_curve.h +++ /dev/null @@ -1,319 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../common/default.h" -#include "bezier_curve.h" - -namespace embree -{ - class BSplineBasis - { - public: - - template - static __forceinline Vec4 eval(const T& u) - { - const T t = u; - const T s = T(1.0f) - u; - const T n0 = s*s*s; - const T n1 = (4.0f*(s*s*s)+(t*t*t)) + (12.0f*((s*t)*s) + 6.0f*((t*s)*t)); - const T n2 = (4.0f*(t*t*t)+(s*s*s)) + (12.0f*((t*s)*t) + 6.0f*((s*t)*s)); - const T n3 = t*t*t; - return T(1.0f/6.0f)*Vec4(n0,n1,n2,n3); - } - - template - static __forceinline Vec4 derivative(const T& u) - { - const T t = u; - const T s = 1.0f - u; - const T n0 = -s*s; - const T n1 = -t*t - 4.0f*(t*s); - const T n2 = s*s + 4.0f*(s*t); - const T n3 = t*t; - return T(0.5f)*Vec4(n0,n1,n2,n3); - } - - template - static __forceinline Vec4 derivative2(const T& u) - { - const T t = u; - const T s = 1.0f - u; - const T n0 = s; - const T n1 = t - 2.0f*s; - const T n2 = s - 2.0f*t; - const T n3 = t; - return Vec4(n0,n1,n2,n3); - } - }; - - struct PrecomputedBSplineBasis - { - enum { N = 16 }; - public: - PrecomputedBSplineBasis() {} - PrecomputedBSplineBasis(int shift); - - /* basis for bspline evaluation */ - public: - float c0[N+1][N+1]; - float c1[N+1][N+1]; - float c2[N+1][N+1]; - float c3[N+1][N+1]; - - /* basis for bspline derivative evaluation */ - public: - float d0[N+1][N+1]; - float d1[N+1][N+1]; - float d2[N+1][N+1]; - float d3[N+1][N+1]; - }; - extern PrecomputedBSplineBasis bspline_basis0; - extern PrecomputedBSplineBasis bspline_basis1; - - template - struct BSplineCurveT - { - Vertex v0,v1,v2,v3; - - __forceinline BSplineCurveT() {} - - __forceinline BSplineCurveT(const Vertex& v0, const Vertex& v1, const Vertex& v2, const Vertex& v3) - : v0(v0), v1(v1), v2(v2), v3(v3) {} - - __forceinline Vertex begin() const { - return madd(1.0f/6.0f,v0,madd(2.0f/3.0f,v1,1.0f/6.0f*v2)); - } - - __forceinline Vertex end() const { - return madd(1.0f/6.0f,v1,madd(2.0f/3.0f,v2,1.0f/6.0f*v3)); - } - - __forceinline Vertex center() const { - return 0.25f*(v0+v1+v2+v3); - } - - __forceinline BBox bounds() const { - return merge(BBox(v0),BBox(v1),BBox(v2),BBox(v3)); - } - - __forceinline friend BSplineCurveT operator -( const BSplineCurveT& a, const Vertex& b ) { - return BSplineCurveT(a.v0-b,a.v1-b,a.v2-b,a.v3-b); - } - - __forceinline BSplineCurveT xfm_pr(const LinearSpace3fa& space, const Vec3fa& p) const - { - const Vec3ff q0(xfmVector(space,(Vec3fa)v0-p), v0.w); - const Vec3ff q1(xfmVector(space,(Vec3fa)v1-p), v1.w); - const Vec3ff q2(xfmVector(space,(Vec3fa)v2-p), v2.w); - const Vec3ff q3(xfmVector(space,(Vec3fa)v3-p), v3.w); - return BSplineCurveT(q0,q1,q2,q3); - } - - __forceinline Vertex eval(const float t) const - { - const Vec4 b = BSplineBasis::eval(t); - return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3))); - } - - __forceinline Vertex eval_du(const float t) const - { - const Vec4 b = BSplineBasis::derivative(t); - return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3))); - } - - __forceinline Vertex eval_dudu(const float t) const - { - const Vec4 b = BSplineBasis::derivative2(t); - return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3))); - } - - __forceinline void eval(const float t, Vertex& p, Vertex& dp, Vertex& ddp) const - { - p = eval(t); - dp = eval_du(t); - ddp = eval_dudu(t); - } - - template - __forceinline Vec4vf veval(const vfloat& t) const - { - const Vec4vf b = BSplineBasis::eval(t); - return madd(b.x, Vec4vf(v0), madd(b.y, Vec4vf(v1), madd(b.z, Vec4vf(v2), b.w * Vec4vf(v3)))); - } - - template - __forceinline Vec4vf veval_du(const vfloat& t) const - { - const Vec4vf b = BSplineBasis::derivative(t); - return madd(b.x, Vec4vf(v0), madd(b.y, Vec4vf(v1), madd(b.z, Vec4vf(v2), b.w * Vec4vf(v3)))); - } - - template - __forceinline Vec4vf veval_dudu(const vfloat& t) const - { - const Vec4vf b = BSplineBasis::derivative2(t); - return madd(b.x, Vec4vf(v0), madd(b.y, Vec4vf(v1), madd(b.z, Vec4vf(v2), b.w * Vec4vf(v3)))); - } - - template - __forceinline void veval(const vfloat& t, Vec4vf& p, Vec4vf& dp) const - { - p = veval(t); - dp = veval_du(t); - } - - template - __forceinline Vec4vf eval0(const int ofs, const int size) const - { - assert(size <= PrecomputedBSplineBasis::N); - assert(ofs <= size); - return madd(vfloat::loadu(&bspline_basis0.c0[size][ofs]), Vec4vf(v0), - madd(vfloat::loadu(&bspline_basis0.c1[size][ofs]), Vec4vf(v1), - madd(vfloat::loadu(&bspline_basis0.c2[size][ofs]), Vec4vf(v2), - vfloat::loadu(&bspline_basis0.c3[size][ofs]) * Vec4vf(v3)))); - } - - template - __forceinline Vec4vf eval1(const int ofs, const int size) const - { - assert(size <= PrecomputedBSplineBasis::N); - assert(ofs <= size); - return madd(vfloat::loadu(&bspline_basis1.c0[size][ofs]), Vec4vf(v0), - madd(vfloat::loadu(&bspline_basis1.c1[size][ofs]), Vec4vf(v1), - madd(vfloat::loadu(&bspline_basis1.c2[size][ofs]), Vec4vf(v2), - vfloat::loadu(&bspline_basis1.c3[size][ofs]) * Vec4vf(v3)))); - } - - template - __forceinline Vec4vf derivative0(const int ofs, const int size) const - { - assert(size <= PrecomputedBSplineBasis::N); - assert(ofs <= size); - return madd(vfloat::loadu(&bspline_basis0.d0[size][ofs]), Vec4vf(v0), - madd(vfloat::loadu(&bspline_basis0.d1[size][ofs]), Vec4vf(v1), - madd(vfloat::loadu(&bspline_basis0.d2[size][ofs]), Vec4vf(v2), - vfloat::loadu(&bspline_basis0.d3[size][ofs]) * Vec4vf(v3)))); - } - - template - __forceinline Vec4vf derivative1(const int ofs, const int size) const - { - assert(size <= PrecomputedBSplineBasis::N); - assert(ofs <= size); - return madd(vfloat::loadu(&bspline_basis1.d0[size][ofs]), Vec4vf(v0), - madd(vfloat::loadu(&bspline_basis1.d1[size][ofs]), Vec4vf(v1), - madd(vfloat::loadu(&bspline_basis1.d2[size][ofs]), Vec4vf(v2), - vfloat::loadu(&bspline_basis1.d3[size][ofs]) * Vec4vf(v3)))); - } - - /* calculates bounds of bspline curve geometry */ - __forceinline BBox3fa accurateRoundBounds() const - { - const int N = 7; - const float scale = 1.0f/(3.0f*(N-1)); - Vec4vfx pl(pos_inf), pu(neg_inf); - for (int i=0; i<=N; i+=VSIZEX) - { - vintx vi = vintx(i)+vintx(step); - vboolx valid = vi <= vintx(N); - const Vec4vfx p = eval0(i,N); - const Vec4vfx dp = derivative0(i,N); - const Vec4vfx pm = p-Vec4vfx(scale)*select(vi!=vintx(0),dp,Vec4vfx(zero)); - const Vec4vfx pp = p+Vec4vfx(scale)*select(vi!=vintx(N),dp,Vec4vfx(zero)); - pl = select(valid,min(pl,p,pm,pp),pl); // FIXME: use masked min - pu = select(valid,max(pu,p,pm,pp),pu); // FIXME: use masked min - } - const Vec3fa lower(reduce_min(pl.x),reduce_min(pl.y),reduce_min(pl.z)); - const Vec3fa upper(reduce_max(pu.x),reduce_max(pu.y),reduce_max(pu.z)); - const float r_min = reduce_min(pl.w); - const float r_max = reduce_max(pu.w); - const Vec3fa upper_r = Vec3fa(max(abs(r_min),abs(r_max))); - return enlarge(BBox3fa(lower,upper),upper_r); - } - - /* calculates bounds when tessellated into N line segments */ - __forceinline BBox3fa accurateFlatBounds(int N) const - { - if (likely(N == 4)) - { - const Vec4vf4 pi = eval0<4>(0,4); - const Vec3fa lower(reduce_min(pi.x),reduce_min(pi.y),reduce_min(pi.z)); - const Vec3fa upper(reduce_max(pi.x),reduce_max(pi.y),reduce_max(pi.z)); - const Vec3fa upper_r = Vec3fa(reduce_max(abs(pi.w))); - const Vec3ff pe = end(); - return enlarge(BBox3fa(min(lower,pe),max(upper,pe)),max(upper_r,Vec3fa(abs(pe.w)))); - } - else - { - Vec3vfx pl(pos_inf), pu(neg_inf); vfloatx ru(0.0f); - for (int i=0; i<=N; i+=VSIZEX) - { - vboolx valid = vintx(i)+vintx(step) <= vintx(N); - const Vec4vfx pi = eval0(i,N); - - pl.x = select(valid,min(pl.x,pi.x),pl.x); // FIXME: use masked min - pl.y = select(valid,min(pl.y,pi.y),pl.y); - pl.z = select(valid,min(pl.z,pi.z),pl.z); - - pu.x = select(valid,max(pu.x,pi.x),pu.x); // FIXME: use masked min - pu.y = select(valid,max(pu.y,pi.y),pu.y); - pu.z = select(valid,max(pu.z,pi.z),pu.z); - - ru = select(valid,max(ru,abs(pi.w)),ru); - } - const Vec3fa lower(reduce_min(pl.x),reduce_min(pl.y),reduce_min(pl.z)); - const Vec3fa upper(reduce_max(pu.x),reduce_max(pu.y),reduce_max(pu.z)); - const Vec3fa upper_r(reduce_max(ru)); - return enlarge(BBox3fa(lower,upper),upper_r); - } - } - - friend __forceinline embree_ostream operator<<(embree_ostream cout, const BSplineCurveT& curve) { - return cout << "BSplineCurve { v0 = " << curve.v0 << ", v1 = " << curve.v1 << ", v2 = " << curve.v2 << ", v3 = " << curve.v3 << " }"; - } - }; - - template - __forceinline void convert(const BezierCurveT& icurve, BezierCurveT& ocurve) { - ocurve = icurve; - } - - template - __forceinline void convert(const BSplineCurveT& icurve, BSplineCurveT& ocurve) { - ocurve = icurve; - } - - template - __forceinline void convert(const BezierCurveT& icurve, BSplineCurveT& ocurve) - { - const Vertex v0 = madd(6.0f,icurve.v0,madd(-7.0f,icurve.v1,2.0f*icurve.v2)); - const Vertex v1 = msub(2.0f,icurve.v1,icurve.v2); - const Vertex v2 = msub(2.0f,icurve.v2,icurve.v1); - const Vertex v3 = madd(2.0f,icurve.v1,madd(-7.0f,icurve.v2,6.0f*icurve.v3)); - ocurve = BSplineCurveT(v0,v1,v2,v3); - } - - template - __forceinline void convert(const BSplineCurveT& icurve, BezierCurveT& ocurve) - { - const Vertex v0 = madd(1.0f/6.0f,icurve.v0,madd(2.0f/3.0f,icurve.v1,1.0f/6.0f*icurve.v2)); - const Vertex v1 = madd(2.0f/3.0f,icurve.v1,1.0f/3.0f*icurve.v2); - const Vertex v2 = madd(1.0f/3.0f,icurve.v1,2.0f/3.0f*icurve.v2); - const Vertex v3 = madd(1.0f/6.0f,icurve.v1,madd(2.0f/3.0f,icurve.v2,1.0f/6.0f*icurve.v3)); - ocurve = BezierCurveT(v0,v1,v2,v3); - } - - __forceinline BSplineCurveT enlargeRadiusToMinWidth(const IntersectContext* context, const CurveGeometry* geom, const Vec3fa& ray_org, const BSplineCurveT& curve) - { - return BSplineCurveT(enlargeRadiusToMinWidth(context,geom,ray_org,curve.v0), - enlargeRadiusToMinWidth(context,geom,ray_org,curve.v1), - enlargeRadiusToMinWidth(context,geom,ray_org,curve.v2), - enlargeRadiusToMinWidth(context,geom,ray_org,curve.v3)); - } - - typedef BSplineCurveT BSplineCurve3fa; -} - diff --git a/thirdparty/embree-aarch64/kernels/subdiv/bspline_patch.h b/thirdparty/embree-aarch64/kernels/subdiv/bspline_patch.h deleted file mode 100644 index 9769bc17bd..0000000000 --- a/thirdparty/embree-aarch64/kernels/subdiv/bspline_patch.h +++ /dev/null @@ -1,449 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "catmullclark_patch.h" -#include "bspline_curve.h" - -namespace embree -{ - template - class __aligned(64) BSplinePatchT - { - typedef CatmullClark1RingT CatmullClarkRing; - typedef CatmullClarkPatchT CatmullClarkPatch; - - public: - - __forceinline BSplinePatchT () {} - - __forceinline BSplinePatchT (const CatmullClarkPatch& patch) { - init(patch); - } - - __forceinline BSplinePatchT(const CatmullClarkPatch& patch, - const BezierCurveT* border0, - const BezierCurveT* border1, - const BezierCurveT* border2, - const BezierCurveT* border3) - { - init(patch); - } - - __forceinline BSplinePatchT (const HalfEdge* edge, const char* vertices, size_t stride) { - init(edge,vertices,stride); - } - - __forceinline Vertex hard_corner(const Vertex& v01, const Vertex& v02, - const Vertex& v10, const Vertex& v11, const Vertex& v12, - const Vertex& v20, const Vertex& v21, const Vertex& v22) - { - return 4.0f*v11 - 2.0f*(v12+v21) + v22; - } - - __forceinline Vertex soft_convex_corner( const Vertex& v01, const Vertex& v02, - const Vertex& v10, const Vertex& v11, const Vertex& v12, - const Vertex& v20, const Vertex& v21, const Vertex& v22) - { - return -8.0f*v11 + 4.0f*(v12+v21) + v22; - } - - __forceinline Vertex convex_corner(const float vertex_crease_weight, - const Vertex& v01, const Vertex& v02, - const Vertex& v10, const Vertex& v11, const Vertex& v12, - const Vertex& v20, const Vertex& v21, const Vertex& v22) - { - if (std::isinf(vertex_crease_weight)) return hard_corner(v01,v02,v10,v11,v12,v20,v21,v22); - else return soft_convex_corner(v01,v02,v10,v11,v12,v20,v21,v22); - } - - __forceinline Vertex load(const HalfEdge* edge, const char* vertices, size_t stride) { - return Vertex_t::loadu(vertices+edge->getStartVertexIndex()*stride); - } - - __forceinline void init_border(const CatmullClarkRing& edge0, - Vertex& v01, Vertex& v02, - const Vertex& v11, const Vertex& v12, - const Vertex& v21, const Vertex& v22) - { - if (likely(edge0.has_opposite_back(0))) - { - v01 = edge0.back(2); - v02 = edge0.back(1); - } else { - v01 = 2.0f*v11-v21; - v02 = 2.0f*v12-v22; - } - } - - __forceinline void init_corner(const CatmullClarkRing& edge0, - Vertex& v00, const Vertex& v01, const Vertex& v02, - const Vertex& v10, const Vertex& v11, const Vertex& v12, - const Vertex& v20, const Vertex& v21, const Vertex& v22) - { - const bool MAYBE_UNUSED has_back1 = edge0.has_opposite_back(1); - const bool has_back0 = edge0.has_opposite_back(0); - const bool has_front1 = edge0.has_opposite_front(1); - const bool MAYBE_UNUSED has_front2 = edge0.has_opposite_front(2); - - if (likely(has_back0)) { - if (likely(has_front1)) { assert(has_back1 && has_front2); v00 = edge0.back(3); } - else { assert(!has_back1); v00 = 2.0f*v01-v02; } - } - else { - if (likely(has_front1)) { assert(!has_front2); v00 = 2.0f*v10-v20; } - else v00 = convex_corner(edge0.vertex_crease_weight,v01,v02,v10,v11,v12,v20,v21,v22); - } - } - - void init(const CatmullClarkPatch& patch) - { - /* fill inner vertices */ - const Vertex v11 = v[1][1] = patch.ring[0].vtx; - const Vertex v12 = v[1][2] = patch.ring[1].vtx; - const Vertex v22 = v[2][2] = patch.ring[2].vtx; - const Vertex v21 = v[2][1] = patch.ring[3].vtx; - - /* fill border vertices */ - init_border(patch.ring[0],v[0][1],v[0][2],v11,v12,v21,v22); - init_border(patch.ring[1],v[1][3],v[2][3],v12,v22,v11,v21); - init_border(patch.ring[2],v[3][2],v[3][1],v22,v21,v12,v11); - init_border(patch.ring[3],v[2][0],v[1][0],v21,v11,v22,v12); - - /* fill corner vertices */ - init_corner(patch.ring[0],v[0][0],v[0][1],v[0][2],v[1][0],v11,v12,v[2][0],v21,v22); - init_corner(patch.ring[1],v[0][3],v[1][3],v[2][3],v[0][2],v12,v22,v[0][1],v11,v21); - init_corner(patch.ring[2],v[3][3],v[3][2],v[3][1],v[2][3],v22,v21,v[1][3],v12,v11); - init_corner(patch.ring[3],v[3][0],v[2][0],v[1][0],v[3][1],v21,v11,v[3][2],v22,v12); - } - - void init_border(const HalfEdge* edge0, const char* vertices, size_t stride, - Vertex& v01, Vertex& v02, - const Vertex& v11, const Vertex& v12, - const Vertex& v21, const Vertex& v22) - { - if (likely(edge0->hasOpposite())) - { - const HalfEdge* e = edge0->opposite()->next()->next(); - v01 = load(e,vertices,stride); - v02 = load(e->next(),vertices,stride); - } else { - v01 = 2.0f*v11-v21; - v02 = 2.0f*v12-v22; - } - } - - void init_corner(const HalfEdge* edge0, const char* vertices, size_t stride, - Vertex& v00, const Vertex& v01, const Vertex& v02, - const Vertex& v10, const Vertex& v11, const Vertex& v12, - const Vertex& v20, const Vertex& v21, const Vertex& v22) - { - const bool has_back0 = edge0->hasOpposite(); - const bool has_front1 = edge0->prev()->hasOpposite(); - - if (likely(has_back0)) - { - const HalfEdge* e = edge0->opposite()->next(); - if (likely(has_front1)) - { - assert(e->hasOpposite()); - assert(edge0->prev()->opposite()->prev()->hasOpposite()); - v00 = load(e->opposite()->prev(),vertices,stride); - } - else { - assert(!e->hasOpposite()); - v00 = 2.0f*v01-v02; - } - } - else - { - if (likely(has_front1)) { - assert(!edge0->prev()->opposite()->prev()->hasOpposite()); - v00 = 2.0f*v10-v20; - } - else { - assert(edge0->vertex_crease_weight == 0.0f || std::isinf(edge0->vertex_crease_weight)); - v00 = convex_corner(edge0->vertex_crease_weight,v01,v02,v10,v11,v12,v20,v21,v22); - } - } - } - - void init(const HalfEdge* edge0, const char* vertices, size_t stride) - { - assert( edge0->isRegularFace() ); - - /* fill inner vertices */ - const Vertex v11 = v[1][1] = load(edge0,vertices,stride); const HalfEdge* edge1 = edge0->next(); - const Vertex v12 = v[1][2] = load(edge1,vertices,stride); const HalfEdge* edge2 = edge1->next(); - const Vertex v22 = v[2][2] = load(edge2,vertices,stride); const HalfEdge* edge3 = edge2->next(); - const Vertex v21 = v[2][1] = load(edge3,vertices,stride); assert(edge0 == edge3->next()); - - /* fill border vertices */ - init_border(edge0,vertices,stride,v[0][1],v[0][2],v11,v12,v21,v22); - init_border(edge1,vertices,stride,v[1][3],v[2][3],v12,v22,v11,v21); - init_border(edge2,vertices,stride,v[3][2],v[3][1],v22,v21,v12,v11); - init_border(edge3,vertices,stride,v[2][0],v[1][0],v21,v11,v22,v12); - - /* fill corner vertices */ - init_corner(edge0,vertices,stride,v[0][0],v[0][1],v[0][2],v[1][0],v11,v12,v[2][0],v21,v22); - init_corner(edge1,vertices,stride,v[0][3],v[1][3],v[2][3],v[0][2],v12,v22,v[0][1],v11,v21); - init_corner(edge2,vertices,stride,v[3][3],v[3][2],v[3][1],v[2][3],v22,v21,v[1][3],v12,v11); - init_corner(edge3,vertices,stride,v[3][0],v[2][0],v[1][0],v[3][1],v21,v11,v[3][2],v22,v12); - } - - __forceinline BBox bounds() const - { - const Vertex* const cv = &v[0][0]; - BBox bounds (cv[0]); - for (size_t i=1; i<16 ; i++) - bounds.extend( cv[i] ); - return bounds; - } - - __forceinline Vertex eval(const float uu, const float vv) const - { - const Vec4f v_n = BSplineBasis::eval(vv); - const Vertex_t curve0 = madd(v_n[0],v[0][0],madd(v_n[1],v[1][0],madd(v_n[2],v[2][0],v_n[3] * v[3][0]))); - const Vertex_t curve1 = madd(v_n[0],v[0][1],madd(v_n[1],v[1][1],madd(v_n[2],v[2][1],v_n[3] * v[3][1]))); - const Vertex_t curve2 = madd(v_n[0],v[0][2],madd(v_n[1],v[1][2],madd(v_n[2],v[2][2],v_n[3] * v[3][2]))); - const Vertex_t curve3 = madd(v_n[0],v[0][3],madd(v_n[1],v[1][3],madd(v_n[2],v[2][3],v_n[3] * v[3][3]))); - - const Vec4f u_n = BSplineBasis::eval(uu); - return madd(u_n[0],curve0,madd(u_n[1],curve1,madd(u_n[2],curve2,u_n[3] * curve3))); - } - - __forceinline Vertex eval_du(const float uu, const float vv) const - { - const Vec4f v_n = BSplineBasis::eval(vv); - const Vertex_t curve0 = madd(v_n[0],v[0][0],madd(v_n[1],v[1][0],madd(v_n[2],v[2][0],v_n[3] * v[3][0]))); - const Vertex_t curve1 = madd(v_n[0],v[0][1],madd(v_n[1],v[1][1],madd(v_n[2],v[2][1],v_n[3] * v[3][1]))); - const Vertex_t curve2 = madd(v_n[0],v[0][2],madd(v_n[1],v[1][2],madd(v_n[2],v[2][2],v_n[3] * v[3][2]))); - const Vertex_t curve3 = madd(v_n[0],v[0][3],madd(v_n[1],v[1][3],madd(v_n[2],v[2][3],v_n[3] * v[3][3]))); - - const Vec4f u_n = BSplineBasis::derivative(uu); - return madd(u_n[0],curve0,madd(u_n[1],curve1,madd(u_n[2],curve2,u_n[3] * curve3))); - } - - __forceinline Vertex eval_dv(const float uu, const float vv) const - { - const Vec4f v_n = BSplineBasis::derivative(vv); - const Vertex_t curve0 = madd(v_n[0],v[0][0],madd(v_n[1],v[1][0],madd(v_n[2],v[2][0],v_n[3] * v[3][0]))); - const Vertex_t curve1 = madd(v_n[0],v[0][1],madd(v_n[1],v[1][1],madd(v_n[2],v[2][1],v_n[3] * v[3][1]))); - const Vertex_t curve2 = madd(v_n[0],v[0][2],madd(v_n[1],v[1][2],madd(v_n[2],v[2][2],v_n[3] * v[3][2]))); - const Vertex_t curve3 = madd(v_n[0],v[0][3],madd(v_n[1],v[1][3],madd(v_n[2],v[2][3],v_n[3] * v[3][3]))); - - const Vec4f u_n = BSplineBasis::eval(uu); - return madd(u_n[0],curve0,madd(u_n[1],curve1,madd(u_n[2],curve2,u_n[3] * curve3))); - } - - __forceinline Vertex eval_dudu(const float uu, const float vv) const - { - const Vec4f v_n = BSplineBasis::eval(vv); - const Vertex_t curve0 = madd(v_n[0],v[0][0],madd(v_n[1],v[1][0],madd(v_n[2],v[2][0],v_n[3] * v[3][0]))); - const Vertex_t curve1 = madd(v_n[0],v[0][1],madd(v_n[1],v[1][1],madd(v_n[2],v[2][1],v_n[3] * v[3][1]))); - const Vertex_t curve2 = madd(v_n[0],v[0][2],madd(v_n[1],v[1][2],madd(v_n[2],v[2][2],v_n[3] * v[3][2]))); - const Vertex_t curve3 = madd(v_n[0],v[0][3],madd(v_n[1],v[1][3],madd(v_n[2],v[2][3],v_n[3] * v[3][3]))); - - const Vec4f u_n = BSplineBasis::derivative2(uu); - return madd(u_n[0],curve0,madd(u_n[1],curve1,madd(u_n[2],curve2,u_n[3] * curve3))); - } - - __forceinline Vertex eval_dvdv(const float uu, const float vv) const - { - const Vec4f v_n = BSplineBasis::derivative2(vv); - const Vertex_t curve0 = madd(v_n[0],v[0][0],madd(v_n[1],v[1][0],madd(v_n[2],v[2][0],v_n[3] * v[3][0]))); - const Vertex_t curve1 = madd(v_n[0],v[0][1],madd(v_n[1],v[1][1],madd(v_n[2],v[2][1],v_n[3] * v[3][1]))); - const Vertex_t curve2 = madd(v_n[0],v[0][2],madd(v_n[1],v[1][2],madd(v_n[2],v[2][2],v_n[3] * v[3][2]))); - const Vertex_t curve3 = madd(v_n[0],v[0][3],madd(v_n[1],v[1][3],madd(v_n[2],v[2][3],v_n[3] * v[3][3]))); - - const Vec4f u_n = BSplineBasis::eval(uu); - return madd(u_n[0],curve0,madd(u_n[1],curve1,madd(u_n[2],curve2,u_n[3] * curve3))); - } - - __forceinline Vertex eval_dudv(const float uu, const float vv) const - { - const Vec4f v_n = BSplineBasis::derivative(vv); - const Vertex_t curve0 = madd(v_n[0],v[0][0],madd(v_n[1],v[1][0],madd(v_n[2],v[2][0],v_n[3] * v[3][0]))); - const Vertex_t curve1 = madd(v_n[0],v[0][1],madd(v_n[1],v[1][1],madd(v_n[2],v[2][1],v_n[3] * v[3][1]))); - const Vertex_t curve2 = madd(v_n[0],v[0][2],madd(v_n[1],v[1][2],madd(v_n[2],v[2][2],v_n[3] * v[3][2]))); - const Vertex_t curve3 = madd(v_n[0],v[0][3],madd(v_n[1],v[1][3],madd(v_n[2],v[2][3],v_n[3] * v[3][3]))); - - const Vec4f u_n = BSplineBasis::derivative(uu); - return madd(u_n[0],curve0,madd(u_n[1],curve1,madd(u_n[2],curve2,u_n[3] * curve3))); - } - - __forceinline Vertex normal(const float uu, const float vv) const - { - const Vertex tu = eval_du(uu,vv); - const Vertex tv = eval_dv(uu,vv); - return cross(tu,tv); - } - - template - __forceinline Vec3 eval(const T& uu, const T& vv, const Vec4& u_n, const Vec4& v_n) const - { - const T curve0_x = madd(v_n[0],T(v[0][0].x),madd(v_n[1],T(v[1][0].x),madd(v_n[2],T(v[2][0].x),v_n[3] * T(v[3][0].x)))); - const T curve1_x = madd(v_n[0],T(v[0][1].x),madd(v_n[1],T(v[1][1].x),madd(v_n[2],T(v[2][1].x),v_n[3] * T(v[3][1].x)))); - const T curve2_x = madd(v_n[0],T(v[0][2].x),madd(v_n[1],T(v[1][2].x),madd(v_n[2],T(v[2][2].x),v_n[3] * T(v[3][2].x)))); - const T curve3_x = madd(v_n[0],T(v[0][3].x),madd(v_n[1],T(v[1][3].x),madd(v_n[2],T(v[2][3].x),v_n[3] * T(v[3][3].x)))); - const T x = madd(u_n[0],curve0_x,madd(u_n[1],curve1_x,madd(u_n[2],curve2_x,u_n[3] * curve3_x))); - - const T curve0_y = madd(v_n[0],T(v[0][0].y),madd(v_n[1],T(v[1][0].y),madd(v_n[2],T(v[2][0].y),v_n[3] * T(v[3][0].y)))); - const T curve1_y = madd(v_n[0],T(v[0][1].y),madd(v_n[1],T(v[1][1].y),madd(v_n[2],T(v[2][1].y),v_n[3] * T(v[3][1].y)))); - const T curve2_y = madd(v_n[0],T(v[0][2].y),madd(v_n[1],T(v[1][2].y),madd(v_n[2],T(v[2][2].y),v_n[3] * T(v[3][2].y)))); - const T curve3_y = madd(v_n[0],T(v[0][3].y),madd(v_n[1],T(v[1][3].y),madd(v_n[2],T(v[2][3].y),v_n[3] * T(v[3][3].y)))); - const T y = madd(u_n[0],curve0_y,madd(u_n[1],curve1_y,madd(u_n[2],curve2_y,u_n[3] * curve3_y))); - - const T curve0_z = madd(v_n[0],T(v[0][0].z),madd(v_n[1],T(v[1][0].z),madd(v_n[2],T(v[2][0].z),v_n[3] * T(v[3][0].z)))); - const T curve1_z = madd(v_n[0],T(v[0][1].z),madd(v_n[1],T(v[1][1].z),madd(v_n[2],T(v[2][1].z),v_n[3] * T(v[3][1].z)))); - const T curve2_z = madd(v_n[0],T(v[0][2].z),madd(v_n[1],T(v[1][2].z),madd(v_n[2],T(v[2][2].z),v_n[3] * T(v[3][2].z)))); - const T curve3_z = madd(v_n[0],T(v[0][3].z),madd(v_n[1],T(v[1][3].z),madd(v_n[2],T(v[2][3].z),v_n[3] * T(v[3][3].z)))); - const T z = madd(u_n[0],curve0_z,madd(u_n[1],curve1_z,madd(u_n[2],curve2_z,u_n[3] * curve3_z))); - - return Vec3(x,y,z); - } - - template - __forceinline Vec3 eval(const T& uu, const T& vv) const - { - const Vec4 u_n = BSplineBasis::eval(uu); - const Vec4 v_n = BSplineBasis::eval(vv); - return eval(uu,vv,u_n,v_n); - } - - template - __forceinline Vec3 eval_du(const T& uu, const T& vv) const - { - const Vec4 u_n = BSplineBasis::derivative(uu); - const Vec4 v_n = BSplineBasis::eval(vv); - return eval(uu,vv,u_n,v_n); - } - - template - __forceinline Vec3 eval_dv(const T& uu, const T& vv) const - { - const Vec4 u_n = BSplineBasis::eval(uu); - const Vec4 v_n = BSplineBasis::derivative(vv); - return eval(uu,vv,u_n,v_n); - } - - template - __forceinline Vec3 eval_dudu(const T& uu, const T& vv) const - { - const Vec4 u_n = BSplineBasis::derivative2(uu); - const Vec4 v_n = BSplineBasis::eval(vv); - return eval(uu,vv,u_n,v_n); - } - - template - __forceinline Vec3 eval_dvdv(const T& uu, const T& vv) const - { - const Vec4 u_n = BSplineBasis::eval(uu); - const Vec4 v_n = BSplineBasis::derivative2(vv); - return eval(uu,vv,u_n,v_n); - } - - template - __forceinline Vec3 eval_dudv(const T& uu, const T& vv) const - { - const Vec4 u_n = BSplineBasis::derivative(uu); - const Vec4 v_n = BSplineBasis::derivative(vv); - return eval(uu,vv,u_n,v_n); - } - - template - __forceinline Vec3 normal(const T& uu, const T& vv) const { - return cross(eval_du(uu,vv),eval_dv(uu,vv)); - } - - void eval(const float u, const float v, - Vertex* P, Vertex* dPdu, Vertex* dPdv, Vertex* ddPdudu, Vertex* ddPdvdv, Vertex* ddPdudv, - const float dscale = 1.0f) const - { - if (P) { - *P = eval(u,v); - } - if (dPdu) { - assert(dPdu); *dPdu = eval_du(u,v)*dscale; - assert(dPdv); *dPdv = eval_dv(u,v)*dscale; - } - if (ddPdudu) { - assert(ddPdudu); *ddPdudu = eval_dudu(u,v)*sqr(dscale); - assert(ddPdvdv); *ddPdvdv = eval_dvdv(u,v)*sqr(dscale); - assert(ddPdudv); *ddPdudv = eval_dudv(u,v)*sqr(dscale); - } - } - - template - __forceinline vfloat eval(const size_t i, const vfloat& uu, const vfloat& vv, const Vec4& u_n, const Vec4& v_n) const - { - const vfloat curve0_x = madd(v_n[0],vfloat(v[0][0][i]),madd(v_n[1],vfloat(v[1][0][i]),madd(v_n[2],vfloat(v[2][0][i]),v_n[3] * vfloat(v[3][0][i])))); - const vfloat curve1_x = madd(v_n[0],vfloat(v[0][1][i]),madd(v_n[1],vfloat(v[1][1][i]),madd(v_n[2],vfloat(v[2][1][i]),v_n[3] * vfloat(v[3][1][i])))); - const vfloat curve2_x = madd(v_n[0],vfloat(v[0][2][i]),madd(v_n[1],vfloat(v[1][2][i]),madd(v_n[2],vfloat(v[2][2][i]),v_n[3] * vfloat(v[3][2][i])))); - const vfloat curve3_x = madd(v_n[0],vfloat(v[0][3][i]),madd(v_n[1],vfloat(v[1][3][i]),madd(v_n[2],vfloat(v[2][3][i]),v_n[3] * vfloat(v[3][3][i])))); - return madd(u_n[0],curve0_x,madd(u_n[1],curve1_x,madd(u_n[2],curve2_x,u_n[3] * curve3_x))); - } - - template - void eval(const vbool& valid, const vfloat& uu, const vfloat& vv, - float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv, - const float dscale, const size_t dstride, const size_t N) const - { - if (P) { - const Vec4 u_n = BSplineBasis::eval(uu); - const Vec4 v_n = BSplineBasis::eval(vv); - for (size_t i=0; i u_n = BSplineBasis::derivative(uu); - const Vec4 v_n = BSplineBasis::eval(vv); - for (size_t i=0; i u_n = BSplineBasis::eval(uu); - const Vec4 v_n = BSplineBasis::derivative(vv); - for (size_t i=0; i u_n = BSplineBasis::derivative2(uu); - const Vec4 v_n = BSplineBasis::eval(vv); - for (size_t i=0; i u_n = BSplineBasis::eval(uu); - const Vec4 v_n = BSplineBasis::derivative2(vv); - for (size_t i=0; i u_n = BSplineBasis::derivative(uu); - const Vec4 v_n = BSplineBasis::derivative(vv); - for (size_t i=0; i BSplinePatch3fa; -} diff --git a/thirdparty/embree-aarch64/kernels/subdiv/catmullclark_coefficients.h b/thirdparty/embree-aarch64/kernels/subdiv/catmullclark_coefficients.h deleted file mode 100644 index 05031cf6b9..0000000000 --- a/thirdparty/embree-aarch64/kernels/subdiv/catmullclark_coefficients.h +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../common/geometry.h" - -namespace embree -{ - static const size_t MAX_PATCH_VALENCE = 16; //!< maximum number of vertices of a patch - static const size_t MAX_RING_FACE_VALENCE = 64; //!< maximum number of faces per ring - static const size_t MAX_RING_EDGE_VALENCE = 2*64; //!< maximum number of edges per ring - - class CatmullClarkPrecomputedCoefficients - { - private: - - float table_cos_2PI_div_n[MAX_RING_FACE_VALENCE+1]; - - float* table_limittangent_a[MAX_RING_FACE_VALENCE+1]; - float* table_limittangent_b[MAX_RING_FACE_VALENCE+1]; - float table_limittangent_c[MAX_RING_FACE_VALENCE+1]; - - __forceinline float set_cos_2PI_div_n(const size_t n) { - if (unlikely(n == 0)) return 1.0f; - return cosf(2.0f*float(pi)/(float)n); - } - - __forceinline float set_limittangent_a(const size_t i, const size_t n) - { - if (unlikely(n == 0)) return 1.0f; - const float c0 = 1.0f/(float)n * 1.0f / sqrtf(4.0f + cosf(float(pi)/(float)n)*cosf(float(pi)/(float)n)); - const float c1 = (1.0f/(float)n + cosf(float(pi)/(float)n) * c0); - return cosf(2.0f*float(pi)*(float)i/(float)n) * c1; - } - - __forceinline float set_limittangent_b(const size_t i, const size_t n) - { - if (unlikely(n == 0)) return 1.0f; - const float c0 = 1.0f/(float)n * 1.0f / sqrtf(4.0f + cosf(float(pi)/(float)n)*cosf(float(pi)/(float)n)); - return cosf((2.0f*float(pi)*i+float(pi))/(float)n) * c0; - } - - __forceinline float set_limittangent_c(const size_t n) - { - if (unlikely(n == 0)) return 1.0f; - return 2.0f/16.0f * (5.0f + cosf(2.0f*float(pi)/(float)n) + cosf(float(pi)/(float)n) * sqrtf(18.0f+2.0f*cosf(2.0f*float(pi)/(float)n))); - } - - public: - - __forceinline float cos_2PI_div_n(const size_t n) - { - if (likely(n <= MAX_RING_FACE_VALENCE)) - return table_cos_2PI_div_n[n]; - else - return set_cos_2PI_div_n(n); - } - - __forceinline float limittangent_a(const size_t i, const size_t n) - { - assert(n <= MAX_RING_FACE_VALENCE); - assert(i < n); - return table_limittangent_a[n][i]; - } - - __forceinline float limittangent_b(const size_t i, const size_t n) - { - assert(n <= MAX_RING_FACE_VALENCE); - assert(i < n); - return table_limittangent_b[n][i]; - } - - __forceinline float limittangent_c(const size_t n) - { - assert(n <= MAX_RING_FACE_VALENCE); - return table_limittangent_c[n]; - } - - static CatmullClarkPrecomputedCoefficients table; - - CatmullClarkPrecomputedCoefficients(); - ~CatmullClarkPrecomputedCoefficients(); - }; -} diff --git a/thirdparty/embree-aarch64/kernels/subdiv/catmullclark_patch.h b/thirdparty/embree-aarch64/kernels/subdiv/catmullclark_patch.h deleted file mode 100644 index ab1d63594a..0000000000 --- a/thirdparty/embree-aarch64/kernels/subdiv/catmullclark_patch.h +++ /dev/null @@ -1,562 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "catmullclark_ring.h" -#include "bezier_curve.h" - -namespace embree -{ - template - class __aligned(64) CatmullClarkPatchT - { - public: - typedef CatmullClark1RingT CatmullClark1Ring; - typedef typename CatmullClark1Ring::Type Type; - - array_t,4> ring; - - public: - __forceinline CatmullClarkPatchT () {} - - __forceinline CatmullClarkPatchT (const HalfEdge* first_half_edge, const char* vertices, size_t stride) { - init(first_half_edge,vertices,stride); - } - - __forceinline CatmullClarkPatchT (const HalfEdge* first_half_edge, const BufferView& vertices) { - init(first_half_edge,vertices.getPtr(),vertices.getStride()); - } - - __forceinline void init (const HalfEdge* first_half_edge, const char* vertices, size_t stride) - { - for (unsigned i=0; i<4; i++) - ring[i].init(first_half_edge+i,vertices,stride); - - assert(verify()); - } - - __forceinline size_t bytes() const { - return ring[0].bytes()+ring[1].bytes()+ring[2].bytes()+ring[3].bytes(); - } - - __forceinline void serialize(void* ptr, size_t& ofs) const - { - for (size_t i=0; i<4; i++) - ring[i].serialize((char*)ptr,ofs); - } - - __forceinline void deserialize(void* ptr) - { - size_t ofs = 0; - for (size_t i=0; i<4; i++) - ring[i].deserialize((char*)ptr,ofs); - } - - __forceinline BBox3fa bounds() const - { - BBox3fa bounds (ring[0].bounds()); - for (size_t i=1; i<4; i++) - bounds.extend(ring[i].bounds()); - return bounds; - } - - __forceinline Type type() const - { - const int ty0 = ring[0].type() ^ CatmullClark1Ring::TYPE_CREASES; - const int ty1 = ring[1].type() ^ CatmullClark1Ring::TYPE_CREASES; - const int ty2 = ring[2].type() ^ CatmullClark1Ring::TYPE_CREASES; - const int ty3 = ring[3].type() ^ CatmullClark1Ring::TYPE_CREASES; - return (Type) ((ty0 & ty1 & ty2 & ty3) ^ CatmullClark1Ring::TYPE_CREASES); - } - - __forceinline bool isFinalResolution(float res) const { - return ring[0].isFinalResolution(res) && ring[1].isFinalResolution(res) && ring[2].isFinalResolution(res) && ring[3].isFinalResolution(res); - } - - static __forceinline void init_regular(const CatmullClark1RingT& p0, - const CatmullClark1RingT& p1, - CatmullClark1RingT& dest0, - CatmullClark1RingT& dest1) - { - assert(p1.face_valence > 2); - dest1.vertex_level = dest0.vertex_level = p0.edge_level; - dest1.face_valence = dest0.face_valence = 4; - dest1.edge_valence = dest0.edge_valence = 8; - dest1.border_index = dest0.border_index = -1; - dest1.vtx = dest0.vtx = (Vertex_t)p0.ring[0]; - dest1.vertex_crease_weight = dest0.vertex_crease_weight = 0.0f; - - dest1.ring[2] = dest0.ring[0] = (Vertex_t)p0.ring[1]; - dest1.ring[1] = dest0.ring[7] = (Vertex_t)p1.ring[0]; - dest1.ring[0] = dest0.ring[6] = (Vertex_t)p1.vtx; - dest1.ring[7] = dest0.ring[5] = (Vertex_t)p1.ring[4]; - dest1.ring[6] = dest0.ring[4] = (Vertex_t)p0.ring[p0.edge_valence-1]; - dest1.ring[5] = dest0.ring[3] = (Vertex_t)p0.ring[p0.edge_valence-2]; - dest1.ring[4] = dest0.ring[2] = (Vertex_t)p0.vtx; - dest1.ring[3] = dest0.ring[1] = (Vertex_t)p0.ring[2]; - - dest1.crease_weight[1] = dest0.crease_weight[0] = 0.0f; - dest1.crease_weight[0] = dest0.crease_weight[3] = p1.crease_weight[1]; - dest1.crease_weight[3] = dest0.crease_weight[2] = 0.0f; - dest1.crease_weight[2] = dest0.crease_weight[1] = p0.crease_weight[0]; - - if (p0.eval_unique_identifier <= p1.eval_unique_identifier) - { - dest0.eval_start_index = 3; - dest1.eval_start_index = 0; - dest0.eval_unique_identifier = p0.eval_unique_identifier; - dest1.eval_unique_identifier = p0.eval_unique_identifier; - } - else - { - dest0.eval_start_index = 1; - dest1.eval_start_index = 2; - dest0.eval_unique_identifier = p1.eval_unique_identifier; - dest1.eval_unique_identifier = p1.eval_unique_identifier; - } - } - - static __forceinline void init_border(const CatmullClark1RingT &p0, - const CatmullClark1RingT &p1, - CatmullClark1RingT &dest0, - CatmullClark1RingT &dest1) - { - dest1.vertex_level = dest0.vertex_level = p0.edge_level; - dest1.face_valence = dest0.face_valence = 3; - dest1.edge_valence = dest0.edge_valence = 6; - dest0.border_index = 2; - dest1.border_index = 4; - dest1.vtx = dest0.vtx = (Vertex_t)p0.ring[0]; - dest1.vertex_crease_weight = dest0.vertex_crease_weight = 0.0f; - - dest1.ring[2] = dest0.ring[0] = (Vertex_t)p0.ring[1]; - dest1.ring[1] = dest0.ring[5] = (Vertex_t)p1.ring[0]; - dest1.ring[0] = dest0.ring[4] = (Vertex_t)p1.vtx; - dest1.ring[5] = dest0.ring[3] = (Vertex_t)p0.ring[p0.border_index+1]; // dummy - dest1.ring[4] = dest0.ring[2] = (Vertex_t)p0.vtx; - dest1.ring[3] = dest0.ring[1] = (Vertex_t)p0.ring[2]; - - dest1.crease_weight[1] = dest0.crease_weight[0] = 0.0f; - dest1.crease_weight[0] = dest0.crease_weight[2] = p1.crease_weight[1]; - dest1.crease_weight[2] = dest0.crease_weight[1] = p0.crease_weight[0]; - - if (p0.eval_unique_identifier <= p1.eval_unique_identifier) - { - dest0.eval_start_index = 1; - dest1.eval_start_index = 2; - dest0.eval_unique_identifier = p0.eval_unique_identifier; - dest1.eval_unique_identifier = p0.eval_unique_identifier; - } - else - { - dest0.eval_start_index = 2; - dest1.eval_start_index = 0; - dest0.eval_unique_identifier = p1.eval_unique_identifier; - dest1.eval_unique_identifier = p1.eval_unique_identifier; - } - } - - static __forceinline void init_regular(const Vertex_t ¢er, const Vertex_t center_ring[8], const unsigned int offset, CatmullClark1RingT &dest) - { - dest.vertex_level = 0.0f; - dest.face_valence = 4; - dest.edge_valence = 8; - dest.border_index = -1; - dest.vtx = (Vertex_t)center; - dest.vertex_crease_weight = 0.0f; - for (size_t i=0; i<8; i++) - dest.ring[i] = (Vertex_t)center_ring[(offset+i)%8]; - for (size_t i=0; i<4; i++) - dest.crease_weight[i] = 0.0f; - - dest.eval_start_index = (8-offset)>>1; - if (dest.eval_start_index >= dest.face_valence) dest.eval_start_index -= dest.face_valence; - assert( dest.eval_start_index < dest.face_valence ); - dest.eval_unique_identifier = 0; - } - - __noinline void subdivide(array_t& patch) const - { - ring[0].subdivide(patch[0].ring[0]); - ring[1].subdivide(patch[1].ring[1]); - ring[2].subdivide(patch[2].ring[2]); - ring[3].subdivide(patch[3].ring[3]); - - patch[0].ring[0].edge_level = 0.5f*ring[0].edge_level; - patch[0].ring[1].edge_level = 0.25f*(ring[1].edge_level+ring[3].edge_level); - patch[0].ring[2].edge_level = 0.25f*(ring[0].edge_level+ring[2].edge_level); - patch[0].ring[3].edge_level = 0.5f*ring[3].edge_level; - - patch[1].ring[0].edge_level = 0.5f*ring[0].edge_level; - patch[1].ring[1].edge_level = 0.5f*ring[1].edge_level; - patch[1].ring[2].edge_level = 0.25f*(ring[0].edge_level+ring[2].edge_level); - patch[1].ring[3].edge_level = 0.25f*(ring[1].edge_level+ring[3].edge_level); - - patch[2].ring[0].edge_level = 0.25f*(ring[0].edge_level+ring[2].edge_level); - patch[2].ring[1].edge_level = 0.5f*ring[1].edge_level; - patch[2].ring[2].edge_level = 0.5f*ring[2].edge_level; - patch[2].ring[3].edge_level = 0.25f*(ring[1].edge_level+ring[3].edge_level); - - patch[3].ring[0].edge_level = 0.25f*(ring[0].edge_level+ring[2].edge_level); - patch[3].ring[1].edge_level = 0.25f*(ring[1].edge_level+ring[3].edge_level); - patch[3].ring[2].edge_level = 0.5f*ring[2].edge_level; - patch[3].ring[3].edge_level = 0.5f*ring[3].edge_level; - - const bool regular0 = ring[0].has_last_face() && ring[1].face_valence > 2; - if (likely(regular0)) - init_regular(patch[0].ring[0],patch[1].ring[1],patch[0].ring[1],patch[1].ring[0]); - else - init_border(patch[0].ring[0],patch[1].ring[1],patch[0].ring[1],patch[1].ring[0]); - - const bool regular1 = ring[1].has_last_face() && ring[2].face_valence > 2; - if (likely(regular1)) - init_regular(patch[1].ring[1],patch[2].ring[2],patch[1].ring[2],patch[2].ring[1]); - else - init_border(patch[1].ring[1],patch[2].ring[2],patch[1].ring[2],patch[2].ring[1]); - - const bool regular2 = ring[2].has_last_face() && ring[3].face_valence > 2; - if (likely(regular2)) - init_regular(patch[2].ring[2],patch[3].ring[3],patch[2].ring[3],patch[3].ring[2]); - else - init_border(patch[2].ring[2],patch[3].ring[3],patch[2].ring[3],patch[3].ring[2]); - - const bool regular3 = ring[3].has_last_face() && ring[0].face_valence > 2; - if (likely(regular3)) - init_regular(patch[3].ring[3],patch[0].ring[0],patch[3].ring[0],patch[0].ring[3]); - else - init_border(patch[3].ring[3],patch[0].ring[0],patch[3].ring[0],patch[0].ring[3]); - - Vertex_t center = (ring[0].vtx + ring[1].vtx + ring[2].vtx + ring[3].vtx) * 0.25f; - - Vertex_t center_ring[8]; - center_ring[0] = (Vertex_t)patch[3].ring[3].ring[0]; - center_ring[7] = (Vertex_t)patch[3].ring[3].vtx; - center_ring[6] = (Vertex_t)patch[2].ring[2].ring[0]; - center_ring[5] = (Vertex_t)patch[2].ring[2].vtx; - center_ring[4] = (Vertex_t)patch[1].ring[1].ring[0]; - center_ring[3] = (Vertex_t)patch[1].ring[1].vtx; - center_ring[2] = (Vertex_t)patch[0].ring[0].ring[0]; - center_ring[1] = (Vertex_t)patch[0].ring[0].vtx; - - init_regular(center,center_ring,0,patch[0].ring[2]); - init_regular(center,center_ring,2,patch[1].ring[3]); - init_regular(center,center_ring,4,patch[2].ring[0]); - init_regular(center,center_ring,6,patch[3].ring[1]); - - assert(patch[0].verify()); - assert(patch[1].verify()); - assert(patch[2].verify()); - assert(patch[3].verify()); - } - - bool verify() const { - return ring[0].hasValidPositions() && ring[1].hasValidPositions() && ring[2].hasValidPositions() && ring[3].hasValidPositions(); - } - - __forceinline void init( FinalQuad& quad ) const - { - quad.vtx[0] = (Vertex_t)ring[0].vtx; - quad.vtx[1] = (Vertex_t)ring[1].vtx; - quad.vtx[2] = (Vertex_t)ring[2].vtx; - quad.vtx[3] = (Vertex_t)ring[3].vtx; - }; - - friend __forceinline embree_ostream operator<<(embree_ostream o, const CatmullClarkPatchT &p) - { - o << "CatmullClarkPatch { " << embree_endl; - for (size_t i=0; i<4; i++) - o << "ring" << i << ": " << p.ring[i] << embree_endl; - o << "}" << embree_endl; - return o; - } - }; - - typedef CatmullClarkPatchT CatmullClarkPatch3fa; - - template - class __aligned(64) GeneralCatmullClarkPatchT - { - public: - typedef CatmullClarkPatchT CatmullClarkPatch; - typedef CatmullClark1RingT CatmullClark1Ring; - typedef BezierCurveT BezierCurve; - - static const unsigned SIZE = MAX_PATCH_VALENCE; - DynamicStackArray,8,SIZE> ring; - unsigned N; - - __forceinline GeneralCatmullClarkPatchT () - : N(0) {} - - GeneralCatmullClarkPatchT (const HalfEdge* h, const char* vertices, size_t stride) { - init(h,vertices,stride); - } - - __forceinline GeneralCatmullClarkPatchT (const HalfEdge* first_half_edge, const BufferView& vertices) { - init(first_half_edge,vertices.getPtr(),vertices.getStride()); - } - - __forceinline void init (const HalfEdge* h, const char* vertices, size_t stride) - { - unsigned int i = 0; - const HalfEdge* edge = h; - do { - ring[i].init(edge,vertices,stride); - edge = edge->next(); - i++; - } while ((edge != h) && (i < SIZE)); - N = i; - } - - __forceinline unsigned size() const { - return N; - } - - __forceinline bool isQuadPatch() const { - return (N == 4) && ring[0].only_quads && ring[1].only_quads && ring[2].only_quads && ring[3].only_quads; - } - - static __forceinline void init_regular(const CatmullClark1RingT& p0, - const CatmullClark1RingT& p1, - CatmullClark1RingT& dest0, - CatmullClark1RingT& dest1) - { - assert(p1.face_valence > 2); - dest1.vertex_level = dest0.vertex_level = p0.edge_level; - dest1.face_valence = dest0.face_valence = 4; - dest1.edge_valence = dest0.edge_valence = 8; - dest1.border_index = dest0.border_index = -1; - dest1.vtx = dest0.vtx = (Vertex_t)p0.ring[0]; - dest1.vertex_crease_weight = dest0.vertex_crease_weight = 0.0f; - - dest1.ring[2] = dest0.ring[0] = (Vertex_t)p0.ring[1]; - dest1.ring[1] = dest0.ring[7] = (Vertex_t)p1.ring[0]; - dest1.ring[0] = dest0.ring[6] = (Vertex_t)p1.vtx; - dest1.ring[7] = dest0.ring[5] = (Vertex_t)p1.ring[4]; - dest1.ring[6] = dest0.ring[4] = (Vertex_t)p0.ring[p0.edge_valence-1]; - dest1.ring[5] = dest0.ring[3] = (Vertex_t)p0.ring[p0.edge_valence-2]; - dest1.ring[4] = dest0.ring[2] = (Vertex_t)p0.vtx; - dest1.ring[3] = dest0.ring[1] = (Vertex_t)p0.ring[2]; - - dest1.crease_weight[1] = dest0.crease_weight[0] = 0.0f; - dest1.crease_weight[0] = dest0.crease_weight[3] = p1.crease_weight[1]; - dest1.crease_weight[3] = dest0.crease_weight[2] = 0.0f; - dest1.crease_weight[2] = dest0.crease_weight[1] = p0.crease_weight[0]; - - if (p0.eval_unique_identifier <= p1.eval_unique_identifier) - { - dest0.eval_start_index = 3; - dest1.eval_start_index = 0; - dest0.eval_unique_identifier = p0.eval_unique_identifier; - dest1.eval_unique_identifier = p0.eval_unique_identifier; - } - else - { - dest0.eval_start_index = 1; - dest1.eval_start_index = 2; - dest0.eval_unique_identifier = p1.eval_unique_identifier; - dest1.eval_unique_identifier = p1.eval_unique_identifier; - } - } - - - static __forceinline void init_border(const CatmullClark1RingT &p0, - const CatmullClark1RingT &p1, - CatmullClark1RingT &dest0, - CatmullClark1RingT &dest1) - { - dest1.vertex_level = dest0.vertex_level = p0.edge_level; - dest1.face_valence = dest0.face_valence = 3; - dest1.edge_valence = dest0.edge_valence = 6; - dest0.border_index = 2; - dest1.border_index = 4; - dest1.vtx = dest0.vtx = (Vertex_t)p0.ring[0]; - dest1.vertex_crease_weight = dest0.vertex_crease_weight = 0.0f; - - dest1.ring[2] = dest0.ring[0] = (Vertex_t)p0.ring[1]; - dest1.ring[1] = dest0.ring[5] = (Vertex_t)p1.ring[0]; - dest1.ring[0] = dest0.ring[4] = (Vertex_t)p1.vtx; - dest1.ring[5] = dest0.ring[3] = (Vertex_t)p0.ring[p0.border_index+1]; // dummy - dest1.ring[4] = dest0.ring[2] = (Vertex_t)p0.vtx; - dest1.ring[3] = dest0.ring[1] = (Vertex_t)p0.ring[2]; - - dest1.crease_weight[1] = dest0.crease_weight[0] = 0.0f; - dest1.crease_weight[0] = dest0.crease_weight[2] = p1.crease_weight[1]; - dest1.crease_weight[2] = dest0.crease_weight[1] = p0.crease_weight[0]; - - if (p0.eval_unique_identifier <= p1.eval_unique_identifier) - { - dest0.eval_start_index = 1; - dest1.eval_start_index = 2; - dest0.eval_unique_identifier = p0.eval_unique_identifier; - dest1.eval_unique_identifier = p0.eval_unique_identifier; - } - else - { - dest0.eval_start_index = 2; - dest1.eval_start_index = 0; - dest0.eval_unique_identifier = p1.eval_unique_identifier; - dest1.eval_unique_identifier = p1.eval_unique_identifier; - } - } - - static __forceinline void init_regular(const Vertex_t ¢er, const array_t& center_ring, const float vertex_level, const unsigned int N, const unsigned int offset, CatmullClark1RingT &dest) - { - assert(N<(MAX_RING_FACE_VALENCE)); - assert(2*N<(MAX_RING_EDGE_VALENCE)); - dest.vertex_level = vertex_level; - dest.face_valence = N; - dest.edge_valence = 2*N; - dest.border_index = -1; - dest.vtx = (Vertex_t)center; - dest.vertex_crease_weight = 0.0f; - for (unsigned i=0; i<2*N; i++) { - dest.ring[i] = (Vertex_t)center_ring[(2*N+offset+i-1)%(2*N)]; - assert(isvalid(dest.ring[i])); - } - for (unsigned i=0; i>1; - if (dest.eval_start_index >= dest.face_valence) dest.eval_start_index -= dest.face_valence; - - assert( dest.eval_start_index < dest.face_valence ); - dest.eval_unique_identifier = 0; - } - - __noinline void subdivide(array_t& patch, unsigned& N_o) const - { - N_o = N; - assert( N ); - for (unsigned i=0; i center_ring; - float center_vertex_level = 2.0f; // guarantees that irregular vertices get always isolated also for non-quads - - for (unsigned i=0; i 2; - if (likely(regular)) init_regular(patch[i].ring[0],patch[ip1].ring[0],patch[i].ring[1],patch[ip1].ring[3]); - else init_border (patch[i].ring[0],patch[ip1].ring[0],patch[i].ring[1],patch[ip1].ring[3]); - - assert( patch[i].ring[1].hasValidPositions() ); - assert( patch[ip1].ring[3].hasValidPositions() ); - - float level = 0.25f*(ring[im1].edge_level+ring[ip1].edge_level); - patch[i].ring[1].edge_level = patch[ip1].ring[2].edge_level = level; - center_vertex_level = max(center_vertex_level,level); - - center += ring[i].vtx; - center_ring[2*i+0] = (Vertex_t)patch[i].ring[0].vtx; - center_ring[2*i+1] = (Vertex_t)patch[i].ring[0].ring[0]; - } - center /= float(N); - - for (unsigned int i=0; i& patches) - { - CatmullClark1Ring patches1ring1 = patches[1].ring[1]; - patches[1].ring[1] = patches[1].ring[0]; // FIXME: optimize these assignments - patches[1].ring[0] = patches[1].ring[3]; - patches[1].ring[3] = patches[1].ring[2]; - patches[1].ring[2] = patches1ring1; - - CatmullClark1Ring patches2ring2 = patches[2].ring[2]; - patches[2].ring[2] = patches[2].ring[0]; - patches[2].ring[0] = patches2ring2; - CatmullClark1Ring patches2ring3 = patches[2].ring[3]; - patches[2].ring[3] = patches[2].ring[1]; - patches[2].ring[1] = patches2ring3; - - CatmullClark1Ring patches3ring3 = patches[3].ring[3]; - patches[3].ring[3] = patches[3].ring[0]; - patches[3].ring[0] = patches[3].ring[1]; - patches[3].ring[1] = patches[3].ring[2]; - patches[3].ring[2] = patches3ring3; - } - - __forceinline void getLimitBorder(BezierCurve curves[GeneralCatmullClarkPatchT::SIZE]) const - { - Vertex P0 = ring[0].getLimitVertex(); - for (unsigned i=0; i GeneralCatmullClarkPatch3fa; -} diff --git a/thirdparty/embree-aarch64/kernels/subdiv/catmullclark_ring.h b/thirdparty/embree-aarch64/kernels/subdiv/catmullclark_ring.h deleted file mode 100644 index 73b41fd4ff..0000000000 --- a/thirdparty/embree-aarch64/kernels/subdiv/catmullclark_ring.h +++ /dev/null @@ -1,826 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../common/geometry.h" -#include "../common/buffer.h" -#include "half_edge.h" -#include "catmullclark_coefficients.h" - -namespace embree -{ - struct __aligned(64) FinalQuad { - Vec3fa vtx[4]; - }; - - template - struct __aligned(64) CatmullClark1RingT - { - ALIGNED_STRUCT_(64); - - int border_index; //!< edge index where border starts - unsigned int face_valence; //!< number of adjacent quad faces - unsigned int edge_valence; //!< number of adjacent edges (2*face_valence) - float vertex_crease_weight; //!< weight of vertex crease (0 if no vertex crease) - DynamicStackArray crease_weight; //!< edge crease weights for each adjacent edge - float vertex_level; //!< maximum level of all adjacent edges - float edge_level; //!< level of first edge - unsigned int eval_start_index; //!< topology dependent index to start evaluation - unsigned int eval_unique_identifier; //!< topology dependent unique identifier for this ring - Vertex vtx; //!< center vertex - DynamicStackArray ring; //!< ring of neighboring vertices - - public: - CatmullClark1RingT () - : eval_start_index(0), eval_unique_identifier(0) {} // FIXME: default constructor should be empty - - /*! calculates number of bytes required to serialize this structure */ - __forceinline size_t bytes() const - { - size_t ofs = 0; - ofs += sizeof(border_index); - ofs += sizeof(face_valence); - assert(2*face_valence == edge_valence); - ofs += sizeof(vertex_crease_weight); - ofs += face_valence*sizeof(float); - ofs += sizeof(vertex_level); - ofs += sizeof(edge_level); - ofs += sizeof(eval_start_index); - ofs += sizeof(eval_unique_identifier); - ofs += sizeof(vtx); - ofs += edge_valence*sizeof(Vertex); - return ofs; - } - - template - static __forceinline void store(char* ptr, size_t& ofs, const Ty& v) { - *(Ty*)&ptr[ofs] = v; ofs += sizeof(Ty); - } - - template - static __forceinline void load(char* ptr, size_t& ofs, Ty& v) { - v = *(Ty*)&ptr[ofs]; ofs += sizeof(Ty); - } - - /*! serializes the ring to some memory location */ - __forceinline void serialize(char* ptr, size_t& ofs) const - { - store(ptr,ofs,border_index); - store(ptr,ofs,face_valence); - store(ptr,ofs,vertex_crease_weight); - for (size_t i=0; ii); - return ring[i]; - } - - __forceinline const Vertex& back(size_t i) const { - assert(edge_valence>=i); - return ring[edge_valence-i]; - } - - __forceinline bool has_last_face() const { - return (size_t)border_index != (size_t)edge_valence-2; - } - - __forceinline bool has_opposite_front(size_t i) const { - return (size_t)border_index != 2*i; - } - - __forceinline bool has_opposite_back(size_t i) const { - return (size_t)border_index != ((size_t)edge_valence-2-2*i); - } - - __forceinline BBox3fa bounds() const - { - BBox3fa bounds ( vtx ); - for (size_t i = 0; igetStartVertexIndex()*stride); - vertex_crease_weight = h->vertex_crease_weight; - - HalfEdge* p = (HalfEdge*) h; - - unsigned i=0; - unsigned min_vertex_index = (unsigned)-1; - unsigned min_vertex_index_face = (unsigned)-1; - edge_level = p->edge_level; - vertex_level = 0.0f; - - do - { - vertex_level = max(vertex_level,p->edge_level); - crease_weight[i/2] = p->edge_crease_weight; - assert(p->hasOpposite() || p->edge_crease_weight == float(inf)); - - /* store first two vertices of face */ - p = p->next(); - const unsigned index0 = p->getStartVertexIndex(); - ring[i++] = Vertex_t::loadu(vertices+index0*stride); - if (index0 < min_vertex_index) { min_vertex_index = index0; min_vertex_index_face = i>>1; } - p = p->next(); - - const unsigned index1 = p->getStartVertexIndex(); - ring[i++] = Vertex_t::loadu(vertices+index1*stride); - p = p->next(); - - /* continue with next face */ - if (likely(p->hasOpposite())) - p = p->opposite(); - - /* if there is no opposite go the long way to the other side of the border */ - else - { - /* find minimum start vertex */ - const unsigned index0 = p->getStartVertexIndex(); - if (index0 < min_vertex_index) { min_vertex_index = index0; min_vertex_index_face = i>>1; } - - /*! mark first border edge and store dummy vertex for face between the two border edges */ - border_index = i; - crease_weight[i/2] = inf; - ring[i++] = Vertex_t::loadu(vertices+index0*stride); - ring[i++] = vtx; // dummy vertex - - /*! goto other side of border */ - p = (HalfEdge*) h; - while (p->hasOpposite()) - p = p->opposite()->next(); - } - - } while (p != h); - - edge_valence = i; - face_valence = i >> 1; - eval_unique_identifier = min_vertex_index; - eval_start_index = min_vertex_index_face; - - assert( hasValidPositions() ); - } - - __forceinline void subdivide(CatmullClark1RingT& dest) const - { - dest.edge_level = 0.5f*edge_level; - dest.vertex_level = 0.5f*vertex_level; - dest.face_valence = face_valence; - dest.edge_valence = edge_valence; - dest.border_index = border_index; - dest.vertex_crease_weight = max(0.0f,vertex_crease_weight-1.0f); - dest.eval_start_index = eval_start_index; - dest.eval_unique_identifier = eval_unique_identifier; - - /* calculate face points */ - Vertex_t S = Vertex_t(0.0f); - for (size_t i=0; i= face_valence) face_index -= face_valence; assert(face_index < face_valence); - size_t index0 = 2*face_index+0; if (index0 >= edge_valence) index0 -= edge_valence; assert(index0 < edge_valence); - size_t index1 = 2*face_index+1; if (index1 >= edge_valence) index1 -= edge_valence; assert(index1 < edge_valence); - size_t index2 = 2*face_index+2; if (index2 >= edge_valence) index2 -= edge_valence; assert(index2 < edge_valence); - S += dest.ring[index1] = ((vtx + ring[index1]) + (ring[index0] + ring[index2])) * 0.25f; - } - - /* calculate new edge points */ - size_t num_creases = 0; - array_t crease_id; - - for (size_t i=0; i= face_valence) face_index -= face_valence; - const float edge_crease = crease_weight[face_index]; - dest.crease_weight[face_index] = max(edge_crease-1.0f,0.0f); - - size_t index = 2*face_index; - size_t prev_index = face_index == 0 ? edge_valence-1 : 2*face_index-1; - size_t next_index = 2*face_index+1; - - const Vertex_t v = vtx + ring[index]; - const Vertex_t f = dest.ring[prev_index] + dest.ring[next_index]; - S += ring[index]; - - /* fast path for regular edge points */ - if (likely(edge_crease <= 0.0f)) { - dest.ring[index] = (v+f) * 0.25f; - } - - /* slower path for hard edge rule */ - else { - crease_id[num_creases++] = face_index; - dest.ring[index] = v*0.5f; - - /* even slower path for blended edge rule */ - if (unlikely(edge_crease < 1.0f)) { - dest.ring[index] = lerp((v+f)*0.25f,v*0.5f,edge_crease); - } - } - } - - /* compute new vertex using smooth rule */ - const float inv_face_valence = 1.0f / (float)face_valence; - const Vertex_t v_smooth = (Vertex_t) madd(inv_face_valence,S,(float(face_valence)-2.0f)*vtx)*inv_face_valence; - dest.vtx = v_smooth; - - /* compute new vertex using vertex_crease_weight rule */ - if (unlikely(vertex_crease_weight > 0.0f)) - { - if (vertex_crease_weight >= 1.0f) { - dest.vtx = vtx; - } else { - dest.vtx = lerp(v_smooth,vtx,vertex_crease_weight); - } - return; - } - - /* no edge crease rule and dart rule */ - if (likely(num_creases <= 1)) - return; - - /* compute new vertex using crease rule */ - if (likely(num_creases == 2)) - { - /* update vertex using crease rule */ - const size_t crease0 = crease_id[0], crease1 = crease_id[1]; - const Vertex_t v_sharp = (Vertex_t)(ring[2*crease0] + 6.0f*vtx + ring[2*crease1]) * (1.0f / 8.0f); - dest.vtx = v_sharp; - - /* update crease_weights using chaikin rule */ - const float crease_weight0 = crease_weight[crease0], crease_weight1 = crease_weight[crease1]; - dest.crease_weight[crease0] = max(0.25f*(3.0f*crease_weight0 + crease_weight1)-1.0f,0.0f); - dest.crease_weight[crease1] = max(0.25f*(3.0f*crease_weight1 + crease_weight0)-1.0f,0.0f); - - /* interpolate between sharp and smooth rule */ - const float v_blend = 0.5f*(crease_weight0+crease_weight1); - if (unlikely(v_blend < 1.0f)) { - dest.vtx = lerp(v_smooth,v_sharp,v_blend); - } - } - - /* compute new vertex using corner rule */ - else { - dest.vtx = vtx; - } - } - - __forceinline bool isRegular1() const - { - if (border_index == -1) { - if (face_valence == 4) return true; - } else { - if (face_valence < 4) return true; - } - return false; - } - - __forceinline size_t numEdgeCreases() const - { - ssize_t numCreases = 0; - for (size_t i=0; i 0.0f; - } - return numCreases; - } - - enum Type { - TYPE_NONE = 0, //!< invalid type - TYPE_REGULAR = 1, //!< regular patch when ignoring creases - TYPE_REGULAR_CREASES = 2, //!< regular patch when considering creases - TYPE_GREGORY = 4, //!< gregory patch when ignoring creases - TYPE_GREGORY_CREASES = 8, //!< gregory patch when considering creases - TYPE_CREASES = 16 //!< patch has crease features - }; - - __forceinline Type type() const - { - /* check if there is an edge crease anywhere */ - const size_t numCreases = numEdgeCreases(); - const bool noInnerCreases = hasBorder() ? numCreases == 2 : numCreases == 0; - - Type crease_mask = (Type) (TYPE_REGULAR | TYPE_GREGORY); - if (noInnerCreases ) crease_mask = (Type) (crease_mask | TYPE_REGULAR_CREASES | TYPE_GREGORY_CREASES); - if (numCreases != 0) crease_mask = (Type) (crease_mask | TYPE_CREASES); - - /* calculate if this vertex is regular */ - bool hasBorder = border_index != -1; - if (face_valence == 2 && hasBorder) { - if (vertex_crease_weight == 0.0f ) return (Type) (crease_mask & (TYPE_REGULAR | TYPE_REGULAR_CREASES | TYPE_GREGORY | TYPE_GREGORY_CREASES | TYPE_CREASES)); - else if (vertex_crease_weight == float(inf)) return (Type) (crease_mask & (TYPE_REGULAR | TYPE_REGULAR_CREASES | TYPE_GREGORY | TYPE_GREGORY_CREASES | TYPE_CREASES)); - else return TYPE_CREASES; - } - else if (vertex_crease_weight != 0.0f) return TYPE_CREASES; - else if (face_valence == 3 && hasBorder) return (Type) (crease_mask & (TYPE_REGULAR | TYPE_REGULAR_CREASES | TYPE_GREGORY | TYPE_GREGORY_CREASES | TYPE_CREASES)); - else if (face_valence == 4 && !hasBorder) return (Type) (crease_mask & (TYPE_REGULAR | TYPE_REGULAR_CREASES | TYPE_GREGORY | TYPE_GREGORY_CREASES | TYPE_CREASES)); - else return (Type) (crease_mask & (TYPE_GREGORY | TYPE_GREGORY_CREASES | TYPE_CREASES)); - } - - __forceinline bool isFinalResolution(float res) const { - return vertex_level <= res; - } - - /* computes the limit vertex */ - __forceinline Vertex getLimitVertex() const - { - /* return hard corner */ - if (unlikely(std::isinf(vertex_crease_weight))) - return vtx; - - /* border vertex rule */ - if (unlikely(border_index != -1)) - { - const unsigned int second_border_index = border_index+2 >= int(edge_valence) ? 0 : border_index+2; - return (4.0f * vtx + (ring[border_index] + ring[second_border_index])) * 1.0f/6.0f; - } - - Vertex_t F( 0.0f ); - Vertex_t E( 0.0f ); - - assert(eval_start_index < face_valence); - - for (size_t i=0; i= face_valence) index -= face_valence; - F += ring[2*index+1]; - E += ring[2*index]; - } - - const float n = (float)face_valence; - return (Vertex_t)(n*n*vtx+4.0f*E+F) / ((n+5.0f)*n); - } - - /* gets limit tangent in the direction of egde vtx -> ring[0] */ - __forceinline Vertex getLimitTangent() const - { - if (unlikely(std::isinf(vertex_crease_weight))) - return ring[0] - vtx; - - /* border vertex rule */ - if (unlikely(border_index != -1)) - { - if (border_index != (int)edge_valence-2 ) { - return ring[0] - vtx; - } - else - { - const unsigned int second_border_index = border_index+2 >= int(edge_valence) ? 0 : border_index+2; - return (ring[second_border_index] - ring[border_index]) * 0.5f; - } - } - - Vertex_t alpha( 0.0f ); - Vertex_t beta ( 0.0f ); - - const size_t n = face_valence; - - assert(eval_start_index < face_valence); - - Vertex_t q( 0.0f ); - for (size_t i=0; i= face_valence) index -= face_valence; - const float a = CatmullClarkPrecomputedCoefficients::table.limittangent_a(index,n); - const float b = CatmullClarkPrecomputedCoefficients::table.limittangent_b(index,n); - alpha += a * ring[2*index]; - beta += b * ring[2*index+1]; - } - - const float sigma = CatmullClarkPrecomputedCoefficients::table.limittangent_c(n); - return sigma * (alpha + beta); - } - - /* gets limit tangent in the direction of egde vtx -> ring[edge_valence-2] */ - __forceinline Vertex getSecondLimitTangent() const - { - if (unlikely(std::isinf(vertex_crease_weight))) - return ring[2] - vtx; - - /* border vertex rule */ - if (unlikely(border_index != -1)) - { - if (border_index != 2) { - return ring[2] - vtx; - } - else { - const unsigned int second_border_index = border_index+2 >= int(edge_valence) ? 0 : border_index+2; - return (ring[border_index] - ring[second_border_index]) * 0.5f; - } - } - - Vertex_t alpha( 0.0f ); - Vertex_t beta ( 0.0f ); - - const size_t n = face_valence; - - assert(eval_start_index < face_valence); - - for (size_t i=0; i= face_valence) index -= face_valence; - - size_t prev_index = index == 0 ? face_valence-1 : index-1; // need to be bit-wise exact in cosf eval - const float a = CatmullClarkPrecomputedCoefficients::table.limittangent_a(prev_index,n); - const float b = CatmullClarkPrecomputedCoefficients::table.limittangent_b(prev_index,n); - alpha += a * ring[2*index]; - beta += b * ring[2*index+1]; - } - - const float sigma = CatmullClarkPrecomputedCoefficients::table.limittangent_c(n); - return sigma* (alpha + beta); - } - - /* gets surface normal */ - const Vertex getNormal() const { - return cross(getLimitTangent(),getSecondLimitTangent()); - } - - /* returns center of the n-th quad in the 1-ring */ - __forceinline Vertex getQuadCenter(const size_t index) const - { - const Vertex_t &p0 = vtx; - const Vertex_t &p1 = ring[2*index+0]; - const Vertex_t &p2 = ring[2*index+1]; - const Vertex_t &p3 = index == face_valence-1 ? ring[0] : ring[2*index+2]; - const Vertex p = (p0+p1+p2+p3) * 0.25f; - return p; - } - - /* returns center of the n-th edge in the 1-ring */ - __forceinline Vertex getEdgeCenter(const size_t index) const { - return (vtx + ring[index*2]) * 0.5f; - } - - bool hasValidPositions() const - { - for (size_t i=0; i " << c.ring[i]; - if (i % 2 == 0) o << " crease = " << c.crease_weight[i/2]; - o << embree_endl; - } - return o; - } - }; - - typedef CatmullClark1RingT CatmullClark1Ring3fa; - - template - struct __aligned(64) GeneralCatmullClark1RingT - { - ALIGNED_STRUCT_(64); - - typedef CatmullClark1RingT CatmullClark1Ring; - - struct Face - { - __forceinline Face() {} - __forceinline Face (int size, float crease_weight) - : size(size), crease_weight(crease_weight) {} - - // FIXME: add member that returns total number of vertices - - int size; // number of vertices-2 of nth face in ring - float crease_weight; - }; - - Vertex vtx; - DynamicStackArray ring; - DynamicStackArray faces; - unsigned int face_valence; - unsigned int edge_valence; - int border_face; - float vertex_crease_weight; - float vertex_level; //!< maximum level of adjacent edges - float edge_level; // level of first edge - bool only_quads; // true if all faces are quads - unsigned int eval_start_face_index; - unsigned int eval_start_vertex_index; - unsigned int eval_unique_identifier; - - public: - GeneralCatmullClark1RingT() - : eval_start_face_index(0), eval_start_vertex_index(0), eval_unique_identifier(0) {} - - __forceinline bool isRegular() const - { - if (border_face == -1 && face_valence == 4) return true; - return false; - } - - __forceinline bool has_last_face() const { - return border_face != (int)face_valence-1; - } - - __forceinline bool has_second_face() const { - return (border_face == -1) || (border_face >= 2); - } - - bool hasValidPositions() const - { - for (size_t i=0; igetStartVertexIndex()*stride); - vertex_crease_weight = h->vertex_crease_weight; - HalfEdge* p = (HalfEdge*) h; - - unsigned int e=0, f=0; - unsigned min_vertex_index = (unsigned)-1; - unsigned min_vertex_index_face = (unsigned)-1; - unsigned min_vertex_index_vertex = (unsigned)-1; - edge_level = p->edge_level; - vertex_level = 0.0f; - do - { - HalfEdge* p_prev = p->prev(); - HalfEdge* p_next = p->next(); - const float crease_weight = p->edge_crease_weight; - assert(p->hasOpposite() || p->edge_crease_weight == float(inf)); - vertex_level = max(vertex_level,p->edge_level); - - /* find minimum start vertex */ - unsigned vertex_index = p_next->getStartVertexIndex(); - if (vertex_index < min_vertex_index) { min_vertex_index = vertex_index; min_vertex_index_face = f; min_vertex_index_vertex = e; } - - /* store first N-2 vertices of face */ - unsigned int vn = 0; - for (p = p_next; p!=p_prev; p=p->next()) { - ring[e++] = Vertex_t::loadu(vertices+p->getStartVertexIndex()*stride); - vn++; - } - faces[f++] = Face(vn,crease_weight); - only_quads &= (vn == 2); - - /* continue with next face */ - if (likely(p->hasOpposite())) - p = p->opposite(); - - /* if there is no opposite go the long way to the other side of the border */ - else - { - /* find minimum start vertex */ - unsigned vertex_index = p->getStartVertexIndex(); - if (vertex_index < min_vertex_index) { min_vertex_index = vertex_index; min_vertex_index_face = f; min_vertex_index_vertex = e; } - - /*! mark first border edge and store dummy vertex for face between the two border edges */ - border_face = f; - faces[f++] = Face(2,inf); - ring[e++] = Vertex_t::loadu(vertices+p->getStartVertexIndex()*stride); - ring[e++] = vtx; // dummy vertex - - /*! goto other side of border */ - p = (HalfEdge*) h; - while (p->hasOpposite()) - p = p->opposite()->next(); - } - - } while (p != h); - - edge_valence = e; - face_valence = f; - eval_unique_identifier = min_vertex_index; - eval_start_face_index = min_vertex_index_face; - eval_start_vertex_index = min_vertex_index_vertex; - - assert( hasValidPositions() ); - } - - __forceinline void subdivide(CatmullClark1Ring& dest) const - { - dest.edge_level = 0.5f*edge_level; - dest.vertex_level = 0.5f*vertex_level; - dest.face_valence = face_valence; - dest.edge_valence = 2*face_valence; - dest.border_index = border_face == -1 ? -1 : 2*border_face; // FIXME: - dest.vertex_crease_weight = max(0.0f,vertex_crease_weight-1.0f); - dest.eval_start_index = eval_start_face_index; - dest.eval_unique_identifier = eval_unique_identifier; - assert(dest.face_valence <= MAX_RING_FACE_VALENCE); - - /* calculate face points */ - Vertex_t S = Vertex_t(0.0f); - for (size_t face=0, v=eval_start_vertex_index; face crease_id; - Vertex_t C = Vertex_t(0.0f); - for (size_t face=0, j=eval_start_vertex_index; face 0.0f)) - { - if (vertex_crease_weight >= 1.0f) { - dest.vtx = vtx; - } else { - dest.vtx = lerp(vtx,v_smooth,vertex_crease_weight); - } - return; - } - - if (likely(num_creases <= 1)) - return; - - /* compute new vertex using crease rule */ - if (likely(num_creases == 2)) { - const Vertex_t v_sharp = (Vertex_t)(C + 6.0f * vtx) * (1.0f / 8.0f); - const float crease_weight0 = faces[crease_id[0]].crease_weight; - const float crease_weight1 = faces[crease_id[1]].crease_weight; - dest.vtx = v_sharp; - dest.crease_weight[crease_id[0]] = max(0.25f*(3.0f*crease_weight0 + crease_weight1)-1.0f,0.0f); - dest.crease_weight[crease_id[1]] = max(0.25f*(3.0f*crease_weight1 + crease_weight0)-1.0f,0.0f); - const float v_blend = 0.5f*(crease_weight0+crease_weight1); - if (unlikely(v_blend < 1.0f)) { - dest.vtx = lerp(v_sharp,v_smooth,v_blend); - } - } - - /* compute new vertex using corner rule */ - else { - dest.vtx = vtx; - } - } - - void convert(CatmullClark1Ring& dst) const - { - dst.edge_level = edge_level; - dst.vertex_level = vertex_level; - dst.vtx = vtx; - dst.face_valence = face_valence; - dst.edge_valence = 2*face_valence; - dst.border_index = border_face == -1 ? -1 : 2*border_face; - for (size_t i=0; i ring[0] */ - __forceinline Vertex getLimitTangent() const - { - CatmullClark1Ring cc_vtx; - - /* fast path for quad only rings */ - if (only_quads) - { - convert(cc_vtx); - return cc_vtx.getLimitTangent(); - } - - subdivide(cc_vtx); - return 2.0f * cc_vtx.getLimitTangent(); - } - - /* gets limit tangent in the direction of egde vtx -> ring[edge_valence-2] */ - __forceinline Vertex getSecondLimitTangent() const - { - CatmullClark1Ring cc_vtx; - - /* fast path for quad only rings */ - if (only_quads) - { - convert(cc_vtx); - return cc_vtx.getSecondLimitTangent(); - } - - subdivide(cc_vtx); - return 2.0f * cc_vtx.getSecondLimitTangent(); - } - - - /* gets limit vertex */ - __forceinline Vertex getLimitVertex() const - { - CatmullClark1Ring cc_vtx; - - /* fast path for quad only rings */ - if (only_quads) - convert(cc_vtx); - else - subdivide(cc_vtx); - return cc_vtx.getLimitVertex(); - } - - friend __forceinline embree_ostream operator<<(embree_ostream o, const GeneralCatmullClark1RingT &c) - { - o << "vtx " << c.vtx << " size = " << c.edge_valence << ", border_face = " << c.border_face << ", " << " face_valence = " << c.face_valence << - ", edge_level = " << c.edge_level << ", vertex_level = " << c.vertex_level << ", ring: " << embree_endl; - for (size_t v=0, f=0; f " << c.ring[i]; - if (i == v) o << " crease = " << c.faces[f].crease_weight; - o << embree_endl; - } - } - return o; - } - }; -} diff --git a/thirdparty/embree-aarch64/kernels/subdiv/catmullrom_curve.h b/thirdparty/embree-aarch64/kernels/subdiv/catmullrom_curve.h deleted file mode 100644 index b244af481c..0000000000 --- a/thirdparty/embree-aarch64/kernels/subdiv/catmullrom_curve.h +++ /dev/null @@ -1,296 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../common/default.h" -#include "../common/scene_curves.h" - -/* - - Implements Catmul Rom curves with control points p0, p1, p2, p3. At - t=0 the curve goes through p1, with tangent (p2-p0)/3, and for t=1 - the curve goes through p2 with tangent (p3-p2)/2. - - */ - -namespace embree -{ - class CatmullRomBasis - { - public: - - template - static __forceinline Vec4 eval(const T& u) - { - const T t = u; - const T s = T(1.0f) - u; - const T n0 = - t * s * s; - const T n1 = 2.0f + t * t * (3.0f * t - 5.0f); - const T n2 = 2.0f + s * s * (3.0f * s - 5.0f); - const T n3 = - s * t * t; - return T(0.5f) * Vec4(n0, n1, n2, n3); - } - - template - static __forceinline Vec4 derivative(const T& u) - { - const T t = u; - const T s = 1.0f - u; - const T n0 = - s * s + 2.0f * s * t; - const T n1 = 2.0f * t * (3.0f * t - 5.0f) + 3.0f * t * t; - const T n2 = 2.0f * s * (3.0f * t + 2.0f) - 3.0f * s * s; - const T n3 = -2.0f * s * t + t * t; - return T(0.5f) * Vec4(n0, n1, n2, n3); - } - - template - static __forceinline Vec4 derivative2(const T& u) - { - const T t = u; - const T n0 = -3.0f * t + 2.0f; - const T n1 = 9.0f * t - 5.0f; - const T n2 = -9.0f * t + 4.0f; - const T n3 = 3.0f * t - 1.0f; - return Vec4(n0, n1, n2, n3); - } - }; - - struct PrecomputedCatmullRomBasis - { - enum { N = 16 }; - public: - PrecomputedCatmullRomBasis() {} - PrecomputedCatmullRomBasis(int shift); - - /* basis for bspline evaluation */ - public: - float c0[N+1][N+1]; - float c1[N+1][N+1]; - float c2[N+1][N+1]; - float c3[N+1][N+1]; - - /* basis for bspline derivative evaluation */ - public: - float d0[N+1][N+1]; - float d1[N+1][N+1]; - float d2[N+1][N+1]; - float d3[N+1][N+1]; - }; - extern PrecomputedCatmullRomBasis catmullrom_basis0; - extern PrecomputedCatmullRomBasis catmullrom_basis1; - - template - struct CatmullRomCurveT - { - Vertex v0,v1,v2,v3; - - __forceinline CatmullRomCurveT() {} - - __forceinline CatmullRomCurveT(const Vertex& v0, const Vertex& v1, const Vertex& v2, const Vertex& v3) - : v0(v0), v1(v1), v2(v2), v3(v3) {} - - __forceinline Vertex begin() const { - return madd(1.0f/6.0f,v0,madd(2.0f/3.0f,v1,1.0f/6.0f*v2)); - } - - __forceinline Vertex end() const { - return madd(1.0f/6.0f,v1,madd(2.0f/3.0f,v2,1.0f/6.0f*v3)); - } - - __forceinline Vertex center() const { - return 0.25f*(v0+v1+v2+v3); - } - - __forceinline BBox bounds() const { - return merge(BBox(v0),BBox(v1),BBox(v2),BBox(v3)); - } - - __forceinline friend CatmullRomCurveT operator -( const CatmullRomCurveT& a, const Vertex& b ) { - return CatmullRomCurveT(a.v0-b,a.v1-b,a.v2-b,a.v3-b); - } - - __forceinline CatmullRomCurveT xfm_pr(const LinearSpace3fa& space, const Vec3fa& p) const - { - const Vec3ff q0(xfmVector(space,v0-p), v0.w); - const Vec3ff q1(xfmVector(space,v1-p), v1.w); - const Vec3ff q2(xfmVector(space,v2-p), v2.w); - const Vec3ff q3(xfmVector(space,v3-p), v3.w); - return CatmullRomCurveT(q0,q1,q2,q3); - } - - __forceinline Vertex eval(const float t) const - { - const Vec4 b = CatmullRomBasis::eval(t); - return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3))); - } - - __forceinline Vertex eval_du(const float t) const - { - const Vec4 b = CatmullRomBasis::derivative(t); - return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3))); - } - - __forceinline Vertex eval_dudu(const float t) const - { - const Vec4 b = CatmullRomBasis::derivative2(t); - return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3))); - } - - __forceinline void eval(const float t, Vertex& p, Vertex& dp, Vertex& ddp) const - { - p = eval(t); - dp = eval_du(t); - ddp = eval_dudu(t); - } - - template - __forceinline Vec4vf veval(const vfloat& t) const - { - const Vec4vf b = CatmullRomBasis::eval(t); - return madd(b.x, Vec4vf(v0), madd(b.y, Vec4vf(v1), madd(b.z, Vec4vf(v2), b.w * Vec4vf(v3)))); - } - - template - __forceinline Vec4vf veval_du(const vfloat& t) const - { - const Vec4vf b = CatmullRomBasis::derivative(t); - return madd(b.x, Vec4vf(v0), madd(b.y, Vec4vf(v1), madd(b.z, Vec4vf(v2), b.w * Vec4vf(v3)))); - } - - template - __forceinline Vec4vf veval_dudu(const vfloat& t) const - { - const Vec4vf b = CatmullRomBasis::derivative2(t); - return madd(b.x, Vec4vf(v0), madd(b.y, Vec4vf(v1), madd(b.z, Vec4vf(v2), b.w * Vec4vf(v3)))); - } - - template - __forceinline void veval(const vfloat& t, Vec4vf& p, Vec4vf& dp) const - { - p = veval(t); - dp = veval_du(t); - } - - template - __forceinline Vec4vf eval0(const int ofs, const int size) const - { - assert(size <= PrecomputedCatmullRomBasis::N); - assert(ofs <= size); - return madd(vfloat::loadu(&catmullrom_basis0.c0[size][ofs]), Vec4vf(v0), - madd(vfloat::loadu(&catmullrom_basis0.c1[size][ofs]), Vec4vf(v1), - madd(vfloat::loadu(&catmullrom_basis0.c2[size][ofs]), Vec4vf(v2), - vfloat::loadu(&catmullrom_basis0.c3[size][ofs]) * Vec4vf(v3)))); - } - - template - __forceinline Vec4vf eval1(const int ofs, const int size) const - { - assert(size <= PrecomputedCatmullRomBasis::N); - assert(ofs <= size); - return madd(vfloat::loadu(&catmullrom_basis1.c0[size][ofs]), Vec4vf(v0), - madd(vfloat::loadu(&catmullrom_basis1.c1[size][ofs]), Vec4vf(v1), - madd(vfloat::loadu(&catmullrom_basis1.c2[size][ofs]), Vec4vf(v2), - vfloat::loadu(&catmullrom_basis1.c3[size][ofs]) * Vec4vf(v3)))); - } - - template - __forceinline Vec4vf derivative0(const int ofs, const int size) const - { - assert(size <= PrecomputedCatmullRomBasis::N); - assert(ofs <= size); - return madd(vfloat::loadu(&catmullrom_basis0.d0[size][ofs]), Vec4vf(v0), - madd(vfloat::loadu(&catmullrom_basis0.d1[size][ofs]), Vec4vf(v1), - madd(vfloat::loadu(&catmullrom_basis0.d2[size][ofs]), Vec4vf(v2), - vfloat::loadu(&catmullrom_basis0.d3[size][ofs]) * Vec4vf(v3)))); - } - - template - __forceinline Vec4vf derivative1(const int ofs, const int size) const - { - assert(size <= PrecomputedCatmullRomBasis::N); - assert(ofs <= size); - return madd(vfloat::loadu(&catmullrom_basis1.d0[size][ofs]), Vec4vf(v0), - madd(vfloat::loadu(&catmullrom_basis1.d1[size][ofs]), Vec4vf(v1), - madd(vfloat::loadu(&catmullrom_basis1.d2[size][ofs]), Vec4vf(v2), - vfloat::loadu(&catmullrom_basis1.d3[size][ofs]) * Vec4vf(v3)))); - } - - /* calculates bounds of catmull-rom curve geometry */ - __forceinline BBox3fa accurateRoundBounds() const - { - const int N = 7; - const float scale = 1.0f/(3.0f*(N-1)); - Vec4vfx pl(pos_inf), pu(neg_inf); - for (int i=0; i<=N; i+=VSIZEX) - { - vintx vi = vintx(i)+vintx(step); - vboolx valid = vi <= vintx(N); - const Vec4vfx p = eval0(i,N); - const Vec4vfx dp = derivative0(i,N); - const Vec4vfx pm = p-Vec4vfx(scale)*select(vi!=vintx(0),dp,Vec4vfx(zero)); - const Vec4vfx pp = p+Vec4vfx(scale)*select(vi!=vintx(N),dp,Vec4vfx(zero)); - pl = select(valid,min(pl,p,pm,pp),pl); // FIXME: use masked min - pu = select(valid,max(pu,p,pm,pp),pu); // FIXME: use masked min - } - const Vec3fa lower(reduce_min(pl.x),reduce_min(pl.y),reduce_min(pl.z)); - const Vec3fa upper(reduce_max(pu.x),reduce_max(pu.y),reduce_max(pu.z)); - const float r_min = reduce_min(pl.w); - const float r_max = reduce_max(pu.w); - const Vec3fa upper_r = Vec3fa(max(abs(r_min),abs(r_max))); - return enlarge(BBox3fa(lower,upper),upper_r); - } - - /* calculates bounds when tessellated into N line segments */ - __forceinline BBox3fa accurateFlatBounds(int N) const - { - if (likely(N == 4)) - { - const Vec4vf4 pi = eval0<4>(0,4); - const Vec3fa lower(reduce_min(pi.x),reduce_min(pi.y),reduce_min(pi.z)); - const Vec3fa upper(reduce_max(pi.x),reduce_max(pi.y),reduce_max(pi.z)); - const Vec3fa upper_r = Vec3fa(reduce_max(abs(pi.w))); - const Vec3ff pe = end(); - return enlarge(BBox3fa(min(lower,pe),max(upper,pe)),max(upper_r,Vec3fa(abs(pe.w)))); - } - else - { - Vec3vfx pl(pos_inf), pu(neg_inf); vfloatx ru(0.0f); - for (int i=0; i<=N; i+=VSIZEX) - { - vboolx valid = vintx(i)+vintx(step) <= vintx(N); - const Vec4vfx pi = eval0(i,N); - - pl.x = select(valid,min(pl.x,pi.x),pl.x); // FIXME: use masked min - pl.y = select(valid,min(pl.y,pi.y),pl.y); - pl.z = select(valid,min(pl.z,pi.z),pl.z); - - pu.x = select(valid,max(pu.x,pi.x),pu.x); // FIXME: use masked min - pu.y = select(valid,max(pu.y,pi.y),pu.y); - pu.z = select(valid,max(pu.z,pi.z),pu.z); - - ru = select(valid,max(ru,abs(pi.w)),ru); - } - const Vec3fa lower(reduce_min(pl.x),reduce_min(pl.y),reduce_min(pl.z)); - const Vec3fa upper(reduce_max(pu.x),reduce_max(pu.y),reduce_max(pu.z)); - const Vec3fa upper_r(reduce_max(ru)); - return enlarge(BBox3fa(lower,upper),upper_r); - } - } - - friend __forceinline embree_ostream operator<<(embree_ostream cout, const CatmullRomCurveT& curve) { - return cout << "CatmullRomCurve { v0 = " << curve.v0 << ", v1 = " << curve.v1 << ", v2 = " << curve.v2 << ", v3 = " << curve.v3 << " }"; - } - }; - - __forceinline CatmullRomCurveT enlargeRadiusToMinWidth(const IntersectContext* context, const CurveGeometry* geom, const Vec3fa& ray_org, const CatmullRomCurveT& curve) - { - return CatmullRomCurveT(enlargeRadiusToMinWidth(context,geom,ray_org,curve.v0), - enlargeRadiusToMinWidth(context,geom,ray_org,curve.v1), - enlargeRadiusToMinWidth(context,geom,ray_org,curve.v2), - enlargeRadiusToMinWidth(context,geom,ray_org,curve.v3)); - } - - typedef CatmullRomCurveT CatmullRomCurve3fa; -} - diff --git a/thirdparty/embree-aarch64/kernels/subdiv/feature_adaptive_eval.h b/thirdparty/embree-aarch64/kernels/subdiv/feature_adaptive_eval.h deleted file mode 100644 index 23f24c360c..0000000000 --- a/thirdparty/embree-aarch64/kernels/subdiv/feature_adaptive_eval.h +++ /dev/null @@ -1,226 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "patch.h" - -namespace embree -{ - namespace isa - { - template - struct FeatureAdaptiveEval - { - public: - - typedef PatchT Patch; - typedef typename Patch::Ref Ref; - typedef GeneralCatmullClarkPatchT GeneralCatmullClarkPatch; - typedef CatmullClark1RingT CatmullClarkRing; - typedef CatmullClarkPatchT CatmullClarkPatch; - typedef BSplinePatchT BSplinePatch; - typedef BezierPatchT BezierPatch; - typedef GregoryPatchT GregoryPatch; - typedef BilinearPatchT BilinearPatch; - typedef BezierCurveT BezierCurve; - - public: - - FeatureAdaptiveEval (const HalfEdge* edge, const char* vertices, size_t stride, const float u, const float v, - Vertex* P, Vertex* dPdu, Vertex* dPdv, Vertex* ddPdudu, Vertex* ddPdvdv, Vertex* ddPdudv) - : P(P), dPdu(dPdu), dPdv(dPdv), ddPdudu(ddPdudu), ddPdvdv(ddPdvdv), ddPdudv(ddPdudv) - { - switch (edge->patch_type) { - case HalfEdge::BILINEAR_PATCH: BilinearPatch(edge,vertices,stride).eval(u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,1.0f); break; - case HalfEdge::REGULAR_QUAD_PATCH: RegularPatchT(edge,vertices,stride).eval(u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,1.0f); break; -#if PATCH_USE_GREGORY == 2 - case HalfEdge::IRREGULAR_QUAD_PATCH: GregoryPatch(edge,vertices,stride).eval(u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,1.0f); break; -#endif - default: { - GeneralCatmullClarkPatch patch(edge,vertices,stride); - eval(patch,Vec2f(u,v),0); - break; - } - } - } - - FeatureAdaptiveEval (CatmullClarkPatch& patch, const float u, const float v, float dscale, size_t depth, - Vertex* P, Vertex* dPdu, Vertex* dPdv, Vertex* ddPdudu, Vertex* ddPdvdv, Vertex* ddPdudv) - : P(P), dPdu(dPdu), dPdv(dPdv), ddPdudu(ddPdudu), ddPdvdv(ddPdvdv), ddPdudv(ddPdudv) - { - eval(patch,Vec2f(u,v),dscale,depth); - } - - void eval_general_quad(const GeneralCatmullClarkPatch& patch, array_t& patches, const Vec2f& uv, size_t depth) - { - float u = uv.x, v = uv.y; - if (v < 0.5f) { - if (u < 0.5f) { -#if PATCH_USE_GREGORY == 2 - BezierCurve borders[2]; patch.getLimitBorder(borders,0); - BezierCurve border0l,border0r; borders[0].subdivide(border0l,border0r); - BezierCurve border2l,border2r; borders[1].subdivide(border2l,border2r); - eval(patches[0],Vec2f(2.0f*u,2.0f*v),2.0f,depth+1, &border0l, nullptr, nullptr, &border2r); -#else - eval(patches[0],Vec2f(2.0f*u,2.0f*v),2.0f,depth+1); -#endif - if (dPdu && dPdv) { - const Vertex dpdx = *dPdu, dpdy = *dPdv; - *dPdu = dpdx; *dPdv = dpdy; - } - } - else { -#if PATCH_USE_GREGORY == 2 - BezierCurve borders[2]; patch.getLimitBorder(borders,1); - BezierCurve border0l,border0r; borders[0].subdivide(border0l,border0r); - BezierCurve border2l,border2r; borders[1].subdivide(border2l,border2r); - eval(patches[1],Vec2f(2.0f*v,2.0f-2.0f*u),2.0f,depth+1, &border0l, nullptr, nullptr, &border2r); -#else - eval(patches[1],Vec2f(2.0f*v,2.0f-2.0f*u),2.0f,depth+1); -#endif - if (dPdu && dPdv) { - const Vertex dpdx = *dPdu, dpdy = *dPdv; - *dPdu = -dpdy; *dPdv = dpdx; - } - } - } else { - if (u > 0.5f) { -#if PATCH_USE_GREGORY == 2 - BezierCurve borders[2]; patch.getLimitBorder(borders,2); - BezierCurve border0l,border0r; borders[0].subdivide(border0l,border0r); - BezierCurve border2l,border2r; borders[1].subdivide(border2l,border2r); - eval(patches[2],Vec2f(2.0f-2.0f*u,2.0f-2.0f*v),2.0f,depth+1, &border0l, nullptr, nullptr, &border2r); -#else - eval(patches[2],Vec2f(2.0f-2.0f*u,2.0f-2.0f*v),2.0f,depth+1); -#endif - if (dPdu && dPdv) { - const Vertex dpdx = *dPdu, dpdy = *dPdv; - *dPdu = -dpdx; *dPdv = -dpdy; - } - } - else { -#if PATCH_USE_GREGORY == 2 - BezierCurve borders[2]; patch.getLimitBorder(borders,3); - BezierCurve border0l,border0r; borders[0].subdivide(border0l,border0r); - BezierCurve border2l,border2r; borders[1].subdivide(border2l,border2r); - eval(patches[3],Vec2f(2.0f-2.0f*v,2.0f*u),2.0f,depth+1, &border0l, nullptr, nullptr, &border2r); -#else - eval(patches[3],Vec2f(2.0f-2.0f*v,2.0f*u),2.0f,depth+1); -#endif - if (dPdu && dPdv) { - const Vertex dpdx = *dPdu, dpdy = *dPdv; - *dPdu = dpdy; *dPdv = -dpdx; - } - } - } - } - - __forceinline bool final(const CatmullClarkPatch& patch, const typename CatmullClarkRing::Type type, size_t depth) - { - const int max_eval_depth = (type & CatmullClarkRing::TYPE_CREASES) ? PATCH_MAX_EVAL_DEPTH_CREASE : PATCH_MAX_EVAL_DEPTH_IRREGULAR; -//#if PATCH_MIN_RESOLUTION -// return patch.isFinalResolution(PATCH_MIN_RESOLUTION) || depth>=(size_t)max_eval_depth; -//#else - return depth>=(size_t)max_eval_depth; -//#endif - } - - void eval(CatmullClarkPatch& patch, Vec2f uv, float dscale, size_t depth, - BezierCurve* border0 = nullptr, BezierCurve* border1 = nullptr, BezierCurve* border2 = nullptr, BezierCurve* border3 = nullptr) - { - while (true) - { - typename CatmullClarkPatch::Type ty = patch.type(); - - if (unlikely(final(patch,ty,depth))) - { - if (ty & CatmullClarkRing::TYPE_REGULAR) { - RegularPatch(patch,border0,border1,border2,border3).eval(uv.x,uv.y,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale); - PATCH_DEBUG_SUBDIVISION(234423,c,c,-1); - return; - } else { - IrregularFillPatch(patch,border0,border1,border2,border3).eval(uv.x,uv.y,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale); - PATCH_DEBUG_SUBDIVISION(34534,c,-1,c); - return; - } - } - else if (ty & CatmullClarkRing::TYPE_REGULAR_CREASES) { - assert(depth > 0); - RegularPatch(patch,border0,border1,border2,border3).eval(uv.x,uv.y,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale); - PATCH_DEBUG_SUBDIVISION(43524,c,c,-1); - return; - } -#if PATCH_USE_GREGORY == 2 - else if (ty & CatmullClarkRing::TYPE_GREGORY_CREASES) { - assert(depth > 0); - GregoryPatch(patch,border0,border1,border2,border3).eval(uv.x,uv.y,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale); - PATCH_DEBUG_SUBDIVISION(23498,c,-1,c); - return; - } -#endif - else - { - array_t patches; - patch.subdivide(patches); // FIXME: only have to generate one of the patches - - const float u = uv.x, v = uv.y; - if (v < 0.5f) { - if (u < 0.5f) { patch = patches[0]; uv = Vec2f(2.0f*u,2.0f*v); dscale *= 2.0f; } - else { patch = patches[1]; uv = Vec2f(2.0f*u-1.0f,2.0f*v); dscale *= 2.0f; } - } else { - if (u > 0.5f) { patch = patches[2]; uv = Vec2f(2.0f*u-1.0f,2.0f*v-1.0f); dscale *= 2.0f; } - else { patch = patches[3]; uv = Vec2f(2.0f*u,2.0f*v-1.0f); dscale *= 2.0f; } - } - depth++; - } - } - } - - void eval(const GeneralCatmullClarkPatch& patch, const Vec2f& uv, const size_t depth) - { - /* convert into standard quad patch if possible */ - if (likely(patch.isQuadPatch())) - { - CatmullClarkPatch qpatch; patch.init(qpatch); - return eval(qpatch,uv,1.0f,depth); - } - - /* subdivide patch */ - unsigned N; - array_t patches; - patch.subdivide(patches,N); // FIXME: only have to generate one of the patches - - /* parametrization for quads */ - if (N == 4) - eval_general_quad(patch,patches,uv,depth); - - /* parametrization for arbitrary polygons */ - else - { - const unsigned l = (unsigned) floor(0.5f*uv.x); const float u = 2.0f*frac(0.5f*uv.x)-0.5f; - const unsigned h = (unsigned) floor(0.5f*uv.y); const float v = 2.0f*frac(0.5f*uv.y)-0.5f; - const unsigned i = 4*h+l; assert(i= N) return; - -#if PATCH_USE_GREGORY == 2 - BezierCurve borders[2]; patch.getLimitBorder(borders,i); - BezierCurve border0l,border0r; borders[0].subdivide(border0l,border0r); - BezierCurve border2l,border2r; borders[1].subdivide(border2l,border2r); - eval(patches[i],Vec2f(u,v),1.0f,depth+1, &border0l, nullptr, nullptr, &border2r); -#else - eval(patches[i],Vec2f(u,v),1.0f,depth+1); -#endif - } - } - - private: - Vertex* const P; - Vertex* const dPdu; - Vertex* const dPdv; - Vertex* const ddPdudu; - Vertex* const ddPdvdv; - Vertex* const ddPdudv; - }; - } -} diff --git a/thirdparty/embree-aarch64/kernels/subdiv/feature_adaptive_eval_grid.h b/thirdparty/embree-aarch64/kernels/subdiv/feature_adaptive_eval_grid.h deleted file mode 100644 index 76583b2e5d..0000000000 --- a/thirdparty/embree-aarch64/kernels/subdiv/feature_adaptive_eval_grid.h +++ /dev/null @@ -1,359 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "patch.h" -#include "catmullclark_patch.h" -#include "bspline_patch.h" -#include "gregory_patch.h" -#include "tessellation.h" - -namespace embree -{ - namespace isa - { - struct FeatureAdaptiveEvalGrid - { - typedef CatmullClark1Ring3fa CatmullClarkRing; - typedef CatmullClarkPatch3fa CatmullClarkPatch; - typedef BilinearPatch3fa BilinearPatch; - typedef BSplinePatch3fa BSplinePatch; - typedef BezierPatch3fa BezierPatch; - typedef GregoryPatch3fa GregoryPatch; - - private: - const unsigned x0,x1; - const unsigned y0,y1; - const unsigned swidth,sheight; - const float rcp_swidth, rcp_sheight; - float* const Px; - float* const Py; - float* const Pz; - float* const U; - float* const V; - float* const Nx; - float* const Ny; - float* const Nz; - const unsigned dwidth; - //const unsigned dheight; - unsigned count; - - - public: - FeatureAdaptiveEvalGrid (const GeneralCatmullClarkPatch3fa& patch, unsigned subPatch, - const unsigned x0, const unsigned x1, const unsigned y0, const unsigned y1, const unsigned swidth, const unsigned sheight, - float* Px, float* Py, float* Pz, float* U, float* V, - float* Nx, float* Ny, float* Nz, - const unsigned dwidth, const unsigned dheight) - : x0(x0), x1(x1), y0(y0), y1(y1), swidth(swidth), sheight(sheight), rcp_swidth(1.0f/(swidth-1.0f)), rcp_sheight(1.0f/(sheight-1.0f)), - Px(Px), Py(Py), Pz(Pz), U(U), V(V), Nx(Nx), Ny(Ny), Nz(Nz), dwidth(dwidth), /*dheight(dheight),*/ count(0) - { - assert(swidth < (2<<20) && sheight < (2<<20)); - const BBox2f srange(Vec2f(0.0f,0.0f),Vec2f(float(swidth-1),float(sheight-1))); - const BBox2f erange(Vec2f((float)x0,(float)y0),Vec2f((float)x1,(float)y1)); - - /* convert into standard quad patch if possible */ - if (likely(patch.isQuadPatch())) - { - CatmullClarkPatch3fa qpatch; patch.init(qpatch); - eval(qpatch, srange, erange, 0); - assert(count == (x1-x0+1)*(y1-y0+1)); - return; - } - - /* subdivide patch */ - unsigned N; - array_t patches; - patch.subdivide(patches,N); - - if (N == 4) - { - const Vec2f c = srange.center(); - const BBox2f srange0(srange.lower,c); - const BBox2f srange1(Vec2f(c.x,srange.lower.y),Vec2f(srange.upper.x,c.y)); - const BBox2f srange2(c,srange.upper); - const BBox2f srange3(Vec2f(srange.lower.x,c.y),Vec2f(c.x,srange.upper.y)); - -#if PATCH_USE_GREGORY == 2 - BezierCurve3fa borders[GeneralCatmullClarkPatch3fa::SIZE]; patch.getLimitBorder(borders); - BezierCurve3fa border0l,border0r; borders[0].subdivide(border0l,border0r); - BezierCurve3fa border1l,border1r; borders[1].subdivide(border1l,border1r); - BezierCurve3fa border2l,border2r; borders[2].subdivide(border2l,border2r); - BezierCurve3fa border3l,border3r; borders[3].subdivide(border3l,border3r); - GeneralCatmullClarkPatch3fa::fix_quad_ring_order(patches); - eval(patches[0],srange0,intersect(srange0,erange),1,&border0l,nullptr,nullptr,&border3r); - eval(patches[1],srange1,intersect(srange1,erange),1,&border0r,&border1l,nullptr,nullptr); - eval(patches[2],srange2,intersect(srange2,erange),1,nullptr,&border1r,&border2l,nullptr); - eval(patches[3],srange3,intersect(srange3,erange),1,nullptr,nullptr,&border2r,&border3l); -#else - GeneralCatmullClarkPatch3fa::fix_quad_ring_order(patches); - eval(patches[0],srange0,intersect(srange0,erange),1); - eval(patches[1],srange1,intersect(srange1,erange),1); - eval(patches[2],srange2,intersect(srange2,erange),1); - eval(patches[3],srange3,intersect(srange3,erange),1); -#endif - } - else - { - assert(subPatch < N); - -#if PATCH_USE_GREGORY == 2 - BezierCurve3fa borders[2]; patch.getLimitBorder(borders,subPatch); - BezierCurve3fa border0l,border0r; borders[0].subdivide(border0l,border0r); - BezierCurve3fa border2l,border2r; borders[1].subdivide(border2l,border2r); - eval(patches[subPatch], srange, erange, 1, &border0l, nullptr, nullptr, &border2r); -#else - eval(patches[subPatch], srange, erange, 1); -#endif - - } - assert(count == (x1-x0+1)*(y1-y0+1)); - } - - FeatureAdaptiveEvalGrid (const CatmullClarkPatch3fa& patch, - const BBox2f& srange, const BBox2f& erange, const unsigned depth, - const unsigned x0, const unsigned x1, const unsigned y0, const unsigned y1, const unsigned swidth, const unsigned sheight, - float* Px, float* Py, float* Pz, float* U, float* V, - float* Nx, float* Ny, float* Nz, - const unsigned dwidth, const unsigned dheight) - : x0(x0), x1(x1), y0(y0), y1(y1), swidth(swidth), sheight(sheight), rcp_swidth(1.0f/(swidth-1.0f)), rcp_sheight(1.0f/(sheight-1.0f)), - Px(Px), Py(Py), Pz(Pz), U(U), V(V), Nx(Nx), Ny(Ny), Nz(Nz), dwidth(dwidth), /*dheight(dheight),*/ count(0) - { - eval(patch,srange,erange,depth); - } - - template - void evalLocalGrid(const Patch& patch, const BBox2f& srange, const int lx0, const int lx1, const int ly0, const int ly1) - { - const float scale_x = rcp(srange.upper.x-srange.lower.x); - const float scale_y = rcp(srange.upper.y-srange.lower.y); - count += (lx1-lx0)*(ly1-ly0); - -#if 0 - for (unsigned iy=ly0; iy=max_eval_depth; -//#else - return depth>=max_eval_depth; -//#endif - } - - void eval(const CatmullClarkPatch3fa& patch, const BBox2f& srange, const BBox2f& erange, const unsigned depth, - const BezierCurve3fa* border0 = nullptr, const BezierCurve3fa* border1 = nullptr, const BezierCurve3fa* border2 = nullptr, const BezierCurve3fa* border3 = nullptr) - { - if (erange.empty()) - return; - - int lx0 = (int) ceilf(erange.lower.x); - int lx1 = (int) ceilf(erange.upper.x) + (erange.upper.x == x1 && (srange.lower.x < erange.upper.x || erange.upper.x == 0)); - int ly0 = (int) ceilf(erange.lower.y); - int ly1 = (int) ceilf(erange.upper.y) + (erange.upper.y == y1 && (srange.lower.y < erange.upper.y || erange.upper.y == 0)); - if (lx0 >= lx1 || ly0 >= ly1) return; - - CatmullClarkPatch::Type ty = patch.type(); - - if (unlikely(final(patch,ty,depth))) - { - if (ty & CatmullClarkRing::TYPE_REGULAR) { - RegularPatch rpatch(patch,border0,border1,border2,border3); - evalLocalGrid(rpatch,srange,lx0,lx1,ly0,ly1); - return; - } else { - IrregularFillPatch ipatch(patch,border0,border1,border2,border3); - evalLocalGrid(ipatch,srange,lx0,lx1,ly0,ly1); - return; - } - } - else if (ty & CatmullClarkRing::TYPE_REGULAR_CREASES) { - assert(depth > 0); - RegularPatch rpatch(patch,border0,border1,border2,border3); - evalLocalGrid(rpatch,srange,lx0,lx1,ly0,ly1); - return; - } -#if PATCH_USE_GREGORY == 2 - else if (ty & CatmullClarkRing::TYPE_GREGORY_CREASES) { - assert(depth > 0); - GregoryPatch gpatch(patch,border0,border1,border2,border3); - evalLocalGrid(gpatch,srange,lx0,lx1,ly0,ly1); - } -#endif - else - { - array_t patches; - patch.subdivide(patches); - - const Vec2f c = srange.center(); - const BBox2f srange0(srange.lower,c); - const BBox2f srange1(Vec2f(c.x,srange.lower.y),Vec2f(srange.upper.x,c.y)); - const BBox2f srange2(c,srange.upper); - const BBox2f srange3(Vec2f(srange.lower.x,c.y),Vec2f(c.x,srange.upper.y)); - - eval(patches[0],srange0,intersect(srange0,erange),depth+1); - eval(patches[1],srange1,intersect(srange1,erange),depth+1); - eval(patches[2],srange2,intersect(srange2,erange),depth+1); - eval(patches[3],srange3,intersect(srange3,erange),depth+1); - } - } - }; - - template - bool stitch_col(const Patch& patch, int subPatch, - const bool right, const unsigned y0, const unsigned y1, const int fine_y, const int coarse_y, - float* Px, float* Py, float* Pz, float* U, float* V, float* Nx, float* Ny, float* Nz, const unsigned dx0, const unsigned dwidth, const unsigned dheight) - { - assert(coarse_y <= fine_y); - if (likely(fine_y == coarse_y)) - return false; - - const unsigned y0s = stitch(y0,fine_y,coarse_y); - const unsigned y1s = stitch(y1,fine_y,coarse_y); - const unsigned M = y1s-y0s+1 + VSIZEX; - - dynamic_large_stack_array(float,px,M,64*sizeof(float)); - dynamic_large_stack_array(float,py,M,64*sizeof(float)); - dynamic_large_stack_array(float,pz,M,64*sizeof(float)); - dynamic_large_stack_array(float,u,M,64*sizeof(float)); - dynamic_large_stack_array(float,v,M,64*sizeof(float)); - dynamic_large_stack_array(float,nx,M,64*sizeof(float)); - dynamic_large_stack_array(float,ny,M,64*sizeof(float)); - dynamic_large_stack_array(float,nz,M,64*sizeof(float)); - const bool has_Nxyz = Nx; assert(!Nx || (Ny && Nz)); - Eval(patch,subPatch, right,right, y0s,y1s, 2,coarse_y+1, px,py,pz,u,v, - has_Nxyz ? (float*)nx : nullptr,has_Nxyz ? (float*)ny : nullptr ,has_Nxyz ? (float*)nz : nullptr, 1,4097); - - for (unsigned y=y0; y<=y1; y++) - { - const unsigned ys = stitch(y,fine_y,coarse_y)-y0s; - Px[(y-y0)*dwidth+dx0] = px[ys]; - Py[(y-y0)*dwidth+dx0] = py[ys]; - Pz[(y-y0)*dwidth+dx0] = pz[ys]; - U [(y-y0)*dwidth+dx0] = u[ys]; - V [(y-y0)*dwidth+dx0] = v[ys]; - if (unlikely(has_Nxyz)) { - Nx[(y-y0)*dwidth+dx0] = nx[ys]; - Ny[(y-y0)*dwidth+dx0] = ny[ys]; - Nz[(y-y0)*dwidth+dx0] = nz[ys]; - } - } - return true; - } - - template - bool stitch_row(const Patch& patch, int subPatch, - const bool bottom, const unsigned x0, const unsigned x1, const int fine_x, const int coarse_x, - float* Px, float* Py, float* Pz, float* U, float* V, float* Nx, float* Ny, float* Nz, const unsigned dy0, const unsigned dwidth, const unsigned dheight) - { - assert(coarse_x <= fine_x); - if (likely(fine_x == coarse_x)) - return false; - - const unsigned x0s = stitch(x0,fine_x,coarse_x); - const unsigned x1s = stitch(x1,fine_x,coarse_x); - const unsigned M = x1s-x0s+1 + VSIZEX; - - dynamic_large_stack_array(float,px,M,32*sizeof(float)); - dynamic_large_stack_array(float,py,M,32*sizeof(float)); - dynamic_large_stack_array(float,pz,M,32*sizeof(float)); - dynamic_large_stack_array(float,u,M,32*sizeof(float)); - dynamic_large_stack_array(float,v,M,32*sizeof(float)); - dynamic_large_stack_array(float,nx,M,32*sizeof(float)); - dynamic_large_stack_array(float,ny,M,32*sizeof(float)); - dynamic_large_stack_array(float,nz,M,32*sizeof(float)); - const bool has_Nxyz = Nx; assert(!Nx || (Ny && Nz)); - Eval(patch,subPatch, x0s,x1s, bottom,bottom, coarse_x+1,2, px,py,pz,u,v, - has_Nxyz ? (float*)nx :nullptr, has_Nxyz ? (float*)ny : nullptr , has_Nxyz ? (float*)nz : nullptr, 4097,1); - - for (unsigned x=x0; x<=x1; x++) - { - const unsigned xs = stitch(x,fine_x,coarse_x)-x0s; - Px[dy0*dwidth+x-x0] = px[xs]; - Py[dy0*dwidth+x-x0] = py[xs]; - Pz[dy0*dwidth+x-x0] = pz[xs]; - U [dy0*dwidth+x-x0] = u[xs]; - V [dy0*dwidth+x-x0] = v[xs]; - if (unlikely(has_Nxyz)) { - Nx[dy0*dwidth+x-x0] = nx[xs]; - Ny[dy0*dwidth+x-x0] = ny[xs]; - Nz[dy0*dwidth+x-x0] = nz[xs]; - } - } - return true; - } - - template - void feature_adaptive_eval_grid (const Patch& patch, unsigned subPatch, const float levels[4], - const unsigned x0, const unsigned x1, const unsigned y0, const unsigned y1, const unsigned swidth, const unsigned sheight, - float* Px, float* Py, float* Pz, float* U, float* V, float* Nx, float* Ny, float* Nz, const unsigned dwidth, const unsigned dheight) - { - bool sl = false, sr = false, st = false, sb = false; - if (levels) { - sl = x0 == 0 && stitch_col(patch,subPatch,0,y0,y1,sheight-1,int(levels[3]), Px,Py,Pz,U,V,Nx,Ny,Nz, 0 ,dwidth,dheight); - sr = x1 == swidth-1 && stitch_col(patch,subPatch,1,y0,y1,sheight-1,int(levels[1]), Px,Py,Pz,U,V,Nx,Ny,Nz, x1-x0,dwidth,dheight); - st = y0 == 0 && stitch_row(patch,subPatch,0,x0,x1,swidth-1,int(levels[0]), Px,Py,Pz,U,V,Nx,Ny,Nz, 0 ,dwidth,dheight); - sb = y1 == sheight-1 && stitch_row(patch,subPatch,1,x0,x1,swidth-1,int(levels[2]), Px,Py,Pz,U,V,Nx,Ny,Nz, y1-y0,dwidth,dheight); - } - const unsigned ofs = st*dwidth+sl; - Eval(patch,subPatch,x0+sl,x1-sr,y0+st,y1-sb, swidth,sheight, Px+ofs,Py+ofs,Pz+ofs,U+ofs,V+ofs,Nx?Nx+ofs:nullptr,Ny?Ny+ofs:nullptr,Nz?Nz+ofs:nullptr, dwidth,dheight); - } - } -} - diff --git a/thirdparty/embree-aarch64/kernels/subdiv/feature_adaptive_eval_simd.h b/thirdparty/embree-aarch64/kernels/subdiv/feature_adaptive_eval_simd.h deleted file mode 100644 index fa3216730f..0000000000 --- a/thirdparty/embree-aarch64/kernels/subdiv/feature_adaptive_eval_simd.h +++ /dev/null @@ -1,186 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "patch.h" - -namespace embree -{ - namespace isa - { - template - struct FeatureAdaptiveEvalSimd - { - public: - - typedef PatchT Patch; - typedef typename Patch::Ref Ref; - typedef GeneralCatmullClarkPatchT GeneralCatmullClarkPatch; - typedef CatmullClark1RingT CatmullClarkRing; - typedef CatmullClarkPatchT CatmullClarkPatch; - typedef BSplinePatchT BSplinePatch; - typedef BezierPatchT BezierPatch; - typedef GregoryPatchT GregoryPatch; - typedef BilinearPatchT BilinearPatch; - typedef BezierCurveT BezierCurve; - - FeatureAdaptiveEvalSimd (const HalfEdge* edge, const char* vertices, size_t stride, const vbool& valid, const vfloat& u, const vfloat& v, - float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv, const size_t dstride, const size_t N) - : P(P), dPdu(dPdu), dPdv(dPdv), ddPdudu(ddPdudu), ddPdvdv(ddPdvdv), ddPdudv(ddPdudv), dstride(dstride), N(N) - { - switch (edge->patch_type) { - case HalfEdge::BILINEAR_PATCH: BilinearPatch(edge,vertices,stride).eval(valid,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,1.0f,dstride,N); break; - case HalfEdge::REGULAR_QUAD_PATCH: RegularPatchT(edge,vertices,stride).eval(valid,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,1.0f,dstride,N); break; -#if PATCH_USE_GREGORY == 2 - case HalfEdge::IRREGULAR_QUAD_PATCH: GregoryPatchT(edge,vertices,stride).eval(valid,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,1.0f,dstride,N); break; -#endif - default: { - GeneralCatmullClarkPatch patch(edge,vertices,stride); - eval_direct(valid,patch,Vec2(u,v),0); - break; - } - } - } - - FeatureAdaptiveEvalSimd (const CatmullClarkPatch& patch, const vbool& valid, const vfloat& u, const vfloat& v, float dscale, size_t depth, - float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv, const size_t dstride, const size_t N) - : P(P), dPdu(dPdu), dPdv(dPdv), ddPdudu(ddPdudu), ddPdvdv(ddPdvdv), ddPdudv(ddPdudv), dstride(dstride), N(N) - { - eval_direct(valid,patch,Vec2(u,v),dscale,depth); - } - - template - __forceinline void eval_quad_direct(const vbool& valid, array_t& patches, const Vec2& uv, float dscale, size_t depth) - { - const vfloat u = uv.x, v = uv.y; - const vbool u0_mask = u < 0.5f, u1_mask = u >= 0.5f; - const vbool v0_mask = v < 0.5f, v1_mask = v >= 0.5f; - const vbool u0v0_mask = valid & u0_mask & v0_mask; - const vbool u0v1_mask = valid & u0_mask & v1_mask; - const vbool u1v0_mask = valid & u1_mask & v0_mask; - const vbool u1v1_mask = valid & u1_mask & v1_mask; - if (any(u0v0_mask)) eval_direct(u0v0_mask,patches[0],Vec2(2.0f*u,2.0f*v),2.0f*dscale,depth+1); - if (any(u1v0_mask)) eval_direct(u1v0_mask,patches[1],Vec2(2.0f*u-1.0f,2.0f*v),2.0f*dscale,depth+1); - if (any(u1v1_mask)) eval_direct(u1v1_mask,patches[2],Vec2(2.0f*u-1.0f,2.0f*v-1.0f),2.0f*dscale,depth+1); - if (any(u0v1_mask)) eval_direct(u0v1_mask,patches[3],Vec2(2.0f*u,2.0f*v-1.0f),2.0f*dscale,depth+1); - } - - template - __forceinline void eval_general_quad_direct(const vbool& valid, const GeneralCatmullClarkPatch& patch, array_t& patches, const Vec2& uv, float dscale, size_t depth) - { -#if PATCH_USE_GREGORY == 2 - BezierCurve borders[GeneralCatmullClarkPatch::SIZE]; patch.getLimitBorder(borders); - BezierCurve border0l,border0r; borders[0].subdivide(border0l,border0r); - BezierCurve border1l,border1r; borders[1].subdivide(border1l,border1r); - BezierCurve border2l,border2r; borders[2].subdivide(border2l,border2r); - BezierCurve border3l,border3r; borders[3].subdivide(border3l,border3r); -#endif - GeneralCatmullClarkPatch::fix_quad_ring_order(patches); - const vfloat u = uv.x, v = uv.y; - const vbool u0_mask = u < 0.5f, u1_mask = u >= 0.5f; - const vbool v0_mask = v < 0.5f, v1_mask = v >= 0.5f; - const vbool u0v0_mask = valid & u0_mask & v0_mask; - const vbool u0v1_mask = valid & u0_mask & v1_mask; - const vbool u1v0_mask = valid & u1_mask & v0_mask; - const vbool u1v1_mask = valid & u1_mask & v1_mask; -#if PATCH_USE_GREGORY == 2 - if (any(u0v0_mask)) eval_direct(u0v0_mask,patches[0],Vec2(2.0f*u,2.0f*v),2.0f*dscale,depth+1,&border0l,nullptr,nullptr,&border3r); - if (any(u1v0_mask)) eval_direct(u1v0_mask,patches[1],Vec2(2.0f*u-1.0f,2.0f*v),2.0f*dscale,depth+1,&border0r,&border1l,nullptr,nullptr); - if (any(u1v1_mask)) eval_direct(u1v1_mask,patches[2],Vec2(2.0f*u-1.0f,2.0f*v-1.0f),2.0f*dscale,depth+1,nullptr,&border1r,&border2l,nullptr); - if (any(u0v1_mask)) eval_direct(u0v1_mask,patches[3],Vec2(2.0f*u,2.0f*v-1.0f),2.0f*dscale,depth+1,nullptr,nullptr,&border2r,&border3l); -#else - if (any(u0v0_mask)) eval_direct(u0v0_mask,patches[0],Vec2(2.0f*u,2.0f*v),2.0f*dscale,depth+1); - if (any(u1v0_mask)) eval_direct(u1v0_mask,patches[1],Vec2(2.0f*u-1.0f,2.0f*v),2.0f*dscale,depth+1); - if (any(u1v1_mask)) eval_direct(u1v1_mask,patches[2],Vec2(2.0f*u-1.0f,2.0f*v-1.0f),2.0f*dscale,depth+1); - if (any(u0v1_mask)) eval_direct(u0v1_mask,patches[3],Vec2(2.0f*u,2.0f*v-1.0f),2.0f*dscale,depth+1); -#endif - } - - __forceinline bool final(const CatmullClarkPatch& patch, const typename CatmullClarkRing::Type type, size_t depth) - { - const size_t max_eval_depth = (type & CatmullClarkRing::TYPE_CREASES) ? PATCH_MAX_EVAL_DEPTH_CREASE : PATCH_MAX_EVAL_DEPTH_IRREGULAR; -//#if PATCH_MIN_RESOLUTION -// return patch.isFinalResolution(PATCH_MIN_RESOLUTION) || depth>=max_eval_depth; -//#else - return depth>=max_eval_depth; -//#endif - } - - void eval_direct(const vbool& valid, const CatmullClarkPatch& patch, const Vec2& uv, float dscale, size_t depth, - BezierCurve* border0 = nullptr, BezierCurve* border1 = nullptr, BezierCurve* border2 = nullptr, BezierCurve* border3 = nullptr) - { - typename CatmullClarkPatch::Type ty = patch.type(); - - if (unlikely(final(patch,ty,depth))) - { - if (ty & CatmullClarkRing::TYPE_REGULAR) { - RegularPatch(patch,border0,border1,border2,border3).eval(valid,uv.x,uv.y,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N); - } else { - IrregularFillPatch(patch,border0,border1,border2,border3).eval(valid,uv.x,uv.y,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N); - } - } - else if (ty & CatmullClarkRing::TYPE_REGULAR_CREASES) { - assert(depth > 0); RegularPatch(patch,border0,border1,border2,border3).eval(valid,uv.x,uv.y,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N); - } -#if PATCH_USE_GREGORY == 2 - else if (ty & CatmullClarkRing::TYPE_GREGORY_CREASES) { - assert(depth > 0); GregoryPatch(patch,border0,border1,border2,border3).eval(valid,uv.x,uv.y,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N); - } -#endif - else - { - array_t patches; - patch.subdivide(patches); // FIXME: only have to generate one of the patches - eval_quad_direct(valid,patches,uv,dscale,depth); - } - } - - void eval_direct(const vbool& valid, const GeneralCatmullClarkPatch& patch, const Vec2& uv, const size_t depth) - { - /* convert into standard quad patch if possible */ - if (likely(patch.isQuadPatch())) { - CatmullClarkPatch qpatch; patch.init(qpatch); - return eval_direct(valid,qpatch,uv,1.0f,depth); - } - - /* subdivide patch */ - unsigned Nc; - array_t patches; - patch.subdivide(patches,Nc); // FIXME: only have to generate one of the patches - - /* parametrization for quads */ - if (Nc == 4) - eval_general_quad_direct(valid,patch,patches,uv,1.0f,depth); - - /* parametrization for arbitrary polygons */ - else - { - const vint l = (vint)floor(0.5f*uv.x); const vfloat u = 2.0f*frac(0.5f*uv.x)-0.5f; - const vint h = (vint)floor(0.5f*uv.y); const vfloat v = 2.0f*frac(0.5f*uv.y)-0.5f; - const vint i = (h<<2)+l; assert(all(valid,i(u,v),1.0f,depth+1, &border0l, nullptr, nullptr, &border2r); -#else - eval_direct(valid,patches[i],Vec2(u,v),1.0f,depth+1); -#endif - }); - } - } - - private: - float* const P; - float* const dPdu; - float* const dPdv; - float* const ddPdudu; - float* const ddPdvdv; - float* const ddPdudv; - const size_t dstride; - const size_t N; - }; - } -} diff --git a/thirdparty/embree-aarch64/kernels/subdiv/gregory_patch.h b/thirdparty/embree-aarch64/kernels/subdiv/gregory_patch.h deleted file mode 100644 index 2a7c4b1f2c..0000000000 --- a/thirdparty/embree-aarch64/kernels/subdiv/gregory_patch.h +++ /dev/null @@ -1,893 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "catmullclark_patch.h" -#include "bezier_patch.h" -#include "bezier_curve.h" -#include "catmullclark_coefficients.h" - -namespace embree -{ - template - class __aligned(64) GregoryPatchT - { - typedef CatmullClarkPatchT CatmullClarkPatch; - typedef GeneralCatmullClarkPatchT GeneralCatmullClarkPatch; - typedef CatmullClark1RingT CatmullClark1Ring; - typedef BezierCurveT BezierCurve; - - public: - Vertex v[4][4]; - Vertex f[2][2]; - - __forceinline GregoryPatchT() {} - - __forceinline GregoryPatchT(const CatmullClarkPatch& patch) { - init(patch); - } - - __forceinline GregoryPatchT(const CatmullClarkPatch& patch, - const BezierCurve* border0, const BezierCurve* border1, const BezierCurve* border2, const BezierCurve* border3) - { - init_crackfix(patch,border0,border1,border2,border3); - } - - __forceinline GregoryPatchT (const HalfEdge* edge, const char* vertices, size_t stride) { - init(CatmullClarkPatch(edge,vertices,stride)); - } - - __forceinline Vertex& p0() { return v[0][0]; } - __forceinline Vertex& p1() { return v[0][3]; } - __forceinline Vertex& p2() { return v[3][3]; } - __forceinline Vertex& p3() { return v[3][0]; } - - __forceinline Vertex& e0_p() { return v[0][1]; } - __forceinline Vertex& e0_m() { return v[1][0]; } - __forceinline Vertex& e1_p() { return v[1][3]; } - __forceinline Vertex& e1_m() { return v[0][2]; } - __forceinline Vertex& e2_p() { return v[3][2]; } - __forceinline Vertex& e2_m() { return v[2][3]; } - __forceinline Vertex& e3_p() { return v[2][0]; } - __forceinline Vertex& e3_m() { return v[3][1]; } - - __forceinline Vertex& f0_p() { return v[1][1]; } - __forceinline Vertex& f1_p() { return v[1][2]; } - __forceinline Vertex& f2_p() { return v[2][2]; } - __forceinline Vertex& f3_p() { return v[2][1]; } - __forceinline Vertex& f0_m() { return f[0][0]; } - __forceinline Vertex& f1_m() { return f[0][1]; } - __forceinline Vertex& f2_m() { return f[1][1]; } - __forceinline Vertex& f3_m() { return f[1][0]; } - - __forceinline const Vertex& p0() const { return v[0][0]; } - __forceinline const Vertex& p1() const { return v[0][3]; } - __forceinline const Vertex& p2() const { return v[3][3]; } - __forceinline const Vertex& p3() const { return v[3][0]; } - - __forceinline const Vertex& e0_p() const { return v[0][1]; } - __forceinline const Vertex& e0_m() const { return v[1][0]; } - __forceinline const Vertex& e1_p() const { return v[1][3]; } - __forceinline const Vertex& e1_m() const { return v[0][2]; } - __forceinline const Vertex& e2_p() const { return v[3][2]; } - __forceinline const Vertex& e2_m() const { return v[2][3]; } - __forceinline const Vertex& e3_p() const { return v[2][0]; } - __forceinline const Vertex& e3_m() const { return v[3][1]; } - - __forceinline const Vertex& f0_p() const { return v[1][1]; } - __forceinline const Vertex& f1_p() const { return v[1][2]; } - __forceinline const Vertex& f2_p() const { return v[2][2]; } - __forceinline const Vertex& f3_p() const { return v[2][1]; } - __forceinline const Vertex& f0_m() const { return f[0][0]; } - __forceinline const Vertex& f1_m() const { return f[0][1]; } - __forceinline const Vertex& f2_m() const { return f[1][1]; } - __forceinline const Vertex& f3_m() const { return f[1][0]; } - - __forceinline Vertex initCornerVertex(const CatmullClarkPatch& irreg_patch, const size_t index) { - return irreg_patch.ring[index].getLimitVertex(); - } - - __forceinline Vertex initPositiveEdgeVertex(const CatmullClarkPatch& irreg_patch, const size_t index, const Vertex& p_vtx) { - return madd(1.0f/3.0f,irreg_patch.ring[index].getLimitTangent(),p_vtx); - } - - __forceinline Vertex initNegativeEdgeVertex(const CatmullClarkPatch& irreg_patch, const size_t index, const Vertex& p_vtx) { - return madd(1.0f/3.0f,irreg_patch.ring[index].getSecondLimitTangent(),p_vtx); - } - - __forceinline Vertex initPositiveEdgeVertex2(const CatmullClarkPatch& irreg_patch, const size_t index, const Vertex& p_vtx) - { - CatmullClark1Ring3fa r0,r1,r2; - irreg_patch.ring[index].subdivide(r0); - r0.subdivide(r1); - r1.subdivide(r2); - return madd(8.0f/3.0f,r2.getLimitTangent(),p_vtx); - } - - __forceinline Vertex initNegativeEdgeVertex2(const CatmullClarkPatch& irreg_patch, const size_t index, const Vertex& p_vtx) - { - CatmullClark1Ring3fa r0,r1,r2; - irreg_patch.ring[index].subdivide(r0); - r0.subdivide(r1); - r1.subdivide(r2); - return madd(8.0f/3.0f,r2.getSecondLimitTangent(),p_vtx); - } - - void initFaceVertex(const CatmullClarkPatch& irreg_patch, - const size_t index, - const Vertex& p_vtx, - const Vertex& e0_p_vtx, - const Vertex& e1_m_vtx, - const unsigned int face_valence_p1, - const Vertex& e0_m_vtx, - const Vertex& e3_p_vtx, - const unsigned int face_valence_p3, - Vertex& f_p_vtx, - Vertex& f_m_vtx) - { - const unsigned int face_valence = irreg_patch.ring[index].face_valence; - const unsigned int edge_valence = irreg_patch.ring[index].edge_valence; - const unsigned int border_index = irreg_patch.ring[index].border_index; - - const Vertex& vtx = irreg_patch.ring[index].vtx; - const Vertex e_i = irreg_patch.ring[index].getEdgeCenter(0); - const Vertex c_i_m_1 = irreg_patch.ring[index].getQuadCenter(0); - const Vertex e_i_m_1 = irreg_patch.ring[index].getEdgeCenter(1); - - Vertex c_i, e_i_p_1; - const bool hasHardEdge0 = - std::isinf(irreg_patch.ring[index].vertex_crease_weight) && - std::isinf(irreg_patch.ring[index].crease_weight[0]); - - if (unlikely((border_index == edge_valence-2) || hasHardEdge0)) - { - /* mirror quad center and edge mid-point */ - c_i = madd(2.0f, e_i - c_i_m_1, c_i_m_1); - e_i_p_1 = madd(2.0f, vtx - e_i_m_1, e_i_m_1); - } - else - { - c_i = irreg_patch.ring[index].getQuadCenter( face_valence-1 ); - e_i_p_1 = irreg_patch.ring[index].getEdgeCenter( face_valence-1 ); - } - - Vertex c_i_m_2, e_i_m_2; - const bool hasHardEdge1 = - std::isinf(irreg_patch.ring[index].vertex_crease_weight) && - std::isinf(irreg_patch.ring[index].crease_weight[1]); - - if (unlikely(border_index == 2 || hasHardEdge1)) - { - /* mirror quad center and edge mid-point */ - c_i_m_2 = madd(2.0f, e_i_m_1 - c_i_m_1, c_i_m_1); - e_i_m_2 = madd(2.0f, vtx - e_i, + e_i); - } - else - { - c_i_m_2 = irreg_patch.ring[index].getQuadCenter( 1 ); - e_i_m_2 = irreg_patch.ring[index].getEdgeCenter( 2 ); - } - - const float d = 3.0f; - //const float c = cosf(2.0f*M_PI/(float)face_valence); - //const float c_e_p = cosf(2.0f*M_PI/(float)face_valence_p1); - //const float c_e_m = cosf(2.0f*M_PI/(float)face_valence_p3); - - const float c = CatmullClarkPrecomputedCoefficients::table.cos_2PI_div_n(face_valence); - const float c_e_p = CatmullClarkPrecomputedCoefficients::table.cos_2PI_div_n(face_valence_p1); - const float c_e_m = CatmullClarkPrecomputedCoefficients::table.cos_2PI_div_n(face_valence_p3); - - const Vertex r_e_p = 1.0f/3.0f * (e_i_m_1 - e_i_p_1) + 2.0f/3.0f * (c_i_m_1 - c_i); - const Vertex r_e_m = 1.0f/3.0f * (e_i - e_i_m_2) + 2.0f/3.0f * (c_i_m_1 - c_i_m_2); - - f_p_vtx = 1.0f / d * (c_e_p * p_vtx + (d - 2.0f*c - c_e_p) * e0_p_vtx + 2.0f*c* e1_m_vtx + r_e_p); - f_m_vtx = 1.0f / d * (c_e_m * p_vtx + (d - 2.0f*c - c_e_m) * e0_m_vtx + 2.0f*c* e3_p_vtx + r_e_m); - } - - __noinline void init(const CatmullClarkPatch& patch) - { - assert( patch.ring[0].hasValidPositions() ); - assert( patch.ring[1].hasValidPositions() ); - assert( patch.ring[2].hasValidPositions() ); - assert( patch.ring[3].hasValidPositions() ); - - p0() = initCornerVertex(patch,0); - p1() = initCornerVertex(patch,1); - p2() = initCornerVertex(patch,2); - p3() = initCornerVertex(patch,3); - - e0_p() = initPositiveEdgeVertex(patch,0, p0()); - e1_p() = initPositiveEdgeVertex(patch,1, p1()); - e2_p() = initPositiveEdgeVertex(patch,2, p2()); - e3_p() = initPositiveEdgeVertex(patch,3, p3()); - - e0_m() = initNegativeEdgeVertex(patch,0, p0()); - e1_m() = initNegativeEdgeVertex(patch,1, p1()); - e2_m() = initNegativeEdgeVertex(patch,2, p2()); - e3_m() = initNegativeEdgeVertex(patch,3, p3()); - - const unsigned int face_valence_p0 = patch.ring[0].face_valence; - const unsigned int face_valence_p1 = patch.ring[1].face_valence; - const unsigned int face_valence_p2 = patch.ring[2].face_valence; - const unsigned int face_valence_p3 = patch.ring[3].face_valence; - - initFaceVertex(patch,0,p0(),e0_p(),e1_m(),face_valence_p1,e0_m(),e3_p(),face_valence_p3,f0_p(),f0_m() ); - initFaceVertex(patch,1,p1(),e1_p(),e2_m(),face_valence_p2,e1_m(),e0_p(),face_valence_p0,f1_p(),f1_m() ); - initFaceVertex(patch,2,p2(),e2_p(),e3_m(),face_valence_p3,e2_m(),e1_p(),face_valence_p1,f2_p(),f2_m() ); - initFaceVertex(patch,3,p3(),e3_p(),e0_m(),face_valence_p0,e3_m(),e2_p(),face_valence_p3,f3_p(),f3_m() ); - - } - - __noinline void init_crackfix(const CatmullClarkPatch& patch, - const BezierCurve* border0, - const BezierCurve* border1, - const BezierCurve* border2, - const BezierCurve* border3) - { - assert( patch.ring[0].hasValidPositions() ); - assert( patch.ring[1].hasValidPositions() ); - assert( patch.ring[2].hasValidPositions() ); - assert( patch.ring[3].hasValidPositions() ); - - p0() = initCornerVertex(patch,0); - p1() = initCornerVertex(patch,1); - p2() = initCornerVertex(patch,2); - p3() = initCornerVertex(patch,3); - - e0_p() = initPositiveEdgeVertex(patch,0, p0()); - e1_p() = initPositiveEdgeVertex(patch,1, p1()); - e2_p() = initPositiveEdgeVertex(patch,2, p2()); - e3_p() = initPositiveEdgeVertex(patch,3, p3()); - - e0_m() = initNegativeEdgeVertex(patch,0, p0()); - e1_m() = initNegativeEdgeVertex(patch,1, p1()); - e2_m() = initNegativeEdgeVertex(patch,2, p2()); - e3_m() = initNegativeEdgeVertex(patch,3, p3()); - - if (unlikely(border0 != nullptr)) - { - p0() = border0->v0; - e0_p() = border0->v1; - e1_m() = border0->v2; - p1() = border0->v3; - } - - if (unlikely(border1 != nullptr)) - { - p1() = border1->v0; - e1_p() = border1->v1; - e2_m() = border1->v2; - p2() = border1->v3; - } - - if (unlikely(border2 != nullptr)) - { - p2() = border2->v0; - e2_p() = border2->v1; - e3_m() = border2->v2; - p3() = border2->v3; - } - - if (unlikely(border3 != nullptr)) - { - p3() = border3->v0; - e3_p() = border3->v1; - e0_m() = border3->v2; - p0() = border3->v3; - } - - const unsigned int face_valence_p0 = patch.ring[0].face_valence; - const unsigned int face_valence_p1 = patch.ring[1].face_valence; - const unsigned int face_valence_p2 = patch.ring[2].face_valence; - const unsigned int face_valence_p3 = patch.ring[3].face_valence; - - initFaceVertex(patch,0,p0(),e0_p(),e1_m(),face_valence_p1,e0_m(),e3_p(),face_valence_p3,f0_p(),f0_m() ); - initFaceVertex(patch,1,p1(),e1_p(),e2_m(),face_valence_p2,e1_m(),e0_p(),face_valence_p0,f1_p(),f1_m() ); - initFaceVertex(patch,2,p2(),e2_p(),e3_m(),face_valence_p3,e2_m(),e1_p(),face_valence_p1,f2_p(),f2_m() ); - initFaceVertex(patch,3,p3(),e3_p(),e0_m(),face_valence_p0,e3_m(),e2_p(),face_valence_p3,f3_p(),f3_m() ); - } - - - void computeGregoryPatchFacePoints(const unsigned int face_valence, - const Vertex& r_e_p, - const Vertex& r_e_m, - const Vertex& p_vtx, - const Vertex& e0_p_vtx, - const Vertex& e1_m_vtx, - const unsigned int face_valence_p1, - const Vertex& e0_m_vtx, - const Vertex& e3_p_vtx, - const unsigned int face_valence_p3, - Vertex& f_p_vtx, - Vertex& f_m_vtx, - const float d = 3.0f) - { - //const float c = cosf(2.0*M_PI/(float)face_valence); - //const float c_e_p = cosf(2.0*M_PI/(float)face_valence_p1); - //const float c_e_m = cosf(2.0*M_PI/(float)face_valence_p3); - - const float c = CatmullClarkPrecomputedCoefficients::table.cos_2PI_div_n(face_valence); - const float c_e_p = CatmullClarkPrecomputedCoefficients::table.cos_2PI_div_n(face_valence_p1); - const float c_e_m = CatmullClarkPrecomputedCoefficients::table.cos_2PI_div_n(face_valence_p3); - - - f_p_vtx = 1.0f / d * (c_e_p * p_vtx + (d - 2.0f*c - c_e_p) * e0_p_vtx + 2.0f*c* e1_m_vtx + r_e_p); - f_m_vtx = 1.0f / d * (c_e_m * p_vtx + (d - 2.0f*c - c_e_m) * e0_m_vtx + 2.0f*c* e3_p_vtx + r_e_m); - f_p_vtx = 1.0f / d * (c_e_p * p_vtx + (d - 2.0f*c - c_e_p) * e0_p_vtx + 2.0f*c* e1_m_vtx + r_e_p); - f_m_vtx = 1.0f / d * (c_e_m * p_vtx + (d - 2.0f*c - c_e_m) * e0_m_vtx + 2.0f*c* e3_p_vtx + r_e_m); - } - - __noinline void init(const GeneralCatmullClarkPatch& patch) - { - assert(patch.size() == 4); -#if 0 - CatmullClarkPatch qpatch; patch.init(qpatch); - init(qpatch); -#else - const float face_valence_p0 = patch.ring[0].face_valence; - const float face_valence_p1 = patch.ring[1].face_valence; - const float face_valence_p2 = patch.ring[2].face_valence; - const float face_valence_p3 = patch.ring[3].face_valence; - - Vertex p0_r_p, p0_r_m; - patch.ring[0].computeGregoryPatchEdgePoints( p0(), e0_p(), e0_m(), p0_r_p, p0_r_m ); - - Vertex p1_r_p, p1_r_m; - patch.ring[1].computeGregoryPatchEdgePoints( p1(), e1_p(), e1_m(), p1_r_p, p1_r_m ); - - Vertex p2_r_p, p2_r_m; - patch.ring[2].computeGregoryPatchEdgePoints( p2(), e2_p(), e2_m(), p2_r_p, p2_r_m ); - - Vertex p3_r_p, p3_r_m; - patch.ring[3].computeGregoryPatchEdgePoints( p3(), e3_p(), e3_m(), p3_r_p, p3_r_m ); - - computeGregoryPatchFacePoints(face_valence_p0, p0_r_p, p0_r_m, p0(), e0_p(), e1_m(), face_valence_p1, e0_m(), e3_p(), face_valence_p3, f0_p(), f0_m() ); - computeGregoryPatchFacePoints(face_valence_p1, p1_r_p, p1_r_m, p1(), e1_p(), e2_m(), face_valence_p2, e1_m(), e0_p(), face_valence_p0, f1_p(), f1_m() ); - computeGregoryPatchFacePoints(face_valence_p2, p2_r_p, p2_r_m, p2(), e2_p(), e3_m(), face_valence_p3, e2_m(), e1_p(), face_valence_p1, f2_p(), f2_m() ); - computeGregoryPatchFacePoints(face_valence_p3, p3_r_p, p3_r_m, p3(), e3_p(), e0_m(), face_valence_p0, e3_m(), e2_p(), face_valence_p3, f3_p(), f3_m() ); - -#endif - } - - - __forceinline void convert_to_bezier() - { - f0_p() = (f0_p() + f0_m()) * 0.5f; - f1_p() = (f1_p() + f1_m()) * 0.5f; - f2_p() = (f2_p() + f2_m()) * 0.5f; - f3_p() = (f3_p() + f3_m()) * 0.5f; - f0_m() = Vertex( zero ); - f1_m() = Vertex( zero ); - f2_m() = Vertex( zero ); - f3_m() = Vertex( zero ); - } - - static __forceinline void computeInnerVertices(const Vertex matrix[4][4], const Vertex f_m[2][2], const float uu, const float vv, - Vertex_t& matrix_11, Vertex_t& matrix_12, Vertex_t& matrix_22, Vertex_t& matrix_21) - { - if (unlikely(uu == 0.0f || uu == 1.0f || vv == 0.0f || vv == 1.0f)) - { - matrix_11 = matrix[1][1]; - matrix_12 = matrix[1][2]; - matrix_22 = matrix[2][2]; - matrix_21 = matrix[2][1]; - } - else - { - const Vertex_t f0_p = matrix[1][1]; - const Vertex_t f1_p = matrix[1][2]; - const Vertex_t f2_p = matrix[2][2]; - const Vertex_t f3_p = matrix[2][1]; - - const Vertex_t f0_m = f_m[0][0]; - const Vertex_t f1_m = f_m[0][1]; - const Vertex_t f2_m = f_m[1][1]; - const Vertex_t f3_m = f_m[1][0]; - - matrix_11 = ( uu * f0_p + vv * f0_m)*rcp(uu+vv); - matrix_12 = ((1.0f-uu) * f1_m + vv * f1_p)*rcp(1.0f-uu+vv); - matrix_22 = ((1.0f-uu) * f2_p + (1.0f-vv) * f2_m)*rcp(2.0f-uu-vv); - matrix_21 = ( uu * f3_m + (1.0f-vv) * f3_p)*rcp(1.0f+uu-vv); - } - } - - template - static __forceinline void computeInnerVertices(const Vertex v[4][4], const Vertex f[2][2], - size_t i, const vfloat& uu, const vfloat& vv, vfloat& matrix_11, vfloat& matrix_12, vfloat& matrix_22, vfloat& matrix_21) - { - const auto m_border = (uu == 0.0f) | (uu == 1.0f) | (vv == 0.0f) | (vv == 1.0f); - - const vfloat f0_p = v[1][1][i]; - const vfloat f1_p = v[1][2][i]; - const vfloat f2_p = v[2][2][i]; - const vfloat f3_p = v[2][1][i]; - - const vfloat f0_m = f[0][0][i]; - const vfloat f1_m = f[0][1][i]; - const vfloat f2_m = f[1][1][i]; - const vfloat f3_m = f[1][0][i]; - - const vfloat one_minus_uu = vfloat(1.0f) - uu; - const vfloat one_minus_vv = vfloat(1.0f) - vv; - - const vfloat f0_i = ( uu * f0_p + vv * f0_m) * rcp(uu+vv); - const vfloat f1_i = (one_minus_uu * f1_m + vv * f1_p) * rcp(one_minus_uu+vv); - const vfloat f2_i = (one_minus_uu * f2_p + one_minus_vv * f2_m) * rcp(one_minus_uu+one_minus_vv); - const vfloat f3_i = ( uu * f3_m + one_minus_vv * f3_p) * rcp(uu+one_minus_vv); - - matrix_11 = select(m_border,f0_p,f0_i); - matrix_12 = select(m_border,f1_p,f1_i); - matrix_22 = select(m_border,f2_p,f2_i); - matrix_21 = select(m_border,f3_p,f3_i); - } - - static __forceinline Vertex eval(const Vertex matrix[4][4], const Vertex f[2][2], const float& uu, const float& vv) - { - Vertex_t v_11, v_12, v_22, v_21; - computeInnerVertices(matrix,f,uu,vv,v_11, v_12, v_22, v_21); - - const Vec4 Bu = BezierBasis::eval(uu); - const Vec4 Bv = BezierBasis::eval(vv); - - return madd(Bv.x,madd(Bu.x,matrix[0][0],madd(Bu.y,matrix[0][1],madd(Bu.z,matrix[0][2],Bu.w * matrix[0][3]))), - madd(Bv.y,madd(Bu.x,matrix[1][0],madd(Bu.y,v_11 ,madd(Bu.z,v_12 ,Bu.w * matrix[1][3]))), - madd(Bv.z,madd(Bu.x,matrix[2][0],madd(Bu.y,v_21 ,madd(Bu.z,v_22 ,Bu.w * matrix[2][3]))), - Bv.w*madd(Bu.x,matrix[3][0],madd(Bu.y,matrix[3][1],madd(Bu.z,matrix[3][2],Bu.w * matrix[3][3])))))); - } - - static __forceinline Vertex eval_du(const Vertex matrix[4][4], const Vertex f[2][2], const float uu, const float vv) // approximative derivative - { - Vertex_t v_11, v_12, v_22, v_21; - computeInnerVertices(matrix,f,uu,vv,v_11, v_12, v_22, v_21); - - const Vec4 Bu = BezierBasis::derivative(uu); - const Vec4 Bv = BezierBasis::eval(vv); - - return madd(Bv.x,madd(Bu.x,matrix[0][0],madd(Bu.y,matrix[0][1],madd(Bu.z,matrix[0][2],Bu.w * matrix[0][3]))), - madd(Bv.y,madd(Bu.x,matrix[1][0],madd(Bu.y,v_11 ,madd(Bu.z,v_12 ,Bu.w * matrix[1][3]))), - madd(Bv.z,madd(Bu.x,matrix[2][0],madd(Bu.y,v_21 ,madd(Bu.z,v_22 ,Bu.w * matrix[2][3]))), - Bv.w*madd(Bu.x,matrix[3][0],madd(Bu.y,matrix[3][1],madd(Bu.z,matrix[3][2],Bu.w * matrix[3][3])))))); - } - - static __forceinline Vertex eval_dv(const Vertex matrix[4][4], const Vertex f[2][2], const float uu, const float vv) // approximative derivative - { - Vertex_t v_11, v_12, v_22, v_21; - computeInnerVertices(matrix,f,uu,vv,v_11, v_12, v_22, v_21); - - const Vec4 Bu = BezierBasis::eval(uu); - const Vec4 Bv = BezierBasis::derivative(vv); - - return madd(Bv.x,madd(Bu.x,matrix[0][0],madd(Bu.y,matrix[0][1],madd(Bu.z,matrix[0][2],Bu.w * matrix[0][3]))), - madd(Bv.y,madd(Bu.x,matrix[1][0],madd(Bu.y,v_11 ,madd(Bu.z,v_12 ,Bu.w * matrix[1][3]))), - madd(Bv.z,madd(Bu.x,matrix[2][0],madd(Bu.y,v_21 ,madd(Bu.z,v_22 ,Bu.w * matrix[2][3]))), - Bv.w*madd(Bu.x,matrix[3][0],madd(Bu.y,matrix[3][1],madd(Bu.z,matrix[3][2],Bu.w * matrix[3][3])))))); - } - - static __forceinline Vertex eval_dudu(const Vertex matrix[4][4], const Vertex f[2][2], const float uu, const float vv) // approximative derivative - { - Vertex_t v_11, v_12, v_22, v_21; - computeInnerVertices(matrix,f,uu,vv,v_11, v_12, v_22, v_21); - - const Vec4 Bu = BezierBasis::derivative2(uu); - const Vec4 Bv = BezierBasis::eval(vv); - - return madd(Bv.x,madd(Bu.x,matrix[0][0],madd(Bu.y,matrix[0][1],madd(Bu.z,matrix[0][2],Bu.w * matrix[0][3]))), - madd(Bv.y,madd(Bu.x,matrix[1][0],madd(Bu.y,v_11 ,madd(Bu.z,v_12 ,Bu.w * matrix[1][3]))), - madd(Bv.z,madd(Bu.x,matrix[2][0],madd(Bu.y,v_21 ,madd(Bu.z,v_22 ,Bu.w * matrix[2][3]))), - Bv.w*madd(Bu.x,matrix[3][0],madd(Bu.y,matrix[3][1],madd(Bu.z,matrix[3][2],Bu.w * matrix[3][3])))))); - } - - static __forceinline Vertex eval_dvdv(const Vertex matrix[4][4], const Vertex f[2][2], const float uu, const float vv) // approximative derivative - { - Vertex_t v_11, v_12, v_22, v_21; - computeInnerVertices(matrix,f,uu,vv,v_11, v_12, v_22, v_21); - - const Vec4 Bu = BezierBasis::eval(uu); - const Vec4 Bv = BezierBasis::derivative2(vv); - - return madd(Bv.x,madd(Bu.x,matrix[0][0],madd(Bu.y,matrix[0][1],madd(Bu.z,matrix[0][2],Bu.w * matrix[0][3]))), - madd(Bv.y,madd(Bu.x,matrix[1][0],madd(Bu.y,v_11 ,madd(Bu.z,v_12 ,Bu.w * matrix[1][3]))), - madd(Bv.z,madd(Bu.x,matrix[2][0],madd(Bu.y,v_21 ,madd(Bu.z,v_22 ,Bu.w * matrix[2][3]))), - Bv.w*madd(Bu.x,matrix[3][0],madd(Bu.y,matrix[3][1],madd(Bu.z,matrix[3][2],Bu.w * matrix[3][3])))))); - } - - static __forceinline Vertex eval_dudv(const Vertex matrix[4][4], const Vertex f[2][2], const float uu, const float vv) // approximative derivative - { - Vertex_t v_11, v_12, v_22, v_21; - computeInnerVertices(matrix,f,uu,vv,v_11, v_12, v_22, v_21); - - const Vec4 Bu = BezierBasis::derivative(uu); - const Vec4 Bv = BezierBasis::derivative(vv); - - return madd(Bv.x,madd(Bu.x,matrix[0][0],madd(Bu.y,matrix[0][1],madd(Bu.z,matrix[0][2],Bu.w * matrix[0][3]))), - madd(Bv.y,madd(Bu.x,matrix[1][0],madd(Bu.y,v_11 ,madd(Bu.z,v_12 ,Bu.w * matrix[1][3]))), - madd(Bv.z,madd(Bu.x,matrix[2][0],madd(Bu.y,v_21 ,madd(Bu.z,v_22 ,Bu.w * matrix[2][3]))), - Bv.w*madd(Bu.x,matrix[3][0],madd(Bu.y,matrix[3][1],madd(Bu.z,matrix[3][2],Bu.w * matrix[3][3])))))); - } - - __forceinline Vertex eval(const float uu, const float vv) const { - return eval(v,f,uu,vv); - } - - __forceinline Vertex eval_du( const float uu, const float vv) const { - return eval_du(v,f,uu,vv); - } - - __forceinline Vertex eval_dv( const float uu, const float vv) const { - return eval_dv(v,f,uu,vv); - } - - __forceinline Vertex eval_dudu( const float uu, const float vv) const { - return eval_dudu(v,f,uu,vv); - } - - __forceinline Vertex eval_dvdv( const float uu, const float vv) const { - return eval_dvdv(v,f,uu,vv); - } - - __forceinline Vertex eval_dudv( const float uu, const float vv) const { - return eval_dudv(v,f,uu,vv); - } - - static __forceinline Vertex normal(const Vertex matrix[4][4], const Vertex f_m[2][2], const float uu, const float vv) // FIXME: why not using basis functions - { - /* interpolate inner vertices */ - Vertex_t matrix_11, matrix_12, matrix_22, matrix_21; - computeInnerVertices(matrix,f_m,uu,vv,matrix_11, matrix_12, matrix_22, matrix_21); - - /* tangentU */ - const Vertex_t col0 = deCasteljau(vv, (Vertex_t)matrix[0][0], (Vertex_t)matrix[1][0], (Vertex_t)matrix[2][0], (Vertex_t)matrix[3][0]); - const Vertex_t col1 = deCasteljau(vv, (Vertex_t)matrix[0][1], (Vertex_t)matrix_11 , (Vertex_t)matrix_21 , (Vertex_t)matrix[3][1]); - const Vertex_t col2 = deCasteljau(vv, (Vertex_t)matrix[0][2], (Vertex_t)matrix_12 , (Vertex_t)matrix_22 , (Vertex_t)matrix[3][2]); - const Vertex_t col3 = deCasteljau(vv, (Vertex_t)matrix[0][3], (Vertex_t)matrix[1][3], (Vertex_t)matrix[2][3], (Vertex_t)matrix[3][3]); - - const Vertex_t tangentU = deCasteljau_tangent(uu, col0, col1, col2, col3); - - /* tangentV */ - const Vertex_t row0 = deCasteljau(uu, (Vertex_t)matrix[0][0], (Vertex_t)matrix[0][1], (Vertex_t)matrix[0][2], (Vertex_t)matrix[0][3]); - const Vertex_t row1 = deCasteljau(uu, (Vertex_t)matrix[1][0], (Vertex_t)matrix_11 , (Vertex_t)matrix_12 , (Vertex_t)matrix[1][3]); - const Vertex_t row2 = deCasteljau(uu, (Vertex_t)matrix[2][0], (Vertex_t)matrix_21 , (Vertex_t)matrix_22 , (Vertex_t)matrix[2][3]); - const Vertex_t row3 = deCasteljau(uu, (Vertex_t)matrix[3][0], (Vertex_t)matrix[3][1], (Vertex_t)matrix[3][2], (Vertex_t)matrix[3][3]); - - const Vertex_t tangentV = deCasteljau_tangent(vv, row0, row1, row2, row3); - - /* normal = tangentU x tangentV */ - const Vertex_t n = cross(tangentU,tangentV); - - return n; - } - - __forceinline Vertex normal( const float uu, const float vv) const { - return normal(v,f,uu,vv); - } - - __forceinline void eval(const float u, const float v, - Vertex* P, Vertex* dPdu, Vertex* dPdv, - Vertex* ddPdudu, Vertex* ddPdvdv, Vertex* ddPdudv, - const float dscale = 1.0f) const - { - if (P) { - *P = eval(u,v); - } - if (dPdu) { - assert(dPdu); *dPdu = eval_du(u,v)*dscale; - assert(dPdv); *dPdv = eval_dv(u,v)*dscale; - } - if (ddPdudu) { - assert(ddPdudu); *ddPdudu = eval_dudu(u,v)*sqr(dscale); - assert(ddPdvdv); *ddPdvdv = eval_dvdv(u,v)*sqr(dscale); - assert(ddPdudv); *ddPdudv = eval_dudv(u,v)*sqr(dscale); - } - } - - template - static __forceinline vfloat eval(const Vertex v[4][4], const Vertex f[2][2], - const size_t i, const vfloat& uu, const vfloat& vv, const Vec4& u_n, const Vec4& v_n, - vfloat& matrix_11, vfloat& matrix_12, vfloat& matrix_22, vfloat& matrix_21) - { - const vfloat curve0_x = madd(v_n[0],vfloat(v[0][0][i]),madd(v_n[1],vfloat(v[1][0][i]),madd(v_n[2],vfloat(v[2][0][i]),v_n[3] * vfloat(v[3][0][i])))); - const vfloat curve1_x = madd(v_n[0],vfloat(v[0][1][i]),madd(v_n[1],vfloat(matrix_11 ),madd(v_n[2],vfloat(matrix_21 ),v_n[3] * vfloat(v[3][1][i])))); - const vfloat curve2_x = madd(v_n[0],vfloat(v[0][2][i]),madd(v_n[1],vfloat(matrix_12 ),madd(v_n[2],vfloat(matrix_22 ),v_n[3] * vfloat(v[3][2][i])))); - const vfloat curve3_x = madd(v_n[0],vfloat(v[0][3][i]),madd(v_n[1],vfloat(v[1][3][i]),madd(v_n[2],vfloat(v[2][3][i]),v_n[3] * vfloat(v[3][3][i])))); - return madd(u_n[0],curve0_x,madd(u_n[1],curve1_x,madd(u_n[2],curve2_x,u_n[3] * curve3_x))); - } - - template - static __forceinline void eval(const Vertex v[4][4], const Vertex f[2][2], - const vbool& valid, const vfloat& uu, const vfloat& vv, - float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv, - const float dscale, const size_t dstride, const size_t N) - { - if (P) { - const Vec4 u_n = BezierBasis::eval(uu); - const Vec4 v_n = BezierBasis::eval(vv); - for (size_t i=0; i u_n = BezierBasis::derivative(uu); - const Vec4 v_n = BezierBasis::eval(vv); - for (size_t i=0; i u_n = BezierBasis::eval(uu); - const Vec4 v_n = BezierBasis::derivative(vv); - for (size_t i=0; i u_n = BezierBasis::derivative2(uu); - const Vec4 v_n = BezierBasis::eval(vv); - for (size_t i=0; i u_n = BezierBasis::eval(uu); - const Vec4 v_n = BezierBasis::derivative2(vv); - for (size_t i=0; i u_n = BezierBasis::derivative(uu); - const Vec4 v_n = BezierBasis::derivative(vv); - for (size_t i=0; i - __forceinline void eval(const vbool& valid, const vfloat& uu, const vfloat& vv, - float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv, - const float dscale, const size_t dstride, const size_t N) const { - eval(v,f,valid,uu,vv,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N); - } - - template - static __forceinline Vec3 eval_t(const Vertex matrix[4][4], const Vec3 f[2][2], const T& uu, const T& vv) - { - typedef typename T::Bool M; - const M m_border = (uu == 0.0f) | (uu == 1.0f) | (vv == 0.0f) | (vv == 1.0f); - - const Vec3 f0_p = Vec3(matrix[1][1].x,matrix[1][1].y,matrix[1][1].z); - const Vec3 f1_p = Vec3(matrix[1][2].x,matrix[1][2].y,matrix[1][2].z); - const Vec3 f2_p = Vec3(matrix[2][2].x,matrix[2][2].y,matrix[2][2].z); - const Vec3 f3_p = Vec3(matrix[2][1].x,matrix[2][1].y,matrix[2][1].z); - - const Vec3 f0_m = f[0][0]; - const Vec3 f1_m = f[0][1]; - const Vec3 f2_m = f[1][1]; - const Vec3 f3_m = f[1][0]; - - const T one_minus_uu = T(1.0f) - uu; - const T one_minus_vv = T(1.0f) - vv; - - const Vec3 f0_i = ( uu * f0_p + vv * f0_m) * rcp(uu+vv); - const Vec3 f1_i = (one_minus_uu * f1_m + vv * f1_p) * rcp(one_minus_uu+vv); - const Vec3 f2_i = (one_minus_uu * f2_p + one_minus_vv * f2_m) * rcp(one_minus_uu+one_minus_vv); - const Vec3 f3_i = ( uu * f3_m + one_minus_vv * f3_p) * rcp(uu+one_minus_vv); - - const Vec3 F0( select(m_border,f0_p.x,f0_i.x), select(m_border,f0_p.y,f0_i.y), select(m_border,f0_p.z,f0_i.z) ); - const Vec3 F1( select(m_border,f1_p.x,f1_i.x), select(m_border,f1_p.y,f1_i.y), select(m_border,f1_p.z,f1_i.z) ); - const Vec3 F2( select(m_border,f2_p.x,f2_i.x), select(m_border,f2_p.y,f2_i.y), select(m_border,f2_p.z,f2_i.z) ); - const Vec3 F3( select(m_border,f3_p.x,f3_i.x), select(m_border,f3_p.y,f3_i.y), select(m_border,f3_p.z,f3_i.z) ); - - const T B0_u = one_minus_uu * one_minus_uu * one_minus_uu; - const T B0_v = one_minus_vv * one_minus_vv * one_minus_vv; - const T B1_u = 3.0f * (one_minus_uu * uu * one_minus_uu); - const T B1_v = 3.0f * (one_minus_vv * vv * one_minus_vv); - const T B2_u = 3.0f * (uu * one_minus_uu * uu); - const T B2_v = 3.0f * (vv * one_minus_vv * vv); - const T B3_u = uu * uu * uu; - const T B3_v = vv * vv * vv; - - const T x = madd(B0_v,madd(B0_u,matrix[0][0].x,madd(B1_u,matrix[0][1].x,madd(B2_u,matrix[0][2].x,B3_u * matrix[0][3].x))), - madd(B1_v,madd(B0_u,matrix[1][0].x,madd(B1_u,F0.x ,madd(B2_u,F1.x ,B3_u * matrix[1][3].x))), - madd(B2_v,madd(B0_u,matrix[2][0].x,madd(B1_u,F3.x ,madd(B2_u,F2.x ,B3_u * matrix[2][3].x))), - B3_v*madd(B0_u,matrix[3][0].x,madd(B1_u,matrix[3][1].x,madd(B2_u,matrix[3][2].x,B3_u * matrix[3][3].x)))))); - - const T y = madd(B0_v,madd(B0_u,matrix[0][0].y,madd(B1_u,matrix[0][1].y,madd(B2_u,matrix[0][2].y,B3_u * matrix[0][3].y))), - madd(B1_v,madd(B0_u,matrix[1][0].y,madd(B1_u,F0.y ,madd(B2_u,F1.y ,B3_u * matrix[1][3].y))), - madd(B2_v,madd(B0_u,matrix[2][0].y,madd(B1_u,F3.y ,madd(B2_u,F2.y ,B3_u * matrix[2][3].y))), - B3_v*madd(B0_u,matrix[3][0].y,madd(B1_u,matrix[3][1].y,madd(B2_u,matrix[3][2].y,B3_u * matrix[3][3].y)))))); - - const T z = madd(B0_v,madd(B0_u,matrix[0][0].z,madd(B1_u,matrix[0][1].z,madd(B2_u,matrix[0][2].z,B3_u * matrix[0][3].z))), - madd(B1_v,madd(B0_u,matrix[1][0].z,madd(B1_u,F0.z ,madd(B2_u,F1.z ,B3_u * matrix[1][3].z))), - madd(B2_v,madd(B0_u,matrix[2][0].z,madd(B1_u,F3.z ,madd(B2_u,F2.z ,B3_u * matrix[2][3].z))), - B3_v*madd(B0_u,matrix[3][0].z,madd(B1_u,matrix[3][1].z,madd(B2_u,matrix[3][2].z,B3_u * matrix[3][3].z)))))); - - return Vec3(x,y,z); - } - - template - __forceinline Vec3 eval(const T& uu, const T& vv) const - { - Vec3 ff[2][2]; - ff[0][0] = Vec3(f[0][0]); - ff[0][1] = Vec3(f[0][1]); - ff[1][1] = Vec3(f[1][1]); - ff[1][0] = Vec3(f[1][0]); - return eval_t(v,ff,uu,vv); - } - - template - static __forceinline Vec3 normal_t(const Vertex matrix[4][4], const Vec3 f[2][2], const T& uu, const T& vv) - { - typedef typename T::Bool M; - - const Vec3 f0_p = Vec3(matrix[1][1].x,matrix[1][1].y,matrix[1][1].z); - const Vec3 f1_p = Vec3(matrix[1][2].x,matrix[1][2].y,matrix[1][2].z); - const Vec3 f2_p = Vec3(matrix[2][2].x,matrix[2][2].y,matrix[2][2].z); - const Vec3 f3_p = Vec3(matrix[2][1].x,matrix[2][1].y,matrix[2][1].z); - - const Vec3 f0_m = f[0][0]; - const Vec3 f1_m = f[0][1]; - const Vec3 f2_m = f[1][1]; - const Vec3 f3_m = f[1][0]; - - const T one_minus_uu = T(1.0f) - uu; - const T one_minus_vv = T(1.0f) - vv; - - const Vec3 f0_i = ( uu * f0_p + vv * f0_m) * rcp(uu+vv); - const Vec3 f1_i = (one_minus_uu * f1_m + vv * f1_p) * rcp(one_minus_uu+vv); - const Vec3 f2_i = (one_minus_uu * f2_p + one_minus_vv * f2_m) * rcp(one_minus_uu+one_minus_vv); - const Vec3 f3_i = ( uu * f3_m + one_minus_vv * f3_p) * rcp(uu+one_minus_vv); - -#if 1 - const M m_corner0 = (uu == 0.0f) & (vv == 0.0f); - const M m_corner1 = (uu == 1.0f) & (vv == 0.0f); - const M m_corner2 = (uu == 1.0f) & (vv == 1.0f); - const M m_corner3 = (uu == 0.0f) & (vv == 1.0f); - const Vec3 matrix_11( select(m_corner0,f0_p.x,f0_i.x), select(m_corner0,f0_p.y,f0_i.y), select(m_corner0,f0_p.z,f0_i.z) ); - const Vec3 matrix_12( select(m_corner1,f1_p.x,f1_i.x), select(m_corner1,f1_p.y,f1_i.y), select(m_corner1,f1_p.z,f1_i.z) ); - const Vec3 matrix_22( select(m_corner2,f2_p.x,f2_i.x), select(m_corner2,f2_p.y,f2_i.y), select(m_corner2,f2_p.z,f2_i.z) ); - const Vec3 matrix_21( select(m_corner3,f3_p.x,f3_i.x), select(m_corner3,f3_p.y,f3_i.y), select(m_corner3,f3_p.z,f3_i.z) ); -#else - const M m_border = (uu == 0.0f) | (uu == 1.0f) | (vv == 0.0f) | (vv == 1.0f); - const Vec3 matrix_11( select(m_border,f0_p.x,f0_i.x), select(m_border,f0_p.y,f0_i.y), select(m_border,f0_p.z,f0_i.z) ); - const Vec3 matrix_12( select(m_border,f1_p.x,f1_i.x), select(m_border,f1_p.y,f1_i.y), select(m_border,f1_p.z,f1_i.z) ); - const Vec3 matrix_22( select(m_border,f2_p.x,f2_i.x), select(m_border,f2_p.y,f2_i.y), select(m_border,f2_p.z,f2_i.z) ); - const Vec3 matrix_21( select(m_border,f3_p.x,f3_i.x), select(m_border,f3_p.y,f3_i.y), select(m_border,f3_p.z,f3_i.z) ); -#endif - - const Vec3 matrix_00 = Vec3(matrix[0][0].x,matrix[0][0].y,matrix[0][0].z); - const Vec3 matrix_10 = Vec3(matrix[1][0].x,matrix[1][0].y,matrix[1][0].z); - const Vec3 matrix_20 = Vec3(matrix[2][0].x,matrix[2][0].y,matrix[2][0].z); - const Vec3 matrix_30 = Vec3(matrix[3][0].x,matrix[3][0].y,matrix[3][0].z); - - const Vec3 matrix_01 = Vec3(matrix[0][1].x,matrix[0][1].y,matrix[0][1].z); - const Vec3 matrix_02 = Vec3(matrix[0][2].x,matrix[0][2].y,matrix[0][2].z); - const Vec3 matrix_03 = Vec3(matrix[0][3].x,matrix[0][3].y,matrix[0][3].z); - - const Vec3 matrix_31 = Vec3(matrix[3][1].x,matrix[3][1].y,matrix[3][1].z); - const Vec3 matrix_32 = Vec3(matrix[3][2].x,matrix[3][2].y,matrix[3][2].z); - const Vec3 matrix_33 = Vec3(matrix[3][3].x,matrix[3][3].y,matrix[3][3].z); - - const Vec3 matrix_13 = Vec3(matrix[1][3].x,matrix[1][3].y,matrix[1][3].z); - const Vec3 matrix_23 = Vec3(matrix[2][3].x,matrix[2][3].y,matrix[2][3].z); - - /* tangentU */ - const Vec3 col0 = deCasteljau(vv, matrix_00, matrix_10, matrix_20, matrix_30); - const Vec3 col1 = deCasteljau(vv, matrix_01, matrix_11, matrix_21, matrix_31); - const Vec3 col2 = deCasteljau(vv, matrix_02, matrix_12, matrix_22, matrix_32); - const Vec3 col3 = deCasteljau(vv, matrix_03, matrix_13, matrix_23, matrix_33); - - const Vec3 tangentU = deCasteljau_tangent(uu, col0, col1, col2, col3); - - /* tangentV */ - const Vec3 row0 = deCasteljau(uu, matrix_00, matrix_01, matrix_02, matrix_03); - const Vec3 row1 = deCasteljau(uu, matrix_10, matrix_11, matrix_12, matrix_13); - const Vec3 row2 = deCasteljau(uu, matrix_20, matrix_21, matrix_22, matrix_23); - const Vec3 row3 = deCasteljau(uu, matrix_30, matrix_31, matrix_32, matrix_33); - - const Vec3 tangentV = deCasteljau_tangent(vv, row0, row1, row2, row3); - - /* normal = tangentU x tangentV */ - const Vec3 n = cross(tangentU,tangentV); - return n; - } - - template - __forceinline Vec3 normal(const T& uu, const T& vv) const - { - Vec3 ff[2][2]; - ff[0][0] = Vec3(f[0][0]); - ff[0][1] = Vec3(f[0][1]); - ff[1][1] = Vec3(f[1][1]); - ff[1][0] = Vec3(f[1][0]); - return normal_t(v,ff,uu,vv); - } - - __forceinline BBox bounds() const - { - const Vertex *const cv = &v[0][0]; - BBox bounds (cv[0]); - for (size_t i=1; i<16; i++) - bounds.extend( cv[i] ); - bounds.extend(f[0][0]); - bounds.extend(f[1][0]); - bounds.extend(f[1][1]); - bounds.extend(f[1][1]); - return bounds; - } - - friend embree_ostream operator<<(embree_ostream o, const GregoryPatchT& p) - { - for (size_t y=0; y<4; y++) - for (size_t x=0; x<4; x++) - o << "v[" << y << "][" << x << "] " << p.v[y][x] << embree_endl; - - for (size_t y=0; y<2; y++) - for (size_t x=0; x<2; x++) - o << "f[" << y << "][" << x << "] " << p.f[y][x] << embree_endl; - return o; - } - }; - - typedef GregoryPatchT GregoryPatch3fa; - - template - __forceinline BezierPatchT::BezierPatchT (const HalfEdge* edge, const char* vertices, size_t stride) - { - CatmullClarkPatchT patch(edge,vertices,stride); - GregoryPatchT gpatch(patch); - gpatch.convert_to_bezier(); - for (size_t y=0; y<4; y++) - for (size_t x=0; x<4; x++) - matrix[y][x] = (Vertex_t)gpatch.v[y][x]; - } - - template - __forceinline BezierPatchT::BezierPatchT(const CatmullClarkPatchT& patch) - { - GregoryPatchT gpatch(patch); - gpatch.convert_to_bezier(); - for (size_t y=0; y<4; y++) - for (size_t x=0; x<4; x++) - matrix[y][x] = (Vertex_t)gpatch.v[y][x]; - } - - template - __forceinline BezierPatchT::BezierPatchT(const CatmullClarkPatchT& patch, - const BezierCurveT* border0, - const BezierCurveT* border1, - const BezierCurveT* border2, - const BezierCurveT* border3) - { - GregoryPatchT gpatch(patch,border0,border1,border2,border3); - gpatch.convert_to_bezier(); - for (size_t y=0; y<4; y++) - for (size_t x=0; x<4; x++) - matrix[y][x] = (Vertex_t)gpatch.v[y][x]; - } -} diff --git a/thirdparty/embree-aarch64/kernels/subdiv/gregory_patch_dense.h b/thirdparty/embree-aarch64/kernels/subdiv/gregory_patch_dense.h deleted file mode 100644 index 85effd02cf..0000000000 --- a/thirdparty/embree-aarch64/kernels/subdiv/gregory_patch_dense.h +++ /dev/null @@ -1,113 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "gregory_patch.h" - -namespace embree -{ - class __aligned(64) DenseGregoryPatch3fa - { - typedef Vec3fa Vec3fa_4x4[4][4]; - public: - - __forceinline DenseGregoryPatch3fa (const GregoryPatch3fa& patch) - { - for (size_t y=0; y<4; y++) - for (size_t x=0; x<4; x++) - matrix[y][x] = Vec3ff(patch.v[y][x], 0.0f); - - matrix[0][0].w = patch.f[0][0].x; - matrix[0][1].w = patch.f[0][0].y; - matrix[0][2].w = patch.f[0][0].z; - matrix[0][3].w = 0.0f; - - matrix[1][0].w = patch.f[0][1].x; - matrix[1][1].w = patch.f[0][1].y; - matrix[1][2].w = patch.f[0][1].z; - matrix[1][3].w = 0.0f; - - matrix[2][0].w = patch.f[1][1].x; - matrix[2][1].w = patch.f[1][1].y; - matrix[2][2].w = patch.f[1][1].z; - matrix[2][3].w = 0.0f; - - matrix[3][0].w = patch.f[1][0].x; - matrix[3][1].w = patch.f[1][0].y; - matrix[3][2].w = patch.f[1][0].z; - matrix[3][3].w = 0.0f; - } - - __forceinline void extract_f_m(Vec3fa f_m[2][2]) const - { - f_m[0][0] = Vec3fa( matrix[0][0].w, matrix[0][1].w, matrix[0][2].w ); - f_m[0][1] = Vec3fa( matrix[1][0].w, matrix[1][1].w, matrix[1][2].w ); - f_m[1][1] = Vec3fa( matrix[2][0].w, matrix[2][1].w, matrix[2][2].w ); - f_m[1][0] = Vec3fa( matrix[3][0].w, matrix[3][1].w, matrix[3][2].w ); - } - - __forceinline Vec3fa eval(const float uu, const float vv) const - { - __aligned(64) Vec3fa f_m[2][2]; extract_f_m(f_m); - return GregoryPatch3fa::eval(*(Vec3fa_4x4*)&matrix,f_m,uu,vv); - } - - __forceinline Vec3fa normal(const float uu, const float vv) const - { - __aligned(64) Vec3fa f_m[2][2]; extract_f_m(f_m); - return GregoryPatch3fa::normal(*(Vec3fa_4x4*)&matrix,f_m,uu,vv); - } - - template - __forceinline Vec3 eval(const T &uu, const T &vv) const - { - Vec3 f_m[2][2]; - f_m[0][0] = Vec3( matrix[0][0].w, matrix[0][1].w, matrix[0][2].w ); - f_m[0][1] = Vec3( matrix[1][0].w, matrix[1][1].w, matrix[1][2].w ); - f_m[1][1] = Vec3( matrix[2][0].w, matrix[2][1].w, matrix[2][2].w ); - f_m[1][0] = Vec3( matrix[3][0].w, matrix[3][1].w, matrix[3][2].w ); - return GregoryPatch3fa::eval_t(*(Vec3fa_4x4*)&matrix,f_m,uu,vv); - } - - template - __forceinline Vec3 normal(const T &uu, const T &vv) const - { - Vec3 f_m[2][2]; - f_m[0][0] = Vec3( matrix[0][0].w, matrix[0][1].w, matrix[0][2].w ); - f_m[0][1] = Vec3( matrix[1][0].w, matrix[1][1].w, matrix[1][2].w ); - f_m[1][1] = Vec3( matrix[2][0].w, matrix[2][1].w, matrix[2][2].w ); - f_m[1][0] = Vec3( matrix[3][0].w, matrix[3][1].w, matrix[3][2].w ); - return GregoryPatch3fa::normal_t(*(Vec3fa_4x4*)&matrix,f_m,uu,vv); - } - - __forceinline void eval(const float u, const float v, - Vec3fa* P, Vec3fa* dPdu, Vec3fa* dPdv, Vec3fa* ddPdudu, Vec3fa* ddPdvdv, Vec3fa* ddPdudv, - const float dscale = 1.0f) const - { - __aligned(64) Vec3fa f_m[2][2]; extract_f_m(f_m); - if (P) { - *P = GregoryPatch3fa::eval(*(Vec3fa_4x4*)&matrix,f_m,u,v); - } - if (dPdu) { - assert(dPdu); *dPdu = GregoryPatch3fa::eval_du(*(Vec3fa_4x4*)&matrix,f_m,u,v)*dscale; - assert(dPdv); *dPdv = GregoryPatch3fa::eval_dv(*(Vec3fa_4x4*)&matrix,f_m,u,v)*dscale; - } - if (ddPdudu) { - assert(ddPdudu); *ddPdudu = GregoryPatch3fa::eval_dudu(*(Vec3fa_4x4*)&matrix,f_m,u,v)*sqr(dscale); - assert(ddPdvdv); *ddPdvdv = GregoryPatch3fa::eval_dvdv(*(Vec3fa_4x4*)&matrix,f_m,u,v)*sqr(dscale); - assert(ddPdudv); *ddPdudv = GregoryPatch3fa::eval_dudv(*(Vec3fa_4x4*)&matrix,f_m,u,v)*sqr(dscale); - } - } - - template - __forceinline void eval(const vbool& valid, const vfloat& uu, const vfloat& vv, float* P, float* dPdu, float* dPdv, const float dscale, const size_t dstride, const size_t N) const - { - __aligned(64) Vec3fa f_m[2][2]; extract_f_m(f_m); - GregoryPatch3fa::eval(matrix,f_m,valid,uu,vv,P,dPdu,dPdv,dscale,dstride,N); - } - - private: - Vec3ff matrix[4][4]; // f_p/m points are stored in 4th component - }; -} diff --git a/thirdparty/embree-aarch64/kernels/subdiv/gridrange.h b/thirdparty/embree-aarch64/kernels/subdiv/gridrange.h deleted file mode 100644 index 4fd741c879..0000000000 --- a/thirdparty/embree-aarch64/kernels/subdiv/gridrange.h +++ /dev/null @@ -1,96 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../common/default.h" - -namespace embree -{ - struct __aligned(16) GridRange - { - unsigned int u_start; - unsigned int u_end; - unsigned int v_start; - unsigned int v_end; - - __forceinline GridRange() {} - - __forceinline GridRange(unsigned int u_start, unsigned int u_end, unsigned int v_start, unsigned int v_end) - : u_start(u_start), u_end(u_end), v_start(v_start), v_end(v_end) {} - - __forceinline unsigned int width() const { - return u_end-u_start+1; - } - - __forceinline unsigned int height() const { - return v_end-v_start+1; - } - - __forceinline bool hasLeafSize() const - { - const unsigned int u_size = u_end-u_start+1; - const unsigned int v_size = v_end-v_start+1; - assert(u_size >= 1); - assert(v_size >= 1); - return u_size <= 3 && v_size <= 3; - } - - static __forceinline unsigned int split(unsigned int start,unsigned int end) - { - const unsigned int center = (start+end)/2; - assert (center > start); - assert (center < end); - return center; - } - - __forceinline void split(GridRange& r0, GridRange& r1) const - { - assert( hasLeafSize() == false ); - const unsigned int u_size = u_end-u_start+1; - const unsigned int v_size = v_end-v_start+1; - r0 = *this; - r1 = *this; - - if (u_size >= v_size) - { - const unsigned int u_mid = split(u_start,u_end); - r0.u_end = u_mid; - r1.u_start = u_mid; - } - else - { - const unsigned int v_mid = split(v_start,v_end); - r0.v_end = v_mid; - r1.v_start = v_mid; - } - } - - __forceinline unsigned int splitIntoSubRanges(GridRange r[4]) const - { - assert( !hasLeafSize() ); - unsigned int children = 0; - GridRange first,second; - split(first,second); - - if (first.hasLeafSize()) { - r[0] = first; - children++; - } - else { - first.split(r[0],r[1]); - children += 2; - } - - if (second.hasLeafSize()) { - r[children] = second; - children++; - } - else { - second.split(r[children+0],r[children+1]); - children += 2; - } - return children; - } - }; -} diff --git a/thirdparty/embree-aarch64/kernels/subdiv/half_edge.h b/thirdparty/embree-aarch64/kernels/subdiv/half_edge.h deleted file mode 100644 index fb350ca71f..0000000000 --- a/thirdparty/embree-aarch64/kernels/subdiv/half_edge.h +++ /dev/null @@ -1,371 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "catmullclark_coefficients.h" - -namespace embree -{ - class __aligned(32) HalfEdge - { - friend class SubdivMesh; - public: - - enum PatchType : char { - BILINEAR_PATCH = 0, //!< a bilinear patch - REGULAR_QUAD_PATCH = 1, //!< a regular quad patch can be represented as a B-Spline - IRREGULAR_QUAD_PATCH = 2, //!< an irregular quad patch can be represented as a Gregory patch - COMPLEX_PATCH = 3 //!< these patches need subdivision and cannot be processed by the above fast code paths - }; - - enum VertexType : char { - REGULAR_VERTEX = 0, //!< regular vertex - NON_MANIFOLD_EDGE_VERTEX = 1, //!< vertex of a non-manifold edge - }; - - __forceinline friend PatchType max( const PatchType& ty0, const PatchType& ty1) { - return (PatchType) max((int)ty0,(int)ty1); - } - - struct Edge - { - /*! edge constructor */ - __forceinline Edge(const uint32_t v0, const uint32_t v1) - : v0(v0), v1(v1) {} - - /*! create an 64 bit identifier that is unique for the not oriented edge */ - __forceinline operator uint64_t() const - { - uint32_t p0 = v0, p1 = v1; - if (p0next(); } - __forceinline const HalfEdge* rotate() const { return opposite()->next(); } - - __forceinline unsigned int getStartVertexIndex() const { return vtx_index; } - __forceinline unsigned int getEndVertexIndex () const { return next()->vtx_index; } - __forceinline Edge getEdge () const { return Edge(getStartVertexIndex(),getEndVertexIndex()); } - - - /*! tests if the start vertex of the edge is regular */ - __forceinline PatchType vertexType() const - { - const HalfEdge* p = this; - size_t face_valence = 0; - bool hasBorder = false; - - do - { - /* we need subdivision to handle edge creases */ - if (p->hasOpposite() && p->edge_crease_weight > 0.0f) - return COMPLEX_PATCH; - - face_valence++; - - /* test for quad */ - const HalfEdge* pp = p; - pp = pp->next(); if (pp == p) return COMPLEX_PATCH; - pp = pp->next(); if (pp == p) return COMPLEX_PATCH; - pp = pp->next(); if (pp == p) return COMPLEX_PATCH; - pp = pp->next(); if (pp != p) return COMPLEX_PATCH; - - /* continue with next face */ - p = p->prev(); - if (likely(p->hasOpposite())) - p = p->opposite(); - - /* if there is no opposite go the long way to the other side of the border */ - else - { - face_valence++; - hasBorder = true; - p = this; - while (p->hasOpposite()) - p = p->rotate(); - } - } while (p != this); - - /* calculate vertex type */ - if (face_valence == 2 && hasBorder) { - if (vertex_crease_weight == 0.0f ) return REGULAR_QUAD_PATCH; - else if (vertex_crease_weight == float(inf)) return REGULAR_QUAD_PATCH; - else return COMPLEX_PATCH; - } - else if (vertex_crease_weight != 0.0f) return COMPLEX_PATCH; - else if (face_valence == 3 && hasBorder) return REGULAR_QUAD_PATCH; - else if (face_valence == 4 && !hasBorder) return REGULAR_QUAD_PATCH; - else return IRREGULAR_QUAD_PATCH; - } - - /*! tests if this edge is part of a bilinear patch */ - __forceinline bool bilinearVertex() const { - return vertex_crease_weight == float(inf) && edge_crease_weight == float(inf); - } - - /*! calculates the type of the patch */ - __forceinline PatchType patchType() const - { - const HalfEdge* p = this; - PatchType ret = REGULAR_QUAD_PATCH; - bool bilinear = true; - - ret = max(ret,p->vertexType()); - bilinear &= p->bilinearVertex(); - if ((p = p->next()) == this) return COMPLEX_PATCH; - - ret = max(ret,p->vertexType()); - bilinear &= p->bilinearVertex(); - if ((p = p->next()) == this) return COMPLEX_PATCH; - - ret = max(ret,p->vertexType()); - bilinear &= p->bilinearVertex(); - if ((p = p->next()) == this) return COMPLEX_PATCH; - - ret = max(ret,p->vertexType()); - bilinear &= p->bilinearVertex(); - if ((p = p->next()) != this) return COMPLEX_PATCH; - - if (bilinear) return BILINEAR_PATCH; - return ret; - } - - /*! tests if the face is a regular b-spline face */ - __forceinline bool isRegularFace() const { - return patch_type == REGULAR_QUAD_PATCH; - } - - /*! tests if the face can be diced (using bspline or gregory patch) */ - __forceinline bool isGregoryFace() const { - return patch_type == IRREGULAR_QUAD_PATCH || patch_type == REGULAR_QUAD_PATCH; - } - - /*! tests if the base vertex of this half edge is a corner vertex */ - __forceinline bool isCorner() const { - return !hasOpposite() && !prev()->hasOpposite(); - } - - /*! tests if the vertex is attached to any border */ - __forceinline bool vertexHasBorder() const - { - const HalfEdge* p = this; - do { - if (!p->hasOpposite()) return true; - p = p->rotate(); - } while (p != this); - return false; - } - - /*! tests if the face this half edge belongs to has some border */ - __forceinline bool faceHasBorder() const - { - const HalfEdge* p = this; - do { - if (p->vertexHasBorder()) return true; - p = p->next(); - } while (p != this); - return false; - } - - /*! calculates conservative bounds of a catmull clark subdivision face */ - __forceinline BBox3fa bounds(const BufferView& vertices) const - { - BBox3fa bounds = this->get1RingBounds(vertices); - for (const HalfEdge* p=this->next(); p!=this; p=p->next()) - bounds.extend(p->get1RingBounds(vertices)); - return bounds; - } - - /*! tests if this is a valid patch */ - __forceinline bool valid(const BufferView& vertices) const - { - size_t N = 1; - if (!this->validRing(vertices)) return false; - for (const HalfEdge* p=this->next(); p!=this; p=p->next(), N++) { - if (!p->validRing(vertices)) return false; - } - return N >= 3 && N <= MAX_PATCH_VALENCE; - } - - /*! counts number of polygon edges */ - __forceinline unsigned int numEdges() const - { - unsigned int N = 1; - for (const HalfEdge* p=this->next(); p!=this; p=p->next(), N++); - return N; - } - - /*! calculates face and edge valence */ - __forceinline void calculateFaceValenceAndEdgeValence(size_t& faceValence, size_t& edgeValence) const - { - faceValence = 0; - edgeValence = 0; - - const HalfEdge* p = this; - do - { - /* calculate bounds of current face */ - unsigned int numEdges = p->numEdges(); - assert(numEdges >= 3); - edgeValence += numEdges-2; - - faceValence++; - p = p->prev(); - - /* continue with next face */ - if (likely(p->hasOpposite())) - p = p->opposite(); - - /* if there is no opposite go the long way to the other side of the border */ - else { - faceValence++; - edgeValence++; - p = this; - while (p->hasOpposite()) - p = p->opposite()->next(); - } - - } while (p != this); - } - - /*! stream output */ - friend __forceinline std::ostream &operator<<(std::ostream &o, const HalfEdge &h) - { - return o << "{ " << - "vertex = " << h.vtx_index << ", " << //" -> " << h.next()->vtx_index << ", " << - "prev = " << h.prev_half_edge_ofs << ", " << - "next = " << h.next_half_edge_ofs << ", " << - "opposite = " << h.opposite_half_edge_ofs << ", " << - "edge_crease = " << h.edge_crease_weight << ", " << - "vertex_crease = " << h.vertex_crease_weight << ", " << - //"edge_level = " << h.edge_level << - " }"; - } - - private: - - /*! calculates the bounds of the face associated with the half-edge */ - __forceinline BBox3fa getFaceBounds(const BufferView& vertices) const - { - BBox3fa b = vertices[getStartVertexIndex()]; - for (const HalfEdge* p = next(); p!=this; p=p->next()) { - b.extend(vertices[p->getStartVertexIndex()]); - } - return b; - } - - /*! calculates the bounds of the 1-ring associated with the vertex of the half-edge */ - __forceinline BBox3fa get1RingBounds(const BufferView& vertices) const - { - BBox3fa bounds = empty; - const HalfEdge* p = this; - do - { - /* calculate bounds of current face */ - bounds.extend(p->getFaceBounds(vertices)); - p = p->prev(); - - /* continue with next face */ - if (likely(p->hasOpposite())) - p = p->opposite(); - - /* if there is no opposite go the long way to the other side of the border */ - else { - p = this; - while (p->hasOpposite()) - p = p->opposite()->next(); - } - - } while (p != this); - - return bounds; - } - - /*! tests if this is a valid face */ - __forceinline bool validFace(const BufferView& vertices, size_t& N) const - { - const Vec3fa v = vertices[getStartVertexIndex()]; - if (!isvalid(v)) return false; - size_t n = 1; - for (const HalfEdge* p = next(); p!=this; p=p->next(), n++) { - const Vec3fa v = vertices[p->getStartVertexIndex()]; - if (!isvalid(v)) return false; - } - N += n-2; - return n >= 3 && n <= MAX_PATCH_VALENCE; - } - - /*! tests if this is a valid ring */ - __forceinline bool validRing(const BufferView& vertices) const - { - size_t faceValence = 0; - size_t edgeValence = 0; - - const HalfEdge* p = this; - do - { - /* calculate bounds of current face */ - if (!p->validFace(vertices,edgeValence)) - return false; - - faceValence++; - p = p->prev(); - - /* continue with next face */ - if (likely(p->hasOpposite())) - p = p->opposite(); - - /* if there is no opposite go the long way to the other side of the border */ - else { - faceValence++; - edgeValence++; - p = this; - while (p->hasOpposite()) - p = p->opposite()->next(); - } - - } while (p != this); - - return faceValence <= MAX_RING_FACE_VALENCE && edgeValence <= MAX_RING_EDGE_VALENCE; - } - - private: - unsigned int vtx_index; //!< index of edge start vertex - int next_half_edge_ofs; //!< relative offset to next half edge of face - int prev_half_edge_ofs; //!< relative offset to previous half edge of face - int opposite_half_edge_ofs; //!< relative offset to opposite half edge - - public: - float edge_crease_weight; //!< crease weight attached to edge - float vertex_crease_weight; //!< crease weight attached to start vertex - float edge_level; //!< subdivision factor for edge - PatchType patch_type; //!< stores type of subdiv patch - VertexType vertex_type; //!< stores type of the start vertex - char align[2]; - }; -} diff --git a/thirdparty/embree-aarch64/kernels/subdiv/hermite_curve.h b/thirdparty/embree-aarch64/kernels/subdiv/hermite_curve.h deleted file mode 100644 index 9fab79cf0c..0000000000 --- a/thirdparty/embree-aarch64/kernels/subdiv/hermite_curve.h +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../common/default.h" -#include "bezier_curve.h" - -namespace embree -{ - template - struct HermiteCurveT : BezierCurveT - { - __forceinline HermiteCurveT() {} - - __forceinline HermiteCurveT(const BezierCurveT& curve) - : BezierCurveT(curve) {} - - __forceinline HermiteCurveT(const Vertex& v0, const Vertex& t0, const Vertex& v1, const Vertex& t1) - : BezierCurveT(v0,madd(1.0f/3.0f,t0,v0),nmadd(1.0f/3.0f,t1,v1),v1) {} - - __forceinline HermiteCurveT xfm_pr(const LinearSpace3fa& space, const Vec3fa& p) const - { - const Vec3ff q0(xfmVector(space,this->v0-p), this->v0.w); - const Vec3ff q1(xfmVector(space,this->v1-p), this->v1.w); - const Vec3ff q2(xfmVector(space,this->v2-p), this->v2.w); - const Vec3ff q3(xfmVector(space,this->v3-p), this->v3.w); - return BezierCurveT(q0,q1,q2,q3); - } - }; - - __forceinline HermiteCurveT enlargeRadiusToMinWidth(const IntersectContext* context, const CurveGeometry* geom, const Vec3fa& ray_org, const HermiteCurveT& curve) { - return HermiteCurveT(enlargeRadiusToMinWidth(context,geom,ray_org,BezierCurveT(curve))); - } - - typedef HermiteCurveT HermiteCurve3fa; -} - diff --git a/thirdparty/embree-aarch64/kernels/subdiv/linear_bezier_patch.h b/thirdparty/embree-aarch64/kernels/subdiv/linear_bezier_patch.h deleted file mode 100644 index f4a854af7f..0000000000 --- a/thirdparty/embree-aarch64/kernels/subdiv/linear_bezier_patch.h +++ /dev/null @@ -1,403 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "bezier_curve.h" - -namespace embree -{ - namespace isa - { - template - struct TensorLinearQuadraticBezierSurface - { - QuadraticBezierCurve L; - QuadraticBezierCurve R; - - __forceinline TensorLinearQuadraticBezierSurface() {} - - __forceinline TensorLinearQuadraticBezierSurface(const TensorLinearQuadraticBezierSurface& curve) - : L(curve.L), R(curve.R) {} - - __forceinline TensorLinearQuadraticBezierSurface& operator= (const TensorLinearQuadraticBezierSurface& other) { - L = other.L; R = other.R; return *this; - } - - __forceinline TensorLinearQuadraticBezierSurface(const QuadraticBezierCurve& L, const QuadraticBezierCurve& R) - : L(L), R(R) {} - - __forceinline BBox bounds() const { - return merge(L.bounds(),R.bounds()); - } - }; - - template<> - struct TensorLinearQuadraticBezierSurface - { - QuadraticBezierCurve LR; - - __forceinline TensorLinearQuadraticBezierSurface() {} - - __forceinline TensorLinearQuadraticBezierSurface(const TensorLinearQuadraticBezierSurface& curve) - : LR(curve.LR) {} - - __forceinline TensorLinearQuadraticBezierSurface& operator= (const TensorLinearQuadraticBezierSurface& other) { - LR = other.LR; return *this; - } - - __forceinline TensorLinearQuadraticBezierSurface(const QuadraticBezierCurve& LR) - : LR(LR) {} - - __forceinline BBox bounds() const - { - const BBox b = LR.bounds(); - const BBox bl(Vec2fa(b.lower),Vec2fa(b.upper)); - const BBox br(Vec2fa(shuffle<2,3,2,3>(b.lower)),Vec2fa(shuffle<2,3,2,3>(b.upper))); - return merge(bl,br); - } - }; - - template - struct TensorLinearCubicBezierSurface - { - CubicBezierCurve L; - CubicBezierCurve R; - - __forceinline TensorLinearCubicBezierSurface() {} - - __forceinline TensorLinearCubicBezierSurface(const TensorLinearCubicBezierSurface& curve) - : L(curve.L), R(curve.R) {} - - __forceinline TensorLinearCubicBezierSurface& operator= (const TensorLinearCubicBezierSurface& other) { - L = other.L; R = other.R; return *this; - } - - __forceinline TensorLinearCubicBezierSurface(const CubicBezierCurve& L, const CubicBezierCurve& R) - : L(L), R(R) {} - - template class SourceCurve> - __forceinline static TensorLinearCubicBezierSurface fromCenterAndNormalCurve(const SourceCurve& center, const SourceCurve& normal) - { - SourceCurve vcurve = center; - SourceCurve ncurve = normal; - - /* here we construct a patch which follows the curve l(t) = - * p(t) +/- r(t)*normalize(cross(n(t),dp(t))) */ - - const Vec3ff p0 = vcurve.eval(0.0f); - const Vec3ff dp0 = vcurve.eval_du(0.0f); - const Vec3ff ddp0 = vcurve.eval_dudu(0.0f); - - const Vec3fa n0 = ncurve.eval(0.0f); - const Vec3fa dn0 = ncurve.eval_du(0.0f); - - const Vec3ff p1 = vcurve.eval(1.0f); - const Vec3ff dp1 = vcurve.eval_du(1.0f); - const Vec3ff ddp1 = vcurve.eval_dudu(1.0f); - - const Vec3fa n1 = ncurve.eval(1.0f); - const Vec3fa dn1 = ncurve.eval_du(1.0f); - - const Vec3fa bt0 = cross(n0,dp0); - const Vec3fa dbt0 = cross(dn0,dp0) + cross(n0,ddp0); - - const Vec3fa bt1 = cross(n1,dp1); - const Vec3fa dbt1 = cross(dn1,dp1) + cross(n1,ddp1); - - const Vec3fa k0 = normalize(bt0); - const Vec3fa dk0 = dnormalize(bt0,dbt0); - - const Vec3fa k1 = normalize(bt1); - const Vec3fa dk1 = dnormalize(bt1,dbt1); - - const Vec3fa l0 = p0 - p0.w*k0; - const Vec3fa dl0 = dp0 - (dp0.w*k0 + p0.w*dk0); - - const Vec3fa r0 = p0 + p0.w*k0; - const Vec3fa dr0 = dp0 + (dp0.w*k0 + p0.w*dk0); - - const Vec3fa l1 = p1 - p1.w*k1; - const Vec3fa dl1 = dp1 - (dp1.w*k1 + p1.w*dk1); - - const Vec3fa r1 = p1 + p1.w*k1; - const Vec3fa dr1 = dp1 + (dp1.w*k1 + p1.w*dk1); - - const float scale = 1.0f/3.0f; - CubicBezierCurve L(l0,l0+scale*dl0,l1-scale*dl1,l1); - CubicBezierCurve R(r0,r0+scale*dr0,r1-scale*dr1,r1); - return TensorLinearCubicBezierSurface(L,R); - } - - __forceinline BBox bounds() const { - return merge(L.bounds(),R.bounds()); - } - - __forceinline BBox3fa accurateBounds() const { - return merge(L.accurateBounds(),R.accurateBounds()); - } - - __forceinline CubicBezierCurve reduce_v() const { - return merge(CubicBezierCurve>(L),CubicBezierCurve>(R)); - } - - __forceinline LinearBezierCurve reduce_u() const { - return LinearBezierCurve(L.bounds(),R.bounds()); - } - - __forceinline TensorLinearCubicBezierSurface xfm(const V& dx) const { - return TensorLinearCubicBezierSurface(L.xfm(dx),R.xfm(dx)); - } - - __forceinline TensorLinearCubicBezierSurface vxfm(const V& dx) const { - return TensorLinearCubicBezierSurface(L.vxfm(dx),R.vxfm(dx)); - } - - __forceinline TensorLinearCubicBezierSurface xfm(const V& dx, const V& p) const { - return TensorLinearCubicBezierSurface(L.xfm(dx,p),R.xfm(dx,p)); - } - - __forceinline TensorLinearCubicBezierSurface xfm(const LinearSpace3fa& space) const { - return TensorLinearCubicBezierSurface(L.xfm(space),R.xfm(space)); - } - - __forceinline TensorLinearCubicBezierSurface xfm(const LinearSpace3fa& space, const Vec3fa& p) const { - return TensorLinearCubicBezierSurface(L.xfm(space,p),R.xfm(space,p)); - } - - __forceinline TensorLinearCubicBezierSurface xfm(const LinearSpace3fa& space, const Vec3fa& p, const float s) const { - return TensorLinearCubicBezierSurface(L.xfm(space,p,s),R.xfm(space,p,s)); - } - - __forceinline TensorLinearCubicBezierSurface clip_u(const Interval1f& u) const { - return TensorLinearCubicBezierSurface(L.clip(u),R.clip(u)); - } - - __forceinline TensorLinearCubicBezierSurface clip_v(const Interval1f& v) const { - return TensorLinearCubicBezierSurface(clerp(L,R,V(v.lower)),clerp(L,R,V(v.upper))); - } - - __forceinline TensorLinearCubicBezierSurface clip(const Interval1f& u, const Interval1f& v) const { - return clip_v(v).clip_u(u); - } - - __forceinline void split_u(TensorLinearCubicBezierSurface& left, TensorLinearCubicBezierSurface& right, const float u = 0.5f) const - { - CubicBezierCurve L0,L1; L.split(L0,L1,u); - CubicBezierCurve R0,R1; R.split(R0,R1,u); - new (&left ) TensorLinearCubicBezierSurface(L0,R0); - new (&right) TensorLinearCubicBezierSurface(L1,R1); - } - - __forceinline TensorLinearCubicBezierSurface vsplit_u(vboolx& valid, const BBox1f& u) const { - valid = true; clear(valid,VSIZEX-1); - return TensorLinearCubicBezierSurface(L.split(u),R.split(u)); - } - - __forceinline V eval(const float u, const float v) const { - return clerp(L,R,V(v)).eval(u); - } - - __forceinline V eval_du(const float u, const float v) const { - return clerp(L,R,V(v)).eval_dt(u); - } - - __forceinline V eval_dv(const float u, const float v) const { - return (R-L).eval(u); - } - - __forceinline void eval(const float u, const float v, V& p, V& dpdu, V& dpdv) const - { - V p0, dp0du; L.eval(u,p0,dp0du); - V p1, dp1du; R.eval(u,p1,dp1du); - p = lerp(p0,p1,v); - dpdu = lerp(dp0du,dp1du,v); - dpdv = p1-p0; - } - - __forceinline TensorLinearQuadraticBezierSurface derivative_u() const { - return TensorLinearQuadraticBezierSurface(L.derivative(),R.derivative()); - } - - __forceinline CubicBezierCurve derivative_v() const { - return R-L; - } - - __forceinline V axis_u() const { - return (L.end()-L.begin())+(R.end()-R.begin()); - } - - __forceinline V axis_v() const { - return (R.begin()-L.begin())+(R.end()-L.end()); - } - - friend embree_ostream operator<<(embree_ostream cout, const TensorLinearCubicBezierSurface& a) - { - return cout << "TensorLinearCubicBezierSurface" << embree_endl - << "{" << embree_endl - << " L = " << a.L << ", " << embree_endl - << " R = " << a.R << embree_endl - << "}"; - } - - friend __forceinline TensorLinearCubicBezierSurface clerp(const TensorLinearCubicBezierSurface& a, const TensorLinearCubicBezierSurface& b, const float t) { - return TensorLinearCubicBezierSurface(clerp(a.L,b.L,V(t)), clerp(a.R,b.R,V(t))); - } - }; - - template<> - struct TensorLinearCubicBezierSurface - { - CubicBezierCurve LR; - - __forceinline TensorLinearCubicBezierSurface() {} - - __forceinline TensorLinearCubicBezierSurface(const TensorLinearCubicBezierSurface& curve) - : LR(curve.LR) {} - - __forceinline TensorLinearCubicBezierSurface& operator= (const TensorLinearCubicBezierSurface& other) { - LR = other.LR; return *this; - } - - __forceinline TensorLinearCubicBezierSurface(const CubicBezierCurve& LR) - : LR(LR) {} - - __forceinline TensorLinearCubicBezierSurface(const CubicBezierCurve& L, const CubicBezierCurve& R) - : LR(shuffle<0,1,0,1>(vfloat4(L.v0),vfloat4(R.v0)),shuffle<0,1,0,1>(vfloat4(L.v1),vfloat4(R.v1)),shuffle<0,1,0,1>(vfloat4(L.v2),vfloat4(R.v2)),shuffle<0,1,0,1>(vfloat4(L.v3),vfloat4(R.v3))) {} - - __forceinline CubicBezierCurve getL() const { - return CubicBezierCurve(Vec2fa(LR.v0),Vec2fa(LR.v1),Vec2fa(LR.v2),Vec2fa(LR.v3)); - } - - __forceinline CubicBezierCurve getR() const { - return CubicBezierCurve(Vec2fa(shuffle<2,3,2,3>(LR.v0)),Vec2fa(shuffle<2,3,2,3>(LR.v1)),Vec2fa(shuffle<2,3,2,3>(LR.v2)),Vec2fa(shuffle<2,3,2,3>(LR.v3))); - } - - __forceinline BBox bounds() const - { - const BBox b = LR.bounds(); - const BBox bl(Vec2fa(b.lower),Vec2fa(b.upper)); - const BBox br(Vec2fa(shuffle<2,3,2,3>(b.lower)),Vec2fa(shuffle<2,3,2,3>(b.upper))); - return merge(bl,br); - } - - __forceinline BBox1f bounds(const Vec2fa& axis) const - { - const CubicBezierCurve LRx = LR; - const CubicBezierCurve LRy(shuffle<1,0,3,2>(LR.v0),shuffle<1,0,3,2>(LR.v1),shuffle<1,0,3,2>(LR.v2),shuffle<1,0,3,2>(LR.v3)); - const CubicBezierCurve LRa = cmadd(shuffle<0>(vfloat4(axis)),LRx,shuffle<1>(vfloat4(axis))*LRy); - const BBox Lb = LRa.bounds(); - const BBox Rb(shuffle<3>(Lb.lower),shuffle<3>(Lb.upper)); - const BBox b = merge(Lb,Rb); - return BBox1f(b.lower[0],b.upper[0]); - } - - __forceinline TensorLinearCubicBezierSurface xfm(const Vec2fa& dx) const - { - const CubicBezierCurve LRx = LR; - const CubicBezierCurve LRy(shuffle<1,0,3,2>(LR.v0),shuffle<1,0,3,2>(LR.v1),shuffle<1,0,3,2>(LR.v2),shuffle<1,0,3,2>(LR.v3)); - const CubicBezierCurve LRa = cmadd(shuffle<0>(vfloat4(dx)),LRx,shuffle<1>(vfloat4(dx))*LRy); - return TensorLinearCubicBezierSurface(CubicBezierCurve(LRa.v0[0],LRa.v1[0],LRa.v2[0],LRa.v3[0]), - CubicBezierCurve(LRa.v0[2],LRa.v1[2],LRa.v2[2],LRa.v3[2])); - } - - __forceinline TensorLinearCubicBezierSurface xfm(const Vec2fa& dx, const Vec2fa& p) const - { - const vfloat4 pxyxy = shuffle<0,1,0,1>(vfloat4(p)); - const CubicBezierCurve LRx = LR-pxyxy; - const CubicBezierCurve LRy(shuffle<1,0,3,2>(LR.v0),shuffle<1,0,3,2>(LR.v1),shuffle<1,0,3,2>(LR.v2),shuffle<1,0,3,2>(LR.v3)); - const CubicBezierCurve LRa = cmadd(shuffle<0>(vfloat4(dx)),LRx,shuffle<1>(vfloat4(dx))*LRy); - return TensorLinearCubicBezierSurface(CubicBezierCurve(LRa.v0[0],LRa.v1[0],LRa.v2[0],LRa.v3[0]), - CubicBezierCurve(LRa.v0[2],LRa.v1[2],LRa.v2[2],LRa.v3[2])); - } - - __forceinline TensorLinearCubicBezierSurface clip_u(const Interval1f& u) const { - return TensorLinearCubicBezierSurface(LR.clip(u)); - } - - __forceinline TensorLinearCubicBezierSurface clip_v(const Interval1f& v) const - { - const CubicBezierCurve LL(shuffle<0,1,0,1>(LR.v0),shuffle<0,1,0,1>(LR.v1),shuffle<0,1,0,1>(LR.v2),shuffle<0,1,0,1>(LR.v3)); - const CubicBezierCurve RR(shuffle<2,3,2,3>(LR.v0),shuffle<2,3,2,3>(LR.v1),shuffle<2,3,2,3>(LR.v2),shuffle<2,3,2,3>(LR.v3)); - return TensorLinearCubicBezierSurface(clerp(LL,RR,vfloat4(v.lower,v.lower,v.upper,v.upper))); - } - - __forceinline TensorLinearCubicBezierSurface clip(const Interval1f& u, const Interval1f& v) const { - return clip_v(v).clip_u(u); - } - - __forceinline void split_u(TensorLinearCubicBezierSurface& left, TensorLinearCubicBezierSurface& right, const float u = 0.5f) const - { - CubicBezierCurve LR0,LR1; LR.split(LR0,LR1,u); - new (&left ) TensorLinearCubicBezierSurface(LR0); - new (&right) TensorLinearCubicBezierSurface(LR1); - } - - __forceinline TensorLinearCubicBezierSurface vsplit_u(vboolx& valid, const BBox1f& u) const { - valid = true; clear(valid,VSIZEX-1); - return TensorLinearCubicBezierSurface(getL().split(u),getR().split(u)); - } - - __forceinline Vec2fa eval(const float u, const float v) const - { - const vfloat4 p = LR.eval(u); - return Vec2fa(lerp(shuffle<0,1,0,1>(p),shuffle<2,3,2,3>(p),v)); - } - - __forceinline Vec2fa eval_du(const float u, const float v) const - { - const vfloat4 dpdu = LR.eval_dt(u); - return Vec2fa(lerp(shuffle<0,1,0,1>(dpdu),shuffle<2,3,2,3>(dpdu),v)); - } - - __forceinline Vec2fa eval_dv(const float u, const float v) const - { - const vfloat4 p = LR.eval(u); - return Vec2fa(shuffle<2,3,2,3>(p)-shuffle<0,1,0,1>(p)); - } - - __forceinline void eval(const float u, const float v, Vec2fa& p, Vec2fa& dpdu, Vec2fa& dpdv) const - { - vfloat4 p0, dp0du; LR.eval(u,p0,dp0du); - p = Vec2fa(lerp(shuffle<0,1,0,1>(p0),shuffle<2,3,2,3>(p0),v)); - dpdu = Vec2fa(lerp(shuffle<0,1,0,1>(dp0du),shuffle<2,3,2,3>(dp0du),v)); - dpdv = Vec2fa(shuffle<2,3,2,3>(p0)-shuffle<0,1,0,1>(p0)); - } - - __forceinline TensorLinearQuadraticBezierSurface derivative_u() const { - return TensorLinearQuadraticBezierSurface(LR.derivative()); - } - - __forceinline CubicBezierCurve derivative_v() const { - return getR()-getL(); - } - - __forceinline Vec2fa axis_u() const - { - const CubicBezierCurve L = getL(); - const CubicBezierCurve R = getR(); - return (L.end()-L.begin())+(R.end()-R.begin()); - } - - __forceinline Vec2fa axis_v() const - { - const CubicBezierCurve L = getL(); - const CubicBezierCurve R = getR(); - return (R.begin()-L.begin())+(R.end()-L.end()); - } - - friend embree_ostream operator<<(embree_ostream cout, const TensorLinearCubicBezierSurface& a) - { - return cout << "TensorLinearCubicBezierSurface" << embree_endl - << "{" << embree_endl - << " L = " << a.getL() << ", " << embree_endl - << " R = " << a.getR() << embree_endl - << "}"; - } - }; - - typedef TensorLinearCubicBezierSurface TensorLinearCubicBezierSurface1f; - typedef TensorLinearCubicBezierSurface TensorLinearCubicBezierSurface2fa; - typedef TensorLinearCubicBezierSurface TensorLinearCubicBezierSurface3fa; - } -} diff --git a/thirdparty/embree-aarch64/kernels/subdiv/patch.h b/thirdparty/embree-aarch64/kernels/subdiv/patch.h deleted file mode 100644 index d58241b96d..0000000000 --- a/thirdparty/embree-aarch64/kernels/subdiv/patch.h +++ /dev/null @@ -1,371 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "catmullclark_patch.h" -#include "bilinear_patch.h" -#include "bspline_patch.h" -#include "bezier_patch.h" -#include "gregory_patch.h" -#include "tessellation_cache.h" - -#if 1 -#define PATCH_DEBUG_SUBDIVISION(ptr,x,y,z) -#else -#define PATCH_DEBUG_SUBDIVISION(ptr,x,y,z) \ - { \ - size_t hex = (size_t)ptr; \ - for (size_t i=0; i<4; i++) hex = hex ^ (hex >> 8); \ - const float c = (float)(((hex >> 0) ^ (hex >> 4) ^ (hex >> 8) ^ (hex >> 12) ^ (hex >> 16))&0xf)/15.0f; \ - if (P) *P = Vertex(0.5f+0.5f*x,0.5f+0.5f*y,0.5f+0.5f*z,0.0f); \ - } -#endif - -#define PATCH_MAX_CACHE_DEPTH 2 -//#define PATCH_MIN_RESOLUTION 1 // FIXME: not yet completely implemented -#define PATCH_MAX_EVAL_DEPTH_IRREGULAR 10 // maximum evaluation depth at irregular vertices (has to be larger or equal than PATCH_MAX_CACHE_DEPTH) -#define PATCH_MAX_EVAL_DEPTH_CREASE 10 // maximum evaluation depth at crease features (has to be larger or equal than PATCH_MAX_CACHE_DEPTH) -#define PATCH_USE_GREGORY 1 // 0 = no gregory, 1 = fill, 2 = as early as possible - -#if PATCH_USE_GREGORY==2 -#define PATCH_USE_BEZIER_PATCH 1 // enable use of bezier instead of b-spline patches -#else -#define PATCH_USE_BEZIER_PATCH 0 // enable use of bezier instead of b-spline patches -#endif - -#if PATCH_USE_BEZIER_PATCH -# define RegularPatch BezierPatch -# define RegularPatchT BezierPatchT -#else -# define RegularPatch BSplinePatch -# define RegularPatchT BSplinePatchT -#endif - -#if PATCH_USE_GREGORY -#define IrregularFillPatch GregoryPatch -#define IrregularFillPatchT GregoryPatchT -#else -#define IrregularFillPatch BilinearPatch -#define IrregularFillPatchT BilinearPatchT -#endif - -namespace embree -{ - template - struct __aligned(64) PatchT - { - public: - - typedef GeneralCatmullClarkPatchT GeneralCatmullClarkPatch; - typedef CatmullClarkPatchT CatmullClarkPatch; - typedef CatmullClark1RingT CatmullClarkRing; - typedef BezierCurveT BezierCurve; - - enum Type { - INVALID_PATCH = 0, - BILINEAR_PATCH = 1, - BSPLINE_PATCH = 2, - BEZIER_PATCH = 3, - GREGORY_PATCH = 4, - SUBDIVIDED_GENERAL_PATCH = 7, - SUBDIVIDED_QUAD_PATCH = 8, - EVAL_PATCH = 9, - }; - - struct Ref - { - __forceinline Ref(void* p = nullptr) - : ptr((size_t)p) {} - - __forceinline operator bool() const { return ptr != 0; } - __forceinline operator size_t() const { return ptr; } - - __forceinline Ref (Type ty, void* in) - : ptr(((size_t)in)+ty) { assert((((size_t)in) & 0xF) == 0); } - - __forceinline Type type () const { return (Type)(ptr & 0xF); } - __forceinline void* object() const { return (void*) (ptr & ~0xF); } - - size_t ptr; - }; - - struct EvalPatch - { - /* creates EvalPatch from a CatmullClarkPatch */ - template - __noinline static Ref create(const Allocator& alloc, const CatmullClarkPatch& patch) - { - size_t ofs = 0, bytes = patch.bytes(); - void* ptr = alloc(bytes); - patch.serialize(ptr,ofs); - assert(ofs == bytes); - return Ref(EVAL_PATCH, ptr); - } - }; - - struct BilinearPatch - { - /* creates BilinearPatch from a CatmullClarkPatch */ - template - __noinline static Ref create(const Allocator& alloc, const CatmullClarkPatch& patch, - const BezierCurve* border0, const BezierCurve* border1, const BezierCurve* border2, const BezierCurve* border3) { - return Ref(BILINEAR_PATCH, new (alloc(sizeof(BilinearPatch))) BilinearPatch(patch)); - } - - __forceinline BilinearPatch (const CatmullClarkPatch& patch) - : patch(patch) {} - - /* creates BilinearPatch from 4 vertices */ - template - __noinline static Ref create(const Allocator& alloc, const HalfEdge* edge, const char* vertices, size_t stride) { - return Ref(BILINEAR_PATCH, new (alloc(sizeof(BilinearPatch))) BilinearPatch(edge,vertices,stride)); - } - - __forceinline BilinearPatch (const HalfEdge* edge, const char* vertices, size_t stride) - : patch(edge,vertices,stride) {} - - public: - BilinearPatchT patch; - }; - - struct BSplinePatch - { - /* creates BSplinePatch from a half edge */ - template - __noinline static Ref create(const Allocator& alloc, const HalfEdge* edge, const char* vertices, size_t stride) { - return Ref(BSPLINE_PATCH, new (alloc(sizeof(BSplinePatch))) BSplinePatch(edge,vertices,stride)); - } - - __forceinline BSplinePatch (const HalfEdge* edge, const char* vertices, size_t stride) - : patch(edge,vertices,stride) {} - - /* creates BSplinePatch from a CatmullClarkPatch */ - template - __noinline static Ref create(const Allocator& alloc, const CatmullClarkPatch& patch, - const BezierCurve* border0, const BezierCurve* border1, const BezierCurve* border2, const BezierCurve* border3) { - return Ref(BSPLINE_PATCH, new (alloc(sizeof(BSplinePatch))) BSplinePatch(patch,border0,border1,border2,border3)); - } - - __forceinline BSplinePatch (const CatmullClarkPatch& patch, const BezierCurve* border0, const BezierCurve* border1, const BezierCurve* border2, const BezierCurve* border3) - : patch(patch,border0,border1,border2,border3) {} - - public: - BSplinePatchT patch; - }; - - struct BezierPatch - { - /* creates BezierPatch from a half edge */ - template - __noinline static Ref create(const Allocator& alloc, const HalfEdge* edge, const char* vertices, size_t stride) { - return Ref(BEZIER_PATCH, new (alloc(sizeof(BezierPatch))) BezierPatch(edge,vertices,stride)); - } - - __forceinline BezierPatch (const HalfEdge* edge, const char* vertices, size_t stride) - : patch(edge,vertices,stride) {} - - /* creates Bezier from a CatmullClarkPatch */ - template - __noinline static Ref create(const Allocator& alloc, const CatmullClarkPatch& patch, - const BezierCurve* border0, const BezierCurve* border1, const BezierCurve* border2, const BezierCurve* border3) { - return Ref(BEZIER_PATCH, new (alloc(sizeof(BezierPatch))) BezierPatch(patch,border0,border1,border2,border3)); - } - - __forceinline BezierPatch (const CatmullClarkPatch& patch, const BezierCurve* border0, const BezierCurve* border1, const BezierCurve* border2, const BezierCurve* border3) - : patch(patch,border0,border1,border2,border3) {} - - public: - BezierPatchT patch; - }; - - struct GregoryPatch - { - /* creates GregoryPatch from half edge */ - template - __noinline static Ref create(const Allocator& alloc, const HalfEdge* edge, const char* vertices, size_t stride) { - return Ref(GREGORY_PATCH, new (alloc(sizeof(GregoryPatch))) GregoryPatch(edge,vertices,stride)); - } - - __forceinline GregoryPatch (const HalfEdge* edge, const char* vertices, size_t stride) - : patch(CatmullClarkPatch(edge,vertices,stride)) {} - - /* creates GregoryPatch from CatmullClarkPatch */ - template - __noinline static Ref create(const Allocator& alloc, const CatmullClarkPatch& patch, - const BezierCurve* border0, const BezierCurve* border1, const BezierCurve* border2, const BezierCurve* border3) { - return Ref(GREGORY_PATCH, new (alloc(sizeof(GregoryPatch))) GregoryPatch(patch,border0,border1,border2,border3)); - } - - __forceinline GregoryPatch (const CatmullClarkPatch& patch, const BezierCurve* border0, const BezierCurve* border1, const BezierCurve* border2, const BezierCurve* border3) - : patch(patch,border0,border1,border2,border3) {} - - public: - GregoryPatchT patch; - }; - - struct SubdividedQuadPatch - { - template - __noinline static Ref create(const Allocator& alloc, Ref children[4]) { - return Ref(SUBDIVIDED_QUAD_PATCH, new (alloc(sizeof(SubdividedQuadPatch))) SubdividedQuadPatch(children)); - } - - __forceinline SubdividedQuadPatch(Ref children[4]) { - for (size_t i=0; i<4; i++) child[i] = children[i]; - } - - public: - Ref child[4]; - }; - - struct SubdividedGeneralPatch - { - template - __noinline static Ref create(const Allocator& alloc, Ref* children, const unsigned N) { - return Ref(SUBDIVIDED_GENERAL_PATCH, new (alloc(sizeof(SubdividedGeneralPatch))) SubdividedGeneralPatch(children,N)); - } - - __forceinline SubdividedGeneralPatch(Ref* children, const unsigned N) : N(N) { - for (unsigned i=0; i - __noinline static Ref create(const Allocator& alloc, const HalfEdge* edge, const char* vertices, size_t stride) - { - if (PATCH_MAX_CACHE_DEPTH == 0) - return nullptr; - - Ref child(0); - switch (edge->patch_type) { - case HalfEdge::BILINEAR_PATCH: child = BilinearPatch::create(alloc,edge,vertices,stride); break; - case HalfEdge::REGULAR_QUAD_PATCH: child = RegularPatch::create(alloc,edge,vertices,stride); break; -#if PATCH_USE_GREGORY == 2 - case HalfEdge::IRREGULAR_QUAD_PATCH: child = GregoryPatch::create(alloc,edge,vertices,stride); break; -#endif - default: { - GeneralCatmullClarkPatch patch(edge,vertices,stride); - child = PatchT::create(alloc,patch,edge,vertices,stride,0); - } - } - return child; - } - - template - __noinline static Ref create(const Allocator& alloc, GeneralCatmullClarkPatch& patch, const HalfEdge* edge, const char* vertices, size_t stride, size_t depth) - { - /* convert into standard quad patch if possible */ - if (likely(patch.isQuadPatch())) - { - CatmullClarkPatch qpatch; patch.init(qpatch); - return PatchT::create(alloc,qpatch,edge,vertices,stride,depth); - } - - /* do only cache up to some depth */ - if (depth >= PATCH_MAX_CACHE_DEPTH) - return nullptr; - - /* subdivide patch */ - unsigned N; - array_t patches; - patch.subdivide(patches,N); - - if (N == 4) - { - Ref child[4]; -#if PATCH_USE_GREGORY == 2 - BezierCurve borders[GeneralCatmullClarkPatch::SIZE]; patch.getLimitBorder(borders); - BezierCurve border0l,border0r; borders[0].subdivide(border0l,border0r); - BezierCurve border1l,border1r; borders[1].subdivide(border1l,border1r); - BezierCurve border2l,border2r; borders[2].subdivide(border2l,border2r); - BezierCurve border3l,border3r; borders[3].subdivide(border3l,border3r); - GeneralCatmullClarkPatch::fix_quad_ring_order(patches); - child[0] = PatchT::create(alloc,patches[0],edge,vertices,stride,depth+1,&border0l,nullptr,nullptr,&border3r); - child[1] = PatchT::create(alloc,patches[1],edge,vertices,stride,depth+1,&border0r,&border1l,nullptr,nullptr); - child[2] = PatchT::create(alloc,patches[2],edge,vertices,stride,depth+1,nullptr,&border1r,&border2l,nullptr); - child[3] = PatchT::create(alloc,patches[3],edge,vertices,stride,depth+1,nullptr,nullptr,&border2r,&border3l); -#else - GeneralCatmullClarkPatch::fix_quad_ring_order(patches); - for (size_t i=0; i<4; i++) - child[i] = PatchT::create(alloc,patches[i],edge,vertices,stride,depth+1); -#endif - return SubdividedQuadPatch::create(alloc,child); - } - else - { - assert(N=max_eval_depth; -//#else - return depth>=max_eval_depth; -//#endif - } - - template - __noinline static Ref create(const Allocator& alloc, CatmullClarkPatch& patch, const HalfEdge* edge, const char* vertices, size_t stride, size_t depth, - const BezierCurve* border0 = nullptr, const BezierCurve* border1 = nullptr, const BezierCurve* border2 = nullptr, const BezierCurve* border3 = nullptr) - { - const typename CatmullClarkPatch::Type ty = patch.type(); - if (unlikely(final(patch,ty,depth))) { - if (ty & CatmullClarkRing::TYPE_REGULAR) return RegularPatch::create(alloc,patch,border0,border1,border2,border3); - else return IrregularFillPatch::create(alloc,patch,border0,border1,border2,border3); - } - else if (ty & CatmullClarkRing::TYPE_REGULAR_CREASES) { - assert(depth > 0); return RegularPatch::create(alloc,patch,border0,border1,border2,border3); - } -#if PATCH_USE_GREGORY == 2 - else if (ty & CatmullClarkRing::TYPE_GREGORY_CREASES) { - assert(depth > 0); return GregoryPatch::create(alloc,patch,border0,border1,border2,border3); - } -#endif - else if (depth >= PATCH_MAX_CACHE_DEPTH) { - return EvalPatch::create(alloc,patch); - } - - else - { - Ref child[4]; - array_t patches; - patch.subdivide(patches); - - for (size_t i=0; i<4; i++) - child[i] = PatchT::create(alloc,patches[i],edge,vertices,stride,depth+1); - return SubdividedQuadPatch::create(alloc,child); - } - } - }; - - typedef PatchT Patch3fa; -} diff --git a/thirdparty/embree-aarch64/kernels/subdiv/patch_eval.h b/thirdparty/embree-aarch64/kernels/subdiv/patch_eval.h deleted file mode 100644 index 482d015fa3..0000000000 --- a/thirdparty/embree-aarch64/kernels/subdiv/patch_eval.h +++ /dev/null @@ -1,129 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "patch.h" -#include "feature_adaptive_eval.h" - -namespace embree -{ - namespace isa - { - template - struct PatchEval - { - public: - - typedef PatchT Patch; - typedef typename Patch::Ref Ref; - typedef CatmullClarkPatchT CatmullClarkPatch; - - PatchEval (SharedLazyTessellationCache::CacheEntry& entry, size_t commitCounter, - const HalfEdge* edge, const char* vertices, size_t stride, const float u, const float v, - Vertex* P, Vertex* dPdu, Vertex* dPdv, Vertex* ddPdudu, Vertex* ddPdvdv, Vertex* ddPdudv) - : P(P), dPdu(dPdu), dPdv(dPdv), ddPdudu(ddPdudu), ddPdvdv(ddPdvdv), ddPdudv(ddPdudv) - { - /* conservative time for the very first allocation */ - auto time = SharedLazyTessellationCache::sharedLazyTessellationCache.getTime(commitCounter); - - Ref patch = SharedLazyTessellationCache::lookup(entry,commitCounter,[&] () { - auto alloc = [&](size_t bytes) { return SharedLazyTessellationCache::malloc(bytes); }; - return Patch::create(alloc,edge,vertices,stride); - },true); - - auto curTime = SharedLazyTessellationCache::sharedLazyTessellationCache.getTime(commitCounter); - const bool allAllocationsValid = SharedLazyTessellationCache::validTime(time,curTime); - - if (patch && allAllocationsValid && eval(patch,u,v,1.0f,0)) { - SharedLazyTessellationCache::unlock(); - return; - } - SharedLazyTessellationCache::unlock(); - FeatureAdaptiveEval(edge,vertices,stride,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv); - PATCH_DEBUG_SUBDIVISION(edge,c,-1,-1); - } - - __forceinline bool eval_quad(const typename Patch::SubdividedQuadPatch* This, const float u, const float v, const float dscale, const size_t depth) - { - if (v < 0.5f) { - if (u < 0.5f) return eval(This->child[0],2.0f*u,2.0f*v,2.0f*dscale,depth+1); - else return eval(This->child[1],2.0f*u-1.0f,2.0f*v,2.0f*dscale,depth+1); - } else { - if (u > 0.5f) return eval(This->child[2],2.0f*u-1.0f,2.0f*v-1.0f,2.0f*dscale,depth+1); - else return eval(This->child[3],2.0f*u,2.0f*v-1.0f,2.0f*dscale,depth+1); - } - } - - bool eval_general(const typename Patch::SubdividedGeneralPatch* This, const float U, const float V, const size_t depth) - { - const unsigned l = (unsigned) floor(0.5f*U); const float u = 2.0f*frac(0.5f*U)-0.5f; - const unsigned h = (unsigned) floor(0.5f*V); const float v = 2.0f*frac(0.5f*V)-0.5f; - const unsigned i = 4*h+l; assert(iN); - return eval(This->child[i],u,v,1.0f,depth+1); - } - - bool eval(Ref This, const float& u, const float& v, const float dscale, const size_t depth) - { - if (!This) return false; - //PRINT(depth); - //PRINT2(u,v); - - switch (This.type()) - { - case Patch::BILINEAR_PATCH: { - //PRINT("bilinear"); - ((typename Patch::BilinearPatch*)This.object())->patch.eval(u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale); - PATCH_DEBUG_SUBDIVISION(This,-1,c,c); - return true; - } - case Patch::BSPLINE_PATCH: { - //PRINT("bspline"); - ((typename Patch::BSplinePatch*)This.object())->patch.eval(u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale); - PATCH_DEBUG_SUBDIVISION(This,-1,c,-1); - return true; - } - case Patch::BEZIER_PATCH: { - //PRINT("bezier"); - ((typename Patch::BezierPatch*)This.object())->patch.eval(u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale); - PATCH_DEBUG_SUBDIVISION(This,-1,c,-1); - return true; - } - case Patch::GREGORY_PATCH: { - //PRINT("gregory"); - ((typename Patch::GregoryPatch*)This.object())->patch.eval(u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale); - PATCH_DEBUG_SUBDIVISION(This,-1,-1,c); - return true; - } - case Patch::SUBDIVIDED_QUAD_PATCH: { - //PRINT("subdivided quad"); - return eval_quad(((typename Patch::SubdividedQuadPatch*)This.object()),u,v,dscale,depth); - } - case Patch::SUBDIVIDED_GENERAL_PATCH: { - //PRINT("general_patch"); - assert(dscale == 1.0f); - return eval_general(((typename Patch::SubdividedGeneralPatch*)This.object()),u,v,depth); - } - case Patch::EVAL_PATCH: { - //PRINT("eval_patch"); - CatmullClarkPatch patch; patch.deserialize(This.object()); - FeatureAdaptiveEval(patch,u,v,dscale,depth,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv); - return true; - } - default: - assert(false); - return false; - } - } - - private: - Vertex* const P; - Vertex* const dPdu; - Vertex* const dPdv; - Vertex* const ddPdudu; - Vertex* const ddPdvdv; - Vertex* const ddPdudv; - }; - } -} - diff --git a/thirdparty/embree-aarch64/kernels/subdiv/patch_eval_grid.h b/thirdparty/embree-aarch64/kernels/subdiv/patch_eval_grid.h deleted file mode 100644 index c05db55f4c..0000000000 --- a/thirdparty/embree-aarch64/kernels/subdiv/patch_eval_grid.h +++ /dev/null @@ -1,245 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "patch.h" -#include "feature_adaptive_eval_grid.h" - -namespace embree -{ - namespace isa - { - struct PatchEvalGrid - { - typedef Patch3fa Patch; - typedef Patch::Ref Ref; - typedef GeneralCatmullClarkPatch3fa GeneralCatmullClarkPatch; - typedef CatmullClarkPatch3fa CatmullClarkPatch; - typedef BSplinePatch3fa BSplinePatch; - typedef BezierPatch3fa BezierPatch; - typedef GregoryPatch3fa GregoryPatch; - typedef BilinearPatch3fa BilinearPatch; - - private: - const unsigned x0,x1; - const unsigned y0,y1; - const unsigned swidth,sheight; - const float rcp_swidth, rcp_sheight; - float* const Px; - float* const Py; - float* const Pz; - float* const U; - float* const V; - float* const Nx; - float* const Ny; - float* const Nz; - const unsigned dwidth,dheight; - unsigned count; - - public: - - PatchEvalGrid (Ref patch, unsigned subPatch, - const unsigned x0, const unsigned x1, const unsigned y0, const unsigned y1, const unsigned swidth, const unsigned sheight, - float* Px, float* Py, float* Pz, float* U, float* V, - float* Nx, float* Ny, float* Nz, - const unsigned dwidth, const unsigned dheight) - : x0(x0), x1(x1), y0(y0), y1(y1), swidth(swidth), sheight(sheight), rcp_swidth(1.0f/(swidth-1.0f)), rcp_sheight(1.0f/(sheight-1.0f)), - Px(Px), Py(Py), Pz(Pz), U(U), V(V), Nx(Nx), Ny(Ny), Nz(Nz), dwidth(dwidth), dheight(dheight), count(0) - { - assert(swidth < (2<<20) && sheight < (2<<20)); - const BBox2f srange(Vec2f(0.0f,0.0f),Vec2f(float(swidth-1),float(sheight-1))); - const BBox2f erange(Vec2f(float(x0),float(y0)),Vec2f((float)x1,(float)y1)); - bool done MAYBE_UNUSED = eval(patch,subPatch,srange,erange); - assert(done); - assert(count == (x1-x0+1)*(y1-y0+1)); - } - - template - __forceinline void evalLocalGrid(const Patch* patch, const BBox2f& srange, const int lx0, const int lx1, const int ly0, const int ly1) - { - const float scale_x = rcp(srange.upper.x-srange.lower.x); - const float scale_y = rcp(srange.upper.y-srange.lower.y); - count += (lx1-lx0)*(ly1-ly0); - -#if 0 - for (unsigned iy=ly0; iypatch.eval(lu,lv); - const float u = float(ix)*rcp_swidth; - const float v = float(iy)*rcp_sheight; - const int ofs = (iy-y0)*dwidth+(ix-x0); - Px[ofs] = p.x; - Py[ofs] = p.y; - Pz[ofs] = p.z; - U[ofs] = u; - V[ofs] = v; - } - } -#else - foreach2(lx0,lx1,ly0,ly1,[&](const vboolx& valid, const vintx& ix, const vintx& iy) { - const vfloatx lu = select(ix == swidth -1, vfloatx(1.0f), (vfloatx(ix)-srange.lower.x)*scale_x); - const vfloatx lv = select(iy == sheight-1, vfloatx(1.0f), (vfloatx(iy)-srange.lower.y)*scale_y); - const Vec3vfx p = patch->patch.eval(lu,lv); - Vec3vfx n = zero; - if (unlikely(Nx != nullptr)) n = normalize_safe(patch->patch.normal(lu,lv)); - const vfloatx u = vfloatx(ix)*rcp_swidth; - const vfloatx v = vfloatx(iy)*rcp_sheight; - const vintx ofs = (iy-y0)*dwidth+(ix-x0); - if (likely(all(valid)) && all(iy==iy[0])) { - const unsigned ofs2 = ofs[0]; - vfloatx::storeu(Px+ofs2,p.x); - vfloatx::storeu(Py+ofs2,p.y); - vfloatx::storeu(Pz+ofs2,p.z); - vfloatx::storeu(U+ofs2,u); - vfloatx::storeu(V+ofs2,v); - if (unlikely(Nx != nullptr)) { - vfloatx::storeu(Nx+ofs2,n.x); - vfloatx::storeu(Ny+ofs2,n.y); - vfloatx::storeu(Nz+ofs2,n.z); - } - } else { - foreach_unique_index(valid,iy,[&](const vboolx& valid, const int iy0, const int j) { - const unsigned ofs2 = ofs[j]-j; - vfloatx::storeu(valid,Px+ofs2,p.x); - vfloatx::storeu(valid,Py+ofs2,p.y); - vfloatx::storeu(valid,Pz+ofs2,p.z); - vfloatx::storeu(valid,U+ofs2,u); - vfloatx::storeu(valid,V+ofs2,v); - if (unlikely(Nx != nullptr)) { - vfloatx::storeu(valid,Nx+ofs2,n.x); - vfloatx::storeu(valid,Ny+ofs2,n.y); - vfloatx::storeu(valid,Nz+ofs2,n.z); - } - }); - } - }); -#endif - } - - bool eval(Ref This, const BBox2f& srange, const BBox2f& erange, const unsigned depth) - { - if (erange.empty()) - return true; - - const int lx0 = (int) ceilf(erange.lower.x); - const int lx1 = (int) ceilf(erange.upper.x) + (erange.upper.x == x1 && (srange.lower.x < erange.upper.x || erange.upper.x == 0)); - const int ly0 = (int) ceilf(erange.lower.y); - const int ly1 = (int) ceilf(erange.upper.y) + (erange.upper.y == y1 && (srange.lower.y < erange.upper.y || erange.upper.y == 0)); - if (lx0 >= lx1 || ly0 >= ly1) - return true; - - if (!This) - return false; - - switch (This.type()) - { - case Patch::BILINEAR_PATCH: { - evalLocalGrid((Patch::BilinearPatch*)This.object(),srange,lx0,lx1,ly0,ly1); - return true; - } - case Patch::BSPLINE_PATCH: { - evalLocalGrid((Patch::BSplinePatch*)This.object(),srange,lx0,lx1,ly0,ly1); - return true; - } - case Patch::BEZIER_PATCH: { - evalLocalGrid((Patch::BezierPatch*)This.object(),srange,lx0,lx1,ly0,ly1); - return true; - } - case Patch::GREGORY_PATCH: { - evalLocalGrid((Patch::GregoryPatch*)This.object(),srange,lx0,lx1,ly0,ly1); - return true; - } - case Patch::SUBDIVIDED_QUAD_PATCH: - { - const Vec2f c = srange.center(); - const BBox2f srange0(srange.lower,c); - const BBox2f srange1(Vec2f(c.x,srange.lower.y),Vec2f(srange.upper.x,c.y)); - const BBox2f srange2(c,srange.upper); - const BBox2f srange3(Vec2f(srange.lower.x,c.y),Vec2f(c.x,srange.upper.y)); - - Patch::SubdividedQuadPatch* patch = (Patch::SubdividedQuadPatch*)This.object(); - eval(patch->child[0],srange0,intersect(srange0,erange),depth+1); - eval(patch->child[1],srange1,intersect(srange1,erange),depth+1); - eval(patch->child[2],srange2,intersect(srange2,erange),depth+1); - eval(patch->child[3],srange3,intersect(srange3,erange),depth+1); - return true; - } - case Patch::EVAL_PATCH: { - CatmullClarkPatch patch; patch.deserialize(This.object()); - FeatureAdaptiveEvalGrid(patch,srange,erange,depth,x0,x1,y0,y1,swidth,sheight,Px,Py,Pz,U,V,Nx,Ny,Nz,dwidth,dheight); - count += (lx1-lx0)*(ly1-ly0); - return true; - } - default: - assert(false); - return false; - } - } - - bool eval(Ref This, unsigned subPatch, const BBox2f& srange, const BBox2f& erange) - { - if (!This) - return false; - - switch (This.type()) - { - case Patch::SUBDIVIDED_GENERAL_PATCH: { - Patch::SubdividedGeneralPatch* patch = (Patch::SubdividedGeneralPatch*)This.object(); - assert(subPatch < patch->N); - return eval(patch->child[subPatch],srange,erange,1); - } - default: - assert(subPatch == 0); - return eval(This,srange,erange,0); - } - } - }; - - __forceinline unsigned patch_eval_subdivision_count (const HalfEdge* h) - { - const unsigned N = h->numEdges(); - if (N == 4) return 1; - else return N; - } - - template - inline void patch_eval_subdivision (const HalfEdge* h, Tessellator tessellator) - { - const unsigned N = h->numEdges(); - int neighborSubdiv[GeneralCatmullClarkPatch3fa::SIZE]; // FIXME: use array_t - float levels[GeneralCatmullClarkPatch3fa::SIZE]; - for (unsigned i=0; ihasOpposite() ? h->opposite()->numEdges() != 4 : 0; - levels[i] = h->edge_level; - h = h->next(); - } - if (N == 4) - { - const Vec2f uv[4] = { Vec2f(0.0f,0.0f), Vec2f(1.0f,0.0f), Vec2f(1.0f,1.0f), Vec2f(0.0f,1.0f) }; - tessellator(uv,neighborSubdiv,levels,0); - } - else - { - for (unsigned i=0; i 16"); - const int h = (i >> 2) & 3, l = i & 3; - const Vec2f subPatchID((float)l,(float)h); - const Vec2f uv[4] = { 2.0f*subPatchID + (0.5f+Vec2f(0.0f,0.0f)), - 2.0f*subPatchID + (0.5f+Vec2f(1.0f,0.0f)), - 2.0f*subPatchID + (0.5f+Vec2f(1.0f,1.0f)), - 2.0f*subPatchID + (0.5f+Vec2f(0.0f,1.0f)) }; - const int neighborSubdiv1[4] = { 0,0,0,0 }; - const float levels1[4] = { 0.5f*levels[(i+0)%N], 0.5f*levels[(i+0)%N], 0.5f*levels[(i+N-1)%N], 0.5f*levels[(i+N-1)%N] }; - tessellator(uv,neighborSubdiv1,levels1,i); - } - } - } - } -} - diff --git a/thirdparty/embree-aarch64/kernels/subdiv/patch_eval_simd.h b/thirdparty/embree-aarch64/kernels/subdiv/patch_eval_simd.h deleted file mode 100644 index 28016d9e20..0000000000 --- a/thirdparty/embree-aarch64/kernels/subdiv/patch_eval_simd.h +++ /dev/null @@ -1,127 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "patch.h" -#include "feature_adaptive_eval_simd.h" - -namespace embree -{ - namespace isa - { - template - struct PatchEvalSimd - { - public: - - typedef PatchT Patch; - typedef typename Patch::Ref Ref; - typedef CatmullClarkPatchT CatmullClarkPatch; - - PatchEvalSimd (SharedLazyTessellationCache::CacheEntry& entry, size_t commitCounter, - const HalfEdge* edge, const char* vertices, size_t stride, const vbool& valid0, const vfloat& u, const vfloat& v, - float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv, const size_t dstride, const size_t N) - : P(P), dPdu(dPdu), dPdv(dPdv), ddPdudu(ddPdudu), ddPdvdv(ddPdvdv), ddPdudv(ddPdudv), dstride(dstride), N(N) - { - /* conservative time for the very first allocation */ - auto time = SharedLazyTessellationCache::sharedLazyTessellationCache.getTime(commitCounter); - - Ref patch = SharedLazyTessellationCache::lookup(entry,commitCounter,[&] () { - auto alloc = [](size_t bytes) { return SharedLazyTessellationCache::malloc(bytes); }; - return Patch::create(alloc,edge,vertices,stride); - }, true); - - auto curTime = SharedLazyTessellationCache::sharedLazyTessellationCache.getTime(commitCounter); - const bool allAllocationsValid = SharedLazyTessellationCache::validTime(time,curTime); - - patch = allAllocationsValid ? patch : nullptr; - - /* use cached data structure for calculations */ - const vbool valid1 = patch ? eval(valid0,patch,u,v,1.0f,0) : vbool(false); - SharedLazyTessellationCache::unlock(); - const vbool valid2 = valid0 & !valid1; - if (any(valid2)) { - FeatureAdaptiveEvalSimd(edge,vertices,stride,valid2,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dstride,N); - } - } - - vbool eval_quad(const vbool& valid, const typename Patch::SubdividedQuadPatch* This, const vfloat& u, const vfloat& v, const float dscale, const size_t depth) - { - vbool ret = false; - const vbool u0_mask = u < 0.5f, u1_mask = u >= 0.5f; - const vbool v0_mask = v < 0.5f, v1_mask = v >= 0.5f; - const vbool u0v0_mask = valid & u0_mask & v0_mask; - const vbool u0v1_mask = valid & u0_mask & v1_mask; - const vbool u1v0_mask = valid & u1_mask & v0_mask; - const vbool u1v1_mask = valid & u1_mask & v1_mask; - if (any(u0v0_mask)) ret |= eval(u0v0_mask,This->child[0],2.0f*u,2.0f*v,2.0f*dscale,depth+1); - if (any(u1v0_mask)) ret |= eval(u1v0_mask,This->child[1],2.0f*u-1.0f,2.0f*v,2.0f*dscale,depth+1); - if (any(u1v1_mask)) ret |= eval(u1v1_mask,This->child[2],2.0f*u-1.0f,2.0f*v-1.0f,2.0f*dscale,depth+1); - if (any(u0v1_mask)) ret |= eval(u0v1_mask,This->child[3],2.0f*u,2.0f*v-1.0f,2.0f*dscale,depth+1); - return ret; - } - - vbool eval_general(const vbool& valid, const typename Patch::SubdividedGeneralPatch* patch, const vfloat& U, const vfloat& V, const size_t depth) - { - vbool ret = false; - const vint l = (vint)floor(0.5f*U); const vfloat u = 2.0f*frac(0.5f*U)-0.5f; - const vint h = (vint)floor(0.5f*V); const vfloat v = 2.0f*frac(0.5f*V)-0.5f; - const vint i = (h<<2)+l; assert(all(valid,iN)); - foreach_unique(valid,i,[&](const vbool& valid, const int i) { - ret |= eval(valid,patch->child[i],u,v,1.0f,depth+1); - }); - return ret; - } - - vbool eval(const vbool& valid, Ref This, const vfloat& u, const vfloat& v, const float dscale, const size_t depth) - { - if (!This) return false; - switch (This.type()) - { - case Patch::BILINEAR_PATCH: { - ((typename Patch::BilinearPatch*)This.object())->patch.eval(valid,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N); - return valid; - } - case Patch::BSPLINE_PATCH: { - ((typename Patch::BSplinePatch*)This.object())->patch.eval(valid,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N); - return valid; - } - case Patch::BEZIER_PATCH: { - ((typename Patch::BezierPatch*)This.object())->patch.eval(valid,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N); - return valid; - } - case Patch::GREGORY_PATCH: { - ((typename Patch::GregoryPatch*)This.object())->patch.eval(valid,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N); - return valid; - } - case Patch::SUBDIVIDED_QUAD_PATCH: { - return eval_quad(valid,((typename Patch::SubdividedQuadPatch*)This.object()),u,v,dscale,depth); - } - case Patch::SUBDIVIDED_GENERAL_PATCH: { - assert(dscale == 1.0f); - return eval_general(valid,((typename Patch::SubdividedGeneralPatch*)This.object()),u,v,depth); - } - case Patch::EVAL_PATCH: { - CatmullClarkPatch patch; patch.deserialize(This.object()); - FeatureAdaptiveEvalSimd(patch,valid,u,v,dscale,depth,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dstride,N); - return valid; - } - default: - assert(false); - return false; - } - } - - private: - float* const P; - float* const dPdu; - float* const dPdv; - float* const ddPdudu; - float* const ddPdvdv; - float* const ddPdudv; - const size_t dstride; - const size_t N; - }; - } -} diff --git a/thirdparty/embree-aarch64/kernels/subdiv/subdivpatch1base.h b/thirdparty/embree-aarch64/kernels/subdiv/subdivpatch1base.h deleted file mode 100644 index d5bc403cca..0000000000 --- a/thirdparty/embree-aarch64/kernels/subdiv/subdivpatch1base.h +++ /dev/null @@ -1,156 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../geometry/primitive.h" -#include "bspline_patch.h" -#include "bezier_patch.h" -#include "gregory_patch.h" -#include "gregory_patch_dense.h" -#include "tessellation.h" -#include "tessellation_cache.h" -#include "gridrange.h" -#include "patch_eval_grid.h" -#include "feature_adaptive_eval_grid.h" -#include "../common/scene_subdiv_mesh.h" - -namespace embree -{ - struct __aligned(64) SubdivPatch1Base - { - public: - - enum Type { - INVALID_PATCH = 0, - BSPLINE_PATCH = 1, - BEZIER_PATCH = 2, - GREGORY_PATCH = 3, - EVAL_PATCH = 5, - BILINEAR_PATCH = 6, - }; - - enum Flags { - TRANSITION_PATCH = 16, - }; - - /*! Default constructor. */ - __forceinline SubdivPatch1Base () {} - - SubdivPatch1Base (const unsigned int gID, - const unsigned int pID, - const unsigned int subPatch, - const SubdivMesh *const mesh, - const size_t time, - const Vec2f uv[4], - const float edge_level[4], - const int subdiv[4], - const int simd_width); - - __forceinline bool needsStitching() const { - return flags & TRANSITION_PATCH; - } - - __forceinline Vec2f getUV(const size_t i) const { - return Vec2f((float)u[i],(float)v[i]) * (8.0f/0x10000); - } - - static void computeEdgeLevels(const float edge_level[4], const int subdiv[4], float level[4]); - static Vec2i computeGridSize(const float level[4]); - bool updateEdgeLevels(const float edge_level[4], const int subdiv[4], const SubdivMesh *const mesh, const int simd_width); - - public: - - __forceinline size_t getGridBytes() const { - const size_t grid_size_xyzuv = (grid_size_simd_blocks * VSIZEX) * 4; - return 64*((grid_size_xyzuv+15) / 16); - } - - __forceinline void write_lock() { mtx.lock(); } - __forceinline void write_unlock() { mtx.unlock(); } - __forceinline bool try_write_lock() { return mtx.try_lock(); } - //__forceinline bool try_read_lock() { return mtx.try_read_lock(); } - - __forceinline void resetRootRef() { - //assert( mtx.hasInitialState() ); - root_ref = SharedLazyTessellationCache::Tag(); - } - - __forceinline SharedLazyTessellationCache::CacheEntry& entry() { - return (SharedLazyTessellationCache::CacheEntry&) root_ref; - } - - public: - __forceinline unsigned int geomID() const { - return geom; - } - - __forceinline unsigned int primID() const { - return prim; - } - - public: - SharedLazyTessellationCache::Tag root_ref; - SpinLock mtx; - - unsigned short u[4]; //!< 16bit discretized u,v coordinates - unsigned short v[4]; - float level[4]; - - unsigned char flags; - unsigned char type; - unsigned short grid_u_res; - unsigned int geom; //!< geometry ID of the subdivision mesh this patch belongs to - unsigned int prim; //!< primitive ID of this subdivision patch - unsigned short grid_v_res; - - unsigned short grid_size_simd_blocks; - unsigned int time_; - - struct PatchHalfEdge { - const HalfEdge* edge; - unsigned subPatch; - }; - - Vec3fa patch_v[4][4]; - - const HalfEdge *edge() const { return ((PatchHalfEdge*)patch_v)->edge; } - unsigned time() const { return time_; } - unsigned subPatch() const { return ((PatchHalfEdge*)patch_v)->subPatch; } - - void set_edge(const HalfEdge *h) const { ((PatchHalfEdge*)patch_v)->edge = h; } - void set_subPatch(const unsigned s) const { ((PatchHalfEdge*)patch_v)->subPatch = s; } - }; - - namespace isa - { - Vec3fa patchEval(const SubdivPatch1Base& patch, const float uu, const float vv); - Vec3fa patchNormal(const SubdivPatch1Base& patch, const float uu, const float vv); - - template - Vec3 patchEval(const SubdivPatch1Base& patch, const simdf& uu, const simdf& vv); - - template - Vec3 patchNormal(const SubdivPatch1Base& patch, const simdf& uu, const simdf& vv); - - - /* eval grid over patch and stich edges when required */ - void evalGrid(const SubdivPatch1Base& patch, - const unsigned x0, const unsigned x1, - const unsigned y0, const unsigned y1, - const unsigned swidth, const unsigned sheight, - float *__restrict__ const grid_x, - float *__restrict__ const grid_y, - float *__restrict__ const grid_z, - float *__restrict__ const grid_u, - float *__restrict__ const grid_v, - const SubdivMesh* const geom); - - /* eval grid over patch and stich edges when required */ - BBox3fa evalGridBounds(const SubdivPatch1Base& patch, - const unsigned x0, const unsigned x1, - const unsigned y0, const unsigned y1, - const unsigned swidth, const unsigned sheight, - const SubdivMesh* const geom); - } -} diff --git a/thirdparty/embree-aarch64/kernels/subdiv/tessellation.h b/thirdparty/embree-aarch64/kernels/subdiv/tessellation.h deleted file mode 100644 index bda1e2d559..0000000000 --- a/thirdparty/embree-aarch64/kernels/subdiv/tessellation.h +++ /dev/null @@ -1,161 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -namespace embree -{ - /* adjust discret tessellation level for feature-adaptive pre-subdivision */ - __forceinline float adjustTessellationLevel(float l, const size_t sublevel) - { - for (size_t i=0; i= 2); - - const float inv_low_rate = rcp((float)(low_rate-1)); - const unsigned int dy = low_rate - 1; - const unsigned int dx = high_rate - 1; - - int p = 2*dy-dx; - - unsigned int offset = 0; - unsigned int y = 0; - float value = 0.0f; - for(unsigned int x=0;x data; - }; - - static __forceinline size_t extractCommitIndex(const int64_t v) { return v >> SharedLazyTessellationCache::COMMIT_INDEX_SHIFT; } - - struct CacheEntry - { - Tag tag; - SpinLock mutex; - }; - - private: - - float *data; - bool hugepages; - size_t size; - size_t maxBlocks; - ThreadWorkState *threadWorkState; - - __aligned(64) std::atomic localTime; - __aligned(64) std::atomic next_block; - __aligned(64) SpinLock reset_state; - __aligned(64) SpinLock linkedlist_mtx; - __aligned(64) std::atomic switch_block_threshold; - __aligned(64) std::atomic numRenderThreads; - - - public: - - - SharedLazyTessellationCache(); - ~SharedLazyTessellationCache(); - - void getNextRenderThreadWorkState(); - - __forceinline size_t maxAllocSize() const { - return switch_block_threshold; - } - - __forceinline size_t getCurrentIndex() { return localTime.load(); } - __forceinline void addCurrentIndex(const size_t i=1) { localTime.fetch_add(i); } - - __forceinline size_t getTime(const size_t globalTime) { - return localTime.load()+NUM_CACHE_SEGMENTS*globalTime; - } - - - __forceinline size_t lockThread (ThreadWorkState *const t_state, const ssize_t plus=1) { return t_state->counter.fetch_add(plus); } - __forceinline size_t unlockThread(ThreadWorkState *const t_state, const ssize_t plus=-1) { assert(isLocked(t_state)); return t_state->counter.fetch_add(plus); } - - __forceinline bool isLocked(ThreadWorkState *const t_state) { return t_state->counter.load() != 0; } - - static __forceinline void lock () { sharedLazyTessellationCache.lockThread(threadState()); } - static __forceinline void unlock() { sharedLazyTessellationCache.unlockThread(threadState()); } - static __forceinline bool isLocked() { return sharedLazyTessellationCache.isLocked(threadState()); } - static __forceinline size_t getState() { return threadState()->counter.load(); } - static __forceinline void lockThreadLoop() { sharedLazyTessellationCache.lockThreadLoop(threadState()); } - - static __forceinline size_t getTCacheTime(const size_t globalTime) { - return sharedLazyTessellationCache.getTime(globalTime); - } - - /* per thread lock */ - __forceinline void lockThreadLoop (ThreadWorkState *const t_state) - { - while(1) - { - size_t lock = SharedLazyTessellationCache::sharedLazyTessellationCache.lockThread(t_state,1); - if (unlikely(lock >= THREAD_BLOCK_ATOMIC_ADD)) - { - /* lock failed wait until sync phase is over */ - sharedLazyTessellationCache.unlockThread(t_state,-1); - sharedLazyTessellationCache.waitForUsersLessEqual(t_state,0); - } - else - break; - } - } - - static __forceinline void* lookup(CacheEntry& entry, size_t globalTime) - { - const int64_t subdiv_patch_root_ref = entry.tag.get(); - CACHE_STATS(SharedTessellationCacheStats::cache_accesses++); - - if (likely(subdiv_patch_root_ref != 0)) - { - const size_t subdiv_patch_root = (subdiv_patch_root_ref & REF_TAG_MASK) + (size_t)sharedLazyTessellationCache.getDataPtr(); - const size_t subdiv_patch_cache_index = extractCommitIndex(subdiv_patch_root_ref); - - if (likely( sharedLazyTessellationCache.validCacheIndex(subdiv_patch_cache_index,globalTime) )) - { - CACHE_STATS(SharedTessellationCacheStats::cache_hits++); - return (void*) subdiv_patch_root; - } - } - CACHE_STATS(SharedTessellationCacheStats::cache_misses++); - return nullptr; - } - - template - static __forceinline auto lookup (CacheEntry& entry, size_t globalTime, const Constructor constructor, const bool before=false) -> decltype(constructor()) - { - ThreadWorkState *t_state = SharedLazyTessellationCache::threadState(); - - while (true) - { - sharedLazyTessellationCache.lockThreadLoop(t_state); - void* patch = SharedLazyTessellationCache::lookup(entry,globalTime); - if (patch) return (decltype(constructor())) patch; - - if (entry.mutex.try_lock()) - { - if (!validTag(entry.tag,globalTime)) - { - auto timeBefore = sharedLazyTessellationCache.getTime(globalTime); - auto ret = constructor(); // thread is locked here! - assert(ret); - /* this should never return nullptr */ - auto timeAfter = sharedLazyTessellationCache.getTime(globalTime); - auto time = before ? timeBefore : timeAfter; - __memory_barrier(); - entry.tag = SharedLazyTessellationCache::Tag(ret,time); - __memory_barrier(); - entry.mutex.unlock(); - return ret; - } - entry.mutex.unlock(); - } - SharedLazyTessellationCache::sharedLazyTessellationCache.unlockThread(t_state); - } - } - - __forceinline bool validCacheIndex(const size_t i, const size_t globalTime) - { -#if FORCE_SIMPLE_FLUSH == 1 - return i == getTime(globalTime); -#else - return i+(NUM_CACHE_SEGMENTS-1) >= getTime(globalTime); -#endif - } - - static __forceinline bool validTime(const size_t oldtime, const size_t newTime) - { - return oldtime+(NUM_CACHE_SEGMENTS-1) >= newTime; - } - - - static __forceinline bool validTag(const Tag& tag, size_t globalTime) - { - const int64_t subdiv_patch_root_ref = tag.get(); - if (subdiv_patch_root_ref == 0) return false; - const size_t subdiv_patch_cache_index = extractCommitIndex(subdiv_patch_root_ref); - return sharedLazyTessellationCache.validCacheIndex(subdiv_patch_cache_index,globalTime); - } - - void waitForUsersLessEqual(ThreadWorkState *const t_state, - const unsigned int users); - - __forceinline size_t alloc(const size_t blocks) - { - if (unlikely(blocks >= switch_block_threshold)) - throw_RTCError(RTC_ERROR_INVALID_OPERATION,"allocation exceeds size of tessellation cache segment"); - - assert(blocks < switch_block_threshold); - size_t index = next_block.fetch_add(blocks); - if (unlikely(index + blocks >= switch_block_threshold)) return (size_t)-1; - return index; - } - - static __forceinline void* malloc(const size_t bytes) - { - size_t block_index = -1; - ThreadWorkState *const t_state = threadState(); - while (true) - { - block_index = sharedLazyTessellationCache.alloc((bytes+BLOCK_SIZE-1)/BLOCK_SIZE); - if (block_index == (size_t)-1) - { - sharedLazyTessellationCache.unlockThread(t_state); - sharedLazyTessellationCache.allocNextSegment(); - sharedLazyTessellationCache.lockThread(t_state); - continue; - } - break; - } - return sharedLazyTessellationCache.getBlockPtr(block_index); - } - - __forceinline void *getBlockPtr(const size_t block_index) - { - assert(block_index < maxBlocks); - assert(data); - assert(block_index*16 <= size); - return (void*)&data[block_index*16]; - } - - __forceinline void* getDataPtr() { return data; } - __forceinline size_t getNumUsedBytes() { return next_block * BLOCK_SIZE; } - __forceinline size_t getMaxBlocks() { return maxBlocks; } - __forceinline size_t getSize() { return size; } - - void allocNextSegment(); - void realloc(const size_t newSize); - - void reset(); - - static SharedLazyTessellationCache sharedLazyTessellationCache; - }; -} diff --git a/thirdparty/embree-aarch64/patches/godot-changes.patch b/thirdparty/embree-aarch64/patches/godot-changes.patch deleted file mode 100644 index 86fbf226d2..0000000000 --- a/thirdparty/embree-aarch64/patches/godot-changes.patch +++ /dev/null @@ -1,630 +0,0 @@ -diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_for.h b/thirdparty/embree-aarch64/common/algorithms/parallel_for.h -index 76c6b740aa..51d296fb16 100644 ---- a/thirdparty/embree-aarch64/common/algorithms/parallel_for.h -+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_for.h -@@ -27,7 +27,10 @@ namespace embree - func(r.begin()); - }); - if (!TaskScheduler::wait()) -- throw std::runtime_error("task cancelled"); -+ // -- GODOT start -- -+ // throw std::runtime_error("task cancelled"); -+ abort(); -+ // -- GODOT end -- - } - #elif defined(TASKING_GCD) && defined(BUILD_IOS) - -@@ -55,13 +58,19 @@ namespace embree - func(i); - },context); - if (context.is_group_execution_cancelled()) -- throw std::runtime_error("task cancelled"); -+ // -- GODOT start -- -+ // throw std::runtime_error("task cancelled"); -+ abort(); -+ // -- GODOT end -- - #else - tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { - func(i); - }); - if (tbb::task::self().is_cancelled()) -- throw std::runtime_error("task cancelled"); -+ // -- GODOT start -- -+ // throw std::runtime_error("task cancelled"); -+ abort(); -+ // -- GODOT end -- - #endif - - #elif defined(TASKING_PPL) -@@ -81,7 +90,10 @@ namespace embree - #if defined(TASKING_INTERNAL) - TaskScheduler::spawn(first,last,minStepSize,func); - if (!TaskScheduler::wait()) -- throw std::runtime_error("task cancelled"); -+ // -- GODOT start -- -+ // throw std::runtime_error("task cancelled"); -+ abort(); -+ // -- GODOT end -- - - #elif defined(TASKING_GCD) && defined(BUILD_IOS) - -@@ -109,13 +121,19 @@ namespace embree - func(range(r.begin(),r.end())); - },context); - if (context.is_group_execution_cancelled()) -- throw std::runtime_error("task cancelled"); -+ // -- GODOT start -- -+ // throw std::runtime_error("task cancelled"); -+ abort(); -+ // -- GODOT end -- - #else - tbb::parallel_for(tbb::blocked_range(first,last,minStepSize),[&](const tbb::blocked_range& r) { - func(range(r.begin(),r.end())); - }); - if (tbb::task::self().is_cancelled()) -- throw std::runtime_error("task cancelled"); -+ // -- GODOT start -- -+ // throw std::runtime_error("task cancelled"); -+ abort(); -+ // -- GODOT end -- - #endif - - #elif defined(TASKING_PPL) -@@ -147,13 +165,19 @@ namespace embree - func(i); - },tbb::simple_partitioner(),context); - if (context.is_group_execution_cancelled()) -- throw std::runtime_error("task cancelled"); -+ // -- GODOT start -- -+ // throw std::runtime_error("task cancelled"); -+ abort(); -+ // -- GODOT end -- - #else - tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { - func(i); - },tbb::simple_partitioner()); - if (tbb::task::self().is_cancelled()) -- throw std::runtime_error("task cancelled"); -+ // -- GODOT start -- -+ // throw std::runtime_error("task cancelled"); -+ abort(); -+ // -- GODOT end -- - #endif - } - -@@ -168,13 +192,19 @@ namespace embree - func(i); - },ap,context); - if (context.is_group_execution_cancelled()) -- throw std::runtime_error("task cancelled"); -+ // -- GODOT start -- -+ // throw std::runtime_error("task cancelled"); -+ abort(); -+ // -- GODOT end -- - #else - tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { - func(i); - },ap); - if (tbb::task::self().is_cancelled()) -- throw std::runtime_error("task cancelled"); -+ // -- GODOT start -- -+ // throw std::runtime_error("task cancelled"); -+ abort(); -+ // -- GODOT end -- - #endif - } - -diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_reduce.h b/thirdparty/embree-aarch64/common/algorithms/parallel_reduce.h -index d444b6a2e4..0daf94e50e 100644 ---- a/thirdparty/embree-aarch64/common/algorithms/parallel_reduce.h -+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_reduce.h -@@ -58,15 +58,19 @@ namespace embree - const Value v = tbb::parallel_reduce(tbb::blocked_range(first,last,minStepSize),identity, - [&](const tbb::blocked_range& r, const Value& start) { return reduction(start,func(range(r.begin(),r.end()))); }, - reduction,context); -- if (context.is_group_execution_cancelled()) -- throw std::runtime_error("task cancelled"); -+ // -- GODOT start -- -+ // if (context.is_group_execution_cancelled()) -+ // throw std::runtime_error("task cancelled"); -+ // -- GODOT end -- - return v; - #else - const Value v = tbb::parallel_reduce(tbb::blocked_range(first,last,minStepSize),identity, - [&](const tbb::blocked_range& r, const Value& start) { return reduction(start,func(range(r.begin(),r.end()))); }, - reduction); -- if (tbb::task::self().is_cancelled()) -- throw std::runtime_error("task cancelled"); -+ // -- GODOT start -- -+ // if (tbb::task::self().is_cancelled()) -+ // throw std::runtime_error("task cancelled"); -+ // -- GODOT end -- - return v; - #endif - #else // TASKING_PPL -diff --git a/thirdparty/embree-aarch64/common/lexers/stringstream.cpp b/thirdparty/embree-aarch64/common/lexers/stringstream.cpp -index 7e7b9faef8..98dc80ad59 100644 ---- a/thirdparty/embree-aarch64/common/lexers/stringstream.cpp -+++ b/thirdparty/embree-aarch64/common/lexers/stringstream.cpp -@@ -39,7 +39,10 @@ namespace embree - std::vector str; str.reserve(64); - while (cin->peek() != EOF && !isSeparator(cin->peek())) { - int c = cin->get(); -- if (!isValidChar(c)) throw std::runtime_error("invalid character "+std::string(1,c)+" in input"); -+ // -- GODOT start -- -+ // if (!isValidChar(c)) throw std::runtime_error("invalid character "+std::string(1,c)+" in input"); -+ if (!isValidChar(c)) abort(); -+ // -- GODOT end -- - str.push_back((char)c); - } - str.push_back(0); -diff --git a/thirdparty/embree-aarch64/common/sys/alloc.cpp b/thirdparty/embree-aarch64/common/sys/alloc.cpp -index 4e8928242e..12f143f131 100644 ---- a/thirdparty/embree-aarch64/common/sys/alloc.cpp -+++ b/thirdparty/embree-aarch64/common/sys/alloc.cpp -@@ -21,7 +21,10 @@ namespace embree - void* ptr = _mm_malloc(size,align); - - if (size != 0 && ptr == nullptr) -- throw std::bad_alloc(); -+ // -- GODOT start -- -+ // throw std::bad_alloc(); -+ abort(); -+ // -- GODOT end -- - - return ptr; - } -@@ -128,7 +131,10 @@ namespace embree - /* fall back to 4k pages */ - int flags = MEM_COMMIT | MEM_RESERVE; - char* ptr = (char*) VirtualAlloc(nullptr,bytes,flags,PAGE_READWRITE); -- if (ptr == nullptr) throw std::bad_alloc(); -+ // -- GODOT start -- -+ // if (ptr == nullptr) throw std::bad_alloc(); -+ if (ptr == nullptr) abort(); -+ // -- GODOT end -- - hugepages = false; - return ptr; - } -@@ -145,7 +151,10 @@ namespace embree - return bytesOld; - - if (!VirtualFree((char*)ptr+bytesNew,bytesOld-bytesNew,MEM_DECOMMIT)) -- throw std::bad_alloc(); -+ // -- GODOT start -- -+ // throw std::bad_alloc(); -+ abort(); -+ // -- GODOT end -- - - return bytesNew; - } -@@ -156,7 +165,10 @@ namespace embree - return; - - if (!VirtualFree(ptr,0,MEM_RELEASE)) -- throw std::bad_alloc(); -+ // -- GODOT start -- -+ // throw std::bad_alloc(); -+ abort(); -+ // -- GODOT end -- - } - - void os_advise(void *ptr, size_t bytes) -@@ -260,7 +272,10 @@ namespace embree - - /* fallback to 4k pages */ - void* ptr = (char*) mmap(0, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); -- if (ptr == MAP_FAILED) throw std::bad_alloc(); -+ // -- GODOT start -- -+ // if (ptr == MAP_FAILED) throw std::bad_alloc(); -+ if (ptr == MAP_FAILED) abort(); -+ // -- GODOT end -- - hugepages = false; - - /* advise huge page hint for THP */ -@@ -277,7 +292,10 @@ namespace embree - return bytesOld; - - if (munmap((char*)ptr+bytesNew,bytesOld-bytesNew) == -1) -- throw std::bad_alloc(); -+ // -- GODOT start -- -+ // throw std::bad_alloc(); -+ abort(); -+ // -- GODOT end -- - - return bytesNew; - } -@@ -291,7 +309,10 @@ namespace embree - const size_t pageSize = hugepages ? PAGE_SIZE_2M : PAGE_SIZE_4K; - bytes = (bytes+pageSize-1) & ~(pageSize-1); - if (munmap(ptr,bytes) == -1) -- throw std::bad_alloc(); -+ // -- GODOT start -- -+ // throw std::bad_alloc(); -+ abort(); -+ // -- GODOT end -- - } - - /* hint for transparent huge pages (THP) */ -diff --git a/thirdparty/embree-aarch64/common/sys/platform.h b/thirdparty/embree-aarch64/common/sys/platform.h -index 7914eb7a52..737f14aa6e 100644 ---- a/thirdparty/embree-aarch64/common/sys/platform.h -+++ b/thirdparty/embree-aarch64/common/sys/platform.h -@@ -174,11 +174,19 @@ - #define PRINT4(x,y,z,w) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << ", " << STRING(w) << " = " << (w) << embree_endl - - #if defined(DEBUG) // only report file and line in debug mode -+ // -- GODOT start -- -+ // #define THROW_RUNTIME_ERROR(str) -+ // throw std::runtime_error(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)); - #define THROW_RUNTIME_ERROR(str) \ -- throw std::runtime_error(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)); -+ printf(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)), abort(); -+ // -- GODOT end -- - #else -+ // -- GODOT start -- -+ // #define THROW_RUNTIME_ERROR(str) -+ // throw std::runtime_error(str); - #define THROW_RUNTIME_ERROR(str) \ -- throw std::runtime_error(str); -+ abort(); -+ // -- GODOT end -- - #endif - - #define FATAL(x) THROW_RUNTIME_ERROR(x) -diff --git a/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.cpp b/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.cpp -index 98d7fb9249..ebf656d1a0 100644 ---- a/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.cpp -+++ b/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.cpp -@@ -48,13 +48,15 @@ namespace embree - { - Task* prevTask = thread.task; - thread.task = this; -- try { -- if (thread.scheduler->cancellingException == nullptr) -+ // -- GODOT start -- -+ // try { -+ // if (thread.scheduler->cancellingException == nullptr) - closure->execute(); -- } catch (...) { -- if (thread.scheduler->cancellingException == nullptr) -- thread.scheduler->cancellingException = std::current_exception(); -- } -+ // } catch (...) { -+ // if (thread.scheduler->cancellingException == nullptr) -+ // thread.scheduler->cancellingException = std::current_exception(); -+ // } -+ // -- GODOT end -- - thread.task = prevTask; - add_dependencies(-1); - } -@@ -297,8 +299,11 @@ namespace embree - size_t threadIndex = allocThreadIndex(); - condition.wait(mutex, [&] () { return hasRootTask.load(); }); - mutex.unlock(); -- std::exception_ptr except = thread_loop(threadIndex); -- if (except != nullptr) std::rethrow_exception(except); -+ // -- GODOT start -- -+ // std::exception_ptr except = thread_loop(threadIndex); -+ // if (except != nullptr) std::rethrow_exception(except); -+ thread_loop(threadIndex); -+ // -- GODOT end -- - } - - void TaskScheduler::reset() { -@@ -330,7 +335,10 @@ namespace embree - return thread->scheduler->cancellingException == nullptr; - } - -- std::exception_ptr TaskScheduler::thread_loop(size_t threadIndex) -+// -- GODOT start -- -+// std::exception_ptr TaskScheduler::thread_loop(size_t threadIndex) -+ void TaskScheduler::thread_loop(size_t threadIndex) -+// -- GODOT end -- - { - /* allocate thread structure */ - std::unique_ptr mthread(new Thread(threadIndex,this)); // too large for stack allocation -@@ -353,9 +361,10 @@ namespace embree - swapThread(oldThread); - - /* remember exception to throw */ -- std::exception_ptr except = nullptr; -- if (cancellingException != nullptr) except = cancellingException; -- -+ // -- GODOT start -- -+ // std::exception_ptr except = nullptr; -+ // if (cancellingException != nullptr) except = cancellingException; -+ // -- GODOT end -- - /* wait for all threads to terminate */ - threadCounter--; - #if defined(__WIN32__) -@@ -373,7 +382,10 @@ namespace embree - yield(); - #endif - } -- return except; -+ // -- GODOT start -- -+ // return except; -+ return; -+ // -- GODOT end -- - } - - bool TaskScheduler::steal_from_other_threads(Thread& thread) -diff --git a/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.h b/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.h -index c2a9391aea..8bd70b2b8c 100644 ---- a/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.h -+++ b/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.h -@@ -123,7 +123,10 @@ namespace embree - { - size_t ofs = bytes + ((align - stackPtr) & (align-1)); - if (stackPtr + ofs > CLOSURE_STACK_SIZE) -- throw std::runtime_error("closure stack overflow"); -+ // -- GODOT start -- -+ // throw std::runtime_error("closure stack overflow"); -+ abort(); -+ // -- GODOT end -- - stackPtr += ofs; - return &stack[stackPtr-bytes]; - } -@@ -132,7 +135,10 @@ namespace embree - __forceinline void push_right(Thread& thread, const size_t size, const Closure& closure) - { - if (right >= TASK_STACK_SIZE) -- throw std::runtime_error("task stack overflow"); -+ // -- GODOT start -- -+ // throw std::runtime_error("task stack overflow"); -+ abort(); -+ // -- GODOT end -- - - /* allocate new task on right side of stack */ - size_t oldStackPtr = stackPtr; -@@ -239,7 +245,10 @@ namespace embree - void wait_for_threads(size_t threadCount); - - /*! thread loop for all worker threads */ -- std::exception_ptr thread_loop(size_t threadIndex); -+ // -- GODOT start -- -+ // std::exception_ptr thread_loop(size_t threadIndex); -+ void thread_loop(size_t threadIndex); -+ // -- GODOT end -- - - /*! steals a task from a different thread */ - bool steal_from_other_threads(Thread& thread); -diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_statistics.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_statistics.cpp -index 20cdd2d320..aa56035026 100644 ---- a/thirdparty/embree-aarch64/kernels/bvh/bvh_statistics.cpp -+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_statistics.cpp -@@ -150,7 +150,10 @@ namespace embree - } - } - else { -- throw std::runtime_error("not supported node type in bvh_statistics"); -+ // -- GODOT start -- -+ // throw std::runtime_error("not supported node type in bvh_statistics"); -+ abort(); -+ // -- GODOT end -- - } - return s; - } -diff --git a/thirdparty/embree-aarch64/kernels/common/rtcore.cpp b/thirdparty/embree-aarch64/kernels/common/rtcore.cpp -index ee5c37b238..625fbf6d4f 100644 ---- a/thirdparty/embree-aarch64/kernels/common/rtcore.cpp -+++ b/thirdparty/embree-aarch64/kernels/common/rtcore.cpp -@@ -230,7 +230,10 @@ RTC_NAMESPACE_BEGIN; - if (quality != RTC_BUILD_QUALITY_LOW && - quality != RTC_BUILD_QUALITY_MEDIUM && - quality != RTC_BUILD_QUALITY_HIGH) -- throw std::runtime_error("invalid build quality"); -+ // -- GODOT start -- -+ // throw std::runtime_error("invalid build quality"); -+ abort(); -+ // -- GODOT end -- - scene->setBuildQuality(quality); - RTC_CATCH_END2(scene); - } -@@ -1383,7 +1386,10 @@ RTC_NAMESPACE_BEGIN; - quality != RTC_BUILD_QUALITY_MEDIUM && - quality != RTC_BUILD_QUALITY_HIGH && - quality != RTC_BUILD_QUALITY_REFIT) -- throw std::runtime_error("invalid build quality"); -+ // -- GODOT start -- -+ // throw std::runtime_error("invalid build quality"); -+ abort(); -+ // -- GODOT end -- - geometry->setBuildQuality(quality); - RTC_CATCH_END2(geometry); - } -diff --git a/thirdparty/embree-aarch64/kernels/common/rtcore.h b/thirdparty/embree-aarch64/kernels/common/rtcore.h -index 6583d12d57..4b070e122b 100644 ---- a/thirdparty/embree-aarch64/kernels/common/rtcore.h -+++ b/thirdparty/embree-aarch64/kernels/common/rtcore.h -@@ -25,52 +25,58 @@ namespace embree - #endif - - /*! Macros used in the rtcore API implementation */ --#define RTC_CATCH_BEGIN try { -+// -- GODOT start -- -+// #define RTC_CATCH_BEGIN try { -+#define RTC_CATCH_BEGIN - --#define RTC_CATCH_END(device) \ -- } catch (std::bad_alloc&) { \ -- Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \ -- } catch (rtcore_error& e) { \ -- Device::process_error(device,e.error,e.what()); \ -- } catch (std::exception& e) { \ -- Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \ -- } catch (...) { \ -- Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \ -- } -+// #define RTC_CATCH_END(device) \ -+// } catch (std::bad_alloc&) { \ -+// Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \ -+// } catch (rtcore_error& e) { \ -+// Device::process_error(device,e.error,e.what()); \ -+// } catch (std::exception& e) { \ -+// Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \ -+// } catch (...) { \ -+// Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \ -+// } -+#define RTC_CATCH_END(device) - --#define RTC_CATCH_END2(scene) \ -- } catch (std::bad_alloc&) { \ -- Device* device = scene ? scene->device : nullptr; \ -- Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \ -- } catch (rtcore_error& e) { \ -- Device* device = scene ? scene->device : nullptr; \ -- Device::process_error(device,e.error,e.what()); \ -- } catch (std::exception& e) { \ -- Device* device = scene ? scene->device : nullptr; \ -- Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \ -- } catch (...) { \ -- Device* device = scene ? scene->device : nullptr; \ -- Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \ -- } -+// #define RTC_CATCH_END2(scene) \ -+// } catch (std::bad_alloc&) { \ -+// Device* device = scene ? scene->device : nullptr; \ -+// Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \ -+// } catch (rtcore_error& e) { \ -+// Device* device = scene ? scene->device : nullptr; \ -+// Device::process_error(device,e.error,e.what()); \ -+// } catch (std::exception& e) { \ -+// Device* device = scene ? scene->device : nullptr; \ -+// Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \ -+// } catch (...) { \ -+// Device* device = scene ? scene->device : nullptr; \ -+// Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \ -+// } -+#define RTC_CATCH_END2(scene) - --#define RTC_CATCH_END2_FALSE(scene) \ -- } catch (std::bad_alloc&) { \ -- Device* device = scene ? scene->device : nullptr; \ -- Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \ -- return false; \ -- } catch (rtcore_error& e) { \ -- Device* device = scene ? scene->device : nullptr; \ -- Device::process_error(device,e.error,e.what()); \ -- return false; \ -- } catch (std::exception& e) { \ -- Device* device = scene ? scene->device : nullptr; \ -- Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \ -- return false; \ -- } catch (...) { \ -- Device* device = scene ? scene->device : nullptr; \ -- Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \ -- return false; \ -- } -+// #define RTC_CATCH_END2_FALSE(scene) \ -+// } catch (std::bad_alloc&) { \ -+// Device* device = scene ? scene->device : nullptr; \ -+// Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \ -+// return false; \ -+// } catch (rtcore_error& e) { \ -+// Device* device = scene ? scene->device : nullptr; \ -+// Device::process_error(device,e.error,e.what()); \ -+// return false; \ -+// } catch (std::exception& e) { \ -+// Device* device = scene ? scene->device : nullptr; \ -+// Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \ -+// return false; \ -+// } catch (...) { \ -+// Device* device = scene ? scene->device : nullptr; \ -+// Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \ -+// return false; \ -+// } -+#define RTC_CATCH_END2_FALSE(scene) return false; -+// -- GODOT end -- - - #define RTC_VERIFY_HANDLE(handle) \ - if (handle == nullptr) { \ -@@ -97,28 +103,38 @@ namespace embree - #define RTC_TRACE(x) - #endif - -- /*! used to throw embree API errors */ -- struct rtcore_error : public std::exception -- { -- __forceinline rtcore_error(RTCError error, const std::string& str) -- : error(error), str(str) {} -- -- ~rtcore_error() throw() {} -- -- const char* what () const throw () { -- return str.c_str(); -- } -- -- RTCError error; -- std::string str; -- }; -+// -- GODOT begin -- -+// /*! used to throw embree API errors */ -+// struct rtcore_error : public std::exception -+// { -+// __forceinline rtcore_error(RTCError error, const std::string& str) -+// : error(error), str(str) {} -+// -+// ~rtcore_error() throw() {} -+// -+// const char* what () const throw () { -+// return str.c_str(); -+// } -+// -+// RTCError error; -+// std::string str; -+// }; -+// -- GODOT end -- - - #if defined(DEBUG) // only report file and line in debug mode -+ // -- GODOT begin -- -+ // #define throw_RTCError(error,str) \ -+ // throw rtcore_error(error,std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)); - #define throw_RTCError(error,str) \ -- throw rtcore_error(error,std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)); -+ printf(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)), abort(); -+ // -- GODOT end -- - #else -+ // -- GODOT begin -- -+ // #define throw_RTCError(error,str) \ -+ // throw rtcore_error(error,str); - #define throw_RTCError(error,str) \ -- throw rtcore_error(error,str); -+ abort(); -+ // -- GODOT end -- - #endif - - #define RTC_BUILD_ARGUMENTS_HAS(settings,member) \ -diff --git a/thirdparty/embree-aarch64/kernels/common/scene.cpp b/thirdparty/embree-aarch64/kernels/common/scene.cpp -index e75aa968f9..1e23aeb415 100644 ---- a/thirdparty/embree-aarch64/kernels/common/scene.cpp -+++ b/thirdparty/embree-aarch64/kernels/common/scene.cpp -@@ -800,16 +800,18 @@ namespace embree - } - - /* initiate build */ -- try { -+ // -- GODOT start -- -+ // try { - scheduler->spawn_root([&]() { commit_task(); Lock lock(schedulerMutex); this->scheduler = nullptr; }, 1, !join); -- } -- catch (...) { -- accels_clear(); -- updateInterface(); -- Lock lock(schedulerMutex); -- this->scheduler = nullptr; -- throw; -- } -+ // } -+ // catch (...) { -+ // accels_clear(); -+ // updateInterface(); -+ // Lock lock(schedulerMutex); -+ // this->scheduler = nullptr; -+ // throw; -+ // } -+ // -- GODOT end -- - } - - #endif -- cgit v1.2.3