diff options
23 files changed, 2296 insertions, 24 deletions
diff --git a/doc/classes/ConfigFile.xml b/doc/classes/ConfigFile.xml index 249e2a8f80..ce976e3d8b 100644 --- a/doc/classes/ConfigFile.xml +++ b/doc/classes/ConfigFile.xml @@ -84,6 +84,7 @@ } [/csharp] [/codeblocks] + Any operation that mutates the ConfigFile such as [method set_value], [method clear], or [method erase_section], only changes what is loaded in memory. If you want to write the change to a file, you have to save the changes with [method save], [method save_encrypted], or [method save_encrypted_pass]. Keep in mind that section and property names can't contain spaces. Anything after a space will be ignored on save and on load. ConfigFiles can also contain manually written comment lines starting with a semicolon ([code];[/code]). Those lines will be ignored when parsing the file. Note that comments will be lost when saving the ConfigFile. This can still be useful for dedicated server configuration files, which are typically never overwritten without explicit user action. [b]Note:[/b] The file extension given to a ConfigFile does not have any impact on its formatting or behavior. By convention, the [code].cfg[/code] extension is used here, but any other extension such as [code].ini[/code] is also valid. Since neither [code].cfg[/code] nor [code].ini[/code] are standardized, Godot's ConfigFile formatting may differ from files written by other programs. @@ -94,6 +95,7 @@ <method name="clear"> <return type="void" /> <description> + Removes the entire contents of the config. </description> </method> <method name="erase_section"> diff --git a/editor/debugger/editor_debugger_node.cpp b/editor/debugger/editor_debugger_node.cpp index 07c02eb022..be84e8dec5 100644 --- a/editor/debugger/editor_debugger_node.cpp +++ b/editor/debugger/editor_debugger_node.cpp @@ -183,16 +183,16 @@ ScriptEditorDebugger *EditorDebuggerNode::get_default_debugger() const { return Object::cast_to<ScriptEditorDebugger>(tabs->get_tab_control(0)); } -Error EditorDebuggerNode::start(const String &p_protocol) { +Error EditorDebuggerNode::start(const String &p_uri) { stop(); + ERR_FAIL_COND_V(p_uri.find("://") < 0, ERR_INVALID_PARAMETER); if (EDITOR_GET("run/output/always_open_output_on_play")) { EditorNode::get_singleton()->make_bottom_panel_item_visible(EditorNode::get_log()); } else { EditorNode::get_singleton()->make_bottom_panel_item_visible(this); } - - server = Ref<EditorDebuggerServer>(EditorDebuggerServer::create(p_protocol)); - const Error err = server->start(); + server = Ref<EditorDebuggerServer>(EditorDebuggerServer::create(p_uri.substr(0, p_uri.find("://") + 3))); + const Error err = server->start(p_uri); if (err != OK) { return err; } diff --git a/editor/debugger/editor_debugger_node.h b/editor/debugger/editor_debugger_node.h index 39a95326be..4d9e846834 100644 --- a/editor/debugger/editor_debugger_node.h +++ b/editor/debugger/editor_debugger_node.h @@ -188,7 +188,7 @@ public: void set_camera_override(CameraOverride p_override); CameraOverride get_camera_override(); - Error start(const String &p_protocol = "tcp://"); + Error start(const String &p_uri = "tcp://"); void stop(); diff --git a/editor/debugger/editor_debugger_server.cpp b/editor/debugger/editor_debugger_server.cpp index e8524e0702..8c3833af50 100644 --- a/editor/debugger/editor_debugger_server.cpp +++ b/editor/debugger/editor_debugger_server.cpp @@ -45,7 +45,7 @@ private: public: static EditorDebuggerServer *create(const String &p_protocol); virtual void poll() {} - virtual Error start(); + virtual Error start(const String &p_uri); virtual void stop(); virtual bool is_active() const; virtual bool is_connection_available() const; @@ -63,11 +63,18 @@ EditorDebuggerServerTCP::EditorDebuggerServerTCP() { server.instantiate(); } -Error EditorDebuggerServerTCP::start() { - int remote_port = (int)EditorSettings::get_singleton()->get("network/debug/remote_port"); - const Error err = server->listen(remote_port); +Error EditorDebuggerServerTCP::start(const String &p_uri) { + int bind_port = (int)EditorSettings::get_singleton()->get("network/debug/remote_port"); + String bind_host = (String)EditorSettings::get_singleton()->get("network/debug/remote_host"); + if (!p_uri.is_empty() && p_uri != "tcp://") { + String scheme, path; + Error err = p_uri.parse_url(scheme, bind_host, bind_port, path); + ERR_FAIL_COND_V(err != OK, ERR_INVALID_PARAMETER); + ERR_FAIL_COND_V(!bind_host.is_valid_ip_address() && bind_host != "*", ERR_INVALID_PARAMETER); + } + const Error err = server->listen(bind_port, bind_host); if (err != OK) { - EditorNode::get_log()->add_message(String("Error listening on port ") + itos(remote_port), EditorLog::MSG_TYPE_ERROR); + EditorNode::get_log()->add_message(String("Error listening on port ") + itos(bind_port), EditorLog::MSG_TYPE_ERROR); return err; } return err; diff --git a/editor/debugger/editor_debugger_server.h b/editor/debugger/editor_debugger_server.h index 6216d0df3d..844d1a9e5a 100644 --- a/editor/debugger/editor_debugger_server.h +++ b/editor/debugger/editor_debugger_server.h @@ -48,7 +48,7 @@ public: static void register_protocol_handler(const String &p_protocol, CreateServerFunc p_func); static EditorDebuggerServer *create(const String &p_protocol); virtual void poll() = 0; - virtual Error start() = 0; + virtual Error start(const String &p_uri = "") = 0; virtual void stop() = 0; virtual bool is_active() const = 0; virtual bool is_connection_available() const = 0; diff --git a/editor/editor_inspector.cpp b/editor/editor_inspector.cpp index 298c1ed917..6bbf0f965e 100644 --- a/editor/editor_inspector.cpp +++ b/editor/editor_inspector.cpp @@ -1266,9 +1266,13 @@ void EditorInspectorSection::_notification(int p_what) { } header_height += get_theme_constant(SNAME("vseparation"), SNAME("Tree")); + Rect2 header_rect = Rect2(Vector2(), Vector2(get_size().width, header_height)); Color c = bg_color; c.a *= 0.4; - draw_rect(Rect2(Vector2(), Vector2(get_size().width, header_height)), c); + if (foldable && header_rect.has_point(get_local_mouse_position())) { + c = c.lightened(Input::get_singleton()->is_mouse_button_pressed(MOUSE_BUTTON_LEFT) ? -0.05 : 0.2); + } + draw_rect(header_rect, c); const int arrow_margin = 2; const int arrow_width = arrow.is_valid() ? arrow->get_width() : 0; @@ -1315,12 +1319,14 @@ void EditorInspectorSection::_notification(int p_what) { if (dropping) { dropping_unfold_timer->start(); } + update(); } break; case NOTIFICATION_MOUSE_EXIT: { if (dropping) { dropping_unfold_timer->stop(); } + update(); } break; } } @@ -1395,6 +1401,8 @@ void EditorInspectorSection::gui_input(const Ref<InputEvent> &p_event) { } else { fold(); } + } else if (mb.is_valid() && !mb->is_pressed()) { + update(); } } diff --git a/main/main.cpp b/main/main.cpp index fe6df43364..4b035fa511 100644 --- a/main/main.cpp +++ b/main/main.cpp @@ -145,6 +145,7 @@ static bool auto_quit = false; static OS::ProcessID allow_focus_steal_pid = 0; #ifdef TOOLS_ENABLED static bool auto_build_solutions = false; +static String debug_server_uri; #endif // Display @@ -286,6 +287,7 @@ void Main::print_help(const char *p_binary) { #ifdef TOOLS_ENABLED OS::get_singleton()->print(" -e, --editor Start the editor instead of running the scene.\n"); OS::get_singleton()->print(" -p, --project-manager Start the project manager, even if a project is auto-detected.\n"); + OS::get_singleton()->print(" --debug-server <uri> Start the editor debug server (<protocol>://<host/IP>[:<port>], e.g. tcp://127.0.0.1:6007)\n"); #endif OS::get_singleton()->print(" -q, --quit Quit after the first iteration.\n"); OS::get_singleton()->print(" -l, --language <locale> Use a specific locale (<locale> being a two-letter code).\n"); @@ -875,6 +877,18 @@ Error Main::setup(const char *execpath, int argc, char *argv[], bool p_second_ph } else if (I->get() == "-p" || I->get() == "--project-manager") { // starts project manager project_manager = true; + } else if (I->get() == "--debug-server") { + if (I->next()) { + debug_server_uri = I->next()->get(); + if (debug_server_uri.find("://") == -1) { // wrong address + OS::get_singleton()->print("Invalid debug server uri. It should be of the form <protocol>://<bind_address>:<port>.\n"); + goto error; + } + N = I->next()->next(); + } else { + OS::get_singleton()->print("Missing remote debug server uri, aborting.\n"); + goto error; + } } else if (I->get() == "--build-solutions") { // Build the scripting solution such C# auto_build_solutions = true; @@ -2347,6 +2361,9 @@ bool Main::start() { } } DisplayServer::get_singleton()->set_context(DisplayServer::CONTEXT_EDITOR); + if (!debug_server_uri.is_empty()) { + EditorDebuggerNode::get_singleton()->start(debug_server_uri); + } } #endif if (!editor) { diff --git a/modules/mono/csharp_script.cpp b/modules/mono/csharp_script.cpp index fab950019f..17846eb281 100644 --- a/modules/mono/csharp_script.cpp +++ b/modules/mono/csharp_script.cpp @@ -43,8 +43,10 @@ #include "core/os/thread.h" #ifdef TOOLS_ENABLED +#include "core/os/keyboard.h" #include "editor/bindings_generator.h" #include "editor/editor_node.h" +#include "editor/editor_settings.h" #include "editor/node_dock.h" #endif @@ -1353,6 +1355,7 @@ void CSharpLanguage::_editor_init_callback() { // Enable it as a plugin EditorNode::add_editor_plugin(godotsharp_editor); + ED_SHORTCUT("mono/build_solution", TTR("Build Solution"), KEY_MASK_ALT | KEY_B); godotsharp_editor->enable_plugin(); get_singleton()->godotsharp_editor = godotsharp_editor; diff --git a/modules/mono/editor/GodotTools/GodotTools/GodotSharpEditor.cs b/modules/mono/editor/GodotTools/GodotTools/GodotSharpEditor.cs index 73cabf8561..98c6881166 100644 --- a/modules/mono/editor/GodotTools/GodotTools/GodotSharpEditor.cs +++ b/modules/mono/editor/GodotTools/GodotTools/GodotSharpEditor.cs @@ -418,11 +418,15 @@ namespace GodotTools AddToolSubmenuItem("C#", _menuPopup); + var buildSolutionShortcut = (Shortcut)EditorShortcut("mono/build_solution"); + _toolBarBuildButton = new Button { Text = "Build", - HintTooltip = "Build solution", - FocusMode = Control.FocusModeEnum.None + HintTooltip = "Build Solution".TTR(), + FocusMode = Control.FocusModeEnum.None, + Shortcut = buildSolutionShortcut, + ShortcutInTooltip = true }; _toolBarBuildButton.PressedSignal += BuildSolutionPressed; AddControlToContainer(CustomControlContainer.Toolbar, _toolBarBuildButton); diff --git a/modules/mono/editor/GodotTools/GodotTools/Internals/Globals.cs b/modules/mono/editor/GodotTools/GodotTools/Internals/Globals.cs index 793f84fd77..5c5ced8c29 100644 --- a/modules/mono/editor/GodotTools/GodotTools/Internals/Globals.cs +++ b/modules/mono/editor/GodotTools/GodotTools/Internals/Globals.cs @@ -13,6 +13,9 @@ namespace GodotTools.Internals public static object EditorDef(string setting, object defaultValue, bool restartIfChanged = false) => internal_EditorDef(setting, defaultValue, restartIfChanged); + public static object EditorShortcut(string setting) => + internal_EditorShortcut(setting); + [SuppressMessage("ReSharper", "InconsistentNaming")] public static string TTR(this string text) => internal_TTR(text); @@ -28,6 +31,9 @@ namespace GodotTools.Internals private static extern object internal_EditorDef(string setting, object defaultValue, bool restartIfChanged); [MethodImpl(MethodImplOptions.InternalCall)] + private static extern object internal_EditorShortcut(string setting); + + [MethodImpl(MethodImplOptions.InternalCall)] private static extern string internal_TTR(string text); } } diff --git a/modules/mono/editor/editor_internal_calls.cpp b/modules/mono/editor/editor_internal_calls.cpp index 6692a6efec..9a61b63c12 100644 --- a/modules/mono/editor/editor_internal_calls.cpp +++ b/modules/mono/editor/editor_internal_calls.cpp @@ -306,6 +306,12 @@ MonoObject *godot_icall_Globals_EditorDef(MonoString *p_setting, MonoObject *p_d return GDMonoMarshal::variant_to_mono_object(result); } +MonoObject *godot_icall_Globals_EditorShortcut(MonoString *p_setting) { + String setting = GDMonoMarshal::mono_string_to_godot(p_setting); + Ref<Shortcut> result = ED_GET_SHORTCUT(setting); + return GDMonoMarshal::variant_to_mono_object(result); +} + MonoString *godot_icall_Globals_TTR(MonoString *p_text) { String text = GDMonoMarshal::mono_string_to_godot(p_text); return GDMonoMarshal::mono_string_from_godot(TTR(text)); @@ -380,6 +386,7 @@ void register_editor_internal_calls() { GDMonoUtils::add_internal_call("GodotTools.Internals.Globals::internal_EditorScale", godot_icall_Globals_EditorScale); GDMonoUtils::add_internal_call("GodotTools.Internals.Globals::internal_GlobalDef", godot_icall_Globals_GlobalDef); GDMonoUtils::add_internal_call("GodotTools.Internals.Globals::internal_EditorDef", godot_icall_Globals_EditorDef); + GDMonoUtils::add_internal_call("GodotTools.Internals.Globals::internal_EditorShortcut", godot_icall_Globals_EditorShortcut); GDMonoUtils::add_internal_call("GodotTools.Internals.Globals::internal_TTR", godot_icall_Globals_TTR); // Utils.OS diff --git a/modules/raycast/SCsub b/modules/raycast/SCsub index 6e7b3e7b8d..1fdc8fe1b3 100644 --- a/modules/raycast/SCsub +++ b/modules/raycast/SCsub @@ -55,6 +55,9 @@ if env["builtin_embree"]: "kernels/bvh/bvh_builder_sah_mb.cpp", "kernels/bvh/bvh_builder_twolevel.cpp", "kernels/bvh/bvh_intersector1_bvh4.cpp", + "kernels/bvh/bvh_intersector_hybrid4_bvh4.cpp", + "kernels/bvh/bvh_intersector_stream_bvh4.cpp", + "kernels/bvh/bvh_intersector_stream_filters.cpp", ] thirdparty_sources = [thirdparty_dir + file for file in embree_src] diff --git a/modules/raycast/godot_update_embree.py b/modules/raycast/godot_update_embree.py index 31a25a318f..e31d88b741 100644 --- a/modules/raycast/godot_update_embree.py +++ b/modules/raycast/godot_update_embree.py @@ -61,6 +61,11 @@ cpp_files = [ "kernels/bvh/bvh_builder_twolevel.cpp", "kernels/bvh/bvh_intersector1.cpp", "kernels/bvh/bvh_intersector1_bvh4.cpp", + "kernels/bvh/bvh_intersector_hybrid4_bvh4.cpp", + "kernels/bvh/bvh_intersector_stream_bvh4.cpp", + "kernels/bvh/bvh_intersector_stream_filters.cpp", + "kernels/bvh/bvh_intersector_hybrid.cpp", + "kernels/bvh/bvh_intersector_stream.cpp", ] os.chdir("../../thirdparty") @@ -117,7 +122,7 @@ with open(os.path.join(dest_dir, "kernels/config.h"), "w") as config_file: /* #undef EMBREE_GEOMETRY_INSTANCE */ /* #undef EMBREE_GEOMETRY_GRID */ /* #undef EMBREE_GEOMETRY_POINT */ -/* #undef EMBREE_RAY_PACKETS */ +#define EMBREE_RAY_PACKETS /* #undef EMBREE_COMPACT_POLYS */ #define EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR 2.0 @@ -249,3 +254,8 @@ with open(os.path.join(dest_dir, "include/embree3/rtcore_config.h"), "w") as con os.chdir("..") shutil.rmtree("embree-tmp") + +subprocess.run(["git", "restore", "embree/patches"]) + +for patch in os.listdir("embree/patches"): + subprocess.run(["git", "apply", "embree/patches/" + patch]) diff --git a/modules/websocket/editor_debugger_server_websocket.cpp b/modules/websocket/editor_debugger_server_websocket.cpp index 2e61cbfc08..d248433d82 100644 --- a/modules/websocket/editor_debugger_server_websocket.cpp +++ b/modules/websocket/editor_debugger_server_websocket.cpp @@ -48,11 +48,19 @@ void EditorDebuggerServerWebSocket::poll() { server->poll(); } -Error EditorDebuggerServerWebSocket::start() { - int remote_port = (int)EditorSettings::get_singleton()->get("network/debug/remote_port"); +Error EditorDebuggerServerWebSocket::start(const String &p_uri) { + int bind_port = (int)EditorSettings::get_singleton()->get("network/debug/remote_port"); + String bind_host = EditorSettings::get_singleton()->get("network/debug/remote_host"); + if (!p_uri.is_empty() && p_uri != "ws://") { + String scheme, path; + Error err = p_uri.parse_url(scheme, bind_host, bind_port, path); + ERR_FAIL_COND_V(err != OK, ERR_INVALID_PARAMETER); + ERR_FAIL_COND_V(!bind_host.is_valid_ip_address() && bind_host != "*", ERR_INVALID_PARAMETER); + } + server->set_bind_ip(bind_host); Vector<String> compatible_protocols; compatible_protocols.push_back("binary"); // compatibility with EMSCRIPTEN TCP-to-WebSocket layer. - return server->listen(remote_port, compatible_protocols); + return server->listen(bind_port, compatible_protocols); } void EditorDebuggerServerWebSocket::stop() { diff --git a/modules/websocket/editor_debugger_server_websocket.h b/modules/websocket/editor_debugger_server_websocket.h index d9543bb647..14ab0109b2 100644 --- a/modules/websocket/editor_debugger_server_websocket.h +++ b/modules/websocket/editor_debugger_server_websocket.h @@ -48,7 +48,7 @@ public: void _peer_disconnected(int p_peer, bool p_was_clean); void poll() override; - Error start() override; + Error start(const String &p_uri) override; void stop() override; bool is_active() const override; bool is_connection_available() const override; diff --git a/thirdparty/embree/include/embree3/rtcore_config.h b/thirdparty/embree/include/embree3/rtcore_config.h index 3a9819c9f1..62b7b6f4dc 100644 --- a/thirdparty/embree/include/embree3/rtcore_config.h +++ b/thirdparty/embree/include/embree3/rtcore_config.h @@ -6,9 +6,9 @@ #define RTC_VERSION_MAJOR 3 #define RTC_VERSION_MINOR 13 -#define RTC_VERSION_PATCH 0 -#define RTC_VERSION 31300 -#define RTC_VERSION_STRING "3.13.0" +#define RTC_VERSION_PATCH 1 +#define RTC_VERSION 31301 +#define RTC_VERSION_STRING "3.13.1" #define RTC_MAX_INSTANCE_LEVEL_COUNT 1 diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid.cpp b/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid.cpp new file mode 100644 index 0000000000..6e9a5a538e --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid.cpp @@ -0,0 +1,917 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh_intersector_hybrid.h" +#include "bvh_traverser1.h" +#include "node_intersector1.h" +#include "node_intersector_packet.h" + +#include "../geometry/intersector_iterators.h" +#include "../geometry/triangle_intersector.h" +#include "../geometry/trianglev_intersector.h" +#include "../geometry/trianglev_mb_intersector.h" +#include "../geometry/trianglei_intersector.h" +#include "../geometry/quadv_intersector.h" +#include "../geometry/quadi_intersector.h" +#include "../geometry/curveNv_intersector.h" +#include "../geometry/curveNi_intersector.h" +#include "../geometry/curveNi_mb_intersector.h" +#include "../geometry/linei_intersector.h" +#include "../geometry/subdivpatch1_intersector.h" +#include "../geometry/object_intersector.h" +#include "../geometry/instance_intersector.h" +#include "../geometry/subgrid_intersector.h" +#include "../geometry/subgrid_mb_intersector.h" +#include "../geometry/curve_intersector_virtual.h" + +#define SWITCH_DURING_DOWN_TRAVERSAL 1 +#define FORCE_SINGLE_MODE 0 + +#define ENABLE_FAST_COHERENT_CODEPATHS 1 + +namespace embree +{ + namespace isa + { + template<int N, int K, int types, bool robust, typename PrimitiveIntersectorK, bool single> + void BVHNIntersectorKHybrid<N, K, types, robust, PrimitiveIntersectorK, single>::intersect1(Accel::Intersectors* This, + const BVH* bvh, + NodeRef root, + size_t k, + Precalculations& pre, + RayHitK<K>& ray, + const TravRayK<K, robust>& tray, + IntersectContext* context) + { + /* stack state */ + StackItemT<NodeRef> stack[stackSizeSingle]; // stack of nodes + StackItemT<NodeRef>* stackPtr = stack + 1; // current stack pointer + StackItemT<NodeRef>* stackEnd = stack + stackSizeSingle; + stack[0].ptr = root; + stack[0].dist = neg_inf; + + /* load the ray into SIMD registers */ + TravRay<N,robust> tray1; + tray1.template init<K>(k, tray.org, tray.dir, tray.rdir, tray.nearXYZ, tray.tnear[k], tray.tfar[k]); + + /* pop loop */ + while (true) pop: + { + /* pop next node */ + if (unlikely(stackPtr == stack)) break; + stackPtr--; + NodeRef cur = NodeRef(stackPtr->ptr); + + /* if popped node is too far, pop next one */ + if (unlikely(*(float*)&stackPtr->dist > ray.tfar[k])) + continue; + + /* downtraversal loop */ + while (true) + { + /* intersect node */ + size_t mask; vfloat<N> tNear; + STAT3(normal.trav_nodes, 1, 1, 1); + bool nodeIntersected = BVHNNodeIntersector1<N, types, robust>::intersect(cur, tray1, ray.time()[k], tNear, mask); + if (unlikely(!nodeIntersected)) { STAT3(normal.trav_nodes,-1,-1,-1); break; } + + /* if no child is hit, pop next node */ + if (unlikely(mask == 0)) + goto pop; + + /* select next child and push other children */ + BVHNNodeTraverser1Hit<N, types>::traverseClosestHit(cur, mask, tNear, stackPtr, stackEnd); + } + + /* this is a leaf node */ + assert(cur != BVH::emptyNode); + STAT3(normal.trav_leaves, 1, 1, 1); + size_t num; Primitive* prim = (Primitive*)cur.leaf(num); + + size_t lazy_node = 0; + PrimitiveIntersectorK::intersect(This, pre, ray, k, context, prim, num, tray1, lazy_node); + + tray1.tfar = ray.tfar[k]; + + if (unlikely(lazy_node)) { + stackPtr->ptr = lazy_node; + stackPtr->dist = neg_inf; + stackPtr++; + } + } + } + + template<int N, int K, int types, bool robust, typename PrimitiveIntersectorK, bool single> + void BVHNIntersectorKHybrid<N, K, types, robust, PrimitiveIntersectorK, single>::intersect(vint<K>* __restrict__ valid_i, + Accel::Intersectors* __restrict__ This, + RayHitK<K>& __restrict__ ray, + IntersectContext* __restrict__ context) + { + BVH* __restrict__ bvh = (BVH*)This->ptr; + + /* we may traverse an empty BVH in case all geometry was invalid */ + if (bvh->root == BVH::emptyNode) + return; + +#if ENABLE_FAST_COHERENT_CODEPATHS == 1 + assert(context); + if (unlikely(types == BVH_AN1 && context->user && context->isCoherent())) + { + intersectCoherent(valid_i, This, ray, context); + return; + } +#endif + + /* filter out invalid rays */ + vbool<K> valid = *valid_i == -1; +#if defined(EMBREE_IGNORE_INVALID_RAYS) + valid &= ray.valid(); +#endif + + /* return if there are no valid rays */ + size_t valid_bits = movemask(valid); + +#if defined(__AVX__) + STAT3(normal.trav_hit_boxes[popcnt(movemask(valid))], 1, 1, 1); +#endif + + if (unlikely(valid_bits == 0)) return; + + /* verify correct input */ + assert(all(valid, ray.valid())); + assert(all(valid, ray.tnear() >= 0.0f)); + assert(!(types & BVH_MB) || all(valid, (ray.time() >= 0.0f) & (ray.time() <= 1.0f))); + Precalculations pre(valid, ray); + + /* load ray */ + TravRayK<K, robust> tray(ray.org, ray.dir, single ? N : 0); + const vfloat<K> org_ray_tnear = max(ray.tnear(), 0.0f); + const vfloat<K> org_ray_tfar = max(ray.tfar , 0.0f); + + if (single) + { + tray.tnear = select(valid, org_ray_tnear, vfloat<K>(pos_inf)); + tray.tfar = select(valid, org_ray_tfar , vfloat<K>(neg_inf)); + + for (; valid_bits!=0; ) { + const size_t i = bscf(valid_bits); + intersect1(This, bvh, bvh->root, i, pre, ray, tray, context); + } + return; + } + + /* determine switch threshold based on flags */ + const size_t switchThreshold = (context->user && context->isCoherent()) ? 2 : switchThresholdIncoherent; + + vint<K> octant = ray.octant(); + octant = select(valid, octant, vint<K>(0xffffffff)); + + /* test whether we have ray with opposing direction signs in the packet */ + bool split = false; + { + size_t bits = valid_bits; + vbool<K> vsplit( false ); + do + { + const size_t valid_index = bsf(bits); + vbool<K> octant_valid = octant[valid_index] == octant; + bits &= ~(size_t)movemask(octant_valid); + vsplit |= vint<K>(octant[valid_index]) == (octant^vint<K>(0x7)); + } while (bits); + if (any(vsplit)) split = true; + } + + do + { + const size_t valid_index = bsf(valid_bits); + const vint<K> diff_octant = vint<K>(octant[valid_index])^octant; + const vint<K> count_diff_octant = \ + ((diff_octant >> 2) & 1) + + ((diff_octant >> 1) & 1) + + ((diff_octant >> 0) & 1); + + vbool<K> octant_valid = (count_diff_octant <= 1) & (octant != vint<K>(0xffffffff)); + if (!single || !split) octant_valid = valid; // deactivate octant sorting in pure chunk mode, otherwise instance traversal performance goes down + + + octant = select(octant_valid,vint<K>(0xffffffff),octant); + valid_bits &= ~(size_t)movemask(octant_valid); + + tray.tnear = select(octant_valid, org_ray_tnear, vfloat<K>(pos_inf)); + tray.tfar = select(octant_valid, org_ray_tfar , vfloat<K>(neg_inf)); + + /* allocate stack and push root node */ + vfloat<K> stack_near[stackSizeChunk]; + NodeRef stack_node[stackSizeChunk]; + stack_node[0] = BVH::invalidNode; + stack_near[0] = inf; + stack_node[1] = bvh->root; + stack_near[1] = tray.tnear; + NodeRef* stackEnd MAYBE_UNUSED = stack_node+stackSizeChunk; + NodeRef* __restrict__ sptr_node = stack_node + 2; + vfloat<K>* __restrict__ sptr_near = stack_near + 2; + + while (1) pop: + { + /* pop next node from stack */ + assert(sptr_node > stack_node); + sptr_node--; + sptr_near--; + NodeRef cur = *sptr_node; + if (unlikely(cur == BVH::invalidNode)) { + assert(sptr_node == stack_node); + break; + } + + /* cull node if behind closest hit point */ + vfloat<K> curDist = *sptr_near; + const vbool<K> active = curDist < tray.tfar; + if (unlikely(none(active))) + continue; + + /* switch to single ray traversal */ +#if (!defined(__WIN32__) || defined(__X86_64__)) && defined(__SSE4_2__) +#if FORCE_SINGLE_MODE == 0 + if (single) +#endif + { + size_t bits = movemask(active); +#if FORCE_SINGLE_MODE == 0 + if (unlikely(popcnt(bits) <= switchThreshold)) +#endif + { + for (; bits!=0; ) { + const size_t i = bscf(bits); + intersect1(This, bvh, cur, i, pre, ray, tray, context); + } + tray.tfar = min(tray.tfar, ray.tfar); + continue; + } + } +#endif + while (likely(!cur.isLeaf())) + { + /* process nodes */ + const vbool<K> valid_node = tray.tfar > curDist; + STAT3(normal.trav_nodes, 1, popcnt(valid_node), K); + const NodeRef nodeRef = cur; + const BaseNode* __restrict__ const node = nodeRef.baseNode(); + + /* set cur to invalid */ + cur = BVH::emptyNode; + curDist = pos_inf; + + size_t num_child_hits = 0; + + for (unsigned i = 0; i < N; i++) + { + const NodeRef child = node->children[i]; + if (unlikely(child == BVH::emptyNode)) break; + vfloat<K> lnearP; + vbool<K> lhit = valid_node; + BVHNNodeIntersectorK<N, K, types, robust>::intersect(nodeRef, i, tray, ray.time(), lnearP, lhit); + + /* if we hit the child we choose to continue with that child if it + is closer than the current next child, or we push it onto the stack */ + if (likely(any(lhit))) + { + assert(sptr_node < stackEnd); + assert(child != BVH::emptyNode); + const vfloat<K> childDist = select(lhit, lnearP, inf); + /* push cur node onto stack and continue with hit child */ + if (any(childDist < curDist)) + { + if (likely(cur != BVH::emptyNode)) { + num_child_hits++; + *sptr_node = cur; sptr_node++; + *sptr_near = curDist; sptr_near++; + } + curDist = childDist; + cur = child; + } + + /* push hit child onto stack */ + else { + num_child_hits++; + *sptr_node = child; sptr_node++; + *sptr_near = childDist; sptr_near++; + } + } + } + +#if defined(__AVX__) + //STAT3(normal.trav_hit_boxes[num_child_hits], 1, 1, 1); +#endif + + if (unlikely(cur == BVH::emptyNode)) + goto pop; + + /* improved distance sorting for 3 or more hits */ + if (unlikely(num_child_hits >= 2)) + { + if (any(sptr_near[-2] < sptr_near[-1])) + { + std::swap(sptr_near[-2],sptr_near[-1]); + std::swap(sptr_node[-2],sptr_node[-1]); + } + if (unlikely(num_child_hits >= 3)) + { + if (any(sptr_near[-3] < sptr_near[-1])) + { + std::swap(sptr_near[-3],sptr_near[-1]); + std::swap(sptr_node[-3],sptr_node[-1]); + } + if (any(sptr_near[-3] < sptr_near[-2])) + { + std::swap(sptr_near[-3],sptr_near[-2]); + std::swap(sptr_node[-3],sptr_node[-2]); + } + } + } + +#if SWITCH_DURING_DOWN_TRAVERSAL == 1 + if (single) + { + // seems to be the best place for testing utilization + if (unlikely(popcnt(tray.tfar > curDist) <= switchThreshold)) + { + *sptr_node++ = cur; + *sptr_near++ = curDist; + goto pop; + } + } +#endif + } + + /* return if stack is empty */ + if (unlikely(cur == BVH::invalidNode)) { + assert(sptr_node == stack_node); + break; + } + + /* intersect leaf */ + assert(cur != BVH::emptyNode); + const vbool<K> valid_leaf = tray.tfar > curDist; + STAT3(normal.trav_leaves, 1, popcnt(valid_leaf), K); + if (unlikely(none(valid_leaf))) continue; + size_t items; const Primitive* prim = (Primitive*)cur.leaf(items); + + size_t lazy_node = 0; + PrimitiveIntersectorK::intersect(valid_leaf, This, pre, ray, context, prim, items, tray, lazy_node); + tray.tfar = select(valid_leaf, ray.tfar, tray.tfar); + + if (unlikely(lazy_node)) { + *sptr_node = lazy_node; sptr_node++; + *sptr_near = neg_inf; sptr_near++; + } + } + } while(valid_bits); + } + + + template<int N, int K, int types, bool robust, typename PrimitiveIntersectorK, bool single> + void BVHNIntersectorKHybrid<N, K, types, robust, PrimitiveIntersectorK, single>::intersectCoherent(vint<K>* __restrict__ valid_i, + Accel::Intersectors* __restrict__ This, + RayHitK<K>& __restrict__ ray, + IntersectContext* context) + { + BVH* __restrict__ bvh = (BVH*)This->ptr; + + /* filter out invalid rays */ + vbool<K> valid = *valid_i == -1; +#if defined(EMBREE_IGNORE_INVALID_RAYS) + valid &= ray.valid(); +#endif + + /* return if there are no valid rays */ + size_t valid_bits = movemask(valid); + if (unlikely(valid_bits == 0)) return; + + /* verify correct input */ + assert(all(valid, ray.valid())); + assert(all(valid, ray.tnear() >= 0.0f)); + assert(!(types & BVH_MB) || all(valid, (ray.time() >= 0.0f) & (ray.time() <= 1.0f))); + Precalculations pre(valid, ray); + + /* load ray */ + TravRayK<K, robust> tray(ray.org, ray.dir, single ? N : 0); + const vfloat<K> org_ray_tnear = max(ray.tnear(), 0.0f); + const vfloat<K> org_ray_tfar = max(ray.tfar , 0.0f); + + vint<K> octant = ray.octant(); + octant = select(valid, octant, vint<K>(0xffffffff)); + + do + { + const size_t valid_index = bsf(valid_bits); + const vbool<K> octant_valid = octant[valid_index] == octant; + valid_bits &= ~(size_t)movemask(octant_valid); + + tray.tnear = select(octant_valid, org_ray_tnear, vfloat<K>(pos_inf)); + tray.tfar = select(octant_valid, org_ray_tfar , vfloat<K>(neg_inf)); + + Frustum<robust> frustum; + frustum.template init<K>(octant_valid, tray.org, tray.rdir, tray.tnear, tray.tfar, N); + + StackItemT<NodeRef> stack[stackSizeSingle]; // stack of nodes + StackItemT<NodeRef>* stackPtr = stack + 1; // current stack pointer + stack[0].ptr = bvh->root; + stack[0].dist = neg_inf; + + while (1) pop: + { + /* pop next node from stack */ + if (unlikely(stackPtr == stack)) break; + + stackPtr--; + NodeRef cur = NodeRef(stackPtr->ptr); + + /* cull node if behind closest hit point */ + vfloat<K> curDist = *(float*)&stackPtr->dist; + const vbool<K> active = curDist < tray.tfar; + if (unlikely(none(active))) continue; + + while (likely(!cur.isLeaf())) + { + /* process nodes */ + //STAT3(normal.trav_nodes, 1, popcnt(valid_node), K); + const NodeRef nodeRef = cur; + const AABBNode* __restrict__ const node = nodeRef.getAABBNode(); + + vfloat<N> fmin; + size_t m_frustum_node = intersectNodeFrustum<N>(node, frustum, fmin); + + if (unlikely(!m_frustum_node)) goto pop; + cur = BVH::emptyNode; + curDist = pos_inf; + +#if defined(__AVX__) + //STAT3(normal.trav_hit_boxes[popcnt(m_frustum_node)], 1, 1, 1); +#endif + size_t num_child_hits = 0; + do { + const size_t i = bscf(m_frustum_node); + vfloat<K> lnearP; + vbool<K> lhit = false; // motion blur is not supported, so the initial value will be ignored + STAT3(normal.trav_nodes, 1, 1, 1); + BVHNNodeIntersectorK<N, K, types, robust>::intersect(nodeRef, i, tray, ray.time(), lnearP, lhit); + + if (likely(any(lhit))) + { + const vfloat<K> childDist = fmin[i]; + const NodeRef child = node->child(i); + BVHN<N>::prefetch(child); + if (any(childDist < curDist)) + { + if (likely(cur != BVH::emptyNode)) { + num_child_hits++; + stackPtr->ptr = cur; + *(float*)&stackPtr->dist = toScalar(curDist); + stackPtr++; + } + curDist = childDist; + cur = child; + } + /* push hit child onto stack */ + else { + num_child_hits++; + stackPtr->ptr = child; + *(float*)&stackPtr->dist = toScalar(childDist); + stackPtr++; + } + } + } while(m_frustum_node); + + if (unlikely(cur == BVH::emptyNode)) goto pop; + + /* improved distance sorting for 3 or more hits */ + if (unlikely(num_child_hits >= 2)) + { + if (stackPtr[-2].dist < stackPtr[-1].dist) + std::swap(stackPtr[-2],stackPtr[-1]); + if (unlikely(num_child_hits >= 3)) + { + if (stackPtr[-3].dist < stackPtr[-1].dist) + std::swap(stackPtr[-3],stackPtr[-1]); + if (stackPtr[-3].dist < stackPtr[-2].dist) + std::swap(stackPtr[-3],stackPtr[-2]); + } + } + } + + /* intersect leaf */ + assert(cur != BVH::invalidNode); + assert(cur != BVH::emptyNode); + const vbool<K> valid_leaf = tray.tfar > curDist; + STAT3(normal.trav_leaves, 1, popcnt(valid_leaf), K); + if (unlikely(none(valid_leaf))) continue; + size_t items; const Primitive* prim = (Primitive*)cur.leaf(items); + + size_t lazy_node = 0; + PrimitiveIntersectorK::intersect(valid_leaf, This, pre, ray, context, prim, items, tray, lazy_node); + + /* reduce max distance interval on successful intersection */ + if (likely(any((ray.tfar < tray.tfar) & valid_leaf))) + { + tray.tfar = select(valid_leaf, ray.tfar, tray.tfar); + frustum.template updateMaxDist<K>(tray.tfar); + } + + if (unlikely(lazy_node)) { + stackPtr->ptr = lazy_node; + stackPtr->dist = neg_inf; + stackPtr++; + } + } + + } while(valid_bits); + } + + // =================================================================================================================================================================== + // =================================================================================================================================================================== + // =================================================================================================================================================================== + + template<int N, int K, int types, bool robust, typename PrimitiveIntersectorK, bool single> + bool BVHNIntersectorKHybrid<N, K, types, robust, PrimitiveIntersectorK, single>::occluded1(Accel::Intersectors* This, + const BVH* bvh, + NodeRef root, + size_t k, + Precalculations& pre, + RayK<K>& ray, + const TravRayK<K, robust>& tray, + IntersectContext* context) + { + /* stack state */ + NodeRef stack[stackSizeSingle]; // stack of nodes that still need to get traversed + NodeRef* stackPtr = stack+1; // current stack pointer + NodeRef* stackEnd = stack+stackSizeSingle; + stack[0] = root; + + /* load the ray into SIMD registers */ + TravRay<N,robust> tray1; + tray1.template init<K>(k, tray.org, tray.dir, tray.rdir, tray.nearXYZ, tray.tnear[k], tray.tfar[k]); + + /* pop loop */ + while (true) pop: + { + /* pop next node */ + if (unlikely(stackPtr == stack)) break; + stackPtr--; + NodeRef cur = (NodeRef)*stackPtr; + + /* downtraversal loop */ + while (true) + { + /* intersect node */ + size_t mask; vfloat<N> tNear; + STAT3(shadow.trav_nodes, 1, 1, 1); + bool nodeIntersected = BVHNNodeIntersector1<N, types, robust>::intersect(cur, tray1, ray.time()[k], tNear, mask); + if (unlikely(!nodeIntersected)) { STAT3(shadow.trav_nodes,-1,-1,-1); break; } + + /* if no child is hit, pop next node */ + if (unlikely(mask == 0)) + goto pop; + + /* select next child and push other children */ + BVHNNodeTraverser1Hit<N, types>::traverseAnyHit(cur, mask, tNear, stackPtr, stackEnd); + } + + /* this is a leaf node */ + assert(cur != BVH::emptyNode); + STAT3(shadow.trav_leaves, 1, 1, 1); + size_t num; Primitive* prim = (Primitive*)cur.leaf(num); + + size_t lazy_node = 0; + if (PrimitiveIntersectorK::occluded(This, pre, ray, k, context, prim, num, tray1, lazy_node)) { + ray.tfar[k] = neg_inf; + return true; + } + + if (unlikely(lazy_node)) { + *stackPtr = lazy_node; + stackPtr++; + } + } + return false; + } + + template<int N, int K, int types, bool robust, typename PrimitiveIntersectorK, bool single> + void BVHNIntersectorKHybrid<N, K, types, robust, PrimitiveIntersectorK, single>::occluded(vint<K>* __restrict__ valid_i, + Accel::Intersectors* __restrict__ This, + RayK<K>& __restrict__ ray, + IntersectContext* context) + { + BVH* __restrict__ bvh = (BVH*)This->ptr; + + /* we may traverse an empty BVH in case all geometry was invalid */ + if (bvh->root == BVH::emptyNode) + return; + +#if ENABLE_FAST_COHERENT_CODEPATHS == 1 + assert(context); + if (unlikely(types == BVH_AN1 && context->user && context->isCoherent())) + { + occludedCoherent(valid_i, This, ray, context); + return; + } +#endif + + /* filter out already occluded and invalid rays */ + vbool<K> valid = (*valid_i == -1) & (ray.tfar >= 0.0f); +#if defined(EMBREE_IGNORE_INVALID_RAYS) + valid &= ray.valid(); +#endif + + /* return if there are no valid rays */ + const size_t valid_bits = movemask(valid); + if (unlikely(valid_bits == 0)) return; + + /* verify correct input */ + assert(all(valid, ray.valid())); + assert(all(valid, ray.tnear() >= 0.0f)); + assert(!(types & BVH_MB) || all(valid, (ray.time() >= 0.0f) & (ray.time() <= 1.0f))); + Precalculations pre(valid, ray); + + /* load ray */ + TravRayK<K, robust> tray(ray.org, ray.dir, single ? N : 0); + const vfloat<K> org_ray_tnear = max(ray.tnear(), 0.0f); + const vfloat<K> org_ray_tfar = max(ray.tfar , 0.0f); + + tray.tnear = select(valid, org_ray_tnear, vfloat<K>(pos_inf)); + tray.tfar = select(valid, org_ray_tfar , vfloat<K>(neg_inf)); + + vbool<K> terminated = !valid; + const vfloat<K> inf = vfloat<K>(pos_inf); + + /* determine switch threshold based on flags */ + const size_t switchThreshold = (context->user && context->isCoherent()) ? 2 : switchThresholdIncoherent; + + /* allocate stack and push root node */ + vfloat<K> stack_near[stackSizeChunk]; + NodeRef stack_node[stackSizeChunk]; + stack_node[0] = BVH::invalidNode; + stack_near[0] = inf; + stack_node[1] = bvh->root; + stack_near[1] = tray.tnear; + NodeRef* stackEnd MAYBE_UNUSED = stack_node+stackSizeChunk; + NodeRef* __restrict__ sptr_node = stack_node + 2; + vfloat<K>* __restrict__ sptr_near = stack_near + 2; + + while (1) pop: + { + /* pop next node from stack */ + assert(sptr_node > stack_node); + sptr_node--; + sptr_near--; + NodeRef cur = *sptr_node; + if (unlikely(cur == BVH::invalidNode)) { + assert(sptr_node == stack_node); + break; + } + + /* cull node if behind closest hit point */ + vfloat<K> curDist = *sptr_near; + const vbool<K> active = curDist < tray.tfar; + if (unlikely(none(active))) + continue; + + /* switch to single ray traversal */ +#if (!defined(__WIN32__) || defined(__X86_64__)) && defined(__SSE4_2__) +#if FORCE_SINGLE_MODE == 0 + if (single) +#endif + { + size_t bits = movemask(active); +#if FORCE_SINGLE_MODE == 0 + if (unlikely(popcnt(bits) <= switchThreshold)) +#endif + { + for (; bits!=0; ) { + const size_t i = bscf(bits); + if (occluded1(This, bvh, cur, i, pre, ray, tray, context)) + set(terminated, i); + } + if (all(terminated)) break; + tray.tfar = select(terminated, vfloat<K>(neg_inf), tray.tfar); + continue; + } + } +#endif + + while (likely(!cur.isLeaf())) + { + /* process nodes */ + const vbool<K> valid_node = tray.tfar > curDist; + STAT3(shadow.trav_nodes, 1, popcnt(valid_node), K); + const NodeRef nodeRef = cur; + const BaseNode* __restrict__ const node = nodeRef.baseNode(); + + /* set cur to invalid */ + cur = BVH::emptyNode; + curDist = pos_inf; + + for (unsigned i = 0; i < N; i++) + { + const NodeRef child = node->children[i]; + if (unlikely(child == BVH::emptyNode)) break; + vfloat<K> lnearP; + vbool<K> lhit = valid_node; + BVHNNodeIntersectorK<N, K, types, robust>::intersect(nodeRef, i, tray, ray.time(), lnearP, lhit); + + /* if we hit the child we push the previously hit node onto the stack, and continue with the currently hit child */ + if (likely(any(lhit))) + { + assert(sptr_node < stackEnd); + assert(child != BVH::emptyNode); + const vfloat<K> childDist = select(lhit, lnearP, inf); + + /* push 'cur' node onto stack and continue with hit child */ + if (likely(cur != BVH::emptyNode)) { + *sptr_node = cur; sptr_node++; + *sptr_near = curDist; sptr_near++; + } + curDist = childDist; + cur = child; + } + } + if (unlikely(cur == BVH::emptyNode)) + goto pop; + +#if SWITCH_DURING_DOWN_TRAVERSAL == 1 + if (single) + { + // seems to be the best place for testing utilization + if (unlikely(popcnt(tray.tfar > curDist) <= switchThreshold)) + { + *sptr_node++ = cur; + *sptr_near++ = curDist; + goto pop; + } + } +#endif + } + + /* return if stack is empty */ + if (unlikely(cur == BVH::invalidNode)) { + assert(sptr_node == stack_node); + break; + } + + + /* intersect leaf */ + assert(cur != BVH::emptyNode); + const vbool<K> valid_leaf = tray.tfar > curDist; + STAT3(shadow.trav_leaves, 1, popcnt(valid_leaf), K); + if (unlikely(none(valid_leaf))) continue; + size_t items; const Primitive* prim = (Primitive*) cur.leaf(items); + + size_t lazy_node = 0; + terminated |= PrimitiveIntersectorK::occluded(!terminated, This, pre, ray, context, prim, items, tray, lazy_node); + if (all(terminated)) break; + tray.tfar = select(terminated, vfloat<K>(neg_inf), tray.tfar); // ignore node intersections for terminated rays + + if (unlikely(lazy_node)) { + *sptr_node = lazy_node; sptr_node++; + *sptr_near = neg_inf; sptr_near++; + } + } + + vfloat<K>::store(valid & terminated, &ray.tfar, neg_inf); + } + + + template<int N, int K, int types, bool robust, typename PrimitiveIntersectorK, bool single> + void BVHNIntersectorKHybrid<N, K, types, robust, PrimitiveIntersectorK, single>::occludedCoherent(vint<K>* __restrict__ valid_i, + Accel::Intersectors* __restrict__ This, + RayK<K>& __restrict__ ray, + IntersectContext* context) + { + BVH* __restrict__ bvh = (BVH*)This->ptr; + + /* filter out invalid rays */ + vbool<K> valid = *valid_i == -1; +#if defined(EMBREE_IGNORE_INVALID_RAYS) + valid &= ray.valid(); +#endif + + /* return if there are no valid rays */ + size_t valid_bits = movemask(valid); + if (unlikely(valid_bits == 0)) return; + + /* verify correct input */ + assert(all(valid, ray.valid())); + assert(all(valid, ray.tnear() >= 0.0f)); + assert(!(types & BVH_MB) || all(valid, (ray.time() >= 0.0f) & (ray.time() <= 1.0f))); + Precalculations pre(valid,ray); + + /* load ray */ + TravRayK<K, robust> tray(ray.org, ray.dir, single ? N : 0); + const vfloat<K> org_ray_tnear = max(ray.tnear(), 0.0f); + const vfloat<K> org_ray_tfar = max(ray.tfar , 0.0f); + + vbool<K> terminated = !valid; + + vint<K> octant = ray.octant(); + octant = select(valid, octant, vint<K>(0xffffffff)); + + do + { + const size_t valid_index = bsf(valid_bits); + vbool<K> octant_valid = octant[valid_index] == octant; + valid_bits &= ~(size_t)movemask(octant_valid); + + tray.tnear = select(octant_valid, org_ray_tnear, vfloat<K>(pos_inf)); + tray.tfar = select(octant_valid, org_ray_tfar, vfloat<K>(neg_inf)); + + Frustum<robust> frustum; + frustum.template init<K>(octant_valid, tray.org, tray.rdir, tray.tnear, tray.tfar, N); + + StackItemMaskT<NodeRef> stack[stackSizeSingle]; // stack of nodes + StackItemMaskT<NodeRef>* stackPtr = stack + 1; // current stack pointer + stack[0].ptr = bvh->root; + stack[0].mask = movemask(octant_valid); + + while (1) pop: + { + /* pop next node from stack */ + if (unlikely(stackPtr == stack)) break; + + stackPtr--; + NodeRef cur = NodeRef(stackPtr->ptr); + + /* cull node of active rays have already been terminated */ + size_t m_active = (size_t)stackPtr->mask & (~(size_t)movemask(terminated)); + + if (unlikely(m_active == 0)) continue; + + while (likely(!cur.isLeaf())) + { + /* process nodes */ + //STAT3(normal.trav_nodes, 1, popcnt(valid_node), K); + const NodeRef nodeRef = cur; + const AABBNode* __restrict__ const node = nodeRef.getAABBNode(); + + vfloat<N> fmin; + size_t m_frustum_node = intersectNodeFrustum<N>(node, frustum, fmin); + + if (unlikely(!m_frustum_node)) goto pop; + cur = BVH::emptyNode; + m_active = 0; + +#if defined(__AVX__) + //STAT3(normal.trav_hit_boxes[popcnt(m_frustum_node)], 1, 1, 1); +#endif + size_t num_child_hits = 0; + do { + const size_t i = bscf(m_frustum_node); + vfloat<K> lnearP; + vbool<K> lhit = false; // motion blur is not supported, so the initial value will be ignored + STAT3(normal.trav_nodes, 1, 1, 1); + BVHNNodeIntersectorK<N, K, types, robust>::intersect(nodeRef, i, tray, ray.time(), lnearP, lhit); + + if (likely(any(lhit))) + { + const NodeRef child = node->child(i); + assert(child != BVH::emptyNode); + BVHN<N>::prefetch(child); + if (likely(cur != BVH::emptyNode)) { + num_child_hits++; + stackPtr->ptr = cur; + stackPtr->mask = m_active; + stackPtr++; + } + cur = child; + m_active = movemask(lhit); + } + } while(m_frustum_node); + + if (unlikely(cur == BVH::emptyNode)) goto pop; + } + + /* intersect leaf */ + assert(cur != BVH::invalidNode); + assert(cur != BVH::emptyNode); +#if defined(__AVX__) + STAT3(normal.trav_leaves, 1, popcnt(m_active), K); +#endif + if (unlikely(!m_active)) continue; + size_t items; const Primitive* prim = (Primitive*)cur.leaf(items); + + size_t lazy_node = 0; + terminated |= PrimitiveIntersectorK::occluded(!terminated, This, pre, ray, context, prim, items, tray, lazy_node); + octant_valid &= !terminated; + if (unlikely(none(octant_valid))) break; + tray.tfar = select(terminated, vfloat<K>(neg_inf), tray.tfar); // ignore node intersections for terminated rays + + if (unlikely(lazy_node)) { + stackPtr->ptr = lazy_node; + stackPtr->mask = movemask(octant_valid); + stackPtr++; + } + } + } while(valid_bits); + + vfloat<K>::store(valid & terminated, &ray.tfar, neg_inf); + } + } +} diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid4_bvh4.cpp b/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid4_bvh4.cpp new file mode 100644 index 0000000000..2137da6a25 --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid4_bvh4.cpp @@ -0,0 +1,59 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh_intersector_hybrid.cpp" + +namespace embree +{ + namespace isa + { + //////////////////////////////////////////////////////////////////////////////// + /// BVH4Intersector4 Definitions + //////////////////////////////////////////////////////////////////////////////// + + IF_ENABLED_TRIS(DEFINE_INTERSECTOR4(BVH4Triangle4Intersector4HybridMoeller, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersectorK_1<4 COMMA TriangleMIntersectorKMoeller <4 COMMA 4 COMMA true> > >)); + IF_ENABLED_TRIS(DEFINE_INTERSECTOR4(BVH4Triangle4Intersector4HybridMoellerNoFilter, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersectorK_1<4 COMMA TriangleMIntersectorKMoeller <4 COMMA 4 COMMA false> > >)); + IF_ENABLED_TRIS(DEFINE_INTERSECTOR4(BVH4Triangle4iIntersector4HybridMoeller, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersectorK_1<4 COMMA TriangleMiIntersectorKMoeller <4 COMMA 4 COMMA true> > >)); + IF_ENABLED_TRIS(DEFINE_INTERSECTOR4(BVH4Triangle4vIntersector4HybridPluecker, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA true COMMA ArrayIntersectorK_1<4 COMMA TriangleMvIntersectorKPluecker<4 COMMA 4 COMMA true> > >)); + IF_ENABLED_TRIS(DEFINE_INTERSECTOR4(BVH4Triangle4iIntersector4HybridPluecker, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA true COMMA ArrayIntersectorK_1<4 COMMA TriangleMiIntersectorKPluecker<4 COMMA 4 COMMA true> > >)); + + IF_ENABLED_TRIS(DEFINE_INTERSECTOR4(BVH4Triangle4vMBIntersector4HybridMoeller, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersectorK_1<4 COMMA TriangleMvMBIntersectorKMoeller <4 COMMA 4 COMMA true> > >)); + IF_ENABLED_TRIS(DEFINE_INTERSECTOR4(BVH4Triangle4iMBIntersector4HybridMoeller, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersectorK_1<4 COMMA TriangleMiMBIntersectorKMoeller <4 COMMA 4 COMMA true> > >)); + IF_ENABLED_TRIS(DEFINE_INTERSECTOR4(BVH4Triangle4vMBIntersector4HybridPluecker, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN2_AN4D COMMA true COMMA ArrayIntersectorK_1<4 COMMA TriangleMvMBIntersectorKPluecker<4 COMMA 4 COMMA true> > >)); + IF_ENABLED_TRIS(DEFINE_INTERSECTOR4(BVH4Triangle4iMBIntersector4HybridPluecker, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN2_AN4D COMMA true COMMA ArrayIntersectorK_1<4 COMMA TriangleMiMBIntersectorKPluecker<4 COMMA 4 COMMA true> > >)); + + IF_ENABLED_QUADS(DEFINE_INTERSECTOR4(BVH4Quad4vIntersector4HybridMoeller, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersectorK_1<4 COMMA QuadMvIntersectorKMoeller <4 COMMA 4 COMMA true > > >)); + IF_ENABLED_QUADS(DEFINE_INTERSECTOR4(BVH4Quad4vIntersector4HybridMoellerNoFilter,BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersectorK_1<4 COMMA QuadMvIntersectorKMoeller <4 COMMA 4 COMMA false> > >)); + IF_ENABLED_QUADS(DEFINE_INTERSECTOR4(BVH4Quad4iIntersector4HybridMoeller, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersectorK_1<4 COMMA QuadMiIntersectorKMoeller <4 COMMA 4 COMMA true > > >)); + IF_ENABLED_QUADS(DEFINE_INTERSECTOR4(BVH4Quad4vIntersector4HybridPluecker, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA true COMMA ArrayIntersectorK_1<4 COMMA QuadMvIntersectorKPluecker<4 COMMA 4 COMMA true > > >)); + IF_ENABLED_QUADS(DEFINE_INTERSECTOR4(BVH4Quad4iIntersector4HybridPluecker, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA true COMMA ArrayIntersectorK_1<4 COMMA QuadMiIntersectorKPluecker<4 COMMA 4 COMMA true > > >)); + + IF_ENABLED_QUADS(DEFINE_INTERSECTOR4(BVH4Quad4iMBIntersector4HybridMoeller, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersectorK_1<4 COMMA QuadMiMBIntersectorKMoeller <4 COMMA 4 COMMA true > > >)); + IF_ENABLED_QUADS(DEFINE_INTERSECTOR4(BVH4Quad4iMBIntersector4HybridPluecker,BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN2_AN4D COMMA true COMMA ArrayIntersectorK_1<4 COMMA QuadMiMBIntersectorKPluecker<4 COMMA 4 COMMA true > > >)); + + IF_ENABLED_CURVES_OR_POINTS(DEFINE_INTERSECTOR4(BVH4OBBVirtualCurveIntersector4Hybrid, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1_UN1 COMMA false COMMA VirtualCurveIntersectorK<4> >)); + IF_ENABLED_CURVES_OR_POINTS(DEFINE_INTERSECTOR4(BVH4OBBVirtualCurveIntersector4HybridMB,BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN2_AN4D_UN2 COMMA false COMMA VirtualCurveIntersectorK<4> >)); + + IF_ENABLED_CURVES_OR_POINTS(DEFINE_INTERSECTOR4(BVH4OBBVirtualCurveIntersectorRobust4Hybrid, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1_UN1 COMMA true COMMA VirtualCurveIntersectorK<4> >)); + IF_ENABLED_CURVES_OR_POINTS(DEFINE_INTERSECTOR4(BVH4OBBVirtualCurveIntersectorRobust4HybridMB,BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN2_AN4D_UN2 COMMA true COMMA VirtualCurveIntersectorK<4> >)); + + //IF_ENABLED_SUBDIV(DEFINE_INTERSECTOR4(BVH4SubdivPatch1Intersector4, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA true COMMA SubdivPatch1Intersector4>)); + IF_ENABLED_SUBDIV(DEFINE_INTERSECTOR4(BVH4SubdivPatch1Intersector4, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA true COMMA SubdivPatch1Intersector4>)); + IF_ENABLED_SUBDIV(DEFINE_INTERSECTOR4(BVH4SubdivPatch1MBIntersector4, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN2_AN4D COMMA false COMMA SubdivPatch1MBIntersector4>)); + //IF_ENABLED_SUBDIV(DEFINE_INTERSECTOR4(BVH4SubdivPatch1MBIntersector4, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN2_AN4D COMMA false COMMA SubdivPatch1MBIntersector4>)); + + IF_ENABLED_USER(DEFINE_INTERSECTOR4(BVH4VirtualIntersector4Chunk, BVHNIntersectorKChunk<4 COMMA 4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersectorK_1<4 COMMA ObjectIntersector4> >)); + IF_ENABLED_USER(DEFINE_INTERSECTOR4(BVH4VirtualMBIntersector4Chunk, BVHNIntersectorKChunk<4 COMMA 4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersectorK_1<4 COMMA ObjectIntersector4MB> >)); + + IF_ENABLED_INSTANCE(DEFINE_INTERSECTOR4(BVH4InstanceIntersector4Chunk, BVHNIntersectorKChunk<4 COMMA 4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersectorK_1<4 COMMA InstanceIntersectorK<4>> >)); + IF_ENABLED_INSTANCE(DEFINE_INTERSECTOR4(BVH4InstanceMBIntersector4Chunk, BVHNIntersectorKChunk<4 COMMA 4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersectorK_1<4 COMMA InstanceIntersectorKMB<4>> >)); + + IF_ENABLED_GRIDS(DEFINE_INTERSECTOR4(BVH4GridIntersector4HybridMoeller, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA false COMMA SubGridIntersectorKMoeller <4 COMMA 4 COMMA true> >)); + //IF_ENABLED_GRIDS(DEFINE_INTERSECTOR4(BVH4GridIntersector4HybridMoeller, BVHNIntersectorKChunk<4 COMMA 4 COMMA BVH_AN1 COMMA false COMMA SubGridIntersectorKMoeller <4 COMMA 4 COMMA true> >)); + + IF_ENABLED_GRIDS(DEFINE_INTERSECTOR4(BVH4GridMBIntersector4HybridMoeller, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN2_AN4D COMMA true COMMA SubGridMBIntersectorKPluecker <4 COMMA 4 COMMA true> >)); + IF_ENABLED_GRIDS(DEFINE_INTERSECTOR4(BVH4GridIntersector4HybridPluecker, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA true COMMA SubGridIntersectorKPluecker <4 COMMA 4 COMMA true> >)); + + } +} + diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector_stream.cpp b/thirdparty/embree/kernels/bvh/bvh_intersector_stream.cpp new file mode 100644 index 0000000000..4a74d8468d --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh_intersector_stream.cpp @@ -0,0 +1,528 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh_intersector_stream.h" + +#include "../geometry/intersector_iterators.h" +#include "../geometry/triangle_intersector.h" +#include "../geometry/trianglev_intersector.h" +#include "../geometry/trianglev_mb_intersector.h" +#include "../geometry/trianglei_intersector.h" +#include "../geometry/quadv_intersector.h" +#include "../geometry/quadi_intersector.h" +#include "../geometry/linei_intersector.h" +#include "../geometry/subdivpatch1_intersector.h" +#include "../geometry/object_intersector.h" +#include "../geometry/instance_intersector.h" + +#include "../common/scene.h" +#include <bitset> + +namespace embree +{ + namespace isa + { + __aligned(64) static const int shiftTable[32] = { + (int)1 << 0, (int)1 << 1, (int)1 << 2, (int)1 << 3, (int)1 << 4, (int)1 << 5, (int)1 << 6, (int)1 << 7, + (int)1 << 8, (int)1 << 9, (int)1 << 10, (int)1 << 11, (int)1 << 12, (int)1 << 13, (int)1 << 14, (int)1 << 15, + (int)1 << 16, (int)1 << 17, (int)1 << 18, (int)1 << 19, (int)1 << 20, (int)1 << 21, (int)1 << 22, (int)1 << 23, + (int)1 << 24, (int)1 << 25, (int)1 << 26, (int)1 << 27, (int)1 << 28, (int)1 << 29, (int)1 << 30, (int)1 << 31 + }; + + template<int N, int types, bool robust, typename PrimitiveIntersector> + __forceinline void BVHNIntersectorStream<N, types, robust, PrimitiveIntersector>::intersect(Accel::Intersectors* __restrict__ This, + RayHitN** inputPackets, + size_t numOctantRays, + IntersectContext* context) + { + /* we may traverse an empty BVH in case all geometry was invalid */ + BVH* __restrict__ bvh = (BVH*) This->ptr; + if (bvh->root == BVH::emptyNode) + return; + + // Only the coherent code path is implemented + assert(context->isCoherent()); + intersectCoherent(This, (RayHitK<VSIZEL>**)inputPackets, numOctantRays, context); + } + + template<int N, int types, bool robust, typename PrimitiveIntersector> + template<int K> + __forceinline void BVHNIntersectorStream<N, types, robust, PrimitiveIntersector>::intersectCoherent(Accel::Intersectors* __restrict__ This, + RayHitK<K>** inputPackets, + size_t numOctantRays, + IntersectContext* context) + { + assert(context->isCoherent()); + + BVH* __restrict__ bvh = (BVH*) This->ptr; + __aligned(64) StackItemMaskCoherent stack[stackSizeSingle]; // stack of nodes + assert(numOctantRays <= MAX_INTERNAL_STREAM_SIZE); + + __aligned(64) TravRayKStream<K, robust> packets[MAX_INTERNAL_STREAM_SIZE/K]; + __aligned(64) Frustum<robust> frustum; + + bool commonOctant = true; + const size_t m_active = initPacketsAndFrustum((RayK<K>**)inputPackets, numOctantRays, packets, frustum, commonOctant); + if (unlikely(m_active == 0)) return; + + /* case of non-common origin */ + if (unlikely(!commonOctant)) + { + const size_t numPackets = (numOctantRays+K-1)/K; + for (size_t i = 0; i < numPackets; i++) + This->intersect(inputPackets[i]->tnear() <= inputPackets[i]->tfar, *inputPackets[i], context); + return; + } + + stack[0].mask = m_active; + stack[0].parent = 0; + stack[0].child = bvh->root; + + /////////////////////////////////////////////////////////////////////////////////// + /////////////////////////////////////////////////////////////////////////////////// + /////////////////////////////////////////////////////////////////////////////////// + + StackItemMaskCoherent* stackPtr = stack + 1; + + while (1) pop: + { + if (unlikely(stackPtr == stack)) break; + + STAT3(normal.trav_stack_pop,1,1,1); + stackPtr--; + /*! pop next node */ + NodeRef cur = NodeRef(stackPtr->child); + size_t m_trav_active = stackPtr->mask; + assert(m_trav_active); + NodeRef parent = stackPtr->parent; + + while (1) + { + if (unlikely(cur.isLeaf())) break; + const AABBNode* __restrict__ const node = cur.getAABBNode(); + parent = cur; + + __aligned(64) size_t maskK[N]; + for (size_t i = 0; i < N; i++) + maskK[i] = m_trav_active; + vfloat<N> dist; + const size_t m_node_hit = traverseCoherentStream(m_trav_active, packets, node, frustum, maskK, dist); + if (unlikely(m_node_hit == 0)) goto pop; + + BVHNNodeTraverserStreamHitCoherent<N, types>::traverseClosestHit(cur, m_trav_active, vbool<N>((int)m_node_hit), dist, (size_t*)maskK, stackPtr); + assert(m_trav_active); + } + + /* non-root and leaf => full culling test for all rays */ + if (unlikely(parent != 0 && cur.isLeaf())) + { + const AABBNode* __restrict__ const node = parent.getAABBNode(); + size_t boxID = 0xff; + for (size_t i = 0; i < N; i++) + if (node->child(i) == cur) { boxID = i; break; } + assert(boxID < N); + assert(cur == node->child(boxID)); + m_trav_active = intersectAABBNodePacket(m_trav_active, packets, node, boxID, frustum.nf); + } + + /*! this is a leaf node */ + assert(cur != BVH::emptyNode); + STAT3(normal.trav_leaves, 1, 1, 1); + size_t num; PrimitiveK<K>* prim = (PrimitiveK<K>*)cur.leaf(num); + + size_t bits = m_trav_active; + + /*! intersect stream of rays with all primitives */ + size_t lazy_node = 0; +#if defined(__SSE4_2__) + STAT_USER(1,(popcnt(bits)+K-1)/K*4); +#endif + while(bits) + { + size_t i = bsf(bits) / K; + const size_t m_isec = ((((size_t)1 << K)-1) << (i*K)); + assert(m_isec & bits); + bits &= ~m_isec; + + TravRayKStream<K, robust>& p = packets[i]; + vbool<K> m_valid = p.tnear <= p.tfar; + PrimitiveIntersectorK<K>::intersectK(m_valid, This, *inputPackets[i], context, prim, num, lazy_node); + p.tfar = min(p.tfar, inputPackets[i]->tfar); + }; + + } // traversal + intersection + } + + template<int N, int types, bool robust, typename PrimitiveIntersector> + __forceinline void BVHNIntersectorStream<N, types, robust, PrimitiveIntersector>::occluded(Accel::Intersectors* __restrict__ This, + RayN** inputPackets, + size_t numOctantRays, + IntersectContext* context) + { + /* we may traverse an empty BVH in case all geometry was invalid */ + BVH* __restrict__ bvh = (BVH*) This->ptr; + if (bvh->root == BVH::emptyNode) + return; + + if (unlikely(context->isCoherent())) + occludedCoherent(This, (RayK<VSIZEL>**)inputPackets, numOctantRays, context); + else + occludedIncoherent(This, (RayK<VSIZEX>**)inputPackets, numOctantRays, context); + } + + template<int N, int types, bool robust, typename PrimitiveIntersector> + template<int K> + __noinline void BVHNIntersectorStream<N, types, robust, PrimitiveIntersector>::occludedCoherent(Accel::Intersectors* __restrict__ This, + RayK<K>** inputPackets, + size_t numOctantRays, + IntersectContext* context) + { + assert(context->isCoherent()); + + BVH* __restrict__ bvh = (BVH*)This->ptr; + __aligned(64) StackItemMaskCoherent stack[stackSizeSingle]; // stack of nodes + assert(numOctantRays <= MAX_INTERNAL_STREAM_SIZE); + + /* inactive rays should have been filtered out before */ + __aligned(64) TravRayKStream<K, robust> packets[MAX_INTERNAL_STREAM_SIZE/K]; + __aligned(64) Frustum<robust> frustum; + + bool commonOctant = true; + size_t m_active = initPacketsAndFrustum(inputPackets, numOctantRays, packets, frustum, commonOctant); + + /* valid rays */ + if (unlikely(m_active == 0)) return; + + /* case of non-common origin */ + if (unlikely(!commonOctant)) + { + const size_t numPackets = (numOctantRays+K-1)/K; + for (size_t i = 0; i < numPackets; i++) + This->occluded(inputPackets[i]->tnear() <= inputPackets[i]->tfar, *inputPackets[i], context); + return; + } + + stack[0].mask = m_active; + stack[0].parent = 0; + stack[0].child = bvh->root; + + /////////////////////////////////////////////////////////////////////////////////// + /////////////////////////////////////////////////////////////////////////////////// + /////////////////////////////////////////////////////////////////////////////////// + + StackItemMaskCoherent* stackPtr = stack + 1; + + while (1) pop: + { + if (unlikely(stackPtr == stack)) break; + + STAT3(normal.trav_stack_pop,1,1,1); + stackPtr--; + /*! pop next node */ + NodeRef cur = NodeRef(stackPtr->child); + size_t m_trav_active = stackPtr->mask & m_active; + if (unlikely(!m_trav_active)) continue; + assert(m_trav_active); + NodeRef parent = stackPtr->parent; + + while (1) + { + if (unlikely(cur.isLeaf())) break; + const AABBNode* __restrict__ const node = cur.getAABBNode(); + parent = cur; + + __aligned(64) size_t maskK[N]; + for (size_t i = 0; i < N; i++) + maskK[i] = m_trav_active; + + vfloat<N> dist; + const size_t m_node_hit = traverseCoherentStream(m_trav_active, packets, node, frustum, maskK, dist); + if (unlikely(m_node_hit == 0)) goto pop; + + BVHNNodeTraverserStreamHitCoherent<N, types>::traverseAnyHit(cur, m_trav_active, vbool<N>((int)m_node_hit), (size_t*)maskK, stackPtr); + assert(m_trav_active); + } + + /* non-root and leaf => full culling test for all rays */ + if (unlikely(parent != 0 && cur.isLeaf())) + { + const AABBNode* __restrict__ const node = parent.getAABBNode(); + size_t boxID = 0xff; + for (size_t i = 0; i < N; i++) + if (node->child(i) == cur) { boxID = i; break; } + assert(boxID < N); + assert(cur == node->child(boxID)); + m_trav_active = intersectAABBNodePacket(m_trav_active, packets, node, boxID, frustum.nf); + } + + /*! this is a leaf node */ + assert(cur != BVH::emptyNode); + STAT3(normal.trav_leaves, 1, 1, 1); + size_t num; PrimitiveK<K>* prim = (PrimitiveK<K>*)cur.leaf(num); + + size_t bits = m_trav_active & m_active; + /*! intersect stream of rays with all primitives */ + size_t lazy_node = 0; +#if defined(__SSE4_2__) + STAT_USER(1,(popcnt(bits)+K-1)/K*4); +#endif + while (bits) + { + size_t i = bsf(bits) / K; + const size_t m_isec = ((((size_t)1 << K)-1) << (i*K)); + assert(m_isec & bits); + bits &= ~m_isec; + TravRayKStream<K, robust>& p = packets[i]; + vbool<K> m_valid = p.tnear <= p.tfar; + vbool<K> m_hit = PrimitiveIntersectorK<K>::occludedK(m_valid, This, *inputPackets[i], context, prim, num, lazy_node); + inputPackets[i]->tfar = select(m_hit & m_valid, vfloat<K>(neg_inf), inputPackets[i]->tfar); + m_active &= ~((size_t)movemask(m_hit) << (i*K)); + } + + } // traversal + intersection + } + + + template<int N, int types, bool robust, typename PrimitiveIntersector> + template<int K> + __forceinline void BVHNIntersectorStream<N, types, robust, PrimitiveIntersector>::occludedIncoherent(Accel::Intersectors* __restrict__ This, + RayK<K>** inputPackets, + size_t numOctantRays, + IntersectContext* context) + { + assert(!context->isCoherent()); + assert(types & BVH_FLAG_ALIGNED_NODE); + + __aligned(64) TravRayKStream<K,robust> packet[MAX_INTERNAL_STREAM_SIZE/K]; + + assert(numOctantRays <= 32); + const size_t numPackets = (numOctantRays+K-1)/K; + size_t m_active = 0; + for (size_t i = 0; i < numPackets; i++) + { + const vfloat<K> tnear = inputPackets[i]->tnear(); + const vfloat<K> tfar = inputPackets[i]->tfar; + vbool<K> m_valid = (tnear <= tfar) & (tnear >= 0.0f); + m_active |= (size_t)movemask(m_valid) << (K*i); + const Vec3vf<K>& org = inputPackets[i]->org; + const Vec3vf<K>& dir = inputPackets[i]->dir; + vfloat<K> packet_min_dist = max(tnear, 0.0f); + vfloat<K> packet_max_dist = select(m_valid, tfar, neg_inf); + new (&packet[i]) TravRayKStream<K,robust>(org, dir, packet_min_dist, packet_max_dist); + } + + BVH* __restrict__ bvh = (BVH*)This->ptr; + + StackItemMaskT<NodeRef> stack[stackSizeSingle]; // stack of nodes + StackItemMaskT<NodeRef>* stackPtr = stack + 1; // current stack pointer + stack[0].ptr = bvh->root; + stack[0].mask = m_active; + + size_t terminated = ~m_active; + + /* near/far offsets based on first ray */ + const NearFarPrecalculations nf(Vec3fa(packet[0].rdir.x[0], packet[0].rdir.y[0], packet[0].rdir.z[0]), N); + + while (1) pop: + { + if (unlikely(stackPtr == stack)) break; + STAT3(shadow.trav_stack_pop,1,1,1); + stackPtr--; + NodeRef cur = NodeRef(stackPtr->ptr); + size_t cur_mask = stackPtr->mask & (~terminated); + if (unlikely(cur_mask == 0)) continue; + + while (true) + { + /*! stop if we found a leaf node */ + if (unlikely(cur.isLeaf())) break; + const AABBNode* __restrict__ const node = cur.getAABBNode(); + + const vint<N> vmask = traverseIncoherentStream(cur_mask, packet, node, nf, shiftTable); + + size_t mask = movemask(vmask != vint<N>(zero)); + if (unlikely(mask == 0)) goto pop; + + __aligned(64) unsigned int child_mask[N]; + vint<N>::storeu(child_mask, vmask); // this explicit store here causes much better code generation + + /*! one child is hit, continue with that child */ + size_t r = bscf(mask); + assert(r < N); + cur = node->child(r); + BVHN<N>::prefetch(cur,types); + cur_mask = child_mask[r]; + + /* simple in order sequence */ + assert(cur != BVH::emptyNode); + if (likely(mask == 0)) continue; + stackPtr->ptr = cur; + stackPtr->mask = cur_mask; + stackPtr++; + + for (; ;) + { + r = bscf(mask); + assert(r < N); + + cur = node->child(r); + BVHN<N>::prefetch(cur,types); + cur_mask = child_mask[r]; + assert(cur != BVH::emptyNode); + if (likely(mask == 0)) break; + stackPtr->ptr = cur; + stackPtr->mask = cur_mask; + stackPtr++; + } + } + + /*! this is a leaf node */ + assert(cur != BVH::emptyNode); + STAT3(shadow.trav_leaves,1,1,1); + size_t num; PrimitiveK<K>* prim = (PrimitiveK<K>*)cur.leaf(num); + + size_t bits = cur_mask; + size_t lazy_node = 0; + + for (; bits != 0;) + { + const size_t rayID = bscf(bits); + + RayK<K> &ray = *inputPackets[rayID / K]; + const size_t k = rayID % K; + if (PrimitiveIntersectorK<K>::occluded(This, ray, k, context, prim, num, lazy_node)) + { + ray.tfar[k] = neg_inf; + terminated |= (size_t)1 << rayID; + } + + /* lazy node */ + if (unlikely(lazy_node)) + { + stackPtr->ptr = lazy_node; + stackPtr->mask = cur_mask; + stackPtr++; + } + } + + if (unlikely(terminated == (size_t)-1)) break; + } + } + + //////////////////////////////////////////////////////////////////////////////// + /// ArrayIntersectorKStream Definitions + //////////////////////////////////////////////////////////////////////////////// + + template<bool filter> + struct Triangle4IntersectorStreamMoeller { + template<int K> using Type = ArrayIntersectorKStream<K,TriangleMIntersectorKMoeller<4 COMMA K COMMA true>>; + }; + + template<bool filter> + struct Triangle4vIntersectorStreamPluecker { + template<int K> using Type = ArrayIntersectorKStream<K,TriangleMvIntersectorKPluecker<4 COMMA K COMMA true>>; + }; + + template<bool filter> + struct Triangle4iIntersectorStreamMoeller { + template<int K> using Type = ArrayIntersectorKStream<K,TriangleMiIntersectorKMoeller<4 COMMA K COMMA true>>; + }; + + template<bool filter> + struct Triangle4iIntersectorStreamPluecker { + template<int K> using Type = ArrayIntersectorKStream<K,TriangleMiIntersectorKPluecker<4 COMMA K COMMA true>>; + }; + + template<bool filter> + struct Quad4vIntersectorStreamMoeller { + template<int K> using Type = ArrayIntersectorKStream<K,QuadMvIntersectorKMoeller<4 COMMA K COMMA true>>; + }; + + template<bool filter> + struct Quad4iIntersectorStreamMoeller { + template<int K> using Type = ArrayIntersectorKStream<K,QuadMiIntersectorKMoeller<4 COMMA K COMMA true>>; + }; + + template<bool filter> + struct Quad4vIntersectorStreamPluecker { + template<int K> using Type = ArrayIntersectorKStream<K,QuadMvIntersectorKPluecker<4 COMMA K COMMA true>>; + }; + + template<bool filter> + struct Quad4iIntersectorStreamPluecker { + template<int K> using Type = ArrayIntersectorKStream<K,QuadMiIntersectorKPluecker<4 COMMA K COMMA true>>; + }; + + struct ObjectIntersectorStream { + template<int K> using Type = ArrayIntersectorKStream<K,ObjectIntersectorK<K COMMA false>>; + }; + + struct InstanceIntersectorStream { + template<int K> using Type = ArrayIntersectorKStream<K,InstanceIntersectorK<K>>; + }; + + // ===================================================================================================== + // ===================================================================================================== + // ===================================================================================================== + + template<int N> + void BVHNIntersectorStreamPacketFallback<N>::intersect(Accel::Intersectors* __restrict__ This, + RayHitN** inputRays, + size_t numTotalRays, + IntersectContext* context) + { + if (unlikely(context->isCoherent())) + intersectK(This, (RayHitK<VSIZEL>**)inputRays, numTotalRays, context); + else + intersectK(This, (RayHitK<VSIZEX>**)inputRays, numTotalRays, context); + } + + template<int N> + void BVHNIntersectorStreamPacketFallback<N>::occluded(Accel::Intersectors* __restrict__ This, + RayN** inputRays, + size_t numTotalRays, + IntersectContext* context) + { + if (unlikely(context->isCoherent())) + occludedK(This, (RayK<VSIZEL>**)inputRays, numTotalRays, context); + else + occludedK(This, (RayK<VSIZEX>**)inputRays, numTotalRays, context); + } + + template<int N> + template<int K> + __noinline void BVHNIntersectorStreamPacketFallback<N>::intersectK(Accel::Intersectors* __restrict__ This, + RayHitK<K>** inputRays, + size_t numTotalRays, + IntersectContext* context) + { + /* fallback to packets */ + for (size_t i = 0; i < numTotalRays; i += K) + { + const vint<K> vi = vint<K>(int(i)) + vint<K>(step); + vbool<K> valid = vi < vint<K>(int(numTotalRays)); + RayHitK<K>& ray = *(inputRays[i / K]); + valid &= ray.tnear() <= ray.tfar; + This->intersect(valid, ray, context); + } + } + + template<int N> + template<int K> + __noinline void BVHNIntersectorStreamPacketFallback<N>::occludedK(Accel::Intersectors* __restrict__ This, + RayK<K>** inputRays, + size_t numTotalRays, + IntersectContext* context) + { + /* fallback to packets */ + for (size_t i = 0; i < numTotalRays; i += K) + { + const vint<K> vi = vint<K>(int(i)) + vint<K>(step); + vbool<K> valid = vi < vint<K>(int(numTotalRays)); + RayK<K>& ray = *(inputRays[i / K]); + valid &= ray.tnear() <= ray.tfar; + This->occluded(valid, ray, context); + } + } + } +} diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector_stream_bvh4.cpp b/thirdparty/embree/kernels/bvh/bvh_intersector_stream_bvh4.cpp new file mode 100644 index 0000000000..c3e5f137b8 --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh_intersector_stream_bvh4.cpp @@ -0,0 +1,36 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh_intersector_stream.cpp" + +namespace embree +{ + namespace isa + { + + //////////////////////////////////////////////////////////////////////////////// + /// General BVHIntersectorStreamPacketFallback Intersector + //////////////////////////////////////////////////////////////////////////////// + + DEFINE_INTERSECTORN(BVH4IntersectorStreamPacketFallback,BVHNIntersectorStreamPacketFallback<4>); + + //////////////////////////////////////////////////////////////////////////////// + /// BVH4IntersectorStream Definitions + //////////////////////////////////////////////////////////////////////////////// + + IF_ENABLED_TRIS(DEFINE_INTERSECTORN(BVH4Triangle4iIntersectorStreamMoeller, BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA false COMMA Triangle4iIntersectorStreamMoeller<true>>)); + IF_ENABLED_TRIS(DEFINE_INTERSECTORN(BVH4Triangle4vIntersectorStreamPluecker, BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA true COMMA Triangle4vIntersectorStreamPluecker<true>>)); + IF_ENABLED_TRIS(DEFINE_INTERSECTORN(BVH4Triangle4iIntersectorStreamPluecker, BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA true COMMA Triangle4iIntersectorStreamPluecker<true>>)); + IF_ENABLED_TRIS(DEFINE_INTERSECTORN(BVH4Triangle4IntersectorStreamMoeller, BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA false COMMA Triangle4IntersectorStreamMoeller<true>>)); + IF_ENABLED_TRIS(DEFINE_INTERSECTORN(BVH4Triangle4IntersectorStreamMoellerNoFilter, BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA false COMMA Triangle4IntersectorStreamMoeller<false>>)); + + IF_ENABLED_QUADS(DEFINE_INTERSECTORN(BVH4Quad4vIntersectorStreamMoeller, BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA false COMMA Quad4vIntersectorStreamMoeller<true>>)); + IF_ENABLED_QUADS(DEFINE_INTERSECTORN(BVH4Quad4vIntersectorStreamMoellerNoFilter,BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA false COMMA Quad4vIntersectorStreamMoeller<false>>)); + IF_ENABLED_QUADS(DEFINE_INTERSECTORN(BVH4Quad4iIntersectorStreamMoeller, BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA false COMMA Quad4iIntersectorStreamMoeller<true>>)); + IF_ENABLED_QUADS(DEFINE_INTERSECTORN(BVH4Quad4vIntersectorStreamPluecker, BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA true COMMA Quad4vIntersectorStreamPluecker<true>>)); + IF_ENABLED_QUADS(DEFINE_INTERSECTORN(BVH4Quad4iIntersectorStreamPluecker, BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA true COMMA Quad4iIntersectorStreamPluecker<true>>)); + + IF_ENABLED_USER(DEFINE_INTERSECTORN(BVH4VirtualIntersectorStream,BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA false COMMA ObjectIntersectorStream>)); + IF_ENABLED_INSTANCE(DEFINE_INTERSECTORN(BVH4InstanceIntersectorStream,BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA false COMMA InstanceIntersectorStream>)); + } +} diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector_stream_filters.cpp b/thirdparty/embree/kernels/bvh/bvh_intersector_stream_filters.cpp new file mode 100644 index 0000000000..b858eb163f --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh_intersector_stream_filters.cpp @@ -0,0 +1,657 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh_intersector_stream_filters.h" +#include "bvh_intersector_stream.h" + +namespace embree +{ + namespace isa + { + template<int K, bool intersect> + __noinline void RayStreamFilter::filterAOS(Scene* scene, void* _rayN, size_t N, size_t stride, IntersectContext* context) + { + RayStreamAOS rayN(_rayN); + + /* use fast path for coherent ray mode */ + if (unlikely(context->isCoherent())) + { + __aligned(64) RayTypeK<K, intersect> rays[MAX_INTERNAL_STREAM_SIZE / K]; + __aligned(64) RayTypeK<K, intersect>* rayPtrs[MAX_INTERNAL_STREAM_SIZE / K]; + + for (size_t i = 0; i < N; i += MAX_INTERNAL_STREAM_SIZE) + { + const size_t size = min(N - i, MAX_INTERNAL_STREAM_SIZE); + + /* convert from AOS to SOA */ + for (size_t j = 0; j < size; j += K) + { + const vint<K> vij = vint<K>(int(i+j)) + vint<K>(step); + const vbool<K> valid = vij < vint<K>(int(N)); + const vint<K> offset = vij * int(stride); + const size_t packetIndex = j / K; + + RayTypeK<K, intersect> ray = rayN.getRayByOffset<K>(valid, offset); + ray.tnear() = select(valid, ray.tnear(), zero); + ray.tfar = select(valid, ray.tfar, neg_inf); + + rays[packetIndex] = ray; + rayPtrs[packetIndex] = &rays[packetIndex]; // rayPtrs might get reordered for occludedN + } + + /* trace stream */ + scene->intersectors.intersectN(rayPtrs, size, context); + + /* convert from SOA to AOS */ + for (size_t j = 0; j < size; j += K) + { + const vint<K> vij = vint<K>(int(i+j)) + vint<K>(step); + const vbool<K> valid = vij < vint<K>(int(N)); + const vint<K> offset = vij * int(stride); + const size_t packetIndex = j / K; + rayN.setHitByOffset(valid, offset, rays[packetIndex]); + } + } + } + else if (unlikely(!intersect)) + { + /* octant sorting for occlusion rays */ + __aligned(64) unsigned int octants[8][MAX_INTERNAL_STREAM_SIZE]; + __aligned(64) RayK<K> rays[MAX_INTERNAL_STREAM_SIZE / K]; + __aligned(64) RayK<K>* rayPtrs[MAX_INTERNAL_STREAM_SIZE / K]; + + unsigned int raysInOctant[8]; + for (unsigned int i = 0; i < 8; i++) + raysInOctant[i] = 0; + size_t inputRayID = 0; + + for (;;) + { + int curOctant = -1; + + /* sort rays into octants */ + for (; inputRayID < N;) + { + const Ray& ray = rayN.getRayByOffset(inputRayID * stride); + + /* skip invalid rays */ + if (unlikely(ray.tnear() > ray.tfar || ray.tfar < 0.0f)) { inputRayID++; continue; } // ignore invalid or already occluded rays +#if defined(EMBREE_IGNORE_INVALID_RAYS) + if (unlikely(!ray.valid())) { inputRayID++; continue; } +#endif + + const unsigned int octantID = movemask(vfloat4(Vec3fa(ray.dir)) < 0.0f) & 0x7; + + assert(octantID < 8); + octants[octantID][raysInOctant[octantID]++] = (unsigned int)inputRayID; + inputRayID++; + if (unlikely(raysInOctant[octantID] == MAX_INTERNAL_STREAM_SIZE)) + { + curOctant = octantID; + break; + } + } + + /* need to flush rays in octant? */ + if (unlikely(curOctant == -1)) + { + for (unsigned int i = 0; i < 8; i++) + if (raysInOctant[i]) { curOctant = i; break; } + } + + /* all rays traced? */ + if (unlikely(curOctant == -1)) + break; + + unsigned int* const rayIDs = &octants[curOctant][0]; + const unsigned int numOctantRays = raysInOctant[curOctant]; + assert(numOctantRays); + + for (unsigned int j = 0; j < numOctantRays; j += K) + { + const vint<K> vi = vint<K>(int(j)) + vint<K>(step); + const vbool<K> valid = vi < vint<K>(int(numOctantRays)); + const vint<K> offset = *(vint<K>*)&rayIDs[j] * int(stride); + RayK<K>& ray = rays[j/K]; + rayPtrs[j/K] = &ray; + ray = rayN.getRayByOffset<K>(valid, offset); + ray.tnear() = select(valid, ray.tnear(), zero); + ray.tfar = select(valid, ray.tfar, neg_inf); + } + + scene->intersectors.occludedN(rayPtrs, numOctantRays, context); + + for (unsigned int j = 0; j < numOctantRays; j += K) + { + const vint<K> vi = vint<K>(int(j)) + vint<K>(step); + const vbool<K> valid = vi < vint<K>(int(numOctantRays)); + const vint<K> offset = *(vint<K>*)&rayIDs[j] * int(stride); + rayN.setHitByOffset<K>(valid, offset, rays[j/K]); + } + + raysInOctant[curOctant] = 0; + } + } + else + { + /* fallback to packets */ + for (size_t i = 0; i < N; i += K) + { + const vint<K> vi = vint<K>(int(i)) + vint<K>(step); + vbool<K> valid = vi < vint<K>(int(N)); + const vint<K> offset = vi * int(stride); + + RayTypeK<K, intersect> ray = rayN.getRayByOffset<K>(valid, offset); + valid &= ray.tnear() <= ray.tfar; + + scene->intersectors.intersect(valid, ray, context); + + rayN.setHitByOffset<K>(valid, offset, ray); + } + } + } + + template<int K, bool intersect> + __noinline void RayStreamFilter::filterAOP(Scene* scene, void** _rayN, size_t N, IntersectContext* context) + { + RayStreamAOP rayN(_rayN); + + /* use fast path for coherent ray mode */ + if (unlikely(context->isCoherent())) + { + __aligned(64) RayTypeK<K, intersect> rays[MAX_INTERNAL_STREAM_SIZE / K]; + __aligned(64) RayTypeK<K, intersect>* rayPtrs[MAX_INTERNAL_STREAM_SIZE / K]; + + for (size_t i = 0; i < N; i += MAX_INTERNAL_STREAM_SIZE) + { + const size_t size = min(N - i, MAX_INTERNAL_STREAM_SIZE); + + /* convert from AOP to SOA */ + for (size_t j = 0; j < size; j += K) + { + const vint<K> vij = vint<K>(int(i+j)) + vint<K>(step); + const vbool<K> valid = vij < vint<K>(int(N)); + const size_t packetIndex = j / K; + + RayTypeK<K, intersect> ray = rayN.getRayByIndex<K>(valid, vij); + ray.tnear() = select(valid, ray.tnear(), zero); + ray.tfar = select(valid, ray.tfar, neg_inf); + + rays[packetIndex] = ray; + rayPtrs[packetIndex] = &rays[packetIndex]; // rayPtrs might get reordered for occludedN + } + + /* trace stream */ + scene->intersectors.intersectN(rayPtrs, size, context); + + /* convert from SOA to AOP */ + for (size_t j = 0; j < size; j += K) + { + const vint<K> vij = vint<K>(int(i+j)) + vint<K>(step); + const vbool<K> valid = vij < vint<K>(int(N)); + const size_t packetIndex = j / K; + + rayN.setHitByIndex<K>(valid, vij, rays[packetIndex]); + } + } + } + else if (unlikely(!intersect)) + { + /* octant sorting for occlusion rays */ + __aligned(64) unsigned int octants[8][MAX_INTERNAL_STREAM_SIZE]; + __aligned(64) RayK<K> rays[MAX_INTERNAL_STREAM_SIZE / K]; + __aligned(64) RayK<K>* rayPtrs[MAX_INTERNAL_STREAM_SIZE / K]; + + unsigned int raysInOctant[8]; + for (unsigned int i = 0; i < 8; i++) + raysInOctant[i] = 0; + size_t inputRayID = 0; + + for (;;) + { + int curOctant = -1; + + /* sort rays into octants */ + for (; inputRayID < N;) + { + const Ray& ray = rayN.getRayByIndex(inputRayID); + + /* skip invalid rays */ + if (unlikely(ray.tnear() > ray.tfar || ray.tfar < 0.0f)) { inputRayID++; continue; } // ignore invalid or already occluded rays +#if defined(EMBREE_IGNORE_INVALID_RAYS) + if (unlikely(!ray.valid())) { inputRayID++; continue; } +#endif + + const unsigned int octantID = movemask(lt_mask(ray.dir,Vec3fa(0.0f))); + + assert(octantID < 8); + octants[octantID][raysInOctant[octantID]++] = (unsigned int)inputRayID; + inputRayID++; + if (unlikely(raysInOctant[octantID] == MAX_INTERNAL_STREAM_SIZE)) + { + curOctant = octantID; + break; + } + } + + /* need to flush rays in octant? */ + if (unlikely(curOctant == -1)) + { + for (unsigned int i = 0; i < 8; i++) + if (raysInOctant[i]) { curOctant = i; break; } + } + + /* all rays traced? */ + if (unlikely(curOctant == -1)) + break; + + unsigned int* const rayIDs = &octants[curOctant][0]; + const unsigned int numOctantRays = raysInOctant[curOctant]; + assert(numOctantRays); + + for (unsigned int j = 0; j < numOctantRays; j += K) + { + const vint<K> vi = vint<K>(int(j)) + vint<K>(step); + const vbool<K> valid = vi < vint<K>(int(numOctantRays)); + const vint<K> index = *(vint<K>*)&rayIDs[j]; + RayK<K>& ray = rays[j/K]; + rayPtrs[j/K] = &ray; + ray = rayN.getRayByIndex<K>(valid, index); + ray.tnear() = select(valid, ray.tnear(), zero); + ray.tfar = select(valid, ray.tfar, neg_inf); + } + + scene->intersectors.occludedN(rayPtrs, numOctantRays, context); + + for (unsigned int j = 0; j < numOctantRays; j += K) + { + const vint<K> vi = vint<K>(int(j)) + vint<K>(step); + const vbool<K> valid = vi < vint<K>(int(numOctantRays)); + const vint<K> index = *(vint<K>*)&rayIDs[j]; + rayN.setHitByIndex<K>(valid, index, rays[j/K]); + } + + raysInOctant[curOctant] = 0; + } + } + else + { + /* fallback to packets */ + for (size_t i = 0; i < N; i += K) + { + const vint<K> vi = vint<K>(int(i)) + vint<K>(step); + vbool<K> valid = vi < vint<K>(int(N)); + + RayTypeK<K, intersect> ray = rayN.getRayByIndex<K>(valid, vi); + valid &= ray.tnear() <= ray.tfar; + + scene->intersectors.intersect(valid, ray, context); + + rayN.setHitByIndex<K>(valid, vi, ray); + } + } + } + + template<int K, bool intersect> + __noinline void RayStreamFilter::filterSOA(Scene* scene, char* rayData, size_t N, size_t numPackets, size_t stride, IntersectContext* context) + { + const size_t rayDataAlignment = (size_t)rayData % (K*sizeof(float)); + const size_t offsetAlignment = (size_t)stride % (K*sizeof(float)); + + /* fast path for packets with the correct width and data alignment */ + if (likely(N == K && + !rayDataAlignment && + !offsetAlignment)) + { + if (unlikely(context->isCoherent())) + { + __aligned(64) RayTypeK<K, intersect>* rayPtrs[MAX_INTERNAL_STREAM_SIZE / K]; + + size_t packetIndex = 0; + for (size_t i = 0; i < numPackets; i++) + { + const size_t offset = i * stride; + RayTypeK<K, intersect>& ray = *(RayTypeK<K, intersect>*)(rayData + offset); + rayPtrs[packetIndex++] = &ray; + + /* trace as stream */ + if (unlikely(packetIndex == MAX_INTERNAL_STREAM_SIZE / K)) + { + const size_t size = packetIndex*K; + scene->intersectors.intersectN(rayPtrs, size, context); + packetIndex = 0; + } + } + + /* flush remaining packets */ + if (unlikely(packetIndex > 0)) + { + const size_t size = packetIndex*K; + scene->intersectors.intersectN(rayPtrs, size, context); + } + } + else if (unlikely(!intersect)) + { + /* octant sorting for occlusion rays */ + RayStreamSOA rayN(rayData, K); + + __aligned(64) unsigned int octants[8][MAX_INTERNAL_STREAM_SIZE]; + __aligned(64) RayK<K> rays[MAX_INTERNAL_STREAM_SIZE / K]; + __aligned(64) RayK<K>* rayPtrs[MAX_INTERNAL_STREAM_SIZE / K]; + + unsigned int raysInOctant[8]; + for (unsigned int i = 0; i < 8; i++) + raysInOctant[i] = 0; + size_t inputRayID = 0; + + for (;;) + { + int curOctant = -1; + + /* sort rays into octants */ + for (; inputRayID < N*numPackets;) + { + const size_t offset = (inputRayID / K) * stride + (inputRayID % K) * sizeof(float); + + /* skip invalid rays */ + if (unlikely(!rayN.isValidByOffset(offset))) { inputRayID++; continue; } // ignore invalid or already occluded rays + #if defined(EMBREE_IGNORE_INVALID_RAYS) + __aligned(64) Ray ray = rayN.getRayByOffset(offset); + if (unlikely(!ray.valid())) { inputRayID++; continue; } + #endif + + const unsigned int octantID = (unsigned int)rayN.getOctantByOffset(offset); + + assert(octantID < 8); + octants[octantID][raysInOctant[octantID]++] = (unsigned int)offset; + inputRayID++; + if (unlikely(raysInOctant[octantID] == MAX_INTERNAL_STREAM_SIZE)) + { + curOctant = octantID; + break; + } + } + + /* need to flush rays in octant? */ + if (unlikely(curOctant == -1)) + { + for (unsigned int i = 0; i < 8; i++) + if (raysInOctant[i]) { curOctant = i; break; } + } + + /* all rays traced? */ + if (unlikely(curOctant == -1)) + break; + + unsigned int* const rayOffsets = &octants[curOctant][0]; + const unsigned int numOctantRays = raysInOctant[curOctant]; + assert(numOctantRays); + + for (unsigned int j = 0; j < numOctantRays; j += K) + { + const vint<K> vi = vint<K>(int(j)) + vint<K>(step); + const vbool<K> valid = vi < vint<K>(int(numOctantRays)); + const vint<K> offset = *(vint<K>*)&rayOffsets[j]; + RayK<K>& ray = rays[j/K]; + rayPtrs[j/K] = &ray; + ray = rayN.getRayByOffset<K>(valid, offset); + ray.tnear() = select(valid, ray.tnear(), zero); + ray.tfar = select(valid, ray.tfar, neg_inf); + } + + scene->intersectors.occludedN(rayPtrs, numOctantRays, context); + + for (unsigned int j = 0; j < numOctantRays; j += K) + { + const vint<K> vi = vint<K>(int(j)) + vint<K>(step); + const vbool<K> valid = vi < vint<K>(int(numOctantRays)); + const vint<K> offset = *(vint<K>*)&rayOffsets[j]; + rayN.setHitByOffset(valid, offset, rays[j/K]); + } + raysInOctant[curOctant] = 0; + } + } + else + { + /* fallback to packets */ + for (size_t i = 0; i < numPackets; i++) + { + const size_t offset = i * stride; + RayTypeK<K, intersect>& ray = *(RayTypeK<K, intersect>*)(rayData + offset); + const vbool<K> valid = ray.tnear() <= ray.tfar; + + scene->intersectors.intersect(valid, ray, context); + } + } + } + else + { + /* fallback to packets for arbitrary packet size and alignment */ + for (size_t i = 0; i < numPackets; i++) + { + const size_t offsetN = i * stride; + RayStreamSOA rayN(rayData + offsetN, N); + + for (size_t j = 0; j < N; j += K) + { + const size_t offset = j * sizeof(float); + vbool<K> valid = (vint<K>(int(j)) + vint<K>(step)) < vint<K>(int(N)); + RayTypeK<K, intersect> ray = rayN.getRayByOffset<K>(valid, offset); + valid &= ray.tnear() <= ray.tfar; + + scene->intersectors.intersect(valid, ray, context); + + rayN.setHitByOffset(valid, offset, ray); + } + } + } + } + + template<int K, bool intersect> + __noinline void RayStreamFilter::filterSOP(Scene* scene, const void* _rayN, size_t N, IntersectContext* context) + { + RayStreamSOP& rayN = *(RayStreamSOP*)_rayN; + + /* use fast path for coherent ray mode */ + if (unlikely(context->isCoherent())) + { + __aligned(64) RayTypeK<K, intersect> rays[MAX_INTERNAL_STREAM_SIZE / K]; + __aligned(64) RayTypeK<K, intersect>* rayPtrs[MAX_INTERNAL_STREAM_SIZE / K]; + + for (size_t i = 0; i < N; i += MAX_INTERNAL_STREAM_SIZE) + { + const size_t size = min(N - i, MAX_INTERNAL_STREAM_SIZE); + + /* convert from SOP to SOA */ + for (size_t j = 0; j < size; j += K) + { + const vint<K> vij = vint<K>(int(i+j)) + vint<K>(step); + const vbool<K> valid = vij < vint<K>(int(N)); + const size_t offset = (i+j) * sizeof(float); + const size_t packetIndex = j / K; + + RayTypeK<K, intersect> ray = rayN.getRayByOffset<K>(valid, offset); + ray.tnear() = select(valid, ray.tnear(), zero); + ray.tfar = select(valid, ray.tfar, neg_inf); + + rays[packetIndex] = ray; + rayPtrs[packetIndex] = &rays[packetIndex]; // rayPtrs might get reordered for occludedN + } + + /* trace stream */ + scene->intersectors.intersectN(rayPtrs, size, context); + + /* convert from SOA to SOP */ + for (size_t j = 0; j < size; j += K) + { + const vint<K> vij = vint<K>(int(i+j)) + vint<K>(step); + const vbool<K> valid = vij < vint<K>(int(N)); + const size_t offset = (i+j) * sizeof(float); + const size_t packetIndex = j / K; + + rayN.setHitByOffset(valid, offset, rays[packetIndex]); + } + } + } + else if (unlikely(!intersect)) + { + /* octant sorting for occlusion rays */ + __aligned(64) unsigned int octants[8][MAX_INTERNAL_STREAM_SIZE]; + __aligned(64) RayK<K> rays[MAX_INTERNAL_STREAM_SIZE / K]; + __aligned(64) RayK<K>* rayPtrs[MAX_INTERNAL_STREAM_SIZE / K]; + + unsigned int raysInOctant[8]; + for (unsigned int i = 0; i < 8; i++) + raysInOctant[i] = 0; + size_t inputRayID = 0; + + for (;;) + { + int curOctant = -1; + + /* sort rays into octants */ + for (; inputRayID < N;) + { + const size_t offset = inputRayID * sizeof(float); + /* skip invalid rays */ + if (unlikely(!rayN.isValidByOffset(offset))) { inputRayID++; continue; } // ignore invalid or already occluded rays +#if defined(EMBREE_IGNORE_INVALID_RAYS) + __aligned(64) Ray ray = rayN.getRayByOffset(offset); + if (unlikely(!ray.valid())) { inputRayID++; continue; } +#endif + + const unsigned int octantID = (unsigned int)rayN.getOctantByOffset(offset); + + assert(octantID < 8); + octants[octantID][raysInOctant[octantID]++] = (unsigned int)offset; + inputRayID++; + if (unlikely(raysInOctant[octantID] == MAX_INTERNAL_STREAM_SIZE)) + { + curOctant = octantID; + break; + } + } + + /* need to flush rays in octant? */ + if (unlikely(curOctant == -1)) + { + for (unsigned int i = 0; i < 8; i++) + if (raysInOctant[i]) { curOctant = i; break; } + } + + /* all rays traced? */ + if (unlikely(curOctant == -1)) + break; + + unsigned int* const rayOffsets = &octants[curOctant][0]; + const unsigned int numOctantRays = raysInOctant[curOctant]; + assert(numOctantRays); + + for (unsigned int j = 0; j < numOctantRays; j += K) + { + const vint<K> vi = vint<K>(int(j)) + vint<K>(step); + const vbool<K> valid = vi < vint<K>(int(numOctantRays)); + const vint<K> offset = *(vint<K>*)&rayOffsets[j]; + RayK<K>& ray = rays[j/K]; + rayPtrs[j/K] = &ray; + ray = rayN.getRayByOffset<K>(valid, offset); + ray.tnear() = select(valid, ray.tnear(), zero); + ray.tfar = select(valid, ray.tfar, neg_inf); + } + + scene->intersectors.occludedN(rayPtrs, numOctantRays, context); + + for (unsigned int j = 0; j < numOctantRays; j += K) + { + const vint<K> vi = vint<K>(int(j)) + vint<K>(step); + const vbool<K> valid = vi < vint<K>(int(numOctantRays)); + const vint<K> offset = *(vint<K>*)&rayOffsets[j]; + rayN.setHitByOffset(valid, offset, rays[j/K]); + } + + raysInOctant[curOctant] = 0; + } + } + else + { + /* fallback to packets */ + for (size_t i = 0; i < N; i += K) + { + const vint<K> vi = vint<K>(int(i)) + vint<K>(step); + vbool<K> valid = vi < vint<K>(int(N)); + const size_t offset = i * sizeof(float); + + RayTypeK<K, intersect> ray = rayN.getRayByOffset<K>(valid, offset); + valid &= ray.tnear() <= ray.tfar; + + scene->intersectors.intersect(valid, ray, context); + + rayN.setHitByOffset(valid, offset, ray); + } + } + } + + + void RayStreamFilter::intersectAOS(Scene* scene, RTCRayHit* _rayN, size_t N, size_t stride, IntersectContext* context) { + if (unlikely(context->isCoherent())) + filterAOS<VSIZEL, true>(scene, _rayN, N, stride, context); + else + filterAOS<VSIZEX, true>(scene, _rayN, N, stride, context); + } + + void RayStreamFilter::occludedAOS(Scene* scene, RTCRay* _rayN, size_t N, size_t stride, IntersectContext* context) { + if (unlikely(context->isCoherent())) + filterAOS<VSIZEL, false>(scene, _rayN, N, stride, context); + else + filterAOS<VSIZEX, false>(scene, _rayN, N, stride, context); + } + + void RayStreamFilter::intersectAOP(Scene* scene, RTCRayHit** _rayN, size_t N, IntersectContext* context) { + if (unlikely(context->isCoherent())) + filterAOP<VSIZEL, true>(scene, (void**)_rayN, N, context); + else + filterAOP<VSIZEX, true>(scene, (void**)_rayN, N, context); + } + + void RayStreamFilter::occludedAOP(Scene* scene, RTCRay** _rayN, size_t N, IntersectContext* context) { + if (unlikely(context->isCoherent())) + filterAOP<VSIZEL, false>(scene, (void**)_rayN, N, context); + else + filterAOP<VSIZEX, false>(scene, (void**)_rayN, N, context); + } + + void RayStreamFilter::intersectSOA(Scene* scene, char* rayData, size_t N, size_t numPackets, size_t stride, IntersectContext* context) { + if (unlikely(context->isCoherent())) + filterSOA<VSIZEL, true>(scene, rayData, N, numPackets, stride, context); + else + filterSOA<VSIZEX, true>(scene, rayData, N, numPackets, stride, context); + } + + void RayStreamFilter::occludedSOA(Scene* scene, char* rayData, size_t N, size_t numPackets, size_t stride, IntersectContext* context) { + if (unlikely(context->isCoherent())) + filterSOA<VSIZEL, false>(scene, rayData, N, numPackets, stride, context); + else + filterSOA<VSIZEX, false>(scene, rayData, N, numPackets, stride, context); + } + + void RayStreamFilter::intersectSOP(Scene* scene, const RTCRayHitNp* _rayN, size_t N, IntersectContext* context) { + if (unlikely(context->isCoherent())) + filterSOP<VSIZEL, true>(scene, _rayN, N, context); + else + filterSOP<VSIZEX, true>(scene, _rayN, N, context); + } + + void RayStreamFilter::occludedSOP(Scene* scene, const RTCRayNp* _rayN, size_t N, IntersectContext* context) { + if (unlikely(context->isCoherent())) + filterSOP<VSIZEL, false>(scene, _rayN, N, context); + else + filterSOP<VSIZEX, false>(scene, _rayN, N, context); + } + + + RayStreamFilterFuncs rayStreamFilterFuncs() { + return RayStreamFilterFuncs(RayStreamFilter::intersectAOS, RayStreamFilter::intersectAOP, RayStreamFilter::intersectSOA, RayStreamFilter::intersectSOP, + RayStreamFilter::occludedAOS, RayStreamFilter::occludedAOP, RayStreamFilter::occludedSOA, RayStreamFilter::occludedSOP); + } + }; +}; diff --git a/thirdparty/embree/kernels/config.h b/thirdparty/embree/kernels/config.h index 80a8ab2a56..2bf7e93587 100644 --- a/thirdparty/embree/kernels/config.h +++ b/thirdparty/embree/kernels/config.h @@ -16,7 +16,7 @@ /* #undef EMBREE_GEOMETRY_INSTANCE */ /* #undef EMBREE_GEOMETRY_GRID */ /* #undef EMBREE_GEOMETRY_POINT */ -/* #undef EMBREE_RAY_PACKETS */ +#define EMBREE_RAY_PACKETS /* #undef EMBREE_COMPACT_POLYS */ #define EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR 2.0 diff --git a/thirdparty/embree/kernels/hash.h b/thirdparty/embree/kernels/hash.h index 10f315cee7..470e15f03e 100644 --- a/thirdparty/embree/kernels/hash.h +++ b/thirdparty/embree/kernels/hash.h @@ -2,4 +2,4 @@ // Copyright 2009-2020 Intel Corporation // SPDX-License-Identifier: Apache-2.0 -#define RTC_HASH "7c53133eb21424f7f0ae1e25bf357e358feaf6ab" +#define RTC_HASH "12b99393438a4cc9e478e33459eed78bec6233fd" |