320 files changed, 94546 insertions, 0 deletions
diff --git a/thirdparty/embree/common/algorithms/parallel_any_of.h b/thirdparty/embree/common/algorithms/parallel_any_of.h
new file mode 100644
index 0000000000..a64e4a1889
--- /dev/null
+++ b/thirdparty/embree/common/algorithms/parallel_any_of.h
@@ -0,0 +1,55 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <functional>
+#include "parallel_reduce.h"
+
+namespace embree
+{
+  
+  template<typename Index, class UnaryPredicate>
+    __forceinline bool parallel_any_of (Index first, Index last, UnaryPredicate pred)
+  {
+    bool ret = false;
+    
+#if defined(TASKING_TBB)
+#if TBB_INTERFACE_VERSION >= 12002
+    tbb::task_group_context context;
+    tbb::parallel_for(tbb::blocked_range<size_t>{first, last}, [&ret,pred,&context](const tbb::blocked_range<size_t>& r) {
+        if (context.is_group_execution_cancelled()) return;
+        for (size_t i = r.begin(); i != r.end(); ++i) {
+          if (pred(i)) {
+            ret = true;
+            context.cancel_group_execution();
+          }
+        }
+      });
+#else
+    tbb::parallel_for(tbb::blocked_range<size_t>{first, last}, [&ret,pred](const tbb::blocked_range<size_t>& r) {
+        if (tbb::task::self().is_cancelled()) return;
+        for (size_t i = r.begin(); i != r.end(); ++i) {
+          if (pred(i)) {
+            ret = true;
+            tbb::task::self().cancel_group_execution();
+          }
+        }
+      });
+#endif
+#else
+    ret = parallel_reduce (first, last, false, [pred](const range<size_t>& r)->bool {
+        bool localret = false;
+        for (auto i=r.begin(); i<r.end(); ++i) {
+          localret |= pred(i);
+        }
+        return localret;
+      },
+      std::bit_or<bool>()
+      );
+#endif
+    
+    return ret;
+  }
+  
+} // end namespace
diff --git a/thirdparty/embree/common/algorithms/parallel_filter.h b/thirdparty/embree/common/algorithms/parallel_filter.h
new file mode 100644
index 0000000000..090ef164c2
--- /dev/null
+++ b/thirdparty/embree/common/algorithms/parallel_filter.h
@@ -0,0 +1,93 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_for.h"
+
+namespace embree
+{
+  template<typename Ty, typename Index, typename Predicate>
+    inline Index sequential_filter( Ty* data, const Index first, const Index last, const Predicate& predicate)
+  {
+    Index j = first;
+    for (Index i=first; i<last; i++)
+      if (predicate(data[i]))
+        data[j++] = data[i];
+
+    return j;
+  }
+
+  template<typename Ty, typename Index, typename Predicate>
+    inline Index parallel_filter( Ty* data, const Index begin, const Index end, const Index minStepSize, const Predicate& predicate)
+  {
+    /* sequential fallback */
+    if (end-begin <= minStepSize)
+      return sequential_filter(data,begin,end,predicate);
+
+    /* calculate number of tasks to use */
+    enum { MAX_TASKS = 64 };
+    const Index numThreads = TaskScheduler::threadCount();
+    const Index numBlocks  = (end-begin+minStepSize-1)/minStepSize;
+    const Index taskCount  = min(numThreads,numBlocks,(Index)MAX_TASKS);
+
+    /* filter blocks */
+    Index nused[MAX_TASKS];
+    Index nfree[MAX_TASKS];
+    parallel_for(taskCount, [&](const Index taskIndex)
+    {
+      const Index i0 = begin+(taskIndex+0)*(end-begin)/taskCount;
+      const Index i1 = begin+(taskIndex+1)*(end-begin)/taskCount;
+      const Index i2 = sequential_filter(data,i0,i1,predicate);
+      nused[taskIndex] = i2-i0;
+      nfree[taskIndex] = i1-i2;
+    });
+
+    /* calculate offsets */
+    Index sused=0;
+    Index sfree=0;
+    Index pfree[MAX_TASKS];
+    for (Index i=0; i<taskCount; i++) 
+    {
+      sused+=nused[i];
+      Index cfree = nfree[i]; pfree[i] = sfree; sfree+=cfree;
+    }
+
+    /* return if we did not filter out any element */
+    assert(sfree <= end-begin);
+    assert(sused <= end-begin);
+    if (sused == end-begin)
+      return end;
+
+    /* otherwise we have to copy misplaced elements around */
+    parallel_for(taskCount, [&](const Index taskIndex)
+    {
+      /* destination to write elements to */
+      Index dst = begin+(taskIndex+0)*(end-begin)/taskCount+nused[taskIndex];
+      Index dst_end = min(dst+nfree[taskIndex],begin+sused);
+      if (dst_end <= dst) return;
+
+      /* range of misplaced elements to copy to destination */
+      Index r0 = pfree[taskIndex];
+      Index r1 = r0+dst_end-dst;
+
+      /* find range in misplaced elements in back to front order */
+      Index k0=0;
+      for (Index i=taskCount-1; i>0; i--)
+      {
+        if (k0 > r1) break;
+        Index k1 = k0+nused[i];
+        Index src = begin+(i+0)*(end-begin)/taskCount+nused[i];
+        for (Index i=max(r0,k0); i<min(r1,k1); i++) {
+          Index isrc = src-i+k0-1;
+          assert(dst >= begin && dst < end);
+          assert(isrc >= begin && isrc < end);
+          data[dst++] = data[isrc];
+        }
+        k0 = k1;
+      }
+    });
+
+    return begin+sused;
+  }
+}
diff --git a/thirdparty/embree/common/algorithms/parallel_for.h b/thirdparty/embree/common/algorithms/parallel_for.h
new file mode 100644
index 0000000000..645681ac63
--- /dev/null
+++ b/thirdparty/embree/common/algorithms/parallel_for.h
@@ -0,0 +1,186 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../tasking/taskscheduler.h"
+#include "../sys/array.h"
+#include "../math/math.h"
+#include "../math/range.h"
+
+namespace embree
+{
+  /* parallel_for without range */
+  template<typename Index, typename Func>
+    __forceinline void parallel_for( const Index N, const Func& func)
+  {
+#if defined(TASKING_INTERNAL)
+    if (N) {
+      TaskScheduler::spawn(Index(0),N,Index(1),[&] (const range<Index>& r) {
+          assert(r.size() == 1);
+          func(r.begin());
+        });
+      if (!TaskScheduler::wait())
+        // -- GODOT start --
+        // throw std::runtime_error("task cancelled");
+        abort();
+        // -- GODOT end --
+    }
+    
+#elif defined(TASKING_TBB)
+  #if TBB_INTERFACE_VERSION >= 12002
+    tbb::task_group_context context;
+    tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+        func(i);
+      },context);
+    if (context.is_group_execution_cancelled())
+      // -- GODOT start --
+      // throw std::runtime_error("task cancelled");
+      abort();
+      // -- GODOT end --
+  #else
+    tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+        func(i);
+      });
+    if (tbb::task::self().is_cancelled())
+      // -- GODOT start --
+      // throw std::runtime_error("task cancelled");
+      abort();
+      // -- GODOT end --
+  #endif
+
+#elif defined(TASKING_PPL)
+    concurrency::parallel_for(Index(0),N,Index(1),[&](Index i) { 
+        func(i);
+      });
+#else
+#  error "no tasking system enabled"
+#endif
+  }
+  
+  /* parallel for with range and granulatity */
+  template<typename Index, typename Func>
+    __forceinline void parallel_for( const Index first, const Index last, const Index minStepSize, const Func& func)
+  {
+    assert(first <= last);
+#if defined(TASKING_INTERNAL)
+    TaskScheduler::spawn(first,last,minStepSize,func);
+    if (!TaskScheduler::wait())
+      // -- GODOT start --
+      // throw std::runtime_error("task cancelled");
+      abort();
+      // -- GODOT end --
+
+#elif defined(TASKING_TBB)
+  #if TBB_INTERFACE_VERSION >= 12002
+    tbb::task_group_context context;
+    tbb::parallel_for(tbb::blocked_range<Index>(first,last,minStepSize),[&](const tbb::blocked_range<Index>& r) {
+        func(range<Index>(r.begin(),r.end()));
+      },context);
+    if (context.is_group_execution_cancelled())
+      // -- GODOT start --
+      // throw std::runtime_error("task cancelled");
+      abort();
+      // -- GODOT end --
+  #else
+    tbb::parallel_for(tbb::blocked_range<Index>(first,last,minStepSize),[&](const tbb::blocked_range<Index>& r) {
+        func(range<Index>(r.begin(),r.end()));
+      });
+    if (tbb::task::self().is_cancelled())
+      // -- GODOT start --
+      // throw std::runtime_error("task cancelled");
+      abort();
+      // -- GODOT end --
+  #endif
+
+#elif defined(TASKING_PPL)
+    concurrency::parallel_for(first, last, Index(1) /*minStepSize*/, [&](Index i) { 
+        func(range<Index>(i,i+1)); 
+      });
+
+#else
+#  error "no tasking system enabled"
+#endif
+  }
+  
+  /* parallel for with range */
+  template<typename Index, typename Func>
+    __forceinline void parallel_for( const Index first, const Index last, const Func& func)
+  {
+    assert(first <= last);
+    parallel_for(first,last,(Index)1,func);
+  }
+
+#if defined(TASKING_TBB) && (TBB_INTERFACE_VERSION > 4001)
+
+  template<typename Index, typename Func>
+    __forceinline void parallel_for_static( const Index N, const Func& func)
+  {
+    #if TBB_INTERFACE_VERSION >= 12002
+      tbb::task_group_context context;
+      tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+          func(i);
+        },tbb::simple_partitioner(),context);
+      if (context.is_group_execution_cancelled())
+        // -- GODOT start --
+        // throw std::runtime_error("task cancelled");
+        abort();
+        // -- GODOT end --
+    #else
+      tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+          func(i);
+        },tbb::simple_partitioner());
+      if (tbb::task::self().is_cancelled())
+        // -- GODOT start --
+        // throw std::runtime_error("task cancelled");
+        abort();
+        // -- GODOT end --
+    #endif
+  }
+
+  typedef tbb::affinity_partitioner affinity_partitioner;
+
+  template<typename Index, typename Func>
+    __forceinline void parallel_for_affinity( const Index N, const Func& func, tbb::affinity_partitioner& ap)
+  {
+    #if TBB_INTERFACE_VERSION >= 12002
+      tbb::task_group_context context;
+      tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+          func(i);
+        },ap,context);
+      if (context.is_group_execution_cancelled())
+        // -- GODOT start --
+        // throw std::runtime_error("task cancelled");
+        abort();
+        // -- GODOT end --
+    #else
+      tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+          func(i);
+        },ap);
+      if (tbb::task::self().is_cancelled())
+        // -- GODOT start --
+        // throw std::runtime_error("task cancelled");
+        abort();
+        // -- GODOT end --
+    #endif
+  }
+
+#else
+
+  template<typename Index, typename Func>
+    __forceinline void parallel_for_static( const Index N, const Func& func) 
+  {
+    parallel_for(N,func);
+  }
+
+  struct affinity_partitioner {
+  };
+
+  template<typename Index, typename Func>
+    __forceinline void parallel_for_affinity( const Index N, const Func& func, affinity_partitioner& ap) 
+  {
+    parallel_for(N,func);
+  }
+
+#endif
+}
diff --git a/thirdparty/embree/common/algorithms/parallel_for_for.h b/thirdparty/embree/common/algorithms/parallel_for_for.h
new file mode 100644
index 0000000000..92c37a4a38
--- /dev/null
+++ b/thirdparty/embree/common/algorithms/parallel_for_for.h
@@ -0,0 +1,149 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_for.h"
+
+namespace embree
+{
+  template<typename ArrayArray, typename Func>
+    __forceinline void sequential_for_for( ArrayArray& array2, const size_t minStepSize, const Func& func ) 
+  {
+    size_t k=0;
+    for (size_t i=0; i!=array2.size(); ++i) {
+      const size_t N = array2[i]->size();
+      if (N) func(array2[i],range<size_t>(0,N),k);
+      k+=N;
+    }
+  }
+
+  class ParallelForForState
+  {
+  public:
+
+    enum { MAX_TASKS = 64 };
+
+    __forceinline ParallelForForState () 
+      : taskCount(0) {}
+
+    template<typename ArrayArray>
+      __forceinline ParallelForForState (ArrayArray& array2, const size_t minStepSize) {
+      init(array2,minStepSize);
+    } 
+
+    template<typename ArrayArray>
+      __forceinline void init ( ArrayArray& array2, const size_t minStepSize )
+    {
+      /* first calculate total number of elements */
+      size_t N = 0;
+      for (size_t i=0; i<array2.size(); i++) {
+	N += array2[i] ? array2[i]->size() : 0;
+      }
+      this->N = N;
+
+      /* calculate number of tasks to use */
+      const size_t numThreads = TaskScheduler::threadCount();
+      const size_t numBlocks  = (N+minStepSize-1)/minStepSize;
+      taskCount = max(size_t(1),min(numThreads,numBlocks,size_t(ParallelForForState::MAX_TASKS)));
+      
+      /* calculate start (i,j) for each task */
+      size_t taskIndex = 0;
+      i0[taskIndex] = 0;
+      j0[taskIndex] = 0;
+      size_t k0 = (++taskIndex)*N/taskCount;
+      for (size_t i=0, k=0; taskIndex < taskCount; i++) 
+      {
+	assert(i<array2.size());
+	size_t j=0, M = array2[i] ? array2[i]->size() : 0;
+	while (j<M && k+M-j >= k0 && taskIndex < taskCount) {
+	  assert(taskIndex<taskCount);
+	  i0[taskIndex] = i;
+	  j0[taskIndex] = j += k0-k;
+	  k=k0;
+	  k0 = (++taskIndex)*N/taskCount;
+	}
+	k+=M-j;
+      }
+    }
+
+    __forceinline size_t size() const {
+      return N;
+    }
+    
+  public:
+    size_t i0[MAX_TASKS];
+    size_t j0[MAX_TASKS];
+    size_t taskCount;
+    size_t N;
+  };
+
+  template<typename ArrayArray, typename Func>
+    __forceinline void parallel_for_for( ArrayArray& array2, const size_t minStepSize, const Func& func )
+  {
+    ParallelForForState state(array2,minStepSize);
+    
+    parallel_for(state.taskCount, [&](const size_t taskIndex) 
+    {
+      /* calculate range */
+      const size_t k0 = (taskIndex+0)*state.size()/state.taskCount;
+      const size_t k1 = (taskIndex+1)*state.size()/state.taskCount;
+      size_t i0 = state.i0[taskIndex];
+      size_t j0 = state.j0[taskIndex];
+
+      /* iterate over arrays */
+      size_t k=k0;
+      for (size_t i=i0; k<k1; i++) {
+        const size_t N =  array2[i] ? array2[i]->size() : 0;
+        const size_t r0 = j0, r1 = min(N,r0+k1-k);
+        if (r1 > r0) func(array2[i],range<size_t>(r0,r1),k);
+        k+=r1-r0; j0 = 0;
+      }
+    });
+  }
+
+  template<typename ArrayArray, typename Func>
+    __forceinline void parallel_for_for( ArrayArray& array2, const Func& func )
+  {
+    parallel_for_for(array2,1,func);
+  }
+
+  template<typename ArrayArray, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_for_for_reduce( ArrayArray& array2, const size_t minStepSize, const Value& identity, const Func& func, const Reduction& reduction )
+  {
+    ParallelForForState state(array2,minStepSize);
+    Value temp[ParallelForForState::MAX_TASKS];
+
+    for (size_t i=0; i<state.taskCount; i++)
+      temp[i] = identity;
+    
+    parallel_for(state.taskCount, [&](const size_t taskIndex) 
+    {
+      /* calculate range */
+      const size_t k0 = (taskIndex+0)*state.size()/state.taskCount;
+      const size_t k1 = (taskIndex+1)*state.size()/state.taskCount;
+      size_t i0 = state.i0[taskIndex];
+      size_t j0 = state.j0[taskIndex];
+
+      /* iterate over arrays */
+      size_t k=k0;
+      for (size_t i=i0; k<k1; i++) {
+        const size_t N =  array2[i] ? array2[i]->size() : 0;
+        const size_t r0 = j0, r1 = min(N,r0+k1-k);
+        if (r1 > r0) temp[taskIndex] = reduction(temp[taskIndex],func(array2[i],range<size_t>(r0,r1),k));
+        k+=r1-r0; j0 = 0;
+      }
+    });
+
+    Value ret = identity;
+    for (size_t i=0; i<state.taskCount; i++)
+      ret = reduction(ret,temp[i]);
+    return ret;
+  }
+
+  template<typename ArrayArray, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_for_for_reduce( ArrayArray& array2, const Value& identity, const Func& func, const Reduction& reduction)
+  {
+    return parallel_for_for_reduce(array2,1,identity,func,reduction);
+  }
+}
diff --git a/thirdparty/embree/common/algorithms/parallel_for_for_prefix_sum.h b/thirdparty/embree/common/algorithms/parallel_for_for_prefix_sum.h
new file mode 100644
index 0000000000..b15b44a991
--- /dev/null
+++ b/thirdparty/embree/common/algorithms/parallel_for_for_prefix_sum.h
@@ -0,0 +1,112 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_for_for.h"
+#include "parallel_prefix_sum.h"
+
+namespace embree
+{
+  template<typename Value>
+    struct ParallelForForPrefixSumState : public ParallelForForState
+  {
+    __forceinline ParallelForForPrefixSumState () {}
+
+    template<typename ArrayArray>
+      __forceinline ParallelForForPrefixSumState (ArrayArray& array2, const size_t minStepSize)
+      : ParallelForForState(array2,minStepSize) {}
+
+    ParallelPrefixSumState<Value> prefix_state;
+  };
+  
+  template<typename ArrayArray, typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_for_for_prefix_sum0( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2, Index minStepSize, 
+                                                      const Value& identity, const Func& func, const Reduction& reduction)
+  {
+    /* calculate number of tasks to use */
+    const size_t taskCount = state.taskCount;
+    /* perform parallel prefix sum */
+    parallel_for(taskCount, [&](const size_t taskIndex)
+    {
+      const size_t k0 = (taskIndex+0)*state.size()/taskCount;
+      const size_t k1 = (taskIndex+1)*state.size()/taskCount;
+      size_t i0 = state.i0[taskIndex];
+      size_t j0 = state.j0[taskIndex];
+
+      /* iterate over arrays */
+      size_t k=k0;
+      Value N=identity;
+      for (size_t i=i0; k<k1; i++) {
+	const size_t size = array2[i] ? array2[i]->size() : 0;
+        const size_t r0 = j0, r1 = min(size,r0+k1-k);
+        if (r1 > r0) N = reduction(N, func(array2[i],range<Index>((Index)r0,(Index)r1),(Index)k,(Index)i));
+        k+=r1-r0; j0 = 0;
+      }
+      state.prefix_state.counts[taskIndex] = N;
+    });
+
+    /* calculate prefix sum */
+    Value sum=identity;
+    for (size_t i=0; i<taskCount; i++)
+    {
+      const Value c = state.prefix_state.counts[i];
+      state.prefix_state.sums[i] = sum;
+      sum=reduction(sum,c);
+    }
+
+    return sum;
+  }
+
+  template<typename ArrayArray, typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_for_for_prefix_sum1( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2, Index minStepSize, 
+                                                      const Value& identity, const Func& func, const Reduction& reduction)
+  {
+    /* calculate number of tasks to use */
+    const size_t taskCount = state.taskCount;
+    /* perform parallel prefix sum */
+    parallel_for(taskCount, [&](const size_t taskIndex)
+    {
+      const size_t k0 = (taskIndex+0)*state.size()/taskCount;
+      const size_t k1 = (taskIndex+1)*state.size()/taskCount;
+      size_t i0 = state.i0[taskIndex];
+      size_t j0 = state.j0[taskIndex];
+
+      /* iterate over arrays */
+      size_t k=k0;
+      Value N=identity;
+      for (size_t i=i0; k<k1; i++) {
+	const size_t size = array2[i] ? array2[i]->size() : 0;
+        const size_t r0 = j0, r1 = min(size,r0+k1-k);
+        if (r1 > r0) N = reduction(N, func(array2[i],range<Index>((Index)r0,(Index)r1),(Index)k,(Index)i,reduction(state.prefix_state.sums[taskIndex],N)));
+        k+=r1-r0; j0 = 0;
+      }
+      state.prefix_state.counts[taskIndex] = N;
+    });
+
+    /* calculate prefix sum */
+    Value sum=identity;
+    for (size_t i=0; i<taskCount; i++)
+    {
+      const Value c = state.prefix_state.counts[i];
+      state.prefix_state.sums[i] = sum;
+      sum=reduction(sum,c);
+    }
+
+    return sum;
+  }
+
+  template<typename ArrayArray, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_for_for_prefix_sum0( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2, 
+						     const Value& identity, const Func& func, const Reduction& reduction)
+  {
+    return parallel_for_for_prefix_sum0(state,array2,size_t(1),identity,func,reduction);
+  }
+
+  template<typename ArrayArray, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_for_for_prefix_sum1( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2, 
+						     const Value& identity, const Func& func, const Reduction& reduction)
+  {
+    return parallel_for_for_prefix_sum1(state,array2,size_t(1),identity,func,reduction);
+  }
+}
diff --git a/thirdparty/embree/common/algorithms/parallel_map.h b/thirdparty/embree/common/algorithms/parallel_map.h
new file mode 100644
index 0000000000..15c098fe20
--- /dev/null
+++ b/thirdparty/embree/common/algorithms/parallel_map.h
@@ -0,0 +1,85 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_sort.h"
+
+namespace embree
+{
+  /*! implementation of a key/value map with parallel construction */
+  template<typename Key, typename Val>
+  class parallel_map
+  {
+    /* key/value pair to build the map */
+    struct KeyValue
+    {
+      __forceinline KeyValue () {}
+
+      __forceinline KeyValue (const Key key, const Val val)
+	: key(key), val(val) {}
+
+      __forceinline operator Key() const {
+	return key;
+      }
+
+    public:
+      Key key;
+      Val val;
+    };
+
+  public:
+    
+    /*! parallel map constructors */
+    parallel_map () {}
+
+    /*! construction from pair of vectors */
+    template<typename KeyVector, typename ValVector>
+      parallel_map (const KeyVector& keys, const ValVector& values) { init(keys,values); }
+
+    /*! initialized the parallel map from a vector with keys and values */
+    template<typename KeyVector, typename ValVector>
+      void init(const KeyVector& keys, const ValVector& values) 
+    {
+      /* reserve sufficient space for all data */
+      assert(keys.size() == values.size());
+      vec.resize(keys.size());
+      
+      /* generate key/value pairs */
+      parallel_for( size_t(0), keys.size(), size_t(4*4096), [&](const range<size_t>& r) {
+	for (size_t i=r.begin(); i<r.end(); i++)
+	  vec[i] = KeyValue((Key)keys[i],values[i]);
+      });
+
+      /* perform parallel radix sort of the key/value pairs */
+      std::vector<KeyValue> temp(keys.size());
+      radix_sort<KeyValue,Key>(vec.data(),temp.data(),keys.size());
+    }
+
+    /*! Returns a pointer to the value associated with the specified key. The pointer will be nullptr of the key is not contained in the map. */
+    __forceinline const Val* lookup(const Key& key) const 
+    {
+      typename std::vector<KeyValue>::const_iterator i = std::lower_bound(vec.begin(), vec.end(), key);
+      if (i == vec.end()) return nullptr;
+      if (i->key != key) return nullptr;
+      return &i->val;
+    }
+
+    /*! If the key is in the map, the function returns the value associated with the key, otherwise it returns the default value. */
+    __forceinline Val lookup(const Key& key, const Val& def) const 
+    {
+      typename std::vector<KeyValue>::const_iterator i = std::lower_bound(vec.begin(), vec.end(), key);
+      if (i == vec.end()) return def;
+      if (i->key != key) return def;
+      return i->val;
+    }
+
+    /*! clears all state */
+    void clear() {
+      vec.clear();
+    }
+
+  private:
+    std::vector<KeyValue> vec;    //!< vector containing sorted elements
+  };
+}
diff --git a/thirdparty/embree/common/algorithms/parallel_partition.h b/thirdparty/embree/common/algorithms/parallel_partition.h
new file mode 100644
index 0000000000..a1cbdc8e04
--- /dev/null
+++ b/thirdparty/embree/common/algorithms/parallel_partition.h
@@ -0,0 +1,283 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_for.h"
+#include "../math/range.h"
+
+namespace embree
+{
+  /* serial partitioning */
+  template<typename T, typename V, typename IsLeft, typename Reduction_T>
+    __forceinline size_t serial_partitioning(T* array, 
+                                             const size_t begin,
+                                             const size_t end, 
+                                             V& leftReduction,
+                                             V& rightReduction,
+                                             const IsLeft& is_left, 
+                                             const Reduction_T& reduction_t)
+  {
+    T* l = array + begin;
+    T* r = array + end - 1;
+    
+    while(1)
+    {
+      /* *l < pivot */
+      while (likely(l <= r && is_left(*l) )) 
+      {
+        //prefetchw(l+4); // FIXME: enable?
+        reduction_t(leftReduction,*l);
+        ++l;
+      }
+      /* *r >= pivot) */
+      while (likely(l <= r && !is_left(*r)))
+      {
+        //prefetchw(r-4); FIXME: enable?
+        reduction_t(rightReduction,*r);
+        --r;
+      }
+      if (r<l) break;
+      
+      reduction_t(leftReduction ,*r);
+      reduction_t(rightReduction,*l);
+      xchg(*l,*r);
+      l++; r--;
+    }
+    
+    return l - array;        
+  }
+
+  template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V>
+    class __aligned(64) parallel_partition_task
+  {
+    ALIGNED_CLASS_(64);
+  private:
+
+    static const size_t MAX_TASKS = 64;
+
+    T* array;
+    size_t N;
+    const IsLeft& is_left;
+    const Reduction_T& reduction_t;
+    const Reduction_V& reduction_v;
+    const Vi& identity;
+
+    size_t numTasks; 
+    __aligned(64) size_t counter_start[MAX_TASKS+1]; 
+    __aligned(64) size_t counter_left[MAX_TASKS+1];  
+    __aligned(64) range<ssize_t> leftMisplacedRanges[MAX_TASKS];  
+    __aligned(64) range<ssize_t> rightMisplacedRanges[MAX_TASKS]; 
+    __aligned(64) V leftReductions[MAX_TASKS];           
+    __aligned(64) V rightReductions[MAX_TASKS];    
+
+  public:
+     
+    __forceinline parallel_partition_task(T* array, 
+                                          const size_t N, 
+                                          const Vi& identity, 
+                                          const IsLeft& is_left, 
+                                          const Reduction_T& reduction_t, 
+                                          const Reduction_V& reduction_v,
+                                          const size_t BLOCK_SIZE) 
+
+      : array(array), N(N), is_left(is_left), reduction_t(reduction_t), reduction_v(reduction_v), identity(identity),
+      numTasks(min((N+BLOCK_SIZE-1)/BLOCK_SIZE,min(TaskScheduler::threadCount(),MAX_TASKS))) {}
+
+    __forceinline const range<ssize_t>* findStartRange(size_t& index, const range<ssize_t>* const r, const size_t numRanges)
+    {
+      size_t i = 0;
+      while(index >= (size_t)r[i].size())
+      {
+        assert(i < numRanges);
+        index -= (size_t)r[i].size();
+        i++;
+      }	    
+      return &r[i];
+    }
+
+    __forceinline void swapItemsInMisplacedRanges(const size_t numLeftMisplacedRanges,
+                                                  const size_t numRightMisplacedRanges,
+                                                  const size_t startID,
+                                                  const size_t endID)
+    {
+      size_t leftLocalIndex  = startID;
+      size_t rightLocalIndex = startID;
+      const range<ssize_t>* l_range = findStartRange(leftLocalIndex,leftMisplacedRanges,numLeftMisplacedRanges);
+      const range<ssize_t>* r_range = findStartRange(rightLocalIndex,rightMisplacedRanges,numRightMisplacedRanges);
+      
+      size_t l_left = l_range->size() - leftLocalIndex;
+      size_t r_left = r_range->size() - rightLocalIndex;
+      T *__restrict__ l = &array[l_range->begin() + leftLocalIndex];
+      T *__restrict__ r = &array[r_range->begin() + rightLocalIndex];
+      size_t size = endID - startID;
+      size_t items = min(size,min(l_left,r_left)); 
+     
+      while (size)
+      {
+        if (unlikely(l_left == 0))
+        {
+          l_range++;
+          l_left = l_range->size();
+          l = &array[l_range->begin()];
+          items = min(size,min(l_left,r_left));
+        }
+
+        if (unlikely(r_left == 0))
+        {		
+          r_range++;
+          r_left = r_range->size();
+          r = &array[r_range->begin()];          
+          items = min(size,min(l_left,r_left));
+        }
+
+        size   -= items;
+        l_left -= items;
+        r_left -= items;
+
+        while(items) {
+          items--;
+          xchg(*l++,*r++);
+        }
+      }
+    }
+
+    __forceinline size_t partition(V& leftReduction, V& rightReduction)
+    {
+      /* partition the individual ranges for each task */
+      parallel_for(numTasks,[&] (const size_t taskID) {
+          const size_t startID = (taskID+0)*N/numTasks;
+          const size_t endID   = (taskID+1)*N/numTasks;
+          V local_left(identity);
+          V local_right(identity);
+          const size_t mid = serial_partitioning(array,startID,endID,local_left,local_right,is_left,reduction_t);
+          counter_start[taskID] = startID;
+          counter_left [taskID] = mid-startID;
+          leftReductions[taskID]  = local_left;
+          rightReductions[taskID] = local_right;
+        });
+      counter_start[numTasks] = N;
+      counter_left[numTasks]  = 0;
+      
+      /* finalize the reductions */
+      for (size_t i=0; i<numTasks; i++) {
+        reduction_v(leftReduction,leftReductions[i]);
+        reduction_v(rightReduction,rightReductions[i]);
+      }
+
+      /* calculate mid point for partitioning */
+      size_t mid = counter_left[0];
+      for (size_t i=1; i<numTasks; i++)
+        mid += counter_left[i];
+      const range<ssize_t> globalLeft (0,mid);
+      const range<ssize_t> globalRight(mid,N);
+
+      /* calculate all left and right ranges that are on the wrong global side */
+      size_t numMisplacedRangesLeft  = 0;
+      size_t numMisplacedRangesRight = 0;
+      size_t numMisplacedItemsLeft   = 0;
+      size_t numMisplacedItemsRight  = 0;
+
+      for (size_t i=0; i<numTasks; i++)
+      {	    
+        const range<ssize_t> left_range (counter_start[i], counter_start[i] + counter_left[i]);
+        const range<ssize_t> right_range(counter_start[i] + counter_left[i], counter_start[i+1]);
+        const range<ssize_t> left_misplaced  = globalLeft. intersect(right_range);
+        const range<ssize_t> right_misplaced = globalRight.intersect(left_range);
+
+        if (!left_misplaced.empty())  
+        {
+          numMisplacedItemsLeft += left_misplaced.size();
+          leftMisplacedRanges[numMisplacedRangesLeft++] = left_misplaced;
+        }
+
+        if (!right_misplaced.empty()) 
+        {
+          numMisplacedItemsRight += right_misplaced.size();
+          rightMisplacedRanges[numMisplacedRangesRight++] = right_misplaced;
+        }
+      }
+      assert( numMisplacedItemsLeft == numMisplacedItemsRight );
+
+      /* if no items are misplaced we are done */
+      if (numMisplacedItemsLeft == 0)
+        return mid;
+
+      /* otherwise we copy the items to the right place in parallel */
+      parallel_for(numTasks,[&] (const size_t taskID) {
+          const size_t startID = (taskID+0)*numMisplacedItemsLeft/numTasks;
+          const size_t endID   = (taskID+1)*numMisplacedItemsLeft/numTasks;
+          swapItemsInMisplacedRanges(numMisplacedRangesLeft,numMisplacedRangesRight,startID,endID);	                             
+        });
+
+      return mid;
+    }
+  };
+
+  template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V>
+    __noinline size_t parallel_partitioning(T* array, 
+                                            const size_t begin,
+                                            const size_t end, 
+                                            const Vi &identity,
+                                            V &leftReduction,
+                                            V &rightReduction,
+                                            const IsLeft& is_left, 
+                                            const Reduction_T& reduction_t,
+                                            const Reduction_V& reduction_v,
+                                            size_t BLOCK_SIZE = 128)
+  {
+    /* fall back to single threaded partitioning for small N */
+    if (unlikely(end-begin < BLOCK_SIZE))
+      return serial_partitioning(array,begin,end,leftReduction,rightReduction,is_left,reduction_t);
+
+    /* otherwise use parallel code */
+    else {
+      typedef parallel_partition_task<T,V,Vi,IsLeft,Reduction_T,Reduction_V> partition_task;
+      std::unique_ptr<partition_task> p(new partition_task(&array[begin],end-begin,identity,is_left,reduction_t,reduction_v,BLOCK_SIZE));
+      return begin+p->partition(leftReduction,rightReduction);    
+    }
+  }
+
+  template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V>
+    __noinline size_t parallel_partitioning(T* array, 
+                                            const size_t begin,
+                                            const size_t end, 
+                                            const Vi &identity,
+                                            V &leftReduction,
+                                            V &rightReduction,
+                                            const IsLeft& is_left, 
+                                            const Reduction_T& reduction_t,
+                                            const Reduction_V& reduction_v,
+                                            size_t BLOCK_SIZE,
+                                            size_t PARALLEL_THRESHOLD)
+  {
+    /* fall back to single threaded partitioning for small N */
+    if (unlikely(end-begin < PARALLEL_THRESHOLD))
+      return serial_partitioning(array,begin,end,leftReduction,rightReduction,is_left,reduction_t);
+
+    /* otherwise use parallel code */
+    else {
+      typedef parallel_partition_task<T,V,Vi,IsLeft,Reduction_T,Reduction_V> partition_task;
+      std::unique_ptr<partition_task> p(new partition_task(&array[begin],end-begin,identity,is_left,reduction_t,reduction_v,BLOCK_SIZE));
+      return begin+p->partition(leftReduction,rightReduction);    
+    }
+  }
+
+
+  template<typename T, typename IsLeft>
+    inline size_t parallel_partitioning(T* array, 
+                                        const size_t begin,
+                                        const size_t end, 
+                                        const IsLeft& is_left, 
+                                        size_t BLOCK_SIZE = 128)
+  {
+    size_t leftReduction = 0;
+    size_t rightReduction = 0;
+    return parallel_partitioning(
+      array,begin,end,0,leftReduction,rightReduction,is_left,
+      [] (size_t& t,const T& ref) {  },
+      [] (size_t& t0,size_t& t1) { },
+      BLOCK_SIZE);
+  }
+
+}
diff --git a/thirdparty/embree/common/algorithms/parallel_prefix_sum.h b/thirdparty/embree/common/algorithms/parallel_prefix_sum.h
new file mode 100644
index 0000000000..208bb4e480
--- /dev/null
+++ b/thirdparty/embree/common/algorithms/parallel_prefix_sum.h
@@ -0,0 +1,85 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_for.h"
+
+namespace embree
+{
+  template<typename Value>
+    struct ParallelPrefixSumState 
+  {
+    enum { MAX_TASKS = 64 };
+    Value counts[MAX_TASKS];
+    Value sums  [MAX_TASKS];
+  };
+
+  template<typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_prefix_sum( ParallelPrefixSumState<Value>& state, Index first, Index last, Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction)
+  {
+    /* calculate number of tasks to use */
+    const size_t numThreads = TaskScheduler::threadCount();
+    const size_t numBlocks  = (last-first+minStepSize-1)/minStepSize;
+    const size_t taskCount  = min(numThreads,numBlocks,size_t(ParallelPrefixSumState<Value>::MAX_TASKS));
+
+    /* perform parallel prefix sum */
+    parallel_for(taskCount, [&](const size_t taskIndex)
+    {
+      const size_t i0 = first+(taskIndex+0)*(last-first)/taskCount;
+      const size_t i1 = first+(taskIndex+1)*(last-first)/taskCount;
+      state.counts[taskIndex] = func(range<size_t>(i0,i1),state.sums[taskIndex]);
+    });
+
+    /* calculate prefix sum */
+    Value sum=identity;
+    for (size_t i=0; i<taskCount; i++) 
+    {
+      const Value c = state.counts[i];
+      state.sums[i] = sum;
+      sum=reduction(sum,c);
+    }
+
+    return sum;
+  }
+
+  /*! parallel calculation of prefix sums */
+  template<typename SrcArray, typename DstArray, typename Value, typename Add>
+    __forceinline Value parallel_prefix_sum(const SrcArray& src, DstArray& dst, size_t N, const Value& identity, const Add& add, const size_t SINGLE_THREAD_THRESHOLD = 4096) 
+  {
+    /* perform single threaded prefix operation for small N */
+    if (N < SINGLE_THREAD_THRESHOLD) 
+    {
+      Value sum=identity;
+      for (size_t i=0; i<N; sum=add(sum,src[i++])) dst[i] = sum;
+      return sum;
+    }
+    
+    /* perform parallel prefix operation for large N */
+    else 
+    {
+      ParallelPrefixSumState<Value> state;
+      
+      /* initial run just sets up start values for subtasks */
+      parallel_prefix_sum( state, size_t(0), size_t(N), size_t(1024), identity, [&](const range<size_t>& r, const Value& sum) -> Value {
+          
+          Value s = identity;
+          for (size_t i=r.begin(); i<r.end(); i++) s = add(s,src[i]);
+          return s;
+          
+        }, add);
+      
+      /* final run calculates prefix sum */
+      return parallel_prefix_sum( state, size_t(0), size_t(N), size_t(1024), identity, [&](const range<size_t>& r, const Value& sum) -> Value {
+          
+          Value s = identity;
+          for (size_t i=r.begin(); i<r.end(); i++) {
+            dst[i] = add(sum,s);
+            s = add(s,src[i]);
+          }
+          return s;
+          
+        }, add);
+    }
+  }
+}
diff --git a/thirdparty/embree/common/algorithms/parallel_reduce.h b/thirdparty/embree/common/algorithms/parallel_reduce.h
new file mode 100644
index 0000000000..8271372ea4
--- /dev/null
+++ b/thirdparty/embree/common/algorithms/parallel_reduce.h
@@ -0,0 +1,150 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_for.h"
+
+namespace embree
+{
+  template<typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value sequential_reduce( const Index first, const Index last, const Value& identity, const Func& func, const Reduction& reduction ) 
+  {
+    return func(range<Index>(first,last));
+  }
+
+  template<typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value sequential_reduce( const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction )
+  {
+    return func(range<Index>(first,last));
+  }
+
+  template<typename Index, typename Value, typename Func, typename Reduction>
+    __noinline Value parallel_reduce_internal( Index taskCount, const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction )
+  {
+    const Index maxTasks = 512;
+    const Index threadCount = (Index) TaskScheduler::threadCount();
+    taskCount = min(taskCount,threadCount,maxTasks);
+
+    /* parallel invokation of all tasks */
+    dynamic_large_stack_array(Value,values,taskCount,8192); // consumes at most 8192 bytes on the stack
+    parallel_for(taskCount, [&](const Index taskIndex) {
+        const Index k0 = first+(taskIndex+0)*(last-first)/taskCount;
+        const Index k1 = first+(taskIndex+1)*(last-first)/taskCount;
+        values[taskIndex] = func(range<Index>(k0,k1));
+      });
+
+    /* perform reduction over all tasks */
+    Value v = identity;
+    for (Index i=0; i<taskCount; i++) v = reduction(v,values[i]);
+    return v;
+  }
+
+  template<typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_reduce( const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction )
+  {
+#if defined(TASKING_INTERNAL)
+
+    /* fast path for small number of iterations */
+    Index taskCount = (last-first+minStepSize-1)/minStepSize;
+    if (likely(taskCount == 1)) {
+      return func(range<Index>(first,last));
+    }
+    return parallel_reduce_internal(taskCount,first,last,minStepSize,identity,func,reduction);
+
+#elif defined(TASKING_TBB)
+  #if TBB_INTERFACE_VERSION >= 12002
+    tbb::task_group_context context;
+    const Value v = tbb::parallel_reduce(tbb::blocked_range<Index>(first,last,minStepSize),identity,
+      [&](const tbb::blocked_range<Index>& r, const Value& start) { return reduction(start,func(range<Index>(r.begin(),r.end()))); },
+      reduction,context);
+    // -- GODOT start --
+    // if (context.is_group_execution_cancelled())
+    //   throw std::runtime_error("task cancelled");
+    // -- GODOT end --
+    return v;
+  #else
+    const Value v = tbb::parallel_reduce(tbb::blocked_range<Index>(first,last,minStepSize),identity,
+      [&](const tbb::blocked_range<Index>& r, const Value& start) { return reduction(start,func(range<Index>(r.begin(),r.end()))); },
+      reduction);
+    // -- GODOT start --
+    // if (tbb::task::self().is_cancelled())
+    //   throw std::runtime_error("task cancelled");
+    // -- GODOT end --
+    return v;
+  #endif
+#else // TASKING_PPL
+    struct AlignedValue
+    {
+      char storage[__alignof(Value)+sizeof(Value)];
+      static uintptr_t alignUp(uintptr_t p, size_t a) { return p + (~(p - 1) % a); };
+      Value* getValuePtr() { return reinterpret_cast<Value*>(alignUp(uintptr_t(storage), __alignof(Value))); }
+      const Value* getValuePtr() const { return reinterpret_cast<Value*>(alignUp(uintptr_t(storage), __alignof(Value))); }
+      AlignedValue(const Value& v) { new(getValuePtr()) Value(v); }
+      AlignedValue(const AlignedValue& v) { new(getValuePtr()) Value(*v.getValuePtr()); }
+      AlignedValue(const AlignedValue&& v) { new(getValuePtr()) Value(*v.getValuePtr()); };
+      AlignedValue& operator = (const AlignedValue& v) { *getValuePtr() = *v.getValuePtr(); return *this; };
+      AlignedValue& operator = (const AlignedValue&& v) { *getValuePtr() = *v.getValuePtr(); return *this; };
+      operator Value() const { return *getValuePtr(); }
+    };
+    
+    struct Iterator_Index
+    {
+      Index v;
+      typedef std::forward_iterator_tag iterator_category;
+      typedef AlignedValue value_type;
+      typedef Index difference_type;
+      typedef Index distance_type;
+      typedef AlignedValue* pointer;
+      typedef AlignedValue& reference;
+      __forceinline Iterator_Index() {}
+      __forceinline Iterator_Index(Index v) : v(v) {}
+      __forceinline bool operator== (Iterator_Index other) { return v == other.v; }
+      __forceinline bool operator!= (Iterator_Index other) { return v != other.v; }
+      __forceinline Iterator_Index operator++() { return Iterator_Index(++v); }
+      __forceinline Iterator_Index operator++(int) { return Iterator_Index(v++); }
+    };
+    
+    auto range_reduction = [&](Iterator_Index begin, Iterator_Index end, const AlignedValue& start) {
+      assert(begin.v < end.v);
+      return reduction(start, func(range<Index>(begin.v, end.v)));
+    };
+    const Value v = concurrency::parallel_reduce(Iterator_Index(first), Iterator_Index(last), AlignedValue(identity), range_reduction, reduction);
+    return v;
+#endif
+  }
+
+  template<typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_reduce( const Index first, const Index last, const Index minStepSize, const Index parallel_threshold, const Value& identity, const Func& func, const Reduction& reduction )
+  {
+    if (likely(last-first < parallel_threshold)) {
+      return func(range<Index>(first,last)); 
+    } else {
+      return parallel_reduce(first,last,minStepSize,identity,func,reduction);
+    }
+  }
+
+  template<typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_reduce( const range<Index> range, const Index minStepSize, const Index parallel_threshold, const Value& identity, const Func& func, const Reduction& reduction ) 
+  {
+    return parallel_reduce(range.begin(),range.end(),minStepSize,parallel_threshold,identity,func,reduction);
+  }
+
+  template<typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_reduce( const Index first, const Index last, const Value& identity, const Func& func, const Reduction& reduction )
+  {
+    auto funcr = [&] ( const range<Index> r ) {
+      Value v = identity;
+      for (Index i=r.begin(); i<r.end(); i++)
+        v = reduction(v,func(i));
+      return v;
+    };
+    return parallel_reduce(first,last,Index(1),identity,funcr,reduction);
+  }
+
+  template<typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_reduce( const range<Index> range, const Value& identity, const Func& func, const Reduction& reduction )
+  {
+    return parallel_reduce(range.begin(),range.end(),Index(1),identity,func,reduction);
+  }
+}
diff --git a/thirdparty/embree/common/algorithms/parallel_set.h b/thirdparty/embree/common/algorithms/parallel_set.h
new file mode 100644
index 0000000000..7eae577457
--- /dev/null
+++ b/thirdparty/embree/common/algorithms/parallel_set.h
@@ -0,0 +1,52 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_sort.h"
+
+namespace embree
+{
+  /* implementation of a set of values with parallel construction */
+  template<typename T>
+  class parallel_set
+  {
+  public:
+
+    /*! default constructor for the parallel set */
+    parallel_set () {}
+
+    /*! construction from vector */
+    template<typename Vector>
+      parallel_set (const Vector& in) { init(in); }
+
+    /*! initialized the parallel set from a vector */
+    template<typename Vector>
+      void init(const Vector& in) 
+    {
+      /* copy data to internal vector */
+      vec.resize(in.size());
+      parallel_for( size_t(0), in.size(), size_t(4*4096), [&](const range<size_t>& r) {
+	for (size_t i=r.begin(); i<r.end(); i++) 
+	  vec[i] = in[i];
+      });
+
+      /* sort the data */
+      std::vector<T> temp(in.size());
+      radix_sort<T>(vec.data(),temp.data(),vec.size());
+    }
+
+    /*! tests if some element is in the set */
+    __forceinline bool lookup(const T& elt) const {
+      return std::binary_search(vec.begin(), vec.end(), elt);
+    }
+
+    /*! clears all state */
+    void clear() {
+      vec.clear();
+    }
+
+  private:
+    std::vector<T> vec;   //!< vector containing sorted elements
+  };
+}
diff --git a/thirdparty/embree/common/algorithms/parallel_sort.h b/thirdparty/embree/common/algorithms/parallel_sort.h
new file mode 100644
index 0000000000..30e56c2bfc
--- /dev/null
+++ b/thirdparty/embree/common/algorithms/parallel_sort.h
@@ -0,0 +1,454 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../simd/simd.h"
+#include "parallel_for.h"
+#include <algorithm>
+
+namespace embree
+{
+  template<class T>
+    __forceinline void insertionsort_ascending(T *__restrict__ array, const size_t length)
+  {
+    for(size_t i = 1;i<length;++i)
+    {
+      T v = array[i];
+      size_t j = i;
+      while(j > 0 && v < array[j-1])
+      {
+        array[j] = array[j-1];
+        --j;
+      }
+      array[j] = v;
+    }
+  }
+  
+  template<class T>
+    __forceinline void insertionsort_decending(T *__restrict__ array, const size_t length)
+  {
+    for(size_t i = 1;i<length;++i)
+    {
+      T v = array[i];
+      size_t j = i;
+      while(j > 0 && v > array[j-1])
+      {
+        array[j] = array[j-1];
+        --j;
+      }
+      array[j] = v;
+    }
+  }
+  
+  template<class T> 
+    void quicksort_ascending(T *__restrict__ t, 
+			     const ssize_t begin, 
+			     const ssize_t end)
+  {
+    if (likely(begin < end)) 
+    {      
+      const T pivotvalue = t[begin];
+      ssize_t left  = begin - 1;
+      ssize_t right = end   + 1;
+      
+      while(1) 
+      {
+        while (t[--right] > pivotvalue);
+        while (t[++left] < pivotvalue);
+        
+        if (left >= right) break;
+        
+        const T temp = t[right];
+        t[right] = t[left];
+        t[left] = temp;
+      }
+      
+      const int pivot = right;
+      quicksort_ascending(t, begin, pivot);
+      quicksort_ascending(t, pivot + 1, end);
+    }
+  }
+  
+  template<class T> 
+    void quicksort_decending(T *__restrict__ t, 
+			     const ssize_t begin, 
+			     const ssize_t end)
+  {
+    if (likely(begin < end)) 
+    {
+      const T pivotvalue = t[begin];
+      ssize_t left  = begin - 1;
+      ssize_t right = end   + 1;
+      
+      while(1) 
+      {
+        while (t[--right] < pivotvalue);
+        while (t[++left] > pivotvalue);
+        
+        if (left >= right) break;
+        
+        const T temp = t[right];
+        t[right] = t[left];
+        t[left] = temp;
+      }
+      
+      const int pivot = right;
+      quicksort_decending(t, begin, pivot);
+      quicksort_decending(t, pivot + 1, end);
+    }
+  }
+  
+  
+  template<class T, ssize_t THRESHOLD> 
+    void quicksort_insertionsort_ascending(T *__restrict__ t, 
+					   const ssize_t begin, 
+					   const ssize_t end)
+  {
+    if (likely(begin < end)) 
+    {      
+      const ssize_t size = end-begin+1;
+      if (likely(size <= THRESHOLD))
+      {
+        insertionsort_ascending<T>(&t[begin],size);
+      }
+      else
+      {
+        const T pivotvalue = t[begin];
+        ssize_t left  = begin - 1;
+        ssize_t right = end   + 1;
+        
+        while(1) 
+        {
+          while (t[--right] > pivotvalue);
+          while (t[++left] < pivotvalue);
+          
+          if (left >= right) break;
+          
+          const T temp = t[right];
+          t[right] = t[left];
+          t[left] = temp;
+        }
+        
+        const ssize_t pivot = right;
+        quicksort_insertionsort_ascending<T,THRESHOLD>(t, begin, pivot);
+        quicksort_insertionsort_ascending<T,THRESHOLD>(t, pivot + 1, end);
+      }
+    }
+  }
+  
+  
+  template<class T, ssize_t THRESHOLD> 
+    void quicksort_insertionsort_decending(T *__restrict__ t, 
+					   const ssize_t begin, 
+					   const ssize_t end)
+  {
+    if (likely(begin < end)) 
+    {
+      const ssize_t size = end-begin+1;
+      if (likely(size <= THRESHOLD))
+      {
+        insertionsort_decending<T>(&t[begin],size);
+      }
+      else
+      {
+        
+        const T pivotvalue = t[begin];
+        ssize_t left  = begin - 1;
+        ssize_t right = end   + 1;
+        
+        while(1) 
+        {
+          while (t[--right] < pivotvalue);
+          while (t[++left] > pivotvalue);
+          
+          if (left >= right) break;
+          
+          const T temp = t[right];
+          t[right] = t[left];
+          t[left] = temp;
+        }
+        
+        const ssize_t pivot = right;
+        quicksort_insertionsort_decending<T,THRESHOLD>(t, begin, pivot);
+        quicksort_insertionsort_decending<T,THRESHOLD>(t, pivot + 1, end);
+      }
+    }
+  }
+  
+  template<typename T>
+    static void radixsort32(T* const morton, const size_t num, const unsigned int shift = 3*8)
+  {
+    static const unsigned int BITS = 8;
+    static const unsigned int BUCKETS = (1 << BITS);
+    static const unsigned int CMP_SORT_THRESHOLD = 16;
+    
+    __aligned(64) unsigned int count[BUCKETS];
+    
+    /* clear buckets */
+    for (size_t i=0;i<BUCKETS;i++) count[i] = 0;
+    
+    /* count buckets */
+#if defined(__INTEL_COMPILER)
+#pragma nounroll
+#endif
+    for (size_t i=0;i<num;i++)
+      count[(unsigned(morton[i]) >> shift) & (BUCKETS-1)]++;
+    
+    /* prefix sums */
+    __aligned(64) unsigned int head[BUCKETS];
+    __aligned(64) unsigned int tail[BUCKETS];
+    
+    head[0] = 0;
+    for (size_t i=1; i<BUCKETS; i++)    
+      head[i] = head[i-1] + count[i-1];
+    
+    for (size_t i=0; i<BUCKETS-1; i++)    
+      tail[i] = head[i+1];
+    
+    tail[BUCKETS-1] = head[BUCKETS-1] + count[BUCKETS-1];
+    
+    assert(tail[BUCKETS-1] == head[BUCKETS-1] + count[BUCKETS-1]);      
+    assert(tail[BUCKETS-1] == num);      
+    
+    /* in-place swap */      
+    for (size_t i=0;i<BUCKETS;i++)
+    {
+      /* process bucket */
+      while(head[i] < tail[i])
+      {
+        T v = morton[head[i]];
+        while(1)
+        {
+          const size_t b = (unsigned(v) >> shift) & (BUCKETS-1);
+          if (b == i) break;
+          std::swap(v,morton[head[b]++]);
+        }
+        assert((unsigned(v) >> shift & (BUCKETS-1)) == i);
+        morton[head[i]++] = v;
+      }
+    }
+    if (shift == 0) return;
+    
+    size_t offset = 0;
+    for (size_t i=0;i<BUCKETS;i++)
+      if (count[i])
+      {
+        
+        for (size_t j=offset;j<offset+count[i]-1;j++)
+          assert(((unsigned(morton[j]) >> shift) & (BUCKETS-1)) == i);
+        
+        if (unlikely(count[i] < CMP_SORT_THRESHOLD))
+          insertionsort_ascending(morton + offset, count[i]);
+        else
+          radixsort32(morton + offset, count[i], shift-BITS);
+        
+        for (size_t j=offset;j<offset+count[i]-1;j++)
+          assert(morton[j] <= morton[j+1]);
+        
+        offset += count[i];
+      }      
+  }    
+
+  template<typename Ty, typename Key>
+    class ParallelRadixSort
+  {
+    static const size_t MAX_TASKS = 64;
+    static const size_t BITS = 8;
+    static const size_t BUCKETS = (1 << BITS);
+    typedef unsigned int TyRadixCount[BUCKETS];
+    
+    template<typename T>
+      static bool compare(const T& v0, const T& v1) {
+      return (Key)v0 < (Key)v1;
+    }
+
+  private:
+    ParallelRadixSort (const ParallelRadixSort& other) DELETED; // do not implement
+    ParallelRadixSort& operator= (const ParallelRadixSort& other) DELETED; // do not implement
+
+    
+  public:
+    ParallelRadixSort (Ty* const src, Ty* const tmp, const size_t N)
+      : radixCount(nullptr), src(src), tmp(tmp), N(N) {}
+
+    void sort(const size_t blockSize)
+    {
+      assert(blockSize > 0);
+      
+      /* perform single threaded sort for small N */
+      if (N<=blockSize) // handles also special case of 0!
+      {	  
+        /* do inplace sort inside destination array */
+        std::sort(src,src+N,compare<Ty>);
+      }
+      
+      /* perform parallel sort for large N */
+      else 
+      {
+        const size_t numThreads = min((N+blockSize-1)/blockSize,TaskScheduler::threadCount(),size_t(MAX_TASKS));
+        tbbRadixSort(numThreads);
+      }
+    }
+
+    ~ParallelRadixSort()
+    {
+      alignedFree(radixCount); 
+      radixCount = nullptr;
+    }
+    
+  private:
+    
+    void tbbRadixIteration0(const Key shift, 
+                            const Ty* __restrict const src, 
+                            Ty* __restrict const dst, 
+                            const size_t threadIndex, const size_t threadCount)
+    {
+      const size_t startID = (threadIndex+0)*N/threadCount;
+      const size_t endID   = (threadIndex+1)*N/threadCount;
+      
+      /* mask to extract some number of bits */
+      const Key mask = BUCKETS-1;
+      
+      /* count how many items go into the buckets */
+      for (size_t i=0; i<BUCKETS; i++)
+        radixCount[threadIndex][i] = 0;
+
+      /* iterate over src array and count buckets */
+      unsigned int * __restrict const count = radixCount[threadIndex];
+#if defined(__INTEL_COMPILER)
+#pragma nounroll      
+#endif
+      for (size_t i=startID; i<endID; i++) {
+#if defined(__64BIT__)
+        const size_t index = ((size_t)(Key)src[i] >> (size_t)shift) & (size_t)mask;
+#else
+        const Key index = ((Key)src[i] >> shift) & mask;
+#endif
+        count[index]++;
+      }
+    }
+    
+    void tbbRadixIteration1(const Key shift, 
+                            const Ty* __restrict const src, 
+                            Ty* __restrict const dst, 
+                            const size_t threadIndex, const size_t threadCount)
+    {
+      const size_t startID = (threadIndex+0)*N/threadCount;
+      const size_t endID   = (threadIndex+1)*N/threadCount;
+      
+      /* mask to extract some number of bits */
+      const Key mask = BUCKETS-1;
+      
+      /* calculate total number of items for each bucket */
+      __aligned(64) unsigned int total[BUCKETS];
+      /*
+      for (size_t i=0; i<BUCKETS; i++)
+        total[i] = 0;
+      */
+      for (size_t i=0; i<BUCKETS; i+=VSIZEX)
+        vintx::store(&total[i], zero);
+      
+      for (size_t i=0; i<threadCount; i++)
+      {
+        /*
+        for (size_t j=0; j<BUCKETS; j++)
+          total[j] += radixCount[i][j];
+        */
+        for (size_t j=0; j<BUCKETS; j+=VSIZEX)
+          vintx::store(&total[j], vintx::load(&total[j]) + vintx::load(&radixCount[i][j]));
+      }
+      
+      /* calculate start offset of each bucket */
+      __aligned(64) unsigned int offset[BUCKETS];
+      offset[0] = 0;
+      for (size_t i=1; i<BUCKETS; i++)    
+        offset[i] = offset[i-1] + total[i-1];
+      
+      /* calculate start offset of each bucket for this thread */
+      for (size_t i=0; i<threadIndex; i++)
+      {
+        /*
+        for (size_t j=0; j<BUCKETS; j++)
+          offset[j] += radixCount[i][j];
+        */
+        for (size_t j=0; j<BUCKETS; j+=VSIZEX)
+          vintx::store(&offset[j], vintx::load(&offset[j]) + vintx::load(&radixCount[i][j]));
+      }
+      
+      /* copy items into their buckets */
+#if defined(__INTEL_COMPILER)
+#pragma nounroll
+#endif
+      for (size_t i=startID; i<endID; i++) {
+        const Ty elt = src[i];
+#if defined(__64BIT__)
+        const size_t index = ((size_t)(Key)src[i] >> (size_t)shift) & (size_t)mask;
+#else
+        const size_t index = ((Key)src[i] >> shift) & mask;
+#endif
+        dst[offset[index]++] = elt;
+      }
+    }
+    
+    void tbbRadixIteration(const Key shift, const bool last,
+                           const Ty* __restrict src, Ty* __restrict dst,
+                           const size_t numTasks)
+    {
+      affinity_partitioner ap;
+      parallel_for_affinity(numTasks,[&] (size_t taskIndex) { tbbRadixIteration0(shift,src,dst,taskIndex,numTasks); },ap);
+      parallel_for_affinity(numTasks,[&] (size_t taskIndex) { tbbRadixIteration1(shift,src,dst,taskIndex,numTasks); },ap);
+    }
+    
+    void tbbRadixSort(const size_t numTasks)
+    {
+      radixCount = (TyRadixCount*) alignedMalloc(MAX_TASKS*sizeof(TyRadixCount),64);
+      
+      if (sizeof(Key) == sizeof(uint32_t)) {
+        tbbRadixIteration(0*BITS,0,src,tmp,numTasks);
+        tbbRadixIteration(1*BITS,0,tmp,src,numTasks);
+        tbbRadixIteration(2*BITS,0,src,tmp,numTasks);
+        tbbRadixIteration(3*BITS,1,tmp,src,numTasks);
+      }
+      else if (sizeof(Key) == sizeof(uint64_t))
+      {
+        tbbRadixIteration(0*BITS,0,src,tmp,numTasks);
+        tbbRadixIteration(1*BITS,0,tmp,src,numTasks);
+        tbbRadixIteration(2*BITS,0,src,tmp,numTasks);
+        tbbRadixIteration(3*BITS,0,tmp,src,numTasks);
+        tbbRadixIteration(4*BITS,0,src,tmp,numTasks);
+        tbbRadixIteration(5*BITS,0,tmp,src,numTasks);
+        tbbRadixIteration(6*BITS,0,src,tmp,numTasks);
+        tbbRadixIteration(7*BITS,1,tmp,src,numTasks);
+      }
+    }
+    
+  private:
+    TyRadixCount* radixCount;
+    Ty* const src;
+    Ty* const tmp;
+    const size_t N;
+  };
+
+  template<typename Ty>
+    void radix_sort(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192)
+  {
+    ParallelRadixSort<Ty,Ty>(src,tmp,N).sort(blockSize);
+  }
+  
+  template<typename Ty, typename Key>
+    void radix_sort(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192)
+  {
+    ParallelRadixSort<Ty,Key>(src,tmp,N).sort(blockSize);
+  }
+  
+  template<typename Ty>
+    void radix_sort_u32(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) {
+    radix_sort<Ty,uint32_t>(src,tmp,N,blockSize);
+  }
+  
+  template<typename Ty>
+    void radix_sort_u64(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) {
+    radix_sort<Ty,uint64_t>(src,tmp,N,blockSize);
+  }
+}
diff --git a/thirdparty/embree/common/lexers/parsestream.h b/thirdparty/embree/common/lexers/parsestream.h
new file mode 100644
index 0000000000..f65a52cb47
--- /dev/null
+++ b/thirdparty/embree/common/lexers/parsestream.h
@@ -0,0 +1,101 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "stringstream.h"
+#include "../sys/filename.h"
+#include "../math/vec2.h"
+#include "../math/vec3.h"
+#include "../math/col3.h"
+#include "../math/color.h"
+
+namespace embree
+{
+  /*! helper class for simple command line parsing */
+  class ParseStream : public Stream<std::string>
+  {
+  public:
+    ParseStream (const Ref<Stream<std::string> >& cin) : cin(cin) {}
+
+    ParseStream (const Ref<Stream<int> >& cin, const std::string& seps = "\n\t\r ",
+                 const std::string& endl = "", bool multiLine = false)
+      : cin(new StringStream(cin,seps,endl,multiLine)) {}
+
+  public:
+    ParseLocation location() { return cin->loc(); }
+    std::string next() { return cin->get(); }
+
+    void force(const std::string& next) {
+      std::string token = getString();
+      if (token != next)
+        THROW_RUNTIME_ERROR("token \""+next+"\" expected but token \""+token+"\" found");
+    }
+
+    std::string getString() {
+      return get();
+    }
+
+    FileName getFileName()  {
+      return FileName(get());
+    }
+
+    int   getInt  () {
+      return atoi(get().c_str());
+    }
+
+    Vec2i getVec2i() {
+      int x = atoi(get().c_str());
+      int y = atoi(get().c_str());
+      return Vec2i(x,y);
+    }
+
+    Vec3ia getVec3ia() {
+      int x = atoi(get().c_str());
+      int y = atoi(get().c_str());
+      int z = atoi(get().c_str());
+      return Vec3ia(x,y,z);
+    }
+
+    float getFloat() {
+      return (float)atof(get().c_str());
+    }
+
+    Vec2f getVec2f() {
+      float x = (float)atof(get().c_str());
+      float y = (float)atof(get().c_str());
+      return Vec2f(x,y);
+    }
+
+    Vec3f getVec3f() {
+      float x = (float)atof(get().c_str());
+      float y = (float)atof(get().c_str());
+      float z = (float)atof(get().c_str());
+      return Vec3f(x,y,z);
+    }
+
+    Vec3fa getVec3fa() {
+      float x = (float)atof(get().c_str());
+      float y = (float)atof(get().c_str());
+      float z = (float)atof(get().c_str());
+      return Vec3fa(x,y,z);
+    }
+
+    Col3f getCol3f() {
+      float x = (float)atof(get().c_str());
+      float y = (float)atof(get().c_str());
+      float z = (float)atof(get().c_str());
+      return Col3f(x,y,z);
+    }
+
+    Color getColor() {
+      float r = (float)atof(get().c_str());
+      float g = (float)atof(get().c_str());
+      float b = (float)atof(get().c_str());
+      return Color(r,g,b);
+    }
+
+  private:
+    Ref<Stream<std::string> > cin;
+  };
+}
diff --git a/thirdparty/embree/common/lexers/stream.h b/thirdparty/embree/common/lexers/stream.h
new file mode 100644
index 0000000000..a40c15f8eb
--- /dev/null
+++ b/thirdparty/embree/common/lexers/stream.h
@@ -0,0 +1,215 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+#include "../sys/ref.h"
+#include "../sys/filename.h"
+#include "../sys/string.h"
+
+#include <vector>
+#include <iostream>
+#include <cstdio>
+#include <string.h>
+
+namespace embree
+{
+  /*! stores the location of a stream element in the source */
+  class ParseLocation
+  {
+  public:
+    ParseLocation () : lineNumber(-1), colNumber(-1) {}
+    ParseLocation (std::shared_ptr<std::string> fileName, ssize_t lineNumber, ssize_t colNumber, ssize_t /*charNumber*/)
+      : fileName(fileName), lineNumber(lineNumber), colNumber(colNumber) {}
+
+    std::string str() const
+    {
+      std::string str = "unknown";
+      if (fileName) str = *fileName;
+      if (lineNumber >= 0) str += " line " + toString(lineNumber);
+      if (lineNumber >= 0 && colNumber >= 0) str += " character " + toString(colNumber);
+      return str;
+    }
+
+  private:
+    std::shared_ptr<std::string> fileName;         /// name of the file (or stream) the token is from
+    ssize_t lineNumber;           /// the line number the token is from
+    ssize_t colNumber;            /// the character number in the current line
+  };
+
+  /*! a stream class templated over the stream elements */
+  template<typename T> class Stream : public RefCount
+  {
+    enum { BUF_SIZE = 1024 };
+    
+  private:
+    virtual T next() = 0;
+    virtual ParseLocation location() = 0;
+    __forceinline std::pair<T,ParseLocation> nextHelper() {
+      ParseLocation l = location();
+      T v = next();
+      return std::pair<T,ParseLocation>(v,l);
+    }
+    __forceinline void push_back(const std::pair<T,ParseLocation>& v) {
+      if (past+future == BUF_SIZE) pop_front();
+      size_t end = (start+past+future++)%BUF_SIZE;
+      buffer[end] = v;
+    }
+    __forceinline void pop_front() {
+      if (past == 0) THROW_RUNTIME_ERROR("stream buffer empty");
+      start = (start+1)%BUF_SIZE; past--;
+    }
+  public:
+    Stream () : start(0), past(0), future(0), buffer(BUF_SIZE) {}
+    virtual ~Stream() {}
+    
+  public:
+    
+    const ParseLocation& loc() {
+      if (future == 0) push_back(nextHelper());
+      return buffer[(start+past)%BUF_SIZE].second;
+    }
+    T get() {
+      if (future == 0) push_back(nextHelper());
+      T t = buffer[(start+past)%BUF_SIZE].first;
+      past++; future--;
+      return t;
+    }
+    const T& peek() {
+      if (future == 0) push_back(nextHelper());
+      return buffer[(start+past)%BUF_SIZE].first;
+    }
+    const T& unget(size_t n = 1) {
+      if (past < n) THROW_RUNTIME_ERROR ("cannot unget that many items");
+      past -= n; future += n;
+      return peek();
+    }
+    void drop() {
+      if (future == 0) push_back(nextHelper());
+      past++; future--;
+    }
+  private:
+    size_t start,past,future;
+    std::vector<std::pair<T,ParseLocation> > buffer;
+  };
+  
+  /*! warps an iostream stream */
+  class StdStream : public Stream<int>
+  {
+  public:
+    StdStream (std::istream& cin, const std::string& name = "std::stream")
+      : cin(cin), lineNumber(1), colNumber(0), charNumber(0), name(std::shared_ptr<std::string>(new std::string(name))) {}
+    ~StdStream() {}
+    ParseLocation location() {
+      return ParseLocation(name,lineNumber,colNumber,charNumber);
+    }
+    int next() {
+      int c = cin.get();
+      if (c == '\n') { lineNumber++; colNumber = 0; } else if (c != '\r') colNumber++;
+      charNumber++;
+      return c;
+    }
+  private:
+    std::istream& cin;
+    ssize_t lineNumber;           /// the line number the token is from
+    ssize_t colNumber;            /// the character number in the current line
+    ssize_t charNumber;           /// the character in the file
+    std::shared_ptr<std::string> name;             /// name of buffer
+  };
+
+  /*! creates a stream from a file */
+  class FileStream : public Stream<int>
+  {
+  public:
+
+    FileStream (FILE* file, const std::string& name = "file")
+      : file(file), lineNumber(1), colNumber(0), charNumber(0), name(std::shared_ptr<std::string>(new std::string(name))) {}
+
+    FileStream (const FileName& fileName)
+      : lineNumber(1), colNumber(0), charNumber(0), name(std::shared_ptr<std::string>(new std::string(fileName.str())))
+    {
+      file = fopen(fileName.c_str(),"r");
+      if (file == nullptr) THROW_RUNTIME_ERROR("cannot open file " + fileName.str());
+    }
+    ~FileStream() { if (file) fclose(file); }
+
+  public:
+    ParseLocation location() {
+      return ParseLocation(name,lineNumber,colNumber,charNumber);
+    }
+
+    int next() {
+      int c = fgetc(file);
+      if (c == '\n') { lineNumber++; colNumber = 0; } else if (c != '\r') colNumber++;
+      charNumber++;
+      return c;
+    }
+
+  private:
+    FILE* file;
+    ssize_t lineNumber;           /// the line number the token is from
+    ssize_t colNumber;            /// the character number in the current line
+    ssize_t charNumber;           /// the character in the file
+    std::shared_ptr<std::string> name;             /// name of buffer
+  };
+
+  /*! creates a stream from a string */
+  class StrStream : public Stream<int>
+  {
+  public:
+
+    StrStream (const char* str)
+      : str(str), lineNumber(1), colNumber(0), charNumber(0) {}
+
+  public:
+    ParseLocation location() {
+      return ParseLocation(std::shared_ptr<std::string>(),lineNumber,colNumber,charNumber);
+    }
+
+    int next() {
+      int c = str[charNumber];
+      if (c == 0) return EOF;
+      if (c == '\n') { lineNumber++; colNumber = 0; } else if (c != '\r') colNumber++;
+      charNumber++;
+      return c;
+    }
+
+  private:
+    const char* str;
+    ssize_t lineNumber;           /// the line number the token is from
+    ssize_t colNumber;            /// the character number in the current line
+    ssize_t charNumber;           /// the character in the file
+  };
+
+  /*! creates a character stream from a command line */
+  class CommandLineStream : public Stream<int>
+  {
+  public:
+    CommandLineStream (int argc, char** argv, const std::string& name = "command line")
+      : i(0), j(0), charNumber(0), name(std::shared_ptr<std::string>(new std::string(name)))
+    {
+      if (argc > 0) {
+	for (size_t i=0; argv[0][i] && i<1024; i++) charNumber++;
+	charNumber++;
+      }
+      for (ssize_t k=1; k<argc; k++) args.push_back(argv[k]);
+    }
+    ~CommandLineStream() {}
+  public:
+    ParseLocation location() {
+      return ParseLocation(name,0,charNumber,charNumber);
+    }
+    int next() {
+      if (i == args.size()) return EOF;
+      if (j == args[i].size()) { i++; j=0; charNumber++; return ' '; }
+      charNumber++;
+      return args[i][j++];
+    }
+  private:
+    size_t i,j;
+    std::vector<std::string> args;
+    ssize_t charNumber;           /// the character in the file
+    std::shared_ptr<std::string> name;             /// name of buffer
+  };
+}
diff --git a/thirdparty/embree/common/lexers/streamfilters.h b/thirdparty/embree/common/lexers/streamfilters.h
new file mode 100644
index 0000000000..3592b77b03
--- /dev/null
+++ b/thirdparty/embree/common/lexers/streamfilters.h
@@ -0,0 +1,39 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "stream.h"
+
+namespace embree
+{
+  /* removes all line comments from a stream */
+  class LineCommentFilter : public Stream<int>
+  {
+  public:
+    LineCommentFilter (const FileName& fileName, const std::string& lineComment)
+      : cin(new FileStream(fileName)), lineComment(lineComment) {}
+    LineCommentFilter (Ref<Stream<int> > cin, const std::string& lineComment)
+      : cin(cin), lineComment(lineComment) {}
+
+    ParseLocation location() { return cin->loc(); }
+
+    int next()
+    {
+      /* look if the line comment starts here */
+      for (size_t j=0; j<lineComment.size(); j++) {
+        if (cin->peek() != lineComment[j]) { cin->unget(j); goto not_found; }
+        cin->get();
+      }
+      /* eat all characters until the end of the line (or file) */
+      while (cin->peek() != '\n' && cin->peek() != EOF) cin->get();
+
+    not_found:
+      return cin->get();
+    }
+
+  private:
+    Ref<Stream<int> > cin;
+    std::string lineComment;
+  };
+}
diff --git a/thirdparty/embree/common/lexers/stringstream.cpp b/thirdparty/embree/common/lexers/stringstream.cpp
new file mode 100644
index 0000000000..a037869506
--- /dev/null
+++ b/thirdparty/embree/common/lexers/stringstream.cpp
@@ -0,0 +1,51 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "stringstream.h"
+
+namespace embree
+{
+  static const std::string stringChars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 _.,+-=:/*\\";
+  
+  /* creates map for fast categorization of characters */
+  static void createCharMap(bool map[256], const std::string& chrs) {
+    for (size_t i=0; i<256; i++) map[i] = false;
+    for (size_t i=0; i<chrs.size(); i++) map[uint8_t(chrs[i])] = true;
+  }
+
+  /* simple tokenizer */
+  StringStream::StringStream(const Ref<Stream<int> >& cin, const std::string& seps, const std::string& endl, bool multiLine)
+    : cin(cin), endl(endl), multiLine(multiLine)
+  {
+    createCharMap(isSepMap,seps);
+    createCharMap(isValidCharMap,stringChars);
+  }
+
+  std::string StringStream::next()
+  {
+    /* skip separators */
+    while (cin->peek() != EOF) {
+      if (endl != "" && cin->peek() == '\n') { cin->drop(); return endl; }
+      if (multiLine && cin->peek() == '\\') {
+        cin->drop();
+        if (cin->peek() == '\n') { cin->drop(); continue; }
+        cin->unget();
+      }
+      if (!isSeparator(cin->peek())) break;
+      cin->drop();
+    }
+
+    /* parse everything until the next separator */
+    std::vector<char> str; str.reserve(64);
+    while (cin->peek() != EOF && !isSeparator(cin->peek())) {
+      int c = cin->get();
+      // -- GODOT start --
+      // if (!isValidChar(c)) throw std::runtime_error("invalid character "+std::string(1,c)+" in input");
+      if (!isValidChar(c)) abort();
+      // -- GODOT end --
+      str.push_back((char)c);
+    }
+    str.push_back(0);
+    return std::string(str.data());
+  }
+}
diff --git a/thirdparty/embree/common/lexers/stringstream.h b/thirdparty/embree/common/lexers/stringstream.h
new file mode 100644
index 0000000000..6d9c27e3cd
--- /dev/null
+++ b/thirdparty/embree/common/lexers/stringstream.h
@@ -0,0 +1,29 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "stream.h"
+
+namespace embree
+{
+  /*! simple tokenizer that produces a string stream */
+  class StringStream : public Stream<std::string>
+  {
+  public:
+    StringStream(const Ref<Stream<int> >& cin, const std::string& seps = "\n\t\r ",
+                 const std::string& endl = "", bool multiLine = false);
+  public:
+    ParseLocation location() { return cin->loc(); }
+    std::string next();
+  private:
+    __forceinline bool isSeparator(unsigned int c) const { return c<256 && isSepMap[c]; }
+    __forceinline bool isValidChar(unsigned int c) const { return c<256 && isValidCharMap[c]; }
+  private:
+    Ref<Stream<int> > cin; /*! source character stream */
+    bool isSepMap[256];    /*! map for fast classification of separators */
+    bool isValidCharMap[256];  /*! map for valid characters */
+    std::string endl;      /*! the token of the end of line */
+    bool multiLine;        /*! whether to parse lines wrapped with \ */
+  };
+}
diff --git a/thirdparty/embree/common/lexers/tokenstream.cpp b/thirdparty/embree/common/lexers/tokenstream.cpp
new file mode 100644
index 0000000000..6ed6f2045a
--- /dev/null
+++ b/thirdparty/embree/common/lexers/tokenstream.cpp
@@ -0,0 +1,181 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "tokenstream.h"
+#include "../math/math.h"
+
+namespace embree
+{
+  /* shorthands for common sets of characters */
+  const std::string TokenStream::alpha = "abcdefghijklmnopqrstuvwxyz";
+  const std::string TokenStream::ALPHA = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+  const std::string TokenStream::numbers = "0123456789";
+  const std::string TokenStream::separators = "\n\t\r ";
+  const std::string TokenStream::stringChars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 _.,+-=:/*\\";
+
+  /* creates map for fast categorization of characters */
+  static void createCharMap(bool map[256], const std::string& chrs) {
+    for (size_t i=0; i<256; i++) map[i] = false;
+    for (size_t i=0; i<chrs.size(); i++) map[uint8_t(chrs[i])] = true;
+  }
+
+  /* build full tokenizer that takes list of valid characters and keywords */
+  TokenStream::TokenStream(const Ref<Stream<int> >& cin,            //< stream to read from
+                                   const std::string& alpha,                //< valid characters for identifiers
+                                   const std::string& seps,                 //< characters that act as separators
+                                   const std::vector<std::string>& symbols) //< symbols
+    : cin(cin), symbols(symbols)
+  {
+    createCharMap(isAlphaMap,alpha);
+    createCharMap(isSepMap,seps);
+    createCharMap(isStringCharMap,stringChars);
+  }
+
+  bool TokenStream::decDigits(std::string& str_o)
+  {
+    bool ok = false;
+    std::string str;
+    if (cin->peek() == '+' || cin->peek() == '-') str += (char)cin->get();
+    while (isDigit(cin->peek())) { ok = true; str += (char)cin->get(); }
+    if (ok) str_o += str;
+    else cin->unget(str.size());
+    return ok;
+  }
+
+  bool TokenStream::decDigits1(std::string& str_o)
+  {
+    bool ok = false;
+    std::string str;
+    while (isDigit(cin->peek())) { ok = true; str += (char)cin->get(); }
+    if (ok) str_o += str; else cin->unget(str.size());
+    return ok;
+  }
+
+  bool TokenStream::trySymbol(const std::string& symbol)
+  {
+    size_t pos = 0;
+    while (pos < symbol.size()) {
+      if (symbol[pos] != cin->peek()) { cin->unget(pos); return false; }
+      cin->drop(); pos++;
+    }
+    return true;
+  }
+
+  bool TokenStream::trySymbols(Token& token, const ParseLocation& loc)
+  {
+    for (size_t i=0; i<symbols.size(); i++) {
+      if (!trySymbol(symbols[i])) continue;
+      token = Token(symbols[i],Token::TY_SYMBOL,loc);
+      return true;
+    }
+    return false;
+  }
+
+  bool TokenStream::tryFloat(Token& token, const ParseLocation& loc)
+  {
+    bool ok = false;
+    std::string str;
+    if (trySymbol("nan")) {
+      token = Token(float(nan));
+      return true;
+    }
+    if (trySymbol("+inf")) {
+      token = Token(float(pos_inf));
+      return true;
+    }
+    if (trySymbol("-inf")) {
+      token = Token(float(neg_inf));
+      return true;
+    }
+
+    if (decDigits(str))
+    {
+      if (cin->peek() == '.') {
+        str += (char)cin->get();
+        decDigits(str);
+        if (cin->peek() == 'e' || cin->peek() == 'E') {
+          str += (char)cin->get();
+          if (decDigits(str)) ok = true; // 1.[2]E2
+        }
+        else ok = true; // 1.[2]
+      }
+      else if (cin->peek() == 'e' || cin->peek() == 'E') {
+        str += (char)cin->get();
+        if (decDigits(str)) ok = true; // 1E2
+      }
+    }
+    else
+    {
+      if (cin->peek() == '.') {
+        str += (char)cin->get();
+        if (decDigits(str)) {
+          if (cin->peek() == 'e' || cin->peek() == 'E') {
+            str += (char)cin->get();
+            if (decDigits(str)) ok = true; // .3E2
+          }
+          else ok = true; // .3
+        }
+      }
+    }
+    if (ok) {
+      token = Token((float)atof(str.c_str()),loc);
+    }
+    else cin->unget(str.size());
+    return ok;
+  }
+
+  bool TokenStream::tryInt(Token& token, const ParseLocation& loc) {
+    std::string str;
+    if (decDigits(str)) {
+      token = Token(atoi(str.c_str()),loc);
+      return true;
+    }
+    return false;
+  }
+
+  bool TokenStream::tryString(Token& token, const ParseLocation& loc)
+  {
+    std::string str;
+    if (cin->peek() != '\"') return false;
+    cin->drop();
+    while (cin->peek() != '\"') {
+      const int c = cin->get();
+      if (!isStringChar(c)) THROW_RUNTIME_ERROR("invalid string character "+std::string(1,c)+" at "+loc.str());
+      str += (char)c;
+    }
+    cin->drop();
+    token = Token(str,Token::TY_STRING,loc);
+    return true;
+  }
+
+  bool TokenStream::tryIdentifier(Token& token, const ParseLocation& loc)
+  {
+    std::string str;
+    if (!isAlpha(cin->peek())) return false;
+    str += (char)cin->get();
+    while (isAlphaNum(cin->peek())) str += (char)cin->get();
+    token = Token(str,Token::TY_IDENTIFIER,loc);
+    return true;
+  }
+
+  void TokenStream::skipSeparators()
+  {
+    /* skip separators */
+    while (cin->peek() != EOF && isSeparator(cin->peek()))
+      cin->drop();
+  }
+
+  Token TokenStream::next()
+  {
+    Token token;
+    skipSeparators();
+    ParseLocation loc = cin->loc();
+    if (trySymbols   (token,loc)) return token;      /**< try to parse a symbol */
+    if (tryFloat     (token,loc)) return token;      /**< try to parse float */
+    if (tryInt       (token,loc)) return token;      /**< try to parse integer */
+    if (tryString    (token,loc)) return token;      /**< try to parse string */
+    if (tryIdentifier(token,loc)) return token;      /**< try to parse identifier */
+    if (cin->peek() == EOF  )     return Token(loc); /**< return EOF token */
+    return Token((char)cin->get(),loc);              /**< return invalid character token */
+  }
+}
diff --git a/thirdparty/embree/common/lexers/tokenstream.h b/thirdparty/embree/common/lexers/tokenstream.h
new file mode 100644
index 0000000000..6e49dd0b39
--- /dev/null
+++ b/thirdparty/embree/common/lexers/tokenstream.h
@@ -0,0 +1,164 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "stream.h"
+#include <string>
+#include <vector>
+
+namespace embree
+{
+  /*! token class */
+  class Token
+  {
+  public:
+
+    enum Type { TY_EOF, TY_CHAR, TY_INT, TY_FLOAT, TY_IDENTIFIER, TY_STRING, TY_SYMBOL };
+
+    Token (        const ParseLocation& loc = ParseLocation()) : ty(TY_EOF  ),       loc(loc) {}
+    Token (char c, const ParseLocation& loc = ParseLocation()) : ty(TY_CHAR ), c(c), loc(loc) {}
+    Token (int i,  const ParseLocation& loc = ParseLocation()) : ty(TY_INT  ), i(i), loc(loc) {}
+    Token (float f,const ParseLocation& loc = ParseLocation()) : ty(TY_FLOAT), f(f), loc(loc) {}
+    Token (std::string str, Type ty, const ParseLocation& loc = ParseLocation()) : ty(ty),   str(str), loc(loc) {}
+
+    static Token Eof()                { return Token(); }
+    static Token Sym(std::string str) { return Token(str,TY_SYMBOL); }
+    static Token Str(std::string str) { return Token(str,TY_STRING); }
+    static Token Id (std::string str) { return Token(str,TY_IDENTIFIER); }
+
+    char Char() const {
+      if (ty == TY_CHAR) return c;
+      THROW_RUNTIME_ERROR(loc.str()+": character expected");
+    }
+
+    int Int() const {
+      if (ty == TY_INT) return i;
+      THROW_RUNTIME_ERROR(loc.str()+": integer expected");
+    }
+
+    float Float(bool cast = true)  const {
+      if (ty == TY_FLOAT) return f;
+      if (ty == TY_INT && cast) return (float)i;
+      THROW_RUNTIME_ERROR(loc.str()+": float expected");
+    }
+
+    std::string Identifier() const {
+      if (ty == TY_IDENTIFIER) return str;
+      THROW_RUNTIME_ERROR(loc.str()+": identifier expected");
+    }
+
+    std::string String() const {
+      if (ty == TY_STRING) return str;
+      THROW_RUNTIME_ERROR(loc.str()+": string expected");
+    }
+
+    std::string Symbol() const {
+      if (ty == TY_SYMBOL) return str;
+      THROW_RUNTIME_ERROR(loc.str()+": symbol expected");
+    }
+
+    const ParseLocation& Location() const { return loc; }
+
+    friend bool operator==(const Token& a, const Token& b)
+    {
+      if (a.ty != b.ty) return false;
+      if (a.ty == TY_CHAR) return a.c == b.c;
+      if (a.ty == TY_INT) return a.i == b.i;
+      if (a.ty == TY_FLOAT) return a.f == b.f;
+      if (a.ty == TY_IDENTIFIER) return a.str == b.str;
+      if (a.ty == TY_STRING) return a.str == b.str;
+      if (a.ty == TY_SYMBOL) return a.str == b.str;
+      return true;
+    }
+
+    friend bool operator!=(const Token& a, const Token& b) {
+      return !(a == b);
+    }
+
+    friend bool operator <( const Token& a, const Token& b ) {
+      if (a.ty != b.ty) return (int)a.ty < (int)b.ty;
+      if (a.ty == TY_CHAR) return a.c < b.c;
+      if (a.ty == TY_INT) return a.i < b.i;
+      if (a.ty == TY_FLOAT) return a.f < b.f;
+      if (a.ty == TY_IDENTIFIER) return a.str < b.str;
+      if (a.ty == TY_STRING) return a.str < b.str;
+      if (a.ty == TY_SYMBOL) return a.str < b.str;
+      return false;
+    }
+
+    friend std::ostream& operator<<(std::ostream& cout, const Token& t)
+    {
+      if (t.ty == TY_EOF) return cout << "eof";
+      if (t.ty == TY_CHAR) return cout << "Char(" << t.c << ")";
+      if (t.ty == TY_INT) return cout << "Int(" << t.i << ")";
+      if (t.ty == TY_FLOAT) return cout << "Float(" << t.f << ")";
+      if (t.ty == TY_IDENTIFIER) return cout << "Id(" << t.str << ")";
+      if (t.ty == TY_STRING) return cout << "String(" << t.str << ")";
+      if (t.ty == TY_SYMBOL) return cout << "Symbol(" << t.str << ")";
+      return cout << "unknown";
+    }
+
+  private:
+    Type ty;            //< the type of the token
+    union {
+      char c;           //< data for char tokens
+      int i;            //< data for int tokens
+      float f;          //< data for float tokens
+    };
+    std::string str;    //< data for string and identifier tokens
+    ParseLocation loc;  //< the location the token is from
+  };
+
+  /*! build full tokenizer that takes list of valid characters and keywords */
+  class TokenStream : public Stream<Token>
+  {
+  public:
+
+    /*! shorthands for common sets of characters */
+    static const std::string alpha;
+    static const std::string ALPHA;
+    static const std::string numbers;
+    static const std::string separators;
+    static const std::string stringChars;
+
+  public:
+    TokenStream(const Ref<Stream<int> >& cin,
+                const std::string& alpha, //< valid characters for identifiers
+                const std::string& seps,  //< characters that act as separators
+                const std::vector<std::string>& symbols = std::vector<std::string>()); //< symbols
+  public:
+    ParseLocation location() { return cin->loc(); }
+    Token next();
+    bool trySymbol(const std::string& symbol);
+
+  private:
+    void skipSeparators();
+    bool decDigits(std::string& str);
+    bool decDigits1(std::string& str);
+    bool trySymbols(Token& token, const ParseLocation& loc);
+    bool tryFloat(Token& token, const ParseLocation& loc);
+    bool tryInt(Token& token, const ParseLocation& loc);
+    bool tryString(Token& token, const ParseLocation& loc);
+    bool tryIdentifier(Token& token, const ParseLocation& loc);
+
+    Ref<Stream<int> > cin;
+    bool isSepMap[256];
+    bool isAlphaMap[256];
+    bool isStringCharMap[256];
+    std::vector<std::string> symbols;
+
+    /*! checks if a character is a separator */
+    __forceinline bool isSeparator(unsigned int c) const { return c<256 && isSepMap[c]; }
+
+    /*! checks if a character is a number */
+    __forceinline bool isDigit(unsigned int c) const {  return c >= '0' && c <= '9'; }
+
+    /*! checks if a character is valid inside a string */
+    __forceinline bool isStringChar(unsigned int c) const { return c<256 && isStringCharMap[c]; }
+
+    /*! checks if a character is legal for an identifier */
+    __forceinline bool isAlpha(unsigned int c) const {  return c<256 && isAlphaMap[c];  }
+    __forceinline bool isAlphaNum(unsigned int c) const { return isAlpha(c) || isDigit(c); }
+  };
+}
diff --git a/thirdparty/embree/common/math/affinespace.h b/thirdparty/embree/common/math/affinespace.h
new file mode 100644
index 0000000000..9d4a0f0846
--- /dev/null
+++ b/thirdparty/embree/common/math/affinespace.h
@@ -0,0 +1,361 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "linearspace2.h"
+#include "linearspace3.h"
+#include "quaternion.h"
+#include "bbox.h"
+#include "vec4.h"
+
+namespace embree
+{
+  #define VectorT typename L::Vector
+  #define ScalarT typename L::Vector::Scalar
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Affine Space
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename L>
+    struct AffineSpaceT
+    {
+      L l;           /*< linear part of affine space */
+      VectorT p;     /*< affine part of affine space */
+
+      ////////////////////////////////////////////////////////////////////////////////
+      // Constructors, Assignment, Cast, Copy Operations
+      ////////////////////////////////////////////////////////////////////////////////
+
+      __forceinline AffineSpaceT           ( )                           { }
+      __forceinline AffineSpaceT           ( const AffineSpaceT& other ) { l = other.l; p = other.p; }
+      __forceinline AffineSpaceT           ( const L           & other ) { l = other  ; p = VectorT(zero); }
+      __forceinline AffineSpaceT& operator=( const AffineSpaceT& other ) { l = other.l; p = other.p; return *this; }
+
+      __forceinline AffineSpaceT( const VectorT& vx, const VectorT& vy, const VectorT& vz, const VectorT& p ) : l(vx,vy,vz), p(p) {}
+      __forceinline AffineSpaceT( const L& l, const VectorT& p ) : l(l), p(p) {}
+
+      template<typename L1> __forceinline AffineSpaceT( const AffineSpaceT<L1>& s ) : l(s.l), p(s.p) {}
+
+      ////////////////////////////////////////////////////////////////////////////////
+      // Constants
+      ////////////////////////////////////////////////////////////////////////////////
+
+      __forceinline AffineSpaceT( ZeroTy ) : l(zero), p(zero) {}
+      __forceinline AffineSpaceT( OneTy )  : l(one),  p(zero) {}
+
+      /*! return matrix for scaling */
+      static __forceinline AffineSpaceT scale(const VectorT& s) { return L::scale(s); }
+
+      /*! return matrix for translation */
+      static __forceinline AffineSpaceT translate(const VectorT& p) { return AffineSpaceT(one,p); }
+
+      /*! return matrix for rotation, only in 2D */
+      static __forceinline AffineSpaceT rotate(const ScalarT& r) { return L::rotate(r); }
+
+      /*! return matrix for rotation around arbitrary point (2D) or axis (3D) */
+      static __forceinline AffineSpaceT rotate(const VectorT& u, const ScalarT& r) { return L::rotate(u,r); }
+
+      /*! return matrix for rotation around arbitrary axis and point, only in 3D */
+      static __forceinline AffineSpaceT rotate(const VectorT& p, const VectorT& u, const ScalarT& r) { return translate(+p) * rotate(u,r) * translate(-p);  }
+
+      /*! return matrix for looking at given point, only in 3D */
+      static __forceinline AffineSpaceT lookat(const VectorT& eye, const VectorT& point, const VectorT& up) {
+        VectorT Z = normalize(point-eye);
+        VectorT U = normalize(cross(up,Z));
+        VectorT V = normalize(cross(Z,U));
+        return AffineSpaceT(L(U,V,Z),eye);
+      }
+
+    };
+  
+  // template specialization to get correct identity matrix for type AffineSpace3fa
+  template<>
+    __forceinline AffineSpaceT<LinearSpace3ff>::AffineSpaceT( OneTy )  : l(one),  p(0.f, 0.f, 0.f, 1.f) {}
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename L> __forceinline AffineSpaceT<L> operator -( const AffineSpaceT<L>& a ) { return AffineSpaceT<L>(-a.l,-a.p); }
+  template<typename L> __forceinline AffineSpaceT<L> operator +( const AffineSpaceT<L>& a ) { return AffineSpaceT<L>(+a.l,+a.p); }
+  template<typename L> __forceinline AffineSpaceT<L>        rcp( const AffineSpaceT<L>& a ) { L il = rcp(a.l); return AffineSpaceT<L>(il,-(il*a.p)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename L> __forceinline const AffineSpaceT<L> operator +( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return AffineSpaceT<L>(a.l+b.l,a.p+b.p); }
+  template<typename L> __forceinline const AffineSpaceT<L> operator -( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return AffineSpaceT<L>(a.l-b.l,a.p-b.p); }
+
+  template<typename L> __forceinline const AffineSpaceT<L> operator *( const ScalarT        & a, const AffineSpaceT<L>& b ) { return AffineSpaceT<L>(a*b.l,a*b.p); }
+  template<typename L> __forceinline const AffineSpaceT<L> operator *( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return AffineSpaceT<L>(a.l*b.l,a.l*b.p+a.p); }
+  template<typename L> __forceinline const AffineSpaceT<L> operator /( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a * rcp(b); }
+  template<typename L> __forceinline const AffineSpaceT<L> operator /( const AffineSpaceT<L>& a, const ScalarT        & b ) { return a * rcp(b); }
+
+  template<typename L> __forceinline AffineSpaceT<L>& operator *=( AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a = a * b; }
+  template<typename L> __forceinline AffineSpaceT<L>& operator *=( AffineSpaceT<L>& a, const ScalarT        & b ) { return a = a * b; }
+  template<typename L> __forceinline AffineSpaceT<L>& operator /=( AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a = a / b; }
+  template<typename L> __forceinline AffineSpaceT<L>& operator /=( AffineSpaceT<L>& a, const ScalarT        & b ) { return a = a / b; }
+
+  template<typename L> __forceinline VectorT xfmPoint (const AffineSpaceT<L>& m, const VectorT& p) { return madd(VectorT(p.x),m.l.vx,madd(VectorT(p.y),m.l.vy,madd(VectorT(p.z),m.l.vz,m.p))); }
+  template<typename L> __forceinline VectorT xfmVector(const AffineSpaceT<L>& m, const VectorT& v) { return xfmVector(m.l,v); }
+  template<typename L> __forceinline VectorT xfmNormal(const AffineSpaceT<L>& m, const VectorT& n) { return xfmNormal(m.l,n); }
+
+  __forceinline const BBox<Vec3fa> xfmBounds(const AffineSpaceT<LinearSpace3<Vec3fa> >& m, const BBox<Vec3fa>& b) 
+  { 
+    BBox3fa dst = empty;
+    const Vec3fa p0(b.lower.x,b.lower.y,b.lower.z); dst.extend(xfmPoint(m,p0));
+    const Vec3fa p1(b.lower.x,b.lower.y,b.upper.z); dst.extend(xfmPoint(m,p1));
+    const Vec3fa p2(b.lower.x,b.upper.y,b.lower.z); dst.extend(xfmPoint(m,p2));
+    const Vec3fa p3(b.lower.x,b.upper.y,b.upper.z); dst.extend(xfmPoint(m,p3));
+    const Vec3fa p4(b.upper.x,b.lower.y,b.lower.z); dst.extend(xfmPoint(m,p4));
+    const Vec3fa p5(b.upper.x,b.lower.y,b.upper.z); dst.extend(xfmPoint(m,p5));
+    const Vec3fa p6(b.upper.x,b.upper.y,b.lower.z); dst.extend(xfmPoint(m,p6));
+    const Vec3fa p7(b.upper.x,b.upper.y,b.upper.z); dst.extend(xfmPoint(m,p7));
+    return dst;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename L> __forceinline bool operator ==( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a.l == b.l && a.p == b.p; }
+  template<typename L> __forceinline bool operator !=( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a.l != b.l || a.p != b.p; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename L> __forceinline AffineSpaceT<L> select ( const typename L::Vector::Scalar::Bool& s, const AffineSpaceT<L>& t, const AffineSpaceT<L>& f ) {
+    return AffineSpaceT<L>(select(s,t.l,f.l),select(s,t.p,f.p));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename L> static embree_ostream operator<<(embree_ostream cout, const AffineSpaceT<L>& m) {
+    return cout << "{ l = " << m.l << ", p = " << m.p << " }";
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Template Instantiations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  typedef AffineSpaceT<LinearSpace2f> AffineSpace2f;
+  typedef AffineSpaceT<LinearSpace3f> AffineSpace3f;
+  typedef AffineSpaceT<LinearSpace3fa> AffineSpace3fa;
+  typedef AffineSpaceT<LinearSpace3fx> AffineSpace3fx;
+  typedef AffineSpaceT<LinearSpace3ff> AffineSpace3ff;
+  typedef AffineSpaceT<Quaternion3f > OrthonormalSpace3f;
+
+  template<int N> using AffineSpace3vf = AffineSpaceT<LinearSpace3<Vec3<vfloat<N>>>>;
+  typedef AffineSpaceT<LinearSpace3<Vec3<vfloat<4>>>>  AffineSpace3vf4;
+  typedef AffineSpaceT<LinearSpace3<Vec3<vfloat<8>>>>  AffineSpace3vf8;
+  typedef AffineSpaceT<LinearSpace3<Vec3<vfloat<16>>>> AffineSpace3vf16;
+
+  template<int N> using AffineSpace3vff = AffineSpaceT<LinearSpace3<Vec4<vfloat<N>>>>;
+  typedef AffineSpaceT<LinearSpace3<Vec4<vfloat<4>>>>  AffineSpace3vfa4;
+  typedef AffineSpaceT<LinearSpace3<Vec4<vfloat<8>>>>  AffineSpace3vfa8;
+  typedef AffineSpaceT<LinearSpace3<Vec4<vfloat<16>>>> AffineSpace3vfa16;
+
+  //////////////////////////////////////////////////////////////////////////////
+  /// Interpolation
+  //////////////////////////////////////////////////////////////////////////////
+  template<typename T, typename R>
+  __forceinline AffineSpaceT<T> lerp(const AffineSpaceT<T>& M0,
+                                     const AffineSpaceT<T>& M1,
+                                     const R& t)
+  {
+    return AffineSpaceT<T>(lerp(M0.l,M1.l,t),lerp(M0.p,M1.p,t));
+  }
+
+  // slerp interprets the 16 floats of the matrix M = D * R * S as components of
+  // three matrizes (D, R, S) that are interpolated individually.
+  template<typename T> __forceinline AffineSpaceT<LinearSpace3<Vec3<T>>>
+  slerp(const AffineSpaceT<LinearSpace3<Vec4<T>>>& M0,
+        const AffineSpaceT<LinearSpace3<Vec4<T>>>& M1,
+        const T& t)
+  {
+    QuaternionT<T> q0(M0.p.w, M0.l.vx.w, M0.l.vy.w, M0.l.vz.w);
+    QuaternionT<T> q1(M1.p.w, M1.l.vx.w, M1.l.vy.w, M1.l.vz.w);
+    QuaternionT<T> q = slerp(q0, q1, t);
+
+    AffineSpaceT<LinearSpace3<Vec3<T>>> S = lerp(M0, M1, t);
+    AffineSpaceT<LinearSpace3<Vec3<T>>> D(one);
+    D.p.x = S.l.vx.y;
+    D.p.y = S.l.vx.z;
+    D.p.z = S.l.vy.z;
+    S.l.vx.y = 0;
+    S.l.vx.z = 0;
+    S.l.vy.z = 0;
+
+    AffineSpaceT<LinearSpace3<Vec3<T>>> R = LinearSpace3<Vec3<T>>(q);
+    return D * R * S;
+  }
+
+  // this is a specialized version for Vec3fa because that does
+  // not play along nicely with the other templated Vec3/Vec4 types
+  __forceinline AffineSpace3fa slerp(const AffineSpace3ff& M0,
+                                     const AffineSpace3ff& M1,
+                                     const float& t)
+  {
+    Quaternion3f q0(M0.p.w, M0.l.vx.w, M0.l.vy.w, M0.l.vz.w);
+    Quaternion3f q1(M1.p.w, M1.l.vx.w, M1.l.vy.w, M1.l.vz.w);
+    Quaternion3f q = slerp(q0, q1, t);
+
+    AffineSpace3fa S = lerp(M0, M1, t);
+    AffineSpace3fa D(one);
+    D.p.x = S.l.vx.y;
+    D.p.y = S.l.vx.z;
+    D.p.z = S.l.vy.z;
+    S.l.vx.y = 0;
+    S.l.vx.z = 0;
+    S.l.vy.z = 0;
+
+    AffineSpace3fa R = LinearSpace3fa(q);
+    return D * R * S;
+  }
+  
+  __forceinline AffineSpace3fa quaternionDecompositionToAffineSpace(const AffineSpace3ff& qd)
+  {
+    // compute affine transform from quaternion decomposition
+    Quaternion3f q(qd.p.w, qd.l.vx.w, qd.l.vy.w, qd.l.vz.w);
+    AffineSpace3fa M = qd;
+    AffineSpace3fa D(one);
+    D.p.x = M.l.vx.y;
+    D.p.y = M.l.vx.z;
+    D.p.z = M.l.vy.z;
+    M.l.vx.y = 0;
+    M.l.vx.z = 0;
+    M.l.vy.z = 0;
+    AffineSpace3fa R = LinearSpace3fa(q);
+    return D * R * M;
+  }
+  
+  __forceinline void quaternionDecomposition(const AffineSpace3ff& qd, Vec3fa& T, Quaternion3f& q, AffineSpace3fa& S)
+  {
+    q = Quaternion3f(qd.p.w, qd.l.vx.w, qd.l.vy.w, qd.l.vz.w);
+    S = qd;
+    T.x = qd.l.vx.y;
+    T.y = qd.l.vx.z;
+    T.z = qd.l.vy.z;
+    S.l.vx.y = 0;
+    S.l.vx.z = 0;
+    S.l.vy.z = 0;
+  }
+
+  __forceinline AffineSpace3fx quaternionDecomposition(Vec3fa const& T, Quaternion3f const& q, AffineSpace3fa const& S)
+  {
+    AffineSpace3ff M = S;
+    M.l.vx.w = q.i;
+    M.l.vy.w = q.j;
+    M.l.vz.w = q.k;
+    M.p.w    = q.r;
+    M.l.vx.y = T.x;
+    M.l.vx.z = T.y;
+    M.l.vy.z = T.z;
+    return M;
+  }
+
+  struct __aligned(16) QuaternionDecomposition
+  {
+    float scale_x = 1.f;
+    float scale_y = 1.f;
+    float scale_z = 1.f;
+    float skew_xy = 0.f;
+    float skew_xz = 0.f;
+    float skew_yz = 0.f;
+    float shift_x = 0.f;
+    float shift_y = 0.f;
+    float shift_z = 0.f;
+    float quaternion_r = 1.f;
+    float quaternion_i = 0.f;
+    float quaternion_j = 0.f;
+    float quaternion_k = 0.f;
+    float translation_x = 0.f;
+    float translation_y = 0.f;
+    float translation_z = 0.f;
+  };
+
+  __forceinline QuaternionDecomposition quaternionDecomposition(AffineSpace3ff const& M)
+  {
+    QuaternionDecomposition qd;
+    qd.scale_x       = M.l.vx.x;
+    qd.scale_y       = M.l.vy.y;
+    qd.scale_z       = M.l.vz.z;
+    qd.shift_x       = M.p.x;
+    qd.shift_y       = M.p.y;
+    qd.shift_z       = M.p.z;
+    qd.translation_x = M.l.vx.y;
+    qd.translation_y = M.l.vx.z;
+    qd.translation_z = M.l.vy.z;
+    qd.skew_xy       = M.l.vy.x;
+    qd.skew_xz       = M.l.vz.x;
+    qd.skew_yz       = M.l.vz.y;
+    qd.quaternion_r  = M.p.w;
+    qd.quaternion_i  = M.l.vx.w;
+    qd.quaternion_j  = M.l.vy.w;
+    qd.quaternion_k  = M.l.vz.w;
+    return qd;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /*
+   * ! Template Specialization for 2D: return matrix for rotation around point
+   * (rotation around arbitrarty vector is not meaningful in 2D)
+   */
+  template<> __forceinline
+  AffineSpace2f AffineSpace2f::rotate(const Vec2f& p, const float& r) {
+    return translate(+p)*AffineSpace2f(LinearSpace2f::rotate(r))*translate(-p);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Similarity Transform
+  //
+  // checks, if M is a similarity transformation, i.e if there exists a factor D
+  // such that for all x,y: distance(Mx, My) = D * distance(x, y)
+  ////////////////////////////////////////////////////////////////////////////////
+  __forceinline bool similarityTransform(const AffineSpace3fa& M, float* D)
+  {
+    if (D) *D = 0.f;
+    if (abs(dot(M.l.vx, M.l.vy)) > 1e-5f) return false;
+    if (abs(dot(M.l.vx, M.l.vz)) > 1e-5f) return false;
+    if (abs(dot(M.l.vy, M.l.vz)) > 1e-5f) return false;
+
+    const float D_x = dot(M.l.vx, M.l.vx);
+    const float D_y = dot(M.l.vy, M.l.vy);
+    const float D_z = dot(M.l.vz, M.l.vz);
+
+    if (abs(D_x - D_y) > 1e-5f ||
+        abs(D_x - D_z) > 1e-5f ||
+        abs(D_y - D_z) > 1e-5f)
+      return false;
+
+    if (D) *D = sqrtf(D_x);
+    return true;
+  }
+
+  __forceinline void AffineSpace3fa_store_unaligned(const AffineSpace3fa &source, AffineSpace3fa* ptr)
+  {
+    Vec3fa::storeu(&ptr->l.vx, source.l.vx);
+    Vec3fa::storeu(&ptr->l.vy, source.l.vy);
+    Vec3fa::storeu(&ptr->l.vz, source.l.vz);
+    Vec3fa::storeu(&ptr->p, source.p);
+  }
+
+  __forceinline AffineSpace3fa AffineSpace3fa_load_unaligned(AffineSpace3fa* ptr)
+  {
+    AffineSpace3fa space;
+    space.l.vx = Vec3fa::loadu(&ptr->l.vx);
+    space.l.vy = Vec3fa::loadu(&ptr->l.vy);
+    space.l.vz = Vec3fa::loadu(&ptr->l.vz);
+    space.p    = Vec3fa::loadu(&ptr->p);
+    return space;
+  }
+
+  #undef VectorT
+  #undef ScalarT
+}
diff --git a/thirdparty/embree/common/math/bbox.h b/thirdparty/embree/common/math/bbox.h
new file mode 100644
index 0000000000..bc43155358
--- /dev/null
+++ b/thirdparty/embree/common/math/bbox.h
@@ -0,0 +1,331 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "vec2.h"
+#include "vec3.h"
+
+namespace embree
+{
+  namespace internal {
+
+    template <typename T> __forceinline T divideByTwo(const T& v) { return v / T(2); }
+    template <> __forceinline float divideByTwo<float>(const float& v) { return v * 0.5f; }
+    template <> __forceinline double divideByTwo<double>(const double& v) { return v * 0.5; }
+
+  } // namespace internal
+  template<typename T>
+  struct BBox
+  {
+    T lower, upper;
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Construction
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline BBox           ( )                   { }
+    template<typename T1>
+    __forceinline BBox           ( const BBox<T1>& other ) : lower(other.lower), upper(other.upper) {}
+    __forceinline BBox& operator=( const BBox& other ) { lower = other.lower; upper = other.upper; return *this; }
+
+    __forceinline BBox ( const T& v                     ) : lower(v), upper(v) {}
+    __forceinline BBox ( const T& lower, const T& upper ) : lower(lower), upper(upper) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Extending Bounds
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const BBox& extend(const BBox& other) { lower = min(lower,other.lower); upper = max(upper,other.upper); return *this; }
+    __forceinline const BBox& extend(const T   & other) { lower = min(lower,other      ); upper = max(upper,other      ); return *this; }
+
+    /*! tests if box is empty */
+    __forceinline bool empty() const { for (int i=0; i<T::N; i++) if (lower[i] > upper[i]) return true; return false; }
+
+    /*! computes the size of the box */
+    __forceinline T size() const { return upper - lower; }
+
+    /*! computes the center of the box */
+    __forceinline T center() const { return internal::divideByTwo<T>(lower+upper); }
+
+    /*! computes twice the center of the box */
+    __forceinline T center2() const { return lower+upper; }
+
+    /*! merges two boxes */
+    __forceinline static const BBox merge (const BBox& a, const BBox& b) {
+      return BBox(min(a.lower, b.lower), max(a.upper, b.upper));
+    }
+
+     /*! enlarge box by some scaling factor */
+    __forceinline BBox enlarge_by(const float a) const {
+      return BBox(lower - T(a)*abs(lower), upper + T(a)*abs(upper));
+    }
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline BBox( EmptyTy ) : lower(pos_inf), upper(neg_inf) {}
+    __forceinline BBox( FullTy  ) : lower(neg_inf), upper(pos_inf) {}
+    __forceinline BBox( FalseTy ) : lower(pos_inf), upper(neg_inf) {}
+    __forceinline BBox( TrueTy  ) : lower(neg_inf), upper(pos_inf) {}
+    __forceinline BBox( NegInfTy ): lower(pos_inf), upper(neg_inf) {}
+    __forceinline BBox( PosInfTy ): lower(neg_inf), upper(pos_inf) {}
+  };
+
+  template<> __forceinline bool BBox<float>::empty() const {
+    return lower > upper;
+  }
+
+#if defined(__SSE__)
+  template<> __forceinline bool BBox<Vec3fa>::empty() const {
+    return !all(le_mask(lower,upper));
+  }
+  template<> __forceinline bool BBox<Vec3fx>::empty() const {
+    return !all(le_mask(lower,upper));
+  }
+#endif
+
+  /*! tests if box is finite */
+  __forceinline bool isvalid( const BBox<Vec3fa>& v ) {
+    return all(gt_mask(v.lower,Vec3fa_t(-FLT_LARGE)) & lt_mask(v.upper,Vec3fa_t(+FLT_LARGE)));
+  }
+
+  /*! tests if box is finite and non-empty*/
+  __forceinline bool isvalid_non_empty( const BBox<Vec3fa>& v ) {
+    return all(gt_mask(v.lower,Vec3fa_t(-FLT_LARGE)) & lt_mask(v.upper,Vec3fa_t(+FLT_LARGE)) & le_mask(v.lower,v.upper));
+  }
+  
+  /*! tests if box has finite entries */
+  __forceinline bool is_finite( const BBox<Vec3fa>& b) {
+    return is_finite(b.lower) && is_finite(b.upper);
+  }
+
+  /*! test if point contained in box */
+  __forceinline bool inside ( const BBox<Vec3fa>& b, const Vec3fa& p ) { return all(ge_mask(p,b.lower) & le_mask(p,b.upper)); }
+
+  /*! computes the center of the box */
+  template<typename T> __forceinline const T center2(const BBox<T>& box) { return box.lower + box.upper; }
+  template<typename T> __forceinline const T center (const BBox<T>& box) { return internal::divideByTwo<T>(center2(box)); }
+
+  /*! computes the volume of a bounding box */
+  __forceinline float volume    ( const BBox<Vec3fa>& b ) { return reduce_mul(b.size()); }
+  __forceinline float safeVolume( const BBox<Vec3fa>& b ) { if (b.empty()) return 0.0f; else return volume(b); }
+
+  /*! computes the volume of a bounding box */
+  __forceinline float volume( const BBox<Vec3f>& b )  { return reduce_mul(b.size()); }
+
+  /*! computes the surface area of a bounding box */
+  template<typename T> __forceinline const T area( const BBox<Vec2<T> >& b ) { const Vec2<T> d = b.size(); return d.x*d.y; }
+
+  template<typename T> __forceinline const T halfArea( const BBox<Vec3<T> >& b ) { return halfArea(b.size()); }
+  template<typename T> __forceinline const T     area( const BBox<Vec3<T> >& b ) { return T(2)*halfArea(b); }
+
+  __forceinline float halfArea( const BBox<Vec3fa>& b ) { return halfArea(b.size()); }
+  __forceinline float     area( const BBox<Vec3fa>& b ) { return 2.0f*halfArea(b); }
+
+  __forceinline float halfArea( const BBox<Vec3fx>& b ) { return halfArea(b.size()); }
+  __forceinline float     area( const BBox<Vec3fx>& b ) { return 2.0f*halfArea(b); }
+
+  template<typename Vec> __forceinline float safeArea( const BBox<Vec>& b ) { if (b.empty()) return 0.0f; else return area(b); }
+
+  template<typename T> __forceinline float expectedApproxHalfArea(const BBox<T>& box) {
+    return halfArea(box);
+  }
+
+  /*! merges bounding boxes and points */
+  template<typename T> __forceinline const BBox<T> merge( const BBox<T>& a, const       T& b ) { return BBox<T>(min(a.lower, b    ), max(a.upper, b    )); }
+  template<typename T> __forceinline const BBox<T> merge( const       T& a, const BBox<T>& b ) { return BBox<T>(min(a    , b.lower), max(a    , b.upper)); }
+  template<typename T> __forceinline const BBox<T> merge( const BBox<T>& a, const BBox<T>& b ) { return BBox<T>(min(a.lower, b.lower), max(a.upper, b.upper)); }
+
+  /*! Merges three boxes. */
+  template<typename T> __forceinline const BBox<T> merge( const BBox<T>& a, const BBox<T>& b, const BBox<T>& c ) { return merge(a,merge(b,c)); }
+
+  /*! Merges four boxes. */
+  template<typename T> __forceinline BBox<T> merge(const BBox<T>& a, const BBox<T>& b, const BBox<T>& c, const BBox<T>& d) {
+    return merge(merge(a,b),merge(c,d));
+  }
+
+  /*! Comparison Operators */
+  template<typename T> __forceinline bool operator==( const BBox<T>& a, const BBox<T>& b ) { return a.lower == b.lower && a.upper == b.upper; }
+  template<typename T> __forceinline bool operator!=( const BBox<T>& a, const BBox<T>& b ) { return a.lower != b.lower || a.upper != b.upper; }
+
+  /*! scaling */
+  template<typename T> __forceinline BBox<T> operator *( const float& a, const BBox<T>& b ) { return BBox<T>(a*b.lower,a*b.upper); }
+  template<typename T> __forceinline BBox<T> operator *( const     T& a, const BBox<T>& b ) { return BBox<T>(a*b.lower,a*b.upper); }
+
+  /*! translations */
+  template<typename T> __forceinline BBox<T> operator +( const BBox<T>& a, const BBox<T>& b ) { return BBox<T>(a.lower+b.lower,a.upper+b.upper); }
+  template<typename T> __forceinline BBox<T> operator -( const BBox<T>& a, const BBox<T>& b ) { return BBox<T>(a.lower-b.lower,a.upper-b.upper); }
+  template<typename T> __forceinline BBox<T> operator +( const BBox<T>& a, const      T & b ) { return BBox<T>(a.lower+b      ,a.upper+b      ); }
+  template<typename T> __forceinline BBox<T> operator -( const BBox<T>& a, const      T & b ) { return BBox<T>(a.lower-b      ,a.upper-b      ); }
+
+  /*! extension */
+  template<typename T> __forceinline BBox<T> enlarge(const BBox<T>& a, const T& b) { return BBox<T>(a.lower-b, a.upper+b); }
+
+  /*! intersect bounding boxes */
+  template<typename T> __forceinline const BBox<T> intersect( const BBox<T>& a, const BBox<T>& b ) { return BBox<T>(max(a.lower, b.lower), min(a.upper, b.upper)); }
+  template<typename T> __forceinline const BBox<T> intersect( const BBox<T>& a, const BBox<T>& b, const BBox<T>& c ) { return intersect(a,intersect(b,c)); }
+  template<typename T> __forceinline const BBox<T> intersect( const BBox<T>& a, const BBox<T>& b, const BBox<T>& c, const BBox<T>& d ) { return intersect(intersect(a,b),intersect(c,d)); }
+
+  /*! subtract bounds from each other */
+  template<typename T> __forceinline void subtract(const BBox<T>& a, const BBox<T>& b, BBox<T>& c, BBox<T>& d)
+  {
+    c.lower = a.lower;
+    c.upper = min(a.upper,b.lower);
+    d.lower = max(a.lower,b.upper);
+    d.upper = a.upper;
+  }
+
+  /*! tests if bounding boxes (and points) are disjoint (empty intersection) */
+  template<typename T> __inline bool disjoint( const BBox<T>& a, const BBox<T>& b ) { return intersect(a,b).empty(); }
+  template<typename T> __inline bool disjoint( const BBox<T>& a, const       T& b ) { return disjoint(a,BBox<T>(b)); }
+  template<typename T> __inline bool disjoint( const       T& a, const BBox<T>& b ) { return disjoint(BBox<T>(a),b); }
+
+  /*! tests if bounding boxes (and points) are conjoint (non-empty intersection) */
+  template<typename T> __inline bool conjoint( const BBox<T>& a, const BBox<T>& b ) { return !intersect(a,b).empty(); }
+  template<typename T> __inline bool conjoint( const BBox<T>& a, const       T& b ) { return conjoint(a,BBox<T>(b)); }
+  template<typename T> __inline bool conjoint( const       T& a, const BBox<T>& b ) { return conjoint(BBox<T>(a),b); }
+
+  /*! subset relation */
+  template<typename T> __inline bool subset( const BBox<T>& a, const BBox<T>& b )
+  { 
+    for ( size_t i = 0; i < T::N; i++ ) if ( a.lower[i] < b.lower[i] ) return false;
+    for ( size_t i = 0; i < T::N; i++ ) if ( a.upper[i] > b.upper[i] ) return false;
+    return true; 
+  }
+
+  template<> __inline bool subset( const BBox<Vec3fa>& a, const BBox<Vec3fa>& b ) {
+    return all(ge_mask(a.lower,b.lower)) & all(le_mask(a.upper,b.upper));
+  }
+
+  template<> __inline bool subset( const BBox<Vec3fx>& a, const BBox<Vec3fx>& b ) {
+    return all(ge_mask(a.lower,b.lower)) & all(le_mask(a.upper,b.upper));
+  }
+  
+  /*! blending */
+  template<typename T>
+    __forceinline BBox<T> lerp(const BBox<T>& b0, const BBox<T>& b1, const float t) {
+    return BBox<T>(lerp(b0.lower,b1.lower,t),lerp(b0.upper,b1.upper,t));
+  }
+
+  /*! output operator */
+  template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const BBox<T>& box) {
+    return cout << "[" << box.lower << "; " << box.upper << "]";
+  }
+
+  /*! default template instantiations */
+  typedef BBox<float> BBox1f;
+  typedef BBox<Vec2f> BBox2f;
+  typedef BBox<Vec2fa> BBox2fa;
+  typedef BBox<Vec3f> BBox3f;
+  typedef BBox<Vec3fa> BBox3fa;
+  typedef BBox<Vec3fx> BBox3fx;
+  typedef BBox<Vec3ff> BBox3ff;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// SSE / AVX / MIC specializations
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined __SSE__
+#include "../simd/sse.h"
+#endif
+
+#if defined __AVX__
+#include "../simd/avx.h"
+#endif
+
+#if defined(__AVX512F__)
+#include "../simd/avx512.h"
+#endif
+
+namespace embree
+{
+  template<int N>
+    __forceinline BBox<Vec3<vfloat<N>>> transpose(const BBox3fa* bounds);
+  
+  template<>
+    __forceinline BBox<Vec3<vfloat4>> transpose<4>(const BBox3fa* bounds)
+  {
+    BBox<Vec3<vfloat4>> dest;
+    
+    transpose((vfloat4&)bounds[0].lower,
+              (vfloat4&)bounds[1].lower,
+              (vfloat4&)bounds[2].lower,
+              (vfloat4&)bounds[3].lower,
+              dest.lower.x,
+              dest.lower.y,
+              dest.lower.z);
+    
+    transpose((vfloat4&)bounds[0].upper,
+              (vfloat4&)bounds[1].upper,
+              (vfloat4&)bounds[2].upper,
+              (vfloat4&)bounds[3].upper,
+              dest.upper.x,
+              dest.upper.y,
+              dest.upper.z);
+    
+    return dest;
+  }
+  
+#if defined(__AVX__)
+  template<>
+    __forceinline BBox<Vec3<vfloat8>> transpose<8>(const BBox3fa* bounds)
+  {
+    BBox<Vec3<vfloat8>> dest;
+    
+    transpose((vfloat4&)bounds[0].lower,
+              (vfloat4&)bounds[1].lower,
+              (vfloat4&)bounds[2].lower,
+              (vfloat4&)bounds[3].lower,
+              (vfloat4&)bounds[4].lower,
+              (vfloat4&)bounds[5].lower,
+              (vfloat4&)bounds[6].lower,
+              (vfloat4&)bounds[7].lower,
+              dest.lower.x,
+              dest.lower.y,
+              dest.lower.z);
+    
+    transpose((vfloat4&)bounds[0].upper,
+              (vfloat4&)bounds[1].upper,
+              (vfloat4&)bounds[2].upper,
+              (vfloat4&)bounds[3].upper,
+              (vfloat4&)bounds[4].upper,
+              (vfloat4&)bounds[5].upper,
+              (vfloat4&)bounds[6].upper,
+              (vfloat4&)bounds[7].upper,
+              dest.upper.x,
+              dest.upper.y,
+              dest.upper.z);
+    
+    return dest;
+  }
+#endif
+  
+  template<int N>
+    __forceinline BBox3fa merge(const BBox3fa* bounds);
+  
+  template<>
+    __forceinline BBox3fa merge<4>(const BBox3fa* bounds)
+  {
+    const Vec3fa lower = min(min(bounds[0].lower,bounds[1].lower),
+                             min(bounds[2].lower,bounds[3].lower));
+    const Vec3fa upper = max(max(bounds[0].upper,bounds[1].upper),
+                             max(bounds[2].upper,bounds[3].upper));
+    return BBox3fa(lower,upper);
+  }
+  
+#if defined(__AVX__)
+  template<>
+    __forceinline BBox3fa merge<8>(const BBox3fa* bounds)
+  {
+    const Vec3fa lower = min(min(min(bounds[0].lower,bounds[1].lower),min(bounds[2].lower,bounds[3].lower)),
+                             min(min(bounds[4].lower,bounds[5].lower),min(bounds[6].lower,bounds[7].lower)));
+    const Vec3fa upper = max(max(max(bounds[0].upper,bounds[1].upper),max(bounds[2].upper,bounds[3].upper)),
+                             max(max(bounds[4].upper,bounds[5].upper),max(bounds[6].upper,bounds[7].upper)));
+    return BBox3fa(lower,upper);
+  }
+#endif
+}
+
diff --git a/thirdparty/embree/common/math/col3.h b/thirdparty/embree/common/math/col3.h
new file mode 100644
index 0000000000..3f50c04393
--- /dev/null
+++ b/thirdparty/embree/common/math/col3.h
@@ -0,0 +1,47 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "math.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// RGB Color Class
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> struct Col3
+  {
+    T r, g, b;
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Construction
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Col3           ( )                   { }
+    __forceinline Col3           ( const Col3& other ) { r = other.r; g = other.g; b = other.b; }
+    __forceinline Col3& operator=( const Col3& other ) { r = other.r; g = other.g; b = other.b; return *this; }
+
+    __forceinline explicit Col3 (const T& v)                         : r(v), g(v), b(v) {}
+    __forceinline          Col3 (const T& r, const T& g, const T& b) : r(r), g(g), b(b) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Col3 (ZeroTy)   : r(zero)   , g(zero)   , b(zero)    {}
+    __forceinline Col3 (OneTy)    : r(one)    , g(one)    , b(one)     {}
+    __forceinline Col3 (PosInfTy) : r(pos_inf), g(pos_inf), b(pos_inf) {}
+    __forceinline Col3 (NegInfTy) : r(neg_inf), g(neg_inf), b(neg_inf) {}
+  };
+
+  /*! output operator */
+  template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const Col3<T>& a) {
+    return cout << "(" << a.r << ", " << a.g << ", " << a.b << ")";
+  }
+
+  /*! default template instantiations */
+  typedef Col3<unsigned char> Col3uc;
+  typedef Col3<float        > Col3f;
+}
diff --git a/thirdparty/embree/common/math/col4.h b/thirdparty/embree/common/math/col4.h
new file mode 100644
index 0000000000..788508516b
--- /dev/null
+++ b/thirdparty/embree/common/math/col4.h
@@ -0,0 +1,47 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "math.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// RGBA Color Class
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> struct Col4
+  {
+    T r, g, b, a;
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Construction
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Col4           ( )                   { }
+    __forceinline Col4           ( const Col4& other ) { r = other.r; g = other.g; b = other.b; a = other.a; }
+    __forceinline Col4& operator=( const Col4& other ) { r = other.r; g = other.g; b = other.b; a = other.a; return *this; }
+
+    __forceinline explicit Col4 (const T& v) : r(v), g(v), b(v), a(v) {}
+    __forceinline          Col4 (const T& r, const T& g, const T& b, const T& a) : r(r), g(g), b(b), a(a) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Col4 (ZeroTy)   : r(zero)   , g(zero)   , b(zero)   , a(zero) {}
+    __forceinline Col4 (OneTy)    : r(one)    , g(one)    , b(one)    , a(one) {}
+    __forceinline Col4 (PosInfTy) : r(pos_inf), g(pos_inf), b(pos_inf), a(pos_inf) {}
+    __forceinline Col4 (NegInfTy) : r(neg_inf), g(neg_inf), b(neg_inf), a(neg_inf) {}
+  };
+
+  /*! output operator */
+  template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const Col4<T>& a) {
+    return cout << "(" << a.r << ", " << a.g << ", " << a.b << ", " << a.a << ")";
+  }
+
+  /*! default template instantiations */
+  typedef Col4<unsigned char> Col4uc;
+  typedef Col4<float        > Col4f;
+}
diff --git a/thirdparty/embree/common/math/color.h b/thirdparty/embree/common/math/color.h
new file mode 100644
index 0000000000..529584ea16
--- /dev/null
+++ b/thirdparty/embree/common/math/color.h
@@ -0,0 +1,241 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "constants.h"
+#include "col3.h"
+#include "col4.h"
+
+#include "../simd/sse.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// SSE RGBA Color Class
+  ////////////////////////////////////////////////////////////////////////////////
+
+  struct Color4
+  {
+    union {
+      __m128 m128;
+      struct { float r,g,b,a; };
+    };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Construction
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Color4 () {}
+    __forceinline Color4 ( const __m128 a ) : m128(a) {}
+
+    __forceinline explicit Color4 (const float v) : m128(_mm_set1_ps(v)) {}
+    __forceinline          Color4 (const float r, const float g, const float b, const float a) : m128(_mm_set_ps(a,b,g,r)) {}
+
+    __forceinline explicit Color4 ( const Col3uc& other ) { m128 = _mm_mul_ps(_mm_set_ps(255.0f,other.b,other.g,other.r),_mm_set1_ps(one_over_255)); }
+    __forceinline explicit Color4 ( const Col3f&  other ) { m128 = _mm_set_ps(1.0f,other.b,other.g,other.r); }
+    __forceinline explicit Color4 ( const Col4uc& other ) { m128 = _mm_mul_ps(_mm_set_ps(other.a,other.b,other.g,other.r),_mm_set1_ps(one_over_255)); }
+    __forceinline explicit Color4 ( const Col4f&  other ) { m128 = _mm_set_ps(other.a,other.b,other.g,other.r); }
+
+    __forceinline Color4           ( const Color4& other ) : m128(other.m128) {}
+    __forceinline Color4& operator=( const Color4& other ) { m128 = other.m128; return *this; }
+
+    __forceinline operator const __m128&() const { return m128; }
+    __forceinline operator       __m128&()       { return m128; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Set
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline void set(Col3f& d) const { d.r = r; d.g = g; d.b = b; }
+    __forceinline void set(Col4f& d) const { d.r = r; d.g = g; d.b = b; d.a = a; }
+    __forceinline void set(Col3uc& d) const 
+    {
+      vfloat4 s = clamp(vfloat4(m128))*255.0f;
+      d.r = (unsigned char)(s[0]); 
+      d.g = (unsigned char)(s[1]); 
+      d.b = (unsigned char)(s[2]); 
+    }
+    __forceinline void set(Col4uc& d) const 
+    {
+      vfloat4 s = clamp(vfloat4(m128))*255.0f;
+      d.r = (unsigned char)(s[0]); 
+      d.g = (unsigned char)(s[1]); 
+      d.b = (unsigned char)(s[2]); 
+      d.a = (unsigned char)(s[3]); 
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Color4( ZeroTy   ) : m128(_mm_set1_ps(0.0f)) {}
+    __forceinline Color4( OneTy    ) : m128(_mm_set1_ps(1.0f)) {}
+    __forceinline Color4( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {}
+    __forceinline Color4( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {}
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// SSE RGB Color Class
+  ////////////////////////////////////////////////////////////////////////////////
+
+  struct Color
+  {
+    union {
+      __m128 m128;
+      struct { float r,g,b; };
+    };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Construction
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Color () {}
+    __forceinline Color ( const __m128 a ) : m128(a) {}
+
+    __forceinline explicit Color  (const float v)                               : m128(_mm_set1_ps(v)) {}
+    __forceinline          Color  (const float r, const float g, const float b) : m128(_mm_set_ps(0.0f,b,g,r)) {}
+
+    __forceinline Color           ( const Color& other ) : m128(other.m128) {}
+    __forceinline Color& operator=( const Color& other ) { m128 = other.m128; return *this; }
+
+    __forceinline Color           ( const Color4& other ) : m128(other.m128) {}
+    __forceinline Color& operator=( const Color4& other ) { m128 = other.m128; return *this; }
+
+    __forceinline operator const __m128&() const { return m128; }
+    __forceinline operator       __m128&()       { return m128; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Set
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline void set(Col3f& d) const { d.r = r; d.g = g; d.b = b; }
+    __forceinline void set(Col4f& d) const { d.r = r; d.g = g; d.b = b; d.a = 1.0f; }
+    __forceinline void set(Col3uc& d) const 
+    { 
+      vfloat4 s = clamp(vfloat4(m128))*255.0f;
+      d.r = (unsigned char)(s[0]); 
+      d.g = (unsigned char)(s[1]); 
+      d.b = (unsigned char)(s[2]); 
+    }
+    __forceinline void set(Col4uc& d) const 
+    { 
+      vfloat4 s = clamp(vfloat4(m128))*255.0f;
+      d.r = (unsigned char)(s[0]); 
+      d.g = (unsigned char)(s[1]); 
+      d.b = (unsigned char)(s[2]); 
+      d.a = 255; 
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Color( ZeroTy   ) : m128(_mm_set1_ps(0.0f)) {}
+    __forceinline Color( OneTy    ) : m128(_mm_set1_ps(1.0f)) {}
+    __forceinline Color( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {}
+    __forceinline Color( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {}
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline const Color operator +( const Color& a ) { return a; }
+  __forceinline const Color operator -( const Color& a ) {
+    const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
+    return _mm_xor_ps(a.m128, mask);
+  }
+  __forceinline const Color abs  ( const Color& a ) {
+    const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
+    return _mm_and_ps(a.m128, mask);
+  }
+  __forceinline const Color rcp  ( const Color& a )
+  {
+#if defined(__AVX512VL__)
+    const Color r = _mm_rcp14_ps(a.m128);
+#else
+    const Color r = _mm_rcp_ps(a.m128);
+#endif
+    return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
+  }
+  __forceinline const Color rsqrt( const Color& a )
+  {
+#if defined(__AVX512VL__)
+    __m128 r = _mm_rsqrt14_ps(a.m128);
+#else
+    __m128 r = _mm_rsqrt_ps(a.m128);
+#endif
+    return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+  }
+  __forceinline const Color sqrt ( const Color& a ) { return _mm_sqrt_ps(a.m128); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline const Color operator +( const Color& a, const Color& b ) { return _mm_add_ps(a.m128, b.m128); }
+  __forceinline const Color operator -( const Color& a, const Color& b ) { return _mm_sub_ps(a.m128, b.m128); }
+  __forceinline const Color operator *( const Color& a, const Color& b ) { return _mm_mul_ps(a.m128, b.m128); }
+  __forceinline const Color operator *( const Color& a, const float  b ) { return a * Color(b); }
+  __forceinline const Color operator *( const float  a, const Color& b ) { return Color(a) * b; }
+  __forceinline const Color operator /( const Color& a, const Color& b ) { return a * rcp(b); }
+  __forceinline const Color operator /( const Color& a, const float  b ) { return a * rcp(b); }
+
+  __forceinline const Color min( const Color& a, const Color& b ) { return _mm_min_ps(a.m128,b.m128); }
+  __forceinline const Color max( const Color& a, const Color& b ) { return _mm_max_ps(a.m128,b.m128); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline const Color operator+=(Color& a, const Color& b) { return a = a + b; }
+  __forceinline const Color operator-=(Color& a, const Color& b) { return a = a - b; }
+  __forceinline const Color operator*=(Color& a, const Color& b) { return a = a * b; }
+  __forceinline const Color operator/=(Color& a, const Color& b) { return a = a / b; }
+  __forceinline const Color operator*=(Color& a, const float b      ) { return a = a * b; }
+  __forceinline const Color operator/=(Color& a, const float b      ) { return a = a / b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline float reduce_add(const Color& v) { return v.r+v.g+v.b; }
+  __forceinline float reduce_mul(const Color& v) { return v.r*v.g*v.b; }
+  __forceinline float reduce_min(const Color& v) { return min(v.r,v.g,v.b); }
+  __forceinline float reduce_max(const Color& v) { return max(v.r,v.g,v.b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool operator ==( const Color& a, const Color& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 7) == 7; }
+  __forceinline bool operator !=( const Color& a, const Color& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 7) != 0; }
+  __forceinline bool operator < ( const Color& a, const Color& b ) {
+    if (a.r != b.r) return a.r < b.r;
+    if (a.g != b.g) return a.g < b.g;
+    if (a.b != b.b) return a.b < b.b;
+    return false;
+  }
+
+   ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline const Color select( bool s, const Color& t, const Color& f ) {
+    __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps();
+    return blendv_ps(f, t, mask);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Special Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  /*! computes luminance of a color */
+  __forceinline float luminance (const Color& a) { return madd(0.212671f,a.r,madd(0.715160f,a.g,0.072169f*a.b)); }
+
+  /*! output operator */
+  __forceinline embree_ostream operator<<(embree_ostream cout, const Color& a) {
+    return cout << "(" << a.r << ", " << a.g << ", " << a.b << ")";
+  }
+}
diff --git a/thirdparty/embree/common/math/constants.cpp b/thirdparty/embree/common/math/constants.cpp
new file mode 100644
index 0000000000..03919ae20c
--- /dev/null
+++ b/thirdparty/embree/common/math/constants.cpp
@@ -0,0 +1,27 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "constants.h"
+
+namespace embree
+{
+  TrueTy True;
+  FalseTy False;
+  ZeroTy zero;
+  OneTy one;
+  NegInfTy neg_inf;
+  PosInfTy inf;
+  PosInfTy pos_inf;
+  NaNTy nan;
+  UlpTy ulp;
+  PiTy pi;
+  OneOverPiTy one_over_pi;
+  TwoPiTy two_pi;
+  OneOverTwoPiTy one_over_two_pi;
+  FourPiTy four_pi;
+  OneOverFourPiTy one_over_four_pi;
+  StepTy step;
+  ReverseStepTy reverse_step;
+  EmptyTy empty;
+  UndefinedTy undefined;
+}
diff --git a/thirdparty/embree/common/math/constants.h b/thirdparty/embree/common/math/constants.h
new file mode 100644
index 0000000000..578473a8ab
--- /dev/null
+++ b/thirdparty/embree/common/math/constants.h
@@ -0,0 +1,197 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+
+#include <limits>
+
+#define _USE_MATH_DEFINES
+#include <math.h> // using cmath causes issues under Windows
+#include <cfloat>
+#include <climits>
+
+namespace embree
+{
+  static MAYBE_UNUSED const float one_over_255 = 1.0f/255.0f;
+  static MAYBE_UNUSED const float min_rcp_input = 1E-18f;  // for abs(x) >= min_rcp_input the newton raphson rcp calculation does not fail
+
+  /* we consider floating point numbers in that range as valid input numbers */
+  static MAYBE_UNUSED float FLT_LARGE = 1.844E18f;
+
+  struct TrueTy {
+    __forceinline operator bool( ) const { return true; }
+  };
+
+  extern MAYBE_UNUSED TrueTy True;
+
+  struct FalseTy {
+    __forceinline operator bool( ) const { return false; }
+  };
+
+  extern MAYBE_UNUSED FalseTy False;
+  
+  struct ZeroTy
+  {
+    __forceinline operator          double   ( ) const { return 0; }
+    __forceinline operator          float    ( ) const { return 0; }
+    __forceinline operator          long long( ) const { return 0; }
+    __forceinline operator unsigned long long( ) const { return 0; }
+    __forceinline operator          long     ( ) const { return 0; }
+    __forceinline operator unsigned long     ( ) const { return 0; }
+    __forceinline operator          int      ( ) const { return 0; }
+    __forceinline operator unsigned int      ( ) const { return 0; }
+    __forceinline operator          short    ( ) const { return 0; }
+    __forceinline operator unsigned short    ( ) const { return 0; }
+    __forceinline operator          char     ( ) const { return 0; }
+    __forceinline operator unsigned char     ( ) const { return 0; }
+  }; 
+
+  extern MAYBE_UNUSED ZeroTy zero;
+
+  struct OneTy
+  {
+    __forceinline operator          double   ( ) const { return 1; }
+    __forceinline operator          float    ( ) const { return 1; }
+    __forceinline operator          long long( ) const { return 1; }
+    __forceinline operator unsigned long long( ) const { return 1; }
+    __forceinline operator          long     ( ) const { return 1; }
+    __forceinline operator unsigned long     ( ) const { return 1; }
+    __forceinline operator          int      ( ) const { return 1; }
+    __forceinline operator unsigned int      ( ) const { return 1; }
+    __forceinline operator          short    ( ) const { return 1; }
+    __forceinline operator unsigned short    ( ) const { return 1; }
+    __forceinline operator          char     ( ) const { return 1; }
+    __forceinline operator unsigned char     ( ) const { return 1; }
+  };
+
+  extern MAYBE_UNUSED OneTy one;
+
+  struct NegInfTy
+  {
+    __forceinline operator          double   ( ) const { return -std::numeric_limits<double>::infinity(); }
+    __forceinline operator          float    ( ) const { return -std::numeric_limits<float>::infinity(); }
+    __forceinline operator          long long( ) const { return std::numeric_limits<long long>::min(); }
+    __forceinline operator unsigned long long( ) const { return std::numeric_limits<unsigned long long>::min(); }
+    __forceinline operator          long     ( ) const { return std::numeric_limits<long>::min(); }
+    __forceinline operator unsigned long     ( ) const { return std::numeric_limits<unsigned long>::min(); }
+    __forceinline operator          int      ( ) const { return std::numeric_limits<int>::min(); }
+    __forceinline operator unsigned int      ( ) const { return std::numeric_limits<unsigned int>::min(); }
+    __forceinline operator          short    ( ) const { return std::numeric_limits<short>::min(); }
+    __forceinline operator unsigned short    ( ) const { return std::numeric_limits<unsigned short>::min(); }
+    __forceinline operator          char     ( ) const { return std::numeric_limits<char>::min(); }
+    __forceinline operator unsigned char     ( ) const { return std::numeric_limits<unsigned char>::min(); }
+
+  };
+
+  extern MAYBE_UNUSED NegInfTy neg_inf;
+
+  struct PosInfTy
+  {
+    __forceinline operator          double   ( ) const { return std::numeric_limits<double>::infinity(); }
+    __forceinline operator          float    ( ) const { return std::numeric_limits<float>::infinity(); }
+    __forceinline operator          long long( ) const { return std::numeric_limits<long long>::max(); }
+    __forceinline operator unsigned long long( ) const { return std::numeric_limits<unsigned long long>::max(); }
+    __forceinline operator          long     ( ) const { return std::numeric_limits<long>::max(); }
+    __forceinline operator unsigned long     ( ) const { return std::numeric_limits<unsigned long>::max(); }
+    __forceinline operator          int      ( ) const { return std::numeric_limits<int>::max(); }
+    __forceinline operator unsigned int      ( ) const { return std::numeric_limits<unsigned int>::max(); }
+    __forceinline operator          short    ( ) const { return std::numeric_limits<short>::max(); }
+    __forceinline operator unsigned short    ( ) const { return std::numeric_limits<unsigned short>::max(); }
+    __forceinline operator          char     ( ) const { return std::numeric_limits<char>::max(); }
+    __forceinline operator unsigned char     ( ) const { return std::numeric_limits<unsigned char>::max(); }
+  };
+
+  extern MAYBE_UNUSED PosInfTy inf;
+  extern MAYBE_UNUSED PosInfTy pos_inf;
+
+  struct NaNTy
+  {
+    __forceinline operator double( ) const { return std::numeric_limits<double>::quiet_NaN(); }
+    __forceinline operator float ( ) const { return std::numeric_limits<float>::quiet_NaN(); }
+  };
+
+  extern MAYBE_UNUSED NaNTy nan;
+
+  struct UlpTy
+  {
+    __forceinline operator double( ) const { return std::numeric_limits<double>::epsilon(); }
+    __forceinline operator float ( ) const { return std::numeric_limits<float>::epsilon(); }
+  };
+
+  extern MAYBE_UNUSED UlpTy ulp;
+
+  struct PiTy
+  {
+    __forceinline operator double( ) const { return double(M_PI); }
+    __forceinline operator float ( ) const { return float(M_PI); }
+  };
+
+  extern MAYBE_UNUSED PiTy pi;
+
+  struct OneOverPiTy
+  {
+    __forceinline operator double( ) const { return double(M_1_PI); }
+    __forceinline operator float ( ) const { return float(M_1_PI); }
+  };
+
+  extern MAYBE_UNUSED OneOverPiTy one_over_pi;
+
+  struct TwoPiTy
+  {
+    __forceinline operator double( ) const { return double(2.0*M_PI); }
+    __forceinline operator float ( ) const { return float(2.0*M_PI); }
+  };
+
+  extern MAYBE_UNUSED TwoPiTy two_pi;
+
+  struct OneOverTwoPiTy
+  {
+    __forceinline operator double( ) const { return double(0.5*M_1_PI); }
+    __forceinline operator float ( ) const { return float(0.5*M_1_PI); }
+  };
+
+  extern MAYBE_UNUSED OneOverTwoPiTy one_over_two_pi;
+
+  struct FourPiTy
+  {
+    __forceinline operator double( ) const { return double(4.0*M_PI); } 
+    __forceinline operator float ( ) const { return float(4.0*M_PI); }
+  };
+
+  extern MAYBE_UNUSED FourPiTy four_pi;
+
+  struct OneOverFourPiTy
+  {
+    __forceinline operator double( ) const { return double(0.25*M_1_PI); }
+    __forceinline operator float ( ) const { return float(0.25*M_1_PI); }
+  };
+
+  extern MAYBE_UNUSED OneOverFourPiTy one_over_four_pi;
+
+  struct StepTy {
+  };
+
+  extern MAYBE_UNUSED StepTy step;
+
+  struct ReverseStepTy {
+  };
+
+  extern MAYBE_UNUSED ReverseStepTy reverse_step;
+
+  struct EmptyTy {
+  };
+
+  extern MAYBE_UNUSED EmptyTy empty;
+
+  struct FullTy {
+  };
+
+  extern MAYBE_UNUSED FullTy full;
+
+  struct UndefinedTy {
+  };
+
+  extern MAYBE_UNUSED UndefinedTy undefined;
+}
diff --git a/thirdparty/embree/common/math/interval.h b/thirdparty/embree/common/math/interval.h
new file mode 100644
index 0000000000..310add2129
--- /dev/null
+++ b/thirdparty/embree/common/math/interval.h
@@ -0,0 +1,161 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "vec2.h"
+#include "vec3.h"
+#include "bbox.h"
+
+namespace embree
+{
+  template<typename V>
+    struct Interval
+    {
+      V lower, upper;
+      
+      __forceinline Interval() {}
+      __forceinline Interval           ( const Interval& other ) { lower = other.lower; upper = other.upper; }
+      __forceinline Interval& operator=( const Interval& other ) { lower = other.lower; upper = other.upper; return *this; }
+
+      __forceinline Interval(const V& a) : lower(a), upper(a) {}
+      __forceinline Interval(const V& lower, const V& upper) : lower(lower), upper(upper) {}
+      __forceinline Interval(const BBox<V>& a) : lower(a.lower), upper(a.upper) {}
+          
+      /*! tests if box is empty */
+      //__forceinline bool empty() const { return lower > upper; }
+      
+      /*! computes the size of the interval */
+      __forceinline V size() const { return upper - lower; }
+      
+      __forceinline V center() const { return 0.5f*(lower+upper); }
+      
+      __forceinline const Interval& extend(const Interval& other) { lower = min(lower,other.lower); upper = max(upper,other.upper); return *this; }
+      __forceinline const Interval& extend(const V   & other) { lower = min(lower,other      ); upper = max(upper,other      ); return *this; }
+      
+      __forceinline friend Interval operator +( const Interval& a, const Interval& b ) {
+        return Interval(a.lower+b.lower,a.upper+b.upper);
+      }
+      
+      __forceinline friend Interval operator -( const Interval& a, const Interval& b ) {
+        return Interval(a.lower-b.upper,a.upper-b.lower);
+      }
+      
+      __forceinline friend Interval operator -( const Interval& a, const V& b ) {
+        return Interval(a.lower-b,a.upper-b);
+      }
+      
+      __forceinline friend Interval operator *( const Interval& a, const Interval& b )
+      {
+        const V ll = a.lower*b.lower;
+        const V lu = a.lower*b.upper;
+        const V ul = a.upper*b.lower;
+        const V uu = a.upper*b.upper;
+        return Interval(min(ll,lu,ul,uu),max(ll,lu,ul,uu));
+      }
+      
+      __forceinline friend Interval merge( const Interval& a, const Interval& b) {
+        return Interval(min(a.lower,b.lower),max(a.upper,b.upper));
+      }
+      
+      __forceinline friend Interval merge( const Interval& a, const Interval& b, const Interval& c) {
+        return merge(merge(a,b),c);
+      }
+      
+      __forceinline friend Interval merge( const Interval& a, const Interval& b, const Interval& c, const Interval& d) {
+        return merge(merge(a,b),merge(c,d));
+      }
+      
+      /*! intersect bounding boxes */
+      __forceinline friend const Interval intersect( const Interval& a, const Interval& b ) { return Interval(max(a.lower, b.lower), min(a.upper, b.upper)); }
+      __forceinline friend const Interval intersect( const Interval& a, const Interval& b, const Interval& c ) { return intersect(a,intersect(b,c)); }
+      __forceinline friend const Interval intersect( const Interval& a, const Interval& b, const Interval& c, const Interval& d ) { return intersect(intersect(a,b),intersect(c,d)); }       
+      
+      friend embree_ostream operator<<(embree_ostream cout, const Interval& a) {
+        return cout << "[" << a.lower << ", " << a.upper << "]";
+      }
+      
+      ////////////////////////////////////////////////////////////////////////////////
+      /// Constants
+      ////////////////////////////////////////////////////////////////////////////////
+      
+      __forceinline Interval( EmptyTy ) : lower(pos_inf), upper(neg_inf) {}
+      __forceinline Interval( FullTy  ) : lower(neg_inf), upper(pos_inf) {}
+    };
+
+  __forceinline bool isEmpty(const Interval<float>& v) { 
+    return v.lower > v.upper;
+  }
+
+  __forceinline vboolx isEmpty(const Interval<vfloatx>& v) {
+    return v.lower > v.upper;
+  }
+  
+  /*! subset relation */
+  template<typename T> __forceinline bool subset( const Interval<T>& a, const Interval<T>& b ) { 
+    return (a.lower > b.lower) && (a.upper < b.upper);
+  }
+
+  template<typename T> __forceinline bool subset( const Vec2<Interval<T>>& a, const Vec2<Interval<T>>& b ) { 
+    return subset(a.x,b.x) && subset(a.y,b.y);
+  }
+
+  template<typename T> __forceinline const Vec2<Interval<T>> intersect( const Vec2<Interval<T>>& a, const Vec2<Interval<T>>& b ) {
+    return Vec2<Interval<T>>(intersect(a.x,b.x),intersect(a.y,b.y));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Interval<T> select ( bool s, const Interval<T>& t, const Interval<T>& f ) {
+    return Interval<T>(select(s,t.lower,f.lower),select(s,t.upper,f.upper));
+  }
+
+  template<typename T> __forceinline Interval<T> select ( const typename T::Bool& s, const Interval<T>& t, const Interval<T>& f ) {
+    return Interval<T>(select(s,t.lower,f.lower),select(s,t.upper,f.upper));
+  }
+
+  __forceinline int numRoots(const Interval<float>& p0, const Interval<float>& p1)
+  {
+    float eps = 1E-4f;
+    bool neg0 = p0.lower < eps; bool pos0 = p0.upper > -eps;
+    bool neg1 = p1.lower < eps; bool pos1 = p1.upper > -eps;
+    return (neg0 && pos1) || (pos0 && neg1) || (neg0 && pos0) || (neg1 && pos1);
+  }
+  
+  typedef Interval<float> Interval1f;
+  typedef Vec2<Interval<float>> Interval2f;
+  typedef Vec3<Interval<float>> Interval3f;
+
+inline void swap(float& a, float& b) { float tmp = a; a = b; b = tmp; }
+
+inline Interval1f shift(const Interval1f& v, float shift) { return Interval1f(v.lower + shift, v.upper + shift); }
+
+#define TWO_PI (2.0*M_PI)
+inline Interval1f sin(Interval1f interval)
+{
+  if (interval.upper-interval.lower >= M_PI) { return Interval1f(-1.0, 1.0); }
+  if (interval.upper > TWO_PI)                 { interval = shift(interval, -TWO_PI*floor(interval.upper/TWO_PI)); }
+  if (interval.lower < 0)                      { interval = shift(interval, -TWO_PI*floor(interval.lower/TWO_PI)); }
+  float sinLower = sin(interval.lower);
+  float sinUpper = sin(interval.upper);
+  if (sinLower > sinUpper) swap(sinLower, sinUpper);
+  if (interval.lower <       M_PI / 2.0 && interval.upper >       M_PI / 2.0) sinUpper =  1.0;
+  if (interval.lower < 3.0 * M_PI / 2.0 && interval.upper > 3.0 * M_PI / 2.0) sinLower = -1.0;
+  return Interval1f(sinLower, sinUpper);
+}
+
+inline Interval1f cos(Interval1f interval)
+{
+  if (interval.upper-interval.lower >= M_PI) { return Interval1f(-1.0, 1.0); }
+  if (interval.upper > TWO_PI)                 { interval = shift(interval, -TWO_PI*floor(interval.upper/TWO_PI)); }
+  if (interval.lower < 0)                      { interval = shift(interval, -TWO_PI*floor(interval.lower/TWO_PI)); }
+  float cosLower = cos(interval.lower);
+  float cosUpper = cos(interval.upper);
+  if (cosLower > cosUpper) swap(cosLower, cosUpper);
+  if (interval.lower < M_PI && interval.upper > M_PI) cosLower = -1.0;
+  return Interval1f(cosLower, cosUpper);
+}
+#undef TWO_PI
+}
diff --git a/thirdparty/embree/common/math/lbbox.h b/thirdparty/embree/common/math/lbbox.h
new file mode 100644
index 0000000000..2b397a05c8
--- /dev/null
+++ b/thirdparty/embree/common/math/lbbox.h
@@ -0,0 +1,289 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bbox.h"
+#include "range.h"
+
+namespace embree
+{
+  template<typename T>
+    __forceinline std::pair<T,T> globalLinear(const std::pair<T,T>& v, const BBox1f& dt)
+  {
+    const float rcp_dt_size = float(1.0f)/dt.size();
+    const T g0 = lerp(v.first,v.second,-dt.lower*rcp_dt_size);
+    const T g1 = lerp(v.first,v.second,(1.0f-dt.lower)*rcp_dt_size);
+    return std::make_pair(g0,g1);
+  }
+
+  template<typename T>
+  struct LBBox
+  {
+  public:
+    __forceinline LBBox () {}
+
+    template<typename T1>
+    __forceinline LBBox ( const LBBox<T1>& other )
+    : bounds0(other.bounds0), bounds1(other.bounds1) {} 
+
+    __forceinline LBBox& operator= ( const LBBox& other ) { 
+      bounds0 = other.bounds0; bounds1 = other.bounds1; return *this; 
+    }
+
+    __forceinline LBBox (EmptyTy) 
+      : bounds0(EmptyTy()), bounds1(EmptyTy()) {}
+    
+    __forceinline explicit LBBox ( const BBox<T>& bounds) 
+      : bounds0(bounds), bounds1(bounds) { }
+    
+    __forceinline LBBox ( const BBox<T>& bounds0, const BBox<T>& bounds1) 
+      : bounds0(bounds0), bounds1(bounds1) { }
+
+    LBBox ( const avector<BBox<T>>& bounds ) 
+    {
+      assert(bounds.size());
+      BBox<T> b0 = bounds.front();
+      BBox<T> b1 = bounds.back();
+      for (size_t i=1; i<bounds.size()-1; i++) {
+        const float f = float(i)/float(bounds.size()-1);
+        const BBox<T> bt = lerp(b0,b1,f);
+        const T dlower = min(bounds[i].lower-bt.lower,T(zero));
+        const T dupper = max(bounds[i].upper-bt.upper,T(zero));
+        b0.lower += dlower; b1.lower += dlower;
+        b0.upper += dupper; b1.upper += dupper;
+      }
+      bounds0 = b0;
+      bounds1 = b1;
+    }
+
+    /*! calculates the linear bounds of a primitive for the specified time range */
+    template<typename BoundsFunc>
+    __forceinline LBBox(const BoundsFunc& bounds, const BBox1f& time_range, float numTimeSegments)
+    {
+      const float lower = time_range.lower*numTimeSegments;
+      const float upper = time_range.upper*numTimeSegments;
+      const float ilowerf = floor(lower);
+      const float iupperf = ceil(upper);
+      const int ilower = (int)ilowerf;
+      const int iupper = (int)iupperf;
+
+      const BBox<T> blower0 = bounds(ilower);
+      const BBox<T> bupper1 = bounds(iupper);
+
+      if (iupper-ilower == 1) {
+        bounds0 = lerp(blower0, bupper1, lower-ilowerf);
+        bounds1 = lerp(bupper1, blower0, iupperf-upper);
+        return;
+      }
+
+      const BBox<T> blower1 = bounds(ilower+1);
+      const BBox<T> bupper0 = bounds(iupper-1);
+      BBox<T> b0 = lerp(blower0, blower1, lower-ilowerf);
+      BBox<T> b1 = lerp(bupper1, bupper0, iupperf-upper);
+
+      for (int i = ilower+1; i < iupper; i++)
+      {
+        const float f = (float(i)/numTimeSegments - time_range.lower) / time_range.size();
+        const BBox<T> bt = lerp(b0, b1, f);
+        const BBox<T> bi = bounds(i);
+        const T dlower = min(bi.lower-bt.lower, T(zero));
+        const T dupper = max(bi.upper-bt.upper, T(zero));
+        b0.lower += dlower; b1.lower += dlower;
+        b0.upper += dupper; b1.upper += dupper;
+      }
+
+      bounds0 = b0;
+      bounds1 = b1;
+    }
+
+    /*! calculates the linear bounds of a primitive for the specified time range */
+    template<typename BoundsFunc>
+    __forceinline LBBox(const BoundsFunc& bounds, const BBox1f& time_range_in, const BBox1f& geom_time_range, float geom_time_segments)
+    {
+      /* normalize global time_range_in to local geom_time_range */
+      const BBox1f time_range((time_range_in.lower-geom_time_range.lower)/geom_time_range.size(),
+                              (time_range_in.upper-geom_time_range.lower)/geom_time_range.size());
+        
+      const float lower = time_range.lower*geom_time_segments;
+      const float upper = time_range.upper*geom_time_segments;
+      const float ilowerf = floor(lower);
+      const float iupperf = ceil(upper);
+      const float ilowerfc = max(0.0f,ilowerf);
+      const float iupperfc = min(iupperf,geom_time_segments);
+      const int   ilowerc = (int)ilowerfc;
+      const int   iupperc = (int)iupperfc;
+      assert(iupperc-ilowerc > 0);
+
+      /* this larger iteration range guarantees that we process borders of geom_time_range is (partially) inside time_range_in */
+      const int ilower_iter = max(-1,(int)ilowerf);
+      const int iupper_iter = min((int)iupperf,(int)geom_time_segments+1);
+        
+      const BBox<T> blower0 = bounds(ilowerc);
+      const BBox<T> bupper1 = bounds(iupperc);
+      if (iupper_iter-ilower_iter == 1) {
+        bounds0 = lerp(blower0, bupper1, max(0.0f,lower-ilowerfc));
+        bounds1 = lerp(bupper1, blower0, max(0.0f,iupperfc-upper));
+        return;
+      }
+
+      const BBox<T> blower1 = bounds(ilowerc+1);
+      const BBox<T> bupper0 = bounds(iupperc-1);
+      BBox<T> b0 = lerp(blower0, blower1, max(0.0f,lower-ilowerfc));
+      BBox<T> b1 = lerp(bupper1, bupper0, max(0.0f,iupperfc-upper));
+
+      for (int i = ilower_iter+1; i < iupper_iter; i++)
+      {
+        const float f = (float(i)/geom_time_segments - time_range.lower) / time_range.size();
+        const BBox<T> bt = lerp(b0, b1, f);
+        const BBox<T> bi = bounds(i);
+        const T dlower = min(bi.lower-bt.lower, T(zero));
+        const T dupper = max(bi.upper-bt.upper, T(zero));
+        b0.lower += dlower; b1.lower += dlower;
+        b0.upper += dupper; b1.upper += dupper;
+      }
+
+      bounds0 = b0;
+      bounds1 = b1;
+    }
+
+    /*! calculates the linear bounds of a primitive for the specified time range */
+    template<typename BoundsFunc>
+    __forceinline LBBox(const BoundsFunc& bounds, const range<int>& time_range, int numTimeSegments)
+    {
+      const int ilower = time_range.begin();
+      const int iupper = time_range.end();
+
+      BBox<T> b0 = bounds(ilower);
+      BBox<T> b1 = bounds(iupper);
+
+      if (iupper-ilower == 1)
+      {
+        bounds0 = b0;
+        bounds1 = b1;
+        return;
+      }
+  
+      for (int i = ilower+1; i<iupper; i++)
+      {
+        const float f = float(i - time_range.begin()) / float(time_range.size());
+        const BBox<T> bt = lerp(b0, b1, f);
+        const BBox<T> bi = bounds(i);
+        const T dlower = min(bi.lower-bt.lower, T(zero));
+        const T dupper = max(bi.upper-bt.upper, T(zero));
+        b0.lower += dlower; b1.lower += dlower;
+        b0.upper += dupper; b1.upper += dupper;
+      }
+
+      bounds0 = b0;
+      bounds1 = b1;
+    }
+
+  public:
+
+    __forceinline bool empty() const {
+      return bounds().empty();
+    }
+
+    __forceinline BBox<T> bounds () const {
+      return merge(bounds0,bounds1);
+    }
+
+    __forceinline BBox<T> interpolate( const float t ) const {
+      return lerp(bounds0,bounds1,t);
+    }
+
+    __forceinline LBBox<T> interpolate( const BBox1f& dt ) const {
+      return LBBox<T>(interpolate(dt.lower),interpolate(dt.upper));
+    }
+
+    __forceinline void extend( const LBBox& other ) {
+      bounds0.extend(other.bounds0);
+      bounds1.extend(other.bounds1);
+    }
+
+    __forceinline float expectedHalfArea() const;
+
+    __forceinline float expectedHalfArea(const BBox1f& dt) const {
+      return interpolate(dt).expectedHalfArea();
+    }
+
+    __forceinline float expectedApproxHalfArea() const {
+      return 0.5f*(halfArea(bounds0) + halfArea(bounds1));
+    }
+
+    /* calculates bounds for [0,1] time range from bounds in dt time range */
+    __forceinline LBBox global(const BBox1f& dt) const 
+    {
+      const float rcp_dt_size = 1.0f/dt.size();
+      const BBox<T> b0 = interpolate(-dt.lower*rcp_dt_size);
+      const BBox<T> b1 = interpolate((1.0f-dt.lower)*rcp_dt_size);
+      return LBBox(b0,b1);
+    }
+
+    /*! Comparison Operators */
+    //template<typename TT> friend __forceinline bool operator==( const LBBox<TT>& a, const LBBox<TT>& b ) { return a.bounds0 == b.bounds0 && a.bounds1 == b.bounds1; }
+    //template<typename TT> friend __forceinline bool operator!=( const LBBox<TT>& a, const LBBox<TT>& b ) { return a.bounds0 != b.bounds0 || a.bounds1 != b.bounds1; }
+    friend __forceinline bool operator==( const LBBox& a, const LBBox& b ) { return a.bounds0 == b.bounds0 && a.bounds1 == b.bounds1; }
+    friend __forceinline bool operator!=( const LBBox& a, const LBBox& b ) { return a.bounds0 != b.bounds0 || a.bounds1 != b.bounds1; }
+    
+    /*! output operator */
+    friend __forceinline embree_ostream operator<<(embree_ostream cout, const LBBox& box) {
+      return cout << "LBBox { " << box.bounds0 << "; " << box.bounds1 << " }";
+    }
+
+  public:
+    BBox<T> bounds0, bounds1;
+  };
+
+  /*! tests if box is finite */
+  template<typename T>
+    __forceinline bool isvalid( const LBBox<T>& v ) {
+    return isvalid(v.bounds0) && isvalid(v.bounds1);
+  }
+
+  template<typename T>
+    __forceinline bool isvalid_non_empty( const LBBox<T>& v ) {
+    return isvalid_non_empty(v.bounds0) && isvalid_non_empty(v.bounds1);
+  }
+  
+  template<typename T>
+    __forceinline T expectedArea(const T& a0, const T& a1, const T& b0, const T& b1)
+  {
+    const T da = a1-a0;
+    const T db = b1-b0;
+    return a0*b0+(a0*db+da*b0)*T(0.5f) + da*db*T(1.0f/3.0f);
+  }
+  
+  template<> __forceinline float LBBox<Vec3fa>::expectedHalfArea() const 
+  {
+    const Vec3fa d0 = bounds0.size();
+    const Vec3fa d1 = bounds1.size();
+    return reduce_add(expectedArea(Vec3fa(d0.x,d0.y,d0.z),
+                                   Vec3fa(d1.x,d1.y,d1.z),
+                                   Vec3fa(d0.y,d0.z,d0.x),
+                                   Vec3fa(d1.y,d1.z,d1.x)));
+  }
+
+  template<typename T>
+  __forceinline float expectedApproxHalfArea(const LBBox<T>& box) {
+    return box.expectedApproxHalfArea(); 
+  }
+
+  template<typename T>
+  __forceinline LBBox<T> merge(const LBBox<T>& a, const LBBox<T>& b) {
+    return LBBox<T>(merge(a.bounds0, b.bounds0), merge(a.bounds1, b.bounds1));
+  }
+
+   /*! subset relation */
+  template<typename T> __inline bool subset( const LBBox<T>& a, const LBBox<T>& b ) {
+    return subset(a.bounds0,b.bounds0) && subset(a.bounds1,b.bounds1);
+  }
+
+  /*! default template instantiations */
+  typedef LBBox<float> LBBox1f;
+  typedef LBBox<Vec2f> LBBox2f;
+  typedef LBBox<Vec3f> LBBox3f;
+  typedef LBBox<Vec3fa> LBBox3fa;
+  typedef LBBox<Vec3fx> LBBox3fx;
+}
diff --git a/thirdparty/embree/common/math/linearspace2.h b/thirdparty/embree/common/math/linearspace2.h
new file mode 100644
index 0000000000..184ee695fb
--- /dev/null
+++ b/thirdparty/embree/common/math/linearspace2.h
@@ -0,0 +1,148 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "vec2.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// 2D Linear Transform (2x2 Matrix)
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> struct LinearSpace2
+  {
+    typedef T Vector;
+    typedef typename T::Scalar Scalar;
+
+    /*! default matrix constructor */
+    __forceinline LinearSpace2           ( ) {}
+    __forceinline LinearSpace2           ( const LinearSpace2& other ) { vx = other.vx; vy = other.vy; }
+    __forceinline LinearSpace2& operator=( const LinearSpace2& other ) { vx = other.vx; vy = other.vy; return *this; }
+
+    template<typename L1> __forceinline LinearSpace2( const LinearSpace2<L1>& s ) : vx(s.vx), vy(s.vy) {}
+
+    /*! matrix construction from column vectors */
+    __forceinline LinearSpace2(const Vector& vx, const Vector& vy)
+      : vx(vx), vy(vy) {}
+
+    /*! matrix construction from row mayor data */
+    __forceinline LinearSpace2(const Scalar& m00, const Scalar& m01, 
+                               const Scalar& m10, const Scalar& m11)
+      : vx(m00,m10), vy(m01,m11) {}
+
+    /*! compute the determinant of the matrix */
+    __forceinline const Scalar det() const { return vx.x*vy.y - vx.y*vy.x; }
+
+    /*! compute adjoint matrix */
+    __forceinline const LinearSpace2 adjoint() const { return LinearSpace2(vy.y,-vy.x,-vx.y,vx.x); }
+
+    /*! compute inverse matrix */
+    __forceinline const LinearSpace2 inverse() const { return adjoint()/det(); }
+
+    /*! compute transposed matrix */
+    __forceinline const LinearSpace2 transposed() const { return LinearSpace2(vx.x,vx.y,vy.x,vy.y); }
+
+    /*! returns first row of matrix */
+    __forceinline Vector row0() const { return Vector(vx.x,vy.x); }
+
+    /*! returns second row of matrix */
+    __forceinline Vector row1() const { return Vector(vx.y,vy.y); }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline LinearSpace2( ZeroTy ) : vx(zero), vy(zero) {}
+    __forceinline LinearSpace2( OneTy ) : vx(one, zero), vy(zero, one) {}
+
+    /*! return matrix for scaling */
+    static __forceinline LinearSpace2 scale(const Vector& s) {
+      return LinearSpace2(s.x,   0,
+                          0  , s.y);
+    }
+
+    /*! return matrix for rotation */
+    static __forceinline LinearSpace2 rotate(const Scalar& r) {
+      Scalar s = sin(r), c = cos(r);
+      return LinearSpace2(c, -s,
+                          s,  c);
+    }
+
+    /*! return closest orthogonal matrix (i.e. a general rotation including reflection) */
+    LinearSpace2 orthogonal() const 
+    {
+      LinearSpace2 m = *this;
+
+      // mirrored?
+      Scalar mirror(one);
+      if (m.det() < Scalar(zero)) {
+        m.vx = -m.vx;
+        mirror = -mirror;
+      }
+
+      // rotation
+      for (int i = 0; i < 99; i++) {
+        const LinearSpace2 m_next = 0.5 * (m + m.transposed().inverse());
+        const LinearSpace2 d = m_next - m;
+        m = m_next;
+        // norm^2 of difference small enough?
+        if (max(dot(d.vx, d.vx), dot(d.vy, d.vy)) < 1e-8)
+          break;
+      }
+
+      // rotation * mirror_x
+      return LinearSpace2(mirror*m.vx, m.vy);
+    }
+
+  public:
+
+    /*! the column vectors of the matrix */
+    Vector vx,vy;
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline LinearSpace2<T> operator -( const LinearSpace2<T>& a ) { return LinearSpace2<T>(-a.vx,-a.vy); }
+  template<typename T> __forceinline LinearSpace2<T> operator +( const LinearSpace2<T>& a ) { return LinearSpace2<T>(+a.vx,+a.vy); }
+  template<typename T> __forceinline LinearSpace2<T> rcp       ( const LinearSpace2<T>& a ) { return a.inverse(); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline LinearSpace2<T> operator +( const LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return LinearSpace2<T>(a.vx+b.vx,a.vy+b.vy); }
+  template<typename T> __forceinline LinearSpace2<T> operator -( const LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return LinearSpace2<T>(a.vx-b.vx,a.vy-b.vy); }
+
+  template<typename T> __forceinline LinearSpace2<T> operator*(const typename T::Scalar & a, const LinearSpace2<T>& b) { return LinearSpace2<T>(a*b.vx, a*b.vy); }
+  template<typename T> __forceinline T               operator*(const LinearSpace2<T>& a, const T              & b) { return b.x*a.vx + b.y*a.vy; }
+  template<typename T> __forceinline LinearSpace2<T> operator*(const LinearSpace2<T>& a, const LinearSpace2<T>& b) { return LinearSpace2<T>(a*b.vx, a*b.vy); }
+
+  template<typename T> __forceinline LinearSpace2<T> operator/(const LinearSpace2<T>& a, const typename T::Scalar & b) { return LinearSpace2<T>(a.vx/b, a.vy/b); }
+  template<typename T> __forceinline LinearSpace2<T> operator/(const LinearSpace2<T>& a, const LinearSpace2<T>& b) { return a * rcp(b); }
+
+  template<typename T> __forceinline LinearSpace2<T>& operator *=( LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return a = a * b; }
+  template<typename T> __forceinline LinearSpace2<T>& operator /=( LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return a = a / b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline bool operator ==( const LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return a.vx == b.vx && a.vy == b.vy; }
+  template<typename T> __forceinline bool operator !=( const LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return a.vx != b.vx || a.vy != b.vy; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> static embree_ostream operator<<(embree_ostream cout, const LinearSpace2<T>& m) {
+    return cout << "{ vx = " << m.vx << ", vy = " << m.vy << "}";
+  }
+
+  /*! Shortcuts for common linear spaces. */
+  typedef LinearSpace2<Vec2f> LinearSpace2f;
+  typedef LinearSpace2<Vec2fa> LinearSpace2fa;
+}
diff --git a/thirdparty/embree/common/math/linearspace3.h b/thirdparty/embree/common/math/linearspace3.h
new file mode 100644
index 0000000000..9eaa2cc2bb
--- /dev/null
+++ b/thirdparty/embree/common/math/linearspace3.h
@@ -0,0 +1,213 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "vec3.h"
+#include "quaternion.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// 3D Linear Transform (3x3 Matrix)
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> struct LinearSpace3
+  {
+    typedef T Vector;
+    typedef typename T::Scalar Scalar;
+
+    /*! default matrix constructor */
+    __forceinline LinearSpace3           ( ) {}
+    __forceinline LinearSpace3           ( const LinearSpace3& other ) { vx = other.vx; vy = other.vy; vz = other.vz; }
+    __forceinline LinearSpace3& operator=( const LinearSpace3& other ) { vx = other.vx; vy = other.vy; vz = other.vz; return *this; }
+
+    template<typename L1> __forceinline LinearSpace3( const LinearSpace3<L1>& s ) : vx(s.vx), vy(s.vy), vz(s.vz) {}
+
+    /*! matrix construction from column vectors */
+    __forceinline LinearSpace3(const Vector& vx, const Vector& vy, const Vector& vz)
+      : vx(vx), vy(vy), vz(vz) {}
+
+    /*! construction from quaternion */
+    __forceinline LinearSpace3( const QuaternionT<Scalar>& q )
+      : vx((q.r*q.r + q.i*q.i - q.j*q.j - q.k*q.k), 2.0f*(q.i*q.j + q.r*q.k), 2.0f*(q.i*q.k - q.r*q.j))
+      , vy(2.0f*(q.i*q.j - q.r*q.k), (q.r*q.r - q.i*q.i + q.j*q.j - q.k*q.k), 2.0f*(q.j*q.k + q.r*q.i))
+      , vz(2.0f*(q.i*q.k + q.r*q.j), 2.0f*(q.j*q.k - q.r*q.i), (q.r*q.r - q.i*q.i - q.j*q.j + q.k*q.k)) {}
+
+    /*! matrix construction from row mayor data */
+    __forceinline LinearSpace3(const Scalar& m00, const Scalar& m01, const Scalar& m02,
+                               const Scalar& m10, const Scalar& m11, const Scalar& m12,
+                               const Scalar& m20, const Scalar& m21, const Scalar& m22)
+      : vx(m00,m10,m20), vy(m01,m11,m21), vz(m02,m12,m22) {}
+
+    /*! compute the determinant of the matrix */
+    __forceinline const Scalar det() const { return dot(vx,cross(vy,vz)); }
+
+    /*! compute adjoint matrix */
+    __forceinline const LinearSpace3 adjoint() const { return LinearSpace3(cross(vy,vz),cross(vz,vx),cross(vx,vy)).transposed(); }
+
+    /*! compute inverse matrix */
+    __forceinline const LinearSpace3 inverse() const { return adjoint()/det(); }
+
+    /*! compute transposed matrix */
+    __forceinline const LinearSpace3 transposed() const { return LinearSpace3(vx.x,vx.y,vx.z,vy.x,vy.y,vy.z,vz.x,vz.y,vz.z); }
+
+    /*! returns first row of matrix */
+    __forceinline Vector row0() const { return Vector(vx.x,vy.x,vz.x); }
+
+    /*! returns second row of matrix */
+    __forceinline Vector row1() const { return Vector(vx.y,vy.y,vz.y); }
+
+    /*! returns third row of matrix */
+    __forceinline Vector row2() const { return Vector(vx.z,vy.z,vz.z); }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline LinearSpace3( ZeroTy ) : vx(zero), vy(zero), vz(zero) {}
+    __forceinline LinearSpace3( OneTy ) : vx(one, zero, zero), vy(zero, one, zero), vz(zero, zero, one) {}
+
+    /*! return matrix for scaling */
+    static __forceinline LinearSpace3 scale(const Vector& s) {
+      return LinearSpace3(s.x,   0,   0,
+                          0  , s.y,   0,
+                          0  ,   0, s.z);
+    }
+
+    /*! return matrix for rotation around arbitrary axis */
+    static __forceinline LinearSpace3 rotate(const Vector& _u, const Scalar& r) {
+      Vector u = normalize(_u);
+      Scalar s = sin(r), c = cos(r);
+      return LinearSpace3(u.x*u.x+(1-u.x*u.x)*c,  u.x*u.y*(1-c)-u.z*s,    u.x*u.z*(1-c)+u.y*s,
+                          u.x*u.y*(1-c)+u.z*s,    u.y*u.y+(1-u.y*u.y)*c,  u.y*u.z*(1-c)-u.x*s,
+                          u.x*u.z*(1-c)-u.y*s,    u.y*u.z*(1-c)+u.x*s,    u.z*u.z+(1-u.z*u.z)*c);
+    }
+
+  public:
+
+    /*! the column vectors of the matrix */
+    Vector vx,vy,vz;
+  };
+
+  /*! compute transposed matrix */
+  template<> __forceinline const LinearSpace3<Vec3fa> LinearSpace3<Vec3fa>::transposed() const { 
+    vfloat4 rx,ry,rz; transpose((vfloat4&)vx,(vfloat4&)vy,(vfloat4&)vz,vfloat4(zero),rx,ry,rz);
+    return LinearSpace3<Vec3fa>(Vec3fa(rx),Vec3fa(ry),Vec3fa(rz)); 
+  }
+
+  template<typename T>
+    __forceinline const LinearSpace3<T> transposed(const LinearSpace3<T>& xfm) { 
+    return xfm.transposed();
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline LinearSpace3<T> operator -( const LinearSpace3<T>& a ) { return LinearSpace3<T>(-a.vx,-a.vy,-a.vz); }
+  template<typename T> __forceinline LinearSpace3<T> operator +( const LinearSpace3<T>& a ) { return LinearSpace3<T>(+a.vx,+a.vy,+a.vz); }
+  template<typename T> __forceinline LinearSpace3<T> rcp       ( const LinearSpace3<T>& a ) { return a.inverse(); }
+
+  /* constructs a coordinate frame form a normalized normal */
+  template<typename T> __forceinline LinearSpace3<T> frame(const T& N) 
+  {
+    const T dx0(0,N.z,-N.y);
+    const T dx1(-N.z,0,N.x);
+    const T dx = normalize(select(dot(dx0,dx0) > dot(dx1,dx1),dx0,dx1));
+    const T dy = normalize(cross(N,dx));
+    return LinearSpace3<T>(dx,dy,N);
+  }
+
+  /* constructs a coordinate frame from a normal and approximate x-direction */
+  template<typename T> __forceinline LinearSpace3<T> frame(const T& N, const T& dxi)
+  {
+    if (abs(dot(dxi,N)) > 0.99f) return frame(N); // fallback in case N and dxi are very parallel
+    const T dx = normalize(cross(dxi,N));
+    const T dy = normalize(cross(N,dx));
+    return LinearSpace3<T>(dx,dy,N);
+  }
+  
+  /* clamps linear space to range -1 to +1 */
+  template<typename T> __forceinline LinearSpace3<T> clamp(const LinearSpace3<T>& space) {
+    return LinearSpace3<T>(clamp(space.vx,T(-1.0f),T(1.0f)),
+                           clamp(space.vy,T(-1.0f),T(1.0f)),
+                           clamp(space.vz,T(-1.0f),T(1.0f)));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline LinearSpace3<T> operator +( const LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return LinearSpace3<T>(a.vx+b.vx,a.vy+b.vy,a.vz+b.vz); }
+  template<typename T> __forceinline LinearSpace3<T> operator -( const LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return LinearSpace3<T>(a.vx-b.vx,a.vy-b.vy,a.vz-b.vz); }
+
+  template<typename T> __forceinline LinearSpace3<T> operator*(const typename T::Scalar & a, const LinearSpace3<T>& b) { return LinearSpace3<T>(a*b.vx, a*b.vy, a*b.vz); }
+  template<typename T> __forceinline T               operator*(const LinearSpace3<T>& a, const T              & b) { return madd(T(b.x),a.vx,madd(T(b.y),a.vy,T(b.z)*a.vz)); }
+  template<typename T> __forceinline LinearSpace3<T> operator*(const LinearSpace3<T>& a, const LinearSpace3<T>& b) { return LinearSpace3<T>(a*b.vx, a*b.vy, a*b.vz); }
+
+  template<typename T> __forceinline LinearSpace3<T> operator/(const LinearSpace3<T>& a, const typename T::Scalar & b) { return LinearSpace3<T>(a.vx/b, a.vy/b, a.vz/b); }
+  template<typename T> __forceinline LinearSpace3<T> operator/(const LinearSpace3<T>& a, const LinearSpace3<T>& b) { return a * rcp(b); }
+
+  template<typename T> __forceinline LinearSpace3<T>& operator *=( LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return a = a * b; }
+  template<typename T> __forceinline LinearSpace3<T>& operator /=( LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return a = a / b; }
+
+  template<typename T> __forceinline T       xfmPoint (const LinearSpace3<T>& s, const T      & a) { return madd(T(a.x),s.vx,madd(T(a.y),s.vy,T(a.z)*s.vz)); }
+  template<typename T> __forceinline T       xfmVector(const LinearSpace3<T>& s, const T      & a) { return madd(T(a.x),s.vx,madd(T(a.y),s.vy,T(a.z)*s.vz)); }
+  template<typename T> __forceinline T       xfmNormal(const LinearSpace3<T>& s, const T      & a) { return xfmVector(s.inverse().transposed(),a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline bool operator ==( const LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return a.vx == b.vx && a.vy == b.vy && a.vz == b.vz; }
+  template<typename T> __forceinline bool operator !=( const LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return a.vx != b.vx || a.vy != b.vy || a.vz != b.vz; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline LinearSpace3<T> select ( const typename T::Scalar::Bool& s, const LinearSpace3<T>& t, const LinearSpace3<T>& f ) {
+    return LinearSpace3<T>(select(s,t.vx,f.vx),select(s,t.vy,f.vy),select(s,t.vz,f.vz));
+  }
+
+  /*! blending */
+  template<typename T>
+    __forceinline LinearSpace3<T> lerp(const LinearSpace3<T>& l0, const LinearSpace3<T>& l1, const float t) 
+  {
+    return LinearSpace3<T>(lerp(l0.vx,l1.vx,t),
+                           lerp(l0.vy,l1.vy,t),
+                           lerp(l0.vz,l1.vz,t));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> static embree_ostream operator<<(embree_ostream cout, const LinearSpace3<T>& m) {
+    return cout << "{ vx = " << m.vx << ", vy = " << m.vy << ", vz = " << m.vz << "}";
+  }
+
+  /*! Shortcuts for common linear spaces. */
+  typedef LinearSpace3<Vec3f> LinearSpace3f;
+  typedef LinearSpace3<Vec3fa> LinearSpace3fa;
+  typedef LinearSpace3<Vec3fx> LinearSpace3fx;
+  typedef LinearSpace3<Vec3ff> LinearSpace3ff;
+
+  template<int N> using LinearSpace3vf = LinearSpace3<Vec3<vfloat<N>>>;
+  typedef LinearSpace3<Vec3<vfloat<4>>>  LinearSpace3vf4;
+  typedef LinearSpace3<Vec3<vfloat<8>>>  LinearSpace3vf8;
+  typedef LinearSpace3<Vec3<vfloat<16>>> LinearSpace3vf16;
+
+  /*! blending */
+  template<typename T, typename S>
+    __forceinline LinearSpace3<T> lerp(const LinearSpace3<T>& l0,
+                                       const LinearSpace3<T>& l1,
+                                       const S& t)
+  {
+    return LinearSpace3<T>(lerp(l0.vx,l1.vx,t),
+                           lerp(l0.vy,l1.vy,t),
+                           lerp(l0.vz,l1.vz,t));
+  }
+
+}
diff --git a/thirdparty/embree/common/math/math.h b/thirdparty/embree/common/math/math.h
new file mode 100644
index 0000000000..4bc54c1a6a
--- /dev/null
+++ b/thirdparty/embree/common/math/math.h
@@ -0,0 +1,369 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+#include "../sys/intrinsics.h"
+#include "constants.h"
+#include <cmath>
+
+#if defined(__ARM_NEON)
+#include "../simd/arm/emulation.h"
+#else
+#include <emmintrin.h>
+#include <xmmintrin.h>
+#include <immintrin.h>
+#endif
+
+#if defined(__WIN32__)
+#if defined(_MSC_VER) && (_MSC_VER <= 1700)
+namespace std
+{
+  __forceinline bool isinf ( const float x ) { return _finite(x) == 0; }
+  __forceinline bool isnan ( const float x ) { return _isnan(x) != 0; }
+  __forceinline bool isfinite (const float x) { return _finite(x) != 0; }
+}
+#endif
+#endif
+
+namespace embree
+{
+  __forceinline bool isvalid ( const float& v ) {
+    return (v > -FLT_LARGE) & (v < +FLT_LARGE);
+  }
+
+  __forceinline int cast_f2i(float f) {
+    union { float f; int i; } v; v.f = f; return v.i;
+  }
+
+  __forceinline float cast_i2f(int i) {
+    union { float f; int i; } v; v.i = i; return v.f;
+  }
+
+  __forceinline int   toInt  (const float& a) { return int(a); }
+  __forceinline float toFloat(const int&   a) { return float(a); }
+
+#if defined(__WIN32__)
+  __forceinline bool finite ( const float x ) { return _finite(x) != 0; }
+#endif
+
+  __forceinline float sign ( const float x ) { return x<0?-1.0f:1.0f; }
+  __forceinline float sqr  ( const float x ) { return x*x; }
+
+  __forceinline float rcp  ( const float x )
+  {
+    const __m128 a = _mm_set_ss(x);
+
+#if defined(__AVX512VL__)
+    const __m128 r = _mm_rcp14_ss(_mm_set_ss(0.0f),a);
+#else
+    const __m128 r = _mm_rcp_ss(a);
+#endif
+
+#if defined(__AVX2__)
+    return _mm_cvtss_f32(_mm_mul_ss(r,_mm_fnmadd_ss(r, a, _mm_set_ss(2.0f))));
+#else
+    return _mm_cvtss_f32(_mm_mul_ss(r,_mm_sub_ss(_mm_set_ss(2.0f), _mm_mul_ss(r, a))));
+#endif
+  }
+
+  __forceinline float signmsk ( const float x ) {
+    return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(0x80000000))));
+  }
+  __forceinline float xorf( const float x, const float y ) {
+    return _mm_cvtss_f32(_mm_xor_ps(_mm_set_ss(x),_mm_set_ss(y)));
+  }
+  __forceinline float andf( const float x, const unsigned y ) {
+    return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(y))));
+  }
+  __forceinline float rsqrt( const float x )
+  {
+    const __m128 a = _mm_set_ss(x);
+#if defined(__AVX512VL__)
+    __m128 r = _mm_rsqrt14_ss(_mm_set_ss(0.0f),a);
+#else
+    __m128 r = _mm_rsqrt_ss(a);
+#endif
+    r = _mm_add_ss(_mm_mul_ss(_mm_set_ss(1.5f), r), _mm_mul_ss(_mm_mul_ss(_mm_mul_ss(a, _mm_set_ss(-0.5f)), r), _mm_mul_ss(r, r)));
+#if defined(__ARM_NEON)
+    r = _mm_add_ss(_mm_mul_ss(_mm_set_ss(1.5f), r), _mm_mul_ss(_mm_mul_ss(_mm_mul_ss(a, _mm_set_ss(-0.5f)), r), _mm_mul_ss(r, r)));
+#endif
+    return _mm_cvtss_f32(r);
+  }
+
+#if defined(__WIN32__) && defined(_MSC_VER) && (_MSC_VER <= 1700)
+  __forceinline float nextafter(float x, float y) { if ((x<y) == (x>0)) return x*(1.1f+float(ulp)); else return x*(0.9f-float(ulp)); }
+  __forceinline double nextafter(double x, double y) { return _nextafter(x, y); }
+  __forceinline int roundf(float f) { return (int)(f + 0.5f); }
+#else
+  __forceinline float nextafter(float x, float y) { return ::nextafterf(x, y); }
+  __forceinline double nextafter(double x, double y) { return ::nextafter(x, y); }
+#endif
+
+  __forceinline float abs  ( const float x ) { return ::fabsf(x); }
+  __forceinline float acos ( const float x ) { return ::acosf (x); }
+  __forceinline float asin ( const float x ) { return ::asinf (x); }
+  __forceinline float atan ( const float x ) { return ::atanf (x); }
+  __forceinline float atan2( const float y, const float x ) { return ::atan2f(y, x); }
+  __forceinline float cos  ( const float x ) { return ::cosf  (x); }
+  __forceinline float cosh ( const float x ) { return ::coshf (x); }
+  __forceinline float exp  ( const float x ) { return ::expf  (x); }
+  __forceinline float fmod ( const float x, const float y ) { return ::fmodf (x, y); }
+  __forceinline float log  ( const float x ) { return ::logf  (x); }
+  __forceinline float log10( const float x ) { return ::log10f(x); }
+  __forceinline float pow  ( const float x, const float y ) { return ::powf  (x, y); }
+  __forceinline float sin  ( const float x ) { return ::sinf  (x); }
+  __forceinline float sinh ( const float x ) { return ::sinhf (x); }
+  __forceinline float sqrt ( const float x ) { return ::sqrtf (x); }
+  __forceinline float tan  ( const float x ) { return ::tanf  (x); }
+  __forceinline float tanh ( const float x ) { return ::tanhf (x); }
+  __forceinline float floor( const float x ) { return ::floorf (x); }
+  __forceinline float ceil ( const float x ) { return ::ceilf (x); }
+  __forceinline float frac ( const float x ) { return x-floor(x); }
+
+  __forceinline double abs  ( const double x ) { return ::fabs(x); }
+  __forceinline double sign ( const double x ) { return x<0?-1.0:1.0; }
+  __forceinline double acos ( const double x ) { return ::acos (x); }
+  __forceinline double asin ( const double x ) { return ::asin (x); }
+  __forceinline double atan ( const double x ) { return ::atan (x); }
+  __forceinline double atan2( const double y, const double x ) { return ::atan2(y, x); }
+  __forceinline double cos  ( const double x ) { return ::cos  (x); }
+  __forceinline double cosh ( const double x ) { return ::cosh (x); }
+  __forceinline double exp  ( const double x ) { return ::exp  (x); }
+  __forceinline double fmod ( const double x, const double y ) { return ::fmod (x, y); }
+  __forceinline double log  ( const double x ) { return ::log  (x); }
+  __forceinline double log10( const double x ) { return ::log10(x); }
+  __forceinline double pow  ( const double x, const double y ) { return ::pow  (x, y); }
+  __forceinline double rcp  ( const double x ) { return 1.0/x; }
+  __forceinline double rsqrt( const double x ) { return 1.0/::sqrt(x); }
+  __forceinline double sin  ( const double x ) { return ::sin  (x); }
+  __forceinline double sinh ( const double x ) { return ::sinh (x); }
+  __forceinline double sqr  ( const double x ) { return x*x; }
+  __forceinline double sqrt ( const double x ) { return ::sqrt (x); }
+  __forceinline double tan  ( const double x ) { return ::tan  (x); }
+  __forceinline double tanh ( const double x ) { return ::tanh (x); }
+  __forceinline double floor( const double x ) { return ::floor (x); }
+  __forceinline double ceil ( const double x ) { return ::ceil (x); }
+
+#if defined(__SSE4_1__)
+  __forceinline float mini(float a, float b) {
+    const __m128i ai = _mm_castps_si128(_mm_set_ss(a));
+    const __m128i bi = _mm_castps_si128(_mm_set_ss(b));
+    const __m128i ci = _mm_min_epi32(ai,bi);
+    return _mm_cvtss_f32(_mm_castsi128_ps(ci));
+  }
+#endif
+
+#if defined(__SSE4_1__)
+  __forceinline float maxi(float a, float b) {
+    const __m128i ai = _mm_castps_si128(_mm_set_ss(a));
+    const __m128i bi = _mm_castps_si128(_mm_set_ss(b));
+    const __m128i ci = _mm_max_epi32(ai,bi);
+    return _mm_cvtss_f32(_mm_castsi128_ps(ci));
+  }
+#endif
+
+  template<typename T>
+    __forceinline T twice(const T& a) { return a+a; }
+
+  __forceinline      int min(int      a, int      b) { return a<b ? a:b; }
+  __forceinline unsigned min(unsigned a, unsigned b) { return a<b ? a:b; }
+  __forceinline  int64_t min(int64_t  a, int64_t  b) { return a<b ? a:b; }
+  __forceinline    float min(float    a, float    b) { return a<b ? a:b; }
+  __forceinline   double min(double   a, double   b) { return a<b ? a:b; }
+#if defined(__64BIT__)
+  __forceinline   size_t min(size_t   a, size_t   b) { return a<b ? a:b; }
+#endif
+
+  template<typename T> __forceinline T min(const T& a, const T& b, const T& c) { return min(min(a,b),c); }
+  template<typename T> __forceinline T min(const T& a, const T& b, const T& c, const T& d) { return min(min(a,b),min(c,d)); }
+  template<typename T> __forceinline T min(const T& a, const T& b, const T& c, const T& d, const T& e) { return min(min(min(a,b),min(c,d)),e); }
+
+  template<typename T> __forceinline T mini(const T& a, const T& b, const T& c) { return mini(mini(a,b),c); }
+  template<typename T> __forceinline T mini(const T& a, const T& b, const T& c, const T& d) { return mini(mini(a,b),mini(c,d)); }
+  template<typename T> __forceinline T mini(const T& a, const T& b, const T& c, const T& d, const T& e) { return mini(mini(mini(a,b),mini(c,d)),e); }
+
+  __forceinline      int max(int      a, int      b) { return a<b ? b:a; }
+  __forceinline unsigned max(unsigned a, unsigned b) { return a<b ? b:a; }
+  __forceinline  int64_t max(int64_t  a, int64_t  b) { return a<b ? b:a; }
+  __forceinline    float max(float    a, float    b) { return a<b ? b:a; }
+  __forceinline   double max(double   a, double   b) { return a<b ? b:a; }
+#if defined(__64BIT__)
+  __forceinline   size_t max(size_t   a, size_t   b) { return a<b ? b:a; }
+#endif
+
+  template<typename T> __forceinline T max(const T& a, const T& b, const T& c) { return max(max(a,b),c); }
+  template<typename T> __forceinline T max(const T& a, const T& b, const T& c, const T& d) { return max(max(a,b),max(c,d)); }
+  template<typename T> __forceinline T max(const T& a, const T& b, const T& c, const T& d, const T& e) { return max(max(max(a,b),max(c,d)),e); }
+
+  template<typename T> __forceinline T maxi(const T& a, const T& b, const T& c) { return maxi(maxi(a,b),c); }
+  template<typename T> __forceinline T maxi(const T& a, const T& b, const T& c, const T& d) { return maxi(maxi(a,b),maxi(c,d)); }
+  template<typename T> __forceinline T maxi(const T& a, const T& b, const T& c, const T& d, const T& e) { return maxi(maxi(maxi(a,b),maxi(c,d)),e); }
+
+#if defined(__MACOSX__)
+  __forceinline ssize_t min(ssize_t a, ssize_t b) { return a<b ? a:b; }
+  __forceinline ssize_t max(ssize_t a, ssize_t b) { return a<b ? b:a; }
+#endif
+
+#if defined(__MACOSX__) && !defined(__INTEL_COMPILER)
+  __forceinline void sincosf(float x, float *sin, float *cos) {
+    __sincosf(x,sin,cos);
+  }
+#endif
+
+#if defined(__WIN32__) || defined(__FreeBSD__)
+  __forceinline void sincosf(float x, float *s, float *c) {
+    *s = sinf(x); *c = cosf(x);
+  }
+#endif
+
+  template<typename T> __forceinline T clamp(const T& x, const T& lower = T(zero), const T& upper = T(one)) { return max(min(x,upper),lower); }
+  template<typename T> __forceinline T clampz(const T& x, const T& upper) { return max(T(zero), min(x,upper)); }
+
+  template<typename T> __forceinline T  deg2rad ( const T& x )  { return x * T(1.74532925199432957692e-2f); }
+  template<typename T> __forceinline T  rad2deg ( const T& x )  { return x * T(5.72957795130823208768e1f); }
+  template<typename T> __forceinline T  sin2cos ( const T& x )  { return sqrt(max(T(zero),T(one)-x*x)); }
+  template<typename T> __forceinline T  cos2sin ( const T& x )  { return sin2cos(x); }
+
+#if defined(__AVX2__)
+  __forceinline float madd  ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fmadd_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); }
+  __forceinline float msub  ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fmsub_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); }
+  __forceinline float nmadd ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fnmadd_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); }
+  __forceinline float nmsub ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fnmsub_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); }
+#else
+  __forceinline float madd  ( const float a, const float b, const float c) { return a*b+c; }
+  __forceinline float msub  ( const float a, const float b, const float c) { return a*b-c; }
+  __forceinline float nmadd ( const float a, const float b, const float c) { return -a*b+c;}
+  __forceinline float nmsub ( const float a, const float b, const float c) { return -a*b-c; }
+#endif
+
+  /*! random functions */
+  template<typename T> T random() { return T(0); }
+#if defined(_WIN32)
+  template<> __forceinline int      random() { return int(rand()) ^ (int(rand()) << 8) ^ (int(rand()) << 16); }
+  template<> __forceinline uint32_t random() { return uint32_t(rand()) ^ (uint32_t(rand()) << 8) ^ (uint32_t(rand()) << 16); }
+#else
+  template<> __forceinline int      random() { return int(rand()); }
+  template<> __forceinline uint32_t random() { return uint32_t(rand()) ^ (uint32_t(rand()) << 16); }
+#endif
+  template<> __forceinline float  random() { return rand()/float(RAND_MAX); }
+  template<> __forceinline double random() { return rand()/double(RAND_MAX); }
+
+#if _WIN32
+  __forceinline double drand48() {
+    return double(rand())/double(RAND_MAX);
+  }
+
+  __forceinline void srand48(long seed) {
+    return srand(seed);
+  }
+#endif
+
+  /*! selects */
+  __forceinline bool  select(bool s, bool  t , bool f) { return s ? t : f; }
+  __forceinline int   select(bool s, int   t,   int f) { return s ? t : f; }
+  __forceinline float select(bool s, float t, float f) { return s ? t : f; }
+
+  __forceinline bool all(bool s) { return s; }
+
+  __forceinline float lerp(const float v0, const float v1, const float t) {
+    return madd(1.0f-t,v0,t*v1);
+  }
+
+  template<typename T>
+    __forceinline T lerp2(const float x0, const float x1, const float x2, const float x3, const T& u, const T& v) {
+    return madd((1.0f-u),madd((1.0f-v),T(x0),v*T(x2)),u*madd((1.0f-v),T(x1),v*T(x3)));
+  }
+
+  /*! exchange */
+  template<typename T> __forceinline void xchg ( T& a, T& b ) { const T tmp = a; a = b; b = tmp; }
+
+  /*  load/store */
+  template<typename Ty> struct mem;
+ 
+  template<> struct mem<float> {
+    static __forceinline float load (bool mask, const void* ptr) { return mask ? *(float*)ptr : 0.0f; }
+    static __forceinline float loadu(bool mask, const void* ptr) { return mask ? *(float*)ptr : 0.0f; }
+  
+    static __forceinline void store (bool mask, void* ptr, const float v) { if (mask) *(float*)ptr = v; }
+    static __forceinline void storeu(bool mask, void* ptr, const float v) { if (mask) *(float*)ptr = v; }
+  };
+  
+  /*! bit reverse operation */
+  template<class T>
+    __forceinline T bitReverse(const T& vin)
+  {
+    T v = vin;
+    v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
+    v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
+    v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
+    v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
+    v = ( v >> 16             ) | ( v               << 16);
+    return v;
+  }
+
+  /*! bit interleave operation */
+  template<class T>
+    __forceinline T bitInterleave(const T& xin, const T& yin, const T& zin)
+  {
+	T x = xin, y = yin, z = zin;
+    x = (x | (x << 16)) & 0x030000FF;
+    x = (x | (x <<  8)) & 0x0300F00F;
+    x = (x | (x <<  4)) & 0x030C30C3;
+    x = (x | (x <<  2)) & 0x09249249;
+
+    y = (y | (y << 16)) & 0x030000FF;
+    y = (y | (y <<  8)) & 0x0300F00F;
+    y = (y | (y <<  4)) & 0x030C30C3;
+    y = (y | (y <<  2)) & 0x09249249;
+
+    z = (z | (z << 16)) & 0x030000FF;
+    z = (z | (z <<  8)) & 0x0300F00F;
+    z = (z | (z <<  4)) & 0x030C30C3;
+    z = (z | (z <<  2)) & 0x09249249;
+
+    return x | (y << 1) | (z << 2);
+  }
+
+#if defined(__AVX2__)
+
+  template<>
+    __forceinline unsigned int bitInterleave(const unsigned int &xi, const unsigned int& yi, const unsigned int& zi)
+  {
+    const unsigned int xx = pdep(xi,0x49249249 /* 0b01001001001001001001001001001001 */ );
+    const unsigned int yy = pdep(yi,0x92492492 /* 0b10010010010010010010010010010010 */);
+    const unsigned int zz = pdep(zi,0x24924924 /* 0b00100100100100100100100100100100 */);
+    return xx | yy | zz;
+  }
+
+#endif
+
+  /*! bit interleave operation for 64bit data types*/
+  template<class T>
+    __forceinline T bitInterleave64(const T& xin, const T& yin, const T& zin){
+    T x = xin & 0x1fffff;
+    T y = yin & 0x1fffff;
+    T z = zin & 0x1fffff;
+
+    x = (x | x << 32) & 0x1f00000000ffff;
+    x = (x | x << 16) & 0x1f0000ff0000ff;
+    x = (x | x << 8) & 0x100f00f00f00f00f;
+    x = (x | x << 4) & 0x10c30c30c30c30c3;
+    x = (x | x << 2) & 0x1249249249249249;
+
+    y = (y | y << 32) & 0x1f00000000ffff;
+    y = (y | y << 16) & 0x1f0000ff0000ff;
+    y = (y | y << 8) & 0x100f00f00f00f00f;
+    y = (y | y << 4) & 0x10c30c30c30c30c3;
+    y = (y | y << 2) & 0x1249249249249249;
+
+    z = (z | z << 32) & 0x1f00000000ffff;
+    z = (z | z << 16) & 0x1f0000ff0000ff;
+    z = (z | z << 8) & 0x100f00f00f00f00f;
+    z = (z | z << 4) & 0x10c30c30c30c30c3;
+    z = (z | z << 2) & 0x1249249249249249;
+
+    return x | (y << 1) | (z << 2);
+  }
+}
diff --git a/thirdparty/embree/common/math/obbox.h b/thirdparty/embree/common/math/obbox.h
new file mode 100644
index 0000000000..2fe8bbf071
--- /dev/null
+++ b/thirdparty/embree/common/math/obbox.h
@@ -0,0 +1,39 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bbox.h"
+#include "linearspace3.h"
+
+namespace embree
+{
+  /*! Oriented bounding box */
+  template<typename T>
+    struct OBBox 
+  {
+  public:
+    
+    __forceinline OBBox () {}
+    
+    __forceinline OBBox (EmptyTy) 
+      : space(one), bounds(empty) {}
+    
+    __forceinline OBBox (const BBox<T>& bounds) 
+      : space(one), bounds(bounds) {}
+      
+    __forceinline OBBox (const LinearSpace3<T>& space, const BBox<T>& bounds) 
+      : space(space), bounds(bounds) {}
+    
+    friend embree_ostream operator<<(embree_ostream cout, const OBBox& p) {
+      return cout << "{ space = " << p.space << ", bounds = " << p.bounds << "}";
+    }
+    
+  public:
+    LinearSpace3<T> space; //!< orthonormal transformation
+    BBox<T> bounds;        //!< bounds in transformed space
+  };
+
+  typedef OBBox<Vec3f> OBBox3f;
+  typedef OBBox<Vec3fa> OBBox3fa;
+}
diff --git a/thirdparty/embree/common/math/quaternion.h b/thirdparty/embree/common/math/quaternion.h
new file mode 100644
index 0000000000..080800efcd
--- /dev/null
+++ b/thirdparty/embree/common/math/quaternion.h
@@ -0,0 +1,254 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "vec3.h"
+#include "vec4.h"
+
+#include "transcendental.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////
+  // Quaternion Struct
+  ////////////////////////////////////////////////////////////////
+
+  template<typename T>
+  struct QuaternionT
+  {
+    typedef Vec3<T> Vector;
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Construction
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline QuaternionT           ()                     { }
+    __forceinline QuaternionT           ( const QuaternionT& other ) { r = other.r; i = other.i; j = other.j; k = other.k; }
+    __forceinline QuaternionT& operator=( const QuaternionT& other ) { r = other.r; i = other.i; j = other.j; k = other.k; return *this; }
+
+    __forceinline          QuaternionT( const T& r       ) : r(r), i(zero), j(zero), k(zero) {}
+    __forceinline explicit QuaternionT( const Vec3<T>& v ) : r(zero), i(v.x), j(v.y), k(v.z) {}
+    __forceinline explicit QuaternionT( const Vec4<T>& v ) : r(v.x), i(v.y), j(v.z), k(v.w) {}
+    __forceinline          QuaternionT( const T& r, const T& i, const T& j, const T& k ) : r(r), i(i), j(j), k(k) {}
+    __forceinline          QuaternionT( const T& r, const Vec3<T>& v ) : r(r), i(v.x), j(v.y), k(v.z) {}
+
+    __inline QuaternionT( const Vec3<T>& vx, const Vec3<T>& vy, const Vec3<T>& vz );
+    __inline QuaternionT( const T& yaw, const T& pitch, const T& roll );
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline QuaternionT( ZeroTy ) : r(zero), i(zero), j(zero), k(zero) {}
+    __forceinline QuaternionT( OneTy  ) : r( one), i(zero), j(zero), k(zero) {}
+
+    /*! return quaternion for rotation around arbitrary axis */
+    static __forceinline QuaternionT rotate(const Vec3<T>& u, const T& r) {
+      return QuaternionT<T>(cos(T(0.5)*r),sin(T(0.5)*r)*normalize(u));
+    }
+
+    /*! returns the rotation axis of the quaternion as a vector */
+    __forceinline Vec3<T> v( ) const { return Vec3<T>(i, j, k); }
+
+  public:
+    T r, i, j, k;
+  };
+
+  template<typename T> __forceinline QuaternionT<T> operator *( const T             & a, const QuaternionT<T>& b ) { return QuaternionT<T>(a * b.r, a * b.i, a * b.j, a * b.k); }
+  template<typename T> __forceinline QuaternionT<T> operator *( const QuaternionT<T>& a, const T             & b ) { return QuaternionT<T>(a.r * b, a.i * b, a.j * b, a.k * b); }
+
+  ////////////////////////////////////////////////////////////////
+  // Unary Operators
+  ////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline QuaternionT<T> operator +( const QuaternionT<T>& a ) { return QuaternionT<T>(+a.r, +a.i, +a.j, +a.k); }
+  template<typename T> __forceinline QuaternionT<T> operator -( const QuaternionT<T>& a ) { return QuaternionT<T>(-a.r, -a.i, -a.j, -a.k); }
+  template<typename T> __forceinline QuaternionT<T> conj      ( const QuaternionT<T>& a ) { return QuaternionT<T>(a.r, -a.i, -a.j, -a.k); }
+  template<typename T> __forceinline T              abs       ( const QuaternionT<T>& a ) { return sqrt(a.r*a.r + a.i*a.i + a.j*a.j + a.k*a.k); }
+  template<typename T> __forceinline QuaternionT<T> rcp       ( const QuaternionT<T>& a ) { return conj(a)*rcp(a.r*a.r + a.i*a.i + a.j*a.j + a.k*a.k); }
+  template<typename T> __forceinline QuaternionT<T> normalize ( const QuaternionT<T>& a ) { return a*rsqrt(a.r*a.r + a.i*a.i + a.j*a.j + a.k*a.k); }
+
+  // evaluates a*q-r
+  template<typename T> __forceinline QuaternionT<T>
+  msub(const T& a, const QuaternionT<T>& q, const QuaternionT<T>& p)
+  {
+    return QuaternionT<T>(msub(a, q.r, p.r),
+                          msub(a, q.i, p.i),
+                          msub(a, q.j, p.j),
+                          msub(a, q.k, p.k));
+  }
+  // evaluates a*q-r
+  template<typename T> __forceinline QuaternionT<T>
+  madd (const T& a, const QuaternionT<T>& q, const QuaternionT<T>& p)
+  {
+    return QuaternionT<T>(madd(a, q.r, p.r),
+                          madd(a, q.i, p.i),
+                          madd(a, q.j, p.j),
+                          madd(a, q.k, p.k));
+  }
+
+  ////////////////////////////////////////////////////////////////
+  // Binary Operators
+  ////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline QuaternionT<T> operator +( const T             & a, const QuaternionT<T>& b ) { return QuaternionT<T>(a + b.r,  b.i,  b.j,  b.k); }
+  template<typename T> __forceinline QuaternionT<T> operator +( const QuaternionT<T>& a, const T             & b ) { return QuaternionT<T>(a.r + b, a.i, a.j, a.k); }
+  template<typename T> __forceinline QuaternionT<T> operator +( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return QuaternionT<T>(a.r + b.r, a.i + b.i, a.j + b.j, a.k + b.k); }
+  template<typename T> __forceinline QuaternionT<T> operator -( const T             & a, const QuaternionT<T>& b ) { return QuaternionT<T>(a - b.r, -b.i, -b.j, -b.k); }
+  template<typename T> __forceinline QuaternionT<T> operator -( const QuaternionT<T>& a, const T             & b ) { return QuaternionT<T>(a.r - b, a.i, a.j, a.k); }
+  template<typename T> __forceinline QuaternionT<T> operator -( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return QuaternionT<T>(a.r - b.r, a.i - b.i, a.j - b.j, a.k - b.k); }
+
+  template<typename T> __forceinline Vec3<T>       operator *( const QuaternionT<T>& a, const Vec3<T>      & b ) { return (a*QuaternionT<T>(b)*conj(a)).v(); }
+  template<typename T> __forceinline QuaternionT<T> operator *( const QuaternionT<T>& a, const QuaternionT<T>& b ) {
+    return QuaternionT<T>(a.r*b.r - a.i*b.i - a.j*b.j - a.k*b.k,
+                          a.r*b.i + a.i*b.r + a.j*b.k - a.k*b.j,
+                          a.r*b.j - a.i*b.k + a.j*b.r + a.k*b.i,
+                          a.r*b.k + a.i*b.j - a.j*b.i + a.k*b.r);
+  }
+  template<typename T> __forceinline QuaternionT<T> operator /( const T             & a, const QuaternionT<T>& b ) { return a*rcp(b); }
+  template<typename T> __forceinline QuaternionT<T> operator /( const QuaternionT<T>& a, const T             & b ) { return a*rcp(b); }
+  template<typename T> __forceinline QuaternionT<T> operator /( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return a*rcp(b); }
+
+  template<typename T> __forceinline QuaternionT<T>& operator +=( QuaternionT<T>& a, const T             & b ) { return a = a+b; }
+  template<typename T> __forceinline QuaternionT<T>& operator +=( QuaternionT<T>& a, const QuaternionT<T>& b ) { return a = a+b; }
+  template<typename T> __forceinline QuaternionT<T>& operator -=( QuaternionT<T>& a, const T             & b ) { return a = a-b; }
+  template<typename T> __forceinline QuaternionT<T>& operator -=( QuaternionT<T>& a, const QuaternionT<T>& b ) { return a = a-b; }
+  template<typename T> __forceinline QuaternionT<T>& operator *=( QuaternionT<T>& a, const T             & b ) { return a = a*b; }
+  template<typename T> __forceinline QuaternionT<T>& operator *=( QuaternionT<T>& a, const QuaternionT<T>& b ) { return a = a*b; }
+  template<typename T> __forceinline QuaternionT<T>& operator /=( QuaternionT<T>& a, const T             & b ) { return a = a*rcp(b); }
+  template<typename T> __forceinline QuaternionT<T>& operator /=( QuaternionT<T>& a, const QuaternionT<T>& b ) { return a = a*rcp(b); }
+
+  template<typename T, typename M> __forceinline QuaternionT<T>
+  select(const M& m, const QuaternionT<T>& q, const QuaternionT<T>& p)
+  {
+    return QuaternionT<T>(select(m, q.r, p.r),
+                          select(m, q.i, p.i),
+                          select(m, q.j, p.j),
+                          select(m, q.k, p.k));
+  }
+
+
+  template<typename T> __forceinline Vec3<T> xfmPoint ( const QuaternionT<T>& a, const Vec3<T>&       b ) { return (a*QuaternionT<T>(b)*conj(a)).v(); }
+  template<typename T> __forceinline Vec3<T> xfmVector( const QuaternionT<T>& a, const Vec3<T>&       b ) { return (a*QuaternionT<T>(b)*conj(a)).v(); }
+  template<typename T> __forceinline Vec3<T> xfmNormal( const QuaternionT<T>& a, const Vec3<T>&       b ) { return (a*QuaternionT<T>(b)*conj(a)).v(); }
+
+  template<typename T> __forceinline T dot(const QuaternionT<T>& a, const QuaternionT<T>& b) { return a.r*b.r + a.i*b.i + a.j*b.j + a.k*b.k; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline bool operator ==( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return a.r == b.r && a.i == b.i && a.j == b.j && a.k == b.k; }
+  template<typename T> __forceinline bool operator !=( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return a.r != b.r || a.i != b.i || a.j != b.j || a.k != b.k; }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Orientation Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> QuaternionT<T>::QuaternionT( const Vec3<T>& vx, const Vec3<T>& vy, const Vec3<T>& vz )
+  {
+    if ( vx.x + vy.y + vz.z >= T(zero) )
+    {
+      const T t = T(one) + (vx.x + vy.y + vz.z);
+      const T s = rsqrt(t)*T(0.5f);
+      r = t*s;
+      i = (vy.z - vz.y)*s;
+      j = (vz.x - vx.z)*s;
+      k = (vx.y - vy.x)*s;
+    }
+    else if ( vx.x >= max(vy.y, vz.z) )
+    {
+      const T t = (T(one) + vx.x) - (vy.y + vz.z);
+      const T s = rsqrt(t)*T(0.5f);
+      r = (vy.z - vz.y)*s;
+      i = t*s;
+      j = (vx.y + vy.x)*s;
+      k = (vz.x + vx.z)*s;
+    }
+    else if ( vy.y >= vz.z ) // if ( vy.y >= max(vz.z, vx.x) )
+    {
+      const T t = (T(one) + vy.y) - (vz.z + vx.x);
+      const T s = rsqrt(t)*T(0.5f);
+      r = (vz.x - vx.z)*s;
+      i = (vx.y + vy.x)*s;
+      j = t*s;
+      k = (vy.z + vz.y)*s;
+    }
+    else //if ( vz.z >= max(vy.y, vx.x) )
+    {
+      const T t = (T(one) + vz.z) - (vx.x + vy.y);
+      const T s = rsqrt(t)*T(0.5f);
+      r = (vx.y - vy.x)*s;
+      i = (vz.x + vx.z)*s;
+      j = (vy.z + vz.y)*s;
+      k = t*s;
+    }
+  }
+
+  template<typename T> QuaternionT<T>::QuaternionT( const T& yaw, const T& pitch, const T& roll )
+  {
+    const T cya = cos(yaw  *T(0.5f));
+    const T cpi = cos(pitch*T(0.5f));
+    const T cro = cos(roll *T(0.5f));
+    const T sya = sin(yaw  *T(0.5f));
+    const T spi = sin(pitch*T(0.5f));
+    const T sro = sin(roll *T(0.5f));
+    r = cro*cya*cpi + sro*sya*spi;
+    i = cro*cya*spi + sro*sya*cpi;
+    j = cro*sya*cpi - sro*cya*spi;
+    k = sro*cya*cpi - cro*sya*spi;
+  }
+
+  //////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  //////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> static embree_ostream operator<<(embree_ostream cout, const QuaternionT<T>& q) {
+    return cout << "{ r = " << q.r << ", i = " << q.i << ", j = " << q.j << ", k = " << q.k << " }";
+  }
+
+  /*! default template instantiations */
+  typedef QuaternionT<float>  Quaternion3f;
+  typedef QuaternionT<double> Quaternion3d;
+
+  template<int N> using Quaternion3vf = QuaternionT<vfloat<N>>;
+  typedef QuaternionT<vfloat<4>>  Quaternion3vf4;
+  typedef QuaternionT<vfloat<8>>  Quaternion3vf8;
+  typedef QuaternionT<vfloat<16>> Quaternion3vf16;
+
+  //////////////////////////////////////////////////////////////////////////////
+  /// Interpolation
+  //////////////////////////////////////////////////////////////////////////////
+  template<typename T>
+  __forceinline QuaternionT<T>lerp(const QuaternionT<T>& q0,
+                                   const QuaternionT<T>& q1,
+                                   const T& factor)
+  {
+    QuaternionT<T> q;
+    q.r = lerp(q0.r, q1.r, factor);
+    q.i = lerp(q0.i, q1.i, factor);
+    q.j = lerp(q0.j, q1.j, factor);
+    q.k = lerp(q0.k, q1.k, factor);
+    return q;
+  }
+
+  template<typename T>
+  __forceinline QuaternionT<T> slerp(const QuaternionT<T>& q0,
+                                     const QuaternionT<T>& q1_,
+                                     const T& t)
+  {
+    T cosTheta = dot(q0, q1_);
+    QuaternionT<T> q1 = select(cosTheta < 0.f, -q1_, q1_);
+    cosTheta          = select(cosTheta < 0.f, -cosTheta, cosTheta);
+    if (unlikely(all(cosTheta > 0.9995f))) {
+      return normalize(lerp(q0, q1, t));
+    }
+    const T phi = t * fastapprox::acos(cosTheta);
+    T sinPhi, cosPhi;
+    fastapprox::sincos(phi, sinPhi, cosPhi);
+    QuaternionT<T> qperp = sinPhi * normalize(msub(cosTheta, q0, q1));
+    return msub(cosPhi, q0, qperp);
+  }
+}
diff --git a/thirdparty/embree/common/math/range.h b/thirdparty/embree/common/math/range.h
new file mode 100644
index 0000000000..909fadb995
--- /dev/null
+++ b/thirdparty/embree/common/math/range.h
@@ -0,0 +1,137 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+#include "../math/math.h"
+
+namespace embree
+{
+  template<typename Ty>
+    struct range 
+    {
+      __forceinline range() {}
+
+      __forceinline range(const Ty& begin)
+        : _begin(begin), _end(begin+1) {}
+      
+      __forceinline range(const Ty& begin, const Ty& end)
+        : _begin(begin), _end(end) {}
+ 
+      __forceinline range(const range& other)
+        : _begin(other._begin), _end(other._end) {}
+
+      template<typename T1>
+      __forceinline range(const range<T1>& other)
+        : _begin(Ty(other._begin)), _end(Ty(other._end)) {}
+
+      template<typename T1>
+      __forceinline range& operator =(const range<T1>& other) {
+        _begin = other._begin;
+        _end = other._end;
+        return *this;
+      }
+      
+      __forceinline Ty begin() const {
+        return _begin;
+      }
+      
+      __forceinline Ty end() const {
+	return _end;
+      }
+
+      __forceinline range intersect(const range& r) const {
+        return range (max(_begin,r._begin),min(_end,r._end));
+      }
+
+      __forceinline Ty size() const {
+        return _end - _begin;
+      }
+
+      __forceinline bool empty() const { 
+        return _end <= _begin; 
+      }
+
+      __forceinline Ty center() const {
+        return (_begin + _end)/2;
+      }
+
+      __forceinline std::pair<range,range> split() const 
+      {
+        const Ty _center = center();
+        return std::make_pair(range(_begin,_center),range(_center,_end));
+      }
+
+      __forceinline void split(range& left_o, range& right_o) const 
+      {
+        const Ty _center = center();
+        left_o = range(_begin,_center);
+        right_o = range(_center,_end);
+      }
+
+      __forceinline friend bool operator< (const range& r0, const range& r1) {
+        return r0.size() < r1.size();
+      }
+	
+      friend embree_ostream operator<<(embree_ostream cout, const range& r) {
+        return cout << "range [" << r.begin() << ", " << r.end() << "]";
+      }
+      
+      Ty _begin, _end;
+    };
+
+  template<typename Ty>
+    range<Ty> make_range(const Ty& begin, const Ty& end) {
+    return range<Ty>(begin,end);
+  }
+
+  template<typename Ty>
+    struct extended_range : public range<Ty>
+    {
+      __forceinline extended_range () {}
+
+      __forceinline extended_range (const Ty& begin)
+        : range<Ty>(begin), _ext_end(begin+1) {}
+      
+      __forceinline extended_range (const Ty& begin, const Ty& end)
+        : range<Ty>(begin,end), _ext_end(end) {}
+
+      __forceinline extended_range (const Ty& begin, const Ty& end, const Ty& ext_end)
+        : range<Ty>(begin,end), _ext_end(ext_end) {}
+      
+      __forceinline Ty ext_end() const {
+	return _ext_end;
+      }
+
+      __forceinline Ty ext_size() const {
+        return _ext_end - range<Ty>::_begin;
+      }
+
+      __forceinline Ty ext_range_size() const {
+        return _ext_end - range<Ty>::_end;
+      }
+
+      __forceinline bool has_ext_range() const {
+        assert(_ext_end >= range<Ty>::_end);
+        return (_ext_end - range<Ty>::_end) > 0;
+      }
+
+      __forceinline void set_ext_range(const size_t ext_end){
+        assert(ext_end >= range<Ty>::_end);
+        _ext_end = ext_end;
+      }
+
+      __forceinline void move_right(const size_t plus){
+        range<Ty>::_begin   += plus;
+        range<Ty>::_end     += plus;
+        _ext_end += plus;
+      }
+
+      friend embree_ostream operator<<(embree_ostream cout, const extended_range& r) {
+        return cout << "extended_range [" << r.begin() << ", " << r.end() <<  " (" << r.ext_end() << ")]";
+      }
+      
+      Ty _ext_end;
+    };
+}
diff --git a/thirdparty/embree/common/math/transcendental.h b/thirdparty/embree/common/math/transcendental.h
new file mode 100644
index 0000000000..fd16c26e81
--- /dev/null
+++ b/thirdparty/embree/common/math/transcendental.h
@@ -0,0 +1,525 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+// Transcendental functions from "ispc": https://github.com/ispc/ispc/
+// Most of the transcendental implementations in ispc code come from
+// Solomon Boulos's "syrah": https://github.com/boulos/syrah/
+
+#include "../simd/simd.h"
+
+namespace embree
+{
+
+namespace fastapprox
+{
+
+template <typename T>
+__forceinline T sin(const T &v)
+{
+  static const float piOverTwoVec = 1.57079637050628662109375;
+  static const float twoOverPiVec = 0.636619746685028076171875;
+  auto scaled = v * twoOverPiVec;
+  auto kReal = floor(scaled);
+  auto k = toInt(kReal);
+
+  // Reduced range version of x
+  auto x = v - kReal * piOverTwoVec;
+  auto kMod4 = k & 3;
+  auto sinUseCos = (kMod4 == 1 | kMod4 == 3);
+  auto flipSign = (kMod4 > 1);
+
+  // These coefficients are from sollya with fpminimax(sin(x)/x, [|0, 2,
+  // 4, 6, 8, 10|], [|single...|], [0;Pi/2]);
+  static const float sinC2  = -0.16666667163372039794921875;
+  static const float sinC4  = +8.333347737789154052734375e-3;
+  static const float sinC6  = -1.9842604524455964565277099609375e-4;
+  static const float sinC8  = +2.760012648650445044040679931640625e-6;
+  static const float sinC10 = -2.50293279435709337121807038784027099609375e-8;
+
+  static const float cosC2  = -0.5;
+  static const float cosC4  = +4.166664183139801025390625e-2;
+  static const float cosC6  = -1.388833043165504932403564453125e-3;
+  static const float cosC8  = +2.47562347794882953166961669921875e-5;
+  static const float cosC10 = -2.59630184018533327616751194000244140625e-7;
+
+  auto outside = select(sinUseCos, 1., x);
+  auto c2  = select(sinUseCos, T(cosC2),  T(sinC2));
+  auto c4  = select(sinUseCos, T(cosC4),  T(sinC4));
+  auto c6  = select(sinUseCos, T(cosC6),  T(sinC6));
+  auto c8  = select(sinUseCos, T(cosC8),  T(sinC8));
+  auto c10 = select(sinUseCos, T(cosC10), T(sinC10));
+
+  auto x2 = x * x;
+  auto formula = x2 * c10 + c8;
+  formula = x2 * formula + c6;
+  formula = x2 * formula + c4;
+  formula = x2 * formula + c2;
+  formula = x2 * formula + 1.;
+  formula *= outside;
+
+  formula = select(flipSign, -formula, formula);
+  return formula;
+}
+
+template <typename T>
+__forceinline T cos(const T &v)
+{
+  static const float piOverTwoVec = 1.57079637050628662109375;
+  static const float twoOverPiVec = 0.636619746685028076171875;
+  auto scaled = v * twoOverPiVec;
+  auto kReal = floor(scaled);
+  auto k = toInt(kReal);
+
+  // Reduced range version of x
+  auto x = v - kReal * piOverTwoVec;
+
+  auto kMod4 = k & 3;
+  auto cosUseCos = (kMod4 == 0 | kMod4 == 2);
+  auto flipSign = (kMod4 == 1 | kMod4 == 2);
+
+  const float sinC2  = -0.16666667163372039794921875;
+  const float sinC4  = +8.333347737789154052734375e-3;
+  const float sinC6  = -1.9842604524455964565277099609375e-4;
+  const float sinC8  = +2.760012648650445044040679931640625e-6;
+  const float sinC10 = -2.50293279435709337121807038784027099609375e-8;
+
+  const float cosC2  = -0.5;
+  const float cosC4  = +4.166664183139801025390625e-2;
+  const float cosC6  = -1.388833043165504932403564453125e-3;
+  const float cosC8  = +2.47562347794882953166961669921875e-5;
+  const float cosC10 = -2.59630184018533327616751194000244140625e-7;
+
+  auto outside = select(cosUseCos, 1., x);
+  auto c2  = select(cosUseCos, T(cosC2),  T(sinC2));
+  auto c4  = select(cosUseCos, T(cosC4),  T(sinC4));
+  auto c6  = select(cosUseCos, T(cosC6),  T(sinC6));
+  auto c8  = select(cosUseCos, T(cosC8),  T(sinC8));
+  auto c10 = select(cosUseCos, T(cosC10), T(sinC10));
+
+  auto x2 = x * x;
+  auto formula = x2 * c10 + c8;
+  formula = x2 * formula + c6;
+  formula = x2 * formula + c4;
+  formula = x2 * formula + c2;
+  formula = x2 * formula + 1.;
+  formula *= outside;
+
+  formula = select(flipSign, -formula, formula);
+  return formula;
+}
+
+template <typename T>
+__forceinline void sincos(const T &v, T &sinResult, T &cosResult)
+{
+  const float piOverTwoVec = 1.57079637050628662109375;
+  const float twoOverPiVec = 0.636619746685028076171875;
+  auto scaled = v * twoOverPiVec;
+  auto kReal = floor(scaled);
+  auto k = toInt(kReal);
+
+  // Reduced range version of x
+  auto x = v - kReal * piOverTwoVec;
+  auto kMod4 = k & 3;
+  auto cosUseCos = ((kMod4 == 0) | (kMod4 == 2));
+  auto sinUseCos = ((kMod4 == 1) | (kMod4 == 3));
+  auto sinFlipSign = (kMod4 > 1);
+  auto cosFlipSign = ((kMod4 == 1) | (kMod4 == 2));
+
+  const float oneVec = +1.;
+  const float sinC2  = -0.16666667163372039794921875;
+  const float sinC4  = +8.333347737789154052734375e-3;
+  const float sinC6  = -1.9842604524455964565277099609375e-4;
+  const float sinC8  = +2.760012648650445044040679931640625e-6;
+  const float sinC10 = -2.50293279435709337121807038784027099609375e-8;
+
+  const float cosC2  = -0.5;
+  const float cosC4  = +4.166664183139801025390625e-2;
+  const float cosC6  = -1.388833043165504932403564453125e-3;
+  const float cosC8  = +2.47562347794882953166961669921875e-5;
+  const float cosC10 = -2.59630184018533327616751194000244140625e-7;
+
+  auto x2 = x * x;
+
+  auto sinFormula = x2 * sinC10 + sinC8;
+  auto cosFormula = x2 * cosC10 + cosC8;
+  sinFormula = x2 * sinFormula + sinC6;
+  cosFormula = x2 * cosFormula + cosC6;
+
+  sinFormula = x2 * sinFormula + sinC4;
+  cosFormula = x2 * cosFormula + cosC4;
+
+  sinFormula = x2 * sinFormula + sinC2;
+  cosFormula = x2 * cosFormula + cosC2;
+
+  sinFormula = x2 * sinFormula + oneVec;
+  cosFormula = x2 * cosFormula + oneVec;
+
+  sinFormula *= x;
+
+  sinResult = select(sinUseCos, cosFormula, sinFormula);
+  cosResult = select(cosUseCos, cosFormula, sinFormula);
+
+  sinResult = select(sinFlipSign, -sinResult, sinResult);
+  cosResult = select(cosFlipSign, -cosResult, cosResult);
+}
+
+template <typename T>
+__forceinline T tan(const T &v)
+{
+  const float piOverFourVec = 0.785398185253143310546875;
+  const float fourOverPiVec = 1.27323949337005615234375;
+
+  auto xLt0 = v < 0.;
+  auto y = select(xLt0, -v, v);
+  auto scaled = y * fourOverPiVec;
+
+  auto kReal = floor(scaled);
+  auto k = toInt(kReal);
+
+  auto x = y - kReal * piOverFourVec;
+
+  // If k & 1, x -= Pi/4
+  auto needOffset = (k & 1) != 0;
+  x = select(needOffset, x - piOverFourVec, x);
+
+  // If k & 3 == (0 or 3) let z = tan_In...(y) otherwise z = -cot_In0To...
+  auto kMod4 = k & 3;
+  auto useCotan = (kMod4 == 1) | (kMod4 == 2);
+
+  const float oneVec = 1.0;
+
+  const float tanC2  = +0.33333075046539306640625;
+  const float tanC4  = +0.13339905440807342529296875;
+  const float tanC6  = +5.3348250687122344970703125e-2;
+  const float tanC8  = +2.46033705770969390869140625e-2;
+  const float tanC10 = +2.892402000725269317626953125e-3;
+  const float tanC12 = +9.500005282461643218994140625e-3;
+
+  const float cotC2  = -0.3333333432674407958984375;
+  const float cotC4  = -2.222204394638538360595703125e-2;
+  const float cotC6  = -2.11752182804048061370849609375e-3;
+  const float cotC8  = -2.0846328698098659515380859375e-4;
+  const float cotC10 = -2.548247357481159269809722900390625e-5;
+  const float cotC12 = -3.5257363606433500535786151885986328125e-7;
+
+  auto x2 = x * x;
+  T z;
+  if (any(useCotan))
+  {
+    auto cotVal = x2 * cotC12 + cotC10;
+    cotVal = x2 * cotVal + cotC8;
+    cotVal = x2 * cotVal + cotC6;
+    cotVal = x2 * cotVal + cotC4;
+    cotVal = x2 * cotVal + cotC2;
+    cotVal = x2 * cotVal + oneVec;
+    // The equation is for x * cot(x) but we need -x * cot(x) for the tan part.
+    cotVal /= -x;
+    z = cotVal;
+  }
+  auto useTan = !useCotan;
+  if (any(useTan))
+  {
+    auto tanVal = x2 * tanC12 + tanC10;
+    tanVal = x2 * tanVal + tanC8;
+    tanVal = x2 * tanVal + tanC6;
+    tanVal = x2 * tanVal + tanC4;
+    tanVal = x2 * tanVal + tanC2;
+    tanVal = x2 * tanVal + oneVec;
+    // Equation was for tan(x)/x
+    tanVal *= x;
+    z = select(useTan, tanVal, z);
+  }
+  return select(xLt0, -z, z);
+}
+
+template <typename T>
+__forceinline T asin(const T &x0)
+{
+  auto isneg = (x0 < 0.f);
+  auto x = abs(x0);
+  auto isnan = (x > 1.f);
+
+  // sollya
+  // fpminimax(((asin(x)-pi/2)/-sqrt(1-x)), [|0,1,2,3,4,5|],[|single...|],
+  //           [1e-20;.9999999999999999]);
+  // avg error: 1.1105439e-06, max error 1.3187528e-06
+  auto v = 1.57079517841339111328125f +
+           x * (-0.21450997889041900634765625f +
+                x * (8.78556668758392333984375e-2f +
+                     x * (-4.489909112453460693359375e-2f +
+                          x * (1.928029954433441162109375e-2f +
+                               x * (-4.3095736764371395111083984375e-3f)))));
+
+  v *= -sqrt(1.f - x);
+  v = v + 1.57079637050628662109375f;
+
+  v = select(v < 0.f, T(0.f), v);
+  v = select(isneg, -v, v);
+  v = select(isnan, T(cast_i2f(0x7fc00000)), v);
+
+  return v;
+}
+
+template <typename T>
+__forceinline T acos(const T &v)
+{
+  return 1.57079637050628662109375f - asin(v);
+}
+
+template <typename T>
+__forceinline T atan(const T &v)
+{
+  const float piOverTwoVec = 1.57079637050628662109375;
+  // atan(-x) = -atan(x) (so flip from negative to positive first)
+  // If x > 1 -> atan(x) = Pi/2 - atan(1/x)
+  auto xNeg = v < 0.f;
+  auto xFlipped = select(xNeg, -v, v);
+
+  auto xGt1 = xFlipped > 1.;
+  auto x = select(xGt1, rcpSafe(xFlipped), xFlipped);
+
+  // These coefficients approximate atan(x)/x
+  const float atanC0  = +0.99999988079071044921875;
+  const float atanC2  = -0.3333191573619842529296875;
+  const float atanC4  = +0.199689209461212158203125;
+  const float atanC6  = -0.14015688002109527587890625;
+  const float atanC8  = +9.905083477497100830078125e-2;
+  const float atanC10 = -5.93664981424808502197265625e-2;
+  const float atanC12 = +2.417283318936824798583984375e-2;
+  const float atanC14 = -4.6721356920897960662841796875e-3;
+
+  auto x2 = x * x;
+  auto result = x2 * atanC14 + atanC12;
+  result = x2 * result + atanC10;
+  result = x2 * result + atanC8;
+  result = x2 * result + atanC6;
+  result = x2 * result + atanC4;
+  result = x2 * result + atanC2;
+  result = x2 * result + atanC0;
+  result *= x;
+
+  result = select(xGt1, piOverTwoVec - result, result);
+  result = select(xNeg, -result, result);
+  return result;
+}
+
+template <typename T>
+__forceinline T atan2(const T &y, const T &x)
+{
+  const float piVec = 3.1415926536;
+  // atan2(y, x) =
+  //
+  // atan2(y > 0, x = +-0) ->  Pi/2
+  // atan2(y < 0, x = +-0) -> -Pi/2
+  // atan2(y = +-0, x < +0) -> +-Pi
+  // atan2(y = +-0, x >= +0) -> +-0
+  //
+  // atan2(y >= 0, x < 0) ->  Pi + atan(y/x)
+  // atan2(y <  0, x < 0) -> -Pi + atan(y/x)
+  // atan2(y, x > 0) -> atan(y/x)
+  //
+  // and then a bunch of code for dealing with infinities.
+  auto yOverX = y * rcpSafe(x);
+  auto atanArg = atan(yOverX);
+  auto xLt0 = x < 0.f;
+  auto yLt0 = y < 0.f;
+  auto offset = select(xLt0,
+                select(yLt0, T(-piVec), T(piVec)), 0.f);
+  return offset + atanArg;
+}
+
+template <typename T>
+__forceinline T exp(const T &v)
+{
+  const float ln2Part1 = 0.6931457519;
+  const float ln2Part2 = 1.4286067653e-6;
+  const float oneOverLn2 = 1.44269502162933349609375;
+
+  auto scaled = v * oneOverLn2;
+  auto kReal = floor(scaled);
+  auto k = toInt(kReal);
+
+  // Reduced range version of x
+  auto x = v - kReal * ln2Part1;
+  x -= kReal * ln2Part2;
+
+  // These coefficients are for e^x in [0, ln(2)]
+  const float one = 1.;
+  const float c2 = 0.4999999105930328369140625;
+  const float c3 = 0.166668415069580078125;
+  const float c4 = 4.16539050638675689697265625e-2;
+  const float c5 = 8.378830738365650177001953125e-3;
+  const float c6 = 1.304379315115511417388916015625e-3;
+  const float c7 = 2.7555381529964506626129150390625e-4;
+
+  auto result = x * c7 + c6;
+  result = x * result + c5;
+  result = x * result + c4;
+  result = x * result + c3;
+  result = x * result + c2;
+  result = x * result + one;
+  result = x * result + one;
+
+  // Compute 2^k (should differ for float and double, but I'll avoid
+  // it for now and just do floats)
+  const int fpbias = 127;
+  auto biasedN = k + fpbias;
+  auto overflow = kReal > fpbias;
+  // Minimum exponent is -126, so if k is <= -127 (k + 127 <= 0)
+  // we've got underflow. -127 * ln(2) -> -88.02. So the most
+  // negative float input that doesn't result in zero is like -88.
+  auto underflow = kReal <= -fpbias;
+  const int infBits = 0x7f800000;
+  biasedN <<= 23;
+  // Reinterpret this thing as float
+  auto twoToTheN = asFloat(biasedN);
+  // Handle both doubles and floats (hopefully eliding the copy for float)
+  auto elemtype2n = twoToTheN;
+  result *= elemtype2n;
+  result = select(overflow, cast_i2f(infBits), result);
+  result = select(underflow, 0., result);
+  return result;
+}
+
+// Range reduction for logarithms takes log(x) -> log(2^n * y) -> n
+// * log(2) + log(y) where y is the reduced range (usually in [1/2, 1)).
+template <typename T, typename R>
+__forceinline void __rangeReduceLog(const T &input,
+                                    T &reduced,
+                                    R &exponent)
+{
+  auto intVersion = asInt(input);
+  // single precision = SEEE EEEE EMMM MMMM MMMM MMMM MMMM MMMM
+  // exponent mask    = 0111 1111 1000 0000 0000 0000 0000 0000
+  //                    0x7  0xF  0x8  0x0  0x0  0x0  0x0  0x0
+  // non-exponent     = 1000 0000 0111 1111 1111 1111 1111 1111
+  //                  = 0x8  0x0  0x7  0xF  0xF  0xF  0xF  0xF
+
+  //const int exponentMask(0x7F800000)
+  static const int nonexponentMask = 0x807FFFFF;
+
+  // We want the reduced version to have an exponent of -1 which is
+  // -1 + 127 after biasing or 126
+  static const int exponentNeg1 = (126l << 23);
+  // NOTE(boulos): We don't need to mask anything out since we know
+  // the sign bit has to be 0. If it's 1, we need to return infinity/nan
+  // anyway (log(x), x = +-0 -> infinity, x < 0 -> NaN).
+  auto biasedExponent = intVersion >> 23; // This number is [0, 255] but it means [-127, 128]
+
+  auto offsetExponent = biasedExponent + 1; // Treat the number as if it were 2^{e+1} * (1.m)/2
+  exponent = offsetExponent - 127;          // get the real value
+
+  // Blend the offset_exponent with the original input (do this in
+  // int for now, until I decide if float can have & and &not)
+  auto blended = (intVersion & nonexponentMask) | (exponentNeg1);
+  reduced = asFloat(blended);
+}
+
+template <typename T> struct ExponentType            { };
+template <int N>      struct ExponentType<vfloat_impl<N>> { typedef vint<N> Ty; };
+template <>           struct ExponentType<float>     { typedef int     Ty; };
+
+template <typename T>
+__forceinline T log(const T &v)
+{
+  T reduced;
+  typename ExponentType<T>::Ty exponent;
+
+  const int nanBits = 0x7fc00000;
+  const int negInfBits = 0xFF800000;
+  const float nan = cast_i2f(nanBits);
+  const float negInf = cast_i2f(negInfBits);
+  auto useNan = v < 0.;
+  auto useInf = v == 0.;
+  auto exceptional = useNan | useInf;
+  const float one = 1.0;
+
+  auto patched = select(exceptional, one, v);
+  __rangeReduceLog(patched, reduced, exponent);
+
+  const float ln2 = 0.693147182464599609375;
+
+  auto x1 = one - reduced;
+  const float c1 = +0.50000095367431640625;
+  const float c2 = +0.33326041698455810546875;
+  const float c3 = +0.2519190013408660888671875;
+  const float c4 = +0.17541764676570892333984375;
+  const float c5 = +0.3424419462680816650390625;
+  const float c6 = -0.599632322788238525390625;
+  const float c7 = +1.98442304134368896484375;
+  const float c8 = -2.4899270534515380859375;
+  const float c9 = +1.7491014003753662109375;
+
+  auto result = x1 * c9 + c8;
+  result = x1 * result + c7;
+  result = x1 * result + c6;
+  result = x1 * result + c5;
+  result = x1 * result + c4;
+  result = x1 * result + c3;
+  result = x1 * result + c2;
+  result = x1 * result + c1;
+  result = x1 * result + one;
+
+  // Equation was for -(ln(red)/(1-red))
+  result *= -x1;
+  result += toFloat(exponent) * ln2;
+
+  return select(exceptional,
+                select(useNan, T(nan), T(negInf)),
+                result);
+}
+
+template <typename T>
+__forceinline T pow(const T &x, const T &y)
+{
+  auto x1 = abs(x);
+  auto z = exp(y * log(x1));
+
+  // Handle special cases
+  const float twoOver23 = 8388608.0f;
+  auto yInt = y == round(y);
+  auto yOddInt = select(yInt, asInt(abs(y) + twoOver23) << 31, 0); // set sign bit
+
+  // x == 0
+  z = select(x == 0.0f,
+      select(y < 0.0f, T(inf) | signmsk(x),
+      select(y == 0.0f, T(1.0f), asFloat(yOddInt) & x)), z);
+
+  // x < 0
+  auto xNegative = x < 0.0f;
+  if (any(xNegative))
+  {
+    auto z1 = z | asFloat(yOddInt);
+    z1 = select(yInt, z1, std::numeric_limits<float>::quiet_NaN());
+    z = select(xNegative, z1, z);
+  }
+
+  auto xFinite = isfinite(x);
+  auto yFinite = isfinite(y);
+  if (all(xFinite & yFinite))
+    return z;
+
+  // x finite and y infinite
+  z = select(andn(xFinite, yFinite),
+      select(x1 == 1.0f, 1.0f,
+      select((x1 > 1.0f) ^ (y < 0.0f), inf, T(0.0f))), z);
+
+  // x infinite
+  z = select(xFinite, z,
+      select(y == 0.0f, 1.0f,
+      select(y < 0.0f, T(0.0f), inf) | (asFloat(yOddInt) & x)));
+
+  return z;
+}
+
+template <typename T>
+__forceinline T pow(const T &x, float y)
+{
+  return pow(x, T(y));
+}
+
+} // namespace fastapprox
+
+} // namespace embree
diff --git a/thirdparty/embree/common/math/vec2.h b/thirdparty/embree/common/math/vec2.h
new file mode 100644
index 0000000000..d62aef51f3
--- /dev/null
+++ b/thirdparty/embree/common/math/vec2.h
@@ -0,0 +1,235 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "math.h"
+
+namespace embree
+{
+  struct Vec2fa;
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Generic 2D vector Class
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> struct Vec2
+  {
+    enum { N = 2 };
+    union {
+      struct { T x, y; };
+#if !(defined(__WIN32__) && _MSC_VER == 1800) // workaround for older VS 2013 compiler
+      T components[N];
+#endif
+    };
+
+    typedef T Scalar;
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Construction
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec2( ) {}
+    __forceinline explicit Vec2( const T& a             ) : x(a), y(a) {}
+    __forceinline          Vec2( const T& x, const T& y ) : x(x), y(y) {}
+
+    __forceinline Vec2( const Vec2& other ) { x = other.x; y = other.y; }
+    __forceinline Vec2( const Vec2fa& other );
+
+    template<typename T1> __forceinline Vec2( const Vec2<T1>& a ) : x(T(a.x)), y(T(a.y)) {}
+    template<typename T1> __forceinline Vec2& operator =( const Vec2<T1>& other ) { x = other.x; y = other.y; return *this; }
+
+    __forceinline Vec2& operator =( const Vec2& other ) { x = other.x; y = other.y; return *this; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec2( ZeroTy   ) : x(zero), y(zero) {}
+    __forceinline Vec2( OneTy    ) : x(one),  y(one) {}
+    __forceinline Vec2( PosInfTy ) : x(pos_inf), y(pos_inf) {}
+    __forceinline Vec2( NegInfTy ) : x(neg_inf), y(neg_inf) {}
+
+#if defined(__WIN32__) && _MSC_VER == 1800 // workaround for older VS 2013 compiler
+	__forceinline const T& operator [](const size_t axis) const { assert(axis < 2); return (&x)[axis]; }
+	__forceinline       T& operator [](const size_t axis)       { assert(axis < 2); return (&x)[axis]; }
+#else
+	__forceinline const T& operator [](const size_t axis) const { assert(axis < 2); return components[axis]; }
+	__forceinline       T& operator [](const size_t axis )      { assert(axis < 2); return components[axis]; }
+#endif
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec2<T> operator +( const Vec2<T>& a ) { return Vec2<T>(+a.x, +a.y); }
+  template<typename T> __forceinline Vec2<T> operator -( const Vec2<T>& a ) { return Vec2<T>(-a.x, -a.y); }
+  template<typename T> __forceinline Vec2<T> abs       ( const Vec2<T>& a ) { return Vec2<T>(abs  (a.x), abs  (a.y)); }
+  template<typename T> __forceinline Vec2<T> rcp       ( const Vec2<T>& a ) { return Vec2<T>(rcp  (a.x), rcp  (a.y)); }
+  template<typename T> __forceinline Vec2<T> rsqrt     ( const Vec2<T>& a ) { return Vec2<T>(rsqrt(a.x), rsqrt(a.y)); }
+  template<typename T> __forceinline Vec2<T> sqrt      ( const Vec2<T>& a ) { return Vec2<T>(sqrt (a.x), sqrt (a.y)); }
+  template<typename T> __forceinline Vec2<T> frac      ( const Vec2<T>& a ) { return Vec2<T>(frac (a.x), frac (a.y)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec2<T> operator +( const Vec2<T>& a, const Vec2<T>& b ) { return Vec2<T>(a.x + b.x, a.y + b.y); }
+  template<typename T> __forceinline Vec2<T> operator +( const Vec2<T>& a, const       T& b ) { return Vec2<T>(a.x + b  , a.y + b  ); }
+  template<typename T> __forceinline Vec2<T> operator +( const       T& a, const Vec2<T>& b ) { return Vec2<T>(a   + b.x, a   + b.y); }
+  template<typename T> __forceinline Vec2<T> operator -( const Vec2<T>& a, const Vec2<T>& b ) { return Vec2<T>(a.x - b.x, a.y - b.y); }
+  template<typename T> __forceinline Vec2<T> operator -( const Vec2<T>& a, const       T& b ) { return Vec2<T>(a.x - b  , a.y - b  ); }
+  template<typename T> __forceinline Vec2<T> operator -( const       T& a, const Vec2<T>& b ) { return Vec2<T>(a   - b.x, a   - b.y); }
+  template<typename T> __forceinline Vec2<T> operator *( const Vec2<T>& a, const Vec2<T>& b ) { return Vec2<T>(a.x * b.x, a.y * b.y); }
+  template<typename T> __forceinline Vec2<T> operator *( const       T& a, const Vec2<T>& b ) { return Vec2<T>(a   * b.x, a   * b.y); }
+  template<typename T> __forceinline Vec2<T> operator *( const Vec2<T>& a, const       T& b ) { return Vec2<T>(a.x * b  , a.y * b  ); }
+  template<typename T> __forceinline Vec2<T> operator /( const Vec2<T>& a, const Vec2<T>& b ) { return Vec2<T>(a.x / b.x, a.y / b.y); }
+  template<typename T> __forceinline Vec2<T> operator /( const Vec2<T>& a, const       T& b ) { return Vec2<T>(a.x / b  , a.y / b  ); }
+  template<typename T> __forceinline Vec2<T> operator /( const       T& a, const Vec2<T>& b ) { return Vec2<T>(a   / b.x, a   / b.y); }
+
+  template<typename T> __forceinline Vec2<T> min(const Vec2<T>& a, const Vec2<T>& b) { return Vec2<T>(min(a.x, b.x), min(a.y, b.y)); }
+  template<typename T> __forceinline Vec2<T> max(const Vec2<T>& a, const Vec2<T>& b) { return Vec2<T>(max(a.x, b.x), max(a.y, b.y)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec2<T> madd  ( const Vec2<T>& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>( madd(a.x,b.x,c.x), madd(a.y,b.y,c.y) ); }
+  template<typename T> __forceinline Vec2<T> msub  ( const Vec2<T>& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>( msub(a.x,b.x,c.x), msub(a.y,b.y,c.y) ); }
+  template<typename T> __forceinline Vec2<T> nmadd ( const Vec2<T>& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>(nmadd(a.x,b.x,c.x),nmadd(a.y,b.y,c.y) ); }
+  template<typename T> __forceinline Vec2<T> nmsub ( const Vec2<T>& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>(nmsub(a.x,b.x,c.x),nmsub(a.y,b.y,c.y) ); }
+
+  template<typename T> __forceinline Vec2<T> madd  ( const T& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>( madd(a,b.x,c.x), madd(a,b.y,c.y) ); }
+  template<typename T> __forceinline Vec2<T> msub  ( const T& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>( msub(a,b.x,c.x), msub(a,b.y,c.y) ); }
+  template<typename T> __forceinline Vec2<T> nmadd ( const T& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>(nmadd(a,b.x,c.x),nmadd(a,b.y,c.y) ); }
+  template<typename T> __forceinline Vec2<T> nmsub ( const T& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>(nmsub(a,b.x,c.x),nmsub(a,b.y,c.y) ); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec2<T>& operator +=( Vec2<T>& a, const Vec2<T>& b ) { a.x += b.x; a.y += b.y; return a; }
+  template<typename T> __forceinline Vec2<T>& operator -=( Vec2<T>& a, const Vec2<T>& b ) { a.x -= b.x; a.y -= b.y; return a; }
+  template<typename T> __forceinline Vec2<T>& operator *=( Vec2<T>& a, const       T& b ) { a.x *= b  ; a.y *= b  ; return a; }
+  template<typename T> __forceinline Vec2<T>& operator /=( Vec2<T>& a, const       T& b ) { a.x /= b  ; a.y /= b  ; return a; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline T reduce_add( const Vec2<T>& a ) { return a.x + a.y; }
+  template<typename T> __forceinline T reduce_mul( const Vec2<T>& a ) { return a.x * a.y; }
+  template<typename T> __forceinline T reduce_min( const Vec2<T>& a ) { return min(a.x, a.y); }
+  template<typename T> __forceinline T reduce_max( const Vec2<T>& a ) { return max(a.x, a.y); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline bool operator ==( const Vec2<T>& a, const Vec2<T>& b ) { return a.x == b.x && a.y == b.y; }
+  template<typename T> __forceinline bool operator !=( const Vec2<T>& a, const Vec2<T>& b ) { return a.x != b.x || a.y != b.y; }
+  template<typename T> __forceinline bool operator < ( const Vec2<T>& a, const Vec2<T>& b ) {
+    if (a.x != b.x) return a.x < b.x;
+    if (a.y != b.y) return a.y < b.y;
+    return false;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Shift Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec2<T> shift_right_1( const Vec2<T>& a ) {
+    return Vec2<T>(shift_right_1(a.x),shift_right_1(a.y));
+  }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Euclidian Space Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline T       dot      ( const Vec2<T>& a, const Vec2<T>& b ) { return madd(a.x,b.x,a.y*b.y); }
+  template<typename T> __forceinline Vec2<T> cross    ( const Vec2<T>& a )                   { return Vec2<T>(-a.y,a.x); } 
+  template<typename T> __forceinline T       length   ( const Vec2<T>& a )                   { return sqrt(dot(a,a)); }
+  template<typename T> __forceinline Vec2<T> normalize( const Vec2<T>& a )                   { return a*rsqrt(dot(a,a)); }
+  template<typename T> __forceinline T       distance ( const Vec2<T>& a, const Vec2<T>& b ) { return length(a-b); }
+  template<typename T> __forceinline T       det      ( const Vec2<T>& a, const Vec2<T>& b ) { return a.x*b.y - a.y*b.x; }
+
+  template<typename T> __forceinline Vec2<T> normalize_safe( const Vec2<T>& a ) {
+    const T d = dot(a,a); return select(d == T( zero ),a, a*rsqrt(d) );
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec2<T> select ( bool s, const Vec2<T>& t, const Vec2<T>& f ) {
+    return Vec2<T>(select(s,t.x,f.x),select(s,t.y,f.y));
+  }
+
+  template<typename T> __forceinline Vec2<T> select ( const Vec2<bool>& s, const Vec2<T>& t, const Vec2<T>& f ) {
+    return Vec2<T>(select(s.x,t.x,f.x),select(s.y,t.y,f.y));
+  }
+
+  template<typename T> __forceinline Vec2<T> select ( const typename T::Bool& s, const Vec2<T>& t, const Vec2<T>& f ) {
+    return Vec2<T>(select(s,t.x,f.x),select(s,t.y,f.y));
+  }
+
+  template<typename T>
+    __forceinline Vec2<T> lerp(const Vec2<T>& v0, const Vec2<T>& v1, const T& t) {
+    return madd(Vec2<T>(T(1.0f)-t),v0,t*v1);
+  }
+
+  template<typename T> __forceinline int maxDim ( const Vec2<T>& a )
+  {
+    const Vec2<T> b = abs(a);
+    if (b.x > b.y) return 0;
+    else return 1;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const Vec2<T>& a) {
+    return cout << "(" << a.x << ", " << a.y << ")";
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Default template instantiations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  typedef Vec2<bool > Vec2b;
+  typedef Vec2<int  > Vec2i;
+  typedef Vec2<float> Vec2f;
+}
+
+#include "vec2fa.h"
+
+#if defined __SSE__
+#include "../simd/sse.h"
+#endif
+
+#if defined __AVX__
+#include "../simd/avx.h"
+#endif
+
+#if defined(__AVX512F__)
+#include "../simd/avx512.h"
+#endif
+
+namespace embree
+{
+  template<> __forceinline Vec2<float>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {}
+
+#if defined(__SSE__)
+  template<> __forceinline Vec2<vfloat4>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {}
+#endif
+
+#if defined(__AVX__)
+  template<> __forceinline Vec2<vfloat8>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {}
+#endif
+
+#if defined(__AVX512F__)
+  template<> __forceinline Vec2<vfloat16>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {}
+#endif
+}
diff --git a/thirdparty/embree/common/math/vec2fa.h b/thirdparty/embree/common/math/vec2fa.h
new file mode 100644
index 0000000000..a51fb68fd0
--- /dev/null
+++ b/thirdparty/embree/common/math/vec2fa.h
@@ -0,0 +1,301 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/alloc.h"
+#include "math.h"
+#include "../simd/sse.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// SSE Vec2fa Type
+  ////////////////////////////////////////////////////////////////////////////////
+
+  struct __aligned(16) Vec2fa
+  {
+    ALIGNED_STRUCT_(16);
+
+    typedef float Scalar;
+    enum { N = 2 };
+    union {
+      __m128 m128;
+      struct { float x,y,az,aw; };
+    };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec2fa( ) {}
+    __forceinline Vec2fa( const __m128 a ) : m128(a) {}
+
+    __forceinline Vec2fa            ( const Vec2<float>& other  ) { x = other.x; y = other.y; }
+    __forceinline Vec2fa& operator =( const Vec2<float>& other ) { x = other.x; y = other.y; return *this; }
+
+    __forceinline Vec2fa            ( const Vec2fa& other ) { m128 = other.m128; }
+    __forceinline Vec2fa& operator =( const Vec2fa& other ) { m128 = other.m128; return *this; }
+
+    __forceinline explicit Vec2fa( const float a ) : m128(_mm_set1_ps(a)) {}
+    __forceinline          Vec2fa( const float x, const float y) : m128(_mm_set_ps(y, y, y, x)) {}
+
+    __forceinline explicit Vec2fa( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {}
+
+    __forceinline operator const __m128&() const { return m128; }
+    __forceinline operator       __m128&()       { return m128; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline Vec2fa load( const void* const a ) {
+      return Vec2fa(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, 0, -1, -1))));
+    }
+
+    static __forceinline Vec2fa loadu( const void* const a ) {
+      return Vec2fa(_mm_and_ps(_mm_loadu_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, 0, -1, -1))));
+    }
+
+    static __forceinline void storeu ( void* ptr, const Vec2fa& v ) {
+      _mm_storeu_ps((float*)ptr,v);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec2fa( ZeroTy   ) : m128(_mm_setzero_ps()) {}
+    __forceinline Vec2fa( OneTy    ) : m128(_mm_set1_ps(1.0f)) {}
+    __forceinline Vec2fa( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {}
+    __forceinline Vec2fa( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const float& operator []( const size_t index ) const { assert(index < 2); return (&x)[index]; }
+    __forceinline       float& operator []( const size_t index )       { assert(index < 2); return (&x)[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec2fa operator +( const Vec2fa& a ) { return a; }
+  __forceinline Vec2fa operator -( const Vec2fa& a ) {
+    const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
+    return _mm_xor_ps(a.m128, mask);
+  }
+  __forceinline Vec2fa abs  ( const Vec2fa& a ) {
+    const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
+    return _mm_and_ps(a.m128, mask);
+  }
+  __forceinline Vec2fa sign ( const Vec2fa& a ) {
+    return blendv_ps(Vec2fa(one), -Vec2fa(one), _mm_cmplt_ps (a,Vec2fa(zero)));
+  }
+
+  __forceinline Vec2fa rcp  ( const Vec2fa& a )
+  {
+#if defined(__AVX512VL__)
+    const Vec2fa r = _mm_rcp14_ps(a.m128);
+#else
+    const Vec2fa r = _mm_rcp_ps(a.m128);
+#endif
+
+#if defined(__AVX2__)
+    const Vec2fa res = _mm_mul_ps(r,_mm_fnmadd_ps(r, a, vfloat4(2.0f)));
+#else
+    const Vec2fa res = _mm_mul_ps(r,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r, a)));
+    //return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
+#endif
+
+    return res;
+  }
+
+  __forceinline Vec2fa sqrt ( const Vec2fa& a ) { return _mm_sqrt_ps(a.m128); }
+  __forceinline Vec2fa sqr  ( const Vec2fa& a ) { return _mm_mul_ps(a,a); }
+
+  __forceinline Vec2fa rsqrt( const Vec2fa& a )
+  {
+#if defined(__AVX512VL__)
+    __m128 r = _mm_rsqrt14_ps(a.m128);
+#else
+    __m128 r = _mm_rsqrt_ps(a.m128);
+#endif
+    return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+  }
+
+  __forceinline Vec2fa zero_fix(const Vec2fa& a) {
+    return blendv_ps(a, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input)));
+  }
+  __forceinline Vec2fa rcp_safe(const Vec2fa& a) {
+    return rcp(zero_fix(a));
+  }
+  __forceinline Vec2fa log ( const Vec2fa& a ) {
+    return Vec2fa(logf(a.x),logf(a.y));
+  }
+
+  __forceinline Vec2fa exp ( const Vec2fa& a ) {
+    return Vec2fa(expf(a.x),expf(a.y));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec2fa operator +( const Vec2fa& a, const Vec2fa& b ) { return _mm_add_ps(a.m128, b.m128); }
+  __forceinline Vec2fa operator -( const Vec2fa& a, const Vec2fa& b ) { return _mm_sub_ps(a.m128, b.m128); }
+  __forceinline Vec2fa operator *( const Vec2fa& a, const Vec2fa& b ) { return _mm_mul_ps(a.m128, b.m128); }
+  __forceinline Vec2fa operator *( const Vec2fa& a, const float b ) { return a * Vec2fa(b); }
+  __forceinline Vec2fa operator *( const float a, const Vec2fa& b ) { return Vec2fa(a) * b; }
+  __forceinline Vec2fa operator /( const Vec2fa& a, const Vec2fa& b ) { return _mm_div_ps(a.m128,b.m128); }
+  __forceinline Vec2fa operator /( const Vec2fa& a, const float b        ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); }
+  __forceinline Vec2fa operator /( const        float a, const Vec2fa& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); }
+
+  __forceinline Vec2fa min( const Vec2fa& a, const Vec2fa& b ) { return _mm_min_ps(a.m128,b.m128); }
+  __forceinline Vec2fa max( const Vec2fa& a, const Vec2fa& b ) { return _mm_max_ps(a.m128,b.m128); }
+
+#if defined(__SSE4_1__)
+    __forceinline Vec2fa mini(const Vec2fa& a, const Vec2fa& b) {
+      const vint4 ai = _mm_castps_si128(a);
+      const vint4 bi = _mm_castps_si128(b);
+      const vint4 ci = _mm_min_epi32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+#endif
+
+#if defined(__SSE4_1__)
+    __forceinline Vec2fa maxi(const Vec2fa& a, const Vec2fa& b) {
+      const vint4 ai = _mm_castps_si128(a);
+      const vint4 bi = _mm_castps_si128(b);
+      const vint4 ci = _mm_max_epi32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+#endif
+
+    __forceinline Vec2fa pow ( const Vec2fa& a, const float& b ) {
+      return Vec2fa(powf(a.x,b),powf(a.y,b));
+    }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX2__)
+  __forceinline Vec2fa madd  ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fmadd_ps(a,b,c); }
+  __forceinline Vec2fa msub  ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fmsub_ps(a,b,c); }
+  __forceinline Vec2fa nmadd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fnmadd_ps(a,b,c); }
+  __forceinline Vec2fa nmsub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fnmsub_ps(a,b,c); }
+#else
+  __forceinline Vec2fa madd  ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return a*b+c; }
+  __forceinline Vec2fa msub  ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return a*b-c; }
+  __forceinline Vec2fa nmadd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return -a*b+c;}
+  __forceinline Vec2fa nmsub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return -a*b-c; }
+#endif
+
+  __forceinline Vec2fa madd  ( const float a, const Vec2fa& b, const Vec2fa& c) { return madd(Vec2fa(a),b,c); }
+  __forceinline Vec2fa msub  ( const float a, const Vec2fa& b, const Vec2fa& c) { return msub(Vec2fa(a),b,c); }
+  __forceinline Vec2fa nmadd ( const float a, const Vec2fa& b, const Vec2fa& c) { return nmadd(Vec2fa(a),b,c); }
+  __forceinline Vec2fa nmsub ( const float a, const Vec2fa& b, const Vec2fa& c) { return nmsub(Vec2fa(a),b,c); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec2fa& operator +=( Vec2fa& a, const Vec2fa& b ) { return a = a + b; }
+  __forceinline Vec2fa& operator -=( Vec2fa& a, const Vec2fa& b ) { return a = a - b; }
+  __forceinline Vec2fa& operator *=( Vec2fa& a, const Vec2fa& b ) { return a = a * b; }
+  __forceinline Vec2fa& operator *=( Vec2fa& a, const float   b ) { return a = a * b; }
+  __forceinline Vec2fa& operator /=( Vec2fa& a, const Vec2fa& b ) { return a = a / b; }
+  __forceinline Vec2fa& operator /=( Vec2fa& a, const float   b ) { return a = a / b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline float reduce_add(const Vec2fa& v) { return v.x+v.y; }
+  __forceinline float reduce_mul(const Vec2fa& v) { return v.x*v.y; }
+  __forceinline float reduce_min(const Vec2fa& v) { return min(v.x,v.y); }
+  __forceinline float reduce_max(const Vec2fa& v) { return max(v.x,v.y); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool operator ==( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 3) == 3; }
+  __forceinline bool operator !=( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 3) != 0; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Euclidian Space Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__SSE4_1__)
+  __forceinline float dot ( const Vec2fa& a, const Vec2fa& b ) {
+    return _mm_cvtss_f32(_mm_dp_ps(a,b,0x3F));
+  }
+#else
+  __forceinline float dot ( const Vec2fa& a, const Vec2fa& b ) {
+    return reduce_add(a*b);
+  }
+#endif
+
+  __forceinline Vec2fa cross ( const Vec2fa& a ) {
+    return Vec2fa(-a.y,a.x);
+  }
+
+  __forceinline float  sqr_length ( const Vec2fa& a )                { return dot(a,a); }
+  __forceinline float  rcp_length ( const Vec2fa& a )                { return rsqrt(dot(a,a)); }
+  __forceinline float  rcp_length2( const Vec2fa& a )                { return rcp(dot(a,a)); }
+  __forceinline float  length   ( const Vec2fa& a )                  { return sqrt(dot(a,a)); }
+  __forceinline Vec2fa normalize( const Vec2fa& a )                  { return a*rsqrt(dot(a,a)); }
+  __forceinline float  distance ( const Vec2fa& a, const Vec2fa& b ) { return length(a-b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec2fa select( bool s, const Vec2fa& t, const Vec2fa& f ) {
+    __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps();
+    return blendv_ps(f, t, mask);
+  }
+
+  __forceinline Vec2fa lerp(const Vec2fa& v0, const Vec2fa& v1, const float t) {
+    return madd(1.0f-t,v0,t*v1);
+  }
+
+  __forceinline int maxDim ( const Vec2fa& a )
+  {
+    const Vec2fa b = abs(a);
+    if (b.x > b.y) return 0;
+    else return 1;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Rounding Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__aarch64__)
+  //__forceinline Vec2fa trunc(const Vec2fa& a) { return vrndq_f32(a); }
+  __forceinline Vec2fa floor(const Vec2fa& a) { return vrndmq_f32(a); }
+  __forceinline Vec2fa ceil (const Vec2fa& a) { return vrndpq_f32(a); }
+#elif defined (__SSE4_1__)
+  //__forceinline Vec2fa trunc( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT); }
+  __forceinline Vec2fa floor( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF    ); }
+  __forceinline Vec2fa ceil ( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF    ); }
+#else
+  //__forceinline Vec2fa trunc( const Vec2fa& a ) { return Vec2fa(truncf(a.x),truncf(a.y),truncf(a.z)); }
+  __forceinline Vec2fa floor( const Vec2fa& a ) { return Vec2fa(floorf(a.x),floorf(a.y)); }
+  __forceinline Vec2fa ceil ( const Vec2fa& a ) { return Vec2fa(ceilf (a.x),ceilf (a.y)); }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator<<(embree_ostream cout, const Vec2fa& a) {
+    return cout << "(" << a.x << ", " << a.y << ")";
+  }
+
+  typedef Vec2fa Vec2fa_t;
+}
diff --git a/thirdparty/embree/common/math/vec3.h b/thirdparty/embree/common/math/vec3.h
new file mode 100644
index 0000000000..ce94eff327
--- /dev/null
+++ b/thirdparty/embree/common/math/vec3.h
@@ -0,0 +1,337 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "math.h"
+
+namespace embree
+{
+  struct Vec3fa;
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Generic 3D vector Class
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> struct Vec3
+  {
+    enum { N  = 3 };
+
+    union {
+      struct {
+	T x, y, z;
+      };
+#if !(defined(__WIN32__) && _MSC_VER == 1800) // workaround for older VS 2013 compiler
+      T components[N];
+#endif
+    };
+
+    typedef T Scalar;
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Construction
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3( ) {}
+    __forceinline explicit Vec3( const T& a                         ) : x(a), y(a), z(a) {}
+    __forceinline          Vec3( const T& x, const T& y, const T& z ) : x(x), y(y), z(z) {}
+
+    __forceinline Vec3( const Vec3& other ) { x = other.x; y = other.y; z = other.z; }
+    __forceinline Vec3( const Vec3fa& other );
+
+    template<typename T1> __forceinline Vec3( const Vec3<T1>& a ) : x(T(a.x)), y(T(a.y)), z(T(a.z)) {}
+    template<typename T1> __forceinline Vec3& operator =(const Vec3<T1>& other) { x = other.x; y = other.y; z = other.z; return *this; }
+
+    __forceinline Vec3& operator =(const Vec3& other) { x = other.x; y = other.y; z = other.z; return *this; }
+	
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3( ZeroTy   ) : x(zero), y(zero), z(zero) {}
+    __forceinline Vec3( OneTy    ) : x(one),  y(one),  z(one) {}
+    __forceinline Vec3( PosInfTy ) : x(pos_inf), y(pos_inf), z(pos_inf) {}
+    __forceinline Vec3( NegInfTy ) : x(neg_inf), y(neg_inf), z(neg_inf) {}
+
+#if defined(__WIN32__) && (_MSC_VER == 1800) // workaround for older VS 2013 compiler
+    __forceinline const T& operator []( const size_t axis ) const { assert(axis < 3); return (&x)[axis]; }
+    __forceinline       T& operator []( const size_t axis )       { assert(axis < 3); return (&x)[axis]; }
+#else
+	__forceinline const T& operator [](const size_t axis) const { assert(axis < 3); return components[axis]; }
+	__forceinline       T& operator [](const size_t axis)       { assert(axis < 3); return components[axis]; }
+#endif
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec3<T> operator +( const Vec3<T>& a ) { return Vec3<T>(+a.x, +a.y, +a.z); }
+  template<typename T> __forceinline Vec3<T> operator -( const Vec3<T>& a ) { return Vec3<T>(-a.x, -a.y, -a.z); }
+  template<typename T> __forceinline Vec3<T> abs       ( const Vec3<T>& a ) { return Vec3<T>(abs  (a.x), abs  (a.y), abs  (a.z)); }
+  template<typename T> __forceinline Vec3<T> rcp       ( const Vec3<T>& a ) { return Vec3<T>(rcp  (a.x), rcp  (a.y), rcp  (a.z)); }
+  template<typename T> __forceinline Vec3<T> rsqrt     ( const Vec3<T>& a ) { return Vec3<T>(rsqrt(a.x), rsqrt(a.y), rsqrt(a.z)); }
+  template<typename T> __forceinline Vec3<T> sqrt      ( const Vec3<T>& a ) { return Vec3<T>(sqrt (a.x), sqrt (a.y), sqrt (a.z)); }
+
+  template<typename T> __forceinline Vec3<T> zero_fix( const Vec3<T>& a )
+  {
+    return Vec3<T>(select(abs(a.x)<min_rcp_input,T(min_rcp_input),a.x),
+                   select(abs(a.y)<min_rcp_input,T(min_rcp_input),a.y),
+                   select(abs(a.z)<min_rcp_input,T(min_rcp_input),a.z));
+  }
+  template<typename T> __forceinline Vec3<T> rcp_safe(const Vec3<T>& a) { return rcp(zero_fix(a)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec3<T> operator +( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(a.x + b.x, a.y + b.y, a.z + b.z); }
+  template<typename T> __forceinline Vec3<T> operator -( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(a.x - b.x, a.y - b.y, a.z - b.z); }
+  template<typename T> __forceinline Vec3<T> operator *( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(a.x * b.x, a.y * b.y, a.z * b.z); }
+  template<typename T> __forceinline Vec3<T> operator *( const       T& a, const Vec3<T>& b ) { return Vec3<T>(a   * b.x, a   * b.y, a   * b.z); }
+  template<typename T> __forceinline Vec3<T> operator *( const Vec3<T>& a, const       T& b ) { return Vec3<T>(a.x * b  , a.y * b  , a.z * b  ); }
+  template<typename T> __forceinline Vec3<T> operator /( const Vec3<T>& a, const       T& b ) { return Vec3<T>(a.x / b  , a.y / b  , a.z / b  ); }
+  template<typename T> __forceinline Vec3<T> operator /( const       T& a, const Vec3<T>& b ) { return Vec3<T>(a   / b.x, a   / b.y, a   / b.z); }
+  template<typename T> __forceinline Vec3<T> operator /( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(a.x / b.x, a.y / b.y, a.z / b.z); }
+
+  template<typename T> __forceinline Vec3<T> min(const Vec3<T>& a, const Vec3<T>& b) { return Vec3<T>(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z)); }
+  template<typename T> __forceinline Vec3<T> max(const Vec3<T>& a, const Vec3<T>& b) { return Vec3<T>(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)); }
+
+  template<typename T> __forceinline Vec3<T> operator >>( const Vec3<T>& a, const int b ) { return Vec3<T>(a.x >> b, a.y >> b, a.z >> b); }
+  template<typename T> __forceinline Vec3<T> operator <<( const Vec3<T>& a, const int b ) { return Vec3<T>(a.x << b, a.y << b, a.z << b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec3<T> madd  ( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>( madd(a.x,b.x,c.x), madd(a.y,b.y,c.y), madd(a.z,b.z,c.z)); }
+  template<typename T> __forceinline Vec3<T> msub  ( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>( msub(a.x,b.x,c.x), msub(a.y,b.y,c.y), msub(a.z,b.z,c.z)); }
+  template<typename T> __forceinline Vec3<T> nmadd ( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>(nmadd(a.x,b.x,c.x),nmadd(a.y,b.y,c.y),nmadd(a.z,b.z,c.z));}
+  template<typename T> __forceinline Vec3<T> nmsub ( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>(nmsub(a.x,b.x,c.x),nmsub(a.y,b.y,c.y),nmsub(a.z,b.z,c.z)); }
+
+  template<typename T> __forceinline Vec3<T> madd  ( const T& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>( madd(a,b.x,c.x), madd(a,b.y,c.y), madd(a,b.z,c.z)); }
+  template<typename T> __forceinline Vec3<T> msub  ( const T& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>( msub(a,b.x,c.x), msub(a,b.y,c.y), msub(a,b.z,c.z)); }
+  template<typename T> __forceinline Vec3<T> nmadd ( const T& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>(nmadd(a,b.x,c.x),nmadd(a,b.y,c.y),nmadd(a,b.z,c.z));}
+  template<typename T> __forceinline Vec3<T> nmsub ( const T& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>(nmsub(a,b.x,c.x),nmsub(a,b.y,c.y),nmsub(a,b.z,c.z)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec3<T>& operator +=( Vec3<T>& a, const T        b ) { a.x += b;   a.y += b;   a.z += b;   return a; }
+  template<typename T> __forceinline Vec3<T>& operator +=( Vec3<T>& a, const Vec3<T>& b ) { a.x += b.x; a.y += b.y; a.z += b.z; return a; }
+  template<typename T> __forceinline Vec3<T>& operator -=( Vec3<T>& a, const Vec3<T>& b ) { a.x -= b.x; a.y -= b.y; a.z -= b.z; return a; }
+  template<typename T> __forceinline Vec3<T>& operator *=( Vec3<T>& a, const       T& b ) { a.x *= b  ; a.y *= b  ; a.z *= b  ; return a; }
+  template<typename T> __forceinline Vec3<T>& operator /=( Vec3<T>& a, const       T& b ) { a.x /= b  ; a.y /= b  ; a.z /= b  ; return a; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline T reduce_add( const Vec3<T>& a ) { return a.x + a.y + a.z; }
+  template<typename T> __forceinline T reduce_mul( const Vec3<T>& a ) { return a.x * a.y * a.z; }
+  template<typename T> __forceinline T reduce_min( const Vec3<T>& a ) { return min(a.x, a.y, a.z); }
+  template<typename T> __forceinline T reduce_max( const Vec3<T>& a ) { return max(a.x, a.y, a.z); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline bool operator ==( const Vec3<T>& a, const Vec3<T>& b ) { return a.x == b.x && a.y == b.y && a.z == b.z; }
+  template<typename T> __forceinline bool operator !=( const Vec3<T>& a, const Vec3<T>& b ) { return a.x != b.x || a.y != b.y || a.z != b.z; }
+  template<typename T> __forceinline bool operator < ( const Vec3<T>& a, const Vec3<T>& b ) {
+    if (a.x != b.x) return a.x < b.x;
+    if (a.y != b.y) return a.y < b.y;
+    if (a.z != b.z) return a.z < b.z;
+    return false;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Shift Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec3<T> shift_right_1( const Vec3<T>& a ) {
+    return Vec3<T>(shift_right_1(a.x),shift_right_1(a.y),shift_right_1(a.z));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec3<T> select ( bool s, const Vec3<T>& t, const Vec3<T>& f ) {
+    return Vec3<T>(select(s,t.x,f.x),select(s,t.y,f.y),select(s,t.z,f.z));
+  }
+
+  template<typename T> __forceinline Vec3<T> select ( const Vec3<bool>& s, const Vec3<T>& t, const Vec3<T>& f ) {
+    return Vec3<T>(select(s.x,t.x,f.x),select(s.y,t.y,f.y),select(s.z,t.z,f.z));
+  }
+
+  template<typename T> __forceinline Vec3<T> select ( const typename T::Bool& s, const Vec3<T>& t, const Vec3<T>& f ) {
+    return Vec3<T>(select(s,t.x,f.x),select(s,t.y,f.y),select(s,t.z,f.z));
+  }
+
+  template<typename T>
+    __forceinline Vec3<T> lerp(const Vec3<T>& v0, const Vec3<T>& v1, const T& t) {
+    return madd(Vec3<T>(T(1.0f)-t),v0,t*v1);
+  }
+
+  template<typename T> __forceinline int maxDim ( const Vec3<T>& a )
+  {
+    const Vec3<T> b = abs(a);
+    if (b.x > b.y) {
+      if (b.x > b.z) return 0; else return 2;
+    } else {
+      if (b.y > b.z) return 1; else return 2;
+    }
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec3<bool> eq_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x==b.x,a.y==b.y,a.z==b.z); }
+  template<typename T> __forceinline Vec3<bool> neq_mask(const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x!=b.x,a.y!=b.y,a.z!=b.z); }
+  template<typename T> __forceinline Vec3<bool> lt_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x< b.x,a.y< b.y,a.z< b.z); }
+  template<typename T> __forceinline Vec3<bool> le_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x<=b.x,a.y<=b.y,a.z<=b.z); }
+  template<typename T> __forceinline Vec3<bool> gt_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x> b.x,a.y> b.y,a.z> b.z); }
+  template<typename T> __forceinline Vec3<bool> ge_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x>=b.x,a.y>=b.y,a.z>=b.z); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Euclidian Space Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline T       sqr      ( const Vec3<T>& a )                   { return dot(a,a); }
+  template<typename T> __forceinline T       dot      ( const Vec3<T>& a, const Vec3<T>& b ) { return madd(a.x,b.x,madd(a.y,b.y,a.z*b.z)); }
+  template<typename T> __forceinline T       length   ( const Vec3<T>& a )                   { return sqrt(sqr(a)); }
+  template<typename T> __forceinline T       rcp_length( const Vec3<T>& a )                  { return rsqrt(sqr(a)); }
+  template<typename T> __forceinline Vec3<T> normalize( const Vec3<T>& a )                   { return a*rsqrt(sqr(a)); }
+  template<typename T> __forceinline T       distance ( const Vec3<T>& a, const Vec3<T>& b ) { return length(a-b); }
+  template<typename T> __forceinline Vec3<T> cross    ( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(msub(a.y,b.z,a.z*b.y), msub(a.z,b.x,a.x*b.z), msub(a.x,b.y,a.y*b.x)); }
+
+  template<typename T> __forceinline Vec3<T> stable_triangle_normal( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c )
+  {
+    const T ab_x = a.z*b.y, ab_y = a.x*b.z, ab_z = a.y*b.x;
+    const T bc_x = b.z*c.y, bc_y = b.x*c.z, bc_z = b.y*c.x;
+    const Vec3<T> cross_ab(msub(a.y,b.z,ab_x), msub(a.z,b.x,ab_y), msub(a.x,b.y,ab_z));
+    const Vec3<T> cross_bc(msub(b.y,c.z,bc_x), msub(b.z,c.x,bc_y), msub(b.x,c.y,bc_z));
+    const auto sx = abs(ab_x) < abs(bc_x);
+    const auto sy = abs(ab_y) < abs(bc_y);
+    const auto sz = abs(ab_z) < abs(bc_z);
+    return Vec3<T>(select(sx,cross_ab.x,cross_bc.x),
+                   select(sy,cross_ab.y,cross_bc.y),
+                   select(sz,cross_ab.z,cross_bc.z));
+  }
+
+  template<typename T> __forceinline T       sum      ( const Vec3<T>& a )                   { return a.x+a.y+a.z; }
+
+  template<typename T> __forceinline      T  halfArea ( const Vec3<T>& d )                  { return madd(d.x,(d.y+d.z),d.y*d.z); }
+  template<typename T> __forceinline      T  area     ( const Vec3<T>& d )                  { return 2.0f*halfArea(d); }
+
+  template<typename T> __forceinline Vec3<T> normalize_safe( const Vec3<T>& a ) {
+    const T d = dot(a,a); return select(d == T( zero ), a ,  a*rsqrt(d) );
+  }
+
+  template<typename T> __forceinline T sqr_point_to_line_distance(const Vec3<T>& P, const Vec3<T>& Q0, const Vec3<T>& Q1)
+  {
+    const Vec3<T> N = cross(P-Q0,Q1-Q0);
+    const Vec3<T> D = Q1-Q0;
+    return dot(N,N)*rcp(dot(D,D));
+  }
+
+  template<typename T> __forceinline T sqr_point_to_line_distance(const Vec3<T>& PmQ0, const Vec3<T>& Q1mQ0)
+  {
+    const Vec3<T> N = cross(PmQ0,Q1mQ0);
+    const Vec3<T> D = Q1mQ0;
+    return dot(N,N)*rcp(dot(D,D));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3<T>& a) {
+    return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")";
+  }
+
+  typedef Vec3<bool > Vec3b;
+  typedef Vec3<int  > Vec3i;
+  typedef Vec3<float> Vec3f;
+}
+
+#include "vec3ba.h"
+#include "vec3ia.h"
+#include "vec3fa.h"
+
+////////////////////////////////////////////////////////////////////////////////
+/// SSE / AVX / MIC specializations
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined __SSE__
+#include "../simd/sse.h"
+#endif
+
+#if defined __AVX__
+#include "../simd/avx.h"
+#endif
+
+#if defined(__AVX512F__)
+#include "../simd/avx512.h"
+#endif
+
+namespace embree
+{
+  template<typename Out, typename In>
+  __forceinline Vec3<Out> broadcast(const Vec3<In>& a, const size_t k) {
+    return Vec3<Out>(Out(a.x[k]), Out(a.y[k]), Out(a.z[k]));
+  }
+
+  template<> __forceinline Vec3<float>::Vec3(const Vec3fa& a) { x = a.x; y = a.y; z = a.z; }
+
+#if defined(__AVX__)
+  template<> __forceinline Vec3<vfloat4>::Vec3(const Vec3fa& a) {
+    x = a.x; y = a.y; z = a.z;
+  }
+#elif defined(__SSE__)
+  template<>
+  __forceinline Vec3<vfloat4>::Vec3(const Vec3fa& a) {
+    const vfloat4 v = vfloat4(a.m128); x = shuffle<0,0,0,0>(v); y = shuffle<1,1,1,1>(v); z = shuffle<2,2,2,2>(v);
+  }
+#endif
+
+#if defined(__SSE__)
+  template<>
+  __forceinline Vec3<vfloat4> broadcast<vfloat4,vfloat4>(const Vec3<vfloat4>& a, const size_t k) {
+    return Vec3<vfloat4>(vfloat4::broadcast(&a.x[k]), vfloat4::broadcast(&a.y[k]), vfloat4::broadcast(&a.z[k]));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline Vec3<vfloat4> shuffle(const Vec3<vfloat4>& b) {
+    return Vec3<vfloat4>(shuffle<i0,i1,i2,i3>(b.x), shuffle<i0,i1,i2,i3>(b.y), shuffle<i0,i1,i2,i3>(b.z));
+  }
+#endif
+
+#if defined(__AVX__)
+  template<>
+  __forceinline Vec3<vfloat8>::Vec3(const Vec3fa& a) {
+    x = a.x; y = a.y; z = a.z;
+  }
+
+  template<>
+  __forceinline Vec3<vfloat8> broadcast<vfloat8,vfloat4>(const Vec3<vfloat4>& a, const size_t k) {
+    return Vec3<vfloat8>(vfloat8::broadcast(&a.x[k]), vfloat8::broadcast(&a.y[k]), vfloat8::broadcast(&a.z[k]));
+  }
+  template<>
+  __forceinline Vec3<vfloat8> broadcast<vfloat8,vfloat8>(const Vec3<vfloat8>& a, const size_t k) {
+    return Vec3<vfloat8>(vfloat8::broadcast(&a.x[k]), vfloat8::broadcast(&a.y[k]), vfloat8::broadcast(&a.z[k]));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline Vec3<vfloat8> shuffle(const Vec3<vfloat8>& b) {
+    return Vec3<vfloat8>(shuffle<i0,i1,i2,i3>(b.x), shuffle<i0,i1,i2,i3>(b.y), shuffle<i0,i1,i2,i3>(b.z));
+  }
+#endif
+
+#if defined(__AVX512F__)
+  template<> __forceinline Vec3<vfloat16>::Vec3(const Vec3fa& a) : x(a.x), y(a.y), z(a.z) {}
+#endif
+}
diff --git a/thirdparty/embree/common/math/vec3ba.h b/thirdparty/embree/common/math/vec3ba.h
new file mode 100644
index 0000000000..a021b522dc
--- /dev/null
+++ b/thirdparty/embree/common/math/vec3ba.h
@@ -0,0 +1,120 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/alloc.h"
+#include "math.h"
+#include "../simd/sse.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// SSE Vec3ba Type
+  ////////////////////////////////////////////////////////////////////////////////
+
+  struct __aligned(16) Vec3ba
+  {
+    ALIGNED_STRUCT_(16);
+    
+    union {
+      __m128 m128;
+      struct { int x,y,z; };
+    };
+
+    typedef int Scalar;
+    enum { N = 3 };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3ba( ) {}
+    __forceinline Vec3ba( const __m128  input ) : m128(input) {}
+    __forceinline Vec3ba( const Vec3ba& other ) : m128(other.m128) {}
+    __forceinline Vec3ba& operator =(const Vec3ba& other) { m128 = other.m128; return *this; }
+
+    __forceinline explicit Vec3ba( bool a )
+      : m128(mm_lookupmask_ps[(size_t(a) << 3) | (size_t(a) << 2) | (size_t(a) << 1) | size_t(a)]) {}
+    __forceinline Vec3ba( bool a, bool b, bool c)
+      : m128(mm_lookupmask_ps[(size_t(c) << 2) | (size_t(b) << 1) | size_t(a)]) {}
+
+    __forceinline operator const __m128&() const { return m128; }
+    __forceinline operator       __m128&()       { return m128; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3ba( FalseTy ) : m128(_mm_setzero_ps()) {}
+    __forceinline Vec3ba( TrueTy  ) : m128(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()))) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const int& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; }
+    __forceinline       int& operator []( const size_t index )       { assert(index < 3); return (&x)[index]; }
+  };
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3ba operator !( const Vec3ba& a ) { return _mm_xor_ps(a.m128, Vec3ba(embree::True)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3ba operator &( const Vec3ba& a, const Vec3ba& b ) { return _mm_and_ps(a.m128, b.m128); }
+  __forceinline Vec3ba operator |( const Vec3ba& a, const Vec3ba& b ) { return _mm_or_ps (a.m128, b.m128); }
+  __forceinline Vec3ba operator ^( const Vec3ba& a, const Vec3ba& b ) { return _mm_xor_ps(a.m128, b.m128); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline Vec3ba& operator &=( Vec3ba& a, const Vec3ba& b ) { return a = a & b; }
+  __forceinline Vec3ba& operator |=( Vec3ba& a, const Vec3ba& b ) { return a = a | b; }
+  __forceinline Vec3ba& operator ^=( Vec3ba& a, const Vec3ba& b ) { return a = a ^ b; }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline bool operator ==( const Vec3ba& a, const Vec3ba& b ) { 
+    return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(a.m128), _mm_castps_si128(b.m128)))) & 7) == 7; 
+  }
+  __forceinline bool operator !=( const Vec3ba& a, const Vec3ba& b ) { 
+    return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(a.m128), _mm_castps_si128(b.m128)))) & 7) != 7; 
+  }
+  __forceinline bool operator < ( const Vec3ba& a, const Vec3ba& b ) {
+    if (a.x != b.x) return a.x < b.x;
+    if (a.y != b.y) return a.y < b.y;
+    if (a.z != b.z) return a.z < b.z;
+    return false;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operations
+  ////////////////////////////////////////////////////////////////////////////////
+    
+  __forceinline bool reduce_and( const Vec3ba& a ) { return (_mm_movemask_ps(a) & 0x7) == 0x7; }
+  __forceinline bool reduce_or ( const Vec3ba& a ) { return (_mm_movemask_ps(a) & 0x7) != 0x0; }
+
+  __forceinline bool all       ( const Vec3ba& b ) { return (_mm_movemask_ps(b) & 0x7) == 0x7; }
+  __forceinline bool any       ( const Vec3ba& b ) { return (_mm_movemask_ps(b) & 0x7) != 0x0; }
+  __forceinline bool none      ( const Vec3ba& b ) { return (_mm_movemask_ps(b) & 0x7) == 0x0; }
+
+  __forceinline size_t movemask(const Vec3ba& a) { return _mm_movemask_ps(a) & 0x7; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3ba& a) {
+    return cout << "(" << (a.x ? "1" : "0") << ", " << (a.y ? "1" : "0") << ", " << (a.z ? "1" : "0") << ")";
+  }
+}
diff --git a/thirdparty/embree/common/math/vec3fa.h b/thirdparty/embree/common/math/vec3fa.h
new file mode 100644
index 0000000000..586039741d
--- /dev/null
+++ b/thirdparty/embree/common/math/vec3fa.h
@@ -0,0 +1,727 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/alloc.h"
+#include "math.h"
+#include "../simd/sse.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// SSE Vec3fa Type
+  ////////////////////////////////////////////////////////////////////////////////
+
+  struct __aligned(16) Vec3fa
+  {
+    ALIGNED_STRUCT_(16);
+
+    typedef float Scalar;
+    enum { N = 3 };
+    union {
+      __m128 m128;
+      struct { float x,y,z; };
+    };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3fa( ) {}
+    __forceinline Vec3fa( const __m128 a ) : m128(a) {}
+
+    __forceinline Vec3fa            ( const Vec3<float>& other ) { m128  = _mm_set_ps(0, other.z, other.y, other.x); }
+    //__forceinline Vec3fa& operator =( const Vec3<float>& other ) { m128  = _mm_set_ps(0, other.z, other.y, other.x); return *this; }
+
+    __forceinline Vec3fa            ( const Vec3fa& other ) { m128 = other.m128; }
+    __forceinline Vec3fa& operator =( const Vec3fa& other ) { m128 = other.m128; return *this; }
+
+    __forceinline explicit Vec3fa( const float a ) : m128(_mm_set1_ps(a)) {}
+    __forceinline          Vec3fa( const float x, const float y, const float z) : m128(_mm_set_ps(0, z, y, x)) {}
+
+    __forceinline explicit Vec3fa( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {}
+
+    __forceinline explicit operator const vfloat4() const { return vfloat4(m128); }
+    __forceinline explicit operator const   vint4() const { return vint4(_mm_cvtps_epi32(m128)); }
+    __forceinline explicit operator const  Vec2fa() const { return Vec2fa(m128); }
+    __forceinline explicit operator const  Vec3ia() const { return Vec3ia(_mm_cvtps_epi32(m128)); }
+    
+    //__forceinline operator const __m128&() const { return m128; }
+    //__forceinline operator       __m128&()       { return m128; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline Vec3fa load( const void* const a ) {
+      return Vec3fa(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1))));
+    }
+
+    static __forceinline Vec3fa loadu( const void* const a ) {
+      return Vec3fa(_mm_loadu_ps((float*)a));
+    }
+
+    static __forceinline void storeu ( void* ptr, const Vec3fa& v ) {
+      _mm_storeu_ps((float*)ptr,v.m128);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3fa( ZeroTy   ) : m128(_mm_setzero_ps()) {}
+    __forceinline Vec3fa( OneTy    ) : m128(_mm_set1_ps(1.0f)) {}
+    __forceinline Vec3fa( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {}
+    __forceinline Vec3fa( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const float& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; }
+    __forceinline       float& operator []( const size_t index )       { assert(index < 3); return (&x)[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fa operator +( const Vec3fa& a ) { return a; }
+  __forceinline Vec3fa operator -( const Vec3fa& a ) {
+    const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
+    return _mm_xor_ps(a.m128, mask);
+  }
+  __forceinline Vec3fa abs  ( const Vec3fa& a ) {
+    const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
+    return _mm_and_ps(a.m128, mask);
+  }
+  __forceinline Vec3fa sign ( const Vec3fa& a ) {
+    return blendv_ps(Vec3fa(one).m128, (-Vec3fa(one)).m128, _mm_cmplt_ps (a.m128,Vec3fa(zero).m128));
+  }
+
+  __forceinline Vec3fa rcp  ( const Vec3fa& a )
+  {
+#if defined(__AVX512VL__)
+    const Vec3fa r = _mm_rcp14_ps(a.m128);
+#else
+    const Vec3fa r = _mm_rcp_ps(a.m128);
+#endif
+
+#if defined(__AVX2__)
+    const Vec3fa res = _mm_mul_ps(r.m128,_mm_fnmadd_ps(r.m128, a.m128, vfloat4(2.0f)));
+#else
+    const Vec3fa res = _mm_mul_ps(r.m128,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r.m128, a.m128)));
+    //return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
+#endif
+
+    return res;
+  }
+
+  __forceinline Vec3fa sqrt ( const Vec3fa& a ) { return _mm_sqrt_ps(a.m128); }
+  __forceinline Vec3fa sqr  ( const Vec3fa& a ) { return _mm_mul_ps(a.m128,a.m128); }
+
+  __forceinline Vec3fa rsqrt( const Vec3fa& a )
+  {
+#if defined(__AVX512VL__)
+    __m128 r = _mm_rsqrt14_ps(a.m128);
+#else
+    __m128 r = _mm_rsqrt_ps(a.m128);
+#endif
+    return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a.m128, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+  }
+
+  __forceinline Vec3fa zero_fix(const Vec3fa& a) {
+    return blendv_ps(a.m128, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input)));
+  }
+  __forceinline Vec3fa rcp_safe(const Vec3fa& a) {
+    return rcp(zero_fix(a));
+  }
+  __forceinline Vec3fa log ( const Vec3fa& a ) {
+    return Vec3fa(logf(a.x),logf(a.y),logf(a.z));
+  }
+
+  __forceinline Vec3fa exp ( const Vec3fa& a ) {
+    return Vec3fa(expf(a.x),expf(a.y),expf(a.z));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fa operator +( const Vec3fa& a, const Vec3fa& b ) { return _mm_add_ps(a.m128, b.m128); }
+  __forceinline Vec3fa operator -( const Vec3fa& a, const Vec3fa& b ) { return _mm_sub_ps(a.m128, b.m128); }
+  __forceinline Vec3fa operator *( const Vec3fa& a, const Vec3fa& b ) { return _mm_mul_ps(a.m128, b.m128); }
+  __forceinline Vec3fa operator *( const Vec3fa& a, const float b ) { return a * Vec3fa(b); }
+  __forceinline Vec3fa operator *( const float a, const Vec3fa& b ) { return Vec3fa(a) * b; }
+  __forceinline Vec3fa operator /( const Vec3fa& a, const Vec3fa& b ) { return _mm_div_ps(a.m128,b.m128); }
+  __forceinline Vec3fa operator /( const Vec3fa& a, const float b        ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); }
+  __forceinline Vec3fa operator /( const        float a, const Vec3fa& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); }
+
+  __forceinline Vec3fa min( const Vec3fa& a, const Vec3fa& b ) { return _mm_min_ps(a.m128,b.m128); }
+  __forceinline Vec3fa max( const Vec3fa& a, const Vec3fa& b ) { return _mm_max_ps(a.m128,b.m128); }
+
+#if defined(__SSE4_1__)
+    __forceinline Vec3fa mini(const Vec3fa& a, const Vec3fa& b) {
+      const vint4 ai = _mm_castps_si128(a.m128);
+      const vint4 bi = _mm_castps_si128(b.m128);
+      const vint4 ci = _mm_min_epi32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+#endif
+
+#if defined(__SSE4_1__)
+    __forceinline Vec3fa maxi(const Vec3fa& a, const Vec3fa& b) {
+      const vint4 ai = _mm_castps_si128(a.m128);
+      const vint4 bi = _mm_castps_si128(b.m128);
+      const vint4 ci = _mm_max_epi32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+#endif
+
+    __forceinline Vec3fa pow ( const Vec3fa& a, const float& b ) {
+      return Vec3fa(powf(a.x,b),powf(a.y,b),powf(a.z,b));
+    }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX2__)
+  __forceinline Vec3fa madd  ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmadd_ps(a.m128,b.m128,c.m128); }
+  __forceinline Vec3fa msub  ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmsub_ps(a.m128,b.m128,c.m128); }
+  __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmadd_ps(a.m128,b.m128,c.m128); }
+  __forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmsub_ps(a.m128,b.m128,c.m128); }
+#else
+  __forceinline Vec3fa madd  ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b+c; }
+  __forceinline Vec3fa msub  ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b-c; }
+  __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b+c;}
+  __forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b-c; }
+#endif
+
+  __forceinline Vec3fa madd  ( const float a, const Vec3fa& b, const Vec3fa& c) { return madd(Vec3fa(a),b,c); }
+  __forceinline Vec3fa msub  ( const float a, const Vec3fa& b, const Vec3fa& c) { return msub(Vec3fa(a),b,c); }
+  __forceinline Vec3fa nmadd ( const float a, const Vec3fa& b, const Vec3fa& c) { return nmadd(Vec3fa(a),b,c); }
+  __forceinline Vec3fa nmsub ( const float a, const Vec3fa& b, const Vec3fa& c) { return nmsub(Vec3fa(a),b,c); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fa& operator +=( Vec3fa& a, const Vec3fa& b ) { return a = a + b; }
+  __forceinline Vec3fa& operator -=( Vec3fa& a, const Vec3fa& b ) { return a = a - b; }
+  __forceinline Vec3fa& operator *=( Vec3fa& a, const Vec3fa& b ) { return a = a * b; }
+  __forceinline Vec3fa& operator *=( Vec3fa& a, const float   b ) { return a = a * b; }
+  __forceinline Vec3fa& operator /=( Vec3fa& a, const Vec3fa& b ) { return a = a / b; }
+  __forceinline Vec3fa& operator /=( Vec3fa& a, const float   b ) { return a = a / b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline float reduce_add(const Vec3fa& v) { 
+    const vfloat4 a(v.m128);
+    const vfloat4 b = shuffle<1>(a);
+    const vfloat4 c = shuffle<2>(a);
+    return _mm_cvtss_f32(a+b+c); 
+  }
+
+  __forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; }
+  __forceinline float reduce_min(const Vec3fa& v) { return min(v.x,v.y,v.z); }
+  __forceinline float reduce_max(const Vec3fa& v) { return max(v.x,v.y,v.z); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool operator ==( const Vec3fa& a, const Vec3fa& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 7) == 7; }
+  __forceinline bool operator !=( const Vec3fa& a, const Vec3fa& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 7) != 0; }
+
+  __forceinline Vec3ba eq_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpeq_ps (a.m128, b.m128); }
+  __forceinline Vec3ba neq_mask(const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpneq_ps(a.m128, b.m128); }
+  __forceinline Vec3ba lt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmplt_ps (a.m128, b.m128); }
+  __forceinline Vec3ba le_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmple_ps (a.m128, b.m128); }
+  __forceinline Vec3ba gt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpnle_ps(a.m128, b.m128); }
+  __forceinline Vec3ba ge_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpnlt_ps(a.m128, b.m128); }
+
+  __forceinline bool isvalid ( const Vec3fa& v ) {
+    return all(gt_mask(v,Vec3fa(-FLT_LARGE)) & lt_mask(v,Vec3fa(+FLT_LARGE)));
+  }
+
+  __forceinline bool is_finite ( const Vec3fa& a ) {
+    return all(ge_mask(a,Vec3fa(-FLT_MAX)) & le_mask(a,Vec3fa(+FLT_MAX)));
+  }
+
+  __forceinline bool isvalid4 ( const Vec3fa& v ) {
+    return all((vfloat4(v.m128) > vfloat4(-FLT_LARGE)) & (vfloat4(v.m128) < vfloat4(+FLT_LARGE)));
+  }
+
+  __forceinline bool is_finite4 ( const Vec3fa& a ) {
+    return all((vfloat4(a.m128) >= vfloat4(-FLT_MAX)) & (vfloat4(a.m128) <= vfloat4(+FLT_MAX)));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Euclidian Space Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__SSE4_1__)
+  __forceinline float dot ( const Vec3fa& a, const Vec3fa& b ) {
+    return _mm_cvtss_f32(_mm_dp_ps(a.m128,b.m128,0x7F));
+  }
+#else
+  __forceinline float dot ( const Vec3fa& a, const Vec3fa& b ) {
+    return reduce_add(a*b);
+  }
+#endif
+
+  __forceinline Vec3fa cross ( const Vec3fa& a, const Vec3fa& b )
+  {
+    vfloat4 a0 = vfloat4(a.m128);
+    vfloat4 b0 = shuffle<1,2,0,3>(vfloat4(b.m128));
+    vfloat4 a1 = shuffle<1,2,0,3>(vfloat4(a.m128));
+    vfloat4 b1 = vfloat4(b.m128);
+    return Vec3fa(shuffle<1,2,0,3>(msub(a0,b0,a1*b1)));
+  }
+
+  __forceinline float  sqr_length ( const Vec3fa& a )                { return dot(a,a); }
+  __forceinline float  rcp_length ( const Vec3fa& a )                { return rsqrt(dot(a,a)); }
+  __forceinline float  rcp_length2( const Vec3fa& a )                { return rcp(dot(a,a)); }
+  __forceinline float  length   ( const Vec3fa& a )                  { return sqrt(dot(a,a)); }
+  __forceinline Vec3fa normalize( const Vec3fa& a )                  { return a*rsqrt(dot(a,a)); }
+  __forceinline float  distance ( const Vec3fa& a, const Vec3fa& b ) { return length(a-b); }
+  __forceinline float  halfArea ( const Vec3fa& d )                  { return madd(d.x,(d.y+d.z),d.y*d.z); }
+  __forceinline float  area     ( const Vec3fa& d )                  { return 2.0f*halfArea(d); }
+
+  __forceinline Vec3fa normalize_safe( const Vec3fa& a ) {
+    const float d = dot(a,a); if (unlikely(d == 0.0f)) return a; else return a*rsqrt(d);
+  }
+
+  /*! differentiated normalization */
+  __forceinline Vec3fa dnormalize(const Vec3fa& p, const Vec3fa& dp)
+  {
+    const float pp  = dot(p,p);
+    const float pdp = dot(p,dp);
+    return (pp*dp-pdp*p)*rcp(pp)*rsqrt(pp);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fa select( bool s, const Vec3fa& t, const Vec3fa& f ) {
+    __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps();
+    return blendv_ps(f.m128, t.m128, mask);
+  }
+
+  __forceinline Vec3fa select( const Vec3ba& s, const Vec3fa& t, const Vec3fa& f ) {
+    return blendv_ps(f.m128, t.m128, s);
+  }
+
+  __forceinline Vec3fa lerp(const Vec3fa& v0, const Vec3fa& v1, const float t) {
+    return madd(1.0f-t,v0,t*v1);
+  }
+
+  __forceinline int maxDim ( const Vec3fa& a )
+  {
+    const Vec3fa b = abs(a);
+    if (b.x > b.y) {
+      if (b.x > b.z) return 0; else return 2;
+    } else {
+      if (b.y > b.z) return 1; else return 2;
+    }
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Rounding Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined (__SSE4_1__)
+  __forceinline Vec3fa trunc( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEAREST_INT); }
+  __forceinline Vec3fa floor( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEG_INF    ); }
+  __forceinline Vec3fa ceil ( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_POS_INF    ); }
+#else
+  __forceinline Vec3fa trunc( const Vec3fa& a ) { return Vec3fa(truncf(a.x),truncf(a.y),truncf(a.z)); }
+  __forceinline Vec3fa floor( const Vec3fa& a ) { return Vec3fa(floorf(a.x),floorf(a.y),floorf(a.z)); }
+  __forceinline Vec3fa ceil ( const Vec3fa& a ) { return Vec3fa(ceilf (a.x),ceilf (a.y),ceilf (a.z)); }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3fa& a) {
+    return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")";
+  }
+
+  typedef Vec3fa Vec3fa_t;
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// SSE Vec3fx Type
+  ////////////////////////////////////////////////////////////////////////////////
+
+  struct __aligned(16) Vec3fx
+  {
+    ALIGNED_STRUCT_(16);
+
+    typedef float Scalar;
+    enum { N = 3 };
+    union {
+      __m128 m128;
+      struct { float x,y,z; union { int a; unsigned u; float w; }; };
+    };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3fx( ) {}
+    __forceinline Vec3fx( const __m128 a ) : m128(a) {}
+
+    __forceinline explicit Vec3fx(const Vec3fa& v) : m128(v.m128) {}
+    __forceinline operator Vec3fa () const { return Vec3fa(m128); }
+        
+    __forceinline explicit Vec3fx            ( const Vec3<float>& other ) { m128  = _mm_set_ps(0, other.z, other.y, other.x); }
+    //__forceinline Vec3fx& operator =( const Vec3<float>& other ) { m128  = _mm_set_ps(0, other.z, other.y, other.x); return *this; }
+
+    __forceinline Vec3fx            ( const Vec3fx& other ) { m128 = other.m128; }
+
+    __forceinline Vec3fx& operator =( const Vec3fx& other ) { m128 = other.m128; return *this; }
+
+    __forceinline explicit Vec3fx( const float a ) : m128(_mm_set1_ps(a)) {}
+    __forceinline          Vec3fx( const float x, const float y, const float z) : m128(_mm_set_ps(0, z, y, x)) {}
+
+    __forceinline Vec3fx( const Vec3fa& other, const int      a1) { m128 = other.m128; a = a1; }
+    __forceinline Vec3fx( const Vec3fa& other, const unsigned a1) { m128 = other.m128; u = a1; }
+    __forceinline Vec3fx( const Vec3fa& other, const float    w1) {      
+#if defined (__SSE4_1__)
+      m128 = _mm_insert_ps(other.m128, _mm_set_ss(w1),3 << 4);
+#else
+      const vint4 mask(-1,-1,-1,0);
+      m128 = select(vboolf4(_mm_castsi128_ps(mask)),vfloat4(other.m128),vfloat4(w1));
+#endif
+    }
+    //__forceinline Vec3fx( const float x, const float y, const float z, const int      a) : x(x), y(y), z(z), a(a) {} // not working properly!
+    //__forceinline Vec3fx( const float x, const float y, const float z, const unsigned a) : x(x), y(y), z(z), u(a) {} // not working properly!
+    __forceinline Vec3fx( const float x, const float y, const float z, const float w) : m128(_mm_set_ps(w, z, y, x)) {}
+    
+    //__forceinline explicit Vec3fx( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {}
+
+    __forceinline explicit operator const vfloat4() const { return vfloat4(m128); }
+    __forceinline explicit operator const   vint4() const { return vint4(_mm_cvtps_epi32(m128)); }
+    __forceinline explicit operator const  Vec2fa() const { return Vec2fa(m128); }
+    __forceinline explicit operator const  Vec3ia() const { return Vec3ia(_mm_cvtps_epi32(m128)); }
+    
+    //__forceinline operator const __m128&() const { return m128; }
+    //__forceinline operator       __m128&()       { return m128; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline Vec3fx load( const void* const a ) {
+      return Vec3fx(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1))));
+    }
+
+    static __forceinline Vec3fx loadu( const void* const a ) {
+      return Vec3fx(_mm_loadu_ps((float*)a));
+    }
+
+    static __forceinline void storeu ( void* ptr, const Vec3fx& v ) {
+      _mm_storeu_ps((float*)ptr,v.m128);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3fx( ZeroTy   ) : m128(_mm_setzero_ps()) {}
+    __forceinline Vec3fx( OneTy    ) : m128(_mm_set1_ps(1.0f)) {}
+    __forceinline Vec3fx( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {}
+    __forceinline Vec3fx( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const float& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; }
+    __forceinline       float& operator []( const size_t index )       { assert(index < 3); return (&x)[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fx operator +( const Vec3fx& a ) { return a; }
+  __forceinline Vec3fx operator -( const Vec3fx& a ) {
+    const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
+    return _mm_xor_ps(a.m128, mask);
+  }
+  __forceinline Vec3fx abs  ( const Vec3fx& a ) {
+    const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
+    return _mm_and_ps(a.m128, mask);
+  }
+  __forceinline Vec3fx sign ( const Vec3fx& a ) {
+    return blendv_ps(Vec3fx(one).m128, (-Vec3fx(one)).m128, _mm_cmplt_ps (a.m128,Vec3fx(zero).m128));
+  }
+
+  __forceinline Vec3fx rcp  ( const Vec3fx& a )
+  {
+#if defined(__AVX512VL__)
+    const Vec3fx r = _mm_rcp14_ps(a.m128);
+#else
+    const Vec3fx r = _mm_rcp_ps(a.m128);
+#endif
+
+#if defined(__AVX2__)
+    const Vec3fx res = _mm_mul_ps(r.m128,_mm_fnmadd_ps(r.m128, a.m128, vfloat4(2.0f)));
+#else
+    const Vec3fx res = _mm_mul_ps(r.m128,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r.m128, a.m128)));
+    //return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
+#endif
+
+    return res;
+  }
+
+  __forceinline Vec3fx sqrt ( const Vec3fx& a ) { return _mm_sqrt_ps(a.m128); }
+  __forceinline Vec3fx sqr  ( const Vec3fx& a ) { return _mm_mul_ps(a.m128,a.m128); }
+
+  __forceinline Vec3fx rsqrt( const Vec3fx& a )
+  {
+#if defined(__AVX512VL__)
+    __m128 r = _mm_rsqrt14_ps(a.m128);
+#else
+    __m128 r = _mm_rsqrt_ps(a.m128);
+#endif
+    return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a.m128, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+  }
+
+  __forceinline Vec3fx zero_fix(const Vec3fx& a) {
+    return blendv_ps(a.m128, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input)));
+  }
+  __forceinline Vec3fx rcp_safe(const Vec3fx& a) {
+    return rcp(zero_fix(a));
+  }
+  __forceinline Vec3fx log ( const Vec3fx& a ) {
+    return Vec3fx(logf(a.x),logf(a.y),logf(a.z));
+  }
+
+  __forceinline Vec3fx exp ( const Vec3fx& a ) {
+    return Vec3fx(expf(a.x),expf(a.y),expf(a.z));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fx operator +( const Vec3fx& a, const Vec3fx& b ) { return _mm_add_ps(a.m128, b.m128); }
+  __forceinline Vec3fx operator -( const Vec3fx& a, const Vec3fx& b ) { return _mm_sub_ps(a.m128, b.m128); }
+  __forceinline Vec3fx operator *( const Vec3fx& a, const Vec3fx& b ) { return _mm_mul_ps(a.m128, b.m128); }
+  __forceinline Vec3fx operator *( const Vec3fx& a, const float b ) { return a * Vec3fx(b); }
+  __forceinline Vec3fx operator *( const float a, const Vec3fx& b ) { return Vec3fx(a) * b; }
+  __forceinline Vec3fx operator /( const Vec3fx& a, const Vec3fx& b ) { return _mm_div_ps(a.m128,b.m128); }
+  __forceinline Vec3fx operator /( const Vec3fx& a, const float b        ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); }
+  __forceinline Vec3fx operator /( const        float a, const Vec3fx& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); }
+
+  __forceinline Vec3fx min( const Vec3fx& a, const Vec3fx& b ) { return _mm_min_ps(a.m128,b.m128); }
+  __forceinline Vec3fx max( const Vec3fx& a, const Vec3fx& b ) { return _mm_max_ps(a.m128,b.m128); }
+
+#if defined(__SSE4_1__)
+    __forceinline Vec3fx mini(const Vec3fx& a, const Vec3fx& b) {
+      const vint4 ai = _mm_castps_si128(a.m128);
+      const vint4 bi = _mm_castps_si128(b.m128);
+      const vint4 ci = _mm_min_epi32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+#endif
+
+#if defined(__SSE4_1__)
+    __forceinline Vec3fx maxi(const Vec3fx& a, const Vec3fx& b) {
+      const vint4 ai = _mm_castps_si128(a.m128);
+      const vint4 bi = _mm_castps_si128(b.m128);
+      const vint4 ci = _mm_max_epi32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+#endif
+
+    __forceinline Vec3fx pow ( const Vec3fx& a, const float& b ) {
+      return Vec3fx(powf(a.x,b),powf(a.y,b),powf(a.z,b));
+    }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX2__)
+  __forceinline Vec3fx madd  ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fmadd_ps(a.m128,b.m128,c.m128); }
+  __forceinline Vec3fx msub  ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fmsub_ps(a.m128,b.m128,c.m128); }
+  __forceinline Vec3fx nmadd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fnmadd_ps(a.m128,b.m128,c.m128); }
+  __forceinline Vec3fx nmsub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fnmsub_ps(a.m128,b.m128,c.m128); }
+#else
+  __forceinline Vec3fx madd  ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return a*b+c; }
+  __forceinline Vec3fx msub  ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return a*b-c; }
+  __forceinline Vec3fx nmadd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return -a*b+c;}
+  __forceinline Vec3fx nmsub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return -a*b-c; }
+#endif
+
+  __forceinline Vec3fx madd  ( const float a, const Vec3fx& b, const Vec3fx& c) { return madd(Vec3fx(a),b,c); }
+  __forceinline Vec3fx msub  ( const float a, const Vec3fx& b, const Vec3fx& c) { return msub(Vec3fx(a),b,c); }
+  __forceinline Vec3fx nmadd ( const float a, const Vec3fx& b, const Vec3fx& c) { return nmadd(Vec3fx(a),b,c); }
+  __forceinline Vec3fx nmsub ( const float a, const Vec3fx& b, const Vec3fx& c) { return nmsub(Vec3fx(a),b,c); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fx& operator +=( Vec3fx& a, const Vec3fx& b ) { return a = a + b; }
+  __forceinline Vec3fx& operator -=( Vec3fx& a, const Vec3fx& b ) { return a = a - b; }
+  __forceinline Vec3fx& operator *=( Vec3fx& a, const Vec3fx& b ) { return a = a * b; }
+  __forceinline Vec3fx& operator *=( Vec3fx& a, const float   b ) { return a = a * b; }
+  __forceinline Vec3fx& operator /=( Vec3fx& a, const Vec3fx& b ) { return a = a / b; }
+  __forceinline Vec3fx& operator /=( Vec3fx& a, const float   b ) { return a = a / b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline float reduce_add(const Vec3fx& v) { 
+    const vfloat4 a(v.m128);
+    const vfloat4 b = shuffle<1>(a);
+    const vfloat4 c = shuffle<2>(a);
+    return _mm_cvtss_f32(a+b+c); 
+  }
+
+  __forceinline float reduce_mul(const Vec3fx& v) { return v.x*v.y*v.z; }
+  __forceinline float reduce_min(const Vec3fx& v) { return min(v.x,v.y,v.z); }
+  __forceinline float reduce_max(const Vec3fx& v) { return max(v.x,v.y,v.z); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool operator ==( const Vec3fx& a, const Vec3fx& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 7) == 7; }
+  __forceinline bool operator !=( const Vec3fx& a, const Vec3fx& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 7) != 0; }
+
+  __forceinline Vec3ba eq_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpeq_ps (a.m128, b.m128); }
+  __forceinline Vec3ba neq_mask(const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpneq_ps(a.m128, b.m128); }
+  __forceinline Vec3ba lt_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmplt_ps (a.m128, b.m128); }
+  __forceinline Vec3ba le_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmple_ps (a.m128, b.m128); }
+  __forceinline Vec3ba gt_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpnle_ps(a.m128, b.m128); }
+  __forceinline Vec3ba ge_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpnlt_ps(a.m128, b.m128); }
+
+  __forceinline bool isvalid ( const Vec3fx& v ) {
+    return all(gt_mask(v,Vec3fx(-FLT_LARGE)) & lt_mask(v,Vec3fx(+FLT_LARGE)));
+  }
+
+  __forceinline bool is_finite ( const Vec3fx& a ) {
+    return all(ge_mask(a,Vec3fx(-FLT_MAX)) & le_mask(a,Vec3fx(+FLT_MAX)));
+  }
+
+  __forceinline bool isvalid4 ( const Vec3fx& v ) {
+    return all((vfloat4(v.m128) > vfloat4(-FLT_LARGE)) & (vfloat4(v.m128) < vfloat4(+FLT_LARGE)));
+  }
+
+  __forceinline bool is_finite4 ( const Vec3fx& a ) {
+    return all((vfloat4(a.m128) >= vfloat4(-FLT_MAX)) & (vfloat4(a.m128) <= vfloat4(+FLT_MAX)));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Euclidian Space Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__SSE4_1__)
+  __forceinline float dot ( const Vec3fx& a, const Vec3fx& b ) {
+    return _mm_cvtss_f32(_mm_dp_ps(a.m128,b.m128,0x7F));
+  }
+#else
+  __forceinline float dot ( const Vec3fx& a, const Vec3fx& b ) {
+    return reduce_add(a*b);
+  }
+#endif
+
+  __forceinline Vec3fx cross ( const Vec3fx& a, const Vec3fx& b )
+  {
+    vfloat4 a0 = vfloat4(a.m128);
+    vfloat4 b0 = shuffle<1,2,0,3>(vfloat4(b.m128));
+    vfloat4 a1 = shuffle<1,2,0,3>(vfloat4(a.m128));
+    vfloat4 b1 = vfloat4(b.m128);
+    return Vec3fx(shuffle<1,2,0,3>(msub(a0,b0,a1*b1)));
+  }
+
+  __forceinline float  sqr_length ( const Vec3fx& a )                { return dot(a,a); }
+  __forceinline float  rcp_length ( const Vec3fx& a )                { return rsqrt(dot(a,a)); }
+  __forceinline float  rcp_length2( const Vec3fx& a )                { return rcp(dot(a,a)); }
+  __forceinline float  length   ( const Vec3fx& a )                  { return sqrt(dot(a,a)); }
+  __forceinline Vec3fx normalize( const Vec3fx& a )                  { return a*rsqrt(dot(a,a)); }
+  __forceinline float  distance ( const Vec3fx& a, const Vec3fx& b ) { return length(a-b); }
+  __forceinline float  halfArea ( const Vec3fx& d )                  { return madd(d.x,(d.y+d.z),d.y*d.z); }
+  __forceinline float  area     ( const Vec3fx& d )                  { return 2.0f*halfArea(d); }
+
+  __forceinline Vec3fx normalize_safe( const Vec3fx& a ) {
+    const float d = dot(a,a); if (unlikely(d == 0.0f)) return a; else return a*rsqrt(d);
+  }
+
+  /*! differentiated normalization */
+  __forceinline Vec3fx dnormalize(const Vec3fx& p, const Vec3fx& dp)
+  {
+    const float pp  = dot(p,p);
+    const float pdp = dot(p,dp);
+    return (pp*dp-pdp*p)*rcp(pp)*rsqrt(pp);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fx select( bool s, const Vec3fx& t, const Vec3fx& f ) {
+    __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps();
+    return blendv_ps(f.m128, t.m128, mask);
+  }
+
+  __forceinline Vec3fx select( const Vec3ba& s, const Vec3fx& t, const Vec3fx& f ) {
+    return blendv_ps(f.m128, t.m128, s);
+  }
+
+  __forceinline Vec3fx lerp(const Vec3fx& v0, const Vec3fx& v1, const float t) {
+    return madd(1.0f-t,v0,t*v1);
+  }
+
+  __forceinline int maxDim ( const Vec3fx& a )
+  {
+    const Vec3fx b = abs(a);
+    if (b.x > b.y) {
+      if (b.x > b.z) return 0; else return 2;
+    } else {
+      if (b.y > b.z) return 1; else return 2;
+    }
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Rounding Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__aarch64__)
+  __forceinline Vec3fx trunc(const Vec3fx& a) { return vrndq_f32(a.m128); }
+  __forceinline Vec3fx floor(const Vec3fx& a) { return vrndmq_f32(a.m128); }
+  __forceinline Vec3fx ceil (const Vec3fx& a) { return vrndpq_f32(a.m128); }
+#elif defined (__SSE4_1__)
+  __forceinline Vec3fx trunc( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEAREST_INT); }
+  __forceinline Vec3fx floor( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEG_INF    ); }
+  __forceinline Vec3fx ceil ( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_POS_INF    ); }
+#else
+  __forceinline Vec3fx trunc( const Vec3fx& a ) { return Vec3fx(truncf(a.x),truncf(a.y),truncf(a.z)); }
+  __forceinline Vec3fx floor( const Vec3fx& a ) { return Vec3fx(floorf(a.x),floorf(a.y),floorf(a.z)); }
+  __forceinline Vec3fx ceil ( const Vec3fx& a ) { return Vec3fx(ceilf (a.x),ceilf (a.y),ceilf (a.z)); }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3fx& a) {
+    return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")";
+  }
+
+  
+  typedef Vec3fx Vec3ff;
+}
diff --git a/thirdparty/embree/common/math/vec3ia.h b/thirdparty/embree/common/math/vec3ia.h
new file mode 100644
index 0000000000..694804c40d
--- /dev/null
+++ b/thirdparty/embree/common/math/vec3ia.h
@@ -0,0 +1,186 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/alloc.h"
+#include "math.h"
+#include "../simd/sse.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// SSE Vec3ia Type
+  ////////////////////////////////////////////////////////////////////////////////
+
+  struct __aligned(16) Vec3ia
+  {
+    ALIGNED_STRUCT_(16);
+
+    union {
+      __m128i m128;
+      struct { int x,y,z; };
+    };
+
+    typedef int Scalar;
+    enum { N = 3 };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3ia( ) {}
+    __forceinline Vec3ia( const __m128i a ) : m128(a) {}
+    __forceinline Vec3ia( const Vec3ia& other ) : m128(other.m128) {}
+    __forceinline Vec3ia& operator =(const Vec3ia& other) { m128 = other.m128; return *this; }
+
+    __forceinline explicit Vec3ia( const int a ) : m128(_mm_set1_epi32(a)) {}
+    __forceinline          Vec3ia( const int x, const int y, const int z) : m128(_mm_set_epi32(z, z, y, x)) {}
+    __forceinline explicit Vec3ia( const __m128 a ) : m128(_mm_cvtps_epi32(a)) {}
+
+    __forceinline operator const __m128i&() const { return m128; }
+    __forceinline operator       __m128i&()       { return m128; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3ia( ZeroTy   ) : m128(_mm_setzero_si128()) {}
+    __forceinline Vec3ia( OneTy    ) : m128(_mm_set1_epi32(1)) {}
+    __forceinline Vec3ia( PosInfTy ) : m128(_mm_set1_epi32(pos_inf)) {}
+    __forceinline Vec3ia( NegInfTy ) : m128(_mm_set1_epi32(neg_inf)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const int& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; }
+    __forceinline       int& operator []( const size_t index )       { assert(index < 3); return (&x)[index]; }
+  };
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3ia operator +( const Vec3ia& a ) { return a; }
+  __forceinline Vec3ia operator -( const Vec3ia& a ) { return _mm_sub_epi32(_mm_setzero_si128(), a.m128); }
+#if defined(__SSSE3__)
+  __forceinline Vec3ia abs       ( const Vec3ia& a ) { return _mm_abs_epi32(a.m128); }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3ia operator +( const Vec3ia& a, const Vec3ia& b ) { return _mm_add_epi32(a.m128, b.m128); }
+  __forceinline Vec3ia operator +( const Vec3ia& a, const int     b ) { return a+Vec3ia(b); }
+  __forceinline Vec3ia operator +( const int     a, const Vec3ia& b ) { return Vec3ia(a)+b; }
+
+  __forceinline Vec3ia operator -( const Vec3ia& a, const Vec3ia& b ) { return _mm_sub_epi32(a.m128, b.m128); }
+  __forceinline Vec3ia operator -( const Vec3ia& a, const int     b ) { return a-Vec3ia(b); }
+  __forceinline Vec3ia operator -( const int     a, const Vec3ia& b ) { return Vec3ia(a)-b; }
+
+#if defined(__SSE4_1__)
+  __forceinline Vec3ia operator *( const Vec3ia& a, const Vec3ia& b ) { return _mm_mullo_epi32(a.m128, b.m128); }
+  __forceinline Vec3ia operator *( const Vec3ia& a, const int     b ) { return a * Vec3ia(b); }
+  __forceinline Vec3ia operator *( const int     a, const Vec3ia& b ) { return Vec3ia(a) * b; }
+#endif
+
+  __forceinline Vec3ia operator &( const Vec3ia& a, const Vec3ia& b ) { return _mm_and_si128(a.m128, b.m128); }
+  __forceinline Vec3ia operator &( const Vec3ia& a, const int     b ) { return a & Vec3ia(b); }
+  __forceinline Vec3ia operator &( const int     a, const Vec3ia& b ) { return Vec3ia(a) & b; }
+
+  __forceinline Vec3ia operator |( const Vec3ia& a, const Vec3ia& b ) { return _mm_or_si128(a.m128, b.m128); }
+  __forceinline Vec3ia operator |( const Vec3ia& a, const int     b ) { return a | Vec3ia(b); }
+  __forceinline Vec3ia operator |( const int     a, const Vec3ia& b ) { return Vec3ia(a) | b; }
+
+  __forceinline Vec3ia operator ^( const Vec3ia& a, const Vec3ia& b ) { return _mm_xor_si128(a.m128, b.m128); }
+  __forceinline Vec3ia operator ^( const Vec3ia& a, const int     b ) { return a ^ Vec3ia(b); }
+  __forceinline Vec3ia operator ^( const int     a, const Vec3ia& b ) { return Vec3ia(a) ^ b; }
+
+  __forceinline Vec3ia operator <<( const Vec3ia& a, const int n ) { return _mm_slli_epi32(a.m128, n); }
+  __forceinline Vec3ia operator >>( const Vec3ia& a, const int n ) { return _mm_srai_epi32(a.m128, n); }
+
+  __forceinline Vec3ia sll ( const Vec3ia& a, const int b ) { return _mm_slli_epi32(a.m128, b); }
+  __forceinline Vec3ia sra ( const Vec3ia& a, const int b ) { return _mm_srai_epi32(a.m128, b); }
+  __forceinline Vec3ia srl ( const Vec3ia& a, const int b ) { return _mm_srli_epi32(a.m128, b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3ia& operator +=( Vec3ia& a, const Vec3ia& b ) { return a = a + b; }
+  __forceinline Vec3ia& operator +=( Vec3ia& a, const int&   b ) { return a = a + b; }
+  
+  __forceinline Vec3ia& operator -=( Vec3ia& a, const Vec3ia& b ) { return a = a - b; }
+  __forceinline Vec3ia& operator -=( Vec3ia& a, const int&   b ) { return a = a - b; }
+  
+#if defined(__SSE4_1__)
+  __forceinline Vec3ia& operator *=( Vec3ia& a, const Vec3ia& b ) { return a = a * b; }
+  __forceinline Vec3ia& operator *=( Vec3ia& a, const int&    b ) { return a = a * b; }
+#endif
+  
+  __forceinline Vec3ia& operator &=( Vec3ia& a, const Vec3ia& b ) { return a = a & b; }
+  __forceinline Vec3ia& operator &=( Vec3ia& a, const int&    b ) { return a = a & b; }
+  
+  __forceinline Vec3ia& operator |=( Vec3ia& a, const Vec3ia& b ) { return a = a | b; }
+  __forceinline Vec3ia& operator |=( Vec3ia& a, const int&    b ) { return a = a | b; }
+  
+  __forceinline Vec3ia& operator <<=( Vec3ia& a, const int& b ) { return a = a << b; }
+  __forceinline Vec3ia& operator >>=( Vec3ia& a, const int& b ) { return a = a >> b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline int reduce_add(const Vec3ia& v) { return v.x+v.y+v.z; }
+  __forceinline int reduce_mul(const Vec3ia& v) { return v.x*v.y*v.z; }
+  __forceinline int reduce_min(const Vec3ia& v) { return min(v.x,v.y,v.z); }
+  __forceinline int reduce_max(const Vec3ia& v) { return max(v.x,v.y,v.z); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool operator ==( const Vec3ia& a, const Vec3ia& b ) { return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(a.m128, b.m128))) & 7) == 7; }
+  __forceinline bool operator !=( const Vec3ia& a, const Vec3ia& b ) { return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(a.m128, b.m128))) & 7) != 7; }
+  __forceinline bool operator < ( const Vec3ia& a, const Vec3ia& b ) {
+    if (a.x != b.x) return a.x < b.x;
+    if (a.y != b.y) return a.y < b.y;
+    if (a.z != b.z) return a.z < b.z;
+    return false;
+  }
+
+  __forceinline Vec3ba eq_mask( const Vec3ia& a, const Vec3ia& b ) { return _mm_castsi128_ps(_mm_cmpeq_epi32 (a.m128, b.m128)); }
+  __forceinline Vec3ba lt_mask( const Vec3ia& a, const Vec3ia& b ) { return _mm_castsi128_ps(_mm_cmplt_epi32 (a.m128, b.m128)); }
+  __forceinline Vec3ba gt_mask( const Vec3ia& a, const Vec3ia& b ) { return _mm_castsi128_ps(_mm_cmpgt_epi32 (a.m128, b.m128)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3ia select( const Vec3ba& m, const Vec3ia& t, const Vec3ia& f ) {
+#if defined(__SSE4_1__)
+    return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m));
+#else
+    return _mm_or_si128(_mm_and_si128(_mm_castps_si128(m), t), _mm_andnot_si128(_mm_castps_si128(m), f)); 
+#endif
+  }
+
+#if defined(__SSE4_1__)
+  __forceinline Vec3ia min( const Vec3ia& a, const Vec3ia& b ) { return _mm_min_epi32(a.m128,b.m128); }
+  __forceinline Vec3ia max( const Vec3ia& a, const Vec3ia& b ) { return _mm_max_epi32(a.m128,b.m128); }
+#else
+  __forceinline Vec3ia min( const Vec3ia& a, const Vec3ia& b ) { return select(lt_mask(a,b),a,b); }
+  __forceinline Vec3ia max( const Vec3ia& a, const Vec3ia& b ) { return select(gt_mask(a,b),a,b); }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3ia& a) {
+    return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")";
+  }
+}
diff --git a/thirdparty/embree/common/math/vec4.h b/thirdparty/embree/common/math/vec4.h
new file mode 100644
index 0000000000..0ed107928a
--- /dev/null
+++ b/thirdparty/embree/common/math/vec4.h
@@ -0,0 +1,243 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "math.h"
+#include "vec3.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Generic 4D vector Class
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> struct Vec4
+  {
+    enum { N = 4 };    
+    union {
+      struct { T x, y, z, w; };
+#if !(defined(__WIN32__) && _MSC_VER == 1800) // workaround for older VS 2013 compiler
+      T components[N];
+#endif
+    };
+
+    typedef T Scalar;
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Construction
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec4( ) {}
+    __forceinline explicit Vec4( const T& a                                     ) : x(a), y(a), z(a), w(a) {}
+    __forceinline          Vec4( const T& x, const T& y, const T& z, const T& w ) : x(x), y(y), z(z), w(w) {}
+    __forceinline          Vec4( const Vec3<T>& xyz, const T& w ) : x(xyz.x), y(xyz.y), z(xyz.z), w(w) {}
+
+    __forceinline Vec4( const Vec4& other ) { x = other.x; y = other.y; z = other.z; w = other.w; }
+    __forceinline Vec4( const Vec3fx& other );
+
+    template<typename T1> __forceinline Vec4( const Vec4<T1>& a ) : x(T(a.x)), y(T(a.y)), z(T(a.z)), w(T(a.w)) {}
+    template<typename T1> __forceinline Vec4& operator =(const Vec4<T1>& other) { x = other.x; y = other.y; z = other.z; w = other.w; return *this; }
+
+    __forceinline Vec4& operator =(const Vec4& other) { x = other.x; y = other.y; z = other.z; w = other.w; return *this; }
+
+    __forceinline operator Vec3<T> () const { return Vec3<T>(x,y,z); }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec4( ZeroTy   ) : x(zero), y(zero), z(zero), w(zero) {}
+    __forceinline Vec4( OneTy    ) : x(one),  y(one),  z(one),  w(one) {}
+    __forceinline Vec4( PosInfTy ) : x(pos_inf), y(pos_inf), z(pos_inf), w(pos_inf) {}
+    __forceinline Vec4( NegInfTy ) : x(neg_inf), y(neg_inf), z(neg_inf), w(neg_inf) {}
+
+#if defined(__WIN32__) && (_MSC_VER == 1800) // workaround for older VS 2013 compiler
+	__forceinline const T& operator [](const size_t axis) const { assert(axis < 4); return (&x)[axis]; }
+	__forceinline       T& operator [](const size_t axis)       { assert(axis < 4); return (&x)[axis]; }
+#else
+	__forceinline const T& operator [](const size_t axis ) const { assert(axis < 4); return components[axis]; }
+	__forceinline       T& operator [](const size_t axis)        { assert(axis < 4); return components[axis]; }
+#endif
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Swizzles
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3<T> xyz() const { return Vec3<T>(x, y, z); }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec4<T> operator +( const Vec4<T>& a ) { return Vec4<T>(+a.x, +a.y, +a.z, +a.w); }
+  template<typename T> __forceinline Vec4<T> operator -( const Vec4<T>& a ) { return Vec4<T>(-a.x, -a.y, -a.z, -a.w); }
+  template<typename T> __forceinline Vec4<T> abs       ( const Vec4<T>& a ) { return Vec4<T>(abs  (a.x), abs  (a.y), abs  (a.z), abs  (a.w)); }
+  template<typename T> __forceinline Vec4<T> rcp       ( const Vec4<T>& a ) { return Vec4<T>(rcp  (a.x), rcp  (a.y), rcp  (a.z), rcp  (a.w)); }
+  template<typename T> __forceinline Vec4<T> rsqrt     ( const Vec4<T>& a ) { return Vec4<T>(rsqrt(a.x), rsqrt(a.y), rsqrt(a.z), rsqrt(a.w)); }
+  template<typename T> __forceinline Vec4<T> sqrt      ( const Vec4<T>& a ) { return Vec4<T>(sqrt (a.x), sqrt (a.y), sqrt (a.z), sqrt (a.w)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec4<T> operator +( const Vec4<T>& a, const Vec4<T>& b ) { return Vec4<T>(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
+  template<typename T> __forceinline Vec4<T> operator -( const Vec4<T>& a, const Vec4<T>& b ) { return Vec4<T>(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
+  template<typename T> __forceinline Vec4<T> operator *( const Vec4<T>& a, const Vec4<T>& b ) { return Vec4<T>(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
+  template<typename T> __forceinline Vec4<T> operator *( const       T& a, const Vec4<T>& b ) { return Vec4<T>(a   * b.x, a   * b.y, a   * b.z, a   * b.w); }
+  template<typename T> __forceinline Vec4<T> operator *( const Vec4<T>& a, const       T& b ) { return Vec4<T>(a.x * b  , a.y * b  , a.z * b  , a.w * b  ); }
+  template<typename T> __forceinline Vec4<T> operator /( const Vec4<T>& a, const Vec4<T>& b ) { return Vec4<T>(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w); }
+  template<typename T> __forceinline Vec4<T> operator /( const Vec4<T>& a, const       T& b ) { return Vec4<T>(a.x / b  , a.y / b  , a.z / b  , a.w / b  ); }
+  template<typename T> __forceinline Vec4<T> operator /( const       T& a, const Vec4<T>& b ) { return Vec4<T>(a   / b.x, a   / b.y, a   / b.z, a   / b.w); }
+
+  template<typename T> __forceinline Vec4<T> min(const Vec4<T>& a, const Vec4<T>& b) { return Vec4<T>(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w)); }
+  template<typename T> __forceinline Vec4<T> max(const Vec4<T>& a, const Vec4<T>& b) { return Vec4<T>(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec4<T> madd  ( const Vec4<T>& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>( madd(a.x,b.x,c.x), madd(a.y,b.y,c.y), madd(a.z,b.z,c.z), madd(a.w,b.w,c.w)); }
+  template<typename T> __forceinline Vec4<T> msub  ( const Vec4<T>& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>( msub(a.x,b.x,c.x), msub(a.y,b.y,c.y), msub(a.z,b.z,c.z), msub(a.w,b.w,c.w)); }
+  template<typename T> __forceinline Vec4<T> nmadd ( const Vec4<T>& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>(nmadd(a.x,b.x,c.x),nmadd(a.y,b.y,c.y),nmadd(a.z,b.z,c.z),nmadd(a.w,b.w,c.w)); }
+  template<typename T> __forceinline Vec4<T> nmsub ( const Vec4<T>& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>(nmsub(a.x,b.x,c.x),nmsub(a.y,b.y,c.y),nmsub(a.z,b.z,c.z),nmsub(a.w,b.w,c.w)); }
+
+  template<typename T> __forceinline Vec4<T> madd  ( const T& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>( madd(a,b.x,c.x), madd(a,b.y,c.y), madd(a,b.z,c.z), madd(a,b.w,c.w)); }
+  template<typename T> __forceinline Vec4<T> msub  ( const T& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>( msub(a,b.x,c.x), msub(a,b.y,c.y), msub(a,b.z,c.z), msub(a,b.w,c.w)); }
+  template<typename T> __forceinline Vec4<T> nmadd ( const T& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>(nmadd(a,b.x,c.x),nmadd(a,b.y,c.y),nmadd(a,b.z,c.z),nmadd(a,b.w,c.w)); }
+  template<typename T> __forceinline Vec4<T> nmsub ( const T& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>(nmsub(a,b.x,c.x),nmsub(a,b.y,c.y),nmsub(a,b.z,c.z),nmsub(a,b.w,c.w)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec4<T>& operator +=( Vec4<T>& a, const Vec4<T>& b ) { a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; return a; }
+  template<typename T> __forceinline Vec4<T>& operator -=( Vec4<T>& a, const Vec4<T>& b ) { a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w; return a; }
+  template<typename T> __forceinline Vec4<T>& operator *=( Vec4<T>& a, const       T& b ) { a.x *= b  ; a.y *= b  ; a.z *= b  ; a.w *= b  ; return a; }
+  template<typename T> __forceinline Vec4<T>& operator /=( Vec4<T>& a, const       T& b ) { a.x /= b  ; a.y /= b  ; a.z /= b  ; a.w /= b  ; return a; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline T reduce_add( const Vec4<T>& a ) { return a.x + a.y + a.z + a.w; }
+  template<typename T> __forceinline T reduce_mul( const Vec4<T>& a ) { return a.x * a.y * a.z * a.w; }
+  template<typename T> __forceinline T reduce_min( const Vec4<T>& a ) { return min(a.x, a.y, a.z, a.w); }
+  template<typename T> __forceinline T reduce_max( const Vec4<T>& a ) { return max(a.x, a.y, a.z, a.w); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline bool operator ==( const Vec4<T>& a, const Vec4<T>& b ) { return a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w; }
+  template<typename T> __forceinline bool operator !=( const Vec4<T>& a, const Vec4<T>& b ) { return a.x != b.x || a.y != b.y || a.z != b.z || a.w != b.w; }
+  template<typename T> __forceinline bool operator < ( const Vec4<T>& a, const Vec4<T>& b ) {
+    if (a.x != b.x) return a.x < b.x;
+    if (a.y != b.y) return a.y < b.y;
+    if (a.z != b.z) return a.z < b.z;
+    if (a.w != b.w) return a.w < b.w;
+    return false;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Shift Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec4<T> shift_right_1( const Vec4<T>& a ) {
+    return Vec4<T>(shift_right_1(a.x),shift_right_1(a.y),shift_right_1(a.z),shift_right_1(a.w));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Euclidian Space Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline T       dot      ( const Vec4<T>& a, const Vec4<T>& b ) { return madd(a.x,b.x,madd(a.y,b.y,madd(a.z,b.z,a.w*b.w))); }
+
+  template<typename T> __forceinline T       length   ( const Vec4<T>& a )                   { return sqrt(dot(a,a)); }
+  template<typename T> __forceinline Vec4<T> normalize( const Vec4<T>& a )                   { return a*rsqrt(dot(a,a)); }
+  template<typename T> __forceinline T       distance ( const Vec4<T>& a, const Vec4<T>& b ) { return length(a-b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec4<T> select ( bool s, const Vec4<T>& t, const Vec4<T>& f ) {
+    return Vec4<T>(select(s,t.x,f.x),select(s,t.y,f.y),select(s,t.z,f.z),select(s,t.w,f.w));
+  }
+
+  template<typename T> __forceinline Vec4<T> select ( const Vec4<bool>& s, const Vec4<T>& t, const Vec4<T>& f ) {
+    return Vec4<T>(select(s.x,t.x,f.x),select(s.y,t.y,f.y),select(s.z,t.z,f.z),select(s.w,t.w,f.w));
+  }
+
+  template<typename T> __forceinline Vec4<T> select ( const typename T::Bool& s, const Vec4<T>& t, const Vec4<T>& f ) {
+    return Vec4<T>(select(s,t.x,f.x),select(s,t.y,f.y),select(s,t.z,f.z),select(s,t.w,f.w));
+  }
+
+  template<typename T>
+    __forceinline Vec4<T> lerp(const Vec4<T>& v0, const Vec4<T>& v1, const T& t) {
+    return madd(Vec4<T>(T(1.0f)-t),v0,t*v1);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const Vec4<T>& a) {
+    return cout << "(" << a.x << ", " << a.y << ", " << a.z << ", " << a.w << ")";
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Default template instantiations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  typedef Vec4<bool         > Vec4b;
+  typedef Vec4<unsigned char> Vec4uc;
+  typedef Vec4<int          > Vec4i;
+  typedef Vec4<float        > Vec4f;
+}
+
+#include "vec3ba.h"
+#include "vec3ia.h"
+#include "vec3fa.h"
+
+////////////////////////////////////////////////////////////////////////////////
+/// SSE / AVX / MIC specializations
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined __SSE__
+#include "../simd/sse.h"
+#endif
+
+#if defined __AVX__
+#include "../simd/avx.h"
+#endif
+
+#if defined __AVX512F__
+#include "../simd/avx512.h"
+#endif
+
+namespace embree
+{
+  template<> __forceinline Vec4<float>::Vec4( const Vec3fx& a ) { x = a.x; y = a.y; z = a.z; w = a.w; }
+
+#if defined(__AVX__)
+  template<> __forceinline Vec4<vfloat4>::Vec4( const Vec3fx& a ) {
+    x = a.x; y = a.y; z = a.z; w = a.w;
+  }
+#elif defined(__SSE__)
+  template<> __forceinline Vec4<vfloat4>::Vec4( const Vec3fx& a ) {
+    const vfloat4 v = vfloat4(a.m128); x = shuffle<0,0,0,0>(v); y = shuffle<1,1,1,1>(v); z = shuffle<2,2,2,2>(v); w = shuffle<3,3,3,3>(v);
+  }
+#endif
+
+#if defined(__AVX__)
+  template<> __forceinline Vec4<vfloat8>::Vec4( const Vec3fx& a ) {
+    x = a.x; y = a.y; z = a.z; w = a.w;
+  }
+#endif
+
+#if defined(__AVX512F__)
+  template<> __forceinline Vec4<vfloat16>::Vec4( const Vec3fx& a ) : x(a.x), y(a.y), z(a.z), w(a.w) {}
+#endif
+}
diff --git a/thirdparty/embree/common/simd/arm/emulation.h b/thirdparty/embree/common/simd/arm/emulation.h
new file mode 100644
index 0000000000..1c3875fb27
--- /dev/null
+++ b/thirdparty/embree/common/simd/arm/emulation.h
@@ -0,0 +1,50 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+/* Make precision match SSE, at the cost of some performance */
+#if !defined(__aarch64__)
+#  define SSE2NEON_PRECISE_DIV 1
+#  define SSE2NEON_PRECISE_SQRT 1
+#endif
+
+#include "sse2neon.h"
+
+__forceinline __m128 _mm_fmsub_ps(__m128 a, __m128 b, __m128 c) {
+   __m128 neg_c = vreinterpretq_m128_f32(vnegq_f32(vreinterpretq_f32_m128(c)));
+  return _mm_fmadd_ps(a, b, neg_c);
+}
+
+__forceinline __m128 _mm_fnmadd_ps(__m128 a, __m128 b, __m128 c) {
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(vfmsq_f32(vreinterpretq_f32_m128(c),
+                                            vreinterpretq_f32_m128(b),
+                                            vreinterpretq_f32_m128(a)));
+#else
+    return _mm_sub_ps(c, _mm_mul_ps(a, b));
+#endif
+}
+
+__forceinline __m128 _mm_fnmsub_ps(__m128 a, __m128 b, __m128 c) {
+  return vreinterpretq_m128_f32(vnegq_f32(vreinterpretq_f32_m128(_mm_fmadd_ps(a,b,c))));
+}
+
+
+/* Dummy defines for floating point control */
+#define _MM_MASK_MASK 0x1f80
+#define _MM_MASK_DIV_ZERO 0x200
+#define _MM_FLUSH_ZERO_ON 0x8000
+#define _MM_MASK_DENORM 0x100
+#define _MM_SET_EXCEPTION_MASK(x)
+#define _MM_SET_FLUSH_ZERO_MODE(x)
+
+__forceinline int _mm_getcsr()
+{
+  return 0;
+}
+
+__forceinline void _mm_mfence()
+{
+  __sync_synchronize();
+}
diff --git a/thirdparty/embree/common/simd/arm/sse2neon.h b/thirdparty/embree/common/simd/arm/sse2neon.h
new file mode 100644
index 0000000000..7eb25cf2c5
--- /dev/null
+++ b/thirdparty/embree/common/simd/arm/sse2neon.h
@@ -0,0 +1,6996 @@
+#ifndef SSE2NEON_H
+#define SSE2NEON_H
+
+// This header file provides a simple API translation layer
+// between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
+//
+// This header file does not yet translate all of the SSE intrinsics.
+//
+// Contributors to this work are:
+//   John W. Ratcliff <jratcliffscarab@gmail.com>
+//   Brandon Rowlett <browlett@nvidia.com>
+//   Ken Fast <kfast@gdeb.com>
+//   Eric van Beurden <evanbeurden@nvidia.com>
+//   Alexander Potylitsin <apotylitsin@nvidia.com>
+//   Hasindu Gamaarachchi <hasindu2008@gmail.com>
+//   Jim Huang <jserv@biilabs.io>
+//   Mark Cheng <marktwtn@biilabs.io>
+//   Malcolm James MacLeod <malcolm@gulden.com>
+//   Devin Hussey (easyaspi314) <husseydevin@gmail.com>
+//   Sebastian Pop <spop@amazon.com>
+//   Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com>
+//   Danila Kutenin <danilak@google.com>
+//   François Turban (JishinMaster) <francois.turban@gmail.com>
+//   Pei-Hsuan Hung <afcidk@gmail.com>
+//   Yang-Hao Yuan <yanghau@biilabs.io>
+//   Syoyo Fujita <syoyo@lighttransport.com>
+//   Brecht Van Lommel <brecht@blender.org>
+
+/*
+ * sse2neon is freely redistributable under the MIT License.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* Tunable configurations */
+
+/* Enable precise implementation of math operations
+ * This would slow down the computation a bit, but gives consistent result with
+ * x86 SSE2. (e.g. would solve a hole or NaN pixel in the rendering result)
+ */
+/* _mm_min_ps and _mm_max_ps */
+#ifndef SSE2NEON_PRECISE_MINMAX
+#define SSE2NEON_PRECISE_MINMAX (0)
+#endif
+/* _mm_rcp_ps and _mm_div_ps */
+#ifndef SSE2NEON_PRECISE_DIV
+#define SSE2NEON_PRECISE_DIV (0)
+#endif
+/* _mm_sqrt_ps and _mm_rsqrt_ps */
+#ifndef SSE2NEON_PRECISE_SQRT
+#define SSE2NEON_PRECISE_SQRT (0)
+#endif
+#ifndef SSE2NEON_PRECISE_RSQRT
+#define SSE2NEON_PRECISE_RSQRT (0)
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#pragma push_macro("FORCE_INLINE")
+#pragma push_macro("ALIGN_STRUCT")
+#define FORCE_INLINE static inline __attribute__((always_inline))
+#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
+#ifndef likely
+#define likely(x) __builtin_expect(!!(x), 1)
+#endif
+#ifndef unlikely
+#define unlikely(x) __builtin_expect(!!(x), 0)
+#endif
+#else
+#error "Macro name collisions may happen with unsupported compiler."
+#ifdef FORCE_INLINE
+#undef FORCE_INLINE
+#endif
+#define FORCE_INLINE static inline
+#ifndef ALIGN_STRUCT
+#define ALIGN_STRUCT(x) __declspec(align(x))
+#endif
+#endif
+#ifndef likely
+#define likely(x) (x)
+#endif
+#ifndef unlikely
+#define unlikely(x) (x)
+#endif
+
+#include <stdint.h>
+#include <stdlib.h>
+
+/* Architecture-specific build options */
+/* FIXME: #pragma GCC push_options is only available on GCC */
+#if defined(__GNUC__)
+#if defined(__arm__) && __ARM_ARCH == 7
+/* According to ARM C Language Extensions Architecture specification,
+ * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON)
+ * architecture supported.
+ */
+#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
+#error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."
+#endif
+#if !defined(__clang__)
+#pragma GCC push_options
+#pragma GCC target("fpu=neon")
+#endif
+#elif defined(__aarch64__)
+#if !defined(__clang__)
+#pragma GCC push_options
+#pragma GCC target("+simd")
+#endif
+#else
+#error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A."
+#endif
+#endif
+
+#include <arm_neon.h>
+
+/* Rounding functions require either Aarch64 instructions or libm failback */
+#if !defined(__aarch64__)
+#include <math.h>
+#endif
+
+/* "__has_builtin" can be used to query support for built-in functions
+ * provided by gcc/clang and other compilers that support it.
+ */
+#ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */
+/* Compatibility with gcc <= 9 */
+#if __GNUC__ <= 9
+#define __has_builtin(x) HAS##x
+#define HAS__builtin_popcount 1
+#define HAS__builtin_popcountll 1
+#else
+#define __has_builtin(x) 0
+#endif
+#endif
+
+/**
+ * MACRO for shuffle parameter for _mm_shuffle_ps().
+ * Argument fp3 is a digit[0123] that represents the fp from argument "b"
+ * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same
+ * for fp2 in result. fp1 is a digit[0123] that represents the fp from
+ * argument "a" of mm_shuffle_ps that will be places in fp1 of result.
+ * fp0 is the same for fp0 of result.
+ */
+#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
+    (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
+
+/* Rounding mode macros. */
+#define _MM_FROUND_TO_NEAREST_INT 0x00
+#define _MM_FROUND_TO_NEG_INF 0x01
+#define _MM_FROUND_TO_POS_INF 0x02
+#define _MM_FROUND_TO_ZERO 0x03
+#define _MM_FROUND_CUR_DIRECTION 0x04
+#define _MM_FROUND_NO_EXC 0x08
+#define _MM_ROUND_NEAREST 0x0000
+#define _MM_ROUND_DOWN 0x2000
+#define _MM_ROUND_UP 0x4000
+#define _MM_ROUND_TOWARD_ZERO 0x6000
+
+/* indicate immediate constant argument in a given range */
+#define __constrange(a, b) const
+
+/* A few intrinsics accept traditional data types like ints or floats, but
+ * most operate on data types that are specific to SSE.
+ * If a vector type ends in d, it contains doubles, and if it does not have
+ * a suffix, it contains floats. An integer vector type can contain any type
+ * of integer, from chars to shorts to unsigned long longs.
+ */
+typedef int64x1_t __m64;
+typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
+// On ARM 32-bit architecture, the float64x2_t is not supported.
+// The data type __m128d should be represented in a different way for related
+// intrinsic conversion.
+#if defined(__aarch64__)
+typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
+#else
+typedef float32x4_t __m128d;
+#endif
+typedef int64x2_t __m128i; /* 128-bit vector containing integers */
+
+/* type-safe casting between types */
+
+#define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
+#define vreinterpretq_m128_f32(x) (x)
+#define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
+
+#define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
+#define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
+#define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
+#define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
+
+#define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
+#define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
+#define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
+#define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
+
+#define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
+#define vreinterpretq_f32_m128(x) (x)
+#define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
+
+#define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
+#define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
+#define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
+#define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
+
+#define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
+#define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
+#define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
+#define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
+
+#define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
+#define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
+#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
+#define vreinterpretq_m128i_s64(x) (x)
+
+#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
+#define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
+#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
+#define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
+
+#define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x)
+#define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x)
+
+#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
+#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
+#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
+#define vreinterpretq_s64_m128i(x) (x)
+
+#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
+#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
+#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
+#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
+
+#define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)
+#define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)
+#define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x)
+#define vreinterpret_m64_s64(x) (x)
+
+#define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x)
+#define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x)
+#define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x)
+#define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x)
+
+#define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x)
+#define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x)
+#define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x)
+
+#define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x)
+#define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x)
+#define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x)
+#define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x)
+
+#define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x)
+#define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x)
+#define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x)
+#define vreinterpret_s64_m64(x) (x)
+
+#define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
+
+#if defined(__aarch64__)
+#define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
+#define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
+
+#define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x)
+
+#define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x)
+#define vreinterpretq_m128d_f64(x) (x)
+
+#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x)
+
+#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x)
+
+#define vreinterpretq_f64_m128d(x) (x)
+#define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x)
+#else
+#define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x)
+#define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x)
+
+#define vreinterpretq_m128d_u32(x) vreinterpretq_f32_u32(x)
+#define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x)
+
+#define vreinterpretq_m128d_f32(x) (x)
+
+#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x)
+
+#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f32(x)
+#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x)
+
+#define vreinterpretq_f32_m128d(x) (x)
+#endif
+
+// A struct is defined in this header file called 'SIMDVec' which can be used
+// by applications which attempt to access the contents of an _m128 struct
+// directly.  It is important to note that accessing the __m128 struct directly
+// is bad coding practice by Microsoft: @see:
+// https://msdn.microsoft.com/en-us/library/ayeb3ayc.aspx
+//
+// However, some legacy source code may try to access the contents of an __m128
+// struct directly so the developer can use the SIMDVec as an alias for it.  Any
+// casting must be done manually by the developer, as you cannot cast or
+// otherwise alias the base NEON data type for intrinsic operations.
+//
+// union intended to allow direct access to an __m128 variable using the names
+// that the MSVC compiler provides.  This union should really only be used when
+// trying to access the members of the vector as integer values.  GCC/clang
+// allow native access to the float members through a simple array access
+// operator (in C since 4.6, in C++ since 4.8).
+//
+// Ideally direct accesses to SIMD vectors should not be used since it can cause
+// a performance hit.  If it really is needed however, the original __m128
+// variable can be aliased with a pointer to this union and used to access
+// individual components.  The use of this union should be hidden behind a macro
+// that is used throughout the codebase to access the members instead of always
+// declaring this type of variable.
+typedef union ALIGN_STRUCT(16) SIMDVec {
+    float m128_f32[4];     // as floats - DON'T USE. Added for convenience.
+    int8_t m128_i8[16];    // as signed 8-bit integers.
+    int16_t m128_i16[8];   // as signed 16-bit integers.
+    int32_t m128_i32[4];   // as signed 32-bit integers.
+    int64_t m128_i64[2];   // as signed 64-bit integers.
+    uint8_t m128_u8[16];   // as unsigned 8-bit integers.
+    uint16_t m128_u16[8];  // as unsigned 16-bit integers.
+    uint32_t m128_u32[4];  // as unsigned 32-bit integers.
+    uint64_t m128_u64[2];  // as unsigned 64-bit integers.
+} SIMDVec;
+
+// casting using SIMDVec
+#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
+#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
+#define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
+
+/* Backwards compatibility for compilers with lack of specific type support */
+
+// Older gcc does not define vld1q_u8_x4 type
+#if defined(__GNUC__) && !defined(__clang__) &&   \
+    ((__GNUC__ == 10 && (__GNUC_MINOR__ <= 1)) || \
+     (__GNUC__ == 9 && (__GNUC_MINOR__ <= 3)) ||  \
+     (__GNUC__ == 8 && (__GNUC_MINOR__ <= 4)) || __GNUC__ <= 7)
+FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
+{
+    uint8x16x4_t ret;
+    ret.val[0] = vld1q_u8(p + 0);
+    ret.val[1] = vld1q_u8(p + 16);
+    ret.val[2] = vld1q_u8(p + 32);
+    ret.val[3] = vld1q_u8(p + 48);
+    return ret;
+}
+#else
+// Wraps vld1q_u8_x4
+FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
+{
+    return vld1q_u8_x4(p);
+}
+#endif
+
+/* Function Naming Conventions
+ * The naming convention of SSE intrinsics is straightforward. A generic SSE
+ * intrinsic function is given as follows:
+ *   _mm_<name>_<data_type>
+ *
+ * The parts of this format are given as follows:
+ * 1. <name> describes the operation performed by the intrinsic
+ * 2. <data_type> identifies the data type of the function's primary arguments
+ *
+ * This last part, <data_type>, is a little complicated. It identifies the
+ * content of the input values, and can be set to any of the following values:
+ * + ps - vectors contain floats (ps stands for packed single-precision)
+ * + pd - vectors cantain doubles (pd stands for packed double-precision)
+ * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit
+ *                            signed integers
+ * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit
+ *                            unsigned integers
+ * + si128 - unspecified 128-bit vector or 256-bit vector
+ * + m128/m128i/m128d - identifies input vector types when they are different
+ *                      than the type of the returned vector
+ *
+ * For example, _mm_setzero_ps. The _mm implies that the function returns
+ * a 128-bit vector. The _ps at the end implies that the argument vectors
+ * contain floats.
+ *
+ * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8)
+ *   // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits
+ *   __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+ *   // Set packed 8-bit integers
+ *   // 128 bits, 16 chars, per 8 bits
+ *   __m128i v_perm = _mm_setr_epi8(1, 0,  2,  3, 8, 9, 10, 11,
+ *                                  4, 5, 12, 13, 6, 7, 14, 15);
+ *   // Shuffle packed 8-bit integers
+ *   __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb
+ *
+ * Data (Number, Binary, Byte Index):
+    +------+------+-------------+------+------+-------------+
+    |      1      |      2      |      3      |      4      | Number
+    +------+------+------+------+------+------+------+------+
+    | 0000 | 0001 | 0000 | 0010 | 0000 | 0011 | 0000 | 0100 | Binary
+    +------+------+------+------+------+------+------+------+
+    |    0 |    1 |    2 |    3 |    4 |    5 |    6 |    7 | Index
+    +------+------+------+------+------+------+------+------+
+
+    +------+------+------+------+------+------+------+------+
+    |      5      |      6      |      7      |      8      | Number
+    +------+------+------+------+------+------+------+------+
+    | 0000 | 0101 | 0000 | 0110 | 0000 | 0111 | 0000 | 1000 | Binary
+    +------+------+------+------+------+------+------+------+
+    |    8 |    9 |   10 |   11 |   12 |   13 |   14 |   15 | Index
+    +------+------+------+------+------+------+------+------+
+ * Index (Byte Index):
+    +------+------+------+------+------+------+------+------+
+    |    1 |    0 |    2 |    3 |    8 |    9 |   10 |   11 |
+    +------+------+------+------+------+------+------+------+
+
+    +------+------+------+------+------+------+------+------+
+    |    4 |    5 |   12 |   13 |    6 |    7 |   14 |   15 |
+    +------+------+------+------+------+------+------+------+
+ * Result:
+    +------+------+------+------+------+------+------+------+
+    |    1 |    0 |    2 |    3 |    8 |    9 |   10 |   11 | Index
+    +------+------+------+------+------+------+------+------+
+    | 0001 | 0000 | 0000 | 0010 | 0000 | 0101 | 0000 | 0110 | Binary
+    +------+------+------+------+------+------+------+------+
+    |     256     |      2      |      5      |      6      | Number
+    +------+------+------+------+------+------+------+------+
+
+    +------+------+------+------+------+------+------+------+
+    |    4 |    5 |   12 |   13 |    6 |    7 |   14 |   15 | Index
+    +------+------+------+------+------+------+------+------+
+    | 0000 | 0011 | 0000 | 0111 | 0000 | 0100 | 0000 | 1000 | Binary
+    +------+------+------+------+------+------+------+------+
+    |      3      |      7      |      4      |      8      | Number
+    +------+------+------+------+------+------+-------------+
+ */
+
+/* Set/get methods */
+
+/* Constants for use with _mm_prefetch.  */
+enum _mm_hint {
+    _MM_HINT_NTA = 0,  /* load data to L1 and L2 cache, mark it as NTA */
+    _MM_HINT_T0 = 1,   /* load data to L1 and L2 cache */
+    _MM_HINT_T1 = 2,   /* load data to L2 cache only */
+    _MM_HINT_T2 = 3,   /* load data to L2 cache only, mark it as NTA */
+    _MM_HINT_ENTA = 4, /* exclusive version of _MM_HINT_NTA */
+    _MM_HINT_ET0 = 5,  /* exclusive version of _MM_HINT_T0 */
+    _MM_HINT_ET1 = 6,  /* exclusive version of _MM_HINT_T1 */
+    _MM_HINT_ET2 = 7   /* exclusive version of _MM_HINT_T2 */
+};
+
+// Loads one cache line of data from address p to a location closer to the
+// processor. https://msdn.microsoft.com/en-us/library/84szxsww(v=vs.100).aspx
+FORCE_INLINE void _mm_prefetch(const void *p, int i)
+{
+    (void) i;
+    __builtin_prefetch(p);
+}
+
+// Pause the processor. This is typically used in spin-wait loops and depending
+// on the x86 processor typical values are in the 40-100 cycle range. The
+// 'yield' instruction isn't a good fit beacuse it's effectively a nop on most
+// Arm cores. Experience with several databases has shown has shown an 'isb' is
+// a reasonable approximation.
+FORCE_INLINE void _mm_pause()
+{
+    __asm__ __volatile__("isb\n");
+}
+
+// Copy the lower single-precision (32-bit) floating-point element of a to dst.
+//
+//   dst[31:0] := a[31:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_f32
+FORCE_INLINE float _mm_cvtss_f32(__m128 a)
+{
+    return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+}
+
+// Convert the lower single-precision (32-bit) floating-point element in b to a
+// double-precision (64-bit) floating-point element, store the result in the
+// lower element of dst, and copy the upper element from a to the upper element
+// of dst.
+//
+//   dst[63:0] := Convert_FP32_To_FP64(b[31:0])
+//   dst[127:64] := a[127:64]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_sd
+FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
+{
+    double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0));
+#else
+    return vreinterpretq_m128d_s64(
+        vsetq_lane_s64(*(int64_t *) &d, vreinterpretq_s64_m128d(a), 0));
+#endif
+}
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer, and store the result in dst.
+//
+//   dst[31:0] := Convert_FP32_To_Int32(a[31:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si32
+#define _mm_cvtss_si32(a) _mm_cvt_ss2si(a)
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 64-bit integer, and store the result in dst.
+//
+//   dst[63:0] := Convert_FP32_To_Int64(a[31:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si64
+FORCE_INLINE int _mm_cvtss_si64(__m128 a)
+{
+#if defined(__aarch64__)
+    return vgetq_lane_s64(
+        vreinterpretq_s64_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a))), 0);
+#else
+    float32_t data = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    float32_t diff = data - floor(data);
+    if (diff > 0.5)
+        return (int64_t) ceil(data);
+    if (unlikely(diff == 0.5)) {
+        int64_t f = (int64_t) floor(data);
+        int64_t c = (int64_t) ceil(data);
+        return c & 1 ? f : c;
+    }
+    return (int64_t) floor(data);
+#endif
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+//
+//   FOR j := 0 to 1
+//      i := 32*j
+//      dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ps2pi
+FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a)
+{
+    return vreinterpret_m64_s32(
+        vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))));
+}
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer with truncation, and store the result in dst.
+//
+//   dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ss2si
+FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)
+{
+    return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0);
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+//
+//   FOR j := 0 to 1
+//      i := 32*j
+//      dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttps_pi32
+#define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a)
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer with truncation, and store the result in dst.
+//
+//   dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si32
+#define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a)
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 64-bit integer with truncation, and store the result in dst.
+//
+//   dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si64
+FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)
+{
+    return vgetq_lane_s64(
+        vmovl_s32(vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)))), 0);
+}
+
+// Sets the 128-bit value to zero
+// https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_setzero_si128(void)
+{
+    return vreinterpretq_m128i_s32(vdupq_n_s32(0));
+}
+
+// Clears the four single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_setzero_ps(void)
+{
+    return vreinterpretq_m128_f32(vdupq_n_f32(0));
+}
+
+// Return vector of type __m128d with all elements set to zero.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_pd
+FORCE_INLINE __m128d _mm_setzero_pd(void)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vdupq_n_f64(0));
+#else
+    return vreinterpretq_m128d_f32(vdupq_n_f32(0));
+#endif
+}
+
+// Sets the four single-precision, floating-point values to w.
+//
+//   r0 := r1 := r2 := r3 := w
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_set1_ps(float _w)
+{
+    return vreinterpretq_m128_f32(vdupq_n_f32(_w));
+}
+
+// Sets the four single-precision, floating-point values to w.
+// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_set_ps1(float _w)
+{
+    return vreinterpretq_m128_f32(vdupq_n_f32(_w));
+}
+
+// Sets the four single-precision, floating-point values to the four inputs.
+// https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
+{
+    float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+}
+
+// Copy single-precision (32-bit) floating-point element a to the lower element
+// of dst, and zero the upper 3 elements.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ss
+FORCE_INLINE __m128 _mm_set_ss(float a)
+{
+    float ALIGN_STRUCT(16) data[4] = {a, 0, 0, 0};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+}
+
+// Sets the four single-precision, floating-point values to the four inputs in
+// reverse order.
+// https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
+{
+    float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+}
+
+// Sets the 8 signed 16-bit integer values in reverse order.
+//
+// Return Value
+//   r0 := w0
+//   r1 := w1
+//   ...
+//   r7 := w7
+FORCE_INLINE __m128i _mm_setr_epi16(short w0,
+                                    short w1,
+                                    short w2,
+                                    short w3,
+                                    short w4,
+                                    short w5,
+                                    short w6,
+                                    short w7)
+{
+    int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
+    return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data));
+}
+
+// Sets the 4 signed 32-bit integer values in reverse order
+// https://technet.microsoft.com/en-us/library/security/27yb3ee5(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
+{
+    int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0};
+    return vreinterpretq_m128i_s32(vld1q_s32(data));
+}
+
+// Set packed 64-bit integers in dst with the supplied values in reverse order.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_epi64
+FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
+{
+    return vreinterpretq_m128i_s64(vcombine_s64(e1, e0));
+}
+
+// Sets the 16 signed 8-bit integer values to b.
+//
+//   r0 := b
+//   r1 := b
+//   ...
+//   r15 := b
+//
+// https://msdn.microsoft.com/en-us/library/6e14xhyf(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
+{
+    return vreinterpretq_m128i_s8(vdupq_n_s8(w));
+}
+
+// Broadcast double-precision (64-bit) floating-point value a to all elements of
+// dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_pd
+FORCE_INLINE __m128d _mm_set1_pd(double d)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vdupq_n_f64(d));
+#else
+    return vreinterpretq_m128d_s64(vdupq_n_s64(*(int64_t *) &d));
+#endif
+}
+
+// Sets the 8 signed 16-bit integer values to w.
+//
+//   r0 := w
+//   r1 := w
+//   ...
+//   r7 := w
+//
+// https://msdn.microsoft.com/en-us/library/k0ya3x0e(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_set1_epi16(short w)
+{
+    return vreinterpretq_m128i_s16(vdupq_n_s16(w));
+}
+
+// Sets the 16 signed 8-bit integer values.
+// https://msdn.microsoft.com/en-us/library/x0cx8zd3(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
+                                  signed char b14,
+                                  signed char b13,
+                                  signed char b12,
+                                  signed char b11,
+                                  signed char b10,
+                                  signed char b9,
+                                  signed char b8,
+                                  signed char b7,
+                                  signed char b6,
+                                  signed char b5,
+                                  signed char b4,
+                                  signed char b3,
+                                  signed char b2,
+                                  signed char b1,
+                                  signed char b0)
+{
+    int8_t ALIGN_STRUCT(16)
+        data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
+                    (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
+                    (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
+                    (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
+    return (__m128i) vld1q_s8(data);
+}
+
+// Sets the 8 signed 16-bit integer values.
+// https://msdn.microsoft.com/en-au/library/3e0fek84(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_set_epi16(short i7,
+                                   short i6,
+                                   short i5,
+                                   short i4,
+                                   short i3,
+                                   short i2,
+                                   short i1,
+                                   short i0)
+{
+    int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
+    return vreinterpretq_m128i_s16(vld1q_s16(data));
+}
+
+// Sets the 16 signed 8-bit integer values in reverse order.
+// https://msdn.microsoft.com/en-us/library/2khb9c7k(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
+                                   signed char b1,
+                                   signed char b2,
+                                   signed char b3,
+                                   signed char b4,
+                                   signed char b5,
+                                   signed char b6,
+                                   signed char b7,
+                                   signed char b8,
+                                   signed char b9,
+                                   signed char b10,
+                                   signed char b11,
+                                   signed char b12,
+                                   signed char b13,
+                                   signed char b14,
+                                   signed char b15)
+{
+    int8_t ALIGN_STRUCT(16)
+        data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
+                    (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
+                    (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
+                    (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
+    return (__m128i) vld1q_s8(data);
+}
+
+// Sets the 4 signed 32-bit integer values to i.
+//
+//   r0 := i
+//   r1 := i
+//   r2 := i
+//   r3 := I
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_set1_epi32(int _i)
+{
+    return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
+}
+
+// Sets the 2 signed 64-bit integer values to i.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/whtfzhzk(v=vs.100)
+FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i)
+{
+    return vreinterpretq_m128i_s64(vdupq_n_s64((int64_t) _i));
+}
+
+// Sets the 2 signed 64-bit integer values to i.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi64x
+FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
+{
+    return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
+}
+
+// Sets the 4 signed 32-bit integer values.
+// https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
+{
+    int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3};
+    return vreinterpretq_m128i_s32(vld1q_s32(data));
+}
+
+// Returns the __m128i structure with its two 64-bit integer values
+// initialized to the values of the two 64-bit integers passed in.
+// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
+FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
+{
+    return vreinterpretq_m128i_s64(
+        vcombine_s64(vcreate_s64(i2), vcreate_s64(i1)));
+}
+
+// Returns the __m128i structure with its two 64-bit integer values
+// initialized to the values of the two 64-bit integers passed in.
+// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
+FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)
+{
+    return _mm_set_epi64x((int64_t) i1, (int64_t) i2);
+}
+
+// Set packed double-precision (64-bit) floating-point elements in dst with the
+// supplied values.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd
+FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
+{
+    double ALIGN_STRUCT(16) data[2] = {e0, e1};
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));
+#else
+    return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data));
+#endif
+}
+
+// Set packed double-precision (64-bit) floating-point elements in dst with the
+// supplied values in reverse order.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_pd
+FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0)
+{
+    return _mm_set_pd(e0, e1);
+}
+
+// Copy double-precision (64-bit) floating-point element a to the lower element
+// of dst, and zero the upper element.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_sd
+FORCE_INLINE __m128d _mm_set_sd(double a)
+{
+    return _mm_set_pd(0, a);
+}
+
+// Broadcast double-precision (64-bit) floating-point value a to all elements of
+// dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd1
+#define _mm_set_pd1 _mm_set1_pd
+
+// Stores four single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx
+FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
+{
+    vst1q_f32(p, vreinterpretq_f32_m128(a));
+}
+
+// Store the lower single-precision (32-bit) floating-point element from a into
+// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+//
+//   MEM[mem_addr+31:mem_addr] := a[31:0]
+//   MEM[mem_addr+63:mem_addr+32] := a[31:0]
+//   MEM[mem_addr+95:mem_addr+64] := a[31:0]
+//   MEM[mem_addr+127:mem_addr+96] := a[31:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ps1
+FORCE_INLINE void _mm_store_ps1(float *p, __m128 a)
+{
+    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    vst1q_f32(p, vdupq_n_f32(a0));
+}
+
+// Store the lower single-precision (32-bit) floating-point element from a into
+// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+//
+//   MEM[mem_addr+31:mem_addr] := a[31:0]
+//   MEM[mem_addr+63:mem_addr+32] := a[31:0]
+//   MEM[mem_addr+95:mem_addr+64] := a[31:0]
+//   MEM[mem_addr+127:mem_addr+96] := a[31:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store1_ps
+#define _mm_store1_ps _mm_store_ps1
+
+// Store 4 single-precision (32-bit) floating-point elements from a into memory
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+//
+//   MEM[mem_addr+31:mem_addr] := a[127:96]
+//   MEM[mem_addr+63:mem_addr+32] := a[95:64]
+//   MEM[mem_addr+95:mem_addr+64] := a[63:32]
+//   MEM[mem_addr+127:mem_addr+96] := a[31:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_ps
+FORCE_INLINE void _mm_storer_ps(float *p, __m128 a)
+{
+    float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a));
+    float32x4_t rev = vextq_f32(tmp, tmp, 2);
+    vst1q_f32(p, rev);
+}
+
+// Stores four single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx
+FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
+{
+    vst1q_f32(p, vreinterpretq_f32_m128(a));
+}
+
+// Stores four 32-bit integer values as (as a __m128i value) at the address p.
+// https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
+FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
+{
+    vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
+}
+
+// Stores four 32-bit integer values as (as a __m128i value) at the address p.
+// https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
+FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
+{
+    vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
+}
+
+// Stores the lower single - precision, floating - point value.
+// https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx
+FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
+{
+    vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
+}
+
+// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
+// or a general-protection exception may be generated.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd
+FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
+{
+#if defined(__aarch64__)
+    vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
+#else
+    vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a));
+#endif
+}
+
+// Store the upper double-precision (64-bit) floating-point element from a into
+// memory.
+//
+//   MEM[mem_addr+63:mem_addr] := a[127:64]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeh_pd
+FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a)
+{
+#if defined(__aarch64__)
+    vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a)));
+#else
+    vst1_f32((float32_t *) mem_addr, vget_high_f32(vreinterpretq_f32_m128d(a)));
+#endif
+}
+
+// Store the lower double-precision (64-bit) floating-point element from a into
+// memory.
+//
+//   MEM[mem_addr+63:mem_addr] := a[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storel_pd
+FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
+{
+#if defined(__aarch64__)
+    vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
+#else
+    vst1_f32((float32_t *) mem_addr, vget_low_f32(vreinterpretq_f32_m128d(a)));
+#endif
+}
+
+// Store 2 double-precision (64-bit) floating-point elements from a into memory
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+//
+//   MEM[mem_addr+63:mem_addr] := a[127:64]
+//   MEM[mem_addr+127:mem_addr+64] := a[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_pd
+FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a)
+{
+    float32x4_t f = vreinterpretq_f32_m128d(a);
+    _mm_store_pd(mem_addr, vreinterpretq_m128d_f32(vextq_f32(f, f, 2)));
+}
+
+// Store the lower double-precision (64-bit) floating-point element from a into
+// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd1
+FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
+{
+#if defined(__aarch64__)
+    float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a));
+    vst1q_f64((float64_t *) mem_addr,
+              vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low)));
+#else
+    float32x2_t a_low = vget_low_f32(vreinterpretq_f32_m128d(a));
+    vst1q_f32((float32_t *) mem_addr,
+              vreinterpretq_f32_m128d(vcombine_f32(a_low, a_low)));
+#endif
+}
+
+// Store the lower double-precision (64-bit) floating-point element from a into
+// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=9,526,5601&text=_mm_store1_pd
+#define _mm_store1_pd _mm_store_pd1
+
+// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from a into memory. mem_addr does not need to be aligned on any
+// particular boundary.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_pd
+FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
+{
+    _mm_store_pd(mem_addr, a);
+}
+
+// Reads the lower 64 bits of b and stores them into the lower 64 bits of a.
+// https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx
+FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
+{
+    uint64x1_t hi = vget_high_u64(vreinterpretq_u64_m128i(*a));
+    uint64x1_t lo = vget_low_u64(vreinterpretq_u64_m128i(b));
+    *a = vreinterpretq_m128i_u64(vcombine_u64(lo, hi));
+}
+
+// Stores the lower two single-precision floating point values of a to the
+// address p.
+//
+//   *p0 := a0
+//   *p1 := a1
+//
+// https://msdn.microsoft.com/en-us/library/h54t98ks(v=vs.90).aspx
+FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
+{
+    *p = vreinterpret_m64_f32(vget_low_f32(a));
+}
+
+// Stores the upper two single-precision, floating-point values of a to the
+// address p.
+//
+//   *p0 := a2
+//   *p1 := a3
+//
+// https://msdn.microsoft.com/en-us/library/a7525fs8(v%3dvs.90).aspx
+FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
+{
+    *p = vreinterpret_m64_f32(vget_high_f32(a));
+}
+
+// Loads a single single-precision, floating-point value, copying it into all
+// four words
+// https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_load1_ps(const float *p)
+{
+    return vreinterpretq_m128_f32(vld1q_dup_f32(p));
+}
+
+// Load a single-precision (32-bit) floating-point element from memory into all
+// elements of dst.
+//
+//   dst[31:0] := MEM[mem_addr+31:mem_addr]
+//   dst[63:32] := MEM[mem_addr+31:mem_addr]
+//   dst[95:64] := MEM[mem_addr+31:mem_addr]
+//   dst[127:96] := MEM[mem_addr+31:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps1
+#define _mm_load_ps1 _mm_load1_ps
+
+// Sets the lower two single-precision, floating-point values with 64
+// bits of data loaded from the address p; the upper two values are passed
+// through from a.
+//
+// Return Value
+//   r0 := *p0
+//   r1 := *p1
+//   r2 := a2
+//   r3 := a3
+//
+// https://msdn.microsoft.com/en-us/library/s57cyak2(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
+{
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a)));
+}
+
+// Load 4 single-precision (32-bit) floating-point elements from memory into dst
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+//
+//   dst[31:0] := MEM[mem_addr+127:mem_addr+96]
+//   dst[63:32] := MEM[mem_addr+95:mem_addr+64]
+//   dst[95:64] := MEM[mem_addr+63:mem_addr+32]
+//   dst[127:96] := MEM[mem_addr+31:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_ps
+FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
+{
+    float32x4_t v = vrev64q_f32(vld1q_f32(p));
+    return vreinterpretq_m128_f32(vextq_f32(v, v, 2));
+}
+
+// Sets the upper two single-precision, floating-point values with 64
+// bits of data loaded from the address p; the lower two values are passed
+// through from a.
+//
+//   r0 := a0
+//   r1 := a1
+//   r2 := *p0
+//   r3 := *p1
+//
+// https://msdn.microsoft.com/en-us/library/w92wta0x(v%3dvs.100).aspx
+FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
+{
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p)));
+}
+
+// Loads four single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_load_ps(const float *p)
+{
+    return vreinterpretq_m128_f32(vld1q_f32(p));
+}
+
+// Loads four single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx
+FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
+{
+    // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
+    // equivalent for neon
+    return vreinterpretq_m128_f32(vld1q_f32(p));
+}
+
+// Load unaligned 16-bit integer from memory into the first element of dst.
+//
+//   dst[15:0] := MEM[mem_addr+15:mem_addr]
+//   dst[MAX:16] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si16
+FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
+{
+    return vreinterpretq_m128i_s16(
+        vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0));
+}
+
+// Load unaligned 64-bit integer from memory into the first element of dst.
+//
+//   dst[63:0] := MEM[mem_addr+63:mem_addr]
+//   dst[MAX:64] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si64
+FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
+{
+    return vreinterpretq_m128i_s64(
+        vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0)));
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into the
+// lower of dst, and zero the upper element. mem_addr does not need to be
+// aligned on any particular boundary.
+//
+//   dst[63:0] := MEM[mem_addr+63:mem_addr]
+//   dst[127:64] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sd
+FORCE_INLINE __m128d _mm_load_sd(const double *p)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
+#else
+    const float *fp = (const float *) p;
+    float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0};
+    return vreinterpretq_m128d_f32(vld1q_f32(data));
+#endif
+}
+
+// Loads two double-precision from 16-byte aligned memory, floating-point
+// values.
+//
+//   dst[127:0] := MEM[mem_addr+127:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd
+FORCE_INLINE __m128d _mm_load_pd(const double *p)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vld1q_f64(p));
+#else
+    const float *fp = (const float *) p;
+    float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]};
+    return vreinterpretq_m128d_f32(vld1q_f32(data));
+#endif
+}
+
+// Loads two double-precision from unaligned memory, floating-point values.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_pd
+FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
+{
+    return _mm_load_pd(p);
+}
+
+// Loads an single - precision, floating - point value into the low word and
+// clears the upper three words.
+// https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx
+FORCE_INLINE __m128 _mm_load_ss(const float *p)
+{
+    return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
+}
+
+// Load 64-bit integer from memory into the first element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_epi64
+FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
+{
+    /* Load the lower 64 bits of the value pointed to by p into the
+     * lower 64 bits of the result, zeroing the upper 64 bits of the result.
+     */
+    return vreinterpretq_m128i_s32(
+        vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0)));
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into the
+// lower element of dst, and copy the upper element from a to dst. mem_addr does
+// not need to be aligned on any particular boundary.
+//
+//   dst[63:0] := MEM[mem_addr+63:mem_addr]
+//   dst[127:64] := a[127:64]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_pd
+FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
+#else
+    return vreinterpretq_m128d_f32(
+        vcombine_f32(vld1_f32((const float *) p),
+                     vget_high_f32(vreinterpretq_f32_m128d(a))));
+#endif
+}
+
+// Load 2 double-precision (64-bit) floating-point elements from memory into dst
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+//
+//   dst[63:0] := MEM[mem_addr+127:mem_addr+64]
+//   dst[127:64] := MEM[mem_addr+63:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_pd
+FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
+{
+#if defined(__aarch64__)
+    float64x2_t v = vld1q_f64(p);
+    return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
+#else
+    int64x2_t v = vld1q_s64((const int64_t *) p);
+    return vreinterpretq_m128d_s64(vextq_s64(v, v, 1));
+#endif
+}
+
+// Sets the low word to the single-precision, floating-point value of b
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/35hdzazd(v=vs.100)
+FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0),
+                       vreinterpretq_f32_m128(a), 0));
+}
+
+// Move the lower double-precision (64-bit) floating-point element from b to the
+// lower element of dst, and copy the upper element from a to the upper element
+// of dst.
+//
+//   dst[63:0] := b[63:0]
+//   dst[127:64] := a[127:64]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_sd
+FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_f32(
+        vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(b)),
+                     vget_high_f32(vreinterpretq_f32_m128d(a))));
+}
+
+// Copy the lower 64-bit integer in a to the lower element of dst, and zero the
+// upper element.
+//
+//   dst[63:0] := a[63:0]
+//   dst[127:64] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_epi64
+FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
+{
+    return vreinterpretq_m128i_s64(
+        vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1));
+}
+
+// Return vector of type __m128 with undefined elements.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ps
+FORCE_INLINE __m128 _mm_undefined_ps(void)
+{
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+    __m128 a;
+    return a;
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+}
+
+/* Logic/Binary operations */
+
+// Computes the bitwise AND-NOT of the four single-precision, floating-point
+// values of a and b.
+//
+//   r0 := ~a0 & b0
+//   r1 := ~a1 & b1
+//   r2 := ~a2 & b2
+//   r3 := ~a3 & b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_s32(
+        vbicq_s32(vreinterpretq_s32_m128(b),
+                  vreinterpretq_s32_m128(a)));  // *NOTE* argument swap
+}
+
+// Compute the bitwise NOT of packed double-precision (64-bit) floating-point
+// elements in a and then AND with b, and store the results in dst.
+//
+//   FOR j := 0 to 1
+// 	     i := j*64
+// 	     dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_pd
+FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
+{
+    // *NOTE* argument swap
+    return vreinterpretq_m128d_s64(
+        vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a)));
+}
+
+// Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the
+// 128-bit value in a.
+//
+//   r := (~a) & b
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vbicq_s32(vreinterpretq_s32_m128i(b),
+                  vreinterpretq_s32_m128i(a)));  // *NOTE* argument swap
+}
+
+// Computes the bitwise AND of the 128-bit value in a and the 128-bit value in
+// b.
+//
+//   r := a & b
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Computes the bitwise AND of the four single-precision, floating-point values
+// of a and b.
+//
+//   r0 := a0 & b0
+//   r1 := a1 & b1
+//   r2 := a2 & b2
+//   r3 := a3 & b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_s32(
+        vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
+}
+
+// Compute the bitwise AND of packed double-precision (64-bit) floating-point
+// elements in a and b, and store the results in dst.
+//
+//   FOR j := 0 to 1
+//     i := j*64
+//     dst[i+63:i] := a[i+63:i] AND b[i+63:i]
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_pd
+FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_s64(
+        vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
+}
+
+// Computes the bitwise OR of the four single-precision, floating-point values
+// of a and b.
+// https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_s32(
+        vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
+}
+
+// Computes bitwise EXOR (exclusive-or) of the four single-precision,
+// floating-point values of a and b.
+// https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_s32(
+        veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
+}
+
+// Compute the bitwise XOR of packed double-precision (64-bit) floating-point
+// elements in a and b, and store the results in dst.
+//
+//   FOR j := 0 to 1
+//      i := j*64
+//      dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_pd
+FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_s64(
+        veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
+}
+
+// Compute the bitwise OR of packed double-precision (64-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_or_pd
+FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_s64(
+        vorrq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
+}
+
+// Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b.
+//
+//   r := a | b
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in
+// b.  https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Duplicate the low double-precision (64-bit) floating-point element from a,
+// and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movedup_pd
+FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)
+{
+#if (__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0));
+#else
+    return vreinterpretq_m128d_u64(
+        vdupq_n_u64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)));
+#endif
+}
+
+// Duplicate odd-indexed single-precision (32-bit) floating-point elements
+// from a, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movehdup_ps
+FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
+{
+#if __has_builtin(__builtin_shufflevector)
+    return vreinterpretq_m128_f32(__builtin_shufflevector(
+        vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3));
+#else
+    float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
+    float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3);
+    float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+#endif
+}
+
+// Duplicate even-indexed single-precision (32-bit) floating-point elements
+// from a, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_moveldup_ps
+FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
+{
+#if __has_builtin(__builtin_shufflevector)
+    return vreinterpretq_m128_f32(__builtin_shufflevector(
+        vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2));
+#else
+    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2);
+    float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+#endif
+}
+
+// Moves the upper two values of B into the lower two values of A.
+//
+//   r3 := a3
+//   r2 := a2
+//   r1 := b3
+//   r0 := b2
+FORCE_INLINE __m128 _mm_movehl_ps(__m128 __A, __m128 __B)
+{
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(__A));
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(__B));
+    return vreinterpretq_m128_f32(vcombine_f32(b32, a32));
+}
+
+// Moves the lower two values of B into the upper two values of A.
+//
+//   r3 := b1
+//   r2 := b0
+//   r1 := a1
+//   r0 := a0
+FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B));
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
+}
+
+// Compute the absolute value of packed signed 32-bit integers in a, and store
+// the unsigned results in dst.
+//
+//   FOR j := 0 to 3
+//     i := j*32
+//     dst[i+31:i] := ABS(a[i+31:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi32
+FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
+{
+    return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a)));
+}
+
+// Compute the absolute value of packed signed 16-bit integers in a, and store
+// the unsigned results in dst.
+//
+//   FOR j := 0 to 7
+//     i := j*16
+//     dst[i+15:i] := ABS(a[i+15:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi16
+FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
+{
+    return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a)));
+}
+
+// Compute the absolute value of packed signed 8-bit integers in a, and store
+// the unsigned results in dst.
+//
+//   FOR j := 0 to 15
+//     i := j*8
+//     dst[i+7:i] := ABS(a[i+7:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi8
+FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
+{
+    return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a)));
+}
+
+// Compute the absolute value of packed signed 32-bit integers in a, and store
+// the unsigned results in dst.
+//
+//   FOR j := 0 to 1
+//     i := j*32
+//     dst[i+31:i] := ABS(a[i+31:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi32
+FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
+{
+    return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a)));
+}
+
+// Compute the absolute value of packed signed 16-bit integers in a, and store
+// the unsigned results in dst.
+//
+//   FOR j := 0 to 3
+//     i := j*16
+//     dst[i+15:i] := ABS(a[i+15:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi16
+FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
+{
+    return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a)));
+}
+
+// Compute the absolute value of packed signed 8-bit integers in a, and store
+// the unsigned results in dst.
+//
+//   FOR j := 0 to 7
+//     i := j*8
+//     dst[i+7:i] := ABS(a[i+7:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi8
+FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
+{
+    return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a)));
+}
+
+// Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift
+// the result right by imm8 bytes, and store the low 16 bytes in dst.
+//
+//   tmp[255:0] := ((a[127:0] << 128)[255:0] OR b[127:0]) >> (imm8*8)
+//   dst[127:0] := tmp[127:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_epi8
+#define _mm_alignr_epi8(a, b, imm)                                            \
+    __extension__({                                                           \
+        __m128i ret;                                                          \
+        if (unlikely((imm) >= 32)) {                                          \
+            ret = _mm_setzero_si128();                                        \
+        } else {                                                              \
+            uint8x16_t tmp_low, tmp_high;                                     \
+            if (imm >= 16) {                                                  \
+                const int idx = imm - 16;                                     \
+                tmp_low = vreinterpretq_u8_m128i(a);                          \
+                tmp_high = vdupq_n_u8(0);                                     \
+                ret =                                                         \
+                    vreinterpretq_m128i_u8(vextq_u8(tmp_low, tmp_high, idx)); \
+            } else {                                                          \
+                const int idx = imm;                                          \
+                tmp_low = vreinterpretq_u8_m128i(b);                          \
+                tmp_high = vreinterpretq_u8_m128i(a);                         \
+                ret =                                                         \
+                    vreinterpretq_m128i_u8(vextq_u8(tmp_low, tmp_high, idx)); \
+            }                                                                 \
+        }                                                                     \
+        ret;                                                                  \
+    })
+
+// Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift
+// the result right by imm8 bytes, and store the low 8 bytes in dst.
+//
+//   tmp[127:0] := ((a[63:0] << 64)[127:0] OR b[63:0]) >> (imm8*8)
+//   dst[63:0] := tmp[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_pi8
+#define _mm_alignr_pi8(a, b, imm)                                           \
+    __extension__({                                                         \
+        __m64 ret;                                                          \
+        if (unlikely((imm) >= 16)) {                                        \
+            ret = vreinterpret_m64_s8(vdup_n_s8(0));                        \
+        } else {                                                            \
+            uint8x8_t tmp_low, tmp_high;                                    \
+            if (imm >= 8) {                                                 \
+                const int idx = imm - 8;                                    \
+                tmp_low = vreinterpret_u8_m64(a);                           \
+                tmp_high = vdup_n_u8(0);                                    \
+                ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
+            } else {                                                        \
+                const int idx = imm;                                        \
+                tmp_low = vreinterpret_u8_m64(b);                           \
+                tmp_high = vreinterpret_u8_m64(a);                          \
+                ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
+            }                                                               \
+        }                                                                   \
+        ret;                                                                \
+    })
+
+// Takes the upper 64 bits of a and places it in the low end of the result
+// Takes the lower 64 bits of b and places it into the high end of the result.
+FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
+{
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
+}
+
+// takes the lower two 32-bit values from a and swaps them and places in high
+// end of result takes the higher two 32 bit values from b and swaps them and
+// places in low end of result.
+FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
+{
+    float32x2_t a21 = vget_high_f32(
+        vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
+    float32x2_t b03 = vget_low_f32(
+        vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
+    return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
+{
+    float32x2_t a03 = vget_low_f32(
+        vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
+    float32x2_t b21 = vget_high_f32(
+        vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
+    return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
+}
+
+// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
+// high
+FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
+{
+    float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
+{
+    float32x2_t a22 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
+{
+    float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
+    float32x2_t b22 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
+    return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
+{
+    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    float32x2_t a22 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
+    float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
+{
+    float32x2_t a33 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
+    float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32_t b2 = vgetq_lane_f32(b, 2);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
+{
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32_t b2 = vgetq_lane_f32(b, 2);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
+}
+
+// NEON does not support a general purpose permute intrinsic
+// Selects four specific single-precision, floating-point values from a and b,
+// based on the mask i.
+//
+// C equivalent:
+//   __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
+//                                 __constrange(0, 255) int imm) {
+//       __m128 ret;
+//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
+//       ret[2] = b[(imm >> 4) & 0x03];  ret[3] = b[(imm >> 6) & 0x03];
+//       return ret;
+//   }
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx
+#define _mm_shuffle_ps_default(a, b, imm)                                  \
+    __extension__({                                                        \
+        float32x4_t ret;                                                   \
+        ret = vmovq_n_f32(                                                 \
+            vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3)));     \
+        ret = vsetq_lane_f32(                                              \
+            vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
+            ret, 1);                                                       \
+        ret = vsetq_lane_f32(                                              \
+            vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \
+            ret, 2);                                                       \
+        ret = vsetq_lane_f32(                                              \
+            vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \
+            ret, 3);                                                       \
+        vreinterpretq_m128_f32(ret);                                       \
+    })
+
+// FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
+// int imm)
+#if __has_builtin(__builtin_shufflevector)
+#define _mm_shuffle_ps(a, b, imm)                                \
+    __extension__({                                              \
+        float32x4_t _input1 = vreinterpretq_f32_m128(a);         \
+        float32x4_t _input2 = vreinterpretq_f32_m128(b);         \
+        float32x4_t _shuf = __builtin_shufflevector(             \
+            _input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \
+            (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \
+        vreinterpretq_m128_f32(_shuf);                           \
+    })
+#else  // generic
+#define _mm_shuffle_ps(a, b, imm)                          \
+    __extension__({                                        \
+        __m128 ret;                                        \
+        switch (imm) {                                     \
+        case _MM_SHUFFLE(1, 0, 3, 2):                      \
+            ret = _mm_shuffle_ps_1032((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 3, 0, 1):                      \
+            ret = _mm_shuffle_ps_2301((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(0, 3, 2, 1):                      \
+            ret = _mm_shuffle_ps_0321((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 1, 0, 3):                      \
+            ret = _mm_shuffle_ps_2103((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(1, 0, 1, 0):                      \
+            ret = _mm_movelh_ps((a), (b));                 \
+            break;                                         \
+        case _MM_SHUFFLE(1, 0, 0, 1):                      \
+            ret = _mm_shuffle_ps_1001((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(0, 1, 0, 1):                      \
+            ret = _mm_shuffle_ps_0101((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(3, 2, 1, 0):                      \
+            ret = _mm_shuffle_ps_3210((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(0, 0, 1, 1):                      \
+            ret = _mm_shuffle_ps_0011((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(0, 0, 2, 2):                      \
+            ret = _mm_shuffle_ps_0022((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 2, 0, 0):                      \
+            ret = _mm_shuffle_ps_2200((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(3, 2, 0, 2):                      \
+            ret = _mm_shuffle_ps_3202((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(3, 2, 3, 2):                      \
+            ret = _mm_movehl_ps((b), (a));                 \
+            break;                                         \
+        case _MM_SHUFFLE(1, 1, 3, 3):                      \
+            ret = _mm_shuffle_ps_1133((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 0, 1, 0):                      \
+            ret = _mm_shuffle_ps_2010((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 0, 0, 1):                      \
+            ret = _mm_shuffle_ps_2001((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 0, 3, 2):                      \
+            ret = _mm_shuffle_ps_2032((a), (b));           \
+            break;                                         \
+        default:                                           \
+            ret = _mm_shuffle_ps_default((a), (b), (imm)); \
+            break;                                         \
+        }                                                  \
+        ret;                                               \
+    })
+#endif
+
+// Takes the upper 64 bits of a and places it in the low end of the result
+// Takes the lower 64 bits of a and places it into the high end of the result.
+FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
+{
+    int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
+}
+
+// takes the lower two 32-bit values from a and swaps them and places in low end
+// of result takes the higher two 32 bit values from a and swaps them and places
+// in high end of result.
+FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
+{
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
+    return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
+}
+
+// rotates the least significant 32 bits into the most signficant 32 bits, and
+// shifts the rest down
+FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
+{
+    return vreinterpretq_m128i_s32(
+        vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1));
+}
+
+// rotates the most significant 32 bits into the least signficant 32 bits, and
+// shifts the rest up
+FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
+{
+    return vreinterpretq_m128i_s32(
+        vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3));
+}
+
+// gets the lower 64 bits of a, and places it in the upper 64 bits
+// gets the lower 64 bits of a and places it in the lower 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
+{
+    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
+}
+
+// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
+// lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
+{
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
+}
+
+// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
+// upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
+// places it in the lower 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
+{
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
+{
+    int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
+    int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
+    return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
+{
+    int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
+{
+    int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
+    return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
+}
+
+// Shuffle packed 8-bit integers in a according to shuffle control mask in the
+// corresponding 8-bit element of b, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi8
+FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
+{
+    int8x16_t tbl = vreinterpretq_s8_m128i(a);   // input a
+    uint8x16_t idx = vreinterpretq_u8_m128i(b);  // input b
+    uint8x16_t idx_masked =
+        vandq_u8(idx, vdupq_n_u8(0x8F));  // avoid using meaningless bits
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
+#elif defined(__GNUC__)
+    int8x16_t ret;
+    // %e and %f represent the even and odd D registers
+    // respectively.
+    __asm__ __volatile__(
+        "vtbl.8  %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
+        "vtbl.8  %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
+        : [ret] "=&w"(ret)
+        : [tbl] "w"(tbl), [idx] "w"(idx_masked));
+    return vreinterpretq_m128i_s8(ret);
+#else
+    // use this line if testing on aarch64
+    int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)};
+    return vreinterpretq_m128i_s8(
+        vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)),
+                    vtbl2_s8(a_split, vget_high_u8(idx_masked))));
+#endif
+}
+
+// C equivalent:
+//   __m128i _mm_shuffle_epi32_default(__m128i a,
+//                                     __constrange(0, 255) int imm) {
+//       __m128i ret;
+//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
+//       ret[2] = a[(imm >> 4) & 0x03];  ret[3] = a[(imm >> 6) & 0x03];
+//       return ret;
+//   }
+#define _mm_shuffle_epi32_default(a, imm)                                   \
+    __extension__({                                                         \
+        int32x4_t ret;                                                      \
+        ret = vmovq_n_s32(                                                  \
+            vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3)));     \
+        ret = vsetq_lane_s32(                                               \
+            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \
+            ret, 1);                                                        \
+        ret = vsetq_lane_s32(                                               \
+            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
+            ret, 2);                                                        \
+        ret = vsetq_lane_s32(                                               \
+            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \
+            ret, 3);                                                        \
+        vreinterpretq_m128i_s32(ret);                                       \
+    })
+
+// FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255)
+// int imm)
+#if defined(__aarch64__)
+#define _mm_shuffle_epi32_splat(a, imm)                          \
+    __extension__({                                              \
+        vreinterpretq_m128i_s32(                                 \
+            vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \
+    })
+#else
+#define _mm_shuffle_epi32_splat(a, imm)                                      \
+    __extension__({                                                          \
+        vreinterpretq_m128i_s32(                                             \
+            vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \
+    })
+#endif
+
+// Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm.
+// https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx
+// FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,
+//                                        __constrange(0,255) int imm)
+#if __has_builtin(__builtin_shufflevector)
+#define _mm_shuffle_epi32(a, imm)                              \
+    __extension__({                                            \
+        int32x4_t _input = vreinterpretq_s32_m128i(a);         \
+        int32x4_t _shuf = __builtin_shufflevector(             \
+            _input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
+            ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3);           \
+        vreinterpretq_m128i_s32(_shuf);                        \
+    })
+#else  // generic
+#define _mm_shuffle_epi32(a, imm)                        \
+    __extension__({                                      \
+        __m128i ret;                                     \
+        switch (imm) {                                   \
+        case _MM_SHUFFLE(1, 0, 3, 2):                    \
+            ret = _mm_shuffle_epi_1032((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(2, 3, 0, 1):                    \
+            ret = _mm_shuffle_epi_2301((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(0, 3, 2, 1):                    \
+            ret = _mm_shuffle_epi_0321((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(2, 1, 0, 3):                    \
+            ret = _mm_shuffle_epi_2103((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(1, 0, 1, 0):                    \
+            ret = _mm_shuffle_epi_1010((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(1, 0, 0, 1):                    \
+            ret = _mm_shuffle_epi_1001((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(0, 1, 0, 1):                    \
+            ret = _mm_shuffle_epi_0101((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(2, 2, 1, 1):                    \
+            ret = _mm_shuffle_epi_2211((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(0, 1, 2, 2):                    \
+            ret = _mm_shuffle_epi_0122((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(3, 3, 3, 2):                    \
+            ret = _mm_shuffle_epi_3332((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(0, 0, 0, 0):                    \
+            ret = _mm_shuffle_epi32_splat((a), 0);       \
+            break;                                       \
+        case _MM_SHUFFLE(1, 1, 1, 1):                    \
+            ret = _mm_shuffle_epi32_splat((a), 1);       \
+            break;                                       \
+        case _MM_SHUFFLE(2, 2, 2, 2):                    \
+            ret = _mm_shuffle_epi32_splat((a), 2);       \
+            break;                                       \
+        case _MM_SHUFFLE(3, 3, 3, 3):                    \
+            ret = _mm_shuffle_epi32_splat((a), 3);       \
+            break;                                       \
+        default:                                         \
+            ret = _mm_shuffle_epi32_default((a), (imm)); \
+            break;                                       \
+        }                                                \
+        ret;                                             \
+    })
+#endif
+
+// Shuffles the lower 4 signed or unsigned 16-bit integers in a as specified
+// by imm.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/y41dkk37(v=vs.100)
+// FORCE_INLINE __m128i _mm_shufflelo_epi16_function(__m128i a,
+//                                                   __constrange(0,255) int
+//                                                   imm)
+#define _mm_shufflelo_epi16_function(a, imm)                                  \
+    __extension__({                                                           \
+        int16x8_t ret = vreinterpretq_s16_m128i(a);                           \
+        int16x4_t lowBits = vget_low_s16(ret);                                \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0);  \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
+                             1);                                              \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
+                             2);                                              \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
+                             3);                                              \
+        vreinterpretq_m128i_s16(ret);                                         \
+    })
+
+// FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a,
+//                                          __constrange(0,255) int imm)
+#if __has_builtin(__builtin_shufflevector)
+#define _mm_shufflelo_epi16(a, imm)                                  \
+    __extension__({                                                  \
+        int16x8_t _input = vreinterpretq_s16_m128i(a);               \
+        int16x8_t _shuf = __builtin_shufflevector(                   \
+            _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3),   \
+            (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \
+        vreinterpretq_m128i_s16(_shuf);                              \
+    })
+#else  // generic
+#define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
+#endif
+
+// Shuffles the upper 4 signed or unsigned 16-bit integers in a as specified
+// by imm.
+// https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx
+// FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a,
+//                                                   __constrange(0,255) int
+//                                                   imm)
+#define _mm_shufflehi_epi16_function(a, imm)                                   \
+    __extension__({                                                            \
+        int16x8_t ret = vreinterpretq_s16_m128i(a);                            \
+        int16x4_t highBits = vget_high_s16(ret);                               \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4);  \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
+                             5);                                               \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
+                             6);                                               \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
+                             7);                                               \
+        vreinterpretq_m128i_s16(ret);                                          \
+    })
+
+// FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
+//                                          __constrange(0,255) int imm)
+#if __has_builtin(__builtin_shufflevector)
+#define _mm_shufflehi_epi16(a, imm)                             \
+    __extension__({                                             \
+        int16x8_t _input = vreinterpretq_s16_m128i(a);          \
+        int16x8_t _shuf = __builtin_shufflevector(              \
+            _input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4,    \
+            (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \
+            (((imm) >> 6) & 0x3) + 4);                          \
+        vreinterpretq_m128i_s16(_shuf);                         \
+    })
+#else  // generic
+#define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
+#endif
+
+// Shuffle double-precision (64-bit) floating-point elements using the control
+// in imm8, and store the results in dst.
+//
+//   dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
+//   dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pd
+#if __has_builtin(__builtin_shufflevector)
+#define _mm_shuffle_pd(a, b, imm8)                                          \
+    vreinterpretq_m128d_s64(__builtin_shufflevector(                        \
+        vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), imm8 & 0x1, \
+        ((imm8 & 0x2) >> 1) + 2))
+#else
+#define _mm_shuffle_pd(a, b, imm8)                                     \
+    _mm_castsi128_pd(_mm_set_epi64x(                                   \
+        vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \
+        vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1)))
+#endif
+
+// Blend packed 16-bit integers from a and b using control mask imm8, and store
+// the results in dst.
+//
+//   FOR j := 0 to 7
+//       i := j*16
+//       IF imm8[j]
+//           dst[i+15:i] := b[i+15:i]
+//       ELSE
+//           dst[i+15:i] := a[i+15:i]
+//       FI
+//   ENDFOR
+// FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b,
+//                                      __constrange(0,255) int imm)
+#define _mm_blend_epi16(a, b, imm)                                        \
+    __extension__({                                                       \
+        const uint16_t _mask[8] = {((imm) & (1 << 0)) ? 0xFFFF : 0x0000,  \
+                                   ((imm) & (1 << 1)) ? 0xFFFF : 0x0000,  \
+                                   ((imm) & (1 << 2)) ? 0xFFFF : 0x0000,  \
+                                   ((imm) & (1 << 3)) ? 0xFFFF : 0x0000,  \
+                                   ((imm) & (1 << 4)) ? 0xFFFF : 0x0000,  \
+                                   ((imm) & (1 << 5)) ? 0xFFFF : 0x0000,  \
+                                   ((imm) & (1 << 6)) ? 0xFFFF : 0x0000,  \
+                                   ((imm) & (1 << 7)) ? 0xFFFF : 0x0000}; \
+        uint16x8_t _mask_vec = vld1q_u16(_mask);                          \
+        uint16x8_t _a = vreinterpretq_u16_m128i(a);                       \
+        uint16x8_t _b = vreinterpretq_u16_m128i(b);                       \
+        vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a));            \
+    })
+
+// Blend packed 8-bit integers from a and b using mask, and store the results in
+// dst.
+//
+//   FOR j := 0 to 15
+//       i := j*8
+//       IF mask[i+7]
+//           dst[i+7:i] := b[i+7:i]
+//       ELSE
+//           dst[i+7:i] := a[i+7:i]
+//       FI
+//   ENDFOR
+FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
+{
+    // Use a signed shift right to create a mask with the sign bit
+    uint8x16_t mask =
+        vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7));
+    uint8x16_t a = vreinterpretq_u8_m128i(_a);
+    uint8x16_t b = vreinterpretq_u8_m128i(_b);
+    return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a));
+}
+
+/* Shifts */
+
+
+// Shift packed 16-bit integers in a right by imm while shifting in sign
+// bits, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi16
+FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
+{
+    const int count = (imm & ~15) ? 15 : imm;
+    return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
+}
+
+// Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while
+// shifting in zeros.
+//
+//   r0 := a0 << count
+//   r1 := a1 << count
+//   ...
+//   r7 := a7 << count
+//
+// https://msdn.microsoft.com/en-us/library/es73bcsy(v=vs.90).aspx
+#define _mm_slli_epi16(a, imm)                                   \
+    __extension__({                                              \
+        __m128i ret;                                             \
+        if (unlikely((imm)) <= 0) {                              \
+            ret = a;                                             \
+        }                                                        \
+        if (unlikely((imm) > 15)) {                              \
+            ret = _mm_setzero_si128();                           \
+        } else {                                                 \
+            ret = vreinterpretq_m128i_s16(                       \
+                vshlq_n_s16(vreinterpretq_s16_m128i(a), (imm))); \
+        }                                                        \
+        ret;                                                     \
+    })
+
+// Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while
+// shifting in zeros. :
+// https://msdn.microsoft.com/en-us/library/z2k3bbtb%28v=vs.90%29.aspx
+// FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, __constrange(0,255) int imm)
+FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
+{
+    if (unlikely(imm <= 0)) /* TODO: add constant range macro: [0, 255] */
+        return a;
+    if (unlikely(imm > 31))
+        return _mm_setzero_si128();
+    return vreinterpretq_m128i_s32(
+        vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm)));
+}
+
+// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
+// store the results in dst.
+FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
+{
+    if (unlikely(imm <= 0)) /* TODO: add constant range macro: [0, 255] */
+        return a;
+    if (unlikely(imm > 63))
+        return _mm_setzero_si128();
+    return vreinterpretq_m128i_s64(
+        vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm)));
+}
+
+// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
+// store the results in dst.
+//
+//   FOR j := 0 to 7
+//     i := j*16
+//     IF imm8[7:0] > 15
+//       dst[i+15:i] := 0
+//     ELSE
+//       dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi16
+#define _mm_srli_epi16(a, imm)                                             \
+    __extension__({                                                        \
+        __m128i ret;                                                       \
+        if (unlikely(imm) == 0) {                                          \
+            ret = a;                                                       \
+        }                                                                  \
+        if (likely(0 < (imm) && (imm) < 16)) {                             \
+            ret = vreinterpretq_m128i_u16(                                 \
+                vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-imm))); \
+        } else {                                                           \
+            ret = _mm_setzero_si128();                                     \
+        }                                                                  \
+        ret;                                                               \
+    })
+
+// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
+// store the results in dst.
+//
+//   FOR j := 0 to 3
+//     i := j*32
+//     IF imm8[7:0] > 31
+//       dst[i+31:i] := 0
+//     ELSE
+//       dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi32
+// FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
+#define _mm_srli_epi32(a, imm)                                             \
+    __extension__({                                                        \
+        __m128i ret;                                                       \
+        if (unlikely((imm) == 0)) {                                        \
+            ret = a;                                                       \
+        }                                                                  \
+        if (likely(0 < (imm) && (imm) < 32)) {                             \
+            ret = vreinterpretq_m128i_u32(                                 \
+                vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-imm))); \
+        } else {                                                           \
+            ret = _mm_setzero_si128();                                     \
+        }                                                                  \
+        ret;                                                               \
+    })
+
+// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
+// store the results in dst.
+//
+//   FOR j := 0 to 1
+//     i := j*64
+//     IF imm8[7:0] > 63
+//       dst[i+63:i] := 0
+//     ELSE
+//       dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi64
+#define _mm_srli_epi64(a, imm)                                             \
+    __extension__({                                                        \
+        __m128i ret;                                                       \
+        if (unlikely((imm) == 0)) {                                        \
+            ret = a;                                                       \
+        }                                                                  \
+        if (likely(0 < (imm) && (imm) < 64)) {                             \
+            ret = vreinterpretq_m128i_u64(                                 \
+                vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-imm))); \
+        } else {                                                           \
+            ret = _mm_setzero_si128();                                     \
+        }                                                                  \
+        ret;                                                               \
+    })
+
+// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits,
+// and store the results in dst.
+//
+//   FOR j := 0 to 3
+//     i := j*32
+//     IF imm8[7:0] > 31
+//       dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
+//     ELSE
+//       dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi32
+// FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
+#define _mm_srai_epi32(a, imm)                                             \
+    __extension__({                                                        \
+        __m128i ret;                                                       \
+        if (unlikely((imm) == 0)) {                                        \
+            ret = a;                                                       \
+        }                                                                  \
+        if (likely(0 < (imm) && (imm) < 32)) {                             \
+            ret = vreinterpretq_m128i_s32(                                 \
+                vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-imm))); \
+        } else {                                                           \
+            ret = vreinterpretq_m128i_s32(                                 \
+                vshrq_n_s32(vreinterpretq_s32_m128i(a), 31));              \
+        }                                                                  \
+        ret;                                                               \
+    })
+
+// Shifts the 128 - bit value in a right by imm bytes while shifting in
+// zeros.imm must be an immediate.
+//
+//   r := srl(a, imm*8)
+//
+// https://msdn.microsoft.com/en-us/library/305w28yz(v=vs.100).aspx
+// FORCE_INLINE _mm_srli_si128(__m128i a, __constrange(0,255) int imm)
+#define _mm_srli_si128(a, imm)                                              \
+    __extension__({                                                         \
+        __m128i ret;                                                        \
+        if (unlikely((imm) <= 0)) {                                         \
+            ret = a;                                                        \
+        }                                                                   \
+        if (unlikely((imm) > 15)) {                                         \
+            ret = _mm_setzero_si128();                                      \
+        } else {                                                            \
+            ret = vreinterpretq_m128i_s8(                                   \
+                vextq_s8(vreinterpretq_s8_m128i(a), vdupq_n_s8(0), (imm))); \
+        }                                                                   \
+        ret;                                                                \
+    })
+
+// Shifts the 128-bit value in a left by imm bytes while shifting in zeros. imm
+// must be an immediate.
+//
+//   r := a << (imm * 8)
+//
+// https://msdn.microsoft.com/en-us/library/34d3k2kt(v=vs.100).aspx
+// FORCE_INLINE __m128i _mm_slli_si128(__m128i a, __constrange(0,255) int imm)
+#define _mm_slli_si128(a, imm)                                          \
+    __extension__({                                                     \
+        __m128i ret;                                                    \
+        if (unlikely((imm) <= 0)) {                                     \
+            ret = a;                                                    \
+        }                                                               \
+        if (unlikely((imm) > 15)) {                                     \
+            ret = _mm_setzero_si128();                                  \
+        } else {                                                        \
+            ret = vreinterpretq_m128i_s8(vextq_s8(                      \
+                vdupq_n_s8(0), vreinterpretq_s8_m128i(a), 16 - (imm))); \
+        }                                                               \
+        ret;                                                            \
+    })
+
+// Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while
+// shifting in zeros.
+//
+//   r0 := a0 << count
+//   r1 := a1 << count
+//   ...
+//   r7 := a7 << count
+//
+// https://msdn.microsoft.com/en-us/library/c79w388h(v%3dvs.90).aspx
+FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (unlikely(c > 15))
+        return _mm_setzero_si128();
+
+    int16x8_t vc = vdupq_n_s16((int16_t) c);
+    return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc));
+}
+
+// Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while
+// shifting in zeros.
+//
+// r0 := a0 << count
+// r1 := a1 << count
+// r2 := a2 << count
+// r3 := a3 << count
+//
+// https://msdn.microsoft.com/en-us/library/6fe5a6s9(v%3dvs.90).aspx
+FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (unlikely(c > 31))
+        return _mm_setzero_si128();
+
+    int32x4_t vc = vdupq_n_s32((int32_t) c);
+    return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc));
+}
+
+// Shifts the 2 signed or unsigned 64-bit integers in a left by count bits while
+// shifting in zeros.
+//
+// r0 := a0 << count
+// r1 := a1 << count
+//
+// https://msdn.microsoft.com/en-us/library/6ta9dffd(v%3dvs.90).aspx
+FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (unlikely(c > 63))
+        return _mm_setzero_si128();
+
+    int64x2_t vc = vdupq_n_s64((int64_t) c);
+    return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc));
+}
+
+// Shifts the 8 signed or unsigned 16-bit integers in a right by count bits
+// while shifting in zeros.
+//
+// r0 := srl(a0, count)
+// r1 := srl(a1, count)
+// ...
+// r7 := srl(a7, count)
+//
+// https://msdn.microsoft.com/en-us/library/wd5ax830(v%3dvs.90).aspx
+FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (unlikely(c > 15))
+        return _mm_setzero_si128();
+
+    int16x8_t vc = vdupq_n_s16(-(int16_t) c);
+    return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc));
+}
+
+// Shifts the 4 signed or unsigned 32-bit integers in a right by count bits
+// while shifting in zeros.
+//
+// r0 := srl(a0, count)
+// r1 := srl(a1, count)
+// r2 := srl(a2, count)
+// r3 := srl(a3, count)
+//
+// https://msdn.microsoft.com/en-us/library/a9cbttf4(v%3dvs.90).aspx
+FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (unlikely(c > 31))
+        return _mm_setzero_si128();
+
+    int32x4_t vc = vdupq_n_s32(-(int32_t) c);
+    return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc));
+}
+
+// Shifts the 2 signed or unsigned 64-bit integers in a right by count bits
+// while shifting in zeros.
+//
+// r0 := srl(a0, count)
+// r1 := srl(a1, count)
+//
+// https://msdn.microsoft.com/en-us/library/yf6cf9k8(v%3dvs.90).aspx
+FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (unlikely(c > 63))
+        return _mm_setzero_si128();
+
+    int64x2_t vc = vdupq_n_s64(-(int64_t) c);
+    return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc));
+}
+
+// NEON does not provide a version of this function.
+// Creates a 16-bit mask from the most significant bits of the 16 signed or
+// unsigned 8-bit integers in a and zero extends the upper bits.
+// https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx
+FORCE_INLINE int _mm_movemask_epi8(__m128i a)
+{
+    // Use increasingly wide shifts+adds to collect the sign bits
+    // together.
+    // Since the widening shifts would be rather confusing to follow in little
+    // endian, everything will be illustrated in big endian order instead. This
+    // has a different result - the bits would actually be reversed on a big
+    // endian machine.
+
+    // Starting input (only half the elements are shown):
+    // 89 ff 1d c0 00 10 99 33
+    uint8x16_t input = vreinterpretq_u8_m128i(a);
+
+    // Shift out everything but the sign bits with an unsigned shift right.
+    //
+    // Bytes of the vector::
+    // 89 ff 1d c0 00 10 99 33
+    // \  \  \  \  \  \  \  \    high_bits = (uint16x4_t)(input >> 7)
+    //  |  |  |  |  |  |  |  |
+    // 01 01 00 01 00 00 01 00
+    //
+    // Bits of first important lane(s):
+    // 10001001 (89)
+    // \______
+    //        |
+    // 00000001 (01)
+    uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
+
+    // Merge the even lanes together with a 16-bit unsigned shift right + add.
+    // 'xx' represents garbage data which will be ignored in the final result.
+    // In the important bytes, the add functions like a binary OR.
+    //
+    // 01 01 00 01 00 00 01 00
+    //  \_ |  \_ |  \_ |  \_ |   paired16 = (uint32x4_t)(input + (input >> 7))
+    //    \|    \|    \|    \|
+    // xx 03 xx 01 xx 00 xx 02
+    //
+    // 00000001 00000001 (01 01)
+    //        \_______ |
+    //                \|
+    // xxxxxxxx xxxxxx11 (xx 03)
+    uint32x4_t paired16 =
+        vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
+
+    // Repeat with a wider 32-bit shift + add.
+    // xx 03 xx 01 xx 00 xx 02
+    //     \____ |     \____ |  paired32 = (uint64x1_t)(paired16 + (paired16 >>
+    //     14))
+    //          \|          \|
+    // xx xx xx 0d xx xx xx 02
+    //
+    // 00000011 00000001 (03 01)
+    //        \\_____ ||
+    //         '----.\||
+    // xxxxxxxx xxxx1101 (xx 0d)
+    uint64x2_t paired32 =
+        vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
+
+    // Last, an even wider 64-bit shift + add to get our result in the low 8 bit
+    // lanes. xx xx xx 0d xx xx xx 02
+    //            \_________ |   paired64 = (uint8x8_t)(paired32 + (paired32 >>
+    //            28))
+    //                      \|
+    // xx xx xx xx xx xx xx d2
+    //
+    // 00001101 00000010 (0d 02)
+    //     \   \___ |  |
+    //      '---.  \|  |
+    // xxxxxxxx 11010010 (xx d2)
+    uint8x16_t paired64 =
+        vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
+
+    // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
+    // xx xx xx xx xx xx xx d2
+    //                      ||  return paired64[0]
+    //                      d2
+    // Note: Little endian would return the correct value 4b (01001011) instead.
+    return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
+}
+
+// Copy the lower 64-bit integer in a to dst.
+//
+//   dst[63:0] := a[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movepi64_pi64
+FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
+{
+    return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a)));
+}
+
+// Copy the 64-bit integer a to the lower element of dst, and zero the upper
+// element.
+//
+//   dst[63:0] := a[63:0]
+//   dst[127:64] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movpi64_epi64
+FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
+{
+    return vreinterpretq_m128i_s64(
+        vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0)));
+}
+
+// NEON does not provide this method
+// Creates a 4-bit mask from the most significant bits of the four
+// single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx
+FORCE_INLINE int _mm_movemask_ps(__m128 a)
+{
+    uint32x4_t input = vreinterpretq_u32_m128(a);
+#if defined(__aarch64__)
+    static const int32x4_t shift = {0, 1, 2, 3};
+    uint32x4_t tmp = vshrq_n_u32(input, 31);
+    return vaddvq_u32(vshlq_u32(tmp, shift));
+#else
+    // Uses the exact same method as _mm_movemask_epi8, see that for details.
+    // Shift out everything but the sign bits with a 32-bit unsigned shift
+    // right.
+    uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
+    // Merge the two pairs together with a 64-bit unsigned shift right + add.
+    uint8x16_t paired =
+        vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
+    // Extract the result.
+    return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
+#endif
+}
+
+// Compute the bitwise NOT of a and then AND with a 128-bit vector containing
+// all 1's, and return 1 if the result is zero, otherwise return 0.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_ones
+FORCE_INLINE int _mm_test_all_ones(__m128i a)
+{
+    return (uint64_t)(vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
+           ~(uint64_t) 0;
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and
+// mask, and return 1 if the result is zero, otherwise return 0.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros
+FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
+{
+    int64x2_t a_and_mask =
+        vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask));
+    return (vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1)) ? 0
+                                                                           : 1;
+}
+
+/* Math operations */
+
+// Subtracts the four single-precision, floating-point values of a and b.
+//
+//   r0 := a0 - b0
+//   r1 := a1 - b1
+//   r2 := a2 - b2
+//   r3 := a3 - b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Subtract the lower single-precision (32-bit) floating-point element in b from
+// the lower single-precision (32-bit) floating-point element in a, store the
+// result in the lower element of dst, and copy the upper 3 packed elements from
+// a to the upper elements of dst.
+//
+//   dst[31:0] := a[31:0] - b[31:0]
+//   dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ss
+FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_sub_ps(a, b));
+}
+
+// Subtract 2 packed 64-bit integers in b from 2 packed 64-bit integers in a,
+// and store the results in dst.
+//    r0 := a0 - b0
+//    r1 := a1 - b1
+FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s64(
+        vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+}
+
+// Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or
+// unsigned 32-bit integers of a.
+//
+//   r0 := a0 - b0
+//   r1 := a1 - b1
+//   r2 := a2 - b2
+//   r3 := a3 - b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and
+// store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi16
+FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and
+// store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi8
+FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Subtract 64-bit integer b from 64-bit integer a, and store the result in dst.
+//
+//   dst[63:0] := a[63:0] - b[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_si64
+FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s64(
+        vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
+}
+
+// Subtracts the 8 unsigned 16-bit integers of bfrom the 8 unsigned 16-bit
+// integers of a and saturates..
+// https://technet.microsoft.com/en-us/subscriptions/index/f44y0s19(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Subtracts the 16 unsigned 8-bit integers of b from the 16 unsigned 8-bit
+// integers of a and saturates.
+//
+//   r0 := UnsignedSaturate(a0 - b0)
+//   r1 := UnsignedSaturate(a1 - b1)
+//   ...
+//   r15 := UnsignedSaturate(a15 - b15)
+//
+// https://technet.microsoft.com/en-us/subscriptions/yadkxc18(v=vs.90)
+FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Subtracts the 16 signed 8-bit integers of b from the 16 signed 8-bit integers
+// of a and saturates.
+//
+//   r0 := SignedSaturate(a0 - b0)
+//   r1 := SignedSaturate(a1 - b1)
+//   ...
+//   r15 := SignedSaturate(a15 - b15)
+//
+// https://technet.microsoft.com/en-us/subscriptions/by7kzks1(v=vs.90)
+FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Subtracts the 8 signed 16-bit integers of b from the 8 signed 16-bit integers
+// of a and saturates.
+//
+//   r0 := SignedSaturate(a0 - b0)
+//   r1 := SignedSaturate(a1 - b1)
+//   ...
+//   r7 := SignedSaturate(a7 - b7)
+//
+// https://technet.microsoft.com/en-us/subscriptions/3247z5b8(v=vs.90)
+FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Subtract packed double-precision (64-bit) floating-point elements in b from
+// packed double-precision (64-bit) floating-point elements in a, and store the
+// results in dst.
+//
+//   FOR j := 0 to 1
+//     i := j*64
+//     dst[i+63:i] := a[i+63:i] - b[i+63:i]
+//   ENDFOR
+//
+//  https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_pd
+FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2];
+    c[0] = da[0] - db[0];
+    c[1] = da[1] - db[1];
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Subtract the lower double-precision (64-bit) floating-point element in b from
+// the lower double-precision (64-bit) floating-point element in a, store the
+// result in the lower element of dst, and copy the upper element from a to the
+// upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_sd
+FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_sub_pd(a, b));
+}
+
+// Add packed unsigned 16-bit integers in a and b using saturation, and store
+// the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epu16
+FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Negate packed 8-bit integers in a when the corresponding signed
+// 8-bit integer in b is negative, and store the results in dst.
+// Element in dst are zeroed out when the corresponding element
+// in b is zero.
+//
+//   for i in 0..15
+//     if b[i] < 0
+//       r[i] := -a[i]
+//     else if b[i] == 0
+//       r[i] := 0
+//     else
+//       r[i] := a[i]
+//     fi
+//   done
+FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
+{
+    int8x16_t a = vreinterpretq_s8_m128i(_a);
+    int8x16_t b = vreinterpretq_s8_m128i(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFF : 0
+    uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
+
+    // (b == 0) ? 0xFF : 0
+#if defined(__aarch64__)
+    int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
+#else
+    int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
+#endif
+
+    // bitwise select either a or nagative 'a' (vnegq_s8(a) return nagative 'a')
+    // based on ltMask
+    int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a);
+    // res = masked & (~zeroMask)
+    int8x16_t res = vbicq_s8(masked, zeroMask);
+
+    return vreinterpretq_m128i_s8(res);
+}
+
+// Negate packed 16-bit integers in a when the corresponding signed
+// 16-bit integer in b is negative, and store the results in dst.
+// Element in dst are zeroed out when the corresponding element
+// in b is zero.
+//
+//   for i in 0..7
+//     if b[i] < 0
+//       r[i] := -a[i]
+//     else if b[i] == 0
+//       r[i] := 0
+//     else
+//       r[i] := a[i]
+//     fi
+//   done
+FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
+{
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFF : 0
+    uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
+    // (b == 0) ? 0xFFFF : 0
+#if defined(__aarch64__)
+    int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
+#else
+    int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));
+#endif
+
+    // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative
+    // 'a') based on ltMask
+    int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a);
+    // res = masked & (~zeroMask)
+    int16x8_t res = vbicq_s16(masked, zeroMask);
+    return vreinterpretq_m128i_s16(res);
+}
+
+// Negate packed 32-bit integers in a when the corresponding signed
+// 32-bit integer in b is negative, and store the results in dst.
+// Element in dst are zeroed out when the corresponding element
+// in b is zero.
+//
+//   for i in 0..3
+//     if b[i] < 0
+//       r[i] := -a[i]
+//     else if b[i] == 0
+//       r[i] := 0
+//     else
+//       r[i] := a[i]
+//     fi
+//   done
+FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
+{
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFFFFFF : 0
+    uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
+
+    // (b == 0) ? 0xFFFFFFFF : 0
+#if defined(__aarch64__)
+    int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
+#else
+    int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));
+#endif
+
+    // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative
+    // 'a') based on ltMask
+    int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a);
+    // res = masked & (~zeroMask)
+    int32x4_t res = vbicq_s32(masked, zeroMask);
+    return vreinterpretq_m128i_s32(res);
+}
+
+// Negate packed 16-bit integers in a when the corresponding signed 16-bit
+// integer in b is negative, and store the results in dst. Element in dst are
+// zeroed out when the corresponding element in b is zero.
+//
+//   FOR j := 0 to 3
+//      i := j*16
+//      IF b[i+15:i] < 0
+//        dst[i+15:i] := -(a[i+15:i])
+//      ELSE IF b[i+15:i] == 0
+//        dst[i+15:i] := 0
+//      ELSE
+//        dst[i+15:i] := a[i+15:i]
+//      FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi16
+FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
+{
+    int16x4_t a = vreinterpret_s16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFF : 0
+    uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
+
+    // (b == 0) ? 0xFFFF : 0
+#if defined(__aarch64__)
+    int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
+#else
+    int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
+#endif
+
+    // bitwise select either a or nagative 'a' (vneg_s16(a) return nagative 'a')
+    // based on ltMask
+    int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a);
+    // res = masked & (~zeroMask)
+    int16x4_t res = vbic_s16(masked, zeroMask);
+
+    return vreinterpret_m64_s16(res);
+}
+
+// Negate packed 32-bit integers in a when the corresponding signed 32-bit
+// integer in b is negative, and store the results in dst. Element in dst are
+// zeroed out when the corresponding element in b is zero.
+//
+//   FOR j := 0 to 1
+//      i := j*32
+//      IF b[i+31:i] < 0
+//        dst[i+31:i] := -(a[i+31:i])
+//      ELSE IF b[i+31:i] == 0
+//        dst[i+31:i] := 0
+//      ELSE
+//        dst[i+31:i] := a[i+31:i]
+//      FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi32
+FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
+{
+    int32x2_t a = vreinterpret_s32_m64(_a);
+    int32x2_t b = vreinterpret_s32_m64(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFFFFFF : 0
+    uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
+
+    // (b == 0) ? 0xFFFFFFFF : 0
+#if defined(__aarch64__)
+    int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
+#else
+    int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
+#endif
+
+    // bitwise select either a or nagative 'a' (vneg_s32(a) return nagative 'a')
+    // based on ltMask
+    int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a);
+    // res = masked & (~zeroMask)
+    int32x2_t res = vbic_s32(masked, zeroMask);
+
+    return vreinterpret_m64_s32(res);
+}
+
+// Negate packed 8-bit integers in a when the corresponding signed 8-bit integer
+// in b is negative, and store the results in dst. Element in dst are zeroed out
+// when the corresponding element in b is zero.
+//
+//   FOR j := 0 to 7
+//      i := j*8
+//      IF b[i+7:i] < 0
+//        dst[i+7:i] := -(a[i+7:i])
+//      ELSE IF b[i+7:i] == 0
+//        dst[i+7:i] := 0
+//      ELSE
+//        dst[i+7:i] := a[i+7:i]
+//      FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi8
+FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
+{
+    int8x8_t a = vreinterpret_s8_m64(_a);
+    int8x8_t b = vreinterpret_s8_m64(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFF : 0
+    uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
+
+    // (b == 0) ? 0xFF : 0
+#if defined(__aarch64__)
+    int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
+#else
+    int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
+#endif
+
+    // bitwise select either a or nagative 'a' (vneg_s8(a) return nagative 'a')
+    // based on ltMask
+    int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a);
+    // res = masked & (~zeroMask)
+    int8x8_t res = vbic_s8(masked, zeroMask);
+
+    return vreinterpret_m64_s8(res);
+}
+
+// Average packed unsigned 16-bit integers in a and b, and store the results in
+// dst.
+//
+//   FOR j := 0 to 3
+//     i := j*16
+//     dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu16
+FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u16(
+        vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)));
+}
+
+// Average packed unsigned 8-bit integers in a and b, and store the results in
+// dst.
+//
+//   FOR j := 0 to 7
+//     i := j*8
+//     dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu8
+FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u8(
+        vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
+}
+
+// Average packed unsigned 8-bit integers in a and b, and store the results in
+// dst.
+//
+//   FOR j := 0 to 7
+//     i := j*8
+//     dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgb
+#define _m_pavgb(a, b) _mm_avg_pu8(a, b)
+
+// Average packed unsigned 16-bit integers in a and b, and store the results in
+// dst.
+//
+//   FOR j := 0 to 3
+//     i := j*16
+//     dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgw
+#define _m_pavgw(a, b) _mm_avg_pu16(a, b)
+
+// Extract a 16-bit integer from a, selected with imm8, and store the result in
+// the lower element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pextrw
+#define _m_pextrw(a, imm) _mm_extract_pi16(a, imm)
+
+// Copy a to dst, and insert the 16-bit integer i into dst at the location
+// specified by imm8.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=m_pinsrw
+#define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm)
+
+// Compare packed signed 16-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxsw
+#define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxub
+#define _m_pmaxub(a, b) _mm_max_pu8(a, b)
+
+// Compare packed signed 16-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminsw
+#define _m_pminsw(a, b) _mm_min_pi16(a, b)
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminub
+#define _m_pminub(a, b) _mm_min_pu8(a, b)
+
+// Computes the average of the 16 unsigned 8-bit integers in a and the 16
+// unsigned 8-bit integers in b and rounds.
+//
+//   r0 := (a0 + b0) / 2
+//   r1 := (a1 + b1) / 2
+//   ...
+//   r15 := (a15 + b15) / 2
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/8zwh554a(v%3dvs.90).aspx
+FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Computes the average of the 8 unsigned 16-bit integers in a and the 8
+// unsigned 16-bit integers in b and rounds.
+//
+//   r0 := (a0 + b0) / 2
+//   r1 := (a1 + b1) / 2
+//   ...
+//   r7 := (a7 + b7) / 2
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/y13ca3c8(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
+{
+    return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a),
+                                 vreinterpretq_u16_m128i(b));
+}
+
+// Adds the four single-precision, floating-point values of a and b.
+//
+//   r0 := a0 + b0
+//   r1 := a1 + b1
+//   r2 := a2 + b2
+//   r3 := a3 + b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Add packed double-precision (64-bit) floating-point elements in a and b, and
+// store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_pd
+FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2];
+    c[0] = da[0] + db[0];
+    c[1] = da[1] + db[1];
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Add the lower double-precision (64-bit) floating-point element in a and b,
+// store the result in the lower element of dst, and copy the upper element from
+// a to the upper element of dst.
+//
+//   dst[63:0] := a[63:0] + b[63:0]
+//   dst[127:64] := a[127:64]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_sd
+FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_add_pd(a, b));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2];
+    c[0] = da[0] + db[0];
+    c[1] = da[1];
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Add 64-bit integers a and b, and store the result in dst.
+//
+//   dst[63:0] := a[63:0] + b[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_si64
+FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s64(
+        vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
+}
+
+// adds the scalar single-precision floating point values of a and b.
+// https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
+{
+    float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
+    float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
+    // the upper values in the result must be the remnants of <a>.
+    return vreinterpretq_m128_f32(vaddq_f32(a, value));
+}
+
+// Adds the 4 signed or unsigned 64-bit integers in a to the 4 signed or
+// unsigned 32-bit integers in b.
+// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s64(
+        vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+}
+
+// Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or
+// unsigned 32-bit integers in b.
+//
+//   r0 := a0 + b0
+//   r1 := a1 + b1
+//   r2 := a2 + b2
+//   r3 := a3 + b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or
+// unsigned 16-bit integers in b.
+// https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Adds the 16 signed or unsigned 8-bit integers in a to the 16 signed or
+// unsigned 8-bit integers in b.
+// https://technet.microsoft.com/en-us/subscriptions/yc7tcyzs(v=vs.90)
+FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b
+// and saturates.
+//
+//   r0 := SignedSaturate(a0 + b0)
+//   r1 := SignedSaturate(a1 + b1)
+//   ...
+//   r7 := SignedSaturate(a7 + b7)
+//
+// https://msdn.microsoft.com/en-us/library/1a306ef8(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Add packed signed 8-bit integers in a and b using saturation, and store the
+// results in dst.
+//
+//   FOR j := 0 to 15
+//     i := j*8
+//     dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epi8
+FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Adds the 16 unsigned 8-bit integers in a to the 16 unsigned 8-bit integers in
+// b and saturates..
+// https://msdn.microsoft.com/en-us/library/9hahyddy(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or
+// unsigned 16-bit integers from b.
+//
+//   r0 := (a0 * b0)[15:0]
+//   r1 := (a1 * b1)[15:0]
+//   ...
+//   r7 := (a7 * b7)[15:0]
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or
+// unsigned 32-bit integers from b.
+// https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Multiply the packed unsigned 16-bit integers in a and b, producing
+// intermediate 32-bit integers, and store the high 16 bits of the intermediate
+// integers in dst.
+//
+//   FOR j := 0 to 3
+//      i := j*16
+//      tmp[31:0] := a[i+15:i] * b[i+15:i]
+//      dst[i+15:i] := tmp[31:16]
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmulhuw
+#define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
+
+// Multiplies the four single-precision, floating-point values of a and b.
+//
+//   r0 := a0 * b0
+//   r1 := a1 * b1
+//   r2 := a2 * b2
+//   r3 := a3 * b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Multiply packed double-precision (64-bit) floating-point elements in a and b,
+// and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_pd
+FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2];
+    c[0] = da[0] * db[0];
+    c[1] = da[1] * db[1];
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Multiply the lower double-precision (64-bit) floating-point element in a and
+// b, store the result in the lower element of dst, and copy the upper element
+// from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_sd
+FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_mul_pd(a, b));
+}
+
+// Multiply the lower single-precision (32-bit) floating-point element in a and
+// b, store the result in the lower element of dst, and copy the upper 3 packed
+// elements from a to the upper elements of dst.
+//
+//   dst[31:0] := a[31:0] * b[31:0]
+//   dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ss
+FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_mul_ps(a, b));
+}
+
+// Multiply the low unsigned 32-bit integers from each packed 64-bit element in
+// a and b, and store the unsigned 64-bit results in dst.
+//
+//   r0 :=  (a0 & 0xFFFFFFFF) * (b0 & 0xFFFFFFFF)
+//   r1 :=  (a2 & 0xFFFFFFFF) * (b2 & 0xFFFFFFFF)
+FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
+{
+    // vmull_u32 upcasts instead of masking, so we downcast.
+    uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a));
+    uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b));
+    return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo));
+}
+
+// Multiply the low unsigned 32-bit integers from a and b, and store the
+// unsigned 64-bit result in dst.
+//
+//   dst[63:0] := a[31:0] * b[31:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_su32
+FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u64(vget_low_u64(
+        vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b))));
+}
+
+// Multiply the low signed 32-bit integers from each packed 64-bit element in
+// a and b, and store the signed 64-bit results in dst.
+//
+//   r0 :=  (int64_t)(int32_t)a0 * (int64_t)(int32_t)b0
+//   r1 :=  (int64_t)(int32_t)a2 * (int64_t)(int32_t)b2
+FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
+{
+    // vmull_s32 upcasts instead of masking, so we downcast.
+    int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a));
+    int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b));
+    return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
+}
+
+// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
+// integers from b.
+//
+//   r0 := (a0 * b0) + (a1 * b1)
+//   r1 := (a2 * b2) + (a3 * b3)
+//   r2 := (a4 * b4) + (a5 * b5)
+//   r3 := (a6 * b6) + (a7 * b7)
+// https://msdn.microsoft.com/en-us/library/yht36sa6(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
+{
+    int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
+                              vget_low_s16(vreinterpretq_s16_m128i(b)));
+    int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
+                               vget_high_s16(vreinterpretq_s16_m128i(b)));
+
+    int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
+    int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
+
+    return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum));
+}
+
+// Multiply packed signed 16-bit integers in a and b, producing intermediate
+// signed 32-bit integers. Shift right by 15 bits while rounding up, and store
+// the packed 16-bit integers in dst.
+//
+//   r0 := Round(((int32_t)a0 * (int32_t)b0) >> 15)
+//   r1 := Round(((int32_t)a1 * (int32_t)b1) >> 15)
+//   r2 := Round(((int32_t)a2 * (int32_t)b2) >> 15)
+//   ...
+//   r7 := Round(((int32_t)a7 * (int32_t)b7) >> 15)
+FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
+{
+    // Has issues due to saturation
+    // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b));
+
+    // Multiply
+    int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
+                                 vget_low_s16(vreinterpretq_s16_m128i(b)));
+    int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
+                                 vget_high_s16(vreinterpretq_s16_m128i(b)));
+
+    // Rounding narrowing shift right
+    // narrow = (int16_t)((mul + 16384) >> 15);
+    int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
+    int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
+
+    // Join together
+    return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi));
+}
+
+// Vertically multiply each unsigned 8-bit integer from a with the corresponding
+// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
+// Horizontally add adjacent pairs of intermediate signed 16-bit integers,
+// and pack the saturated results in dst.
+//
+//   FOR j := 0 to 7
+//      i := j*16
+//      dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] +
+//      a[i+7:i]*b[i+7:i] )
+//   ENDFOR
+FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
+{
+#if defined(__aarch64__)
+    uint8x16_t a = vreinterpretq_u8_m128i(_a);
+    int8x16_t b = vreinterpretq_s8_m128i(_b);
+    int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
+                             vmovl_s8(vget_low_s8(b)));
+    int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
+                             vmovl_s8(vget_high_s8(b)));
+    return vreinterpretq_m128i_s16(
+        vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)));
+#else
+    // This would be much simpler if x86 would choose to zero extend OR sign
+    // extend, not both. This could probably be optimized better.
+    uint16x8_t a = vreinterpretq_u16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+
+    // Zero extend a
+    int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
+    int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
+
+    // Sign extend by shifting left then shifting right.
+    int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
+    int16x8_t b_odd = vshrq_n_s16(b, 8);
+
+    // multiply
+    int16x8_t prod1 = vmulq_s16(a_even, b_even);
+    int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
+
+    // saturated add
+    return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2));
+#endif
+}
+
+// Computes the fused multiple add product of 32-bit floating point numbers.
+//
+// Return Value
+// Multiplies A and B, and adds C to the temporary result before returning it.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd
+FORCE_INLINE __m128 _mm_fmadd_ps(__m128 a, __m128 b, __m128 c)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(c),
+                                            vreinterpretq_f32_m128(b),
+                                            vreinterpretq_f32_m128(a)));
+#else
+    return _mm_add_ps(_mm_mul_ps(a, b), c);
+#endif
+}
+
+// Alternatively add and subtract packed single-precision (32-bit)
+// floating-point elements in a to/from packed elements in b, and store the
+// results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=addsub_ps
+FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
+{
+    __m128 mask = {-1.0f, 1.0f, -1.0f, 1.0f};
+    return _mm_fmadd_ps(b, mask, a);
+}
+
+// Horizontally add adjacent pairs of double-precision (64-bit) floating-point
+// elements in a and b, and pack the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pd
+FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[] = {da[0] + da[1], db[0] + db[1]};
+    return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
+#endif
+}
+
+// Compute the absolute differences of packed unsigned 8-bit integers in a and
+// b, then horizontally sum each consecutive 8 differences to produce two
+// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
+// 16 bits of 64-bit elements in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_epu8
+FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
+{
+    uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
+    uint16_t r0 = t[0] + t[1] + t[2] + t[3];
+    uint16_t r4 = t[4] + t[5] + t[6] + t[7];
+    uint16x8_t r = vsetq_lane_u16(r0, vdupq_n_u16(0), 0);
+    return (__m128i) vsetq_lane_u16(r4, r, 4);
+}
+
+// Compute the absolute differences of packed unsigned 8-bit integers in a and
+// b, then horizontally sum each consecutive 8 differences to produce four
+// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
+// 16 bits of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_pu8
+FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
+{
+    uint16x4_t t =
+        vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
+    uint16_t r0 = t[0] + t[1] + t[2] + t[3];
+    return vreinterpret_m64_u16(vset_lane_u16(r0, vdup_n_u16(0), 0));
+}
+
+// Compute the absolute differences of packed unsigned 8-bit integers in a and
+// b, then horizontally sum each consecutive 8 differences to produce four
+// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
+// 16 bits of dst.
+//
+//   FOR j := 0 to 7
+//      i := j*8
+//      tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i])
+//   ENDFOR
+//   dst[15:0] := tmp[7:0] + tmp[15:8] + tmp[23:16] + tmp[31:24] + tmp[39:32] +
+//   tmp[47:40] + tmp[55:48] + tmp[63:56] dst[63:16] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_psadbw
+#define _m_psadbw(a, b) _mm_sad_pu8(a, b)
+
+// Divides the four single-precision, floating-point values of a and b.
+//
+//   r0 := a0 / b0
+//   r1 := a1 / b1
+//   r2 := a2 / b2
+//   r3 := a3 / b3
+//
+// https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
+{
+#if defined(__aarch64__) && !SSE2NEON_PRECISE_DIV
+    return vreinterpretq_m128_f32(
+        vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+    float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b));
+    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
+#if SSE2NEON_PRECISE_DIV
+    // Additional Netwon-Raphson iteration for accuracy
+    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
+#endif
+    return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip));
+#endif
+}
+
+// Divides the scalar single-precision floating point value of a by b.
+// https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
+{
+    float32_t value =
+        vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
+}
+
+// Divide packed double-precision (64-bit) floating-point elements in a by
+// packed elements in b, and store the results in dst.
+//
+//  FOR j := 0 to 1
+//    i := 64*j
+//    dst[i+63:i] := a[i+63:i] / b[i+63:i]
+//  ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_pd
+FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2];
+    c[0] = da[0] / db[0];
+    c[1] = da[1] / db[1];
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Divide the lower double-precision (64-bit) floating-point element in a by the
+// lower double-precision (64-bit) floating-point element in b, store the result
+// in the lower element of dst, and copy the upper element from a to the upper
+// element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_sd
+FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    float64x2_t tmp =
+        vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b));
+    return vreinterpretq_m128d_f64(
+        vsetq_lane_f64(vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1), tmp, 1));
+#else
+    return _mm_move_sd(a, _mm_div_pd(a, b));
+#endif
+}
+
+// Compute the approximate reciprocal of packed single-precision (32-bit)
+// floating-point elements in a, and store the results in dst. The maximum
+// relative error for this approximation is less than 1.5*2^-12.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ps
+FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
+{
+    float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
+    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
+#if SSE2NEON_PRECISE_DIV
+    // Additional Netwon-Raphson iteration for accuracy
+    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
+#endif
+    return vreinterpretq_m128_f32(recip);
+}
+
+// Compute the approximate reciprocal of the lower single-precision (32-bit)
+// floating-point element in a, store the result in the lower element of dst,
+// and copy the upper 3 packed elements from a to the upper elements of dst. The
+// maximum relative error for this approximation is less than 1.5*2^-12.
+//
+//   dst[31:0] := (1.0 / a[31:0])
+//   dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ss
+FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
+{
+    return _mm_move_ss(a, _mm_rcp_ps(a));
+}
+
+// Computes the approximations of square roots of the four single-precision,
+// floating-point values of a. First computes reciprocal square roots and then
+// reciprocals of the four values.
+//
+//   r0 := sqrt(a0)
+//   r1 := sqrt(a1)
+//   r2 := sqrt(a2)
+//   r3 := sqrt(a3)
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
+{
+#if SSE2NEON_PRECISE_SQRT
+    float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in));
+
+    // Test for vrsqrteq_f32(0) -> positive infinity case.
+    // Change to zero, so that s * 1/sqrt(s) result is zero too.
+    const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);
+    const uint32x4_t div_by_zero =
+        vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip));
+    recip = vreinterpretq_f32_u32(
+        vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip)));
+
+    // Additional Netwon-Raphson iteration for accuracy
+    recip = vmulq_f32(
+        vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
+        recip);
+    recip = vmulq_f32(
+        vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
+        recip);
+
+    // sqrt(s) = s * 1/sqrt(s)
+    return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(in), recip));
+#elif defined(__aarch64__)
+    return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
+#else
+    float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in));
+    float32x4_t sq = vrecpeq_f32(recipsq);
+    return vreinterpretq_m128_f32(sq);
+#endif
+}
+
+// Computes the approximation of the square root of the scalar single-precision
+// floating point value of in.
+// https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
+{
+    float32_t value =
+        vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
+}
+
+// Computes the approximations of the reciprocal square roots of the four
+// single-precision floating point values of in.
+// https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
+{
+    float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in));
+#if SSE2NEON_PRECISE_RSQRT
+    // Additional Netwon-Raphson iteration for accuracy
+    out = vmulq_f32(
+        out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
+    out = vmulq_f32(
+        out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
+#endif
+    return vreinterpretq_m128_f32(out);
+}
+
+// Compute the approximate reciprocal square root of the lower single-precision
+// (32-bit) floating-point element in a, store the result in the lower element
+// of dst, and copy the upper 3 packed elements from a to the upper elements of
+// dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ss
+FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
+{
+    return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
+}
+
+// Compare packed signed 16-bit integers in a and b, and store packed maximum
+// values in dst.
+//
+//   FOR j := 0 to 3
+//      i := j*16
+//      dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16
+FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s16(
+        vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
+}
+
+// Compare packed signed 16-bit integers in a and b, and store packed maximum
+// values in dst.
+//
+//   FOR j := 0 to 3
+//      i := j*16
+//      dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16
+#define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
+
+// Computes the maximums of the four single-precision, floating-point values of
+// a and b.
+// https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
+{
+#if SSE2NEON_PRECISE_MINMAX
+    float32x4_t _a = vreinterpretq_f32_m128(a);
+    float32x4_t _b = vreinterpretq_f32_m128(b);
+    return vbslq_f32(vcltq_f32(_b, _a), _a, _b);
+#else
+    return vreinterpretq_m128_f32(
+        vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#endif
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
+// values in dst.
+//
+//   FOR j := 0 to 7
+//      i := j*8
+//      dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8
+FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u8(
+        vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
+// values in dst.
+//
+//   FOR j := 0 to 7
+//      i := j*8
+//      dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8
+#define _m_pmaxub(a, b) _mm_max_pu8(a, b)
+
+// Compare packed signed 16-bit integers in a and b, and store packed minimum
+// values in dst.
+//
+//   FOR j := 0 to 3
+//      i := j*16
+//      dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16
+FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s16(
+        vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
+}
+
+// Compare packed signed 16-bit integers in a and b, and store packed minimum
+// values in dst.
+//
+//   FOR j := 0 to 3
+//      i := j*16
+//      dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16
+#define _m_pminsw(a, b) _mm_min_pi16(a, b)
+
+// Computes the minima of the four single-precision, floating-point values of a
+// and b.
+// https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
+{
+#if SSE2NEON_PRECISE_MINMAX
+    float32x4_t _a = vreinterpretq_f32_m128(a);
+    float32x4_t _b = vreinterpretq_f32_m128(b);
+    return vbslq_f32(vcltq_f32(_a, _b), _a, _b);
+#else
+    return vreinterpretq_m128_f32(
+        vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#endif
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
+// values in dst.
+//
+//   FOR j := 0 to 7
+//      i := j*8
+//      dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8
+FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u8(
+        vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
+// values in dst.
+//
+//   FOR j := 0 to 7
+//      i := j*8
+//      dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8
+#define _m_pminub(a, b) _mm_min_pu8(a, b)
+
+// Computes the maximum of the two lower scalar single-precision floating point
+// values of a and b.
+// https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
+{
+    float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
+}
+
+// Computes the minimum of the two lower scalar single-precision floating point
+// values of a and b.
+// https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
+{
+    float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
+}
+
+// Computes the pairwise maxima of the 16 unsigned 8-bit integers from a and the
+// 16 unsigned 8-bit integers from b.
+// https://msdn.microsoft.com/en-us/library/st6634za(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Computes the pairwise minima of the 16 unsigned 8-bit integers from a and the
+// 16 unsigned 8-bit integers from b.
+// https://msdn.microsoft.com/ko-kr/library/17k8cf58(v=vs.100).aspxx
+FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8
+// signed 16-bit integers from b.
+// https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compare packed signed 8-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi8
+FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed unsigned 16-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu16
+FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vmaxq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Compare packed signed 8-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi8
+FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vminq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed unsigned 16-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu16
+FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vminq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Computes the pairwise maxima of the 8 signed 16-bit integers from a and the 8
+// signed 16-bit integers from b.
+// https://msdn.microsoft.com/en-us/LIBRary/3x060h7c(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// epi versions of min/max
+// Computes the pariwise maximums of the four signed 32-bit integer values of a
+// and b.
+//
+// A 128-bit parameter that can be defined with the following equations:
+//   r0 := (a0 > b0) ? a0 : b0
+//   r1 := (a1 > b1) ? a1 : b1
+//   r2 := (a2 > b2) ? a2 : b2
+//   r3 := (a3 > b3) ? a3 : b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Computes the pariwise minima of the four signed 32-bit integer values of a
+// and b.
+//
+// A 128-bit parameter that can be defined with the following equations:
+//   r0 := (a0 < b0) ? a0 : b0
+//   r1 := (a1 < b1) ? a1 : b1
+//   r2 := (a2 < b2) ? a2 : b2
+//   r3 := (a3 < b3) ? a3 : b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compare packed unsigned 32-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
+FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
+}
+
+// Compare packed unsigned 32-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
+FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
+}
+
+// Multiply the packed unsigned 16-bit integers in a and b, producing
+// intermediate 32-bit integers, and store the high 16 bits of the intermediate
+// integers in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_pu16
+FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u16(vshrn_n_u32(
+        vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16));
+}
+
+// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
+// integers from b.
+//
+//   r0 := (a0 * b0)[31:16]
+//   r1 := (a1 * b1)[31:16]
+//   ...
+//   r7 := (a7 * b7)[31:16]
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
+{
+    /* FIXME: issue with large values because of result saturation */
+    // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a),
+    // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return
+    // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
+    int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
+    int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
+    int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
+    int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));
+    int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));
+    int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
+    uint16x8x2_t r =
+        vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
+    return vreinterpretq_m128i_u16(r.val[1]);
+}
+
+// Multiply the packed unsigned 16-bit integers in a and b, producing
+// intermediate 32-bit integers, and store the high 16 bits of the intermediate
+// integers in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_epu16
+FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
+{
+    uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a));
+    uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b));
+    uint32x4_t ab3210 = vmull_u16(a3210, b3210);
+#if defined(__aarch64__)
+    uint32x4_t ab7654 =
+        vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
+    uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210),
+                              vreinterpretq_u16_u32(ab7654));
+    return vreinterpretq_m128i_u16(r);
+#else
+    uint16x4_t a7654 = vget_high_u16(vreinterpretq_u16_m128i(a));
+    uint16x4_t b7654 = vget_high_u16(vreinterpretq_u16_m128i(b));
+    uint32x4_t ab7654 = vmull_u16(a7654, b7654);
+    uint16x8x2_t r =
+        vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
+    return vreinterpretq_m128i_u16(r.val[1]);
+#endif
+}
+
+// Computes pairwise add of each argument as single-precision, floating-point
+// values a and b.
+// https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx
+FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(
+        vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
+#endif
+}
+
+// Computes pairwise add of each argument as a 16-bit signed or unsigned integer
+// values a and b.
+FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
+{
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
+#else
+    return vreinterpretq_m128i_s16(
+        vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)),
+                     vpadd_s16(vget_low_s16(b), vget_high_s16(b))));
+#endif
+}
+
+// Horizontally substract adjacent pairs of single-precision (32-bit)
+// floating-point elements in a and b, and pack the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_ps
+FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(vsubq_f32(
+        vuzp1q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b)),
+        vuzp2q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b))));
+#else
+    float32x4x2_t c =
+        vuzpq_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b));
+    return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1]));
+#endif
+}
+
+// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
+// signed 16-bit results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi16
+FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s16(
+        vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
+}
+
+// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
+// signed 32-bit results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi32
+FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s32(
+        vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)));
+}
+
+// Computes pairwise difference of each argument as a 16-bit signed or unsigned
+// integer values a and b.
+FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
+{
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+    // Interleave using vshrn/vmovn
+    // [a0|a2|a4|a6|b0|b2|b4|b6]
+    // [a1|a3|a5|a7|b1|b3|b5|b7]
+    int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
+    int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
+    // Subtract
+    return vreinterpretq_m128i_s16(vsubq_s16(ab0246, ab1357));
+}
+
+// Computes saturated pairwise sub of each argument as a 16-bit signed
+// integer values a and b.
+FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
+{
+#if defined(__aarch64__)
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+    return vreinterpretq_s64_s16(
+        vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
+#else
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+    // Interleave using vshrn/vmovn
+    // [a0|a2|a4|a6|b0|b2|b4|b6]
+    // [a1|a3|a5|a7|b1|b3|b5|b7]
+    int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
+    int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
+    // Saturated add
+    return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357));
+#endif
+}
+
+// Computes saturated pairwise difference of each argument as a 16-bit signed
+// integer values a and b.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_epi16
+FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
+{
+#if defined(__aarch64__)
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+    return vreinterpretq_s64_s16(
+        vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
+#else
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+    // Interleave using vshrn/vmovn
+    // [a0|a2|a4|a6|b0|b2|b4|b6]
+    // [a1|a3|a5|a7|b1|b3|b5|b7]
+    int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
+    int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
+    // Saturated subtract
+    return vreinterpretq_m128i_s16(vqsubq_s16(ab0246, ab1357));
+#endif
+}
+
+// Computes pairwise add of each argument as a 32-bit signed or unsigned integer
+// values a and b.
+FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
+{
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+    return vreinterpretq_m128i_s32(
+        vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
+                     vpadd_s32(vget_low_s32(b), vget_high_s32(b))));
+}
+
+// Computes pairwise difference of each argument as a 32-bit signed or unsigned
+// integer values a and b.
+FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
+{
+    int64x2_t a = vreinterpretq_s64_m128i(_a);
+    int64x2_t b = vreinterpretq_s64_m128i(_b);
+    // Interleave using vshrn/vmovn
+    // [a0|a2|b0|b2]
+    // [a1|a2|b1|b3]
+    int32x4_t ab02 = vcombine_s32(vmovn_s64(a), vmovn_s64(b));
+    int32x4_t ab13 = vcombine_s32(vshrn_n_s64(a, 32), vshrn_n_s64(b, 32));
+    // Subtract
+    return vreinterpretq_m128i_s32(vsubq_s32(ab02, ab13));
+}
+
+// Kahan summation for accurate summation of floating-point numbers.
+// http://blog.zachbjornson.com/2019/08/11/fast-float-summation.html
+FORCE_INLINE void _sse2neon_kadd_f32(float *sum, float *c, float y)
+{
+    y -= *c;
+    float t = *sum + y;
+    *c = (t - *sum) - y;
+    *sum = t;
+}
+
+// Conditionally multiply the packed single-precision (32-bit) floating-point
+// elements in a and b using the high 4 bits in imm8, sum the four products,
+// and conditionally store the sum in dst using the low 4 bits of imm.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_ps
+FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
+{
+#if defined(__aarch64__)
+    /* shortcuts */
+    if (imm == 0xFF) {
+        return _mm_set1_ps(vaddvq_f32(_mm_mul_ps(a, b)));
+    }
+    if (imm == 0x7F) {
+        float32x4_t m = _mm_mul_ps(a, b);
+        m[3] = 0;
+        return _mm_set1_ps(vaddvq_f32(m));
+    }
+#endif
+
+    float s = 0, c = 0;
+    float32x4_t f32a = vreinterpretq_f32_m128(a);
+    float32x4_t f32b = vreinterpretq_f32_m128(b);
+
+    /* To improve the accuracy of floating-point summation, Kahan algorithm
+     * is used for each operation.
+     */
+    if (imm & (1 << 4))
+        _sse2neon_kadd_f32(&s, &c, f32a[0] * f32b[0]);
+    if (imm & (1 << 5))
+        _sse2neon_kadd_f32(&s, &c, f32a[1] * f32b[1]);
+    if (imm & (1 << 6))
+        _sse2neon_kadd_f32(&s, &c, f32a[2] * f32b[2]);
+    if (imm & (1 << 7))
+        _sse2neon_kadd_f32(&s, &c, f32a[3] * f32b[3]);
+    s += c;
+
+    float32x4_t res = {
+        (imm & 0x1) ? s : 0,
+        (imm & 0x2) ? s : 0,
+        (imm & 0x4) ? s : 0,
+        (imm & 0x8) ? s : 0,
+    };
+    return vreinterpretq_m128_f32(res);
+}
+
+/* Compare operations */
+
+// Compares for less than
+// https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compares for less than
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fy94wye7(v=vs.100)
+FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmplt_ps(a, b));
+}
+
+// Compares for greater than.
+//
+//   r0 := (a0 > b0) ? 0xffffffff : 0x0
+//   r1 := (a1 > b1) ? 0xffffffff : 0x0
+//   r2 := (a2 > b2) ? 0xffffffff : 0x0
+//   r3 := (a3 > b3) ? 0xffffffff : 0x0
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compares for greater than.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/1xyyyy9e(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpgt_ps(a, b));
+}
+
+// Compares for greater than or equal.
+// https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compares for greater than or equal.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/kesh3ddc(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpge_ps(a, b));
+}
+
+// Compares for less than or equal.
+//
+//   r0 := (a0 <= b0) ? 0xffffffff : 0x0
+//   r1 := (a1 <= b1) ? 0xffffffff : 0x0
+//   r2 := (a2 <= b2) ? 0xffffffff : 0x0
+//   r3 := (a3 <= b3) ? 0xffffffff : 0x0
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compares for less than or equal.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/a7x0hbhw(v=vs.100)
+FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmple_ps(a, b));
+}
+
+// Compares for equality.
+// https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compares for equality.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/k423z28e(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpeq_ps(a, b));
+}
+
+// Compares for inequality.
+// https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
+}
+
+// Compares for inequality.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/ekya8fh4(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpneq_ps(a, b));
+}
+
+// Compares for not greater than or equal.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/wsexys62(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b)
+{
+    return _mm_cmplt_ps(a, b);
+}
+
+// Compares for not greater than or equal.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fk2y80s8(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b)
+{
+    return _mm_cmplt_ss(a, b);
+}
+
+// Compares for not greater than.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/d0xh7w0s(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b)
+{
+    return _mm_cmple_ps(a, b);
+}
+
+// Compares for not greater than.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b)
+{
+    return _mm_cmple_ss(a, b);
+}
+
+// Compares for not less than or equal.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/6a330kxw(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
+{
+    return _mm_cmpgt_ps(a, b);
+}
+
+// Compares for not less than or equal.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b)
+{
+    return _mm_cmpgt_ss(a, b);
+}
+
+// Compares for not less than.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/4686bbdw(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
+{
+    return _mm_cmpge_ps(a, b);
+}
+
+// Compares for not less than.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/56b9z2wf(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
+{
+    return _mm_cmpge_ss(a, b);
+}
+
+// Compares the 16 signed or unsigned 8-bit integers in a and the 16 signed or
+// unsigned 8-bit integers in b for equality.
+// https://msdn.microsoft.com/en-us/library/windows/desktop/bz5xk21a(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for equality, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_pd
+FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(
+        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
+    uint32x4_t cmp =
+        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
+    uint32x4_t swapped = vrev64q_u32(cmp);
+    return vreinterpretq_m128d_u32(vandq_u32(cmp, swapped));
+#endif
+}
+
+// Compares the 8 signed or unsigned 16-bit integers in a and the 8 signed or
+// unsigned 16-bit integers in b for equality.
+// https://msdn.microsoft.com/en-us/library/2ay060te(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compare packed 32-bit integers in a and b for equality, and store the results
+// in dst
+FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compare packed 64-bit integers in a and b for equality, and store the results
+// in dst
+FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_u64(
+        vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b)));
+#else
+    // ARMv7 lacks vceqq_u64
+    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
+    uint32x4_t cmp =
+        vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b));
+    uint32x4_t swapped = vrev64q_u32(cmp);
+    return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped));
+#endif
+}
+
+// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
+// in b for lesser than.
+// https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
+// in b for greater than.
+//
+//   r0 := (a0 > b0) ? 0xff : 0x0
+//   r1 := (a1 > b1) ? 0xff : 0x0
+//   ...
+//   r15 := (a15 > b15) ? 0xff : 0x0
+//
+// https://msdn.microsoft.com/zh-tw/library/wf45zt2b(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
+// in b for less than.
+//
+//   r0 := (a0 < b0) ? 0xffff : 0x0
+//   r1 := (a1 < b1) ? 0xffff : 0x0
+//   ...
+//   r7 := (a7 < b7) ? 0xffff : 0x0
+//
+// https://technet.microsoft.com/en-us/library/t863edb2(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
+// in b for greater than.
+//
+//   r0 := (a0 > b0) ? 0xffff : 0x0
+//   r1 := (a1 > b1) ? 0xffff : 0x0
+//   ...
+//   r7 := (a7 > b7) ? 0xffff : 0x0
+//
+// https://technet.microsoft.com/en-us/library/xd43yfsa(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+
+// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
+// in b for less than.
+// https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
+// in b for greater than.
+// https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers
+// in b for greater than.
+FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_u64(
+        vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+#else
+    // ARMv7 lacks vcgtq_s64.
+    // This is based off of Clang's SSE2 polyfill:
+    // (a > b) -> ((a_hi > b_hi) || (a_lo > b_lo && a_hi == b_hi))
+
+    // Mask the sign bit out since we need a signed AND an unsigned comparison
+    // and it is ugly to try and split them.
+    int32x4_t mask = vreinterpretq_s32_s64(vdupq_n_s64(0x80000000ull));
+    int32x4_t a_mask = veorq_s32(vreinterpretq_s32_m128i(a), mask);
+    int32x4_t b_mask = veorq_s32(vreinterpretq_s32_m128i(b), mask);
+    // Check if a > b
+    int64x2_t greater = vreinterpretq_s64_u32(vcgtq_s32(a_mask, b_mask));
+    // Copy upper mask to lower mask
+    // a_hi > b_hi
+    int64x2_t gt_hi = vshrq_n_s64(greater, 63);
+    // Copy lower mask to upper mask
+    // a_lo > b_lo
+    int64x2_t gt_lo = vsliq_n_s64(greater, greater, 32);
+    // Compare for equality
+    int64x2_t equal = vreinterpretq_s64_u32(vceqq_s32(a_mask, b_mask));
+    // Copy upper mask to lower mask
+    // a_hi == b_hi
+    int64x2_t eq_hi = vshrq_n_s64(equal, 63);
+    // a_hi > b_hi || (a_lo > b_lo && a_hi == b_hi)
+    int64x2_t ret = vorrq_s64(gt_hi, vandq_s64(gt_lo, eq_hi));
+    return vreinterpretq_m128i_s64(ret);
+#endif
+}
+
+// Compares the four 32-bit floats in a and b to check if any values are NaN.
+// Ordered compare between each value returns true for "orderable" and false for
+// "not orderable" (NaN).
+// https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx see
+// also:
+// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
+// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
+FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
+{
+    // Note: NEON does not have ordered compare builtin
+    // Need to compare a eq a and b eq b to check for NaN
+    // Do AND of results to get final
+    uint32x4_t ceqaa =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t ceqbb =
+        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
+}
+
+// Compares for ordered.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/343t62da(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpord_ps(a, b));
+}
+
+// Compares for unordered.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/khy6fk1t(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
+{
+    uint32x4_t f32a =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t f32b =
+        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b)));
+}
+
+// Compares for unordered.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/2as2387b(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpunord_ps(a, b));
+}
+
+// Compares the lower single-precision floating point scalar values of a and b
+// using a less than operation. :
+// https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx Important
+// note!! The documentation on MSDN is incorrect!  If either of the values is a
+// NAN the docs say you will get a one, but in fact, it will return a zero!!
+FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
+{
+    uint32x4_t a_not_nan =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t b_not_nan =
+        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
+    uint32x4_t a_lt_b =
+        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_lt_b), 0) != 0) ? 1 : 0;
+}
+
+// Compares the lower single-precision floating point scalar values of a and b
+// using a greater than operation. :
+// https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx
+FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
+{
+    // return vgetq_lane_u32(vcgtq_f32(vreinterpretq_f32_m128(a),
+    // vreinterpretq_f32_m128(b)), 0);
+    uint32x4_t a_not_nan =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t b_not_nan =
+        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
+    uint32x4_t a_gt_b =
+        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0) ? 1 : 0;
+}
+
+// Compares the lower single-precision floating point scalar values of a and b
+// using a less than or equal operation. :
+// https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx
+FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
+{
+    // return vgetq_lane_u32(vcleq_f32(vreinterpretq_f32_m128(a),
+    // vreinterpretq_f32_m128(b)), 0);
+    uint32x4_t a_not_nan =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t b_not_nan =
+        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
+    uint32x4_t a_le_b =
+        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_le_b), 0) != 0) ? 1 : 0;
+}
+
+// Compares the lower single-precision floating point scalar values of a and b
+// using a greater than or equal operation. :
+// https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx
+FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
+{
+    // return vgetq_lane_u32(vcgeq_f32(vreinterpretq_f32_m128(a),
+    // vreinterpretq_f32_m128(b)), 0);
+    uint32x4_t a_not_nan =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t b_not_nan =
+        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
+    uint32x4_t a_ge_b =
+        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0) ? 1 : 0;
+}
+
+// Compares the lower single-precision floating point scalar values of a and b
+// using an equality operation. :
+// https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx
+FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
+{
+    // return vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a),
+    // vreinterpretq_f32_m128(b)), 0);
+    uint32x4_t a_not_nan =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t b_not_nan =
+        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
+    uint32x4_t a_eq_b =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_eq_b), 0) != 0) ? 1 : 0;
+}
+
+// Compares the lower single-precision floating point scalar values of a and b
+// using an inequality operation. :
+// https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx
+FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
+{
+    // return !vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a),
+    // vreinterpretq_f32_m128(b)), 0);
+    uint32x4_t a_not_nan =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t b_not_nan =
+        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
+    uint32x4_t a_neq_b = vmvnq_u32(
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+    return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_neq_b), 0) != 0) ? 1 : 0;
+}
+
+// according to the documentation, these intrinsics behave the same as the
+// non-'u' versions.  We'll just alias them here.
+#define _mm_ucomieq_ss _mm_comieq_ss
+#define _mm_ucomige_ss _mm_comige_ss
+#define _mm_ucomigt_ss _mm_comigt_ss
+#define _mm_ucomile_ss _mm_comile_ss
+#define _mm_ucomilt_ss _mm_comilt_ss
+#define _mm_ucomineq_ss _mm_comineq_ss
+
+/* Conversions */
+
+// Convert packed signed 32-bit integers in b to packed single-precision
+// (32-bit) floating-point elements, store the results in the lower 2 elements
+// of dst, and copy the upper 2 packed elements from a to the upper elements of
+// dst.
+//
+//   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+//   dst[63:32] := Convert_Int32_To_FP32(b[63:32])
+//   dst[95:64] := a[95:64]
+//   dst[127:96] := a[127:96]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_pi2ps
+FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
+{
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
+                     vget_high_f32(vreinterpretq_f32_m128(a))));
+}
+
+// Convert the signed 32-bit integer b to a single-precision (32-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+//
+//   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+//   dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_si2ss
+FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
+{
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
+}
+
+// Convert the signed 32-bit integer b to a single-precision (32-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+//
+//   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+//   dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_ss
+#define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b)
+
+// Convert the signed 64-bit integer b to a single-precision (32-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+//
+//   dst[31:0] := Convert_Int64_To_FP32(b[63:0])
+//   dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_ss
+FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)
+{
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
+}
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer, and store the result in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ss2si
+FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
+{
+#if defined(__aarch64__)
+    return vgetq_lane_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a)), 0);
+#else
+    float32_t data = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    float32_t diff = data - floor(data);
+    if (diff > 0.5)
+        return (int32_t) ceil(data);
+    if (unlikely(diff == 0.5)) {
+        int32_t f = (int32_t) floor(data);
+        int32_t c = (int32_t) ceil(data);
+        return c & 1 ? f : c;
+    }
+    return (int32_t) floor(data);
+#endif
+}
+
+// Convert packed 16-bit integers in a to packed single-precision (32-bit)
+// floating-point elements, and store the results in dst.
+//
+//   FOR j := 0 to 3
+//      i := j*16
+//      m := j*32
+//      dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi16_ps
+FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
+{
+    return vreinterpretq_m128_f32(
+        vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a))));
+}
+
+// Convert packed 32-bit integers in b to packed single-precision (32-bit)
+// floating-point elements, store the results in the lower 2 elements of dst,
+// and copy the upper 2 packed elements from a to the upper elements of dst.
+//
+//   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+//   dst[63:32] := Convert_Int32_To_FP32(b[63:32])
+//   dst[95:64] := a[95:64]
+//   dst[127:96] := a[127:96]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_ps
+FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
+{
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
+                     vget_high_f32(vreinterpretq_f32_m128(a))));
+}
+
+// Convert packed signed 32-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, store the results in the lower 2 elements
+// of dst, then covert the packed signed 32-bit integers in b to
+// single-precision (32-bit) floating-point element, and store the results in
+// the upper 2 elements of dst.
+//
+//   dst[31:0] := Convert_Int32_To_FP32(a[31:0])
+//   dst[63:32] := Convert_Int32_To_FP32(a[63:32])
+//   dst[95:64] := Convert_Int32_To_FP32(b[31:0])
+//   dst[127:96] := Convert_Int32_To_FP32(b[63:32])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32x2_ps
+FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
+{
+    return vreinterpretq_m128_f32(vcvtq_f32_s32(
+        vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))));
+}
+
+// Convert the lower packed 8-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, and store the results in dst.
+//
+//   FOR j := 0 to 3
+//      i := j*8
+//      m := j*32
+//      dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi8_ps
+FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
+{
+    return vreinterpretq_m128_f32(vcvtq_f32_s32(
+        vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a))))));
+}
+
+// Convert packed unsigned 16-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, and store the results in dst.
+//
+//   FOR j := 0 to 3
+//      i := j*16
+//      m := j*32
+//      dst[m+31:m] := Convert_UInt16_To_FP32(a[i+15:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu16_ps
+FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
+{
+    return vreinterpretq_m128_f32(
+        vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a))));
+}
+
+// Convert the lower packed unsigned 8-bit integers in a to packed
+// single-precision (32-bit) floating-point elements, and store the results in
+// dst.
+//
+//   FOR j := 0 to 3
+//      i := j*8
+//      m := j*32
+//      dst[m+31:m] := Convert_UInt8_To_FP32(a[i+7:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu8_ps
+FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
+{
+    return vreinterpretq_m128_f32(vcvtq_f32_u32(
+        vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a))))));
+}
+
+// Converts the four single-precision, floating-point values of a to signed
+// 32-bit integer values using truncate.
+// https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
+{
+    return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 64-bit integer with truncation, and store the result in dst.
+//
+//   dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64
+FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
+{
+#if defined(__aarch64__)
+    return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0);
+#else
+    double ret = *((double *) &a);
+    return (int64_t) ret;
+#endif
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 64-bit integer with truncation, and store the result in dst.
+//
+//   dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64x
+#define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a)
+
+// Converts the four signed 32-bit integer values of a to single-precision,
+// floating-point values
+// https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
+{
+    return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
+}
+
+// Converts the four unsigned 8-bit integers in the lower 16 bits to four
+// unsigned 32-bit integers.
+FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
+{
+    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);    /* xxxx xxxx xxxx DCBA */
+    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */
+    return vreinterpretq_m128i_u16(u16x8);
+}
+
+// Converts the four unsigned 8-bit integers in the lower 32 bits to four
+// unsigned 32-bit integers.
+// https://msdn.microsoft.com/en-us/library/bb531467%28v=vs.100%29.aspx
+FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
+{
+    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx DCBA */
+    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0D0C 0B0A */
+    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
+    return vreinterpretq_m128i_u32(u32x4);
+}
+
+// Converts the two unsigned 8-bit integers in the lower 16 bits to two
+// unsigned 64-bit integers.
+FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
+{
+    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx xxBA */
+    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0x0x 0B0A */
+    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
+    uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_u64(u64x2);
+}
+
+// Converts the four unsigned 8-bit integers in the lower 16 bits to four
+// unsigned 32-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
+{
+    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);    /* xxxx xxxx xxxx DCBA */
+    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
+    return vreinterpretq_m128i_s16(s16x8);
+}
+
+// Converts the four unsigned 8-bit integers in the lower 32 bits to four
+// unsigned 32-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
+{
+    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx DCBA */
+    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0D0C 0B0A */
+    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
+    return vreinterpretq_m128i_s32(s32x4);
+}
+
+// Converts the two signed 8-bit integers in the lower 32 bits to four
+// signed 64-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
+{
+    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx xxBA */
+    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0x0x 0B0A */
+    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
+    int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_s64(s64x2);
+}
+
+// Converts the four signed 16-bit integers in the lower 64 bits to four signed
+// 32-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
+{
+    return vreinterpretq_m128i_s32(
+        vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
+}
+
+// Converts the two signed 16-bit integers in the lower 32 bits two signed
+// 32-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
+{
+    int16x8_t s16x8 = vreinterpretq_s16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
+    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
+    int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_s64(s64x2);
+}
+
+// Converts the four unsigned 16-bit integers in the lower 64 bits to four
+// unsigned 32-bit integers.
+FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
+{
+    return vreinterpretq_m128i_u32(
+        vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
+}
+
+// Converts the two unsigned 16-bit integers in the lower 32 bits to two
+// unsigned 64-bit integers.
+FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
+{
+    uint16x8_t u16x8 = vreinterpretq_u16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
+    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
+    uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_u64(u64x2);
+}
+
+// Converts the two unsigned 32-bit integers in the lower 64 bits to two
+// unsigned 64-bit integers.
+FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
+{
+    return vreinterpretq_m128i_u64(
+        vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a))));
+}
+
+// Converts the two signed 32-bit integers in the lower 64 bits to two signed
+// 64-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
+{
+    return vreinterpretq_m128i_s64(
+        vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
+}
+
+// Converts the four single-precision, floating-point values of a to signed
+// 32-bit integer values.
+//
+//   r0 := (int) a0
+//   r1 := (int) a1
+//   r2 := (int) a2
+//   r3 := (int) a3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx
+// *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A
+// does not support! It is supported on ARMv8-A however.
+FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
+#else
+    uint32x4_t signmask = vdupq_n_u32(0x80000000);
+    float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
+                                 vdupq_n_f32(0.5f)); /* +/- 0.5 */
+    int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
+        vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
+    int32x4_t r_trunc =
+        vcvtq_s32_f32(vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
+    int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
+        vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
+    int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
+                                 vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
+    float32x4_t delta = vsubq_f32(
+        vreinterpretq_f32_m128(a),
+        vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
+    uint32x4_t is_delta_half = vceqq_f32(delta, half); /* delta == +/- 0.5 */
+    return vreinterpretq_m128i_s32(vbslq_s32(is_delta_half, r_even, r_normal));
+#endif
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 16-bit integers, and store the results in dst. Note: this intrinsic
+// will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and
+// 0x7FFFFFFF.
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi16
+FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a)
+{
+    return vreinterpret_m64_s16(
+        vmovn_s32(vreinterpretq_s32_m128i(_mm_cvtps_epi32(a))));
+}
+
+// Copy the lower 32-bit integer in a to dst.
+//
+//   dst[31:0] := a[31:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si32
+FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
+{
+    return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
+}
+
+// Copy the lower 64-bit integer in a to dst.
+//
+//   dst[63:0] := a[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64
+FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
+{
+    return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
+}
+
+// Copy the lower 64-bit integer in a to dst.
+//
+//   dst[63:0] := a[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x
+#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
+
+// Moves 32-bit integer a to the least significant 32 bits of an __m128 object,
+// zero extending the upper bits.
+//
+//   r0 := a
+//   r1 := 0x0
+//   r2 := 0x0
+//   r3 := 0x0
+//
+// https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
+{
+    return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
+}
+
+// Moves 64-bit integer a to the least significant 64 bits of an __m128 object,
+// zero extending the upper bits.
+//
+//   r0 := a
+//   r1 := 0x0
+FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
+{
+    return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
+}
+
+// Cast vector of type __m128 to type __m128d. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_pd
+FORCE_INLINE __m128d _mm_castps_pd(__m128 a)
+{
+    return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a));
+}
+
+// Applies a type cast to reinterpret four 32-bit floating point values passed
+// in as a 128-bit parameter as packed 32-bit integers.
+// https://msdn.microsoft.com/en-us/library/bb514099.aspx
+FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
+{
+    return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));
+}
+
+// Cast vector of type __m128i to type __m128d. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castsi128_pd
+FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a));
+#else
+    return vreinterpretq_m128d_f32(vreinterpretq_f32_m128i(a));
+#endif
+}
+
+// Applies a type cast to reinterpret four 32-bit integers passed in as a
+// 128-bit parameter as packed 32-bit floating point values.
+// https://msdn.microsoft.com/en-us/library/bb514029.aspx
+FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
+{
+    return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
+}
+
+// Loads 128-bit value. :
+// https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx
+FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
+{
+    return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
+//
+//   dst[63:0] := MEM[mem_addr+63:mem_addr]
+//   dst[127:64] := MEM[mem_addr+63:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load1_pd
+FORCE_INLINE __m128d _mm_load1_pd(const double *p)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
+#else
+    return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p));
+#endif
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
+//
+//   dst[63:0] := MEM[mem_addr+63:mem_addr]
+//   dst[127:64] := MEM[mem_addr+63:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1
+#define _mm_load_pd1 _mm_load1_pd
+
+// Load a double-precision (64-bit) floating-point element from memory into the
+// upper element of dst, and copy the lower element from a to dst. mem_addr does
+// not need to be aligned on any particular boundary.
+//
+//   dst[63:0] := a[63:0]
+//   dst[127:64] := MEM[mem_addr+63:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadh_pd
+FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
+#else
+    return vreinterpretq_m128d_f32(vcombine_f32(
+        vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p)));
+#endif
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
+//
+//   dst[63:0] := MEM[mem_addr+63:mem_addr]
+//   dst[127:64] := MEM[mem_addr+63:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1
+#define _mm_load_pd1 _mm_load1_pd
+
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
+//
+//   dst[63:0] := MEM[mem_addr+63:mem_addr]
+//   dst[127:64] := MEM[mem_addr+63:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loaddup_pd
+#define _mm_loaddup_pd _mm_load1_pd
+
+// Loads 128-bit value. :
+// https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
+{
+    return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
+}
+
+// Load unaligned 32-bit integer from memory into the first element of dst.
+//
+//   dst[31:0] := MEM[mem_addr+31:mem_addr]
+//   dst[MAX:32] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si32
+FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
+{
+    return vreinterpretq_m128i_s32(
+        vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0));
+}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed single-precision (32-bit) floating-point elements, and store the
+// results in dst.
+//
+//   FOR j := 0 to 1
+//     i := 32*j
+//     k := 64*j
+//     dst[i+31:i] := Convert_FP64_To_FP32(a[k+64:k])
+//   ENDFOR
+//   dst[127:64] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ps
+FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
+{
+#if defined(__aarch64__)
+    float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
+    return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
+#else
+    float a0 = (float) ((double *) &a)[0];
+    float a1 = (float) ((double *) &a)[1];
+    return _mm_set_ps(0, 0, a1, a0);
+#endif
+}
+
+// Copy the lower double-precision (64-bit) floating-point element of a to dst.
+//
+//   dst[63:0] := a[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_f64
+FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
+{
+#if defined(__aarch64__)
+    return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
+#else
+    return ((double *) &a)[0];
+#endif
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed double-precision (64-bit) floating-point elements, and store the
+// results in dst.
+//
+//   FOR j := 0 to 1
+//     i := 64*j
+//     k := 32*j
+//     dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pd
+FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a))));
+#else
+    double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
+    return _mm_set_pd(a1, a0);
+#endif
+}
+
+// Cast vector of type __m128d to type __m128i. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_si128
+FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
+{
+    return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a));
+}
+
+// Cast vector of type __m128d to type __m128. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_ps
+FORCE_INLINE __m128 _mm_castpd_ps(__m128d a)
+{
+    return vreinterpretq_m128_s64(vreinterpretq_s64_m128d(a));
+}
+
+// Blend packed single-precision (32-bit) floating-point elements from a and b
+// using mask, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_ps
+FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
+{
+    // Use a signed shift right to create a mask with the sign bit
+    uint32x4_t mask =
+        vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_m128(_mask), 31));
+    float32x4_t a = vreinterpretq_f32_m128(_a);
+    float32x4_t b = vreinterpretq_f32_m128(_b);
+    return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
+}
+
+// Blend packed single-precision (32-bit) floating-point elements from a and b
+// using mask, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_ps
+FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)
+{
+    const uint32_t ALIGN_STRUCT(16)
+        data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,
+                   ((imm8) & (1 << 1)) ? UINT32_MAX : 0,
+                   ((imm8) & (1 << 2)) ? UINT32_MAX : 0,
+                   ((imm8) & (1 << 3)) ? UINT32_MAX : 0};
+    uint32x4_t mask = vld1q_u32(data);
+    float32x4_t a = vreinterpretq_f32_m128(_a);
+    float32x4_t b = vreinterpretq_f32_m128(_b);
+    return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
+}
+
+// Blend packed double-precision (64-bit) floating-point elements from a and b
+// using mask, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_pd
+FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)
+{
+    uint64x2_t mask =
+        vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63));
+#if defined(__aarch64__)
+    float64x2_t a = vreinterpretq_f64_m128d(_a);
+    float64x2_t b = vreinterpretq_f64_m128d(_b);
+    return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a));
+#else
+    uint64x2_t a = vreinterpretq_u64_m128d(_a);
+    uint64x2_t b = vreinterpretq_u64_m128d(_b);
+    return vreinterpretq_m128d_u64(vbslq_u64(mask, b, a));
+#endif
+}
+
+typedef struct {
+    uint16_t res0;
+    uint8_t res1 : 6;
+    uint8_t bit22 : 1;
+    uint8_t bit23 : 1;
+    uint8_t res2;
+#if defined(__aarch64__)
+    uint32_t res3;
+#endif
+} fpcr_bitfield;
+
+// Macro: Set the rounding mode bits of the MXCSR control and status register to
+// the value in unsigned 32-bit integer a. The rounding mode may contain any of
+// the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
+// _MM_ROUND_TOWARD_ZERO
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_ROUNDING_MODE
+FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
+{
+    union {
+        fpcr_bitfield field;
+#if defined(__aarch64__)
+        uint64_t value;
+#else
+        uint32_t value;
+#endif
+    } r;
+
+#if defined(__aarch64__)
+    asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */
+#else
+    asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
+#endif
+
+    switch (rounding) {
+    case _MM_ROUND_TOWARD_ZERO:
+        r.field.bit22 = 1;
+        r.field.bit23 = 1;
+        break;
+    case _MM_ROUND_DOWN:
+        r.field.bit22 = 0;
+        r.field.bit23 = 1;
+        break;
+    case _MM_ROUND_UP:
+        r.field.bit22 = 1;
+        r.field.bit23 = 0;
+        break;
+    default:  //_MM_ROUND_NEAREST
+        r.field.bit22 = 0;
+        r.field.bit23 = 0;
+    }
+
+#if defined(__aarch64__)
+    asm volatile("msr FPCR, %0" ::"r"(r)); /* write */
+#else
+    asm volatile("vmsr FPSCR, %0" ::"r"(r));        /* write */
+#endif
+}
+
+FORCE_INLINE void _mm_setcsr(unsigned int a)
+{
+    _MM_SET_ROUNDING_MODE(a);
+}
+
+// Round the packed single-precision (32-bit) floating-point elements in a using
+// the rounding parameter, and store the results as packed single-precision
+// floating-point elements in dst.
+// software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
+FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
+{
+#if defined(__aarch64__)
+    switch (rounding) {
+    case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a)));
+    case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a)));
+    case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a)));
+    case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a)));
+    default:  //_MM_FROUND_CUR_DIRECTION
+        return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a)));
+    }
+#else
+    float *v_float = (float *) &a;
+    __m128 zero, neg_inf, pos_inf;
+
+    switch (rounding) {
+    case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
+        return _mm_cvtepi32_ps(_mm_cvtps_epi32(a));
+    case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
+        return (__m128){floorf(v_float[0]), floorf(v_float[1]),
+                        floorf(v_float[2]), floorf(v_float[3])};
+    case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
+        return (__m128){ceilf(v_float[0]), ceilf(v_float[1]), ceilf(v_float[2]),
+                        ceilf(v_float[3])};
+    case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
+        zero = _mm_set_ps(0.0f, 0.0f, 0.0f, 0.0f);
+        neg_inf = _mm_set_ps(floorf(v_float[0]), floorf(v_float[1]),
+                             floorf(v_float[2]), floorf(v_float[3]));
+        pos_inf = _mm_set_ps(ceilf(v_float[0]), ceilf(v_float[1]),
+                             ceilf(v_float[2]), ceilf(v_float[3]));
+        return _mm_blendv_ps(pos_inf, neg_inf, _mm_cmple_ps(a, zero));
+    default:  //_MM_FROUND_CUR_DIRECTION
+        return (__m128){roundf(v_float[0]), roundf(v_float[1]),
+                        roundf(v_float[2]), roundf(v_float[3])};
+    }
+#endif
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+//
+//   FOR j := 0 to 1
+//       i := 32*j
+//       dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ps2pi
+FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
+{
+#if defined(__aarch64__)
+    return vreinterpret_m64_s32(
+        vget_low_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a))));
+#else
+    return vreinterpret_m64_s32(
+        vcvt_s32_f32(vget_low_f32(vreinterpretq_f32_m128(
+            _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)))));
+#endif
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+//
+//   FOR j := 0 to 1
+//       i := 32*j
+//       dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi32
+#define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a)
+
+// Round the packed single-precision (32-bit) floating-point elements in a up to
+// an integer value, and store the results as packed single-precision
+// floating-point elements in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ps
+FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
+{
+    return _mm_round_ps(a, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+}
+
+// Round the lower single-precision (32-bit) floating-point element in b up to
+// an integer value, store the result as a single-precision floating-point
+// element in the lower element of dst, and copy the upper 3 packed elements
+// from a to the upper elements of dst.
+//
+//   dst[31:0] := CEIL(b[31:0])
+//   dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ss
+FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(
+        a, _mm_round_ps(b, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC));
+}
+
+// Round the packed single-precision (32-bit) floating-point elements in a down
+// to an integer value, and store the results as packed single-precision
+// floating-point elements in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ps
+FORCE_INLINE __m128 _mm_floor_ps(__m128 a)
+{
+    return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+}
+
+// Round the lower single-precision (32-bit) floating-point element in b down to
+// an integer value, store the result as a single-precision floating-point
+// element in the lower element of dst, and copy the upper 3 packed elements
+// from a to the upper elements of dst.
+//
+//   dst[31:0] := FLOOR(b[31:0])
+//   dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ss
+FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(
+        a, _mm_round_ps(b, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC));
+}
+
+// Load 128-bits of integer data from unaligned memory into dst. This intrinsic
+// may perform better than _mm_loadu_si128 when the data crosses a cache line
+// boundary.
+//
+//   dst[127:0] := MEM[mem_addr+127:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lddqu_si128
+#define _mm_lddqu_si128 _mm_loadu_si128
+
+/* Miscellaneous Operations */
+
+// Shifts the 8 signed 16-bit integers in a right by count bits while shifting
+// in the sign bit.
+//
+//   r0 := a0 >> count
+//   r1 := a1 >> count
+//   ...
+//   r7 := a7 >> count
+//
+// https://msdn.microsoft.com/en-us/library/3c9997dk(v%3dvs.90).aspx
+FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
+{
+    int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
+    if (unlikely(c > 15))
+        return _mm_cmplt_epi16(a, _mm_setzero_si128());
+    return vreinterpretq_m128i_s16(vshlq_s16((int16x8_t) a, vdupq_n_s16(-c)));
+}
+
+// Shifts the 4 signed 32-bit integers in a right by count bits while shifting
+// in the sign bit.
+//
+//   r0 := a0 >> count
+//   r1 := a1 >> count
+//   r2 := a2 >> count
+//   r3 := a3 >> count
+//
+// https://msdn.microsoft.com/en-us/library/ce40009e(v%3dvs.100).aspx
+FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
+{
+    int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
+    if (unlikely(c > 31))
+        return _mm_cmplt_epi32(a, _mm_setzero_si128());
+    return vreinterpretq_m128i_s32(vshlq_s32((int32x4_t) a, vdupq_n_s32(-c)));
+}
+
+// Packs the 16 signed 16-bit integers from a and b into 8-bit integers and
+// saturates.
+// https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)),
+                    vqmovn_s16(vreinterpretq_s16_m128i(b))));
+}
+
+// Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned
+// integers and saturates.
+//
+//   r0 := UnsignedSaturate(a0)
+//   r1 := UnsignedSaturate(a1)
+//   ...
+//   r7 := UnsignedSaturate(a7)
+//   r8 := UnsignedSaturate(b0)
+//   r9 := UnsignedSaturate(b1)
+//   ...
+//   r15 := UnsignedSaturate(b7)
+//
+// https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)),
+                    vqmovun_s16(vreinterpretq_s16_m128i(b))));
+}
+
+// Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers
+// and saturates.
+//
+//   r0 := SignedSaturate(a0)
+//   r1 := SignedSaturate(a1)
+//   r2 := SignedSaturate(a2)
+//   r3 := SignedSaturate(a3)
+//   r4 := SignedSaturate(b0)
+//   r5 := SignedSaturate(b1)
+//   r6 := SignedSaturate(b2)
+//   r7 := SignedSaturate(b3)
+//
+// https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)),
+                     vqmovn_s32(vreinterpretq_s32_m128i(b))));
+}
+
+// Packs the 8 unsigned 32-bit integers from a and b into unsigned 16-bit
+// integers and saturates.
+//
+//   r0 := UnsignedSaturate(a0)
+//   r1 := UnsignedSaturate(a1)
+//   r2 := UnsignedSaturate(a2)
+//   r3 := UnsignedSaturate(a3)
+//   r4 := UnsignedSaturate(b0)
+//   r5 := UnsignedSaturate(b1)
+//   r6 := UnsignedSaturate(b2)
+//   r7 := UnsignedSaturate(b3)
+FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)),
+                     vqmovun_s32(vreinterpretq_s32_m128i(b))));
+}
+
+// Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower
+// 8 signed or unsigned 8-bit integers in b.
+//
+//   r0 := a0
+//   r1 := b0
+//   r2 := a1
+//   r3 := b1
+//   ...
+//   r14 := a7
+//   r15 := b7
+//
+// https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s8(
+        vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+#else
+    int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a)));
+    int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b)));
+    int8x8x2_t result = vzip_s8(a1, b1);
+    return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
+#endif
+}
+
+// Interleaves the lower 4 signed or unsigned 16-bit integers in a with the
+// lower 4 signed or unsigned 16-bit integers in b.
+//
+//   r0 := a0
+//   r1 := b0
+//   r2 := a1
+//   r3 := b1
+//   r4 := a2
+//   r5 := b2
+//   r6 := a3
+//   r7 := b3
+//
+// https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s16(
+        vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+#else
+    int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a));
+    int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b));
+    int16x4x2_t result = vzip_s16(a1, b1);
+    return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
+#endif
+}
+
+// Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the
+// lower 2 signed or unsigned 32 - bit integers in b.
+//
+//   r0 := a0
+//   r1 := b0
+//   r2 := a1
+//   r3 := b1
+//
+// https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s32(
+        vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+#else
+    int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b));
+    int32x2x2_t result = vzip_s32(a1, b1);
+    return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
+#endif
+}
+
+FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
+{
+    int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));
+    int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));
+    return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));
+}
+
+// Selects and interleaves the lower two single-precision, floating-point values
+// from a and b.
+//
+//   r0 := a0
+//   r1 := b0
+//   r2 := a1
+//   r3 := b1
+//
+// https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx
+FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(
+        vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+    float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
+    float32x2x2_t result = vzip_f32(a1, b1);
+    return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
+#endif
+}
+
+// Unpack and interleave double-precision (64-bit) floating-point elements from
+// the low half of a and b, and store the results in dst.
+//
+//   DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
+//     dst[63:0] := src1[63:0]
+//     dst[127:64] := src2[63:0]
+//     RETURN dst[127:0]
+//   }
+//   dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpacklo_pd
+FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    return vreinterpretq_m128d_s64(
+        vcombine_s64(vget_low_s64(vreinterpretq_s64_m128d(a)),
+                     vget_low_s64(vreinterpretq_s64_m128d(b))));
+#endif
+}
+
+// Unpack and interleave double-precision (64-bit) floating-point elements from
+// the high half of a and b, and store the results in dst.
+//
+//   DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
+//     dst[63:0] := src1[127:64]
+//     dst[127:64] := src2[127:64]
+//     RETURN dst[127:0]
+//   }
+//   dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_pd
+FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    return vreinterpretq_m128d_s64(
+        vcombine_s64(vget_high_s64(vreinterpretq_s64_m128d(a)),
+                     vget_high_s64(vreinterpretq_s64_m128d(b))));
+#endif
+}
+
+// Selects and interleaves the upper two single-precision, floating-point values
+// from a and b.
+//
+//   r0 := a2
+//   r1 := b2
+//   r2 := a3
+//   r3 := b3
+//
+// https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx
+FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(
+        vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+    float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
+    float32x2x2_t result = vzip_f32(a1, b1);
+    return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
+#endif
+}
+
+// Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper
+// 8 signed or unsigned 8-bit integers in b.
+//
+//   r0 := a8
+//   r1 := b8
+//   r2 := a9
+//   r3 := b9
+//   ...
+//   r14 := a15
+//   r15 := b15
+//
+// https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s8(
+        vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+#else
+    int8x8_t a1 =
+        vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a)));
+    int8x8_t b1 =
+        vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b)));
+    int8x8x2_t result = vzip_s8(a1, b1);
+    return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
+#endif
+}
+
+// Interleaves the upper 4 signed or unsigned 16-bit integers in a with the
+// upper 4 signed or unsigned 16-bit integers in b.
+//
+//   r0 := a4
+//   r1 := b4
+//   r2 := a5
+//   r3 := b5
+//   r4 := a6
+//   r5 := b6
+//   r6 := a7
+//   r7 := b7
+//
+// https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s16(
+        vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+#else
+    int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a));
+    int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b));
+    int16x4x2_t result = vzip_s16(a1, b1);
+    return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
+#endif
+}
+
+// Interleaves the upper 2 signed or unsigned 32-bit integers in a with the
+// upper 2 signed or unsigned 32-bit integers in b.
+// https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s32(
+        vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+#else
+    int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b));
+    int32x2x2_t result = vzip_s32(a1, b1);
+    return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
+#endif
+}
+
+// Interleaves the upper signed or unsigned 64-bit integer in a with the
+// upper signed or unsigned 64-bit integer in b.
+//
+//   r0 := a1
+//   r1 := b1
+FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
+{
+    int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));
+    int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));
+    return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));
+}
+
+// Horizontally compute the minimum amongst the packed unsigned 16-bit integers
+// in a, store the minimum and index in dst, and zero the remaining bits in dst.
+//
+//   index[2:0] := 0
+//   min[15:0] := a[15:0]
+//   FOR j := 0 to 7
+//       i := j*16
+//       IF a[i+15:i] < min[15:0]
+//           index[2:0] := j
+//           min[15:0] := a[i+15:i]
+//       FI
+//   ENDFOR
+//   dst[15:0] := min[15:0]
+//   dst[18:16] := index[2:0]
+//   dst[127:19] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_minpos_epu16
+FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
+{
+    __m128i dst;
+    uint16_t min, idx = 0;
+    // Find the minimum value
+#if defined(__aarch64__)
+    min = vminvq_u16(vreinterpretq_u16_m128i(a));
+#else
+    __m64 tmp;
+    tmp = vreinterpret_m64_u16(
+        vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)),
+                 vget_high_u16(vreinterpretq_u16_m128i(a))));
+    tmp = vreinterpret_m64_u16(
+        vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
+    tmp = vreinterpret_m64_u16(
+        vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
+    min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0);
+#endif
+    // Get the index of the minimum value
+    int i;
+    for (i = 0; i < 8; i++) {
+        if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) {
+            idx = (uint16_t) i;
+            break;
+        }
+        a = _mm_srli_si128(a, 2);
+    }
+    // Generate result
+    dst = _mm_setzero_si128();
+    dst = vreinterpretq_m128i_u16(
+        vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0));
+    dst = vreinterpretq_m128i_u16(
+        vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1));
+    return dst;
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
+// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
+// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
+// otherwise set CF to 0. Return the CF value.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_si128
+FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
+{
+    int64x2_t s64 =
+        vandq_s64(vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_m128i(a))),
+                  vreinterpretq_s64_m128i(b));
+    return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
+// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
+// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
+// otherwise set CF to 0. Return the ZF value.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_si128
+FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
+{
+    int64x2_t s64 =
+        vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b));
+    return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
+}
+
+// Extracts the selected signed or unsigned 8-bit integer from a and zero
+// extends.
+// FORCE_INLINE int _mm_extract_epi8(__m128i a, __constrange(0,16) int imm)
+#define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
+
+// Inserts the least significant 8 bits of b into the selected 8-bit integer
+// of a.
+// FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b,
+//                                      __constrange(0,16) int imm)
+#define _mm_insert_epi8(a, b, imm)                                 \
+    __extension__({                                                \
+        vreinterpretq_m128i_s8(                                    \
+            vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \
+    })
+
+// Extracts the selected signed or unsigned 16-bit integer from a and zero
+// extends.
+// https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx
+// FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
+#define _mm_extract_epi16(a, imm) \
+    vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
+
+// Inserts the least significant 16 bits of b into the selected 16-bit integer
+// of a.
+// https://msdn.microsoft.com/en-us/library/kaze8hz1%28v=vs.100%29.aspx
+// FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,
+//                                       __constrange(0,8) int imm)
+#define _mm_insert_epi16(a, b, imm)                                  \
+    __extension__({                                                  \
+        vreinterpretq_m128i_s16(                                     \
+            vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \
+    })
+
+// Copy a to dst, and insert the 16-bit integer i into dst at the location
+// specified by imm8.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_pi16
+#define _mm_insert_pi16(a, b, imm)                               \
+    __extension__({                                              \
+        vreinterpret_m64_s16(                                    \
+            vset_lane_s16((b), vreinterpret_s16_m64(a), (imm))); \
+    })
+
+// Extracts the selected signed or unsigned 32-bit integer from a and zero
+// extends.
+// FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
+#define _mm_extract_epi32(a, imm) \
+    vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
+
+// Extracts the selected single-precision (32-bit) floating-point from a.
+// FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm)
+#define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))
+
+// Inserts the least significant 32 bits of b into the selected 32-bit integer
+// of a.
+// FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b,
+//                                       __constrange(0,4) int imm)
+#define _mm_insert_epi32(a, b, imm)                                  \
+    __extension__({                                                  \
+        vreinterpretq_m128i_s32(                                     \
+            vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \
+    })
+
+// Extracts the selected signed or unsigned 64-bit integer from a and zero
+// extends.
+// FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
+#define _mm_extract_epi64(a, imm) \
+    vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
+
+// Inserts the least significant 64 bits of b into the selected 64-bit integer
+// of a.
+// FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b,
+//                                       __constrange(0,2) int imm)
+#define _mm_insert_epi64(a, b, imm)                                  \
+    __extension__({                                                  \
+        vreinterpretq_m128i_s64(                                     \
+            vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \
+    })
+
+// Count the number of bits set to 1 in unsigned 32-bit integer a, and
+// return that count in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u32
+FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
+{
+#if defined(__aarch64__)
+#if __has_builtin(__builtin_popcount)
+    return __builtin_popcount(a);
+#else
+    return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a)));
+#endif
+#else
+    uint32_t count = 0;
+    uint8x8_t input_val, count8x8_val;
+    uint16x4_t count16x4_val;
+    uint32x2_t count32x2_val;
+
+    input_val = vld1_u8((uint8_t *) &a);
+    count8x8_val = vcnt_u8(input_val);
+    count16x4_val = vpaddl_u8(count8x8_val);
+    count32x2_val = vpaddl_u16(count16x4_val);
+
+    vst1_u32(&count, count32x2_val);
+    return count;
+#endif
+}
+
+// Count the number of bits set to 1 in unsigned 64-bit integer a, and
+// return that count in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u64
+FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
+{
+#if defined(__aarch64__)
+#if __has_builtin(__builtin_popcountll)
+    return __builtin_popcountll(a);
+#else
+    return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a)));
+#endif
+#else
+    uint64_t count = 0;
+    uint8x8_t input_val, count8x8_val;
+    uint16x4_t count16x4_val;
+    uint32x2_t count32x2_val;
+    uint64x1_t count64x1_val;
+
+    input_val = vld1_u8((uint8_t *) &a);
+    count8x8_val = vcnt_u8(input_val);
+    count16x4_val = vpaddl_u8(count8x8_val);
+    count32x2_val = vpaddl_u16(count16x4_val);
+    count64x1_val = vpaddl_u32(count32x2_val);
+    vst1_u64(&count, count64x1_val);
+    return count;
+#endif
+}
+
+// Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
+// (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
+// transposed matrix in these vectors (row0 now contains column 0, etc.).
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=MM_TRANSPOSE4_PS
+#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)         \
+    do {                                                  \
+        float32x4x2_t ROW01 = vtrnq_f32(row0, row1);      \
+        float32x4x2_t ROW23 = vtrnq_f32(row2, row3);      \
+        row0 = vcombine_f32(vget_low_f32(ROW01.val[0]),   \
+                            vget_low_f32(ROW23.val[0]));  \
+        row1 = vcombine_f32(vget_low_f32(ROW01.val[1]),   \
+                            vget_low_f32(ROW23.val[1]));  \
+        row2 = vcombine_f32(vget_high_f32(ROW01.val[0]),  \
+                            vget_high_f32(ROW23.val[0])); \
+        row3 = vcombine_f32(vget_high_f32(ROW01.val[1]),  \
+                            vget_high_f32(ROW23.val[1])); \
+    } while (0)
+
+/* Crypto Extensions */
+
+#if defined(__ARM_FEATURE_CRYPTO)
+// Wraps vmull_p64
+FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
+{
+    poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
+    poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
+    return vreinterpretq_u64_p128(vmull_p64(a, b));
+}
+#else  // ARMv7 polyfill
+// ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
+//
+// vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
+// 64-bit->128-bit polynomial multiply.
+//
+// It needs some work and is somewhat slow, but it is still faster than all
+// known scalar methods.
+//
+// Algorithm adapted to C from
+// https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted
+// from "Fast Software Polynomial Multiplication on ARM Processors Using the
+// NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab
+// (https://hal.inria.fr/hal-01506572)
+static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
+{
+    poly8x8_t a = vreinterpret_p8_u64(_a);
+    poly8x8_t b = vreinterpret_p8_u64(_b);
+
+    // Masks
+    uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
+                                    vcreate_u8(0x00000000ffffffff));
+    uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
+                                    vcreate_u8(0x0000000000000000));
+
+    // Do the multiplies, rotating with vext to get all combinations
+    uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b));  // D = A0 * B0
+    uint8x16_t e =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1)));  // E = A0 * B1
+    uint8x16_t f =
+        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b));  // F = A1 * B0
+    uint8x16_t g =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2)));  // G = A0 * B2
+    uint8x16_t h =
+        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b));  // H = A2 * B0
+    uint8x16_t i =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3)));  // I = A0 * B3
+    uint8x16_t j =
+        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b));  // J = A3 * B0
+    uint8x16_t k =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4)));  // L = A0 * B4
+
+    // Add cross products
+    uint8x16_t l = veorq_u8(e, f);  // L = E + F
+    uint8x16_t m = veorq_u8(g, h);  // M = G + H
+    uint8x16_t n = veorq_u8(i, j);  // N = I + J
+
+    // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
+    // instructions.
+#if defined(__aarch64__)
+    uint8x16_t lm_p0 = vreinterpretq_u8_u64(
+        vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
+    uint8x16_t lm_p1 = vreinterpretq_u8_u64(
+        vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
+    uint8x16_t nk_p0 = vreinterpretq_u8_u64(
+        vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
+    uint8x16_t nk_p1 = vreinterpretq_u8_u64(
+        vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
+#else
+    uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
+    uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
+    uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
+    uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
+#endif
+    // t0 = (L) (P0 + P1) << 8
+    // t1 = (M) (P2 + P3) << 16
+    uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
+    uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
+    uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
+
+    // t2 = (N) (P4 + P5) << 24
+    // t3 = (K) (P6 + P7) << 32
+    uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
+    uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
+    uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
+
+    // De-interleave
+#if defined(__aarch64__)
+    uint8x16_t t0 = vreinterpretq_u8_u64(
+        vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
+    uint8x16_t t1 = vreinterpretq_u8_u64(
+        vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
+    uint8x16_t t2 = vreinterpretq_u8_u64(
+        vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
+    uint8x16_t t3 = vreinterpretq_u8_u64(
+        vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
+#else
+    uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
+    uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
+    uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
+    uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
+#endif
+    // Shift the cross products
+    uint8x16_t t0_shift = vextq_u8(t0, t0, 15);  // t0 << 8
+    uint8x16_t t1_shift = vextq_u8(t1, t1, 14);  // t1 << 16
+    uint8x16_t t2_shift = vextq_u8(t2, t2, 13);  // t2 << 24
+    uint8x16_t t3_shift = vextq_u8(t3, t3, 12);  // t3 << 32
+
+    // Accumulate the products
+    uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
+    uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
+    uint8x16_t mix = veorq_u8(d, cross1);
+    uint8x16_t r = veorq_u8(mix, cross2);
+    return vreinterpretq_u64_u8(r);
+}
+#endif  // ARMv7 polyfill
+
+// Perform a carry-less multiplication of two 64-bit integers, selected from a
+// and b according to imm8, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_clmulepi64_si128
+FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
+{
+    uint64x2_t a = vreinterpretq_u64_m128i(_a);
+    uint64x2_t b = vreinterpretq_u64_m128i(_b);
+    switch (imm & 0x11) {
+    case 0x00:
+        return vreinterpretq_m128i_u64(
+            _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b)));
+    case 0x01:
+        return vreinterpretq_m128i_u64(
+            _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b)));
+    case 0x10:
+        return vreinterpretq_m128i_u64(
+            _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b)));
+    case 0x11:
+        return vreinterpretq_m128i_u64(
+            _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b)));
+    default:
+        abort();
+    }
+}
+
+#if !defined(__ARM_FEATURE_CRYPTO)
+/* clang-format off */
+#define SSE2NEON_AES_DATA(w)                                           \
+    {                                                                  \
+        w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
+        w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
+        w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \
+        w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \
+        w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \
+        w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \
+        w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \
+        w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
+        w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \
+        w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \
+        w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \
+        w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \
+        w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \
+        w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \
+        w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \
+        w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
+        w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \
+        w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \
+        w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \
+        w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \
+        w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \
+        w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \
+        w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \
+        w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
+        w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \
+        w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \
+        w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \
+        w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \
+        w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \
+        w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \
+        w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \
+        w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
+        w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \
+        w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \
+        w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \
+        w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
+        w(0xb0), w(0x54), w(0xbb), w(0x16)                             \
+    }
+/* clang-format on */
+
+/* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */
+#define SSE2NEON_AES_H0(x) (x)
+static const uint8_t SSE2NEON_sbox[256] = SSE2NEON_AES_DATA(SSE2NEON_AES_H0);
+#undef SSE2NEON_AES_H0
+
+// In the absence of crypto extensions, implement aesenc using regular neon
+// intrinsics instead. See:
+// https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
+// https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
+// https://github.com/ColinIanKing/linux-next-mirror/blob/b5f466091e130caaf0735976648f72bd5e09aa84/crypto/aegis128-neon-inner.c#L52
+// for more information Reproduced with permission of the author.
+FORCE_INLINE __m128i _mm_aesenc_si128(__m128i EncBlock, __m128i RoundKey)
+{
+#if defined(__aarch64__)
+    static const uint8_t shift_rows[] = {0x0, 0x5, 0xa, 0xf, 0x4, 0x9,
+                                         0xe, 0x3, 0x8, 0xd, 0x2, 0x7,
+                                         0xc, 0x1, 0x6, 0xb};
+    static const uint8_t ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
+                                       0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc};
+
+    uint8x16_t v;
+    uint8x16_t w = vreinterpretq_u8_m128i(EncBlock);
+
+    // shift rows
+    w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
+
+    // sub bytes
+    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(SSE2NEON_sbox), w);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x40), w - 0x40);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x80), w - 0x80);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0xc0), w - 0xc0);
+
+    // mix columns
+    w = (v << 1) ^ (uint8x16_t)(((int8x16_t) v >> 7) & 0x1b);
+    w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
+    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
+
+    //  add round key
+    return vreinterpretq_m128i_u8(w) ^ RoundKey;
+
+#else /* ARMv7-A NEON implementation */
+#define SSE2NEON_AES_B2W(b0, b1, b2, b3)                                       \
+    (((uint32_t)(b3) << 24) | ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | \
+     (b0))
+#define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */))
+#define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
+#define SSE2NEON_AES_U0(p) \
+    SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
+#define SSE2NEON_AES_U1(p) \
+    SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)
+#define SSE2NEON_AES_U2(p) \
+    SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
+#define SSE2NEON_AES_U3(p) \
+    SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))
+    static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = {
+        SSE2NEON_AES_DATA(SSE2NEON_AES_U0),
+        SSE2NEON_AES_DATA(SSE2NEON_AES_U1),
+        SSE2NEON_AES_DATA(SSE2NEON_AES_U2),
+        SSE2NEON_AES_DATA(SSE2NEON_AES_U3),
+    };
+#undef SSE2NEON_AES_B2W
+#undef SSE2NEON_AES_F2
+#undef SSE2NEON_AES_F3
+#undef SSE2NEON_AES_U0
+#undef SSE2NEON_AES_U1
+#undef SSE2NEON_AES_U2
+#undef SSE2NEON_AES_U3
+
+    uint32_t x0 = _mm_cvtsi128_si32(EncBlock);
+    uint32_t x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0x55));
+    uint32_t x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xAA));
+    uint32_t x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xFF));
+
+    __m128i out = _mm_set_epi32(
+        (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
+         aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
+        (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
+         aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
+        (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
+         aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
+        (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
+         aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));
+
+    return _mm_xor_si128(out, RoundKey);
+#endif
+}
+
+// Perform the last round of an AES encryption flow on data (state) in a using
+// the round key in RoundKey, and store the result in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128
+FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
+{
+    /* FIXME: optimized for NEON */
+    uint8_t v[4][4] = {
+        [0] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 0)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 5)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 10)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 15)]},
+        [1] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 4)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 9)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 14)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 3)]},
+        [2] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 8)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 13)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 2)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 7)]},
+        [3] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 12)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 1)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 6)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 11)]},
+    };
+    for (int i = 0; i < 16; i++)
+        vreinterpretq_nth_u8_m128i(a, i) =
+            v[i / 4][i % 4] ^ vreinterpretq_nth_u8_m128i(RoundKey, i);
+    return a;
+}
+
+// Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
+// This instruction generates a round key for AES encryption. See
+// https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
+// for details.
+//
+// https://msdn.microsoft.com/en-us/library/cc714138(v=vs.120).aspx
+FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i key, const int rcon)
+{
+    uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55));
+    uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF));
+    for (int i = 0; i < 4; ++i) {
+        ((uint8_t *) &X1)[i] = SSE2NEON_sbox[((uint8_t *) &X1)[i]];
+        ((uint8_t *) &X3)[i] = SSE2NEON_sbox[((uint8_t *) &X3)[i]];
+    }
+    return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
+                         ((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
+}
+#undef SSE2NEON_AES_DATA
+
+#else /* __ARM_FEATURE_CRYPTO */
+// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
+// AESMC and then manually applying the real key as an xor operation. This
+// unfortunately means an additional xor op; the compiler should be able to
+// optimize this away for repeated calls however. See
+// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
+// for more details.
+FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^
+        vreinterpretq_u8_m128i(b));
+}
+
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128
+FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
+{
+    return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8(
+                             vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
+                         RoundKey);
+}
+
+FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
+{
+    // AESE does ShiftRows and SubBytes on A
+    uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
+
+    uint8x16_t dest = {
+        // Undo ShiftRows step from AESE and extract X1 and X3
+        u8[0x4], u8[0x1], u8[0xE], u8[0xB],  // SubBytes(X1)
+        u8[0x1], u8[0xE], u8[0xB], u8[0x4],  // ROT(SubBytes(X1))
+        u8[0xC], u8[0x9], u8[0x6], u8[0x3],  // SubBytes(X3)
+        u8[0x9], u8[0x6], u8[0x3], u8[0xC],  // ROT(SubBytes(X3))
+    };
+    uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon};
+    return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r);
+}
+#endif
+
+/* Streaming Extensions */
+
+// Guarantees that every preceding store is globally visible before any
+// subsequent store.
+// https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx
+FORCE_INLINE void _mm_sfence(void)
+{
+    __sync_synchronize();
+}
+
+// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-
+// point elements) from a into memory using a non-temporal memory hint.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_ps
+FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+    __builtin_nontemporal_store(a, (float32x4_t *) p);
+#else
+    vst1q_f32(p, vreinterpretq_f32_m128(a));
+#endif
+}
+
+// Stores the data in a to the address p without polluting the caches.  If the
+// cache line containing address p is already in the cache, the cache will be
+// updated.
+// https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx
+FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+    __builtin_nontemporal_store(a, p);
+#else
+    vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a));
+#endif
+}
+
+// Load 128-bits of integer data from memory into dst using a non-temporal
+// memory hint. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+//
+//   dst[127:0] := MEM[mem_addr+127:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_load_si128
+FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+    return __builtin_nontemporal_load(p);
+#else
+    return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p));
+#endif
+}
+
+// Cache line containing p is flushed and invalidated from all caches in the
+// coherency domain. :
+// https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx
+FORCE_INLINE void _mm_clflush(void const *p)
+{
+    (void) p;
+    // no corollary for Neon?
+}
+
+// Allocate aligned blocks of memory.
+// https://software.intel.com/en-us/
+//         cpp-compiler-developer-guide-and-reference-allocating-and-freeing-aligned-memory-blocks
+FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
+{
+    void *ptr;
+    if (align == 1)
+        return malloc(size);
+    if (align == 2 || (sizeof(void *) == 8 && align == 4))
+        align = sizeof(void *);
+    if (!posix_memalign(&ptr, align, size))
+        return ptr;
+    return NULL;
+}
+
+// Free aligned memory that was allocated with _mm_malloc.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_free
+FORCE_INLINE void _mm_free(void *addr)
+{
+    free(addr);
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 8-bit integer v.
+// https://msdn.microsoft.com/en-us/library/bb514036(v=vs.100)
+FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
+#else
+    crc ^= v;
+    for (int bit = 0; bit < 8; bit++) {
+        if (crc & 1)
+            crc = (crc >> 1) ^ UINT32_C(0x82f63b78);
+        else
+            crc = (crc >> 1);
+    }
+#endif
+    return crc;
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 16-bit integer v.
+// https://msdn.microsoft.com/en-us/library/bb531411(v=vs.100)
+FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
+#else
+    crc = _mm_crc32_u8(crc, v & 0xff);
+    crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
+#endif
+    return crc;
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 32-bit integer v.
+// https://msdn.microsoft.com/en-us/library/bb531394(v=vs.100)
+FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
+#else
+    crc = _mm_crc32_u16(crc, v & 0xffff);
+    crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
+#endif
+    return crc;
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 64-bit integer v.
+// https://msdn.microsoft.com/en-us/library/bb514033(v=vs.100)
+FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
+#else
+    crc = _mm_crc32_u32((uint32_t)(crc), v & 0xffffffff);
+    crc = _mm_crc32_u32((uint32_t)(crc), (v >> 32) & 0xffffffff);
+#endif
+    return crc;
+}
+
+#if defined(__GNUC__) || defined(__clang__)
+#pragma pop_macro("ALIGN_STRUCT")
+#pragma pop_macro("FORCE_INLINE")
+#endif
+
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC pop_options
+#endif
+
+#endif
diff --git a/thirdparty/embree/common/simd/avx.h b/thirdparty/embree/common/simd/avx.h
new file mode 100644
index 0000000000..d3100306ee
--- /dev/null
+++ b/thirdparty/embree/common/simd/avx.h
@@ -0,0 +1,34 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "sse.h"
+
+#if defined(__AVX512VL__)
+#include "vboolf8_avx512.h"
+#include "vboold4_avx512.h"
+#else
+#include "vboolf8_avx.h"
+#include "vboold4_avx.h"
+#endif
+
+#if defined(__AVX2__)
+#include "vint8_avx2.h"
+#include "vuint8_avx2.h"
+#if defined(__X86_64__)
+#include "vllong4_avx2.h"
+#endif
+#else
+#include "vint8_avx.h"
+#include "vuint8_avx.h"
+#endif
+#include "vfloat8_avx.h"
+#if defined(__X86_64__)
+#include "vdouble4_avx.h"
+#endif
+
+#if defined(__AVX512F__)
+#include "avx512.h"
+#endif
+
diff --git a/thirdparty/embree/common/simd/avx512.h b/thirdparty/embree/common/simd/avx512.h
new file mode 100644
index 0000000000..d43bbacea1
--- /dev/null
+++ b/thirdparty/embree/common/simd/avx512.h
@@ -0,0 +1,41 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+#include "../sys/intrinsics.h"
+#include "../math/constants.h"
+#include "../sys/alloc.h"
+#include "varying.h"
+
+#include "vboolf16_avx512.h"
+#include "vint16_avx512.h"
+#include "vuint16_avx512.h"
+#include "vfloat16_avx512.h"
+
+#include "vboold8_avx512.h"
+#include "vllong8_avx512.h"
+#include "vdouble8_avx512.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Prefetching
+  ////////////////////////////////////////////////////////////////////////////////
+
+#define PFHINT_L1   0
+#define PFHINT_L2   1
+#define PFHINT_NT   2
+
+  template<const unsigned int mode>
+    __forceinline void prefetch(const void * __restrict__ const m)
+  {
+    if (mode == PFHINT_L1)
+      _mm_prefetch((const char*)m,_MM_HINT_T0); 
+    else if (mode == PFHINT_L2) 
+      _mm_prefetch((const char*)m,_MM_HINT_T1); 
+    else if (mode == PFHINT_NT) 
+      _mm_prefetch((const char*)m,_MM_HINT_NTA); 
+  }
+}
diff --git a/thirdparty/embree/common/simd/simd.h b/thirdparty/embree/common/simd/simd.h
new file mode 100644
index 0000000000..195506b530
--- /dev/null
+++ b/thirdparty/embree/common/simd/simd.h
@@ -0,0 +1,110 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../math/math.h"
+
+/* include SSE wrapper classes */
+#if defined(__SSE__)
+#  include "sse.h"
+#endif
+
+/* include AVX wrapper classes */
+#if defined(__AVX__)
+#  include "avx.h"
+#endif
+
+/* include AVX512 wrapper classes */
+#if defined (__AVX512F__)
+#  include "avx512.h"
+#endif
+
+namespace embree
+{
+  template <int N>
+  __forceinline vbool<N> isfinite(const vfloat<N>& v)
+  {
+    return (v >= vfloat<N>(-std::numeric_limits<float>::max()))
+         & (v <= vfloat<N>( std::numeric_limits<float>::max()));
+  }
+  
+  /* foreach unique */
+  template<typename vbool, typename vint, typename Closure>
+  __forceinline void foreach_unique(const vbool& valid0, const vint& vi, const Closure& closure)
+  {
+    vbool valid1 = valid0;
+    while (any(valid1)) {
+      const int j = int(bsf(movemask(valid1)));
+      const int i = vi[j];
+      const vbool valid2 = valid1 & (i == vi);
+      valid1 = andn(valid1, valid2);
+      closure(valid2, i);
+    }
+  }
+
+  /* returns the next unique value i in vi and the corresponding valid_i mask */
+  template<typename vbool, typename vint>
+  __forceinline int next_unique(vbool& valid, const vint& vi, /*out*/ vbool& valid_i)
+  {
+    assert(any(valid));
+    const int j = int(bsf(movemask(valid)));
+    const int i = vi[j];
+    valid_i = valid & (i == vi);
+    valid = andn(valid, valid_i);
+    return i;
+  }
+
+  /* foreach unique index */
+  template<typename vbool, typename vint, typename Closure>
+  __forceinline void foreach_unique_index(const vbool& valid0, const vint& vi, const Closure& closure)
+  {
+    vbool valid1 = valid0;
+    while (any(valid1)) {
+      const int j = int(bsf(movemask(valid1)));
+      const int i = vi[j];
+      const vbool valid2 = valid1 & (i == vi);
+      valid1 = andn(valid1, valid2);
+      closure(valid2, i, j);
+    }
+  }
+
+  /* returns the index of the next unique value i in vi and the corresponding valid_i mask */
+  template<typename vbool, typename vint>
+  __forceinline int next_unique_index(vbool& valid, const vint& vi, /*out*/ vbool& valid_i)
+  {
+    assert(any(valid));
+    const int j = int(bsf(movemask(valid)));
+    const int i = vi[j];
+    valid_i = valid & (i == vi);
+    valid = andn(valid, valid_i);
+    return j;
+  }
+
+  template<typename Closure>
+  __forceinline void foreach2(int x0, int x1, int y0, int y1, const Closure& closure)
+  {
+    __aligned(64) int U[2*VSIZEX];
+    __aligned(64) int V[2*VSIZEX];
+    int index = 0;
+    for (int y=y0; y<y1; y++) {
+      const bool lasty = y+1>=y1;
+      const vintx vy = y;
+      for (int x=x0; x<x1; ) { //x+=VSIZEX) {
+        const bool lastx = x+VSIZEX >= x1;
+        vintx vx = x+vintx(step);
+        vintx::storeu(&U[index], vx);
+        vintx::storeu(&V[index], vy);
+        const int dx = min(x1-x,VSIZEX);
+        index += dx;
+        x += dx;
+        if (index >= VSIZEX || (lastx && lasty)) {
+          const vboolx valid = vintx(step) < vintx(index);
+          closure(valid, vintx::load(U), vintx::load(V));
+          x-= max(0, index-VSIZEX);
+          index = 0;
+        }
+      }
+    }
+  }
+}
diff --git a/thirdparty/embree/common/simd/sse.cpp b/thirdparty/embree/common/simd/sse.cpp
new file mode 100644
index 0000000000..535d6943d8
--- /dev/null
+++ b/thirdparty/embree/common/simd/sse.cpp
@@ -0,0 +1,34 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "sse.h"
+
+namespace embree 
+{
+  const __m128 mm_lookupmask_ps[16] = {
+    _mm_castsi128_ps(_mm_set_epi32( 0, 0, 0, 0)),
+    _mm_castsi128_ps(_mm_set_epi32( 0, 0, 0,-1)),
+    _mm_castsi128_ps(_mm_set_epi32( 0, 0,-1, 0)),
+    _mm_castsi128_ps(_mm_set_epi32( 0, 0,-1,-1)),
+    _mm_castsi128_ps(_mm_set_epi32( 0,-1, 0, 0)),
+    _mm_castsi128_ps(_mm_set_epi32( 0,-1, 0,-1)),
+    _mm_castsi128_ps(_mm_set_epi32( 0,-1,-1, 0)),
+    _mm_castsi128_ps(_mm_set_epi32( 0,-1,-1,-1)),
+    _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0, 0)),
+    _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0,-1)),
+    _mm_castsi128_ps(_mm_set_epi32(-1, 0,-1, 0)),
+    _mm_castsi128_ps(_mm_set_epi32(-1, 0,-1,-1)),
+    _mm_castsi128_ps(_mm_set_epi32(-1,-1, 0, 0)),
+    _mm_castsi128_ps(_mm_set_epi32(-1,-1, 0,-1)),
+    _mm_castsi128_ps(_mm_set_epi32(-1,-1,-1, 0)),
+    _mm_castsi128_ps(_mm_set_epi32(-1,-1,-1,-1))
+  };
+
+  const __m128d mm_lookupmask_pd[4] = {
+    _mm_castsi128_pd(_mm_set_epi32( 0, 0, 0, 0)),
+    _mm_castsi128_pd(_mm_set_epi32( 0, 0,-1,-1)),
+    _mm_castsi128_pd(_mm_set_epi32(-1,-1, 0, 0)),
+    _mm_castsi128_pd(_mm_set_epi32(-1,-1,-1,-1))
+  };
+
+}
diff --git a/thirdparty/embree/common/simd/sse.h b/thirdparty/embree/common/simd/sse.h
new file mode 100644
index 0000000000..1465fb4fb0
--- /dev/null
+++ b/thirdparty/embree/common/simd/sse.h
@@ -0,0 +1,35 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+#include "../sys/intrinsics.h"
+#include "../sys/alloc.h"
+#include "../math/constants.h"
+#include "varying.h"
+
+namespace embree 
+{
+#if defined(__SSE4_1__)
+  __forceinline __m128 blendv_ps(__m128 f, __m128 t, __m128 mask) { 
+    return _mm_blendv_ps(f,t,mask);
+  }
+#else
+  __forceinline __m128 blendv_ps(__m128 f, __m128 t, __m128 mask) { 
+    return _mm_or_ps(_mm_and_ps(mask, t), _mm_andnot_ps(mask, f)); 
+  }
+#endif
+
+  extern const __m128  mm_lookupmask_ps[16];
+  extern const __m128d mm_lookupmask_pd[4];
+}
+
+#if defined(__AVX512VL__)
+#include "vboolf4_avx512.h"
+#else
+#include "vboolf4_sse2.h"
+#endif
+#include "vint4_sse2.h"
+#include "vuint4_sse2.h"
+#include "vfloat4_sse2.h"
diff --git a/thirdparty/embree/common/simd/varying.h b/thirdparty/embree/common/simd/varying.h
new file mode 100644
index 0000000000..9b98d326be
--- /dev/null
+++ b/thirdparty/embree/common/simd/varying.h
@@ -0,0 +1,145 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+
+namespace embree
+{
+  /* Varying numeric types */
+  template<int N>
+  struct vfloat_impl
+  {
+    union { float f[N]; int i[N]; };
+    __forceinline const float& operator [](size_t index) const { assert(index < N); return f[index]; }
+    __forceinline       float& operator [](size_t index)       { assert(index < N); return f[index]; }
+  };
+
+  template<int N>
+  struct vdouble_impl
+  {
+    union { double f[N]; long long i[N]; };
+    __forceinline const double& operator [](size_t index) const { assert(index < N); return f[index]; }
+    __forceinline       double& operator [](size_t index)       { assert(index < N); return f[index]; }
+  };
+
+  template<int N>
+  struct vint_impl
+  {
+    int i[N];
+    __forceinline const int& operator [](size_t index) const { assert(index < N); return i[index]; }
+    __forceinline       int& operator [](size_t index)       { assert(index < N); return i[index]; }
+  };
+
+  template<int N>
+  struct vuint_impl
+  {
+    unsigned int i[N];
+    __forceinline const unsigned int& operator [](size_t index) const { assert(index < N); return i[index]; }
+    __forceinline       unsigned int& operator [](size_t index)       { assert(index < N); return i[index]; }
+  };
+
+  template<int N>
+  struct vllong_impl
+  {
+    long long i[N];
+    __forceinline const long long& operator [](size_t index) const { assert(index < N); return i[index]; }
+    __forceinline       long long& operator [](size_t index)       { assert(index < N); return i[index]; }
+  };
+
+  /* Varying bool types */
+  template<int N> struct vboolf_impl { int       i[N]; }; // for float/int
+  template<int N> struct vboold_impl { long long i[N]; }; // for double/long long
+ 
+  /* Varying size constants */
+#if defined(__AVX512VL__) // SKX
+  const int VSIZEX = 8;  // default size
+  const int VSIZEL = 16; // large size
+#elif defined(__AVX__)
+  const int VSIZEX = 8;
+  const int VSIZEL = 8;
+#else
+  const int VSIZEX = 4;
+  const int VSIZEL = 4;
+#endif
+
+  template<int N>
+  struct vtypes {
+    using vbool = vboolf_impl<N>;
+    using vboolf = vboolf_impl<N>;
+    using vboold = vboold_impl<N>;
+    using vint = vint_impl<N>;
+    using vuint = vuint_impl<N>;
+    using vllong = vllong_impl<N>;
+    using vfloat = vfloat_impl<N>;
+    using vdouble = vdouble_impl<N>;
+  };
+
+  template<>
+  struct vtypes<1> {
+    using vbool = bool;
+    using vboolf = bool;
+    using vboold = bool;
+    using vint = int;
+    using vuint = unsigned int;
+    using vllong = long long;
+    using vfloat = float;
+    using vdouble = double;
+  };
+
+  /* Aliases to default types */
+  template<int N> using vbool = typename vtypes<N>::vbool;
+  template<int N> using vboolf = typename vtypes<N>::vboolf;
+  template<int N> using vboold = typename vtypes<N>::vboold;
+  template<int N> using vint = typename vtypes<N>::vint;
+  template<int N> using vuint = typename vtypes<N>::vuint;
+  template<int N> using vllong = typename vtypes<N>::vllong;
+  template<int N> using vreal = typename vtypes<N>::vfloat;
+  template<int N> using vfloat = typename vtypes<N>::vfloat;
+  template<int N> using vdouble = typename vtypes<N>::vdouble;
+
+  /* 4-wide shortcuts */
+  typedef vfloat<4>  vfloat4;
+  typedef vdouble<4> vdouble4;
+  typedef vreal<4>   vreal4;
+  typedef vint<4>    vint4;
+  typedef vuint<4>  vuint4;
+  typedef vllong<4>  vllong4;
+  typedef vbool<4>   vbool4;
+  typedef vboolf<4>  vboolf4;
+  typedef vboold<4>  vboold4;
+
+  /* 8-wide shortcuts */
+  typedef vfloat<8>  vfloat8;
+  typedef vdouble<8> vdouble8;
+  typedef vreal<8>   vreal8;
+  typedef vint<8>    vint8;
+  typedef vuint<8>    vuint8;
+  typedef vllong<8>  vllong8;
+  typedef vbool<8>   vbool8;
+  typedef vboolf<8>  vboolf8;
+  typedef vboold<8>  vboold8;
+
+  /* 16-wide shortcuts */
+  typedef vfloat<16>  vfloat16;
+  typedef vdouble<16> vdouble16;
+  typedef vreal<16>   vreal16;
+  typedef vint<16>    vint16;
+  typedef vuint<16>   vuint16;
+  typedef vllong<16>  vllong16;
+  typedef vbool<16>   vbool16;
+  typedef vboolf<16>  vboolf16;
+  typedef vboold<16>  vboold16;
+
+  /* Default shortcuts */
+  typedef vfloat<VSIZEX>  vfloatx;
+  typedef vdouble<VSIZEX> vdoublex;
+  typedef vreal<VSIZEX>   vrealx;
+  typedef vint<VSIZEX>    vintx;
+  typedef vuint<VSIZEX>   vuintx;
+  typedef vllong<VSIZEX>  vllongx;
+  typedef vbool<VSIZEX>   vboolx;
+  typedef vboolf<VSIZEX>  vboolfx;
+  typedef vboold<VSIZEX>  vbooldx;
+}
diff --git a/thirdparty/embree/common/simd/vboold4_avx.h b/thirdparty/embree/common/simd/vboold4_avx.h
new file mode 100644
index 0000000000..7db0d1c5c1
--- /dev/null
+++ b/thirdparty/embree/common/simd/vboold4_avx.h
@@ -0,0 +1,169 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+  /* 4-wide AVX bool type for 64bit data types*/
+  template<>
+  struct vboold<4>
+  {
+    ALIGNED_STRUCT_(32);
+    
+    typedef vboold4 Bool;
+
+    enum  { size = 4 };       // number of SIMD elements
+    union {                   // data
+      __m256d v;
+      struct { __m128d vl,vh; };
+      long long i[4];
+    };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboold() {}
+    __forceinline vboold(const vboold4& a) { v = a.v; }
+    __forceinline vboold4& operator =(const vboold4& a) { v = a.v; return *this; }
+
+    __forceinline vboold(__m256d a) : v(a) {}
+    __forceinline vboold(__m256i a) : v(_mm256_castsi256_pd(a)) {}
+
+    __forceinline operator const __m256() const { return _mm256_castpd_ps(v); }
+    __forceinline operator const __m256i() const { return _mm256_castpd_si256(v); }
+    __forceinline operator const __m256d() const { return v; }
+
+    __forceinline vboold(int a)
+    {
+      assert(a >= 0 && a <= 255);
+#if defined (__AVX2__)
+      const __m256i mask = _mm256_set_epi64x(0x8, 0x4, 0x2, 0x1);
+      const __m256i b = _mm256_set1_epi64x(a);
+      const __m256i c = _mm256_and_si256(b,mask);
+      v = _mm256_castsi256_pd(_mm256_cmpeq_epi64(c,mask));
+#else
+      vl = mm_lookupmask_pd[a & 0x3];
+      vh = mm_lookupmask_pd[a >> 2];
+#endif
+    }
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboold(FalseTy) : v(_mm256_setzero_pd()) {}
+    __forceinline vboold(TrueTy)  : v(_mm256_cmp_pd(_mm256_setzero_pd(), _mm256_setzero_pd(), _CMP_EQ_OQ)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline bool       operator [](size_t index) const { assert(index < 4); return (_mm256_movemask_pd(v) >> index) & 1; }
+    __forceinline long long& operator [](size_t index)       { assert(index < 4); return i[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboold4 operator !(const vboold4& a) { return _mm256_xor_pd(a, vboold4(embree::True)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboold4 operator &(const vboold4& a, const vboold4& b) { return _mm256_and_pd(a, b); }
+  __forceinline vboold4 operator |(const vboold4& a, const vboold4& b) { return _mm256_or_pd (a, b); }
+  __forceinline vboold4 operator ^(const vboold4& a, const vboold4& b) { return _mm256_xor_pd(a, b); }
+
+  __forceinline vboold4 andn(const vboold4& a, const vboold4& b) { return _mm256_andnot_pd(b, a); }
+
+  __forceinline vboold4& operator &=(vboold4& a, const vboold4& b) { return a = a & b; }
+  __forceinline vboold4& operator |=(vboold4& a, const vboold4& b) { return a = a | b; }
+  __forceinline vboold4& operator ^=(vboold4& a, const vboold4& b) { return a = a ^ b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboold4 operator !=(const vboold4& a, const vboold4& b) { return _mm256_xor_pd(a, b); }
+  __forceinline vboold4 operator ==(const vboold4& a, const vboold4& b) { return _mm256_xor_pd(_mm256_xor_pd(a,b),vboold4(embree::True)); }
+
+  __forceinline vboold4 select(const vboold4& mask, const vboold4& t, const vboold4& f) {
+    return _mm256_blendv_pd(f, t, mask); 
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboold4 unpacklo(const vboold4& a, const vboold4& b) { return _mm256_unpacklo_pd(a, b); }
+  __forceinline vboold4 unpackhi(const vboold4& a, const vboold4& b) { return _mm256_unpackhi_pd(a, b); }
+
+
+#if defined(__AVX2__)
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vboold4 shuffle(const vboold4& v) {
+    return _mm256_permute4x64_pd(v, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+
+  template<int i>
+  __forceinline vboold4 shuffle(const vboold4& v) {
+    return _mm256_permute4x64_pd(v, _MM_SHUFFLE(i, i, i, i));
+  }
+#endif
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool reduce_and(const vboold4& a) { return _mm256_movemask_pd(a) == (unsigned int)0xf; }
+  __forceinline bool reduce_or (const vboold4& a) { return !_mm256_testz_pd(a,a); }
+
+  __forceinline bool all (const vboold4& a) { return _mm256_movemask_pd(a) == (unsigned int)0xf; }
+  __forceinline bool any (const vboold4& a) { return !_mm256_testz_pd(a,a); }
+  __forceinline bool none(const vboold4& a) { return _mm256_testz_pd(a,a) != 0; }
+
+  __forceinline bool all (const vboold4& valid, const vboold4& b) { return all((!valid) | b); }
+  __forceinline bool any (const vboold4& valid, const vboold4& b) { return any(valid & b); }
+  __forceinline bool none(const vboold4& valid, const vboold4& b) { return none(valid & b); }
+
+  __forceinline unsigned int movemask(const vboold4& a) { return _mm256_movemask_pd(a); }
+  __forceinline size_t       popcnt  (const vboold4& a) { return popcnt((size_t)_mm256_movemask_pd(a)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Get/Set Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool get(const vboold4& a, size_t index) { return a[index]; }
+  __forceinline void set  (vboold4& a, size_t index)     { a[index] = -1; }
+  __forceinline void clear(vboold4& a, size_t index)     { a[index] =  0; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vboold4& a) {
+    return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", "
+                       << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">";
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vboold4_avx512.h b/thirdparty/embree/common/simd/vboold4_avx512.h
new file mode 100644
index 0000000000..ceaad7bba5
--- /dev/null
+++ b/thirdparty/embree/common/simd/vboold4_avx512.h
@@ -0,0 +1,156 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+  /* 4-wide AVX-512 bool type */
+  template<>
+  struct vboold<4>
+  {
+    typedef vboold4 Bool;
+    typedef vint4   Int;
+
+    enum { size = 4 }; // number of SIMD elements
+    __mmask8 v;        // data
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboold() {}
+    __forceinline vboold(const vboold4& t) { v = t.v; }
+    __forceinline vboold4& operator =(const vboold4& f) { v = f.v; return *this; }
+
+    __forceinline vboold(const __mmask8 &t) { v = t; }
+    __forceinline operator __mmask8() const { return v; }
+
+    __forceinline vboold(bool b) { v = b ? 0xf : 0x0; }
+    __forceinline vboold(int t)  { v = (__mmask8)t; }
+    __forceinline vboold(unsigned int t) { v = (__mmask8)t; }
+
+    /* return int8 mask */
+    __forceinline __m128i mask8() const {
+      return _mm_movm_epi8(v);
+    }
+
+    /* return int32 mask */
+    __forceinline __m128i mask32() const {
+      return _mm_movm_epi32(v);
+    }
+
+    /* return int64 mask */
+    __forceinline __m256i mask64() const {
+      return _mm256_movm_epi64(v);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboold(FalseTy) : v(0x0) {}
+    __forceinline vboold(TrueTy)  : v(0xf) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline bool operator [](size_t index) const {
+      assert(index < 4); return (mm512_mask2int(v) >> index) & 1;
+    }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboold4 operator !(const vboold4& a) { return _mm512_kandn(a, 0xf); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboold4 operator &(const vboold4& a, const vboold4& b) { return _mm512_kand(a, b); }
+  __forceinline vboold4 operator |(const vboold4& a, const vboold4& b) { return _mm512_kor(a, b); }
+  __forceinline vboold4 operator ^(const vboold4& a, const vboold4& b) { return _mm512_kxor(a, b); }
+
+  __forceinline vboold4 andn(const vboold4& a, const vboold4& b) { return _mm512_kandn(b, a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboold4& operator &=(vboold4& a, const vboold4& b) { return a = a & b; }
+  __forceinline vboold4& operator |=(vboold4& a, const vboold4& b) { return a = a | b; }
+  __forceinline vboold4& operator ^=(vboold4& a, const vboold4& b) { return a = a ^ b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboold4 operator !=(const vboold4& a, const vboold4& b) { return _mm512_kxor(a, b); }
+  __forceinline vboold4 operator ==(const vboold4& a, const vboold4& b) { return _mm512_kand(_mm512_kxnor(a, b), 0xf); }
+
+  __forceinline vboold4 select(const vboold4& s, const vboold4& a, const vboold4& b) {
+    return _mm512_kor(_mm512_kand(s, a), _mm512_kandn(s, b));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline int all (const vboold4& a) { return a.v == 0xf; }
+  __forceinline int any (const vboold4& a) { return _mm512_kortestz(a, a) == 0; }
+  __forceinline int none(const vboold4& a) { return _mm512_kortestz(a, a) != 0; }
+
+  __forceinline int all (const vboold4& valid, const vboold4& b) { return all((!valid) | b); }
+  __forceinline int any (const vboold4& valid, const vboold4& b) { return any(valid & b); }
+  __forceinline int none(const vboold4& valid, const vboold4& b) { return none(valid & b); }
+
+  __forceinline size_t movemask(const vboold4& a) { return _mm512_kmov(a); }
+  __forceinline size_t popcnt  (const vboold4& a) { return popcnt(a.v); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Conversion Operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline unsigned int toInt(const vboold4& a) { return mm512_mask2int(a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Get/Set Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool get(const vboold4& a, size_t index) { assert(index < 4); return (toInt(a) >> index) & 1; }
+  __forceinline void set(vboold4& a, size_t index)       { assert(index < 4); a |= 1 << index; }
+  __forceinline void clear(vboold4& a, size_t index)     { assert(index < 4); a = andn(a, 1 << index); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vboold4& a)
+  {
+    cout << "<";
+    for (size_t i=0; i<4; i++) {
+      if ((a.v >> i) & 1) cout << "1"; else cout << "0";
+    }
+    return cout << ">";
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vboold8_avx512.h b/thirdparty/embree/common/simd/vboold8_avx512.h
new file mode 100644
index 0000000000..66d2054872
--- /dev/null
+++ b/thirdparty/embree/common/simd/vboold8_avx512.h
@@ -0,0 +1,151 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+  /* 8-wide AVX-512 bool type */
+  template<>
+  struct vboold<8>
+  {
+    typedef vboold8 Bool;
+    typedef vint8   Int;
+
+    enum { size = 8 }; // number of SIMD elements
+    __mmask8 v;        // data
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline vboold() {}
+    __forceinline vboold(const vboold8& t) { v = t.v; }
+    __forceinline vboold8& operator =(const vboold8& f) { v = f.v; return *this; }
+
+    __forceinline vboold(const __mmask8& t) { v = t; }
+    __forceinline operator __mmask8() const { return v; }
+    
+    __forceinline vboold(bool b) { v = b ? 0xff : 0x00; }
+    __forceinline vboold(int t)  { v = (__mmask8)t; }
+    __forceinline vboold(unsigned int t) { v = (__mmask8)t; }
+
+    /* return int8 mask */
+    __forceinline __m128i mask8() const {
+      return _mm_movm_epi8(v);
+    }
+
+    /* return int64 mask */
+    __forceinline __m512i mask64() const { 
+      return _mm512_movm_epi64(v);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboold(FalseTy) : v(0x00) {}
+    __forceinline vboold(TrueTy)  : v(0xff) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline bool operator [](size_t index) const {
+      assert(index < 8); return (mm512_mask2int(v) >> index) & 1;
+    }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboold8 operator !(const vboold8& a) { return _mm512_knot(a); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboold8 operator &(const vboold8& a, const vboold8& b) { return _mm512_kand(a, b); }
+  __forceinline vboold8 operator |(const vboold8& a, const vboold8& b) { return _mm512_kor(a, b); }
+  __forceinline vboold8 operator ^(const vboold8& a, const vboold8& b) { return _mm512_kxor(a, b); }
+
+  __forceinline vboold8 andn(const vboold8& a, const vboold8& b) { return _mm512_kandn(b, a); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboold8& operator &=(vboold8& a, const vboold8& b) { return a = a & b; }
+  __forceinline vboold8& operator |=(vboold8& a, const vboold8& b) { return a = a | b; }
+  __forceinline vboold8& operator ^=(vboold8& a, const vboold8& b) { return a = a ^ b; }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboold8 operator !=(const vboold8& a, const vboold8& b) { return _mm512_kxor(a, b); }
+  __forceinline vboold8 operator ==(const vboold8& a, const vboold8& b) { return _mm512_kxnor(a, b); }
+  
+  __forceinline vboold8 select(const vboold8& s, const vboold8& a, const vboold8& b) {
+    return _mm512_kor(_mm512_kand(s, a), _mm512_kandn(s, b));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operations
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline int all (const vboold8& a) { return a.v == 0xff; }
+  __forceinline int any (const vboold8& a) { return _mm512_kortestz(a, a) == 0; }
+  __forceinline int none(const vboold8& a) { return _mm512_kortestz(a, a) != 0; }
+
+  __forceinline int all (const vboold8& valid, const vboold8& b) { return all((!valid) | b); }
+  __forceinline int any (const vboold8& valid, const vboold8& b) { return any(valid & b); }
+  __forceinline int none(const vboold8& valid, const vboold8& b) { return none(valid & b); }
+  
+  __forceinline size_t movemask(const vboold8& a) { return _mm512_kmov(a); }
+  __forceinline size_t popcnt  (const vboold8& a) { return popcnt(a.v); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Conversion Operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline unsigned int toInt(const vboold8& a) { return mm512_mask2int(a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Get/Set Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool get(const vboold8& a, size_t index) { assert(index < 8); return (toInt(a) >> index) & 1; }
+  __forceinline void set(vboold8& a, size_t index)       { assert(index < 8); a |= 1 << index; }
+  __forceinline void clear(vboold8& a, size_t index)     { assert(index < 8); a = andn(a, 1 << index); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vboold8& a)
+  {
+    cout << "<";
+    for (size_t i=0; i<8; i++) {
+      if ((a.v >> i) & 1) cout << "1"; else cout << "0";
+    }
+    return cout << ">";
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vboolf16_avx512.h b/thirdparty/embree/common/simd/vboolf16_avx512.h
new file mode 100644
index 0000000000..19841dcea8
--- /dev/null
+++ b/thirdparty/embree/common/simd/vboolf16_avx512.h
@@ -0,0 +1,153 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+  /* 16-wide AVX-512 bool type */
+  template<>
+  struct vboolf<16>
+  {
+    typedef vboolf16 Bool;
+    typedef vint16   Int;
+    typedef vfloat16 Float;
+
+    enum { size = 16 }; // number of SIMD elements
+    __mmask16 v;        // data
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline vboolf() {}
+    __forceinline vboolf(const vboolf16& t) { v = t.v; }
+    __forceinline vboolf16& operator =(const vboolf16& f) { v = f.v; return *this; }
+
+    __forceinline vboolf(const __mmask16& t) { v = t; }
+    __forceinline operator __mmask16() const { return v; }
+    
+    __forceinline vboolf(bool b) { v = b ? 0xFFFF : 0x0000; }
+    __forceinline vboolf(int t) { v = (__mmask16)t; }
+    __forceinline vboolf(unsigned int t) { v = (__mmask16)t; }
+
+    /* return int8 mask */
+    __forceinline __m128i mask8() const {
+      return _mm_movm_epi8(v);
+    }
+
+    /* return int32 mask */
+    __forceinline __m512i mask32() const {
+      return _mm512_movm_epi32(v);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboolf(FalseTy) : v(0x0000) {}
+    __forceinline vboolf(TrueTy)  : v(0xffff) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+  
+    __forceinline bool operator [](size_t index) const {
+      assert(index < 16); return (mm512_mask2int(v) >> index) & 1;
+    }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboolf16 operator !(const vboolf16& a) { return _mm512_knot(a); }
+  
+   ////////////////////////////////////////////////////////////////////////////////
+   /// Binary Operators
+   ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboolf16 operator &(const vboolf16& a, const vboolf16& b) { return _mm512_kand(a,b); }
+  __forceinline vboolf16 operator |(const vboolf16& a, const vboolf16& b) { return _mm512_kor(a,b); }
+  __forceinline vboolf16 operator ^(const vboolf16& a, const vboolf16& b) { return _mm512_kxor(a,b); }
+
+  __forceinline vboolf16 andn(const vboolf16& a, const vboolf16& b) { return _mm512_kandn(b,a); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboolf16& operator &=(vboolf16& a, const vboolf16& b) { return a = a & b; }
+  __forceinline vboolf16& operator |=(vboolf16& a, const vboolf16& b) { return a = a | b; }
+  __forceinline vboolf16& operator ^=(vboolf16& a, const vboolf16& b) { return a = a ^ b; }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboolf16 operator !=(const vboolf16& a, const vboolf16& b) { return _mm512_kxor(a, b); }
+  __forceinline vboolf16 operator ==(const vboolf16& a, const vboolf16& b) { return _mm512_kxnor(a, b); }
+  
+  __forceinline vboolf16 select(const vboolf16& s, const vboolf16& a, const vboolf16& b) {
+    return _mm512_kor(_mm512_kand(s,a),_mm512_kandn(s,b));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operations
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline int all (const vboolf16& a) { return  _mm512_kortestc(a,a) != 0; }
+  __forceinline int any (const vboolf16& a) { return  _mm512_kortestz(a,a) == 0; }
+  __forceinline int none(const vboolf16& a) { return  _mm512_kortestz(a,a) != 0; }
+
+  __forceinline int all (const vboolf16& valid, const vboolf16& b) { return all((!valid) | b); }
+  __forceinline int any (const vboolf16& valid, const vboolf16& b) { return any(valid & b); }
+  __forceinline int none(const vboolf16& valid, const vboolf16& b) { return none(valid & b); }
+  
+  __forceinline size_t movemask(const vboolf16& a) { return _mm512_kmov(a); }
+  __forceinline size_t popcnt  (const vboolf16& a) { return popcnt(a.v); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Convertion Operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline unsigned int toInt (const vboolf16& a) { return mm512_mask2int(a); }
+  __forceinline vboolf16     toMask(const int& a)      { return mm512_int2mask(a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Get/Set Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool get(const vboolf16& a, size_t index) { assert(index < 16); return (toInt(a) >> index) & 1; }
+  __forceinline void set(vboolf16& a, size_t index)       { assert(index < 16); a |= 1 << index; }
+  __forceinline void clear(vboolf16& a, size_t index)     { assert(index < 16); a = andn(a, 1 << index); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf16& a)
+  {
+    cout << "<";
+    for (size_t i=0; i<16; i++) {
+      if ((a.v >> i) & 1) cout << "1"; else cout << "0";
+    }
+    return cout << ">";
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vboolf4_avx512.h b/thirdparty/embree/common/simd/vboolf4_avx512.h
new file mode 100644
index 0000000000..e65f66b025
--- /dev/null
+++ b/thirdparty/embree/common/simd/vboolf4_avx512.h
@@ -0,0 +1,159 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+  /* 4-wide AVX-512 bool type */
+  template<>
+  struct vboolf<4>
+  {
+    typedef vboolf4 Bool;
+    typedef vint4   Int;
+
+    enum { size = 4 }; // number of SIMD elements
+    __mmask8 v;        // data
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboolf() {}
+    __forceinline vboolf(const vboolf4& t) { v = t.v; }
+    __forceinline vboolf4& operator =(const vboolf4& f) { v = f.v; return *this; }
+
+    __forceinline vboolf(const __mmask8 &t) { v = t; }
+    __forceinline operator __mmask8() const { return v; }
+
+    __forceinline vboolf(bool b) { v = b ? 0xf : 0x0; }
+    __forceinline vboolf(int t)  { v = (__mmask8)t; }
+    __forceinline vboolf(unsigned int t) { v = (__mmask8)t; }
+
+    __forceinline vboolf(bool a, bool b, bool c, bool d)
+      : v((__mmask8)((int(d) << 3) | (int(c) << 2) | (int(b) << 1) | int(a))) {}
+
+    /* return int8 mask */
+    __forceinline __m128i mask8() const {
+      return _mm_movm_epi8(v);
+    }
+
+    /* return int32 mask */
+    __forceinline __m128i mask32() const {
+      return _mm_movm_epi32(v);
+    }
+
+    /* return int64 mask */
+    __forceinline __m256i mask64() const {
+      return _mm256_movm_epi64(v);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboolf(FalseTy) : v(0x0) {}
+    __forceinline vboolf(TrueTy)  : v(0xf) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline bool operator [](size_t index) const {
+      assert(index < 4); return (mm512_mask2int(v) >> index) & 1;
+    }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf4 operator !(const vboolf4& a) { return _mm512_kandn(a, 0xf); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf4 operator &(const vboolf4& a, const vboolf4& b) { return _mm512_kand(a, b); }
+  __forceinline vboolf4 operator |(const vboolf4& a, const vboolf4& b) { return _mm512_kor(a, b); }
+  __forceinline vboolf4 operator ^(const vboolf4& a, const vboolf4& b) { return _mm512_kxor(a, b); }
+
+  __forceinline vboolf4 andn(const vboolf4& a, const vboolf4& b) { return _mm512_kandn(b, a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf4& operator &=(vboolf4& a, const vboolf4& b) { return a = a & b; }
+  __forceinline vboolf4& operator |=(vboolf4& a, const vboolf4& b) { return a = a | b; }
+  __forceinline vboolf4& operator ^=(vboolf4& a, const vboolf4& b) { return a = a ^ b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf4 operator !=(const vboolf4& a, const vboolf4& b) { return _mm512_kxor(a, b); }
+  __forceinline vboolf4 operator ==(const vboolf4& a, const vboolf4& b) { return _mm512_kand(_mm512_kxnor(a, b), 0xf); }
+
+  __forceinline vboolf4 select(const vboolf4& s, const vboolf4& a, const vboolf4& b) {
+    return _mm512_kor(_mm512_kand(s, a), _mm512_kandn(s, b));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline int all (const vboolf4& a) { return a.v == 0xf; }
+  __forceinline int any (const vboolf4& a) { return _mm512_kortestz(a, a) == 0; }
+  __forceinline int none(const vboolf4& a) { return _mm512_kortestz(a, a) != 0; }
+
+  __forceinline int all (const vboolf4& valid, const vboolf4& b) { return all((!valid) | b); }
+  __forceinline int any (const vboolf4& valid, const vboolf4& b) { return any(valid & b); }
+  __forceinline int none(const vboolf4& valid, const vboolf4& b) { return none(valid & b); }
+
+  __forceinline size_t movemask(const vboolf4& a) { return _mm512_kmov(a); }
+  __forceinline size_t popcnt  (const vboolf4& a) { return popcnt(a.v); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Conversion Operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline unsigned int toInt(const vboolf4& a) { return mm512_mask2int(a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Get/Set Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool get(const vboolf4& a, size_t index) { assert(index < 4); return (toInt(a) >> index) & 1; }
+  __forceinline void set(vboolf4& a, size_t index)       { assert(index < 4); a |= 1 << index; }
+  __forceinline void clear(vboolf4& a, size_t index)     { assert(index < 4); a = andn(a, 1 << index); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf4& a)
+  {
+    cout << "<";
+    for (size_t i=0; i<4; i++) {
+      if ((a.v >> i) & 1) cout << "1"; else cout << "0";
+    }
+    return cout << ">";
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vboolf4_sse2.h b/thirdparty/embree/common/simd/vboolf4_sse2.h
new file mode 100644
index 0000000000..fa84b1b6ee
--- /dev/null
+++ b/thirdparty/embree/common/simd/vboolf4_sse2.h
@@ -0,0 +1,189 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+  /* 4-wide SSE bool type */
+  template<>
+  struct vboolf<4>
+  {
+    ALIGNED_STRUCT_(16);
+    
+    typedef vboolf4 Bool;
+    typedef vint4   Int;
+    typedef vfloat4 Float;
+
+    enum  { size = 4 };            // number of SIMD elements
+    union { __m128 v; int i[4]; }; // data
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline vboolf() {}
+    __forceinline vboolf(const vboolf4& other) { v = other.v; }
+    __forceinline vboolf4& operator =(const vboolf4& other) { v = other.v; return *this; }
+
+    __forceinline vboolf(__m128 input) : v(input) {}
+    __forceinline operator const __m128&() const { return v; }
+    __forceinline operator const __m128i() const { return _mm_castps_si128(v); }
+    __forceinline operator const __m128d() const { return _mm_castps_pd(v); }
+    
+    __forceinline vboolf(bool a)
+      : v(mm_lookupmask_ps[(size_t(a) << 3) | (size_t(a) << 2) | (size_t(a) << 1) | size_t(a)]) {}
+    __forceinline vboolf(bool a, bool b)
+      : v(mm_lookupmask_ps[(size_t(b) << 3) | (size_t(a) << 2) | (size_t(b) << 1) | size_t(a)]) {}
+    __forceinline vboolf(bool a, bool b, bool c, bool d)
+      : v(mm_lookupmask_ps[(size_t(d) << 3) | (size_t(c) << 2) | (size_t(b) << 1) | size_t(a)]) {}
+    __forceinline vboolf(int mask) { assert(mask >= 0 && mask < 16); v = mm_lookupmask_ps[mask]; }
+    __forceinline vboolf(unsigned int mask) { assert(mask < 16); v = mm_lookupmask_ps[mask]; }
+
+    /* return int32 mask */
+    __forceinline __m128i mask32() const { 
+      return _mm_castps_si128(v);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboolf(FalseTy) : v(_mm_setzero_ps()) {}
+    __forceinline vboolf(TrueTy)  : v(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()))) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline bool operator [](size_t index) const { assert(index < 4); return (_mm_movemask_ps(v) >> index) & 1; }
+    __forceinline int& operator [](size_t index)       { assert(index < 4); return i[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboolf4 operator !(const vboolf4& a) { return _mm_xor_ps(a, vboolf4(embree::True)); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboolf4 operator &(const vboolf4& a, const vboolf4& b) { return _mm_and_ps(a, b); }
+  __forceinline vboolf4 operator |(const vboolf4& a, const vboolf4& b) { return _mm_or_ps (a, b); }
+  __forceinline vboolf4 operator ^(const vboolf4& a, const vboolf4& b) { return _mm_xor_ps(a, b); }
+
+  __forceinline vboolf4 andn(const vboolf4& a, const vboolf4& b) { return _mm_andnot_ps(b, a); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboolf4& operator &=(vboolf4& a, const vboolf4& b) { return a = a & b; }
+  __forceinline vboolf4& operator |=(vboolf4& a, const vboolf4& b) { return a = a | b; }
+  __forceinline vboolf4& operator ^=(vboolf4& a, const vboolf4& b) { return a = a ^ b; }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboolf4 operator !=(const vboolf4& a, const vboolf4& b) { return _mm_xor_ps(a, b); }
+  __forceinline vboolf4 operator ==(const vboolf4& a, const vboolf4& b) { return _mm_castsi128_ps(_mm_cmpeq_epi32(a, b)); }
+  
+  __forceinline vboolf4 select(const vboolf4& m, const vboolf4& t, const vboolf4& f) {
+#if defined(__SSE4_1__)
+    return _mm_blendv_ps(f, t, m); 
+#else
+    return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f)); 
+#endif
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboolf4 unpacklo(const vboolf4& a, const vboolf4& b) { return _mm_unpacklo_ps(a, b); }
+  __forceinline vboolf4 unpackhi(const vboolf4& a, const vboolf4& b) { return _mm_unpackhi_ps(a, b); }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vboolf4 shuffle(const vboolf4& v) {
+    return _mm_castsi128_ps(_mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vboolf4 shuffle(const vboolf4& a, const vboolf4& b) {
+    return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+
+  template<int i0>
+  __forceinline vboolf4 shuffle(const vboolf4& v) {
+    return shuffle<i0,i0,i0,i0>(v);
+  }
+
+#if defined(__SSE3__)
+  template<> __forceinline vboolf4 shuffle<0, 0, 2, 2>(const vboolf4& v) { return _mm_moveldup_ps(v); }
+  template<> __forceinline vboolf4 shuffle<1, 1, 3, 3>(const vboolf4& v) { return _mm_movehdup_ps(v); }
+  template<> __forceinline vboolf4 shuffle<0, 1, 0, 1>(const vboolf4& v) { return _mm_castpd_ps(_mm_movedup_pd(v)); }
+#endif
+
+#if defined(__SSE4_1__)
+  template<int dst, int src, int clr> __forceinline vboolf4 insert(const vboolf4& a, const vboolf4& b) { return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); }
+  template<int dst, int src> __forceinline vboolf4 insert(const vboolf4& a, const vboolf4& b) { return insert<dst, src, 0>(a, b); }
+  template<int dst> __forceinline vboolf4 insert(const vboolf4& a, const bool b) { return insert<dst, 0>(a, vboolf4(b)); }
+#endif
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operations
+  ////////////////////////////////////////////////////////////////////////////////
+    
+  __forceinline bool reduce_and(const vboolf4& a) { return _mm_movemask_ps(a) == 0xf; }
+  __forceinline bool reduce_or (const vboolf4& a) { return _mm_movemask_ps(a) != 0x0; }
+
+  __forceinline bool all (const vboolf4& b) { return _mm_movemask_ps(b) == 0xf; }
+  __forceinline bool any (const vboolf4& b) { return _mm_movemask_ps(b) != 0x0; }
+  __forceinline bool none(const vboolf4& b) { return _mm_movemask_ps(b) == 0x0; }
+
+  __forceinline bool all (const vboolf4& valid, const vboolf4& b) { return all((!valid) | b); }
+  __forceinline bool any (const vboolf4& valid, const vboolf4& b) { return any(valid & b); }
+  __forceinline bool none(const vboolf4& valid, const vboolf4& b) { return none(valid & b); }
+  
+  __forceinline size_t movemask(const vboolf4& a) { return _mm_movemask_ps(a); }
+#if defined(__SSE4_2__)
+  __forceinline size_t popcnt(const vboolf4& a) { return popcnt((size_t)_mm_movemask_ps(a)); }
+#else
+  __forceinline size_t popcnt(const vboolf4& a) { return bool(a[0])+bool(a[1])+bool(a[2])+bool(a[3]); }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Get/Set Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool get(const vboolf4& a, size_t index) { return a[index]; }
+  __forceinline void set(vboolf4& a, size_t index)       { a[index] = -1; }
+  __forceinline void clear(vboolf4& a, size_t index)     { a[index] =  0; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf4& a) {
+    return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ">";
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vboolf8_avx.h b/thirdparty/embree/common/simd/vboolf8_avx.h
new file mode 100644
index 0000000000..ba77cc3c5e
--- /dev/null
+++ b/thirdparty/embree/common/simd/vboolf8_avx.h
@@ -0,0 +1,202 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+  /* 8-wide AVX bool type */
+  template<>
+  struct vboolf<8>
+  {
+    ALIGNED_STRUCT_(32);
+    
+    typedef vboolf8 Bool;
+    typedef vint8   Int;
+    typedef vfloat8 Float;
+
+    enum  { size = 8 };       // number of SIMD elements
+    union {                   // data
+      __m256 v;
+      struct { __m128 vl,vh; };
+      int i[8];
+    };  
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboolf() {}
+    __forceinline vboolf(const vboolf8& a) { v = a.v; }
+    __forceinline vboolf8& operator =(const vboolf8& a) { v = a.v; return *this; }
+
+    __forceinline vboolf(__m256 a) : v(a) {}
+    __forceinline operator const __m256&() const { return v; }
+    __forceinline operator const __m256i() const { return _mm256_castps_si256(v); }
+    __forceinline operator const __m256d() const { return _mm256_castps_pd(v); }
+
+    __forceinline vboolf(int a)
+    {
+      assert(a >= 0 && a <= 255);
+#if defined (__AVX2__)
+      const __m256i mask = _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1);
+      const __m256i b = _mm256_set1_epi32(a);
+      const __m256i c = _mm256_and_si256(b,mask);
+      v = _mm256_castsi256_ps(_mm256_cmpeq_epi32(c,mask));
+#else
+      vl = mm_lookupmask_ps[a & 0xF];
+      vh = mm_lookupmask_ps[a >> 4];
+#endif
+    }
+
+    __forceinline vboolf(const vboolf4& a) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),a,1)) {}
+    __forceinline vboolf(const vboolf4& a, const vboolf4& b) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),b,1)) {}
+    __forceinline vboolf(__m128 a, __m128 b) : vl(a), vh(b) {}
+
+    __forceinline vboolf(bool a) : v(vboolf8(vboolf4(a), vboolf4(a))) {}
+    __forceinline vboolf(bool a, bool b) : v(vboolf8(vboolf4(a), vboolf4(b))) {}
+    __forceinline vboolf(bool a, bool b, bool c, bool d) : v(vboolf8(vboolf4(a,b), vboolf4(c,d))) {}
+    __forceinline vboolf(bool a, bool b, bool c, bool d, bool e, bool f, bool g, bool h) : v(vboolf8(vboolf4(a,b,c,d), vboolf4(e,f,g,h))) {}
+
+    /* return int32 mask */
+    __forceinline __m256i mask32() const { 
+      return _mm256_castps_si256(v);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboolf(FalseTy) : v(_mm256_setzero_ps()) {}
+    __forceinline vboolf(TrueTy)  : v(_mm256_cmp_ps(_mm256_setzero_ps(), _mm256_setzero_ps(), _CMP_EQ_OQ)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline bool operator [](size_t index) const { assert(index < 8); return (_mm256_movemask_ps(v) >> index) & 1; }
+    __forceinline int& operator [](size_t index)       { assert(index < 8); return i[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8 operator !(const vboolf8& a) { return _mm256_xor_ps(a, vboolf8(embree::True)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8 operator &(const vboolf8& a, const vboolf8& b) { return _mm256_and_ps(a, b); }
+  __forceinline vboolf8 operator |(const vboolf8& a, const vboolf8& b) { return _mm256_or_ps (a, b); }
+  __forceinline vboolf8 operator ^(const vboolf8& a, const vboolf8& b) { return _mm256_xor_ps(a, b); }
+
+  __forceinline vboolf8 andn(const vboolf8& a, const vboolf8& b) { return _mm256_andnot_ps(b, a); }
+
+  __forceinline vboolf8& operator &=(vboolf8& a, const vboolf8& b) { return a = a & b; }
+  __forceinline vboolf8& operator |=(vboolf8& a, const vboolf8& b) { return a = a | b; }
+  __forceinline vboolf8& operator ^=(vboolf8& a, const vboolf8& b) { return a = a ^ b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8 operator !=(const vboolf8& a, const vboolf8& b) { return _mm256_xor_ps(a, b); }
+  __forceinline vboolf8 operator ==(const vboolf8& a, const vboolf8& b) { return _mm256_xor_ps(_mm256_xor_ps(a,b),vboolf8(embree::True)); }
+
+  __forceinline vboolf8 select(const vboolf8& mask, const vboolf8& t, const vboolf8& f) {
+    return _mm256_blendv_ps(f, t, mask); 
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8 unpacklo(const vboolf8& a, const vboolf8& b) { return _mm256_unpacklo_ps(a, b); }
+  __forceinline vboolf8 unpackhi(const vboolf8& a, const vboolf8& b) { return _mm256_unpackhi_ps(a, b); }
+
+  template<int i>
+  __forceinline vboolf8 shuffle(const vboolf8& v) {
+    return _mm256_permute_ps(v, _MM_SHUFFLE(i, i, i, i));
+  }
+
+  template<int i0, int i1>
+  __forceinline vboolf8 shuffle4(const vboolf8& v) {
+    return _mm256_permute2f128_ps(v, v, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1>
+  __forceinline vboolf8 shuffle4(const vboolf8& a, const vboolf8& b) {
+    return _mm256_permute2f128_ps(a, b, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vboolf8 shuffle(const vboolf8& v) {
+    return _mm256_permute_ps(v, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vboolf8 shuffle(const vboolf8& a, const vboolf8& b) {
+    return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+
+  template<> __forceinline vboolf8 shuffle<0, 0, 2, 2>(const vboolf8& v) { return _mm256_moveldup_ps(v); }
+  template<> __forceinline vboolf8 shuffle<1, 1, 3, 3>(const vboolf8& v) { return _mm256_movehdup_ps(v); }
+  template<> __forceinline vboolf8 shuffle<0, 1, 0, 1>(const vboolf8& v) { return _mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(v))); }
+
+  template<int i> __forceinline vboolf8 insert4(const vboolf8& a, const vboolf4& b) { return _mm256_insertf128_ps(a, b, i); }
+  template<int i> __forceinline vboolf4 extract4   (const vboolf8& a) { return _mm256_extractf128_ps(a, i); }
+  template<>      __forceinline vboolf4 extract4<0>(const vboolf8& a) { return _mm256_castps256_ps128(a);   }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool reduce_and(const vboolf8& a) { return _mm256_movemask_ps(a) == (unsigned int)0xff; }
+  __forceinline bool reduce_or (const vboolf8& a) { return !_mm256_testz_ps(a,a); }
+
+  __forceinline bool all (const vboolf8& a) { return _mm256_movemask_ps(a) == (unsigned int)0xff; }
+  __forceinline bool any (const vboolf8& a) { return !_mm256_testz_ps(a,a); }
+  __forceinline bool none(const vboolf8& a) { return _mm256_testz_ps(a,a) != 0; }
+
+  __forceinline bool all (const vboolf8& valid, const vboolf8& b) { return all((!valid) | b); }
+  __forceinline bool any (const vboolf8& valid, const vboolf8& b) { return any(valid & b); }
+  __forceinline bool none(const vboolf8& valid, const vboolf8& b) { return none(valid & b); }
+
+  __forceinline unsigned int movemask(const vboolf8& a) { return _mm256_movemask_ps(a); }
+  __forceinline size_t       popcnt  (const vboolf8& a) { return popcnt((size_t)_mm256_movemask_ps(a)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Get/Set Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool get(const vboolf8& a, size_t index) { return a[index]; }
+  __forceinline void set(vboolf8& a, size_t index)       { a[index] = -1; }
+  __forceinline void clear(vboolf8& a, size_t index)     { a[index] =  0; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf8& a) {
+    return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", "
+                       << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">";
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vboolf8_avx512.h b/thirdparty/embree/common/simd/vboolf8_avx512.h
new file mode 100644
index 0000000000..73ff5666e1
--- /dev/null
+++ b/thirdparty/embree/common/simd/vboolf8_avx512.h
@@ -0,0 +1,159 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+  /* 8-wide AVX-512 bool type */
+  template<>
+  struct vboolf<8>
+  {
+    typedef vboolf8 Bool;
+    typedef vint8   Int;
+
+    enum { size = 8 }; // number of SIMD elements
+    __mmask8 v;        // data
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboolf() {}
+    __forceinline vboolf(const vboolf8& t) { v = t.v; }
+    __forceinline vboolf8& operator =(const vboolf8& f) { v = f.v; return *this; }
+
+    __forceinline vboolf(const __mmask8 &t) { v = t; }
+    __forceinline operator __mmask8() const { return v; }
+
+    __forceinline vboolf(bool b) { v = b ? 0xff : 0x00; }
+    __forceinline vboolf(int t)  { v = (__mmask8)t; }
+    __forceinline vboolf(unsigned int t) { v = (__mmask8)t; }
+
+    __forceinline vboolf(bool a, bool b, bool c, bool d, bool e, bool f, bool g, bool h)
+      : v((__mmask8)((int(h) << 7) | (int(g) << 6) | (int(f) << 5) | (int(e) << 4) | (int(d) << 3) | (int(c) << 2) | (int(b) << 1) | int(a))) {}
+
+    /* return int8 mask */
+    __forceinline __m128i mask8() const {
+      return _mm_movm_epi8(v);
+    }
+
+    /* return int32 mask */
+    __forceinline __m256i mask32() const {
+      return _mm256_movm_epi32(v);
+    }
+
+    /* return int64 mask */
+    __forceinline __m512i mask64() const {
+      return _mm512_movm_epi64(v);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboolf(FalseTy) : v(0x00) {}
+    __forceinline vboolf(TrueTy)  : v(0xff) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline bool operator [](size_t index) const {
+      assert(index < 8); return (mm512_mask2int(v) >> index) & 1;
+    }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8 operator !(const vboolf8& a) { return _mm512_knot(a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8 operator &(const vboolf8& a, const vboolf8& b) { return _mm512_kand(a, b); }
+  __forceinline vboolf8 operator |(const vboolf8& a, const vboolf8& b) { return _mm512_kor(a, b); }
+  __forceinline vboolf8 operator ^(const vboolf8& a, const vboolf8& b) { return _mm512_kxor(a, b); }
+
+  __forceinline vboolf8 andn(const vboolf8& a, const vboolf8& b) { return _mm512_kandn(b, a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8& operator &=(vboolf8& a, const vboolf8& b) { return a = a & b; }
+  __forceinline vboolf8& operator |=(vboolf8& a, const vboolf8& b) { return a = a | b; }
+  __forceinline vboolf8& operator ^=(vboolf8& a, const vboolf8& b) { return a = a ^ b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8 operator !=(const vboolf8& a, const vboolf8& b) { return _mm512_kxor(a, b); }
+  __forceinline vboolf8 operator ==(const vboolf8& a, const vboolf8& b) { return _mm512_kxnor(a, b); }
+
+  __forceinline vboolf8 select(const vboolf8& s, const vboolf8& a, const vboolf8& b) {
+    return _mm512_kor(_mm512_kand(s, a), _mm512_kandn(s, b));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline int all (const vboolf8& a) { return a.v == 0xff; }
+  __forceinline int any (const vboolf8& a) { return _mm512_kortestz(a, a) == 0; }
+  __forceinline int none(const vboolf8& a) { return _mm512_kortestz(a, a) != 0; }
+
+  __forceinline int all (const vboolf8& valid, const vboolf8& b) { return all((!valid) | b); }
+  __forceinline int any (const vboolf8& valid, const vboolf8& b) { return any(valid & b); }
+  __forceinline int none(const vboolf8& valid, const vboolf8& b) { return none(valid & b); }
+
+  __forceinline size_t movemask(const vboolf8& a) { return _mm512_kmov(a); }
+  __forceinline size_t popcnt  (const vboolf8& a) { return popcnt(a.v); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Conversion Operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline unsigned int toInt(const vboolf8& a) { return mm512_mask2int(a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Get/Set Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool get(const vboolf8& a, size_t index) { assert(index < 8); return (toInt(a) >> index) & 1; }
+  __forceinline void set(vboolf8& a, size_t index)       { assert(index < 8); a |= 1 << index; }
+  __forceinline void clear(vboolf8& a, size_t index)     { assert(index < 8); a = andn(a, 1 << index); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf8& a)
+  {
+    cout << "<";
+    for (size_t i=0; i<8; i++) {
+      if ((a.v >> i) & 1) cout << "1"; else cout << "0";
+    }
+    return cout << ">";
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vdouble4_avx.h b/thirdparty/embree/common/simd/vdouble4_avx.h
new file mode 100644
index 0000000000..55326de7dd
--- /dev/null
+++ b/thirdparty/embree/common/simd/vdouble4_avx.h
@@ -0,0 +1,321 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{ 
+  /* 4-wide AVX 64-bit double type */
+  template<>
+  struct vdouble<4>
+  {
+    ALIGNED_STRUCT_(32);
+            
+    typedef vboold4 Bool;
+
+    enum  { size = 4 }; // number of SIMD elements
+    union {             // data
+      __m256d v; 
+      double i[4]; 
+    };
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+       
+    __forceinline vdouble() {}
+    __forceinline vdouble(const vdouble4& t) { v = t.v; }
+    __forceinline vdouble4& operator =(const vdouble4& f) { v = f.v; return *this; }
+
+    __forceinline vdouble(const __m256d& t) { v = t; }
+    __forceinline operator __m256d() const { return v; }
+
+    __forceinline vdouble(double i) {
+      v = _mm256_set1_pd(i);
+    }
+    
+    __forceinline vdouble(double a, double b, double c, double d) {
+      v = _mm256_set_pd(d,c,b,a);      
+    }
+   
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline vdouble(ZeroTy) : v(_mm256_setzero_pd()) {}
+    __forceinline vdouble(OneTy)  : v(_mm256_set1_pd(1)) {}
+    __forceinline vdouble(StepTy) : v(_mm256_set_pd(3.0,2.0,1.0,0.0)) {}
+    __forceinline vdouble(ReverseStepTy) : v(_mm256_setr_pd(3.0,2.0,1.0,0.0)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline void store_nt(double *__restrict__ ptr, const vdouble4& a) {
+      _mm256_stream_pd(ptr, a);
+    }
+
+    static __forceinline vdouble4 loadu(const double* addr) {
+      return _mm256_loadu_pd(addr);
+    }
+
+    static __forceinline vdouble4 load(const vdouble4* addr) {
+      return _mm256_load_pd((double*)addr);
+    }
+
+    static __forceinline vdouble4 load(const double* addr) {
+      return _mm256_load_pd(addr);
+    }
+
+    static __forceinline void store(double* ptr, const vdouble4& v) {
+      _mm256_store_pd(ptr, v);
+    }
+
+    static __forceinline void storeu(double* ptr, const vdouble4& v) {
+      _mm256_storeu_pd(ptr, v);
+    }
+
+    static __forceinline vdouble4 broadcast(const void* a) { return _mm256_set1_pd(*(double*)a); }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline       double& operator [](size_t index)       { assert(index < 4); return i[index]; }
+    __forceinline const double& operator [](size_t index) const { assert(index < 4); return i[index]; }
+  };
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX2__)
+  __forceinline vdouble4 asDouble(const vllong4&  a) { return _mm256_castsi256_pd(a); }
+  __forceinline vllong4  asLLong (const vdouble4& a) { return _mm256_castpd_si256(a); }
+#endif
+
+  __forceinline vdouble4 operator +(const vdouble4& a) { return a; }
+  __forceinline vdouble4 operator -(const vdouble4& a) { return _mm256_sub_pd(_mm256_setzero_pd(), a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vdouble4 operator +(const vdouble4& a, const vdouble4& b) { return _mm256_add_pd(a, b); }
+  __forceinline vdouble4 operator +(const vdouble4& a, double          b) { return a + vdouble4(b); }
+  __forceinline vdouble4 operator +(double          a, const vdouble4& b) { return vdouble4(a) + b; }
+
+  __forceinline vdouble4 operator -(const vdouble4& a, const vdouble4& b) { return _mm256_sub_pd(a, b); }
+  __forceinline vdouble4 operator -(const vdouble4& a, double          b) { return a - vdouble4(b); }
+  __forceinline vdouble4 operator -(double          a, const vdouble4& b) { return vdouble4(a) - b; }
+
+  __forceinline vdouble4 operator *(const vdouble4& a, const vdouble4& b) { return _mm256_mul_pd(a, b); }
+  __forceinline vdouble4 operator *(const vdouble4& a, double          b) { return a * vdouble4(b); }
+  __forceinline vdouble4 operator *(double          a, const vdouble4& b) { return vdouble4(a) * b; }
+
+  __forceinline vdouble4 operator &(const vdouble4& a, const vdouble4& b) { return _mm256_and_pd(a, b); }
+  __forceinline vdouble4 operator &(const vdouble4& a, double          b) { return a & vdouble4(b); }
+  __forceinline vdouble4 operator &(double          a, const vdouble4& b) { return vdouble4(a) & b; }
+
+  __forceinline vdouble4 operator |(const vdouble4& a, const vdouble4& b) { return _mm256_or_pd(a, b); }
+  __forceinline vdouble4 operator |(const vdouble4& a, double          b) { return a | vdouble4(b); }
+  __forceinline vdouble4 operator |(double          a, const vdouble4& b) { return vdouble4(a) | b; }
+
+  __forceinline vdouble4 operator ^(const vdouble4& a, const vdouble4& b) { return _mm256_xor_pd(a, b); }
+  __forceinline vdouble4 operator ^(const vdouble4& a, double          b) { return a ^ vdouble4(b); }
+  __forceinline vdouble4 operator ^(double          a, const vdouble4& b) { return vdouble4(a) ^ b; }
+  
+  __forceinline vdouble4 min(const vdouble4& a, const vdouble4& b) { return _mm256_min_pd(a, b); }
+  __forceinline vdouble4 min(const vdouble4& a, double          b) { return min(a,vdouble4(b)); }
+  __forceinline vdouble4 min(double          a, const vdouble4& b) { return min(vdouble4(a),b); }
+
+  __forceinline vdouble4 max(const vdouble4& a, const vdouble4& b) { return _mm256_max_pd(a, b); }
+  __forceinline vdouble4 max(const vdouble4& a, double          b) { return max(a,vdouble4(b)); }
+  __forceinline vdouble4 max(double          a, const vdouble4& b) { return max(vdouble4(a),b); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__FMA__)
+  __forceinline vdouble4 madd (const vdouble4& a, const vdouble4& b, const vdouble4& c) { return _mm256_fmadd_pd(a,b,c); }
+  __forceinline vdouble4 msub (const vdouble4& a, const vdouble4& b, const vdouble4& c) { return _mm256_fmsub_pd(a,b,c); }
+  __forceinline vdouble4 nmadd(const vdouble4& a, const vdouble4& b, const vdouble4& c) { return _mm256_fnmadd_pd(a,b,c); }
+  __forceinline vdouble4 nmsub(const vdouble4& a, const vdouble4& b, const vdouble4& c) { return _mm256_fnmsub_pd(a,b,c); }
+#else
+  __forceinline vdouble4 madd (const vdouble4& a, const vdouble4& b, const vdouble4& c) { return a*b+c; }
+  __forceinline vdouble4 msub (const vdouble4& a, const vdouble4& b, const vdouble4& c) { return a*b-c; }
+  __forceinline vdouble4 nmadd(const vdouble4& a, const vdouble4& b, const vdouble4& c) { return -a*b+c;}
+  __forceinline vdouble4 nmsub(const vdouble4& a, const vdouble4& b, const vdouble4& c) { return -a*b-c; }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vdouble4& operator +=(vdouble4& a, const vdouble4& b) { return a = a + b; }
+  __forceinline vdouble4& operator +=(vdouble4& a, double          b) { return a = a + b; }
+  
+  __forceinline vdouble4& operator -=(vdouble4& a, const vdouble4& b) { return a = a - b; }
+  __forceinline vdouble4& operator -=(vdouble4& a, double          b) { return a = a - b; }
+
+  __forceinline vdouble4& operator *=(vdouble4& a, const vdouble4& b) { return a = a * b; }
+  __forceinline vdouble4& operator *=(vdouble4& a, double          b) { return a = a * b; }
+  
+  __forceinline vdouble4& operator &=(vdouble4& a, const vdouble4& b) { return a = a & b; }
+  __forceinline vdouble4& operator &=(vdouble4& a, double          b) { return a = a & b; }
+  
+  __forceinline vdouble4& operator |=(vdouble4& a, const vdouble4& b) { return a = a | b; }
+  __forceinline vdouble4& operator |=(vdouble4& a, double          b) { return a = a | b; }
+  
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  __forceinline vboold4 operator ==(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_EQ); }
+  __forceinline vboold4 operator !=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_NE); }
+  __forceinline vboold4 operator < (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_LT); }
+  __forceinline vboold4 operator >=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_GE); }
+  __forceinline vboold4 operator > (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_GT); }
+  __forceinline vboold4 operator <=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_LE); }
+#else
+  __forceinline vboold4 operator ==(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_EQ_OQ);  }
+  __forceinline vboold4 operator !=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NEQ_UQ); }
+  __forceinline vboold4 operator < (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_LT_OS);  }
+  __forceinline vboold4 operator >=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NLT_US); }
+  __forceinline vboold4 operator > (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NLE_US); }
+  __forceinline vboold4 operator <=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_LE_OS);  }
+#endif
+
+  __forceinline vboold4 operator ==(const vdouble4& a, double          b) { return a == vdouble4(b); }
+  __forceinline vboold4 operator ==(double          a, const vdouble4& b) { return vdouble4(a) == b; }
+
+  __forceinline vboold4 operator !=(const vdouble4& a, double          b) { return a != vdouble4(b); }
+  __forceinline vboold4 operator !=(double          a, const vdouble4& b) { return vdouble4(a) != b; }
+
+  __forceinline vboold4 operator < (const vdouble4& a, double          b) { return a <  vdouble4(b); }
+  __forceinline vboold4 operator < (double          a, const vdouble4& b) { return vdouble4(a) <  b; }
+
+  __forceinline vboold4 operator >=(const vdouble4& a, double          b) { return a >= vdouble4(b); }
+  __forceinline vboold4 operator >=(double          a, const vdouble4& b) { return vdouble4(a) >= b; }
+
+  __forceinline vboold4 operator > (const vdouble4& a, double          b) { return a >  vdouble4(b); }
+  __forceinline vboold4 operator > (double          a, const vdouble4& b) { return vdouble4(a) >  b; }
+
+  __forceinline vboold4 operator <=(const vdouble4& a, double          b) { return a <= vdouble4(b); }
+  __forceinline vboold4 operator <=(double          a, const vdouble4& b) { return vdouble4(a) <= b; }
+
+  __forceinline vboold4 eq(const vdouble4& a, const vdouble4& b) { return a == b; }
+  __forceinline vboold4 ne(const vdouble4& a, const vdouble4& b) { return a != b; }
+  __forceinline vboold4 lt(const vdouble4& a, const vdouble4& b) { return a <  b; }
+  __forceinline vboold4 ge(const vdouble4& a, const vdouble4& b) { return a >= b; }
+  __forceinline vboold4 gt(const vdouble4& a, const vdouble4& b) { return a >  b; }
+  __forceinline vboold4 le(const vdouble4& a, const vdouble4& b) { return a <= b; }
+
+#if defined(__AVX512VL__)
+  __forceinline vboold4 eq(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_EQ); }
+  __forceinline vboold4 ne(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_NE); }
+  __forceinline vboold4 lt(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_LT); }
+  __forceinline vboold4 ge(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_GE); }
+  __forceinline vboold4 gt(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_GT); }
+  __forceinline vboold4 le(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_LE); }
+#else
+  __forceinline vboold4 eq(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a == b); }
+  __forceinline vboold4 ne(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a != b); }
+  __forceinline vboold4 lt(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a <  b); }
+  __forceinline vboold4 ge(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a >= b); }
+  __forceinline vboold4 gt(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a >  b); }
+  __forceinline vboold4 le(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a <= b); }
+#endif
+ 
+  __forceinline vdouble4 select(const vboold4& m, const vdouble4& t, const vdouble4& f) {
+#if defined(__AVX512VL__)
+    return _mm256_mask_blend_pd(m, f, t);
+#else
+    return _mm256_blendv_pd(f, t, m);
+#endif
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<int i0, int i1>
+  __forceinline vdouble4 shuffle(const vdouble4& v) {
+    return _mm256_permute_pd(v, (i1 << 3) | (i0 << 2) | (i1 << 1) | i0);
+  }
+
+  template<int i>
+  __forceinline vdouble4 shuffle(const vdouble4& v) {
+    return shuffle<i, i>(v);
+  }
+
+  template<int i0, int i1>
+  __forceinline vdouble4 shuffle2(const vdouble4& v) {
+    return _mm256_permute2f128_pd(v, v, (i1 << 4) | i0);
+  }
+
+  __forceinline double toScalar(const vdouble4& v) {
+    return _mm_cvtsd_f64(_mm256_castpd256_pd128(v));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vdouble4 vreduce_min2(const vdouble4& x) { return min(x, shuffle<1,0>(x)); }
+  __forceinline vdouble4 vreduce_min (const vdouble4& y) { const vdouble4 x = vreduce_min2(y); return min(x, shuffle2<1,0>(x)); }
+
+  __forceinline vdouble4 vreduce_max2(const vdouble4& x) { return max(x,shuffle<1,0>(x)); }
+  __forceinline vdouble4 vreduce_max (const vdouble4& y) { const vdouble4 x = vreduce_max2(y); return max(x, shuffle2<1,0>(x)); }
+
+  __forceinline vdouble4 vreduce_and2(const vdouble4& x) { return x & shuffle<1,0>(x); }
+  __forceinline vdouble4 vreduce_and (const vdouble4& y) { const vdouble4 x = vreduce_and2(y); return x & shuffle2<1,0>(x); }
+
+  __forceinline vdouble4 vreduce_or2(const vdouble4& x) { return x | shuffle<1,0>(x); }
+  __forceinline vdouble4 vreduce_or (const vdouble4& y) { const vdouble4 x = vreduce_or2(y); return x | shuffle2<1,0>(x); }
+
+  __forceinline vdouble4 vreduce_add2(const vdouble4& x) { return x + shuffle<1,0>(x); }
+  __forceinline vdouble4 vreduce_add (const vdouble4& y) { const vdouble4 x = vreduce_add2(y); return x + shuffle2<1,0>(x); }
+
+  __forceinline double reduce_add(const vdouble4& a) { return toScalar(vreduce_add(a)); }
+  __forceinline double reduce_min(const vdouble4& a) { return toScalar(vreduce_min(a)); }
+  __forceinline double reduce_max(const vdouble4& a) { return toScalar(vreduce_max(a)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Memory load and store operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vdouble4& v)
+  {
+    cout << "<" << v[0];
+    for (size_t i=1; i<4; i++) cout << ", " << v[i];
+    cout << ">";
+    return cout;
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vdouble8_avx512.h b/thirdparty/embree/common/simd/vdouble8_avx512.h
new file mode 100644
index 0000000000..98d21bfe4a
--- /dev/null
+++ b/thirdparty/embree/common/simd/vdouble8_avx512.h
@@ -0,0 +1,351 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+  /* 8-wide AVX-512 64-bit double type */
+  template<>
+  struct vdouble<8>
+  {
+    ALIGNED_STRUCT_(64);
+    
+    typedef vboold8 Bool;
+
+    enum  { size = 8 }; // number of SIMD elements
+    union {              // data
+      __m512d v;
+      double i[8];
+    };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vdouble() {}
+    __forceinline vdouble(const vdouble8& t) { v = t.v; }
+    __forceinline vdouble8& operator =(const vdouble8& f) { v = f.v; return *this; }
+
+    __forceinline vdouble(const __m512d& t) { v = t; }
+    __forceinline operator __m512d() const { return v; }
+    __forceinline operator __m256d() const { return _mm512_castpd512_pd256(v); }
+
+    __forceinline vdouble(double i) {
+      v = _mm512_set1_pd(i);
+    }
+
+    __forceinline vdouble(double a, double b, double c, double d) {
+      v = _mm512_set4_pd(d,c,b,a);
+    }
+
+    __forceinline vdouble(double a0, double a1, double a2, double a3,
+                          double a4, double a5, double a6, double a7)
+    {
+      v = _mm512_set_pd(a7,a6,a5,a4,a3,a2,a1,a0);
+    }
+
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vdouble(ZeroTy) : v(_mm512_setzero_pd()) {}
+    __forceinline vdouble(OneTy)  : v(_mm512_set1_pd(1)) {}
+    __forceinline vdouble(StepTy) : v(_mm512_set_pd(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)) {}
+    __forceinline vdouble(ReverseStepTy) : v(_mm512_setr_pd(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline void store_nt(void *__restrict__ ptr, const vdouble8& a) {
+      _mm512_stream_pd((double*)ptr, a);
+    }
+
+    static __forceinline vdouble8 loadu(const void* addr) {
+      return _mm512_loadu_pd((double*)addr);
+    }
+
+    static __forceinline vdouble8 load(const vdouble8* addr) {
+      return _mm512_load_pd((double*)addr);
+    }
+
+    static __forceinline vdouble8 load(const double* addr) {
+      return _mm512_load_pd(addr);
+    }
+
+    static __forceinline void store(void* ptr, const vdouble8& v) {
+      _mm512_store_pd(ptr, v);
+    }
+
+    static __forceinline void storeu(void* ptr, const vdouble8& v) {
+      _mm512_storeu_pd(ptr, v);
+    }
+
+    static __forceinline void storeu(const vboold8& mask, double* ptr, const vdouble8& f) {
+      _mm512_mask_storeu_pd(ptr, mask, f);
+    }
+
+    static __forceinline void store(const vboold8& mask, void* addr, const vdouble8& v2) {
+      _mm512_mask_store_pd(addr, mask, v2);
+    }
+
+    static __forceinline vdouble8 compact(const vboold8& mask, vdouble8& v) {
+      return _mm512_mask_compress_pd(v, mask, v);
+    }
+
+    static __forceinline vdouble8 compact(const vboold8& mask, const vdouble8& a, vdouble8& b) {
+      return _mm512_mask_compress_pd(a, mask, b);
+    }
+
+    static __forceinline vdouble8 broadcast(const void* a) { return _mm512_set1_pd(*(double*)a); }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline       double& operator [](size_t index)       { assert(index < 8); return i[index]; }
+    __forceinline const double& operator [](size_t index) const { assert(index < 8); return i[index]; }
+
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vdouble8 asDouble(const vllong8&  a) { return _mm512_castsi512_pd(a); }
+  __forceinline vllong8  asLLong (const vdouble8& a) { return _mm512_castpd_si512(a); }
+
+  __forceinline vdouble8 operator +(const vdouble8& a) { return a; }
+  __forceinline vdouble8 operator -(const vdouble8& a) { return _mm512_sub_pd(_mm512_setzero_pd(), a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vdouble8 operator +(const vdouble8& a, const vdouble8& b) { return _mm512_add_pd(a, b); }
+  __forceinline vdouble8 operator +(const vdouble8& a, double          b) { return a + vdouble8(b); }
+  __forceinline vdouble8 operator +(double          a, const vdouble8& b) { return vdouble8(a) + b; }
+
+  __forceinline vdouble8 operator -(const vdouble8& a, const vdouble8& b) { return _mm512_sub_pd(a, b); }
+  __forceinline vdouble8 operator -(const vdouble8& a, double          b) { return a - vdouble8(b); }
+  __forceinline vdouble8 operator -(double          a, const vdouble8& b) { return vdouble8(a) - b; }
+
+  __forceinline vdouble8 operator *(const vdouble8& a, const vdouble8& b) { return _mm512_mul_pd(a, b); }
+  __forceinline vdouble8 operator *(const vdouble8& a, double          b) { return a * vdouble8(b); }
+  __forceinline vdouble8 operator *(double          a, const vdouble8& b) { return vdouble8(a) * b; }
+
+  __forceinline vdouble8 operator &(const vdouble8& a, const vdouble8& b) { return _mm512_and_pd(a, b); }
+  __forceinline vdouble8 operator &(const vdouble8& a, double          b) { return a & vdouble8(b); }
+  __forceinline vdouble8 operator &(double          a, const vdouble8& b) { return vdouble8(a) & b; }
+
+  __forceinline vdouble8 operator |(const vdouble8& a, const vdouble8& b) { return _mm512_or_pd(a, b); }
+  __forceinline vdouble8 operator |(const vdouble8& a, double          b) { return a | vdouble8(b); }
+  __forceinline vdouble8 operator |(double          a, const vdouble8& b) { return vdouble8(a) | b; }
+
+  __forceinline vdouble8 operator ^(const vdouble8& a, const vdouble8& b) { return _mm512_xor_pd(a, b); }
+  __forceinline vdouble8 operator ^(const vdouble8& a, double          b) { return a ^ vdouble8(b); }
+  __forceinline vdouble8 operator ^(double          a, const vdouble8& b) { return vdouble8(a) ^ b; }
+
+  __forceinline vdouble8 operator <<(const vdouble8& a, const unsigned int n) { return _mm512_castsi512_pd(_mm512_slli_epi64(_mm512_castpd_si512(a), n)); }
+  __forceinline vdouble8 operator >>(const vdouble8& a, const unsigned int n) { return _mm512_castsi512_pd(_mm512_srai_epi64(_mm512_castpd_si512(a), n)); }
+
+  __forceinline vdouble8 operator <<(const vdouble8& a, const vllong8& n) { return _mm512_castsi512_pd(_mm512_sllv_epi64(_mm512_castpd_si512(a), n)); }
+  __forceinline vdouble8 operator >>(const vdouble8& a, const vllong8& n) { return _mm512_castsi512_pd(_mm512_srav_epi64(_mm512_castpd_si512(a), n)); }
+
+  __forceinline vdouble8 sll (const vdouble8& a, const unsigned int b) { return  _mm512_castsi512_pd(_mm512_slli_epi64(_mm512_castpd_si512(a), b)); }
+  __forceinline vdouble8 sra (const vdouble8& a, const unsigned int b) { return  _mm512_castsi512_pd(_mm512_srai_epi64(_mm512_castpd_si512(a), b)); }
+  __forceinline vdouble8 srl (const vdouble8& a, const unsigned int b) { return  _mm512_castsi512_pd(_mm512_srli_epi64(_mm512_castpd_si512(a), b)); }
+
+  __forceinline vdouble8 min(const vdouble8& a, const vdouble8& b) { return _mm512_min_pd(a, b); }
+  __forceinline vdouble8 min(const vdouble8& a, double          b) { return min(a,vdouble8(b)); }
+  __forceinline vdouble8 min(double          a, const vdouble8& b) { return min(vdouble8(a),b); }
+
+  __forceinline vdouble8 max(const vdouble8& a, const vdouble8& b) { return _mm512_max_pd(a, b); }
+  __forceinline vdouble8 max(const vdouble8& a, double          b) { return max(a,vdouble8(b)); }
+  __forceinline vdouble8 max(double          a, const vdouble8& b) { return max(vdouble8(a),b); }
+
+  __forceinline vdouble8 mask_add(const vboold8& mask, vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_add_pd(c,mask,a,b); }
+  __forceinline vdouble8 mask_sub(const vboold8& mask, vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_sub_pd(c,mask,a,b); }
+
+  __forceinline vdouble8 mask_and(const vboold8& m,vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_and_pd(c,m,a,b); }
+  __forceinline vdouble8 mask_or (const vboold8& m,vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_or_pd(c,m,a,b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vdouble8 madd (const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fmadd_pd(a,b,c); }
+  __forceinline vdouble8 msub (const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fmsub_pd(a,b,c); }
+  __forceinline vdouble8 nmadd(const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fnmadd_pd(a,b,c); }
+  __forceinline vdouble8 nmsub(const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fnmsub_pd(a,b,c); }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vdouble8& operator +=(vdouble8& a, const vdouble8& b) { return a = a + b; }
+  __forceinline vdouble8& operator +=(vdouble8& a, double          b) { return a = a + b; }
+
+  __forceinline vdouble8& operator -=(vdouble8& a, const vdouble8& b) { return a = a - b; }
+  __forceinline vdouble8& operator -=(vdouble8& a, double          b) { return a = a - b; }
+
+  __forceinline vdouble8& operator *=(vdouble8& a, const vdouble8& b) { return a = a * b; }
+  __forceinline vdouble8& operator *=(vdouble8& a, double          b) { return a = a * b; }
+
+  __forceinline vdouble8& operator &=(vdouble8& a, const vdouble8& b) { return a = a & b; }
+  __forceinline vdouble8& operator &=(vdouble8& a, double          b) { return a = a & b; }
+
+  __forceinline vdouble8& operator |=(vdouble8& a, const vdouble8& b) { return a = a | b; }
+  __forceinline vdouble8& operator |=(vdouble8& a, double          b) { return a = a | b; }
+
+  __forceinline vdouble8& operator <<=(vdouble8& a, const double b) { return a = a << b; }
+  __forceinline vdouble8& operator >>=(vdouble8& a, const double b) { return a = a >> b; }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboold8 operator ==(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboold8 operator ==(const vdouble8& a, double          b) { return a == vdouble8(b); }
+  __forceinline vboold8 operator ==(double          a, const vdouble8& b) { return vdouble8(a) == b; }
+
+  __forceinline vboold8 operator !=(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboold8 operator !=(const vdouble8& a, double          b) { return a != vdouble8(b); }
+  __forceinline vboold8 operator !=(double          a, const vdouble8& b) { return vdouble8(a) != b; }
+
+  __forceinline vboold8 operator < (const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboold8 operator < (const vdouble8& a, double          b) { return a <  vdouble8(b); }
+  __forceinline vboold8 operator < (double          a, const vdouble8& b) { return vdouble8(a) <  b; }
+
+  __forceinline vboold8 operator >=(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboold8 operator >=(const vdouble8& a, double          b) { return a >= vdouble8(b); }
+  __forceinline vboold8 operator >=(double          a, const vdouble8& b) { return vdouble8(a) >= b; }
+
+  __forceinline vboold8 operator > (const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboold8 operator > (const vdouble8& a, double          b) { return a >  vdouble8(b); }
+  __forceinline vboold8 operator > (double          a, const vdouble8& b) { return vdouble8(a) >  b; }
+
+  __forceinline vboold8 operator <=(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LE); }
+  __forceinline vboold8 operator <=(const vdouble8& a, double          b) { return a <= vdouble8(b); }
+  __forceinline vboold8 operator <=(double          a, const vdouble8& b) { return vdouble8(a) <= b; }
+
+  __forceinline vboold8 eq(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboold8 ne(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboold8 lt(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboold8 ge(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboold8 gt(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboold8 le(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LE); }
+
+  __forceinline vboold8 eq(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_EQ); }
+  __forceinline vboold8 ne(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_NE); }
+  __forceinline vboold8 lt(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_LT); }
+  __forceinline vboold8 ge(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_GE); }
+  __forceinline vboold8 gt(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_GT); }
+  __forceinline vboold8 le(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_LE); }
+
+  __forceinline vdouble8 select(const vboold8& m, const vdouble8& t, const vdouble8& f) {
+    return _mm512_mask_or_pd(f,m,t,t);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<int i0, int i1>
+  __forceinline vdouble8 shuffle(const vdouble8& v) {
+    return _mm512_permute_pd(v, (i1 << 7) | (i0 << 6) | (i1 << 5) | (i0 << 4) | (i1 << 3) | (i0 << 2) | (i1 << 1) | i0);
+  }
+
+  template<int i>
+  __forceinline vdouble8 shuffle(const vdouble8& v) {
+    return shuffle<i, i>(v);
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vdouble8 shuffle(const vdouble8& v) {
+    return _mm512_permutex_pd(v, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+
+  template<int i0, int i1>
+  __forceinline vdouble8 shuffle4(const vdouble8& v) {
+    return _mm512_shuffle_f64x2(v, v, _MM_SHUFFLE(i1*2+1, i1*2, i0*2+1, i0*2));
+  }
+
+  template<int i>
+  __forceinline vdouble8 shuffle4(const vdouble8& v) {
+    return shuffle4<i, i>(v);
+  }
+  
+  template<int i>
+  __forceinline vdouble8 align_shift_right(const vdouble8& a, const vdouble8& b) {
+    return _mm512_castsi512_pd(_mm512_alignr_epi64(_mm512_castpd_si512(a), _mm512_castpd_si512(b), i));
+  }
+
+  __forceinline double toScalar(const vdouble8& v) {
+    return _mm_cvtsd_f64(_mm512_castpd512_pd128(v));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vdouble8 vreduce_add2(vdouble8 x) {                      return x + shuffle<1,0,3,2>(x); }
+  __forceinline vdouble8 vreduce_add4(vdouble8 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); }
+  __forceinline vdouble8 vreduce_add (vdouble8 x) { x = vreduce_add4(x); return x + shuffle4<1,0>(x); }
+
+  __forceinline vdouble8 vreduce_min2(vdouble8 x) {                      return min(x, shuffle<1,0,3,2>(x)); }
+  __forceinline vdouble8 vreduce_min4(vdouble8 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); }
+  __forceinline vdouble8 vreduce_min (vdouble8 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0>(x)); }
+
+  __forceinline vdouble8 vreduce_max2(vdouble8 x) {                      return max(x, shuffle<1,0,3,2>(x)); }
+  __forceinline vdouble8 vreduce_max4(vdouble8 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); }
+  __forceinline vdouble8 vreduce_max (vdouble8 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0>(x)); }
+
+  __forceinline double reduce_add(const vdouble8& v) { return toScalar(vreduce_add(v)); }
+  __forceinline double reduce_min(const vdouble8& v) { return toScalar(vreduce_min(v)); }
+  __forceinline double reduce_max(const vdouble8& v) { return toScalar(vreduce_max(v)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Memory load and store operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vdouble8 permute(const vdouble8& v, const vllong8& index) {
+    return _mm512_permutexvar_pd(index, v);
+  }
+
+  __forceinline vdouble8 reverse(const vdouble8& a) {
+    return permute(a, vllong8(reverse_step));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vdouble8& v)
+  {
+    cout << "<" << v[0];
+    for (size_t i=1; i<8; i++) cout << ", " << v[i];
+    cout << ">";
+    return cout;
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vfloat16_avx512.h b/thirdparty/embree/common/simd/vfloat16_avx512.h
new file mode 100644
index 0000000000..9f1e2459c4
--- /dev/null
+++ b/thirdparty/embree/common/simd/vfloat16_avx512.h
@@ -0,0 +1,615 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+  /* 16-wide AVX-512 float type */
+  template<>
+  struct vfloat<16>
+  {
+    ALIGNED_STRUCT_(64);
+    
+    typedef vboolf16 Bool;
+    typedef vint16   Int;
+    typedef vfloat16 Float;
+
+    enum  { size = 16 }; // number of SIMD elements
+    union {              // data
+      __m512 v; 
+      float f[16];
+      int i[16];
+    };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+        
+    __forceinline vfloat() {}
+    __forceinline vfloat(const vfloat16& t) { v = t; }
+    __forceinline vfloat16& operator =(const vfloat16& f) { v = f.v; return *this; }
+
+    __forceinline vfloat(const __m512& t) { v = t; }
+    __forceinline operator __m512() const { return v; }
+    __forceinline operator __m256() const { return _mm512_castps512_ps256(v); }
+    __forceinline operator __m128() const { return _mm512_castps512_ps128(v); }
+
+    __forceinline vfloat(float f) {
+      v = _mm512_set1_ps(f);
+    }
+
+    __forceinline vfloat(float a, float b, float c, float d) {
+      v = _mm512_set4_ps(a, b, c, d);
+    }
+
+    __forceinline vfloat(const vfloat4& i) {
+      v = _mm512_broadcast_f32x4(i);
+    }
+
+    __forceinline vfloat(const vfloat4& a, const vfloat4& b, const vfloat4& c, const vfloat4& d) {
+      v = _mm512_castps128_ps512(a);
+      v = _mm512_insertf32x4(v, b, 1);
+      v = _mm512_insertf32x4(v, c, 2);
+      v = _mm512_insertf32x4(v, d, 3);
+    }
+
+    __forceinline vfloat(const vboolf16& mask, const vfloat4& a, const vfloat4& b) {
+      v = _mm512_broadcast_f32x4(a);
+      v = _mm512_mask_broadcast_f32x4(v,mask,b);
+    }
+
+    __forceinline vfloat(const vfloat8& i) {
+      v = _mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_castps_pd(i)));
+    }
+
+    __forceinline vfloat(const vfloat8& a, const vfloat8& b) {
+      v = _mm512_castps256_ps512(a);
+#if defined(__AVX512DQ__)
+      v = _mm512_insertf32x8(v, b, 1);
+#else
+      v = _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castps_pd(v), _mm256_castps_pd(b), 1));
+#endif
+    }
+
+    /* WARNING: due to f64x4 the mask is considered as an 8bit mask */
+    /*__forceinline vfloat(const vboolf16& mask, const vfloat8& a, const vfloat8& b) {
+      __m512d aa = _mm512_broadcast_f64x4(_mm256_castps_pd(a));
+      aa = _mm512_mask_broadcast_f64x4(aa,mask,_mm256_castps_pd(b));
+      v = _mm512_castpd_ps(aa);
+      }*/
+    
+    __forceinline explicit vfloat(const vint16& a) {
+      v = _mm512_cvtepi32_ps(a);
+    }
+
+    __forceinline explicit vfloat(const vuint16& a) {
+      v = _mm512_cvtepu32_ps(a);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vfloat(ZeroTy)   : v(_mm512_setzero_ps()) {}
+    __forceinline vfloat(OneTy)    : v(_mm512_set1_ps(1.0f)) {}
+    __forceinline vfloat(PosInfTy) : v(_mm512_set1_ps(pos_inf)) {}
+    __forceinline vfloat(NegInfTy) : v(_mm512_set1_ps(neg_inf)) {}
+    __forceinline vfloat(StepTy)   : v(_mm512_set_ps(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {}
+    __forceinline vfloat(NaNTy)    : v(_mm512_set1_ps(nan)) {}
+    __forceinline vfloat(UndefinedTy) : v(_mm512_undefined_ps()) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline vfloat16 load (const void* ptr) { return _mm512_load_ps((float*)ptr);  }
+    static __forceinline vfloat16 loadu(const void* ptr) { return _mm512_loadu_ps((float*)ptr); }
+
+    static __forceinline vfloat16 load (const vboolf16& mask, const void* ptr) { return _mm512_mask_load_ps (_mm512_setzero_ps(),mask,(float*)ptr); }
+    static __forceinline vfloat16 loadu(const vboolf16& mask, const void* ptr) { return _mm512_mask_loadu_ps(_mm512_setzero_ps(),mask,(float*)ptr); }
+
+    static __forceinline void store (void* ptr, const vfloat16& v) { _mm512_store_ps ((float*)ptr,v); }
+    static __forceinline void storeu(void* ptr, const vfloat16& v) { _mm512_storeu_ps((float*)ptr,v); }
+
+    static __forceinline void store (const vboolf16& mask, void* ptr, const vfloat16& v) { _mm512_mask_store_ps ((float*)ptr,mask,v); }
+    static __forceinline void storeu(const vboolf16& mask, void* ptr, const vfloat16& v) { _mm512_mask_storeu_ps((float*)ptr,mask,v); }
+
+    static __forceinline void store_nt(void* __restrict__ ptr, const vfloat16& a) {
+      _mm512_stream_ps((float*)ptr,a);
+    }
+
+    static __forceinline vfloat16 broadcast(const float* f) {
+      return _mm512_set1_ps(*f);
+    }
+
+    template<int scale = 4>
+    static __forceinline vfloat16 gather(const float* ptr, const vint16& index) {
+      return _mm512_i32gather_ps(index, ptr, scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline vfloat16 gather(const vboolf16& mask, const float* ptr, const vint16& index) {
+      vfloat16 r = zero;
+      return _mm512_mask_i32gather_ps(r, mask, index, ptr, scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(float* ptr, const vint16& index, const vfloat16& v) {
+      _mm512_i32scatter_ps(ptr, index, v, scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(const vboolf16& mask, float* ptr, const vint16& index, const vfloat16& v) {
+      _mm512_mask_i32scatter_ps(ptr, mask, index, v, scale);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline       float& operator [](size_t index)       { assert(index < 16); return f[index]; }
+    __forceinline const float& operator [](size_t index) const { assert(index < 16); return f[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat16 asFloat(const vint16&   a) { return _mm512_castsi512_ps(a); }
+  __forceinline vint16   asInt  (const vfloat16& a) { return _mm512_castps_si512(a); }
+  __forceinline vuint16  asUInt (const vfloat16& a) { return _mm512_castps_si512(a); }
+
+  __forceinline vint16   toInt  (const vfloat16& a) { return vint16(a); }
+  __forceinline vfloat16 toFloat(const vint16&   a) { return vfloat16(a); }
+
+  __forceinline vfloat16 operator +(const vfloat16& a) { return a; }
+  __forceinline vfloat16 operator -(const vfloat16& a) { return _mm512_mul_ps(a,vfloat16(-1)); }
+
+  __forceinline vfloat16 abs    (const vfloat16& a) { return _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(a),_mm512_set1_epi32(0x7FFFFFFF))); }
+  __forceinline vfloat16 signmsk(const vfloat16& a) { return _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(a),_mm512_set1_epi32(0x80000000))); }
+
+  __forceinline vfloat16 rcp(const vfloat16& a) {
+    const vfloat16 r = _mm512_rcp14_ps(a);
+    return _mm512_mul_ps(r, _mm512_fnmadd_ps(r, a, vfloat16(2.0f)));
+  }
+
+  __forceinline vfloat16 sqr (const vfloat16& a) { return _mm512_mul_ps(a,a); }
+  __forceinline vfloat16 sqrt(const vfloat16& a) { return _mm512_sqrt_ps(a); }
+
+  __forceinline vfloat16 rsqrt(const vfloat16& a)
+  {
+    const vfloat16 r = _mm512_rsqrt14_ps(a);
+    return _mm512_fmadd_ps(_mm512_set1_ps(1.5f), r,
+                           _mm512_mul_ps(_mm512_mul_ps(_mm512_mul_ps(a, _mm512_set1_ps(-0.5f)), r), _mm512_mul_ps(r, r))); 
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat16 operator +(const vfloat16& a, const vfloat16& b) { return _mm512_add_ps(a, b); }
+  __forceinline vfloat16 operator +(const vfloat16& a, float           b) { return a + vfloat16(b); }
+  __forceinline vfloat16 operator +(float           a, const vfloat16& b) { return vfloat16(a) + b; }
+
+  __forceinline vfloat16 operator -(const vfloat16& a, const vfloat16& b) { return _mm512_sub_ps(a, b); }
+  __forceinline vfloat16 operator -(const vfloat16& a, float           b) { return a - vfloat16(b); }
+  __forceinline vfloat16 operator -(float           a, const vfloat16& b) { return vfloat16(a) - b; }
+
+  __forceinline vfloat16 operator *(const vfloat16& a, const vfloat16& b) { return _mm512_mul_ps(a, b); }
+  __forceinline vfloat16 operator *(const vfloat16& a, float           b) { return a * vfloat16(b); }
+  __forceinline vfloat16 operator *(float           a, const vfloat16& b) { return vfloat16(a) * b; }
+
+  __forceinline vfloat16 operator /(const vfloat16& a, const vfloat16& b) { return _mm512_div_ps(a,b); }
+  __forceinline vfloat16 operator /(const vfloat16& a, float           b) { return a/vfloat16(b); }
+  __forceinline vfloat16 operator /(float           a, const vfloat16& b) { return vfloat16(a)/b; }
+  
+  __forceinline vfloat16 operator &(const vfloat16& a, const vfloat16& b) { return _mm512_and_ps(a,b); }
+  __forceinline vfloat16 operator |(const vfloat16& a, const vfloat16& b) { return _mm512_or_ps(a,b); }
+  __forceinline vfloat16 operator ^(const vfloat16& a, const vfloat16& b) {
+    return  _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(a),_mm512_castps_si512(b))); 
+  }
+  
+  __forceinline vfloat16 min(const vfloat16& a, const vfloat16& b) { return _mm512_min_ps(a,b);  }
+  __forceinline vfloat16 min(const vfloat16& a, float           b) { return _mm512_min_ps(a,vfloat16(b)); }
+  __forceinline vfloat16 min(const float&    a, const vfloat16& b) { return _mm512_min_ps(vfloat16(a),b); }
+
+  __forceinline vfloat16 max(const vfloat16& a, const vfloat16& b) { return _mm512_max_ps(a,b); }
+  __forceinline vfloat16 max(const vfloat16& a, float           b) { return _mm512_max_ps(a,vfloat16(b)); }
+  __forceinline vfloat16 max(const float&    a, const vfloat16& b) { return _mm512_max_ps(vfloat16(a),b); }
+
+  __forceinline vfloat16 mini(const vfloat16& a, const vfloat16& b) {
+    const vint16 ai = _mm512_castps_si512(a);
+    const vint16 bi = _mm512_castps_si512(b);
+    const vint16 ci = _mm512_min_epi32(ai,bi);
+    return _mm512_castsi512_ps(ci);
+  }
+
+  __forceinline vfloat16 maxi(const vfloat16& a, const vfloat16& b) {
+    const vint16 ai = _mm512_castps_si512(a);
+    const vint16 bi = _mm512_castps_si512(b);
+    const vint16 ci = _mm512_max_epi32(ai,bi);
+    return _mm512_castsi512_ps(ci);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat16 madd (const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmadd_ps(a,b,c); }
+  __forceinline vfloat16 msub (const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmsub_ps(a,b,c); }
+  __forceinline vfloat16 nmadd(const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fnmadd_ps(a,b,c); }
+  __forceinline vfloat16 nmsub(const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fnmsub_ps(a,b,c); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat16& operator +=(vfloat16& a, const vfloat16& b) { return a = a + b; }
+  __forceinline vfloat16& operator +=(vfloat16& a, float           b) { return a = a + b; }
+  
+  __forceinline vfloat16& operator -=(vfloat16& a, const vfloat16& b) { return a = a - b; }
+  __forceinline vfloat16& operator -=(vfloat16& a, float           b) { return a = a - b; }
+  
+  __forceinline vfloat16& operator *=(vfloat16& a, const vfloat16& b) { return a = a * b; }
+  __forceinline vfloat16& operator *=(vfloat16& a, float           b) { return a = a * b; }
+
+  __forceinline vfloat16& operator /=(vfloat16& a, const vfloat16& b) { return a = a / b; }
+  __forceinline vfloat16& operator /=(vfloat16& a, float           b) { return a = a / b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf16 operator ==(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf16 operator ==(const vfloat16& a, float           b) { return a == vfloat16(b); }
+  __forceinline vboolf16 operator ==(float           a, const vfloat16& b) { return vfloat16(a) == b; }
+
+  __forceinline vboolf16 operator !=(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf16 operator !=(const vfloat16& a, float           b) { return a != vfloat16(b); }
+  __forceinline vboolf16 operator !=(float           a, const vfloat16& b) { return vfloat16(a) != b; }
+
+  __forceinline vboolf16 operator < (const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboolf16 operator < (const vfloat16& a, float           b) { return a <  vfloat16(b); }
+  __forceinline vboolf16 operator < (float           a, const vfloat16& b) { return vfloat16(a) <  b; }
+
+  __forceinline vboolf16 operator >=(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboolf16 operator >=(const vfloat16& a, float           b) { return a >= vfloat16(b); }
+  __forceinline vboolf16 operator >=(float           a, const vfloat16& b) { return vfloat16(a) >= b; }
+
+  __forceinline vboolf16 operator > (const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf16 operator > (const vfloat16& a, float           b) { return a >  vfloat16(b); }
+  __forceinline vboolf16 operator > (float           a, const vfloat16& b) { return vfloat16(a) >  b; }
+
+  __forceinline vboolf16 operator <=(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LE); }
+  __forceinline vboolf16 operator <=(const vfloat16& a, float           b) { return a <= vfloat16(b); }
+  __forceinline vboolf16 operator <=(float           a, const vfloat16& b) { return vfloat16(a) <= b; }
+
+  __forceinline vboolf16 eq(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf16 ne(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf16 lt(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboolf16 ge(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboolf16 gt(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf16 le(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LE); }
+
+  __forceinline vboolf16 eq(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf16 ne(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf16 lt(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_LT); }
+  __forceinline vboolf16 ge(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_GE); }
+  __forceinline vboolf16 gt(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf16 le(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_LE); }
+  
+  __forceinline vfloat16 select(const vboolf16& s, const vfloat16& t, const vfloat16& f) {
+    return _mm512_mask_blend_ps(s, f, t);
+  }
+
+  __forceinline vfloat16 lerp(const vfloat16& a, const vfloat16& b, const vfloat16& t) {
+    return madd(t,b-a,a);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Rounding Functions
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vfloat16 floor(const vfloat16& a) {
+    return _mm512_floor_ps(a);
+  }
+  __forceinline vfloat16 ceil (const vfloat16& a) {
+    return _mm512_ceil_ps(a);
+  }
+  __forceinline vfloat16 round (const vfloat16& a) {
+    return _mm512_roundscale_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+  }
+  __forceinline vint16 floori (const vfloat16& a) {
+    return _mm512_cvt_roundps_epi32(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat16 unpacklo(const vfloat16& a, const vfloat16& b) { return _mm512_unpacklo_ps(a, b); }
+  __forceinline vfloat16 unpackhi(const vfloat16& a, const vfloat16& b) { return _mm512_unpackhi_ps(a, b); }
+
+  template<int i>
+  __forceinline vfloat16 shuffle(const vfloat16& v) {
+    return _mm512_permute_ps(v, _MM_SHUFFLE(i, i, i, i));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vfloat16 shuffle(const vfloat16& v) {
+    return _mm512_permute_ps(v, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+
+  template<int i>
+  __forceinline vfloat16 shuffle4(const vfloat16& v) {
+    return _mm512_shuffle_f32x4(v, v ,_MM_SHUFFLE(i, i, i, i));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vfloat16 shuffle4(const vfloat16& v) {
+    return _mm512_shuffle_f32x4(v, v, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+
+  __forceinline vfloat16 interleave4_even(const vfloat16& a, const vfloat16& b) {
+    return _mm512_castsi512_ps(_mm512_mask_permutex_epi64(_mm512_castps_si512(a), mm512_int2mask(0xcc), _mm512_castps_si512(b), (_MM_PERM_ENUM)0x4e));
+  }
+
+  __forceinline vfloat16 interleave4_odd(const vfloat16& a, const vfloat16& b) {
+    return _mm512_castsi512_ps(_mm512_mask_permutex_epi64(_mm512_castps_si512(b), mm512_int2mask(0x33), _mm512_castps_si512(a), (_MM_PERM_ENUM)0x4e));
+  }
+
+  __forceinline vfloat16 permute(vfloat16 v, __m512i index) {
+    return _mm512_castsi512_ps(_mm512_permutexvar_epi32(index, _mm512_castps_si512(v)));
+  }
+
+  __forceinline vfloat16 reverse(const vfloat16& v) {
+    return permute(v,_mm512_setr_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0));
+  }
+
+  template<int i>
+  __forceinline vfloat16 align_shift_right(const vfloat16& a, const vfloat16& b) {
+    return _mm512_castsi512_ps(_mm512_alignr_epi32(_mm512_castps_si512(a),_mm512_castps_si512(b),i)); 
+  };
+
+  template<int i>
+  __forceinline vfloat16 mask_align_shift_right(const vboolf16& mask, vfloat16& c, const vfloat16& a, const vfloat16& b) {
+    return _mm512_castsi512_ps(_mm512_mask_alignr_epi32(_mm512_castps_si512(c),mask,_mm512_castps_si512(a),_mm512_castps_si512(b),i)); 
+  };
+ 
+  __forceinline vfloat16 shift_left_1(const vfloat16& a) {
+    vfloat16 z = zero;
+    return mask_align_shift_right<15>(0xfffe,z,a,a);
+  }
+
+  __forceinline vfloat16 shift_right_1(const vfloat16& x) {
+    return align_shift_right<1>(zero,x);
+  }
+
+  __forceinline float toScalar(const vfloat16& v) { return mm512_cvtss_f32(v); }
+
+
+  template<int i> __forceinline vfloat16 insert4(const vfloat16& a, const vfloat4& b) { return _mm512_insertf32x4(a, b, i); }
+
+  template<int N, int i>
+  vfloat<N> extractN(const vfloat16& v);
+
+  template<> __forceinline vfloat4 extractN<4,0>(const vfloat16& v) { return _mm512_castps512_ps128(v);    }
+  template<> __forceinline vfloat4 extractN<4,1>(const vfloat16& v) { return _mm512_extractf32x4_ps(v, 1); }
+  template<> __forceinline vfloat4 extractN<4,2>(const vfloat16& v) { return _mm512_extractf32x4_ps(v, 2); }
+  template<> __forceinline vfloat4 extractN<4,3>(const vfloat16& v) { return _mm512_extractf32x4_ps(v, 3); }
+
+  template<> __forceinline vfloat8 extractN<8,0>(const vfloat16& v) { return _mm512_castps512_ps256(v);    }
+  template<> __forceinline vfloat8 extractN<8,1>(const vfloat16& v) { return _mm512_extractf32x8_ps(v, 1); }
+
+  template<int i> __forceinline vfloat4 extract4   (const vfloat16& v) { return _mm512_extractf32x4_ps(v, i); }
+  template<>      __forceinline vfloat4 extract4<0>(const vfloat16& v) { return _mm512_castps512_ps128(v);    }
+
+  template<int i> __forceinline vfloat8 extract8   (const vfloat16& v) { return _mm512_extractf32x8_ps(v, i); }
+  template<>      __forceinline vfloat8 extract8<0>(const vfloat16& v) { return _mm512_castps512_ps256(v);    }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Transpose
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline void transpose(const vfloat16& r0, const vfloat16& r1, const vfloat16& r2, const vfloat16& r3,
+                               vfloat16& c0, vfloat16& c1, vfloat16& c2, vfloat16& c3)
+  {
+    vfloat16 a0a2_b0b2 = unpacklo(r0, r2);
+    vfloat16 c0c2_d0d2 = unpackhi(r0, r2);
+    vfloat16 a1a3_b1b3 = unpacklo(r1, r3);
+    vfloat16 c1c3_d1d3 = unpackhi(r1, r3);
+
+    c0 = unpacklo(a0a2_b0b2, a1a3_b1b3);
+    c1 = unpackhi(a0a2_b0b2, a1a3_b1b3);
+    c2 = unpacklo(c0c2_d0d2, c1c3_d1d3);
+    c3 = unpackhi(c0c2_d0d2, c1c3_d1d3);
+  }
+
+  __forceinline void transpose(const vfloat4& r0,  const vfloat4& r1,  const vfloat4& r2,  const vfloat4& r3,
+                               const vfloat4& r4,  const vfloat4& r5,  const vfloat4& r6,  const vfloat4& r7,
+                               const vfloat4& r8,  const vfloat4& r9,  const vfloat4& r10, const vfloat4& r11,
+                               const vfloat4& r12, const vfloat4& r13, const vfloat4& r14, const vfloat4& r15,
+                               vfloat16& c0, vfloat16& c1, vfloat16& c2, vfloat16& c3)
+  {
+    return transpose(vfloat16(r0, r4, r8, r12), vfloat16(r1, r5, r9, r13), vfloat16(r2, r6, r10, r14), vfloat16(r3, r7, r11, r15),
+                     c0, c1, c2, c3);
+  }
+
+  __forceinline void transpose(const vfloat16& r0, const vfloat16& r1, const vfloat16& r2, const vfloat16& r3,
+                               const vfloat16& r4, const vfloat16& r5, const vfloat16& r6, const vfloat16& r7,
+                               vfloat16& c0, vfloat16& c1, vfloat16& c2, vfloat16& c3,
+                               vfloat16& c4, vfloat16& c5, vfloat16& c6, vfloat16& c7)
+  {
+    vfloat16 a0a1a2a3_e0e1e2e3, b0b1b2b3_f0f1f2f3, c0c1c2c3_g0g1g2g3, d0d1d2d3_h0h1h2h3;
+    transpose(r0, r1, r2, r3, a0a1a2a3_e0e1e2e3, b0b1b2b3_f0f1f2f3, c0c1c2c3_g0g1g2g3, d0d1d2d3_h0h1h2h3);
+
+    vfloat16 a4a5a6a7_e4e5e6e7, b4b5b6b7_f4f5f6f7, c4c5c6c7_g4g5g6g7, d4d5d6d7_h4h5h6h7;
+    transpose(r4, r5, r6, r7, a4a5a6a7_e4e5e6e7, b4b5b6b7_f4f5f6f7, c4c5c6c7_g4g5g6g7, d4d5d6d7_h4h5h6h7);
+
+    c0 = interleave4_even(a0a1a2a3_e0e1e2e3, a4a5a6a7_e4e5e6e7);
+    c1 = interleave4_even(b0b1b2b3_f0f1f2f3, b4b5b6b7_f4f5f6f7);
+    c2 = interleave4_even(c0c1c2c3_g0g1g2g3, c4c5c6c7_g4g5g6g7);
+    c3 = interleave4_even(d0d1d2d3_h0h1h2h3, d4d5d6d7_h4h5h6h7);
+    c4 = interleave4_odd (a0a1a2a3_e0e1e2e3, a4a5a6a7_e4e5e6e7);
+    c5 = interleave4_odd (b0b1b2b3_f0f1f2f3, b4b5b6b7_f4f5f6f7);
+    c6 = interleave4_odd (c0c1c2c3_g0g1g2g3, c4c5c6c7_g4g5g6g7);
+    c7 = interleave4_odd (d0d1d2d3_h0h1h2h3, d4d5d6d7_h4h5h6h7);
+  }
+
+  __forceinline void transpose(const vfloat8& r0,  const vfloat8& r1,  const vfloat8& r2,  const vfloat8& r3,
+                               const vfloat8& r4,  const vfloat8& r5,  const vfloat8& r6,  const vfloat8& r7,
+                               const vfloat8& r8,  const vfloat8& r9,  const vfloat8& r10, const vfloat8& r11,
+                               const vfloat8& r12, const vfloat8& r13, const vfloat8& r14, const vfloat8& r15,
+                               vfloat16& c0, vfloat16& c1, vfloat16& c2, vfloat16& c3,
+                               vfloat16& c4, vfloat16& c5, vfloat16& c6, vfloat16& c7)
+  {
+    return transpose(vfloat16(r0, r8),  vfloat16(r1, r9),  vfloat16(r2, r10), vfloat16(r3, r11),
+                     vfloat16(r4, r12), vfloat16(r5, r13), vfloat16(r6, r14), vfloat16(r7, r15),
+                     c0, c1, c2, c3, c4, c5, c6, c7);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat16 vreduce_add2(vfloat16 x) {                      return x + shuffle<1,0,3,2>(x); }
+  __forceinline vfloat16 vreduce_add4(vfloat16 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); }
+  __forceinline vfloat16 vreduce_add8(vfloat16 x) { x = vreduce_add4(x); return x + shuffle4<1,0,3,2>(x); }
+  __forceinline vfloat16 vreduce_add (vfloat16 x) { x = vreduce_add8(x); return x + shuffle4<2,3,0,1>(x); }
+
+  __forceinline vfloat16 vreduce_min2(vfloat16 x) {                      return min(x, shuffle<1,0,3,2>(x)); }
+  __forceinline vfloat16 vreduce_min4(vfloat16 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); }
+  __forceinline vfloat16 vreduce_min8(vfloat16 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0,3,2>(x)); }
+  __forceinline vfloat16 vreduce_min (vfloat16 x) { x = vreduce_min8(x); return min(x, shuffle4<2,3,0,1>(x)); }
+
+  __forceinline vfloat16 vreduce_max2(vfloat16 x) {                      return max(x, shuffle<1,0,3,2>(x)); }
+  __forceinline vfloat16 vreduce_max4(vfloat16 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); }
+  __forceinline vfloat16 vreduce_max8(vfloat16 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0,3,2>(x)); }
+  __forceinline vfloat16 vreduce_max (vfloat16 x) { x = vreduce_max8(x); return max(x, shuffle4<2,3,0,1>(x)); }
+
+  __forceinline float reduce_add(const vfloat16& v) { return toScalar(vreduce_add(v)); }
+  __forceinline float reduce_min(const vfloat16& v) { return toScalar(vreduce_min(v)); }
+  __forceinline float reduce_max(const vfloat16& v) { return toScalar(vreduce_max(v)); }
+ 
+  __forceinline size_t select_min(const vfloat16& v) { 
+    return bsf(_mm512_kmov(_mm512_cmp_epi32_mask(_mm512_castps_si512(v),_mm512_castps_si512(vreduce_min(v)),_MM_CMPINT_EQ)));
+  }
+
+  __forceinline size_t select_max(const vfloat16& v) { 
+    return bsf(_mm512_kmov(_mm512_cmp_epi32_mask(_mm512_castps_si512(v),_mm512_castps_si512(vreduce_max(v)),_MM_CMPINT_EQ)));
+  }
+
+  __forceinline size_t select_min(const vboolf16& valid, const vfloat16& v) 
+  { 
+    const vfloat16 a = select(valid,v,vfloat16(pos_inf)); 
+    const vbool16 valid_min = valid & (a == vreduce_min(a));
+    return bsf(movemask(any(valid_min) ? valid_min : valid)); 
+  }
+
+  __forceinline size_t select_max(const vboolf16& valid, const vfloat16& v) 
+  { 
+    const vfloat16 a = select(valid,v,vfloat16(neg_inf)); 
+    const vbool16 valid_max = valid & (a == vreduce_max(a));
+    return bsf(movemask(any(valid_max) ? valid_max : valid)); 
+  }
+  
+  __forceinline vfloat16 prefix_sum(const vfloat16& a) 
+  {
+    const vfloat16 z(zero);
+    vfloat16 v = a;
+    v = v + align_shift_right<16-1>(v,z);
+    v = v + align_shift_right<16-2>(v,z);
+    v = v + align_shift_right<16-4>(v,z);
+    v = v + align_shift_right<16-8>(v,z);
+    return v;  
+  }
+
+  __forceinline vfloat16 reverse_prefix_sum(const vfloat16& a) 
+  {
+    const vfloat16 z(zero);
+    vfloat16 v = a;
+    v = v + align_shift_right<1>(z,v);
+    v = v + align_shift_right<2>(z,v);
+    v = v + align_shift_right<4>(z,v);
+    v = v + align_shift_right<8>(z,v);
+    return v;  
+  }
+
+  __forceinline vfloat16 prefix_min(const vfloat16& a)
+  {
+    const vfloat16 z(pos_inf);
+    vfloat16 v = a;
+    v = min(v,align_shift_right<16-1>(v,z));
+    v = min(v,align_shift_right<16-2>(v,z));
+    v = min(v,align_shift_right<16-4>(v,z));
+    v = min(v,align_shift_right<16-8>(v,z));
+    return v;  
+  }
+
+  __forceinline vfloat16 prefix_max(const vfloat16& a)
+  {
+    const vfloat16 z(neg_inf);
+    vfloat16 v = a;
+    v = max(v,align_shift_right<16-1>(v,z));
+    v = max(v,align_shift_right<16-2>(v,z));
+    v = max(v,align_shift_right<16-4>(v,z));
+    v = max(v,align_shift_right<16-8>(v,z));
+    return v;  
+  }
+
+
+  __forceinline vfloat16 reverse_prefix_min(const vfloat16& a)
+  {
+    const vfloat16 z(pos_inf);
+    vfloat16 v = a;
+    v = min(v,align_shift_right<1>(z,v));
+    v = min(v,align_shift_right<2>(z,v));
+    v = min(v,align_shift_right<4>(z,v));
+    v = min(v,align_shift_right<8>(z,v));
+    return v;  
+  }
+
+  __forceinline vfloat16 reverse_prefix_max(const vfloat16& a)
+  {
+    const vfloat16 z(neg_inf);
+    vfloat16 v = a;
+    v = max(v,align_shift_right<1>(z,v));
+    v = max(v,align_shift_right<2>(z,v));
+    v = max(v,align_shift_right<4>(z,v));
+    v = max(v,align_shift_right<8>(z,v));
+    return v;  
+  }
+
+  __forceinline vfloat16 rcp_safe(const vfloat16& a) {
+    return rcp(select(a != vfloat16(zero), a, vfloat16(min_rcp_input)));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vfloat16& v)
+  {
+    cout << "<" << v[0];
+    for (int i=1; i<16; i++) cout << ", " << v[i];
+    cout << ">";
+    return cout;
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vfloat4_sse2.h b/thirdparty/embree/common/simd/vfloat4_sse2.h
new file mode 100644
index 0000000000..5215bf9730
--- /dev/null
+++ b/thirdparty/embree/common/simd/vfloat4_sse2.h
@@ -0,0 +1,722 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+  /* 4-wide SSE float type */
+  template<>
+  struct vfloat<4>
+  {
+    ALIGNED_STRUCT_(16);
+    
+    typedef vboolf4 Bool;
+    typedef vint4   Int;
+    typedef vfloat4 Float;
+    
+    enum  { size = 4 };                        // number of SIMD elements
+    union { __m128 v; float f[4]; int i[4]; }; // data
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline vfloat() {}
+    __forceinline vfloat(const vfloat4& other) { v = other.v; }
+    __forceinline vfloat4& operator =(const vfloat4& other) { v = other.v; return *this; }
+
+    __forceinline vfloat(__m128 a) : v(a) {}
+    __forceinline operator const __m128&() const { return v; }
+    __forceinline operator       __m128&()       { return v; }
+
+    __forceinline vfloat(float a) : v(_mm_set1_ps(a)) {}
+    __forceinline vfloat(float a, float b, float c, float d) : v(_mm_set_ps(d, c, b, a)) {}
+
+    __forceinline explicit vfloat(const vint4& a) : v(_mm_cvtepi32_ps(a)) {}
+    __forceinline explicit vfloat(const vuint4& x) {
+      const __m128i a   = _mm_and_si128(x,_mm_set1_epi32(0x7FFFFFFF));
+      const __m128i b   = _mm_and_si128(_mm_srai_epi32(x,31),_mm_set1_epi32(0x4F000000)); //0x4F000000 = 2^31 
+      const __m128  af  = _mm_cvtepi32_ps(a);
+      const __m128  bf  = _mm_castsi128_ps(b);  
+      v  = _mm_add_ps(af,bf);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vfloat(ZeroTy)   : v(_mm_setzero_ps()) {}
+    __forceinline vfloat(OneTy)    : v(_mm_set1_ps(1.0f)) {}
+    __forceinline vfloat(PosInfTy) : v(_mm_set1_ps(pos_inf)) {}
+    __forceinline vfloat(NegInfTy) : v(_mm_set1_ps(neg_inf)) {}
+    __forceinline vfloat(StepTy)   : v(_mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)) {}
+    __forceinline vfloat(NaNTy)    : v(_mm_set1_ps(nan)) {}
+    __forceinline vfloat(UndefinedTy) : v(_mm_undefined_ps()) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline vfloat4 load (const void* a) { return _mm_load_ps((float*)a); }
+    static __forceinline vfloat4 loadu(const void* a) { return _mm_loadu_ps((float*)a); }
+
+    static __forceinline void store (void* ptr, const vfloat4& v) { _mm_store_ps((float*)ptr,v); }
+    static __forceinline void storeu(void* ptr, const vfloat4& v) { _mm_storeu_ps((float*)ptr,v); }
+
+#if defined(__AVX512VL__)
+
+    static __forceinline vfloat4 load (const vboolf4& mask, const void* ptr) { return _mm_mask_load_ps (_mm_setzero_ps(),mask,(float*)ptr); }
+    static __forceinline vfloat4 loadu(const vboolf4& mask, const void* ptr) { return _mm_mask_loadu_ps(_mm_setzero_ps(),mask,(float*)ptr); }
+
+    static __forceinline void store (const vboolf4& mask, void* ptr, const vfloat4& v) { _mm_mask_store_ps ((float*)ptr,mask,v); }
+    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vfloat4& v) { _mm_mask_storeu_ps((float*)ptr,mask,v); }
+#elif defined(__AVX__)
+    static __forceinline vfloat4 load (const vboolf4& mask, const void* ptr) { return _mm_maskload_ps((float*)ptr,mask); }
+    static __forceinline vfloat4 loadu(const vboolf4& mask, const void* ptr) { return _mm_maskload_ps((float*)ptr,mask); }
+
+    static __forceinline void store (const vboolf4& mask, void* ptr, const vfloat4& v) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,v); }
+    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vfloat4& v) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,v); }
+#else
+    static __forceinline vfloat4 load (const vboolf4& mask, const void* ptr) { return _mm_and_ps(_mm_load_ps ((float*)ptr),mask); }
+    static __forceinline vfloat4 loadu(const vboolf4& mask, const void* ptr) { return _mm_and_ps(_mm_loadu_ps((float*)ptr),mask); }
+
+    static __forceinline void store (const vboolf4& mask, void* ptr, const vfloat4& v) { store (ptr,select(mask,v,load (ptr))); }
+    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vfloat4& v) { storeu(ptr,select(mask,v,loadu(ptr))); }
+#endif
+
+#if defined(__AVX__)
+    static __forceinline vfloat4 broadcast(const void* a) { return _mm_broadcast_ss((float*)a); }
+#else
+    static __forceinline vfloat4 broadcast(const void* a) { return _mm_set1_ps(*(float*)a); }
+#endif
+
+    static __forceinline vfloat4 load_nt (const float* ptr) {
+#if defined (__SSE4_1__)
+    return _mm_castsi128_ps(_mm_stream_load_si128((__m128i*)ptr));
+#else
+    return _mm_load_ps(ptr); 
+#endif
+  }
+
+#if defined(__SSE4_1__)
+    static __forceinline vfloat4 load(const char* ptr) {
+      return _mm_cvtepi32_ps(_mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)ptr)));
+    }
+#else
+    static __forceinline vfloat4 load(const char* ptr) {
+      return vfloat4(ptr[0],ptr[1],ptr[2],ptr[3]);
+    }
+#endif
+
+#if defined(__SSE4_1__)
+    static __forceinline vfloat4 load(const unsigned char* ptr) {
+      return _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr)));
+    }
+#else
+    static __forceinline vfloat4 load(const unsigned char* ptr) {
+      //return _mm_cvtpu8_ps(*(__m64*)ptr); // don't enable, will use MMX instructions
+      return vfloat4(ptr[0],ptr[1],ptr[2],ptr[3]);
+    }
+#endif
+
+#if defined(__SSE4_1__)
+    static __forceinline vfloat4 load(const short* ptr) {
+      return _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_loadu_si128((__m128i*)ptr)));
+    }
+#else
+    static __forceinline vfloat4 load(const short* ptr) {
+      return vfloat4(ptr[0],ptr[1],ptr[2],ptr[3]);
+    }
+#endif
+
+    static __forceinline vfloat4 load(const unsigned short* ptr) {
+      return _mm_mul_ps(vfloat4(vint4::load(ptr)),vfloat4(1.0f/65535.0f));
+    }
+    
+    static __forceinline void store_nt(void* ptr, const vfloat4& v)
+    {
+#if defined (__SSE4_1__)
+      _mm_stream_ps((float*)ptr,v);
+#else
+      _mm_store_ps((float*)ptr,v);
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline vfloat4 gather(const float* ptr, const vint4& index) {
+#if defined(__AVX2__)
+      return _mm_i32gather_ps(ptr, index, scale);
+#else
+      return vfloat4(
+        *(float*)(((char*)ptr)+scale*index[0]),
+        *(float*)(((char*)ptr)+scale*index[1]),
+        *(float*)(((char*)ptr)+scale*index[2]),
+        *(float*)(((char*)ptr)+scale*index[3]));
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline vfloat4 gather(const vboolf4& mask, const float* ptr, const vint4& index) {
+      vfloat4 r = zero;
+#if defined(__AVX512VL__)
+      return _mm_mmask_i32gather_ps(r, mask, index, ptr, scale);
+#elif defined(__AVX2__)
+      return _mm_mask_i32gather_ps(r, ptr, index, mask, scale);
+#else
+      if (likely(mask[0])) r[0] = *(float*)(((char*)ptr)+scale*index[0]);
+      if (likely(mask[1])) r[1] = *(float*)(((char*)ptr)+scale*index[1]);
+      if (likely(mask[2])) r[2] = *(float*)(((char*)ptr)+scale*index[2]);
+      if (likely(mask[3])) r[3] = *(float*)(((char*)ptr)+scale*index[3]);
+      return r;
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(void* ptr, const vint4& index, const vfloat4& v)
+    {
+#if defined(__AVX512VL__)
+      _mm_i32scatter_ps((float*)ptr, index, v, scale);
+#else
+      *(float*)(((char*)ptr)+scale*index[0]) = v[0];
+      *(float*)(((char*)ptr)+scale*index[1]) = v[1];
+      *(float*)(((char*)ptr)+scale*index[2]) = v[2];
+      *(float*)(((char*)ptr)+scale*index[3]) = v[3];
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(const vboolf4& mask, void* ptr, const vint4& index, const vfloat4& v)
+    {
+#if defined(__AVX512VL__)
+      _mm_mask_i32scatter_ps((float*)ptr ,mask, index, v, scale);
+#else
+      if (likely(mask[0])) *(float*)(((char*)ptr)+scale*index[0]) = v[0];
+      if (likely(mask[1])) *(float*)(((char*)ptr)+scale*index[1]) = v[1];
+      if (likely(mask[2])) *(float*)(((char*)ptr)+scale*index[2]) = v[2];
+      if (likely(mask[3])) *(float*)(((char*)ptr)+scale*index[3]) = v[3];
+#endif
+    }
+
+    static __forceinline void store(const vboolf4& mask, char* ptr, const vint4& ofs, const vfloat4& v) {
+      scatter<1>(mask,ptr,ofs,v);
+    }
+    static __forceinline void store(const vboolf4& mask, float* ptr, const vint4& ofs, const vfloat4& v) {
+      scatter<4>(mask,ptr,ofs,v);
+    }
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const float& operator [](size_t index) const { assert(index < 4); return f[index]; }
+    __forceinline       float& operator [](size_t index)       { assert(index < 4); return f[index]; }
+
+    friend __forceinline vfloat4 select(const vboolf4& m, const vfloat4& t, const vfloat4& f) {
+#if defined(__AVX512VL__)
+      return _mm_mask_blend_ps(m, f, t);
+#elif defined(__SSE4_1__)
+      return _mm_blendv_ps(f, t, m); 
+#else
+      return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f)); 
+#endif
+    }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Load/Store
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<> struct mem<vfloat4>
+  {
+    static __forceinline vfloat4 load (const vboolf4& mask, const void* ptr) { return vfloat4::load (mask,ptr); }
+    static __forceinline vfloat4 loadu(const vboolf4& mask, const void* ptr) { return vfloat4::loadu(mask,ptr); }
+    
+    static __forceinline void store (const vboolf4& mask, void* ptr, const vfloat4& v) { vfloat4::store (mask,ptr,v); }
+    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vfloat4& v) { vfloat4::storeu(mask,ptr,v); }
+  };
+    
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat4 asFloat(const vint4&   a) { return _mm_castsi128_ps(a); }
+  __forceinline vint4   asInt  (const vfloat4& a) { return _mm_castps_si128(a); }
+  __forceinline vuint4  asUInt (const vfloat4& a) { return _mm_castps_si128(a); }
+
+  __forceinline vint4   toInt  (const vfloat4& a) { return vint4(a); }
+  __forceinline vfloat4 toFloat(const vint4&   a) { return vfloat4(a); }
+
+  __forceinline vfloat4 operator +(const vfloat4& a) { return a; }
+  __forceinline vfloat4 operator -(const vfloat4& a) { return _mm_xor_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))); }
+
+  __forceinline vfloat4 abs(const vfloat4& a) { return _mm_and_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))); }
+#if defined(__AVX512VL__)
+  __forceinline vfloat4 sign(const vfloat4& a) { return _mm_mask_blend_ps(_mm_cmp_ps_mask(a, vfloat4(zero), _CMP_LT_OQ), vfloat4(one), -vfloat4(one)); }
+#else
+  __forceinline vfloat4 sign(const vfloat4& a) { return blendv_ps(vfloat4(one), -vfloat4(one), _mm_cmplt_ps(a, vfloat4(zero))); }
+#endif
+  __forceinline vfloat4 signmsk(const vfloat4& a) { return _mm_and_ps(a,_mm_castsi128_ps(_mm_set1_epi32(0x80000000))); }
+  
+  __forceinline vfloat4 rcp(const vfloat4& a)
+  {
+#if defined(__AVX512VL__)
+    const vfloat4 r = _mm_rcp14_ps(a);
+#else
+    const vfloat4 r = _mm_rcp_ps(a);
+#endif
+
+#if defined(__AVX2__)
+    return _mm_mul_ps(r,_mm_fnmadd_ps(r, a, vfloat4(2.0f)));
+#else
+    return _mm_mul_ps(r,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r, a)));
+#endif
+  }
+  __forceinline vfloat4 sqr (const vfloat4& a) { return _mm_mul_ps(a,a); }
+  __forceinline vfloat4 sqrt(const vfloat4& a) { return _mm_sqrt_ps(a); }
+
+  __forceinline vfloat4 rsqrt(const vfloat4& a)
+  {
+#if defined(__AVX512VL__)
+    vfloat4 r = _mm_rsqrt14_ps(a);
+#else
+    vfloat4 r = _mm_rsqrt_ps(a);
+#endif
+
+#if defined(__ARM_NEON)
+    r = _mm_fmadd_ps(_mm_set1_ps(1.5f), r, _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+    r = _mm_fmadd_ps(_mm_set1_ps(1.5f), r, _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+#elif defined(__AVX2__)
+    r = _mm_fmadd_ps(_mm_set1_ps(1.5f), r, _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+#else
+    r = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f), r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+#endif
+    return r;
+  }
+
+  __forceinline vboolf4 isnan(const vfloat4& a) {
+    const vfloat4 b = _mm_and_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)));
+#if defined(__AVX512VL__)
+    return _mm_cmp_epi32_mask(_mm_castps_si128(b), _mm_set1_epi32(0x7f800000), _MM_CMPINT_GT);
+#else
+    return _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_castps_si128(b), _mm_set1_epi32(0x7f800000)));
+#endif
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat4 operator +(const vfloat4& a, const vfloat4& b) { return _mm_add_ps(a, b); }
+  __forceinline vfloat4 operator +(const vfloat4& a, float          b) { return a + vfloat4(b); }
+  __forceinline vfloat4 operator +(float          a, const vfloat4& b) { return vfloat4(a) + b; }
+
+  __forceinline vfloat4 operator -(const vfloat4& a, const vfloat4& b) { return _mm_sub_ps(a, b); }
+  __forceinline vfloat4 operator -(const vfloat4& a, float          b) { return a - vfloat4(b); }
+  __forceinline vfloat4 operator -(float          a, const vfloat4& b) { return vfloat4(a) - b; }
+
+  __forceinline vfloat4 operator *(const vfloat4& a, const vfloat4& b) { return _mm_mul_ps(a, b); }
+  __forceinline vfloat4 operator *(const vfloat4& a, float          b) { return a * vfloat4(b); }
+  __forceinline vfloat4 operator *(float          a, const vfloat4& b) { return vfloat4(a) * b; }
+
+  __forceinline vfloat4 operator /(const vfloat4& a, const vfloat4& b) { return _mm_div_ps(a,b); }
+  __forceinline vfloat4 operator /(const vfloat4& a, float          b) { return a/vfloat4(b); }
+  __forceinline vfloat4 operator /(float          a, const vfloat4& b) { return vfloat4(a)/b; }
+
+  __forceinline vfloat4 operator &(const vfloat4& a, const vfloat4& b) { return _mm_and_ps(a,b); }
+  __forceinline vfloat4 operator |(const vfloat4& a, const vfloat4& b) { return _mm_or_ps(a,b); }
+  __forceinline vfloat4 operator ^(const vfloat4& a, const vfloat4& b) { return _mm_xor_ps(a,b); }
+  __forceinline vfloat4 operator ^(const vfloat4& a, const vint4&   b) { return _mm_xor_ps(a,_mm_castsi128_ps(b)); }
+
+  __forceinline vfloat4 min(const vfloat4& a, const vfloat4& b) { return _mm_min_ps(a,b); }
+  __forceinline vfloat4 min(const vfloat4& a, float          b) { return _mm_min_ps(a,vfloat4(b)); }
+  __forceinline vfloat4 min(float          a, const vfloat4& b) { return _mm_min_ps(vfloat4(a),b); }
+
+  __forceinline vfloat4 max(const vfloat4& a, const vfloat4& b) { return _mm_max_ps(a,b); }
+  __forceinline vfloat4 max(const vfloat4& a, float          b) { return _mm_max_ps(a,vfloat4(b)); }
+  __forceinline vfloat4 max(float          a, const vfloat4& b) { return _mm_max_ps(vfloat4(a),b); }
+
+#if defined(__SSE4_1__)
+    __forceinline vfloat4 mini(const vfloat4& a, const vfloat4& b) {
+      const vint4 ai = _mm_castps_si128(a);
+      const vint4 bi = _mm_castps_si128(b);
+      const vint4 ci = _mm_min_epi32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+
+    __forceinline vfloat4 maxi(const vfloat4& a, const vfloat4& b) {
+      const vint4 ai = _mm_castps_si128(a);
+      const vint4 bi = _mm_castps_si128(b);
+      const vint4 ci = _mm_max_epi32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+
+    __forceinline vfloat4 minui(const vfloat4& a, const vfloat4& b) {
+      const vint4 ai = _mm_castps_si128(a);
+      const vint4 bi = _mm_castps_si128(b);
+      const vint4 ci = _mm_min_epu32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+
+    __forceinline vfloat4 maxui(const vfloat4& a, const vfloat4& b) {
+      const vint4 ai = _mm_castps_si128(a);
+      const vint4 bi = _mm_castps_si128(b);
+      const vint4 ci = _mm_max_epu32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+#else
+    __forceinline vfloat4 mini(const vfloat4& a, const vfloat4& b) {
+      return min(a,b);
+    }
+
+    __forceinline vfloat4 maxi(const vfloat4& a, const vfloat4& b) {
+      return max(a,b);
+    }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX2__) || defined(__ARM_NEON)
+  __forceinline vfloat4 madd (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fmadd_ps(a,b,c); }
+  __forceinline vfloat4 msub (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fmsub_ps(a,b,c); }
+  __forceinline vfloat4 nmadd(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fnmadd_ps(a,b,c); }
+  __forceinline vfloat4 nmsub(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fnmsub_ps(a,b,c); }
+#else
+  __forceinline vfloat4 madd (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return a*b+c; }
+  __forceinline vfloat4 msub (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return a*b-c; }
+  __forceinline vfloat4 nmadd(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return -a*b+c;}
+  __forceinline vfloat4 nmsub(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return -a*b-c; }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat4& operator +=(vfloat4& a, const vfloat4& b) { return a = a + b; }
+  __forceinline vfloat4& operator +=(vfloat4& a, float          b) { return a = a + b; }
+
+  __forceinline vfloat4& operator -=(vfloat4& a, const vfloat4& b) { return a = a - b; }
+  __forceinline vfloat4& operator -=(vfloat4& a, float          b) { return a = a - b; }
+
+  __forceinline vfloat4& operator *=(vfloat4& a, const vfloat4& b) { return a = a * b; }
+  __forceinline vfloat4& operator *=(vfloat4& a, float          b) { return a = a * b; }
+
+  __forceinline vfloat4& operator /=(vfloat4& a, const vfloat4& b) { return a = a / b; }
+  __forceinline vfloat4& operator /=(vfloat4& a, float          b) { return a = a / b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  __forceinline vboolf4 operator ==(const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_EQ); }
+  __forceinline vboolf4 operator !=(const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_NE); }
+  __forceinline vboolf4 operator < (const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_LT); }
+  __forceinline vboolf4 operator >=(const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_GE); }
+  __forceinline vboolf4 operator > (const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_GT); }
+  __forceinline vboolf4 operator <=(const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_LE); }
+#else
+  __forceinline vboolf4 operator ==(const vfloat4& a, const vfloat4& b) { return _mm_cmpeq_ps (a, b); }
+  __forceinline vboolf4 operator !=(const vfloat4& a, const vfloat4& b) { return _mm_cmpneq_ps(a, b); }
+  __forceinline vboolf4 operator < (const vfloat4& a, const vfloat4& b) { return _mm_cmplt_ps (a, b); }
+  __forceinline vboolf4 operator >=(const vfloat4& a, const vfloat4& b) { return _mm_cmpnlt_ps(a, b); }
+  __forceinline vboolf4 operator > (const vfloat4& a, const vfloat4& b) { return _mm_cmpnle_ps(a, b); }
+  __forceinline vboolf4 operator <=(const vfloat4& a, const vfloat4& b) { return _mm_cmple_ps (a, b); }
+#endif
+
+  __forceinline vboolf4 operator ==(const vfloat4& a, float          b) { return a == vfloat4(b); }
+  __forceinline vboolf4 operator ==(float          a, const vfloat4& b) { return vfloat4(a) == b; }
+
+  __forceinline vboolf4 operator !=(const vfloat4& a, float          b) { return a != vfloat4(b); }
+  __forceinline vboolf4 operator !=(float          a, const vfloat4& b) { return vfloat4(a) != b; }
+
+  __forceinline vboolf4 operator < (const vfloat4& a, float          b) { return a <  vfloat4(b); }
+  __forceinline vboolf4 operator < (float          a, const vfloat4& b) { return vfloat4(a) <  b; }
+  
+  __forceinline vboolf4 operator >=(const vfloat4& a, float          b) { return a >= vfloat4(b); }
+  __forceinline vboolf4 operator >=(float          a, const vfloat4& b) { return vfloat4(a) >= b; }
+
+  __forceinline vboolf4 operator > (const vfloat4& a, float          b) { return a >  vfloat4(b); }
+  __forceinline vboolf4 operator > (float          a, const vfloat4& b) { return vfloat4(a) >  b; }
+
+  __forceinline vboolf4 operator <=(const vfloat4& a, float          b) { return a <= vfloat4(b); }
+  __forceinline vboolf4 operator <=(float          a, const vfloat4& b) { return vfloat4(a) <= b; }
+
+  __forceinline vboolf4 eq(const vfloat4& a, const vfloat4& b) { return a == b; }
+  __forceinline vboolf4 ne(const vfloat4& a, const vfloat4& b) { return a != b; }
+  __forceinline vboolf4 lt(const vfloat4& a, const vfloat4& b) { return a <  b; }
+  __forceinline vboolf4 ge(const vfloat4& a, const vfloat4& b) { return a >= b; }
+  __forceinline vboolf4 gt(const vfloat4& a, const vfloat4& b) { return a >  b; }
+  __forceinline vboolf4 le(const vfloat4& a, const vfloat4& b) { return a <= b; }
+
+#if defined(__AVX512VL__)
+  __forceinline vboolf4 eq(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_EQ); }
+  __forceinline vboolf4 ne(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_NE); }
+  __forceinline vboolf4 lt(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_LT); }
+  __forceinline vboolf4 ge(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_GE); }
+  __forceinline vboolf4 gt(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_GT); }
+  __forceinline vboolf4 le(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_LE); }
+#else
+  __forceinline vboolf4 eq(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a == b); }
+  __forceinline vboolf4 ne(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a != b); }
+  __forceinline vboolf4 lt(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a <  b); }
+  __forceinline vboolf4 ge(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a >= b); }
+  __forceinline vboolf4 gt(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a >  b); }
+  __forceinline vboolf4 le(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a <= b); }
+#endif
+
+  template<int mask>
+    __forceinline vfloat4 select(const vfloat4& t, const vfloat4& f)
+  {
+#if defined(__SSE4_1__) 
+    return _mm_blend_ps(f, t, mask);
+#else
+    return select(vboolf4(mask), t, f);
+#endif
+  }
+  
+  __forceinline vfloat4 lerp(const vfloat4& a, const vfloat4& b, const vfloat4& t) {
+    return madd(t,b-a,a);
+  }
+  
+  __forceinline bool isvalid(const vfloat4& v) {
+    return all((v > vfloat4(-FLT_LARGE)) & (v < vfloat4(+FLT_LARGE)));
+  }
+
+  __forceinline bool is_finite(const vfloat4& a) {
+    return all((a >= vfloat4(-FLT_MAX)) & (a <= vfloat4(+FLT_MAX)));
+  }
+
+  __forceinline bool is_finite(const vboolf4& valid, const vfloat4& a) {
+    return all(valid, (a >= vfloat4(-FLT_MAX)) & (a <= vfloat4(+FLT_MAX)));
+  }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Rounding Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__aarch64__)
+  __forceinline vfloat4 floor(const vfloat4& a) { return vrndmq_f32(a.v); }
+  __forceinline vfloat4 ceil (const vfloat4& a) { return vrndpq_f32(a.v); }
+  __forceinline vfloat4 trunc(const vfloat4& a) { return vrndq_f32(a.v); }
+  __forceinline vfloat4 round(const vfloat4& a) { return vrndnq_f32(a.v); }
+#elif defined (__SSE4_1__)
+  __forceinline vfloat4 floor(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF    ); }
+  __forceinline vfloat4 ceil (const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF    ); }
+  __forceinline vfloat4 trunc(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_ZERO       ); }
+  __forceinline vfloat4 round(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT); }
+#else
+  __forceinline vfloat4 floor(const vfloat4& a) { return vfloat4(floorf(a[0]),floorf(a[1]),floorf(a[2]),floorf(a[3])); }
+  __forceinline vfloat4 ceil (const vfloat4& a) { return vfloat4(ceilf (a[0]),ceilf (a[1]),ceilf (a[2]),ceilf (a[3])); }
+  __forceinline vfloat4 trunc(const vfloat4& a) { return vfloat4(truncf(a[0]),truncf(a[1]),truncf(a[2]),truncf(a[3])); }
+  __forceinline vfloat4 round(const vfloat4& a) { return vfloat4(roundf(a[0]),roundf(a[1]),roundf(a[2]),roundf(a[3])); }
+#endif
+  __forceinline vfloat4 frac(const vfloat4& a) { return a-floor(a); }
+
+  __forceinline vint4 floori(const vfloat4& a) {
+#if defined(__SSE4_1__)
+    return vint4(floor(a));
+#else
+    return vint4(a-vfloat4(0.5f));
+#endif
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat4 unpacklo(const vfloat4& a, const vfloat4& b) { return _mm_unpacklo_ps(a, b); }
+  __forceinline vfloat4 unpackhi(const vfloat4& a, const vfloat4& b) { return _mm_unpackhi_ps(a, b); }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vfloat4 shuffle(const vfloat4& v) {
+    return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vfloat4 shuffle(const vfloat4& a, const vfloat4& b) {
+    return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+
+#if defined(__SSE3__)
+  template<> __forceinline vfloat4 shuffle<0, 0, 2, 2>(const vfloat4& v) { return _mm_moveldup_ps(v); }
+  template<> __forceinline vfloat4 shuffle<1, 1, 3, 3>(const vfloat4& v) { return _mm_movehdup_ps(v); }
+  template<> __forceinline vfloat4 shuffle<0, 1, 0, 1>(const vfloat4& v) { return _mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(v))); }
+#endif
+
+  template<int i>
+  __forceinline vfloat4 shuffle(const vfloat4& v) {
+    return shuffle<i,i,i,i>(v);
+  }
+
+  template<int i> __forceinline float extract   (const vfloat4& a) { return _mm_cvtss_f32(shuffle<i>(a)); }
+  template<>      __forceinline float extract<0>(const vfloat4& a) { return _mm_cvtss_f32(a); }
+
+#if defined (__SSE4_1__)
+  template<int dst, int src, int clr> __forceinline vfloat4 insert(const vfloat4& a, const vfloat4& b) { return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); }
+  template<int dst, int src> __forceinline vfloat4 insert(const vfloat4& a, const vfloat4& b) { return insert<dst, src, 0>(a, b); }
+  template<int dst> __forceinline vfloat4 insert(const vfloat4& a, const float b) { return insert<dst, 0>(a, _mm_set_ss(b)); }
+#else
+  template<int dst, int src> __forceinline vfloat4 insert(const vfloat4& a, const vfloat4& b) { vfloat4 c = a; c[dst&3] = b[src&3]; return c; }
+  template<int dst>  __forceinline vfloat4 insert(const vfloat4& a, float b) { vfloat4 c = a; c[dst&3] = b; return c; }
+#endif
+
+  __forceinline float toScalar(const vfloat4& v) { return _mm_cvtss_f32(v); }
+
+  __forceinline vfloat4 shift_right_1(const vfloat4& x) {
+    return _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(x), 4)); 
+  }
+
+#if defined (__AVX2__)
+  __forceinline vfloat4 permute(const vfloat4 &a, const __m128i &index) {
+    return _mm_permutevar_ps(a,index);
+  }
+
+  __forceinline vfloat4 broadcast1f(const void* a) { return _mm_broadcast_ss((float*)a); }
+
+#endif
+
+#if defined(__AVX512VL__)
+  template<int i>
+  __forceinline vfloat4 align_shift_right(const vfloat4& a, const vfloat4& b) {
+    return _mm_castsi128_ps(_mm_alignr_epi32(_mm_castps_si128(a), _mm_castps_si128(b), i));
+  }  
+#endif
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Sorting Network
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat4 sort_ascending(const vfloat4& v)
+  {
+    const vfloat4 a0 = v;
+    const vfloat4 b0 = shuffle<1,0,3,2>(a0);
+    const vfloat4 c0 = min(a0,b0);
+    const vfloat4 d0 = max(a0,b0);
+    const vfloat4 a1 = select<0x5 /* 0b0101 */>(c0,d0);
+    const vfloat4 b1 = shuffle<2,3,0,1>(a1);
+    const vfloat4 c1 = min(a1,b1);
+    const vfloat4 d1 = max(a1,b1);
+    const vfloat4 a2 = select<0x3 /* 0b0011 */>(c1,d1);
+    const vfloat4 b2 = shuffle<0,2,1,3>(a2);
+    const vfloat4 c2 = min(a2,b2);
+    const vfloat4 d2 = max(a2,b2);
+    const vfloat4 a3 = select<0x2 /* 0b0010 */>(c2,d2);
+    return a3;
+  }
+
+  __forceinline vfloat4 sort_descending(const vfloat4& v)
+  {
+    const vfloat4 a0 = v;
+    const vfloat4 b0 = shuffle<1,0,3,2>(a0);
+    const vfloat4 c0 = max(a0,b0);
+    const vfloat4 d0 = min(a0,b0);
+    const vfloat4 a1 = select<0x5 /* 0b0101 */>(c0,d0);
+    const vfloat4 b1 = shuffle<2,3,0,1>(a1);
+    const vfloat4 c1 = max(a1,b1);
+    const vfloat4 d1 = min(a1,b1);
+    const vfloat4 a2 = select<0x3 /* 0b0011 */>(c1,d1);
+    const vfloat4 b2 = shuffle<0,2,1,3>(a2);
+    const vfloat4 c2 = max(a2,b2);
+    const vfloat4 d2 = min(a2,b2);
+    const vfloat4 a3 = select<0x2 /* 0b0010 */>(c2,d2);
+    return a3;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Transpose
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline void transpose(const vfloat4& r0, const vfloat4& r1, const vfloat4& r2, const vfloat4& r3, vfloat4& c0, vfloat4& c1, vfloat4& c2, vfloat4& c3)
+  {
+    vfloat4 l02 = unpacklo(r0,r2);
+    vfloat4 h02 = unpackhi(r0,r2);
+    vfloat4 l13 = unpacklo(r1,r3);
+    vfloat4 h13 = unpackhi(r1,r3);
+    c0 = unpacklo(l02,l13);
+    c1 = unpackhi(l02,l13);
+    c2 = unpacklo(h02,h13);
+    c3 = unpackhi(h02,h13);
+  }
+
+  __forceinline void transpose(const vfloat4& r0, const vfloat4& r1, const vfloat4& r2, const vfloat4& r3, vfloat4& c0, vfloat4& c1, vfloat4& c2)
+  {
+    vfloat4 l02 = unpacklo(r0,r2);
+    vfloat4 h02 = unpackhi(r0,r2);
+    vfloat4 l13 = unpacklo(r1,r3);
+    vfloat4 h13 = unpackhi(r1,r3);
+    c0 = unpacklo(l02,l13);
+    c1 = unpackhi(l02,l13);
+    c2 = unpacklo(h02,h13);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat4 vreduce_min(const vfloat4& v) { vfloat4 h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); }
+  __forceinline vfloat4 vreduce_max(const vfloat4& v) { vfloat4 h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); }
+  __forceinline vfloat4 vreduce_add(const vfloat4& v) { vfloat4 h = shuffle<1,0,3,2>(v)   + v ; return shuffle<2,3,0,1>(h)   + h ; }
+
+  __forceinline float reduce_min(const vfloat4& v) { return _mm_cvtss_f32(vreduce_min(v)); }
+  __forceinline float reduce_max(const vfloat4& v) { return _mm_cvtss_f32(vreduce_max(v)); }
+  __forceinline float reduce_add(const vfloat4& v) { return _mm_cvtss_f32(vreduce_add(v)); }
+
+  __forceinline size_t select_min(const vboolf4& valid, const vfloat4& v) 
+  { 
+    const vfloat4 a = select(valid,v,vfloat4(pos_inf)); 
+    const vbool4 valid_min = valid & (a == vreduce_min(a));
+    return bsf(movemask(any(valid_min) ? valid_min : valid)); 
+  }
+  __forceinline size_t select_max(const vboolf4& valid, const vfloat4& v) 
+  { 
+    const vfloat4 a = select(valid,v,vfloat4(neg_inf)); 
+    const vbool4 valid_max = valid & (a == vreduce_max(a));
+    return bsf(movemask(any(valid_max) ? valid_max : valid)); 
+  }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Euclidian Space Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline float dot(const vfloat4& a, const vfloat4& b) {
+    return reduce_add(a*b);
+  }
+
+  __forceinline vfloat4 cross(const vfloat4& a, const vfloat4& b)
+  {
+    const vfloat4 a0 = a;
+    const vfloat4 b0 = shuffle<1,2,0,3>(b);
+    const vfloat4 a1 = shuffle<1,2,0,3>(a);
+    const vfloat4 b1 = b;
+    return shuffle<1,2,0,3>(msub(a0,b0,a1*b1));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vfloat4& a) {
+    return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ">";
+  }
+
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vfloat8_avx.h b/thirdparty/embree/common/simd/vfloat8_avx.h
new file mode 100644
index 0000000000..13446454e8
--- /dev/null
+++ b/thirdparty/embree/common/simd/vfloat8_avx.h
@@ -0,0 +1,758 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+  /* 8-wide AVX float type */
+  template<>
+  struct vfloat<8>
+  {
+    ALIGNED_STRUCT_(32);
+   
+    typedef vboolf8 Bool;
+    typedef vint8   Int;
+    typedef vfloat8 Float;
+
+    enum  { size = 8 };                        // number of SIMD elements
+    union { __m256 v; float f[8]; int i[8]; }; // data
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vfloat() {}
+    __forceinline vfloat(const vfloat8& other) { v = other.v; }
+    __forceinline vfloat8& operator =(const vfloat8& other) { v = other.v; return *this; }
+
+    __forceinline vfloat(__m256 a) : v(a) {}
+    __forceinline operator const __m256&() const { return v; }
+    __forceinline operator       __m256&()       { return v; }
+
+    __forceinline explicit vfloat(const vfloat4& a) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),a,1)) {}
+    __forceinline vfloat(const vfloat4& a, const vfloat4& b) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),b,1)) {}
+
+    __forceinline explicit vfloat(const char* a) : v(_mm256_loadu_ps((const float*)a)) {}
+    __forceinline vfloat(float a) : v(_mm256_set1_ps(a)) {}
+    __forceinline vfloat(float a, float b) : v(_mm256_set_ps(b, a, b, a, b, a, b, a)) {}
+    __forceinline vfloat(float a, float b, float c, float d) : v(_mm256_set_ps(d, c, b, a, d, c, b, a)) {}
+    __forceinline vfloat(float a, float b, float c, float d, float e, float f, float g, float h) : v(_mm256_set_ps(h, g, f, e, d, c, b, a)) {}
+
+    __forceinline explicit vfloat(__m256i a) : v(_mm256_cvtepi32_ps(a)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vfloat(ZeroTy)   : v(_mm256_setzero_ps()) {}
+    __forceinline vfloat(OneTy)    : v(_mm256_set1_ps(1.0f)) {}
+    __forceinline vfloat(PosInfTy) : v(_mm256_set1_ps(pos_inf)) {}
+    __forceinline vfloat(NegInfTy) : v(_mm256_set1_ps(neg_inf)) {}
+    __forceinline vfloat(StepTy)   : v(_mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)) {}
+    __forceinline vfloat(NaNTy)    : v(_mm256_set1_ps(nan)) {}
+    __forceinline vfloat(UndefinedTy) : v(_mm256_undefined_ps()) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline vfloat8 broadcast(const void* a) {
+      return _mm256_broadcast_ss((float*)a); 
+    }
+
+    static __forceinline vfloat8 load(const char* ptr) {
+#if defined(__AVX2__)
+      return _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32(_mm_loadu_si128((__m128i*)ptr)));
+#else
+      return vfloat8(vfloat4::load(ptr),vfloat4::load(ptr+4));
+#endif
+    }
+
+    static __forceinline vfloat8 load(const unsigned char* ptr) {
+#if defined(__AVX2__)
+      return _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr)));
+#else
+      return vfloat8(vfloat4::load(ptr),vfloat4::load(ptr+4));
+#endif
+    }
+
+    static __forceinline vfloat8 load(const short* ptr) {
+#if defined(__AVX2__)
+      return _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)ptr)));
+#else
+      return vfloat8(vfloat4::load(ptr),vfloat4::load(ptr+4));
+#endif
+    }
+      
+    static __forceinline vfloat8 load (const void* ptr) { return _mm256_load_ps((float*)ptr); }
+    static __forceinline vfloat8 loadu(const void* ptr) { return _mm256_loadu_ps((float*)ptr); }
+
+    static __forceinline void store (void* ptr, const vfloat8& v) { return _mm256_store_ps((float*)ptr,v); }
+    static __forceinline void storeu(void* ptr, const vfloat8& v) { return _mm256_storeu_ps((float*)ptr,v); }
+
+#if defined(__AVX512VL__)
+
+    static __forceinline vfloat8 load (const vboolf8& mask, const void* ptr) { return _mm256_mask_load_ps (_mm256_setzero_ps(),mask,(float*)ptr); }
+    static __forceinline vfloat8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_mask_loadu_ps(_mm256_setzero_ps(),mask,(float*)ptr); }
+
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_mask_store_ps ((float*)ptr,mask,v); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_mask_storeu_ps((float*)ptr,mask,v); }
+#else
+    static __forceinline vfloat8 load (const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,(__m256i)mask); }
+    static __forceinline vfloat8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,(__m256i)mask); }
+
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,v); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,v); }
+#endif
+    
+#if defined(__AVX2__)
+    static __forceinline vfloat8 load_nt(void* ptr) {
+      return _mm256_castsi256_ps(_mm256_stream_load_si256((__m256i*)ptr));
+    }
+#endif
+    
+    static __forceinline void store_nt(void* ptr, const vfloat8& v) {
+      _mm256_stream_ps((float*)ptr,v);
+    }
+
+    template<int scale = 4>
+    static __forceinline vfloat8 gather(const float* ptr, const vint8& index) {
+#if defined(__AVX2__)
+      return _mm256_i32gather_ps(ptr, index ,scale);
+#else
+      return vfloat8(
+          *(float*)(((char*)ptr)+scale*index[0]),
+          *(float*)(((char*)ptr)+scale*index[1]),
+          *(float*)(((char*)ptr)+scale*index[2]),
+          *(float*)(((char*)ptr)+scale*index[3]),
+          *(float*)(((char*)ptr)+scale*index[4]),
+          *(float*)(((char*)ptr)+scale*index[5]),
+          *(float*)(((char*)ptr)+scale*index[6]),
+          *(float*)(((char*)ptr)+scale*index[7]));
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline vfloat8 gather(const vboolf8& mask, const float* ptr, const vint8& index) {
+      vfloat8 r = zero;
+#if defined(__AVX512VL__)
+      return _mm256_mmask_i32gather_ps(r, mask, index, ptr, scale);
+#elif defined(__AVX2__)
+      return _mm256_mask_i32gather_ps(r, ptr, index, mask, scale);
+#else
+      if (likely(mask[0])) r[0] = *(float*)(((char*)ptr)+scale*index[0]);
+      if (likely(mask[1])) r[1] = *(float*)(((char*)ptr)+scale*index[1]);
+      if (likely(mask[2])) r[2] = *(float*)(((char*)ptr)+scale*index[2]);
+      if (likely(mask[3])) r[3] = *(float*)(((char*)ptr)+scale*index[3]);
+      if (likely(mask[4])) r[4] = *(float*)(((char*)ptr)+scale*index[4]);
+      if (likely(mask[5])) r[5] = *(float*)(((char*)ptr)+scale*index[5]);
+      if (likely(mask[6])) r[6] = *(float*)(((char*)ptr)+scale*index[6]);
+      if (likely(mask[7])) r[7] = *(float*)(((char*)ptr)+scale*index[7]);
+      return r;
+    #endif
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(void* ptr, const vint8& ofs, const vfloat8& v)
+    {
+#if defined(__AVX512VL__)
+      _mm256_i32scatter_ps((float*)ptr, ofs, v, scale);
+#else
+      *(float*)(((char*)ptr)+scale*ofs[0]) = v[0];
+      *(float*)(((char*)ptr)+scale*ofs[1]) = v[1];
+      *(float*)(((char*)ptr)+scale*ofs[2]) = v[2];
+      *(float*)(((char*)ptr)+scale*ofs[3]) = v[3];
+      *(float*)(((char*)ptr)+scale*ofs[4]) = v[4];
+      *(float*)(((char*)ptr)+scale*ofs[5]) = v[5];
+      *(float*)(((char*)ptr)+scale*ofs[6]) = v[6];
+      *(float*)(((char*)ptr)+scale*ofs[7]) = v[7];
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vfloat8& v)
+    {
+#if defined(__AVX512VL__)
+      _mm256_mask_i32scatter_ps((float*)ptr, mask, ofs, v, scale);
+#else
+      if (likely(mask[0])) *(float*)(((char*)ptr)+scale*ofs[0]) = v[0];
+      if (likely(mask[1])) *(float*)(((char*)ptr)+scale*ofs[1]) = v[1];
+      if (likely(mask[2])) *(float*)(((char*)ptr)+scale*ofs[2]) = v[2];
+      if (likely(mask[3])) *(float*)(((char*)ptr)+scale*ofs[3]) = v[3];
+      if (likely(mask[4])) *(float*)(((char*)ptr)+scale*ofs[4]) = v[4];
+      if (likely(mask[5])) *(float*)(((char*)ptr)+scale*ofs[5]) = v[5];
+      if (likely(mask[6])) *(float*)(((char*)ptr)+scale*ofs[6]) = v[6];
+      if (likely(mask[7])) *(float*)(((char*)ptr)+scale*ofs[7]) = v[7];
+#endif
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const float& operator [](size_t index) const { assert(index < 8); return f[index]; }
+    __forceinline       float& operator [](size_t index)       { assert(index < 8); return f[index]; }
+  };
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat8 asFloat(const vint8&   a) { return _mm256_castsi256_ps(a); }
+  __forceinline vint8   asInt  (const vfloat8& a) { return _mm256_castps_si256(a); }
+
+  __forceinline vint8   toInt  (const vfloat8& a) { return vint8(a); }
+  __forceinline vfloat8 toFloat(const vint8&   a) { return vfloat8(a); }
+
+  __forceinline vfloat8 operator +(const vfloat8& a) { return a; }
+  __forceinline vfloat8 operator -(const vfloat8& a) {
+    const __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); 
+    return _mm256_xor_ps(a, mask);
+  }
+  __forceinline vfloat8 abs(const vfloat8& a) {
+    const __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff));
+    return _mm256_and_ps(a, mask);
+  }
+  __forceinline vfloat8 sign   (const vfloat8& a) { return _mm256_blendv_ps(vfloat8(one), -vfloat8(one), _mm256_cmp_ps(a, vfloat8(zero), _CMP_NGE_UQ)); }
+  __forceinline vfloat8 signmsk(const vfloat8& a) { return _mm256_and_ps(a,_mm256_castsi256_ps(_mm256_set1_epi32(0x80000000))); }
+
+
+  static __forceinline vfloat8 rcp(const vfloat8& a)
+  {
+#if defined(__AVX512VL__)
+    const vfloat8 r = _mm256_rcp14_ps(a);
+#else
+    const vfloat8 r = _mm256_rcp_ps(a);
+#endif
+
+#if defined(__AVX2__)
+    return _mm256_mul_ps(r, _mm256_fnmadd_ps(r, a, vfloat8(2.0f)));
+#else
+    return _mm256_mul_ps(r, _mm256_sub_ps(vfloat8(2.0f), _mm256_mul_ps(r, a)));
+#endif
+  }
+  __forceinline vfloat8 sqr (const vfloat8& a) { return _mm256_mul_ps(a,a); }
+  __forceinline vfloat8 sqrt(const vfloat8& a) { return _mm256_sqrt_ps(a); }
+
+  static __forceinline vfloat8 rsqrt(const vfloat8& a)
+  {
+#if defined(__AVX512VL__)
+    const vfloat8 r = _mm256_rsqrt14_ps(a);
+#else
+    const vfloat8 r = _mm256_rsqrt_ps(a);
+#endif
+
+#if defined(__AVX2__)
+    return _mm256_fmadd_ps(_mm256_set1_ps(1.5f), r,
+                           _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(a, _mm256_set1_ps(-0.5f)), r), _mm256_mul_ps(r, r))); 
+#else
+    return _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(1.5f), r),
+                         _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(a, _mm256_set1_ps(-0.5f)), r), _mm256_mul_ps(r, r)));
+#endif
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat8 operator +(const vfloat8& a, const vfloat8& b) { return _mm256_add_ps(a, b); }
+  __forceinline vfloat8 operator +(const vfloat8& a, float          b) { return a + vfloat8(b); }
+  __forceinline vfloat8 operator +(float          a, const vfloat8& b) { return vfloat8(a) + b; }
+
+  __forceinline vfloat8 operator -(const vfloat8& a, const vfloat8& b) { return _mm256_sub_ps(a, b); }
+  __forceinline vfloat8 operator -(const vfloat8& a, float          b) { return a - vfloat8(b); }
+  __forceinline vfloat8 operator -(float          a, const vfloat8& b) { return vfloat8(a) - b; }
+
+  __forceinline vfloat8 operator *(const vfloat8& a, const vfloat8& b) { return _mm256_mul_ps(a, b); }
+  __forceinline vfloat8 operator *(const vfloat8& a, float          b) { return a * vfloat8(b); }
+  __forceinline vfloat8 operator *(float          a, const vfloat8& b) { return vfloat8(a) * b; }
+
+  __forceinline vfloat8 operator /(const vfloat8& a, const vfloat8& b) { return _mm256_div_ps(a, b); }
+  __forceinline vfloat8 operator /(const vfloat8& a, float          b) { return a / vfloat8(b); }
+  __forceinline vfloat8 operator /(float          a, const vfloat8& b) { return vfloat8(a) / b; }
+
+  __forceinline vfloat8 operator &(const vfloat8& a, const vfloat8& b) { return _mm256_and_ps(a,b); }
+  __forceinline vfloat8 operator |(const vfloat8& a, const vfloat8& b) { return _mm256_or_ps(a,b); }
+  __forceinline vfloat8 operator ^(const vfloat8& a, const vfloat8& b) { return _mm256_xor_ps(a,b); }
+  __forceinline vfloat8 operator ^(const vfloat8& a, const vint8&   b) { return _mm256_xor_ps(a,_mm256_castsi256_ps(b)); }
+
+  __forceinline vfloat8 min(const vfloat8& a, const vfloat8& b) { return _mm256_min_ps(a, b); }
+  __forceinline vfloat8 min(const vfloat8& a, float          b) { return _mm256_min_ps(a, vfloat8(b)); }
+  __forceinline vfloat8 min(float          a, const vfloat8& b) { return _mm256_min_ps(vfloat8(a), b); }
+
+  __forceinline vfloat8 max(const vfloat8& a, const vfloat8& b) { return _mm256_max_ps(a, b); }
+  __forceinline vfloat8 max(const vfloat8& a, float          b) { return _mm256_max_ps(a, vfloat8(b)); }
+  __forceinline vfloat8 max(float          a, const vfloat8& b) { return _mm256_max_ps(vfloat8(a), b); }
+
+  /* need "static __forceinline for MSVC, otherwise we'll link the wrong version in debug mode */
+#if defined(__AVX2__)
+
+  static __forceinline vfloat8 mini(const vfloat8& a, const vfloat8& b) {
+    const vint8 ai = _mm256_castps_si256(a);
+    const vint8 bi = _mm256_castps_si256(b);
+    const vint8 ci = _mm256_min_epi32(ai,bi);
+    return _mm256_castsi256_ps(ci);
+  }
+
+  static __forceinline vfloat8 maxi(const vfloat8& a, const vfloat8& b) {
+    const vint8 ai = _mm256_castps_si256(a);
+    const vint8 bi = _mm256_castps_si256(b);
+    const vint8 ci = _mm256_max_epi32(ai,bi);
+    return _mm256_castsi256_ps(ci);
+  }
+
+  static __forceinline vfloat8 minui(const vfloat8& a, const vfloat8& b) {
+    const vint8 ai = _mm256_castps_si256(a);
+    const vint8 bi = _mm256_castps_si256(b);
+    const vint8 ci = _mm256_min_epu32(ai,bi);
+    return _mm256_castsi256_ps(ci);
+  }
+
+  static __forceinline vfloat8 maxui(const vfloat8& a, const vfloat8& b) {
+    const vint8 ai = _mm256_castps_si256(a);
+    const vint8 bi = _mm256_castps_si256(b);
+    const vint8 ci = _mm256_max_epu32(ai,bi);
+    return _mm256_castsi256_ps(ci);
+  }
+
+#else
+
+  static __forceinline vfloat8 mini(const vfloat8& a, const vfloat8& b) {
+    return asFloat(min(asInt(a),asInt(b)));
+  }
+
+  static __forceinline vfloat8 maxi(const vfloat8& a, const vfloat8& b) {
+    return asFloat(max(asInt(a),asInt(b)));
+  }
+
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX2__)
+  static __forceinline vfloat8 madd  (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return _mm256_fmadd_ps(a,b,c); }
+  static __forceinline vfloat8 msub  (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return _mm256_fmsub_ps(a,b,c); }
+  static __forceinline vfloat8 nmadd (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return _mm256_fnmadd_ps(a,b,c); }
+  static __forceinline vfloat8 nmsub (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return _mm256_fnmsub_ps(a,b,c); }
+#else
+  static __forceinline vfloat8 madd  (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return a*b+c; }
+  static __forceinline vfloat8 msub  (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return a*b-c; }
+  static __forceinline vfloat8 nmadd (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return -a*b+c;}
+  static __forceinline vfloat8 nmsub (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return -a*b-c; }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat8& operator +=(vfloat8& a, const vfloat8& b) { return a = a + b; }
+  __forceinline vfloat8& operator +=(vfloat8& a, float          b) { return a = a + b; }
+
+  __forceinline vfloat8& operator -=(vfloat8& a, const vfloat8& b) { return a = a - b; }
+  __forceinline vfloat8& operator -=(vfloat8& a, float          b) { return a = a - b; }
+
+  __forceinline vfloat8& operator *=(vfloat8& a, const vfloat8& b) { return a = a * b; }
+  __forceinline vfloat8& operator *=(vfloat8& a, float          b) { return a = a * b; }
+
+  __forceinline vfloat8& operator /=(vfloat8& a, const vfloat8& b) { return a = a / b; }
+  __forceinline vfloat8& operator /=(vfloat8& a, float          b) { return a = a / b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  static __forceinline vboolf8 operator ==(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_EQ); }
+  static __forceinline vboolf8 operator !=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_NE); }
+  static __forceinline vboolf8 operator < (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_LT); }
+  static __forceinline vboolf8 operator >=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_GE); }
+  static __forceinline vboolf8 operator > (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_GT); }
+  static __forceinline vboolf8 operator <=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_LE); }
+
+  static __forceinline vfloat8 select(const vboolf8& m, const vfloat8& t, const vfloat8& f) {
+    return _mm256_mask_blend_ps(m, f, t);
+  }
+#else
+  static __forceinline vboolf8 operator ==(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_EQ_OQ);  }
+  static __forceinline vboolf8 operator !=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NEQ_UQ); }
+  static __forceinline vboolf8 operator < (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_LT_OS);  }
+  static __forceinline vboolf8 operator >=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NLT_US); }
+  static __forceinline vboolf8 operator > (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NLE_US); }
+  static __forceinline vboolf8 operator <=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_LE_OS);  }
+
+  static __forceinline vfloat8 select(const vboolf8& m, const vfloat8& t, const vfloat8& f) {
+    return _mm256_blendv_ps(f, t, m); 
+  }
+#endif
+
+  template<int mask>
+    __forceinline vfloat8 select(const vfloat8& t, const vfloat8& f) {
+    return _mm256_blend_ps(f, t, mask);
+  }
+
+  __forceinline vboolf8 operator ==(const vfloat8& a, const float&   b) { return a == vfloat8(b); }
+  __forceinline vboolf8 operator ==(const float&   a, const vfloat8& b) { return vfloat8(a) == b; }
+
+  __forceinline vboolf8 operator !=(const vfloat8& a, const float&   b) { return a != vfloat8(b); }
+  __forceinline vboolf8 operator !=(const float&   a, const vfloat8& b) { return vfloat8(a) != b; }
+
+  __forceinline vboolf8 operator < (const vfloat8& a, const float&   b) { return a <  vfloat8(b); }
+  __forceinline vboolf8 operator < (const float&   a, const vfloat8& b) { return vfloat8(a) <  b; }
+
+  __forceinline vboolf8 operator >=(const vfloat8& a, const float&   b) { return a >= vfloat8(b); }
+  __forceinline vboolf8 operator >=(const float&   a, const vfloat8& b) { return vfloat8(a) >= b; }
+
+  __forceinline vboolf8 operator > (const vfloat8& a, const float&   b) { return a >  vfloat8(b); }
+  __forceinline vboolf8 operator > (const float&   a, const vfloat8& b) { return vfloat8(a) >  b; }
+
+  __forceinline vboolf8 operator <=(const vfloat8& a, const float&   b) { return a <= vfloat8(b); }
+  __forceinline vboolf8 operator <=(const float&   a, const vfloat8& b) { return vfloat8(a) <= b; }
+
+  __forceinline vboolf8 eq(const vfloat8& a, const vfloat8& b) { return a == b; }
+  __forceinline vboolf8 ne(const vfloat8& a, const vfloat8& b) { return a != b; }
+  __forceinline vboolf8 lt(const vfloat8& a, const vfloat8& b) { return a <  b; }
+  __forceinline vboolf8 ge(const vfloat8& a, const vfloat8& b) { return a >= b; }
+  __forceinline vboolf8 gt(const vfloat8& a, const vfloat8& b) { return a >  b; }
+  __forceinline vboolf8 le(const vfloat8& a, const vfloat8& b) { return a <= b; }
+
+#if defined(__AVX512VL__)
+  static __forceinline vboolf8 eq(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_EQ); }
+  static __forceinline vboolf8 ne(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_NE); }
+  static __forceinline vboolf8 lt(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_LT); }
+  static __forceinline vboolf8 ge(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_GE); }
+  static __forceinline vboolf8 gt(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_GT); }
+  static __forceinline vboolf8 le(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_LE); }
+#else
+  static __forceinline vboolf8 eq(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a == b); }
+  static __forceinline vboolf8 ne(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a != b); }
+  static __forceinline vboolf8 lt(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a <  b); }
+  static __forceinline vboolf8 ge(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a >= b); }
+  static __forceinline vboolf8 gt(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a >  b); }
+  static __forceinline vboolf8 le(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a <= b); }
+#endif
+
+  __forceinline vfloat8 lerp(const vfloat8& a, const vfloat8& b, const vfloat8& t) {
+    return madd(t,b-a,a);
+  }
+
+  __forceinline bool isvalid (const vfloat8& v) {
+    return all((v > vfloat8(-FLT_LARGE)) & (v < vfloat8(+FLT_LARGE)));
+  }
+
+  __forceinline bool is_finite (const vfloat8& a) {
+    return all((a >= vfloat8(-FLT_MAX)) & (a <= vfloat8(+FLT_MAX)));
+  }
+
+  __forceinline bool is_finite (const vboolf8& valid, const vfloat8& a) {
+    return all(valid, (a >= vfloat8(-FLT_MAX)) & (a <= vfloat8(+FLT_MAX)));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Rounding Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat8 floor(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_NEG_INF    ); }
+  __forceinline vfloat8 ceil (const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_POS_INF    ); }
+  __forceinline vfloat8 trunc(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_ZERO       ); }
+  __forceinline vfloat8 round(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_NEAREST_INT); }
+  __forceinline vfloat8 frac (const vfloat8& a) { return a-floor(a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat8 unpacklo(const vfloat8& a, const vfloat8& b) { return _mm256_unpacklo_ps(a, b); }
+  __forceinline vfloat8 unpackhi(const vfloat8& a, const vfloat8& b) { return _mm256_unpackhi_ps(a, b); }
+
+  template<int i>
+  __forceinline vfloat8 shuffle(const vfloat8& v) {
+    return _mm256_permute_ps(v, _MM_SHUFFLE(i, i, i, i));
+  }
+
+  template<int i0, int i1>
+  __forceinline vfloat8 shuffle4(const vfloat8& v) {
+    return _mm256_permute2f128_ps(v, v, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1>
+  __forceinline vfloat8 shuffle4(const vfloat8& a, const vfloat8& b) {
+    return _mm256_permute2f128_ps(a, b, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vfloat8 shuffle(const vfloat8& v) {
+    return _mm256_permute_ps(v, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vfloat8 shuffle(const vfloat8& a, const vfloat8& b) {
+    return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+
+  template<> __forceinline vfloat8 shuffle<0, 0, 2, 2>(const vfloat8& v) { return _mm256_moveldup_ps(v); }
+  template<> __forceinline vfloat8 shuffle<1, 1, 3, 3>(const vfloat8& v) { return _mm256_movehdup_ps(v); }
+  template<> __forceinline vfloat8 shuffle<0, 1, 0, 1>(const vfloat8& v) { return _mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(v))); }
+
+  __forceinline vfloat8 broadcast(const float* ptr) { return _mm256_broadcast_ss(ptr); }
+  template<size_t i> __forceinline vfloat8 insert4(const vfloat8& a, const vfloat4& b) { return _mm256_insertf128_ps(a, b, i); }
+  template<size_t i> __forceinline vfloat4 extract4   (const vfloat8& a) { return _mm256_extractf128_ps(a, i); }
+  template<>         __forceinline vfloat4 extract4<0>(const vfloat8& a) { return _mm256_castps256_ps128(a);   }
+
+  __forceinline float toScalar(const vfloat8& v) { return _mm_cvtss_f32(_mm256_castps256_ps128(v)); }
+
+#if defined (__AVX2__)
+  static __forceinline vfloat8 permute(const vfloat8& a, const __m256i& index) {
+    return _mm256_permutevar8x32_ps(a, index);
+  }
+#endif
+
+#if defined(__AVX512VL__)
+  template<int i>
+  static __forceinline vfloat8 align_shift_right(const vfloat8& a, const vfloat8& b) {
+    return _mm256_castsi256_ps(_mm256_alignr_epi32(_mm256_castps_si256(a), _mm256_castps_si256(b), i));
+  }  
+#endif
+
+#if defined (__AVX_I__)
+  template<const int mode>
+  static __forceinline vint4 convert_to_hf16(const vfloat8& a) {
+    return _mm256_cvtps_ph(a, mode);
+  }
+
+  static __forceinline vfloat8 convert_from_hf16(const vint4& a) {
+    return _mm256_cvtph_ps(a);
+  }
+#endif
+
+#if defined(__AVX512VL__)
+  static __forceinline vfloat8 shift_right_1(const vfloat8& x) {
+    return align_shift_right<1>(zero,x);
+  }
+#else
+  static __forceinline vfloat8 shift_right_1(const vfloat8& x) {
+    const vfloat8 t0 = shuffle<1,2,3,0>(x);
+    const vfloat8 t1 = shuffle4<1,0>(t0);
+    return _mm256_blend_ps(t0,t1,0x88);
+  }
+#endif
+
+  __forceinline vint8 floori(const vfloat8& a) {
+    return vint8(floor(a));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Transpose
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline void transpose(const vfloat8& r0, const vfloat8& r1, const vfloat8& r2, const vfloat8& r3, vfloat8& c0, vfloat8& c1, vfloat8& c2, vfloat8& c3)
+  {
+    vfloat8 l02 = unpacklo(r0,r2);
+    vfloat8 h02 = unpackhi(r0,r2);
+    vfloat8 l13 = unpacklo(r1,r3);
+    vfloat8 h13 = unpackhi(r1,r3);
+    c0 = unpacklo(l02,l13);
+    c1 = unpackhi(l02,l13);
+    c2 = unpacklo(h02,h13);
+    c3 = unpackhi(h02,h13);
+  }
+
+  __forceinline void transpose(const vfloat8& r0, const vfloat8& r1, const vfloat8& r2, const vfloat8& r3, vfloat8& c0, vfloat8& c1, vfloat8& c2)
+  {
+    vfloat8 l02 = unpacklo(r0,r2);
+    vfloat8 h02 = unpackhi(r0,r2);
+    vfloat8 l13 = unpacklo(r1,r3);
+    vfloat8 h13 = unpackhi(r1,r3);
+    c0 = unpacklo(l02,l13);
+    c1 = unpackhi(l02,l13);
+    c2 = unpacklo(h02,h13);
+  }
+
+  __forceinline void transpose(const vfloat8& r0, const vfloat8& r1, const vfloat8& r2, const vfloat8& r3, const vfloat8& r4, const vfloat8& r5, const vfloat8& r6, const vfloat8& r7,
+                               vfloat8& c0, vfloat8& c1, vfloat8& c2, vfloat8& c3, vfloat8& c4, vfloat8& c5, vfloat8& c6, vfloat8& c7)
+  {
+    vfloat8 h0,h1,h2,h3; transpose(r0,r1,r2,r3,h0,h1,h2,h3);
+    vfloat8 h4,h5,h6,h7; transpose(r4,r5,r6,r7,h4,h5,h6,h7);
+    c0 = shuffle4<0,2>(h0,h4);
+    c1 = shuffle4<0,2>(h1,h5);
+    c2 = shuffle4<0,2>(h2,h6);
+    c3 = shuffle4<0,2>(h3,h7);
+    c4 = shuffle4<1,3>(h0,h4);
+    c5 = shuffle4<1,3>(h1,h5);
+    c6 = shuffle4<1,3>(h2,h6);
+    c7 = shuffle4<1,3>(h3,h7);
+  }
+
+  __forceinline void transpose(const vfloat4& r0, const vfloat4& r1, const vfloat4& r2, const vfloat4& r3, const vfloat4& r4, const vfloat4& r5, const vfloat4& r6, const vfloat4& r7,
+                               vfloat8& c0, vfloat8& c1, vfloat8& c2, vfloat8& c3)
+  {
+    transpose(vfloat8(r0,r4), vfloat8(r1,r5), vfloat8(r2,r6), vfloat8(r3,r7), c0, c1, c2, c3);
+  }
+
+  __forceinline void transpose(const vfloat4& r0, const vfloat4& r1, const vfloat4& r2, const vfloat4& r3, const vfloat4& r4, const vfloat4& r5, const vfloat4& r6, const vfloat4& r7,
+                               vfloat8& c0, vfloat8& c1, vfloat8& c2)
+  {
+    transpose(vfloat8(r0,r4), vfloat8(r1,r5), vfloat8(r2,r6), vfloat8(r3,r7), c0, c1, c2);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat8 vreduce_min2(const vfloat8& v) { return min(v,shuffle<1,0,3,2>(v)); }
+  __forceinline vfloat8 vreduce_min4(const vfloat8& v) { vfloat8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); }
+  __forceinline vfloat8 vreduce_min (const vfloat8& v) { vfloat8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); }
+
+  __forceinline vfloat8 vreduce_max2(const vfloat8& v) { return max(v,shuffle<1,0,3,2>(v)); }
+  __forceinline vfloat8 vreduce_max4(const vfloat8& v) { vfloat8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); }
+  __forceinline vfloat8 vreduce_max (const vfloat8& v) { vfloat8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); }
+
+  __forceinline vfloat8 vreduce_add2(const vfloat8& v) { return v + shuffle<1,0,3,2>(v); }
+  __forceinline vfloat8 vreduce_add4(const vfloat8& v) { vfloat8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); }
+  __forceinline vfloat8 vreduce_add (const vfloat8& v) { vfloat8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); }
+
+  __forceinline float reduce_min(const vfloat8& v) { return toScalar(vreduce_min(v)); }
+  __forceinline float reduce_max(const vfloat8& v) { return toScalar(vreduce_max(v)); }
+  __forceinline float reduce_add(const vfloat8& v) { return toScalar(vreduce_add(v)); }
+
+  __forceinline size_t select_min(const vboolf8& valid, const vfloat8& v) 
+  { 
+    const vfloat8 a = select(valid,v,vfloat8(pos_inf)); 
+    const vbool8 valid_min = valid & (a == vreduce_min(a));
+    return bsf(movemask(any(valid_min) ? valid_min : valid)); 
+  }
+
+  __forceinline size_t select_max(const vboolf8& valid, const vfloat8& v) 
+  { 
+    const vfloat8 a = select(valid,v,vfloat8(neg_inf)); 
+    const vbool8 valid_max = valid & (a == vreduce_max(a));
+    return bsf(movemask(any(valid_max) ? valid_max : valid)); 
+  }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Euclidian Space Operators (pairs of Vec3fa's)
+  ////////////////////////////////////////////////////////////////////////////////
+
+  //__forceinline vfloat8 dot(const vfloat8& a, const vfloat8& b) {
+  //  return vreduce_add4(a*b);
+  //}
+
+  __forceinline vfloat8 dot(const vfloat8& a, const vfloat8& b) {
+    return _mm256_dp_ps(a,b,0x7F);
+  }
+
+  __forceinline vfloat8 cross(const vfloat8& a, const vfloat8& b)
+  {
+    const vfloat8 a0 = a;
+    const vfloat8 b0 = shuffle<1,2,0,3>(b);
+    const vfloat8 a1 = shuffle<1,2,0,3>(a);
+    const vfloat8 b1 = b;
+    return shuffle<1,2,0,3>(msub(a0,b0,a1*b1));
+  }
+
+  //__forceinline float sqr_length (const vfloat<8>& a) { return dot(a,a); }
+  //__forceinline float rcp_length (const vfloat<8>& a) { return rsqrt(dot(a,a)); }
+  //__forceinline float rcp_length2(const vfloat<8>& a) { return rcp(dot(a,a)); }
+  //__forceinline float length     (const vfloat<8>& a) { return sqrt(dot(a,a)); }
+  __forceinline vfloat<8> normalize(const vfloat<8>& a) { return a*rsqrt(dot(a,a)); }
+  //__forceinline float distance(const vfloat<8>& a, const vfloat<8>& b) { return length(a-b); }
+  //__forceinline float halfArea(const vfloat<8>& d) { return madd(d.x,(d.y+d.z),d.y*d.z); }
+  //__forceinline float area    (const vfloat<8>& d) { return 2.0f*halfArea(d); }
+  //__forceinline vfloat<8> reflect(const vfloat<8>& V, const vfloat<8>& N) { return 2.0f*dot(V,N)*N-V; }
+
+  //__forceinline vfloat<8> normalize_safe(const vfloat<8>& a) {
+  //  const float d = dot(a,a); if (unlikely(d == 0.0f)) return a; else return a*rsqrt(d);
+  //}
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// In Register Sorting
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat8 sort_ascending(const vfloat8& v)
+  {
+    const vfloat8 a0 = v;
+    const vfloat8 b0 = shuffle<1,0,3,2>(a0);
+    const vfloat8 c0 = min(a0,b0);
+    const vfloat8 d0 = max(a0,b0);
+    const vfloat8 a1 = select<0x99 /* 0b10011001 */>(c0,d0);
+    const vfloat8 b1 = shuffle<2,3,0,1>(a1);
+    const vfloat8 c1 = min(a1,b1);
+    const vfloat8 d1 = max(a1,b1);
+    const vfloat8 a2 = select<0xc3 /* 0b11000011 */>(c1,d1);
+    const vfloat8 b2 = shuffle<1,0,3,2>(a2);
+    const vfloat8 c2 = min(a2,b2);
+    const vfloat8 d2 = max(a2,b2);
+    const vfloat8 a3 = select<0xa5 /* 0b10100101 */>(c2,d2);
+    const vfloat8 b3 = shuffle4<1,0>(a3);
+    const vfloat8 c3 = min(a3,b3);
+    const vfloat8 d3 = max(a3,b3);
+    const vfloat8 a4 = select<0xf /* 0b00001111 */>(c3,d3);
+    const vfloat8 b4 = shuffle<2,3,0,1>(a4);
+    const vfloat8 c4 = min(a4,b4);
+    const vfloat8 d4 = max(a4,b4);
+    const vfloat8 a5 = select<0x33 /* 0b00110011 */>(c4,d4);
+    const vfloat8 b5 = shuffle<1,0,3,2>(a5);
+    const vfloat8 c5 = min(a5,b5);
+    const vfloat8 d5 = max(a5,b5);
+    const vfloat8 a6 = select<0x55 /* 0b01010101 */>(c5,d5);
+    return a6;
+  }
+
+   __forceinline vfloat8 sort_descending(const vfloat8& v)
+  {
+    const vfloat8 a0 = v;
+    const vfloat8 b0 = shuffle<1,0,3,2>(a0);
+    const vfloat8 c0 = max(a0,b0);
+    const vfloat8 d0 = min(a0,b0);
+    const vfloat8 a1 = select<0x99 /* 0b10011001 */>(c0,d0);
+    const vfloat8 b1 = shuffle<2,3,0,1>(a1);
+    const vfloat8 c1 = max(a1,b1);
+    const vfloat8 d1 = min(a1,b1);
+    const vfloat8 a2 = select<0xc3 /* 0b11000011 */>(c1,d1);
+    const vfloat8 b2 = shuffle<1,0,3,2>(a2);
+    const vfloat8 c2 = max(a2,b2);
+    const vfloat8 d2 = min(a2,b2);
+    const vfloat8 a3 = select<0xa5 /* 0b10100101 */>(c2,d2);
+    const vfloat8 b3 = shuffle4<1,0>(a3);
+    const vfloat8 c3 = max(a3,b3);
+    const vfloat8 d3 = min(a3,b3);
+    const vfloat8 a4 = select<0xf /* 0b00001111 */>(c3,d3);
+    const vfloat8 b4 = shuffle<2,3,0,1>(a4);
+    const vfloat8 c4 = max(a4,b4);
+    const vfloat8 d4 = min(a4,b4);
+    const vfloat8 a5 = select<0x33 /* 0b00110011 */>(c4,d4);
+    const vfloat8 b5 = shuffle<1,0,3,2>(a5);
+    const vfloat8 c5 = max(a5,b5);
+    const vfloat8 d5 = min(a5,b5);
+    const vfloat8 a6 = select<0x55 /* 0b01010101 */>(c5,d5);
+    return a6;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vfloat8& a) {
+    return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">";
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vint16_avx512.h b/thirdparty/embree/common/simd/vint16_avx512.h
new file mode 100644
index 0000000000..3720c3c9d6
--- /dev/null
+++ b/thirdparty/embree/common/simd/vint16_avx512.h
@@ -0,0 +1,472 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{ 
+  /* 16-wide AVX-512 integer type */
+  template<>
+  struct vint<16>
+  {
+    ALIGNED_STRUCT_(64);
+    
+    typedef vboolf16 Bool;
+    typedef vint16   Int;
+    typedef vfloat16 Float;
+
+    enum  { size = 16 }; // number of SIMD elements
+    union {              // data
+      __m512i v; 
+      int i[16]; 
+    };
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+       
+    __forceinline vint() {}
+    __forceinline vint(const vint16& t) { v = t.v; }
+    __forceinline vint16& operator =(const vint16& f) { v = f.v; return *this; }
+
+    __forceinline vint(const __m512i& t) { v = t; }
+    __forceinline operator __m512i() const { return v; }
+    __forceinline operator __m256i() const { return _mm512_castsi512_si256(v); }
+
+    __forceinline vint(int i) {
+      v = _mm512_set1_epi32(i);
+    }
+    
+    __forceinline vint(int a, int b, int c, int d) {
+      v = _mm512_set4_epi32(d,c,b,a);      
+    }
+
+    __forceinline vint(int a0 , int a1 , int a2 , int a3,
+                       int a4 , int a5 , int a6 , int a7,
+                       int a8 , int a9 , int a10, int a11,
+                       int a12, int a13, int a14, int a15)
+    {
+      v = _mm512_set_epi32(a15,a14,a13,a12,a11,a10,a9,a8,a7,a6,a5,a4,a3,a2,a1,a0);
+    }
+
+    __forceinline vint(const vint4& i) {
+      v = _mm512_broadcast_i32x4(i);
+    }
+
+    __forceinline vint(const vint4& a, const vint4& b, const vint4& c, const vint4& d) {
+      v = _mm512_castsi128_si512(a);
+      v = _mm512_inserti32x4(v, b, 1);
+      v = _mm512_inserti32x4(v, c, 2);
+      v = _mm512_inserti32x4(v, d, 3);
+    }
+
+    __forceinline vint(const vint8& i) {
+      v = _mm512_castps_si512(_mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_castsi256_pd(i))));
+    }
+
+    __forceinline vint(const vint8& a, const vint8& b) {
+      v = _mm512_castsi256_si512(a);
+      v = _mm512_inserti64x4(v, b, 1);
+    }
+   
+    __forceinline explicit vint(const __m512& f) {
+      v = _mm512_cvtps_epi32(f);
+    }
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline vint(ZeroTy)   : v(_mm512_setzero_epi32()) {}
+    __forceinline vint(OneTy)    : v(_mm512_set1_epi32(1)) {}
+    __forceinline vint(PosInfTy) : v(_mm512_set1_epi32(pos_inf)) {}
+    __forceinline vint(NegInfTy) : v(_mm512_set1_epi32(neg_inf)) {}
+    __forceinline vint(StepTy)   : v(_mm512_set_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {}
+    __forceinline vint(ReverseStepTy) : v(_mm512_setr_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline vint16 load (const void* addr) { return _mm512_load_si512((int*)addr); }
+
+    static __forceinline vint16 load(const unsigned char* ptr) { return _mm512_cvtepu8_epi32(_mm_load_si128((__m128i*)ptr)); }
+    static __forceinline vint16 load(const unsigned short* ptr) { return _mm512_cvtepu16_epi32(_mm256_load_si256((__m256i*)ptr)); }
+
+    static __forceinline vint16 loadu(const unsigned char* ptr) { return _mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr)); }
+    static __forceinline vint16 loadu(const unsigned short* ptr) { return _mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i*)ptr)); }
+
+    static __forceinline vint16 loadu(const void* addr) { return _mm512_loadu_si512(addr); }
+
+    static __forceinline vint16 load (const vboolf16& mask, const void* addr) { return _mm512_mask_load_epi32 (_mm512_setzero_epi32(),mask,addr); }
+    static __forceinline vint16 loadu(const vboolf16& mask, const void* addr) { return _mm512_mask_loadu_epi32(_mm512_setzero_epi32(),mask,addr); }
+
+    static __forceinline void store (void* ptr, const vint16& v) { _mm512_store_si512 (ptr,v); }
+    static __forceinline void storeu(void* ptr, const vint16& v) { _mm512_storeu_si512(ptr,v); }
+ 
+    static __forceinline void store (const vboolf16& mask, void* addr, const vint16& v2) { _mm512_mask_store_epi32(addr,mask,v2); }
+    static __forceinline void storeu(const vboolf16& mask, void* ptr,  const vint16& f) { _mm512_mask_storeu_epi32((int*)ptr,mask,f); }
+
+    static __forceinline void store_nt(void* __restrict__ ptr, const vint16& a) { _mm512_stream_si512((__m512i*)ptr,a); }
+
+    static __forceinline vint16 compact(const vboolf16& mask, vint16 &v) {
+      return _mm512_mask_compress_epi32(v,mask,v);
+    }
+
+    static __forceinline vint16 compact(const vboolf16& mask, const vint16 &a, vint16 &b) {
+      return _mm512_mask_compress_epi32(a,mask,b);
+    }
+
+    static __forceinline vint16 expand(const vboolf16& mask, const vint16& a, vint16& b) {
+      return _mm512_mask_expand_epi32(b,mask,a);
+    }
+
+    template<int scale = 4>
+    static __forceinline vint16 gather(const int* ptr, const vint16& index) {
+      return _mm512_i32gather_epi32(index,ptr,scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline vint16 gather(const vboolf16& mask, const int* ptr, const vint16& index) {
+      return _mm512_mask_i32gather_epi32(_mm512_undefined_epi32(),mask,index,ptr,scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline vint16 gather(const vboolf16& mask, vint16& dest, const int* ptr, const vint16& index) {
+      return _mm512_mask_i32gather_epi32(dest,mask,index,ptr,scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(int* ptr, const vint16& index, const vint16& v) {
+      _mm512_i32scatter_epi32((int*)ptr,index,v,scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(const vboolf16& mask, int* ptr, const vint16& index, const vint16& v) {
+      _mm512_mask_i32scatter_epi32((int*)ptr,mask,index,v,scale);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline       int& operator [](size_t index)       { assert(index < 16); return i[index]; }
+    __forceinline const int& operator [](size_t index) const { assert(index < 16); return i[index]; }
+
+    __forceinline unsigned int uint    (size_t index) const { assert(index < 16); return ((unsigned int*)i)[index]; }
+    __forceinline size_t&      uint64_t(size_t index) const { assert(index < 8);  return ((size_t*)i)[index]; }
+  };
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf16 asBool(const vint16& a) { return _mm512_movepi32_mask(a); }
+
+  __forceinline vint16 operator +(const vint16& a) { return a; }
+  __forceinline vint16 operator -(const vint16& a) { return _mm512_sub_epi32(_mm512_setzero_epi32(), a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint16 operator +(const vint16& a, const vint16& b) { return _mm512_add_epi32(a, b); }
+  __forceinline vint16 operator +(const vint16& a, int           b) { return a + vint16(b); }
+  __forceinline vint16 operator +(int           a, const vint16& b) { return vint16(a) + b; }
+
+  __forceinline vint16 operator -(const vint16& a, const vint16& b) { return _mm512_sub_epi32(a, b); }
+  __forceinline vint16 operator -(const vint16& a, int           b) { return a - vint16(b); }
+  __forceinline vint16 operator -(int           a, const vint16& b) { return vint16(a) - b; }
+
+  __forceinline vint16 operator *(const vint16& a, const vint16& b) { return _mm512_mullo_epi32(a, b); }
+  __forceinline vint16 operator *(const vint16& a, int           b) { return a * vint16(b); }
+  __forceinline vint16 operator *(int           a, const vint16& b) { return vint16(a) * b; }
+
+  __forceinline vint16 operator &(const vint16& a, const vint16& b) { return _mm512_and_epi32(a, b); }
+  __forceinline vint16 operator &(const vint16& a, int           b) { return a & vint16(b); }
+  __forceinline vint16 operator &(int           a, const vint16& b) { return vint16(a) & b; }
+
+  __forceinline vint16 operator |(const vint16& a, const vint16& b) { return _mm512_or_epi32(a, b); }
+  __forceinline vint16 operator |(const vint16& a, int           b) { return a | vint16(b); }
+  __forceinline vint16 operator |(int           a, const vint16& b) { return vint16(a) | b; }
+
+  __forceinline vint16 operator ^(const vint16& a, const vint16& b) { return _mm512_xor_epi32(a, b); }
+  __forceinline vint16 operator ^(const vint16& a, int           b) { return a ^ vint16(b); }
+  __forceinline vint16 operator ^(int           a, const vint16& b) { return vint16(a) ^ b; }
+
+  __forceinline vint16 operator <<(const vint16& a, int n) { return _mm512_slli_epi32(a, n); }
+  __forceinline vint16 operator >>(const vint16& a, int n) { return _mm512_srai_epi32(a, n); }
+
+  __forceinline vint16 operator <<(const vint16& a, const vint16& n) { return _mm512_sllv_epi32(a, n); }
+  __forceinline vint16 operator >>(const vint16& a, const vint16& n) { return _mm512_srav_epi32(a, n); }
+
+  __forceinline vint16 sll (const vint16& a, int b) { return _mm512_slli_epi32(a, b); }
+  __forceinline vint16 sra (const vint16& a, int b) { return _mm512_srai_epi32(a, b); }
+  __forceinline vint16 srl (const vint16& a, int b) { return _mm512_srli_epi32(a, b); }
+  
+  __forceinline vint16 min(const vint16& a, const vint16& b) { return _mm512_min_epi32(a, b); }
+  __forceinline vint16 min(const vint16& a, int           b) { return min(a,vint16(b)); }
+  __forceinline vint16 min(int           a, const vint16& b) { return min(vint16(a),b); }
+
+  __forceinline vint16 max(const vint16& a, const vint16& b) { return _mm512_max_epi32(a, b); }
+  __forceinline vint16 max(const vint16& a, int           b) { return max(a,vint16(b)); }
+  __forceinline vint16 max(int           a, const vint16& b) { return max(vint16(a),b); }
+  
+  __forceinline vint16 umin(const vint16& a, const vint16& b) { return _mm512_min_epu32(a, b); }
+  __forceinline vint16 umax(const vint16& a, const vint16& b) { return _mm512_max_epu32(a, b); }
+
+  __forceinline vint16 mask_add(const vboolf16& mask, vint16& c, const vint16& a, const vint16& b) { return _mm512_mask_add_epi32(c,mask,a,b); }
+  __forceinline vint16 mask_sub(const vboolf16& mask, vint16& c, const vint16& a, const vint16& b) { return _mm512_mask_sub_epi32(c,mask,a,b); }
+
+  __forceinline vint16 mask_and(const vboolf16& m, vint16& c, const vint16& a, const vint16& b) { return _mm512_mask_and_epi32(c,m,a,b); }
+  __forceinline vint16 mask_or (const vboolf16& m, vint16& c, const vint16& a, const vint16& b) { return _mm512_mask_or_epi32(c,m,a,b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint16& operator +=(vint16& a, const vint16& b) { return a = a + b; }
+  __forceinline vint16& operator +=(vint16& a, int           b) { return a = a + b; }
+  
+  __forceinline vint16& operator -=(vint16& a, const vint16& b) { return a = a - b; }
+  __forceinline vint16& operator -=(vint16& a, int           b) { return a = a - b; }
+
+  __forceinline vint16& operator *=(vint16& a, const vint16& b) { return a = a * b; }
+  __forceinline vint16& operator *=(vint16& a, int           b) { return a = a * b; }
+  
+  __forceinline vint16& operator &=(vint16& a, const vint16& b) { return a = a & b; }
+  __forceinline vint16& operator &=(vint16& a, int           b) { return a = a & b; }
+  
+  __forceinline vint16& operator |=(vint16& a, const vint16& b) { return a = a | b; }
+  __forceinline vint16& operator |=(vint16& a, int           b) { return a = a | b; }
+  
+  __forceinline vint16& operator <<=(vint16& a, int b) { return a = a << b; }
+  __forceinline vint16& operator >>=(vint16& a, int b) { return a = a >> b; }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf16 operator ==(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf16 operator ==(const vint16& a, int           b) { return a == vint16(b); }
+  __forceinline vboolf16 operator ==(int           a, const vint16& b) { return vint16(a) == b; }
+  
+  __forceinline vboolf16 operator !=(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf16 operator !=(const vint16& a, int           b) { return a != vint16(b); }
+  __forceinline vboolf16 operator !=(int           a, const vint16& b) { return vint16(a) != b; }
+  
+  __forceinline vboolf16 operator < (const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboolf16 operator < (const vint16& a, int           b) { return a <  vint16(b); }
+  __forceinline vboolf16 operator < (int           a, const vint16& b) { return vint16(a) <  b; }
+  
+  __forceinline vboolf16 operator >=(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboolf16 operator >=(const vint16& a, int           b) { return a >= vint16(b); }
+  __forceinline vboolf16 operator >=(int           a, const vint16& b) { return vint16(a) >= b; }
+
+  __forceinline vboolf16 operator > (const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf16 operator > (const vint16& a, int           b) { return a >  vint16(b); }
+  __forceinline vboolf16 operator > (int           a, const vint16& b) { return vint16(a) >  b; }
+
+  __forceinline vboolf16 operator <=(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_LE); }
+  __forceinline vboolf16 operator <=(const vint16& a, int           b) { return a <= vint16(b); }
+  __forceinline vboolf16 operator <=(int           a, const vint16& b) { return vint16(a) <= b; }
+
+  __forceinline vboolf16 eq(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf16 ne(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf16 lt(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboolf16 ge(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboolf16 gt(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf16 le(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_LE); }
+  __forceinline vboolf16 uint_le(const vint16& a, const vint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LE); }
+  __forceinline vboolf16 uint_gt(const vint16& a, const vint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GT); }
+
+  __forceinline vboolf16 eq(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf16 ne(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf16 lt(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_LT); }
+  __forceinline vboolf16 ge(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_GE); }
+  __forceinline vboolf16 gt(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf16 le(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_LE); }
+  __forceinline vboolf16 uint_le(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_LE); }
+  __forceinline vboolf16 uint_gt(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_GT); }
+    
+ 
+  __forceinline vint16 select(const vboolf16& m, const vint16& t, const vint16& f) {
+    return _mm512_mask_or_epi32(f,m,t,t); 
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint16 unpacklo(const vint16& a, const vint16& b) { return _mm512_unpacklo_epi32(a, b); }
+  __forceinline vint16 unpackhi(const vint16& a, const vint16& b) { return _mm512_unpackhi_epi32(a, b); }
+
+  template<int i>
+    __forceinline vint16 shuffle(const vint16& v) {
+    return _mm512_castps_si512(_mm512_permute_ps(_mm512_castsi512_ps(v), _MM_SHUFFLE(i, i, i, i)));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vint16 shuffle(const vint16& v) {
+    return _mm512_castps_si512(_mm512_permute_ps(_mm512_castsi512_ps(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<int i>
+  __forceinline vint16 shuffle4(const vint16& v) {
+    return _mm512_castps_si512(_mm512_shuffle_f32x4(_mm512_castsi512_ps(v), _mm512_castsi512_ps(v), _MM_SHUFFLE(i, i, i, i)));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vint16 shuffle4(const vint16& v) {
+    return _mm512_castps_si512(_mm512_shuffle_f32x4(_mm512_castsi512_ps(v), _mm512_castsi512_ps(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<int i>
+  __forceinline vint16 align_shift_right(const vint16& a, const vint16& b) {
+    return _mm512_alignr_epi32(a, b, i);
+  };
+
+  __forceinline int toScalar(const vint16& v) {
+    return _mm_cvtsi128_si32(_mm512_castsi512_si128(v));
+  }
+
+  template<int i> __forceinline vint16 insert4(const vint16& a, const vint4& b) { return _mm512_inserti32x4(a, b, i); }
+
+  template<int N, int i>
+  vint<N> extractN(const vint16& v);
+
+  template<> __forceinline vint4 extractN<4,0>(const vint16& v) { return _mm512_castsi512_si128(v);       }
+  template<> __forceinline vint4 extractN<4,1>(const vint16& v) { return _mm512_extracti32x4_epi32(v, 1); }
+  template<> __forceinline vint4 extractN<4,2>(const vint16& v) { return _mm512_extracti32x4_epi32(v, 2); }
+  template<> __forceinline vint4 extractN<4,3>(const vint16& v) { return _mm512_extracti32x4_epi32(v, 3); }
+
+  template<> __forceinline vint8 extractN<8,0>(const vint16& v) { return _mm512_castsi512_si256(v);       }
+  template<> __forceinline vint8 extractN<8,1>(const vint16& v) { return _mm512_extracti32x8_epi32(v, 1); }
+
+  template<int i> __forceinline vint4 extract4   (const vint16& v) { return _mm512_extracti32x4_epi32(v, i); }
+  template<>      __forceinline vint4 extract4<0>(const vint16& v) { return _mm512_castsi512_si128(v);       }
+
+  template<int i> __forceinline vint8 extract8   (const vint16& v) { return _mm512_extracti32x8_epi32(v, i); }
+  template<>      __forceinline vint8 extract8<0>(const vint16& v) { return _mm512_castsi512_si256(v);       }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint16 vreduce_min2(vint16 x) {                      return min(x, shuffle<1,0,3,2>(x)); }
+  __forceinline vint16 vreduce_min4(vint16 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); }
+  __forceinline vint16 vreduce_min8(vint16 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0,3,2>(x)); }
+  __forceinline vint16 vreduce_min (vint16 x) { x = vreduce_min8(x); return min(x, shuffle4<2,3,0,1>(x)); }
+
+  __forceinline vint16 vreduce_max2(vint16 x) {                      return max(x, shuffle<1,0,3,2>(x)); }
+  __forceinline vint16 vreduce_max4(vint16 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); }
+  __forceinline vint16 vreduce_max8(vint16 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0,3,2>(x)); }
+  __forceinline vint16 vreduce_max (vint16 x) { x = vreduce_max8(x); return max(x, shuffle4<2,3,0,1>(x)); }
+
+  __forceinline vint16 vreduce_and2(vint16 x) {                      return x & shuffle<1,0,3,2>(x); }
+  __forceinline vint16 vreduce_and4(vint16 x) { x = vreduce_and2(x); return x & shuffle<2,3,0,1>(x); }
+  __forceinline vint16 vreduce_and8(vint16 x) { x = vreduce_and4(x); return x & shuffle4<1,0,3,2>(x); }
+  __forceinline vint16 vreduce_and (vint16 x) { x = vreduce_and8(x); return x & shuffle4<2,3,0,1>(x); }
+
+  __forceinline vint16 vreduce_or2(vint16 x) {                     return x | shuffle<1,0,3,2>(x); }
+  __forceinline vint16 vreduce_or4(vint16 x) { x = vreduce_or2(x); return x | shuffle<2,3,0,1>(x); }
+  __forceinline vint16 vreduce_or8(vint16 x) { x = vreduce_or4(x); return x | shuffle4<1,0,3,2>(x); }
+  __forceinline vint16 vreduce_or (vint16 x) { x = vreduce_or8(x); return x | shuffle4<2,3,0,1>(x); }
+
+  __forceinline vint16 vreduce_add2(vint16 x) {                      return x + shuffle<1,0,3,2>(x); }
+  __forceinline vint16 vreduce_add4(vint16 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); }
+  __forceinline vint16 vreduce_add8(vint16 x) { x = vreduce_add4(x); return x + shuffle4<1,0,3,2>(x); }
+  __forceinline vint16 vreduce_add (vint16 x) { x = vreduce_add8(x); return x + shuffle4<2,3,0,1>(x); }
+  
+  __forceinline int reduce_min(const vint16& v) { return toScalar(vreduce_min(v)); }
+  __forceinline int reduce_max(const vint16& v) { return toScalar(vreduce_max(v)); }
+  __forceinline int reduce_and(const vint16& v) { return toScalar(vreduce_and(v)); }
+  __forceinline int reduce_or (const vint16& v) { return toScalar(vreduce_or (v)); }
+  __forceinline int reduce_add(const vint16& v) { return toScalar(vreduce_add(v)); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Memory load and store operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint16 conflict(const vint16& index)
+  {
+    return _mm512_conflict_epi32(index);
+  }
+
+  __forceinline vint16 conflict(const vboolf16& mask, vint16& dest, const vint16& index)
+  {
+    return _mm512_mask_conflict_epi32(dest,mask,index);
+  }    
+
+  __forceinline vint16 convert_uint32_t(const __m512& f) {
+    return _mm512_cvtps_epu32(f);
+  }
+
+  __forceinline vint16 permute(vint16 v, vint16 index) {
+    return _mm512_permutexvar_epi32(index,v);  
+  }
+
+  __forceinline vint16 reverse(const vint16 &a) {
+    return permute(a,vint16(reverse_step));
+  }
+
+  __forceinline vint16 prefix_sum(const vint16& a) 
+  {
+    const vint16 z(zero);
+    vint16 v = a;
+    v = v + align_shift_right<16-1>(v,z);
+    v = v + align_shift_right<16-2>(v,z);
+    v = v + align_shift_right<16-4>(v,z);
+    v = v + align_shift_right<16-8>(v,z);
+    return v;  
+  }
+
+  __forceinline vint16 reverse_prefix_sum(const vint16& a) 
+  {
+    const vint16 z(zero);
+    vint16 v = a;
+    v = v + align_shift_right<1>(z,v);
+    v = v + align_shift_right<2>(z,v);
+    v = v + align_shift_right<4>(z,v);
+    v = v + align_shift_right<8>(z,v);
+    return v;  
+  }
+
+  /* this should use a vbool8 and a vint8_64...*/
+  template<int scale = 1, int hint = _MM_HINT_T0>
+    __forceinline void gather_prefetch64(const void* base_addr, const vbool16& mask, const vint16& offset)
+  {
+#if defined(__AVX512PF__)
+    _mm512_mask_prefetch_i64gather_pd(offset, mask, base_addr, scale, hint);
+#endif
+  }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vint16& v)
+  {
+    cout << "<" << v[0];
+    for (int i=1; i<16; i++) cout << ", " << v[i];
+    cout << ">";
+    return cout;
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vint4_sse2.h b/thirdparty/embree/common/simd/vint4_sse2.h
new file mode 100644
index 0000000000..9814d5c71c
--- /dev/null
+++ b/thirdparty/embree/common/simd/vint4_sse2.h
@@ -0,0 +1,598 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../math/math.h"
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+  /* 4-wide SSE integer type */
+  template<>
+  struct vint<4>
+  {
+    ALIGNED_STRUCT_(16);
+    
+    typedef vboolf4 Bool;
+    typedef vint4   Int;
+    typedef vfloat4 Float;
+
+    enum  { size = 4 };             // number of SIMD elements
+    union { __m128i v; int i[4]; }; // data
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline vint() {}
+    __forceinline vint(const vint4& a) { v = a.v; }
+    __forceinline vint4& operator =(const vint4& a) { v = a.v; return *this; }
+
+    __forceinline vint(__m128i a) : v(a) {}
+    __forceinline operator const __m128i&() const { return v; }
+    __forceinline operator       __m128i&()       { return v; }
+
+    __forceinline vint(int a) : v(_mm_set1_epi32(a)) {}
+    __forceinline vint(int a, int b, int c, int d) : v(_mm_set_epi32(d, c, b, a)) {}
+
+    __forceinline explicit vint(__m128 a) : v(_mm_cvtps_epi32(a)) {}
+#if defined(__AVX512VL__)
+    __forceinline explicit vint(const vboolf4& a) : v(_mm_movm_epi32(a)) {}
+#else
+    __forceinline explicit vint(const vboolf4& a) : v(_mm_castps_si128((__m128)a)) {}
+#endif
+
+    __forceinline vint(long long a, long long b) : v(_mm_set_epi64x(b,a)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vint(ZeroTy)        : v(_mm_setzero_si128()) {}
+    __forceinline vint(OneTy)         : v(_mm_set_epi32(1, 1, 1, 1)) {}
+    __forceinline vint(PosInfTy)      : v(_mm_set_epi32(pos_inf, pos_inf, pos_inf, pos_inf)) {}
+    __forceinline vint(NegInfTy)      : v(_mm_set_epi32(neg_inf, neg_inf, neg_inf, neg_inf)) {}
+    __forceinline vint(StepTy)        : v(_mm_set_epi32(3, 2, 1, 0)) {}
+    __forceinline vint(ReverseStepTy) : v(_mm_set_epi32(0, 1, 2, 3)) {}
+
+    __forceinline vint(TrueTy)   { v = _mm_cmpeq_epi32(v,v); }
+    __forceinline vint(UndefinedTy) : v(_mm_castps_si128(_mm_undefined_ps())) {}
+
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline vint4 load (const void* a) { return _mm_load_si128((__m128i*)a); }
+    static __forceinline vint4 loadu(const void* a) { return _mm_loadu_si128((__m128i*)a); }
+
+    static __forceinline void store (void* ptr, const vint4& v) { _mm_store_si128((__m128i*)ptr,v); }
+    static __forceinline void storeu(void* ptr, const vint4& v) { _mm_storeu_si128((__m128i*)ptr,v); }
+    
+#if defined(__AVX512VL__)
+
+    static __forceinline vint4 compact(const vboolf4& mask, vint4 &v) {
+      return _mm_mask_compress_epi32(v, mask, v);
+    }
+    static __forceinline vint4 compact(const vboolf4& mask, vint4 &a, const vint4& b) {
+      return _mm_mask_compress_epi32(a, mask, b);
+    }
+
+    static __forceinline vint4 load (const vboolf4& mask, const void* ptr) { return _mm_mask_load_epi32 (_mm_setzero_si128(),mask,ptr); }
+    static __forceinline vint4 loadu(const vboolf4& mask, const void* ptr) { return _mm_mask_loadu_epi32(_mm_setzero_si128(),mask,ptr); }
+
+    static __forceinline void store (const vboolf4& mask, void* ptr, const vint4& v) { _mm_mask_store_epi32 (ptr,mask,v); }
+    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vint4& v) { _mm_mask_storeu_epi32(ptr,mask,v); }
+#elif defined(__AVX__)
+    static __forceinline vint4 load (const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask)); }
+    static __forceinline vint4 loadu(const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask)); }
+
+    static __forceinline void store (const vboolf4& mask, void* ptr, const vint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,_mm_castsi128_ps(i)); }
+    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,_mm_castsi128_ps(i)); }
+#else
+    static __forceinline vint4 load (const vbool4& mask, const void* a) { return _mm_and_si128(_mm_load_si128 ((__m128i*)a),mask); }
+    static __forceinline vint4 loadu(const vbool4& mask, const void* a) { return _mm_and_si128(_mm_loadu_si128((__m128i*)a),mask); }
+
+    static __forceinline void store (const vboolf4& mask, void* ptr, const vint4& i) { store (ptr,select(mask,i,load (ptr))); }
+    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vint4& i) { storeu(ptr,select(mask,i,loadu(ptr))); }
+#endif
+
+
+#if defined(__SSE4_1__)
+    static __forceinline vint4 load(const unsigned char* ptr) {
+      return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr));
+    }
+
+    static __forceinline vint4 loadu(const unsigned char* ptr) {
+      return  _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr));
+    }
+#else
+
+    static __forceinline vint4 load(const unsigned char* ptr) {
+      return vint4(ptr[0],ptr[1],ptr[2],ptr[3]);
+    } 
+
+    static __forceinline vint4 loadu(const unsigned char* ptr) {
+      return vint4(ptr[0],ptr[1],ptr[2],ptr[3]);
+    }
+
+#endif
+
+    static __forceinline vint4 load(const unsigned short* ptr) {
+#if defined (__SSE4_1__)
+      return _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr));
+#else
+      return vint4(ptr[0],ptr[1],ptr[2],ptr[3]);
+#endif
+    } 
+
+    static __forceinline void store(unsigned char* ptr, const vint4& v) {
+#if defined(__SSE4_1__)
+      __m128i x = v;
+      x = _mm_packus_epi32(x, x);
+      x = _mm_packus_epi16(x, x);
+      *(int*)ptr = _mm_cvtsi128_si32(x);
+#else
+      for (size_t i=0;i<4;i++)
+        ptr[i] = (unsigned char)v[i];
+#endif
+    }
+
+    static __forceinline void store(unsigned short* ptr, const vint4& v) {
+      for (size_t i=0;i<4;i++)
+        ptr[i] = (unsigned short)v[i];
+    }
+
+    static __forceinline vint4 load_nt(void* ptr) {
+#if defined(__SSE4_1__)
+      return _mm_stream_load_si128((__m128i*)ptr); 
+#else
+      return _mm_load_si128((__m128i*)ptr); 
+#endif
+    }
+    
+    static __forceinline void store_nt(void* ptr, const vint4& v) {
+#if defined(__SSE4_1__)
+      _mm_stream_ps((float*)ptr, _mm_castsi128_ps(v));
+#else
+      _mm_store_si128((__m128i*)ptr,v);
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline vint4 gather(const int* ptr, const vint4& index) {
+#if defined(__AVX2__)
+      return _mm_i32gather_epi32(ptr, index, scale);
+#else
+      return vint4(
+          *(int*)(((char*)ptr)+scale*index[0]),
+          *(int*)(((char*)ptr)+scale*index[1]),
+          *(int*)(((char*)ptr)+scale*index[2]),
+          *(int*)(((char*)ptr)+scale*index[3]));
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline vint4 gather(const vboolf4& mask, const int* ptr, const vint4& index) {
+      vint4 r = zero;
+#if defined(__AVX512VL__)
+      return _mm_mmask_i32gather_epi32(r, mask, index, ptr, scale);
+#elif defined(__AVX2__)
+      return _mm_mask_i32gather_epi32(r, ptr, index, mask, scale);
+#else
+      if (likely(mask[0])) r[0] = *(int*)(((char*)ptr)+scale*index[0]);
+      if (likely(mask[1])) r[1] = *(int*)(((char*)ptr)+scale*index[1]);
+      if (likely(mask[2])) r[2] = *(int*)(((char*)ptr)+scale*index[2]);
+      if (likely(mask[3])) r[3] = *(int*)(((char*)ptr)+scale*index[3]);
+      return r;
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(void* ptr, const vint4& index, const vint4& v)
+    {
+#if defined(__AVX512VL__)
+      _mm_i32scatter_epi32((int*)ptr, index, v, scale);
+#else
+      *(int*)(((char*)ptr)+scale*index[0]) = v[0];
+      *(int*)(((char*)ptr)+scale*index[1]) = v[1];
+      *(int*)(((char*)ptr)+scale*index[2]) = v[2];
+      *(int*)(((char*)ptr)+scale*index[3]) = v[3];
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(const vboolf4& mask, void* ptr, const vint4& index, const vint4& v)
+    {
+#if defined(__AVX512VL__)
+      _mm_mask_i32scatter_epi32((int*)ptr, mask, index, v, scale);
+#else
+      if (likely(mask[0])) *(int*)(((char*)ptr)+scale*index[0]) = v[0];
+      if (likely(mask[1])) *(int*)(((char*)ptr)+scale*index[1]) = v[1];
+      if (likely(mask[2])) *(int*)(((char*)ptr)+scale*index[2]) = v[2];
+      if (likely(mask[3])) *(int*)(((char*)ptr)+scale*index[3]) = v[3];
+#endif
+    }
+
+#if defined(__x86_64__)
+    static __forceinline vint4 broadcast64(long long a) { return _mm_set1_epi64x(a); }
+#endif
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const int& operator [](size_t index) const { assert(index < 4); return i[index]; }
+    __forceinline       int& operator [](size_t index)       { assert(index < 4); return i[index]; }
+
+    friend __forceinline vint4 select(const vboolf4& m, const vint4& t, const vint4& f) {
+#if defined(__AVX512VL__)
+      return _mm_mask_blend_epi32(m, (__m128i)f, (__m128i)t);
+#elif defined(__SSE4_1__)
+      return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m)); 
+#else
+      return _mm_or_si128(_mm_and_si128(m, t), _mm_andnot_si128(m, f)); 
+#endif
+    }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  __forceinline vboolf4 asBool(const vint4& a) { return _mm_movepi32_mask(a); }
+#else
+  __forceinline vboolf4 asBool(const vint4& a) { return _mm_castsi128_ps(a); }
+#endif
+
+  __forceinline vint4 operator +(const vint4& a) { return a; }
+  __forceinline vint4 operator -(const vint4& a) { return _mm_sub_epi32(_mm_setzero_si128(), a); }
+#if defined(__SSSE3__)
+  __forceinline vint4 abs(const vint4& a) { return _mm_abs_epi32(a); }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint4 operator +(const vint4& a, const vint4& b) { return _mm_add_epi32(a, b); }
+  __forceinline vint4 operator +(const vint4& a, int          b) { return a + vint4(b); }
+  __forceinline vint4 operator +(int          a, const vint4& b) { return vint4(a) + b; }
+
+  __forceinline vint4 operator -(const vint4& a, const vint4& b) { return _mm_sub_epi32(a, b); }
+  __forceinline vint4 operator -(const vint4& a, int          b) { return a - vint4(b); }
+  __forceinline vint4 operator -(int          a, const vint4& b) { return vint4(a) - b; }
+
+#if defined(__SSE4_1__)
+  __forceinline vint4 operator *(const vint4& a, const vint4& b) { return _mm_mullo_epi32(a, b); }
+#else
+  __forceinline vint4 operator *(const vint4& a, const vint4& b) { return vint4(a[0]*b[0],a[1]*b[1],a[2]*b[2],a[3]*b[3]); }
+#endif
+  __forceinline vint4 operator *(const vint4& a, int          b) { return a * vint4(b); }
+  __forceinline vint4 operator *(int          a, const vint4& b) { return vint4(a) * b; }
+
+  __forceinline vint4 operator &(const vint4& a, const vint4& b) { return _mm_and_si128(a, b); }
+  __forceinline vint4 operator &(const vint4& a, int          b) { return a & vint4(b); }
+  __forceinline vint4 operator &(int          a, const vint4& b) { return vint4(a) & b; }
+
+  __forceinline vint4 operator |(const vint4& a, const vint4& b) { return _mm_or_si128(a, b); }
+  __forceinline vint4 operator |(const vint4& a, int          b) { return a | vint4(b); }
+  __forceinline vint4 operator |(int          a, const vint4& b) { return vint4(a) | b; }
+
+  __forceinline vint4 operator ^(const vint4& a, const vint4& b) { return _mm_xor_si128(a, b); }
+  __forceinline vint4 operator ^(const vint4& a, int          b) { return a ^ vint4(b); }
+  __forceinline vint4 operator ^(int          a, const vint4& b) { return vint4(a) ^ b; }
+
+  __forceinline vint4 operator <<(const vint4& a, int n) { return _mm_slli_epi32(a, n); }
+  __forceinline vint4 operator >>(const vint4& a, int n) { return _mm_srai_epi32(a, n); }
+
+  __forceinline vint4 sll (const vint4& a, int b) { return _mm_slli_epi32(a, b); }
+  __forceinline vint4 sra (const vint4& a, int b) { return _mm_srai_epi32(a, b); }
+  __forceinline vint4 srl (const vint4& a, int b) { return _mm_srli_epi32(a, b); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint4& operator +=(vint4& a, const vint4& b) { return a = a + b; }
+  __forceinline vint4& operator +=(vint4& a, int          b) { return a = a + b; }
+  
+  __forceinline vint4& operator -=(vint4& a, const vint4& b) { return a = a - b; }
+  __forceinline vint4& operator -=(vint4& a, int          b) { return a = a - b; }
+
+#if defined(__SSE4_1__)
+  __forceinline vint4& operator *=(vint4& a, const vint4& b) { return a = a * b; }
+  __forceinline vint4& operator *=(vint4& a, int          b) { return a = a * b; }
+#endif
+  
+  __forceinline vint4& operator &=(vint4& a, const vint4& b) { return a = a & b; }
+  __forceinline vint4& operator &=(vint4& a, int          b) { return a = a & b; }
+  
+  __forceinline vint4& operator |=(vint4& a, const vint4& b) { return a = a | b; }
+  __forceinline vint4& operator |=(vint4& a, int          b) { return a = a | b; }
+  
+  __forceinline vint4& operator <<=(vint4& a, int b) { return a = a << b; }
+  __forceinline vint4& operator >>=(vint4& a, int b) { return a = a >> b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  __forceinline vboolf4 operator ==(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf4 operator !=(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf4 operator < (const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboolf4 operator >=(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboolf4 operator > (const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf4 operator <=(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_LE); }
+#else
+  __forceinline vboolf4 operator ==(const vint4& a, const vint4& b) { return _mm_castsi128_ps(_mm_cmpeq_epi32(a, b)); }
+  __forceinline vboolf4 operator !=(const vint4& a, const vint4& b) { return !(a == b); }
+  __forceinline vboolf4 operator < (const vint4& a, const vint4& b) { return _mm_castsi128_ps(_mm_cmplt_epi32(a, b)); }
+  __forceinline vboolf4 operator >=(const vint4& a, const vint4& b) { return !(a <  b); }
+  __forceinline vboolf4 operator > (const vint4& a, const vint4& b) { return _mm_castsi128_ps(_mm_cmpgt_epi32(a, b)); }
+  __forceinline vboolf4 operator <=(const vint4& a, const vint4& b) { return !(a >  b); }
+#endif
+
+  __forceinline vboolf4 operator ==(const vint4& a, int          b) { return a == vint4(b); }
+  __forceinline vboolf4 operator ==(int          a, const vint4& b) { return vint4(a) == b; }
+
+  __forceinline vboolf4 operator !=(const vint4& a, int          b) { return a != vint4(b); }
+  __forceinline vboolf4 operator !=(int          a, const vint4& b) { return vint4(a) != b; }
+
+  __forceinline vboolf4 operator < (const vint4& a, int          b) { return a <  vint4(b); }
+  __forceinline vboolf4 operator < (int          a, const vint4& b) { return vint4(a) <  b; }
+
+  __forceinline vboolf4 operator >=(const vint4& a, int          b) { return a >= vint4(b); }
+  __forceinline vboolf4 operator >=(int          a, const vint4& b) { return vint4(a) >= b; }
+
+  __forceinline vboolf4 operator > (const vint4& a, int          b) { return a >  vint4(b); }
+  __forceinline vboolf4 operator > (int          a, const vint4& b) { return vint4(a) >  b; }
+
+  __forceinline vboolf4 operator <=(const vint4& a, int          b) { return a <= vint4(b); }
+  __forceinline vboolf4 operator <=(int          a, const vint4& b) { return vint4(a) <= b; }
+
+  __forceinline vboolf4 eq(const vint4& a, const vint4& b) { return a == b; }
+  __forceinline vboolf4 ne(const vint4& a, const vint4& b) { return a != b; }
+  __forceinline vboolf4 lt(const vint4& a, const vint4& b) { return a <  b; }
+  __forceinline vboolf4 ge(const vint4& a, const vint4& b) { return a >= b; }
+  __forceinline vboolf4 gt(const vint4& a, const vint4& b) { return a >  b; }
+  __forceinline vboolf4 le(const vint4& a, const vint4& b) { return a <= b; }
+
+#if defined(__AVX512VL__)
+  __forceinline vboolf4 eq(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_EQ); }
+  __forceinline vboolf4 ne(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_NE); }
+  __forceinline vboolf4 lt(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LT); }
+  __forceinline vboolf4 ge(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_GE); }
+  __forceinline vboolf4 gt(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_GT); }
+  __forceinline vboolf4 le(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LE); }
+#else
+  __forceinline vboolf4 eq(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a == b); }
+  __forceinline vboolf4 ne(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a != b); }
+  __forceinline vboolf4 lt(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a <  b); }
+  __forceinline vboolf4 ge(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a >= b); }
+  __forceinline vboolf4 gt(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a >  b); }
+  __forceinline vboolf4 le(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a <= b); }
+#endif
+
+  template<int mask>
+  __forceinline vint4 select(const vint4& t, const vint4& f) {
+#if defined(__SSE4_1__) 
+    return _mm_castps_si128(_mm_blend_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), mask));
+#else
+    return select(vboolf4(mask), t, f);
+#endif    
+  }
+
+#if defined(__SSE4_1__)
+  __forceinline vint4 min(const vint4& a, const vint4& b) { return _mm_min_epi32(a, b); }
+  __forceinline vint4 max(const vint4& a, const vint4& b) { return _mm_max_epi32(a, b); }
+
+  __forceinline vint4 umin(const vint4& a, const vint4& b) { return _mm_min_epu32(a, b); }
+  __forceinline vint4 umax(const vint4& a, const vint4& b) { return _mm_max_epu32(a, b); }
+
+#else
+  __forceinline vint4 min(const vint4& a, const vint4& b) { return select(a < b,a,b); }
+  __forceinline vint4 max(const vint4& a, const vint4& b) { return select(a < b,b,a); }
+#endif
+
+  __forceinline vint4 min(const vint4& a, int          b) { return min(a,vint4(b)); }
+  __forceinline vint4 min(int          a, const vint4& b) { return min(vint4(a),b); }
+  __forceinline vint4 max(const vint4& a, int          b) { return max(a,vint4(b)); }
+  __forceinline vint4 max(int          a, const vint4& b) { return max(vint4(a),b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint4 unpacklo(const vint4& a, const vint4& b) { return _mm_castps_si128(_mm_unpacklo_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); }
+  __forceinline vint4 unpackhi(const vint4& a, const vint4& b) { return _mm_castps_si128(_mm_unpackhi_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vint4 shuffle(const vint4& v) {
+    return _mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vint4 shuffle(const vint4& a, const vint4& b) {
+    return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+#if defined(__SSE3__)
+  template<> __forceinline vint4 shuffle<0, 0, 2, 2>(const vint4& v) { return _mm_castps_si128(_mm_moveldup_ps(_mm_castsi128_ps(v))); }
+  template<> __forceinline vint4 shuffle<1, 1, 3, 3>(const vint4& v) { return _mm_castps_si128(_mm_movehdup_ps(_mm_castsi128_ps(v))); }
+  template<> __forceinline vint4 shuffle<0, 1, 0, 1>(const vint4& v) { return _mm_castpd_si128(_mm_movedup_pd (_mm_castsi128_pd(v))); }
+#endif
+
+  template<int i>
+  __forceinline vint4 shuffle(const vint4& v) {
+    return shuffle<i,i,i,i>(v);
+  }
+
+#if defined(__SSE4_1__)
+  template<int src> __forceinline int extract(const vint4& b) { return _mm_extract_epi32(b, src); }
+  template<int dst> __forceinline vint4 insert(const vint4& a, const int b) { return _mm_insert_epi32(a, b, dst); }
+#else
+  template<int src> __forceinline int extract(const vint4& b) { return b[src&3]; }
+  template<int dst> __forceinline vint4 insert(const vint4& a, int b) { vint4 c = a; c[dst&3] = b; return c; }
+#endif
+
+
+  template<> __forceinline int extract<0>(const vint4& b) { return _mm_cvtsi128_si32(b); }
+
+  __forceinline int toScalar(const vint4& v) { return _mm_cvtsi128_si32(v); }
+
+  __forceinline size_t toSizeT(const vint4& v) { 
+#if defined(__WIN32__) && !defined(__X86_64__) // win32 workaround
+    return toScalar(v);
+#else
+    return _mm_cvtsi128_si64(v); 
+#endif
+  }
+
+#if defined(__AVX512VL__)
+
+  __forceinline vint4 permute(const vint4 &a, const vint4 &index) {
+    return  _mm_castps_si128(_mm_permutevar_ps(_mm_castsi128_ps(a),index));
+  }
+
+  template<int i>
+  __forceinline vint4 align_shift_right(const vint4& a, const vint4& b) {
+    return _mm_alignr_epi32(a, b, i);    
+  }  
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__SSE4_1__)
+  __forceinline vint4 vreduce_min(const vint4& v) { vint4 h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); }
+  __forceinline vint4 vreduce_max(const vint4& v) { vint4 h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); }
+  __forceinline vint4 vreduce_add(const vint4& v) { vint4 h = shuffle<1,0,3,2>(v)   + v ; return shuffle<2,3,0,1>(h)   + h ; }
+
+  __forceinline int reduce_min(const vint4& v) { return toScalar(vreduce_min(v)); }
+  __forceinline int reduce_max(const vint4& v) { return toScalar(vreduce_max(v)); }
+  __forceinline int reduce_add(const vint4& v) { return toScalar(vreduce_add(v)); }
+
+  __forceinline size_t select_min(const vint4& v) { return bsf(movemask(v == vreduce_min(v))); }
+  __forceinline size_t select_max(const vint4& v) { return bsf(movemask(v == vreduce_max(v))); }
+
+  __forceinline size_t select_min(const vboolf4& valid, const vint4& v) { const vint4 a = select(valid,v,vint4(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); }
+  __forceinline size_t select_max(const vboolf4& valid, const vint4& v) { const vint4 a = select(valid,v,vint4(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); }
+
+#else
+
+  __forceinline int reduce_min(const vint4& v) { return min(v[0],v[1],v[2],v[3]); }
+  __forceinline int reduce_max(const vint4& v) { return max(v[0],v[1],v[2],v[3]); }
+  __forceinline int reduce_add(const vint4& v) { return v[0]+v[1]+v[2]+v[3]; }
+
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Sorting networks
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__SSE4_1__)
+
+  __forceinline vint4 usort_ascending(const vint4& v)
+  {
+    const vint4 a0 = v;
+    const vint4 b0 = shuffle<1,0,3,2>(a0);
+    const vint4 c0 = umin(a0,b0);
+    const vint4 d0 = umax(a0,b0);
+    const vint4 a1 = select<0x5 /* 0b0101 */>(c0,d0);
+    const vint4 b1 = shuffle<2,3,0,1>(a1);
+    const vint4 c1 = umin(a1,b1);
+    const vint4 d1 = umax(a1,b1);
+    const vint4 a2 = select<0x3 /* 0b0011 */>(c1,d1);
+    const vint4 b2 = shuffle<0,2,1,3>(a2);
+    const vint4 c2 = umin(a2,b2);
+    const vint4 d2 = umax(a2,b2);
+    const vint4 a3 = select<0x2 /* 0b0010 */>(c2,d2);
+    return a3;
+  }
+
+  __forceinline vint4 usort_descending(const vint4& v)
+  {
+    const vint4 a0 = v;
+    const vint4 b0 = shuffle<1,0,3,2>(a0);
+    const vint4 c0 = umax(a0,b0);
+    const vint4 d0 = umin(a0,b0);
+    const vint4 a1 = select<0x5 /* 0b0101 */>(c0,d0);
+    const vint4 b1 = shuffle<2,3,0,1>(a1);
+    const vint4 c1 = umax(a1,b1);
+    const vint4 d1 = umin(a1,b1);
+    const vint4 a2 = select<0x3 /* 0b0011 */>(c1,d1);
+    const vint4 b2 = shuffle<0,2,1,3>(a2);
+    const vint4 c2 = umax(a2,b2);
+    const vint4 d2 = umin(a2,b2);
+    const vint4 a3 = select<0x2 /* 0b0010 */>(c2,d2);
+    return a3;
+  }
+
+#else
+
+  __forceinline vint4 usort_ascending(const vint4& v)
+  {
+    const vint4 a0 = v-vint4(0x80000000);
+    const vint4 b0 = shuffle<1,0,3,2>(a0);
+    const vint4 c0 = min(a0,b0);
+    const vint4 d0 = max(a0,b0);
+    const vint4 a1 = select<0x5 /* 0b0101 */>(c0,d0);
+    const vint4 b1 = shuffle<2,3,0,1>(a1);
+    const vint4 c1 = min(a1,b1);
+    const vint4 d1 = max(a1,b1);
+    const vint4 a2 = select<0x3 /* 0b0011 */>(c1,d1);
+    const vint4 b2 = shuffle<0,2,1,3>(a2);
+    const vint4 c2 = min(a2,b2);
+    const vint4 d2 = max(a2,b2);
+    const vint4 a3 = select<0x2 /* 0b0010 */>(c2,d2);
+    return a3+vint4(0x80000000);
+  }
+
+  __forceinline vint4 usort_descending(const vint4& v)
+  {
+    const vint4 a0 = v-vint4(0x80000000);
+    const vint4 b0 = shuffle<1,0,3,2>(a0);
+    const vint4 c0 = max(a0,b0);
+    const vint4 d0 = min(a0,b0);
+    const vint4 a1 = select<0x5 /* 0b0101 */>(c0,d0);
+    const vint4 b1 = shuffle<2,3,0,1>(a1);
+    const vint4 c1 = max(a1,b1);
+    const vint4 d1 = min(a1,b1);
+    const vint4 a2 = select<0x3 /* 0b0011 */>(c1,d1);
+    const vint4 b2 = shuffle<0,2,1,3>(a2);
+    const vint4 c2 = max(a2,b2);
+    const vint4 d2 = min(a2,b2);
+    const vint4 a3 = select<0x2 /* 0b0010 */>(c2,d2);
+    return a3+vint4(0x80000000);
+  }
+
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vint4& a) {
+    return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ">";
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vint8_avx.h b/thirdparty/embree/common/simd/vint8_avx.h
new file mode 100644
index 0000000000..f43e9a8c22
--- /dev/null
+++ b/thirdparty/embree/common/simd/vint8_avx.h
@@ -0,0 +1,470 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+  /* 8-wide AVX integer type */
+  template<>
+  struct vint<8>
+  {
+    ALIGNED_STRUCT_(32);
+    
+    typedef vboolf8 Bool;
+    typedef vint8   Int;
+    typedef vfloat8 Float;
+
+    enum  { size = 8 };        // number of SIMD elements
+    union {                    // data
+      __m256i v;
+      struct { __m128i vl,vh; };
+      int i[8];
+    }; 
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vint() {}
+    __forceinline vint(const vint8& a) { v = a.v; }
+    __forceinline vint8& operator =(const vint8& a) { v = a.v; return *this; }
+
+    __forceinline vint(__m256i a) : v(a) {}
+    __forceinline operator const __m256i&() const { return v; }
+    __forceinline operator       __m256i&()       { return v; }
+
+    __forceinline explicit vint(const vint4& a) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),a,1)) {}
+    __forceinline vint(const vint4& a, const vint4& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {}
+    __forceinline vint(const __m128i& a, const __m128i& b) : vl(a), vh(b) {}
+ 
+    __forceinline explicit vint(const int* a) : v(_mm256_castps_si256(_mm256_loadu_ps((const float*)a))) {}
+    __forceinline vint(int a) : v(_mm256_set1_epi32(a)) {}
+    __forceinline vint(int a, int b) : v(_mm256_set_epi32(b, a, b, a, b, a, b, a)) {}
+    __forceinline vint(int a, int b, int c, int d) : v(_mm256_set_epi32(d, c, b, a, d, c, b, a)) {}
+    __forceinline vint(int a, int b, int c, int d, int e, int f, int g, int vh) : v(_mm256_set_epi32(vh, g, f, e, d, c, b, a)) {}
+
+    __forceinline explicit vint(__m256 a) : v(_mm256_cvtps_epi32(a)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vint(ZeroTy)        : v(_mm256_setzero_si256()) {}
+    __forceinline vint(OneTy)         : v(_mm256_set_epi32(1,1,1,1,1,1,1,1)) {}
+    __forceinline vint(PosInfTy)      : v(_mm256_set_epi32(pos_inf,pos_inf,pos_inf,pos_inf,pos_inf,pos_inf,pos_inf,pos_inf)) {}
+    __forceinline vint(NegInfTy)      : v(_mm256_set_epi32(neg_inf,neg_inf,neg_inf,neg_inf,neg_inf,neg_inf,neg_inf,neg_inf)) {}
+    __forceinline vint(StepTy)        : v(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)) {}
+    __forceinline vint(ReverseStepTy) : v(_mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7)) {}
+    __forceinline vint(UndefinedTy)   : v(_mm256_undefined_si256()) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline vint8 load (const void* a) { return _mm256_castps_si256(_mm256_load_ps((float*)a)); }
+    static __forceinline vint8 loadu(const void* a) { return _mm256_castps_si256(_mm256_loadu_ps((float*)a)); }
+
+    static __forceinline vint8 load (const vboolf8& mask, const void* a) { return _mm256_castps_si256(_mm256_maskload_ps((float*)a,mask)); }
+    static __forceinline vint8 loadu(const vboolf8& mask, const void* a) { return _mm256_castps_si256(_mm256_maskload_ps((float*)a,mask)); }
+
+    static __forceinline void store (void* ptr, const vint8& f) { _mm256_store_ps((float*)ptr,_mm256_castsi256_ps(f)); }
+    static __forceinline void storeu(void* ptr, const vint8& f) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(f)); }
+    
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); }
+
+    static __forceinline void store_nt(void* ptr, const vint8& v) {
+      _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v));
+    }
+
+    static __forceinline vint8 load(const unsigned char* ptr) {
+      vint4 il = vint4::load(ptr+0);
+      vint4 ih = vint4::load(ptr+4);
+      return vint8(il,ih);
+    }
+
+    static __forceinline vint8 loadu(const unsigned char* ptr) {
+      vint4 il = vint4::loadu(ptr+0);
+      vint4 ih = vint4::loadu(ptr+4);
+      return vint8(il,ih);
+    }
+
+    static __forceinline vint8 load(const unsigned short* ptr) {
+      vint4 il = vint4::load(ptr+0);
+      vint4 ih = vint4::load(ptr+4);
+      return vint8(il,ih);
+    }
+
+    static __forceinline vint8 loadu(const unsigned short* ptr) {
+      vint4 il = vint4::loadu(ptr+0);
+      vint4 ih = vint4::loadu(ptr+4);
+      return vint8(il,ih);
+    }
+
+    static __forceinline void store(unsigned char* ptr, const vint8& i) {
+      vint4 il(i.vl);
+      vint4 ih(i.vh);
+      vint4::store(ptr + 0,il);
+      vint4::store(ptr + 4,ih);
+    }
+
+    static __forceinline void store(unsigned short* ptr, const vint8& v) {
+      for (size_t i=0;i<8;i++)
+        ptr[i] = (unsigned short)v[i];
+    }
+
+    template<int scale = 4>
+    static __forceinline vint8 gather(const int* ptr, const vint8& index) {
+      return vint8(
+          *(int*)(((char*)ptr)+scale*index[0]),
+          *(int*)(((char*)ptr)+scale*index[1]),
+          *(int*)(((char*)ptr)+scale*index[2]),
+          *(int*)(((char*)ptr)+scale*index[3]),
+          *(int*)(((char*)ptr)+scale*index[4]),
+          *(int*)(((char*)ptr)+scale*index[5]),
+          *(int*)(((char*)ptr)+scale*index[6]),
+          *(int*)(((char*)ptr)+scale*index[7]));
+    }
+
+    template<int scale = 4>
+    static __forceinline vint8 gather(const vboolf8& mask, const int* ptr, const vint8& index) {
+      vint8 r = zero;
+      if (likely(mask[0])) r[0] = *(int*)(((char*)ptr)+scale*index[0]);
+      if (likely(mask[1])) r[1] = *(int*)(((char*)ptr)+scale*index[1]);
+      if (likely(mask[2])) r[2] = *(int*)(((char*)ptr)+scale*index[2]);
+      if (likely(mask[3])) r[3] = *(int*)(((char*)ptr)+scale*index[3]);
+      if (likely(mask[4])) r[4] = *(int*)(((char*)ptr)+scale*index[4]);
+      if (likely(mask[5])) r[5] = *(int*)(((char*)ptr)+scale*index[5]);
+      if (likely(mask[6])) r[6] = *(int*)(((char*)ptr)+scale*index[6]);
+      if (likely(mask[7])) r[7] = *(int*)(((char*)ptr)+scale*index[7]);
+      return r;
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(void* ptr, const vint8& ofs, const vint8& v)
+    {
+      *(int*)(((char*)ptr)+scale*ofs[0]) = v[0];
+      *(int*)(((char*)ptr)+scale*ofs[1]) = v[1];
+      *(int*)(((char*)ptr)+scale*ofs[2]) = v[2];
+      *(int*)(((char*)ptr)+scale*ofs[3]) = v[3];
+      *(int*)(((char*)ptr)+scale*ofs[4]) = v[4];
+      *(int*)(((char*)ptr)+scale*ofs[5]) = v[5];
+      *(int*)(((char*)ptr)+scale*ofs[6]) = v[6];
+      *(int*)(((char*)ptr)+scale*ofs[7]) = v[7];
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vint8& v)
+    {
+      if (likely(mask[0])) *(int*)(((char*)ptr)+scale*ofs[0]) = v[0];
+      if (likely(mask[1])) *(int*)(((char*)ptr)+scale*ofs[1]) = v[1];
+      if (likely(mask[2])) *(int*)(((char*)ptr)+scale*ofs[2]) = v[2];
+      if (likely(mask[3])) *(int*)(((char*)ptr)+scale*ofs[3]) = v[3];
+      if (likely(mask[4])) *(int*)(((char*)ptr)+scale*ofs[4]) = v[4];
+      if (likely(mask[5])) *(int*)(((char*)ptr)+scale*ofs[5]) = v[5];
+      if (likely(mask[6])) *(int*)(((char*)ptr)+scale*ofs[6]) = v[6];
+      if (likely(mask[7])) *(int*)(((char*)ptr)+scale*ofs[7]) = v[7];
+    }
+
+
+    static __forceinline vint8 broadcast64(const long long& a) { return _mm256_set1_epi64x(a); }
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const int& operator [](size_t index) const { assert(index < 8); return i[index]; }
+    __forceinline       int& operator [](size_t index)       { assert(index < 8); return i[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8 asBool(const vint8& a) { return _mm256_castsi256_ps(a); }
+
+  __forceinline vint8 operator +(const vint8& a) { return a; }
+  __forceinline vint8 operator -(const vint8& a) { return vint8(_mm_sub_epi32(_mm_setzero_si128(), a.vl), _mm_sub_epi32(_mm_setzero_si128(), a.vh)); }
+  __forceinline vint8 abs       (const vint8& a) { return vint8(_mm_abs_epi32(a.vl), _mm_abs_epi32(a.vh)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint8 operator +(const vint8& a, const vint8& b) { return vint8(_mm_add_epi32(a.vl, b.vl), _mm_add_epi32(a.vh, b.vh)); }
+  __forceinline vint8 operator +(const vint8& a, int          b) { return a + vint8(b); }
+  __forceinline vint8 operator +(int          a, const vint8& b) { return vint8(a) + b; }
+
+  __forceinline vint8 operator -(const vint8& a, const vint8& b) { return vint8(_mm_sub_epi32(a.vl, b.vl), _mm_sub_epi32(a.vh, b.vh)); }
+  __forceinline vint8 operator -(const vint8& a, int          b) { return a - vint8(b); }
+  __forceinline vint8 operator -(int          a, const vint8& b) { return vint8(a) - b; }
+
+  __forceinline vint8 operator *(const vint8& a, const vint8& b) { return vint8(_mm_mullo_epi32(a.vl, b.vl), _mm_mullo_epi32(a.vh, b.vh)); }
+  __forceinline vint8 operator *(const vint8& a, int          b) { return a * vint8(b); }
+  __forceinline vint8 operator *(int          a, const vint8& b) { return vint8(a) * b; }
+
+  __forceinline vint8 operator &(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+  __forceinline vint8 operator &(const vint8& a, int          b) { return a & vint8(b); }
+  __forceinline vint8 operator &(int          a, const vint8& b) { return vint8(a) & b; }
+
+  __forceinline vint8 operator |(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_or_ps (_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+  __forceinline vint8 operator |(const vint8& a, int          b) { return a | vint8(b); }
+  __forceinline vint8 operator |(int          a, const vint8& b) { return vint8(a) | b; }
+
+  __forceinline vint8 operator ^(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+  __forceinline vint8 operator ^(const vint8& a, int          b) { return a ^ vint8(b); }
+  __forceinline vint8 operator ^(int          a, const vint8& b) { return vint8(a) ^ b; }
+
+  __forceinline vint8 operator <<(const vint8& a, int n) { return vint8(_mm_slli_epi32(a.vl, n), _mm_slli_epi32(a.vh, n)); }
+  __forceinline vint8 operator >>(const vint8& a, int n) { return vint8(_mm_srai_epi32(a.vl, n), _mm_srai_epi32(a.vh, n)); }
+
+  __forceinline vint8 sll (const vint8& a, int b) { return vint8(_mm_slli_epi32(a.vl, b), _mm_slli_epi32(a.vh, b)); }
+  __forceinline vint8 sra (const vint8& a, int b) { return vint8(_mm_srai_epi32(a.vl, b), _mm_srai_epi32(a.vh, b)); }
+  __forceinline vint8 srl (const vint8& a, int b) { return vint8(_mm_srli_epi32(a.vl, b), _mm_srli_epi32(a.vh, b)); }
+  
+  __forceinline vint8 min(const vint8& a, const vint8& b) { return vint8(_mm_min_epi32(a.vl, b.vl), _mm_min_epi32(a.vh, b.vh)); }
+  __forceinline vint8 min(const vint8& a, int          b) { return min(a,vint8(b)); }
+  __forceinline vint8 min(int          a, const vint8& b) { return min(vint8(a),b); }
+
+  __forceinline vint8 max(const vint8& a, const vint8& b) { return vint8(_mm_max_epi32(a.vl, b.vl), _mm_max_epi32(a.vh, b.vh)); }
+  __forceinline vint8 max(const vint8& a, int          b) { return max(a,vint8(b)); }
+  __forceinline vint8 max(int          a, const vint8& b) { return max(vint8(a),b); }
+
+  __forceinline vint8 umin(const vint8& a, const vint8& b) { return vint8(_mm_min_epu32(a.vl, b.vl), _mm_min_epu32(a.vh, b.vh)); }
+  __forceinline vint8 umin(const vint8& a, int          b) { return umin(a,vint8(b)); }
+  __forceinline vint8 umin(int          a, const vint8& b) { return umin(vint8(a),b); }
+
+  __forceinline vint8 umax(const vint8& a, const vint8& b) { return vint8(_mm_max_epu32(a.vl, b.vl), _mm_max_epu32(a.vh, b.vh)); }
+  __forceinline vint8 umax(const vint8& a, int          b) { return umax(a,vint8(b)); }
+  __forceinline vint8 umax(int          a, const vint8& b) { return umax(vint8(a),b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint8& operator +=(vint8& a, const vint8& b) { return a = a + b; }
+  __forceinline vint8& operator +=(vint8& a, int          b) { return a = a + b; }
+  
+  __forceinline vint8& operator -=(vint8& a, const vint8& b) { return a = a - b; }
+  __forceinline vint8& operator -=(vint8& a, int          b) { return a = a - b; }
+  
+  __forceinline vint8& operator *=(vint8& a, const vint8& b) { return a = a * b; }
+  __forceinline vint8& operator *=(vint8& a, int          b) { return a = a * b; }
+  
+  __forceinline vint8& operator &=(vint8& a, const vint8& b) { return a = a & b; }
+  __forceinline vint8& operator &=(vint8& a, int          b) { return a = a & b; }
+  
+  __forceinline vint8& operator |=(vint8& a, const vint8& b) { return a = a | b; }
+  __forceinline vint8& operator |=(vint8& a, int          b) { return a = a | b; }
+  
+  __forceinline vint8& operator <<=(vint8& a, int b) { return a = a << b; }
+  __forceinline vint8& operator >>=(vint8& a, int b) { return a = a >> b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8 operator ==(const vint8& a, const vint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmpeq_epi32 (a.vl, b.vl)),
+                                                                                     _mm_castsi128_ps(_mm_cmpeq_epi32 (a.vh, b.vh))); }
+  __forceinline vboolf8 operator ==(const vint8& a, int          b) { return a == vint8(b); }
+  __forceinline vboolf8 operator ==(int          a, const vint8& b) { return vint8(a) == b; }
+  
+  __forceinline vboolf8 operator !=(const vint8& a, const vint8& b) { return !(a == b); }
+  __forceinline vboolf8 operator !=(const vint8& a, int          b) { return a != vint8(b); }
+  __forceinline vboolf8 operator !=(int          a, const vint8& b) { return vint8(a) != b; }
+  
+  __forceinline vboolf8 operator < (const vint8& a, const vint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmplt_epi32 (a.vl, b.vl)),
+                                                                                     _mm_castsi128_ps(_mm_cmplt_epi32 (a.vh, b.vh))); }
+  __forceinline vboolf8 operator < (const vint8& a, int          b) { return a <  vint8(b); }
+  __forceinline vboolf8 operator < (int          a, const vint8& b) { return vint8(a) <  b; }
+  
+  __forceinline vboolf8 operator >=(const vint8& a, const vint8& b) { return !(a <  b); }
+  __forceinline vboolf8 operator >=(const vint8& a, int          b) { return a >= vint8(b); }
+  __forceinline vboolf8 operator >=(int          a, const vint8& b) { return vint8(a) >= b; }
+
+  __forceinline vboolf8 operator > (const vint8& a, const vint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmpgt_epi32 (a.vl, b.vl)),
+                                                                                     _mm_castsi128_ps(_mm_cmpgt_epi32 (a.vh, b.vh))); }
+  __forceinline vboolf8 operator > (const vint8& a, int          b) { return a >  vint8(b); }
+  __forceinline vboolf8 operator > (int          a, const vint8& b) { return vint8(a) >  b; }
+
+  __forceinline vboolf8 operator <=(const vint8& a, const vint8& b) { return !(a >  b); }
+  __forceinline vboolf8 operator <=(const vint8& a, int          b) { return a <= vint8(b); }
+  __forceinline vboolf8 operator <=(int          a, const vint8& b) { return vint8(a) <= b; }
+
+  __forceinline vboolf8 eq(const vint8& a, const vint8& b) { return a == b; }
+  __forceinline vboolf8 ne(const vint8& a, const vint8& b) { return a != b; }
+  __forceinline vboolf8 lt(const vint8& a, const vint8& b) { return a <  b; }
+  __forceinline vboolf8 ge(const vint8& a, const vint8& b) { return a >= b; }
+  __forceinline vboolf8 gt(const vint8& a, const vint8& b) { return a >  b; }
+  __forceinline vboolf8 le(const vint8& a, const vint8& b) { return a <= b; }
+
+  __forceinline vboolf8 eq(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a == b); }
+  __forceinline vboolf8 ne(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a != b); }
+  __forceinline vboolf8 lt(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a <  b); }
+  __forceinline vboolf8 ge(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a >= b); }
+  __forceinline vboolf8 gt(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a >  b); }
+  __forceinline vboolf8 le(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a <= b); }
+
+  __forceinline vint8 select(const vboolf8& m, const vint8& t, const vint8& f) {
+    return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m)); 
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint8 unpacklo(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+  __forceinline vint8 unpackhi(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+
+  template<int i>
+  __forceinline vint8 shuffle(const vint8& v) {
+    return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i, i, i, i)));
+  }
+
+  template<int i0, int i1>
+  __forceinline vint8 shuffle4(const vint8& v) {
+    return _mm256_permute2f128_si256(v, v, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1>
+  __forceinline vint8 shuffle4(const vint8& a, const vint8& b) {
+    return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vint8 shuffle(const vint8& v) {
+    return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vint8 shuffle(const vint8& a, const vint8& b) {
+    return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<> __forceinline vint8 shuffle<0, 0, 2, 2>(const vint8& v) { return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(v))); }
+  template<> __forceinline vint8 shuffle<1, 1, 3, 3>(const vint8& v) { return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(v))); }
+  template<> __forceinline vint8 shuffle<0, 1, 0, 1>(const vint8& v) { return _mm256_castps_si256(_mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(v))))); }
+
+  __forceinline vint8 broadcast(const int* ptr) { return _mm256_castps_si256(_mm256_broadcast_ss((const float*)ptr)); }
+  template<int i> __forceinline vint8 insert4(const vint8& a, const vint4& b) { return _mm256_insertf128_si256(a, b, i); }
+  template<int i> __forceinline vint4 extract4(const vint8& a) { return _mm256_extractf128_si256(a, i); }
+  template<> __forceinline vint4 extract4<0>(const vint8& a) { return _mm256_castsi256_si128(a); }
+
+  __forceinline int toScalar(const vint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint8 vreduce_min2(const vint8& v) { return min(v,shuffle<1,0,3,2>(v)); }
+  __forceinline vint8 vreduce_min4(const vint8& v) { vint8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); }
+  __forceinline vint8 vreduce_min (const vint8& v) { vint8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); }
+
+  __forceinline vint8 vreduce_max2(const vint8& v) { return max(v,shuffle<1,0,3,2>(v)); }
+  __forceinline vint8 vreduce_max4(const vint8& v) { vint8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); }
+  __forceinline vint8 vreduce_max (const vint8& v) { vint8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); }
+
+  __forceinline vint8 vreduce_add2(const vint8& v) { return v + shuffle<1,0,3,2>(v); }
+  __forceinline vint8 vreduce_add4(const vint8& v) { vint8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); }
+  __forceinline vint8 vreduce_add (const vint8& v) { vint8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); }
+
+  __forceinline int reduce_min(const vint8& v) { return toScalar(vreduce_min(v)); }
+  __forceinline int reduce_max(const vint8& v) { return toScalar(vreduce_max(v)); }
+  __forceinline int reduce_add(const vint8& v) { return toScalar(vreduce_add(v)); }
+
+  __forceinline size_t select_min(const vint8& v) { return bsf(movemask(v == vreduce_min(v))); }
+  __forceinline size_t select_max(const vint8& v) { return bsf(movemask(v == vreduce_max(v))); }
+
+  __forceinline size_t select_min(const vboolf8& valid, const vint8& v) { const vint8 a = select(valid,v,vint8(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); }
+  __forceinline size_t select_max(const vboolf8& valid, const vint8& v) { const vint8 a = select(valid,v,vint8(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Sorting networks
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint8 usort_ascending(const vint8& v)
+  {
+    const vint8 a0 = v;
+    const vint8 b0 = shuffle<1,0,3,2>(a0);
+    const vint8 c0 = umin(a0,b0);
+    const vint8 d0 = umax(a0,b0);
+    const vint8 a1 = select(0x99 /* 0b10011001 */,c0,d0);
+    const vint8 b1 = shuffle<2,3,0,1>(a1);
+    const vint8 c1 = umin(a1,b1);
+    const vint8 d1 = umax(a1,b1);
+    const vint8 a2 = select(0xc3 /* 0b11000011 */,c1,d1);
+    const vint8 b2 = shuffle<1,0,3,2>(a2);
+    const vint8 c2 = umin(a2,b2);
+    const vint8 d2 = umax(a2,b2);
+    const vint8 a3 = select(0xa5 /* 0b10100101 */,c2,d2);
+    const vint8 b3 = shuffle4<1,0>(a3);
+    const vint8 c3 = umin(a3,b3);
+    const vint8 d3 = umax(a3,b3);
+    const vint8 a4 = select(0xf /* 0b00001111 */,c3,d3);
+    const vint8 b4 = shuffle<2,3,0,1>(a4);
+    const vint8 c4 = umin(a4,b4);
+    const vint8 d4 = umax(a4,b4);
+    const vint8 a5 = select(0x33 /* 0b00110011 */,c4,d4);
+    const vint8 b5 = shuffle<1,0,3,2>(a5);
+    const vint8 c5 = umin(a5,b5);
+    const vint8 d5 = umax(a5,b5);
+    const vint8 a6 = select(0x55 /* 0b01010101 */,c5,d5);
+    return a6;
+  }
+
+  __forceinline vint8 usort_descending(const vint8& v)
+  {
+    const vint8 a0 = v;
+    const vint8 b0 = shuffle<1,0,3,2>(a0);
+    const vint8 c0 = umax(a0,b0);
+    const vint8 d0 = umin(a0,b0);
+    const vint8 a1 = select(0x99 /* 0b10011001 */,c0,d0);
+    const vint8 b1 = shuffle<2,3,0,1>(a1);
+    const vint8 c1 = umax(a1,b1);
+    const vint8 d1 = umin(a1,b1);
+    const vint8 a2 = select(0xc3 /* 0b11000011 */,c1,d1);
+    const vint8 b2 = shuffle<1,0,3,2>(a2);
+    const vint8 c2 = umax(a2,b2);
+    const vint8 d2 = umin(a2,b2);
+    const vint8 a3 = select(0xa5 /* 0b10100101 */,c2,d2);
+    const vint8 b3 = shuffle4<1,0>(a3);
+    const vint8 c3 = umax(a3,b3);
+    const vint8 d3 = umin(a3,b3);
+    const vint8 a4 = select(0xf /* 0b00001111 */,c3,d3);
+    const vint8 b4 = shuffle<2,3,0,1>(a4);
+    const vint8 c4 = umax(a4,b4);
+    const vint8 d4 = umin(a4,b4);
+    const vint8 a5 = select(0x33 /* 0b00110011 */,c4,d4);
+    const vint8 b5 = shuffle<1,0,3,2>(a5);
+    const vint8 c5 = umax(a5,b5);
+    const vint8 d5 = umin(a5,b5);
+    const vint8 a6 = select(0x55 /* 0b01010101 */,c5,d5);
+    return a6;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vint8& a) {
+    return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">";
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vint8_avx2.h b/thirdparty/embree/common/simd/vint8_avx2.h
new file mode 100644
index 0000000000..e04737ffbe
--- /dev/null
+++ b/thirdparty/embree/common/simd/vint8_avx2.h
@@ -0,0 +1,518 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+  /* 8-wide AVX integer type */
+  template<>
+  struct vint<8>
+  {
+    ALIGNED_STRUCT_(32);
+
+    typedef vboolf8 Bool;
+    typedef vint8   Int;
+    typedef vfloat8 Float;
+
+    enum  { size = 8 }; // number of SIMD elements
+    union {             // data
+      __m256i v;
+      int i[8];
+    }; 
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vint() {}
+    __forceinline vint(const vint8& a) { v = a.v; }
+    __forceinline vint8& operator =(const vint8& a) { v = a.v; return *this; }
+
+    __forceinline vint(__m256i a) : v(a) {}
+    __forceinline operator const __m256i&() const { return v; }
+    __forceinline operator       __m256i&()       { return v; }
+
+    __forceinline explicit vint(const vint4& a) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),a,1)) {}
+    __forceinline vint(const vint4& a, const vint4& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {}
+    __forceinline vint(const __m128i& a, const __m128i& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {}
+ 
+    __forceinline explicit vint(const int* a) : v(_mm256_castps_si256(_mm256_loadu_ps((const float*)a))) {}
+    __forceinline vint(int a) : v(_mm256_set1_epi32(a)) {}
+    __forceinline vint(int a, int b) : v(_mm256_set_epi32(b, a, b, a, b, a, b, a)) {}
+    __forceinline vint(int a, int b, int c, int d) : v(_mm256_set_epi32(d, c, b, a, d, c, b, a)) {}
+    __forceinline vint(int a, int b, int c, int d, int e, int f, int g, int h) : v(_mm256_set_epi32(h, g, f, e, d, c, b, a)) {}
+
+    __forceinline explicit vint(__m256 a) : v(_mm256_cvtps_epi32(a)) {}
+
+#if defined(__AVX512VL__)
+    __forceinline explicit vint(const vboolf8& a) : v(_mm256_movm_epi32(a)) {}
+#else
+    __forceinline explicit vint(const vboolf8& a) : v(_mm256_castps_si256((__m256)a)) {}
+#endif
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vint(ZeroTy)        : v(_mm256_setzero_si256()) {}
+    __forceinline vint(OneTy)         : v(_mm256_set1_epi32(1)) {}
+    __forceinline vint(PosInfTy)      : v(_mm256_set1_epi32(pos_inf)) {}
+    __forceinline vint(NegInfTy)      : v(_mm256_set1_epi32(neg_inf)) {}
+    __forceinline vint(StepTy)        : v(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)) {}
+    __forceinline vint(ReverseStepTy) : v(_mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7)) {}
+    __forceinline vint(UndefinedTy)   : v(_mm256_undefined_si256()) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline vint8 load(const unsigned char* ptr)  { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); }
+    static __forceinline vint8 loadu(const unsigned char* ptr) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); }
+    static __forceinline vint8 load(const unsigned short* ptr)  { return _mm256_cvtepu16_epi32(_mm_load_si128((__m128i*)ptr)); }
+    static __forceinline vint8 loadu(const unsigned short* ptr) { return _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr)); }
+
+    static __forceinline vint8 load(const void* ptr) { return _mm256_load_si256((__m256i*)ptr); }
+    static __forceinline vint8 loadu(const void* ptr) { return _mm256_loadu_si256((__m256i*)ptr); }
+
+    static __forceinline void store (void* ptr, const vint8& v) { _mm256_store_si256((__m256i*)ptr,v); }
+    static __forceinline void storeu(void* ptr, const vint8& v) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(v)); }
+
+#if defined(__AVX512VL__)
+
+    static __forceinline vint8 compact(const vboolf8& mask, vint8 &v) {
+      return _mm256_mask_compress_epi32(v, mask, v);
+    }
+    static __forceinline vint8 compact(const vboolf8& mask, vint8 &a, const vint8& b) {
+      return _mm256_mask_compress_epi32(a, mask, b);
+    }
+
+    static __forceinline vint8 load (const vboolf8& mask, const void* ptr) { return _mm256_mask_load_epi32 (_mm256_setzero_si256(),mask,ptr); }
+    static __forceinline vint8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_mask_loadu_epi32(_mm256_setzero_si256(),mask,ptr); }
+
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& v) { _mm256_mask_store_epi32 (ptr,mask,v); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& v) { _mm256_mask_storeu_epi32(ptr,mask,v); }
+#else
+    static __forceinline vint8 load (const vboolf8& mask, const void* ptr) { return _mm256_castps_si256(_mm256_maskload_ps((float*)ptr,mask)); }
+    static __forceinline vint8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_castps_si256(_mm256_maskload_ps((float*)ptr,mask)); }
+
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& v) { _mm256_maskstore_epi32((int*)ptr,mask,v); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& v) { _mm256_maskstore_epi32((int*)ptr,mask,v); }
+#endif
+    
+    static __forceinline vint8 load_nt(void* ptr) {
+      return _mm256_stream_load_si256((__m256i*)ptr);
+    }
+
+    static __forceinline void store_nt(void* ptr, const vint8& v) {
+      _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v));
+    }
+
+    static __forceinline void store(unsigned char* ptr, const vint8& i)
+    {
+      for (size_t j=0; j<8; j++)
+        ptr[j] = i[j];
+    }
+
+    static __forceinline void store(unsigned short* ptr, const vint8& v) {
+      for (size_t i=0;i<8;i++)
+        ptr[i] = (unsigned short)v[i];
+    }
+
+    template<int scale = 4>
+    static __forceinline vint8 gather(const int *const ptr, const vint8& index) {
+      return _mm256_i32gather_epi32(ptr, index, scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline vint8 gather(const vboolf8& mask, const int *const ptr, const vint8& index) {
+      vint8 r = zero;
+#if defined(__AVX512VL__)
+      return _mm256_mmask_i32gather_epi32(r, mask, index, ptr, scale);
+#else
+      return _mm256_mask_i32gather_epi32(r, ptr, index, mask, scale);
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(void* ptr, const vint8& ofs, const vint8& v)
+    {
+#if defined(__AVX512VL__)
+      _mm256_i32scatter_epi32((int*)ptr, ofs, v, scale);
+#else
+      *(int*)(((char*)ptr)+scale*ofs[0]) = v[0];
+      *(int*)(((char*)ptr)+scale*ofs[1]) = v[1];
+      *(int*)(((char*)ptr)+scale*ofs[2]) = v[2];
+      *(int*)(((char*)ptr)+scale*ofs[3]) = v[3];
+      *(int*)(((char*)ptr)+scale*ofs[4]) = v[4];
+      *(int*)(((char*)ptr)+scale*ofs[5]) = v[5];
+      *(int*)(((char*)ptr)+scale*ofs[6]) = v[6];
+      *(int*)(((char*)ptr)+scale*ofs[7]) = v[7];
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vint8& v)
+    {
+#if defined(__AVX512VL__)
+      _mm256_mask_i32scatter_epi32((int*)ptr, mask, ofs, v, scale);
+#else
+      if (likely(mask[0])) *(int*)(((char*)ptr)+scale*ofs[0]) = v[0];
+      if (likely(mask[1])) *(int*)(((char*)ptr)+scale*ofs[1]) = v[1];
+      if (likely(mask[2])) *(int*)(((char*)ptr)+scale*ofs[2]) = v[2];
+      if (likely(mask[3])) *(int*)(((char*)ptr)+scale*ofs[3]) = v[3];
+      if (likely(mask[4])) *(int*)(((char*)ptr)+scale*ofs[4]) = v[4];
+      if (likely(mask[5])) *(int*)(((char*)ptr)+scale*ofs[5]) = v[5];
+      if (likely(mask[6])) *(int*)(((char*)ptr)+scale*ofs[6]) = v[6];
+      if (likely(mask[7])) *(int*)(((char*)ptr)+scale*ofs[7]) = v[7];
+#endif
+    }
+
+    static __forceinline vint8 broadcast64(const long long &a) { return _mm256_set1_epi64x(a); }
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const int& operator [](size_t index) const { assert(index < 8); return i[index]; }
+    __forceinline       int& operator [](size_t index)       { assert(index < 8); return i[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  static __forceinline vboolf8 asBool(const vint8& a) { return _mm256_movepi32_mask(a); }
+#else
+  static __forceinline vboolf8 asBool(const vint8& a) { return _mm256_castsi256_ps(a); }
+#endif
+
+  __forceinline vint8 operator +(const vint8& a) { return a; }
+  __forceinline vint8 operator -(const vint8& a) { return _mm256_sub_epi32(_mm256_setzero_si256(), a); }
+  __forceinline vint8 abs       (const vint8& a) { return _mm256_abs_epi32(a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint8 operator +(const vint8& a, const vint8& b) { return _mm256_add_epi32(a, b); }
+  __forceinline vint8 operator +(const vint8& a, int          b) { return a + vint8(b); }
+  __forceinline vint8 operator +(int          a, const vint8& b) { return vint8(a) + b; }
+
+  __forceinline vint8 operator -(const vint8& a, const vint8& b) { return _mm256_sub_epi32(a, b); }
+  __forceinline vint8 operator -(const vint8& a, int          b) { return a - vint8(b); }
+  __forceinline vint8 operator -(int          a, const vint8& b) { return vint8(a) - b; }
+
+  __forceinline vint8 operator *(const vint8& a, const vint8& b) { return _mm256_mullo_epi32(a, b); }
+  __forceinline vint8 operator *(const vint8& a, int          b) { return a * vint8(b); }
+  __forceinline vint8 operator *(int          a, const vint8& b) { return vint8(a) * b; }
+
+  __forceinline vint8 operator &(const vint8& a, const vint8& b) { return _mm256_and_si256(a, b); }
+  __forceinline vint8 operator &(const vint8& a, int          b) { return a & vint8(b); }
+  __forceinline vint8 operator &(int          a, const vint8& b) { return vint8(a) & b; }
+
+  __forceinline vint8 operator |(const vint8& a, const vint8& b) { return _mm256_or_si256(a, b); }
+  __forceinline vint8 operator |(const vint8& a, int          b) { return a | vint8(b); }
+  __forceinline vint8 operator |(int          a, const vint8& b) { return vint8(a) | b; }
+
+  __forceinline vint8 operator ^(const vint8& a, const vint8& b) { return _mm256_xor_si256(a, b); }
+  __forceinline vint8 operator ^(const vint8& a, int          b) { return a ^ vint8(b); }
+  __forceinline vint8 operator ^(int          a, const vint8& b) { return vint8(a) ^ b; }
+
+  __forceinline vint8 operator <<(const vint8& a, int n) { return _mm256_slli_epi32(a, n); }
+  __forceinline vint8 operator >>(const vint8& a, int n) { return _mm256_srai_epi32(a, n); }
+
+  __forceinline vint8 operator <<(const vint8& a, const vint8& n) { return _mm256_sllv_epi32(a, n); }
+  __forceinline vint8 operator >>(const vint8& a, const vint8& n) { return _mm256_srav_epi32(a, n); }
+
+  __forceinline vint8 sll(const vint8& a, int b) { return _mm256_slli_epi32(a, b); }
+  __forceinline vint8 sra(const vint8& a, int b) { return _mm256_srai_epi32(a, b); }
+  __forceinline vint8 srl(const vint8& a, int b) { return _mm256_srli_epi32(a, b); }
+
+  __forceinline vint8 sll(const vint8& a, const vint8& b) { return _mm256_sllv_epi32(a, b); }
+  __forceinline vint8 sra(const vint8& a, const vint8& b) { return _mm256_srav_epi32(a, b); }
+  __forceinline vint8 srl(const vint8& a, const vint8& b) { return _mm256_srlv_epi32(a, b); }
+  
+  __forceinline vint8 min(const vint8& a, const vint8& b) { return _mm256_min_epi32(a, b); }
+  __forceinline vint8 min(const vint8& a, int          b) { return min(a,vint8(b)); }
+  __forceinline vint8 min(int          a, const vint8& b) { return min(vint8(a),b); }
+
+  __forceinline vint8 max(const vint8& a, const vint8& b) { return _mm256_max_epi32(a, b); }
+  __forceinline vint8 max(const vint8& a, int          b) { return max(a,vint8(b)); }
+  __forceinline vint8 max(int          a, const vint8& b) { return max(vint8(a),b); }
+
+  __forceinline vint8 umin(const vint8& a, const vint8& b) { return _mm256_min_epu32(a, b); }
+  __forceinline vint8 umax(const vint8& a, const vint8& b) { return _mm256_max_epu32(a, b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint8& operator +=(vint8& a, const vint8& b) { return a = a + b; }
+  __forceinline vint8& operator +=(vint8& a, int          b) { return a = a + b; }
+  
+  __forceinline vint8& operator -=(vint8& a, const vint8& b) { return a = a - b; }
+  __forceinline vint8& operator -=(vint8& a, int          b) { return a = a - b; }
+  
+  __forceinline vint8& operator *=(vint8& a, const vint8& b) { return a = a * b; }
+  __forceinline vint8& operator *=(vint8& a, int          b) { return a = a * b; }
+  
+  __forceinline vint8& operator &=(vint8& a, const vint8& b) { return a = a & b; }
+  __forceinline vint8& operator &=(vint8& a, int          b) { return a = a & b; }
+  
+  __forceinline vint8& operator |=(vint8& a, const vint8& b) { return a = a | b; }
+  __forceinline vint8& operator |=(vint8& a, int          b) { return a = a | b; }
+  
+  __forceinline vint8& operator <<=(vint8& a, const int b) { return a = a << b; }
+  __forceinline vint8& operator >>=(vint8& a, const int b) { return a = a >> b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  static __forceinline vboolf8 operator ==(const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_EQ); }
+  static __forceinline vboolf8 operator !=(const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_NE); }
+  static __forceinline vboolf8 operator < (const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_LT); }
+  static __forceinline vboolf8 operator >=(const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_GE); }
+  static __forceinline vboolf8 operator > (const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_GT); }
+  static __forceinline vboolf8 operator <=(const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_LE); }
+
+  static __forceinline vint8 select(const vboolf8& m, const vint8& t, const vint8& f) {
+    return _mm256_mask_blend_epi32(m, (__m256i)f, (__m256i)t);
+  }
+#else
+  static __forceinline vboolf8 operator ==(const vint8& a, const vint8& b) { return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a, b)); }
+  static __forceinline vboolf8 operator !=(const vint8& a, const vint8& b) { return !(a == b); }
+  static __forceinline vboolf8 operator < (const vint8& a, const vint8& b) { return _mm256_castsi256_ps(_mm256_cmpgt_epi32(b, a)); }
+  static __forceinline vboolf8 operator >=(const vint8& a, const vint8& b) { return !(a <  b); }
+  static __forceinline vboolf8 operator > (const vint8& a, const vint8& b) { return _mm256_castsi256_ps(_mm256_cmpgt_epi32(a, b)); }
+  static __forceinline vboolf8 operator <=(const vint8& a, const vint8& b) { return !(a >  b); }
+
+  static __forceinline vint8 select(const vboolf8& m, const vint8& t, const vint8& f) {
+    return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m));
+  }
+#endif
+
+  template<int mask>
+  __forceinline vint8 select(const vint8& t, const vint8& f) {
+    return _mm256_blend_epi32(f, t, mask);
+  }
+
+  __forceinline vboolf8 operator ==(const vint8& a, int          b) { return a == vint8(b); }
+  __forceinline vboolf8 operator ==(int          a, const vint8& b) { return vint8(a) == b; }
+
+  __forceinline vboolf8 operator !=(const vint8& a, int          b) { return a != vint8(b); }
+  __forceinline vboolf8 operator !=(int          a, const vint8& b) { return vint8(a) != b; }
+
+  __forceinline vboolf8 operator < (const vint8& a, int          b) { return a <  vint8(b); }
+  __forceinline vboolf8 operator < (int          a, const vint8& b) { return vint8(a) <  b; }
+
+  __forceinline vboolf8 operator >=(const vint8& a, int          b) { return a >= vint8(b); }
+  __forceinline vboolf8 operator >=(int          a, const vint8& b) { return vint8(a) >= b; }
+
+  __forceinline vboolf8 operator > (const vint8& a, int          b) { return a >  vint8(b); }
+  __forceinline vboolf8 operator > (int          a, const vint8& b) { return vint8(a) >  b; }
+
+  __forceinline vboolf8 operator <=(const vint8& a, int          b) { return a <= vint8(b); }
+  __forceinline vboolf8 operator <=(int          a, const vint8& b) { return vint8(a) <= b; }
+
+  __forceinline vboolf8 eq(const vint8& a, const vint8& b) { return a == b; }
+  __forceinline vboolf8 ne(const vint8& a, const vint8& b) { return a != b; }
+  __forceinline vboolf8 lt(const vint8& a, const vint8& b) { return a <  b; }
+  __forceinline vboolf8 ge(const vint8& a, const vint8& b) { return a >= b; }
+  __forceinline vboolf8 gt(const vint8& a, const vint8& b) { return a >  b; }
+  __forceinline vboolf8 le(const vint8& a, const vint8& b) { return a <= b; }
+
+#if defined(__AVX512VL__)
+  static __forceinline vboolf8 eq(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_EQ); }
+  static __forceinline vboolf8 ne(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_NE); }
+  static __forceinline vboolf8 lt(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LT); }
+  static __forceinline vboolf8 ge(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_GE); }
+  static __forceinline vboolf8 gt(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_GT); }
+  static __forceinline vboolf8 le(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LE); }
+#else
+  static __forceinline vboolf8 eq(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a == b); }
+  static __forceinline vboolf8 ne(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a != b); }
+  static __forceinline vboolf8 lt(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a <  b); }
+  static __forceinline vboolf8 ge(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a >= b); }
+  static __forceinline vboolf8 gt(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a >  b); }
+  static __forceinline vboolf8 le(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a <= b); }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint8 unpacklo(const vint8& a, const vint8& b) { return _mm256_unpacklo_epi32(a, b); }
+  __forceinline vint8 unpackhi(const vint8& a, const vint8& b) { return _mm256_unpackhi_epi32(a, b); }
+
+  template<int i>
+  __forceinline vint8 shuffle(const vint8& v) {
+    return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i, i, i, i)));
+  }
+
+  template<int i0, int i1>
+  __forceinline vint8 shuffle4(const vint8& v) {
+    return _mm256_permute2f128_si256(v, v, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1>
+  __forceinline vint8 shuffle4(const vint8& a, const vint8& b) {
+    return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vint8 shuffle(const vint8& v) {
+    return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vint8 shuffle(const vint8& a, const vint8& b) {
+    return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<> __forceinline vint8 shuffle<0, 0, 2, 2>(const vint8& v) { return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(v))); }
+  template<> __forceinline vint8 shuffle<1, 1, 3, 3>(const vint8& v) { return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(v))); }
+  template<> __forceinline vint8 shuffle<0, 1, 0, 1>(const vint8& v) { return _mm256_castps_si256(_mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(v))))); }
+
+  __forceinline vint8 broadcast(const int* ptr) { return _mm256_castps_si256(_mm256_broadcast_ss((const float*)ptr)); }
+
+  template<int i> __forceinline vint8 insert4(const vint8& a, const vint4& b) { return _mm256_insertf128_si256(a, b, i); }
+  template<int i> __forceinline vint4 extract4(const vint8& a) { return _mm256_extractf128_si256(a, i); }
+  template<> __forceinline vint4 extract4<0>(const vint8& a) { return _mm256_castsi256_si128(a); }
+
+  __forceinline int toScalar(const vint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); }
+
+  __forceinline vint8 permute(const vint8& v, const __m256i& index) {
+    return _mm256_permutevar8x32_epi32(v, index);
+  }
+
+  __forceinline vint8 shuffle(const vint8& v, const __m256i& index) {
+    return _mm256_castps_si256(_mm256_permutevar_ps(_mm256_castsi256_ps(v), index));
+  }
+
+  template<int i>
+  static __forceinline vint8 align_shift_right(const vint8& a, const vint8& b) {
+#if defined(__AVX512VL__)
+    return _mm256_alignr_epi32(a, b, i);    
+#else
+    return _mm256_alignr_epi8(a, b, 4*i);
+#endif
+  }  
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint8 vreduce_min2(const vint8& v) { return min(v,shuffle<1,0,3,2>(v)); }
+  __forceinline vint8 vreduce_min4(const vint8& v) { vint8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); }
+  __forceinline vint8 vreduce_min (const vint8& v) { vint8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); }
+
+  __forceinline vint8 vreduce_max2(const vint8& v) { return max(v,shuffle<1,0,3,2>(v)); }
+  __forceinline vint8 vreduce_max4(const vint8& v) { vint8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); }
+  __forceinline vint8 vreduce_max (const vint8& v) { vint8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); }
+
+  __forceinline vint8 vreduce_add2(const vint8& v) { return v + shuffle<1,0,3,2>(v); }
+  __forceinline vint8 vreduce_add4(const vint8& v) { vint8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); }
+  __forceinline vint8 vreduce_add (const vint8& v) { vint8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); }
+
+  __forceinline int reduce_min(const vint8& v) { return toScalar(vreduce_min(v)); }
+  __forceinline int reduce_max(const vint8& v) { return toScalar(vreduce_max(v)); }
+  __forceinline int reduce_add(const vint8& v) { return toScalar(vreduce_add(v)); }
+
+  __forceinline size_t select_min(const vint8& v) { return bsf(movemask(v == vreduce_min(v))); }
+  __forceinline size_t select_max(const vint8& v) { return bsf(movemask(v == vreduce_max(v))); }
+
+  __forceinline size_t select_min(const vboolf8& valid, const vint8& v) { const vint8 a = select(valid,v,vint8(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); }
+  __forceinline size_t select_max(const vboolf8& valid, const vint8& v) { const vint8 a = select(valid,v,vint8(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Sorting networks
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint8 usort_ascending(const vint8& v)
+  {
+    const vint8 a0 = v;
+    const vint8 b0 = shuffle<1,0,3,2>(a0);
+    const vint8 c0 = umin(a0,b0);
+    const vint8 d0 = umax(a0,b0);
+    const vint8 a1 = select<0x99 /* 0b10011001 */>(c0,d0);
+    const vint8 b1 = shuffle<2,3,0,1>(a1);
+    const vint8 c1 = umin(a1,b1);
+    const vint8 d1 = umax(a1,b1);
+    const vint8 a2 = select<0xc3 /* 0b11000011 */>(c1,d1);
+    const vint8 b2 = shuffle<1,0,3,2>(a2);
+    const vint8 c2 = umin(a2,b2);
+    const vint8 d2 = umax(a2,b2);
+    const vint8 a3 = select<0xa5 /* 0b10100101 */>(c2,d2);
+    const vint8 b3 = shuffle4<1,0>(a3);
+    const vint8 c3 = umin(a3,b3);
+    const vint8 d3 = umax(a3,b3);
+    const vint8 a4 = select<0xf /* 0b00001111 */>(c3,d3);
+    const vint8 b4 = shuffle<2,3,0,1>(a4);
+    const vint8 c4 = umin(a4,b4);
+    const vint8 d4 = umax(a4,b4);
+    const vint8 a5 = select<0x33 /* 0b00110011 */>(c4,d4);
+    const vint8 b5 = shuffle<1,0,3,2>(a5);
+    const vint8 c5 = umin(a5,b5);
+    const vint8 d5 = umax(a5,b5);
+    const vint8 a6 = select<0x55 /* 0b01010101 */>(c5,d5);
+    return a6;
+  }
+
+  __forceinline vint8 usort_descending(const vint8& v)
+  {
+    const vint8 a0 = v;
+    const vint8 b0 = shuffle<1,0,3,2>(a0);
+    const vint8 c0 = umax(a0,b0);
+    const vint8 d0 = umin(a0,b0);
+    const vint8 a1 = select<0x99 /* 0b10011001 */>(c0,d0);
+    const vint8 b1 = shuffle<2,3,0,1>(a1);
+    const vint8 c1 = umax(a1,b1);
+    const vint8 d1 = umin(a1,b1);
+    const vint8 a2 = select<0xc3 /* 0b11000011 */>(c1,d1);
+    const vint8 b2 = shuffle<1,0,3,2>(a2);
+    const vint8 c2 = umax(a2,b2);
+    const vint8 d2 = umin(a2,b2);
+    const vint8 a3 = select<0xa5 /* 0b10100101 */>(c2,d2);
+    const vint8 b3 = shuffle4<1,0>(a3);
+    const vint8 c3 = umax(a3,b3);
+    const vint8 d3 = umin(a3,b3);
+    const vint8 a4 = select<0xf /* 0b00001111 */>(c3,d3);
+    const vint8 b4 = shuffle<2,3,0,1>(a4);
+    const vint8 c4 = umax(a4,b4);
+    const vint8 d4 = umin(a4,b4);
+    const vint8 a5 = select<0x33 /* 0b00110011 */>(c4,d4);
+    const vint8 b5 = shuffle<1,0,3,2>(a5);
+    const vint8 c5 = umax(a5,b5);
+    const vint8 d5 = umin(a5,b5);
+    const vint8 a6 = select<0x55 /* 0b01010101 */>(c5,d5);
+    return a6;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vint8& a) {
+    return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">";
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vllong4_avx2.h b/thirdparty/embree/common/simd/vllong4_avx2.h
new file mode 100644
index 0000000000..6c86845877
--- /dev/null
+++ b/thirdparty/embree/common/simd/vllong4_avx2.h
@@ -0,0 +1,352 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{ 
+  /* 4-wide AVX2 64-bit long long type */
+  template<>
+  struct vllong<4>
+  {
+    ALIGNED_STRUCT_(32);
+    
+    typedef vboold4 Bool;
+
+    enum  { size = 4 }; // number of SIMD elements
+    union {             // data
+      __m256i v; 
+      long long i[4];
+    };
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+       
+    __forceinline vllong() {}
+    __forceinline vllong(const vllong4& t) { v = t.v; }
+    __forceinline vllong4& operator =(const vllong4& f) { v = f.v; return *this; }
+
+    __forceinline vllong(const __m256i& t) { v = t; }
+    __forceinline operator __m256i() const { return v; }
+    __forceinline operator __m256d() const { return _mm256_castsi256_pd(v); }
+
+
+    __forceinline vllong(long long i) {
+      v = _mm256_set1_epi64x(i);
+    }
+    
+    __forceinline vllong(long long a, long long b, long long c, long long d) {
+      v = _mm256_set_epi64x(d,c,b,a);      
+    }
+   
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline vllong(ZeroTy) : v(_mm256_setzero_si256()) {}
+    __forceinline vllong(OneTy)  : v(_mm256_set1_epi64x(1)) {}
+    __forceinline vllong(StepTy) : v(_mm256_set_epi64x(3,2,1,0)) {}
+    __forceinline vllong(ReverseStepTy) : v(_mm256_set_epi64x(0,1,2,3)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline void store_nt(void* __restrict__ ptr, const vllong4& a) {
+      _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(a));
+    }
+
+    static __forceinline vllong4 loadu(const void* addr)
+    {
+      return _mm256_loadu_si256((__m256i*)addr);
+    }
+
+    static __forceinline vllong4 load(const vllong4* addr) {
+      return _mm256_load_si256((__m256i*)addr);
+    }
+
+    static __forceinline vllong4 load(const long long* addr) {
+      return _mm256_load_si256((__m256i*)addr);
+    }
+
+    static __forceinline void store(void* ptr, const vllong4& v) {
+      _mm256_store_si256((__m256i*)ptr,v);
+    }
+
+    static __forceinline void storeu(void* ptr, const vllong4& v) {
+      _mm256_storeu_si256((__m256i*)ptr,v);
+    }
+
+    static __forceinline void storeu(const vboold4& mask, long long* ptr, const vllong4& f) {
+#if defined(__AVX512VL__)
+      _mm256_mask_storeu_epi64(ptr,mask,f);
+#else
+      _mm256_maskstore_pd((double*)ptr,mask,_mm256_castsi256_pd(f));
+#endif
+    }
+
+    static __forceinline void store(const vboold4& mask, void* ptr, const vllong4& f) {
+#if defined(__AVX512VL__)
+      _mm256_mask_store_epi64(ptr,mask,f);
+#else
+      _mm256_maskstore_pd((double*)ptr,mask,_mm256_castsi256_pd(f));
+#endif
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline       long long& operator [](size_t index)       { assert(index < 4); return i[index]; }
+    __forceinline const long long& operator [](size_t index) const { assert(index < 4); return i[index]; }
+
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vllong4 select(const vboold4& m, const vllong4& t, const vllong4& f) {
+  #if defined(__AVX512VL__)
+    return _mm256_mask_blend_epi64(m, f, t);
+  #else
+    return _mm256_castpd_si256(_mm256_blendv_pd(_mm256_castsi256_pd(f), _mm256_castsi256_pd(t), m));
+  #endif
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  __forceinline vboold4 asBool(const vllong4& a) { return _mm256_movepi64_mask(a); }
+#else
+  __forceinline vboold4 asBool(const vllong4& a) { return _mm256_castsi256_pd(a); }
+#endif
+
+  __forceinline vllong4 operator +(const vllong4& a) { return a; }
+  __forceinline vllong4 operator -(const vllong4& a) { return _mm256_sub_epi64(_mm256_setzero_si256(), a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vllong4 operator +(const vllong4& a, const vllong4& b) { return _mm256_add_epi64(a, b); }
+  __forceinline vllong4 operator +(const vllong4& a, long long      b) { return a + vllong4(b); }
+  __forceinline vllong4 operator +(long long      a, const vllong4& b) { return vllong4(a) + b; }
+
+  __forceinline vllong4 operator -(const vllong4& a, const vllong4& b) { return _mm256_sub_epi64(a, b); }
+  __forceinline vllong4 operator -(const vllong4& a, long long      b) { return a - vllong4(b); }
+  __forceinline vllong4 operator -(long long      a, const vllong4& b) { return vllong4(a) - b; }
+
+  /* only low 32bit part */
+  __forceinline vllong4 operator *(const vllong4& a, const vllong4& b) { return _mm256_mul_epi32(a, b); }
+  __forceinline vllong4 operator *(const vllong4& a, long long      b) { return a * vllong4(b); }
+  __forceinline vllong4 operator *(long long      a, const vllong4& b) { return vllong4(a) * b; }
+
+  __forceinline vllong4 operator &(const vllong4& a, const vllong4& b) { return _mm256_and_si256(a, b); }
+  __forceinline vllong4 operator &(const vllong4& a, long long      b) { return a & vllong4(b); }
+  __forceinline vllong4 operator &(long long      a, const vllong4& b) { return vllong4(a) & b; }
+
+  __forceinline vllong4 operator |(const vllong4& a, const vllong4& b) { return _mm256_or_si256(a, b); }
+  __forceinline vllong4 operator |(const vllong4& a, long long      b) { return a | vllong4(b); }
+  __forceinline vllong4 operator |(long long      a, const vllong4& b) { return vllong4(a) | b; }
+
+  __forceinline vllong4 operator ^(const vllong4& a, const vllong4& b) { return _mm256_xor_si256(a, b); }
+  __forceinline vllong4 operator ^(const vllong4& a, long long      b) { return a ^ vllong4(b); }
+  __forceinline vllong4 operator ^(long long      a, const vllong4& b) { return vllong4(a) ^ b; }
+
+  __forceinline vllong4 operator <<(const vllong4& a, long long n) { return _mm256_slli_epi64(a, (int)n); }
+  //__forceinline vllong4 operator >>(const vllong4& a, long long n) { return _mm256_srai_epi64(a, n); }
+
+  __forceinline vllong4 operator <<(const vllong4& a, const vllong4& n) { return _mm256_sllv_epi64(a, n); }
+  //__forceinline vllong4 operator >>(const vllong4& a, const vllong4& n) { return _mm256_srav_epi64(a, n); }
+  //__forceinline vllong4 sra(const vllong4& a, long long b) { return _mm256_srai_epi64(a, b); }
+
+  __forceinline vllong4 srl(const vllong4& a, long long b) { return _mm256_srli_epi64(a, (int)b); }
+  
+  //__forceinline vllong4 min(const vllong4& a, const vllong4& b) { return _mm256_min_epi64(a, b); }
+  //__forceinline vllong4 min(const vllong4& a, long long      b) { return min(a,vllong4(b)); }
+  //__forceinline vllong4 min(long long      a, const vllong4& b) { return min(vllong4(a),b); }
+
+  //__forceinline vllong4 max(const vllong4& a, const vllong4& b) { return _mm256_max_epi64(a, b); }
+  //__forceinline vllong4 max(const vllong4& a, long long      b) { return max(a,vllong4(b)); }
+  //__forceinline vllong4 max(long long      a, const vllong4& b) { return max(vllong4(a),b); }
+
+#if defined(__AVX512VL__)
+  __forceinline vllong4 mask_and(const vboold4& m, const vllong4& c, const vllong4& a, const vllong4& b) { return _mm256_mask_and_epi64(c,m,a,b); }
+  __forceinline vllong4 mask_or (const vboold4& m, const vllong4& c, const vllong4& a, const vllong4& b) { return _mm256_mask_or_epi64(c,m,a,b); }
+#else
+  __forceinline vllong4 mask_and(const vboold4& m, const vllong4& c, const vllong4& a, const vllong4& b) { return select(m, a & b, c); }
+  __forceinline vllong4 mask_or (const vboold4& m, const vllong4& c, const vllong4& a, const vllong4& b) { return select(m, a | b, c); }
+#endif
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vllong4& operator +=(vllong4& a, const vllong4& b) { return a = a + b; }
+  __forceinline vllong4& operator +=(vllong4& a, long long      b) { return a = a + b; }
+  
+  __forceinline vllong4& operator -=(vllong4& a, const vllong4& b) { return a = a - b; }
+  __forceinline vllong4& operator -=(vllong4& a, long long      b) { return a = a - b; }
+
+  __forceinline vllong4& operator *=(vllong4& a, const vllong4& b) { return a = a * b; }
+  __forceinline vllong4& operator *=(vllong4& a, long long      b) { return a = a * b; }
+  
+  __forceinline vllong4& operator &=(vllong4& a, const vllong4& b) { return a = a & b; }
+  __forceinline vllong4& operator &=(vllong4& a, long long      b) { return a = a & b; }
+  
+  __forceinline vllong4& operator |=(vllong4& a, const vllong4& b) { return a = a | b; }
+  __forceinline vllong4& operator |=(vllong4& a, long long      b) { return a = a | b; }
+  
+  __forceinline vllong4& operator <<=(vllong4& a, long long      b) { return a = a << b; }
+  //__forceinline vllong4& operator >>=(vllong4& a, long long      b) { return a = a >> b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  __forceinline vboold4 operator ==(const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboold4 operator !=(const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboold4 operator < (const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboold4 operator >=(const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboold4 operator > (const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboold4 operator <=(const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_LE); }
+#else
+  __forceinline vboold4 operator ==(const vllong4& a, const vllong4& b) { return _mm256_cmpeq_epi64(a,b); }
+  __forceinline vboold4 operator !=(const vllong4& a, const vllong4& b) { return !(a == b); }
+  __forceinline vboold4 operator > (const vllong4& a, const vllong4& b) { return _mm256_cmpgt_epi64(a,b); }
+  __forceinline vboold4 operator < (const vllong4& a, const vllong4& b) { return _mm256_cmpgt_epi64(b,a); }
+  __forceinline vboold4 operator >=(const vllong4& a, const vllong4& b) { return !(a < b); }
+  __forceinline vboold4 operator <=(const vllong4& a, const vllong4& b) { return !(a > b); }
+#endif
+
+  __forceinline vboold4 operator ==(const vllong4& a, long long      b) { return a == vllong4(b); }
+  __forceinline vboold4 operator ==(long long      a, const vllong4& b) { return vllong4(a) == b; }
+
+  __forceinline vboold4 operator !=(const vllong4& a, long long      b) { return a != vllong4(b); }
+  __forceinline vboold4 operator !=(long long      a, const vllong4& b) { return vllong4(a) != b; }
+
+  __forceinline vboold4 operator > (const vllong4& a, long long      b) { return a >  vllong4(b); }
+  __forceinline vboold4 operator > (long long      a, const vllong4& b) { return vllong4(a) >  b; }
+
+  __forceinline vboold4 operator < (const vllong4& a, long long      b) { return a <  vllong4(b); }
+  __forceinline vboold4 operator < (long long      a, const vllong4& b) { return vllong4(a) <  b; }
+
+  __forceinline vboold4 operator >=(const vllong4& a, long long      b) { return a >= vllong4(b); }
+  __forceinline vboold4 operator >=(long long      a, const vllong4& b) { return vllong4(a) >= b; }
+
+  __forceinline vboold4 operator <=(const vllong4& a, long long      b) { return a <= vllong4(b); }
+  __forceinline vboold4 operator <=(long long      a, const vllong4& b) { return vllong4(a) <= b; }
+
+  __forceinline vboold4 eq(const vllong4& a, const vllong4& b) { return a == b; }
+  __forceinline vboold4 ne(const vllong4& a, const vllong4& b) { return a != b; }
+  __forceinline vboold4 lt(const vllong4& a, const vllong4& b) { return a <  b; }
+  __forceinline vboold4 ge(const vllong4& a, const vllong4& b) { return a >= b; }
+  __forceinline vboold4 gt(const vllong4& a, const vllong4& b) { return a >  b; }
+  __forceinline vboold4 le(const vllong4& a, const vllong4& b) { return a <= b; }
+
+#if defined(__AVX512VL__)
+  __forceinline vboold4 eq(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_EQ); }
+  __forceinline vboold4 ne(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_NE); }
+  __forceinline vboold4 lt(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_LT); }
+  __forceinline vboold4 ge(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_GE); }
+  __forceinline vboold4 gt(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_GT); }
+  __forceinline vboold4 le(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_LE); }
+#else
+  __forceinline vboold4 eq(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a == b); }
+  __forceinline vboold4 ne(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a != b); }
+  __forceinline vboold4 lt(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a <  b); }
+  __forceinline vboold4 ge(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a >= b); }
+  __forceinline vboold4 gt(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a >  b); }
+  __forceinline vboold4 le(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a <= b); }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<int i0, int i1>
+  __forceinline vllong4 shuffle(const vllong4& v) {
+    return _mm256_castpd_si256(_mm256_permute_pd(_mm256_castsi256_pd(v), (i1 << 3) | (i0 << 2) | (i1 << 1) | i0));
+  }
+
+  template<int i>
+  __forceinline vllong4 shuffle(const vllong4& v) {
+    return shuffle<i, i>(v);
+  }
+
+  template<int i0, int i1>
+  __forceinline vllong4 shuffle2(const vllong4& v) {
+    return _mm256_castpd_si256(_mm256_permute2f128_pd(_mm256_castsi256_pd(v), _mm256_castsi256_pd(v), (i1 << 4) | i0));
+  }
+
+  __forceinline long long toScalar(const vllong4& v) {
+    return _mm_cvtsi128_si64(_mm256_castsi256_si128(v));
+  }
+
+#if defined(__AVX512VL__)
+  __forceinline vllong4 permute(const vllong4& a, const __m256i& index) {
+    // workaround for GCC 7.x
+#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__)
+    return _mm256_permutex2var_epi64(a,index,a);
+#else
+    return _mm256_permutexvar_epi64(index,a);
+#endif
+  }
+
+  __forceinline vllong4 permutex2var(const vllong4& index, const vllong4& a, const vllong4& b) {
+    return _mm256_permutex2var_epi64(a,index,b);
+  }
+
+#endif
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+  
+
+  __forceinline vllong4 vreduce_and2(const vllong4& x) { return x & shuffle<1,0>(x); }
+  __forceinline vllong4 vreduce_and (const vllong4& y) { const vllong4 x = vreduce_and2(y); return x & shuffle2<1,0>(x); }
+
+  __forceinline vllong4 vreduce_or2(const vllong4& x) { return x | shuffle<1,0>(x); }
+  __forceinline vllong4 vreduce_or (const vllong4& y) { const vllong4 x = vreduce_or2(y); return x | shuffle2<1,0>(x); }
+
+  __forceinline vllong4 vreduce_add2(const vllong4& x) { return x + shuffle<1,0>(x); }
+  __forceinline vllong4 vreduce_add (const vllong4& y) { const vllong4 x = vreduce_add2(y); return x + shuffle2<1,0>(x); }
+
+  __forceinline long long reduce_add(const vllong4& a) { return toScalar(vreduce_add(a)); }
+  __forceinline long long reduce_or (const vllong4& a) { return toScalar(vreduce_or(a)); }
+  __forceinline long long reduce_and(const vllong4& a) { return toScalar(vreduce_and(a)); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vllong4& v)
+  {
+    cout << "<" << v[0];
+    for (size_t i=1; i<4; i++) cout << ", " << v[i];
+    cout << ">";
+    return cout;
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vllong8_avx512.h b/thirdparty/embree/common/simd/vllong8_avx512.h
new file mode 100644
index 0000000000..ee69411637
--- /dev/null
+++ b/thirdparty/embree/common/simd/vllong8_avx512.h
@@ -0,0 +1,358 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{ 
+  /* 8-wide AVX-512 64-bit long long type */
+  template<>
+  struct vllong<8>
+  {
+    ALIGNED_STRUCT_(64);
+        
+    typedef vboold8 Bool;
+
+    enum  { size = 8 }; // number of SIMD elements
+    union {             // data
+      __m512i v; 
+      long long i[8];
+    };
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+       
+    __forceinline vllong() {}
+    __forceinline vllong(const vllong8& t) { v = t.v; }
+    __forceinline vllong8& operator =(const vllong8& f) { v = f.v; return *this; }
+
+    __forceinline vllong(const __m512i& t) { v = t; }
+    __forceinline operator __m512i() const { return v; }
+    __forceinline operator __m256i() const { return _mm512_castsi512_si256(v); }
+
+    __forceinline vllong(long long i) {
+      v = _mm512_set1_epi64(i);
+    }
+    
+    __forceinline vllong(long long a, long long b, long long c, long long d) {
+      v = _mm512_set4_epi64(d,c,b,a);      
+    }
+
+    __forceinline vllong(long long a0, long long a1, long long a2, long long a3,
+                         long long a4, long long a5, long long a6, long long a7)
+    {
+      v = _mm512_set_epi64(a7,a6,a5,a4,a3,a2,a1,a0);
+    }
+   
+    __forceinline vllong(const vllong<4>& i) {
+      v = _mm512_broadcast_i64x4(i);
+    }
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline vllong(ZeroTy) : v(_mm512_setzero_epi32()) {}
+    __forceinline vllong(OneTy)  : v(_mm512_set1_epi64(1)) {}
+    __forceinline vllong(StepTy) : v(_mm512_set_epi64(7,6,5,4,3,2,1,0)) {}
+    __forceinline vllong(ReverseStepTy) : v(_mm512_setr_epi64(7,6,5,4,3,2,1,0)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline void store_nt(void* __restrict__ ptr, const vllong8& a) {
+      _mm512_stream_si512((__m512i*)ptr,a);
+    }
+
+    static __forceinline vllong8 loadu(const void* addr) {
+      return _mm512_loadu_si512(addr);
+    }
+
+    static __forceinline vllong8 load(const vllong8* addr) {
+      return _mm512_load_si512(addr);
+    }
+
+    static __forceinline vllong8 load(const long long* addr) {
+      return _mm512_load_si512(addr);
+    }
+
+    static __forceinline vllong8 load(const unsigned char* ptr) {
+      return _mm512_cvtepu8_epi64(*(__m128i*)ptr); 
+    }
+
+    static __forceinline void store(void* ptr, const vllong8& v) {
+      _mm512_store_si512(ptr,v);
+    }
+
+    static __forceinline void storeu(void* ptr, const vllong8& v) {
+      _mm512_storeu_si512(ptr,v);
+    }
+
+    static __forceinline void storeu(const vboold8& mask, long long* ptr, const vllong8& f) {
+      _mm512_mask_storeu_epi64(ptr,mask,f);
+    }
+
+    static __forceinline void store(const vboold8& mask, void* addr, const vllong8& v2) {
+      _mm512_mask_store_epi64(addr,mask,v2);
+    }
+
+    static __forceinline vllong8 compact(const vboold8& mask, vllong8& v) {
+      return _mm512_mask_compress_epi64(v,mask,v);
+    }
+
+    static __forceinline vllong8 compact(const vboold8& mask, const vllong8& a, vllong8& b) {
+      return _mm512_mask_compress_epi64(a,mask,b);
+    }
+
+    static __forceinline vllong8 expand(const vboold8& mask, const vllong8& a, vllong8& b) {
+      return _mm512_mask_expand_epi64(b,mask,a);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline       long long& operator [](size_t index)       { assert(index < 8); return i[index]; }
+    __forceinline const long long& operator [](size_t index) const { assert(index < 8); return i[index]; }
+
+  };
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboold8 asBool(const vllong8& a) { return _mm512_movepi64_mask(a); }
+
+  __forceinline vllong8 operator +(const vllong8& a) { return a; }
+  __forceinline vllong8 operator -(const vllong8& a) { return _mm512_sub_epi64(_mm512_setzero_epi32(), a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vllong8 operator +(const vllong8& a, const vllong8& b) { return _mm512_add_epi64(a, b); }
+  __forceinline vllong8 operator +(const vllong8& a, long long      b) { return a + vllong8(b); }
+  __forceinline vllong8 operator +(long long      a, const vllong8& b) { return vllong8(a) + b; }
+
+  __forceinline vllong8 operator -(const vllong8& a, const vllong8& b) { return _mm512_sub_epi64(a, b); }
+  __forceinline vllong8 operator -(const vllong8& a, long long      b) { return a - vllong8(b); }
+  __forceinline vllong8 operator -(long long      a, const vllong8& b) { return vllong8(a) - b; }
+
+  __forceinline vllong8 operator *(const vllong8& a, const vllong8& b) { return _mm512_mullo_epi64(a, b); }
+  __forceinline vllong8 operator *(const vllong8& a, long long      b) { return a * vllong8(b); }
+  __forceinline vllong8 operator *(long long      a, const vllong8& b) { return vllong8(a) * b; }
+
+  __forceinline vllong8 operator &(const vllong8& a, const vllong8& b) { return _mm512_and_epi64(a, b); }
+  __forceinline vllong8 operator &(const vllong8& a, long long      b) { return a & vllong8(b); }
+  __forceinline vllong8 operator &(long long      a, const vllong8& b) { return vllong8(a) & b; }
+
+  __forceinline vllong8 operator |(const vllong8& a, const vllong8& b) { return _mm512_or_epi64(a, b); }
+  __forceinline vllong8 operator |(const vllong8& a, long long      b) { return a | vllong8(b); }
+  __forceinline vllong8 operator |(long long      a, const vllong8& b) { return vllong8(a) | b; }
+
+  __forceinline vllong8 operator ^(const vllong8& a, const vllong8& b) { return _mm512_xor_epi64(a, b); }
+  __forceinline vllong8 operator ^(const vllong8& a, long long      b) { return a ^ vllong8(b); }
+  __forceinline vllong8 operator ^(long long      a, const vllong8& b) { return vllong8(a) ^ b; }
+
+  __forceinline vllong8 operator <<(const vllong8& a, long long n) { return _mm512_slli_epi64(a, n); }
+  __forceinline vllong8 operator >>(const vllong8& a, long long n) { return _mm512_srai_epi64(a, n); }
+
+  __forceinline vllong8 operator <<(const vllong8& a, const vllong8& n) { return _mm512_sllv_epi64(a, n); }
+  __forceinline vllong8 operator >>(const vllong8& a, const vllong8& n) { return _mm512_srav_epi64(a, n); }
+
+  __forceinline vllong8 sll (const vllong8& a, long long b) { return _mm512_slli_epi64(a, b); }
+  __forceinline vllong8 sra (const vllong8& a, long long b) { return _mm512_srai_epi64(a, b); }
+  __forceinline vllong8 srl (const vllong8& a, long long b) { return _mm512_srli_epi64(a, b); }
+
+  __forceinline vllong8 min(const vllong8& a, const vllong8& b) { return _mm512_min_epi64(a, b); }
+  __forceinline vllong8 min(const vllong8& a, long long      b) { return min(a,vllong8(b)); }
+  __forceinline vllong8 min(long long      a, const vllong8& b) { return min(vllong8(a),b); }
+
+  __forceinline vllong8 max(const vllong8& a, const vllong8& b) { return _mm512_max_epi64(a, b); }
+  __forceinline vllong8 max(const vllong8& a, long long      b) { return max(a,vllong8(b)); }
+  __forceinline vllong8 max(long long      a, const vllong8& b) { return max(vllong8(a),b); }
+  
+  __forceinline vllong8 mask_add(const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_add_epi64(c,m,a,b); }
+  __forceinline vllong8 mask_sub(const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_sub_epi64(c,m,a,b); }
+
+  __forceinline vllong8 mask_and(const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_and_epi64(c,m,a,b); }
+  __forceinline vllong8 mask_or (const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_or_epi64(c,m,a,b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vllong8& operator +=(vllong8& a, const vllong8&  b) { return a = a + b; }
+  __forceinline vllong8& operator +=(vllong8& a, long long       b) { return a = a + b; }
+  
+  __forceinline vllong8& operator -=(vllong8& a, const vllong8&  b) { return a = a - b; }
+  __forceinline vllong8& operator -=(vllong8& a, long long       b) { return a = a - b; }
+
+  __forceinline vllong8& operator *=(vllong8& a, const vllong8&  b) { return a = a * b; }
+  __forceinline vllong8& operator *=(vllong8& a, long long       b) { return a = a * b; }
+  
+  __forceinline vllong8& operator &=(vllong8& a, const vllong8&  b) { return a = a & b; }
+  __forceinline vllong8& operator &=(vllong8& a, long long       b) { return a = a & b; }
+  
+  __forceinline vllong8& operator |=(vllong8& a, const vllong8&  b) { return a = a | b; }
+  __forceinline vllong8& operator |=(vllong8& a, long long       b) { return a = a | b; }
+  
+  __forceinline vllong8& operator <<=(vllong8& a, long long b) { return a = a << b; }
+  __forceinline vllong8& operator >>=(vllong8& a, long long b) { return a = a >> b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboold8 operator ==(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboold8 operator ==(const vllong8& a, long long      b) { return a == vllong8(b); }
+  __forceinline vboold8 operator ==(long long      a, const vllong8& b) { return vllong8(a) == b; }
+  
+  __forceinline vboold8 operator !=(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboold8 operator !=(const vllong8& a, long long      b) { return a != vllong8(b); }
+  __forceinline vboold8 operator !=(long long      a, const vllong8& b) { return vllong8(a) != b; }
+  
+  __forceinline vboold8 operator < (const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboold8 operator < (const vllong8& a, long long      b) { return a <  vllong8(b); }
+  __forceinline vboold8 operator < (long long      a, const vllong8& b) { return vllong8(a) <  b; }
+  
+  __forceinline vboold8 operator >=(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboold8 operator >=(const vllong8& a, long long      b) { return a >= vllong8(b); }
+  __forceinline vboold8 operator >=(long long      a, const vllong8& b) { return vllong8(a) >= b; }
+
+  __forceinline vboold8 operator > (const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboold8 operator > (const vllong8& a, long long      b) { return a >  vllong8(b); }
+  __forceinline vboold8 operator > (long long      a, const vllong8& b) { return vllong8(a) >  b; }
+
+  __forceinline vboold8 operator <=(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LE); }
+  __forceinline vboold8 operator <=(const vllong8& a, long long      b) { return a <= vllong8(b); }
+  __forceinline vboold8 operator <=(long long      a, const vllong8& b) { return vllong8(a) <= b; }
+
+  __forceinline vboold8 eq(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboold8 ne(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboold8 lt(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboold8 ge(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboold8 gt(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboold8 le(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LE); }
+    
+  __forceinline vboold8 eq(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_EQ); }
+  __forceinline vboold8 ne(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_NE); }
+  __forceinline vboold8 lt(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_LT); }
+  __forceinline vboold8 ge(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_GE); }
+  __forceinline vboold8 gt(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_GT); }
+  __forceinline vboold8 le(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_LE); }
+
+  __forceinline vllong8 select(const vboold8& m, const vllong8& t, const vllong8& f) {
+    return _mm512_mask_or_epi64(f,m,t,t); 
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<int i0, int i1>
+  __forceinline vllong8 shuffle(const vllong8& v) {
+    return _mm512_castpd_si512(_mm512_permute_pd(_mm512_castsi512_pd(v), (i1 << 7) | (i0 << 6) | (i1 << 5) | (i0 << 4) | (i1 << 3) | (i0 << 2) | (i1 << 1) | i0));
+  }
+
+  template<int i>
+  __forceinline vllong8 shuffle(const vllong8& v) {
+    return shuffle<i, i>(v);
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vllong8 shuffle(const vllong8& v) {
+    return _mm512_permutex_epi64(v, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+
+  template<int i0, int i1>
+  __forceinline vllong8 shuffle4(const vllong8& v) {
+    return _mm512_shuffle_i64x2(v, v, _MM_SHUFFLE(i1*2+1, i1*2, i0*2+1, i0*2));
+  }
+
+  template<int i>
+  __forceinline vllong8 shuffle4(const vllong8& v) {
+    return shuffle4<i, i>(v);
+  }
+
+  template<int i>
+  __forceinline vllong8 align_shift_right(const vllong8& a, const vllong8& b) {
+    return _mm512_alignr_epi64(a, b, i);
+  };
+
+  __forceinline long long toScalar(const vllong8& v) {
+    return _mm_cvtsi128_si64(_mm512_castsi512_si128(v));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vllong8 vreduce_min2(vllong8 x) {                      return min(x, shuffle<1,0,3,2>(x)); }
+  __forceinline vllong8 vreduce_min4(vllong8 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); }
+  __forceinline vllong8 vreduce_min (vllong8 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0>(x)); }
+
+  __forceinline vllong8 vreduce_max2(vllong8 x) {                      return max(x, shuffle<1,0,3,2>(x)); }
+  __forceinline vllong8 vreduce_max4(vllong8 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); }
+  __forceinline vllong8 vreduce_max (vllong8 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0>(x)); }
+
+  __forceinline vllong8 vreduce_and2(vllong8 x) {                      return x & shuffle<1,0,3,2>(x); }
+  __forceinline vllong8 vreduce_and4(vllong8 x) { x = vreduce_and2(x); return x & shuffle<2,3,0,1>(x); }
+  __forceinline vllong8 vreduce_and (vllong8 x) { x = vreduce_and4(x); return x & shuffle4<1,0>(x); }
+
+  __forceinline vllong8 vreduce_or2(vllong8 x) {                     return x | shuffle<1,0,3,2>(x); }
+  __forceinline vllong8 vreduce_or4(vllong8 x) { x = vreduce_or2(x); return x | shuffle<2,3,0,1>(x); }
+  __forceinline vllong8 vreduce_or (vllong8 x) { x = vreduce_or4(x); return x | shuffle4<1,0>(x); }
+
+  __forceinline vllong8 vreduce_add2(vllong8 x) {                      return x + shuffle<1,0,3,2>(x); }
+  __forceinline vllong8 vreduce_add4(vllong8 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); }
+  __forceinline vllong8 vreduce_add (vllong8 x) { x = vreduce_add4(x); return x + shuffle4<1,0>(x); }
+
+  __forceinline long long reduce_min(const vllong8& v) { return toScalar(vreduce_min(v)); }
+  __forceinline long long reduce_max(const vllong8& v) { return toScalar(vreduce_max(v)); }
+  __forceinline long long reduce_and(const vllong8& v) { return toScalar(vreduce_and(v)); }
+  __forceinline long long reduce_or (const vllong8& v) { return toScalar(vreduce_or (v)); }
+  __forceinline long long reduce_add(const vllong8& v) { return toScalar(vreduce_add(v)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Memory load and store operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vllong8 permute(const vllong8& v, const vllong8& index) {
+    return _mm512_permutexvar_epi64(index,v);  
+  }
+
+  __forceinline vllong8 reverse(const vllong8& a) {
+    return permute(a,vllong8(reverse_step));
+  }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vllong8& v)
+  {
+    cout << "<" << v[0];
+    for (size_t i=1; i<8; i++) cout << ", " << v[i];
+    cout << ">";
+    return cout;
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vuint16_avx512.h b/thirdparty/embree/common/simd/vuint16_avx512.h
new file mode 100644
index 0000000000..c9eb6682ff
--- /dev/null
+++ b/thirdparty/embree/common/simd/vuint16_avx512.h
@@ -0,0 +1,424 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{ 
+  /* 16-wide AVX-512 unsigned integer type */
+  template<>
+  struct vuint<16>
+  {
+    ALIGNED_STRUCT_(64);   
+
+    typedef vboolf16 Bool;
+    typedef vuint16  UInt;
+    typedef vfloat16 Float;
+
+    enum  { size = 16 }; // number of SIMD elements
+    union {              // data
+      __m512i v; 
+      unsigned int i[16]; 
+    };
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+       
+    __forceinline vuint() {}
+    __forceinline vuint(const vuint16& t) { v = t.v; }
+    __forceinline vuint16& operator =(const vuint16& f) { v = f.v; return *this; }
+
+    __forceinline vuint(const __m512i& t) { v = t; }
+    __forceinline operator __m512i() const { return v; }
+    __forceinline operator __m256i() const { return _mm512_castsi512_si256(v); }
+
+    __forceinline vuint(unsigned int i) {
+      v = _mm512_set1_epi32(i);
+    }
+
+    __forceinline vuint(const vuint4& i) {
+      v = _mm512_broadcast_i32x4(i);
+    }
+
+    __forceinline vuint(const vuint8& i) {
+      v = _mm512_castps_si512(_mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_castsi256_pd(i))));
+    }
+    
+    __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d) {
+      v = _mm512_set4_epi32(d,c,b,a);      
+    }
+
+    __forceinline vuint(unsigned int a0 , unsigned int a1 , unsigned int a2 , unsigned int a3,
+                        unsigned int a4 , unsigned int a5 , unsigned int a6 , unsigned int a7,
+                        unsigned int a8 , unsigned int a9 , unsigned int a10, unsigned int a11,
+                        unsigned int a12, unsigned int a13, unsigned int a14, unsigned int a15)
+    {
+      v = _mm512_set_epi32(a15,a14,a13,a12,a11,a10,a9,a8,a7,a6,a5,a4,a3,a2,a1,a0);
+    }
+   
+    __forceinline explicit vuint(const __m512& f) {
+      v = _mm512_cvtps_epu32(f);
+    }
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline vuint(ZeroTy) : v(_mm512_setzero_epi32()) {}
+    __forceinline vuint(OneTy)  : v(_mm512_set1_epi32(1)) {}
+    __forceinline vuint(StepTy) : v(_mm512_set_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {}
+    __forceinline vuint(ReverseStepTy) : v(_mm512_setr_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline void store_nt(void* __restrict__ ptr, const vuint16& a) {
+      _mm512_stream_si512((__m512i*)ptr,a);
+    }
+
+    static __forceinline vuint16 loadu(const void* addr)
+    {
+      return _mm512_loadu_si512(addr);
+    }
+
+    static __forceinline vuint16 loadu(const unsigned char* ptr) { return _mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr)); }
+    static __forceinline vuint16 loadu(const unsigned short* ptr) { return _mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i*)ptr)); }
+
+    static __forceinline vuint16 load(const vuint16* addr) {
+      return _mm512_load_si512(addr);
+    }
+
+    static __forceinline vuint16 load(const unsigned int* addr) {
+      return _mm512_load_si512(addr);
+    }
+
+    static __forceinline vuint16 load(unsigned short* ptr) { return _mm512_cvtepu16_epi32(*(__m256i*)ptr); }
+
+
+    static __forceinline void store(void* ptr, const vuint16& v) {
+      _mm512_store_si512(ptr,v);
+    }
+
+    static __forceinline void storeu(void* ptr, const vuint16& v) {
+      _mm512_storeu_si512(ptr,v);
+    }
+
+    static __forceinline void storeu(const vboolf16& mask, void* ptr, const vuint16& f) {
+      _mm512_mask_storeu_epi32(ptr,mask,f);
+    }
+
+    static __forceinline void store(const vboolf16& mask, void* addr, const vuint16& v2) {
+      _mm512_mask_store_epi32(addr,mask,v2);
+    }
+
+    static __forceinline vuint16 compact(const vboolf16& mask, vuint16& v) {
+      return _mm512_mask_compress_epi32(v,mask,v);
+    }
+
+    static __forceinline vuint16 compact(const vboolf16& mask, const vuint16& a, vuint16& b) {
+      return _mm512_mask_compress_epi32(a,mask,b);
+    }
+
+    static __forceinline vuint16 expand(const vboolf16& mask, const vuint16& a, vuint16& b) {
+      return _mm512_mask_expand_epi32(b,mask,a);
+    }
+
+    template<int scale = 4>
+    static __forceinline vuint16 gather(const unsigned int* ptr, const vint16& index) {
+      return _mm512_i32gather_epi32(index,ptr,scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline vuint16 gather(const vboolf16& mask, const unsigned int* ptr, const vint16& index) {
+      return _mm512_mask_i32gather_epi32(_mm512_undefined_epi32(),mask,index,ptr,scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline vuint16 gather(const vboolf16& mask, vuint16& dest, const unsigned int* ptr, const vint16& index) {
+      return _mm512_mask_i32gather_epi32(dest,mask,index,ptr,scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(unsigned int* ptr, const vint16& index, const vuint16& v) {
+      _mm512_i32scatter_epi32((int*)ptr,index,v,scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(const vboolf16& mask, unsigned int* ptr, const vint16& index, const vuint16& v) {
+      _mm512_mask_i32scatter_epi32((int*)ptr,mask,index,v,scale);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline       unsigned int& operator [](size_t index)       { assert(index < 16); return i[index]; }
+    __forceinline const unsigned int& operator [](size_t index) const { assert(index < 16); return i[index]; }
+
+    __forceinline unsigned int uint    (size_t index) const { assert(index < 16); return ((unsigned int*)i)[index]; }
+    __forceinline size_t&      uint64_t(size_t index) const { assert(index < 8);  return ((size_t*)i)[index]; }
+  };
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf16 asBool(const vuint16& a) { return _mm512_movepi32_mask(a); }
+
+  __forceinline vuint16 operator +(const vuint16& a) { return a; }
+  __forceinline vuint16 operator -(const vuint16& a) { return _mm512_sub_epi32(_mm512_setzero_epi32(), a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint16 operator +(const vuint16& a, const vuint16& b) { return _mm512_add_epi32(a, b); }
+  __forceinline vuint16 operator +(const vuint16& a, unsigned int   b) { return a + vuint16(b); }
+  __forceinline vuint16 operator +(unsigned int   a, const vuint16& b) { return vuint16(a) + b; }
+
+  __forceinline vuint16 operator -(const vuint16& a, const vuint16& b) { return _mm512_sub_epi32(a, b); }
+  __forceinline vuint16 operator -(const vuint16& a, unsigned int   b) { return a - vuint16(b); }
+  __forceinline vuint16 operator -(unsigned int   a, const vuint16& b) { return vuint16(a) - b; }
+
+  __forceinline vuint16 operator *(const vuint16& a, const vuint16& b) { return _mm512_mul_epu32(a, b); }
+  __forceinline vuint16 operator *(const vuint16& a, unsigned int   b) { return a * vuint16(b); }
+  __forceinline vuint16 operator *(unsigned int   a, const vuint16& b) { return vuint16(a) * b; }
+
+  __forceinline vuint16 operator &(const vuint16& a, const vuint16& b) { return _mm512_and_epi32(a, b); }
+  __forceinline vuint16 operator &(const vuint16& a, unsigned int   b) { return a & vuint16(b); }
+  __forceinline vuint16 operator &(unsigned int   a, const vuint16& b) { return vuint16(a) & b; }
+
+  __forceinline vuint16 operator |(const vuint16& a, const vuint16& b) { return _mm512_or_epi32(a, b); }
+  __forceinline vuint16 operator |(const vuint16& a, unsigned int   b) { return a | vuint16(b); }
+  __forceinline vuint16 operator |(unsigned int   a, const vuint16& b) { return vuint16(a) | b; }
+
+  __forceinline vuint16 operator ^(const vuint16& a, const vuint16& b) { return _mm512_xor_epi32(a, b); }
+  __forceinline vuint16 operator ^(const vuint16& a, unsigned int   b) { return a ^ vuint16(b); }
+  __forceinline vuint16 operator ^(unsigned int   a, const vuint16& b) { return vuint16(a) ^ b; }
+
+  __forceinline vuint16 operator <<(const vuint16& a, unsigned int n) { return _mm512_slli_epi32(a, n); }
+  __forceinline vuint16 operator >>(const vuint16& a, unsigned int n) { return _mm512_srli_epi32(a, n); }
+
+  __forceinline vuint16 operator <<(const vuint16& a, const vuint16& n) { return _mm512_sllv_epi32(a, n); }
+  __forceinline vuint16 operator >>(const vuint16& a, const vuint16& n) { return _mm512_srlv_epi32(a, n); }
+
+  __forceinline vuint16 sll (const vuint16& a, unsigned int b) { return _mm512_slli_epi32(a, b); }
+  __forceinline vuint16 sra (const vuint16& a, unsigned int b) { return _mm512_srai_epi32(a, b); }
+  __forceinline vuint16 srl (const vuint16& a, unsigned int b) { return _mm512_srli_epi32(a, b); }
+  
+  __forceinline vuint16 min(const vuint16& a, const vuint16& b) { return _mm512_min_epu32(a, b); }
+  __forceinline vuint16 min(const vuint16& a, unsigned int   b) { return min(a,vuint16(b)); }
+  __forceinline vuint16 min(unsigned int   a, const vuint16& b) { return min(vuint16(a),b); }
+
+  __forceinline vuint16 max(const vuint16& a, const vuint16& b) { return _mm512_max_epu32(a, b); }
+  __forceinline vuint16 max(const vuint16& a, unsigned int   b) { return max(a,vuint16(b)); }
+  __forceinline vuint16 max(unsigned int   a, const vuint16& b) { return max(vuint16(a),b); }
+  
+  __forceinline vuint16 mask_add(const vboolf16& mask, vuint16& c, const vuint16& a, const vuint16& b) { return _mm512_mask_add_epi32(c,mask,a,b); }
+  __forceinline vuint16 mask_sub(const vboolf16& mask, vuint16& c, const vuint16& a, const vuint16& b) { return _mm512_mask_sub_epi32(c,mask,a,b); }
+
+  __forceinline vuint16 mask_and(const vboolf16& m, vuint16& c, const vuint16& a, const vuint16& b) { return _mm512_mask_and_epi32(c,m,a,b); }
+  __forceinline vuint16 mask_or (const vboolf16& m, vuint16& c, const vuint16& a, const vuint16& b) { return _mm512_mask_or_epi32(c,m,a,b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint16& operator +=(vuint16& a, const vuint16& b) { return a = a + b; }
+  __forceinline vuint16& operator +=(vuint16& a, unsigned int   b) { return a = a + b; }
+  
+  __forceinline vuint16& operator -=(vuint16& a, const vuint16& b) { return a = a - b; }
+  __forceinline vuint16& operator -=(vuint16& a, unsigned int   b) { return a = a - b; }
+
+  __forceinline vuint16& operator *=(vuint16& a, const vuint16& b) { return a = a * b; }
+  __forceinline vuint16& operator *=(vuint16& a, unsigned int   b) { return a = a * b; }
+  
+  __forceinline vuint16& operator &=(vuint16& a, const vuint16& b) { return a = a & b; }
+  __forceinline vuint16& operator &=(vuint16& a, unsigned int   b) { return a = a & b; }
+  
+  __forceinline vuint16& operator |=(vuint16& a, const vuint16& b) { return a = a | b; }
+  __forceinline vuint16& operator |=(vuint16& a, unsigned int   b) { return a = a | b; }
+  
+  __forceinline vuint16& operator <<=(vuint16& a, unsigned int b) { return a = a << b; }
+  __forceinline vuint16& operator >>=(vuint16& a, unsigned int b) { return a = a >> b; }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf16 operator ==(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf16 operator ==(const vuint16& a, unsigned int   b) { return a == vuint16(b); }
+  __forceinline vboolf16 operator ==(unsigned int   a, const vuint16& b) { return vuint16(a) == b; }
+  
+  __forceinline vboolf16 operator !=(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf16 operator !=(const vuint16& a, unsigned int   b) { return a != vuint16(b); }
+  __forceinline vboolf16 operator !=(unsigned int   a, const vuint16& b) { return vuint16(a) != b; }
+  
+  __forceinline vboolf16 operator < (const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboolf16 operator < (const vuint16& a, unsigned int   b) { return a <  vuint16(b); }
+  __forceinline vboolf16 operator < (unsigned int   a, const vuint16& b) { return vuint16(a) <  b; }
+  
+  __forceinline vboolf16 operator >=(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboolf16 operator >=(const vuint16& a, unsigned int   b) { return a >= vuint16(b); }
+  __forceinline vboolf16 operator >=(unsigned int   a, const vuint16& b) { return vuint16(a) >= b; }
+
+  __forceinline vboolf16 operator > (const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf16 operator > (const vuint16& a, unsigned int   b) { return a >  vuint16(b); }
+  __forceinline vboolf16 operator > (unsigned int   a, const vuint16& b) { return vuint16(a) >  b; }
+
+  __forceinline vboolf16 operator <=(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LE); }
+  __forceinline vboolf16 operator <=(const vuint16& a, unsigned int   b) { return a <= vuint16(b); }
+  __forceinline vboolf16 operator <=(unsigned int   a, const vuint16& b) { return vuint16(a) <= b; }
+
+  __forceinline vboolf16 eq(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf16 ne(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf16 lt(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboolf16 ge(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboolf16 gt(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf16 le(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LE); }
+
+  __forceinline vboolf16 eq(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf16 ne(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf16 lt(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_LT); }
+  __forceinline vboolf16 ge(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_GE); }
+  __forceinline vboolf16 gt(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf16 le(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_LE); }
+    
+ 
+  __forceinline vuint16 select(const vboolf16& m, const vuint16& t, const vuint16& f) {
+    return _mm512_mask_or_epi32(f,m,t,t); 
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<int i>
+  __forceinline vuint16 shuffle(const vuint16& v) {
+    return _mm512_castps_si512(_mm512_permute_ps(_mm512_castsi512_ps(v), _MM_SHUFFLE(i, i, i, i)));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vuint16 shuffle(const vuint16& v) {
+    return _mm512_castps_si512(_mm512_permute_ps(_mm512_castsi512_ps(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<int i>
+  __forceinline vuint16 shuffle4(const vuint16& v) {
+    return _mm512_castps_si512(_mm512_shuffle_f32x4(_mm512_castsi512_ps(v), _mm512_castsi512_ps(v) ,_MM_SHUFFLE(i, i, i, i)));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vuint16 shuffle4(const vuint16& v) {
+    return _mm512_castps_si512(_mm512_shuffle_f32x4(_mm512_castsi512_ps(v), _mm512_castsi512_ps(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<int i>
+  __forceinline vuint16 align_shift_right(const vuint16& a, const vuint16& b) {
+    return _mm512_alignr_epi32(a, b, i);
+  };
+
+  __forceinline unsigned int toScalar(const vuint16& v) {
+    return _mm_cvtsi128_si32(_mm512_castsi512_si128(v));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint16 vreduce_min2(vuint16 x) {                      return min(x, shuffle<1,0,3,2>(x)); }
+  __forceinline vuint16 vreduce_min4(vuint16 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); }
+  __forceinline vuint16 vreduce_min8(vuint16 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0,3,2>(x)); }
+  __forceinline vuint16 vreduce_min (vuint16 x) { x = vreduce_min8(x); return min(x, shuffle4<2,3,0,1>(x)); }
+
+  __forceinline vuint16 vreduce_max2(vuint16 x) {                      return max(x, shuffle<1,0,3,2>(x)); }
+  __forceinline vuint16 vreduce_max4(vuint16 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); }
+  __forceinline vuint16 vreduce_max8(vuint16 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0,3,2>(x)); }
+  __forceinline vuint16 vreduce_max (vuint16 x) { x = vreduce_max8(x); return max(x, shuffle4<2,3,0,1>(x)); }
+
+  __forceinline vuint16 vreduce_and2(vuint16 x) {                      return x & shuffle<1,0,3,2>(x); }
+  __forceinline vuint16 vreduce_and4(vuint16 x) { x = vreduce_and2(x); return x & shuffle<2,3,0,1>(x); }
+  __forceinline vuint16 vreduce_and8(vuint16 x) { x = vreduce_and4(x); return x & shuffle4<1,0,3,2>(x); }
+  __forceinline vuint16 vreduce_and (vuint16 x) { x = vreduce_and8(x); return x & shuffle4<2,3,0,1>(x); }
+
+  __forceinline vuint16 vreduce_or2(vuint16 x) {                     return x | shuffle<1,0,3,2>(x); }
+  __forceinline vuint16 vreduce_or4(vuint16 x) { x = vreduce_or2(x); return x | shuffle<2,3,0,1>(x); }
+  __forceinline vuint16 vreduce_or8(vuint16 x) { x = vreduce_or4(x); return x | shuffle4<1,0,3,2>(x); }
+  __forceinline vuint16 vreduce_or (vuint16 x) { x = vreduce_or8(x); return x | shuffle4<2,3,0,1>(x); }
+
+  __forceinline vuint16 vreduce_add2(vuint16 x) {                      return x + shuffle<1,0,3,2>(x); }
+  __forceinline vuint16 vreduce_add4(vuint16 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); }
+  __forceinline vuint16 vreduce_add8(vuint16 x) { x = vreduce_add4(x); return x + shuffle4<1,0,3,2>(x); }
+  __forceinline vuint16 vreduce_add (vuint16 x) { x = vreduce_add8(x); return x + shuffle4<2,3,0,1>(x); }
+
+  __forceinline unsigned int reduce_min(const vuint16& v) { return toScalar(vreduce_min(v)); }
+  __forceinline unsigned int reduce_max(const vuint16& v) { return toScalar(vreduce_max(v)); }
+  __forceinline unsigned int reduce_and(const vuint16& v) { return toScalar(vreduce_and(v)); }
+  __forceinline unsigned int reduce_or (const vuint16& v) { return toScalar(vreduce_or (v)); }
+  __forceinline unsigned int reduce_add(const vuint16& v) { return toScalar(vreduce_add(v)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Memory load and store operations
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vuint16 permute(vuint16 v, vuint16 index) {
+    return _mm512_permutexvar_epi32(index,v);  
+  }
+
+  __forceinline vuint16 reverse(const vuint16& a) {
+    return permute(a,vuint16(reverse_step));
+  }
+
+  __forceinline vuint16 prefix_sum(const vuint16& a) 
+  {
+    const vuint16 z(zero);
+    vuint16 v = a;
+    v = v + align_shift_right<16-1>(v,z);
+    v = v + align_shift_right<16-2>(v,z);
+    v = v + align_shift_right<16-4>(v,z);
+    v = v + align_shift_right<16-8>(v,z);
+    return v;  
+  }
+
+  __forceinline vuint16 reverse_prefix_sum(const vuint16& a) 
+  {
+    const vuint16 z(zero);
+    vuint16 v = a;
+    v = v + align_shift_right<1>(z,v);
+    v = v + align_shift_right<2>(z,v);
+    v = v + align_shift_right<4>(z,v);
+    v = v + align_shift_right<8>(z,v);
+    return v;  
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vuint16& v)
+  {
+    cout << "<" << v[0];
+    for (int i=1; i<16; i++) cout << ", " << v[i];
+    cout << ">";
+    return cout;
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vuint4_sse2.h b/thirdparty/embree/common/simd/vuint4_sse2.h
new file mode 100644
index 0000000000..0601b9ab80
--- /dev/null
+++ b/thirdparty/embree/common/simd/vuint4_sse2.h
@@ -0,0 +1,426 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../math/math.h"
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+  /* 4-wide SSE integer type */
+  template<>
+  struct vuint<4>
+  {
+    ALIGNED_STRUCT_(16);
+    
+    typedef vboolf4 Bool;
+    typedef vuint4   Int;
+    typedef vfloat4 Float;
+
+    enum  { size = 4 }; // number of SIMD elements
+    union { __m128i v; unsigned int i[4]; }; // data
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline vuint() {}
+    __forceinline vuint(const vuint4& a) { v = a.v; }
+    __forceinline vuint4& operator =(const vuint4& a) { v = a.v; return *this; }
+
+    __forceinline vuint(const __m128i a) : v(a) {}
+    __forceinline operator const __m128i&() const { return v; }
+    __forceinline operator       __m128i&()       { return v; }
+
+
+    __forceinline vuint(unsigned int a) : v(_mm_set1_epi32(a)) {}
+    __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d) : v(_mm_set_epi32(d, c, b, a)) {}
+
+#if defined(__AVX512VL__)
+    __forceinline explicit vuint(__m128 a) : v(_mm_cvtps_epu32(a)) {}
+#endif
+
+#if defined(__AVX512VL__)
+    __forceinline explicit vuint(const vboolf4& a) : v(_mm_movm_epi32(a)) {}
+#else
+    __forceinline explicit vuint(const vboolf4& a) : v(_mm_castps_si128((__m128)a)) {}
+#endif
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vuint(ZeroTy)   : v(_mm_setzero_si128()) {}
+    __forceinline vuint(OneTy)    : v(_mm_set1_epi32(1)) {}
+    __forceinline vuint(PosInfTy) : v(_mm_set1_epi32(unsigned(pos_inf))) {}
+    __forceinline vuint(StepTy)   : v(_mm_set_epi32(3, 2, 1, 0)) {}
+    __forceinline vuint(TrueTy)   { v = _mm_cmpeq_epi32(v,v); }
+    __forceinline vuint(UndefinedTy) : v(_mm_castps_si128(_mm_undefined_ps())) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline vuint4 load (const void* a) { return _mm_load_si128((__m128i*)a); }
+    static __forceinline vuint4 loadu(const void* a) { return _mm_loadu_si128((__m128i*)a); }
+
+    static __forceinline void store (void* ptr, const vuint4& v) { _mm_store_si128((__m128i*)ptr,v); }
+    static __forceinline void storeu(void* ptr, const vuint4& v) { _mm_storeu_si128((__m128i*)ptr,v); }
+    
+#if defined(__AVX512VL__)
+    static __forceinline vuint4 load (const vboolf4& mask, const void* ptr) { return _mm_mask_load_epi32 (_mm_setzero_si128(),mask,ptr); }
+    static __forceinline vuint4 loadu(const vboolf4& mask, const void* ptr) { return _mm_mask_loadu_epi32(_mm_setzero_si128(),mask,ptr); }
+
+    static __forceinline void store (const vboolf4& mask, void* ptr, const vuint4& v) { _mm_mask_store_epi32 (ptr,mask,v); }
+    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vuint4& v) { _mm_mask_storeu_epi32(ptr,mask,v); }
+#elif defined(__AVX__)
+    static __forceinline vuint4 load (const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask)); }
+    static __forceinline vuint4 loadu(const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask)); }
+
+    static __forceinline void store (const vboolf4& mask, void* ptr, const vuint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,_mm_castsi128_ps(i)); }
+    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vuint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,_mm_castsi128_ps(i)); }
+#else
+    static __forceinline vuint4 load (const vbool4& mask, const void* a) { return _mm_and_si128(_mm_load_si128 ((__m128i*)a),mask); }
+    static __forceinline vuint4 loadu(const vbool4& mask, const void* a) { return _mm_and_si128(_mm_loadu_si128((__m128i*)a),mask); }
+
+    static __forceinline void store (const vboolf4& mask, void* ptr, const vuint4& i) { store (ptr,select(mask,i,load (ptr))); }
+    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vuint4& i) { storeu(ptr,select(mask,i,loadu(ptr))); }
+#endif
+
+#if defined(__SSE4_1__)
+    static __forceinline vuint4 load(const unsigned char* ptr) {
+      return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr));
+    }
+
+    static __forceinline vuint4 loadu(const unsigned char* ptr) {
+      return  _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr));
+    }
+
+#endif
+
+    static __forceinline vuint4 load(const unsigned short* ptr) {
+#if defined (__SSE4_1__)
+      return _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr));
+#else
+      return vuint4(ptr[0],ptr[1],ptr[2],ptr[3]);
+#endif
+    } 
+
+    static __forceinline vuint4 load_nt(void* ptr) {
+#if defined(__SSE4_1__)
+      return _mm_stream_load_si128((__m128i*)ptr); 
+#else
+      return _mm_load_si128((__m128i*)ptr); 
+#endif
+    }
+    
+    static __forceinline void store_nt(void* ptr, const vuint4& v) {
+#if defined(__SSE4_1__)
+      _mm_stream_ps((float*)ptr,_mm_castsi128_ps(v)); 
+#else
+      _mm_store_si128((__m128i*)ptr,v);
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline vuint4 gather(const unsigned int* ptr, const vint4& index) {
+#if defined(__AVX2__)
+      return _mm_i32gather_epi32((const int*)ptr, index, scale);
+#else
+      return vuint4(
+          *(unsigned int*)(((char*)ptr)+scale*index[0]),
+          *(unsigned int*)(((char*)ptr)+scale*index[1]),
+          *(unsigned int*)(((char*)ptr)+scale*index[2]),
+          *(unsigned int*)(((char*)ptr)+scale*index[3]));
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline vuint4 gather(const vboolf4& mask, const unsigned int* ptr, const vint4& index) {
+      vuint4 r = zero;
+#if defined(__AVX512VL__)
+      return _mm_mmask_i32gather_epi32(r, mask, index, ptr, scale);
+#elif defined(__AVX2__)
+      return _mm_mask_i32gather_epi32(r, (const int*)ptr, index, mask, scale);
+#else
+      if (likely(mask[0])) r[0] = *(unsigned int*)(((char*)ptr)+scale*index[0]);
+      if (likely(mask[1])) r[1] = *(unsigned int*)(((char*)ptr)+scale*index[1]);
+      if (likely(mask[2])) r[2] = *(unsigned int*)(((char*)ptr)+scale*index[2]);
+      if (likely(mask[3])) r[3] = *(unsigned int*)(((char*)ptr)+scale*index[3]);
+      return r;
+#endif
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const unsigned int& operator [](size_t index) const { assert(index < 4); return i[index]; }
+    __forceinline       unsigned int& operator [](size_t index)       { assert(index < 4); return i[index]; }
+
+    friend __forceinline vuint4 select(const vboolf4& m, const vuint4& t, const vuint4& f) {
+#if defined(__AVX512VL__)
+      return _mm_mask_blend_epi32(m, (__m128i)f, (__m128i)t);
+#elif defined(__SSE4_1__)
+      return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m)); 
+#else
+      return _mm_or_si128(_mm_and_si128(m, t), _mm_andnot_si128(m, f)); 
+#endif
+    }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  __forceinline vboolf4 asBool(const vuint4& a) { return _mm_movepi32_mask(a); }
+#else
+  __forceinline vboolf4 asBool(const vuint4& a) { return _mm_castsi128_ps(a); }
+#endif
+
+  __forceinline vuint4 operator +(const vuint4& a) { return a; }
+  __forceinline vuint4 operator -(const vuint4& a) { return _mm_sub_epi32(_mm_setzero_si128(), a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint4 operator +(const vuint4& a, const vuint4& b) { return _mm_add_epi32(a, b); }
+  __forceinline vuint4 operator +(const vuint4& a, unsigned int  b) { return a + vuint4(b); }
+  __forceinline vuint4 operator +(unsigned int  a, const vuint4& b) { return vuint4(a) + b; }
+
+  __forceinline vuint4 operator -(const vuint4& a, const vuint4& b) { return _mm_sub_epi32(a, b); }
+  __forceinline vuint4 operator -(const vuint4& a, unsigned int  b) { return a - vuint4(b); }
+  __forceinline vuint4 operator -(unsigned int  a, const vuint4& b) { return vuint4(a) - b; }
+
+//#if defined(__SSE4_1__)
+//  __forceinline vuint4 operator *(const vuint4& a, const vuint4& b) { return _mm_mullo_epu32(a, b); }
+//#else
+//  __forceinline vuint4 operator *(const vuint4& a, const vuint4& b) { return vuint4(a[0]*b[0],a[1]*b[1],a[2]*b[2],a[3]*b[3]); }
+//#endif
+//  __forceinline vuint4 operator *(const vuint4& a, unsigned int  b) { return a * vuint4(b); }
+//  __forceinline vuint4 operator *(unsigned int  a, const vuint4& b) { return vuint4(a) * b; }
+
+  __forceinline vuint4 operator &(const vuint4& a, const vuint4& b) { return _mm_and_si128(a, b); }
+  __forceinline vuint4 operator &(const vuint4& a, unsigned int  b) { return a & vuint4(b); }
+  __forceinline vuint4 operator &(unsigned int  a, const vuint4& b) { return vuint4(a) & b; }
+
+  __forceinline vuint4 operator |(const vuint4& a, const vuint4& b) { return _mm_or_si128(a, b); }
+  __forceinline vuint4 operator |(const vuint4& a, unsigned int  b) { return a | vuint4(b); }
+  __forceinline vuint4 operator |(unsigned int  a, const vuint4& b) { return vuint4(a) | b; }
+
+  __forceinline vuint4 operator ^(const vuint4& a, const vuint4& b) { return _mm_xor_si128(a, b); }
+  __forceinline vuint4 operator ^(const vuint4& a, unsigned int  b) { return a ^ vuint4(b); }
+  __forceinline vuint4 operator ^(unsigned int  a, const vuint4& b) { return vuint4(a) ^ b; }
+
+  __forceinline vuint4 operator <<(const vuint4& a, unsigned int n) { return _mm_slli_epi32(a, n); }
+  __forceinline vuint4 operator >>(const vuint4& a, unsigned int n) { return _mm_srli_epi32(a, n); }
+
+  __forceinline vuint4 sll (const vuint4& a, unsigned int b) { return _mm_slli_epi32(a, b); }
+  __forceinline vuint4 sra (const vuint4& a, unsigned int b) { return _mm_srai_epi32(a, b); }
+  __forceinline vuint4 srl (const vuint4& a, unsigned int b) { return _mm_srli_epi32(a, b); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint4& operator +=(vuint4& a, const vuint4& b) { return a = a + b; }
+  __forceinline vuint4& operator +=(vuint4& a, unsigned int  b) { return a = a + b; }
+  
+  __forceinline vuint4& operator -=(vuint4& a, const vuint4& b) { return a = a - b; }
+  __forceinline vuint4& operator -=(vuint4& a, unsigned int  b) { return a = a - b; }
+
+//#if defined(__SSE4_1__)
+//  __forceinline vuint4& operator *=(vuint4& a, const vuint4& b) { return a = a * b; }
+//  __forceinline vuint4& operator *=(vuint4& a, unsigned int  b) { return a = a * b; }
+//#endif
+  
+  __forceinline vuint4& operator &=(vuint4& a, const vuint4& b) { return a = a & b; }
+  __forceinline vuint4& operator &=(vuint4& a, unsigned int  b) { return a = a & b; }
+  
+  __forceinline vuint4& operator |=(vuint4& a, const vuint4& b) { return a = a | b; }
+  __forceinline vuint4& operator |=(vuint4& a, unsigned int  b) { return a = a | b; }
+  
+  __forceinline vuint4& operator <<=(vuint4& a, unsigned int  b) { return a = a << b; }
+  __forceinline vuint4& operator >>=(vuint4& a, unsigned int  b) { return a = a >> b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  __forceinline vboolf4 operator ==(const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf4 operator !=(const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_NE); }
+  //__forceinline vboolf4 operator < (const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_LT); }
+  //__forceinline vboolf4 operator >=(const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_GE); }
+  //__forceinline vboolf4 operator > (const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_GT); }
+  //__forceinline vboolf4 operator <=(const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_LE); }
+#else
+  __forceinline vboolf4 operator ==(const vuint4& a, const vuint4& b) { return _mm_castsi128_ps(_mm_cmpeq_epi32(a, b)); }
+  __forceinline vboolf4 operator !=(const vuint4& a, const vuint4& b) { return !(a == b); }
+  //__forceinline vboolf4 operator < (const vuint4& a, const vuint4& b) { return _mm_castsi128_ps(_mm_cmplt_epu32(a, b)); }
+  //__forceinline vboolf4 operator >=(const vuint4& a, const vuint4& b) { return !(a <  b); }
+  //__forceinline vboolf4 operator > (const vuint4& a, const vuint4& b) { return _mm_castsi128_ps(_mm_cmpgt_epu32(a, b)); }
+  //__forceinline vboolf4 operator <=(const vuint4& a, const vuint4& b) { return !(a >  b); }
+#endif
+
+  __forceinline vboolf4 operator ==(const vuint4& a, unsigned int  b) { return a == vuint4(b); }
+  __forceinline vboolf4 operator ==(unsigned int  a, const vuint4& b) { return vuint4(a) == b; }
+
+  __forceinline vboolf4 operator !=(const vuint4& a, unsigned int  b) { return a != vuint4(b); }
+  __forceinline vboolf4 operator !=(unsigned int  a, const vuint4& b) { return vuint4(a) != b; }
+
+  //__forceinline vboolf4 operator < (const vuint4& a, unsigned int  b) { return a <  vuint4(b); }
+  //__forceinline vboolf4 operator < (unsigned int  a, const vuint4& b) { return vuint4(a) <  b; }
+
+  //__forceinline vboolf4 operator >=(const vuint4& a, unsigned int  b) { return a >= vuint4(b); }
+  //__forceinline vboolf4 operator >=(unsigned int  a, const vuint4& b) { return vuint4(a) >= b; }
+
+  //__forceinline vboolf4 operator > (const vuint4& a, unsigned int  b) { return a >  vuint4(b); }
+  //__forceinline vboolf4 operator > (unsigned int  a, const vuint4& b) { return vuint4(a) >  b; }
+
+  //__forceinline vboolf4 operator <=(const vuint4& a, unsigned int  b) { return a <= vuint4(b); }
+  //__forceinline vboolf4 operator <=(unsigned int  a, const vuint4& b) { return vuint4(a) <= b; }
+
+  __forceinline vboolf4 eq(const vuint4& a, const vuint4& b) { return a == b; }
+  __forceinline vboolf4 ne(const vuint4& a, const vuint4& b) { return a != b; }
+  //__forceinline vboolf4 lt(const vuint4& a, const vuint4& b) { return a <  b; }
+  //__forceinline vboolf4 ge(const vuint4& a, const vuint4& b) { return a >= b; }
+  //__forceinline vboolf4 gt(const vuint4& a, const vuint4& b) { return a >  b; }
+  //__forceinline vboolf4 le(const vuint4& a, const vuint4& b) { return a <= b; }
+
+#if defined(__AVX512VL__)
+  __forceinline vboolf4 eq(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_EQ); }
+  __forceinline vboolf4 ne(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_NE); }
+  //__forceinline vboolf4 lt(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_LT); }
+  //__forceinline vboolf4 ge(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_GE); }
+  //__forceinline vboolf4 gt(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_GT); }
+  //__forceinline vboolf4 le(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_LE); }
+#else
+  __forceinline vboolf4 eq(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a == b); }
+  __forceinline vboolf4 ne(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a != b); }
+  //__forceinline vboolf4 lt(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a <  b); }
+  //__forceinline vboolf4 ge(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a >= b); }
+  //__forceinline vboolf4 gt(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a >  b); }
+  //__forceinline vboolf4 le(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a <= b); }
+#endif
+
+  template<int mask>
+  __forceinline vuint4 select(const vuint4& t, const vuint4& f) {
+#if defined(__SSE4_1__) 
+    return _mm_castps_si128(_mm_blend_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), mask));
+#else
+    return select(vboolf4(mask), t, f);
+#endif    
+  }
+
+/*#if defined(__SSE4_1__)
+  __forceinline vuint4 min(const vuint4& a, const vuint4& b) { return _mm_min_epu32(a, b); }
+  __forceinline vuint4 max(const vuint4& a, const vuint4& b) { return _mm_max_epu32(a, b); }
+
+#else
+  __forceinline vuint4 min(const vuint4& a, const vuint4& b) { return select(a < b,a,b); }
+  __forceinline vuint4 max(const vuint4& a, const vuint4& b) { return select(a < b,b,a); }
+#endif
+
+  __forceinline vuint4 min(const vuint4& a, unsigned int  b) { return min(a,vuint4(b)); }
+  __forceinline vuint4 min(unsigned int  a, const vuint4& b) { return min(vuint4(a),b); }
+  __forceinline vuint4 max(const vuint4& a, unsigned int  b) { return max(a,vuint4(b)); }
+  __forceinline vuint4 max(unsigned int  a, const vuint4& b) { return max(vuint4(a),b); }*/
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint4 unpacklo(const vuint4& a, const vuint4& b) { return _mm_castps_si128(_mm_unpacklo_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); }
+  __forceinline vuint4 unpackhi(const vuint4& a, const vuint4& b) { return _mm_castps_si128(_mm_unpackhi_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vuint4 shuffle(const vuint4& v) {
+    return _mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vuint4 shuffle(const vuint4& a, const vuint4& b) {
+    return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+#if defined(__SSE3__)
+  template<> __forceinline vuint4 shuffle<0, 0, 2, 2>(const vuint4& v) { return _mm_castps_si128(_mm_moveldup_ps(_mm_castsi128_ps(v))); }
+  template<> __forceinline vuint4 shuffle<1, 1, 3, 3>(const vuint4& v) { return _mm_castps_si128(_mm_movehdup_ps(_mm_castsi128_ps(v))); }
+  template<> __forceinline vuint4 shuffle<0, 1, 0, 1>(const vuint4& v) { return _mm_castpd_si128(_mm_movedup_pd (_mm_castsi128_pd(v))); }
+#endif
+
+  template<int i>
+  __forceinline vuint4 shuffle(const vuint4& v) {
+    return shuffle<i,i,i,i>(v);
+  }
+
+#if defined(__SSE4_1__)
+  template<int src> __forceinline unsigned int extract(const vuint4& b) { return _mm_extract_epi32(b, src); }
+  template<int dst> __forceinline vuint4 insert(const vuint4& a, const unsigned b) { return _mm_insert_epi32(a, b, dst); }
+#else
+  template<int src> __forceinline unsigned int extract(const vuint4& b) { return b[src&3]; }
+  template<int dst> __forceinline vuint4 insert(const vuint4& a, const unsigned b) { vuint4 c = a; c[dst&3] = b; return c; }
+#endif
+
+
+  template<> __forceinline unsigned int extract<0>(const vuint4& b) { return _mm_cvtsi128_si32(b); }
+
+  __forceinline unsigned int toScalar(const vuint4& v) { return _mm_cvtsi128_si32(v); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if 0
+#if defined(__SSE4_1__)
+
+  __forceinline vuint4 vreduce_min(const vuint4& v) { vuint4 h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); }
+  __forceinline vuint4 vreduce_max(const vuint4& v) { vuint4 h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); }
+  __forceinline vuint4 vreduce_add(const vuint4& v) { vuint4 h = shuffle<1,0,3,2>(v)   + v ; return shuffle<2,3,0,1>(h)   + h ; }
+
+  __forceinline unsigned int reduce_min(const vuint4& v) { return toScalar(vreduce_min(v)); }
+  __forceinline unsigned int reduce_max(const vuint4& v) { return toScalar(vreduce_max(v)); }
+  __forceinline unsigned int reduce_add(const vuint4& v) { return toScalar(vreduce_add(v)); }
+
+  __forceinline size_t select_min(const vuint4& v) { return bsf(movemask(v == vreduce_min(v))); }
+  __forceinline size_t select_max(const vuint4& v) { return bsf(movemask(v == vreduce_max(v))); }
+
+  //__forceinline size_t select_min(const vboolf4& valid, const vuint4& v) { const vuint4 a = select(valid,v,vuint4(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); }
+  //__forceinline size_t select_max(const vboolf4& valid, const vuint4& v) { const vuint4 a = select(valid,v,vuint4(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); }
+
+#else
+
+  __forceinline unsigned int reduce_min(const vuint4& v) { return min(v[0],v[1],v[2],v[3]); }
+  __forceinline unsigned int reduce_max(const vuint4& v) { return max(v[0],v[1],v[2],v[3]); }
+  __forceinline unsigned int reduce_add(const vuint4& v) { return v[0]+v[1]+v[2]+v[3]; }
+
+#endif
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vuint4& a) {
+    return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ">";
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vuint8_avx.h b/thirdparty/embree/common/simd/vuint8_avx.h
new file mode 100644
index 0000000000..589cd9d731
--- /dev/null
+++ b/thirdparty/embree/common/simd/vuint8_avx.h
@@ -0,0 +1,386 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+  /* 8-wide AVX integer type */
+  template<>
+  struct vuint<8>
+  {
+    ALIGNED_STRUCT_(32);   
+
+    typedef vboolf8 Bool;
+    typedef vuint8   Int;
+    typedef vfloat8 Float;
+
+    enum  { size = 8 };        // number of SIMD elements
+    union {                    // data
+      __m256i v;
+      struct { __m128i vl,vh; };
+      unsigned int i[8];
+    }; 
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vuint() {}
+    __forceinline vuint(const vuint8& a) { v = a.v; }
+    __forceinline vuint8& operator =(const vuint8& a) { v = a.v; return *this; }
+
+    __forceinline vuint(__m256i a) : v(a) {}
+    __forceinline operator const __m256i&() const { return v; }
+    __forceinline operator       __m256i&()       { return v; }
+
+    __forceinline explicit vuint(const vuint4& a) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),a,1)) {}
+    __forceinline vuint(const vuint4& a, const vuint4& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {}
+    __forceinline vuint(const __m128i& a, const __m128i& b) : vl(a), vh(b) {}
+ 
+    __forceinline explicit vuint(const unsigned int* a) : v(_mm256_castps_si256(_mm256_loadu_ps((const float*)a))) {}
+    __forceinline vuint(unsigned int a) : v(_mm256_set1_epi32(a)) {}
+    __forceinline vuint(unsigned int a, unsigned int b) : v(_mm256_set_epi32(b, a, b, a, b, a, b, a)) {}
+    __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d) : v(_mm256_set_epi32(d, c, b, a, d, c, b, a)) {}
+    __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d, unsigned int e, unsigned int f, unsigned int g, unsigned int vh) : v(_mm256_set_epi32(vh, g, f, e, d, c, b, a)) {}
+
+    __forceinline explicit vuint(__m256 a) : v(_mm256_cvtps_epi32(a)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vuint(ZeroTy)   : v(_mm256_setzero_si256()) {}
+    __forceinline vuint(OneTy)    : v(_mm256_set1_epi32(1)) {}
+    __forceinline vuint(PosInfTy) : v(_mm256_set1_epi32(0xFFFFFFFF)) {}
+    __forceinline vuint(StepTy)   : v(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)) {}
+    __forceinline vuint(UndefinedTy) : v(_mm256_undefined_si256()) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline vuint8 load (const void* a) { return _mm256_castps_si256(_mm256_load_ps((float*)a)); }
+    static __forceinline vuint8 loadu(const void* a) { return _mm256_castps_si256(_mm256_loadu_ps((float*)a)); }
+
+    static __forceinline vuint8 load (const vboolf8& mask, const void* a) { return _mm256_castps_si256(_mm256_maskload_ps((float*)a,mask)); }
+    static __forceinline vuint8 loadu(const vboolf8& mask, const void* a) { return _mm256_castps_si256(_mm256_maskload_ps((float*)a,mask)); }
+
+    static __forceinline void store (void* ptr, const vuint8& f) { _mm256_store_ps((float*)ptr,_mm256_castsi256_ps(f)); }
+    static __forceinline void storeu(void* ptr, const vuint8& f) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(f)); }
+    
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); }
+
+    static __forceinline void store_nt(void* ptr, const vuint8& v) {
+      _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v));
+    }
+
+    static __forceinline vuint8 load(const unsigned char* ptr) {
+      vuint4 il = vuint4::load(ptr+0);
+      vuint4 ih = vuint4::load(ptr+4);
+      return vuint8(il,ih);
+    }
+
+    static __forceinline vuint8 loadu(const unsigned char* ptr) {
+      vuint4 il = vuint4::loadu(ptr+0);
+      vuint4 ih = vuint4::loadu(ptr+4);
+      return vuint8(il,ih);
+    }
+
+    static __forceinline vuint8 load(const unsigned short* ptr) {
+      vuint4 il = vuint4::load(ptr+0);
+      vuint4 ih = vuint4::load(ptr+4);
+      return vuint8(il,ih);
+    }
+
+    static __forceinline vuint8 loadu(const unsigned short* ptr) {
+      vuint4 il = vuint4::loadu(ptr+0);
+      vuint4 ih = vuint4::loadu(ptr+4);
+      return vuint8(il,ih);
+    }
+
+    static __forceinline void store(unsigned char* ptr, const vuint8& i) {
+      vuint4 il(i.vl);
+      vuint4 ih(i.vh);
+      vuint4::store(ptr + 0,il);
+      vuint4::store(ptr + 4,ih);
+    }
+
+    static __forceinline void store(unsigned short* ptr, const vuint8& v) {
+      for (size_t i=0;i<8;i++)
+        ptr[i] = (unsigned short)v[i];
+    }
+
+    template<int scale = 4>
+    static __forceinline vuint8 gather(const unsigned int* ptr, const vint8& index) {
+      return vuint8(
+          *(unsigned int*)(((char*)ptr)+scale*index[0]),
+          *(unsigned int*)(((char*)ptr)+scale*index[1]),
+          *(unsigned int*)(((char*)ptr)+scale*index[2]),
+          *(unsigned int*)(((char*)ptr)+scale*index[3]),
+          *(unsigned int*)(((char*)ptr)+scale*index[4]),
+          *(unsigned int*)(((char*)ptr)+scale*index[5]),
+          *(unsigned int*)(((char*)ptr)+scale*index[6]),
+          *(unsigned int*)(((char*)ptr)+scale*index[7]));
+    }
+
+    template<int scale = 4>
+    static __forceinline vuint8 gather(const vboolf8& mask, const unsigned int* ptr, const vint8& index) {
+      vuint8 r = zero;
+      if (likely(mask[0])) r[0] = *(unsigned int*)(((char*)ptr)+scale*index[0]);
+      if (likely(mask[1])) r[1] = *(unsigned int*)(((char*)ptr)+scale*index[1]);
+      if (likely(mask[2])) r[2] = *(unsigned int*)(((char*)ptr)+scale*index[2]);
+      if (likely(mask[3])) r[3] = *(unsigned int*)(((char*)ptr)+scale*index[3]);
+      if (likely(mask[4])) r[4] = *(unsigned int*)(((char*)ptr)+scale*index[4]);
+      if (likely(mask[5])) r[5] = *(unsigned int*)(((char*)ptr)+scale*index[5]);
+      if (likely(mask[6])) r[6] = *(unsigned int*)(((char*)ptr)+scale*index[6]);
+      if (likely(mask[7])) r[7] = *(unsigned int*)(((char*)ptr)+scale*index[7]);
+      return r;
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(void* ptr, const vint8& ofs, const vuint8& v)
+    {
+      *(unsigned int*)(((char*)ptr)+scale*ofs[0]) = v[0];
+      *(unsigned int*)(((char*)ptr)+scale*ofs[1]) = v[1];
+      *(unsigned int*)(((char*)ptr)+scale*ofs[2]) = v[2];
+      *(unsigned int*)(((char*)ptr)+scale*ofs[3]) = v[3];
+      *(unsigned int*)(((char*)ptr)+scale*ofs[4]) = v[4];
+      *(unsigned int*)(((char*)ptr)+scale*ofs[5]) = v[5];
+      *(unsigned int*)(((char*)ptr)+scale*ofs[6]) = v[6];
+      *(unsigned int*)(((char*)ptr)+scale*ofs[7]) = v[7];
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vuint8& v)
+    {
+      if (likely(mask[0])) *(unsigned int*)(((char*)ptr)+scale*ofs[0]) = v[0];
+      if (likely(mask[1])) *(unsigned int*)(((char*)ptr)+scale*ofs[1]) = v[1];
+      if (likely(mask[2])) *(unsigned int*)(((char*)ptr)+scale*ofs[2]) = v[2];
+      if (likely(mask[3])) *(unsigned int*)(((char*)ptr)+scale*ofs[3]) = v[3];
+      if (likely(mask[4])) *(unsigned int*)(((char*)ptr)+scale*ofs[4]) = v[4];
+      if (likely(mask[5])) *(unsigned int*)(((char*)ptr)+scale*ofs[5]) = v[5];
+      if (likely(mask[6])) *(unsigned int*)(((char*)ptr)+scale*ofs[6]) = v[6];
+      if (likely(mask[7])) *(unsigned int*)(((char*)ptr)+scale*ofs[7]) = v[7];
+    }
+
+
+    static __forceinline vuint8 broadcast64(const long long& a) { return _mm256_set1_epi64x(a); }
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const unsigned int& operator [](size_t index) const { assert(index < 8); return i[index]; }
+    __forceinline       unsigned int& operator [](size_t index)       { assert(index < 8); return i[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8 asBool(const vuint8& a) { return _mm256_castsi256_ps(a); }
+
+  __forceinline vuint8 operator +(const vuint8& a) { return a; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint8 operator +(const vuint8& a, const vuint8& b) { return vuint8(_mm_add_epi32(a.vl, b.vl), _mm_add_epi32(a.vh, b.vh)); }
+  __forceinline vuint8 operator +(const vuint8& a, unsigned int          b) { return a + vuint8(b); }
+  __forceinline vuint8 operator +(unsigned int          a, const vuint8& b) { return vuint8(a) + b; }
+
+  __forceinline vuint8 operator -(const vuint8& a, const vuint8& b) { return vuint8(_mm_sub_epi32(a.vl, b.vl), _mm_sub_epi32(a.vh, b.vh)); }
+  __forceinline vuint8 operator -(const vuint8& a, unsigned int          b) { return a - vuint8(b); }
+  __forceinline vuint8 operator -(unsigned int          a, const vuint8& b) { return vuint8(a) - b; }
+
+  //__forceinline vuint8 operator *(const vuint8& a, const vuint8& b) { return vuint8(_mm_mullo_epu32(a.vl, b.vl), _mm_mullo_epu32(a.vh, b.vh)); }
+  //__forceinline vuint8 operator *(const vuint8& a, unsigned int          b) { return a * vuint8(b); }
+  //__forceinline vuint8 operator *(unsigned int          a, const vuint8& b) { return vuint8(a) * b; }
+
+  __forceinline vuint8 operator &(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+  __forceinline vuint8 operator &(const vuint8& a, unsigned int          b) { return a & vuint8(b); }
+  __forceinline vuint8 operator &(unsigned int          a, const vuint8& b) { return vuint8(a) & b; }
+
+  __forceinline vuint8 operator |(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_or_ps (_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+  __forceinline vuint8 operator |(const vuint8& a, unsigned int          b) { return a | vuint8(b); }
+  __forceinline vuint8 operator |(unsigned int          a, const vuint8& b) { return vuint8(a) | b; }
+
+  __forceinline vuint8 operator ^(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+  __forceinline vuint8 operator ^(const vuint8& a, unsigned int          b) { return a ^ vuint8(b); }
+  __forceinline vuint8 operator ^(unsigned int          a, const vuint8& b) { return vuint8(a) ^ b; }
+
+  __forceinline vuint8 operator <<(const vuint8& a, unsigned int n) { return vuint8(_mm_slli_epi32(a.vl, n), _mm_slli_epi32(a.vh, n)); }
+  __forceinline vuint8 operator >>(const vuint8& a, unsigned int n) { return vuint8(_mm_srai_epi32(a.vl, n), _mm_srli_epi32(a.vh, n)); }
+
+  __forceinline vuint8 sll (const vuint8& a, unsigned int b) { return vuint8(_mm_slli_epi32(a.vl, b), _mm_slli_epi32(a.vh, b)); }
+  __forceinline vuint8 sra (const vuint8& a, unsigned int b) { return vuint8(_mm_srai_epi32(a.vl, b), _mm_srai_epi32(a.vh, b)); }
+  __forceinline vuint8 srl (const vuint8& a, unsigned int b) { return vuint8(_mm_srli_epi32(a.vl, b), _mm_srli_epi32(a.vh, b)); }
+  
+  __forceinline vuint8 min(const vuint8& a, const vuint8& b) { return vuint8(_mm_min_epu32(a.vl, b.vl), _mm_min_epu32(a.vh, b.vh)); }
+  __forceinline vuint8 min(const vuint8& a, unsigned int          b) { return min(a,vuint8(b)); }
+  __forceinline vuint8 min(unsigned int          a, const vuint8& b) { return min(vuint8(a),b); }
+
+  __forceinline vuint8 max(const vuint8& a, const vuint8& b) { return vuint8(_mm_max_epu32(a.vl, b.vl), _mm_max_epu32(a.vh, b.vh)); }
+  __forceinline vuint8 max(const vuint8& a, unsigned int          b) { return max(a,vuint8(b)); }
+  __forceinline vuint8 max(unsigned int          a, const vuint8& b) { return max(vuint8(a),b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint8& operator +=(vuint8& a, const vuint8& b) { return a = a + b; }
+  __forceinline vuint8& operator +=(vuint8& a, unsigned int          b) { return a = a + b; }
+  
+  __forceinline vuint8& operator -=(vuint8& a, const vuint8& b) { return a = a - b; }
+  __forceinline vuint8& operator -=(vuint8& a, unsigned int          b) { return a = a - b; }
+  
+  //__forceinline vuint8& operator *=(vuint8& a, const vuint8& b) { return a = a * b; }
+  //__forceinline vuint8& operator *=(vuint8& a, unsigned int          b) { return a = a * b; }
+  
+  __forceinline vuint8& operator &=(vuint8& a, const vuint8& b) { return a = a & b; }
+  __forceinline vuint8& operator &=(vuint8& a, unsigned int          b) { return a = a & b; }
+  
+  __forceinline vuint8& operator |=(vuint8& a, const vuint8& b) { return a = a | b; }
+  __forceinline vuint8& operator |=(vuint8& a, unsigned int          b) { return a = a | b; }
+  
+  __forceinline vuint8& operator <<=(vuint8& a, unsigned int b) { return a = a << b; }
+  __forceinline vuint8& operator >>=(vuint8& a, unsigned int b) { return a = a >> b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8 operator ==(const vuint8& a, const vuint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmpeq_epi32 (a.vl, b.vl)),
+                                                                                       _mm_castsi128_ps(_mm_cmpeq_epi32 (a.vh, b.vh))); }
+  __forceinline vboolf8 operator ==(const vuint8& a, unsigned int          b) { return a == vuint8(b); }
+  __forceinline vboolf8 operator ==(unsigned int          a, const vuint8& b) { return vuint8(a) == b; }
+  
+  __forceinline vboolf8 operator !=(const vuint8& a, const vuint8& b) { return !(a == b); }
+  __forceinline vboolf8 operator !=(const vuint8& a, unsigned int          b) { return a != vuint8(b); }
+  __forceinline vboolf8 operator !=(unsigned int          a, const vuint8& b) { return vuint8(a) != b; }
+  
+  //__forceinline vboolf8 operator < (const vuint8& a, const vuint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmplt_epu32 (a.vl, b.vl)),
+  //                                                                                     _mm_castsi128_ps(_mm_cmplt_epu32 (a.vh, b.vh))); }
+  //__forceinline vboolf8 operator < (const vuint8& a, unsigned int          b) { return a <  vuint8(b); }
+  //__forceinline vboolf8 operator < (unsigned int          a, const vuint8& b) { return vuint8(a) <  b; }
+  
+  //__forceinline vboolf8 operator >=(const vuint8& a, const vuint8& b) { return !(a <  b); }
+  //__forceinline vboolf8 operator >=(const vuint8& a, unsigned int          b) { return a >= vuint8(b); }
+  //__forceinline vboolf8 operator >=(unsigned int          a, const vuint8& b) { return vuint8(a) >= b; }
+
+  //__forceinline vboolf8 operator > (const vuint8& a, const vuint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmpgt_epu32 (a.vl, b.vl)),
+  //                                                                                     _mm_castsi128_ps(_mm_cmpgt_epu32 (a.vh, b.vh))); }
+  //__forceinline vboolf8 operator > (const vuint8& a, unsigned int          b) { return a >  vuint8(b); }
+  //__forceinline vboolf8 operator > (unsigned int          a, const vuint8& b) { return vuint8(a) >  b; }
+
+  //__forceinline vboolf8 operator <=(const vuint8& a, const vuint8& b) { return !(a >  b); }
+  //__forceinline vboolf8 operator <=(const vuint8& a, unsigned int          b) { return a <= vuint8(b); }
+  //__forceinline vboolf8 operator <=(unsigned int          a, const vuint8& b) { return vuint8(a) <= b; }
+
+  __forceinline vboolf8 eq(const vuint8& a, const vuint8& b) { return a == b; }
+  __forceinline vboolf8 ne(const vuint8& a, const vuint8& b) { return a != b; }
+
+  __forceinline vboolf8 eq(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a == b); }
+  __forceinline vboolf8 ne(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a != b); }
+
+  __forceinline vuint8 select(const vboolf8& m, const vuint8& t, const vuint8& f) {
+    return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m)); 
+  }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint8 unpacklo(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+  __forceinline vuint8 unpackhi(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+
+  template<int i>
+  __forceinline vuint8 shuffle(const vuint8& v) {
+    return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i, i, i, i)));
+  }
+
+  template<int i0, int i1>
+  __forceinline vuint8 shuffle4(const vuint8& v) {
+    return _mm256_permute2f128_si256(v, v, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1>
+  __forceinline vuint8 shuffle4(const vuint8& a, const vuint8& b) {
+    return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vuint8 shuffle(const vuint8& v) {
+    return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vuint8 shuffle(const vuint8& a, const vuint8& b) {
+    return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<> __forceinline vuint8 shuffle<0, 0, 2, 2>(const vuint8& v) { return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(v))); }
+  template<> __forceinline vuint8 shuffle<1, 1, 3, 3>(const vuint8& v) { return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(v))); }
+  template<> __forceinline vuint8 shuffle<0, 1, 0, 1>(const vuint8& v) { return _mm256_castps_si256(_mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(v))))); }
+
+  template<int i> __forceinline vuint8 insert4(const vuint8& a, const vuint4& b) { return _mm256_insertf128_si256(a, b, i); }
+  template<int i> __forceinline vuint4 extract4(const vuint8& a) { return _mm256_extractf128_si256(a, i); }
+  template<> __forceinline vuint4 extract4<0>(const vuint8& a) { return _mm256_castsi256_si128(a); }
+
+  __forceinline int toScalar(const vuint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  //__forceinline vuint8 vreduce_min2(const vuint8& v) { return min(v,shuffle<1,0,3,2>(v)); }
+  //__forceinline vuint8 vreduce_min4(const vuint8& v) { vuint8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); }
+  //__forceinline vuint8 vreduce_min (const vuint8& v) { vuint8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); }
+
+  //__forceinline vuint8 vreduce_max2(const vuint8& v) { return max(v,shuffle<1,0,3,2>(v)); }
+  //__forceinline vuint8 vreduce_max4(const vuint8& v) { vuint8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); }
+  //__forceinline vuint8 vreduce_max (const vuint8& v) { vuint8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); }
+
+  __forceinline vuint8 vreduce_add2(const vuint8& v) { return v + shuffle<1,0,3,2>(v); }
+  __forceinline vuint8 vreduce_add4(const vuint8& v) { vuint8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); }
+  __forceinline vuint8 vreduce_add (const vuint8& v) { vuint8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); }
+
+  //__forceinline int reduce_min(const vuint8& v) { return toScalar(vreduce_min(v)); }
+  //__forceinline int reduce_max(const vuint8& v) { return toScalar(vreduce_max(v)); }
+  __forceinline int reduce_add(const vuint8& v) { return toScalar(vreduce_add(v)); }
+
+  //__forceinline size_t select_min(const vuint8& v) { return bsf(movemask(v == vreduce_min(v))); }
+  //__forceinline size_t select_max(const vuint8& v) { return bsf(movemask(v == vreduce_max(v))); }
+
+  //__forceinline size_t select_min(const vboolf8& valid, const vuint8& v) { const vuint8 a = select(valid,v,vuint8(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); }
+  //__forceinline size_t select_max(const vboolf8& valid, const vuint8& v) { const vuint8 a = select(valid,v,vuint8(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vuint8& a) {
+    return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">";
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/simd/vuint8_avx2.h b/thirdparty/embree/common/simd/vuint8_avx2.h
new file mode 100644
index 0000000000..17b994522f
--- /dev/null
+++ b/thirdparty/embree/common/simd/vuint8_avx2.h
@@ -0,0 +1,446 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define vboolf vboolf_impl
+#define vboold vboold_impl
+#define vint vint_impl
+#define vuint vuint_impl
+#define vllong vllong_impl
+#define vfloat vfloat_impl
+#define vdouble vdouble_impl
+
+namespace embree
+{
+  /* 8-wide AVX integer type */
+  template<>
+  struct vuint<8>
+  {
+    ALIGNED_STRUCT_(32);
+        
+    typedef vboolf8 Bool;
+    typedef vuint8   Int;
+    typedef vfloat8 Float;
+
+    enum  { size = 8 }; // number of SIMD elements
+    union {             // data
+      __m256i v;
+      unsigned int i[8];
+    }; 
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vuint() {}
+    __forceinline vuint(const vuint8& a) { v = a.v; }
+    __forceinline vuint8& operator =(const vuint8& a) { v = a.v; return *this; }
+
+    __forceinline vuint(__m256i a) : v(a) {}
+    __forceinline operator const __m256i&() const { return v; }
+    __forceinline operator       __m256i&()       { return v; }
+
+    __forceinline explicit vuint(const vuint4& a) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),a,1)) {}
+    __forceinline vuint(const vuint4& a, const vuint4& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {}
+    __forceinline vuint(const __m128i& a, const __m128i& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {}
+ 
+    __forceinline explicit vuint(const unsigned int* a) : v(_mm256_castps_si256(_mm256_loadu_ps((const float*)a))) {}
+    __forceinline vuint(unsigned int a) : v(_mm256_set1_epi32(a)) {}
+    __forceinline vuint(unsigned int a, unsigned int b) : v(_mm256_set_epi32(b, a, b, a, b, a, b, a)) {}
+    __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d) : v(_mm256_set_epi32(d, c, b, a, d, c, b, a)) {}
+    __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d, unsigned int e, unsigned int f, unsigned int g, unsigned int h) : v(_mm256_set_epi32(h, g, f, e, d, c, b, a)) {}
+
+    __forceinline explicit vuint(__m256 a) : v(_mm256_cvtps_epi32(a)) {}
+
+#if defined(__AVX512VL__)
+    __forceinline explicit vuint(const vboolf8& a) : v(_mm256_movm_epi32(a)) {}
+#else
+    __forceinline explicit vuint(const vboolf8& a) : v(_mm256_castps_si256((__m256)a)) {}
+#endif
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vuint(ZeroTy)   : v(_mm256_setzero_si256()) {}
+    __forceinline vuint(OneTy)    : v(_mm256_set1_epi32(1)) {}
+    __forceinline vuint(PosInfTy) : v(_mm256_set1_epi32(pos_inf)) {}
+    __forceinline vuint(NegInfTy) : v(_mm256_set1_epi32(neg_inf)) {}
+    __forceinline vuint(StepTy)   : v(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)) {}
+    __forceinline vuint(UndefinedTy) : v(_mm256_undefined_si256()) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline vuint8 load(const unsigned char* ptr)  { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); }
+    static __forceinline vuint8 loadu(const unsigned char* ptr) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); }
+    static __forceinline vuint8 load(const unsigned short* ptr)  { return _mm256_cvtepu16_epi32(_mm_load_si128((__m128i*)ptr)); }
+    static __forceinline vuint8 loadu(const unsigned short* ptr) { return _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr)); }
+
+    static __forceinline vuint8 load(const void* ptr) { return _mm256_load_si256((__m256i*)ptr); }
+    static __forceinline vuint8 loadu(const void* ptr) { return _mm256_loadu_si256((__m256i*)ptr); }
+
+    static __forceinline void store (void* ptr, const vuint8& v) { _mm256_store_si256((__m256i*)ptr,v); }
+    static __forceinline void storeu(void* ptr, const vuint8& v) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(v)); }
+
+#if defined(__AVX512VL__)
+
+    static __forceinline vuint8 compact(const vboolf8& mask, vuint8 &v) {
+      return _mm256_mask_compress_epi32(v, mask, v);
+    }
+    static __forceinline vuint8 compact(const vboolf8& mask, vuint8 &a, const vuint8& b) {
+      return _mm256_mask_compress_epi32(a, mask, b);
+    }
+
+    static __forceinline vuint8 load (const vboolf8& mask, const void* ptr) { return _mm256_mask_load_epi32 (_mm256_setzero_si256(),mask,ptr); }
+    static __forceinline vuint8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_mask_loadu_epi32(_mm256_setzero_si256(),mask,ptr); }
+
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& v) { _mm256_mask_store_epi32 (ptr,mask,v); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& v) { _mm256_mask_storeu_epi32(ptr,mask,v); }
+#else
+    static __forceinline vuint8 load (const vboolf8& mask, const void* ptr) { return _mm256_castps_si256(_mm256_maskload_ps((float*)ptr,mask)); }
+    static __forceinline vuint8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_castps_si256(_mm256_maskload_ps((float*)ptr,mask)); }
+
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& v) { _mm256_maskstore_epi32((int*)ptr,mask,v); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& v) { _mm256_maskstore_epi32((int*)ptr,mask,v); }
+#endif
+    
+    static __forceinline vuint8 load_nt(void* ptr) {
+      return _mm256_stream_load_si256((__m256i*)ptr);
+    }
+
+    static __forceinline void store_nt(void* ptr, const vuint8& v) {
+      _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v));
+    }
+
+    static __forceinline void store(unsigned char* ptr, const vuint8& i)
+    {
+      for (size_t j=0; j<8; j++)
+        ptr[j] = i[j];
+    }
+
+    static __forceinline void store(unsigned short* ptr, const vuint8& v) {
+      for (size_t i=0;i<8;i++)
+        ptr[i] = (unsigned short)v[i];
+    }
+
+    template<int scale = 4>
+    static __forceinline vuint8 gather(const unsigned int *const ptr, const vint8& index) {
+      return _mm256_i32gather_epi32((const int*) ptr, index, scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline vuint8 gather(const vboolf8& mask, const unsigned int *const ptr, const vint8& index) {
+      vuint8 r = zero;
+#if defined(__AVX512VL__)
+      return _mm256_mmask_i32gather_epi32(r, mask, index, (const int*) ptr, scale);
+#else
+      return _mm256_mask_i32gather_epi32(r, (const int*) ptr, index, mask, scale);
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(void* ptr, const vint8& ofs, const vuint8& v)
+    {
+#if defined(__AVX512VL__)
+      _mm256_i32scatter_epi32((int*)ptr, ofs, v, scale);
+#else
+      *(unsigned int*)(((char*)ptr)+scale*ofs[0]) = v[0];
+      *(unsigned int*)(((char*)ptr)+scale*ofs[1]) = v[1];
+      *(unsigned int*)(((char*)ptr)+scale*ofs[2]) = v[2];
+      *(unsigned int*)(((char*)ptr)+scale*ofs[3]) = v[3];
+      *(unsigned int*)(((char*)ptr)+scale*ofs[4]) = v[4];
+      *(unsigned int*)(((char*)ptr)+scale*ofs[5]) = v[5];
+      *(unsigned int*)(((char*)ptr)+scale*ofs[6]) = v[6];
+      *(unsigned int*)(((char*)ptr)+scale*ofs[7]) = v[7];
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vuint8& v)
+    {
+#if defined(__AVX512VL__)
+      _mm256_mask_i32scatter_epi32((int*)ptr, mask, ofs, v, scale);
+#else
+      if (likely(mask[0])) *(unsigned int*)(((char*)ptr)+scale*ofs[0]) = v[0];
+      if (likely(mask[1])) *(unsigned int*)(((char*)ptr)+scale*ofs[1]) = v[1];
+      if (likely(mask[2])) *(unsigned int*)(((char*)ptr)+scale*ofs[2]) = v[2];
+      if (likely(mask[3])) *(unsigned int*)(((char*)ptr)+scale*ofs[3]) = v[3];
+      if (likely(mask[4])) *(unsigned int*)(((char*)ptr)+scale*ofs[4]) = v[4];
+      if (likely(mask[5])) *(unsigned int*)(((char*)ptr)+scale*ofs[5]) = v[5];
+      if (likely(mask[6])) *(unsigned int*)(((char*)ptr)+scale*ofs[6]) = v[6];
+      if (likely(mask[7])) *(unsigned int*)(((char*)ptr)+scale*ofs[7]) = v[7];
+#endif
+    }
+
+    static __forceinline vuint8 broadcast64(const long long &a) { return _mm256_set1_epi64x(a); }
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const unsigned int& operator [](size_t index) const { assert(index < 8); return i[index]; }
+    __forceinline       unsigned int& operator [](size_t index)       { assert(index < 8); return i[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  __forceinline vboolf8 asBool(const vuint8& a) { return _mm256_movepi32_mask(a); }
+#else
+  __forceinline vboolf8 asBool(const vuint8& a) { return _mm256_castsi256_ps(a); }
+#endif
+
+  __forceinline vuint8 operator +(const vuint8& a) { return a; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint8 operator +(const vuint8& a, const vuint8& b) { return _mm256_add_epi32(a, b); }
+  __forceinline vuint8 operator +(const vuint8& a, unsigned int          b) { return a + vuint8(b); }
+  __forceinline vuint8 operator +(unsigned int          a, const vuint8& b) { return vuint8(a) + b; }
+
+  __forceinline vuint8 operator -(const vuint8& a, const vuint8& b) { return _mm256_sub_epi32(a, b); }
+  __forceinline vuint8 operator -(const vuint8& a, unsigned int          b) { return a - vuint8(b); }
+  __forceinline vuint8 operator -(unsigned int          a, const vuint8& b) { return vuint8(a) - b; }
+
+  //__forceinline vuint8 operator *(const vuint8& a, const vuint8& b) { return _mm256_mullo_epu32(a, b); }
+  //__forceinline vuint8 operator *(const vuint8& a, unsigned int          b) { return a * vuint8(b); }
+  //__forceinline vuint8 operator *(unsigned int          a, const vuint8& b) { return vuint8(a) * b; }
+
+  __forceinline vuint8 operator &(const vuint8& a, const vuint8& b) { return _mm256_and_si256(a, b); }
+  __forceinline vuint8 operator &(const vuint8& a, unsigned int          b) { return a & vuint8(b); }
+  __forceinline vuint8 operator &(unsigned int          a, const vuint8& b) { return vuint8(a) & b; }
+
+  __forceinline vuint8 operator |(const vuint8& a, const vuint8& b) { return _mm256_or_si256(a, b); }
+  __forceinline vuint8 operator |(const vuint8& a, unsigned int          b) { return a | vuint8(b); }
+  __forceinline vuint8 operator |(unsigned int          a, const vuint8& b) { return vuint8(a) | b; }
+
+  __forceinline vuint8 operator ^(const vuint8& a, const vuint8& b) { return _mm256_xor_si256(a, b); }
+  __forceinline vuint8 operator ^(const vuint8& a, unsigned int          b) { return a ^ vuint8(b); }
+  __forceinline vuint8 operator ^(unsigned int          a, const vuint8& b) { return vuint8(a) ^ b; }
+
+  __forceinline vuint8 operator <<(const vuint8& a, unsigned int n) { return _mm256_slli_epi32(a, n); }
+  __forceinline vuint8 operator >>(const vuint8& a, unsigned int n) { return _mm256_srli_epi32(a, n); }
+
+  __forceinline vuint8 operator <<(const vuint8& a, const vuint8& n) { return _mm256_sllv_epi32(a, n); }
+  __forceinline vuint8 operator >>(const vuint8& a, const vuint8& n) { return _mm256_srlv_epi32(a, n); }
+
+  __forceinline vuint8 sll(const vuint8& a, unsigned int b) { return _mm256_slli_epi32(a, b); }
+  __forceinline vuint8 sra(const vuint8& a, unsigned int b) { return _mm256_srai_epi32(a, b); }
+  __forceinline vuint8 srl(const vuint8& a, unsigned int b) { return _mm256_srli_epi32(a, b); }
+
+  __forceinline vuint8 sll(const vuint8& a, const vuint8& b) { return _mm256_sllv_epi32(a, b); }
+  __forceinline vuint8 sra(const vuint8& a, const vuint8& b) { return _mm256_srav_epi32(a, b); }
+  __forceinline vuint8 srl(const vuint8& a, const vuint8& b) { return _mm256_srlv_epi32(a, b); }
+  
+  __forceinline vuint8 min(const vuint8& a, const vuint8& b) { return _mm256_min_epu32(a, b); }
+  __forceinline vuint8 min(const vuint8& a, unsigned int          b) { return min(a,vuint8(b)); }
+  __forceinline vuint8 min(unsigned int          a, const vuint8& b) { return min(vuint8(a),b); }
+
+  __forceinline vuint8 max(const vuint8& a, const vuint8& b) { return _mm256_max_epu32(a, b); }
+  __forceinline vuint8 max(const vuint8& a, unsigned int          b) { return max(a,vuint8(b)); }
+  __forceinline vuint8 max(unsigned int          a, const vuint8& b) { return max(vuint8(a),b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint8& operator +=(vuint8& a, const vuint8& b) { return a = a + b; }
+  __forceinline vuint8& operator +=(vuint8& a, unsigned int          b) { return a = a + b; }
+  
+  __forceinline vuint8& operator -=(vuint8& a, const vuint8& b) { return a = a - b; }
+  __forceinline vuint8& operator -=(vuint8& a, unsigned int          b) { return a = a - b; }
+  
+  //__forceinline vuint8& operator *=(vuint8& a, const vuint8& b) { return a = a * b; }
+  //__forceinline vuint8& operator *=(vuint8& a, unsigned int          b) { return a = a * b; }
+  
+  __forceinline vuint8& operator &=(vuint8& a, const vuint8& b) { return a = a & b; }
+  __forceinline vuint8& operator &=(vuint8& a, unsigned int          b) { return a = a & b; }
+  
+  __forceinline vuint8& operator |=(vuint8& a, const vuint8& b) { return a = a | b; }
+  __forceinline vuint8& operator |=(vuint8& a, unsigned int          b) { return a = a | b; }
+  
+  __forceinline vuint8& operator <<=(vuint8& a, const unsigned int b) { return a = a << b; }
+  __forceinline vuint8& operator >>=(vuint8& a, const unsigned int b) { return a = a >> b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  __forceinline vboolf8 operator ==(const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf8 operator !=(const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf8 operator < (const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboolf8 operator >=(const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboolf8 operator > (const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf8 operator <=(const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_LE); }
+
+  __forceinline vuint8 select(const vboolf8& m, const vuint8& t, const vuint8& f) {
+    return _mm256_mask_blend_epi32(m, (__m256i)f, (__m256i)t);
+  }
+#else
+  __forceinline vboolf8 operator ==(const vuint8& a, const vuint8& b) { return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a, b)); }
+  __forceinline vboolf8 operator !=(const vuint8& a, const vuint8& b) { return !(a == b); }
+  //__forceinline vboolf8 operator < (const vuint8& a, const vuint8& b) { return _mm256_castsi256_ps(_mm256_cmpgt_epu32(b, a)); }
+  //__forceinline vboolf8 operator >=(const vuint8& a, const vuint8& b) { return !(a <  b); }
+  //__forceinline vboolf8 operator > (const vuint8& a, const vuint8& b) { return _mm256_castsi256_ps(_mm256_cmpgt_epu32(a, b)); }
+  //__forceinline vboolf8 operator <=(const vuint8& a, const vuint8& b) { return !(a >  b); }
+
+  __forceinline vuint8 select(const vboolf8& m, const vuint8& t, const vuint8& f) {
+    return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m));
+  }
+#endif
+
+  template<int mask>
+  __forceinline vuint8 select(const vuint8& t, const vuint8& f) {
+    return _mm256_blend_epi32(f, t, mask);
+  }
+
+  __forceinline vboolf8 operator ==(const vuint8& a, unsigned int          b) { return a == vuint8(b); }
+  __forceinline vboolf8 operator ==(unsigned int          a, const vuint8& b) { return vuint8(a) == b; }
+
+  __forceinline vboolf8 operator !=(const vuint8& a, unsigned int          b) { return a != vuint8(b); }
+  __forceinline vboolf8 operator !=(unsigned int          a, const vuint8& b) { return vuint8(a) != b; }
+
+  //__forceinline vboolf8 operator < (const vuint8& a, unsigned int          b) { return a <  vuint8(b); }
+  //__forceinline vboolf8 operator < (unsigned int          a, const vuint8& b) { return vuint8(a) <  b; }
+
+  //__forceinline vboolf8 operator >=(const vuint8& a, unsigned int          b) { return a >= vuint8(b); }
+  //__forceinline vboolf8 operator >=(unsigned int          a, const vuint8& b) { return vuint8(a) >= b; }
+
+  //__forceinline vboolf8 operator > (const vuint8& a, unsigned int          b) { return a >  vuint8(b); }
+  //__forceinline vboolf8 operator > (unsigned int          a, const vuint8& b) { return vuint8(a) >  b; }
+
+  //__forceinline vboolf8 operator <=(const vuint8& a, unsigned int          b) { return a <= vuint8(b); }
+  //__forceinline vboolf8 operator <=(unsigned int          a, const vuint8& b) { return vuint8(a) <= b; }
+
+  __forceinline vboolf8 eq(const vuint8& a, const vuint8& b) { return a == b; }
+  __forceinline vboolf8 ne(const vuint8& a, const vuint8& b) { return a != b; }
+  //__forceinline vboolf8 lt(const vuint8& a, const vuint8& b) { return a <  b; }
+  //__forceinline vboolf8 ge(const vuint8& a, const vuint8& b) { return a >= b; }
+  //__forceinline vboolf8 gt(const vuint8& a, const vuint8& b) { return a >  b; }
+  //__forceinline vboolf8 le(const vuint8& a, const vuint8& b) { return a <= b; }
+
+#if defined(__AVX512VL__)
+  __forceinline vboolf8 eq(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_EQ); }
+  __forceinline vboolf8 ne(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_NE); }
+  __forceinline vboolf8 lt(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_LT); }
+  __forceinline vboolf8 ge(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_GE); }
+  __forceinline vboolf8 gt(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_GT); }
+  __forceinline vboolf8 le(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_LE); }
+#else
+  __forceinline vboolf8 eq(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a == b); }
+  __forceinline vboolf8 ne(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a != b); }
+  //__forceinline vboolf8 lt(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a <  b); }
+  //__forceinline vboolf8 ge(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a >= b); }
+  //__forceinline vboolf8 gt(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a >  b); }
+  //__forceinline vboolf8 le(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a <= b); }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint8 unpacklo(const vuint8& a, const vuint8& b) { return _mm256_unpacklo_epi32(a, b); }
+  __forceinline vuint8 unpackhi(const vuint8& a, const vuint8& b) { return _mm256_unpackhi_epi32(a, b); }
+
+  template<int i>
+  __forceinline vuint8 shuffle(const vuint8& v) {
+    return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i, i, i, i)));
+  }
+
+  template<int i0, int i1>
+  __forceinline vuint8 shuffle4(const vuint8& v) {
+    return _mm256_permute2f128_si256(v, v, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1>
+  __forceinline vuint8 shuffle4(const vuint8& a, const vuint8& b) {
+    return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vuint8 shuffle(const vuint8& v) {
+    return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vuint8 shuffle(const vuint8& a, const vuint8& b) {
+    return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<> __forceinline vuint8 shuffle<0, 0, 2, 2>(const vuint8& v) { return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(v))); }
+  template<> __forceinline vuint8 shuffle<1, 1, 3, 3>(const vuint8& v) { return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(v))); }
+  template<> __forceinline vuint8 shuffle<0, 1, 0, 1>(const vuint8& v) { return _mm256_castps_si256(_mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(v))))); }
+
+  template<int i> __forceinline vuint8 insert4(const vuint8& a, const vuint4& b) { return _mm256_insertf128_si256(a, b, i); }
+  template<int i> __forceinline vuint4 extract4(const vuint8& a) { return _mm256_extractf128_si256(a, i); }
+  template<> __forceinline vuint4 extract4<0>(const vuint8& a) { return _mm256_castsi256_si128(a); }
+
+  __forceinline int toScalar(const vuint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); }
+
+  __forceinline vuint8 permute(const vuint8& v, const __m256i& index) {
+    return _mm256_permutevar8x32_epi32(v, index);
+  }
+
+  __forceinline vuint8 shuffle(const vuint8& v, const __m256i& index) {
+    return _mm256_castps_si256(_mm256_permutevar_ps(_mm256_castsi256_ps(v), index));
+  }
+
+  template<int i>
+  __forceinline vuint8 align_shift_right(const vuint8& a, const vuint8& b) {
+#if defined(__AVX512VL__)
+    return _mm256_alignr_epi32(a, b, i);    
+#else
+    return _mm256_alignr_epi8(a, b, 4*i);
+#endif
+  }  
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  //__forceinline vuint8 vreduce_min2(const vuint8& v) { return min(v,shuffle<1,0,3,2>(v)); }
+  //__forceinline vuint8 vreduce_min4(const vuint8& v) { vuint8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); }
+  //__forceinline vuint8 vreduce_min (const vuint8& v) { vuint8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); }
+
+  //__forceinline vuint8 vreduce_max2(const vuint8& v) { return max(v,shuffle<1,0,3,2>(v)); }
+  //__forceinline vuint8 vreduce_max4(const vuint8& v) { vuint8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); }
+  //__forceinline vuint8 vreduce_max (const vuint8& v) { vuint8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); }
+
+  __forceinline vuint8 vreduce_add2(const vuint8& v) { return v + shuffle<1,0,3,2>(v); }
+  __forceinline vuint8 vreduce_add4(const vuint8& v) { vuint8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); }
+  __forceinline vuint8 vreduce_add (const vuint8& v) { vuint8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); }
+
+  //__forceinline int reduce_min(const vuint8& v) { return toScalar(vreduce_min(v)); }
+  //__forceinline int reduce_max(const vuint8& v) { return toScalar(vreduce_max(v)); }
+  __forceinline int reduce_add(const vuint8& v) { return toScalar(vreduce_add(v)); }
+
+  //__forceinline size_t select_min(const vuint8& v) { return bsf(movemask(v == vreduce_min(v))); }
+  //__forceinline size_t select_max(const vuint8& v) { return bsf(movemask(v == vreduce_max(v))); }
+
+  //__forceinline size_t select_min(const vboolf8& valid, const vuint8& v) { const vuint8 a = select(valid,v,vuint8(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); }
+  //__forceinline size_t select_max(const vboolf8& valid, const vuint8& v) { const vuint8 a = select(valid,v,vuint8(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vuint8& a) {
+    return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">";
+  }
+}
+
+#undef vboolf
+#undef vboold
+#undef vint
+#undef vuint
+#undef vllong
+#undef vfloat
+#undef vdouble
diff --git a/thirdparty/embree/common/sys/alloc.cpp b/thirdparty/embree/common/sys/alloc.cpp
new file mode 100644
index 0000000000..abdd269069
--- /dev/null
+++ b/thirdparty/embree/common/sys/alloc.cpp
@@ -0,0 +1,327 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "alloc.h"
+#include "intrinsics.h"
+#include "sysinfo.h"
+#include "mutex.h"
+
+////////////////////////////////////////////////////////////////////////////////
+/// All Platforms
+////////////////////////////////////////////////////////////////////////////////
+  
+namespace embree
+{
+  void* alignedMalloc(size_t size, size_t align) 
+  {
+    if (size == 0)
+      return nullptr;
+    
+    assert((align & (align-1)) == 0);
+    void* ptr = _mm_malloc(size,align);
+
+    if (size != 0 && ptr == nullptr)
+      // -- GODOT start --
+      // throw std::bad_alloc();
+      abort();
+      // -- GODOT end --
+    
+    return ptr;
+  }
+  
+  void alignedFree(void* ptr)
+  {
+    if (ptr)
+      _mm_free(ptr);
+  }
+
+  static bool huge_pages_enabled = false;
+  static MutexSys os_init_mutex;
+
+  __forceinline bool isHugePageCandidate(const size_t bytes) 
+  {
+    if (!huge_pages_enabled)
+      return false;
+
+    /* use huge pages only when memory overhead is low */
+    const size_t hbytes = (bytes+PAGE_SIZE_2M-1) & ~size_t(PAGE_SIZE_2M-1);
+    return 66*(hbytes-bytes) < bytes; // at most 1.5% overhead
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Windows Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#ifdef _WIN32
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <malloc.h>
+
+namespace embree
+{
+  bool win_enable_selockmemoryprivilege (bool verbose)
+  {
+    HANDLE hToken;
+    if (!OpenProcessToken(GetCurrentProcess(), TOKEN_QUERY | TOKEN_ADJUST_PRIVILEGES, &hToken)) {
+      if (verbose) std::cout << "WARNING: OpenProcessToken failed while trying to enable SeLockMemoryPrivilege: " << GetLastError() << std::endl;
+      return false;
+    }
+
+    TOKEN_PRIVILEGES tp;
+    tp.PrivilegeCount = 1;
+    tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
+
+    if (!LookupPrivilegeValueW(nullptr, L"SeLockMemoryPrivilege", &tp.Privileges[0].Luid)) {
+      if (verbose) std::cout << "WARNING: LookupPrivilegeValue failed while trying to enable SeLockMemoryPrivilege: " << GetLastError() << std::endl;
+      return false;
+    }
+    
+    SetLastError(ERROR_SUCCESS);
+    if (!AdjustTokenPrivileges(hToken, FALSE, &tp, sizeof(tp), nullptr, 0)) {
+      if (verbose) std::cout << "WARNING: AdjustTokenPrivileges failed while trying to enable SeLockMemoryPrivilege" << std::endl;
+      return false;
+    }
+    
+    if (GetLastError() == ERROR_NOT_ALL_ASSIGNED) {
+      if (verbose) std::cout << "WARNING: AdjustTokenPrivileges failed to enable SeLockMemoryPrivilege: Add SeLockMemoryPrivilege for current user and run process in elevated mode (Run as administrator)." << std::endl;
+      return false;
+    } 
+
+    return true;
+  }
+
+  bool os_init(bool hugepages, bool verbose) 
+  {
+    Lock<MutexSys> lock(os_init_mutex);
+
+    if (!hugepages) {
+      huge_pages_enabled = false;
+      return true;
+    }
+
+    if (GetLargePageMinimum() != PAGE_SIZE_2M) {
+      huge_pages_enabled = false;
+      return false;
+    }
+
+    huge_pages_enabled = true;
+    return true;
+  }
+
+  void* os_malloc(size_t bytes, bool& hugepages)
+  {
+    if (bytes == 0) {
+      hugepages = false;
+      return nullptr;
+    }
+
+    /* try direct huge page allocation first */
+    if (isHugePageCandidate(bytes)) 
+    {
+      int flags = MEM_COMMIT | MEM_RESERVE | MEM_LARGE_PAGES;
+      char* ptr = (char*) VirtualAlloc(nullptr,bytes,flags,PAGE_READWRITE);
+      if (ptr != nullptr) {
+        hugepages = true;
+        return ptr;
+      }
+    } 
+
+    /* fall back to 4k pages */
+    int flags = MEM_COMMIT | MEM_RESERVE;
+    char* ptr = (char*) VirtualAlloc(nullptr,bytes,flags,PAGE_READWRITE);
+    // -- GODOT start --
+    // if (ptr == nullptr) throw std::bad_alloc();
+    if (ptr == nullptr) abort();
+    // -- GODOT end --
+    hugepages = false;
+    return ptr;
+  }
+
+  size_t os_shrink(void* ptr, size_t bytesNew, size_t bytesOld, bool hugepages) 
+  {
+    if (hugepages) // decommitting huge pages seems not to work under Windows
+      return bytesOld;
+
+    const size_t pageSize = hugepages ? PAGE_SIZE_2M : PAGE_SIZE_4K;
+    bytesNew = (bytesNew+pageSize-1) & ~(pageSize-1);
+    bytesOld = (bytesOld+pageSize-1) & ~(pageSize-1);
+    if (bytesNew >= bytesOld)
+      return bytesOld;
+
+    if (!VirtualFree((char*)ptr+bytesNew,bytesOld-bytesNew,MEM_DECOMMIT))
+      // -- GODOT start --
+      // throw std::bad_alloc();
+      abort();
+      // -- GODOT end --
+
+    return bytesNew;
+  }
+
+  void os_free(void* ptr, size_t bytes, bool hugepages) 
+  {
+    if (bytes == 0) 
+      return;
+
+    if (!VirtualFree(ptr,0,MEM_RELEASE))
+      // -- GODOT start --
+      // throw std::bad_alloc();
+      abort();
+      // -- GODOT end --
+  }
+
+  void os_advise(void *ptr, size_t bytes)
+  {
+  }
+}
+
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Unix Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__UNIX__)
+
+#include <sys/mman.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sstream>
+
+#if defined(__MACOSX__)
+#include <mach/vm_statistics.h>
+#endif
+
+namespace embree
+{
+  bool os_init(bool hugepages, bool verbose) 
+  {
+    Lock<MutexSys> lock(os_init_mutex);
+
+    if (!hugepages) {
+      huge_pages_enabled = false;
+      return true;
+    }
+
+#if defined(__LINUX__)
+
+    int hugepagesize = 0;
+
+    std::ifstream file; 
+    file.open("/proc/meminfo",std::ios::in);
+    if (!file.is_open()) {
+      if (verbose) std::cout << "WARNING: Could not open /proc/meminfo. Huge page support cannot get enabled!" << std::endl;
+      huge_pages_enabled = false;
+      return false;
+    }
+    
+    std::string line;
+    while (getline(file,line))
+    {
+      std::stringstream sline(line);
+      while (!sline.eof() && sline.peek() == ' ') sline.ignore();
+      std::string tag; getline(sline,tag,' ');
+      while (!sline.eof() && sline.peek() == ' ') sline.ignore();
+      std::string val; getline(sline,val,' ');
+      while (!sline.eof() && sline.peek() == ' ') sline.ignore();
+      std::string unit; getline(sline,unit,' ');
+      if (tag == "Hugepagesize:" && unit == "kB") {
+	hugepagesize = std::stoi(val)*1024;
+	break;
+      }
+    }
+    
+    if (hugepagesize != PAGE_SIZE_2M) 
+    {
+      if (verbose) std::cout << "WARNING: Only 2MB huge pages supported. Huge page support cannot get enabled!" << std::endl;
+      huge_pages_enabled = false;
+      return false;
+    }
+#endif
+
+    huge_pages_enabled = true;
+    return true;
+  }
+
+  void* os_malloc(size_t bytes, bool& hugepages)
+  { 
+    if (bytes == 0) {
+      hugepages = false;
+      return nullptr;
+    }
+
+    /* try direct huge page allocation first */
+    if (isHugePageCandidate(bytes)) 
+    {
+#if defined(__MACOSX__)
+      void* ptr = mmap(0, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, VM_FLAGS_SUPERPAGE_SIZE_2MB, 0);
+      if (ptr != MAP_FAILED) {
+        hugepages = true;
+        return ptr;
+      }
+#elif defined(MAP_HUGETLB)
+      void* ptr = mmap(0, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON | MAP_HUGETLB, -1, 0);
+      if (ptr != MAP_FAILED) {
+        hugepages = true;
+        return ptr;
+      }
+#endif
+    } 
+
+    /* fallback to 4k pages */
+    void* ptr = (char*) mmap(0, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
+    // -- GODOT start --
+    // if (ptr == MAP_FAILED) throw std::bad_alloc();
+    if (ptr == MAP_FAILED) abort();
+    // -- GODOT end --
+    hugepages = false;
+
+    /* advise huge page hint for THP */
+    os_advise(ptr,bytes);
+    return ptr;
+  }
+
+  size_t os_shrink(void* ptr, size_t bytesNew, size_t bytesOld, bool hugepages) 
+  {
+    const size_t pageSize = hugepages ? PAGE_SIZE_2M : PAGE_SIZE_4K;
+    bytesNew = (bytesNew+pageSize-1) & ~(pageSize-1);
+    bytesOld = (bytesOld+pageSize-1) & ~(pageSize-1);
+    if (bytesNew >= bytesOld)
+      return bytesOld;
+
+    if (munmap((char*)ptr+bytesNew,bytesOld-bytesNew) == -1)
+      // -- GODOT start --
+      // throw std::bad_alloc();
+      abort();
+      // -- GODOT end --
+
+    return bytesNew;
+  }
+
+  void os_free(void* ptr, size_t bytes, bool hugepages) 
+  {
+    if (bytes == 0)
+      return;
+
+    /* for hugepages we need to also align the size */
+    const size_t pageSize = hugepages ? PAGE_SIZE_2M : PAGE_SIZE_4K;
+    bytes = (bytes+pageSize-1) & ~(pageSize-1);
+    if (munmap(ptr,bytes) == -1)
+      // -- GODOT start --
+      // throw std::bad_alloc();
+      abort();
+      // -- GODOT end --
+  }
+
+  /* hint for transparent huge pages (THP) */
+  void os_advise(void* pptr, size_t bytes)
+  {
+#if defined(MADV_HUGEPAGE)
+    madvise(pptr,bytes,MADV_HUGEPAGE); 
+#endif
+  }
+}
+
+#endif
diff --git a/thirdparty/embree/common/sys/alloc.h b/thirdparty/embree/common/sys/alloc.h
new file mode 100644
index 0000000000..4fa474ec1d
--- /dev/null
+++ b/thirdparty/embree/common/sys/alloc.h
@@ -0,0 +1,164 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "platform.h"
+#include <vector>
+#include <set>
+
+namespace embree
+{
+#define ALIGNED_STRUCT_(align)                                           \
+  void* operator new(size_t size) { return alignedMalloc(size,align); } \
+  void operator delete(void* ptr) { alignedFree(ptr); }                 \
+  void* operator new[](size_t size) { return alignedMalloc(size,align); } \
+  void operator delete[](void* ptr) { alignedFree(ptr); }
+
+#define ALIGNED_CLASS_(align)                                           \
+ public:                                                               \
+    ALIGNED_STRUCT_(align)                                              \
+ private:
+  
+  /*! aligned allocation */
+  void* alignedMalloc(size_t size, size_t align);
+  void alignedFree(void* ptr);
+  
+  /*! allocator that performs aligned allocations */
+  template<typename T, size_t alignment>
+    struct aligned_allocator
+    {
+      typedef T value_type;
+      typedef T* pointer;
+      typedef const T* const_pointer;
+      typedef T& reference;
+      typedef const T& const_reference;
+      typedef std::size_t size_type;
+      typedef std::ptrdiff_t difference_type;
+
+      __forceinline pointer allocate( size_type n ) {
+        return (pointer) alignedMalloc(n*sizeof(value_type),alignment);
+      }
+
+      __forceinline void deallocate( pointer p, size_type n ) {
+        return alignedFree(p);
+      }
+
+      __forceinline void construct( pointer p, const_reference val ) {
+        new (p) T(val);
+      }
+
+      __forceinline void destroy( pointer p ) {
+        p->~T();
+      }
+    };
+
+  /*! allocates pages directly from OS */
+  bool win_enable_selockmemoryprivilege(bool verbose);
+  bool os_init(bool hugepages, bool verbose);
+  void* os_malloc (size_t bytes, bool& hugepages);
+  size_t os_shrink (void* ptr, size_t bytesNew, size_t bytesOld, bool hugepages);
+  void  os_free   (void* ptr, size_t bytes, bool hugepages);
+  void  os_advise (void* ptr, size_t bytes);
+
+  /*! allocator that performs OS allocations */
+  template<typename T>
+    struct os_allocator
+    {
+      typedef T value_type;
+      typedef T* pointer;
+      typedef const T* const_pointer;
+      typedef T& reference;
+      typedef const T& const_reference;
+      typedef std::size_t size_type;
+      typedef std::ptrdiff_t difference_type;
+
+      __forceinline os_allocator () 
+        : hugepages(false) {}
+
+      __forceinline pointer allocate( size_type n ) {
+        return (pointer) os_malloc(n*sizeof(value_type),hugepages);
+      }
+
+      __forceinline void deallocate( pointer p, size_type n ) {
+        return os_free(p,n*sizeof(value_type),hugepages);
+      }
+
+      __forceinline void construct( pointer p, const_reference val ) {
+        new (p) T(val);
+      }
+
+      __forceinline void destroy( pointer p ) {
+        p->~T();
+      }
+
+      bool hugepages;
+    };
+
+  /*! allocator for IDs */
+  template<typename T, size_t max_id>
+    struct IDPool
+    {
+      typedef T value_type;
+
+      IDPool ()
+      : nextID(0) {}
+
+      T allocate() 
+      {
+        /* return ID from list */
+        if (!IDs.empty()) 
+        {
+          T id = *IDs.begin();
+          IDs.erase(IDs.begin());
+          return id;
+        } 
+
+        /* allocate new ID */
+        else
+        {
+          if (size_t(nextID)+1 > max_id)
+            return -1;
+          
+          return nextID++;
+        }
+      }
+
+      /* adds an ID provided by the user */
+      bool add(T id)
+      {
+        if (id > max_id)
+          return false;
+        
+        /* check if ID should be in IDs set */
+        if (id < nextID) {
+          auto p = IDs.find(id);
+          if (p == IDs.end()) return false;
+          IDs.erase(p);
+          return true;
+        }
+
+        /* otherwise increase ID set */
+        else
+        {
+          for (T i=nextID; i<id; i++) {
+            IDs.insert(i);
+          }
+          nextID = id+1;
+          return true;
+        }
+      }
+
+      void deallocate( T id ) 
+      {
+        assert(id < nextID);
+        MAYBE_UNUSED auto done = IDs.insert(id).second;
+        assert(done);
+      }
+
+    private:
+      std::set<T> IDs;   //!< stores deallocated IDs to be reused
+      T nextID;          //!< next ID to use when IDs vector is empty
+    };
+}
+
diff --git a/thirdparty/embree/common/sys/array.h b/thirdparty/embree/common/sys/array.h
new file mode 100644
index 0000000000..dd9190c52a
--- /dev/null
+++ b/thirdparty/embree/common/sys/array.h
@@ -0,0 +1,222 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "platform.h"
+#include "alloc.h"
+
+namespace embree
+{
+  /*! static array with static size */
+  template<typename T, size_t N>
+    class array_t
+    {
+    public:
+
+      /********************** Iterators  ****************************/
+
+      __forceinline T* begin() const { return items; };
+      __forceinline T* end  () const { return items+N; };
+
+
+      /********************** Capacity ****************************/
+
+      __forceinline bool   empty    () const { return N == 0; }
+      __forceinline size_t size     () const { return N; }
+      __forceinline size_t max_size () const { return N; }
+            
+
+      /******************** Element access **************************/
+
+      __forceinline       T& operator[](size_t i)       { assert(i < N); return items[i]; }
+      __forceinline const T& operator[](size_t i) const { assert(i < N); return items[i]; }
+
+      __forceinline       T& at(size_t i)       { assert(i < N); return items[i]; }
+      __forceinline const T& at(size_t i) const { assert(i < N); return items[i]; }
+
+      __forceinline T& front() const { assert(N > 0); return items[0]; };
+      __forceinline T& back () const { assert(N > 0); return items[N-1]; };
+
+      __forceinline       T* data()       { return items; };
+      __forceinline const T* data() const { return items; };
+
+    private:
+      T items[N];
+    };
+
+  /*! static array with dynamic size */
+  template<typename T, size_t N>
+    class darray_t
+    {
+    public:
+
+      __forceinline darray_t () : M(0) {}
+
+      __forceinline darray_t (const T& v) : M(0) {
+        for (size_t i=0; i<N; i++) items[i] = v;
+      }
+
+      /********************** Iterators  ****************************/
+
+      __forceinline T* begin() const { return items; };
+      __forceinline T* end  () const { return items+M; };
+
+
+      /********************** Capacity ****************************/
+
+      __forceinline bool   empty    () const { return M == 0; }
+      __forceinline size_t size     () const { return M; }
+      __forceinline size_t capacity () const { return N; }
+      __forceinline size_t max_size () const { return N; }
+      
+      void resize(size_t new_size) {
+        assert(new_size < max_size());
+        M = new_size;
+      }
+
+      /******************** Modifiers **************************/
+
+      __forceinline void push_back(const T& v) 
+      {
+        assert(M+1 < max_size());
+        items[M++] = v;
+      }
+
+      __forceinline void pop_back() 
+      {
+        assert(!empty());
+        M--;
+      }
+
+      __forceinline void clear() {
+        M = 0;
+      }
+
+      /******************** Element access **************************/
+
+      __forceinline       T& operator[](size_t i)       { assert(i < M); return items[i]; }
+      __forceinline const T& operator[](size_t i) const { assert(i < M); return items[i]; }
+
+      __forceinline       T& at(size_t i)       { assert(i < M); return items[i]; }
+      __forceinline const T& at(size_t i) const { assert(i < M); return items[i]; }
+
+      __forceinline T& front() const { assert(M > 0); return items[0]; };
+      __forceinline T& back () const { assert(M > 0); return items[M-1]; };
+
+      __forceinline       T* data()       { return items; };
+      __forceinline const T* data() const { return items; };
+
+    private:
+      size_t M;
+      T items[N];
+    };
+
+  /*! dynamic sized array that is allocated on the stack */
+#define dynamic_large_stack_array(Ty,Name,N,max_stack_bytes) StackArray<Ty,max_stack_bytes> Name(N)
+  template<typename Ty, size_t max_stack_bytes>
+    struct __aligned(64) StackArray
+  {
+    __forceinline StackArray (const size_t N)
+      : N(N)
+    {
+      if (N*sizeof(Ty) <= max_stack_bytes) 
+        data = &arr[0];
+      else
+        data = (Ty*) alignedMalloc(N*sizeof(Ty),64); 
+    }
+
+    __forceinline ~StackArray () {
+      if (data != &arr[0]) alignedFree(data);
+    }
+
+    __forceinline operator       Ty* ()       { return data; }
+    __forceinline operator const Ty* () const { return data; }
+
+    __forceinline       Ty& operator[](const int i)       { assert(i>=0 && i<N); return data[i]; }
+    __forceinline const Ty& operator[](const int i) const { assert(i>=0 && i<N); return data[i]; }
+
+    __forceinline       Ty& operator[](const unsigned i)       { assert(i<N); return data[i]; }
+    __forceinline const Ty& operator[](const unsigned i) const { assert(i<N); return data[i]; }
+
+#if defined(__64BIT__)
+    __forceinline       Ty& operator[](const size_t i)       { assert(i<N); return data[i]; }
+    __forceinline const Ty& operator[](const size_t i) const { assert(i<N); return data[i]; }
+#endif
+
+  private:
+    Ty arr[max_stack_bytes/sizeof(Ty)];
+    Ty* data;
+    size_t N;
+
+  private:
+    StackArray (const StackArray& other) DELETED; // do not implement
+    StackArray& operator= (const StackArray& other) DELETED; // do not implement
+
+  };
+
+  /*! dynamic sized array that is allocated on the stack */
+  template<typename Ty, size_t max_stack_elements, size_t max_total_elements>
+    struct __aligned(64) DynamicStackArray
+  {
+    __forceinline DynamicStackArray ()
+      : data(&arr[0]) {}
+
+    __forceinline ~DynamicStackArray ()
+    {
+      if (!isStackAllocated())
+        delete[] data;
+    }
+
+    __forceinline bool isStackAllocated() const {
+      return data == &arr[0];
+    }
+
+    __forceinline size_t size() const
+    {
+      if (isStackAllocated()) return max_stack_elements;
+      else return max_total_elements;
+    }
+
+    __forceinline void resize(size_t M)
+    {
+      assert(M <= max_total_elements);
+      if (likely(M <= max_stack_elements)) return;
+      if (likely(!isStackAllocated())) return;
+
+      data = new Ty[max_total_elements];
+      
+      for (size_t i=0; i<max_stack_elements; i++)
+        data[i] = arr[i];
+    }
+
+    __forceinline operator       Ty* ()       { return data; }
+    __forceinline operator const Ty* () const { return data; }
+
+    __forceinline       Ty& operator[](const int i)      { assert(i>=0 && i<max_total_elements); resize(i+1); return data[i]; }
+    __forceinline       Ty& operator[](const unsigned i) { assert(i<max_total_elements); resize(i+1); return data[i]; }
+
+#if defined(__64BIT__)
+    __forceinline       Ty& operator[](const size_t i) { assert(i<max_total_elements); resize(i+1); return data[i]; }
+#endif
+
+    __forceinline DynamicStackArray (const DynamicStackArray& other)
+      : data(&arr[0]) 
+    {
+      for (size_t i=0; i<other.size(); i++)
+        this->operator[] (i) = other[i];
+    }
+     
+    DynamicStackArray& operator= (const DynamicStackArray& other)
+    {
+      for (size_t i=0; i<other.size(); i++)
+        this->operator[] (i) = other[i];
+
+      return *this;
+    }
+
+  private:
+    Ty arr[max_stack_elements];
+    Ty* data;
+  };
+}
diff --git a/thirdparty/embree/common/sys/atomic.h b/thirdparty/embree/common/sys/atomic.h
new file mode 100644
index 0000000000..67af254f36
--- /dev/null
+++ b/thirdparty/embree/common/sys/atomic.h
@@ -0,0 +1,59 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <atomic>
+#include "intrinsics.h"
+
+namespace embree
+{
+/* compiler memory barriers */
+#if defined(__INTEL_COMPILER)
+//#define __memory_barrier() __memory_barrier()
+#elif defined(__GNUC__) || defined(__clang__)
+#  define __memory_barrier() asm volatile("" ::: "memory")
+#elif  defined(_MSC_VER)
+#  define __memory_barrier() _ReadWriteBarrier()
+#endif
+
+  template <typename T>
+    struct atomic : public std::atomic<T>
+  {
+    atomic () {}
+      
+    atomic (const T& a)
+      : std::atomic<T>(a) {}
+
+    atomic (const atomic<T>& a) {
+      this->store(a.load());
+    }
+
+    atomic& operator=(const atomic<T>& other) {
+      this->store(other.load());
+      return *this;
+    }
+  };
+
+  template<typename T>
+    __forceinline void atomic_min(std::atomic<T>& aref, const T& bref)
+  {
+    const T b = bref.load();
+    while (true) {
+      T a = aref.load();
+      if (a <= b) break;
+      if (aref.compare_exchange_strong(a,b)) break;
+    }
+  }
+
+  template<typename T>
+    __forceinline void atomic_max(std::atomic<T>& aref, const T& bref)
+  {
+    const T b = bref.load();
+    while (true) {
+      T a = aref.load();
+      if (a >= b) break;
+      if (aref.compare_exchange_strong(a,b)) break;
+    }
+  }
+}
diff --git a/thirdparty/embree/common/sys/barrier.cpp b/thirdparty/embree/common/sys/barrier.cpp
new file mode 100644
index 0000000000..0c0e39d92d
--- /dev/null
+++ b/thirdparty/embree/common/sys/barrier.cpp
@@ -0,0 +1,289 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "barrier.h"
+#include "condition.h"
+#include "regression.h"
+#include "thread.h"
+
+#if defined (__WIN32__)
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+
+namespace embree
+{
+  struct BarrierSysImplementation
+  {
+    __forceinline BarrierSysImplementation (size_t N) 
+      : i(0), enterCount(0), exitCount(0), barrierSize(0) 
+    {
+      events[0] = CreateEvent(nullptr, TRUE, FALSE, nullptr);
+      events[1] = CreateEvent(nullptr, TRUE, FALSE, nullptr);
+      init(N);
+    }
+    
+    __forceinline ~BarrierSysImplementation ()
+    {
+      CloseHandle(events[0]);
+      CloseHandle(events[1]);
+    }
+    
+    __forceinline void init(size_t N) 
+    {
+      barrierSize = N;
+      enterCount.store(N);
+      exitCount.store(N);
+    }
+
+    __forceinline void wait()
+    {
+      /* every thread entering the barrier decrements this count */
+      size_t i0 = i;
+      size_t cnt0 = enterCount--;
+
+      /* all threads except the last one are wait in the barrier */
+      if (cnt0 > 1) 
+      {
+        if (WaitForSingleObject(events[i0], INFINITE) != WAIT_OBJECT_0)
+          THROW_RUNTIME_ERROR("WaitForSingleObjects failed");
+      }
+      
+      /* the last thread starts all threads waiting at the barrier */
+      else 
+      {
+        i = 1-i;
+        enterCount.store(barrierSize);
+        if (SetEvent(events[i0]) == 0)
+          THROW_RUNTIME_ERROR("SetEvent failed");
+      }
+
+      /* every thread leaving the barrier decrements this count */
+      size_t cnt1 = exitCount--;
+
+      /* the last thread that left the barrier resets the event again */
+      if (cnt1 == 1) 
+      {
+        exitCount.store(barrierSize);
+        if (ResetEvent(events[i0]) == 0)
+          THROW_RUNTIME_ERROR("ResetEvent failed");
+      }
+    }
+
+  public:
+    HANDLE events[2];
+    atomic<size_t> i;
+    atomic<size_t> enterCount;
+    atomic<size_t> exitCount;
+    size_t barrierSize;
+  };
+}
+
+#else
+
+namespace embree
+{
+  struct BarrierSysImplementation
+  {
+    __forceinline BarrierSysImplementation (size_t N) 
+      : count(0), barrierSize(0) 
+    {
+      init(N);
+    }
+    
+    __forceinline void init(size_t N) 
+    {
+      assert(count == 0);
+      count = 0;
+      barrierSize = N;
+    }
+
+    __forceinline void wait()
+    {
+      mutex.lock();
+      count++;
+      
+      if (count == barrierSize) {
+        count = 0;
+        cond.notify_all();
+        mutex.unlock();
+        return;
+      }
+      
+      cond.wait(mutex);
+      mutex.unlock();
+      return;
+    }
+
+  public:
+    MutexSys mutex;
+    ConditionSys cond;
+    volatile size_t count;
+    volatile size_t barrierSize;
+  };
+}
+
+#endif
+
+namespace embree
+{
+  BarrierSys::BarrierSys (size_t N) {
+    opaque = new BarrierSysImplementation(N);
+  }
+
+  BarrierSys::~BarrierSys () {
+    delete (BarrierSysImplementation*) opaque;
+  }
+
+  void BarrierSys::init(size_t count) {
+    ((BarrierSysImplementation*) opaque)->init(count);
+  }
+
+  void BarrierSys::wait() {
+    ((BarrierSysImplementation*) opaque)->wait();
+  }
+
+  LinearBarrierActive::LinearBarrierActive (size_t N) 
+    : count0(nullptr), count1(nullptr), mode(0), flag0(0), flag1(0), threadCount(0)
+  { 
+    if (N == 0) N = getNumberOfLogicalThreads();
+    init(N);
+  }
+
+  LinearBarrierActive::~LinearBarrierActive() 
+  {
+    delete[] count0;
+    delete[] count1;
+  }
+
+  void LinearBarrierActive::init(size_t N) 
+  {
+    if (threadCount != N) {
+      threadCount = N;
+      if (count0) delete[] count0; count0 = new unsigned char[N];
+      if (count1) delete[] count1; count1 = new unsigned char[N];
+    }
+    mode      = 0;
+    flag0     = 0;
+    flag1     = 0;
+    for (size_t i=0; i<N; i++) count0[i] = 0;
+    for (size_t i=0; i<N; i++) count1[i] = 0;
+  }
+
+  void LinearBarrierActive::wait (const size_t threadIndex)
+  {
+    if (mode == 0)
+    {			
+      if (threadIndex == 0)
+      {	
+        for (size_t i=0; i<threadCount; i++)
+          count1[i] = 0;
+        
+        for (size_t i=1; i<threadCount; i++)
+        {
+          while (likely(count0[i] == 0)) 
+            pause_cpu();
+        }
+        mode  = 1;
+        flag1 = 0;
+        __memory_barrier();
+        flag0 = 1;
+      }			
+      else
+      {					
+        count0[threadIndex] = 1;
+        {
+          while (likely(flag0 == 0))
+            pause_cpu();
+        }
+        
+      }		
+    }					
+    else						
+    {
+      if (threadIndex == 0)
+      {	
+        for (size_t i=0; i<threadCount; i++)
+          count0[i] = 0;
+        
+        for (size_t i=1; i<threadCount; i++)
+        {		
+          while (likely(count1[i] == 0))
+            pause_cpu();
+        }
+        
+        mode  = 0;
+        flag0 = 0;
+        __memory_barrier();
+        flag1 = 1;
+      }			
+      else
+      {					
+        count1[threadIndex] = 1;
+        {
+          while (likely(flag1 == 0))
+            pause_cpu();
+        }
+      }		
+    }					
+  }
+
+  struct barrier_sys_regression_test : public RegressionTest
+  {
+    BarrierSys barrier;
+    std::atomic<size_t> threadID;
+    std::atomic<size_t> numFailed;
+    std::vector<size_t> threadResults;
+
+    barrier_sys_regression_test() 
+      : RegressionTest("barrier_sys_regression_test"), threadID(0), numFailed(0)
+    {
+      registerRegressionTest(this);
+    }
+
+    static void thread_alloc(barrier_sys_regression_test* This)
+    {
+      size_t tid = This->threadID++;
+      for (size_t j=0; j<1000; j++)
+      {
+        This->barrier.wait();
+        This->threadResults[tid] = tid;
+        This->barrier.wait();
+      }
+    }
+    
+    bool run ()
+    {
+      threadID.store(0);
+      numFailed.store(0);
+
+      size_t numThreads = getNumberOfLogicalThreads();
+      threadResults.resize(numThreads);
+      barrier.init(numThreads+1);
+
+      /* create threads */
+      std::vector<thread_t> threads;
+      for (size_t i=0; i<numThreads; i++)
+        threads.push_back(createThread((thread_func)thread_alloc,this));
+
+      /* run test */ 
+      for (size_t i=0; i<1000; i++)
+      {
+        for (size_t i=0; i<numThreads; i++) threadResults[i] = 0;
+        barrier.wait();
+        barrier.wait();
+        for (size_t i=0; i<numThreads; i++) numFailed += threadResults[i] != i;
+      }
+
+      /* destroy threads */
+      for (size_t i=0; i<numThreads; i++)
+        join(threads[i]);
+
+      return numFailed == 0;
+    }
+  };
+
+  barrier_sys_regression_test barrier_sys_regression_test;
+}
+
+
diff --git a/thirdparty/embree/common/sys/barrier.h b/thirdparty/embree/common/sys/barrier.h
new file mode 100644
index 0000000000..37fc036291
--- /dev/null
+++ b/thirdparty/embree/common/sys/barrier.h
@@ -0,0 +1,112 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "intrinsics.h"
+#include "sysinfo.h"
+#include "atomic.h"
+
+namespace embree
+{
+  /*! system barrier using operating system */
+  class BarrierSys
+  {
+  public:
+
+    /*! construction / destruction */
+    BarrierSys (size_t N = 0);
+    ~BarrierSys ();
+
+  private:
+    /*! class in non-copyable */
+    BarrierSys (const BarrierSys& other) DELETED; // do not implement
+    BarrierSys& operator= (const BarrierSys& other) DELETED; // do not implement
+
+  public:
+    /*! intializes the barrier with some number of threads */
+    void init(size_t count);
+
+    /*! lets calling thread wait in barrier */
+    void wait();
+
+  private:
+    void* opaque;
+  };
+
+  /*! fast active barrier using atomitc counter */
+  struct BarrierActive 
+  {
+  public:
+    BarrierActive () 
+      : cntr(0) {}
+    
+    void reset() {
+      cntr.store(0);
+    }
+
+    void wait (size_t numThreads) 
+    {
+      cntr++;
+      while (cntr.load() != numThreads) 
+        pause_cpu();
+    }
+
+  private:
+    std::atomic<size_t> cntr;
+  };
+
+  /*! fast active barrier that does not require initialization to some number of threads */
+  struct BarrierActiveAutoReset
+  {
+  public:
+    BarrierActiveAutoReset () 
+      : cntr0(0), cntr1(0) {}
+
+    void wait (size_t threadCount) 
+    {
+      cntr0.fetch_add(1);
+      while (cntr0 != threadCount) pause_cpu();
+      cntr1.fetch_add(1);
+      while (cntr1 != threadCount) pause_cpu();
+      cntr0.fetch_add(-1);
+      while (cntr0 != 0) pause_cpu();
+      cntr1.fetch_add(-1);
+      while (cntr1 != 0) pause_cpu();
+    }
+
+  private:
+    std::atomic<size_t> cntr0;
+    std::atomic<size_t> cntr1;
+  };
+
+  class LinearBarrierActive
+  {
+  public:
+
+    /*! construction and destruction */
+    LinearBarrierActive (size_t threadCount = 0);
+    ~LinearBarrierActive();
+    
+  private:
+    /*! class in non-copyable */
+    LinearBarrierActive (const LinearBarrierActive& other) DELETED; // do not implement
+    LinearBarrierActive& operator= (const LinearBarrierActive& other) DELETED; // do not implement
+
+  public:
+    /*! intializes the barrier with some number of threads */
+    void init(size_t threadCount);
+    
+    /*! thread with threadIndex waits in the barrier */
+    void wait (const size_t threadIndex);
+    
+  private:
+    volatile unsigned char* count0;
+    volatile unsigned char* count1; 
+    volatile unsigned int mode;
+    volatile unsigned int flag0;
+    volatile unsigned int flag1;
+    volatile size_t threadCount;
+  };
+}
+
diff --git a/thirdparty/embree/common/sys/condition.cpp b/thirdparty/embree/common/sys/condition.cpp
new file mode 100644
index 0000000000..606a1d0b04
--- /dev/null
+++ b/thirdparty/embree/common/sys/condition.cpp
@@ -0,0 +1,85 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "condition.h"
+
+#if defined(__WIN32__) && !defined(PTHREADS_WIN32)
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+
+namespace embree
+{
+  struct ConditionImplementation
+  {
+    __forceinline ConditionImplementation () {
+      InitializeConditionVariable(&cond);
+    }
+
+    __forceinline ~ConditionImplementation () {
+    }
+
+    __forceinline void wait(MutexSys& mutex_in) {
+      SleepConditionVariableCS(&cond, (LPCRITICAL_SECTION)mutex_in.mutex, INFINITE);
+    }
+
+    __forceinline void notify_all() {
+      WakeAllConditionVariable(&cond);
+    }
+
+  public:
+    CONDITION_VARIABLE cond;
+  };
+}
+#endif
+
+#if defined(__UNIX__) || defined(PTHREADS_WIN32)
+#include <pthread.h>
+namespace embree
+{
+  struct ConditionImplementation
+  {
+    __forceinline ConditionImplementation () { 
+      if (pthread_cond_init(&cond,nullptr) != 0)
+        THROW_RUNTIME_ERROR("pthread_cond_init failed");
+    }
+    
+    __forceinline ~ConditionImplementation() { 
+      MAYBE_UNUSED bool ok = pthread_cond_destroy(&cond) == 0;
+      assert(ok);
+    }
+    
+    __forceinline void wait(MutexSys& mutex) { 
+      if (pthread_cond_wait(&cond, (pthread_mutex_t*)mutex.mutex) != 0)
+        THROW_RUNTIME_ERROR("pthread_cond_wait failed");
+    }
+    
+    __forceinline void notify_all() { 
+      if (pthread_cond_broadcast(&cond) != 0)
+        THROW_RUNTIME_ERROR("pthread_cond_broadcast failed");
+    }
+    
+  public:
+    pthread_cond_t cond;
+  };
+}
+#endif
+
+namespace embree 
+{
+  ConditionSys::ConditionSys () { 
+    cond = new ConditionImplementation; 
+  }
+
+  ConditionSys::~ConditionSys() { 
+    delete (ConditionImplementation*) cond;
+  }
+
+  void ConditionSys::wait(MutexSys& mutex) { 
+    ((ConditionImplementation*) cond)->wait(mutex);
+  }
+
+  void ConditionSys::notify_all() { 
+    ((ConditionImplementation*) cond)->notify_all();
+  }
+}
diff --git a/thirdparty/embree/common/sys/condition.h b/thirdparty/embree/common/sys/condition.h
new file mode 100644
index 0000000000..557c6e3482
--- /dev/null
+++ b/thirdparty/embree/common/sys/condition.h
@@ -0,0 +1,31 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "mutex.h"
+
+namespace embree
+{
+  class ConditionSys
+  {
+  public:
+    ConditionSys();
+    ~ConditionSys();
+    void wait( class MutexSys& mutex );
+    void notify_all();
+
+    template<typename Predicate>
+      __forceinline void wait( class MutexSys& mutex, const Predicate& pred )
+    {
+      while (!pred()) wait(mutex);
+    }
+
+  private:
+    ConditionSys (const ConditionSys& other) DELETED; // do not implement
+    ConditionSys& operator= (const ConditionSys& other) DELETED; // do not implement
+
+  protected:
+    void* cond;
+  };
+}
diff --git a/thirdparty/embree/common/sys/filename.cpp b/thirdparty/embree/common/sys/filename.cpp
new file mode 100644
index 0000000000..f55b224302
--- /dev/null
+++ b/thirdparty/embree/common/sys/filename.cpp
@@ -0,0 +1,138 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "filename.h"
+#include "sysinfo.h"
+
+namespace embree
+{
+#ifdef __WIN32__
+  const char path_sep = '\\';
+#else
+  const char path_sep = '/';
+#endif
+
+  /*! create an empty filename */
+  FileName::FileName () {}
+
+  /*! create a valid filename from a string */
+  FileName::FileName (const char* in) {
+    filename = in;
+    for (size_t i=0; i<filename.size(); i++)
+      if (filename[i] == '\\' || filename[i] == '/')
+        filename[i] = path_sep;
+    while (!filename.empty() && filename[filename.size()-1] == path_sep)
+      filename.resize(filename.size()-1);
+  }
+
+  /*! create a valid filename from a string */
+  FileName::FileName (const std::string& in) {
+    filename = in;
+    for (size_t i=0; i<filename.size(); i++)
+      if (filename[i] == '\\' || filename[i] == '/')
+        filename[i] = path_sep;
+    while (!filename.empty() && filename[filename.size()-1] == path_sep)
+      filename.resize(filename.size()-1);
+  }
+  
+  /*! returns path to home folder */
+  FileName FileName::homeFolder() 
+  {
+#ifdef __WIN32__
+    const char* home = getenv("UserProfile");
+#else
+    const char* home = getenv("HOME");
+#endif
+    if (home) return home;
+    return "";
+  }
+
+  /*! returns path to executable */
+  FileName FileName::executableFolder() {
+    return FileName(getExecutableFileName()).path();
+  }
+
+  /*! returns the path */
+  FileName FileName::path() const {
+    size_t pos = filename.find_last_of(path_sep);
+    if (pos == std::string::npos) return FileName();
+    return filename.substr(0,pos);
+  }
+
+  /*! returns the basename */
+  std::string FileName::base() const {
+    size_t pos = filename.find_last_of(path_sep);
+    if (pos == std::string::npos) return filename;
+    return filename.substr(pos+1);
+  }
+
+  /*! returns the extension */
+  std::string FileName::ext() const {
+    size_t pos = filename.find_last_of('.');
+    if (pos == std::string::npos) return "";
+    return filename.substr(pos+1);
+  }
+
+  /*! returns the extension */
+  FileName FileName::dropExt() const {
+    size_t pos = filename.find_last_of('.');
+    if (pos == std::string::npos) return filename;
+    return filename.substr(0,pos);
+  }
+
+  /*! returns the basename without extension */
+  std::string FileName::name() const {
+    size_t start = filename.find_last_of(path_sep);
+    if (start == std::string::npos) start = 0; else start++;
+    size_t end = filename.find_last_of('.');
+    if (end == std::string::npos || end < start) end = filename.size();
+    return filename.substr(start, end - start);
+  }
+
+  /*! replaces the extension */
+  FileName FileName::setExt(const std::string& ext) const {
+    size_t start = filename.find_last_of(path_sep);
+    if (start == std::string::npos) start = 0; else start++;
+    size_t end = filename.find_last_of('.');
+    if (end == std::string::npos || end < start) return FileName(filename+ext);
+    return FileName(filename.substr(0,end)+ext);
+  }
+
+  /*! adds the extension */
+  FileName FileName::addExt(const std::string& ext) const {
+    return FileName(filename+ext);
+  }
+
+  /*! concatenates two filenames to this/other */
+  FileName FileName::operator +( const FileName& other ) const {
+    if (filename == "") return FileName(other);
+    else return FileName(filename + path_sep + other.filename);
+  }
+
+  /*! concatenates two filenames to this/other */
+  FileName FileName::operator +( const std::string& other ) const {
+    return operator+(FileName(other));
+  }
+
+  /*! removes the base from a filename (if possible) */
+  FileName FileName::operator -( const FileName& base ) const {
+    size_t pos = filename.find_first_of(base);
+    if (pos == std::string::npos) return *this;
+    return FileName(filename.substr(pos+1));
+  }
+
+  /*! == operator */
+  bool operator== (const FileName& a, const FileName& b) {
+    return a.filename == b.filename;
+  }
+  
+  /*! != operator */
+  bool operator!= (const FileName& a, const FileName& b) {
+    return a.filename != b.filename;
+  }
+
+  /*! output operator */
+  std::ostream& operator<<(std::ostream& cout, const FileName& filename) {
+    return cout << filename.filename;
+  }
+}
diff --git a/thirdparty/embree/common/sys/filename.h b/thirdparty/embree/common/sys/filename.h
new file mode 100644
index 0000000000..d5929cd836
--- /dev/null
+++ b/thirdparty/embree/common/sys/filename.h
@@ -0,0 +1,81 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "platform.h"
+
+namespace embree
+{
+  /*! Convenience class for handling file names and paths. */
+  class FileName
+  {
+  public:
+
+    /*! create an empty filename */
+    FileName ();
+
+    /*! create a valid filename from a string */
+    FileName (const char* filename);
+
+    /*! create a valid filename from a string */
+    FileName (const std::string& filename);
+    
+    /*! returns path to home folder */
+    static FileName homeFolder();
+
+    /*! returns path to executable */
+    static FileName executableFolder();
+
+    /*! auto convert into a string */
+    operator std::string() const { return filename; }
+
+    /*! returns a string of the filename */
+    const std::string str() const { return filename; }
+
+    /*! returns a c-string of the filename */
+    const char* c_str() const { return filename.c_str(); }
+
+    /*! returns the path of a filename */
+    FileName path() const;
+
+    /*! returns the file of a filename  */
+    std::string base() const;
+
+    /*! returns the base of a filename without extension */
+    std::string name() const;
+
+    /*! returns the file extension */
+    std::string ext() const;
+
+    /*! drops the file extension */
+    FileName dropExt() const;
+
+    /*! replaces the file extension */
+    FileName setExt(const std::string& ext = "") const;
+
+    /*! adds file extension */
+    FileName addExt(const std::string& ext = "") const;
+
+    /*! concatenates two filenames to this/other */
+    FileName operator +( const FileName& other ) const;
+
+    /*! concatenates two filenames to this/other */
+    FileName operator +( const std::string& other ) const;
+
+    /*! removes the base from a filename (if possible) */
+    FileName operator -( const FileName& base ) const;
+
+    /*! == operator */
+    friend bool operator==(const FileName& a, const FileName& b);
+
+    /*! != operator */
+    friend bool operator!=(const FileName& a, const FileName& b);
+
+    /*! output operator */
+    friend std::ostream& operator<<(std::ostream& cout, const FileName& filename);
+   
+  private:
+    std::string filename;
+  };
+}
diff --git a/thirdparty/embree/common/sys/intrinsics.h b/thirdparty/embree/common/sys/intrinsics.h
new file mode 100644
index 0000000000..ed8dd7d40a
--- /dev/null
+++ b/thirdparty/embree/common/sys/intrinsics.h
@@ -0,0 +1,525 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "platform.h"
+
+#if defined(__WIN32__)
+#include <intrin.h>
+#endif
+
+#if defined(__ARM_NEON)
+#include "../simd/arm/emulation.h"
+#else
+#include <immintrin.h>
+#endif
+
+#if defined(__BMI__) && defined(__GNUC__) && !defined(__INTEL_COMPILER)
+  #if !defined(_tzcnt_u32)
+    #define _tzcnt_u32 __tzcnt_u32
+  #endif
+  #if !defined(_tzcnt_u64)
+    #define _tzcnt_u64 __tzcnt_u64
+  #endif
+#endif
+
+#if defined(__LZCNT__)
+  #if !defined(_lzcnt_u32)
+    #define _lzcnt_u32 __lzcnt32
+  #endif
+  #if !defined(_lzcnt_u64)
+    #define _lzcnt_u64 __lzcnt64
+  #endif
+#endif
+
+#if defined(__WIN32__)
+// -- GODOT start --
+#if !defined(NOMINMAX)
+// -- GODOT end --
+#define NOMINMAX
+// -- GODOT start --
+#endif
+#include "windows.h"
+// -- GODOT end --
+#endif
+
+/* normally defined in pmmintrin.h, but we always need this */
+#if !defined(_MM_SET_DENORMALS_ZERO_MODE)
+#define _MM_DENORMALS_ZERO_ON   (0x0040)
+#define _MM_DENORMALS_ZERO_OFF  (0x0000)
+#define _MM_DENORMALS_ZERO_MASK (0x0040)
+#define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
+#endif
+
+namespace embree
+{
+  
+////////////////////////////////////////////////////////////////////////////////
+/// Windows Platform
+////////////////////////////////////////////////////////////////////////////////
+  
+#if defined(__WIN32__)
+  
+  __forceinline size_t read_tsc()  
+  {
+    LARGE_INTEGER li;
+    QueryPerformanceCounter(&li);
+    return (size_t)li.QuadPart;
+  }
+  
+  __forceinline int bsf(int v) {
+#if defined(__AVX2__) 
+    return _tzcnt_u32(v);
+#else
+    unsigned long r = 0; _BitScanForward(&r,v); return r;
+#endif
+  }
+  
+  __forceinline unsigned bsf(unsigned v) {
+#if defined(__AVX2__) 
+    return _tzcnt_u32(v);
+#else
+    unsigned long r = 0; _BitScanForward(&r,v); return r;
+#endif
+  }
+  
+#if defined(__X86_64__)
+  __forceinline size_t bsf(size_t v) {
+#if defined(__AVX2__) 
+    return _tzcnt_u64(v);
+#else
+    unsigned long r = 0; _BitScanForward64(&r,v); return r;
+#endif
+  }
+#endif
+  
+  __forceinline int bscf(int& v) 
+  {
+    int i = bsf(v);
+    v &= v-1;
+    return i;
+  }
+  
+  __forceinline unsigned bscf(unsigned& v) 
+  {
+    unsigned i = bsf(v);
+    v &= v-1;
+    return i;
+  }
+  
+#if defined(__X86_64__)
+  __forceinline size_t bscf(size_t& v) 
+  {
+    size_t i = bsf(v);
+    v &= v-1;
+    return i;
+  }
+#endif
+  
+  __forceinline int bsr(int v) {
+#if defined(__AVX2__) 
+    return 31 - _lzcnt_u32(v);
+#else
+    unsigned long r = 0; _BitScanReverse(&r,v); return r;
+#endif
+  }
+  
+  __forceinline unsigned bsr(unsigned v) {
+#if defined(__AVX2__) 
+    return 31 - _lzcnt_u32(v);
+#else
+    unsigned long r = 0; _BitScanReverse(&r,v); return r;
+#endif
+  }
+  
+#if defined(__X86_64__)
+  __forceinline size_t bsr(size_t v) {
+#if defined(__AVX2__) 
+    return 63 -_lzcnt_u64(v);
+#else
+    unsigned long r = 0; _BitScanReverse64(&r, v); return r;
+#endif
+  }
+#endif
+  
+  __forceinline int lzcnt(const int x)
+  {
+#if defined(__AVX2__)
+    return _lzcnt_u32(x);
+#else
+    if (unlikely(x == 0)) return 32;
+    return 31 - bsr(x);    
+#endif
+  }
+  
+  __forceinline int btc(int v, int i) {
+    long r = v; _bittestandcomplement(&r,i); return r;
+  }
+  
+  __forceinline int bts(int v, int i) {
+    long r = v; _bittestandset(&r,i); return r;
+  }
+  
+  __forceinline int btr(int v, int i) {
+    long r = v; _bittestandreset(&r,i); return r;
+  }
+  
+#if defined(__X86_64__)
+  
+  __forceinline size_t btc(size_t v, size_t i) {
+    size_t r = v; _bittestandcomplement64((__int64*)&r,i); return r;
+  }
+  
+  __forceinline size_t bts(size_t v, size_t i) {
+    __int64 r = v; _bittestandset64(&r,i); return r;
+  }
+  
+  __forceinline size_t btr(size_t v, size_t i) {
+    __int64 r = v; _bittestandreset64(&r,i); return r;
+  }
+  
+#endif
+  
+  __forceinline int32_t atomic_cmpxchg(volatile int32_t* p, const int32_t c, const int32_t v) {
+    return _InterlockedCompareExchange((volatile long*)p,v,c);
+  }
+
+////////////////////////////////////////////////////////////////////////////////
+/// Unix Platform
+////////////////////////////////////////////////////////////////////////////////
+  
+#else
+  
+#if defined(__i386__) && defined(__PIC__)
+  
+  __forceinline void __cpuid(int out[4], int op) 
+  {
+    asm volatile ("xchg{l}\t{%%}ebx, %1\n\t"
+                  "cpuid\n\t"
+                  "xchg{l}\t{%%}ebx, %1\n\t"
+                  : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) 
+                  : "0"(op)); 
+  }
+  
+  __forceinline void __cpuid_count(int out[4], int op1, int op2) 
+  {
+    asm volatile ("xchg{l}\t{%%}ebx, %1\n\t"
+                  "cpuid\n\t"
+                  "xchg{l}\t{%%}ebx, %1\n\t"
+                  : "=a" (out[0]), "=r" (out[1]), "=c" (out[2]), "=d" (out[3])
+                  : "0" (op1), "2" (op2)); 
+  }
+  
+#elif defined(__X86_ASM__)
+
+  __forceinline void __cpuid(int out[4], int op) {
+    asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op)); 
+  }
+  
+  __forceinline void __cpuid_count(int out[4], int op1, int op2) {
+    asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op1), "c"(op2)); 
+  }
+  
+#endif
+  
+  __forceinline uint64_t read_tsc()  {
+#if defined(__X86_ASM__)
+    uint32_t high,low;
+    asm volatile ("rdtsc" : "=d"(high), "=a"(low));
+    return (((uint64_t)high) << 32) + (uint64_t)low;
+#else
+    /* Not supported yet, meaning measuring traversal cost per pixel does not work. */
+    return 0;
+#endif
+  }
+  
+  __forceinline int bsf(int v) {
+#if defined(__AVX2__) 
+    return _tzcnt_u32(v);
+#elif defined(__X86_ASM__)
+    int r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
+#else
+    return __builtin_ctz(v);
+#endif
+  }
+  
+#if defined(__64BIT__)
+  __forceinline unsigned bsf(unsigned v) 
+  {
+#if defined(__AVX2__) 
+    return _tzcnt_u32(v);
+#elif defined(__X86_ASM__)
+    unsigned r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
+#else
+    return __builtin_ctz(v);
+#endif
+  }
+#endif
+  
+  __forceinline size_t bsf(size_t v) {
+#if defined(__AVX2__)
+#if defined(__X86_64__)
+    return _tzcnt_u64(v);
+#else
+    return _tzcnt_u32(v);
+#endif
+#elif defined(__X86_ASM__)
+    size_t r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
+#else
+    return __builtin_ctzl(v);
+#endif
+  }
+
+  __forceinline int bscf(int& v) 
+  {
+    int i = bsf(v);
+    v &= v-1;
+    return i;
+  }
+  
+#if defined(__64BIT__)
+  __forceinline unsigned int bscf(unsigned int& v) 
+  {
+    unsigned int i = bsf(v);
+    v &= v-1;
+    return i;
+  }
+#endif
+  
+  __forceinline size_t bscf(size_t& v) 
+  {
+    size_t i = bsf(v);
+    v &= v-1;
+    return i;
+  }
+  
+  __forceinline int bsr(int v) {
+#if defined(__AVX2__) 
+    return 31 - _lzcnt_u32(v);
+#elif defined(__X86_ASM__)
+    int r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
+#else
+    return __builtin_clz(v) ^ 31;
+#endif
+  }
+  
+#if defined(__64BIT__)
+  __forceinline unsigned bsr(unsigned v) {
+#if defined(__AVX2__) 
+    return 31 - _lzcnt_u32(v);
+#elif defined(__X86_ASM__)
+    unsigned r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
+#else
+    return __builtin_clz(v) ^ 31;
+#endif
+  }
+#endif
+  
+  __forceinline size_t bsr(size_t v) {
+#if defined(__AVX2__)
+#if defined(__X86_64__)
+    return 63 - _lzcnt_u64(v);
+#else
+    return 31 - _lzcnt_u32(v);
+#endif
+#elif defined(__X86_ASM__)
+    size_t r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
+#else
+    return (sizeof(v) * 8 - 1) - __builtin_clzl(v);
+#endif
+  }
+  
+  __forceinline int lzcnt(const int x)
+  {
+#if defined(__AVX2__)
+    return _lzcnt_u32(x);
+#else
+    if (unlikely(x == 0)) return 32;
+    return 31 - bsr(x);    
+#endif
+  }
+
+  __forceinline size_t blsr(size_t v) {
+#if defined(__AVX2__) 
+#if defined(__INTEL_COMPILER)
+    return _blsr_u64(v);
+#else
+#if defined(__X86_64__)
+    return __blsr_u64(v);
+#else
+    return __blsr_u32(v);
+#endif
+#endif
+#else
+    return v & (v-1);
+#endif
+  }
+  
+  __forceinline int btc(int v, int i) {
+#if defined(__X86_ASM__)
+    int r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r;
+#else
+    return (v ^ (1 << i));
+#endif
+  }
+  
+  __forceinline int bts(int v, int i) {
+#if defined(__X86_ASM__)
+    int r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
+#else
+    return (v | (v << i));
+#endif
+  }
+  
+  __forceinline int btr(int v, int i) {
+#if defined(__X86_ASM__)
+    int r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
+#else
+    return (v & ~(v << i));
+#endif
+  }
+  
+  __forceinline size_t btc(size_t v, size_t i) {
+#if defined(__X86_ASM__)
+    size_t r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r;
+#else
+    return (v ^ (1 << i));
+#endif
+  }
+  
+  __forceinline size_t bts(size_t v, size_t i) {
+#if defined(__X86_ASM__)
+    size_t r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
+#else
+    return (v | (v << i));
+#endif
+  }
+  
+  __forceinline size_t btr(size_t v, size_t i) {
+#if defined(__X86_ASM__)
+    size_t r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
+#else
+    return (v & ~(v << i));
+#endif
+  }
+
+  __forceinline int32_t atomic_cmpxchg(int32_t volatile* value, int32_t comparand, const int32_t input) {
+    return __sync_val_compare_and_swap(value, comparand, input);
+  }
+  
+#endif
+  
+////////////////////////////////////////////////////////////////////////////////
+/// All Platforms
+////////////////////////////////////////////////////////////////////////////////
+  
+#if defined(__clang__) || defined(__GNUC__)
+#if !defined(_mm_undefined_ps)
+  __forceinline __m128 _mm_undefined_ps() { return _mm_setzero_ps(); }
+#endif
+#if !defined(_mm_undefined_si128)
+  __forceinline __m128i _mm_undefined_si128() { return _mm_setzero_si128(); }
+#endif
+#if !defined(_mm256_undefined_ps) && defined(__AVX__)
+  __forceinline __m256 _mm256_undefined_ps() { return _mm256_setzero_ps(); }
+#endif
+#if !defined(_mm256_undefined_si256) && defined(__AVX__)
+  __forceinline __m256i _mm256_undefined_si256() { return _mm256_setzero_si256(); }
+#endif
+#if !defined(_mm512_undefined_ps) && defined(__AVX512F__)
+  __forceinline __m512 _mm512_undefined_ps() { return _mm512_setzero_ps(); }
+#endif
+#if !defined(_mm512_undefined_epi32) && defined(__AVX512F__)
+  __forceinline __m512i _mm512_undefined_epi32() { return _mm512_setzero_si512(); }
+#endif
+#endif
+
+#if defined(__SSE4_2__)
+  
+  __forceinline int popcnt(int in) {
+    return _mm_popcnt_u32(in);
+  }
+  
+  __forceinline unsigned popcnt(unsigned in) {
+    return _mm_popcnt_u32(in);
+  }
+  
+#if defined(__64BIT__)
+  __forceinline size_t popcnt(size_t in) {
+    return _mm_popcnt_u64(in);
+  }
+#endif
+  
+#endif
+
+#if defined(__X86_ASM__)
+  __forceinline uint64_t rdtsc()
+  {
+    int dummy[4]; 
+    __cpuid(dummy,0); 
+    uint64_t clock = read_tsc(); 
+    __cpuid(dummy,0); 
+    return clock;
+  }
+#endif
+  
+  __forceinline void pause_cpu(const size_t N = 8)
+  {
+    for (size_t i=0; i<N; i++)
+      _mm_pause();    
+  }
+  
+  /* prefetches */
+  __forceinline void prefetchL1 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T0); }
+  __forceinline void prefetchL2 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T1); }
+  __forceinline void prefetchL3 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T2); }
+  __forceinline void prefetchNTA(const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_NTA); }
+  __forceinline void prefetchEX (const void* ptr) {
+#if defined(__INTEL_COMPILER)
+    _mm_prefetch((const char*)ptr,_MM_HINT_ET0);
+#else
+    _mm_prefetch((const char*)ptr,_MM_HINT_T0);    
+#endif
+  }
+
+  __forceinline void prefetchL1EX(const void* ptr) { 
+    prefetchEX(ptr); 
+  }
+  
+  __forceinline void prefetchL2EX(const void* ptr) { 
+    prefetchEX(ptr); 
+  }
+#if defined(__AVX2__)
+   __forceinline unsigned int pext(unsigned int a, unsigned int b) { return _pext_u32(a, b); }
+   __forceinline unsigned int pdep(unsigned int a, unsigned int b) { return _pdep_u32(a, b); }
+#if defined(__X86_64__)
+   __forceinline size_t pext(size_t a, size_t b) { return _pext_u64(a, b); }
+   __forceinline size_t pdep(size_t a, size_t b) { return _pdep_u64(a, b); }
+#endif
+#endif
+
+#if defined(__AVX512F__)
+#if defined(__INTEL_COMPILER)
+   __forceinline float mm512_cvtss_f32(__m512 v) {
+     return _mm512_cvtss_f32(v);
+   }
+   __forceinline int mm512_mask2int(__mmask16 k1) {
+     return _mm512_mask2int(k1);
+   }
+   __forceinline __mmask16 mm512_int2mask(int mask) {
+     return _mm512_int2mask(mask);
+   }
+#else
+   __forceinline float mm512_cvtss_f32(__m512 v) { // FIXME: _mm512_cvtss_f32 neither supported by clang v4.0.0 nor GCC 6.3
+     return _mm_cvtss_f32(_mm512_castps512_ps128(v));
+   }
+   __forceinline int mm512_mask2int(__mmask16 k1) { // FIXME: _mm512_mask2int not yet supported by GCC 6.3
+     return (int)k1;
+   }
+   __forceinline __mmask16 mm512_int2mask(int mask) { // FIXME: _mm512_int2mask not yet supported by GCC 6.3
+     return (__mmask16)mask;
+   }
+#endif
+#endif
+}
diff --git a/thirdparty/embree/common/sys/library.cpp b/thirdparty/embree/common/sys/library.cpp
new file mode 100644
index 0000000000..fc983dffd5
--- /dev/null
+++ b/thirdparty/embree/common/sys/library.cpp
@@ -0,0 +1,83 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "library.h"
+#include "sysinfo.h"
+#include "filename.h"
+
+////////////////////////////////////////////////////////////////////////////////
+/// Windows Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__WIN32__)
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+
+namespace embree
+{
+  /* opens a shared library */
+  lib_t openLibrary(const std::string& file)
+  {
+    std::string fullName = file+".dll";
+    FileName executable = getExecutableFileName();
+    HANDLE handle = LoadLibrary((executable.path() + fullName).c_str());
+    return lib_t(handle);
+  }
+
+  /* returns address of a symbol from the library */
+  void* getSymbol(lib_t lib, const std::string& sym) {
+    return (void*)GetProcAddress(HMODULE(lib),sym.c_str());
+  }
+
+  /* closes the shared library */
+  void closeLibrary(lib_t lib) {
+    FreeLibrary(HMODULE(lib));
+  }
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Unix Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__UNIX__)
+
+#include <dlfcn.h>
+
+namespace embree
+{
+  /* opens a shared library */
+  lib_t openLibrary(const std::string& file)
+  {
+#if defined(__MACOSX__)
+    std::string fullName = "lib"+file+".dylib";
+#else
+    std::string fullName = "lib"+file+".so";
+#endif
+    void* lib = dlopen(fullName.c_str(), RTLD_NOW);
+    if (lib) return lib_t(lib);
+    FileName executable = getExecutableFileName();
+    lib = dlopen((executable.path() + fullName).c_str(),RTLD_NOW);
+    if (lib == nullptr) {
+      const char* error = dlerror();
+      if (error) { 
+        THROW_RUNTIME_ERROR(error);
+      } else {
+        THROW_RUNTIME_ERROR("could not load library "+executable.str());
+      }
+    }
+    return lib_t(lib);
+  }
+
+  /* returns address of a symbol from the library */
+  void* getSymbol(lib_t lib, const std::string& sym) {
+    return dlsym(lib,sym.c_str());
+  }
+
+  /* closes the shared library */
+  void closeLibrary(lib_t lib) {
+    dlclose(lib);
+  }
+}
+#endif
diff --git a/thirdparty/embree/common/sys/library.h b/thirdparty/embree/common/sys/library.h
new file mode 100644
index 0000000000..67e14d2420
--- /dev/null
+++ b/thirdparty/embree/common/sys/library.h
@@ -0,0 +1,21 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "platform.h"
+
+namespace embree
+{
+  /*! type for shared library */
+  typedef struct opaque_lib_t* lib_t;
+
+  /*! loads a shared library */
+  lib_t openLibrary(const std::string& file);
+
+  /*! returns address of a symbol from the library */
+  void* getSymbol(lib_t lib, const std::string& sym);
+
+  /*! unloads a shared library */
+  void closeLibrary(lib_t lib);
+}
diff --git a/thirdparty/embree/common/sys/mutex.cpp b/thirdparty/embree/common/sys/mutex.cpp
new file mode 100644
index 0000000000..789feaf2d8
--- /dev/null
+++ b/thirdparty/embree/common/sys/mutex.cpp
@@ -0,0 +1,57 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "mutex.h"
+#include "regression.h"
+
+#if defined(__WIN32__) && !defined(PTHREADS_WIN32)
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+
+namespace embree
+{
+  MutexSys::MutexSys() { mutex = new CRITICAL_SECTION; InitializeCriticalSection((CRITICAL_SECTION*)mutex); }
+  MutexSys::~MutexSys() { DeleteCriticalSection((CRITICAL_SECTION*)mutex); delete (CRITICAL_SECTION*)mutex; }
+  void MutexSys::lock() { EnterCriticalSection((CRITICAL_SECTION*)mutex); }
+  bool MutexSys::try_lock() { return TryEnterCriticalSection((CRITICAL_SECTION*)mutex) != 0; }
+  void MutexSys::unlock() { LeaveCriticalSection((CRITICAL_SECTION*)mutex); }
+}
+#endif
+
+#if defined(__UNIX__) || defined(PTHREADS_WIN32)
+#include <pthread.h>
+namespace embree
+{
+  /*! system mutex using pthreads */
+  MutexSys::MutexSys() 
+  { 
+    mutex = new pthread_mutex_t; 
+    if (pthread_mutex_init((pthread_mutex_t*)mutex, nullptr) != 0)
+      THROW_RUNTIME_ERROR("pthread_mutex_init failed");
+  }
+  
+  MutexSys::~MutexSys() 
+  { 
+    MAYBE_UNUSED bool ok = pthread_mutex_destroy((pthread_mutex_t*)mutex) == 0;
+    assert(ok);
+    delete (pthread_mutex_t*)mutex; 
+  }
+  
+  void MutexSys::lock() 
+  { 
+    if (pthread_mutex_lock((pthread_mutex_t*)mutex) != 0) 
+      THROW_RUNTIME_ERROR("pthread_mutex_lock failed");
+  }
+  
+  bool MutexSys::try_lock() { 
+    return pthread_mutex_trylock((pthread_mutex_t*)mutex) == 0;
+  }
+  
+  void MutexSys::unlock() 
+  { 
+    if (pthread_mutex_unlock((pthread_mutex_t*)mutex) != 0)
+      THROW_RUNTIME_ERROR("pthread_mutex_unlock failed");
+  }
+};
+#endif
diff --git a/thirdparty/embree/common/sys/mutex.h b/thirdparty/embree/common/sys/mutex.h
new file mode 100644
index 0000000000..4cb3626d92
--- /dev/null
+++ b/thirdparty/embree/common/sys/mutex.h
@@ -0,0 +1,98 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "platform.h"
+#include "intrinsics.h"
+#include "atomic.h"
+
+namespace embree
+{
+  /*! system mutex */
+  class MutexSys {
+    friend struct ConditionImplementation;
+  public:
+    MutexSys();
+    ~MutexSys();
+
+  private:
+    MutexSys (const MutexSys& other) DELETED; // do not implement
+    MutexSys& operator= (const MutexSys& other) DELETED; // do not implement
+
+  public:
+    void lock();
+    bool try_lock();
+    void unlock();
+
+  protected:
+    void* mutex;
+  };
+
+  /*! spinning mutex */
+  class SpinLock
+  {
+  public:
+ 
+    SpinLock ()
+      : flag(false) {}
+
+    __forceinline bool isLocked() {
+      return flag.load();
+    }
+
+    __forceinline void lock()
+    {
+      while (true) 
+      {
+        while (flag.load()) 
+        {
+          _mm_pause(); 
+          _mm_pause();
+        }
+        
+        bool expected = false;
+        if (flag.compare_exchange_strong(expected,true,std::memory_order_acquire))
+          break;
+      }
+    }
+    
+    __forceinline bool try_lock()
+    {
+      bool expected = false;
+      if (flag.load() != expected) {
+        return false;
+      }
+      return flag.compare_exchange_strong(expected,true,std::memory_order_acquire);
+    }
+
+    __forceinline void unlock() {
+      flag.store(false,std::memory_order_release);
+    }
+    
+    __forceinline void wait_until_unlocked() 
+    {
+      while(flag.load())
+      {
+        _mm_pause(); 
+        _mm_pause();
+      }
+    }
+
+  public:
+    atomic<bool> flag;
+  };
+
+  /*! safe mutex lock and unlock helper */
+  template<typename Mutex> class Lock {
+  public:
+    Lock (Mutex& mutex) : mutex(mutex), locked(true) { mutex.lock(); }
+    Lock (Mutex& mutex, bool locked) : mutex(mutex), locked(locked) {}
+    ~Lock() { if (locked) mutex.unlock(); }
+    __forceinline void lock() { assert(!locked); locked = true; mutex.lock(); }
+    __forceinline bool isLocked() const { return locked; }
+  protected:
+    Mutex& mutex;
+    bool locked;
+  };
+}
diff --git a/thirdparty/embree/common/sys/platform.h b/thirdparty/embree/common/sys/platform.h
new file mode 100644
index 0000000000..697e07bb86
--- /dev/null
+++ b/thirdparty/embree/common/sys/platform.h
@@ -0,0 +1,392 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define _CRT_SECURE_NO_WARNINGS
+
+#include <cstddef>
+#include <cassert>
+#include <cstdlib>
+#include <cstdio>
+#include <memory>
+#include <stdexcept>
+#include <iostream>
+#include <iomanip>
+#include <fstream>
+#include <string>
+#include <cstring>
+#include <stdint.h>
+#include <functional>
+
+////////////////////////////////////////////////////////////////////////////////
+/// detect platform
+////////////////////////////////////////////////////////////////////////////////
+
+/* detect 32 or 64 Intel platform */
+#if defined(__x86_64__) || defined(__ia64__) || defined(_M_X64)
+#define __X86_64__
+#define __X86_ASM__
+#elif defined(__i386__) || defined(_M_IX86)
+#define __X86_ASM__
+#endif
+
+/* detect 64 bit platform */
+#if defined(__X86_64__) || defined(__aarch64__)
+#define __64BIT__
+#endif
+
+/* detect Linux platform */
+#if defined(linux) || defined(__linux__) || defined(__LINUX__)
+#  if !defined(__LINUX__)
+#     define __LINUX__
+#  endif
+#  if !defined(__UNIX__)
+#     define __UNIX__
+#  endif
+#endif
+
+/* detect FreeBSD platform */
+#if defined(__FreeBSD__) || defined(__FREEBSD__)
+#  if !defined(__FREEBSD__)
+#     define __FREEBSD__
+#  endif
+#  if !defined(__UNIX__)
+#     define __UNIX__
+#  endif
+#endif
+
+/* detect Windows 95/98/NT/2000/XP/Vista/7/8/10 platform */
+#if (defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)) && !defined(__CYGWIN__)
+#  if !defined(__WIN32__)
+#     define __WIN32__
+#  endif
+#endif
+
+/* detect Cygwin platform */
+#if defined(__CYGWIN__)
+#  if !defined(__UNIX__)
+#     define __UNIX__
+#  endif
+#endif
+
+/* detect MAC OS X platform */
+#if defined(__APPLE__) || defined(MACOSX) || defined(__MACOSX__)
+#  if !defined(__MACOSX__)
+#     define __MACOSX__
+#  endif
+#  if !defined(__UNIX__)
+#     define __UNIX__
+#  endif
+#endif
+
+/* try to detect other Unix systems */
+#if defined(__unix__) || defined (unix) || defined(__unix) || defined(_unix)
+#  if !defined(__UNIX__)
+#     define __UNIX__
+#  endif
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Macros
+////////////////////////////////////////////////////////////////////////////////
+
+#ifdef __WIN32__
+#define dll_export __declspec(dllexport)
+#define dll_import __declspec(dllimport)
+#else
+#define dll_export __attribute__ ((visibility ("default")))
+#define dll_import 
+#endif
+
+// -- GODOT start --
+#if defined(__WIN32__) && !defined(__MINGW32__)
+// -- GODOT end --
+#if !defined(__noinline)
+#define __noinline             __declspec(noinline)
+#endif
+//#define __forceinline        __forceinline
+//#define __restrict           __restrict
+#if defined(__INTEL_COMPILER)
+#define __restrict__           __restrict
+#else
+#define __restrict__           //__restrict // causes issues with MSVC
+#endif
+#if !defined(__thread)
+#define __thread               __declspec(thread)
+#endif
+#if !defined(__aligned)
+#define __aligned(...)           __declspec(align(__VA_ARGS__))
+#endif
+//#define __FUNCTION__           __FUNCTION__
+#define debugbreak()           __debugbreak()
+
+#else
+#if !defined(__noinline)
+#define __noinline             __attribute__((noinline))
+#endif
+#if !defined(__forceinline)
+#define __forceinline          inline __attribute__((always_inline))
+#endif
+//#define __restrict             __restrict
+//#define __thread               __thread
+#if !defined(__aligned)
+#define __aligned(...)           __attribute__((aligned(__VA_ARGS__)))
+#endif
+#if !defined(__FUNCTION__)
+#define __FUNCTION__           __PRETTY_FUNCTION__
+#endif
+#define debugbreak()           asm ("int $3")
+#endif
+
+#if defined(__clang__) || defined(__GNUC__)
+  #define MAYBE_UNUSED __attribute__((unused))
+#else
+  #define MAYBE_UNUSED
+#endif
+
+#if defined(_MSC_VER) && (_MSC_VER < 1900) // before VS2015 deleted functions are not supported properly
+  #define DELETED
+#else
+  #define DELETED  = delete
+#endif
+
+// -- GODOT start --
+#if !defined(likely)
+// -- GODOT end --
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+#define   likely(expr) (expr)
+#define unlikely(expr) (expr)
+#else
+#define   likely(expr) __builtin_expect((bool)(expr),true )
+#define unlikely(expr) __builtin_expect((bool)(expr),false)
+#endif
+// -- GODOT start --
+#endif
+// -- GODOT end --
+
+////////////////////////////////////////////////////////////////////////////////
+/// Error handling and debugging
+////////////////////////////////////////////////////////////////////////////////
+
+/* debug printing macros */
+#define STRING(x) #x
+#define TOSTRING(x) STRING(x)
+#define PING embree_cout << __FILE__ << " (" << __LINE__ << "): " << __FUNCTION__ << embree_endl
+#define PRINT(x) embree_cout << STRING(x) << " = " << (x) << embree_endl
+#define PRINT2(x,y) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << embree_endl
+#define PRINT3(x,y,z) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << embree_endl
+#define PRINT4(x,y,z,w) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << ", " << STRING(w) << " = " << (w) << embree_endl
+
+#if defined(DEBUG) // only report file and line in debug mode
+  // -- GODOT start --
+  // #define THROW_RUNTIME_ERROR(str)
+  //   throw std::runtime_error(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
+  #define THROW_RUNTIME_ERROR(str) \
+    printf(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)), abort();
+  // -- GODOT end --
+#else
+  // -- GODOT start --
+  // #define THROW_RUNTIME_ERROR(str)
+  //   throw std::runtime_error(str);
+  #define THROW_RUNTIME_ERROR(str) \
+    abort();
+  // -- GODOT end --
+#endif
+
+#define FATAL(x)   THROW_RUNTIME_ERROR(x)
+#define WARNING(x) { std::cerr << "Warning: " << x << embree_endl << std::flush; }
+
+#define NOT_IMPLEMENTED FATAL(std::string(__FUNCTION__) + " not implemented")
+
+////////////////////////////////////////////////////////////////////////////////
+/// Basic types
+////////////////////////////////////////////////////////////////////////////////
+
+/* default floating-point type */
+namespace embree {
+  typedef float real;
+}
+
+/* windows does not have ssize_t */
+#if defined(__WIN32__)
+#if defined(__64BIT__)
+typedef int64_t ssize_t;
+#else
+typedef int32_t ssize_t;
+#endif
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Basic utility functions
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline std::string toString(long long value) {
+  return std::to_string(value);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Disable some compiler warnings
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__INTEL_COMPILER)
+//#pragma warning(disable:265 ) // floating-point operation result is out of range
+//#pragma warning(disable:383 ) // value copied to temporary, reference to temporary used
+//#pragma warning(disable:869 ) // parameter was never referenced
+//#pragma warning(disable:981 ) // operands are evaluated in unspecified order
+//#pragma warning(disable:1418) // external function definition with no prior declaration
+//#pragma warning(disable:1419) // external declaration in primary source file
+//#pragma warning(disable:1572) // floating-point equality and inequality comparisons are unreliable
+//#pragma warning(disable:94  ) // the size of an array must be greater than zero
+//#pragma warning(disable:1599) // declaration hides parameter
+//#pragma warning(disable:424 ) // extra ";" ignored
+#pragma warning(disable:2196) // routine is both "inline" and "noinline"
+//#pragma warning(disable:177 ) // label was declared but never referenced
+//#pragma warning(disable:114 ) // function was referenced but not defined
+//#pragma warning(disable:819 ) // template nesting depth does not match the previous declaration of function
+#pragma warning(disable:15335)  // was not vectorized: vectorization possible but seems inefficient
+#endif
+
+#if defined(_MSC_VER)
+//#pragma warning(disable:4200) // nonstandard extension used : zero-sized array in struct/union
+#pragma warning(disable:4800) // forcing value to bool 'true' or 'false' (performance warning)
+//#pragma warning(disable:4267) // '=' : conversion from 'size_t' to 'unsigned long', possible loss of data
+#pragma warning(disable:4244) // 'argument' : conversion from 'ssize_t' to 'unsigned int', possible loss of data
+//#pragma warning(disable:4355) // 'this' : used in base member initializer list
+//#pragma warning(disable:391 ) // '<=' : signed / unsigned mismatch
+//#pragma warning(disable:4018) // '<' : signed / unsigned mismatch
+//#pragma warning(disable:4305) // 'initializing' : truncation from 'double' to 'float'
+//#pragma warning(disable:4068) // unknown pragma
+//#pragma warning(disable:4146) // unary minus operator applied to unsigned type, result still unsigned
+//#pragma warning(disable:4838) // conversion from 'unsigned int' to 'const int' requires a narrowing conversion)
+//#pragma warning(disable:4227) // anachronism used : qualifiers on reference are ignored
+#pragma warning(disable:4503) // decorated name length exceeded, name was truncated
+#pragma warning(disable:4180) // qualifier applied to function type has no meaning; ignored
+#pragma warning(disable:4258) // definition from the for loop is ignored; the definition from the enclosing scope is used
+
+#  if _MSC_VER < 1910 // prior to Visual studio 2017 (V141)
+#    pragma warning(disable:4101) // warning C4101: 'x': unreferenced local variable // a compiler bug issues wrong warnings
+#    pragma warning(disable:4789) // buffer '' of size 8 bytes will be overrun; 32 bytes will be written starting at offset 0
+#  endif
+
+#endif
+
+#if defined(__clang__) && !defined(__INTEL_COMPILER)
+//#pragma clang diagnostic ignored "-Wunknown-pragmas"
+//#pragma clang diagnostic ignored "-Wunused-variable"
+//#pragma clang diagnostic ignored "-Wreorder"
+//#pragma clang diagnostic ignored "-Wmicrosoft"
+//#pragma clang diagnostic ignored "-Wunused-private-field"
+//#pragma clang diagnostic ignored "-Wunused-local-typedef"
+//#pragma clang diagnostic ignored "-Wunused-function"
+//#pragma clang diagnostic ignored "-Wnarrowing"
+//#pragma clang diagnostic ignored "-Wc++11-narrowing"
+//#pragma clang diagnostic ignored "-Wdeprecated-register"
+//#pragma clang diagnostic ignored "-Wdeprecated-declarations"
+#endif
+
+#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__)
+#pragma GCC diagnostic ignored "-Wpragmas"
+//#pragma GCC diagnostic ignored "-Wnarrowing"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+//#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+//#pragma GCC diagnostic ignored "-Warray-bounds"
+#pragma GCC diagnostic ignored "-Wattributes"
+#pragma GCC diagnostic ignored "-Wmisleading-indentation"
+#pragma GCC diagnostic ignored "-Wsign-compare"
+#pragma GCC diagnostic ignored "-Wparentheses"
+#endif
+
+#if defined(__clang__) && defined(__WIN32__)
+#pragma clang diagnostic ignored "-Wunused-parameter"
+#pragma clang diagnostic ignored "-Wmicrosoft-cast"
+#pragma clang diagnostic ignored "-Wmicrosoft-enum-value"
+#pragma clang diagnostic ignored "-Wmicrosoft-include"
+#pragma clang diagnostic ignored "-Wunused-function"
+#pragma clang diagnostic ignored "-Wunknown-pragmas"
+#endif
+
+/* disabling deprecated warning, please use only where use of deprecated Embree API functions is desired */
+#if defined(__WIN32__) && defined(__INTEL_COMPILER)
+#define DISABLE_DEPRECATED_WARNING __pragma(warning (disable: 1478)) // warning: function was declared deprecated
+#define ENABLE_DEPRECATED_WARNING  __pragma(warning (enable:  1478)) // warning: function was declared deprecated
+#elif defined(__INTEL_COMPILER)
+#define DISABLE_DEPRECATED_WARNING _Pragma("warning (disable: 1478)") // warning: function was declared deprecated
+#define ENABLE_DEPRECATED_WARNING  _Pragma("warning (enable : 1478)") // warning: function was declared deprecated
+#elif defined(__clang__)
+#define DISABLE_DEPRECATED_WARNING _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"") // warning: xxx is deprecated
+#define ENABLE_DEPRECATED_WARNING  _Pragma("clang diagnostic warning \"-Wdeprecated-declarations\"") // warning: xxx is deprecated
+#elif defined(__GNUC__)
+#define DISABLE_DEPRECATED_WARNING _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"") // warning: xxx is deprecated
+#define ENABLE_DEPRECATED_WARNING  _Pragma("GCC diagnostic warning \"-Wdeprecated-declarations\"") // warning: xxx is deprecated
+#elif defined(_MSC_VER)
+#define DISABLE_DEPRECATED_WARNING __pragma(warning (disable: 4996)) // warning: function was declared deprecated
+#define ENABLE_DEPRECATED_WARNING  __pragma(warning (enable : 4996)) // warning: function was declared deprecated
+#endif
+
+/* embree output stream */
+#define embree_ostream std::ostream&
+#define embree_cout std::cout
+#define embree_cout_uniform std::cout
+#define embree_endl std::endl
+  
+////////////////////////////////////////////////////////////////////////////////
+/// Some macros for static profiling
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined (__GNUC__) 
+#define IACA_SSC_MARK( MARK_ID )						\
+__asm__ __volatile__ (									\
+					  "\n\t  movl $"#MARK_ID", %%ebx"	\
+					  "\n\t  .byte 0x64, 0x67, 0x90"	\
+					  : : : "memory" );
+
+#define IACA_UD_BYTES __asm__ __volatile__ ("\n\t .byte 0x0F, 0x0B");
+
+#else
+#define IACA_UD_BYTES {__asm _emit 0x0F \
+	__asm _emit 0x0B}
+
+#define IACA_SSC_MARK(x) {__asm  mov ebx, x\
+	__asm  _emit 0x64 \
+	__asm  _emit 0x67 \
+	__asm  _emit 0x90 }
+
+#define IACA_VC64_START __writegsbyte(111, 111);
+#define IACA_VC64_END   __writegsbyte(222, 222);
+
+#endif
+
+#define IACA_START {IACA_UD_BYTES \
+					IACA_SSC_MARK(111)}
+#define IACA_END {IACA_SSC_MARK(222) \
+					IACA_UD_BYTES}
+
+namespace embree
+{
+  template<typename Closure>
+    struct OnScopeExitHelper
+  {
+    OnScopeExitHelper (const Closure f) : active(true), f(f) {}
+    ~OnScopeExitHelper() { if (active) f(); }
+    void deactivate() { active = false; }
+    bool active;
+    const Closure f;
+  };
+  
+  template <typename Closure>
+    OnScopeExitHelper<Closure> OnScopeExit(const Closure f) {
+    return OnScopeExitHelper<Closure>(f);
+  }
+
+#define STRING_JOIN2(arg1, arg2) DO_STRING_JOIN2(arg1, arg2)
+#define DO_STRING_JOIN2(arg1, arg2) arg1 ## arg2
+#define ON_SCOPE_EXIT(code)                                             \
+  auto STRING_JOIN2(on_scope_exit_, __LINE__) = OnScopeExit([&](){code;})
+
+  template<typename Ty>
+    std::unique_ptr<Ty> make_unique(Ty* ptr) {
+    return std::unique_ptr<Ty>(ptr);
+  }
+
+}
diff --git a/thirdparty/embree/common/sys/ref.h b/thirdparty/embree/common/sys/ref.h
new file mode 100644
index 0000000000..c2b56c1908
--- /dev/null
+++ b/thirdparty/embree/common/sys/ref.h
@@ -0,0 +1,122 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "atomic.h"
+
+namespace embree
+{
+  struct NullTy {
+  };
+
+  extern MAYBE_UNUSED NullTy null;
+  
+  class RefCount
+  {
+  public:
+    RefCount(int val = 0) : refCounter(val) {}
+    virtual ~RefCount() {};
+  
+    virtual RefCount* refInc() { refCounter.fetch_add(1); return this; }
+    virtual void refDec() { if (refCounter.fetch_add(-1) == 1) delete this; }
+  private:
+    std::atomic<size_t> refCounter;
+  };
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reference to single object
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename Type>
+  class Ref
+  {
+  public:
+    Type* ptr;
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Ref() : ptr(nullptr) {}
+    __forceinline Ref(NullTy) : ptr(nullptr) {}
+    __forceinline Ref(const Ref& input) : ptr(input.ptr) { if (ptr) ptr->refInc(); }
+    __forceinline Ref(Ref&& input) : ptr(input.ptr) { input.ptr = nullptr; }
+
+    __forceinline Ref(Type* const input) : ptr(input)
+    {
+      if (ptr)
+        ptr->refInc();
+    }
+
+    __forceinline ~Ref()
+    {
+      if (ptr)
+        ptr->refDec();
+    }
+
+    __forceinline Ref& operator =(const Ref& input)
+    {
+      if (input.ptr)
+        input.ptr->refInc();
+      if (ptr)
+        ptr->refDec();
+      ptr = input.ptr;
+      return *this;
+    }
+
+    __forceinline Ref& operator =(Ref&& input)
+    {
+      if (ptr)
+        ptr->refDec();
+      ptr = input.ptr;
+      input.ptr = nullptr;
+      return *this;
+    }
+
+    __forceinline Ref& operator =(Type* const input)
+    {
+      if (input)
+        input->refInc();
+      if (ptr)
+        ptr->refDec();
+      ptr = input;
+      return *this;
+    }
+
+    __forceinline Ref& operator =(NullTy)
+    {
+      if (ptr)
+        ptr->refDec();
+      ptr = nullptr;
+      return *this;
+    }
+
+    __forceinline operator bool() const { return ptr != nullptr; }
+
+    __forceinline const Type& operator  *() const { return *ptr; }
+    __forceinline       Type& operator  *()       { return *ptr; }
+    __forceinline const Type* operator ->() const { return  ptr; }
+    __forceinline       Type* operator ->()       { return  ptr; }
+
+    template<typename TypeOut>
+    __forceinline       Ref<TypeOut> cast()       { return Ref<TypeOut>(static_cast<TypeOut*>(ptr)); }
+    template<typename TypeOut>
+    __forceinline const Ref<TypeOut> cast() const { return Ref<TypeOut>(static_cast<TypeOut*>(ptr)); }
+
+    template<typename TypeOut>
+    __forceinline       Ref<TypeOut> dynamicCast()       { return Ref<TypeOut>(dynamic_cast<TypeOut*>(ptr)); }
+    template<typename TypeOut>
+    __forceinline const Ref<TypeOut> dynamicCast() const { return Ref<TypeOut>(dynamic_cast<TypeOut*>(ptr)); }
+  };
+
+  template<typename Type> __forceinline bool operator < (const Ref<Type>& a, const Ref<Type>& b) { return a.ptr   <  b.ptr;   }
+
+  template<typename Type> __forceinline bool operator ==(const Ref<Type>& a, NullTy            ) { return a.ptr   == nullptr; }
+  template<typename Type> __forceinline bool operator ==(NullTy            , const Ref<Type>& b) { return nullptr == b.ptr;   }
+  template<typename Type> __forceinline bool operator ==(const Ref<Type>& a, const Ref<Type>& b) { return a.ptr   == b.ptr;   }
+
+  template<typename Type> __forceinline bool operator !=(const Ref<Type>& a, NullTy            ) { return a.ptr   != nullptr; }
+  template<typename Type> __forceinline bool operator !=(NullTy            , const Ref<Type>& b) { return nullptr != b.ptr;   }
+  template<typename Type> __forceinline bool operator !=(const Ref<Type>& a, const Ref<Type>& b) { return a.ptr   != b.ptr;   }
+}
diff --git a/thirdparty/embree/common/sys/regression.cpp b/thirdparty/embree/common/sys/regression.cpp
new file mode 100644
index 0000000000..45315b1105
--- /dev/null
+++ b/thirdparty/embree/common/sys/regression.cpp
@@ -0,0 +1,30 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "regression.h"
+
+namespace embree
+{
+  /* registerRegressionTest is invoked from static initializers, thus
+   * we cannot have the regression_tests variable as global static
+   * variable due to issues with static variable initialization
+   * order. */
+  std::vector<RegressionTest*>& get_regression_tests()
+  {
+    static std::vector<RegressionTest*> regression_tests;
+    return regression_tests;
+  } 
+
+  void registerRegressionTest(RegressionTest* test) 
+  {
+    get_regression_tests().push_back(test);
+  }
+
+  RegressionTest* getRegressionTest(size_t index)
+  {
+    if (index >= get_regression_tests().size())
+      return nullptr;
+    
+    return get_regression_tests()[index];
+  }
+}
diff --git a/thirdparty/embree/common/sys/regression.h b/thirdparty/embree/common/sys/regression.h
new file mode 100644
index 0000000000..bb0bb94006
--- /dev/null
+++ b/thirdparty/embree/common/sys/regression.h
@@ -0,0 +1,25 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "platform.h"
+
+#include <vector>
+
+namespace embree
+{
+  /*! virtual interface for all regression tests */
+  struct RegressionTest 
+  { 
+    RegressionTest (std::string name) : name(name) {}
+    virtual bool run() = 0;
+    std::string name;
+  };
+ 
+  /*! registers a regression test */
+  void registerRegressionTest(RegressionTest* test);
+
+  /*! run all regression tests */
+  RegressionTest* getRegressionTest(size_t index);
+}
diff --git a/thirdparty/embree/common/sys/string.cpp b/thirdparty/embree/common/sys/string.cpp
new file mode 100644
index 0000000000..f42fdc8536
--- /dev/null
+++ b/thirdparty/embree/common/sys/string.cpp
@@ -0,0 +1,42 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "string.h"
+
+#include <algorithm>
+#include <ctype.h>
+
+namespace embree
+{
+  char to_lower(char c) { return char(tolower(int(c))); }
+  char to_upper(char c) { return char(toupper(int(c))); }
+  std::string toLowerCase(const std::string& s) { std::string dst(s); std::transform(dst.begin(), dst.end(), dst.begin(), to_lower); return dst; }
+  std::string toUpperCase(const std::string& s) { std::string dst(s); std::transform(dst.begin(), dst.end(), dst.begin(), to_upper); return dst; }
+
+  Vec2f string_to_Vec2f ( std::string str )
+  {
+    size_t next = 0;
+    const float x = std::stof(str,&next); str = str.substr(next+1);
+    const float y = std::stof(str,&next);
+    return Vec2f(x,y);
+  }
+  
+  Vec3f string_to_Vec3f ( std::string str )
+  {
+    size_t next = 0;
+    const float x = std::stof(str,&next); str = str.substr(next+1);
+    const float y = std::stof(str,&next); str = str.substr(next+1);
+    const float z = std::stof(str,&next); 
+    return Vec3f(x,y,z);
+  }
+  
+  Vec4f string_to_Vec4f ( std::string str )
+  {
+    size_t next = 0;
+    const float x = std::stof(str,&next); str = str.substr(next+1);
+    const float y = std::stof(str,&next); str = str.substr(next+1);
+    const float z = std::stof(str,&next); str = str.substr(next+1);
+    const float w = std::stof(str,&next);
+    return Vec4f(x,y,z,w);
+  }
+}
diff --git a/thirdparty/embree/common/sys/string.h b/thirdparty/embree/common/sys/string.h
new file mode 100644
index 0000000000..820076b21c
--- /dev/null
+++ b/thirdparty/embree/common/sys/string.h
@@ -0,0 +1,37 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "platform.h"
+#include "../math/vec2.h"
+#include "../math/vec3.h"
+#include "../math/vec4.h"
+
+namespace embree
+{
+  class IOStreamStateRestorer 
+  {
+  public:
+    IOStreamStateRestorer(std::ostream& iostream)
+      : iostream(iostream), flags(iostream.flags()), precision(iostream.precision()) {
+    }
+
+    ~IOStreamStateRestorer() {
+      iostream.flags(flags);
+      iostream.precision(precision);
+    }
+    
+  private:
+    std::ostream& iostream;
+    std::ios::fmtflags flags;
+    std::streamsize precision;
+  };
+
+  std::string toLowerCase(const std::string& s);
+  std::string toUpperCase(const std::string& s);
+
+  Vec2f string_to_Vec2f ( std::string str );
+  Vec3f string_to_Vec3f ( std::string str );
+  Vec4f string_to_Vec4f ( std::string str );
+}
diff --git a/thirdparty/embree/common/sys/sysinfo.cpp b/thirdparty/embree/common/sys/sysinfo.cpp
new file mode 100644
index 0000000000..f1a59e511e
--- /dev/null
+++ b/thirdparty/embree/common/sys/sysinfo.cpp
@@ -0,0 +1,656 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "sysinfo.h"
+#include "intrinsics.h"
+#include "string.h"
+#include "ref.h"
+#if defined(__FREEBSD__)
+#include <sys/cpuset.h>
+#include <pthread_np.h>
+typedef cpuset_t cpu_set_t;
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// All Platforms
+////////////////////////////////////////////////////////////////////////////////
+
+namespace embree
+{
+  NullTy null;
+  
+  std::string getPlatformName() 
+  {
+#if defined(__LINUX__) && !defined(__64BIT__)
+    return "Linux (32bit)";
+#elif defined(__LINUX__) && defined(__64BIT__)
+    return "Linux (64bit)";
+#elif defined(__FREEBSD__) && !defined(__64BIT__)
+    return "FreeBSD (32bit)";
+#elif defined(__FREEBSD__) && defined(__64BIT__)
+    return "FreeBSD (64bit)";
+#elif defined(__CYGWIN__) && !defined(__64BIT__)
+    return "Cygwin (32bit)";
+#elif defined(__CYGWIN__) && defined(__64BIT__)
+    return "Cygwin (64bit)";
+#elif defined(__WIN32__) && !defined(__64BIT__)
+    return "Windows (32bit)";
+#elif defined(__WIN32__) && defined(__64BIT__)
+    return "Windows (64bit)";
+#elif defined(__MACOSX__) && !defined(__64BIT__)
+    return "Mac OS X (32bit)";
+#elif defined(__MACOSX__) && defined(__64BIT__)
+    return "Mac OS X (64bit)";
+#elif defined(__UNIX__) && !defined(__64BIT__)
+    return "Unix (32bit)";
+#elif defined(__UNIX__) && defined(__64BIT__)
+    return "Unix (64bit)";
+#else
+    return "Unknown";
+#endif
+  }
+
+  std::string getCompilerName()
+  {
+#if defined(__INTEL_COMPILER)
+    int icc_mayor = __INTEL_COMPILER / 100 % 100;
+    int icc_minor = __INTEL_COMPILER % 100;
+    std::string version = "Intel Compiler ";
+    version += toString(icc_mayor);
+    version += "." + toString(icc_minor);
+#if defined(__INTEL_COMPILER_UPDATE)
+    version += "." + toString(__INTEL_COMPILER_UPDATE);
+#endif
+    return version;
+#elif defined(__clang__)
+    return "CLANG " __clang_version__;
+#elif defined (__GNUC__)
+    return "GCC " __VERSION__;
+#elif defined(_MSC_VER)
+    std::string version = toString(_MSC_FULL_VER);
+    version.insert(4,".");
+    version.insert(9,".");
+    version.insert(2,".");
+    return "Visual C++ Compiler " + version;
+#else
+    return "Unknown Compiler";
+#endif
+  }
+
+  std::string getCPUVendor()
+  {
+#if defined(__X86_ASM__)
+    int cpuinfo[4]; 
+    __cpuid (cpuinfo, 0); 
+    int name[4];
+    name[0] = cpuinfo[1];
+    name[1] = cpuinfo[3];
+    name[2] = cpuinfo[2];
+    name[3] = 0;
+    return (char*)name;
+#elif defined(__ARM_NEON)
+    return "ARM";
+#else
+    return "Unknown";
+#endif
+  }
+
+  CPU getCPUModel() 
+  {
+#if defined(__X86_ASM__)
+    if (getCPUVendor() != "GenuineIntel")
+      return CPU::UNKNOWN;
+    
+    int out[4];
+    __cpuid(out, 0);
+    if (out[0] < 1) return CPU::UNKNOWN;
+    __cpuid(out, 1);
+
+    /* please see CPUID documentation for these formulas */
+    uint32_t family_ID          = (out[0] >>  8) & 0x0F;
+    uint32_t extended_family_ID = (out[0] >> 20) & 0xFF;
+    
+    uint32_t model_ID           = (out[0] >>  4) & 0x0F;
+    uint32_t extended_model_ID  = (out[0] >> 16) & 0x0F;
+    
+    uint32_t DisplayFamily = family_ID;
+    if (family_ID == 0x0F)
+      DisplayFamily += extended_family_ID;
+    
+    uint32_t DisplayModel = model_ID;
+    if (family_ID == 0x06 || family_ID == 0x0F)
+      DisplayModel += extended_model_ID << 4;
+
+    uint32_t DisplayFamily_DisplayModel = (DisplayFamily << 8) + (DisplayModel << 0);
+
+    // Data from Intel® 64 and IA-32 Architectures, Volume 4, Chapter 2, Table 2-1 (CPUID Signature Values of DisplayFamily_DisplayModel)
+    if (DisplayFamily_DisplayModel == 0x067D) return CPU::CORE_ICE_LAKE;
+    if (DisplayFamily_DisplayModel == 0x067E) return CPU::CORE_ICE_LAKE;
+    if (DisplayFamily_DisplayModel == 0x068C) return CPU::CORE_TIGER_LAKE;
+    if (DisplayFamily_DisplayModel == 0x06A5) return CPU::CORE_COMET_LAKE;
+    if (DisplayFamily_DisplayModel == 0x06A6) return CPU::CORE_COMET_LAKE;
+    if (DisplayFamily_DisplayModel == 0x0666) return CPU::CORE_CANNON_LAKE;
+    if (DisplayFamily_DisplayModel == 0x068E) return CPU::CORE_KABY_LAKE;
+    if (DisplayFamily_DisplayModel == 0x069E) return CPU::CORE_KABY_LAKE;
+    if (DisplayFamily_DisplayModel == 0x066A) return CPU::XEON_ICE_LAKE;
+    if (DisplayFamily_DisplayModel == 0x066C) return CPU::XEON_ICE_LAKE;
+    if (DisplayFamily_DisplayModel == 0x0655) return CPU::XEON_SKY_LAKE;
+    if (DisplayFamily_DisplayModel == 0x064E) return CPU::CORE_SKY_LAKE;
+    if (DisplayFamily_DisplayModel == 0x065E) return CPU::CORE_SKY_LAKE;
+    if (DisplayFamily_DisplayModel == 0x0656) return CPU::XEON_BROADWELL;
+    if (DisplayFamily_DisplayModel == 0x064F) return CPU::XEON_BROADWELL;
+    if (DisplayFamily_DisplayModel == 0x0647) return CPU::CORE_BROADWELL;
+    if (DisplayFamily_DisplayModel == 0x063D) return CPU::CORE_BROADWELL;
+    if (DisplayFamily_DisplayModel == 0x063F) return CPU::XEON_HASWELL;
+    if (DisplayFamily_DisplayModel == 0x063C) return CPU::CORE_HASWELL;
+    if (DisplayFamily_DisplayModel == 0x0645) return CPU::CORE_HASWELL;
+    if (DisplayFamily_DisplayModel == 0x0646) return CPU::CORE_HASWELL;
+    if (DisplayFamily_DisplayModel == 0x063E) return CPU::XEON_IVY_BRIDGE;
+    if (DisplayFamily_DisplayModel == 0x063A) return CPU::CORE_IVY_BRIDGE;
+    if (DisplayFamily_DisplayModel == 0x062D) return CPU::SANDY_BRIDGE;
+    if (DisplayFamily_DisplayModel == 0x062F) return CPU::SANDY_BRIDGE;
+    if (DisplayFamily_DisplayModel == 0x062A) return CPU::SANDY_BRIDGE;
+    if (DisplayFamily_DisplayModel == 0x062E) return CPU::NEHALEM;
+    if (DisplayFamily_DisplayModel == 0x0625) return CPU::NEHALEM;
+    if (DisplayFamily_DisplayModel == 0x062C) return CPU::NEHALEM;
+    if (DisplayFamily_DisplayModel == 0x061E) return CPU::NEHALEM;
+    if (DisplayFamily_DisplayModel == 0x061F) return CPU::NEHALEM;
+    if (DisplayFamily_DisplayModel == 0x061A) return CPU::NEHALEM;
+    if (DisplayFamily_DisplayModel == 0x061D) return CPU::NEHALEM;
+    if (DisplayFamily_DisplayModel == 0x0617) return CPU::CORE2;
+    if (DisplayFamily_DisplayModel == 0x060F) return CPU::CORE2;
+    if (DisplayFamily_DisplayModel == 0x060E) return CPU::CORE1;
+
+    if (DisplayFamily_DisplayModel == 0x0685) return CPU::XEON_PHI_KNIGHTS_MILL;
+    if (DisplayFamily_DisplayModel == 0x0657) return CPU::XEON_PHI_KNIGHTS_LANDING;
+    
+#elif defined(__ARM_NEON)
+    return CPU::ARM;
+#endif
+    
+    return CPU::UNKNOWN;
+  }
+
+  std::string stringOfCPUModel(CPU model)
+  {
+    switch (model) {
+    case CPU::XEON_ICE_LAKE           : return "Xeon Ice Lake";
+    case CPU::CORE_ICE_LAKE           : return "Core Ice Lake";
+    case CPU::CORE_TIGER_LAKE         : return "Core Tiger Lake";
+    case CPU::CORE_COMET_LAKE         : return "Core Comet Lake";
+    case CPU::CORE_CANNON_LAKE        : return "Core Cannon Lake";
+    case CPU::CORE_KABY_LAKE          : return "Core Kaby Lake";
+    case CPU::XEON_SKY_LAKE           : return "Xeon Sky Lake";
+    case CPU::CORE_SKY_LAKE           : return "Core Sky Lake";
+    case CPU::XEON_PHI_KNIGHTS_MILL   : return "Xeon Phi Knights Mill";
+    case CPU::XEON_PHI_KNIGHTS_LANDING: return "Xeon Phi Knights Landing";
+    case CPU::XEON_BROADWELL          : return "Xeon Broadwell";
+    case CPU::CORE_BROADWELL          : return "Core Broadwell";
+    case CPU::XEON_HASWELL            : return "Xeon Haswell";
+    case CPU::CORE_HASWELL            : return "Core Haswell";
+    case CPU::XEON_IVY_BRIDGE         : return "Xeon Ivy Bridge";
+    case CPU::CORE_IVY_BRIDGE         : return "Core Ivy Bridge";
+    case CPU::SANDY_BRIDGE            : return "Sandy Bridge";
+    case CPU::NEHALEM                 : return "Nehalem";
+    case CPU::CORE2                   : return "Core2";
+    case CPU::CORE1                   : return "Core";
+    case CPU::ARM                     : return "ARM";
+    case CPU::UNKNOWN                 : return "Unknown CPU";
+    }
+    return "Unknown CPU (error)";
+  }
+
+#if defined(__X86_ASM__)
+  /* constants to access destination registers of CPUID instruction */
+  static const int EAX = 0;
+  static const int EBX = 1;
+  static const int ECX = 2;
+  static const int EDX = 3;
+
+  /* cpuid[eax=1].ecx */
+  static const int CPU_FEATURE_BIT_SSE3   = 1 << 0;
+  static const int CPU_FEATURE_BIT_SSSE3  = 1 << 9;
+  static const int CPU_FEATURE_BIT_FMA3   = 1 << 12;
+  static const int CPU_FEATURE_BIT_SSE4_1 = 1 << 19;
+  static const int CPU_FEATURE_BIT_SSE4_2 = 1 << 20;
+  //static const int CPU_FEATURE_BIT_MOVBE  = 1 << 22;
+  static const int CPU_FEATURE_BIT_POPCNT = 1 << 23;
+  //static const int CPU_FEATURE_BIT_XSAVE  = 1 << 26;
+  static const int CPU_FEATURE_BIT_OXSAVE = 1 << 27;
+  static const int CPU_FEATURE_BIT_AVX    = 1 << 28;
+  static const int CPU_FEATURE_BIT_F16C   = 1 << 29;
+  static const int CPU_FEATURE_BIT_RDRAND = 1 << 30;
+
+  /* cpuid[eax=1].edx */
+  static const int CPU_FEATURE_BIT_SSE  = 1 << 25;
+  static const int CPU_FEATURE_BIT_SSE2 = 1 << 26;
+
+  /* cpuid[eax=0x80000001].ecx */
+  static const int CPU_FEATURE_BIT_LZCNT = 1 << 5;
+
+  /* cpuid[eax=7,ecx=0].ebx */
+  static const int CPU_FEATURE_BIT_BMI1    = 1 << 3;
+  static const int CPU_FEATURE_BIT_AVX2    = 1 << 5;
+  static const int CPU_FEATURE_BIT_BMI2    = 1 << 8;
+  static const int CPU_FEATURE_BIT_AVX512F = 1 << 16;     // AVX512F  (foundation)
+  static const int CPU_FEATURE_BIT_AVX512DQ = 1 << 17;    // AVX512DQ (doubleword and quadword instructions)
+  static const int CPU_FEATURE_BIT_AVX512PF = 1 << 26;    // AVX512PF (prefetch gather/scatter instructions)
+  static const int CPU_FEATURE_BIT_AVX512ER = 1 << 27;    // AVX512ER (exponential and reciprocal instructions)
+  static const int CPU_FEATURE_BIT_AVX512CD = 1 << 28;    // AVX512CD (conflict detection instructions)
+  static const int CPU_FEATURE_BIT_AVX512BW = 1 << 30;    // AVX512BW (byte and word instructions)
+  static const int CPU_FEATURE_BIT_AVX512VL = 1 << 31;    // AVX512VL (vector length extensions)
+  static const int CPU_FEATURE_BIT_AVX512IFMA = 1 << 21;  // AVX512IFMA (integer fused multiple-add instructions)
+  
+  /* cpuid[eax=7,ecx=0].ecx */
+  static const int CPU_FEATURE_BIT_AVX512VBMI = 1 << 1;   // AVX512VBMI (vector bit manipulation instructions)
+#endif
+
+#if defined(__X86_ASM__)
+  __noinline int64_t get_xcr0() 
+  {
+// -- GODOT start --
+#if defined (__WIN32__) && !defined (__MINGW32__)
+// -- GODOT end --
+    int64_t xcr0 = 0; // int64_t is workaround for compiler bug under VS2013, Win32
+    xcr0 = _xgetbv(0);
+    return xcr0;
+#else
+    int xcr0 = 0;
+    __asm__ ("xgetbv" : "=a" (xcr0) : "c" (0) : "%edx" );
+    return xcr0;
+#endif
+  }
+#endif
+
+  int getCPUFeatures()
+  {
+#if defined(__X86_ASM__)
+    /* cache CPU features access */
+    static int cpu_features = 0;
+    if (cpu_features) 
+      return cpu_features;
+
+    /* get number of CPUID leaves */
+    int cpuid_leaf0[4]; 
+    __cpuid(cpuid_leaf0, 0x00000000);
+    unsigned nIds = cpuid_leaf0[EAX];  
+
+    /* get number of extended CPUID leaves */
+    int cpuid_leafe[4]; 
+    __cpuid(cpuid_leafe, 0x80000000);
+    unsigned nExIds = cpuid_leafe[EAX];
+
+    /* get CPUID leaves for EAX = 1,7, and 0x80000001 */
+    int cpuid_leaf_1[4] = { 0,0,0,0 };
+    int cpuid_leaf_7[4] = { 0,0,0,0 };
+    int cpuid_leaf_e1[4] = { 0,0,0,0 };
+    if (nIds >= 1) __cpuid (cpuid_leaf_1,0x00000001);
+#if _WIN32
+#if _MSC_VER && (_MSC_FULL_VER < 160040219)
+#else
+    if (nIds >= 7) __cpuidex(cpuid_leaf_7,0x00000007,0);
+#endif
+#else
+    if (nIds >= 7) __cpuid_count(cpuid_leaf_7,0x00000007,0);
+#endif
+    if (nExIds >= 0x80000001) __cpuid(cpuid_leaf_e1,0x80000001);
+
+    /* detect if OS saves XMM, YMM, and ZMM states */
+    bool xmm_enabled = true;
+    bool ymm_enabled = false;
+    bool zmm_enabled = false;
+    if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_OXSAVE) {
+      int64_t xcr0 = get_xcr0();
+      xmm_enabled = ((xcr0 & 0x02) == 0x02);                /* checks if xmm are enabled in XCR0 */
+      ymm_enabled = xmm_enabled && ((xcr0 & 0x04) == 0x04); /* checks if ymm state are enabled in XCR0 */
+      zmm_enabled = ymm_enabled && ((xcr0 & 0xE0) == 0xE0); /* checks if OPMASK state, upper 256-bit of ZMM0-ZMM15 and ZMM16-ZMM31 state are enabled in XCR0 */
+    }
+    if (xmm_enabled) cpu_features |= CPU_FEATURE_XMM_ENABLED;
+    if (ymm_enabled) cpu_features |= CPU_FEATURE_YMM_ENABLED;
+    if (zmm_enabled) cpu_features |= CPU_FEATURE_ZMM_ENABLED;
+    
+    if (cpuid_leaf_1[EDX] & CPU_FEATURE_BIT_SSE   ) cpu_features |= CPU_FEATURE_SSE;
+    if (cpuid_leaf_1[EDX] & CPU_FEATURE_BIT_SSE2  ) cpu_features |= CPU_FEATURE_SSE2;
+    if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSE3  ) cpu_features |= CPU_FEATURE_SSE3;
+    if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSSE3 ) cpu_features |= CPU_FEATURE_SSSE3;
+    if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSE4_1) cpu_features |= CPU_FEATURE_SSE41;
+    if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSE4_2) cpu_features |= CPU_FEATURE_SSE42;
+    if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_POPCNT) cpu_features |= CPU_FEATURE_POPCNT;
+    
+    if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_AVX   ) cpu_features |= CPU_FEATURE_AVX;
+    if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_F16C  ) cpu_features |= CPU_FEATURE_F16C;
+    if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_RDRAND) cpu_features |= CPU_FEATURE_RDRAND;
+    if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX2  ) cpu_features |= CPU_FEATURE_AVX2;
+    if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_FMA3  ) cpu_features |= CPU_FEATURE_FMA3;
+    if (cpuid_leaf_e1[ECX] & CPU_FEATURE_BIT_LZCNT) cpu_features |= CPU_FEATURE_LZCNT;
+    if (cpuid_leaf_7 [EBX] & CPU_FEATURE_BIT_BMI1 ) cpu_features |= CPU_FEATURE_BMI1;
+    if (cpuid_leaf_7 [EBX] & CPU_FEATURE_BIT_BMI2 ) cpu_features |= CPU_FEATURE_BMI2;
+
+    if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512F   ) cpu_features |= CPU_FEATURE_AVX512F;
+    if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512DQ  ) cpu_features |= CPU_FEATURE_AVX512DQ;
+    if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512PF  ) cpu_features |= CPU_FEATURE_AVX512PF;
+    if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512ER  ) cpu_features |= CPU_FEATURE_AVX512ER; 
+    if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512CD  ) cpu_features |= CPU_FEATURE_AVX512CD;
+    if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512BW  ) cpu_features |= CPU_FEATURE_AVX512BW;
+    if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512IFMA) cpu_features |= CPU_FEATURE_AVX512IFMA;
+    if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512VL  ) cpu_features |= CPU_FEATURE_AVX512VL;
+    if (cpuid_leaf_7[ECX] & CPU_FEATURE_BIT_AVX512VBMI) cpu_features |= CPU_FEATURE_AVX512VBMI;
+
+    return cpu_features;
+#elif defined(__ARM_NEON)
+    /* emulated features with sse2neon */
+    return CPU_FEATURE_SSE|CPU_FEATURE_SSE2|CPU_FEATURE_XMM_ENABLED;
+#else
+    /* Unknown CPU. */
+    return 0;
+#endif
+  }
+
+  std::string stringOfCPUFeatures(int features)
+  {
+    std::string str;
+    if (features & CPU_FEATURE_XMM_ENABLED) str += "XMM ";
+    if (features & CPU_FEATURE_YMM_ENABLED) str += "YMM ";
+    if (features & CPU_FEATURE_ZMM_ENABLED) str += "ZMM ";
+    if (features & CPU_FEATURE_SSE   ) str += "SSE ";
+    if (features & CPU_FEATURE_SSE2  ) str += "SSE2 ";
+    if (features & CPU_FEATURE_SSE3  ) str += "SSE3 ";
+    if (features & CPU_FEATURE_SSSE3 ) str += "SSSE3 ";
+    if (features & CPU_FEATURE_SSE41 ) str += "SSE4.1 ";
+    if (features & CPU_FEATURE_SSE42 ) str += "SSE4.2 ";
+    if (features & CPU_FEATURE_POPCNT) str += "POPCNT ";
+    if (features & CPU_FEATURE_AVX   ) str += "AVX ";
+    if (features & CPU_FEATURE_F16C  ) str += "F16C ";
+    if (features & CPU_FEATURE_RDRAND) str += "RDRAND ";
+    if (features & CPU_FEATURE_AVX2  ) str += "AVX2 ";
+    if (features & CPU_FEATURE_FMA3  ) str += "FMA3 ";
+    if (features & CPU_FEATURE_LZCNT ) str += "LZCNT ";
+    if (features & CPU_FEATURE_BMI1  ) str += "BMI1 ";
+    if (features & CPU_FEATURE_BMI2  ) str += "BMI2 ";
+    if (features & CPU_FEATURE_AVX512F) str += "AVX512F ";
+    if (features & CPU_FEATURE_AVX512DQ) str += "AVX512DQ ";
+    if (features & CPU_FEATURE_AVX512PF) str += "AVX512PF ";
+    if (features & CPU_FEATURE_AVX512ER) str += "AVX512ER ";
+    if (features & CPU_FEATURE_AVX512CD) str += "AVX512CD ";
+    if (features & CPU_FEATURE_AVX512BW) str += "AVX512BW ";
+    if (features & CPU_FEATURE_AVX512VL) str += "AVX512VL ";
+    if (features & CPU_FEATURE_AVX512IFMA) str += "AVX512IFMA ";
+    if (features & CPU_FEATURE_AVX512VBMI) str += "AVX512VBMI ";
+    return str;
+  }
+  
+  std::string stringOfISA (int isa)
+  {
+    if (isa == SSE) return "SSE";
+    if (isa == SSE2) return "SSE2";
+    if (isa == SSE3) return "SSE3";
+    if (isa == SSSE3) return "SSSE3";
+    if (isa == SSE41) return "SSE4.1";
+    if (isa == SSE42) return "SSE4.2";
+    if (isa == AVX) return "AVX";
+    if (isa == AVX2) return "AVX2";
+    if (isa == AVX512) return "AVX512";
+    return "UNKNOWN";
+  }
+
+  bool hasISA(int features, int isa) {
+    return (features & isa) == isa;
+  }
+  
+  std::string supportedTargetList (int features)
+  {
+    std::string v;
+    if (hasISA(features,SSE)) v += "SSE ";
+    if (hasISA(features,SSE2)) v += "SSE2 ";
+    if (hasISA(features,SSE3)) v += "SSE3 ";
+    if (hasISA(features,SSSE3)) v += "SSSE3 ";
+    if (hasISA(features,SSE41)) v += "SSE4.1 ";
+    if (hasISA(features,SSE42)) v += "SSE4.2 ";
+    if (hasISA(features,AVX)) v += "AVX ";
+    if (hasISA(features,AVXI)) v += "AVXI ";
+    if (hasISA(features,AVX2)) v += "AVX2 ";
+    if (hasISA(features,AVX512)) v += "AVX512 ";
+    return v;
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Windows Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__WIN32__)
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <psapi.h>
+
+namespace embree
+{
+  std::string getExecutableFileName() {
+    char filename[1024];
+    if (!GetModuleFileName(nullptr, filename, sizeof(filename)))
+      return std::string();
+    return std::string(filename);
+  }
+
+  unsigned int getNumberOfLogicalThreads() 
+  {
+    static int nThreads = -1;
+    if (nThreads != -1) return nThreads;
+
+    typedef WORD (WINAPI *GetActiveProcessorGroupCountFunc)();
+    typedef DWORD (WINAPI *GetActiveProcessorCountFunc)(WORD);
+    HMODULE hlib = LoadLibrary("Kernel32");
+    GetActiveProcessorGroupCountFunc pGetActiveProcessorGroupCount = (GetActiveProcessorGroupCountFunc)GetProcAddress(hlib, "GetActiveProcessorGroupCount");
+    GetActiveProcessorCountFunc      pGetActiveProcessorCount      = (GetActiveProcessorCountFunc)     GetProcAddress(hlib, "GetActiveProcessorCount");
+
+    if (pGetActiveProcessorGroupCount && pGetActiveProcessorCount) 
+    {
+      int groups = pGetActiveProcessorGroupCount();
+      int totalProcessors = 0;
+      for (int i = 0; i < groups; i++) 
+        totalProcessors += pGetActiveProcessorCount(i);
+      nThreads = totalProcessors;
+    }
+    else
+    {
+      SYSTEM_INFO sysinfo;
+      GetSystemInfo(&sysinfo);
+      nThreads = sysinfo.dwNumberOfProcessors;
+    }
+    assert(nThreads);
+    return nThreads;
+  }
+
+  int getTerminalWidth() 
+  {
+    HANDLE handle = GetStdHandle(STD_OUTPUT_HANDLE);
+    if (handle == INVALID_HANDLE_VALUE) return 80;
+    CONSOLE_SCREEN_BUFFER_INFO info;
+    memset(&info,0,sizeof(info));
+    GetConsoleScreenBufferInfo(handle, &info);
+    return info.dwSize.X;
+  }
+
+  double getSeconds() 
+  {
+    LARGE_INTEGER freq, val;
+    QueryPerformanceFrequency(&freq);
+    QueryPerformanceCounter(&val);
+    return (double)val.QuadPart / (double)freq.QuadPart;
+  }
+
+  void sleepSeconds(double t) {
+    Sleep(DWORD(1000.0*t));
+  }
+
+  size_t getVirtualMemoryBytes()
+  {
+    PROCESS_MEMORY_COUNTERS info;
+    GetProcessMemoryInfo( GetCurrentProcess( ), &info, sizeof(info) );
+    return (size_t)info.QuotaPeakPagedPoolUsage;
+  }
+
+  size_t getResidentMemoryBytes()
+  {
+    PROCESS_MEMORY_COUNTERS info;
+    GetProcessMemoryInfo( GetCurrentProcess( ), &info, sizeof(info) );
+    return (size_t)info.WorkingSetSize;
+  }
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Linux Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__LINUX__)
+
+#include <stdio.h>
+#include <unistd.h>
+
+namespace embree
+{
+  std::string getExecutableFileName() 
+  {
+    std::string pid = "/proc/" + toString(getpid()) + "/exe";
+    char buf[4096];
+    memset(buf,0,sizeof(buf));
+    if (readlink(pid.c_str(), buf, sizeof(buf)-1) == -1)
+      return std::string();
+    return std::string(buf);
+  }
+
+  size_t getVirtualMemoryBytes()
+  {
+    size_t virt, resident, shared;
+    std::ifstream buffer("/proc/self/statm");
+    buffer >> virt >> resident >> shared;
+    return virt*sysconf(_SC_PAGE_SIZE);
+  }
+
+  size_t getResidentMemoryBytes()
+  {
+    size_t virt, resident, shared;
+    std::ifstream buffer("/proc/self/statm");
+    buffer >> virt >> resident >> shared;
+    return resident*sysconf(_SC_PAGE_SIZE);
+  }
+}
+
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// FreeBSD Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined (__FreeBSD__)
+
+#include <sys/sysctl.h>
+
+namespace embree
+{
+  std::string getExecutableFileName()
+  {
+    const int mib[4] = { CTL_KERN, KERN_PROC, KERN_PROC_PATHNAME, -1 };
+    char buf[4096];
+    memset(buf,0,sizeof(buf));
+    size_t len = sizeof(buf)-1;
+    if (sysctl(mib, 4, buf, &len, 0x0, 0) == -1)
+      return std::string();
+    return std::string(buf);
+  }
+
+  size_t getVirtualMemoryBytes() {
+    return 0;
+  }
+   
+  size_t getResidentMemoryBytes() {
+    return 0;
+  }
+}
+
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Mac OS X Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__MACOSX__)
+
+#include <mach-o/dyld.h>
+
+namespace embree
+{
+  std::string getExecutableFileName()
+  {
+    char buf[4096];
+    uint32_t size = sizeof(buf);
+    if (_NSGetExecutablePath(buf, &size) != 0)
+      return std::string();
+    return std::string(buf);
+  }
+
+  size_t getVirtualMemoryBytes() {
+    return 0;
+  }
+   
+  size_t getResidentMemoryBytes() {
+    return 0;
+  }
+}
+
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Unix Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__UNIX__)
+
+#include <unistd.h>
+#include <sys/ioctl.h>
+#include <sys/time.h>
+#include <pthread.h>
+
+namespace embree
+{
+  unsigned int getNumberOfLogicalThreads() 
+  {
+    static int nThreads = -1;
+    if (nThreads != -1) return nThreads;
+
+// -- GODOT start --
+// #if defined(__MACOSX__)
+#if defined(__MACOSX__) || defined(__ANDROID__)
+// -- GODOT end --
+    nThreads = sysconf(_SC_NPROCESSORS_ONLN); // does not work in Linux LXC container
+    assert(nThreads);
+#else
+    cpu_set_t set;
+    if (pthread_getaffinity_np(pthread_self(), sizeof(set), &set) == 0)
+      nThreads = CPU_COUNT(&set);
+#endif
+    
+    assert(nThreads);
+    return nThreads;
+  }
+
+  int getTerminalWidth() 
+  {
+    struct winsize info;
+    if (ioctl(STDOUT_FILENO, TIOCGWINSZ, &info) < 0) return 80;
+    return info.ws_col;
+  }
+
+  double getSeconds() {
+    struct timeval tp; gettimeofday(&tp,nullptr);
+    return double(tp.tv_sec) + double(tp.tv_usec)/1E6;
+  }
+
+  void sleepSeconds(double t) {
+    usleep(1000000.0*t);
+  }
+}
+#endif
+
diff --git a/thirdparty/embree/common/sys/sysinfo.h b/thirdparty/embree/common/sys/sysinfo.h
new file mode 100644
index 0000000000..72351d12e4
--- /dev/null
+++ b/thirdparty/embree/common/sys/sysinfo.h
@@ -0,0 +1,178 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define CACHELINE_SIZE 64
+
+#if !defined(PAGE_SIZE)
+  #define PAGE_SIZE 4096
+#endif
+
+#define PAGE_SIZE_2M (2*1024*1024)
+#define PAGE_SIZE_4K (4*1024)
+
+#include "platform.h"
+
+/* define isa namespace and ISA bitvector */
+#if defined (__AVX512VL__)
+#  define isa avx512
+#  define ISA AVX512
+#  define ISA_STR "AVX512"
+#elif defined (__AVX2__)
+#  define isa avx2
+#  define ISA AVX2
+#  define ISA_STR "AVX2"
+#elif defined(__AVXI__)
+#  define isa avxi
+#  define ISA AVXI
+#  define ISA_STR "AVXI"
+#elif defined(__AVX__)
+#  define isa avx
+#  define ISA AVX
+#  define ISA_STR "AVX"
+#elif defined (__SSE4_2__)
+#  define isa sse42
+#  define ISA SSE42
+#  define ISA_STR "SSE4.2"
+//#elif defined (__SSE4_1__) //  we demote this to SSE2, MacOSX code compiles with SSE41 by default with XCode 11
+//#  define isa sse41
+//#  define ISA SSE41
+//#  define ISA_STR "SSE4.1"
+//#elif defined(__SSSE3__) // we demote this to SSE2, MacOSX code compiles with SSSE3 by default with ICC
+//#  define isa ssse3
+//#  define ISA SSSE3
+//#  define ISA_STR "SSSE3"
+//#elif defined(__SSE3__) // we demote this to SSE2, MacOSX code compiles with SSE3 by default with clang
+//#  define isa sse3
+//#  define ISA SSE3
+//#  define ISA_STR "SSE3"
+#elif defined(__SSE2__) || defined(__SSE3__) || defined(__SSSE3__)
+#  define isa sse2
+#  define ISA SSE2
+#  define ISA_STR "SSE2"
+#elif defined(__SSE__)
+#  define isa sse
+#  define ISA SSE
+#  define ISA_STR "SSE"
+#else 
+#error Unknown ISA
+#endif
+
+namespace embree
+{
+  enum class CPU
+  {
+    XEON_ICE_LAKE,
+    CORE_ICE_LAKE,
+    CORE_TIGER_LAKE,
+    CORE_COMET_LAKE,
+    CORE_CANNON_LAKE,
+    CORE_KABY_LAKE,
+    XEON_SKY_LAKE,
+    CORE_SKY_LAKE,
+    XEON_PHI_KNIGHTS_MILL,
+    XEON_PHI_KNIGHTS_LANDING,
+    XEON_BROADWELL,
+    CORE_BROADWELL,
+    XEON_HASWELL,
+    CORE_HASWELL,
+    XEON_IVY_BRIDGE,
+    CORE_IVY_BRIDGE,
+    SANDY_BRIDGE,
+    NEHALEM,
+    CORE2,
+    CORE1,
+    ARM,
+    UNKNOWN,
+  };
+  
+  /*! get the full path to the running executable */
+  std::string getExecutableFileName();
+
+  /*! return platform name */
+  std::string getPlatformName();
+
+  /*! get the full name of the compiler */
+  std::string getCompilerName();
+
+  /*! return the name of the CPU */
+  std::string getCPUVendor();
+
+  /*! get microprocessor model */
+  CPU getCPUModel(); 
+
+  /*! converts CPU model into string */
+  std::string stringOfCPUModel(CPU model);
+
+  /*! CPU features */
+  static const int CPU_FEATURE_SSE    = 1 << 0;
+  static const int CPU_FEATURE_SSE2   = 1 << 1;
+  static const int CPU_FEATURE_SSE3   = 1 << 2;
+  static const int CPU_FEATURE_SSSE3  = 1 << 3;
+  static const int CPU_FEATURE_SSE41  = 1 << 4;
+  static const int CPU_FEATURE_SSE42  = 1 << 5; 
+  static const int CPU_FEATURE_POPCNT = 1 << 6;
+  static const int CPU_FEATURE_AVX    = 1 << 7;
+  static const int CPU_FEATURE_F16C   = 1 << 8;
+  static const int CPU_FEATURE_RDRAND = 1 << 9;
+  static const int CPU_FEATURE_AVX2   = 1 << 10;
+  static const int CPU_FEATURE_FMA3   = 1 << 11;
+  static const int CPU_FEATURE_LZCNT  = 1 << 12;
+  static const int CPU_FEATURE_BMI1   = 1 << 13;
+  static const int CPU_FEATURE_BMI2   = 1 << 14;
+  static const int CPU_FEATURE_AVX512F = 1 << 16;
+  static const int CPU_FEATURE_AVX512DQ = 1 << 17;    
+  static const int CPU_FEATURE_AVX512PF = 1 << 18;
+  static const int CPU_FEATURE_AVX512ER = 1 << 19;
+  static const int CPU_FEATURE_AVX512CD = 1 << 20;
+  static const int CPU_FEATURE_AVX512BW = 1 << 21;
+  static const int CPU_FEATURE_AVX512VL = 1 << 22;
+  static const int CPU_FEATURE_AVX512IFMA = 1 << 23;
+  static const int CPU_FEATURE_AVX512VBMI = 1 << 24;
+  static const int CPU_FEATURE_XMM_ENABLED = 1 << 25;
+  static const int CPU_FEATURE_YMM_ENABLED = 1 << 26;
+  static const int CPU_FEATURE_ZMM_ENABLED = 1 << 27;
+ 
+  /*! get CPU features */
+  int getCPUFeatures();
+
+  /*! convert CPU features into a string */
+  std::string stringOfCPUFeatures(int features);
+
+  /*! creates a string of all supported targets that are supported */
+  std::string supportedTargetList (int isa);
+
+  /*! ISAs */
+  static const int SSE    = CPU_FEATURE_SSE | CPU_FEATURE_XMM_ENABLED; 
+  static const int SSE2   = SSE | CPU_FEATURE_SSE2;
+  static const int SSE3   = SSE2 | CPU_FEATURE_SSE3;
+  static const int SSSE3  = SSE3 | CPU_FEATURE_SSSE3;
+  static const int SSE41  = SSSE3 | CPU_FEATURE_SSE41;
+  static const int SSE42  = SSE41 | CPU_FEATURE_SSE42 | CPU_FEATURE_POPCNT;
+  static const int AVX    = SSE42 | CPU_FEATURE_AVX | CPU_FEATURE_YMM_ENABLED;
+  static const int AVXI   = AVX | CPU_FEATURE_F16C | CPU_FEATURE_RDRAND;
+  static const int AVX2   = AVXI | CPU_FEATURE_AVX2 | CPU_FEATURE_FMA3 | CPU_FEATURE_BMI1 | CPU_FEATURE_BMI2 | CPU_FEATURE_LZCNT;
+  static const int AVX512 = AVX2 | CPU_FEATURE_AVX512F | CPU_FEATURE_AVX512DQ | CPU_FEATURE_AVX512CD | CPU_FEATURE_AVX512BW | CPU_FEATURE_AVX512VL | CPU_FEATURE_ZMM_ENABLED;
+
+  /*! converts ISA bitvector into a string */
+  std::string stringOfISA(int features);
+
+  /*! return the number of logical threads of the system */
+  unsigned int getNumberOfLogicalThreads();
+
+  /*! returns the size of the terminal window in characters */
+  int getTerminalWidth();
+
+  /*! returns performance counter in seconds */
+  double getSeconds();
+
+  /*! sleeps the specified number of seconds */
+  void sleepSeconds(double t);
+
+  /*! returns virtual address space occupied by process */
+  size_t getVirtualMemoryBytes();
+
+  /*! returns resident memory required by process */
+  size_t getResidentMemoryBytes();
+}
diff --git a/thirdparty/embree/common/sys/thread.cpp b/thirdparty/embree/common/sys/thread.cpp
new file mode 100644
index 0000000000..f4014be89b
--- /dev/null
+++ b/thirdparty/embree/common/sys/thread.cpp
@@ -0,0 +1,474 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "thread.h"
+#include "sysinfo.h"
+#include "string.h"
+
+#include <iostream>
+#if defined(__ARM_NEON)
+#include "../simd/arm/emulation.h"
+#else
+#include <xmmintrin.h>
+#endif
+
+#if defined(PTHREADS_WIN32)
+#pragma comment (lib, "pthreadVC.lib")
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Windows Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__WIN32__)
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+
+namespace embree
+{
+  /*! set the affinity of a given thread */
+  void setAffinity(HANDLE thread, ssize_t affinity)
+  {
+    typedef WORD (WINAPI *GetActiveProcessorGroupCountFunc)();
+    typedef DWORD (WINAPI *GetActiveProcessorCountFunc)(WORD);
+    typedef BOOL (WINAPI *SetThreadGroupAffinityFunc)(HANDLE, const GROUP_AFFINITY *, PGROUP_AFFINITY);
+    typedef BOOL (WINAPI *SetThreadIdealProcessorExFunc)(HANDLE, PPROCESSOR_NUMBER, PPROCESSOR_NUMBER);
+    HMODULE hlib = LoadLibrary("Kernel32");
+    GetActiveProcessorGroupCountFunc pGetActiveProcessorGroupCount = (GetActiveProcessorGroupCountFunc)GetProcAddress(hlib, "GetActiveProcessorGroupCount");
+    GetActiveProcessorCountFunc pGetActiveProcessorCount = (GetActiveProcessorCountFunc)GetProcAddress(hlib, "GetActiveProcessorCount");
+    SetThreadGroupAffinityFunc pSetThreadGroupAffinity = (SetThreadGroupAffinityFunc)GetProcAddress(hlib, "SetThreadGroupAffinity");
+    SetThreadIdealProcessorExFunc pSetThreadIdealProcessorEx = (SetThreadIdealProcessorExFunc)GetProcAddress(hlib, "SetThreadIdealProcessorEx");
+    if (pGetActiveProcessorGroupCount && pGetActiveProcessorCount && pSetThreadGroupAffinity && pSetThreadIdealProcessorEx) 
+    {
+      int groups = pGetActiveProcessorGroupCount();
+      int totalProcessors = 0, group = 0, number = 0;
+      for (int i = 0; i<groups; i++) {
+        int processors = pGetActiveProcessorCount(i);
+        if (totalProcessors + processors > affinity) {
+          group = i;
+          number = (int)affinity - totalProcessors;
+          break;
+        }
+        totalProcessors += processors;
+      }
+  
+      GROUP_AFFINITY groupAffinity;
+      groupAffinity.Group = (WORD)group;
+      groupAffinity.Mask = (KAFFINITY)(uint64_t(1) << number);
+      groupAffinity.Reserved[0] = 0;
+      groupAffinity.Reserved[1] = 0;
+      groupAffinity.Reserved[2] = 0;
+      if (!pSetThreadGroupAffinity(thread, &groupAffinity, nullptr))
+        WARNING("SetThreadGroupAffinity failed"); // on purpose only a warning
+  
+      PROCESSOR_NUMBER processorNumber;
+      processorNumber.Group = group;
+      processorNumber.Number = number;
+      processorNumber.Reserved = 0;
+      if (!pSetThreadIdealProcessorEx(thread, &processorNumber, nullptr))
+        WARNING("SetThreadIdealProcessorEx failed"); // on purpose only a warning
+    } 
+    else 
+    {
+      if (!SetThreadAffinityMask(thread, DWORD_PTR(uint64_t(1) << affinity)))
+        WARNING("SetThreadAffinityMask failed"); // on purpose only a warning
+      if (SetThreadIdealProcessor(thread, (DWORD)affinity) == (DWORD)-1)
+        WARNING("SetThreadIdealProcessor failed"); // on purpose only a warning
+      }
+  }
+
+  /*! set affinity of the calling thread */
+  void setAffinity(ssize_t affinity) {
+    setAffinity(GetCurrentThread(), affinity);
+  }
+
+  struct ThreadStartupData 
+  {
+  public:
+    ThreadStartupData (thread_func f, void* arg) 
+      : f(f), arg(arg) {}
+  public:
+    thread_func f;
+    void* arg;
+  };
+
+  DWORD WINAPI threadStartup(LPVOID ptr)
+  {
+    ThreadStartupData* parg = (ThreadStartupData*) ptr;
+    _mm_setcsr(_mm_getcsr() | /*FTZ:*/ (1<<15) | /*DAZ:*/ (1<<6));
+    parg->f(parg->arg);
+    delete parg;
+    return 0;
+  }
+
+#if !defined(PTHREADS_WIN32)
+
+  /*! creates a hardware thread running on specific core */
+  thread_t createThread(thread_func f, void* arg, size_t stack_size, ssize_t threadID)
+  {
+    HANDLE thread = CreateThread(nullptr, stack_size, threadStartup, new ThreadStartupData(f,arg), 0, nullptr);
+    if (thread == nullptr) FATAL("CreateThread failed");
+    if (threadID >= 0) setAffinity(thread, threadID);
+    return thread_t(thread);
+  }
+
+  /*! the thread calling this function gets yielded */
+  void yield() {
+    SwitchToThread();
+  }
+
+  /*! waits until the given thread has terminated */
+  void join(thread_t tid) {
+    WaitForSingleObject(HANDLE(tid), INFINITE);
+    CloseHandle(HANDLE(tid));
+  }
+
+  /*! destroy a hardware thread by its handle */
+  void destroyThread(thread_t tid) {
+    TerminateThread(HANDLE(tid),0);
+    CloseHandle(HANDLE(tid));
+  }
+
+  /*! creates thread local storage */
+  tls_t createTls() {
+    return tls_t(size_t(TlsAlloc()));
+  }
+
+  /*! set the thread local storage pointer */
+  void setTls(tls_t tls, void* const ptr) {
+    TlsSetValue(DWORD(size_t(tls)), ptr);
+  }
+
+  /*! return the thread local storage pointer */
+  void* getTls(tls_t tls) {
+    return TlsGetValue(DWORD(size_t(tls)));
+  }
+
+  /*! destroys thread local storage identifier */
+  void destroyTls(tls_t tls) {
+    TlsFree(DWORD(size_t(tls)));
+  }
+#endif
+}
+
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Linux Platform
+////////////////////////////////////////////////////////////////////////////////
+
+// -- GODOT start --
+#if defined(__LINUX__) && !defined(__ANDROID__)
+// -- GODOT end --
+
+#include <fstream>
+#include <sstream>
+#include <algorithm>
+
+namespace embree
+{
+  static MutexSys mutex;
+  static std::vector<size_t> threadIDs;
+  
+  /* changes thread ID mapping such that we first fill up all thread on one core */
+  size_t mapThreadID(size_t threadID)
+  {
+    Lock<MutexSys> lock(mutex);
+    
+    if (threadIDs.size() == 0)
+    {
+      /* parse thread/CPU topology */
+      for (size_t cpuID=0;;cpuID++)
+      {
+        std::fstream fs;
+        std::string cpu = std::string("/sys/devices/system/cpu/cpu") + std::to_string((long long)cpuID) + std::string("/topology/thread_siblings_list");
+        fs.open (cpu.c_str(), std::fstream::in);
+        if (fs.fail()) break;
+
+        int i;
+        while (fs >> i) 
+        {
+          if (std::none_of(threadIDs.begin(),threadIDs.end(),[&] (int id) { return id == i; }))
+            threadIDs.push_back(i);
+          if (fs.peek() == ',') 
+            fs.ignore();
+        }
+        fs.close();
+      }
+
+#if 0
+      for (size_t i=0;i<threadIDs.size();i++)
+        std::cout << i << " -> " << threadIDs[i] << std::endl;
+#endif
+
+      /* verify the mapping and do not use it if the mapping has errors */
+      for (size_t i=0;i<threadIDs.size();i++) {
+        for (size_t j=0;j<threadIDs.size();j++) {
+          if (i != j && threadIDs[i] == threadIDs[j]) {
+            threadIDs.clear();
+          }
+        }
+      }
+    }
+
+    /* re-map threadIDs if mapping is available */
+    size_t ID = threadID;
+    if (threadID < threadIDs.size())
+      ID = threadIDs[threadID];
+
+    /* find correct thread to affinitize to */
+    cpu_set_t set;
+    if (pthread_getaffinity_np(pthread_self(), sizeof(set), &set) == 0)
+    {
+      for (int i=0, j=0; i<CPU_SETSIZE; i++)
+      {
+        if (!CPU_ISSET(i,&set)) continue;
+
+        if (j == ID) {
+          ID = i;
+          break;
+        }
+        j++;
+      }
+    }
+
+    return ID;
+  }
+
+  /*! set affinity of the calling thread */
+  void setAffinity(ssize_t affinity)
+  {
+    cpu_set_t cset;
+    CPU_ZERO(&cset);
+    size_t threadID = mapThreadID(affinity);
+    CPU_SET(threadID, &cset);
+
+    pthread_setaffinity_np(pthread_self(), sizeof(cset), &cset);
+  }
+}
+#endif
+
+// -- GODOT start --
+////////////////////////////////////////////////////////////////////////////////
+/// Android Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__ANDROID__)
+
+namespace embree
+{
+  /*! set affinity of the calling thread */
+  void setAffinity(ssize_t affinity)
+  {
+    cpu_set_t cset;
+    CPU_ZERO(&cset);
+    CPU_SET(affinity, &cset);
+
+    sched_setaffinity(0, sizeof(cset), &cset);
+  }
+}
+#endif
+// -- GODOT end --
+
+////////////////////////////////////////////////////////////////////////////////
+/// FreeBSD Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__FreeBSD__)
+
+#include <pthread_np.h>
+
+namespace embree
+{
+  /*! set affinity of the calling thread */
+  void setAffinity(ssize_t affinity)
+  {
+    cpuset_t cset;
+    CPU_ZERO(&cset);
+    CPU_SET(affinity, &cset);
+
+    pthread_setaffinity_np(pthread_self(), sizeof(cset), &cset);
+  }
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// MacOSX Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__MACOSX__)
+
+#include <mach/thread_act.h>
+#include <mach/thread_policy.h>
+#include <mach/mach_init.h>
+
+namespace embree
+{
+  /*! set affinity of the calling thread */
+  void setAffinity(ssize_t affinity)
+  {
+#if !defined(__ARM_NEON) // affinity seems not supported on M1 chip
+    
+    thread_affinity_policy ap;
+    ap.affinity_tag = affinity;
+    if (thread_policy_set(mach_thread_self(),THREAD_AFFINITY_POLICY,(thread_policy_t)&ap,THREAD_AFFINITY_POLICY_COUNT) != KERN_SUCCESS)
+      WARNING("setting thread affinity failed"); // on purpose only a warning
+    
+#endif
+  }
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Unix Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__UNIX__) || defined(PTHREADS_WIN32)
+
+#include <pthread.h>
+#include <sched.h>
+
+#if defined(__USE_NUMA__)
+#include <numa.h>
+#endif
+
+namespace embree
+{
+  struct ThreadStartupData 
+  {
+  public:
+    ThreadStartupData (thread_func f, void* arg, int affinity) 
+      : f(f), arg(arg), affinity(affinity) {}
+  public: 
+    thread_func f;
+    void* arg;
+    ssize_t affinity;
+  };
+  
+  static void* threadStartup(ThreadStartupData* parg)
+  {
+    _mm_setcsr(_mm_getcsr() | /*FTZ:*/ (1<<15) | /*DAZ:*/ (1<<6));
+    
+    /*! Mac OS X does not support setting affinity at thread creation time */
+#if defined(__MACOSX__)
+    if (parg->affinity >= 0)
+	setAffinity(parg->affinity);
+#endif
+
+    parg->f(parg->arg);
+    delete parg;
+    return nullptr;
+  }
+
+  /*! creates a hardware thread running on specific core */
+  thread_t createThread(thread_func f, void* arg, size_t stack_size, ssize_t threadID)
+  {
+    /* set stack size */
+    pthread_attr_t attr;
+    pthread_attr_init(&attr);
+    if (stack_size > 0) pthread_attr_setstacksize (&attr, stack_size);
+
+    /* create thread */
+    pthread_t* tid = new pthread_t;
+    if (pthread_create(tid,&attr,(void*(*)(void*))threadStartup,new ThreadStartupData(f,arg,threadID)) != 0) {
+      pthread_attr_destroy(&attr);
+      delete tid; 
+      FATAL("pthread_create failed");
+    }
+    pthread_attr_destroy(&attr);
+
+    /* set affinity */
+// -- GODOT start --
+#if defined(__LINUX__) && !defined(__ANDROID__)
+// -- GODOT end --
+    if (threadID >= 0) {
+      cpu_set_t cset;
+      CPU_ZERO(&cset);
+      threadID = mapThreadID(threadID);
+      CPU_SET(threadID, &cset);
+      pthread_setaffinity_np(*tid, sizeof(cset), &cset);
+    }
+#elif defined(__FreeBSD__)
+    if (threadID >= 0) {
+      cpuset_t cset;
+      CPU_ZERO(&cset);
+      CPU_SET(threadID, &cset);
+      pthread_setaffinity_np(*tid, sizeof(cset), &cset);
+    }
+// -- GODOT start --
+#elif defined(__ANDROID__)
+    if (threadID >= 0) {
+      cpu_set_t cset;
+      CPU_ZERO(&cset);
+      CPU_SET(threadID, &cset);
+      sched_setaffinity(pthread_gettid_np(*tid), sizeof(cset), &cset);
+    }
+#endif
+// -- GODOT end --
+
+    return thread_t(tid);
+  }
+
+  /*! the thread calling this function gets yielded */
+  void yield() {
+    sched_yield();
+  }
+
+  /*! waits until the given thread has terminated */
+  void join(thread_t tid) {
+    if (pthread_join(*(pthread_t*)tid, nullptr) != 0)
+      FATAL("pthread_join failed");
+    delete (pthread_t*)tid;
+  }
+
+  /*! destroy a hardware thread by its handle */
+  void destroyThread(thread_t tid) {
+// -- GODOT start --
+#if defined(__ANDROID__)
+    FATAL("Can't destroy threads on Android.");
+#else
+    pthread_cancel(*(pthread_t*)tid);
+    delete (pthread_t*)tid;
+#endif
+// -- GODOT end --
+  }
+
+  /*! creates thread local storage */
+  tls_t createTls() 
+  {
+    pthread_key_t* key = new pthread_key_t;
+    if (pthread_key_create(key,nullptr) != 0) {
+      delete key;
+      FATAL("pthread_key_create failed");
+    }
+
+    return tls_t(key);
+  }
+
+  /*! return the thread local storage pointer */
+  void* getTls(tls_t tls) 
+  {
+    assert(tls);
+    return pthread_getspecific(*(pthread_key_t*)tls);
+  }
+
+  /*! set the thread local storage pointer */
+  void setTls(tls_t tls, void* const ptr) 
+  {
+    assert(tls);
+    if (pthread_setspecific(*(pthread_key_t*)tls, ptr) != 0)
+      FATAL("pthread_setspecific failed");
+  }
+
+  /*! destroys thread local storage identifier */
+  void destroyTls(tls_t tls) 
+  {
+    assert(tls);
+    if (pthread_key_delete(*(pthread_key_t*)tls) != 0)
+      FATAL("pthread_key_delete failed");
+    delete (pthread_key_t*)tls;
+  }
+}
+
+#endif
diff --git a/thirdparty/embree/common/sys/thread.h b/thirdparty/embree/common/sys/thread.h
new file mode 100644
index 0000000000..92a10d5c5d
--- /dev/null
+++ b/thirdparty/embree/common/sys/thread.h
@@ -0,0 +1,49 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "platform.h"
+#include "mutex.h"
+#include "alloc.h"
+#include "vector.h"
+#include <vector>
+
+namespace embree
+{
+  /*! type for thread */
+  typedef struct opaque_thread_t* thread_t;
+
+  /*! signature of thread start function */
+  typedef void (*thread_func)(void*);
+
+  /*! creates a hardware thread running on specific logical thread */
+  thread_t createThread(thread_func f, void* arg, size_t stack_size = 0, ssize_t threadID = -1);
+
+  /*! set affinity of the calling thread */
+  void setAffinity(ssize_t affinity);
+
+  /*! the thread calling this function gets yielded */
+  void yield();
+
+  /*! waits until the given thread has terminated */
+  void join(thread_t tid);
+
+  /*! destroy handle of a thread */
+  void destroyThread(thread_t tid);
+
+  /*! type for handle to thread local storage */
+  typedef struct opaque_tls_t* tls_t;
+
+  /*! creates thread local storage */
+  tls_t createTls();
+
+  /*! set the thread local storage pointer */
+  void setTls(tls_t tls, void* const ptr);
+
+  /*! return the thread local storage pointer */
+  void* getTls(tls_t tls);
+
+  /*! destroys thread local storage identifier */
+  void destroyTls(tls_t tls);
+}
diff --git a/thirdparty/embree/common/sys/vector.h b/thirdparty/embree/common/sys/vector.h
new file mode 100644
index 0000000000..f832626789
--- /dev/null
+++ b/thirdparty/embree/common/sys/vector.h
@@ -0,0 +1,242 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "alloc.h"
+#include <algorithm>
+
+namespace embree
+{
+   template<typename T, typename allocator>
+    class vector_t
+    {
+    public:
+      typedef T value_type;
+      typedef T* iterator;
+      typedef const T* const_iterator;
+    
+      __forceinline vector_t () 
+        : size_active(0), size_alloced(0), items(nullptr) {}
+    
+      __forceinline explicit vector_t (size_t sz) 
+        : size_active(0), size_alloced(0), items(nullptr) { internal_resize_init(sz); }
+    
+      template<typename M>
+      __forceinline explicit vector_t (M alloc, size_t sz) 
+      : alloc(alloc), size_active(0), size_alloced(0), items(nullptr) { internal_resize_init(sz); }
+    
+      __forceinline ~vector_t() {
+        clear();
+      }
+    
+      __forceinline vector_t (const vector_t& other)
+      {
+        size_active = other.size_active;
+        size_alloced = other.size_alloced;
+        items = alloc.allocate(size_alloced);
+        for (size_t i=0; i<size_active; i++) 
+          ::new (&items[i]) value_type(other.items[i]);
+      }
+    
+      __forceinline vector_t (vector_t&& other)
+        : alloc(std::move(other.alloc))
+      {
+        size_active = other.size_active; other.size_active = 0;
+        size_alloced = other.size_alloced; other.size_alloced = 0;
+        items = other.items; other.items = nullptr;
+      }
+
+      __forceinline vector_t& operator=(const vector_t& other) 
+      {
+        resize(other.size_active);
+        for (size_t i=0; i<size_active; i++)
+          items[i] = value_type(other.items[i]);
+        return *this;
+      }
+
+      __forceinline vector_t& operator=(vector_t&& other) 
+      {
+        clear();
+        alloc = std::move(other.alloc);
+        size_active = other.size_active; other.size_active = 0;
+        size_alloced = other.size_alloced; other.size_alloced = 0;
+        items = other.items; other.items = nullptr;
+        return *this;
+      }
+
+      /********************** Iterators  ****************************/
+    
+      __forceinline       iterator begin()       { return items; };
+      __forceinline const_iterator begin() const { return items; };
+
+      __forceinline       iterator end  ()       { return items+size_active; };
+      __forceinline const_iterator end  () const { return items+size_active; };
+
+
+      /********************** Capacity ****************************/
+
+      __forceinline bool   empty    () const { return size_active == 0; }
+      __forceinline size_t size     () const { return size_active; }
+      __forceinline size_t capacity () const { return size_alloced; }
+
+
+      __forceinline void resize(size_t new_size) {
+        internal_resize(new_size,internal_grow_size(new_size));
+      }
+
+      __forceinline void reserve(size_t new_alloced) 
+      {
+        /* do nothing if container already large enough */
+        if (new_alloced <= size_alloced) 
+          return;
+
+        /* resize exact otherwise */
+        internal_resize(size_active,new_alloced);
+      }
+
+      __forceinline void shrink_to_fit() {
+        internal_resize(size_active,size_active);
+      }
+
+      /******************** Element access **************************/
+
+      __forceinline       T& operator[](size_t i)       { assert(i < size_active); return items[i]; }
+      __forceinline const T& operator[](size_t i) const { assert(i < size_active); return items[i]; }
+
+      __forceinline       T& at(size_t i)       { assert(i < size_active); return items[i]; }
+      __forceinline const T& at(size_t i) const { assert(i < size_active); return items[i]; }
+
+      __forceinline T& front() const { assert(size_active > 0); return items[0]; };
+      __forceinline T& back () const { assert(size_active > 0); return items[size_active-1]; };
+
+      __forceinline       T* data()       { return items; };
+      __forceinline const T* data() const { return items; };
+
+     
+      /******************** Modifiers **************************/
+
+      __forceinline void push_back(const T& nt) 
+      {
+        const T v = nt; // need local copy as input reference could point to this vector
+        internal_resize(size_active,internal_grow_size(size_active+1));
+        ::new (&items[size_active++]) T(v);
+      }
+
+      __forceinline void pop_back() 
+      {
+        assert(!empty());
+        size_active--;
+        alloc.destroy(&items[size_active]);
+      }
+
+      __forceinline void clear() 
+      {
+        /* destroy elements */
+        for (size_t i=0; i<size_active; i++)
+          alloc.destroy(&items[i]);
+        
+        /* free memory */
+        alloc.deallocate(items,size_alloced); 
+        items = nullptr;
+        size_active = size_alloced = 0;
+      }
+
+    /******************** Comparisons **************************/
+    
+    friend bool operator== (const vector_t& a, const vector_t& b) 
+    {
+      if (a.size() != b.size()) return false;
+      for (size_t i=0; i<a.size(); i++)
+        if (a[i] != b[i])
+          return false;
+      return true;
+    }
+
+    friend bool operator!= (const vector_t& a, const vector_t& b) {
+      return !(a==b);
+    }
+
+    private:
+
+      __forceinline void internal_resize_init(size_t new_active)
+      {
+        assert(size_active == 0); 
+        assert(size_alloced == 0);
+        assert(items == nullptr);
+        if (new_active == 0) return;
+        items = alloc.allocate(new_active);
+        for (size_t i=0; i<new_active; i++) ::new (&items[i]) T();
+        size_active = new_active;
+        size_alloced = new_active;
+      }
+
+      __forceinline void internal_resize(size_t new_active, size_t new_alloced)
+      {
+        assert(new_active <= new_alloced); 
+
+        /* destroy elements */
+        if (new_active < size_active) 
+        {
+          for (size_t i=new_active; i<size_active; i++)
+            alloc.destroy(&items[i]);
+          size_active = new_active;
+        }
+
+        /* only reallocate if necessary */
+        if (new_alloced == size_alloced) {
+          for (size_t i=size_active; i<new_active; i++) ::new (&items[i]) T;
+          size_active = new_active;
+          return;
+        }
+
+        /* reallocate and copy items */
+        T* old_items = items;
+        items = alloc.allocate(new_alloced);
+        for (size_t i=0; i<size_active; i++) {
+          ::new (&items[i]) T(std::move(old_items[i]));
+          alloc.destroy(&old_items[i]);
+        }
+
+        for (size_t i=size_active; i<new_active; i++) {
+          ::new (&items[i]) T;
+        }
+
+        alloc.deallocate(old_items,size_alloced);
+        size_active = new_active;
+        size_alloced = new_alloced;
+      }
+
+      __forceinline size_t internal_grow_size(size_t new_alloced)
+      {
+        /* do nothing if container already large enough */
+        if (new_alloced <= size_alloced) 
+          return size_alloced;
+
+        /* resize to next power of 2 otherwise */
+        size_t new_size_alloced = size_alloced;
+        while (new_size_alloced < new_alloced) {
+          new_size_alloced = std::max(size_t(1),2*new_size_alloced);
+        }
+        return new_size_alloced;
+      }
+
+    private:
+      allocator alloc;
+      size_t size_active;    // number of valid items
+      size_t size_alloced;   // number of items allocated
+      T* items;              // data array
+    };
+
+  /*! vector class that performs standard allocations */
+  template<typename T>
+    using vector = vector_t<T,std::allocator<T>>;
+
+  /*! vector class that performs aligned allocations */
+  template<typename T>
+    using avector = vector_t<T,aligned_allocator<T,std::alignment_of<T>::value> >;
+  
+  /*! vector class that performs OS allocations */
+  template<typename T>
+    using ovector = vector_t<T,os_allocator<T> >;
+}
diff --git a/thirdparty/embree/common/tasking/taskscheduler.h b/thirdparty/embree/common/tasking/taskscheduler.h
new file mode 100644
index 0000000000..8f3dd87689
--- /dev/null
+++ b/thirdparty/embree/common/tasking/taskscheduler.h
@@ -0,0 +1,15 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#if defined(TASKING_INTERNAL)
+#  include "taskschedulerinternal.h"
+#elif defined(TASKING_TBB)
+#  include "taskschedulertbb.h"
+#elif defined(TASKING_PPL)
+#  include "taskschedulerppl.h"
+#else
+#  error "no tasking system enabled"
+#endif
+
diff --git a/thirdparty/embree/common/tasking/taskschedulerinternal.cpp b/thirdparty/embree/common/tasking/taskschedulerinternal.cpp
new file mode 100644
index 0000000000..ad438588a3
--- /dev/null
+++ b/thirdparty/embree/common/tasking/taskschedulerinternal.cpp
@@ -0,0 +1,420 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "taskschedulerinternal.h"
+#include "../math/math.h"
+#include "../sys/sysinfo.h"
+#include <algorithm>
+
+namespace embree
+{
+  RTC_NAMESPACE_BEGIN
+  
+  static MutexSys g_mutex;
+  size_t TaskScheduler::g_numThreads = 0;
+  __thread TaskScheduler* TaskScheduler::g_instance = nullptr;
+  std::vector<Ref<TaskScheduler>> g_instance_vector;
+  __thread TaskScheduler::Thread* TaskScheduler::thread_local_thread = nullptr;
+  TaskScheduler::ThreadPool* TaskScheduler::threadPool = nullptr;
+
+  template<typename Predicate, typename Body>
+  __forceinline void TaskScheduler::steal_loop(Thread& thread, const Predicate& pred, const Body& body)
+  {
+    while (true)
+    {
+      /*! some rounds that yield */
+      for (size_t i=0; i<32; i++)
+      {
+        /*! some spinning rounds */
+        const size_t threadCount = thread.threadCount();
+        for (size_t j=0; j<1024; j+=threadCount)
+        {
+          if (!pred()) return;
+          if (thread.scheduler->steal_from_other_threads(thread)) {
+            i=j=0;
+            body();
+          }
+        }
+        yield();
+      }
+    }
+  }
+
+  /*! run this task */
+  void TaskScheduler::Task::run_internal (Thread& thread) // FIXME: avoid as many dll_exports as possible
+  {
+    /* try to run if not already stolen */
+    if (try_switch_state(INITIALIZED,DONE))
+    {
+      Task* prevTask = thread.task;
+      thread.task = this;
+      // -- GODOT start --
+      // try {
+      // if (thread.scheduler->cancellingException == nullptr)
+          closure->execute();
+      // } catch (...) {
+      //   if (thread.scheduler->cancellingException == nullptr)
+      //     thread.scheduler->cancellingException = std::current_exception();
+      // }
+      // -- GODOT end --
+      thread.task = prevTask;
+      add_dependencies(-1);
+    }
+
+    /* steal until all dependencies have completed */
+    steal_loop(thread,
+               [&] () { return dependencies>0; },
+               [&] () { while (thread.tasks.execute_local_internal(thread,this)); });
+
+    /* now signal our parent task that we are finished */
+    if (parent)
+      parent->add_dependencies(-1);
+  }
+
+    /*! run this task */
+  dll_export void TaskScheduler::Task::run (Thread& thread) {
+    run_internal(thread);
+  }
+
+  bool TaskScheduler::TaskQueue::execute_local_internal(Thread& thread, Task* parent)
+  {
+    /* stop if we run out of local tasks or reach the waiting task */
+    if (right == 0 || &tasks[right-1] == parent)
+      return false;
+
+    /* execute task */
+    size_t oldRight = right;
+    tasks[right-1].run_internal(thread);
+    if (right != oldRight) {
+      THROW_RUNTIME_ERROR("you have to wait for spawned subtasks");
+    }
+
+    /* pop task and closure from stack */
+    right--;
+    if (tasks[right].stackPtr != size_t(-1))
+      stackPtr = tasks[right].stackPtr;
+
+    /* also move left pointer */
+    if (left >= right) left.store(right.load());
+
+    return right != 0;
+  }
+
+  dll_export bool TaskScheduler::TaskQueue::execute_local(Thread& thread, Task* parent) {
+    return execute_local_internal(thread,parent);
+  }
+
+  bool TaskScheduler::TaskQueue::steal(Thread& thread)
+  {
+    size_t l = left;
+    size_t r = right;
+    if (l < r)
+    {
+      l = left++;
+       if (l >= r)
+         return false;
+    }
+    else
+      return false;
+
+    if (!tasks[l].try_steal(thread.tasks.tasks[thread.tasks.right]))
+      return false;
+
+    thread.tasks.right++;
+    return true;
+  }
+
+  /* we steal from the left */
+  size_t TaskScheduler::TaskQueue::getTaskSizeAtLeft()
+  {
+    if (left >= right) return 0;
+    return tasks[left].N;
+  }
+
+  void threadPoolFunction(std::pair<TaskScheduler::ThreadPool*,size_t>* pair)
+  {
+    TaskScheduler::ThreadPool* pool = pair->first;
+    size_t threadIndex = pair->second;
+    delete pair;
+    pool->thread_loop(threadIndex);
+  }
+
+  TaskScheduler::ThreadPool::ThreadPool(bool set_affinity)
+    : numThreads(0), numThreadsRunning(0), set_affinity(set_affinity), running(false) {}
+
+  dll_export void TaskScheduler::ThreadPool::startThreads()
+  {
+    if (running) return;
+    setNumThreads(numThreads,true);
+  }
+
+  void TaskScheduler::ThreadPool::setNumThreads(size_t newNumThreads, bool startThreads)
+  {
+    Lock<MutexSys> lock(g_mutex);
+    assert(newNumThreads);
+    newNumThreads = min(newNumThreads, (size_t) getNumberOfLogicalThreads());
+
+    numThreads = newNumThreads;
+    if (!startThreads && !running) return;
+    running = true;
+    size_t numThreadsActive = numThreadsRunning;
+
+    mutex.lock();
+    numThreadsRunning = newNumThreads;
+    mutex.unlock();
+    condition.notify_all();
+
+    /* start new threads */
+    for (size_t t=numThreadsActive; t<numThreads; t++)
+    {
+      if (t == 0) continue;
+      auto pair = new std::pair<TaskScheduler::ThreadPool*,size_t>(this,t);
+      threads.push_back(createThread((thread_func)threadPoolFunction,pair,4*1024*1024,set_affinity ? t : -1));
+    }
+
+    /* stop some threads if we reduce the number of threads */
+    for (ssize_t t=numThreadsActive-1; t>=ssize_t(numThreadsRunning); t--) {
+      if (t == 0) continue;
+      embree::join(threads.back());
+      threads.pop_back();
+    }
+  }
+
+  TaskScheduler::ThreadPool::~ThreadPool()
+  {
+    /* leave all taskschedulers */
+    mutex.lock();
+    numThreadsRunning = 0;
+    mutex.unlock();
+    condition.notify_all();
+
+    /* wait for threads to terminate */
+    for (size_t i=0; i<threads.size(); i++)
+      embree::join(threads[i]);
+  }
+
+  dll_export void TaskScheduler::ThreadPool::add(const Ref<TaskScheduler>& scheduler)
+  {
+    mutex.lock();
+    schedulers.push_back(scheduler);
+    mutex.unlock();
+    condition.notify_all();
+  }
+
+  dll_export void TaskScheduler::ThreadPool::remove(const Ref<TaskScheduler>& scheduler)
+  {
+    Lock<MutexSys> lock(mutex);
+    for (std::list<Ref<TaskScheduler> >::iterator it = schedulers.begin(); it != schedulers.end(); it++) {
+      if (scheduler == *it) {
+        schedulers.erase(it);
+        return;
+      }
+    }
+  }
+
+  void TaskScheduler::ThreadPool::thread_loop(size_t globalThreadIndex)
+  {
+    while (globalThreadIndex < numThreadsRunning)
+    {
+      Ref<TaskScheduler> scheduler = NULL;
+      ssize_t threadIndex = -1;
+      {
+        Lock<MutexSys> lock(mutex);
+        condition.wait(mutex, [&] () { return globalThreadIndex >= numThreadsRunning || !schedulers.empty(); });
+        if (globalThreadIndex >= numThreadsRunning) break;
+        scheduler = schedulers.front();
+        threadIndex = scheduler->allocThreadIndex();
+      }
+      scheduler->thread_loop(threadIndex);
+    }
+  }
+
+  TaskScheduler::TaskScheduler()
+    : threadCounter(0), anyTasksRunning(0), hasRootTask(false)
+  {
+    threadLocal.resize(2*getNumberOfLogicalThreads()); // FIXME: this has to be 2x as in the compatibility join mode with rtcCommitScene the worker threads also join. When disallowing rtcCommitScene to join a build we can remove the 2x.
+    for (size_t i=0; i<threadLocal.size(); i++)
+      threadLocal[i].store(nullptr);
+  }
+
+  TaskScheduler::~TaskScheduler()
+  {
+    assert(threadCounter == 0);
+  }
+
+  dll_export size_t TaskScheduler::threadID()
+  {
+    Thread* thread = TaskScheduler::thread();
+    if (thread) return thread->threadIndex;
+    else        return 0;
+  }
+
+  dll_export size_t TaskScheduler::threadIndex()
+  {
+    Thread* thread = TaskScheduler::thread();
+    if (thread) return thread->threadIndex;
+    else        return 0;
+  }
+
+  dll_export size_t TaskScheduler::threadCount() {
+    return threadPool->size();
+  }
+
+  dll_export TaskScheduler* TaskScheduler::instance()
+  {
+    if (g_instance == NULL) {
+      Lock<MutexSys> lock(g_mutex);
+      g_instance = new TaskScheduler;
+      g_instance_vector.push_back(g_instance);
+    }
+    return g_instance;
+  }
+
+  void TaskScheduler::create(size_t numThreads, bool set_affinity, bool start_threads)
+  {
+    if (!threadPool) threadPool = new TaskScheduler::ThreadPool(set_affinity);
+    threadPool->setNumThreads(numThreads,start_threads);
+  }
+
+  void TaskScheduler::destroy() {
+    delete threadPool; threadPool = nullptr;
+  }
+
+  dll_export ssize_t TaskScheduler::allocThreadIndex()
+  {
+    size_t threadIndex = threadCounter++;
+    assert(threadIndex < threadLocal.size());
+    return threadIndex;
+  }
+
+  void TaskScheduler::join()
+  {
+    mutex.lock();
+    size_t threadIndex = allocThreadIndex();
+    condition.wait(mutex, [&] () { return hasRootTask.load(); });
+    mutex.unlock();
+    // -- GODOT start --
+    // std::exception_ptr except = thread_loop(threadIndex);
+    // if (except != nullptr) std::rethrow_exception(except);
+    thread_loop(threadIndex);
+    // -- GODOT end --
+  }
+
+  void TaskScheduler::reset() {
+    hasRootTask = false;
+  }
+
+  void TaskScheduler::wait_for_threads(size_t threadCount)
+  {
+    while (threadCounter < threadCount-1)
+      pause_cpu();
+  }
+
+  dll_export TaskScheduler::Thread* TaskScheduler::thread() {
+    return thread_local_thread;
+  }
+
+  dll_export TaskScheduler::Thread* TaskScheduler::swapThread(Thread* thread)
+  {
+    Thread* old = thread_local_thread;
+    thread_local_thread = thread;
+    return old;
+  }
+
+  dll_export bool TaskScheduler::wait()
+  {
+    Thread* thread = TaskScheduler::thread();
+    if (thread == nullptr) return true;
+    while (thread->tasks.execute_local_internal(*thread,thread->task)) {};
+    return thread->scheduler->cancellingException == nullptr;
+  }
+
+// -- GODOT start --
+//   std::exception_ptr TaskScheduler::thread_loop(size_t threadIndex)
+  void TaskScheduler::thread_loop(size_t threadIndex)
+// -- GODOT end --
+  {
+    /* allocate thread structure */
+    std::unique_ptr<Thread> mthread(new Thread(threadIndex,this)); // too large for stack allocation
+    Thread& thread = *mthread;
+    threadLocal[threadIndex].store(&thread);
+    Thread* oldThread = swapThread(&thread);
+
+    /* main thread loop */
+    while (anyTasksRunning)
+    {
+      steal_loop(thread,
+                 [&] () { return anyTasksRunning > 0; },
+                 [&] () {
+                   anyTasksRunning++;
+                   while (thread.tasks.execute_local_internal(thread,nullptr));
+                   anyTasksRunning--;
+                 });
+    }
+    threadLocal[threadIndex].store(nullptr);
+    swapThread(oldThread);
+
+    /* remember exception to throw */
+    // -- GODOT start --
+    // std::exception_ptr except = nullptr;
+    // if (cancellingException != nullptr) except = cancellingException;
+    // -- GODOT end --
+    /* wait for all threads to terminate */
+    threadCounter--;
+#if defined(__WIN32__)
+	size_t loopIndex = 1;
+#endif
+#define LOOP_YIELD_THRESHOLD (4096)
+	while (threadCounter > 0) {
+#if defined(__WIN32__)
+          if ((loopIndex % LOOP_YIELD_THRESHOLD) == 0)
+            yield();
+          else
+            _mm_pause();
+	  loopIndex++;
+#else
+          yield();
+#endif
+	}
+     // -- GODOT start --
+     // return except;
+     return;
+     // -- GODOT end --
+  }
+
+  bool TaskScheduler::steal_from_other_threads(Thread& thread)
+  {
+    const size_t threadIndex = thread.threadIndex;
+    const size_t threadCount = this->threadCounter;
+
+    for (size_t i=1; i<threadCount; i++)
+    {
+      pause_cpu(32);
+      size_t otherThreadIndex = threadIndex+i;
+      if (otherThreadIndex >= threadCount) otherThreadIndex -= threadCount;
+
+      Thread* othread = threadLocal[otherThreadIndex].load();
+      if (!othread)
+        continue;
+
+      if (othread->tasks.steal(thread))
+        return true;
+    }
+
+    return false;
+  }
+
+  dll_export void TaskScheduler::startThreads() {
+    threadPool->startThreads();
+  }
+
+  dll_export void TaskScheduler::addScheduler(const Ref<TaskScheduler>& scheduler) {
+    threadPool->add(scheduler);
+  }
+
+  dll_export void TaskScheduler::removeScheduler(const Ref<TaskScheduler>& scheduler) {
+    threadPool->remove(scheduler);
+  }
+
+  RTC_NAMESPACE_END
+}
diff --git a/thirdparty/embree/common/tasking/taskschedulerinternal.h b/thirdparty/embree/common/tasking/taskschedulerinternal.h
new file mode 100644
index 0000000000..8fa6bb12fa
--- /dev/null
+++ b/thirdparty/embree/common/tasking/taskschedulerinternal.h
@@ -0,0 +1,385 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+#include "../sys/alloc.h"
+#include "../sys/barrier.h"
+#include "../sys/thread.h"
+#include "../sys/mutex.h"
+#include "../sys/condition.h"
+#include "../sys/ref.h"
+#include "../sys/atomic.h"
+#include "../math/range.h"
+#include "../../include/embree3/rtcore.h"
+
+#include <list>
+
+namespace embree
+{
+
+  /* The tasking system exports some symbols to be used by the tutorials. Thus we 
+     hide is also in the API namespace when requested. */
+  RTC_NAMESPACE_BEGIN
+
+  struct TaskScheduler : public RefCount
+  {
+    ALIGNED_STRUCT_(64);
+    friend class Device;
+
+    static const size_t TASK_STACK_SIZE = 4*1024;           //!< task structure stack
+    static const size_t CLOSURE_STACK_SIZE = 512*1024;    //!< stack for task closures
+
+    struct Thread;
+
+    /*! virtual interface for all tasks */
+    struct TaskFunction {
+      virtual void execute() = 0;
+    };
+
+    /*! builds a task interface from a closure */
+    template<typename Closure>
+    struct ClosureTaskFunction : public TaskFunction
+    {
+      Closure closure;
+      __forceinline ClosureTaskFunction (const Closure& closure) : closure(closure) {}
+      void execute() { closure(); };
+    };
+
+    struct __aligned(64) Task
+    {
+      /*! states a task can be in */
+      enum { DONE, INITIALIZED };
+
+      /*! switch from one state to another */
+      __forceinline void switch_state(int from, int to)
+      {
+	__memory_barrier();
+        MAYBE_UNUSED bool success = state.compare_exchange_strong(from,to);
+	assert(success);
+      }
+
+      /*! try to switch from one state to another */
+      __forceinline bool try_switch_state(int from, int to) {
+	__memory_barrier();
+	return state.compare_exchange_strong(from,to);
+      }
+
+       /*! increment/decrement dependency counter */
+      void add_dependencies(int n) {
+	dependencies+=n;
+      }
+
+      /*! initialize all tasks to DONE state by default */
+      __forceinline Task()
+	: state(DONE) {}
+
+      /*! construction of new task */
+      __forceinline Task (TaskFunction* closure, Task* parent, size_t stackPtr, size_t N)
+        : dependencies(1), stealable(true), closure(closure), parent(parent), stackPtr(stackPtr), N(N)
+      {
+        if (parent) parent->add_dependencies(+1);
+	switch_state(DONE,INITIALIZED);
+      }
+
+      /*! construction of stolen task, stealing thread will decrement initial dependency */
+      __forceinline Task (TaskFunction* closure, Task* parent)
+        : dependencies(1), stealable(false), closure(closure), parent(parent), stackPtr(-1), N(1)
+      {
+	switch_state(DONE,INITIALIZED);
+      }
+
+      /*! try to steal this task */
+      bool try_steal(Task& child)
+      {
+        if (!stealable) return false;
+	if (!try_switch_state(INITIALIZED,DONE)) return false;
+	new (&child) Task(closure, this);
+        return true;
+      }
+
+      /*! run this task */
+      dll_export void run(Thread& thread);
+
+      void run_internal(Thread& thread);
+
+    public:
+      std::atomic<int> state;            //!< state this task is in
+      std::atomic<int> dependencies;     //!< dependencies to wait for
+      std::atomic<bool> stealable;       //!< true if task can be stolen
+      TaskFunction* closure;             //!< the closure to execute
+      Task* parent;                      //!< parent task to signal when we are finished
+      size_t stackPtr;                   //!< stack location where closure is stored
+      size_t N;                          //!< approximative size of task
+    };
+
+    struct TaskQueue
+    {
+      TaskQueue ()
+      : left(0), right(0), stackPtr(0) {}
+
+      __forceinline void* alloc(size_t bytes, size_t align = 64)
+      {
+        size_t ofs = bytes + ((align - stackPtr) & (align-1));
+        if (stackPtr + ofs > CLOSURE_STACK_SIZE)
+          // -- GODOT start --
+          // throw std::runtime_error("closure stack overflow");
+          abort();
+          // -- GODOT end --
+        stackPtr += ofs;
+        return &stack[stackPtr-bytes];
+      }
+
+      template<typename Closure>
+      __forceinline void push_right(Thread& thread, const size_t size, const Closure& closure)
+      {
+        if (right >= TASK_STACK_SIZE)
+           // -- GODOT start --
+           // throw std::runtime_error("task stack overflow");
+           abort();
+           // -- GODOT end --
+
+	/* allocate new task on right side of stack */
+        size_t oldStackPtr = stackPtr;
+        TaskFunction* func = new (alloc(sizeof(ClosureTaskFunction<Closure>))) ClosureTaskFunction<Closure>(closure);
+        new (&tasks[right]) Task(func,thread.task,oldStackPtr,size);
+        right++;
+
+	/* also move left pointer */
+	if (left >= right-1) left = right-1;
+      }
+
+      dll_export bool execute_local(Thread& thread, Task* parent);
+      bool execute_local_internal(Thread& thread, Task* parent);
+      bool steal(Thread& thread);
+      size_t getTaskSizeAtLeft();
+
+      bool empty() { return right == 0; }
+
+    public:
+
+      /* task stack */
+      Task tasks[TASK_STACK_SIZE];
+      __aligned(64) std::atomic<size_t> left;   //!< threads steal from left
+      __aligned(64) std::atomic<size_t> right;  //!< new tasks are added to the right
+
+      /* closure stack */
+      __aligned(64) char stack[CLOSURE_STACK_SIZE];
+      size_t stackPtr;
+    };
+
+    /*! thread local structure for each thread */
+    struct Thread
+    {
+      ALIGNED_STRUCT_(64);
+
+      Thread (size_t threadIndex, const Ref<TaskScheduler>& scheduler)
+      : threadIndex(threadIndex), task(nullptr), scheduler(scheduler) {}
+
+      __forceinline size_t threadCount() {
+        return scheduler->threadCounter;
+      }
+
+      size_t threadIndex;              //!< ID of this thread
+      TaskQueue tasks;                 //!< local task queue
+      Task* task;                      //!< current active task
+      Ref<TaskScheduler> scheduler;     //!< pointer to task scheduler
+    };
+
+    /*! pool of worker threads */
+    struct ThreadPool
+    {
+      ThreadPool (bool set_affinity);
+      ~ThreadPool ();
+
+      /*! starts the threads */
+      dll_export void startThreads();
+
+      /*! sets number of threads to use */
+      void setNumThreads(size_t numThreads, bool startThreads = false);
+
+      /*! adds a task scheduler object for scheduling */
+      dll_export void add(const Ref<TaskScheduler>& scheduler);
+
+      /*! remove the task scheduler object again */
+      dll_export void remove(const Ref<TaskScheduler>& scheduler);
+
+      /*! returns number of threads of the thread pool */
+      size_t size() const { return numThreads; }
+
+      /*! main loop for all threads */
+      void thread_loop(size_t threadIndex);
+
+    private:
+      std::atomic<size_t> numThreads;
+      std::atomic<size_t> numThreadsRunning;
+      bool set_affinity;
+      std::atomic<bool> running;
+      std::vector<thread_t> threads;
+
+    private:
+      MutexSys mutex;
+      ConditionSys condition;
+      std::list<Ref<TaskScheduler> > schedulers;
+    };
+
+    TaskScheduler ();
+    ~TaskScheduler ();
+
+    /*! initializes the task scheduler */
+    static void create(size_t numThreads, bool set_affinity, bool start_threads);
+
+    /*! destroys the task scheduler again */
+    static void destroy();
+
+    /*! lets new worker threads join the tasking system */
+    void join();
+    void reset();
+
+    /*! let a worker thread allocate a thread index */
+    dll_export ssize_t allocThreadIndex();
+
+    /*! wait for some number of threads available (threadCount includes main thread) */
+    void wait_for_threads(size_t threadCount);
+
+    /*! thread loop for all worker threads */
+    // -- GODOT start --
+    // std::exception_ptr thread_loop(size_t threadIndex);
+    void thread_loop(size_t threadIndex);
+    // -- GODOT end --
+
+    /*! steals a task from a different thread */
+    bool steal_from_other_threads(Thread& thread);
+
+    template<typename Predicate, typename Body>
+      static void steal_loop(Thread& thread, const Predicate& pred, const Body& body);
+
+    /* spawn a new task at the top of the threads task stack */
+    template<typename Closure>
+      void spawn_root(const Closure& closure, size_t size = 1, bool useThreadPool = true)
+    {
+      if (useThreadPool) startThreads();
+
+      size_t threadIndex = allocThreadIndex();
+      std::unique_ptr<Thread> mthread(new Thread(threadIndex,this)); // too large for stack allocation
+      Thread& thread = *mthread;
+      assert(threadLocal[threadIndex].load() == nullptr);
+      threadLocal[threadIndex] = &thread;
+      Thread* oldThread = swapThread(&thread);
+      thread.tasks.push_right(thread,size,closure);
+      {
+        Lock<MutexSys> lock(mutex);
+	anyTasksRunning++;
+        hasRootTask = true;
+        condition.notify_all();
+      }
+
+      if (useThreadPool) addScheduler(this);
+
+      while (thread.tasks.execute_local(thread,nullptr));
+      anyTasksRunning--;
+      if (useThreadPool) removeScheduler(this);
+
+      threadLocal[threadIndex] = nullptr;
+      swapThread(oldThread);
+
+      /* remember exception to throw */
+      std::exception_ptr except = nullptr;
+      if (cancellingException != nullptr) except = cancellingException;
+
+      /* wait for all threads to terminate */
+      threadCounter--;
+      while (threadCounter > 0) yield();
+      cancellingException = nullptr;
+
+      /* re-throw proper exception */
+      if (except != nullptr)
+        std::rethrow_exception(except);
+    }
+
+    /* spawn a new task at the top of the threads task stack */
+    template<typename Closure>
+    static __forceinline void spawn(size_t size, const Closure& closure)
+    {
+      Thread* thread = TaskScheduler::thread();
+      if (likely(thread != nullptr)) thread->tasks.push_right(*thread,size,closure);
+      else                           instance()->spawn_root(closure,size);
+    }
+
+    /* spawn a new task at the top of the threads task stack */
+    template<typename Closure>
+    static __forceinline void spawn(const Closure& closure) {
+      spawn(1,closure);
+    }
+
+    /* spawn a new task set  */
+    template<typename Index, typename Closure>
+    static void spawn(const Index begin, const Index end, const Index blockSize, const Closure& closure)
+    {
+      spawn(end-begin, [=]()
+        {
+	  if (end-begin <= blockSize) {
+	    return closure(range<Index>(begin,end));
+	  }
+	  const Index center = (begin+end)/2;
+	  spawn(begin,center,blockSize,closure);
+	  spawn(center,end  ,blockSize,closure);
+	  wait();
+	});
+    }
+
+    /* work on spawned subtasks and wait until all have finished */
+    dll_export static bool wait();
+
+    /* returns the ID of the current thread */
+    dll_export static size_t threadID();
+
+    /* returns the index (0..threadCount-1) of the current thread */
+    dll_export static size_t threadIndex();
+
+    /* returns the total number of threads */
+    dll_export static size_t threadCount();
+
+  private:
+
+    /* returns the thread local task list of this worker thread */
+    dll_export static Thread* thread();
+
+    /* sets the thread local task list of this worker thread */
+    dll_export static Thread* swapThread(Thread* thread);
+
+    /*! returns the taskscheduler object to be used by the master thread */
+    dll_export static TaskScheduler* instance();
+
+    /*! starts the threads */
+    dll_export static void startThreads();
+
+    /*! adds a task scheduler object for scheduling */
+    dll_export static void addScheduler(const Ref<TaskScheduler>& scheduler);
+
+    /*! remove the task scheduler object again */
+    dll_export static void removeScheduler(const Ref<TaskScheduler>& scheduler);
+
+  private:
+    std::vector<atomic<Thread*>> threadLocal;
+    std::atomic<size_t> threadCounter;
+    std::atomic<size_t> anyTasksRunning;
+    std::atomic<bool> hasRootTask;
+    std::exception_ptr cancellingException;
+    MutexSys mutex;
+    ConditionSys condition;
+
+  private:
+    static size_t g_numThreads;
+    static __thread TaskScheduler* g_instance;
+    static __thread Thread* thread_local_thread;
+    static ThreadPool* threadPool;
+  };
+
+  RTC_NAMESPACE_END
+
+#if defined(RTC_NAMESPACE)
+    using RTC_NAMESPACE::TaskScheduler;
+#endif
+}
diff --git a/thirdparty/embree/common/tasking/taskschedulerppl.h b/thirdparty/embree/common/tasking/taskschedulerppl.h
new file mode 100644
index 0000000000..cbc2ecdbb8
--- /dev/null
+++ b/thirdparty/embree/common/tasking/taskschedulerppl.h
@@ -0,0 +1,46 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+#include "../sys/alloc.h"
+#include "../sys/barrier.h"
+#include "../sys/thread.h"
+#include "../sys/mutex.h"
+#include "../sys/condition.h"
+#include "../sys/ref.h"
+
+#if !defined(__WIN32__)
+#error PPL tasking system only available under windows
+#endif
+
+#include <ppl.h>
+
+namespace embree
+{
+  struct TaskScheduler
+  {
+    /*! initializes the task scheduler */
+    static void create(size_t numThreads, bool set_affinity, bool start_threads);
+
+    /*! destroys the task scheduler again */
+    static void destroy();
+
+    /* returns the ID of the current thread */
+    static __forceinline size_t threadID() {
+      return GetCurrentThreadId();
+    }
+
+    /* returns the index (0..threadCount-1) of the current thread */
+    /* FIXME: threadIndex is NOT supported by PPL! */
+    static __forceinline size_t threadIndex() {
+      return 0;
+    }
+
+    /* returns the total number of threads */
+    static __forceinline size_t threadCount() {
+      return GetMaximumProcessorCount(ALL_PROCESSOR_GROUPS) + 1;
+    }
+  };
+};
diff --git a/thirdparty/embree/common/tasking/taskschedulertbb.h b/thirdparty/embree/common/tasking/taskschedulertbb.h
new file mode 100644
index 0000000000..35bd49849f
--- /dev/null
+++ b/thirdparty/embree/common/tasking/taskschedulertbb.h
@@ -0,0 +1,73 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+#include "../sys/alloc.h"
+#include "../sys/barrier.h"
+#include "../sys/thread.h"
+#include "../sys/mutex.h"
+#include "../sys/condition.h"
+#include "../sys/ref.h"
+
+#if defined(__WIN32__)
+// -- GODOT start --
+#if !defined(NOMINMAX)
+// -- GODOT end --
+#  define NOMINMAX
+// -- GODOT start --
+#endif
+// -- GODOT end --
+#endif
+
+// We need to define these to avoid implicit linkage against
+// tbb_debug.lib under Windows. When removing these lines debug build
+// under Windows fails.
+#define __TBB_NO_IMPLICIT_LINKAGE 1
+#define __TBBMALLOC_NO_IMPLICIT_LINKAGE 1
+#define TBB_SUPPRESS_DEPRECATED_MESSAGES 1
+#define TBB_PREVIEW_ISOLATED_TASK_GROUP 1
+#include "tbb/tbb.h"
+#include "tbb/parallel_sort.h"
+
+namespace embree
+{
+  struct TaskScheduler
+  {
+    /*! initializes the task scheduler */
+    static void create(size_t numThreads, bool set_affinity, bool start_threads);
+
+    /*! destroys the task scheduler again */
+    static void destroy();
+
+    /* returns the ID of the current thread */
+    static __forceinline size_t threadID()
+    {
+      return threadIndex();
+    }
+
+    /* returns the index (0..threadCount-1) of the current thread */
+    static __forceinline size_t threadIndex()
+    {
+#if TBB_INTERFACE_VERSION >= 9100
+      return tbb::this_task_arena::current_thread_index();
+#elif TBB_INTERFACE_VERSION >= 9000
+      return tbb::task_arena::current_thread_index();
+#else
+      return 0;
+#endif
+    }
+
+    /* returns the total number of threads */
+    static __forceinline size_t threadCount() {
+#if TBB_INTERFACE_VERSION >= 9100
+      return tbb::this_task_arena::max_concurrency();
+#else
+      return tbb::task_scheduler_init::default_num_threads();
+#endif
+    }
+
+  };
+
+};
diff --git a/thirdparty/embree/include/embree3/rtcore.h b/thirdparty/embree/include/embree3/rtcore.h
new file mode 100644
index 0000000000..450ab4c535
--- /dev/null
+++ b/thirdparty/embree/include/embree3/rtcore.h
@@ -0,0 +1,14 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "rtcore_config.h"
+#include "rtcore_common.h"
+#include "rtcore_device.h"
+#include "rtcore_buffer.h"
+#include "rtcore_ray.h"
+#include "rtcore_geometry.h"
+#include "rtcore_scene.h"
+#include "rtcore_builder.h"
+#include "rtcore_quaternion.h"
diff --git a/thirdparty/embree/include/embree3/rtcore_buffer.h b/thirdparty/embree/include/embree3/rtcore_buffer.h
new file mode 100644
index 0000000000..6b8eba9769
--- /dev/null
+++ b/thirdparty/embree/include/embree3/rtcore_buffer.h
@@ -0,0 +1,51 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "rtcore_device.h"
+
+RTC_NAMESPACE_BEGIN
+
+/* Types of buffers */
+enum RTCBufferType
+{
+  RTC_BUFFER_TYPE_INDEX            = 0,
+  RTC_BUFFER_TYPE_VERTEX           = 1,
+  RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE = 2,
+  RTC_BUFFER_TYPE_NORMAL           = 3,
+  RTC_BUFFER_TYPE_TANGENT          = 4,
+  RTC_BUFFER_TYPE_NORMAL_DERIVATIVE = 5,
+
+  RTC_BUFFER_TYPE_GRID                 = 8,
+
+  RTC_BUFFER_TYPE_FACE                 = 16,
+  RTC_BUFFER_TYPE_LEVEL                = 17,
+  RTC_BUFFER_TYPE_EDGE_CREASE_INDEX    = 18,
+  RTC_BUFFER_TYPE_EDGE_CREASE_WEIGHT   = 19,
+  RTC_BUFFER_TYPE_VERTEX_CREASE_INDEX  = 20,
+  RTC_BUFFER_TYPE_VERTEX_CREASE_WEIGHT = 21,
+  RTC_BUFFER_TYPE_HOLE                 = 22,
+
+  RTC_BUFFER_TYPE_FLAGS = 32
+};
+
+/* Opaque buffer type */
+typedef struct RTCBufferTy* RTCBuffer;
+
+/* Creates a new buffer. */
+RTC_API RTCBuffer rtcNewBuffer(RTCDevice device, size_t byteSize);
+
+/* Creates a new shared buffer. */
+RTC_API RTCBuffer rtcNewSharedBuffer(RTCDevice device, void* ptr, size_t byteSize);
+
+/* Returns a pointer to the buffer data. */
+RTC_API void* rtcGetBufferData(RTCBuffer buffer);
+
+/* Retains the buffer (increments the reference count). */
+RTC_API void rtcRetainBuffer(RTCBuffer buffer);
+
+/* Releases the buffer (decrements the reference count). */
+RTC_API void rtcReleaseBuffer(RTCBuffer buffer);
+
+RTC_NAMESPACE_END
diff --git a/thirdparty/embree/include/embree3/rtcore_builder.h b/thirdparty/embree/include/embree3/rtcore_builder.h
new file mode 100644
index 0000000000..4bff999fed
--- /dev/null
+++ b/thirdparty/embree/include/embree3/rtcore_builder.h
@@ -0,0 +1,125 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "rtcore_scene.h"
+
+RTC_NAMESPACE_BEGIN
+  
+/* Opaque BVH type */
+typedef struct RTCBVHTy* RTCBVH;
+
+/* Input build primitives for the builder */
+struct RTC_ALIGN(32) RTCBuildPrimitive
+{
+  float lower_x, lower_y, lower_z; 
+  unsigned int geomID;
+  float upper_x, upper_y, upper_z;
+  unsigned int primID;
+};
+
+/* Opaque thread local allocator type */
+typedef struct RTCThreadLocalAllocatorTy* RTCThreadLocalAllocator;
+
+/* Callback to create a node */
+typedef void* (*RTCCreateNodeFunction) (RTCThreadLocalAllocator allocator, unsigned int childCount, void* userPtr);
+
+/* Callback to set the pointer to all children */
+typedef void (*RTCSetNodeChildrenFunction) (void* nodePtr, void** children, unsigned int childCount, void* userPtr);
+
+/* Callback to set the bounds of all children */
+typedef void (*RTCSetNodeBoundsFunction) (void* nodePtr, const struct RTCBounds** bounds, unsigned int childCount, void* userPtr);
+
+/* Callback to create a leaf node */
+typedef void* (*RTCCreateLeafFunction) (RTCThreadLocalAllocator allocator, const struct RTCBuildPrimitive* primitives, size_t primitiveCount, void* userPtr);
+
+/* Callback to split a build primitive */
+typedef void (*RTCSplitPrimitiveFunction) (const struct RTCBuildPrimitive* primitive, unsigned int dimension, float position, struct RTCBounds* leftBounds, struct RTCBounds* rightBounds, void* userPtr);
+
+/* Build flags */
+enum RTCBuildFlags
+{
+  RTC_BUILD_FLAG_NONE    = 0,
+  RTC_BUILD_FLAG_DYNAMIC = (1 << 0),
+};
+
+enum RTCBuildConstants
+{
+  RTC_BUILD_MAX_PRIMITIVES_PER_LEAF = 32
+};
+
+/* Input for builders */
+struct RTCBuildArguments
+{
+  size_t byteSize;
+  
+  enum RTCBuildQuality buildQuality;
+  enum RTCBuildFlags buildFlags;
+  unsigned int maxBranchingFactor;
+  unsigned int maxDepth;
+  unsigned int sahBlockSize;
+  unsigned int minLeafSize;
+  unsigned int maxLeafSize;
+  float traversalCost;
+  float intersectionCost;
+  
+  RTCBVH bvh;
+  struct RTCBuildPrimitive* primitives;
+  size_t primitiveCount;
+  size_t primitiveArrayCapacity;
+  
+  RTCCreateNodeFunction createNode;
+  RTCSetNodeChildrenFunction setNodeChildren;
+  RTCSetNodeBoundsFunction setNodeBounds;
+  RTCCreateLeafFunction createLeaf;
+  RTCSplitPrimitiveFunction splitPrimitive;
+  RTCProgressMonitorFunction buildProgress;
+  void* userPtr;
+};
+
+/* Returns the default build settings.  */
+RTC_FORCEINLINE struct RTCBuildArguments rtcDefaultBuildArguments()
+{
+  struct RTCBuildArguments args;
+  args.byteSize = sizeof(args);
+  args.buildQuality = RTC_BUILD_QUALITY_MEDIUM;
+  args.buildFlags = RTC_BUILD_FLAG_NONE;
+  args.maxBranchingFactor = 2;
+  args.maxDepth = 32;
+  args.sahBlockSize = 1;
+  args.minLeafSize = 1;
+  args.maxLeafSize = RTC_BUILD_MAX_PRIMITIVES_PER_LEAF;
+  args.traversalCost = 1.0f;
+  args.intersectionCost = 1.0f;
+  args.bvh = NULL;
+  args.primitives = NULL;
+  args.primitiveCount = 0;
+  args.primitiveArrayCapacity = 0;
+  args.createNode = NULL;
+  args.setNodeChildren = NULL;
+  args.setNodeBounds = NULL;
+  args.createLeaf = NULL;
+  args.splitPrimitive = NULL;
+  args.buildProgress = NULL;
+  args.userPtr = NULL;
+  return args;
+}
+
+/* Creates a new BVH. */
+RTC_API RTCBVH rtcNewBVH(RTCDevice device);
+
+/* Builds a BVH. */
+RTC_API void* rtcBuildBVH(const struct RTCBuildArguments* args);
+
+/* Allocates memory using the thread local allocator. */
+RTC_API void* rtcThreadLocalAlloc(RTCThreadLocalAllocator allocator, size_t bytes, size_t align);
+
+/* Retains the BVH (increments reference count). */
+RTC_API void rtcRetainBVH(RTCBVH bvh);
+
+/* Releases the BVH (decrements reference count). */
+RTC_API void rtcReleaseBVH(RTCBVH bvh);
+
+RTC_NAMESPACE_END
+
diff --git a/thirdparty/embree/include/embree3/rtcore_common.h b/thirdparty/embree/include/embree3/rtcore_common.h
new file mode 100644
index 0000000000..4857e1e05e
--- /dev/null
+++ b/thirdparty/embree/include/embree3/rtcore_common.h
@@ -0,0 +1,328 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <stddef.h>
+#include <sys/types.h>
+#include <stdbool.h>
+
+#include "rtcore_config.h"
+
+RTC_NAMESPACE_BEGIN
+
+#if defined(_WIN32)
+#if defined(_M_X64)
+typedef long long ssize_t;
+#else
+typedef int ssize_t;
+#endif
+#endif
+
+// -- GODOT start --
+#if defined(_WIN32) && defined(_MSC_VER)
+// -- GODOT end --
+#  define RTC_ALIGN(...) __declspec(align(__VA_ARGS__))
+#else
+#  define RTC_ALIGN(...) __attribute__((aligned(__VA_ARGS__)))
+#endif
+
+#if !defined (RTC_DEPRECATED)
+#ifdef __GNUC__
+  #define RTC_DEPRECATED __attribute__((deprecated))
+#elif defined(_MSC_VER)
+  #define RTC_DEPRECATED __declspec(deprecated)
+#else
+  #define RTC_DEPRECATED
+#endif
+#endif
+
+#if defined(_WIN32) 
+#  define RTC_FORCEINLINE __forceinline
+#else
+#  define RTC_FORCEINLINE inline __attribute__((always_inline))
+#endif
+
+/* Invalid geometry ID */
+#define RTC_INVALID_GEOMETRY_ID ((unsigned int)-1)
+
+/* Maximum number of time steps */
+#define RTC_MAX_TIME_STEP_COUNT 129
+
+/* Formats of buffers and other data structures */
+enum RTCFormat
+{
+  RTC_FORMAT_UNDEFINED = 0,
+
+  /* 8-bit unsigned integer */
+  RTC_FORMAT_UCHAR = 0x1001,
+  RTC_FORMAT_UCHAR2,
+  RTC_FORMAT_UCHAR3,
+  RTC_FORMAT_UCHAR4,
+
+  /* 8-bit signed integer */
+  RTC_FORMAT_CHAR = 0x2001,
+  RTC_FORMAT_CHAR2,
+  RTC_FORMAT_CHAR3,
+  RTC_FORMAT_CHAR4,
+
+  /* 16-bit unsigned integer */
+  RTC_FORMAT_USHORT = 0x3001,
+  RTC_FORMAT_USHORT2,
+  RTC_FORMAT_USHORT3,
+  RTC_FORMAT_USHORT4,
+
+  /* 16-bit signed integer */
+  RTC_FORMAT_SHORT = 0x4001,
+  RTC_FORMAT_SHORT2,
+  RTC_FORMAT_SHORT3,
+  RTC_FORMAT_SHORT4,
+
+  /* 32-bit unsigned integer */
+  RTC_FORMAT_UINT = 0x5001,
+  RTC_FORMAT_UINT2,
+  RTC_FORMAT_UINT3,
+  RTC_FORMAT_UINT4,
+
+  /* 32-bit signed integer */
+  RTC_FORMAT_INT = 0x6001,
+  RTC_FORMAT_INT2,
+  RTC_FORMAT_INT3,
+  RTC_FORMAT_INT4,
+
+  /* 64-bit unsigned integer */
+  RTC_FORMAT_ULLONG = 0x7001,
+  RTC_FORMAT_ULLONG2,
+  RTC_FORMAT_ULLONG3,
+  RTC_FORMAT_ULLONG4,
+
+  /* 64-bit signed integer */
+  RTC_FORMAT_LLONG = 0x8001,
+  RTC_FORMAT_LLONG2,
+  RTC_FORMAT_LLONG3,
+  RTC_FORMAT_LLONG4,
+
+  /* 32-bit float */
+  RTC_FORMAT_FLOAT = 0x9001,
+  RTC_FORMAT_FLOAT2,
+  RTC_FORMAT_FLOAT3,
+  RTC_FORMAT_FLOAT4,
+  RTC_FORMAT_FLOAT5,
+  RTC_FORMAT_FLOAT6,
+  RTC_FORMAT_FLOAT7,
+  RTC_FORMAT_FLOAT8,
+  RTC_FORMAT_FLOAT9,
+  RTC_FORMAT_FLOAT10,
+  RTC_FORMAT_FLOAT11,
+  RTC_FORMAT_FLOAT12,
+  RTC_FORMAT_FLOAT13,
+  RTC_FORMAT_FLOAT14,
+  RTC_FORMAT_FLOAT15,
+  RTC_FORMAT_FLOAT16,
+
+  /* 32-bit float matrix (row-major order) */
+  RTC_FORMAT_FLOAT2X2_ROW_MAJOR = 0x9122,
+  RTC_FORMAT_FLOAT2X3_ROW_MAJOR = 0x9123,
+  RTC_FORMAT_FLOAT2X4_ROW_MAJOR = 0x9124,
+  RTC_FORMAT_FLOAT3X2_ROW_MAJOR = 0x9132,
+  RTC_FORMAT_FLOAT3X3_ROW_MAJOR = 0x9133,
+  RTC_FORMAT_FLOAT3X4_ROW_MAJOR = 0x9134,
+  RTC_FORMAT_FLOAT4X2_ROW_MAJOR = 0x9142,
+  RTC_FORMAT_FLOAT4X3_ROW_MAJOR = 0x9143,
+  RTC_FORMAT_FLOAT4X4_ROW_MAJOR = 0x9144,
+
+  /* 32-bit float matrix (column-major order) */
+  RTC_FORMAT_FLOAT2X2_COLUMN_MAJOR = 0x9222,
+  RTC_FORMAT_FLOAT2X3_COLUMN_MAJOR = 0x9223,
+  RTC_FORMAT_FLOAT2X4_COLUMN_MAJOR = 0x9224,
+  RTC_FORMAT_FLOAT3X2_COLUMN_MAJOR = 0x9232,
+  RTC_FORMAT_FLOAT3X3_COLUMN_MAJOR = 0x9233,
+  RTC_FORMAT_FLOAT3X4_COLUMN_MAJOR = 0x9234,
+  RTC_FORMAT_FLOAT4X2_COLUMN_MAJOR = 0x9242,
+  RTC_FORMAT_FLOAT4X3_COLUMN_MAJOR = 0x9243,
+  RTC_FORMAT_FLOAT4X4_COLUMN_MAJOR = 0x9244,
+
+  /* special 12-byte format for grids */
+  RTC_FORMAT_GRID = 0xA001
+};
+
+/* Build quality levels */
+enum RTCBuildQuality
+{
+  RTC_BUILD_QUALITY_LOW    = 0,
+  RTC_BUILD_QUALITY_MEDIUM = 1,
+  RTC_BUILD_QUALITY_HIGH   = 2,
+  RTC_BUILD_QUALITY_REFIT  = 3,
+};
+
+/* Axis-aligned bounding box representation */
+struct RTC_ALIGN(16) RTCBounds
+{
+  float lower_x, lower_y, lower_z, align0;
+  float upper_x, upper_y, upper_z, align1;
+};
+
+/* Linear axis-aligned bounding box representation */
+struct RTC_ALIGN(16) RTCLinearBounds
+{
+  struct RTCBounds bounds0;
+  struct RTCBounds bounds1;
+};
+
+/* Intersection context flags */
+enum RTCIntersectContextFlags
+{
+  RTC_INTERSECT_CONTEXT_FLAG_NONE       = 0,
+  RTC_INTERSECT_CONTEXT_FLAG_INCOHERENT = (0 << 0), // optimize for incoherent rays
+  RTC_INTERSECT_CONTEXT_FLAG_COHERENT   = (1 << 0)  // optimize for coherent rays
+};
+
+/* Arguments for RTCFilterFunctionN */
+struct RTCFilterFunctionNArguments
+{
+  int* valid;
+  void* geometryUserPtr;
+  struct RTCIntersectContext* context;
+  struct RTCRayN* ray;
+  struct RTCHitN* hit;
+  unsigned int N;
+};
+
+/* Filter callback function */
+typedef void (*RTCFilterFunctionN)(const struct RTCFilterFunctionNArguments* args);
+
+/* Intersection context passed to intersect/occluded calls */
+struct RTCIntersectContext
+{
+  enum RTCIntersectContextFlags flags;               // intersection flags
+  RTCFilterFunctionN filter;                         // filter function to execute
+  
+#if RTC_MAX_INSTANCE_LEVEL_COUNT > 1
+  unsigned int instStackSize;                        // Number of instances currently on the stack.
+#endif
+  unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // The current stack of instance ids.
+  
+#if RTC_MIN_WIDTH
+  float minWidthDistanceFactor;                      // curve radius is set to this factor times distance to ray origin
+#endif
+};
+
+/* Initializes an intersection context. */
+RTC_FORCEINLINE void rtcInitIntersectContext(struct RTCIntersectContext* context)
+{
+  unsigned l = 0;
+  context->flags = RTC_INTERSECT_CONTEXT_FLAG_INCOHERENT;
+  context->filter = NULL;
+  
+#if RTC_MAX_INSTANCE_LEVEL_COUNT > 1
+  context->instStackSize = 0;
+#endif
+  for (; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+    context->instID[l] = RTC_INVALID_GEOMETRY_ID;
+  
+#if RTC_MIN_WIDTH
+  context->minWidthDistanceFactor = 0.0f;
+#endif
+}
+
+/* Point query structure for closest point query */
+struct RTC_ALIGN(16) RTCPointQuery 
+{
+  float x;                // x coordinate of the query point
+  float y;                // y coordinate of the query point
+  float z;                // z coordinate of the query point
+  float time;             // time of the point query
+  float radius;           // radius of the point query 
+};
+
+/* Structure of a packet of 4 query points */
+struct RTC_ALIGN(16) RTCPointQuery4
+{
+  float x[4];                // x coordinate of the query point
+  float y[4];                // y coordinate of the query point
+  float z[4];                // z coordinate of the query point
+  float time[4];             // time of the point query
+  float radius[4];           // radius of the point query
+};
+
+/* Structure of a packet of 8 query points */
+struct RTC_ALIGN(32) RTCPointQuery8
+{
+  float x[8];                // x coordinate of the query point
+  float y[8];                // y coordinate of the query point
+  float z[8];                // z coordinate of the query point
+  float time[8];             // time of the point query
+  float radius[8];           // radius ofr the point query 
+};
+
+/* Structure of a packet of 16 query points */
+struct RTC_ALIGN(64) RTCPointQuery16
+{
+  float x[16];                // x coordinate of the query point
+  float y[16];                // y coordinate of the query point
+  float z[16];                // z coordinate of the query point
+  float time[16];             // time of the point quey
+  float radius[16];           // radius of the point query
+};
+
+struct RTCPointQueryN;
+
+struct RTC_ALIGN(16) RTCPointQueryContext
+{
+  // accumulated 4x4 column major matrices from world space to instance space.
+  // undefined if size == 0.
+  float world2inst[RTC_MAX_INSTANCE_LEVEL_COUNT][16]; 
+
+  // accumulated 4x4 column major matrices from instance space to world space.
+  // undefined if size == 0.
+  float inst2world[RTC_MAX_INSTANCE_LEVEL_COUNT][16]; 
+
+  // instance ids.
+  unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT];
+
+  // number of instances currently on the stack.
+  unsigned int instStackSize;
+};
+
+/* Initializes an intersection context. */
+RTC_FORCEINLINE void rtcInitPointQueryContext(struct RTCPointQueryContext* context)
+{
+  context->instStackSize = 0;
+  context->instID[0] = RTC_INVALID_GEOMETRY_ID;
+}
+
+struct RTC_ALIGN(16) RTCPointQueryFunctionArguments
+{
+  // The (world space) query object that was passed as an argument of rtcPointQuery. The
+  // radius of the query can be decreased inside the callback to shrink the
+  // search domain. Increasing the radius or modifying the time or position of
+  // the query results in undefined behaviour.
+  struct RTCPointQuery* query;
+
+  // Used for user input/output data. Will not be read or modified internally.
+  void* userPtr;
+
+  // primitive and geometry ID of primitive
+  unsigned int  primID;        
+  unsigned int  geomID;    
+
+  // the context with transformation and instance ID stack
+  struct RTCPointQueryContext* context;
+
+  // If the current instance transform M (= context->world2inst[context->instStackSize]) 
+  // is a similarity matrix, i.e there is a constant factor similarityScale such that,
+  //    for all x,y: dist(Mx, My) = similarityScale * dist(x, y),
+  // The similarity scale is 0, if the current instance transform is not a
+  // similarity transform and vice versa. The similarity scale allows to compute
+  // distance information in instance space and scale the distances into world
+  // space by dividing with the similarity scale, for example, to update the
+  // query radius. If the current instance transform is not a similarity
+  // transform (similarityScale = 0), the distance computation has to be
+  // performed in world space to ensure correctness. if there is no instance
+  // transform (context->instStackSize == 0), the similarity scale is 1.
+  float similarityScale;
+};
+
+typedef bool (*RTCPointQueryFunction)(struct RTCPointQueryFunctionArguments* args);
+  
+RTC_NAMESPACE_END
diff --git a/thirdparty/embree/include/embree3/rtcore_config.h b/thirdparty/embree/include/embree3/rtcore_config.h
new file mode 100644
index 0000000000..3a9819c9f1
--- /dev/null
+++ b/thirdparty/embree/include/embree3/rtcore_config.h
@@ -0,0 +1,57 @@
+
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define RTC_VERSION_MAJOR 3
+#define RTC_VERSION_MINOR 13
+#define RTC_VERSION_PATCH 0
+#define RTC_VERSION 31300
+#define RTC_VERSION_STRING "3.13.0"
+
+#define RTC_MAX_INSTANCE_LEVEL_COUNT 1
+
+#define EMBREE_MIN_WIDTH 0
+#define RTC_MIN_WIDTH EMBREE_MIN_WIDTH
+
+#define EMBREE_STATIC_LIB
+/* #undef EMBREE_API_NAMESPACE */
+
+#if defined(EMBREE_API_NAMESPACE)
+#  define RTC_NAMESPACE
+#  define RTC_NAMESPACE_BEGIN namespace  {
+#  define RTC_NAMESPACE_END }
+#  define RTC_NAMESPACE_USE using namespace ;
+#  define RTC_API_EXTERN_C
+#  undef EMBREE_API_NAMESPACE
+#else
+#  define RTC_NAMESPACE_BEGIN
+#  define RTC_NAMESPACE_END
+#  define RTC_NAMESPACE_USE
+#  if defined(__cplusplus)
+#    define RTC_API_EXTERN_C extern "C"
+#  else
+#    define RTC_API_EXTERN_C
+#  endif
+#endif
+
+#if defined(ISPC)
+#  define RTC_API_IMPORT extern "C" unmasked
+#  define RTC_API_EXPORT extern "C" unmasked
+#elif defined(EMBREE_STATIC_LIB)
+#  define RTC_API_IMPORT RTC_API_EXTERN_C
+#  define RTC_API_EXPORT RTC_API_EXTERN_C
+#elif defined(_WIN32)
+#  define RTC_API_IMPORT RTC_API_EXTERN_C __declspec(dllimport)
+#  define RTC_API_EXPORT RTC_API_EXTERN_C __declspec(dllexport)
+#else
+#  define RTC_API_IMPORT RTC_API_EXTERN_C
+#  define RTC_API_EXPORT RTC_API_EXTERN_C __attribute__ ((visibility ("default")))
+#endif
+
+#if defined(RTC_EXPORT_API)
+#  define RTC_API RTC_API_EXPORT
+#else
+#  define RTC_API RTC_API_IMPORT
+#endif
diff --git a/thirdparty/embree/include/embree3/rtcore_device.h b/thirdparty/embree/include/embree3/rtcore_device.h
new file mode 100644
index 0000000000..2dd3047603
--- /dev/null
+++ b/thirdparty/embree/include/embree3/rtcore_device.h
@@ -0,0 +1,87 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "rtcore_common.h"
+
+RTC_NAMESPACE_BEGIN
+
+/* Opaque device type */
+typedef struct RTCDeviceTy* RTCDevice;
+
+/* Creates a new Embree device. */
+RTC_API RTCDevice rtcNewDevice(const char* config);
+
+/* Retains the Embree device (increments the reference count). */
+RTC_API void rtcRetainDevice(RTCDevice device);
+  
+/* Releases an Embree device (decrements the reference count). */
+RTC_API void rtcReleaseDevice(RTCDevice device);
+
+/* Device properties */
+enum RTCDeviceProperty
+{
+  RTC_DEVICE_PROPERTY_VERSION       = 0,
+  RTC_DEVICE_PROPERTY_VERSION_MAJOR = 1,
+  RTC_DEVICE_PROPERTY_VERSION_MINOR = 2,
+  RTC_DEVICE_PROPERTY_VERSION_PATCH = 3,
+
+  RTC_DEVICE_PROPERTY_NATIVE_RAY4_SUPPORTED  = 32,
+  RTC_DEVICE_PROPERTY_NATIVE_RAY8_SUPPORTED  = 33,
+  RTC_DEVICE_PROPERTY_NATIVE_RAY16_SUPPORTED = 34,
+  RTC_DEVICE_PROPERTY_RAY_STREAM_SUPPORTED   = 35,
+
+  RTC_DEVICE_PROPERTY_BACKFACE_CULLING_CURVES_ENABLED = 63,
+  RTC_DEVICE_PROPERTY_RAY_MASK_SUPPORTED          = 64,
+  RTC_DEVICE_PROPERTY_BACKFACE_CULLING_ENABLED    = 65,
+  RTC_DEVICE_PROPERTY_FILTER_FUNCTION_SUPPORTED   = 66,
+  RTC_DEVICE_PROPERTY_IGNORE_INVALID_RAYS_ENABLED = 67,
+  RTC_DEVICE_PROPERTY_COMPACT_POLYS_ENABLED       = 68,
+
+  RTC_DEVICE_PROPERTY_TRIANGLE_GEOMETRY_SUPPORTED    = 96,
+  RTC_DEVICE_PROPERTY_QUAD_GEOMETRY_SUPPORTED        = 97,
+  RTC_DEVICE_PROPERTY_SUBDIVISION_GEOMETRY_SUPPORTED = 98,
+  RTC_DEVICE_PROPERTY_CURVE_GEOMETRY_SUPPORTED       = 99,
+  RTC_DEVICE_PROPERTY_USER_GEOMETRY_SUPPORTED        = 100,
+  RTC_DEVICE_PROPERTY_POINT_GEOMETRY_SUPPORTED       = 101,
+
+  RTC_DEVICE_PROPERTY_TASKING_SYSTEM        = 128,
+  RTC_DEVICE_PROPERTY_JOIN_COMMIT_SUPPORTED = 129,
+  RTC_DEVICE_PROPERTY_PARALLEL_COMMIT_SUPPORTED = 130
+};
+
+/* Gets a device property. */
+RTC_API ssize_t rtcGetDeviceProperty(RTCDevice device, enum RTCDeviceProperty prop);
+
+/* Sets a device property. */
+RTC_API void rtcSetDeviceProperty(RTCDevice device, const enum RTCDeviceProperty prop, ssize_t value);
+  
+/* Error codes */
+enum RTCError
+{
+  RTC_ERROR_NONE              = 0,
+  RTC_ERROR_UNKNOWN           = 1,
+  RTC_ERROR_INVALID_ARGUMENT  = 2,
+  RTC_ERROR_INVALID_OPERATION = 3,
+  RTC_ERROR_OUT_OF_MEMORY     = 4,
+  RTC_ERROR_UNSUPPORTED_CPU   = 5,
+  RTC_ERROR_CANCELLED         = 6
+};
+
+/* Returns the error code. */
+RTC_API enum RTCError rtcGetDeviceError(RTCDevice device);
+
+/* Error callback function */
+typedef void (*RTCErrorFunction)(void* userPtr, enum RTCError code, const char* str);
+
+/* Sets the error callback function. */
+RTC_API void rtcSetDeviceErrorFunction(RTCDevice device, RTCErrorFunction error, void* userPtr);
+
+/* Memory monitor callback function */
+typedef bool (*RTCMemoryMonitorFunction)(void* ptr, ssize_t bytes, bool post);
+
+/* Sets the memory monitor callback function. */
+RTC_API void rtcSetDeviceMemoryMonitorFunction(RTCDevice device, RTCMemoryMonitorFunction memoryMonitor, void* userPtr);
+
+RTC_NAMESPACE_END
diff --git a/thirdparty/embree/include/embree3/rtcore_geometry.h b/thirdparty/embree/include/embree3/rtcore_geometry.h
new file mode 100644
index 0000000000..d1de17491c
--- /dev/null
+++ b/thirdparty/embree/include/embree3/rtcore_geometry.h
@@ -0,0 +1,383 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "rtcore_buffer.h"
+#include "rtcore_quaternion.h"
+
+RTC_NAMESPACE_BEGIN
+
+/* Opaque scene type */
+typedef struct RTCSceneTy* RTCScene;
+
+/* Opaque geometry type */
+typedef struct RTCGeometryTy* RTCGeometry;
+
+/* Types of geometries */
+enum RTCGeometryType
+{
+  RTC_GEOMETRY_TYPE_TRIANGLE = 0, // triangle mesh
+  RTC_GEOMETRY_TYPE_QUAD     = 1, // quad (triangle pair) mesh
+  RTC_GEOMETRY_TYPE_GRID     = 2, // grid mesh
+
+  RTC_GEOMETRY_TYPE_SUBDIVISION = 8, // Catmull-Clark subdivision surface
+
+  RTC_GEOMETRY_TYPE_CONE_LINEAR_CURVE   = 15, // Cone linear curves - discontinuous at edge boundaries 
+  RTC_GEOMETRY_TYPE_ROUND_LINEAR_CURVE  = 16, // Round (rounded cone like) linear curves 
+  RTC_GEOMETRY_TYPE_FLAT_LINEAR_CURVE   = 17, // flat (ribbon-like) linear curves
+
+  RTC_GEOMETRY_TYPE_ROUND_BEZIER_CURVE  = 24, // round (tube-like) Bezier curves
+  RTC_GEOMETRY_TYPE_FLAT_BEZIER_CURVE   = 25, // flat (ribbon-like) Bezier curves
+  RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_BEZIER_CURVE  = 26, // flat normal-oriented Bezier curves
+  
+  RTC_GEOMETRY_TYPE_ROUND_BSPLINE_CURVE = 32, // round (tube-like) B-spline curves
+  RTC_GEOMETRY_TYPE_FLAT_BSPLINE_CURVE  = 33, // flat (ribbon-like) B-spline curves
+  RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_BSPLINE_CURVE  = 34, // flat normal-oriented B-spline curves
+
+  RTC_GEOMETRY_TYPE_ROUND_HERMITE_CURVE = 40, // round (tube-like) Hermite curves
+  RTC_GEOMETRY_TYPE_FLAT_HERMITE_CURVE  = 41, // flat (ribbon-like) Hermite curves
+  RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_HERMITE_CURVE  = 42, // flat normal-oriented Hermite curves
+
+  RTC_GEOMETRY_TYPE_SPHERE_POINT = 50,
+  RTC_GEOMETRY_TYPE_DISC_POINT = 51,
+  RTC_GEOMETRY_TYPE_ORIENTED_DISC_POINT = 52,
+
+  RTC_GEOMETRY_TYPE_ROUND_CATMULL_ROM_CURVE = 58, // round (tube-like) Catmull-Rom curves
+  RTC_GEOMETRY_TYPE_FLAT_CATMULL_ROM_CURVE  = 59, // flat (ribbon-like) Catmull-Rom curves
+  RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_CATMULL_ROM_CURVE  = 60, // flat normal-oriented Catmull-Rom curves
+
+  RTC_GEOMETRY_TYPE_USER     = 120, // user-defined geometry
+  RTC_GEOMETRY_TYPE_INSTANCE = 121  // scene instance
+};
+
+/* Interpolation modes for subdivision surfaces */
+enum RTCSubdivisionMode
+{
+  RTC_SUBDIVISION_MODE_NO_BOUNDARY     = 0,
+  RTC_SUBDIVISION_MODE_SMOOTH_BOUNDARY = 1,
+  RTC_SUBDIVISION_MODE_PIN_CORNERS     = 2,
+  RTC_SUBDIVISION_MODE_PIN_BOUNDARY    = 3,
+  RTC_SUBDIVISION_MODE_PIN_ALL         = 4,
+};
+
+/* Curve segment flags */
+enum RTCCurveFlags
+{
+  RTC_CURVE_FLAG_NEIGHBOR_LEFT  = (1 << 0), // left segments exists
+  RTC_CURVE_FLAG_NEIGHBOR_RIGHT = (1 << 1)  // right segment exists
+};
+
+/* Arguments for RTCBoundsFunction */
+struct RTCBoundsFunctionArguments
+{
+  void* geometryUserPtr;
+  unsigned int primID;
+  unsigned int timeStep;
+  struct RTCBounds* bounds_o;
+};
+
+/* Bounding callback function */
+typedef void (*RTCBoundsFunction)(const struct RTCBoundsFunctionArguments* args);
+
+/* Arguments for RTCIntersectFunctionN */
+struct RTCIntersectFunctionNArguments
+{
+  int* valid;
+  void* geometryUserPtr;
+  unsigned int primID;
+  struct RTCIntersectContext* context;
+  struct RTCRayHitN* rayhit;
+  unsigned int N;
+  unsigned int geomID;
+};
+
+/* Intersection callback function */
+typedef void (*RTCIntersectFunctionN)(const struct RTCIntersectFunctionNArguments* args);
+
+/* Arguments for RTCOccludedFunctionN */
+struct RTCOccludedFunctionNArguments
+{
+  int* valid;
+  void* geometryUserPtr;
+  unsigned int primID;
+  struct RTCIntersectContext* context;
+  struct RTCRayN* ray;
+  unsigned int N;
+  unsigned int geomID;
+};
+
+/* Occlusion callback function */
+typedef void (*RTCOccludedFunctionN)(const struct RTCOccludedFunctionNArguments* args);
+
+/* Arguments for RTCDisplacementFunctionN */
+struct RTCDisplacementFunctionNArguments
+{
+  void* geometryUserPtr;
+  RTCGeometry geometry;
+  unsigned int primID;
+  unsigned int timeStep;
+  const float* u;
+  const float* v;
+  const float* Ng_x;
+  const float* Ng_y;
+  const float* Ng_z;
+  float* P_x;
+  float* P_y;
+  float* P_z;
+  unsigned int N;
+};
+
+/* Displacement mapping callback function */
+typedef void (*RTCDisplacementFunctionN)(const struct RTCDisplacementFunctionNArguments* args);
+
+/* Creates a new geometry of specified type. */
+RTC_API RTCGeometry rtcNewGeometry(RTCDevice device, enum RTCGeometryType type);
+
+/* Retains the geometry (increments the reference count). */
+RTC_API void rtcRetainGeometry(RTCGeometry geometry);
+
+/* Releases the geometry (decrements the reference count) */
+RTC_API void rtcReleaseGeometry(RTCGeometry geometry);
+
+/* Commits the geometry. */
+RTC_API void rtcCommitGeometry(RTCGeometry geometry);
+
+
+/* Enables the geometry. */
+RTC_API void rtcEnableGeometry(RTCGeometry geometry);
+
+/* Disables the geometry. */
+RTC_API void rtcDisableGeometry(RTCGeometry geometry);
+
+
+/* Sets the number of motion blur time steps of the geometry. */
+RTC_API void rtcSetGeometryTimeStepCount(RTCGeometry geometry, unsigned int timeStepCount);
+
+/* Sets the motion blur time range of the geometry. */
+RTC_API void rtcSetGeometryTimeRange(RTCGeometry geometry, float startTime, float endTime);
+  
+/* Sets the number of vertex attributes of the geometry. */
+RTC_API void rtcSetGeometryVertexAttributeCount(RTCGeometry geometry, unsigned int vertexAttributeCount);
+
+/* Sets the ray mask of the geometry. */
+RTC_API void rtcSetGeometryMask(RTCGeometry geometry, unsigned int mask);
+
+/* Sets the build quality of the geometry. */
+RTC_API void rtcSetGeometryBuildQuality(RTCGeometry geometry, enum RTCBuildQuality quality);
+
+/* Sets the maximal curve or point radius scale allowed by min-width feature. */
+RTC_API void rtcSetGeometryMaxRadiusScale(RTCGeometry geometry, float maxRadiusScale);
+
+
+/* Sets a geometry buffer. */
+RTC_API void rtcSetGeometryBuffer(RTCGeometry geometry, enum RTCBufferType type, unsigned int slot, enum RTCFormat format, RTCBuffer buffer, size_t byteOffset, size_t byteStride, size_t itemCount);
+
+/* Sets a shared geometry buffer. */
+RTC_API void rtcSetSharedGeometryBuffer(RTCGeometry geometry, enum RTCBufferType type, unsigned int slot, enum RTCFormat format, const void* ptr, size_t byteOffset, size_t byteStride, size_t itemCount);
+
+/* Creates and sets a new geometry buffer. */
+RTC_API void* rtcSetNewGeometryBuffer(RTCGeometry geometry, enum RTCBufferType type, unsigned int slot, enum RTCFormat format, size_t byteStride, size_t itemCount);
+
+/* Returns the pointer to the data of a buffer. */
+RTC_API void* rtcGetGeometryBufferData(RTCGeometry geometry, enum RTCBufferType type, unsigned int slot);
+
+/* Updates a geometry buffer. */
+RTC_API void rtcUpdateGeometryBuffer(RTCGeometry geometry, enum RTCBufferType type, unsigned int slot);
+
+
+/* Sets the intersection filter callback function of the geometry. */
+RTC_API void rtcSetGeometryIntersectFilterFunction(RTCGeometry geometry, RTCFilterFunctionN filter);
+
+/* Sets the occlusion filter callback function of the geometry. */
+RTC_API void rtcSetGeometryOccludedFilterFunction(RTCGeometry geometry, RTCFilterFunctionN filter);
+
+/* Sets the user-defined data pointer of the geometry. */
+RTC_API void rtcSetGeometryUserData(RTCGeometry geometry, void* ptr);
+
+/* Gets the user-defined data pointer of the geometry. */
+RTC_API void* rtcGetGeometryUserData(RTCGeometry geometry);
+
+/* Set the point query callback function of a geometry. */
+RTC_API void rtcSetGeometryPointQueryFunction(RTCGeometry geometry, RTCPointQueryFunction pointQuery);
+
+/* Sets the number of primitives of a user geometry. */
+RTC_API void rtcSetGeometryUserPrimitiveCount(RTCGeometry geometry, unsigned int userPrimitiveCount);
+
+/* Sets the bounding callback function to calculate bounding boxes for user primitives. */
+RTC_API void rtcSetGeometryBoundsFunction(RTCGeometry geometry, RTCBoundsFunction bounds, void* userPtr);
+
+/* Set the intersect callback function of a user geometry. */
+RTC_API void rtcSetGeometryIntersectFunction(RTCGeometry geometry, RTCIntersectFunctionN intersect);
+
+/* Set the occlusion callback function of a user geometry. */
+RTC_API void rtcSetGeometryOccludedFunction(RTCGeometry geometry, RTCOccludedFunctionN occluded);
+
+/* Invokes the intersection filter from the intersection callback function. */
+RTC_API void rtcFilterIntersection(const struct RTCIntersectFunctionNArguments* args, const struct RTCFilterFunctionNArguments* filterArgs);
+
+/* Invokes the occlusion filter from the occlusion callback function. */
+RTC_API void rtcFilterOcclusion(const struct RTCOccludedFunctionNArguments* args, const struct RTCFilterFunctionNArguments* filterArgs);
+
+
+/* Sets the instanced scene of an instance geometry. */
+RTC_API void rtcSetGeometryInstancedScene(RTCGeometry geometry, RTCScene scene);
+
+/* Sets the transformation of an instance for the specified time step. */
+RTC_API void rtcSetGeometryTransform(RTCGeometry geometry, unsigned int timeStep, enum RTCFormat format, const void* xfm);
+
+/* Sets the transformation quaternion of an instance for the specified time step. */
+RTC_API void rtcSetGeometryTransformQuaternion(RTCGeometry geometry, unsigned int timeStep, const struct RTCQuaternionDecomposition* qd);
+
+/* Returns the interpolated transformation of an instance for the specified time. */
+RTC_API void rtcGetGeometryTransform(RTCGeometry geometry, float time, enum RTCFormat format, void* xfm);
+
+
+/* Sets the uniform tessellation rate of the geometry. */
+RTC_API void rtcSetGeometryTessellationRate(RTCGeometry geometry, float tessellationRate);
+
+/* Sets the number of topologies of a subdivision surface. */
+RTC_API void rtcSetGeometryTopologyCount(RTCGeometry geometry, unsigned int topologyCount);
+
+/* Sets the subdivision interpolation mode. */
+RTC_API void rtcSetGeometrySubdivisionMode(RTCGeometry geometry, unsigned int topologyID, enum RTCSubdivisionMode mode);
+
+/* Binds a vertex attribute to a topology of the geometry. */
+RTC_API void rtcSetGeometryVertexAttributeTopology(RTCGeometry geometry, unsigned int vertexAttributeID, unsigned int topologyID);
+
+/* Sets the displacement callback function of a subdivision surface. */
+RTC_API void rtcSetGeometryDisplacementFunction(RTCGeometry geometry, RTCDisplacementFunctionN displacement);
+
+/* Returns the first half edge of a face. */
+RTC_API unsigned int rtcGetGeometryFirstHalfEdge(RTCGeometry geometry, unsigned int faceID);
+
+/* Returns the face the half edge belongs to. */
+RTC_API unsigned int rtcGetGeometryFace(RTCGeometry geometry, unsigned int edgeID);
+
+/* Returns next half edge. */
+RTC_API unsigned int rtcGetGeometryNextHalfEdge(RTCGeometry geometry, unsigned int edgeID);
+
+/* Returns previous half edge. */
+RTC_API unsigned int rtcGetGeometryPreviousHalfEdge(RTCGeometry geometry, unsigned int edgeID);
+
+/* Returns opposite half edge. */
+RTC_API unsigned int rtcGetGeometryOppositeHalfEdge(RTCGeometry geometry, unsigned int topologyID, unsigned int edgeID);
+
+
+/* Arguments for rtcInterpolate */
+struct RTCInterpolateArguments
+{
+  RTCGeometry geometry;
+  unsigned int primID;
+  float u;
+  float v;
+  enum RTCBufferType bufferType;
+  unsigned int bufferSlot;
+  float* P;
+  float* dPdu;
+  float* dPdv;
+  float* ddPdudu;
+  float* ddPdvdv;
+  float* ddPdudv;
+  unsigned int valueCount;
+};
+
+/* Interpolates vertex data to some u/v location and optionally calculates all derivatives. */
+RTC_API void rtcInterpolate(const struct RTCInterpolateArguments* args);
+
+/* Interpolates vertex data to some u/v location. */
+RTC_FORCEINLINE void rtcInterpolate0(RTCGeometry geometry, unsigned int primID, float u, float v, enum RTCBufferType bufferType, unsigned int bufferSlot, float* P, unsigned int valueCount)
+{
+  struct RTCInterpolateArguments args;
+  args.geometry = geometry;
+  args.primID = primID;
+  args.u = u;
+  args.v = v;
+  args.bufferType = bufferType;
+  args.bufferSlot = bufferSlot;
+  args.P = P;
+  args.dPdu = NULL;
+  args.dPdv = NULL;
+  args.ddPdudu = NULL;
+  args.ddPdvdv = NULL;
+  args.ddPdudv = NULL;
+  args.valueCount = valueCount;
+  rtcInterpolate(&args);
+}
+
+/* Interpolates vertex data to some u/v location and calculates first order derivatives. */
+RTC_FORCEINLINE void rtcInterpolate1(RTCGeometry geometry, unsigned int primID, float u, float v, enum RTCBufferType bufferType, unsigned int bufferSlot,
+                                     float* P, float* dPdu, float* dPdv, unsigned int valueCount)
+{
+  struct RTCInterpolateArguments args;
+  args.geometry = geometry;
+  args.primID = primID;
+  args.u = u;
+  args.v = v;
+  args.bufferType = bufferType;
+  args.bufferSlot = bufferSlot;
+  args.P = P;
+  args.dPdu = dPdu;
+  args.dPdv = dPdv;
+  args.ddPdudu = NULL;
+  args.ddPdvdv = NULL;
+  args.ddPdudv = NULL;
+  args.valueCount = valueCount;
+  rtcInterpolate(&args);
+}
+
+/* Interpolates vertex data to some u/v location and calculates first and second order derivatives. */
+RTC_FORCEINLINE void rtcInterpolate2(RTCGeometry geometry, unsigned int primID, float u, float v, enum RTCBufferType bufferType, unsigned int bufferSlot,
+                                     float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv, unsigned int valueCount)
+{
+  struct RTCInterpolateArguments args;
+  args.geometry = geometry;
+  args.primID = primID;
+  args.u = u;
+  args.v = v;
+  args.bufferType = bufferType;
+  args.bufferSlot = bufferSlot;
+  args.P = P;
+  args.dPdu = dPdu;
+  args.dPdv = dPdv;
+  args.ddPdudu = ddPdudu;
+  args.ddPdvdv = ddPdvdv;
+  args.ddPdudv = ddPdudv;
+  args.valueCount = valueCount;
+  rtcInterpolate(&args);
+}
+
+/* Arguments for rtcInterpolateN */
+struct RTCInterpolateNArguments
+{
+  RTCGeometry geometry;
+  const void* valid;
+  const unsigned int* primIDs;
+  const float* u;
+  const float* v;
+  unsigned int N;
+  enum RTCBufferType bufferType;
+  unsigned int bufferSlot;
+  float* P;
+  float* dPdu;
+  float* dPdv;
+  float* ddPdudu;
+  float* ddPdvdv;
+  float* ddPdudv;
+  unsigned int valueCount;
+};
+
+/* Interpolates vertex data to an array of u/v locations. */
+RTC_API void rtcInterpolateN(const struct RTCInterpolateNArguments* args);
+
+/* RTCGrid primitive for grid mesh */
+struct RTCGrid
+{
+  unsigned int startVertexID;
+  unsigned int stride;
+  unsigned short width,height; // max is a 32k x 32k grid
+};
+
+RTC_NAMESPACE_END
+
+
diff --git a/thirdparty/embree/include/embree3/rtcore_quaternion.h b/thirdparty/embree/include/embree3/rtcore_quaternion.h
new file mode 100644
index 0000000000..6489fa3467
--- /dev/null
+++ b/thirdparty/embree/include/embree3/rtcore_quaternion.h
@@ -0,0 +1,101 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "rtcore_common.h"
+
+RTC_NAMESPACE_BEGIN
+
+/*
+ * Structure for transformation respresentation as a matrix decomposition using
+ * a quaternion
+ */
+struct RTC_ALIGN(16) RTCQuaternionDecomposition
+{
+  float scale_x;
+  float scale_y;
+  float scale_z;
+  float skew_xy;
+  float skew_xz;
+  float skew_yz;
+  float shift_x;
+  float shift_y;
+  float shift_z;
+  float quaternion_r;
+  float quaternion_i;
+  float quaternion_j;
+  float quaternion_k;
+  float translation_x;
+  float translation_y;
+  float translation_z;
+};
+
+RTC_FORCEINLINE void rtcInitQuaternionDecomposition(struct RTCQuaternionDecomposition* qdecomp)
+{
+  qdecomp->scale_x = 1.f;
+  qdecomp->scale_y = 1.f;
+  qdecomp->scale_z = 1.f;
+  qdecomp->skew_xy = 0.f;
+  qdecomp->skew_xz = 0.f;
+  qdecomp->skew_yz = 0.f;
+  qdecomp->shift_x = 0.f;
+  qdecomp->shift_y = 0.f;
+  qdecomp->shift_z = 0.f;
+  qdecomp->quaternion_r = 1.f;
+  qdecomp->quaternion_i = 0.f;
+  qdecomp->quaternion_j = 0.f;
+  qdecomp->quaternion_k = 0.f;
+  qdecomp->translation_x = 0.f;
+  qdecomp->translation_y = 0.f;
+  qdecomp->translation_z = 0.f;
+}
+
+RTC_FORCEINLINE void rtcQuaternionDecompositionSetQuaternion(
+  struct RTCQuaternionDecomposition* qdecomp,
+  float r, float i, float j, float k)
+{
+  qdecomp->quaternion_r = r;
+  qdecomp->quaternion_i = i;
+  qdecomp->quaternion_j = j;
+  qdecomp->quaternion_k = k;
+}
+
+RTC_FORCEINLINE void rtcQuaternionDecompositionSetScale(
+  struct RTCQuaternionDecomposition* qdecomp,
+  float scale_x, float scale_y, float scale_z)
+{
+  qdecomp->scale_x = scale_x;
+  qdecomp->scale_y = scale_y;
+  qdecomp->scale_z = scale_z;
+}
+
+RTC_FORCEINLINE void rtcQuaternionDecompositionSetSkew(
+  struct RTCQuaternionDecomposition* qdecomp,
+  float skew_xy, float skew_xz, float skew_yz)
+{
+  qdecomp->skew_xy = skew_xy;
+  qdecomp->skew_xz = skew_xz;
+  qdecomp->skew_yz = skew_yz;
+}
+
+RTC_FORCEINLINE void rtcQuaternionDecompositionSetShift(
+  struct RTCQuaternionDecomposition* qdecomp,
+  float shift_x, float shift_y, float shift_z)
+{
+  qdecomp->shift_x = shift_x;
+  qdecomp->shift_y = shift_y;
+  qdecomp->shift_z = shift_z;
+}
+
+RTC_FORCEINLINE void rtcQuaternionDecompositionSetTranslation(
+  struct RTCQuaternionDecomposition* qdecomp,
+  float translation_x, float translation_y, float translation_z)
+{
+  qdecomp->translation_x = translation_x;
+  qdecomp->translation_y = translation_y;
+  qdecomp->translation_z = translation_z;
+}
+
+RTC_NAMESPACE_END
+
diff --git a/thirdparty/embree/include/embree3/rtcore_ray.h b/thirdparty/embree/include/embree3/rtcore_ray.h
new file mode 100644
index 0000000000..a2ee6dabbb
--- /dev/null
+++ b/thirdparty/embree/include/embree3/rtcore_ray.h
@@ -0,0 +1,378 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "rtcore_common.h"
+
+RTC_NAMESPACE_BEGIN
+
+/* Ray structure for a single ray */
+struct RTC_ALIGN(16) RTCRay
+{
+  float org_x;        // x coordinate of ray origin
+  float org_y;        // y coordinate of ray origin
+  float org_z;        // z coordinate of ray origin
+  float tnear;        // start of ray segment
+
+  float dir_x;        // x coordinate of ray direction
+  float dir_y;        // y coordinate of ray direction
+  float dir_z;        // z coordinate of ray direction
+  float time;         // time of this ray for motion blur
+
+  float tfar;         // end of ray segment (set to hit distance)
+  unsigned int mask;  // ray mask
+  unsigned int id;    // ray ID
+  unsigned int flags; // ray flags
+};
+
+/* Hit structure for a single ray */
+struct RTC_ALIGN(16) RTCHit
+{
+  float Ng_x;          // x coordinate of geometry normal
+  float Ng_y;          // y coordinate of geometry normal
+  float Ng_z;          // z coordinate of geometry normal
+
+  float u;             // barycentric u coordinate of hit
+  float v;             // barycentric v coordinate of hit
+
+  unsigned int primID; // primitive ID
+  unsigned int geomID; // geometry ID
+  unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // instance ID
+};
+
+/* Combined ray/hit structure for a single ray */
+struct RTCRayHit
+{
+  struct RTCRay ray;
+  struct RTCHit hit;
+};
+
+/* Ray structure for a packet of 4 rays */
+struct RTC_ALIGN(16) RTCRay4
+{
+  float org_x[4];
+  float org_y[4];
+  float org_z[4];
+  float tnear[4];
+
+  float dir_x[4];
+  float dir_y[4];
+  float dir_z[4];
+  float time[4];
+
+  float tfar[4];
+  unsigned int mask[4];
+  unsigned int id[4];
+  unsigned int flags[4];
+};
+
+/* Hit structure for a packet of 4 rays */
+struct RTC_ALIGN(16) RTCHit4
+{
+  float Ng_x[4];
+  float Ng_y[4];
+  float Ng_z[4];
+
+  float u[4];
+  float v[4];
+
+  unsigned int primID[4];
+  unsigned int geomID[4];
+  unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT][4];
+};
+
+/* Combined ray/hit structure for a packet of 4 rays */
+struct RTCRayHit4
+{
+  struct RTCRay4 ray;
+  struct RTCHit4 hit;
+};
+
+/* Ray structure for a packet of 8 rays */
+struct RTC_ALIGN(32) RTCRay8
+{
+  float org_x[8];
+  float org_y[8];
+  float org_z[8];
+  float tnear[8];
+
+  float dir_x[8];
+  float dir_y[8];
+  float dir_z[8];
+  float time[8];
+
+  float tfar[8];
+  unsigned int mask[8];
+  unsigned int id[8];
+  unsigned int flags[8];
+};
+
+/* Hit structure for a packet of 8 rays */
+struct RTC_ALIGN(32) RTCHit8
+{
+  float Ng_x[8];
+  float Ng_y[8];
+  float Ng_z[8];
+
+  float u[8];
+  float v[8];
+
+  unsigned int primID[8];
+  unsigned int geomID[8];
+  unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT][8];
+};
+
+/* Combined ray/hit structure for a packet of 8 rays */
+struct RTCRayHit8
+{
+  struct RTCRay8 ray;
+  struct RTCHit8 hit;
+};
+
+/* Ray structure for a packet of 16 rays */
+struct RTC_ALIGN(64) RTCRay16
+{
+  float org_x[16];
+  float org_y[16];
+  float org_z[16];
+  float tnear[16];
+
+  float dir_x[16];
+  float dir_y[16];
+  float dir_z[16];
+  float time[16];
+
+  float tfar[16];
+  unsigned int mask[16];
+  unsigned int id[16];
+  unsigned int flags[16];
+};
+
+/* Hit structure for a packet of 16 rays */
+struct RTC_ALIGN(64) RTCHit16
+{
+  float Ng_x[16];
+  float Ng_y[16];
+  float Ng_z[16];
+
+  float u[16];
+  float v[16];
+
+  unsigned int primID[16];
+  unsigned int geomID[16];
+  unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT][16];
+};
+
+/* Combined ray/hit structure for a packet of 16 rays */
+struct RTCRayHit16
+{
+  struct RTCRay16 ray;
+  struct RTCHit16 hit;
+};
+
+/* Ray structure for a packet/stream of N rays in pointer SOA layout */
+struct RTCRayNp
+{
+  float* org_x;
+  float* org_y;
+  float* org_z;
+  float* tnear;
+
+  float* dir_x;
+  float* dir_y;
+  float* dir_z;
+  float* time;
+
+  float* tfar;
+  unsigned int* mask;
+  unsigned int* id;
+  unsigned int* flags;
+};
+
+/* Hit structure for a packet/stream of N rays in pointer SOA layout */
+struct RTCHitNp
+{
+  float* Ng_x;
+  float* Ng_y;
+  float* Ng_z;
+
+  float* u;
+  float* v;
+
+  unsigned int* primID;
+  unsigned int* geomID;
+  unsigned int* instID[RTC_MAX_INSTANCE_LEVEL_COUNT];
+};
+
+/* Combined ray/hit structure for a packet/stream of N rays in pointer SOA layout */
+struct RTCRayHitNp
+{
+  struct RTCRayNp ray;
+  struct RTCHitNp hit;
+};
+
+struct RTCRayN;
+struct RTCHitN;
+struct RTCRayHitN;
+
+#if defined(__cplusplus)
+
+/* Helper functions to access ray packets of runtime size N */
+RTC_FORCEINLINE float& RTCRayN_org_x(RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[0*N+i]; }
+RTC_FORCEINLINE float& RTCRayN_org_y(RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[1*N+i]; }
+RTC_FORCEINLINE float& RTCRayN_org_z(RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[2*N+i]; }
+RTC_FORCEINLINE float& RTCRayN_tnear(RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[3*N+i]; }
+
+RTC_FORCEINLINE float& RTCRayN_dir_x(RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[4*N+i]; }
+RTC_FORCEINLINE float& RTCRayN_dir_y(RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[5*N+i]; }
+RTC_FORCEINLINE float& RTCRayN_dir_z(RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[6*N+i]; }
+RTC_FORCEINLINE float& RTCRayN_time (RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[7*N+i]; }
+
+RTC_FORCEINLINE float&        RTCRayN_tfar (RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[8*N+i]; }
+RTC_FORCEINLINE unsigned int& RTCRayN_mask (RTCRayN* ray, unsigned int N, unsigned int i) { return ((unsigned*)ray)[9*N+i]; }
+RTC_FORCEINLINE unsigned int& RTCRayN_id   (RTCRayN* ray, unsigned int N, unsigned int i) { return ((unsigned*)ray)[10*N+i]; }
+RTC_FORCEINLINE unsigned int& RTCRayN_flags(RTCRayN* ray, unsigned int N, unsigned int i) { return ((unsigned*)ray)[11*N+i]; }
+
+/* Helper functions to access hit packets of runtime size N */
+RTC_FORCEINLINE float& RTCHitN_Ng_x(RTCHitN* hit, unsigned int N, unsigned int i) { return ((float*)hit)[0*N+i]; }
+RTC_FORCEINLINE float& RTCHitN_Ng_y(RTCHitN* hit, unsigned int N, unsigned int i) { return ((float*)hit)[1*N+i]; }
+RTC_FORCEINLINE float& RTCHitN_Ng_z(RTCHitN* hit, unsigned int N, unsigned int i) { return ((float*)hit)[2*N+i]; }
+
+RTC_FORCEINLINE float& RTCHitN_u(RTCHitN* hit, unsigned int N, unsigned int i) { return ((float*)hit)[3*N+i]; }
+RTC_FORCEINLINE float& RTCHitN_v(RTCHitN* hit, unsigned int N, unsigned int i) { return ((float*)hit)[4*N+i]; }
+
+RTC_FORCEINLINE unsigned int& RTCHitN_primID(RTCHitN* hit, unsigned int N, unsigned int i) { return ((unsigned*)hit)[5*N+i]; }
+RTC_FORCEINLINE unsigned int& RTCHitN_geomID(RTCHitN* hit, unsigned int N, unsigned int i) { return ((unsigned*)hit)[6*N+i]; }
+RTC_FORCEINLINE unsigned int& RTCHitN_instID(RTCHitN* hit, unsigned int N, unsigned int i, unsigned int l) { return ((unsigned*)hit)[7*N+i+N*l]; }
+
+/* Helper functions to extract RTCRayN and RTCHitN from RTCRayHitN */
+RTC_FORCEINLINE RTCRayN* RTCRayHitN_RayN(RTCRayHitN* rayhit, unsigned int N) { return (RTCRayN*)&((float*)rayhit)[0*N]; }
+RTC_FORCEINLINE RTCHitN* RTCRayHitN_HitN(RTCRayHitN* rayhit, unsigned int N) { return (RTCHitN*)&((float*)rayhit)[12*N]; }
+
+/* Helper structure for a ray packet of compile-time size N */
+template<int N>
+struct RTCRayNt
+{
+  float org_x[N];
+  float org_y[N];
+  float org_z[N];
+  float tnear[N];
+
+  float dir_x[N];
+  float dir_y[N];
+  float dir_z[N];
+  float time[N];
+
+  float tfar[N];
+  unsigned int mask[N];
+  unsigned int id[N];
+  unsigned int flags[N];
+};
+
+/* Helper structure for a hit packet of compile-time size N */
+template<int N>
+struct RTCHitNt
+{
+  float Ng_x[N];
+  float Ng_y[N];
+  float Ng_z[N];
+
+  float u[N];
+  float v[N];
+
+  unsigned int primID[N];
+  unsigned int geomID[N];
+  unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT][N];
+};
+
+/* Helper structure for a combined ray/hit packet of compile-time size N */
+template<int N>
+struct RTCRayHitNt
+{
+  RTCRayNt<N> ray;
+  RTCHitNt<N> hit;
+};
+
+RTC_FORCEINLINE RTCRay rtcGetRayFromRayN(RTCRayN* rayN, unsigned int N, unsigned int i)
+{
+  RTCRay ray;
+  ray.org_x = RTCRayN_org_x(rayN,N,i);
+  ray.org_y = RTCRayN_org_y(rayN,N,i);
+  ray.org_z = RTCRayN_org_z(rayN,N,i);
+  ray.tnear = RTCRayN_tnear(rayN,N,i);
+  ray.dir_x = RTCRayN_dir_x(rayN,N,i);
+  ray.dir_y = RTCRayN_dir_y(rayN,N,i);
+  ray.dir_z = RTCRayN_dir_z(rayN,N,i);
+  ray.time  = RTCRayN_time(rayN,N,i);
+  ray.tfar  = RTCRayN_tfar(rayN,N,i);
+  ray.mask  = RTCRayN_mask(rayN,N,i);
+  ray.id    = RTCRayN_id(rayN,N,i);
+  ray.flags = RTCRayN_flags(rayN,N,i);
+  return ray;
+}
+
+RTC_FORCEINLINE RTCHit rtcGetHitFromHitN(RTCHitN* hitN, unsigned int N, unsigned int i)
+{
+  RTCHit hit;
+  hit.Ng_x   = RTCHitN_Ng_x(hitN,N,i);
+  hit.Ng_y   = RTCHitN_Ng_y(hitN,N,i);
+  hit.Ng_z   = RTCHitN_Ng_z(hitN,N,i);
+  hit.u      = RTCHitN_u(hitN,N,i);
+  hit.v      = RTCHitN_v(hitN,N,i);
+  hit.primID = RTCHitN_primID(hitN,N,i);
+  hit.geomID = RTCHitN_geomID(hitN,N,i);
+  for (unsigned int l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; l++)
+    hit.instID[l] = RTCHitN_instID(hitN,N,i,l);
+  return hit;
+}
+
+RTC_FORCEINLINE void rtcCopyHitToHitN(RTCHitN* hitN, const RTCHit* hit, unsigned int N, unsigned int i)
+{
+  RTCHitN_Ng_x(hitN,N,i)   = hit->Ng_x;
+  RTCHitN_Ng_y(hitN,N,i)   = hit->Ng_y;
+  RTCHitN_Ng_z(hitN,N,i)   = hit->Ng_z;
+  RTCHitN_u(hitN,N,i)      = hit->u;
+  RTCHitN_v(hitN,N,i)      = hit->v;
+  RTCHitN_primID(hitN,N,i) = hit->primID;
+  RTCHitN_geomID(hitN,N,i) = hit->geomID;
+  for (unsigned int l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; l++)
+    RTCHitN_instID(hitN,N,i,l) = hit->instID[l];
+}
+
+RTC_FORCEINLINE RTCRayHit rtcGetRayHitFromRayHitN(RTCRayHitN* rayhitN, unsigned int N, unsigned int i)
+{
+  RTCRayHit rh;
+
+  RTCRayN* ray = RTCRayHitN_RayN(rayhitN,N);
+  rh.ray.org_x = RTCRayN_org_x(ray,N,i);
+  rh.ray.org_y = RTCRayN_org_y(ray,N,i);
+  rh.ray.org_z = RTCRayN_org_z(ray,N,i);
+  rh.ray.tnear = RTCRayN_tnear(ray,N,i);
+  rh.ray.dir_x = RTCRayN_dir_x(ray,N,i);
+  rh.ray.dir_y = RTCRayN_dir_y(ray,N,i);
+  rh.ray.dir_z = RTCRayN_dir_z(ray,N,i);
+  rh.ray.time  = RTCRayN_time(ray,N,i);
+  rh.ray.tfar  = RTCRayN_tfar(ray,N,i);
+  rh.ray.mask  = RTCRayN_mask(ray,N,i);
+  rh.ray.id    = RTCRayN_id(ray,N,i);
+  rh.ray.flags = RTCRayN_flags(ray,N,i);
+
+  RTCHitN* hit  = RTCRayHitN_HitN(rayhitN,N);
+  rh.hit.Ng_x   = RTCHitN_Ng_x(hit,N,i);
+  rh.hit.Ng_y   = RTCHitN_Ng_y(hit,N,i);
+  rh.hit.Ng_z   = RTCHitN_Ng_z(hit,N,i);
+  rh.hit.u      = RTCHitN_u(hit,N,i);
+  rh.hit.v      = RTCHitN_v(hit,N,i);
+  rh.hit.primID = RTCHitN_primID(hit,N,i);
+  rh.hit.geomID = RTCHitN_geomID(hit,N,i);
+  for (unsigned int l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; l++)
+    rh.hit.instID[l] = RTCHitN_instID(hit,N,i,l);
+
+  return rh;
+}
+
+#endif
+
+RTC_NAMESPACE_END
+
diff --git a/thirdparty/embree/include/embree3/rtcore_scene.h b/thirdparty/embree/include/embree3/rtcore_scene.h
new file mode 100644
index 0000000000..5878a3d402
--- /dev/null
+++ b/thirdparty/embree/include/embree3/rtcore_scene.h
@@ -0,0 +1,160 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "rtcore_device.h"
+
+RTC_NAMESPACE_BEGIN
+  
+/* Forward declarations for ray structures */
+struct RTCRayHit;
+struct RTCRayHit4;
+struct RTCRayHit8;
+struct RTCRayHit16;
+struct RTCRayHitNp;
+
+/* Scene flags */
+enum RTCSceneFlags
+{
+  RTC_SCENE_FLAG_NONE                    = 0,
+  RTC_SCENE_FLAG_DYNAMIC                 = (1 << 0),
+  RTC_SCENE_FLAG_COMPACT                 = (1 << 1),
+  RTC_SCENE_FLAG_ROBUST                  = (1 << 2),
+  RTC_SCENE_FLAG_CONTEXT_FILTER_FUNCTION = (1 << 3)
+};
+
+/* Creates a new scene. */
+RTC_API RTCScene rtcNewScene(RTCDevice device);
+
+/* Returns the device the scene got created in. The reference count of
+ * the device is incremented by this function. */
+RTC_API RTCDevice rtcGetSceneDevice(RTCScene hscene);
+   
+/* Retains the scene (increments the reference count). */
+RTC_API void rtcRetainScene(RTCScene scene);
+
+/* Releases the scene (decrements the reference count). */
+RTC_API void rtcReleaseScene(RTCScene scene);
+
+
+/* Attaches the geometry to a scene. */
+RTC_API unsigned int rtcAttachGeometry(RTCScene scene, RTCGeometry geometry);
+
+/* Attaches the geometry to a scene using the specified geometry ID. */
+RTC_API void rtcAttachGeometryByID(RTCScene scene, RTCGeometry geometry, unsigned int geomID);
+
+/* Detaches the geometry from the scene. */
+RTC_API void rtcDetachGeometry(RTCScene scene, unsigned int geomID);
+
+/* Gets a geometry handle from the scene. */
+RTC_API RTCGeometry rtcGetGeometry(RTCScene scene, unsigned int geomID);
+
+
+/* Commits the scene. */
+RTC_API void rtcCommitScene(RTCScene scene);
+
+/* Commits the scene from multiple threads. */
+RTC_API void rtcJoinCommitScene(RTCScene scene);
+
+
+/* Progress monitor callback function */
+typedef bool (*RTCProgressMonitorFunction)(void* ptr, double n);
+
+/* Sets the progress monitor callback function of the scene. */
+RTC_API void rtcSetSceneProgressMonitorFunction(RTCScene scene, RTCProgressMonitorFunction progress, void* ptr);
+
+/* Sets the build quality of the scene. */
+RTC_API void rtcSetSceneBuildQuality(RTCScene scene, enum RTCBuildQuality quality);
+
+/* Sets the scene flags. */
+RTC_API void rtcSetSceneFlags(RTCScene scene, enum RTCSceneFlags flags);
+
+/* Returns the scene flags. */
+RTC_API enum RTCSceneFlags rtcGetSceneFlags(RTCScene scene);
+
+/* Returns the axis-aligned bounds of the scene. */
+RTC_API void rtcGetSceneBounds(RTCScene scene, struct RTCBounds* bounds_o);
+
+/* Returns the linear axis-aligned bounds of the scene. */
+RTC_API void rtcGetSceneLinearBounds(RTCScene scene, struct RTCLinearBounds* bounds_o);
+
+
+/* Perform a closest point query of the scene. */
+RTC_API bool rtcPointQuery(RTCScene scene, struct RTCPointQuery* query, struct RTCPointQueryContext* context, RTCPointQueryFunction queryFunc, void* userPtr);
+
+/* Perform a closest point query with a packet of 4 points with the scene. */
+RTC_API bool rtcPointQuery4(const int* valid, RTCScene scene, struct RTCPointQuery4* query, struct RTCPointQueryContext* context, RTCPointQueryFunction queryFunc, void** userPtr);
+
+/* Perform a closest point query with a packet of 4 points with the scene. */
+RTC_API bool rtcPointQuery8(const int* valid, RTCScene scene, struct RTCPointQuery8* query, struct RTCPointQueryContext* context, RTCPointQueryFunction queryFunc, void** userPtr);
+
+/* Perform a closest point query with a packet of 4 points with the scene. */
+RTC_API bool rtcPointQuery16(const int* valid, RTCScene scene, struct RTCPointQuery16* query, struct RTCPointQueryContext* context, RTCPointQueryFunction queryFunc, void** userPtr);
+
+/* Intersects a single ray with the scene. */
+RTC_API void rtcIntersect1(RTCScene scene, struct RTCIntersectContext* context, struct RTCRayHit* rayhit);
+
+/* Intersects a packet of 4 rays with the scene. */
+RTC_API void rtcIntersect4(const int* valid, RTCScene scene, struct RTCIntersectContext* context, struct RTCRayHit4* rayhit);
+
+/* Intersects a packet of 8 rays with the scene. */
+RTC_API void rtcIntersect8(const int* valid, RTCScene scene, struct RTCIntersectContext* context, struct RTCRayHit8* rayhit);
+
+/* Intersects a packet of 16 rays with the scene. */
+RTC_API void rtcIntersect16(const int* valid, RTCScene scene, struct RTCIntersectContext* context, struct RTCRayHit16* rayhit);
+
+/* Intersects a stream of M rays with the scene. */
+RTC_API void rtcIntersect1M(RTCScene scene, struct RTCIntersectContext* context, struct RTCRayHit* rayhit, unsigned int M, size_t byteStride);
+
+/* Intersects a stream of pointers to M rays with the scene. */
+RTC_API void rtcIntersect1Mp(RTCScene scene, struct RTCIntersectContext* context, struct RTCRayHit** rayhit, unsigned int M);
+
+/* Intersects a stream of M ray packets of size N in SOA format with the scene. */
+RTC_API void rtcIntersectNM(RTCScene scene, struct RTCIntersectContext* context, struct RTCRayHitN* rayhit, unsigned int N, unsigned int M, size_t byteStride);
+
+/* Intersects a stream of M ray packets of size N in SOA format with the scene. */
+RTC_API void rtcIntersectNp(RTCScene scene, struct RTCIntersectContext* context, const struct RTCRayHitNp* rayhit, unsigned int N);
+
+/* Tests a single ray for occlusion with the scene. */
+RTC_API void rtcOccluded1(RTCScene scene, struct RTCIntersectContext* context, struct RTCRay* ray);
+
+/* Tests a packet of 4 rays for occlusion occluded with the scene. */
+RTC_API void rtcOccluded4(const int* valid, RTCScene scene, struct RTCIntersectContext* context, struct RTCRay4* ray);
+
+/* Tests a packet of 8 rays for occlusion with the scene. */
+RTC_API void rtcOccluded8(const int* valid, RTCScene scene, struct RTCIntersectContext* context, struct RTCRay8* ray);
+
+/* Tests a packet of 16 rays for occlusion with the scene. */
+RTC_API void rtcOccluded16(const int* valid, RTCScene scene, struct RTCIntersectContext* context, struct RTCRay16* ray);
+
+/* Tests a stream of M rays for occlusion with the scene. */
+RTC_API void rtcOccluded1M(RTCScene scene, struct RTCIntersectContext* context, struct RTCRay* ray, unsigned int M, size_t byteStride);
+
+/* Tests a stream of pointers to M rays for occlusion with the scene. */
+RTC_API void rtcOccluded1Mp(RTCScene scene, struct RTCIntersectContext* context, struct RTCRay** ray, unsigned int M);
+
+/* Tests a stream of M ray packets of size N in SOA format for occlusion with the scene. */
+RTC_API void rtcOccludedNM(RTCScene scene, struct RTCIntersectContext* context, struct RTCRayN* ray, unsigned int N, unsigned int M, size_t byteStride);
+
+/* Tests a stream of M ray packets of size N in SOA format for occlusion with the scene. */
+RTC_API void rtcOccludedNp(RTCScene scene, struct RTCIntersectContext* context, const struct RTCRayNp* ray, unsigned int N);
+
+/*! collision callback */
+struct RTCCollision { unsigned int geomID0; unsigned int primID0; unsigned int geomID1; unsigned int primID1; };
+typedef void (*RTCCollideFunc) (void* userPtr, struct RTCCollision* collisions, unsigned int num_collisions);
+
+/*! Performs collision detection of two scenes */
+RTC_API void rtcCollide (RTCScene scene0, RTCScene scene1, RTCCollideFunc callback, void* userPtr);
+ 
+#if defined(__cplusplus)
+
+/* Helper for easily combining scene flags */
+inline RTCSceneFlags operator|(RTCSceneFlags a, RTCSceneFlags b) {
+  return (RTCSceneFlags)((size_t)a | (size_t)b);
+}
+
+#endif
+
+RTC_NAMESPACE_END
+
diff --git a/thirdparty/embree/kernels/builders/bvh_builder_hair.h b/thirdparty/embree/kernels/builders/bvh_builder_hair.h
new file mode 100644
index 0000000000..d83e8918a1
--- /dev/null
+++ b/thirdparty/embree/kernels/builders/bvh_builder_hair.h
@@ -0,0 +1,411 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../bvh/bvh.h"
+#include "../geometry/primitive.h"
+#include "../builders/bvh_builder_sah.h"
+#include "../builders/heuristic_binning_array_aligned.h"
+#include "../builders/heuristic_binning_array_unaligned.h"
+#include "../builders/heuristic_strand_array.h"
+
+#define NUM_HAIR_OBJECT_BINS 32
+
+namespace embree
+{
+  namespace isa
+  {
+    struct BVHBuilderHair
+    {
+      /*! settings for builder */
+      struct Settings
+      {
+        /*! default settings */
+        Settings ()
+        : branchingFactor(2), maxDepth(32), logBlockSize(0), minLeafSize(1), maxLeafSize(7), finished_range_threshold(inf) {}
+
+      public:
+        size_t branchingFactor;  //!< branching factor of BVH to build
+        size_t maxDepth;         //!< maximum depth of BVH to build
+        size_t logBlockSize;     //!< log2 of blocksize for SAH heuristic
+        size_t minLeafSize;      //!< minimum size of a leaf
+        size_t maxLeafSize;      //!< maximum size of a leaf
+        size_t finished_range_threshold;  //!< finished range threshold
+      };
+
+      template<typename NodeRef,
+        typename CreateAllocFunc,
+        typename CreateAABBNodeFunc,
+        typename SetAABBNodeFunc,
+        typename CreateOBBNodeFunc,
+        typename SetOBBNodeFunc,
+        typename CreateLeafFunc,
+        typename ProgressMonitor,
+        typename ReportFinishedRangeFunc>
+
+        class BuilderT
+        {
+          ALIGNED_CLASS_(16);
+          friend struct BVHBuilderHair;
+
+          typedef FastAllocator::CachedAllocator Allocator;
+          typedef HeuristicArrayBinningSAH<PrimRef,NUM_HAIR_OBJECT_BINS> HeuristicBinningSAH;
+          typedef UnalignedHeuristicArrayBinningSAH<PrimRef,NUM_HAIR_OBJECT_BINS> UnalignedHeuristicBinningSAH;
+          typedef HeuristicStrandSplit HeuristicStrandSplitSAH;
+
+          static const size_t MAX_BRANCHING_FACTOR =  8;         //!< maximum supported BVH branching factor
+          static const size_t MIN_LARGE_LEAF_LEVELS = 8;         //!< create balanced tree if we are that many levels before the maximum tree depth
+          static const size_t SINGLE_THREADED_THRESHOLD = 4096;  //!< threshold to switch to single threaded build
+
+          static const size_t travCostAligned = 1;
+          static const size_t travCostUnaligned = 5;
+          static const size_t intCost = 6;
+
+          BuilderT (Scene* scene,
+                    PrimRef* prims,
+                    const CreateAllocFunc& createAlloc,
+                    const CreateAABBNodeFunc& createAABBNode,
+                    const SetAABBNodeFunc& setAABBNode,
+                    const CreateOBBNodeFunc& createOBBNode,
+                    const SetOBBNodeFunc& setOBBNode,
+                    const CreateLeafFunc& createLeaf,
+                    const ProgressMonitor& progressMonitor,
+                    const ReportFinishedRangeFunc& reportFinishedRange,
+                    const Settings settings)
+
+            : cfg(settings),
+            prims(prims),
+            createAlloc(createAlloc),
+            createAABBNode(createAABBNode),
+            setAABBNode(setAABBNode),
+            createOBBNode(createOBBNode),
+            setOBBNode(setOBBNode),
+            createLeaf(createLeaf),
+            progressMonitor(progressMonitor),
+            reportFinishedRange(reportFinishedRange),
+            alignedHeuristic(prims), unalignedHeuristic(scene,prims), strandHeuristic(scene,prims) {}
+
+          /*! checks if all primitives are from the same geometry */
+          __forceinline bool sameGeometry(const PrimInfoRange& range)
+          {
+            if (range.size() == 0) return true;
+            unsigned int firstGeomID = prims[range.begin()].geomID();
+            for (size_t i=range.begin()+1; i<range.end(); i++) {
+              if (prims[i].geomID() != firstGeomID){
+                return false;
+              }
+            }
+            return true;
+          }
+
+          /*! creates a large leaf that could be larger than supported by the BVH */
+          NodeRef createLargeLeaf(size_t depth, const PrimInfoRange& pinfo, Allocator alloc)
+          {
+            /* this should never occur but is a fatal error */
+            if (depth > cfg.maxDepth)
+              throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached");
+
+            /* create leaf for few primitives */
+            if (pinfo.size() <= cfg.maxLeafSize && sameGeometry(pinfo))
+              return createLeaf(prims,pinfo,alloc);
+
+            /* fill all children by always splitting the largest one */
+            PrimInfoRange children[MAX_BRANCHING_FACTOR];
+            unsigned numChildren = 1;
+            children[0] = pinfo;
+
+            do {
+
+              /* find best child with largest bounding box area */
+              int bestChild = -1;
+              size_t bestSize = 0;
+              for (unsigned i=0; i<numChildren; i++)
+              {
+                /* ignore leaves as they cannot get split */
+                if (children[i].size() <= cfg.maxLeafSize && sameGeometry(children[i]))
+                  continue;
+
+                /* remember child with largest size */
+                if (children[i].size() > bestSize) {
+                  bestSize = children[i].size();
+                  bestChild = i;
+                }
+              }
+              if (bestChild == -1) break;
+
+              /*! split best child into left and right child */
+              __aligned(64) PrimInfoRange left, right;
+              if (!sameGeometry(children[bestChild])) {
+                alignedHeuristic.splitByGeometry(children[bestChild],left,right);
+              } else {
+                alignedHeuristic.splitFallback(children[bestChild],left,right);
+              }
+
+              /* add new children left and right */
+              children[bestChild] = children[numChildren-1];
+              children[numChildren-1] = left;
+              children[numChildren+0] = right;
+              numChildren++;
+
+            } while (numChildren < cfg.branchingFactor);
+
+            /* create node */
+            auto node = createAABBNode(alloc);
+
+            for (size_t i=0; i<numChildren; i++) {
+              const NodeRef child = createLargeLeaf(depth+1,children[i],alloc);
+              setAABBNode(node,i,child,children[i].geomBounds);
+            }
+
+            return node;
+          }
+
+          /*! performs split */
+          __noinline void split(const PrimInfoRange& pinfo, PrimInfoRange& linfo, PrimInfoRange& rinfo, bool& aligned) // FIXME: not inlined as ICC otherwise uses much stack
+          {
+            /* variable to track the SAH of the best splitting approach */
+            float bestSAH = inf;
+            const size_t blocks = (pinfo.size()+(1ull<<cfg.logBlockSize)-1ull) >> cfg.logBlockSize;
+            const float leafSAH = intCost*float(blocks)*halfArea(pinfo.geomBounds);
+
+            /* try standard binning in aligned space */
+            float alignedObjectSAH = inf;
+            HeuristicBinningSAH::Split alignedObjectSplit;
+            if (aligned) {
+              alignedObjectSplit = alignedHeuristic.find(pinfo,cfg.logBlockSize);
+              alignedObjectSAH = travCostAligned*halfArea(pinfo.geomBounds) + intCost*alignedObjectSplit.splitSAH();
+              bestSAH = min(alignedObjectSAH,bestSAH);
+            }
+
+            /* try standard binning in unaligned space */
+            UnalignedHeuristicBinningSAH::Split unalignedObjectSplit;
+            LinearSpace3fa uspace;
+            float unalignedObjectSAH = inf;
+            if (bestSAH > 0.7f*leafSAH) {
+              uspace = unalignedHeuristic.computeAlignedSpace(pinfo);
+              const PrimInfoRange sinfo = unalignedHeuristic.computePrimInfo(pinfo,uspace);
+              unalignedObjectSplit = unalignedHeuristic.find(sinfo,cfg.logBlockSize,uspace);
+              unalignedObjectSAH = travCostUnaligned*halfArea(pinfo.geomBounds) + intCost*unalignedObjectSplit.splitSAH();
+              bestSAH = min(unalignedObjectSAH,bestSAH);
+            }
+
+            /* try splitting into two strands */
+            HeuristicStrandSplitSAH::Split strandSplit;
+            float strandSAH = inf;
+            if (bestSAH > 0.7f*leafSAH && pinfo.size() <= 256) {
+              strandSplit = strandHeuristic.find(pinfo,cfg.logBlockSize);
+              strandSAH = travCostUnaligned*halfArea(pinfo.geomBounds) + intCost*strandSplit.splitSAH();
+              bestSAH = min(strandSAH,bestSAH);
+            }
+
+            /* fallback if SAH heuristics failed */
+            if (unlikely(!std::isfinite(bestSAH)))
+            {
+              alignedHeuristic.deterministic_order(pinfo);
+              alignedHeuristic.splitFallback(pinfo,linfo,rinfo);
+            }
+
+            /* perform aligned split if this is best */
+            else if (bestSAH == alignedObjectSAH) {
+              alignedHeuristic.split(alignedObjectSplit,pinfo,linfo,rinfo);
+            }
+
+            /* perform unaligned split if this is best */
+            else if (bestSAH == unalignedObjectSAH) {
+              unalignedHeuristic.split(unalignedObjectSplit,uspace,pinfo,linfo,rinfo);
+              aligned = false;
+            }
+
+            /* perform strand split if this is best */
+            else if (bestSAH == strandSAH) {
+              strandHeuristic.split(strandSplit,pinfo,linfo,rinfo);
+              aligned = false;
+            }
+
+            /* can never happen */
+            else
+              assert(false);
+          }
+
+          /*! recursive build */
+          NodeRef recurse(size_t depth, const PrimInfoRange& pinfo, Allocator alloc, bool toplevel, bool alloc_barrier)
+          {
+            /* get thread local allocator */
+            if (!alloc)
+              alloc = createAlloc();
+
+            /* call memory monitor function to signal progress */
+            if (toplevel && pinfo.size() <= SINGLE_THREADED_THRESHOLD)
+              progressMonitor(pinfo.size());
+
+            PrimInfoRange children[MAX_BRANCHING_FACTOR];
+
+            /* create leaf node */
+            if (depth+MIN_LARGE_LEAF_LEVELS >= cfg.maxDepth || pinfo.size() <= cfg.minLeafSize) {
+              alignedHeuristic.deterministic_order(pinfo);
+              return createLargeLeaf(depth,pinfo,alloc);
+            }
+
+            /* fill all children by always splitting the one with the largest surface area */
+            size_t numChildren = 1;
+            children[0] = pinfo;
+            bool aligned = true;
+
+            do {
+
+              /* find best child with largest bounding box area */
+              ssize_t bestChild = -1;
+              float bestArea = neg_inf;
+              for (size_t i=0; i<numChildren; i++)
+              {
+                /* ignore leaves as they cannot get split */
+                if (children[i].size() <= cfg.minLeafSize)
+                  continue;
+
+                /* remember child with largest area */
+                if (area(children[i].geomBounds) > bestArea) {
+                  bestArea = area(children[i].geomBounds);
+                  bestChild = i;
+                }
+              }
+              if (bestChild == -1) break;
+
+              /*! split best child into left and right child */
+              PrimInfoRange left, right;
+              split(children[bestChild],left,right,aligned);
+
+              /* add new children left and right */
+              children[bestChild] = children[numChildren-1];
+              children[numChildren-1] = left;
+              children[numChildren+0] = right;
+              numChildren++;
+
+            } while (numChildren < cfg.branchingFactor);
+
+            NodeRef node;
+
+            /* create aligned node */
+            if (aligned)
+            {
+              node = createAABBNode(alloc);
+
+              /* spawn tasks or ... */
+              if (pinfo.size() > SINGLE_THREADED_THRESHOLD)
+              {
+                parallel_for(size_t(0), numChildren, [&] (const range<size_t>& r) {
+                    for (size_t i=r.begin(); i<r.end(); i++) {
+                      const bool child_alloc_barrier = pinfo.size() > cfg.finished_range_threshold && children[i].size() <= cfg.finished_range_threshold;
+                      setAABBNode(node,i,recurse(depth+1,children[i],nullptr,true,child_alloc_barrier),children[i].geomBounds);
+                      _mm_mfence(); // to allow non-temporal stores during build
+                    }
+                  });
+              }
+              /* ... continue sequentially */
+              else {
+                for (size_t i=0; i<numChildren; i++) {
+                  const bool child_alloc_barrier = pinfo.size() > cfg.finished_range_threshold && children[i].size() <= cfg.finished_range_threshold;
+                  setAABBNode(node,i,recurse(depth+1,children[i],alloc,false,child_alloc_barrier),children[i].geomBounds);
+                }
+              }
+            }
+
+            /* create unaligned node */
+            else
+            {
+              node = createOBBNode(alloc);
+
+              /* spawn tasks or ... */
+              if (pinfo.size() > SINGLE_THREADED_THRESHOLD)
+              {
+                parallel_for(size_t(0), numChildren, [&] (const range<size_t>& r) {
+                    for (size_t i=r.begin(); i<r.end(); i++) {
+                      const LinearSpace3fa space = unalignedHeuristic.computeAlignedSpace(children[i]);
+                      const PrimInfoRange sinfo = unalignedHeuristic.computePrimInfo(children[i],space);
+                      const OBBox3fa obounds(space,sinfo.geomBounds);
+                      const bool child_alloc_barrier = pinfo.size() > cfg.finished_range_threshold && children[i].size() <= cfg.finished_range_threshold;
+                      setOBBNode(node,i,recurse(depth+1,children[i],nullptr,true,child_alloc_barrier),obounds);
+                      _mm_mfence(); // to allow non-temporal stores during build
+                    }
+                  });
+              }
+              /* ... continue sequentially */
+              else
+              {
+                for (size_t i=0; i<numChildren; i++) {
+                  const LinearSpace3fa space = unalignedHeuristic.computeAlignedSpace(children[i]);
+                  const PrimInfoRange sinfo = unalignedHeuristic.computePrimInfo(children[i],space);
+                  const OBBox3fa obounds(space,sinfo.geomBounds);
+                  const bool child_alloc_barrier = pinfo.size() > cfg.finished_range_threshold && children[i].size() <= cfg.finished_range_threshold;
+                  setOBBNode(node,i,recurse(depth+1,children[i],alloc,false,child_alloc_barrier),obounds);
+                }
+              }
+            }
+
+            /* reports a finished range of primrefs */
+            if (unlikely(alloc_barrier))
+              reportFinishedRange(pinfo);
+
+            return node;
+          }
+
+        private:
+          Settings cfg;
+          PrimRef* prims;
+          const CreateAllocFunc& createAlloc;
+          const CreateAABBNodeFunc& createAABBNode;
+          const SetAABBNodeFunc& setAABBNode;
+          const CreateOBBNodeFunc& createOBBNode;
+          const SetOBBNodeFunc& setOBBNode;
+          const CreateLeafFunc& createLeaf;
+          const ProgressMonitor& progressMonitor;
+          const ReportFinishedRangeFunc& reportFinishedRange;
+
+        private:
+          HeuristicBinningSAH alignedHeuristic;
+          UnalignedHeuristicBinningSAH unalignedHeuristic;
+          HeuristicStrandSplitSAH strandHeuristic;
+        };
+
+      template<typename NodeRef,
+        typename CreateAllocFunc,
+        typename CreateAABBNodeFunc,
+        typename SetAABBNodeFunc,
+        typename CreateOBBNodeFunc,
+        typename SetOBBNodeFunc,
+        typename CreateLeafFunc,
+        typename ProgressMonitor,
+        typename ReportFinishedRangeFunc>
+
+        static NodeRef build (const CreateAllocFunc& createAlloc,
+                              const CreateAABBNodeFunc& createAABBNode,
+                              const SetAABBNodeFunc& setAABBNode,
+                              const CreateOBBNodeFunc& createOBBNode,
+                              const SetOBBNodeFunc& setOBBNode,
+                              const CreateLeafFunc& createLeaf,
+                              const ProgressMonitor& progressMonitor,
+                              const ReportFinishedRangeFunc& reportFinishedRange,
+                              Scene* scene,
+                              PrimRef* prims,
+                              const PrimInfo& pinfo,
+                              const Settings settings)
+        {
+          typedef BuilderT<NodeRef,
+            CreateAllocFunc,
+            CreateAABBNodeFunc,SetAABBNodeFunc,
+            CreateOBBNodeFunc,SetOBBNodeFunc,
+            CreateLeafFunc,ProgressMonitor,
+            ReportFinishedRangeFunc> Builder;
+
+          Builder builder(scene,prims,createAlloc,
+                          createAABBNode,setAABBNode,
+                          createOBBNode,setOBBNode,
+                          createLeaf,progressMonitor,reportFinishedRange,settings);
+
+          NodeRef root = builder.recurse(1,pinfo,nullptr,true,false);
+          _mm_mfence(); // to allow non-temporal stores during build
+          return root;
+        }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/builders/bvh_builder_morton.h b/thirdparty/embree/kernels/builders/bvh_builder_morton.h
new file mode 100644
index 0000000000..8f21e3254f
--- /dev/null
+++ b/thirdparty/embree/kernels/builders/bvh_builder_morton.h
@@ -0,0 +1,501 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/builder.h"
+#include "../../common/algorithms/parallel_reduce.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    struct BVHBuilderMorton
+    {
+      static const size_t MAX_BRANCHING_FACTOR = 8;          //!< maximum supported BVH branching factor
+      static const size_t MIN_LARGE_LEAF_LEVELS = 8;         //!< create balanced tree of we are that many levels before the maximum tree depth
+
+      /*! settings for morton builder */
+      struct Settings
+      {
+        /*! default settings */
+        Settings ()
+        : branchingFactor(2), maxDepth(32), minLeafSize(1), maxLeafSize(7), singleThreadThreshold(1024) {}
+
+        /*! initialize settings from API settings */
+        Settings (const RTCBuildArguments& settings)
+        : branchingFactor(2), maxDepth(32), minLeafSize(1), maxLeafSize(7), singleThreadThreshold(1024)
+        {
+          if (RTC_BUILD_ARGUMENTS_HAS(settings,maxBranchingFactor)) branchingFactor = settings.maxBranchingFactor;
+          if (RTC_BUILD_ARGUMENTS_HAS(settings,maxDepth          )) maxDepth        = settings.maxDepth;
+          if (RTC_BUILD_ARGUMENTS_HAS(settings,minLeafSize       )) minLeafSize     = settings.minLeafSize;
+          if (RTC_BUILD_ARGUMENTS_HAS(settings,maxLeafSize       )) maxLeafSize     = settings.maxLeafSize;
+
+          minLeafSize = min(minLeafSize,maxLeafSize);
+        }
+
+        Settings (size_t branchingFactor, size_t maxDepth, size_t minLeafSize, size_t maxLeafSize, size_t singleThreadThreshold)
+        : branchingFactor(branchingFactor), maxDepth(maxDepth), minLeafSize(minLeafSize), maxLeafSize(maxLeafSize), singleThreadThreshold(singleThreadThreshold)
+        {
+          minLeafSize = min(minLeafSize,maxLeafSize);
+        }
+
+      public:
+        size_t branchingFactor;  //!< branching factor of BVH to build
+        size_t maxDepth;         //!< maximum depth of BVH to build
+        size_t minLeafSize;      //!< minimum size of a leaf
+        size_t maxLeafSize;      //!< maximum size of a leaf
+        size_t singleThreadThreshold; //!< threshold when we switch to single threaded build
+      };
+
+      /*! Build primitive consisting of morton code and primitive ID. */
+      struct __aligned(8) BuildPrim
+      {
+        union {
+          struct {
+            unsigned int code;     //!< morton code
+            unsigned int index;    //!< i'th primitive
+          };
+          uint64_t t;
+        };
+
+        /*! interface for radix sort */
+        __forceinline operator unsigned() const { return code; }
+
+        /*! interface for standard sort */
+        __forceinline bool operator<(const BuildPrim &m) const { return code < m.code; }
+      };
+
+      /*! maps bounding box to morton code */
+      struct MortonCodeMapping
+      {
+        static const size_t LATTICE_BITS_PER_DIM = 10;
+        static const size_t LATTICE_SIZE_PER_DIM = size_t(1) << LATTICE_BITS_PER_DIM;
+
+        vfloat4 base;
+        vfloat4 scale;
+
+        __forceinline MortonCodeMapping(const BBox3fa& bounds)
+        {
+          base  = (vfloat4)bounds.lower;
+          const vfloat4 diag  = (vfloat4)bounds.upper - (vfloat4)bounds.lower;
+          scale = select(diag > vfloat4(1E-19f), rcp(diag) * vfloat4(LATTICE_SIZE_PER_DIM * 0.99f),vfloat4(0.0f));
+        }
+
+        __forceinline const vint4 bin (const BBox3fa& box) const
+        {
+          const vfloat4 lower = (vfloat4)box.lower;
+          const vfloat4 upper = (vfloat4)box.upper;
+          const vfloat4 centroid = lower+upper;
+          return vint4((centroid-base)*scale);
+        }
+
+        __forceinline unsigned int code (const BBox3fa& box) const
+        {
+          const vint4 binID = bin(box);
+          const unsigned int x = extract<0>(binID);
+          const unsigned int y = extract<1>(binID);
+          const unsigned int z = extract<2>(binID);
+          const unsigned int xyz = bitInterleave(x,y,z);
+          return xyz;
+        }
+      };
+
+#if defined (__AVX2__)
+
+      /*! for AVX2 there is a fast scalar bitInterleave */
+      struct MortonCodeGenerator
+      {
+        __forceinline MortonCodeGenerator(const MortonCodeMapping& mapping, BuildPrim* dest)
+          : mapping(mapping), dest(dest) {}
+
+        __forceinline void operator() (const BBox3fa& b, const unsigned index)
+        {
+          dest->index = index;
+          dest->code = mapping.code(b);
+          dest++;
+        }
+
+      public:
+        const MortonCodeMapping mapping;
+        BuildPrim* dest;
+        size_t currentID;
+      };
+
+#else
+
+      /*! before AVX2 is it better to use the SSE version of bitInterleave */
+      struct MortonCodeGenerator
+      {
+        __forceinline MortonCodeGenerator(const MortonCodeMapping& mapping, BuildPrim* dest)
+          : mapping(mapping), dest(dest), currentID(0), slots(0), ax(0), ay(0), az(0), ai(0) {}
+
+        __forceinline ~MortonCodeGenerator()
+        {
+          if (slots != 0)
+          {
+            const vint4 code = bitInterleave(ax,ay,az);
+            for (size_t i=0; i<slots; i++) {
+              dest[currentID-slots+i].index = ai[i];
+              dest[currentID-slots+i].code = code[i];
+            }
+          }
+        }
+
+        __forceinline void operator() (const BBox3fa& b, const unsigned index)
+        {
+          const vint4 binID = mapping.bin(b);
+          ax[slots] = extract<0>(binID);
+          ay[slots] = extract<1>(binID);
+          az[slots] = extract<2>(binID);
+          ai[slots] = index;
+          slots++;
+          currentID++;
+
+          if (slots == 4)
+          {
+            const vint4 code = bitInterleave(ax,ay,az);
+            vint4::storeu(&dest[currentID-4],unpacklo(code,ai));
+            vint4::storeu(&dest[currentID-2],unpackhi(code,ai));
+            slots = 0;
+          }
+        }
+
+      public:
+        const MortonCodeMapping mapping;
+        BuildPrim* dest;
+        size_t currentID;
+        size_t slots;
+        vint4 ax, ay, az, ai;
+      };
+
+#endif
+
+      template<
+        typename ReductionTy,
+        typename Allocator,
+        typename CreateAllocator,
+        typename CreateNodeFunc,
+        typename SetNodeBoundsFunc,
+        typename CreateLeafFunc,
+        typename CalculateBounds,
+        typename ProgressMonitor>
+
+        class BuilderT : private Settings
+      {
+        ALIGNED_CLASS_(16);
+
+      public:
+
+        BuilderT (CreateAllocator& createAllocator,
+                  CreateNodeFunc& createNode,
+                  SetNodeBoundsFunc& setBounds,
+                  CreateLeafFunc& createLeaf,
+                  CalculateBounds& calculateBounds,
+                  ProgressMonitor& progressMonitor,
+                  const Settings& settings)
+
+          : Settings(settings),
+          createAllocator(createAllocator),
+          createNode(createNode),
+          setBounds(setBounds),
+          createLeaf(createLeaf),
+          calculateBounds(calculateBounds),
+          progressMonitor(progressMonitor),
+          morton(nullptr) {}
+
+        ReductionTy createLargeLeaf(size_t depth, const range<unsigned>& current, Allocator alloc)
+        {
+          /* this should never occur but is a fatal error */
+          if (depth > maxDepth)
+            throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached");
+
+          /* create leaf for few primitives */
+          if (current.size() <= maxLeafSize)
+            return createLeaf(current,alloc);
+
+          /* fill all children by always splitting the largest one */
+          range<unsigned> children[MAX_BRANCHING_FACTOR];
+          size_t numChildren = 1;
+          children[0] = current;
+
+          do {
+
+            /* find best child with largest number of primitives */
+            size_t bestChild = -1;
+            size_t bestSize = 0;
+            for (size_t i=0; i<numChildren; i++)
+            {
+              /* ignore leaves as they cannot get split */
+              if (children[i].size() <= maxLeafSize)
+                continue;
+
+              /* remember child with largest size */
+              if (children[i].size() > bestSize) {
+                bestSize = children[i].size();
+                bestChild = i;
+              }
+            }
+            if (bestChild == size_t(-1)) break;
+
+            /*! split best child into left and right child */
+            auto split = children[bestChild].split();
+
+            /* add new children left and right */
+            children[bestChild] = children[numChildren-1];
+            children[numChildren-1] = split.first;
+            children[numChildren+0] = split.second;
+            numChildren++;
+
+          } while (numChildren < branchingFactor);
+
+          /* create node */
+          auto node = createNode(alloc,numChildren);
+
+          /* recurse into each child */
+          ReductionTy bounds[MAX_BRANCHING_FACTOR];
+          for (size_t i=0; i<numChildren; i++)
+            bounds[i] = createLargeLeaf(depth+1,children[i],alloc);
+
+          return setBounds(node,bounds,numChildren);
+        }
+
+        /*! recreates morton codes when reaching a region where all codes are identical */
+        __noinline void recreateMortonCodes(const range<unsigned>& current) const
+        {
+          /* fast path for small ranges */
+          if (likely(current.size() < 1024))
+          {
+            /*! recalculate centroid bounds */
+            BBox3fa centBounds(empty);
+            for (size_t i=current.begin(); i<current.end(); i++)
+              centBounds.extend(center2(calculateBounds(morton[i])));
+
+            /* recalculate morton codes */
+            MortonCodeMapping mapping(centBounds);
+            for (size_t i=current.begin(); i<current.end(); i++)
+              morton[i].code = mapping.code(calculateBounds(morton[i]));
+
+            /* sort morton codes */
+            std::sort(morton+current.begin(),morton+current.end());
+          }
+          else
+          {
+            /*! recalculate centroid bounds */
+            auto calculateCentBounds = [&] ( const range<unsigned>& r ) {
+              BBox3fa centBounds = empty;
+              for (size_t i=r.begin(); i<r.end(); i++)
+                centBounds.extend(center2(calculateBounds(morton[i])));
+              return centBounds;
+            };
+            const BBox3fa centBounds = parallel_reduce(current.begin(), current.end(), unsigned(1024),
+                                                       BBox3fa(empty), calculateCentBounds, BBox3fa::merge);
+
+            /* recalculate morton codes */
+            MortonCodeMapping mapping(centBounds);
+            parallel_for(current.begin(), current.end(), unsigned(1024), [&] ( const range<unsigned>& r ) {
+                for (size_t i=r.begin(); i<r.end(); i++) {
+                  morton[i].code = mapping.code(calculateBounds(morton[i]));
+                }
+              });
+
+            /*! sort morton codes */
+#if defined(TASKING_TBB)
+            tbb::parallel_sort(morton+current.begin(),morton+current.end());
+#else
+            radixsort32(morton+current.begin(),current.size());
+#endif
+          }
+        }
+
+        __forceinline void split(const range<unsigned>& current, range<unsigned>& left, range<unsigned>& right) const
+        {
+          const unsigned int code_start = morton[current.begin()].code;
+          const unsigned int code_end   = morton[current.end()-1].code;
+          unsigned int bitpos = lzcnt(code_start^code_end);
+
+          /* if all items mapped to same morton code, then re-create new morton codes for the items */
+          if (unlikely(bitpos == 32))
+          {
+            recreateMortonCodes(current);
+            const unsigned int code_start = morton[current.begin()].code;
+            const unsigned int code_end   = morton[current.end()-1].code;
+            bitpos = lzcnt(code_start^code_end);
+
+            /* if the morton code is still the same, goto fall back split */
+            if (unlikely(bitpos == 32)) {
+              current.split(left,right);
+              return;
+            }
+          }
+
+          /* split the items at the topmost different morton code bit */
+          const unsigned int bitpos_diff = 31-bitpos;
+          const unsigned int bitmask = 1 << bitpos_diff;
+
+          /* find location where bit differs using binary search */
+          unsigned begin = current.begin();
+          unsigned end   = current.end();
+          while (begin + 1 != end) {
+            const unsigned mid = (begin+end)/2;
+            const unsigned bit = morton[mid].code & bitmask;
+            if (bit == 0) begin = mid; else end = mid;
+          }
+          unsigned center = end;
+#if defined(DEBUG)
+          for (unsigned int i=begin;  i<center; i++) assert((morton[i].code & bitmask) == 0);
+          for (unsigned int i=center; i<end;    i++) assert((morton[i].code & bitmask) == bitmask);
+#endif
+
+          left = make_range(current.begin(),center);
+          right = make_range(center,current.end());
+        }
+
+        ReductionTy recurse(size_t depth, const range<unsigned>& current, Allocator alloc, bool toplevel)
+        {
+          /* get thread local allocator */
+          if (!alloc)
+            alloc = createAllocator();
+
+          /* call memory monitor function to signal progress */
+          if (toplevel && current.size() <= singleThreadThreshold)
+            progressMonitor(current.size());
+
+          /* create leaf node */
+          if (unlikely(depth+MIN_LARGE_LEAF_LEVELS >= maxDepth || current.size() <= minLeafSize))
+            return createLargeLeaf(depth,current,alloc);
+
+          /* fill all children by always splitting the one with the largest surface area */
+          range<unsigned> children[MAX_BRANCHING_FACTOR];
+          split(current,children[0],children[1]);
+          size_t numChildren = 2;
+
+          while (numChildren < branchingFactor)
+          {
+            /* find best child with largest number of primitives */
+            int bestChild = -1;
+            unsigned bestItems = 0;
+            for (unsigned int i=0; i<numChildren; i++)
+            {
+              /* ignore leaves as they cannot get split */
+              if (children[i].size() <= minLeafSize)
+                continue;
+
+              /* remember child with largest area */
+              if (children[i].size() > bestItems) {
+                bestItems = children[i].size();
+                bestChild = i;
+              }
+            }
+            if (bestChild == -1) break;
+
+            /*! split best child into left and right child */
+            range<unsigned> left, right;
+            split(children[bestChild],left,right);
+
+            /* add new children left and right */
+            children[bestChild] = children[numChildren-1];
+            children[numChildren-1] = left;
+            children[numChildren+0] = right;
+            numChildren++;
+          }
+
+          /* create leaf node if no split is possible */
+          if (unlikely(numChildren == 1))
+            return createLeaf(current,alloc);
+
+          /* allocate node */
+          auto node = createNode(alloc,numChildren);
+
+          /* process top parts of tree parallel */
+          ReductionTy bounds[MAX_BRANCHING_FACTOR];
+          if (current.size() > singleThreadThreshold)
+          {
+            /*! parallel_for is faster than spawing sub-tasks */
+            parallel_for(size_t(0), numChildren, [&] (const range<size_t>& r) {
+                for (size_t i=r.begin(); i<r.end(); i++) {
+                  bounds[i] = recurse(depth+1,children[i],nullptr,true);
+                  _mm_mfence(); // to allow non-temporal stores during build
+                }
+              });
+          }
+
+          /* finish tree sequentially */
+          else
+          {
+            for (size_t i=0; i<numChildren; i++)
+              bounds[i] = recurse(depth+1,children[i],alloc,false);
+          }
+
+          return setBounds(node,bounds,numChildren);
+        }
+
+        /* build function */
+        ReductionTy build(BuildPrim* src, BuildPrim* tmp, size_t numPrimitives)
+        {
+          /* sort morton codes */
+          morton = src;
+          radix_sort_u32(src,tmp,numPrimitives,singleThreadThreshold);
+
+          /* build BVH */
+          const ReductionTy root = recurse(1, range<unsigned>(0,(unsigned)numPrimitives), nullptr, true);
+          _mm_mfence(); // to allow non-temporal stores during build
+          return root;
+        }
+
+      public:
+        CreateAllocator& createAllocator;
+        CreateNodeFunc& createNode;
+        SetNodeBoundsFunc& setBounds;
+        CreateLeafFunc& createLeaf;
+        CalculateBounds& calculateBounds;
+        ProgressMonitor& progressMonitor;
+
+      public:
+        BuildPrim* morton;
+      };
+
+
+      template<
+      typename ReductionTy,
+        typename CreateAllocFunc,
+        typename CreateNodeFunc,
+        typename SetBoundsFunc,
+        typename CreateLeafFunc,
+        typename CalculateBoundsFunc,
+        typename ProgressMonitor>
+
+        static ReductionTy build(CreateAllocFunc createAllocator,
+                                 CreateNodeFunc createNode,
+                                 SetBoundsFunc setBounds,
+                                 CreateLeafFunc createLeaf,
+                                 CalculateBoundsFunc calculateBounds,
+                                 ProgressMonitor progressMonitor,
+                                 BuildPrim* src,
+                                 BuildPrim* tmp,
+                                 size_t numPrimitives,
+                                 const Settings& settings)
+        {
+          typedef BuilderT<
+            ReductionTy,
+            decltype(createAllocator()),
+            CreateAllocFunc,
+            CreateNodeFunc,
+            SetBoundsFunc,
+            CreateLeafFunc,
+            CalculateBoundsFunc,
+            ProgressMonitor> Builder;
+
+          Builder builder(createAllocator,
+                          createNode,
+                          setBounds,
+                          createLeaf,
+                          calculateBounds,
+                          progressMonitor,
+                          settings);
+
+          return builder.build(src,tmp,numPrimitives);
+        }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/builders/bvh_builder_msmblur.h b/thirdparty/embree/kernels/builders/bvh_builder_msmblur.h
new file mode 100644
index 0000000000..f9a08d65cd
--- /dev/null
+++ b/thirdparty/embree/kernels/builders/bvh_builder_msmblur.h
@@ -0,0 +1,692 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define MBLUR_NUM_TEMPORAL_BINS 2
+#define MBLUR_NUM_OBJECT_BINS   32
+
+#include "../bvh/bvh.h"
+#include "../common/primref_mb.h"
+#include "heuristic_binning_array_aligned.h"
+#include "heuristic_timesplit_array.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<typename T>
+      struct SharedVector
+      {
+        __forceinline SharedVector() {}
+
+        __forceinline SharedVector(T* ptr, size_t refCount = 1)
+          : prims(ptr), refCount(refCount) {}
+
+        __forceinline void incRef() {
+          refCount++;
+        }
+
+        __forceinline void decRef()
+        {
+          if (--refCount == 0)
+            delete prims;
+        }
+
+        T* prims;
+        size_t refCount;
+      };
+
+    template<typename BuildRecord, int MAX_BRANCHING_FACTOR>
+      struct LocalChildListT
+      {
+        typedef SharedVector<mvector<PrimRefMB>> SharedPrimRefVector;
+
+        __forceinline LocalChildListT (const BuildRecord& record)
+          : numChildren(1), numSharedPrimVecs(1)
+        {
+          /* the local root will be freed in the ancestor where it was created (thus refCount is 2) */
+          children[0] = record;
+          primvecs[0] = new (&sharedPrimVecs[0]) SharedPrimRefVector(record.prims.prims, 2);
+        }
+
+        __forceinline ~LocalChildListT()
+        {
+          for (size_t i = 0; i < numChildren; i++)
+            primvecs[i]->decRef();
+        }
+
+        __forceinline BuildRecord& operator[] ( const size_t i ) {
+          return children[i];
+        }
+
+        __forceinline size_t size() const {
+          return numChildren;
+        }
+
+        __forceinline void split(ssize_t bestChild, const BuildRecord& lrecord, const BuildRecord& rrecord, std::unique_ptr<mvector<PrimRefMB>> new_vector)
+        {
+          SharedPrimRefVector* bsharedPrimVec = primvecs[bestChild];
+          if (lrecord.prims.prims == bsharedPrimVec->prims) {
+            primvecs[bestChild] = bsharedPrimVec;
+            bsharedPrimVec->incRef();
+          }
+          else {
+            primvecs[bestChild] = new (&sharedPrimVecs[numSharedPrimVecs++]) SharedPrimRefVector(lrecord.prims.prims);
+          }
+
+          if (rrecord.prims.prims == bsharedPrimVec->prims) {
+            primvecs[numChildren] = bsharedPrimVec;
+            bsharedPrimVec->incRef();
+          }
+          else {
+            primvecs[numChildren] = new (&sharedPrimVecs[numSharedPrimVecs++]) SharedPrimRefVector(rrecord.prims.prims);
+          }
+          bsharedPrimVec->decRef();
+          new_vector.release();
+
+          children[bestChild] = lrecord;
+          children[numChildren] = rrecord;
+          numChildren++;
+        }
+
+      public:
+        array_t<BuildRecord,MAX_BRANCHING_FACTOR> children;
+        array_t<SharedPrimRefVector*,MAX_BRANCHING_FACTOR> primvecs;
+        size_t numChildren;
+
+        array_t<SharedPrimRefVector,2*MAX_BRANCHING_FACTOR> sharedPrimVecs;
+        size_t numSharedPrimVecs;
+      };
+
+    template<typename Mesh>
+      struct RecalculatePrimRef
+      {
+        Scene* scene;
+
+        __forceinline RecalculatePrimRef (Scene* scene)
+          : scene(scene) {}
+
+        __forceinline PrimRefMB operator() (const PrimRefMB& prim, const BBox1f time_range) const
+        {
+          const unsigned geomID = prim.geomID();
+          const unsigned primID = prim.primID();
+          const Mesh* mesh = scene->get<Mesh>(geomID);
+          const LBBox3fa lbounds = mesh->linearBounds(primID, time_range);
+          const range<int> tbounds = mesh->timeSegmentRange(time_range);
+          return PrimRefMB (lbounds, tbounds.size(), mesh->time_range, mesh->numTimeSegments(), geomID, primID);
+        }
+
+        // __noinline is workaround for ICC16 bug under MacOSX
+        __noinline PrimRefMB operator() (const PrimRefMB& prim, const BBox1f time_range, const LinearSpace3fa& space) const
+        {
+          const unsigned geomID = prim.geomID();
+          const unsigned primID = prim.primID();
+          const Mesh* mesh = scene->get<Mesh>(geomID);
+          const LBBox3fa lbounds = mesh->linearBounds(space, primID, time_range);
+          const range<int> tbounds = mesh->timeSegmentRange(time_range);
+          return PrimRefMB (lbounds, tbounds.size(), mesh->time_range, mesh->numTimeSegments(), geomID, primID);
+        }
+
+        __forceinline LBBox3fa linearBounds(const PrimRefMB& prim, const BBox1f time_range) const {
+          return scene->get<Mesh>(prim.geomID())->linearBounds(prim.primID(), time_range);
+        }
+
+        // __noinline is workaround for ICC16 bug under MacOSX
+        __noinline LBBox3fa linearBounds(const PrimRefMB& prim, const BBox1f time_range, const LinearSpace3fa& space) const {
+          return scene->get<Mesh>(prim.geomID())->linearBounds(space, prim.primID(), time_range);
+        }
+      };
+
+    struct VirtualRecalculatePrimRef
+    {
+      Scene* scene;
+      
+      __forceinline VirtualRecalculatePrimRef (Scene* scene)
+        : scene(scene) {}
+      
+      __forceinline PrimRefMB operator() (const PrimRefMB& prim, const BBox1f time_range) const
+      {
+        const unsigned geomID = prim.geomID();
+        const unsigned primID = prim.primID();
+        const Geometry* mesh = scene->get(geomID);
+        const LBBox3fa lbounds = mesh->vlinearBounds(primID, time_range);
+        const range<int> tbounds = mesh->timeSegmentRange(time_range);
+        return PrimRefMB (lbounds, tbounds.size(), mesh->time_range, mesh->numTimeSegments(), geomID, primID);
+      }
+      
+      __forceinline PrimRefMB operator() (const PrimRefMB& prim, const BBox1f time_range, const LinearSpace3fa& space) const
+      {
+        const unsigned geomID = prim.geomID();
+        const unsigned primID = prim.primID();
+        const Geometry* mesh = scene->get(geomID);
+        const LBBox3fa lbounds = mesh->vlinearBounds(space, primID, time_range);
+        const range<int> tbounds = mesh->timeSegmentRange(time_range);
+        return PrimRefMB (lbounds, tbounds.size(), mesh->time_range, mesh->numTimeSegments(), geomID, primID);
+      }
+      
+      __forceinline LBBox3fa linearBounds(const PrimRefMB& prim, const BBox1f time_range) const {
+        return scene->get(prim.geomID())->vlinearBounds(prim.primID(), time_range);
+      }
+      
+      __forceinline LBBox3fa linearBounds(const PrimRefMB& prim, const BBox1f time_range, const LinearSpace3fa& space) const {
+        return scene->get(prim.geomID())->vlinearBounds(space, prim.primID(), time_range);
+      }
+    };
+
+    struct BVHBuilderMSMBlur
+    {
+      /*! settings for msmblur builder */
+      struct Settings
+      {
+        /*! default settings */
+        Settings ()
+        : branchingFactor(2), maxDepth(32), logBlockSize(0), minLeafSize(1), maxLeafSize(8),
+          travCost(1.0f), intCost(1.0f), singleLeafTimeSegment(false),
+          singleThreadThreshold(1024) {}
+
+
+        Settings (size_t sahBlockSize, size_t minLeafSize, size_t maxLeafSize, float travCost, float intCost, size_t singleThreadThreshold)
+        : branchingFactor(2), maxDepth(32), logBlockSize(bsr(sahBlockSize)), minLeafSize(minLeafSize), maxLeafSize(maxLeafSize),
+          travCost(travCost), intCost(intCost), singleThreadThreshold(singleThreadThreshold)
+        {
+          minLeafSize = min(minLeafSize,maxLeafSize);
+        }
+
+      public:
+        size_t branchingFactor;  //!< branching factor of BVH to build
+        size_t maxDepth;         //!< maximum depth of BVH to build
+        size_t logBlockSize;     //!< log2 of blocksize for SAH heuristic
+        size_t minLeafSize;      //!< minimum size of a leaf
+        size_t maxLeafSize;      //!< maximum size of a leaf
+        float travCost;          //!< estimated cost of one traversal step
+        float intCost;           //!< estimated cost of one primitive intersection
+        bool singleLeafTimeSegment; //!< split time to single time range
+        size_t singleThreadThreshold; //!< threshold when we switch to single threaded build
+      };
+
+      struct BuildRecord
+      {
+      public:
+	__forceinline BuildRecord () {}
+
+        __forceinline BuildRecord (size_t depth)
+          : depth(depth) {}
+
+        __forceinline BuildRecord (const SetMB& prims, size_t depth)
+          : depth(depth), prims(prims) {}
+
+        __forceinline friend bool operator< (const BuildRecord& a, const BuildRecord& b) {
+          return a.prims.size() < b.prims.size();
+        }
+
+        __forceinline size_t size() const {
+          return prims.size();
+        }
+
+      public:
+	size_t depth;                     //!< Depth of the root of this subtree.
+	SetMB prims;                      //!< The list of primitives.
+      };
+
+      struct BuildRecordSplit : public BuildRecord
+      {
+        __forceinline BuildRecordSplit () {}
+
+        __forceinline BuildRecordSplit (size_t depth) 
+          : BuildRecord(depth) {}
+
+        __forceinline BuildRecordSplit (const BuildRecord& record, const BinSplit<MBLUR_NUM_OBJECT_BINS>& split)
+          : BuildRecord(record), split(split) {}
+        
+        BinSplit<MBLUR_NUM_OBJECT_BINS> split;
+      };
+
+      template<
+        typename NodeRef,
+        typename RecalculatePrimRef,
+        typename Allocator,
+        typename CreateAllocFunc,
+        typename CreateNodeFunc,
+        typename SetNodeFunc,
+        typename CreateLeafFunc,
+        typename ProgressMonitor>
+
+        class BuilderT
+        {
+          ALIGNED_CLASS_(16);
+          static const size_t MAX_BRANCHING_FACTOR = 16;       //!< maximum supported BVH branching factor	  
+          static const size_t MIN_LARGE_LEAF_LEVELS = 8;        //!< create balanced tree if we are that many levels before the maximum tree depth
+
+          typedef BVHNodeRecordMB4D<NodeRef> NodeRecordMB4D;
+          typedef BinSplit<MBLUR_NUM_OBJECT_BINS> Split;
+          typedef mvector<PrimRefMB>* PrimRefVector;
+          typedef SharedVector<mvector<PrimRefMB>> SharedPrimRefVector;
+          typedef LocalChildListT<BuildRecord,MAX_BRANCHING_FACTOR> LocalChildList;
+          typedef LocalChildListT<BuildRecordSplit,MAX_BRANCHING_FACTOR> LocalChildListSplit;
+
+        public:
+
+          BuilderT (MemoryMonitorInterface* device,
+                    const RecalculatePrimRef recalculatePrimRef,
+                    const CreateAllocFunc createAlloc,
+                    const CreateNodeFunc createNode,
+                    const SetNodeFunc setNode,
+                    const CreateLeafFunc createLeaf,
+                    const ProgressMonitor progressMonitor,
+                    const Settings& settings)
+            : cfg(settings),
+            heuristicObjectSplit(),
+            heuristicTemporalSplit(device, recalculatePrimRef),
+            recalculatePrimRef(recalculatePrimRef), createAlloc(createAlloc), createNode(createNode), setNode(setNode), createLeaf(createLeaf),
+            progressMonitor(progressMonitor)
+          {
+            if (cfg.branchingFactor > MAX_BRANCHING_FACTOR)
+              throw_RTCError(RTC_ERROR_UNKNOWN,"bvh_builder: branching factor too large");
+          }
+
+          /*! finds the best split */
+          const Split find(const SetMB& set)
+          {
+            /* first try standard object split */
+            const Split object_split = heuristicObjectSplit.find(set,cfg.logBlockSize);
+            const float object_split_sah = object_split.splitSAH();
+
+            /* test temporal splits only when object split was bad */
+            const float leaf_sah = set.leafSAH(cfg.logBlockSize);
+            if (object_split_sah < 0.50f*leaf_sah)
+              return object_split;
+
+            /* do temporal splits only if the time range is big enough */
+            if (set.time_range.size() > 1.01f/float(set.max_num_time_segments))
+            {
+              const Split temporal_split = heuristicTemporalSplit.find(set,cfg.logBlockSize);
+              const float temporal_split_sah = temporal_split.splitSAH();
+
+              /* take temporal split if it improved SAH */
+              if (temporal_split_sah < object_split_sah)
+                return temporal_split;
+            }
+
+            return object_split;
+          }
+
+          /*! array partitioning */
+          __forceinline std::unique_ptr<mvector<PrimRefMB>> split(const Split& split, const SetMB& set, SetMB& lset, SetMB& rset)
+          {
+            /* perform object split */
+            if (likely(split.data == Split::SPLIT_OBJECT)) {
+              heuristicObjectSplit.split(split,set,lset,rset);
+            }
+            /* perform temporal split */
+            else if (likely(split.data == Split::SPLIT_TEMPORAL)) {
+              return heuristicTemporalSplit.split(split,set,lset,rset);
+            }
+            /* perform fallback split */
+            else if (unlikely(split.data == Split::SPLIT_FALLBACK)) {
+              set.deterministic_order();
+              splitFallback(set,lset,rset);
+            }
+            /* split by geometry */
+            else if (unlikely(split.data == Split::SPLIT_GEOMID)) {
+              set.deterministic_order();
+              splitByGeometry(set,lset,rset);
+            }
+            else
+              assert(false);
+
+            return std::unique_ptr<mvector<PrimRefMB>>();
+          }
+
+          /*! finds the best fallback split */
+          __noinline Split findFallback(const SetMB& set)
+          {
+            /* split if primitives are not from same geometry */
+            if (!sameGeometry(set))
+              return Split(0.0f,Split::SPLIT_GEOMID);
+            
+            /* if a leaf can only hold a single time-segment, we might have to do additional temporal splits */
+            if (cfg.singleLeafTimeSegment)
+            {
+              /* test if one primitive has more than one time segment in time range, if so split time */
+              for (size_t i=set.begin(); i<set.end(); i++)
+              {
+                const PrimRefMB& prim = (*set.prims)[i];
+                const range<int> itime_range = prim.timeSegmentRange(set.time_range);
+                const int localTimeSegments = itime_range.size();
+                assert(localTimeSegments > 0);
+                if (localTimeSegments > 1) {
+                  const int icenter = (itime_range.begin() + itime_range.end())/2;
+                  const float splitTime = prim.timeStep(icenter);
+                  return Split(0.0f,(unsigned)Split::SPLIT_TEMPORAL,0,splitTime);
+                }
+              }
+            }        
+
+            /* otherwise return fallback split */
+            return Split(0.0f,Split::SPLIT_FALLBACK);
+          }
+
+          /*! performs fallback split */
+          void splitFallback(const SetMB& set, SetMB& lset, SetMB& rset)
+          {
+            mvector<PrimRefMB>& prims = *set.prims;
+
+            const size_t begin = set.begin();
+            const size_t end   = set.end();
+            const size_t center = (begin + end)/2;
+
+            PrimInfoMB linfo = empty;
+            for (size_t i=begin; i<center; i++)
+              linfo.add_primref(prims[i]);
+
+            PrimInfoMB rinfo = empty;
+            for (size_t i=center; i<end; i++)
+              rinfo.add_primref(prims[i]);
+
+            new (&lset) SetMB(linfo,set.prims,range<size_t>(begin,center),set.time_range);
+            new (&rset) SetMB(rinfo,set.prims,range<size_t>(center,end  ),set.time_range);
+          }
+
+          /*! checks if all primitives are from the same geometry */
+          __forceinline bool sameGeometry(const SetMB& set)
+          {
+            if (set.size() == 0) return true;
+            mvector<PrimRefMB>& prims = *set.prims;
+            const size_t begin = set.begin();
+            const size_t end   = set.end();
+            unsigned int firstGeomID = prims[begin].geomID();
+            for (size_t i=begin+1; i<end; i++) {
+              if (prims[i].geomID() != firstGeomID){
+                return false;
+              }
+            }
+            return true;
+          }
+
+          /* split by geometry ID */
+          void splitByGeometry(const SetMB& set, SetMB& lset, SetMB& rset)
+          {
+            assert(set.size() > 1);
+
+            mvector<PrimRefMB>& prims = *set.prims;
+            const size_t begin = set.begin();
+            const size_t end   = set.end();
+            
+            PrimInfoMB left(empty);
+            PrimInfoMB right(empty);
+            unsigned int geomID = prims[begin].geomID();
+            size_t center = serial_partitioning(prims.data(),begin,end,left,right,
+                                                [&] ( const PrimRefMB& prim ) { return prim.geomID() == geomID; },
+                                                [ ] ( PrimInfoMB& dst, const PrimRefMB& prim ) { dst.add_primref(prim); });
+            
+            new (&lset) SetMB(left, set.prims,range<size_t>(begin,center),set.time_range);
+            new (&rset) SetMB(right,set.prims,range<size_t>(center,end  ),set.time_range);
+          }
+
+          const NodeRecordMB4D createLargeLeaf(const BuildRecord& in, Allocator alloc)
+          {
+            /* this should never occur but is a fatal error */
+            if (in.depth > cfg.maxDepth)
+              throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached");
+
+            /* replace already found split by fallback split */
+            const BuildRecordSplit current(BuildRecord(in.prims,in.depth),findFallback(in.prims));
+
+            /* special case when directly creating leaf without any splits that could shrink time_range */
+            bool force_split = false;
+            if (current.depth == 1 && current.size() > 0)
+            {
+              BBox1f c = empty;
+              BBox1f p = current.prims.time_range;
+              for (size_t i=current.prims.begin(); i<current.prims.end(); i++) {
+                mvector<PrimRefMB>& prims = *current.prims.prims;
+                c.extend(prims[i].time_range);
+              }
+              
+              force_split = c.lower > p.lower || c.upper < p.upper;
+            }
+	    
+            /* create leaf for few primitives */
+            if (current.size() <= cfg.maxLeafSize && current.split.data < Split::SPLIT_ENFORCE && !force_split)
+              return createLeaf(current,alloc);
+	  
+            /* fill all children by always splitting the largest one */
+            bool hasTimeSplits = false;
+            NodeRecordMB4D values[MAX_BRANCHING_FACTOR];
+            LocalChildListSplit children(current);
+
+            do {
+              /* find best child with largest bounding box area */
+              size_t bestChild = -1;
+              size_t bestSize = 0;
+              for (size_t i=0; i<children.size(); i++)
+              {
+                /* ignore leaves as they cannot get split */
+                if (children[i].size() <= cfg.maxLeafSize && children[i].split.data < Split::SPLIT_ENFORCE && !force_split)
+                  continue;
+
+                force_split = false;
+                
+                /* remember child with largest size */
+                if (children[i].size() > bestSize) {
+                  bestSize = children[i].size();
+                  bestChild = i;
+                }
+              }
+              if (bestChild == -1) break;
+
+              /* perform best found split */
+              BuildRecordSplit& brecord = children[bestChild];
+              BuildRecordSplit lrecord(current.depth+1);
+              BuildRecordSplit rrecord(current.depth+1);
+              std::unique_ptr<mvector<PrimRefMB>> new_vector = split(brecord.split,brecord.prims,lrecord.prims,rrecord.prims);
+              hasTimeSplits |= new_vector != nullptr;
+
+              /* find new splits */
+              lrecord.split = findFallback(lrecord.prims);
+              rrecord.split = findFallback(rrecord.prims);
+              children.split(bestChild,lrecord,rrecord,std::move(new_vector));
+
+            } while (children.size() < cfg.branchingFactor);
+
+            /* detect time_ranges that have shrunken */
+            for (size_t i=0; i<children.size(); i++) {
+              const BBox1f c = children[i].prims.time_range;
+              const BBox1f p = in.prims.time_range;
+              hasTimeSplits |= c.lower > p.lower || c.upper < p.upper;
+            }
+
+            /* create node */
+            auto node = createNode(children.children.data(),children.numChildren,alloc,hasTimeSplits);
+
+            /* recurse into each child and perform reduction */
+            LBBox3fa gbounds = empty;
+            for (size_t i=0; i<children.size(); i++) {
+              values[i] = createLargeLeaf(children[i],alloc);
+              gbounds.extend(values[i].lbounds);
+            }
+
+            setNode(current,children.children.data(),node,values,children.numChildren);
+
+            /* calculate geometry bounds of this node */
+            if (hasTimeSplits)
+              return NodeRecordMB4D(node,current.prims.linearBounds(recalculatePrimRef),current.prims.time_range);
+            else
+              return NodeRecordMB4D(node,gbounds,current.prims.time_range);
+          }
+
+          const NodeRecordMB4D recurse(const BuildRecord& current, Allocator alloc, bool toplevel)
+          {
+            /* get thread local allocator */
+            if (!alloc)
+              alloc = createAlloc();
+
+            /* call memory monitor function to signal progress */
+            if (toplevel && current.size() <= cfg.singleThreadThreshold)
+              progressMonitor(current.size());
+
+            /*! find best split */
+            const Split csplit = find(current.prims);
+
+            /*! compute leaf and split cost */
+            const float leafSAH  = cfg.intCost*current.prims.leafSAH(cfg.logBlockSize);
+            const float splitSAH = cfg.travCost*current.prims.halfArea()+cfg.intCost*csplit.splitSAH();
+            assert((current.size() == 0) || ((leafSAH >= 0) && (splitSAH >= 0)));
+
+            /*! create a leaf node when threshold reached or SAH tells us to stop */
+            if (current.size() <= cfg.minLeafSize || current.depth+MIN_LARGE_LEAF_LEVELS >= cfg.maxDepth || (current.size() <= cfg.maxLeafSize && leafSAH <= splitSAH)) {
+              current.prims.deterministic_order();
+              return createLargeLeaf(current,alloc);
+            }
+
+            /*! perform initial split */
+            SetMB lprims,rprims;
+            std::unique_ptr<mvector<PrimRefMB>> new_vector = split(csplit,current.prims,lprims,rprims);
+            bool hasTimeSplits = new_vector != nullptr;
+            NodeRecordMB4D values[MAX_BRANCHING_FACTOR];
+            LocalChildList children(current);
+            {
+              BuildRecord lrecord(lprims,current.depth+1);
+              BuildRecord rrecord(rprims,current.depth+1);
+              children.split(0,lrecord,rrecord,std::move(new_vector));
+            }
+
+            /*! split until node is full or SAH tells us to stop */
+            while (children.size() < cfg.branchingFactor) 
+            {
+              /*! find best child to split */
+              float bestArea = neg_inf;
+              ssize_t bestChild = -1;
+              for (size_t i=0; i<children.size(); i++)
+              {
+                if (children[i].size() <= cfg.minLeafSize) continue;
+                if (expectedApproxHalfArea(children[i].prims.geomBounds) > bestArea) {
+                  bestChild = i; bestArea = expectedApproxHalfArea(children[i].prims.geomBounds);
+                }
+              }
+              if (bestChild == -1) break;
+
+              /* perform split */
+              BuildRecord& brecord = children[bestChild];
+              BuildRecord lrecord(current.depth+1);
+              BuildRecord rrecord(current.depth+1);
+              Split csplit = find(brecord.prims);
+              std::unique_ptr<mvector<PrimRefMB>> new_vector = split(csplit,brecord.prims,lrecord.prims,rrecord.prims);
+              hasTimeSplits |= new_vector != nullptr;
+              children.split(bestChild,lrecord,rrecord,std::move(new_vector));
+            }
+
+            /* detect time_ranges that have shrunken */
+            for (size_t i=0; i<children.size(); i++) {
+              const BBox1f c = children[i].prims.time_range;
+              const BBox1f p = current.prims.time_range;
+              hasTimeSplits |= c.lower > p.lower || c.upper < p.upper;
+            }
+
+            /* sort buildrecords for simpler shadow ray traversal */
+            //std::sort(&children[0],&children[children.size()],std::greater<BuildRecord>()); // FIXME: reduces traversal performance of bvh8.triangle4 (need to verified) !!
+
+            /*! create an inner node */
+            auto node = createNode(children.children.data(), children.numChildren, alloc, hasTimeSplits);
+            LBBox3fa gbounds = empty;
+
+            /* spawn tasks */
+            if (unlikely(current.size() > cfg.singleThreadThreshold))
+            {
+              /*! parallel_for is faster than spawing sub-tasks */
+              parallel_for(size_t(0), children.size(), [&] (const range<size_t>& r) {
+                  for (size_t i=r.begin(); i<r.end(); i++) {
+                    values[i] = recurse(children[i],nullptr,true);
+                    _mm_mfence(); // to allow non-temporal stores during build
+                  }
+                });
+
+              /*! merge bounding boxes */
+              for (size_t i=0; i<children.size(); i++)
+                gbounds.extend(values[i].lbounds);
+            }
+            /* recurse into each child */
+            else
+            {
+              //for (size_t i=0; i<children.size(); i++)
+              for (ssize_t i=children.size()-1; i>=0; i--) {
+                values[i] = recurse(children[i],alloc,false);
+                gbounds.extend(values[i].lbounds);
+              }
+            }
+
+            setNode(current,children.children.data(),node,values,children.numChildren);
+
+            /* calculate geometry bounds of this node */
+            if (unlikely(hasTimeSplits))
+              return NodeRecordMB4D(node,current.prims.linearBounds(recalculatePrimRef),current.prims.time_range);
+            else
+              return NodeRecordMB4D(node,gbounds,current.prims.time_range);
+          }
+
+          /*! builder entry function */
+          __forceinline const NodeRecordMB4D operator() (mvector<PrimRefMB>& prims, const PrimInfoMB& pinfo)
+          {
+            const SetMB set(pinfo,&prims);
+            auto ret = recurse(BuildRecord(set,1),nullptr,true);
+            _mm_mfence(); // to allow non-temporal stores during build
+            return ret;
+          }
+
+        private:
+          Settings cfg;
+          HeuristicArrayBinningMB<PrimRefMB,MBLUR_NUM_OBJECT_BINS> heuristicObjectSplit;
+          HeuristicMBlurTemporalSplit<PrimRefMB,RecalculatePrimRef,MBLUR_NUM_TEMPORAL_BINS> heuristicTemporalSplit;
+          const RecalculatePrimRef recalculatePrimRef;
+          const CreateAllocFunc createAlloc;
+          const CreateNodeFunc createNode;
+          const SetNodeFunc setNode;
+          const CreateLeafFunc createLeaf;
+          const ProgressMonitor progressMonitor;
+        };
+
+      template<typename NodeRef,
+        typename RecalculatePrimRef,
+        typename CreateAllocFunc,
+        typename CreateNodeFunc,
+        typename SetNodeFunc,
+        typename CreateLeafFunc,
+        typename ProgressMonitorFunc>
+
+        static const BVHNodeRecordMB4D<NodeRef> build(mvector<PrimRefMB>& prims,
+                                                      const PrimInfoMB& pinfo,
+                                                      MemoryMonitorInterface* device,
+                                                      const RecalculatePrimRef recalculatePrimRef,
+                                                      const CreateAllocFunc createAlloc,
+                                                      const CreateNodeFunc createNode,
+                                                      const SetNodeFunc setNode,
+                                                      const CreateLeafFunc createLeaf,
+                                                      const ProgressMonitorFunc progressMonitor,
+                                                      const Settings& settings)
+      {
+          typedef BuilderT<
+            NodeRef,
+            RecalculatePrimRef,
+            decltype(createAlloc()),
+            CreateAllocFunc,
+            CreateNodeFunc,
+            SetNodeFunc,
+            CreateLeafFunc,
+            ProgressMonitorFunc> Builder;
+
+          Builder builder(device,
+                          recalculatePrimRef,
+                          createAlloc,
+                          createNode,
+                          setNode,
+                          createLeaf,
+                          progressMonitor,
+                          settings);
+
+
+          return builder(prims,pinfo);
+        }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/builders/bvh_builder_msmblur_hair.h b/thirdparty/embree/kernels/builders/bvh_builder_msmblur_hair.h
new file mode 100644
index 0000000000..397e8636b1
--- /dev/null
+++ b/thirdparty/embree/kernels/builders/bvh_builder_msmblur_hair.h
@@ -0,0 +1,526 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../bvh/bvh.h"
+#include "../geometry/primitive.h"
+#include "../builders/bvh_builder_msmblur.h"
+#include "../builders/heuristic_binning_array_aligned.h"
+#include "../builders/heuristic_binning_array_unaligned.h"
+#include "../builders/heuristic_timesplit_array.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    struct BVHBuilderHairMSMBlur
+    {
+      /*! settings for msmblur builder */
+      struct Settings
+      {
+        /*! default settings */
+        Settings ()
+        : branchingFactor(2), maxDepth(32), logBlockSize(0), minLeafSize(1), maxLeafSize(8) {}
+
+      public:
+        size_t branchingFactor;  //!< branching factor of BVH to build
+        size_t maxDepth;         //!< maximum depth of BVH to build
+        size_t logBlockSize;     //!< log2 of blocksize for SAH heuristic
+        size_t minLeafSize;      //!< minimum size of a leaf
+        size_t maxLeafSize;      //!< maximum size of a leaf
+      };
+
+      struct BuildRecord
+      {
+      public:
+	__forceinline BuildRecord () {}
+
+        __forceinline BuildRecord (size_t depth)
+          : depth(depth) {}
+
+        __forceinline BuildRecord (const SetMB& prims, size_t depth)
+          : depth(depth), prims(prims) {}
+
+        __forceinline size_t size() const {
+          return prims.size();
+        }
+
+      public:
+	size_t depth;       //!< depth of the root of this subtree
+	SetMB prims;        //!< the list of primitives
+      };
+
+      template<typename NodeRef,
+        typename RecalculatePrimRef,
+        typename CreateAllocFunc,
+        typename CreateAABBNodeMBFunc,
+        typename SetAABBNodeMBFunc,
+        typename CreateOBBNodeMBFunc,
+        typename SetOBBNodeMBFunc,
+        typename CreateLeafFunc,
+        typename ProgressMonitor>
+
+        class BuilderT
+        {
+          ALIGNED_CLASS_(16);
+
+          static const size_t MAX_BRANCHING_FACTOR =  8;         //!< maximum supported BVH branching factor
+          static const size_t MIN_LARGE_LEAF_LEVELS = 8;         //!< create balanced tree if we are that many levels before the maximum tree depth
+          static const size_t SINGLE_THREADED_THRESHOLD = 4096;  //!< threshold to switch to single threaded build
+
+          typedef BVHNodeRecordMB<NodeRef> NodeRecordMB;
+          typedef BVHNodeRecordMB4D<NodeRef> NodeRecordMB4D;
+
+          typedef FastAllocator::CachedAllocator Allocator;
+          typedef LocalChildListT<BuildRecord,MAX_BRANCHING_FACTOR> LocalChildList;
+
+          typedef HeuristicMBlurTemporalSplit<PrimRefMB,RecalculatePrimRef,MBLUR_NUM_TEMPORAL_BINS> HeuristicTemporal;
+          typedef HeuristicArrayBinningMB<PrimRefMB,MBLUR_NUM_OBJECT_BINS> HeuristicBinning;
+          typedef UnalignedHeuristicArrayBinningMB<PrimRefMB,MBLUR_NUM_OBJECT_BINS> UnalignedHeuristicBinning;
+
+        public:
+
+          BuilderT (Scene* scene,
+                    const RecalculatePrimRef& recalculatePrimRef,
+                    const CreateAllocFunc& createAlloc,
+                    const CreateAABBNodeMBFunc& createAABBNodeMB,
+                    const SetAABBNodeMBFunc& setAABBNodeMB,
+                    const CreateOBBNodeMBFunc& createOBBNodeMB,
+                    const SetOBBNodeMBFunc& setOBBNodeMB,
+                    const CreateLeafFunc& createLeaf,
+                    const ProgressMonitor& progressMonitor,
+                    const Settings settings)
+
+            : cfg(settings),
+            scene(scene),
+            recalculatePrimRef(recalculatePrimRef),
+            createAlloc(createAlloc),
+            createAABBNodeMB(createAABBNodeMB), setAABBNodeMB(setAABBNodeMB),
+            createOBBNodeMB(createOBBNodeMB), setOBBNodeMB(setOBBNodeMB),
+            createLeaf(createLeaf),
+            progressMonitor(progressMonitor),
+            unalignedHeuristic(scene),
+            temporalSplitHeuristic(scene->device,recalculatePrimRef) {}
+
+        private:
+
+          /*! checks if all primitives are from the same geometry */
+          __forceinline bool sameGeometry(const SetMB& set)
+          {
+            mvector<PrimRefMB>& prims = *set.prims;
+            unsigned int firstGeomID = prims[set.begin()].geomID();
+            for (size_t i=set.begin()+1; i<set.end(); i++) {
+              if (prims[i].geomID() != firstGeomID){
+                return false;
+              }
+            }
+            return true;
+          }
+          
+          /*! performs some split if SAH approaches fail */
+          void splitFallback(const SetMB& set, SetMB& lset, SetMB& rset)
+          {
+            mvector<PrimRefMB>& prims = *set.prims;
+
+            const size_t begin = set.begin();
+            const size_t end   = set.end();
+            const size_t center = (begin + end)/2;
+
+            PrimInfoMB linfo = empty;
+            for (size_t i=begin; i<center; i++)
+              linfo.add_primref(prims[i]);
+
+            PrimInfoMB rinfo = empty;
+            for (size_t i=center; i<end; i++)
+              rinfo.add_primref(prims[i]);
+
+            new (&lset) SetMB(linfo,set.prims,range<size_t>(begin,center),set.time_range);
+            new (&rset) SetMB(rinfo,set.prims,range<size_t>(center,end  ),set.time_range);
+          }
+
+          void splitByGeometry(const SetMB& set, SetMB& lset, SetMB& rset)
+          {
+            assert(set.size() > 1);
+            const size_t begin = set.begin();
+            const size_t end   = set.end();
+            PrimInfoMB linfo(empty);
+            PrimInfoMB rinfo(empty);
+            unsigned int geomID = (*set.prims)[begin].geomID();
+            size_t center = serial_partitioning(set.prims->data(),begin,end,linfo,rinfo,
+                                                [&] ( const PrimRefMB& prim ) { return prim.geomID() == geomID; },
+                                                [ ] ( PrimInfoMB& a, const PrimRefMB& ref ) { a.add_primref(ref); });
+
+            new (&lset) SetMB(linfo,set.prims,range<size_t>(begin,center),set.time_range);
+            new (&rset) SetMB(rinfo,set.prims,range<size_t>(center,end  ),set.time_range);
+          }
+
+          /*! creates a large leaf that could be larger than supported by the BVH */
+          NodeRecordMB4D createLargeLeaf(BuildRecord& current, Allocator alloc)
+          {
+            /* this should never occur but is a fatal error */
+            if (current.depth > cfg.maxDepth)
+              throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached");
+
+            /* special case when directly creating leaf without any splits that could shrink time_range */
+            bool force_split = false;
+            if (current.depth == 1 && current.size() > 0)
+            {
+              BBox1f c = empty;
+              BBox1f p = current.prims.time_range;
+              for (size_t i=current.prims.begin(); i<current.prims.end(); i++) {
+                mvector<PrimRefMB>& prims = *current.prims.prims;
+                c.extend(prims[i].time_range);
+              }
+              
+              force_split = c.lower > p.lower || c.upper < p.upper;
+            }
+
+            /* create leaf for few primitives */
+            if (current.size() <= cfg.maxLeafSize && sameGeometry(current.prims) && !force_split)
+              return createLeaf(current.prims,alloc);
+
+            /* fill all children by always splitting the largest one */
+            LocalChildList children(current);
+            NodeRecordMB4D values[MAX_BRANCHING_FACTOR];
+
+            do {
+
+              /* find best child with largest bounding box area */
+              int bestChild = -1;
+              size_t bestSize = 0;
+              for (unsigned i=0; i<children.size(); i++)
+              {
+                /* ignore leaves as they cannot get split */
+                if (children[i].size() <= cfg.maxLeafSize && sameGeometry(children[i].prims) && !force_split)
+                  continue;
+
+                force_split = false;
+
+                /* remember child with largest size */
+                if (children[i].size() > bestSize) {
+                  bestSize = children[i].size();
+                  bestChild = i;
+                }
+              }
+              if (bestChild == -1) break;
+
+              /*! split best child into left and right child */
+              BuildRecord left(current.depth+1);
+              BuildRecord right(current.depth+1);
+              if (!sameGeometry(children[bestChild].prims)) {
+                splitByGeometry(children[bestChild].prims,left.prims,right.prims);
+              } else {
+                splitFallback(children[bestChild].prims,left.prims,right.prims);
+              }
+              children.split(bestChild,left,right,std::unique_ptr<mvector<PrimRefMB>>());
+
+            } while (children.size() < cfg.branchingFactor);
+
+
+            /* detect time_ranges that have shrunken */
+            bool timesplit = false;
+            for (size_t i=0; i<children.size(); i++) {
+              const BBox1f c = children[i].prims.time_range;
+              const BBox1f p = current.prims.time_range;
+              timesplit |= c.lower > p.lower || c.upper < p.upper;
+            }
+            
+            /* create node */
+            NodeRef node = createAABBNodeMB(children.children.data(),children.numChildren,alloc,timesplit);
+
+            LBBox3fa bounds = empty;
+            for (size_t i=0; i<children.size(); i++) {
+              values[i] = createLargeLeaf(children[i],alloc);
+              bounds.extend(values[i].lbounds);
+            }
+
+            setAABBNodeMB(current,children.children.data(),node,values,children.numChildren);
+
+            if (timesplit)
+              bounds = current.prims.linearBounds(recalculatePrimRef);
+              
+            return NodeRecordMB4D(node,bounds,current.prims.time_range);
+          }
+
+          /*! performs split */
+          std::unique_ptr<mvector<PrimRefMB>> split(const BuildRecord& current, BuildRecord& lrecord, BuildRecord& rrecord, bool& aligned, bool& timesplit)
+          {
+            /* variable to track the SAH of the best splitting approach */
+            float bestSAH = inf;
+            const float leafSAH = current.prims.leafSAH(cfg.logBlockSize);
+
+            /* perform standard binning in aligned space */
+            HeuristicBinning::Split alignedObjectSplit = alignedHeuristic.find(current.prims,cfg.logBlockSize);
+            float alignedObjectSAH = alignedObjectSplit.splitSAH();
+            bestSAH = min(alignedObjectSAH,bestSAH);
+
+            /* perform standard binning in unaligned space */
+            UnalignedHeuristicBinning::Split unalignedObjectSplit;
+            LinearSpace3fa uspace;
+            float unalignedObjectSAH = inf;
+            if (alignedObjectSAH > 0.7f*leafSAH) {
+              uspace = unalignedHeuristic.computeAlignedSpaceMB(scene,current.prims);
+              const SetMB sset = current.prims.primInfo(recalculatePrimRef,uspace);
+              unalignedObjectSplit = unalignedHeuristic.find(sset,cfg.logBlockSize,uspace);
+              unalignedObjectSAH = 1.3f*unalignedObjectSplit.splitSAH(); // makes unaligned splits more expensive
+              bestSAH = min(unalignedObjectSAH,bestSAH);
+            }
+
+            /* do temporal splits only if previous approaches failed to produce good SAH and the the time range is large enough */
+            float temporal_split_sah = inf;
+            typename HeuristicTemporal::Split temporal_split;
+            if (bestSAH > 0.5f*leafSAH) {
+              if (current.prims.time_range.size() > 1.01f/float(current.prims.max_num_time_segments)) {
+                temporal_split = temporalSplitHeuristic.find(current.prims,cfg.logBlockSize);
+                temporal_split_sah = temporal_split.splitSAH();
+                bestSAH = min(temporal_split_sah,bestSAH);
+              }
+            }
+
+            /* perform fallback split if SAH heuristics failed */
+            if (unlikely(!std::isfinite(bestSAH))) {
+              current.prims.deterministic_order();
+              splitFallback(current.prims,lrecord.prims,rrecord.prims);
+            }
+            /* perform aligned split if this is best */
+            else if (likely(bestSAH == alignedObjectSAH)) {
+              alignedHeuristic.split(alignedObjectSplit,current.prims,lrecord.prims,rrecord.prims);
+            }
+            /* perform unaligned split if this is best */
+            else if (likely(bestSAH == unalignedObjectSAH)) {
+              unalignedHeuristic.split(unalignedObjectSplit,uspace,current.prims,lrecord.prims,rrecord.prims);
+              aligned = false;
+            }
+            /* perform temporal split if this is best */
+            else if (likely(bestSAH == temporal_split_sah)) {
+              timesplit = true;
+              return temporalSplitHeuristic.split(temporal_split,current.prims,lrecord.prims,rrecord.prims);
+            }
+            else
+              assert(false);
+
+            return std::unique_ptr<mvector<PrimRefMB>>();
+          }
+
+          /*! recursive build */
+          NodeRecordMB4D recurse(BuildRecord& current, Allocator alloc, bool toplevel)
+          {
+            /* get thread local allocator */
+            if (!alloc)
+              alloc = createAlloc();
+
+            /* call memory monitor function to signal progress */
+            if (toplevel && current.size() <= SINGLE_THREADED_THRESHOLD)
+              progressMonitor(current.size());
+
+            /* create leaf node */
+            if (current.depth+MIN_LARGE_LEAF_LEVELS >= cfg.maxDepth || current.size() <= cfg.minLeafSize) {
+              current.prims.deterministic_order();
+              return createLargeLeaf(current,alloc);
+            }
+
+            /* fill all children by always splitting the one with the largest surface area */
+            NodeRecordMB4D values[MAX_BRANCHING_FACTOR];
+            LocalChildList children(current);
+            bool aligned = true;
+            bool timesplit = false;
+
+            do {
+
+              /* find best child with largest bounding box area */
+              ssize_t bestChild = -1;
+              float bestArea = neg_inf;
+              for (size_t i=0; i<children.size(); i++)
+              {
+                /* ignore leaves as they cannot get split */
+                if (children[i].size() <= cfg.minLeafSize)
+                  continue;
+
+                /* remember child with largest area */
+                const float A = children[i].prims.halfArea();
+                if (A > bestArea) {
+                  bestArea = children[i].prims.halfArea();
+                  bestChild = i;
+                }
+              }
+              if (bestChild == -1) break;
+
+              /*! split best child into left and right child */
+              BuildRecord left(current.depth+1);
+              BuildRecord right(current.depth+1);
+              std::unique_ptr<mvector<PrimRefMB>> new_vector = split(children[bestChild],left,right,aligned,timesplit);
+              children.split(bestChild,left,right,std::move(new_vector));
+
+            } while (children.size() < cfg.branchingFactor);
+
+            /* detect time_ranges that have shrunken */
+            for (size_t i=0; i<children.size(); i++) {
+              const BBox1f c = children[i].prims.time_range;
+              const BBox1f p = current.prims.time_range;
+              timesplit |= c.lower > p.lower || c.upper < p.upper;
+            }
+
+            /* create time split node */
+            if (timesplit)
+            {
+              const NodeRef node = createAABBNodeMB(children.children.data(),children.numChildren,alloc,true);
+
+              /* spawn tasks or ... */
+              if (current.size() > SINGLE_THREADED_THRESHOLD)
+              {
+                parallel_for(size_t(0), children.size(), [&] (const range<size_t>& r) {
+                    for (size_t i=r.begin(); i<r.end(); i++) {
+                      values[i] = recurse(children[i],nullptr,true);
+                      _mm_mfence(); // to allow non-temporal stores during build
+                    }
+                  });
+              }
+              /* ... continue sequential */
+              else {
+                for (size_t i=0; i<children.size(); i++) {
+                  values[i] = recurse(children[i],alloc,false);
+                }
+              }
+
+              setAABBNodeMB(current,children.children.data(),node,values,children.numChildren);
+
+              const LBBox3fa bounds = current.prims.linearBounds(recalculatePrimRef);
+              return NodeRecordMB4D(node,bounds,current.prims.time_range);
+            }
+
+            /* create aligned node */
+            else if (aligned)
+            {
+              const NodeRef node = createAABBNodeMB(children.children.data(),children.numChildren,alloc,true);
+
+              /* spawn tasks or ... */
+              if (current.size() > SINGLE_THREADED_THRESHOLD)
+              {
+                LBBox3fa cbounds[MAX_BRANCHING_FACTOR];
+                parallel_for(size_t(0), children.size(), [&] (const range<size_t>& r) {
+                    for (size_t i=r.begin(); i<r.end(); i++) {
+                      values[i] = recurse(children[i],nullptr,true);
+                      cbounds[i] = values[i].lbounds;
+                      _mm_mfence(); // to allow non-temporal stores during build
+                    }
+                  });
+
+                LBBox3fa bounds = empty;
+                for (size_t i=0; i<children.size(); i++)
+                  bounds.extend(cbounds[i]);
+                setAABBNodeMB(current,children.children.data(),node,values,children.numChildren);
+                return NodeRecordMB4D(node,bounds,current.prims.time_range);
+              }
+              /* ... continue sequentially */
+              else
+              {
+                LBBox3fa bounds = empty;
+                for (size_t i=0; i<children.size(); i++) {
+                  values[i] = recurse(children[i],alloc,false);
+                  bounds.extend(values[i].lbounds);
+                }
+                setAABBNodeMB(current,children.children.data(),node,values,children.numChildren);
+                return NodeRecordMB4D(node,bounds,current.prims.time_range);
+              }
+            }
+
+            /* create unaligned node */
+            else
+            {
+              const NodeRef node = createOBBNodeMB(alloc);
+
+              /* spawn tasks or ... */
+              if (current.size() > SINGLE_THREADED_THRESHOLD)
+              {
+                parallel_for(size_t(0), children.size(), [&] (const range<size_t>& r) {
+                    for (size_t i=r.begin(); i<r.end(); i++) {
+                      const LinearSpace3fa space = unalignedHeuristic.computeAlignedSpaceMB(scene,children[i].prims);
+                      const LBBox3fa lbounds = children[i].prims.linearBounds(recalculatePrimRef,space);
+                      const auto child = recurse(children[i],nullptr,true);
+                      setOBBNodeMB(node,i,child.ref,space,lbounds,children[i].prims.time_range);
+                      _mm_mfence(); // to allow non-temporal stores during build
+                    }
+                  });
+              }
+              /* ... continue sequentially */
+              else
+              {
+                for (size_t i=0; i<children.size(); i++) {
+                  const LinearSpace3fa space = unalignedHeuristic.computeAlignedSpaceMB(scene,children[i].prims);
+                  const LBBox3fa lbounds = children[i].prims.linearBounds(recalculatePrimRef,space);
+                  const auto child = recurse(children[i],alloc,false);
+                  setOBBNodeMB(node,i,child.ref,space,lbounds,children[i].prims.time_range);
+                }
+              }
+
+              const LBBox3fa bounds = current.prims.linearBounds(recalculatePrimRef);
+              return NodeRecordMB4D(node,bounds,current.prims.time_range);
+            }
+          }
+
+        public:
+
+          /*! entry point into builder */
+          NodeRecordMB4D operator() (mvector<PrimRefMB>& prims, const PrimInfoMB& pinfo)
+          {
+            BuildRecord record(SetMB(pinfo,&prims),1);
+            auto root = recurse(record,nullptr,true);
+            _mm_mfence(); // to allow non-temporal stores during build
+            return root;
+          }
+
+        private:
+          Settings cfg;
+          Scene* scene;
+          const RecalculatePrimRef& recalculatePrimRef;
+          const CreateAllocFunc& createAlloc;
+          const CreateAABBNodeMBFunc& createAABBNodeMB;
+          const SetAABBNodeMBFunc& setAABBNodeMB;
+          const CreateOBBNodeMBFunc& createOBBNodeMB;
+          const SetOBBNodeMBFunc& setOBBNodeMB;
+          const CreateLeafFunc& createLeaf;
+          const ProgressMonitor& progressMonitor;
+
+        private:
+          HeuristicBinning alignedHeuristic;
+          UnalignedHeuristicBinning unalignedHeuristic;
+          HeuristicTemporal temporalSplitHeuristic;
+        };
+
+      template<typename NodeRef,
+        typename RecalculatePrimRef,
+        typename CreateAllocFunc,
+        typename CreateAABBNodeMBFunc,
+        typename SetAABBNodeMBFunc,
+        typename CreateOBBNodeMBFunc,
+        typename SetOBBNodeMBFunc,
+        typename CreateLeafFunc,
+        typename ProgressMonitor>
+
+        static BVHNodeRecordMB4D<NodeRef> build (Scene* scene, mvector<PrimRefMB>& prims, const PrimInfoMB& pinfo,
+                                               const RecalculatePrimRef& recalculatePrimRef,
+                                               const CreateAllocFunc& createAlloc,
+                                               const CreateAABBNodeMBFunc& createAABBNodeMB,
+                                               const SetAABBNodeMBFunc& setAABBNodeMB,
+                                               const CreateOBBNodeMBFunc& createOBBNodeMB,
+                                               const SetOBBNodeMBFunc& setOBBNodeMB,
+                                               const CreateLeafFunc& createLeaf,
+                                               const ProgressMonitor& progressMonitor,
+                                               const Settings settings)
+        {
+          typedef BuilderT<NodeRef,RecalculatePrimRef,CreateAllocFunc,
+            CreateAABBNodeMBFunc,SetAABBNodeMBFunc,
+            CreateOBBNodeMBFunc,SetOBBNodeMBFunc,
+            CreateLeafFunc,ProgressMonitor> Builder;
+
+          Builder builder(scene,recalculatePrimRef,createAlloc,
+                          createAABBNodeMB,setAABBNodeMB,
+                          createOBBNodeMB,setOBBNodeMB,
+                          createLeaf,progressMonitor,settings);
+
+          return builder(prims,pinfo);
+        }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/builders/bvh_builder_sah.h b/thirdparty/embree/kernels/builders/bvh_builder_sah.h
new file mode 100644
index 0000000000..fff4bf2a35
--- /dev/null
+++ b/thirdparty/embree/kernels/builders/bvh_builder_sah.h
@@ -0,0 +1,669 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "heuristic_binning_array_aligned.h"
+#include "heuristic_spatial_array.h"
+#include "heuristic_openmerge_array.h"
+
+#if defined(__AVX512F__) && !defined(__AVX512VL__) // KNL
+#  define NUM_OBJECT_BINS 16
+#  define NUM_SPATIAL_BINS 16
+#else
+#  define NUM_OBJECT_BINS 32
+#  define NUM_SPATIAL_BINS 16
+#endif
+
+namespace embree
+{
+  namespace isa
+  {
+    MAYBE_UNUSED static const float travCost = 1.0f;
+    MAYBE_UNUSED static const size_t DEFAULT_SINGLE_THREAD_THRESHOLD = 1024;
+
+    struct GeneralBVHBuilder
+    {
+      static const size_t MAX_BRANCHING_FACTOR = 16;       //!< maximum supported BVH branching factor      
+      static const size_t MIN_LARGE_LEAF_LEVELS = 8;       //!< create balanced tree of we are that many levels before the maximum tree depth
+      
+
+      /*! settings for SAH builder */
+      struct Settings
+      {
+        /*! default settings */
+        Settings ()
+        : branchingFactor(2), maxDepth(32), logBlockSize(0), minLeafSize(1), maxLeafSize(7),
+          travCost(1.0f), intCost(1.0f), singleThreadThreshold(1024), primrefarrayalloc(inf) {}
+
+        /*! initialize settings from API settings */
+        Settings (const RTCBuildArguments& settings)
+        : branchingFactor(2), maxDepth(32), logBlockSize(0), minLeafSize(1), maxLeafSize(7),
+          travCost(1.0f), intCost(1.0f), singleThreadThreshold(1024), primrefarrayalloc(inf)
+        {
+          if (RTC_BUILD_ARGUMENTS_HAS(settings,maxBranchingFactor)) branchingFactor = settings.maxBranchingFactor;
+          if (RTC_BUILD_ARGUMENTS_HAS(settings,maxDepth          )) maxDepth        = settings.maxDepth;
+          if (RTC_BUILD_ARGUMENTS_HAS(settings,sahBlockSize      )) logBlockSize    = bsr(settings.sahBlockSize);
+          if (RTC_BUILD_ARGUMENTS_HAS(settings,minLeafSize       )) minLeafSize     = settings.minLeafSize;
+          if (RTC_BUILD_ARGUMENTS_HAS(settings,maxLeafSize       )) maxLeafSize     = settings.maxLeafSize;
+          if (RTC_BUILD_ARGUMENTS_HAS(settings,traversalCost     )) travCost        = settings.traversalCost;
+          if (RTC_BUILD_ARGUMENTS_HAS(settings,intersectionCost  )) intCost         = settings.intersectionCost;
+
+          minLeafSize = min(minLeafSize,maxLeafSize);
+        }
+
+        Settings (size_t sahBlockSize, size_t minLeafSize, size_t maxLeafSize, float travCost, float intCost, size_t singleThreadThreshold, size_t primrefarrayalloc = inf)
+        : branchingFactor(2), maxDepth(32), logBlockSize(bsr(sahBlockSize)), minLeafSize(minLeafSize), maxLeafSize(maxLeafSize),
+          travCost(travCost), intCost(intCost), singleThreadThreshold(singleThreadThreshold), primrefarrayalloc(primrefarrayalloc)
+        {
+          minLeafSize = min(minLeafSize,maxLeafSize);
+        }
+
+      public:
+        size_t branchingFactor;  //!< branching factor of BVH to build
+        size_t maxDepth;         //!< maximum depth of BVH to build
+        size_t logBlockSize;     //!< log2 of blocksize for SAH heuristic
+        size_t minLeafSize;      //!< minimum size of a leaf
+        size_t maxLeafSize;      //!< maximum size of a leaf
+        float travCost;          //!< estimated cost of one traversal step
+        float intCost;           //!< estimated cost of one primitive intersection
+        size_t singleThreadThreshold; //!< threshold when we switch to single threaded build
+        size_t primrefarrayalloc;  //!< builder uses prim ref array to allocate nodes and leaves when a subtree of that size is finished
+      };
+
+      /*! recursive state of builder */
+      template<typename Set, typename Split>
+        struct BuildRecordT
+        {
+        public:
+          __forceinline BuildRecordT () {}
+
+          __forceinline BuildRecordT (size_t depth)
+            : depth(depth), alloc_barrier(false), prims(empty) {}
+
+          __forceinline BuildRecordT (size_t depth, const Set& prims)
+            : depth(depth), alloc_barrier(false), prims(prims) {}
+
+          __forceinline BBox3fa bounds() const { return prims.geomBounds; }
+
+          __forceinline friend bool operator< (const BuildRecordT& a, const BuildRecordT& b) { return a.prims.size() < b.prims.size(); }
+          __forceinline friend bool operator> (const BuildRecordT& a, const BuildRecordT& b) { return a.prims.size() > b.prims.size();  }
+
+          __forceinline size_t size() const { return prims.size(); }
+
+        public:
+          size_t depth;       //!< Depth of the root of this subtree.
+          bool alloc_barrier; //!< barrier used to reuse primref-array blocks to allocate nodes
+          Set prims;          //!< The list of primitives.
+        };
+
+      template<typename PrimRef, typename Set>
+      struct DefaultCanCreateLeafFunc
+      {
+        __forceinline bool operator()(const PrimRef*, const Set&) const { return true; }
+      };
+
+      template<typename PrimRef, typename Set>
+      struct DefaultCanCreateLeafSplitFunc
+      {
+        __forceinline void operator()(PrimRef*, const Set&, Set&, Set&) const { }
+      };
+
+      template<typename BuildRecord,
+        typename Heuristic,
+        typename Set,
+        typename PrimRef,
+        typename ReductionTy,
+        typename Allocator,
+        typename CreateAllocFunc,
+        typename CreateNodeFunc,
+        typename UpdateNodeFunc,
+        typename CreateLeafFunc,
+        typename CanCreateLeafFunc,
+        typename CanCreateLeafSplitFunc,
+        typename ProgressMonitor>
+
+        class BuilderT
+        {
+          friend struct GeneralBVHBuilder;
+
+          BuilderT (PrimRef* prims,
+                    Heuristic& heuristic,
+                    const CreateAllocFunc& createAlloc,
+                    const CreateNodeFunc& createNode,
+                    const UpdateNodeFunc& updateNode,
+                    const CreateLeafFunc& createLeaf,
+                    const CanCreateLeafFunc& canCreateLeaf,
+                    const CanCreateLeafSplitFunc& canCreateLeafSplit,
+                    const ProgressMonitor& progressMonitor,
+                    const Settings& settings) :
+                    cfg(settings),
+                    prims(prims),
+                    heuristic(heuristic),
+                    createAlloc(createAlloc),
+                    createNode(createNode),
+                    updateNode(updateNode),
+                    createLeaf(createLeaf),
+                    canCreateLeaf(canCreateLeaf),
+                    canCreateLeafSplit(canCreateLeafSplit),
+                    progressMonitor(progressMonitor)
+          {
+            if (cfg.branchingFactor > MAX_BRANCHING_FACTOR)
+              throw_RTCError(RTC_ERROR_UNKNOWN,"bvh_builder: branching factor too large");
+          }
+
+          const ReductionTy createLargeLeaf(const BuildRecord& current, Allocator alloc)
+          {
+            /* this should never occur but is a fatal error */
+            if (current.depth > cfg.maxDepth)
+              throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached");
+
+            /* create leaf for few primitives */
+            if (current.prims.size() <= cfg.maxLeafSize && canCreateLeaf(prims,current.prims))
+              return createLeaf(prims,current.prims,alloc);
+
+            /* fill all children by always splitting the largest one */
+            ReductionTy values[MAX_BRANCHING_FACTOR];
+            BuildRecord children[MAX_BRANCHING_FACTOR];
+            size_t numChildren = 1;
+            children[0] = current;
+            do {
+
+              /* find best child with largest bounding box area */
+              size_t bestChild = -1;
+              size_t bestSize = 0;
+              for (size_t i=0; i<numChildren; i++)
+              {
+                /* ignore leaves as they cannot get split */
+                if (children[i].prims.size() <= cfg.maxLeafSize && canCreateLeaf(prims,children[i].prims))
+                  continue;
+
+                /* remember child with largest size */
+                if (children[i].prims.size() > bestSize) {
+                  bestSize = children[i].prims.size();
+                  bestChild = i;
+                }
+              }
+              if (bestChild == (size_t)-1) break;
+
+              /*! split best child into left and right child */
+              BuildRecord left(current.depth+1);
+              BuildRecord right(current.depth+1);
+              if (!canCreateLeaf(prims,children[bestChild].prims)) {
+                canCreateLeafSplit(prims,children[bestChild].prims,left.prims,right.prims);
+              } else {
+                heuristic.splitFallback(children[bestChild].prims,left.prims,right.prims);
+              }
+
+              /* add new children left and right */
+              children[bestChild] = children[numChildren-1];
+              children[numChildren-1] = left;
+              children[numChildren+0] = right;
+              numChildren++;
+
+            } while (numChildren < cfg.branchingFactor);
+
+            /* set barrier for primrefarrayalloc */
+            if (unlikely(current.size() > cfg.primrefarrayalloc))
+              for (size_t i=0; i<numChildren; i++)
+                children[i].alloc_barrier = children[i].size() <= cfg.primrefarrayalloc;
+
+            /* create node */
+            auto node = createNode(children,numChildren,alloc);
+
+            /* recurse into each child  and perform reduction */
+            for (size_t i=0; i<numChildren; i++)
+              values[i] = createLargeLeaf(children[i],alloc);
+
+            /* perform reduction */
+            return updateNode(current,children,node,values,numChildren);
+          }
+
+          const ReductionTy recurse(BuildRecord& current, Allocator alloc, bool toplevel)
+          {
+            /* get thread local allocator */
+            if (!alloc)
+              alloc = createAlloc();
+
+            /* call memory monitor function to signal progress */
+            if (toplevel && current.size() <= cfg.singleThreadThreshold)
+              progressMonitor(current.size());
+
+            /*! find best split */
+            auto split = heuristic.find(current.prims,cfg.logBlockSize);
+
+            /*! compute leaf and split cost */
+            const float leafSAH  = cfg.intCost*current.prims.leafSAH(cfg.logBlockSize);
+            const float splitSAH = cfg.travCost*halfArea(current.prims.geomBounds)+cfg.intCost*split.splitSAH();
+            assert((current.prims.size() == 0) || ((leafSAH >= 0) && (splitSAH >= 0)));
+
+            /*! create a leaf node when threshold reached or SAH tells us to stop */
+            if (current.prims.size() <= cfg.minLeafSize || current.depth+MIN_LARGE_LEAF_LEVELS >= cfg.maxDepth || (current.prims.size() <= cfg.maxLeafSize && leafSAH <= splitSAH)) {
+              heuristic.deterministic_order(current.prims);
+              return createLargeLeaf(current,alloc);
+            }
+
+            /*! perform initial split */
+            Set lprims,rprims;
+            heuristic.split(split,current.prims,lprims,rprims);
+	    
+            /*! initialize child list with initial split */
+            ReductionTy values[MAX_BRANCHING_FACTOR];
+            BuildRecord children[MAX_BRANCHING_FACTOR];
+            children[0] = BuildRecord(current.depth+1,lprims);
+            children[1] = BuildRecord(current.depth+1,rprims);
+            size_t numChildren = 2;
+
+            /*! split until node is full or SAH tells us to stop */
+            while (numChildren < cfg.branchingFactor)
+            {
+              /*! find best child to split */
+              float bestArea = neg_inf;
+              ssize_t bestChild = -1;
+              for (size_t i=0; i<numChildren; i++)
+              {
+                /* ignore leaves as they cannot get split */
+                if (children[i].prims.size() <= cfg.minLeafSize) continue;
+
+                /* find child with largest surface area */
+                if (halfArea(children[i].prims.geomBounds) > bestArea) {
+                  bestChild = i;
+                  bestArea = halfArea(children[i].prims.geomBounds);
+                }
+              }
+              if (bestChild == -1) break;
+
+              /* perform best found split */
+              BuildRecord& brecord = children[bestChild];
+              BuildRecord lrecord(current.depth+1);
+              BuildRecord rrecord(current.depth+1);
+              auto split = heuristic.find(brecord.prims,cfg.logBlockSize);
+              heuristic.split(split,brecord.prims,lrecord.prims,rrecord.prims);
+              children[bestChild  ] = lrecord;
+              children[numChildren] = rrecord;
+              numChildren++;
+            }
+
+            /* set barrier for primrefarrayalloc */
+            if (unlikely(current.size() > cfg.primrefarrayalloc))
+              for (size_t i=0; i<numChildren; i++)
+                children[i].alloc_barrier = children[i].size() <= cfg.primrefarrayalloc;
+
+            /* sort buildrecords for faster shadow ray traversal */
+            std::sort(&children[0],&children[numChildren],std::greater<BuildRecord>());
+
+            /*! create an inner node */
+            auto node = createNode(children,numChildren,alloc);
+
+            /* spawn tasks */
+            if (current.size() > cfg.singleThreadThreshold)
+            {
+              /*! parallel_for is faster than spawing sub-tasks */
+              parallel_for(size_t(0), numChildren, [&] (const range<size_t>& r) { // FIXME: no range here
+                  for (size_t i=r.begin(); i<r.end(); i++) {
+                    values[i] = recurse(children[i],nullptr,true);
+                    _mm_mfence(); // to allow non-temporal stores during build
+                  }
+                });
+
+              return updateNode(current,children,node,values,numChildren);
+            }
+            /* recurse into each child */
+            else
+            {
+              for (size_t i=0; i<numChildren; i++)
+                values[i] = recurse(children[i],alloc,false);
+
+              return updateNode(current,children,node,values,numChildren);
+            }
+          }
+
+        private:
+          Settings cfg;
+          PrimRef* prims;
+          Heuristic& heuristic;
+          const CreateAllocFunc& createAlloc;
+          const CreateNodeFunc& createNode;
+          const UpdateNodeFunc& updateNode;
+          const CreateLeafFunc& createLeaf;
+          const CanCreateLeafFunc& canCreateLeaf;
+          const CanCreateLeafSplitFunc& canCreateLeafSplit;
+          const ProgressMonitor& progressMonitor;
+        };
+
+      template<
+      typename ReductionTy,
+        typename Heuristic,
+        typename Set,
+        typename PrimRef,
+        typename CreateAllocFunc,
+        typename CreateNodeFunc,
+        typename UpdateNodeFunc,
+        typename CreateLeafFunc,
+        typename ProgressMonitor>
+
+        __noinline static ReductionTy build(Heuristic& heuristic,
+                                            PrimRef* prims,
+                                            const Set& set,
+                                            CreateAllocFunc createAlloc,
+                                            CreateNodeFunc createNode, UpdateNodeFunc updateNode,
+                                            const CreateLeafFunc& createLeaf,
+                                            const ProgressMonitor& progressMonitor,
+                                            const Settings& settings)
+      {
+        typedef BuildRecordT<Set,typename Heuristic::Split> BuildRecord;
+
+        typedef BuilderT<
+          BuildRecord,
+          Heuristic,
+          Set,
+          PrimRef,
+          ReductionTy,
+          decltype(createAlloc()),
+          CreateAllocFunc,
+          CreateNodeFunc,
+          UpdateNodeFunc,
+          CreateLeafFunc,
+          DefaultCanCreateLeafFunc<PrimRef, Set>,
+          DefaultCanCreateLeafSplitFunc<PrimRef, Set>,
+          ProgressMonitor> Builder;
+
+        /* instantiate builder */
+        Builder builder(prims,
+                        heuristic,
+                        createAlloc,
+                        createNode,
+                        updateNode,
+                        createLeaf,
+                        DefaultCanCreateLeafFunc<PrimRef, Set>(),
+                        DefaultCanCreateLeafSplitFunc<PrimRef, Set>(),
+                        progressMonitor,
+                        settings);
+
+        /* build hierarchy */
+        BuildRecord record(1,set);
+        const ReductionTy root = builder.recurse(record,nullptr,true);
+        _mm_mfence(); // to allow non-temporal stores during build
+        return root;
+      }
+
+      template<
+      typename ReductionTy,
+        typename Heuristic,
+        typename Set,
+        typename PrimRef,
+        typename CreateAllocFunc,
+        typename CreateNodeFunc,
+        typename UpdateNodeFunc,
+        typename CreateLeafFunc,
+        typename CanCreateLeafFunc,
+        typename CanCreateLeafSplitFunc,
+        typename ProgressMonitor>
+
+        __noinline static ReductionTy build(Heuristic& heuristic,
+                                            PrimRef* prims,
+                                            const Set& set,
+                                            CreateAllocFunc createAlloc,
+                                            CreateNodeFunc createNode, UpdateNodeFunc updateNode,
+                                            const CreateLeafFunc& createLeaf,
+                                            const CanCreateLeafFunc& canCreateLeaf,
+                                            const CanCreateLeafSplitFunc& canCreateLeafSplit,
+                                            const ProgressMonitor& progressMonitor,
+                                            const Settings& settings)
+      {
+        typedef BuildRecordT<Set,typename Heuristic::Split> BuildRecord;
+
+        typedef BuilderT<
+          BuildRecord,
+          Heuristic,
+          Set,
+          PrimRef,
+          ReductionTy,
+          decltype(createAlloc()),
+          CreateAllocFunc,
+          CreateNodeFunc,
+          UpdateNodeFunc,
+          CreateLeafFunc,
+          CanCreateLeafFunc,
+          CanCreateLeafSplitFunc,
+          ProgressMonitor> Builder;
+
+        /* instantiate builder */
+        Builder builder(prims,
+                        heuristic,
+                        createAlloc,
+                        createNode,
+                        updateNode,
+                        createLeaf,
+                        canCreateLeaf,
+                        canCreateLeafSplit,
+                        progressMonitor,
+                        settings);
+
+        /* build hierarchy */
+        BuildRecord record(1,set);
+        const ReductionTy root = builder.recurse(record,nullptr,true);
+        _mm_mfence(); // to allow non-temporal stores during build
+        return root;
+      }
+    };
+
+    /* SAH builder that operates on an array of BuildRecords */
+    struct BVHBuilderBinnedSAH
+    {
+      typedef PrimInfoRange Set;
+      typedef HeuristicArrayBinningSAH<PrimRef,NUM_OBJECT_BINS> Heuristic;
+      typedef GeneralBVHBuilder::BuildRecordT<Set,typename Heuristic::Split> BuildRecord;
+      typedef GeneralBVHBuilder::Settings Settings;
+
+      /*! special builder that propagates reduction over the tree */
+      template<
+      typename ReductionTy,
+        typename CreateAllocFunc,
+        typename CreateNodeFunc,
+        typename UpdateNodeFunc,
+        typename CreateLeafFunc,
+        typename ProgressMonitor>
+
+        static ReductionTy build(CreateAllocFunc createAlloc,
+                                 CreateNodeFunc createNode, UpdateNodeFunc updateNode,
+                                 const CreateLeafFunc& createLeaf,
+                                 const ProgressMonitor& progressMonitor,
+                                 PrimRef* prims, const PrimInfo& pinfo,
+                                 const Settings& settings)
+      {
+        Heuristic heuristic(prims);
+        return GeneralBVHBuilder::build<ReductionTy,Heuristic,Set,PrimRef>(
+          heuristic,
+          prims,
+          PrimInfoRange(0,pinfo.size(),pinfo),
+          createAlloc,
+          createNode,
+          updateNode,
+          createLeaf,
+          progressMonitor,
+          settings);
+      }
+
+      /*! special builder that propagates reduction over the tree */
+      template<
+      typename ReductionTy,
+        typename CreateAllocFunc,
+        typename CreateNodeFunc,
+        typename UpdateNodeFunc,
+        typename CreateLeafFunc,
+        typename CanCreateLeafFunc,
+        typename CanCreateLeafSplitFunc,
+        typename ProgressMonitor>
+
+        static ReductionTy build(CreateAllocFunc createAlloc,
+                                 CreateNodeFunc createNode, UpdateNodeFunc updateNode,
+                                 const CreateLeafFunc& createLeaf,
+                                 const CanCreateLeafFunc& canCreateLeaf,
+                                 const CanCreateLeafSplitFunc& canCreateLeafSplit,
+                                 const ProgressMonitor& progressMonitor,
+                                 PrimRef* prims, const PrimInfo& pinfo,
+                                 const Settings& settings)
+      {
+        Heuristic heuristic(prims);
+        return GeneralBVHBuilder::build<ReductionTy,Heuristic,Set,PrimRef>(
+          heuristic,
+          prims,
+          PrimInfoRange(0,pinfo.size(),pinfo),
+          createAlloc,
+          createNode,
+          updateNode,
+          createLeaf,
+          canCreateLeaf,
+          canCreateLeafSplit,
+          progressMonitor,
+          settings);
+      }
+    };
+
+    /* Spatial SAH builder that operates on an double-buffered array of BuildRecords */
+    struct BVHBuilderBinnedFastSpatialSAH
+    {
+      typedef PrimInfoExtRange Set;
+      typedef Split2<BinSplit<NUM_OBJECT_BINS>,SpatialBinSplit<NUM_SPATIAL_BINS> > Split;
+      typedef GeneralBVHBuilder::BuildRecordT<Set,Split> BuildRecord;
+      typedef GeneralBVHBuilder::Settings Settings;
+
+      static const unsigned int GEOMID_MASK = 0xFFFFFFFF >>     RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS;
+      static const unsigned int SPLITS_MASK = 0xFFFFFFFF << (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS);
+
+      template<typename ReductionTy, typename UserCreateLeaf>
+      struct CreateLeafExt
+      {
+        __forceinline CreateLeafExt (const UserCreateLeaf userCreateLeaf)
+          : userCreateLeaf(userCreateLeaf) {}
+
+        // __noinline is workaround for ICC2016 compiler bug
+        template<typename Allocator>
+        __noinline ReductionTy operator() (PrimRef* prims, const range<size_t>& range, Allocator alloc) const
+        {
+          for (size_t i=range.begin(); i<range.end(); i++)
+            prims[i].lower.u &= GEOMID_MASK;
+
+          return userCreateLeaf(prims,range,alloc);
+        }
+
+        const UserCreateLeaf userCreateLeaf;
+      };
+
+      /*! special builder that propagates reduction over the tree */
+      template<
+      typename ReductionTy,
+        typename CreateAllocFunc,
+        typename CreateNodeFunc,
+        typename UpdateNodeFunc,
+        typename CreateLeafFunc,
+        typename SplitPrimitiveFunc,
+        typename ProgressMonitor>
+
+        static ReductionTy build(CreateAllocFunc createAlloc,
+                                 CreateNodeFunc createNode,
+                                 UpdateNodeFunc updateNode,
+                                 const CreateLeafFunc& createLeaf,
+                                 SplitPrimitiveFunc splitPrimitive,
+                                 ProgressMonitor progressMonitor,
+                                 PrimRef* prims,
+                                 const size_t extSize,
+                                 const PrimInfo& pinfo,
+                                 const Settings& settings)
+        {
+          typedef HeuristicArraySpatialSAH<SplitPrimitiveFunc,PrimRef,NUM_OBJECT_BINS,NUM_SPATIAL_BINS> Heuristic;
+          Heuristic heuristic(splitPrimitive,prims,pinfo);
+
+          /* calculate total surface area */ // FIXME: this sum is not deterministic
+          const float A = (float) parallel_reduce(size_t(0),pinfo.size(),0.0, [&] (const range<size_t>& r) -> double {
+
+              double A = 0.0f;
+              for (size_t i=r.begin(); i<r.end(); i++)
+              {
+                PrimRef& prim = prims[i];
+                A += area(prim.bounds());
+              }
+              return A;
+            },std::plus<double>());
+
+
+          /* calculate maximum number of spatial splits per primitive */
+          const unsigned int maxSplits = ((size_t)1 << RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)-1;
+          const float f = 10.0f;
+
+          const float invA = 1.0f / A;
+          parallel_for( size_t(0), pinfo.size(), [&](const range<size_t>& r) {
+
+              for (size_t i=r.begin(); i<r.end(); i++)
+              {
+                PrimRef& prim = prims[i];
+                assert((prim.geomID() & SPLITS_MASK) == 0);
+                // FIXME: is there a better general heuristic ?
+                const float nf = ceilf(f*pinfo.size()*area(prim.bounds()) * invA);
+                unsigned int n = 4+min((int)maxSplits-4, max(1, (int)(nf)));
+                prim.lower.u |= n << (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS);
+              }
+            });
+
+          return GeneralBVHBuilder::build<ReductionTy,Heuristic,Set,PrimRef>(
+            heuristic,
+            prims,
+            PrimInfoExtRange(0,pinfo.size(),extSize,pinfo),
+            createAlloc,
+            createNode,
+            updateNode,
+            CreateLeafExt<ReductionTy,CreateLeafFunc>(createLeaf),
+            progressMonitor,
+            settings);
+        }
+    };
+
+    /* Open/Merge SAH builder that operates on an array of BuildRecords */
+    struct BVHBuilderBinnedOpenMergeSAH
+    {
+      static const size_t NUM_OBJECT_BINS_HQ = 32;
+      typedef PrimInfoExtRange Set;
+      typedef BinSplit<NUM_OBJECT_BINS_HQ> Split;
+      typedef GeneralBVHBuilder::BuildRecordT<Set,Split> BuildRecord;
+      typedef GeneralBVHBuilder::Settings Settings;
+      
+      /*! special builder that propagates reduction over the tree */
+      template<
+        typename ReductionTy, 
+        typename BuildRef,
+        typename CreateAllocFunc, 
+        typename CreateNodeFunc, 
+        typename UpdateNodeFunc, 
+        typename CreateLeafFunc, 
+        typename NodeOpenerFunc, 
+        typename ProgressMonitor>
+        
+        static ReductionTy build(CreateAllocFunc createAlloc, 
+                                 CreateNodeFunc createNode, 
+                                 UpdateNodeFunc updateNode, 
+                                 const CreateLeafFunc& createLeaf, 
+                                 NodeOpenerFunc nodeOpenerFunc,
+                                 ProgressMonitor progressMonitor,
+                                 BuildRef* prims, 
+                                 const size_t extSize,
+                                 const PrimInfo& pinfo, 
+                                 const Settings& settings)
+      {
+        typedef HeuristicArrayOpenMergeSAH<NodeOpenerFunc,BuildRef,NUM_OBJECT_BINS_HQ> Heuristic;
+        Heuristic heuristic(nodeOpenerFunc,prims,settings.branchingFactor);
+
+        return GeneralBVHBuilder::build<ReductionTy,Heuristic,Set,BuildRef>(
+          heuristic,
+          prims,
+          PrimInfoExtRange(0,pinfo.size(),extSize,pinfo),
+          createAlloc,
+          createNode,
+          updateNode,
+          createLeaf,
+          progressMonitor,
+          settings);
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/builders/heuristic_binning.h b/thirdparty/embree/kernels/builders/heuristic_binning.h
new file mode 100644
index 0000000000..ee29d09ac9
--- /dev/null
+++ b/thirdparty/embree/kernels/builders/heuristic_binning.h
@@ -0,0 +1,496 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "priminfo.h"
+#include "../../common/algorithms/parallel_reduce.h"
+#include "../../common/algorithms/parallel_partition.h"
+
+namespace embree
+{
+  namespace isa
+  { 
+    /*! mapping into bins */
+    template<size_t BINS>
+      struct BinMapping
+      {
+      public:
+        __forceinline BinMapping() {}
+        
+        /*! calculates the mapping */
+        __forceinline BinMapping(size_t N, const BBox3fa& centBounds) 
+        {
+          num = min(BINS,size_t(4.0f + 0.05f*N));
+          assert(num >= 1);
+          const vfloat4 eps = 1E-34f;
+          const vfloat4 diag = max(eps, (vfloat4) centBounds.size());
+          scale = select(diag > eps,vfloat4(0.99f*num)/diag,vfloat4(0.0f));
+          ofs  = (vfloat4) centBounds.lower;
+        }
+
+        /*! calculates the mapping */
+        __forceinline BinMapping(const BBox3fa& centBounds) 
+        {
+          num = BINS;
+          const vfloat4 eps = 1E-34f;
+          const vfloat4 diag = max(eps, (vfloat4) centBounds.size());
+          scale = select(diag > eps,vfloat4(0.99f*num)/diag,vfloat4(0.0f));
+          ofs  = (vfloat4) centBounds.lower;
+        }
+
+        /*! calculates the mapping */
+        template<typename PrimInfo>
+        __forceinline BinMapping(const PrimInfo& pinfo) 
+        {
+          const vfloat4 eps = 1E-34f;
+          num = min(BINS,size_t(4.0f + 0.05f*pinfo.size()));
+          const vfloat4 diag = max(eps,(vfloat4) pinfo.centBounds.size());
+          scale = select(diag > eps,vfloat4(0.99f*num)/diag,vfloat4(0.0f));
+          ofs  = (vfloat4) pinfo.centBounds.lower;
+        }
+
+        /*! returns number of bins */
+        __forceinline size_t size() const { return num; }
+        
+        /*! slower but safe binning */
+        __forceinline Vec3ia bin(const Vec3fa& p) const 
+        {
+          const vint4 i = floori((vfloat4(p)-ofs)*scale);
+#if 1
+          assert(i[0] >= 0 && (size_t)i[0] < num); 
+          assert(i[1] >= 0 && (size_t)i[1] < num);
+          assert(i[2] >= 0 && (size_t)i[2] < num);
+          return Vec3ia(i);
+#else
+          return Vec3ia(clamp(i,vint4(0),vint4(num-1)));
+#endif
+        }
+
+        /*! faster but unsafe binning */
+        __forceinline Vec3ia bin_unsafe(const Vec3fa& p) const {
+          return Vec3ia(floori((vfloat4(p)-ofs)*scale));
+        }
+
+        /*! faster but unsafe binning */
+        template<typename PrimRef>
+        __forceinline Vec3ia bin_unsafe(const PrimRef& p) const {
+          return bin_unsafe(p.binCenter());
+        }
+
+        /*! faster but unsafe binning */
+        template<typename PrimRef, typename BinBoundsAndCenter>
+        __forceinline Vec3ia bin_unsafe(const PrimRef& p, const BinBoundsAndCenter& binBoundsAndCenter) const {
+          return bin_unsafe(binBoundsAndCenter.binCenter(p));
+        }
+
+        template<typename PrimRef>
+        __forceinline bool bin_unsafe(const PrimRef& ref,
+                                      const vint4&   vSplitPos,
+                                      const vbool4&  splitDimMask) const // FIXME: rename to isLeft
+        {
+          return any(((vint4)bin_unsafe(center2(ref.bounds())) < vSplitPos) & splitDimMask);
+        }
+        /*! calculates left spatial position of bin */
+        __forceinline float pos(const size_t bin, const size_t dim) const {
+          return madd(float(bin),1.0f / scale[dim],ofs[dim]);
+        }
+
+        /*! returns true if the mapping is invalid in some dimension */
+        __forceinline bool invalid(const size_t dim) const {
+          return scale[dim] == 0.0f;
+        }
+        
+        /*! stream output */
+        friend embree_ostream operator<<(embree_ostream cout, const BinMapping& mapping) {
+          return cout << "BinMapping { num = " << mapping.num << ", ofs = " << mapping.ofs << ", scale = " << mapping.scale << "}";
+        }
+        
+      public:
+        size_t num;
+        vfloat4 ofs,scale;        //!< linear function that maps to bin ID
+      };
+    
+    /*! stores all information to perform some split */
+    template<size_t BINS>
+      struct BinSplit
+      {
+        enum
+        {
+          SPLIT_OBJECT   = 0,
+          SPLIT_FALLBACK = 1,
+          SPLIT_ENFORCE  = 2, // splits with larger ID are enforced in createLargeLeaf even if we could create a leaf already
+          SPLIT_TEMPORAL = 2,
+          SPLIT_GEOMID   = 3,
+        };
+
+        /*! construct an invalid split by default */
+        __forceinline BinSplit()
+          : sah(inf), dim(-1), pos(0), data(0) {}
+
+        __forceinline BinSplit(float sah, unsigned data, int dim = 0, float fpos = 0)
+          : sah(sah), dim(dim), fpos(fpos), data(data) {}
+        
+        /*! constructs specified split */
+        __forceinline BinSplit(float sah, int dim, int pos, const BinMapping<BINS>& mapping)
+          : sah(sah), dim(dim), pos(pos), data(0), mapping(mapping) {}
+        
+        /*! tests if this split is valid */
+        __forceinline bool valid() const { return dim != -1; }
+        
+        /*! calculates surface area heuristic for performing the split */
+        __forceinline float splitSAH() const { return sah; }
+        
+        /*! stream output */
+        friend embree_ostream operator<<(embree_ostream cout, const BinSplit& split) {
+          return cout << "BinSplit { sah = " << split.sah << ", dim = " << split.dim << ", pos = " << split.pos << "}";
+        }
+        
+      public:
+        float sah;                //!< SAH cost of the split
+        int dim;                  //!< split dimension
+        union { int pos; float fpos; };                  //!< bin index for splitting
+        unsigned int data;        //!< extra optional split data
+        BinMapping<BINS> mapping; //!< mapping into bins
+      };
+    
+    /*! stores extended information about the split */
+    template<typename BBox>
+      struct SplitInfoT
+    {
+
+      __forceinline SplitInfoT () {}
+      
+      __forceinline SplitInfoT (size_t leftCount, const BBox& leftBounds, size_t rightCount, const BBox& rightBounds)
+	: leftCount(leftCount), rightCount(rightCount), leftBounds(leftBounds), rightBounds(rightBounds) {}
+      
+    public:
+      size_t leftCount,rightCount;
+      BBox leftBounds,rightBounds;
+    };
+
+    typedef SplitInfoT<BBox3fa> SplitInfo;
+    typedef SplitInfoT<LBBox3fa> SplitInfo2;
+    
+    /*! stores all binning information */
+    template<size_t BINS, typename PrimRef, typename BBox>
+      struct __aligned(64) BinInfoT
+    {		  
+      typedef BinSplit<BINS> Split;
+      typedef vbool4 vbool;
+      typedef vint4 vint;
+      typedef vfloat4 vfloat;
+      
+      __forceinline BinInfoT() {
+      }
+      
+      __forceinline BinInfoT(EmptyTy) {
+	clear();
+      }
+
+      /*! bin access function */
+      __forceinline BBox &bounds(const size_t binID, const size_t dimID)             { return _bounds[binID][dimID]; }
+      __forceinline const BBox &bounds(const size_t binID, const size_t dimID) const { return _bounds[binID][dimID]; }
+
+      __forceinline unsigned int &counts(const size_t binID, const size_t dimID)             { return _counts[binID][dimID]; }
+      __forceinline const unsigned int &counts(const size_t binID, const size_t dimID) const { return _counts[binID][dimID]; }
+
+      __forceinline vuint4 &counts(const size_t binID)             { return _counts[binID]; }
+      __forceinline const vuint4 &counts(const size_t binID) const { return _counts[binID]; }
+
+      /*! clears the bin info */
+      __forceinline void clear() 
+      {
+	for (size_t i=0; i<BINS; i++) {
+	  bounds(i,0) = bounds(i,1) = bounds(i,2) = empty;
+	  counts(i) = vuint4(zero);
+	}
+      }
+      
+      /*! bins an array of primitives */
+      __forceinline void bin (const PrimRef* prims, size_t N, const BinMapping<BINS>& mapping)
+      {
+	if (unlikely(N == 0)) return;
+	size_t i; 
+	for (i=0; i<N-1; i+=2)
+        {
+          /*! map even and odd primitive to bin */
+          BBox prim0; Vec3fa center0;
+          prims[i+0].binBoundsAndCenter(prim0,center0); 
+          const vint4 bin0 = (vint4)mapping.bin(center0); 
+          
+          BBox prim1; Vec3fa center1;
+          prims[i+1].binBoundsAndCenter(prim1,center1); 
+          const vint4 bin1 = (vint4)mapping.bin(center1); 
+          
+          /*! increase bounds for bins for even primitive */
+          const unsigned int b00 = extract<0>(bin0); bounds(b00,0).extend(prim0); 
+          const unsigned int b01 = extract<1>(bin0); bounds(b01,1).extend(prim0); 
+          const unsigned int b02 = extract<2>(bin0); bounds(b02,2).extend(prim0); 
+          const unsigned int s0 = (unsigned int)prims[i+0].size();
+          counts(b00,0)+=s0;
+          counts(b01,1)+=s0;
+          counts(b02,2)+=s0;
+
+          /*! increase bounds of bins for odd primitive */
+          const unsigned int b10 = extract<0>(bin1);  bounds(b10,0).extend(prim1); 
+          const unsigned int b11 = extract<1>(bin1);  bounds(b11,1).extend(prim1); 
+          const unsigned int b12 = extract<2>(bin1);  bounds(b12,2).extend(prim1); 
+          const unsigned int s1 = (unsigned int)prims[i+1].size();
+          counts(b10,0)+=s1;
+          counts(b11,1)+=s1;
+          counts(b12,2)+=s1;
+        }
+	/*! for uneven number of primitives */
+	if (i < N)
+        {
+          /*! map primitive to bin */
+          BBox prim0; Vec3fa center0;
+          prims[i].binBoundsAndCenter(prim0,center0); 
+          const vint4 bin0 = (vint4)mapping.bin(center0); 
+          
+          /*! increase bounds of bins */
+          const unsigned int s0 = (unsigned int)prims[i].size();
+          const int b00 = extract<0>(bin0); counts(b00,0)+=s0; bounds(b00,0).extend(prim0);
+          const int b01 = extract<1>(bin0); counts(b01,1)+=s0; bounds(b01,1).extend(prim0);
+          const int b02 = extract<2>(bin0); counts(b02,2)+=s0; bounds(b02,2).extend(prim0);
+        }
+      }
+
+      /*! bins an array of primitives */
+      template<typename BinBoundsAndCenter>
+        __forceinline void bin (const PrimRef* prims, size_t N, const BinMapping<BINS>& mapping, const BinBoundsAndCenter& binBoundsAndCenter)
+      {
+	if (N == 0) return;
+        
+	size_t i; 
+	for (i=0; i<N-1; i+=2)
+        {
+          /*! map even and odd primitive to bin */
+          BBox prim0; Vec3fa center0; binBoundsAndCenter.binBoundsAndCenter(prims[i+0],prim0,center0); 
+          const vint4 bin0 = (vint4)mapping.bin(center0); 
+          BBox prim1; Vec3fa center1; binBoundsAndCenter.binBoundsAndCenter(prims[i+1],prim1,center1); 
+          const vint4 bin1 = (vint4)mapping.bin(center1); 
+          
+          /*! increase bounds for bins for even primitive */
+          const unsigned int s0 = prims[i+0].size();
+          const int b00 = extract<0>(bin0); counts(b00,0)+=s0; bounds(b00,0).extend(prim0);
+          const int b01 = extract<1>(bin0); counts(b01,1)+=s0; bounds(b01,1).extend(prim0);
+          const int b02 = extract<2>(bin0); counts(b02,2)+=s0; bounds(b02,2).extend(prim0);
+          
+          /*! increase bounds of bins for odd primitive */
+          const unsigned int s1 = prims[i+1].size();
+          const int b10 = extract<0>(bin1); counts(b10,0)+=s1; bounds(b10,0).extend(prim1);
+          const int b11 = extract<1>(bin1); counts(b11,1)+=s1; bounds(b11,1).extend(prim1);
+          const int b12 = extract<2>(bin1); counts(b12,2)+=s1; bounds(b12,2).extend(prim1);
+        }
+	
+	/*! for uneven number of primitives */
+	if (i < N)
+        {
+          /*! map primitive to bin */
+          BBox prim0; Vec3fa center0; binBoundsAndCenter.binBoundsAndCenter(prims[i+0],prim0,center0); 
+          const vint4 bin0 = (vint4)mapping.bin(center0); 
+          
+          /*! increase bounds of bins */
+          const unsigned int s0 = prims[i+0].size();
+          const int b00 = extract<0>(bin0); counts(b00,0)+=s0; bounds(b00,0).extend(prim0);
+          const int b01 = extract<1>(bin0); counts(b01,1)+=s0; bounds(b01,1).extend(prim0);
+          const int b02 = extract<2>(bin0); counts(b02,2)+=s0; bounds(b02,2).extend(prim0);
+        }
+      }
+      
+      __forceinline void bin(const PrimRef* prims, size_t begin, size_t end, const BinMapping<BINS>& mapping) {
+	bin(prims+begin,end-begin,mapping);
+      }
+
+      template<typename BinBoundsAndCenter>
+        __forceinline void bin(const PrimRef* prims, size_t begin, size_t end, const BinMapping<BINS>& mapping, const BinBoundsAndCenter& binBoundsAndCenter) {
+	bin<BinBoundsAndCenter>(prims+begin,end-begin,mapping,binBoundsAndCenter);
+      }
+
+      /*! merges in other binning information */
+      __forceinline void merge (const BinInfoT& other, size_t numBins)
+      {
+		
+	for (size_t i=0; i<numBins; i++) 
+        {
+          counts(i) += other.counts(i);
+          bounds(i,0).extend(other.bounds(i,0));
+          bounds(i,1).extend(other.bounds(i,1));
+          bounds(i,2).extend(other.bounds(i,2));
+        }
+      }
+
+      /*! reduces binning information */
+      static __forceinline const BinInfoT reduce (const BinInfoT& a, const BinInfoT& b, const size_t numBins = BINS)
+      {
+        BinInfoT c;
+	for (size_t i=0; i<numBins; i++) 
+        {
+          c.counts(i) = a.counts(i)+b.counts(i);
+          c.bounds(i,0) = embree::merge(a.bounds(i,0),b.bounds(i,0));
+          c.bounds(i,1) = embree::merge(a.bounds(i,1),b.bounds(i,1));
+          c.bounds(i,2) = embree::merge(a.bounds(i,2),b.bounds(i,2));
+        }
+        return c;
+      }
+      
+      /*! finds the best split by scanning binning information */
+      __forceinline Split best(const BinMapping<BINS>& mapping, const size_t blocks_shift) const
+      {
+	/* sweep from right to left and compute parallel prefix of merged bounds */
+	vfloat4 rAreas[BINS];
+	vuint4 rCounts[BINS];
+	vuint4 count = 0; BBox bx = empty; BBox by = empty; BBox bz = empty;
+	for (size_t i=mapping.size()-1; i>0; i--)
+        {
+          count += counts(i);
+          rCounts[i] = count;
+          bx.extend(bounds(i,0)); rAreas[i][0] = expectedApproxHalfArea(bx);
+          by.extend(bounds(i,1)); rAreas[i][1] = expectedApproxHalfArea(by);
+          bz.extend(bounds(i,2)); rAreas[i][2] = expectedApproxHalfArea(bz);
+          rAreas[i][3] = 0.0f;
+        }
+	/* sweep from left to right and compute SAH */
+	vuint4 blocks_add = (1 << blocks_shift)-1;
+	vuint4 ii = 1; vfloat4 vbestSAH = pos_inf; vuint4 vbestPos = 0; 
+	count = 0; bx = empty; by = empty; bz = empty;
+	for (size_t i=1; i<mapping.size(); i++, ii+=1)
+        {
+          count += counts(i-1);
+          bx.extend(bounds(i-1,0)); float Ax = expectedApproxHalfArea(bx);
+          by.extend(bounds(i-1,1)); float Ay = expectedApproxHalfArea(by);
+          bz.extend(bounds(i-1,2)); float Az = expectedApproxHalfArea(bz);
+          const vfloat4 lArea = vfloat4(Ax,Ay,Az,Az);
+          const vfloat4 rArea = rAreas[i];
+          const vuint4 lCount = (count     +blocks_add) >> (unsigned int)(blocks_shift); // if blocks_shift >=1 then lCount < 4B and could be represented with an vint4, which would allow for faster vfloat4 conversions.
+          const vuint4 rCount = (rCounts[i]+blocks_add) >> (unsigned int)(blocks_shift);
+          const vfloat4 sah = madd(lArea,vfloat4(lCount),rArea*vfloat4(rCount));
+          //const vfloat4 sah = madd(lArea,vfloat4(vint4(lCount)),rArea*vfloat4(vint4(rCount)));
+
+          vbestPos = select(sah < vbestSAH,ii ,vbestPos);
+          vbestSAH = select(sah < vbestSAH,sah,vbestSAH);
+        }
+	
+	/* find best dimension */
+	float bestSAH = inf;
+	int   bestDim = -1;
+	int   bestPos = 0;
+	for (int dim=0; dim<3; dim++) 
+        {
+          /* ignore zero sized dimensions */
+          if (unlikely(mapping.invalid(dim)))
+            continue;
+          
+          /* test if this is a better dimension */
+          if (vbestSAH[dim] < bestSAH && vbestPos[dim] != 0) {
+            bestDim = dim;
+            bestPos = vbestPos[dim];
+            bestSAH = vbestSAH[dim];
+          }
+        }
+	return Split(bestSAH,bestDim,bestPos,mapping);
+      }
+      
+      /*! calculates extended split information */
+      __forceinline void getSplitInfo(const BinMapping<BINS>& mapping, const Split& split, SplitInfoT<BBox>& info) const 
+      {
+	if (split.dim == -1) {
+	  new (&info) SplitInfoT<BBox>(0,empty,0,empty);
+	  return;
+	}
+	
+	size_t leftCount = 0;
+	BBox leftBounds = empty;
+	for (size_t i=0; i<(size_t)split.pos; i++) {
+	  leftCount += counts(i,split.dim);
+	  leftBounds.extend(bounds(i,split.dim));
+	}
+	size_t rightCount = 0;
+	BBox rightBounds = empty;
+	for (size_t i=split.pos; i<mapping.size(); i++) {
+	  rightCount += counts(i,split.dim);
+	  rightBounds.extend(bounds(i,split.dim));
+	}
+	new (&info) SplitInfoT<BBox>(leftCount,leftBounds,rightCount,rightBounds);
+      }
+
+      /*! gets the number of primitives left of the split */
+      __forceinline size_t getLeftCount(const BinMapping<BINS>& mapping, const Split& split) const
+      {
+        if (unlikely(split.dim == -1)) return -1;
+
+        size_t leftCount = 0;
+        for (size_t i = 0; i < (size_t)split.pos; i++) {
+          leftCount += counts(i, split.dim);
+        }
+        return leftCount;
+      }
+
+      /*! gets the number of primitives right of the split */
+      __forceinline size_t getRightCount(const BinMapping<BINS>& mapping, const Split& split) const
+      {
+        if (unlikely(split.dim == -1)) return -1;
+
+        size_t rightCount = 0;
+        for (size_t i = (size_t)split.pos; i<mapping.size(); i++) {
+          rightCount += counts(i, split.dim);
+        }
+        return rightCount;
+      }
+
+    private:
+      BBox _bounds[BINS][3]; //!< geometry bounds for each bin in each dimension
+      vuint4   _counts[BINS];    //!< counts number of primitives that map into the bins
+    };
+  }
+
+  template<typename BinInfoT, typename BinMapping, typename PrimRef>
+  __forceinline void bin_parallel(BinInfoT& binner, const PrimRef* prims, size_t begin, size_t end, size_t blockSize, size_t parallelThreshold, const BinMapping& mapping)
+  {
+    if (likely(end-begin < parallelThreshold)) {
+      binner.bin(prims,begin,end,mapping);
+    } else {
+      binner = parallel_reduce(begin,end,blockSize,binner,
+                              [&](const range<size_t>& r) -> BinInfoT { BinInfoT binner(empty); binner.bin(prims + r.begin(), r.size(), mapping); return binner; },
+                              [&](const BinInfoT& b0, const BinInfoT& b1) -> BinInfoT { BinInfoT r = b0; r.merge(b1, mapping.size()); return r; });
+    }
+  }
+
+  template<typename BinBoundsAndCenter, typename BinInfoT, typename BinMapping, typename PrimRef>
+  __forceinline void bin_parallel(BinInfoT& binner, const PrimRef* prims, size_t begin, size_t end, size_t blockSize, size_t parallelThreshold, const BinMapping& mapping, const BinBoundsAndCenter& binBoundsAndCenter)
+  {
+    if (likely(end-begin < parallelThreshold)) {
+      binner.bin(prims,begin,end,mapping,binBoundsAndCenter);
+    } else {
+      binner = parallel_reduce(begin,end,blockSize,binner,
+                              [&](const range<size_t>& r) -> BinInfoT { BinInfoT binner(empty); binner.bin(prims + r.begin(), r.size(), mapping, binBoundsAndCenter); return binner; },
+                              [&](const BinInfoT& b0, const BinInfoT& b1) -> BinInfoT { BinInfoT r = b0; r.merge(b1, mapping.size()); return r; });
+    }
+  }
+
+  template<bool parallel, typename BinInfoT, typename BinMapping, typename PrimRef>
+  __forceinline void bin_serial_or_parallel(BinInfoT& binner, const PrimRef* prims, size_t begin, size_t end, size_t blockSize, const BinMapping& mapping)
+  {
+    if (!parallel) {
+      binner.bin(prims,begin,end,mapping);
+    } else {
+      binner = parallel_reduce(begin,end,blockSize,binner,
+                              [&](const range<size_t>& r) -> BinInfoT { BinInfoT binner(empty); binner.bin(prims + r.begin(), r.size(), mapping); return binner; },
+                              [&](const BinInfoT& b0, const BinInfoT& b1) -> BinInfoT { BinInfoT r = b0; r.merge(b1, mapping.size()); return r; });
+    }
+  }
+
+  template<bool parallel, typename BinBoundsAndCenter, typename BinInfoT, typename BinMapping, typename PrimRef>
+  __forceinline void bin_serial_or_parallel(BinInfoT& binner, const PrimRef* prims, size_t begin, size_t end, size_t blockSize, const BinMapping& mapping, const BinBoundsAndCenter& binBoundsAndCenter)
+  {
+    if (!parallel) {
+      binner.bin(prims,begin,end,mapping,binBoundsAndCenter);
+    } else {
+      binner = parallel_reduce(begin,end,blockSize,binner,
+                              [&](const range<size_t>& r) -> BinInfoT { BinInfoT binner(empty); binner.bin(prims + r.begin(), r.size(), mapping, binBoundsAndCenter); return binner; },
+                              [&](const BinInfoT& b0, const BinInfoT& b1) -> BinInfoT { BinInfoT r = b0; r.merge(b1, mapping.size()); return r; });
+    }
+  }
+}
diff --git a/thirdparty/embree/kernels/builders/heuristic_binning_array_aligned.h b/thirdparty/embree/kernels/builders/heuristic_binning_array_aligned.h
new file mode 100644
index 0000000000..ab3b97efb9
--- /dev/null
+++ b/thirdparty/embree/kernels/builders/heuristic_binning_array_aligned.h
@@ -0,0 +1,200 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "heuristic_binning.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    struct PrimInfoRange : public CentGeomBBox3fa, public range<size_t>
+    {
+      __forceinline PrimInfoRange () {
+      }
+
+      __forceinline PrimInfoRange(const PrimInfo& pinfo)
+        : CentGeomBBox3fa(pinfo), range<size_t>(pinfo.begin,pinfo.end) {}
+
+      __forceinline PrimInfoRange(EmptyTy)
+        : CentGeomBBox3fa(EmptyTy()), range<size_t>(0,0) {}
+
+      __forceinline PrimInfoRange (size_t begin, size_t end, const CentGeomBBox3fa& centGeomBounds)
+        : CentGeomBBox3fa(centGeomBounds), range<size_t>(begin,end) {}
+      
+      __forceinline float leafSAH() const { 
+	return expectedApproxHalfArea(geomBounds)*float(size()); 
+      }
+      
+      __forceinline float leafSAH(size_t block_shift) const { 
+	return expectedApproxHalfArea(geomBounds)*float((size()+(size_t(1)<<block_shift)-1) >> block_shift);
+      }
+    };
+    
+    /*! Performs standard object binning */
+    template<typename PrimRef, size_t BINS>
+      struct HeuristicArrayBinningSAH
+      {
+        typedef BinSplit<BINS> Split;
+        typedef BinInfoT<BINS,PrimRef,BBox3fa> Binner;
+        typedef range<size_t> Set;
+
+        static const size_t PARALLEL_THRESHOLD = 3 * 1024;
+        static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024;
+        static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128;
+
+        __forceinline HeuristicArrayBinningSAH ()
+          : prims(nullptr) {}
+
+        /*! remember prim array */
+        __forceinline HeuristicArrayBinningSAH (PrimRef* prims)
+          : prims(prims) {}
+
+        /*! finds the best split */
+        __noinline const Split find(const PrimInfoRange& pinfo, const size_t logBlockSize)
+        {
+          if (likely(pinfo.size() < PARALLEL_THRESHOLD))
+            return find_template<false>(pinfo,logBlockSize);
+          else
+            return find_template<true>(pinfo,logBlockSize);
+        }
+
+        template<bool parallel>
+        __forceinline const Split find_template(const PrimInfoRange& pinfo, const size_t logBlockSize)
+        {
+          Binner binner(empty);
+          const BinMapping<BINS> mapping(pinfo);
+          bin_serial_or_parallel<parallel>(binner,prims,pinfo.begin(),pinfo.end(),PARALLEL_FIND_BLOCK_SIZE,mapping);
+          return binner.best(mapping,logBlockSize);
+        }
+
+        /*! array partitioning */
+        __forceinline void split(const Split& split, const PrimInfoRange& pinfo, PrimInfoRange& linfo, PrimInfoRange& rinfo)
+        {
+          if (likely(pinfo.size() < PARALLEL_THRESHOLD))
+            split_template<false>(split,pinfo,linfo,rinfo);
+          else
+            split_template<true>(split,pinfo,linfo,rinfo);
+        }
+
+        template<bool parallel>
+        __forceinline void split_template(const Split& split, const PrimInfoRange& set, PrimInfoRange& lset, PrimInfoRange& rset)
+        {
+          if (!split.valid()) {
+            deterministic_order(set);
+            return splitFallback(set,lset,rset);
+          }
+          
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          CentGeomBBox3fa local_left(empty);
+          CentGeomBBox3fa local_right(empty);
+          const unsigned int splitPos = split.pos;
+          const unsigned int splitDim = split.dim;
+          const unsigned int splitDimMask = (unsigned int)1 << splitDim;
+
+          const typename Binner::vint vSplitPos(splitPos);
+          const typename Binner::vbool vSplitMask(splitDimMask);
+          auto isLeft = [&] (const PrimRef &ref) { return split.mapping.bin_unsafe(ref,vSplitPos,vSplitMask); };
+
+          size_t center = 0;
+          if (!parallel)
+            center = serial_partitioning(prims,begin,end,local_left,local_right,isLeft,
+                                         [] (CentGeomBBox3fa& pinfo,const PrimRef& ref) { pinfo.extend_center2(ref); });
+          else
+            center = parallel_partitioning(
+              prims,begin,end,EmptyTy(),local_left,local_right,isLeft,
+              [] (CentGeomBBox3fa& pinfo,const PrimRef& ref) { pinfo.extend_center2(ref); },
+              [] (CentGeomBBox3fa& pinfo0,const CentGeomBBox3fa& pinfo1) { pinfo0.merge(pinfo1); },
+              PARALLEL_PARTITION_BLOCK_SIZE);
+          
+          new (&lset) PrimInfoRange(begin,center,local_left);
+          new (&rset) PrimInfoRange(center,end,local_right);
+          assert(area(lset.geomBounds) >= 0.0f);
+          assert(area(rset.geomBounds) >= 0.0f);
+        }
+
+        void deterministic_order(const PrimInfoRange& pinfo)
+        {
+          /* required as parallel partition destroys original primitive order */
+          std::sort(&prims[pinfo.begin()],&prims[pinfo.end()]);
+        }
+
+        void splitFallback(const PrimInfoRange& pinfo, PrimInfoRange& linfo, PrimInfoRange& rinfo)
+        {
+          const size_t begin = pinfo.begin();
+          const size_t end   = pinfo.end();
+          const size_t center = (begin + end)/2;
+
+          CentGeomBBox3fa left(empty);
+          for (size_t i=begin; i<center; i++)
+            left.extend_center2(prims[i]);
+          new (&linfo) PrimInfoRange(begin,center,left);
+
+          CentGeomBBox3fa right(empty);
+          for (size_t i=center; i<end; i++)
+            right.extend_center2(prims[i]);
+          new (&rinfo) PrimInfoRange(center,end,right);
+        }
+
+        void splitByGeometry(const range<size_t>& range, PrimInfoRange& linfo, PrimInfoRange& rinfo)
+        {
+          assert(range.size() > 1);
+          CentGeomBBox3fa left(empty);
+          CentGeomBBox3fa right(empty);
+          unsigned int geomID = prims[range.begin()].geomID();
+          size_t center = serial_partitioning(prims,range.begin(),range.end(),left,right,
+                                              [&] ( const PrimRef& prim ) { return prim.geomID() == geomID; },
+                                              [ ] ( CentGeomBBox3fa& a, const PrimRef& ref ) { a.extend_center2(ref); });
+
+          new (&linfo) PrimInfoRange(range.begin(),center,left);
+          new (&rinfo) PrimInfoRange(center,range.end(),right);
+        }
+
+      private:
+        PrimRef* const prims;
+      };
+
+    /*! Performs standard object binning */
+    template<typename PrimRefMB, size_t BINS>
+      struct HeuristicArrayBinningMB
+      {
+        typedef BinSplit<BINS> Split;
+        typedef typename PrimRefMB::BBox BBox;
+        typedef BinInfoT<BINS,PrimRefMB,BBox> ObjectBinner;
+        static const size_t PARALLEL_THRESHOLD = 3 * 1024;
+        static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024;
+        static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128;
+
+        /*! finds the best split */
+        const Split find(const SetMB& set, const size_t logBlockSize)
+        {
+          ObjectBinner binner(empty);
+          const BinMapping<BINS> mapping(set.size(),set.centBounds);
+          bin_parallel(binner,set.prims->data(),set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,PARALLEL_THRESHOLD,mapping);
+          Split osplit = binner.best(mapping,logBlockSize);
+          osplit.sah *= set.time_range.size();
+          if (!osplit.valid()) osplit.data = Split::SPLIT_FALLBACK; // use fallback split
+          return osplit;
+        }
+        
+        /*! array partitioning */
+        __forceinline void split(const Split& split, const SetMB& set, SetMB& lset, SetMB& rset)
+        {
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          PrimInfoMB left = empty;
+          PrimInfoMB right = empty;
+          const vint4 vSplitPos(split.pos);
+          const vbool4 vSplitMask(1 << split.dim);
+          auto isLeft = [&] (const PrimRefMB &ref) { return any(((vint4)split.mapping.bin_unsafe(ref) < vSplitPos) & vSplitMask); };
+          auto reduction = [] (PrimInfoMB& pinfo, const PrimRefMB& ref) { pinfo.add_primref(ref); };
+          auto reduction2 = [] (PrimInfoMB& pinfo0,const PrimInfoMB& pinfo1) { pinfo0.merge(pinfo1); };
+          size_t center = parallel_partitioning(set.prims->data(),begin,end,EmptyTy(),left,right,isLeft,reduction,reduction2,PARALLEL_PARTITION_BLOCK_SIZE,PARALLEL_THRESHOLD);
+          new (&lset) SetMB(left, set.prims,range<size_t>(begin,center),set.time_range);
+          new (&rset) SetMB(right,set.prims,range<size_t>(center,end  ),set.time_range);
+        }
+      };
+  }
+}
diff --git a/thirdparty/embree/kernels/builders/heuristic_binning_array_unaligned.h b/thirdparty/embree/kernels/builders/heuristic_binning_array_unaligned.h
new file mode 100644
index 0000000000..34a7f121bb
--- /dev/null
+++ b/thirdparty/embree/kernels/builders/heuristic_binning_array_unaligned.h
@@ -0,0 +1,302 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "heuristic_binning.h"
+
+namespace embree
+{
+  namespace isa
+  { 
+    /*! Performs standard object binning */
+    template<typename PrimRef, size_t BINS>
+      struct UnalignedHeuristicArrayBinningSAH
+      {
+        typedef BinSplit<BINS> Split;
+        typedef BinInfoT<BINS,PrimRef,BBox3fa> Binner;
+        typedef range<size_t> Set;
+
+        __forceinline UnalignedHeuristicArrayBinningSAH () // FIXME: required?
+          : scene(nullptr), prims(nullptr) {}
+        
+        /*! remember prim array */
+        __forceinline UnalignedHeuristicArrayBinningSAH (Scene* scene, PrimRef* prims)
+          : scene(scene), prims(prims) {}
+
+        const LinearSpace3fa computeAlignedSpace(const range<size_t>& set)
+        {
+          Vec3fa axis(0,0,1);
+          uint64_t bestGeomPrimID = -1;
+
+          /*! find curve with minimum ID that defines valid direction */
+          for (size_t i=set.begin(); i<set.end(); i++)
+          {
+            const unsigned int geomID = prims[i].geomID();
+            const unsigned int primID = prims[i].primID();
+            const uint64_t geomprimID = prims[i].ID64();
+            if (geomprimID >= bestGeomPrimID) continue;
+            const Vec3fa axis1 = scene->get(geomID)->computeDirection(primID);
+            if (sqr_length(axis1) > 1E-18f) {
+              axis = normalize(axis1);
+              bestGeomPrimID = geomprimID;
+            }
+          }
+          return frame(axis).transposed();
+        }
+        
+        const PrimInfo computePrimInfo(const range<size_t>& set, const LinearSpace3fa& space)
+        {
+          auto computeBounds = [&](const range<size_t>& r) -> CentGeomBBox3fa
+            {
+              CentGeomBBox3fa bounds(empty);
+              for (size_t i=r.begin(); i<r.end(); i++) {
+                Geometry* mesh = scene->get(prims[i].geomID());
+                bounds.extend(mesh->vbounds(space,prims[i].primID()));
+              }
+              return bounds;
+            };
+          
+          const CentGeomBBox3fa bounds = parallel_reduce(set.begin(), set.end(), size_t(1024), size_t(4096), 
+                                                         CentGeomBBox3fa(empty), computeBounds, CentGeomBBox3fa::merge2);
+
+          return PrimInfo(set.begin(),set.end(),bounds);
+        }
+
+        struct BinBoundsAndCenter
+        {
+          __forceinline BinBoundsAndCenter(Scene* scene, const LinearSpace3fa& space)
+            : scene(scene), space(space) {}
+          
+            /*! returns center for binning */
+          __forceinline Vec3fa binCenter(const PrimRef& ref) const
+          {
+            Geometry* mesh = (Geometry*) scene->get(ref.geomID());
+            BBox3fa bounds = mesh->vbounds(space,ref.primID());
+            return embree::center2(bounds);
+          }
+          
+          /*! returns bounds and centroid used for binning */
+          __forceinline void binBoundsAndCenter(const PrimRef& ref, BBox3fa& bounds_o, Vec3fa& center_o) const
+          {
+            Geometry* mesh = (Geometry*) scene->get(ref.geomID());
+            BBox3fa bounds = mesh->vbounds(space,ref.primID());
+            bounds_o = bounds;
+            center_o = embree::center2(bounds);
+          }
+
+        private:
+          Scene* scene;
+          const LinearSpace3fa space;
+        };
+        
+        /*! finds the best split */
+        __forceinline const Split find(const PrimInfoRange& pinfo, const size_t logBlockSize, const LinearSpace3fa& space)
+        {
+          if (likely(pinfo.size() < 10000))
+            return find_template<false>(pinfo,logBlockSize,space);
+          else
+            return find_template<true>(pinfo,logBlockSize,space);
+        }
+
+        /*! finds the best split */
+        template<bool parallel>
+        const Split find_template(const PrimInfoRange& set, const size_t logBlockSize, const LinearSpace3fa& space)
+        {
+          Binner binner(empty);
+          const BinMapping<BINS> mapping(set);
+          BinBoundsAndCenter binBoundsAndCenter(scene,space);
+          bin_serial_or_parallel<parallel>(binner,prims,set.begin(),set.end(),size_t(4096),mapping,binBoundsAndCenter);
+          return binner.best(mapping,logBlockSize);
+        }
+        
+        /*! array partitioning */
+        __forceinline void split(const Split& split, const LinearSpace3fa& space, const Set& set, PrimInfoRange& lset, PrimInfoRange& rset)
+        {
+          if (likely(set.size() < 10000))
+            split_template<false>(split,space,set,lset,rset);
+          else
+            split_template<true>(split,space,set,lset,rset);
+        }
+
+        /*! array partitioning */
+        template<bool parallel>
+        __forceinline void split_template(const Split& split, const LinearSpace3fa& space, const Set& set, PrimInfoRange& lset, PrimInfoRange& rset)
+        {
+          if (!split.valid()) {
+            deterministic_order(set);
+            return splitFallback(set,lset,rset);
+          }
+          
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          CentGeomBBox3fa local_left(empty);
+          CentGeomBBox3fa local_right(empty);
+          const int splitPos = split.pos;
+          const int splitDim = split.dim;
+          BinBoundsAndCenter binBoundsAndCenter(scene,space);
+
+          size_t center = 0;
+          if (likely(set.size() < 10000))
+            center = serial_partitioning(prims,begin,end,local_left,local_right,
+                                         [&] (const PrimRef& ref) { return split.mapping.bin_unsafe(ref,binBoundsAndCenter)[splitDim] < splitPos; },
+                                         [] (CentGeomBBox3fa& pinfo,const PrimRef& ref) { pinfo.extend_center2(ref); });
+          else
+            center = parallel_partitioning(prims,begin,end,EmptyTy(),local_left,local_right,
+                                           [&] (const PrimRef& ref) { return split.mapping.bin_unsafe(ref,binBoundsAndCenter)[splitDim] < splitPos; },
+                                           [] (CentGeomBBox3fa& pinfo,const PrimRef& ref) { pinfo.extend_center2(ref); },
+                                           [] (CentGeomBBox3fa& pinfo0,const CentGeomBBox3fa& pinfo1) { pinfo0.merge(pinfo1); },
+                                           128);
+          
+          new (&lset) PrimInfoRange(begin,center,local_left);
+          new (&rset) PrimInfoRange(center,end,local_right);
+          assert(area(lset.geomBounds) >= 0.0f);
+          assert(area(rset.geomBounds) >= 0.0f);
+        }
+        
+        void deterministic_order(const range<size_t>& set) 
+        {
+          /* required as parallel partition destroys original primitive order */
+          std::sort(&prims[set.begin()],&prims[set.end()]);
+        }
+        
+        void splitFallback(const range<size_t>& set, PrimInfoRange& lset, PrimInfoRange& rset)
+        {
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          const size_t center = (begin + end)/2;
+          
+          CentGeomBBox3fa left(empty);
+          for (size_t i=begin; i<center; i++)
+            left.extend_center2(prims[i]);
+          new (&lset) PrimInfoRange(begin,center,left);
+          
+          CentGeomBBox3fa right(empty);
+          for (size_t i=center; i<end; i++)
+            right.extend_center2(prims[i]);
+          new (&rset) PrimInfoRange(center,end,right);
+        }
+        
+      private:
+        Scene* const scene;
+        PrimRef* const prims;
+      };
+
+    /*! Performs standard object binning */
+    template<typename PrimRefMB, size_t BINS>
+      struct UnalignedHeuristicArrayBinningMB
+      {
+        typedef BinSplit<BINS> Split;
+        typedef typename PrimRefMB::BBox BBox;
+        typedef BinInfoT<BINS,PrimRefMB,BBox> ObjectBinner;
+        
+        static const size_t PARALLEL_THRESHOLD = 3 * 1024;
+        static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024;
+        static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128;
+
+        UnalignedHeuristicArrayBinningMB(Scene* scene)
+        : scene(scene) {}
+
+        const LinearSpace3fa computeAlignedSpaceMB(Scene* scene, const SetMB& set)
+        {
+          Vec3fa axis0(0,0,1);
+          uint64_t bestGeomPrimID = -1;
+
+          /*! find curve with minimum ID that defines valid direction */
+          for (size_t i=set.begin(); i<set.end(); i++)
+          {
+            const PrimRefMB& prim = (*set.prims)[i];
+            const unsigned int geomID = prim.geomID();
+            const unsigned int primID = prim.primID();
+            const uint64_t geomprimID = prim.ID64();
+            if (geomprimID >= bestGeomPrimID) continue;
+            
+            const Geometry* mesh = scene->get(geomID);
+            const range<int> tbounds = mesh->timeSegmentRange(set.time_range);
+            if (tbounds.size() == 0) continue;
+
+            const size_t t = (tbounds.begin()+tbounds.end())/2;
+            const Vec3fa axis1 = mesh->computeDirection(primID,t);
+            if (sqr_length(axis1) > 1E-18f) {
+              axis0 = normalize(axis1);
+              bestGeomPrimID = geomprimID;
+            }
+          }
+
+          return frame(axis0).transposed();
+        }
+
+        struct BinBoundsAndCenter
+        {
+          __forceinline BinBoundsAndCenter(Scene* scene, BBox1f time_range, const LinearSpace3fa& space)
+            : scene(scene), time_range(time_range), space(space) {}
+          
+          /*! returns center for binning */
+          template<typename PrimRef>
+          __forceinline Vec3fa binCenter(const PrimRef& ref) const
+          {
+            Geometry* mesh = scene->get(ref.geomID());
+            LBBox3fa lbounds = mesh->vlinearBounds(space,ref.primID(),time_range);
+            return center2(lbounds.interpolate(0.5f));
+          }
+
+          /*! returns bounds and centroid used for binning */
+          __noinline void binBoundsAndCenter (const PrimRefMB& ref, BBox3fa& bounds_o, Vec3fa& center_o) const // __noinline is workaround for ICC16 bug under MacOSX
+          {
+            Geometry* mesh = scene->get(ref.geomID());
+            LBBox3fa lbounds = mesh->vlinearBounds(space,ref.primID(),time_range);
+            bounds_o = lbounds.interpolate(0.5f);
+            center_o = center2(bounds_o);
+          }
+
+          /*! returns bounds and centroid used for binning */
+          __noinline void binBoundsAndCenter (const PrimRefMB& ref, LBBox3fa& bounds_o, Vec3fa& center_o) const // __noinline is workaround for ICC16 bug under MacOSX
+          {
+            Geometry* mesh = scene->get(ref.geomID());
+            LBBox3fa lbounds = mesh->vlinearBounds(space,ref.primID(),time_range);
+            bounds_o = lbounds;
+            center_o = center2(lbounds.interpolate(0.5f));
+          }
+          
+        private:
+          Scene* scene;
+          BBox1f time_range;
+          const LinearSpace3fa space;
+        };
+
+        /*! finds the best split */
+        const Split find(const SetMB& set, const size_t logBlockSize, const LinearSpace3fa& space)
+        {
+          BinBoundsAndCenter binBoundsAndCenter(scene,set.time_range,space);
+          ObjectBinner binner(empty);
+          const BinMapping<BINS> mapping(set.size(),set.centBounds);
+          bin_parallel(binner,set.prims->data(),set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,PARALLEL_THRESHOLD,mapping,binBoundsAndCenter);
+          Split osplit = binner.best(mapping,logBlockSize);
+          osplit.sah *= set.time_range.size();
+          if (!osplit.valid()) osplit.data = Split::SPLIT_FALLBACK; // use fallback split
+          return osplit;
+        }
+        
+        /*! array partitioning */
+        __forceinline void split(const Split& split, const LinearSpace3fa& space, const SetMB& set, SetMB& lset, SetMB& rset)
+        {
+          BinBoundsAndCenter binBoundsAndCenter(scene,set.time_range,space);
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          PrimInfoMB left = empty;
+          PrimInfoMB right = empty;
+          const vint4 vSplitPos(split.pos);
+          const vbool4 vSplitMask(1 << split.dim);
+          auto isLeft = [&] (const PrimRefMB &ref) { return any(((vint4)split.mapping.bin_unsafe(ref,binBoundsAndCenter) < vSplitPos) & vSplitMask); };
+          auto reduction = [] (PrimInfoMB& pinfo, const PrimRefMB& ref) { pinfo.add_primref(ref); };
+          auto reduction2 = [] (PrimInfoMB& pinfo0,const PrimInfoMB& pinfo1) { pinfo0.merge(pinfo1); };
+          size_t center = parallel_partitioning(set.prims->data(),begin,end,EmptyTy(),left,right,isLeft,reduction,reduction2,PARALLEL_PARTITION_BLOCK_SIZE,PARALLEL_THRESHOLD);
+          new (&lset) SetMB(left,set.prims,range<size_t>(begin,center),set.time_range);
+          new (&rset) SetMB(right,set.prims,range<size_t>(center,end ),set.time_range);
+        }
+
+      private:
+        Scene* scene;
+      };
+  }
+}
diff --git a/thirdparty/embree/kernels/builders/heuristic_openmerge_array.h b/thirdparty/embree/kernels/builders/heuristic_openmerge_array.h
new file mode 100644
index 0000000000..4249d16ea1
--- /dev/null
+++ b/thirdparty/embree/kernels/builders/heuristic_openmerge_array.h
@@ -0,0 +1,443 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+// TODO: 
+//       - adjust parallel build thresholds
+//       - openNodesBasedOnExtend should consider max extended size
+  
+#pragma once
+
+#include "heuristic_binning.h"
+#include "heuristic_spatial.h"
+
+/* stop opening of all bref.geomIDs are the same */
+#define EQUAL_GEOMID_STOP_CRITERIA 1
+
+/* 10% spatial extend threshold */
+#define MAX_EXTEND_THRESHOLD   0.1f
+
+/* maximum is 8 children */
+#define MAX_OPENED_CHILD_NODES 8
+
+/* open until all build refs are below threshold size in one step */
+#define USE_LOOP_OPENING 0
+
+namespace embree
+{
+  namespace isa
+  { 
+    /*! Performs standard object binning */
+    template<typename NodeOpenerFunc, typename PrimRef, size_t OBJECT_BINS>
+      struct HeuristicArrayOpenMergeSAH
+      {
+        typedef BinSplit<OBJECT_BINS> Split;
+        typedef BinInfoT<OBJECT_BINS,PrimRef,BBox3fa> Binner;
+        
+        static const size_t PARALLEL_THRESHOLD = 1024;
+        static const size_t PARALLEL_FIND_BLOCK_SIZE = 512;
+        static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128;
+
+        static const size_t MOVE_STEP_SIZE = 64;
+        static const size_t CREATE_SPLITS_STEP_SIZE = 128;
+
+        __forceinline HeuristicArrayOpenMergeSAH ()
+          : prims0(nullptr) {}
+        
+        /*! remember prim array */
+        __forceinline HeuristicArrayOpenMergeSAH (const NodeOpenerFunc& nodeOpenerFunc, PrimRef* prims0, size_t max_open_size)
+          : prims0(prims0), nodeOpenerFunc(nodeOpenerFunc), max_open_size(max_open_size) 
+        {
+          assert(max_open_size <= MAX_OPENED_CHILD_NODES);
+        }
+
+        struct OpenHeuristic
+        {
+          __forceinline OpenHeuristic( const PrimInfoExtRange& pinfo )
+          {
+            const Vec3fa diag = pinfo.geomBounds.size();
+            dim = maxDim(diag);
+            assert(diag[dim] > 0.0f);
+            inv_max_extend = 1.0f / diag[dim];
+          }
+
+          __forceinline bool operator () ( PrimRef& prim ) const {
+            return !prim.node.isLeaf() && prim.bounds().size()[dim] * inv_max_extend > MAX_EXTEND_THRESHOLD;
+          }
+
+        private:
+          size_t dim;
+          float inv_max_extend;
+        };
+
+        /*! compute extended ranges */
+        __forceinline void setExtentedRanges(const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset, const size_t lweight, const size_t rweight)
+        {
+          assert(set.ext_range_size() > 0);
+          const float left_factor           = (float)lweight / (lweight + rweight);
+          const size_t ext_range_size       = set.ext_range_size();
+          const size_t left_ext_range_size  = min((size_t)(floorf(left_factor * ext_range_size)),ext_range_size);
+          const size_t right_ext_range_size = ext_range_size - left_ext_range_size;
+          lset.set_ext_range(lset.end() + left_ext_range_size);
+          rset.set_ext_range(rset.end() + right_ext_range_size);
+        }
+
+        /*! move ranges */
+        __forceinline void moveExtentedRange(const PrimInfoExtRange& set, const PrimInfoExtRange& lset, PrimInfoExtRange& rset)
+        {
+          const size_t left_ext_range_size = lset.ext_range_size();
+          const size_t right_size = rset.size();
+
+          /* has the left child an extended range? */
+          if (left_ext_range_size > 0)
+          {
+            /* left extended range smaller than right range ? */
+            if (left_ext_range_size < right_size)
+            {
+              /* only move a small part of the beginning of the right range to the end */
+              parallel_for( rset.begin(), rset.begin()+left_ext_range_size, MOVE_STEP_SIZE, [&](const range<size_t>& r) {                  
+                  for (size_t i=r.begin(); i<r.end(); i++)
+                    prims0[i+right_size] = prims0[i];
+                });
+            }
+            else
+            {
+              /* no overlap, move entire right range to new location, can be made fully parallel */
+              parallel_for( rset.begin(), rset.end(), MOVE_STEP_SIZE,  [&](const range<size_t>& r) {
+                  for (size_t i=r.begin(); i<r.end(); i++)
+                    prims0[i+left_ext_range_size] = prims0[i];
+                });
+            }
+            /* update right range */
+            assert(rset.ext_end() + left_ext_range_size == set.ext_end());
+            rset.move_right(left_ext_range_size);
+          }
+        }
+
+        /* estimates the extra space required when opening, and checks if all primitives are from same geometry */
+        __noinline std::pair<size_t,bool> getProperties(const PrimInfoExtRange& set)
+        {
+          const OpenHeuristic heuristic(set);
+          const unsigned int geomID = prims0[set.begin()].geomID();
+          
+          auto body = [&] (const range<size_t>& r) -> std::pair<size_t,bool> { 
+            bool commonGeomID = true;
+            size_t opens = 0;
+            for (size_t i=r.begin(); i<r.end(); i++) {
+              commonGeomID &= prims0[i].geomID() == geomID; 
+              if (heuristic(prims0[i]))
+                opens += prims0[i].node.getN()-1; // coarse approximation
+            }
+            return std::pair<size_t,bool>(opens,commonGeomID); 
+          };
+          auto reduction = [&] (const std::pair<size_t,bool>& b0, const std::pair<size_t,bool>& b1) -> std::pair<size_t,bool> { 
+            return std::pair<size_t,bool>(b0.first+b1.first,b0.second && b1.second); 
+          };
+          return parallel_reduce(set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,PARALLEL_THRESHOLD,std::pair<size_t,bool>(0,true),body,reduction);
+        }
+
+        // FIXME: should consider maximum available extended size 
+        __noinline void openNodesBasedOnExtend(PrimInfoExtRange& set)
+        {
+          const OpenHeuristic heuristic(set);
+          const size_t ext_range_start = set.end();
+
+          if (false && set.size() < PARALLEL_THRESHOLD) 
+          {
+            size_t extra_elements = 0;
+            for (size_t i=set.begin(); i<set.end(); i++)
+            {
+              if (heuristic(prims0[i]))
+              {
+                PrimRef tmp[MAX_OPENED_CHILD_NODES];
+                const size_t n = nodeOpenerFunc(prims0[i],tmp);
+                assert(extra_elements + n-1 <= set.ext_range_size());
+                for (size_t j=0; j<n; j++)
+                  set.extend_center2(tmp[j]);
+
+                prims0[i] = tmp[0];
+                for (size_t j=1; j<n; j++)
+                  prims0[ext_range_start+extra_elements+j-1] = tmp[j]; 
+                extra_elements += n-1;
+              }
+            }
+            set._end += extra_elements;
+          }
+          else 
+          {
+            std::atomic<size_t> ext_elements;
+            ext_elements.store(0);
+            PrimInfo info = parallel_reduce( set.begin(), set.end(), CREATE_SPLITS_STEP_SIZE, PrimInfo(empty), [&](const range<size_t>& r) -> PrimInfo {
+                PrimInfo info(empty);
+                for (size_t i=r.begin(); i<r.end(); i++)
+                  if (heuristic(prims0[i]))
+                  {
+                    PrimRef tmp[MAX_OPENED_CHILD_NODES];
+                    const size_t n = nodeOpenerFunc(prims0[i],tmp);
+                    const size_t ID = ext_elements.fetch_add(n-1);
+                    assert(ID + n-1 <= set.ext_range_size());
+
+                    for (size_t j=0; j<n; j++)
+                      info.extend_center2(tmp[j]);
+
+                    prims0[i] = tmp[0];
+                    for (size_t j=1; j<n; j++)
+                      prims0[ext_range_start+ID+j-1] = tmp[j]; 
+                  }
+                return info;
+              }, [] (const PrimInfo& a, const PrimInfo& b) { return PrimInfo::merge(a,b); });
+            set.centBounds.extend(info.centBounds);
+            assert(ext_elements.load() <= set.ext_range_size());
+            set._end += ext_elements.load();
+          }
+        } 
+
+        __noinline void openNodesBasedOnExtendLoop(PrimInfoExtRange& set, const size_t est_new_elements)
+        {
+          const OpenHeuristic heuristic(set);
+          size_t next_iteration_extra_elements = est_new_elements;          
+          
+          while (next_iteration_extra_elements <= set.ext_range_size()) 
+          {
+            next_iteration_extra_elements = 0;
+            size_t extra_elements = 0;
+            const size_t ext_range_start = set.end();
+
+            for (size_t i=set.begin(); i<set.end(); i++)
+            {
+              if (heuristic(prims0[i]))
+              {
+                PrimRef tmp[MAX_OPENED_CHILD_NODES];
+                const size_t n = nodeOpenerFunc(prims0[i],tmp);
+                assert(extra_elements + n-1 <= set.ext_range_size());
+                for (size_t j=0;j<n;j++)
+                  set.extend_center2(tmp[j]);
+                  
+                prims0[i] = tmp[0];
+                for (size_t j=1;j<n;j++)
+                  prims0[ext_range_start+extra_elements+j-1] = tmp[j]; 
+                extra_elements += n-1;
+
+                for (size_t j=0; j<n; j++)
+                  if (heuristic(tmp[j]))
+                    next_iteration_extra_elements += tmp[j].node.getN()-1; // coarse approximation
+
+              }
+            }
+            assert( extra_elements <= set.ext_range_size());
+            set._end += extra_elements;
+
+            for (size_t i=set.begin();i<set.end();i++)
+              assert(prims0[i].numPrimitives() > 0);
+
+            if (unlikely(next_iteration_extra_elements == 0)) break;
+          }
+        } 
+
+        __noinline const Split find(PrimInfoExtRange& set, const size_t logBlockSize)
+        {
+          /* single element */
+          if (set.size() <= 1)
+            return Split();
+
+          /* disable opening if there is no overlap */
+          const size_t D = 4;
+          if (unlikely(set.has_ext_range() && set.size() <= D))
+          {
+            bool disjoint = true;
+            for (size_t j=set.begin(); j<set.end()-1; j++) {
+              for (size_t i=set.begin()+1; i<set.end(); i++) {
+                if (conjoint(prims0[j].bounds(),prims0[i].bounds())) { 
+                  disjoint = false; break; 
+                }
+              }
+            }
+            if (disjoint) set.set_ext_range(set.end()); /* disables opening */
+          }
+
+          std::pair<size_t,bool> p(0,false);
+
+          /* disable opening when all primitives are from same geometry */
+          if (unlikely(set.has_ext_range()))
+          {
+            p =  getProperties(set);
+#if EQUAL_GEOMID_STOP_CRITERIA == 1
+            if (p.second) set.set_ext_range(set.end()); /* disable opening */
+#endif         
+          }
+
+          /* open nodes when we have sufficient space available */
+          if (unlikely(set.has_ext_range()))
+          {
+#if USE_LOOP_OPENING == 1
+            openNodesBasedOnExtendLoop(set,p.first);
+#else
+            if (p.first <= set.ext_range_size())
+              openNodesBasedOnExtend(set);
+#endif
+
+            /* disable opening when unsufficient space for opening a node available */
+            if (set.ext_range_size() < max_open_size-1) 
+              set.set_ext_range(set.end()); /* disable opening */
+          }
+                    
+          /* find best split */
+          return object_find(set,logBlockSize);
+        }
+
+
+        /*! finds the best object split */
+        __forceinline const Split object_find(const PrimInfoExtRange& set,const size_t logBlockSize)
+        {
+          if (set.size() < PARALLEL_THRESHOLD) return sequential_object_find(set,logBlockSize);
+          else                                 return parallel_object_find  (set,logBlockSize);
+        }
+
+        /*! finds the best object split */
+        __noinline const Split sequential_object_find(const PrimInfoExtRange& set, const size_t logBlockSize)
+        {
+          Binner binner(empty); 
+          const BinMapping<OBJECT_BINS> mapping(set.centBounds);
+          binner.bin(prims0,set.begin(),set.end(),mapping);
+          return binner.best(mapping,logBlockSize);
+        }
+
+        /*! finds the best split */
+        __noinline const Split parallel_object_find(const PrimInfoExtRange& set, const size_t logBlockSize)
+        {
+          Binner binner(empty);
+          const BinMapping<OBJECT_BINS> mapping(set.centBounds);
+          const BinMapping<OBJECT_BINS>& _mapping = mapping; // CLANG 3.4 parser bug workaround
+          auto body = [&] (const range<size_t>& r) -> Binner { 
+            Binner binner(empty); binner.bin(prims0+r.begin(),r.size(),_mapping); return binner; 
+          };
+          auto reduction = [&] (const Binner& b0, const Binner& b1) -> Binner { 
+            Binner r = b0; r.merge(b1,_mapping.size()); return r; 
+          };
+          binner = parallel_reduce(set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,binner,body,reduction);
+          return binner.best(mapping,logBlockSize);
+        }
+        
+        /*! array partitioning */
+        __noinline void split(const Split& split, const PrimInfoExtRange& set_i, PrimInfoExtRange& lset, PrimInfoExtRange& rset) 
+        {
+          PrimInfoExtRange set = set_i;
+
+          /* valid split */
+          if (unlikely(!split.valid())) {
+            deterministic_order(set);
+            splitFallback(set,lset,rset);
+            return;
+          }
+
+          std::pair<size_t,size_t> ext_weights(0,0);
+
+          /* object split */
+          if (likely(set.size() < PARALLEL_THRESHOLD)) 
+            ext_weights = sequential_object_split(split,set,lset,rset);
+          else
+            ext_weights = parallel_object_split(split,set,lset,rset);
+
+          /* if we have an extended range, set extended child ranges and move right split range */
+          if (unlikely(set.has_ext_range())) 
+          {
+            setExtentedRanges(set,lset,rset,ext_weights.first,ext_weights.second);
+            moveExtentedRange(set,lset,rset);
+          }
+        }
+
+        /*! array partitioning */
+        std::pair<size_t,size_t> sequential_object_split(const Split& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset) 
+        {
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          PrimInfo local_left(empty);
+          PrimInfo local_right(empty);
+          const unsigned int splitPos = split.pos;
+          const unsigned int splitDim = split.dim;
+          const unsigned int splitDimMask = (unsigned int)1 << splitDim; 
+
+          const vint4 vSplitPos(splitPos);
+          const vbool4 vSplitMask( (int)splitDimMask );
+
+          size_t center = serial_partitioning(prims0,
+                                              begin,end,local_left,local_right,
+                                              [&] (const PrimRef& ref) { return split.mapping.bin_unsafe(ref,vSplitPos,vSplitMask); },
+                                              [] (PrimInfo& pinfo,const PrimRef& ref) { pinfo.add_center2(ref); });          
+          
+          new (&lset) PrimInfoExtRange(begin,center,center,local_left);
+          new (&rset) PrimInfoExtRange(center,end,end,local_right);
+          assert(area(lset.geomBounds) >= 0.0f);
+          assert(area(rset.geomBounds) >= 0.0f);
+          return std::pair<size_t,size_t>(local_left.size(),local_right.size());
+        }
+
+        /*! array partitioning */
+        __noinline std::pair<size_t,size_t> parallel_object_split(const Split& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset)
+        {
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          PrimInfo left(empty);
+          PrimInfo right(empty);
+          const unsigned int splitPos = split.pos;
+          const unsigned int splitDim = split.dim;
+          const unsigned int splitDimMask = (unsigned int)1 << splitDim;
+
+          const vint4 vSplitPos(splitPos);
+          const vbool4 vSplitMask( (int)splitDimMask );
+          auto isLeft = [&] (const PrimRef& ref) { return split.mapping.bin_unsafe(ref,vSplitPos,vSplitMask); };
+
+          const size_t center = parallel_partitioning(
+            prims0,begin,end,EmptyTy(),left,right,isLeft,
+            [] (PrimInfo& pinfo,const PrimRef& ref) { pinfo.add_center2(ref); },
+            [] (PrimInfo& pinfo0,const PrimInfo& pinfo1) { pinfo0.merge(pinfo1); },
+            PARALLEL_PARTITION_BLOCK_SIZE);
+
+          new (&lset) PrimInfoExtRange(begin,center,center,left);
+          new (&rset) PrimInfoExtRange(center,end,end,right);
+          assert(area(lset.geomBounds) >= 0.0f);
+          assert(area(rset.geomBounds) >= 0.0f);
+
+          return std::pair<size_t,size_t>(left.size(),right.size());
+        }
+
+        void deterministic_order(const extended_range<size_t>& set) 
+        {
+          /* required as parallel partition destroys original primitive order */
+          std::sort(&prims0[set.begin()],&prims0[set.end()]);
+        }
+
+        __forceinline void splitFallback(const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset)
+        {
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          const size_t center = (begin + end)/2;
+
+          PrimInfo left(empty);
+          for (size_t i=begin; i<center; i++)
+            left.add_center2(prims0[i]);
+
+          const size_t lweight = left.end;
+          
+          PrimInfo right(empty);
+          for (size_t i=center; i<end; i++)
+            right.add_center2(prims0[i]);	
+
+          const size_t rweight = right.end;
+          new (&lset) PrimInfoExtRange(begin,center,center,left);
+          new (&rset) PrimInfoExtRange(center,end,end,right);
+
+          /* if we have an extended range */
+          if (set.has_ext_range()) 
+          {
+            setExtentedRanges(set,lset,rset,lweight,rweight);
+            moveExtentedRange(set,lset,rset);
+          }
+        }
+        
+      private:
+        PrimRef* const prims0;
+        const NodeOpenerFunc& nodeOpenerFunc;
+        size_t max_open_size;
+      };
+  }
+}
diff --git a/thirdparty/embree/kernels/builders/heuristic_spatial.h b/thirdparty/embree/kernels/builders/heuristic_spatial.h
new file mode 100644
index 0000000000..a6939ba258
--- /dev/null
+++ b/thirdparty/embree/kernels/builders/heuristic_spatial.h
@@ -0,0 +1,414 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/scene.h"
+#include "priminfo.h"
+
+namespace embree
+{
+  static const unsigned int RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS = 5;
+
+  namespace isa
+  {
+
+    /*! mapping into bins */
+    template<size_t BINS>
+      struct SpatialBinMapping
+      {
+      public:
+        __forceinline SpatialBinMapping() {}
+        
+        /*! calculates the mapping */
+        __forceinline SpatialBinMapping(const CentGeomBBox3fa& pinfo)
+        {
+          const vfloat4 lower = (vfloat4) pinfo.geomBounds.lower;
+          const vfloat4 upper = (vfloat4) pinfo.geomBounds.upper;
+          const vfloat4 eps = 128.0f*vfloat4(ulp)*max(abs(lower),abs(upper));
+          const vfloat4 diag = max(eps,(vfloat4) pinfo.geomBounds.size());
+          scale = select(upper-lower <= eps,vfloat4(0.0f),vfloat4(BINS)/diag);
+          ofs  = (vfloat4) pinfo.geomBounds.lower;
+          inv_scale = 1.0f / scale; 
+        }
+
+        /*! slower but safe binning */
+        __forceinline vint4 bin(const Vec3fa& p) const
+        {
+          const vint4 i = floori((vfloat4(p)-ofs)*scale);
+          return clamp(i,vint4(0),vint4(BINS-1));
+        }
+
+        __forceinline std::pair<vint4,vint4> bin(const BBox3fa& b) const
+        {
+#if defined(__AVX__)
+          const vfloat8 ofs8(ofs);
+          const vfloat8 scale8(scale);
+          const vint8 lu   = floori((vfloat8::loadu(&b)-ofs8)*scale8);
+          const vint8 c_lu = clamp(lu,vint8(zero),vint8(BINS-1));
+          return std::pair<vint4,vint4>(extract4<0>(c_lu),extract4<1>(c_lu));
+#else
+          const vint4 lower = floori((vfloat4(b.lower)-ofs)*scale);
+          const vint4 upper = floori((vfloat4(b.upper)-ofs)*scale);
+          const vint4 c_lower = clamp(lower,vint4(0),vint4(BINS-1));
+          const vint4 c_upper = clamp(upper,vint4(0),vint4(BINS-1));
+          return std::pair<vint4,vint4>(c_lower,c_upper);
+#endif
+        }
+
+        
+        /*! calculates left spatial position of bin */
+        __forceinline float pos(const size_t bin, const size_t dim) const {
+          return madd(float(bin),inv_scale[dim],ofs[dim]);
+        }
+
+        /*! calculates left spatial position of bin */
+        template<size_t N>
+        __forceinline vfloat<N> posN(const vfloat<N> bin, const size_t dim) const {
+          return madd(bin,vfloat<N>(inv_scale[dim]),vfloat<N>(ofs[dim]));
+        }
+        
+        /*! returns true if the mapping is invalid in some dimension */
+        __forceinline bool invalid(const size_t dim) const {
+          return scale[dim] == 0.0f;
+        }
+        
+      public:
+        vfloat4 ofs,scale,inv_scale;  //!< linear function that maps to bin ID
+      };
+
+    /*! stores all information required to perform some split */
+    template<size_t BINS>
+      struct SpatialBinSplit
+      {
+        /*! construct an invalid split by default */
+        __forceinline SpatialBinSplit() 
+          : sah(inf), dim(-1), pos(0), left(-1), right(-1), factor(1.0f) {}
+        
+        /*! constructs specified split */
+        __forceinline SpatialBinSplit(float sah, int dim, int pos, const SpatialBinMapping<BINS>& mapping)
+          : sah(sah), dim(dim), pos(pos), left(-1), right(-1), factor(1.0f), mapping(mapping) {}
+
+        /*! constructs specified split */
+        __forceinline SpatialBinSplit(float sah, int dim, int pos, int left, int right, float factor, const SpatialBinMapping<BINS>& mapping)
+          : sah(sah), dim(dim), pos(pos), left(left), right(right), factor(factor), mapping(mapping) {}
+        
+        /*! tests if this split is valid */
+        __forceinline bool valid() const { return dim != -1; }
+        
+        /*! calculates surface area heuristic for performing the split */
+        __forceinline float splitSAH() const { return sah; }
+        
+        /*! stream output */
+        friend embree_ostream operator<<(embree_ostream cout, const SpatialBinSplit& split) {
+          return cout << "SpatialBinSplit { sah = " << split.sah << ", dim = " << split.dim << ", pos = " << split.pos << ", left = " << split.left << ", right = " << split.right << ", factor = " << split.factor << "}";
+        }
+        
+      public:
+        float sah;                 //!< SAH cost of the split
+        int   dim;                 //!< split dimension
+        int   pos;                 //!< split position
+        int   left;                //!< number of elements on the left side
+        int   right;               //!< number of elements on the right side
+        float factor;              //!< factor splitting the extended range
+        SpatialBinMapping<BINS> mapping; //!< mapping into bins
+      };    
+    
+    /*! stores all binning information */
+    template<size_t BINS, typename PrimRef>
+      struct __aligned(64) SpatialBinInfo
+    {
+      SpatialBinInfo() {
+      }
+
+      __forceinline SpatialBinInfo(EmptyTy) {
+	clear();
+      }
+
+      /*! clears the bin info */
+      __forceinline void clear() 
+      {
+        for (size_t i=0; i<BINS; i++) { 
+          bounds[i][0] = bounds[i][1] = bounds[i][2] = empty;
+          numBegin[i] = numEnd[i] = 0;
+        }
+      }
+      
+      /*! adds binning data */
+      __forceinline void add(const size_t dim,
+                             const size_t beginID, 
+                             const size_t endID, 
+                             const size_t binID, 
+                             const BBox3fa &b,
+                             const size_t n = 1) 
+      {
+        assert(beginID < BINS);
+        assert(endID < BINS);
+        assert(binID < BINS);
+
+        numBegin[beginID][dim]+=(unsigned int)n;
+        numEnd  [endID][dim]+=(unsigned int)n;
+        bounds  [binID][dim].extend(b);        
+      }
+
+      /*! extends binning bounds */
+      __forceinline void extend(const size_t dim,
+                                const size_t binID, 
+                                const BBox3fa &b) 
+      {
+        assert(binID < BINS);
+        bounds  [binID][dim].extend(b);        
+      }
+      
+      /*! bins an array of triangles */
+      template<typename SplitPrimitive>
+        __forceinline void bin(const SplitPrimitive& splitPrimitive, const PrimRef* prims, size_t N, const SpatialBinMapping<BINS>& mapping)
+      {
+        for (size_t i=0; i<N; i++)
+        {
+          const PrimRef prim = prims[i];
+          unsigned splits = prim.geomID() >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS);
+
+          if (unlikely(splits == 1))
+          {
+            const vint4 bin = mapping.bin(center(prim.bounds()));
+            for (size_t dim=0; dim<3; dim++) 
+            {
+              assert(bin[dim] >= (int)0 && bin[dim] < (int)BINS);
+              numBegin[bin[dim]][dim]++;
+              numEnd  [bin[dim]][dim]++;
+              bounds  [bin[dim]][dim].extend(prim.bounds());
+            }
+          } 
+          else
+          {
+            const vint4 bin0 = mapping.bin(prim.bounds().lower);
+            const vint4 bin1 = mapping.bin(prim.bounds().upper);
+            
+            for (size_t dim=0; dim<3; dim++) 
+            {
+              size_t bin;
+              PrimRef rest = prim;
+              size_t l = bin0[dim];
+              size_t r = bin1[dim];
+
+              // same bin optimization
+              if (likely(l == r)) 
+              {
+                numBegin[l][dim]++;
+                numEnd  [l][dim]++;
+                bounds  [l][dim].extend(prim.bounds());
+                continue;
+              }
+
+              for (bin=(size_t)bin0[dim]; bin<(size_t)bin1[dim]; bin++) 
+              {
+                const float pos = mapping.pos(bin+1,dim);
+                
+                PrimRef left,right;
+                splitPrimitive(rest,(int)dim,pos,left,right);
+                if (unlikely(left.bounds().empty())) l++;                
+                bounds[bin][dim].extend(left.bounds());
+                rest = right;
+              }
+              if (unlikely(rest.bounds().empty())) r--;
+              numBegin[l][dim]++;
+              numEnd  [r][dim]++;
+              bounds  [bin][dim].extend(rest.bounds());
+            }
+          }
+        }
+      }
+      
+      /*! bins a range of primitives inside an array */
+      template<typename SplitPrimitive>
+        void bin(const SplitPrimitive& splitPrimitive, const PrimRef* prims, size_t begin, size_t end, const SpatialBinMapping<BINS>& mapping) {
+	bin(splitPrimitive,prims+begin,end-begin,mapping);
+      }
+
+      /*! bins an array of primitives */
+      template<typename PrimitiveSplitterFactory>
+        __forceinline void bin2(const PrimitiveSplitterFactory& splitterFactory, const PrimRef* source, size_t begin, size_t end, const SpatialBinMapping<BINS>& mapping)
+      {
+        for (size_t i=begin; i<end; i++)
+        {
+          const PrimRef &prim = source[i];
+          const vint4 bin0 = mapping.bin(prim.bounds().lower);
+          const vint4 bin1 = mapping.bin(prim.bounds().upper);
+          
+          for (size_t dim=0; dim<3; dim++) 
+          {
+            if (unlikely(mapping.invalid(dim))) 
+              continue;
+            
+            size_t bin;
+            size_t l = bin0[dim];
+            size_t r = bin1[dim];
+            
+            // same bin optimization
+            if (likely(l == r)) 
+            {
+              add(dim,l,l,l,prim.bounds());
+              continue;
+            }
+            const size_t bin_start = bin0[dim];
+            const size_t bin_end   = bin1[dim];
+            BBox3fa rest = prim.bounds();
+            const auto splitter = splitterFactory(prim);
+            for (bin=bin_start; bin<bin_end; bin++) 
+            {
+              const float pos = mapping.pos(bin+1,dim);
+              BBox3fa left,right;
+              splitter(rest,dim,pos,left,right);
+              if (unlikely(left.empty())) l++;                
+              extend(dim,bin,left);
+              rest = right;
+            }
+            if (unlikely(rest.empty())) r--;
+            add(dim,l,r,bin,rest);
+          }
+        }              
+      }
+
+
+
+      /*! bins an array of primitives */
+      __forceinline void binSubTreeRefs(const PrimRef* source, size_t begin, size_t end, const SpatialBinMapping<BINS>& mapping)
+      {
+        for (size_t i=begin; i<end; i++)
+        {
+          const PrimRef &prim = source[i];
+          const vint4 bin0 = mapping.bin(prim.bounds().lower);
+          const vint4 bin1 = mapping.bin(prim.bounds().upper);
+          
+          for (size_t dim=0; dim<3; dim++) 
+          {
+            if (unlikely(mapping.invalid(dim))) 
+              continue;
+            
+            const size_t l = bin0[dim];
+            const size_t r = bin1[dim];
+
+            const unsigned int n  = prim.primID();
+            
+            // same bin optimization
+            if (likely(l == r)) 
+            {
+              add(dim,l,l,l,prim.bounds(),n);
+              continue;
+            }
+            const size_t bin_start = bin0[dim];
+            const size_t bin_end   = bin1[dim];
+            for (size_t bin=bin_start; bin<bin_end; bin++) 
+              add(dim,l,r,bin,prim.bounds(),n);
+          }
+        }              
+      }
+      
+      /*! merges in other binning information */
+      void merge (const SpatialBinInfo& other)
+      {
+        for (size_t i=0; i<BINS; i++) 
+        {
+          numBegin[i] += other.numBegin[i];
+          numEnd  [i] += other.numEnd  [i];
+          bounds[i][0].extend(other.bounds[i][0]);
+          bounds[i][1].extend(other.bounds[i][1]);
+          bounds[i][2].extend(other.bounds[i][2]);
+        }
+      }
+
+      /*! merges in other binning information */
+      static __forceinline const SpatialBinInfo reduce (const SpatialBinInfo& a, const SpatialBinInfo& b)
+      {
+        SpatialBinInfo c(empty);
+        for (size_t i=0; i<BINS; i++) 
+        {
+          c.numBegin[i] += a.numBegin[i]+b.numBegin[i];
+          c.numEnd  [i] += a.numEnd  [i]+b.numEnd  [i];
+          c.bounds[i][0] = embree::merge(a.bounds[i][0],b.bounds[i][0]);
+          c.bounds[i][1] = embree::merge(a.bounds[i][1],b.bounds[i][1]);
+          c.bounds[i][2] = embree::merge(a.bounds[i][2],b.bounds[i][2]);
+        }
+        return c;
+      }
+      
+      /*! finds the best split by scanning binning information */
+      SpatialBinSplit<BINS> best(const SpatialBinMapping<BINS>& mapping, const size_t blocks_shift) const 
+      {
+        /* sweep from right to left and compute parallel prefix of merged bounds */
+        vfloat4 rAreas[BINS];
+        vuint4 rCounts[BINS];
+        vuint4 count = 0; BBox3fa bx = empty; BBox3fa by = empty; BBox3fa bz = empty;
+        for (size_t i=BINS-1; i>0; i--)
+        {
+          count += numEnd[i];
+          rCounts[i] = count;
+          bx.extend(bounds[i][0]); rAreas[i][0] = halfArea(bx);
+          by.extend(bounds[i][1]); rAreas[i][1] = halfArea(by);
+          bz.extend(bounds[i][2]); rAreas[i][2] = halfArea(bz);
+          rAreas[i][3] = 0.0f;
+        }
+        
+        /* sweep from left to right and compute SAH */
+        vuint4 blocks_add = (1 << blocks_shift)-1;
+        vuint4 ii = 1; vfloat4 vbestSAH = pos_inf; vuint4 vbestPos = 0; vuint4 vbestlCount = 0; vuint4 vbestrCount = 0;
+        count = 0; bx = empty; by = empty; bz = empty;
+        for (size_t i=1; i<BINS; i++, ii+=1)
+        {
+          count += numBegin[i-1];
+          bx.extend(bounds[i-1][0]); float Ax = halfArea(bx);
+          by.extend(bounds[i-1][1]); float Ay = halfArea(by);
+          bz.extend(bounds[i-1][2]); float Az = halfArea(bz);
+          const vfloat4 lArea = vfloat4(Ax,Ay,Az,Az);
+          const vfloat4 rArea = rAreas[i];
+          const vuint4 lCount = (count     +blocks_add) >> (unsigned int)(blocks_shift);
+          const vuint4 rCount = (rCounts[i]+blocks_add) >> (unsigned int)(blocks_shift);
+          const vfloat4 sah = madd(lArea,vfloat4(lCount),rArea*vfloat4(rCount));
+          // const vfloat4 sah = madd(lArea,vfloat4(vint4(lCount)),rArea*vfloat4(vint4(rCount)));
+          const vbool4 mask = sah < vbestSAH;
+          vbestPos      = select(mask,ii ,vbestPos);
+          vbestSAH      = select(mask,sah,vbestSAH);
+          vbestlCount   = select(mask,count,vbestlCount);
+          vbestrCount   = select(mask,rCounts[i],vbestrCount);
+        }
+        
+        /* find best dimension */
+        float bestSAH = inf;
+        int   bestDim = -1;
+        int   bestPos = 0;
+        unsigned int   bestlCount = 0;
+        unsigned int   bestrCount = 0;
+        for (int dim=0; dim<3; dim++) 
+        {
+          /* ignore zero sized dimensions */
+          if (unlikely(mapping.invalid(dim)))
+            continue;
+          
+          /* test if this is a better dimension */
+          if (vbestSAH[dim] < bestSAH && vbestPos[dim] != 0) {
+            bestDim = dim;
+            bestPos = vbestPos[dim];
+            bestSAH = vbestSAH[dim];
+            bestlCount = vbestlCount[dim];
+            bestrCount = vbestrCount[dim];
+          }
+        }
+        assert(bestSAH >= 0.0f);
+        
+        /* return invalid split if no split found */
+        if (bestDim == -1) 
+          return SpatialBinSplit<BINS>(inf,-1,0,mapping);
+        
+        /* return best found split */
+        return SpatialBinSplit<BINS>(bestSAH,bestDim,bestPos,bestlCount,bestrCount,1.0f,mapping);
+      }
+      
+    private:
+      BBox3fa bounds[BINS][3];  //!< geometry bounds for each bin in each dimension
+      vuint4    numBegin[BINS];   //!< number of primitives starting in bin
+      vuint4    numEnd[BINS];     //!< number of primitives ending in bin
+    };
+  }
+}
+
diff --git a/thirdparty/embree/kernels/builders/heuristic_spatial_array.h b/thirdparty/embree/kernels/builders/heuristic_spatial_array.h
new file mode 100644
index 0000000000..60d235f48d
--- /dev/null
+++ b/thirdparty/embree/kernels/builders/heuristic_spatial_array.h
@@ -0,0 +1,546 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "heuristic_binning.h"
+#include "heuristic_spatial.h"
+
+namespace embree
+{
+  namespace isa
+  { 
+#if 0
+#define SPATIAL_ASPLIT_OVERLAP_THRESHOLD 0.2f
+#define SPATIAL_ASPLIT_SAH_THRESHOLD 0.95f
+#define SPATIAL_ASPLIT_AREA_THRESHOLD 0.0f
+#else
+#define SPATIAL_ASPLIT_OVERLAP_THRESHOLD 0.1f
+#define SPATIAL_ASPLIT_SAH_THRESHOLD 0.99f
+#define SPATIAL_ASPLIT_AREA_THRESHOLD 0.000005f
+#endif
+
+    struct PrimInfoExtRange : public CentGeomBBox3fa, public extended_range<size_t>
+    {
+      __forceinline PrimInfoExtRange() {
+      }
+
+      __forceinline PrimInfoExtRange(EmptyTy)
+        : CentGeomBBox3fa(EmptyTy()), extended_range<size_t>(0,0,0) {}
+
+      __forceinline PrimInfoExtRange(size_t begin, size_t end, size_t ext_end, const CentGeomBBox3fa& centGeomBounds) 
+        : CentGeomBBox3fa(centGeomBounds), extended_range<size_t>(begin,end,ext_end) {}
+      
+      __forceinline float leafSAH() const { 
+	return expectedApproxHalfArea(geomBounds)*float(size()); 
+      }
+      
+      __forceinline float leafSAH(size_t block_shift) const { 
+	return expectedApproxHalfArea(geomBounds)*float((size()+(size_t(1)<<block_shift)-1) >> block_shift);
+      }
+    };
+
+    template<typename ObjectSplit, typename SpatialSplit>
+      struct Split2
+      {
+        __forceinline Split2 () {}
+        
+        __forceinline Split2 (const Split2& other) 
+        {
+          spatial = other.spatial;
+          sah = other.sah;
+          if (spatial) spatialSplit() = other.spatialSplit();
+          else         objectSplit()  = other.objectSplit();
+        }
+        
+        __forceinline Split2& operator= (const Split2& other) 
+        {
+          spatial = other.spatial;
+          sah = other.sah;
+          if (spatial) spatialSplit() = other.spatialSplit();
+          else         objectSplit()  = other.objectSplit();
+          return *this;
+        }
+          
+          __forceinline     ObjectSplit&  objectSplit()        { return *(      ObjectSplit*)data; }
+        __forceinline const ObjectSplit&  objectSplit() const  { return *(const ObjectSplit*)data; }
+        
+        __forceinline       SpatialSplit& spatialSplit()       { return *(      SpatialSplit*)data; }
+        __forceinline const SpatialSplit& spatialSplit() const { return *(const SpatialSplit*)data; }
+        
+        __forceinline Split2 (const ObjectSplit& objectSplit, float sah)
+          : spatial(false), sah(sah) 
+        {
+          new (data) ObjectSplit(objectSplit);
+        }
+        
+        __forceinline Split2 (const SpatialSplit& spatialSplit, float sah)
+          : spatial(true), sah(sah) 
+        {
+          new (data) SpatialSplit(spatialSplit);
+        }
+        
+        __forceinline float splitSAH() const { 
+          return sah; 
+        }
+        
+        __forceinline bool valid() const {
+          return sah < float(inf);
+        }
+        
+      public:
+        __aligned(64) char data[sizeof(ObjectSplit) > sizeof(SpatialSplit) ? sizeof(ObjectSplit) : sizeof(SpatialSplit)];
+        bool spatial;
+        float sah;
+      };
+    
+    /*! Performs standard object binning */
+    template<typename PrimitiveSplitterFactory, typename PrimRef, size_t OBJECT_BINS, size_t SPATIAL_BINS>
+      struct HeuristicArraySpatialSAH
+      {
+        typedef BinSplit<OBJECT_BINS> ObjectSplit;
+        typedef BinInfoT<OBJECT_BINS,PrimRef,BBox3fa> ObjectBinner;
+
+        typedef SpatialBinSplit<SPATIAL_BINS> SpatialSplit;
+        typedef SpatialBinInfo<SPATIAL_BINS,PrimRef> SpatialBinner;
+
+        //typedef extended_range<size_t> Set;
+        typedef Split2<ObjectSplit,SpatialSplit> Split;
+        
+        static const size_t PARALLEL_THRESHOLD = 3*1024;
+        static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024;
+        static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128;
+
+        static const size_t MOVE_STEP_SIZE = 64;
+        static const size_t CREATE_SPLITS_STEP_SIZE = 64;
+
+        __forceinline HeuristicArraySpatialSAH ()
+          : prims0(nullptr) {}
+        
+        /*! remember prim array */
+        __forceinline HeuristicArraySpatialSAH (const PrimitiveSplitterFactory& splitterFactory, PrimRef* prims0, const CentGeomBBox3fa& root_info)
+          : prims0(prims0), splitterFactory(splitterFactory), root_info(root_info) {}
+
+
+        /*! compute extended ranges */
+        __noinline void setExtentedRanges(const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset, const size_t lweight, const size_t rweight)
+        {
+          assert(set.ext_range_size() > 0);
+          const float left_factor           = (float)lweight / (lweight + rweight);
+          const size_t ext_range_size       = set.ext_range_size();
+          const size_t left_ext_range_size  = min((size_t)(floorf(left_factor * ext_range_size)),ext_range_size);
+          const size_t right_ext_range_size = ext_range_size - left_ext_range_size;
+          lset.set_ext_range(lset.end() + left_ext_range_size);
+          rset.set_ext_range(rset.end() + right_ext_range_size);
+        }
+
+        /*! move ranges */
+        __noinline void moveExtentedRange(const PrimInfoExtRange& set, const PrimInfoExtRange& lset, PrimInfoExtRange& rset)
+        {
+          const size_t left_ext_range_size = lset.ext_range_size();
+          const size_t right_size = rset.size();
+
+          /* has the left child an extended range? */
+          if (left_ext_range_size > 0)
+          {
+            /* left extended range smaller than right range ? */
+            if (left_ext_range_size < right_size)
+            {
+              /* only move a small part of the beginning of the right range to the end */
+              parallel_for( rset.begin(), rset.begin()+left_ext_range_size, MOVE_STEP_SIZE, [&](const range<size_t>& r) {                  
+                  for (size_t i=r.begin(); i<r.end(); i++)
+                    prims0[i+right_size] = prims0[i];
+                });
+            }
+            else
+            {
+              /* no overlap, move entire right range to new location, can be made fully parallel */
+              parallel_for( rset.begin(), rset.end(), MOVE_STEP_SIZE,  [&](const range<size_t>& r) {
+                  for (size_t i=r.begin(); i<r.end(); i++)
+                    prims0[i+left_ext_range_size] = prims0[i];
+                });
+            }
+            /* update right range */
+            assert(rset.ext_end() + left_ext_range_size == set.ext_end());
+            rset.move_right(left_ext_range_size);
+          }
+        }
+
+        /*! finds the best split */
+        const Split find(const PrimInfoExtRange& set, const size_t logBlockSize)
+        {
+          SplitInfo oinfo;
+          const ObjectSplit object_split = object_find(set,logBlockSize,oinfo);
+          const float object_split_sah = object_split.splitSAH();
+
+          if (unlikely(set.has_ext_range()))
+          {
+            const BBox3fa overlap = intersect(oinfo.leftBounds, oinfo.rightBounds);
+            
+            /* do only spatial splits if the child bounds overlap */
+            if (safeArea(overlap) >= SPATIAL_ASPLIT_AREA_THRESHOLD*safeArea(root_info.geomBounds) &&
+                safeArea(overlap) >= SPATIAL_ASPLIT_OVERLAP_THRESHOLD*safeArea(set.geomBounds))
+            {              
+              const SpatialSplit spatial_split = spatial_find(set, logBlockSize);
+              const float spatial_split_sah = spatial_split.splitSAH();
+
+              /* valid spatial split, better SAH and number of splits do not exceed extended range */
+              if (spatial_split_sah < SPATIAL_ASPLIT_SAH_THRESHOLD*object_split_sah &&
+                  spatial_split.left + spatial_split.right - set.size() <= set.ext_range_size())
+              {          
+                return Split(spatial_split,spatial_split_sah);
+              }
+            }
+          }
+
+          return Split(object_split,object_split_sah);
+        }
+
+        /*! finds the best object split */
+        __forceinline const ObjectSplit object_find(const PrimInfoExtRange& set, const size_t logBlockSize, SplitInfo &info)
+        {
+          if (set.size() < PARALLEL_THRESHOLD) return sequential_object_find(set,logBlockSize,info);
+          else                                 return parallel_object_find  (set,logBlockSize,info);
+        }
+
+        /*! finds the best object split */
+        __noinline const ObjectSplit sequential_object_find(const PrimInfoExtRange& set, const size_t logBlockSize, SplitInfo &info)
+        {
+          ObjectBinner binner(empty); 
+          const BinMapping<OBJECT_BINS> mapping(set);
+          binner.bin(prims0,set.begin(),set.end(),mapping);
+          ObjectSplit s = binner.best(mapping,logBlockSize);
+          binner.getSplitInfo(mapping, s, info);
+          return s;
+        }
+
+        /*! finds the best split */
+        __noinline const ObjectSplit parallel_object_find(const PrimInfoExtRange& set, const size_t logBlockSize, SplitInfo &info)
+        {
+          ObjectBinner binner(empty);
+          const BinMapping<OBJECT_BINS> mapping(set);
+          const BinMapping<OBJECT_BINS>& _mapping = mapping; // CLANG 3.4 parser bug workaround
+          binner = parallel_reduce(set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,binner,
+                                   [&] (const range<size_t>& r) -> ObjectBinner { ObjectBinner binner(empty); binner.bin(prims0+r.begin(),r.size(),_mapping); return binner; },
+                                   [&] (const ObjectBinner& b0, const ObjectBinner& b1) -> ObjectBinner { ObjectBinner r = b0; r.merge(b1,_mapping.size()); return r; });
+          ObjectSplit s = binner.best(mapping,logBlockSize);
+          binner.getSplitInfo(mapping, s, info);
+          return s;
+        }
+
+        /*! finds the best spatial split */
+        __forceinline const SpatialSplit spatial_find(const PrimInfoExtRange& set, const size_t logBlockSize)
+        {
+          if (set.size() < PARALLEL_THRESHOLD) return sequential_spatial_find(set, logBlockSize);
+          else                                 return parallel_spatial_find  (set, logBlockSize);
+        }
+
+        /*! finds the best spatial split */
+        __noinline const SpatialSplit sequential_spatial_find(const PrimInfoExtRange& set, const size_t logBlockSize)
+        {
+          SpatialBinner binner(empty); 
+          const SpatialBinMapping<SPATIAL_BINS> mapping(set);
+          binner.bin2(splitterFactory,prims0,set.begin(),set.end(),mapping);
+          /* todo: best spatial split not exeeding the extended range does not provide any benefit ?*/
+          return binner.best(mapping,logBlockSize); //,set.ext_size());
+        }
+
+        __noinline const SpatialSplit parallel_spatial_find(const PrimInfoExtRange& set, const size_t logBlockSize)
+        {
+          SpatialBinner binner(empty);
+          const SpatialBinMapping<SPATIAL_BINS> mapping(set);
+          const SpatialBinMapping<SPATIAL_BINS>& _mapping = mapping; // CLANG 3.4 parser bug workaround
+          binner = parallel_reduce(set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,binner,
+                                   [&] (const range<size_t>& r) -> SpatialBinner { 
+                                     SpatialBinner binner(empty); 
+                                     binner.bin2(splitterFactory,prims0,r.begin(),r.end(),_mapping);
+                                     return binner; },
+                                   [&] (const SpatialBinner& b0, const SpatialBinner& b1) -> SpatialBinner { return SpatialBinner::reduce(b0,b1); });
+          /* todo: best spatial split not exeeding the extended range does not provide any benefit ?*/
+          return binner.best(mapping,logBlockSize); //,set.ext_size());
+        }
+
+
+        /*! subdivides primitives based on a spatial split */
+        __noinline void create_spatial_splits(PrimInfoExtRange& set, const SpatialSplit& split, const SpatialBinMapping<SPATIAL_BINS> &mapping)
+        {
+          assert(set.has_ext_range());
+          const size_t max_ext_range_size = set.ext_range_size();
+          const size_t ext_range_start = set.end();
+
+          /* atomic counter for number of primref splits */
+          std::atomic<size_t> ext_elements;
+          ext_elements.store(0);
+          
+          const float fpos = split.mapping.pos(split.pos,split.dim);
+        
+          const unsigned int mask = 0xFFFFFFFF >> RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS;
+
+          parallel_for( set.begin(), set.end(), CREATE_SPLITS_STEP_SIZE, [&](const range<size_t>& r) {
+              for (size_t i=r.begin();i<r.end();i++)
+              {
+                const unsigned int splits = prims0[i].geomID() >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS);
+
+                if (likely(splits <= 1)) continue; /* todo: does this ever happen ? */
+
+                //int bin0 = split.mapping.bin(prims0[i].lower)[split.dim];
+                //int bin1 = split.mapping.bin(prims0[i].upper)[split.dim];
+                //if (unlikely(bin0 < split.pos && bin1 >= split.pos))
+                if (unlikely(prims0[i].lower[split.dim] < fpos && prims0[i].upper[split.dim] > fpos))
+                {
+                  assert(splits > 1);
+
+                  PrimRef left,right;
+                  const auto splitter = splitterFactory(prims0[i]);
+                  splitter(prims0[i],split.dim,fpos,left,right);
+                
+                  // no empty splits
+                  if (unlikely(left.bounds().empty() || right.bounds().empty())) continue;
+                
+                  left.lower.u  = (left.lower.u  & mask) | ((splits-1) << (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS));
+                  right.lower.u = (right.lower.u & mask) | ((splits-1) << (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS));
+
+                  const size_t ID = ext_elements.fetch_add(1);
+
+                  /* break if the number of subdivided elements are greater than the maximum allowed size */
+                  if (unlikely(ID >= max_ext_range_size)) 
+                    break;
+
+                  /* only write within the correct bounds */
+                  assert(ID < max_ext_range_size);
+                  prims0[i] = left;
+                  prims0[ext_range_start+ID] = right;     
+                }
+              }
+            });
+
+          const size_t numExtElements = min(max_ext_range_size,ext_elements.load());          
+          assert(set.end()+numExtElements<=set.ext_end());
+          set._end += numExtElements;
+        }
+        
+        /*! array partitioning */
+        void split(const Split& split, const PrimInfoExtRange& set_i, PrimInfoExtRange& lset, PrimInfoExtRange& rset) 
+        {
+          PrimInfoExtRange set = set_i;
+          
+          /* valid split */
+          if (unlikely(!split.valid())) {
+            deterministic_order(set);
+            return splitFallback(set,lset,rset);
+          }
+
+          std::pair<size_t,size_t> ext_weights(0,0);
+
+          if (unlikely(split.spatial))
+          {
+            create_spatial_splits(set,split.spatialSplit(), split.spatialSplit().mapping); 
+
+            /* spatial split */
+            if (likely(set.size() < PARALLEL_THRESHOLD)) 
+              ext_weights = sequential_spatial_split(split.spatialSplit(),set,lset,rset);
+            else
+              ext_weights = parallel_spatial_split(split.spatialSplit(),set,lset,rset);
+          }
+          else
+          {
+            /* object split */
+            if (likely(set.size() < PARALLEL_THRESHOLD)) 
+              ext_weights = sequential_object_split(split.objectSplit(),set,lset,rset);
+            else
+              ext_weights = parallel_object_split(split.objectSplit(),set,lset,rset);
+          }
+
+          /* if we have an extended range, set extended child ranges and move right split range */
+          if (unlikely(set.has_ext_range())) 
+          {
+            setExtentedRanges(set,lset,rset,ext_weights.first,ext_weights.second);
+            moveExtentedRange(set,lset,rset);
+          }
+        }
+
+        /*! array partitioning */
+        std::pair<size_t,size_t> sequential_object_split(const ObjectSplit& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset) 
+        {
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          PrimInfo local_left(empty);
+          PrimInfo local_right(empty);
+          const unsigned int splitPos = split.pos;
+          const unsigned int splitDim = split.dim;
+          const unsigned int splitDimMask = (unsigned int)1 << splitDim; 
+
+          const typename ObjectBinner::vint vSplitPos(splitPos);
+          const typename ObjectBinner::vbool vSplitMask(splitDimMask);
+          size_t center = serial_partitioning(prims0,
+                                              begin,end,local_left,local_right,
+                                              [&] (const PrimRef& ref) { 
+                                                return split.mapping.bin_unsafe(ref,vSplitPos,vSplitMask);
+                                              },
+                                              [] (PrimInfo& pinfo,const PrimRef& ref) { pinfo.add_center2(ref,ref.lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); });          
+          const size_t left_weight  = local_left.end;
+          const size_t right_weight = local_right.end;
+
+          new (&lset) PrimInfoExtRange(begin,center,center,local_left);
+          new (&rset) PrimInfoExtRange(center,end,end,local_right);
+
+          assert(area(lset.geomBounds) >= 0.0f);
+          assert(area(rset.geomBounds) >= 0.0f);
+          return std::pair<size_t,size_t>(left_weight,right_weight);
+        }
+
+
+        /*! array partitioning */
+        __noinline std::pair<size_t,size_t> sequential_spatial_split(const SpatialSplit& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset) 
+        {
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          PrimInfo local_left(empty);
+          PrimInfo local_right(empty);
+          const unsigned int splitPos = split.pos;
+          const unsigned int splitDim = split.dim;
+          const unsigned int splitDimMask = (unsigned int)1 << splitDim; 
+
+          /* init spatial mapping */
+          const SpatialBinMapping<SPATIAL_BINS> &mapping = split.mapping;
+          const vint4 vSplitPos(splitPos);
+          const vbool4 vSplitMask( (int)splitDimMask );
+
+          size_t center = serial_partitioning(prims0,
+                                              begin,end,local_left,local_right,
+                                              [&] (const PrimRef& ref) {
+                                                const Vec3fa c = ref.bounds().center();
+                                                return any(((vint4)mapping.bin(c) < vSplitPos) & vSplitMask); 
+                                              },
+                                              [] (PrimInfo& pinfo,const PrimRef& ref) { pinfo.add_center2(ref,ref.lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); });
+
+          const size_t left_weight  = local_left.end;
+          const size_t right_weight = local_right.end;
+          
+          new (&lset) PrimInfoExtRange(begin,center,center,local_left);
+          new (&rset) PrimInfoExtRange(center,end,end,local_right);
+          assert(area(lset.geomBounds) >= 0.0f);
+          assert(area(rset.geomBounds) >= 0.0f);
+          return std::pair<size_t,size_t>(left_weight,right_weight);
+        }
+
+
+        
+        /*! array partitioning */
+        __noinline std::pair<size_t,size_t> parallel_object_split(const ObjectSplit& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset)
+        {
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          PrimInfo left(empty);
+          PrimInfo right(empty);
+          const unsigned int splitPos = split.pos;
+          const unsigned int splitDim = split.dim;
+          const unsigned int splitDimMask = (unsigned int)1 << splitDim;
+
+          const typename ObjectBinner::vint vSplitPos(splitPos);
+          const typename ObjectBinner::vbool vSplitMask(splitDimMask);
+          auto isLeft = [&] (const PrimRef &ref) { return split.mapping.bin_unsafe(ref,vSplitPos,vSplitMask); };
+
+          const size_t center = parallel_partitioning(
+            prims0,begin,end,EmptyTy(),left,right,isLeft,
+            [] (PrimInfo &pinfo,const PrimRef &ref) { pinfo.add_center2(ref,ref.lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); },
+            [] (PrimInfo &pinfo0,const PrimInfo &pinfo1) { pinfo0.merge(pinfo1); },
+            PARALLEL_PARTITION_BLOCK_SIZE);
+
+          const size_t left_weight  = left.end;
+          const size_t right_weight = right.end;
+          
+          left.begin  = begin;  left.end  = center; 
+          right.begin = center; right.end = end;
+          
+          new (&lset) PrimInfoExtRange(begin,center,center,left);
+          new (&rset) PrimInfoExtRange(center,end,end,right);
+
+          assert(area(left.geomBounds) >= 0.0f);
+          assert(area(right.geomBounds) >= 0.0f);
+          return std::pair<size_t,size_t>(left_weight,right_weight);
+        }
+
+        /*! array partitioning */
+        __noinline std::pair<size_t,size_t> parallel_spatial_split(const SpatialSplit& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset)
+        {
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          PrimInfo left(empty);
+          PrimInfo right(empty);
+          const unsigned int splitPos = split.pos;
+          const unsigned int splitDim = split.dim;
+          const unsigned int splitDimMask = (unsigned int)1 << splitDim;
+
+          /* init spatial mapping */
+          const SpatialBinMapping<SPATIAL_BINS>& mapping = split.mapping;
+          const vint4 vSplitPos(splitPos);
+          const vbool4 vSplitMask( (int)splitDimMask );
+
+          auto isLeft = [&] (const PrimRef &ref) { 
+            const Vec3fa c = ref.bounds().center();
+            return any(((vint4)mapping.bin(c) < vSplitPos) & vSplitMask); };
+
+          const size_t center = parallel_partitioning(
+            prims0,begin,end,EmptyTy(),left,right,isLeft,
+            [] (PrimInfo &pinfo,const PrimRef &ref) { pinfo.add_center2(ref,ref.lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); },
+            [] (PrimInfo &pinfo0,const PrimInfo &pinfo1) { pinfo0.merge(pinfo1); },
+            PARALLEL_PARTITION_BLOCK_SIZE);
+
+          const size_t left_weight  = left.end;
+          const size_t right_weight = right.end;
+          
+          left.begin  = begin;  left.end  = center; 
+          right.begin = center; right.end = end;
+          
+          new (&lset) PrimInfoExtRange(begin,center,center,left);
+          new (&rset) PrimInfoExtRange(center,end,end,right);
+
+          assert(area(left.geomBounds) >= 0.0f);
+          assert(area(right.geomBounds) >= 0.0f);
+          return std::pair<size_t,size_t>(left_weight,right_weight);
+        }
+
+        void deterministic_order(const PrimInfoExtRange& set) 
+        {
+          /* required as parallel partition destroys original primitive order */
+          std::sort(&prims0[set.begin()],&prims0[set.end()]);
+        }
+
+        void splitFallback(const PrimInfoExtRange& set, 
+                           PrimInfoExtRange& lset, 
+                           PrimInfoExtRange& rset)
+        {
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          const size_t center = (begin + end)/2;
+
+          PrimInfo left(empty);
+          for (size_t i=begin; i<center; i++) {
+            left.add_center2(prims0[i],prims0[i].lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS));
+          }
+          const size_t lweight = left.end;
+          
+          PrimInfo right(empty);
+          for (size_t i=center; i<end; i++) {
+            right.add_center2(prims0[i],prims0[i].lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS));	
+          }
+          const size_t rweight = right.end;
+
+          new (&lset) PrimInfoExtRange(begin,center,center,left);
+          new (&rset) PrimInfoExtRange(center,end,end,right);
+
+          /* if we have an extended range */
+          if (set.has_ext_range()) {
+            setExtentedRanges(set,lset,rset,lweight,rweight);
+            moveExtentedRange(set,lset,rset);
+          }
+        }
+        
+      private:
+        PrimRef* const prims0;
+        const PrimitiveSplitterFactory& splitterFactory;
+        const CentGeomBBox3fa& root_info;
+      };
+  }
+}
diff --git a/thirdparty/embree/kernels/builders/heuristic_strand_array.h b/thirdparty/embree/kernels/builders/heuristic_strand_array.h
new file mode 100644
index 0000000000..19c7fcdaa8
--- /dev/null
+++ b/thirdparty/embree/kernels/builders/heuristic_strand_array.h
@@ -0,0 +1,188 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "priminfo.h"
+#include "../../common/algorithms/parallel_reduce.h"
+#include "../../common/algorithms/parallel_partition.h"
+
+namespace embree
+{
+  namespace isa
+  { 
+    /*! Performs standard object binning */
+    struct HeuristicStrandSplit
+    {
+      typedef range<size_t> Set;
+  
+      static const size_t PARALLEL_THRESHOLD = 10000;
+      static const size_t PARALLEL_FIND_BLOCK_SIZE = 4096;
+      static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 64;
+
+      /*! stores all information to perform some split */
+      struct Split
+      {    
+	/*! construct an invalid split by default */
+	__forceinline Split()
+	  : sah(inf), axis0(zero), axis1(zero) {}
+	
+	/*! constructs specified split */
+	__forceinline Split(const float sah, const Vec3fa& axis0, const Vec3fa& axis1)
+	  : sah(sah), axis0(axis0), axis1(axis1) {}
+	
+	/*! calculates standard surface area heuristic for the split */
+	__forceinline float splitSAH() const { return sah; }
+
+        /*! test if this split is valid */
+        __forceinline bool valid() const { return sah != float(inf); }
+		
+      public:
+	float sah;             //!< SAH cost of the split
+	Vec3fa axis0, axis1;   //!< axis the two strands are aligned into
+      };
+
+      __forceinline HeuristicStrandSplit () // FIXME: required?
+        : scene(nullptr), prims(nullptr) {}
+      
+      /*! remember prim array */
+      __forceinline HeuristicStrandSplit (Scene* scene, PrimRef* prims)
+        : scene(scene), prims(prims) {}
+      
+      __forceinline const Vec3fa direction(const PrimRef& prim) {
+        return scene->get(prim.geomID())->computeDirection(prim.primID());
+      }
+      
+      __forceinline const BBox3fa bounds(const PrimRef& prim) {
+        return scene->get(prim.geomID())->vbounds(prim.primID());
+      }
+
+      __forceinline const BBox3fa bounds(const LinearSpace3fa& space, const PrimRef& prim) {
+        return scene->get(prim.geomID())->vbounds(space,prim.primID());
+      }
+
+      /*! finds the best split */
+      const Split find(const range<size_t>& set, size_t logBlockSize)
+      {
+        Vec3fa axis0(0,0,1);
+        uint64_t bestGeomPrimID = -1;
+
+        /* curve with minimum ID determines first axis */
+        for (size_t i=set.begin(); i<set.end(); i++)
+        {
+          const uint64_t geomprimID = prims[i].ID64();
+          if (geomprimID >= bestGeomPrimID) continue;
+          const Vec3fa axis = direction(prims[i]);
+          if (sqr_length(axis) > 1E-18f) {
+            axis0 = normalize(axis);
+            bestGeomPrimID = geomprimID;
+          }
+        }
+      
+        /* find 2nd axis that is most misaligned with first axis and has minimum ID */
+        float bestCos = 1.0f;
+        Vec3fa axis1 = axis0;
+        bestGeomPrimID = -1;
+        for (size_t i=set.begin(); i<set.end(); i++) 
+        {
+          const uint64_t geomprimID = prims[i].ID64();
+          Vec3fa axisi = direction(prims[i]);
+          float leni = length(axisi);
+          if (leni == 0.0f) continue;
+          axisi /= leni;
+          float cos = abs(dot(axisi,axis0));
+          if ((cos == bestCos && (geomprimID < bestGeomPrimID)) || cos < bestCos) {
+            bestCos = cos; axis1 = axisi;
+            bestGeomPrimID = geomprimID;
+          }
+        }
+      
+        /* partition the two strands */
+        size_t lnum = 0, rnum = 0;
+        BBox3fa lbounds = empty, rbounds = empty;
+        const LinearSpace3fa space0 = frame(axis0).transposed();
+        const LinearSpace3fa space1 = frame(axis1).transposed();
+        
+        for (size_t i=set.begin(); i<set.end(); i++)
+        {
+          PrimRef& prim = prims[i];
+          const Vec3fa axisi = normalize(direction(prim));
+          const float cos0 = abs(dot(axisi,axis0));
+          const float cos1 = abs(dot(axisi,axis1));
+          
+          if (cos0 > cos1) { lnum++; lbounds.extend(bounds(space0,prim)); }
+          else             { rnum++; rbounds.extend(bounds(space1,prim)); }
+        }
+      
+        /*! return an invalid split if we do not partition */
+        if (lnum == 0 || rnum == 0) 
+          return Split(inf,axis0,axis1);
+      
+        /*! calculate sah for the split */
+        const size_t lblocks = (lnum+(1ull<<logBlockSize)-1ull) >> logBlockSize;
+        const size_t rblocks = (rnum+(1ull<<logBlockSize)-1ull) >> logBlockSize;
+        const float sah = madd(float(lblocks),halfArea(lbounds),float(rblocks)*halfArea(rbounds));
+        return Split(sah,axis0,axis1);
+      }
+
+      /*! array partitioning */
+      void split(const Split& split, const PrimInfoRange& set, PrimInfoRange& lset, PrimInfoRange& rset) 
+      {
+        if (!split.valid()) {
+          deterministic_order(set);
+          return splitFallback(set,lset,rset);
+        }
+        
+        const size_t begin = set.begin();
+        const size_t end   = set.end();
+        CentGeomBBox3fa local_left(empty);
+        CentGeomBBox3fa local_right(empty);
+
+        auto primOnLeftSide = [&] (const PrimRef& prim) -> bool { 
+          const Vec3fa axisi = normalize(direction(prim));
+          const float cos0 = abs(dot(axisi,split.axis0));
+          const float cos1 = abs(dot(axisi,split.axis1));
+          return cos0 > cos1;
+        };
+
+        auto mergePrimBounds = [this] (CentGeomBBox3fa& pinfo,const PrimRef& ref) { 
+          pinfo.extend(bounds(ref)); 
+        };
+        
+        size_t center = serial_partitioning(prims,begin,end,local_left,local_right,primOnLeftSide,mergePrimBounds);
+        
+        new (&lset) PrimInfoRange(begin,center,local_left);
+        new (&rset) PrimInfoRange(center,end,local_right);
+        assert(area(lset.geomBounds) >= 0.0f);
+        assert(area(rset.geomBounds) >= 0.0f);
+      }
+
+      void deterministic_order(const Set& set) 
+      {
+        /* required as parallel partition destroys original primitive order */
+        std::sort(&prims[set.begin()],&prims[set.end()]);
+      }
+      
+      void splitFallback(const Set& set, PrimInfoRange& lset, PrimInfoRange& rset)
+      {
+        const size_t begin = set.begin();
+        const size_t end   = set.end();
+        const size_t center = (begin + end)/2;
+        
+        CentGeomBBox3fa left(empty);
+        for (size_t i=begin; i<center; i++)
+          left.extend(bounds(prims[i]));
+        new (&lset) PrimInfoRange(begin,center,left);
+        
+        CentGeomBBox3fa right(empty);
+        for (size_t i=center; i<end; i++)
+          right.extend(bounds(prims[i]));	
+        new (&rset) PrimInfoRange(center,end,right);
+      }
+      
+    private:
+      Scene* const scene;
+      PrimRef* const prims;
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/builders/heuristic_timesplit_array.h b/thirdparty/embree/kernels/builders/heuristic_timesplit_array.h
new file mode 100644
index 0000000000..b968e01c90
--- /dev/null
+++ b/thirdparty/embree/kernels/builders/heuristic_timesplit_array.h
@@ -0,0 +1,237 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/primref_mb.h"
+#include "../../common/algorithms/parallel_filter.h"
+
+#define MBLUR_TIME_SPLIT_THRESHOLD 1.25f
+
+namespace embree
+{
+  namespace isa
+  { 
+    /*! Performs standard object binning */
+    template<typename PrimRefMB, typename RecalculatePrimRef, size_t BINS>
+      struct HeuristicMBlurTemporalSplit
+      {
+        typedef BinSplit<MBLUR_NUM_OBJECT_BINS> Split;
+        typedef mvector<PrimRefMB>* PrimRefVector;
+        typedef typename PrimRefMB::BBox BBox; 
+
+        static const size_t PARALLEL_THRESHOLD = 3 * 1024;
+        static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024;
+        static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128;
+
+        HeuristicMBlurTemporalSplit (MemoryMonitorInterface* device, const RecalculatePrimRef& recalculatePrimRef)
+          : device(device), recalculatePrimRef(recalculatePrimRef) {}
+
+        struct TemporalBinInfo
+        {
+          __forceinline TemporalBinInfo () {
+          }
+
+          __forceinline TemporalBinInfo (EmptyTy)
+          {
+            for (size_t i=0; i<BINS-1; i++)
+            {
+              count0[i] = count1[i] = 0;
+              bounds0[i] = bounds1[i] = empty;
+            }
+          }
+          
+          void bin(const PrimRefMB* prims, size_t begin, size_t end, BBox1f time_range, const SetMB& set, const RecalculatePrimRef& recalculatePrimRef)
+          {
+            for (int b=0; b<BINS-1; b++)
+            {
+              const float t = float(b+1)/float(BINS);
+              const float ct = lerp(time_range.lower,time_range.upper,t);
+              const float center_time = set.align_time(ct);
+              if (center_time <= time_range.lower) continue;
+              if (center_time >= time_range.upper) continue;
+              const BBox1f dt0(time_range.lower,center_time);
+              const BBox1f dt1(center_time,time_range.upper);
+              
+              /* find linear bounds for both time segments */
+              for (size_t i=begin; i<end; i++) 
+              {
+                if (prims[i].time_range_overlap(dt0))
+                {
+                  const LBBox3fa bn0 = recalculatePrimRef.linearBounds(prims[i],dt0);
+#if MBLUR_BIN_LBBOX
+                  bounds0[b].extend(bn0);
+#else
+                  bounds0[b].extend(bn0.interpolate(0.5f));
+#endif
+                  count0[b] += prims[i].timeSegmentRange(dt0).size();
+                }
+
+                if (prims[i].time_range_overlap(dt1))
+                {
+                  const LBBox3fa bn1 = recalculatePrimRef.linearBounds(prims[i],dt1);
+#if MBLUR_BIN_LBBOX
+                  bounds1[b].extend(bn1);
+#else
+                  bounds1[b].extend(bn1.interpolate(0.5f));
+#endif
+                  count1[b] += prims[i].timeSegmentRange(dt1).size();
+                }
+              }
+            }
+          }
+
+          __forceinline void bin_parallel(const PrimRefMB* prims, size_t begin, size_t end, size_t blockSize, size_t parallelThreshold, BBox1f time_range, const SetMB& set, const RecalculatePrimRef& recalculatePrimRef) 
+          {
+            if (likely(end-begin < parallelThreshold)) {
+              bin(prims,begin,end,time_range,set,recalculatePrimRef);
+            } 
+            else 
+            {
+              auto bin = [&](const range<size_t>& r) -> TemporalBinInfo { 
+                TemporalBinInfo binner(empty); binner.bin(prims, r.begin(), r.end(), time_range, set, recalculatePrimRef); return binner; 
+              };
+              *this = parallel_reduce(begin,end,blockSize,TemporalBinInfo(empty),bin,merge2);
+            }
+          }
+          
+          /*! merges in other binning information */
+          __forceinline void merge (const TemporalBinInfo& other)
+          {
+            for (size_t i=0; i<BINS-1; i++) 
+            {
+              count0[i] += other.count0[i];
+              count1[i] += other.count1[i];
+              bounds0[i].extend(other.bounds0[i]);
+              bounds1[i].extend(other.bounds1[i]);
+            }
+          }
+
+          static __forceinline const TemporalBinInfo merge2(const TemporalBinInfo& a, const TemporalBinInfo& b) {
+            TemporalBinInfo r = a; r.merge(b); return r;
+          }
+                    
+          Split best(int logBlockSize, BBox1f time_range, const SetMB& set)
+          {
+            float bestSAH = inf;
+            float bestPos = 0.0f;
+            for (int b=0; b<BINS-1; b++)
+            {
+              float t = float(b+1)/float(BINS);
+              float ct = lerp(time_range.lower,time_range.upper,t);
+              const float center_time = set.align_time(ct);
+              if (center_time <= time_range.lower) continue;
+              if (center_time >= time_range.upper) continue;
+              const BBox1f dt0(time_range.lower,center_time);
+              const BBox1f dt1(center_time,time_range.upper);
+              
+              /* calculate sah */
+              const size_t lCount = (count0[b]+(size_t(1) << logBlockSize)-1) >> int(logBlockSize);
+              const size_t rCount = (count1[b]+(size_t(1) << logBlockSize)-1) >> int(logBlockSize);
+              float sah0 = expectedApproxHalfArea(bounds0[b])*float(lCount)*dt0.size();
+              float sah1 = expectedApproxHalfArea(bounds1[b])*float(rCount)*dt1.size();
+              if (unlikely(lCount == 0)) sah0 = 0.0f; // happens for initial splits when objects not alive over entire shutter time
+              if (unlikely(rCount == 0)) sah1 = 0.0f;
+              const float sah = sah0+sah1;
+              if (sah < bestSAH) {
+                bestSAH = sah;
+                bestPos = center_time;
+              }
+            }
+            return Split(bestSAH*MBLUR_TIME_SPLIT_THRESHOLD,(unsigned)Split::SPLIT_TEMPORAL,0,bestPos);
+          }
+          
+        public:
+          size_t count0[BINS-1];
+          size_t count1[BINS-1];
+          BBox bounds0[BINS-1];
+          BBox bounds1[BINS-1];
+        };
+        
+        /*! finds the best split */
+        const Split find(const SetMB& set, const size_t logBlockSize)
+        {
+          assert(set.size() > 0);
+          TemporalBinInfo binner(empty);
+          binner.bin_parallel(set.prims->data(),set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,PARALLEL_THRESHOLD,set.time_range,set,recalculatePrimRef);
+          Split tsplit = binner.best((int)logBlockSize,set.time_range,set);
+          if (!tsplit.valid()) tsplit.data = Split::SPLIT_FALLBACK; // use fallback split
+          return tsplit;
+        }
+
+        __forceinline std::unique_ptr<mvector<PrimRefMB>> split(const Split& tsplit, const SetMB& set, SetMB& lset, SetMB& rset)
+        {
+          assert(tsplit.sah != float(inf));
+          assert(tsplit.fpos > set.time_range.lower);
+          assert(tsplit.fpos < set.time_range.upper);
+
+          float center_time = tsplit.fpos;
+          const BBox1f time_range0(set.time_range.lower,center_time);
+          const BBox1f time_range1(center_time,set.time_range.upper);
+          mvector<PrimRefMB>& prims = *set.prims;
+          
+          /* calculate primrefs for first time range */
+          std::unique_ptr<mvector<PrimRefMB>> new_vector(new mvector<PrimRefMB>(device, set.size()));
+          PrimRefVector lprims = new_vector.get();
+          
+          auto reduction_func0 = [&] (const range<size_t>& r) {
+            PrimInfoMB pinfo = empty;
+            for (size_t i=r.begin(); i<r.end(); i++) 
+            {
+              if (likely(prims[i].time_range_overlap(time_range0)))
+              {
+                const PrimRefMB& prim = recalculatePrimRef(prims[i],time_range0);
+                (*lprims)[i-set.begin()] = prim;
+                pinfo.add_primref(prim);
+              }
+              else
+              {
+                (*lprims)[i-set.begin()] = prims[i];
+              }
+            }
+            return pinfo;
+          };        
+          PrimInfoMB linfo = parallel_reduce(set.object_range,PARALLEL_PARTITION_BLOCK_SIZE,PARALLEL_THRESHOLD,PrimInfoMB(empty),reduction_func0,PrimInfoMB::merge2);
+
+          /* primrefs for first time range are in lprims[0 .. set.size()) */
+          /* some primitives may need to be filtered out */
+          if (linfo.size() != set.size())
+            linfo.object_range._end = parallel_filter(lprims->data(), size_t(0), set.size(), size_t(1024),
+                                                      [&](const PrimRefMB& prim) { return prim.time_range_overlap(time_range0); });
+                      
+          lset = SetMB(linfo,lprims,time_range0);
+
+          /* calculate primrefs for second time range */
+          auto reduction_func1 = [&] (const range<size_t>& r) {
+            PrimInfoMB pinfo = empty;
+            for (size_t i=r.begin(); i<r.end(); i++) 
+            {
+              if (likely(prims[i].time_range_overlap(time_range1)))
+              {
+                const PrimRefMB& prim = recalculatePrimRef(prims[i],time_range1);
+                prims[i] = prim;
+                pinfo.add_primref(prim);
+              }
+            }
+            return pinfo;
+          };        
+          PrimInfoMB rinfo = parallel_reduce(set.object_range,PARALLEL_PARTITION_BLOCK_SIZE,PARALLEL_THRESHOLD,PrimInfoMB(empty),reduction_func1,PrimInfoMB::merge2);
+          rinfo.object_range = range<size_t>(set.begin(), set.begin() + rinfo.size());
+
+          /* primrefs for second time range are in prims[set.begin() .. set.end()) */
+          /* some primitives may need to be filtered out */
+          if (rinfo.size() != set.size())
+            rinfo.object_range._end = parallel_filter(prims.data(), set.begin(), set.end(), size_t(1024),
+                                                      [&](const PrimRefMB& prim) { return prim.time_range_overlap(time_range1); });
+        
+          rset = SetMB(rinfo,&prims,time_range1);
+
+          return new_vector;
+        }
+
+      private:
+        MemoryMonitorInterface* device;              // device to report memory usage to
+        const RecalculatePrimRef recalculatePrimRef;
+      };
+  }
+}
diff --git a/thirdparty/embree/kernels/builders/priminfo.h b/thirdparty/embree/kernels/builders/priminfo.h
new file mode 100644
index 0000000000..fee515247a
--- /dev/null
+++ b/thirdparty/embree/kernels/builders/priminfo.h
@@ -0,0 +1,362 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/default.h"
+#include "../common/primref.h"
+#include "../common/primref_mb.h"
+
+namespace embree
+{
+  // FIXME: maybe there's a better place for this util fct
+  __forceinline float areaProjectedTriangle(const Vec3fa& v0, const Vec3fa& v1, const Vec3fa& v2)
+  {
+    const Vec3fa e0 = v1-v0;
+    const Vec3fa e1 = v2-v0;
+    const Vec3fa d = cross(e0,e1);
+    return fabs(d.x) + fabs(d.y) + fabs(d.z);
+  }
+
+  //namespace isa
+  //{
+    template<typename BBox>
+      class CentGeom
+    {
+    public:
+      __forceinline CentGeom () {}
+
+      __forceinline CentGeom (EmptyTy) 
+	: geomBounds(empty), centBounds(empty) {}
+      
+      __forceinline CentGeom (const BBox& geomBounds, const BBox3fa& centBounds) 
+	: geomBounds(geomBounds), centBounds(centBounds) {}
+      
+      template<typename PrimRef> 
+        __forceinline void extend_primref(const PrimRef& prim) 
+      {
+        BBox bounds; Vec3fa center;
+        prim.binBoundsAndCenter(bounds,center);
+        geomBounds.extend(bounds);
+        centBounds.extend(center);
+      }
+
+       template<typename PrimRef> 
+         __forceinline void extend_center2(const PrimRef& prim) 
+       {
+         BBox3fa bounds = prim.bounds();
+         geomBounds.extend(bounds);
+         centBounds.extend(bounds.center2());
+       }
+       
+      __forceinline void extend(const BBox& geomBounds_) {
+	geomBounds.extend(geomBounds_);
+	centBounds.extend(center2(geomBounds_));
+      }
+
+      __forceinline void merge(const CentGeom& other) 
+      {
+	geomBounds.extend(other.geomBounds);
+	centBounds.extend(other.centBounds);
+      }
+
+      static __forceinline const CentGeom merge2(const CentGeom& a, const CentGeom& b) {
+        CentGeom r = a; r.merge(b); return r;
+      }
+
+    public:
+      BBox geomBounds;   //!< geometry bounds of primitives
+      BBox3fa centBounds;   //!< centroid bounds of primitives
+    };
+
+    typedef CentGeom<BBox3fa> CentGeomBBox3fa;
+
+    /*! stores bounding information for a set of primitives */
+    template<typename BBox>
+      class PrimInfoT : public CentGeom<BBox>
+    {
+    public:
+      using CentGeom<BBox>::geomBounds;
+      using CentGeom<BBox>::centBounds;
+
+      __forceinline PrimInfoT () {}
+
+      __forceinline PrimInfoT (EmptyTy) 
+	: CentGeom<BBox>(empty), begin(0), end(0) {}
+
+      __forceinline PrimInfoT (size_t begin, size_t end, const CentGeomBBox3fa& centGeomBounds) 
+        : CentGeom<BBox>(centGeomBounds), begin(begin), end(end) {}
+
+      template<typename PrimRef> 
+        __forceinline void add_primref(const PrimRef& prim) 
+      {
+        CentGeom<BBox>::extend_primref(prim);
+        end++;
+      }
+
+       template<typename PrimRef> 
+         __forceinline void add_center2(const PrimRef& prim) {
+         CentGeom<BBox>::extend_center2(prim);
+         end++;
+       }
+
+        template<typename PrimRef> 
+          __forceinline void add_center2(const PrimRef& prim, const size_t i) {
+          CentGeom<BBox>::extend_center2(prim);
+          end+=i;
+        }
+
+      /*__forceinline void add(const BBox& geomBounds_) {
+	CentGeom<BBox>::extend(geomBounds_);
+	end++;
+      }
+
+      __forceinline void add(const BBox& geomBounds_, const size_t i) {
+	CentGeom<BBox>::extend(geomBounds_);
+	end+=i;
+        }*/
+
+      __forceinline void merge(const PrimInfoT& other) 
+      {
+	CentGeom<BBox>::merge(other);
+        begin += other.begin;
+	end += other.end;
+      }
+
+      static __forceinline const PrimInfoT merge(const PrimInfoT& a, const PrimInfoT& b) {
+        PrimInfoT r = a; r.merge(b); return r;
+      }
+      
+      /*! returns the number of primitives */
+      __forceinline size_t size() const { 
+	return end-begin; 
+      }
+
+      __forceinline float halfArea() {
+        return expectedApproxHalfArea(geomBounds);
+      }
+
+      __forceinline float leafSAH() const { 
+	return expectedApproxHalfArea(geomBounds)*float(size()); 
+	//return halfArea(geomBounds)*blocks(num); 
+      }
+      
+      __forceinline float leafSAH(size_t block_shift) const { 
+	return expectedApproxHalfArea(geomBounds)*float((size()+(size_t(1)<<block_shift)-1) >> block_shift);
+	//return halfArea(geomBounds)*float((num+3) >> 2);
+	//return halfArea(geomBounds)*blocks(num); 
+      }
+      
+      /*! stream output */
+      friend embree_ostream operator<<(embree_ostream cout, const PrimInfoT& pinfo) {
+	return cout << "PrimInfo { begin = " << pinfo.begin << ", end = " << pinfo.end << ", geomBounds = " << pinfo.geomBounds << ", centBounds = " << pinfo.centBounds << "}";
+      }
+      
+    public:
+      size_t begin,end;          //!< number of primitives
+    };
+
+    typedef PrimInfoT<BBox3fa> PrimInfo;
+    //typedef PrimInfoT<LBBox3fa> PrimInfoMB;
+
+    /*! stores bounding information for a set of primitives */
+    template<typename BBox>
+      class PrimInfoMBT : public CentGeom<BBox>
+    {
+    public:
+      using CentGeom<BBox>::geomBounds;
+      using CentGeom<BBox>::centBounds;
+
+      __forceinline PrimInfoMBT () {
+      } 
+
+      __forceinline PrimInfoMBT (EmptyTy)
+        : CentGeom<BBox>(empty), object_range(0,0), num_time_segments(0), max_num_time_segments(0), max_time_range(0.0f,1.0f), time_range(1.0f,0.0f) {}
+
+      __forceinline PrimInfoMBT (size_t begin, size_t end)
+        : CentGeom<BBox>(empty), object_range(begin,end), num_time_segments(0), max_num_time_segments(0), max_time_range(0.0f,1.0f), time_range(1.0f,0.0f) {}
+
+      template<typename PrimRef> 
+        __forceinline void add_primref(const PrimRef& prim) 
+      {
+        CentGeom<BBox>::extend_primref(prim);
+        time_range.extend(prim.time_range);
+        object_range._end++;
+        num_time_segments += prim.size();
+        if (max_num_time_segments < prim.totalTimeSegments()) {
+          max_num_time_segments = prim.totalTimeSegments();
+          max_time_range = prim.time_range;
+        }
+      }
+
+      __forceinline void merge(const PrimInfoMBT& other)
+      {
+        CentGeom<BBox>::merge(other);
+        time_range.extend(other.time_range);
+        object_range._begin += other.object_range.begin();
+        object_range._end += other.object_range.end();
+        num_time_segments += other.num_time_segments;
+        if (max_num_time_segments < other.max_num_time_segments) {
+          max_num_time_segments = other.max_num_time_segments;
+          max_time_range = other.max_time_range;
+        }
+      }
+
+      static __forceinline const PrimInfoMBT merge2(const PrimInfoMBT& a, const PrimInfoMBT& b) {
+        PrimInfoMBT r = a; r.merge(b); return r;
+      }
+
+      __forceinline size_t begin() const {
+        return object_range.begin();
+      }
+
+      __forceinline size_t end() const {
+        return object_range.end();
+      }
+      
+      /*! returns the number of primitives */
+      __forceinline size_t size() const { 
+	return object_range.size(); 
+      }
+
+      __forceinline float halfArea() const {
+        return time_range.size()*expectedApproxHalfArea(geomBounds);
+      }
+
+      __forceinline float leafSAH() const { 
+	return time_range.size()*expectedApproxHalfArea(geomBounds)*float(num_time_segments); 
+      }
+      
+      __forceinline float leafSAH(size_t block_shift) const { 
+	return time_range.size()*expectedApproxHalfArea(geomBounds)*float((num_time_segments+(size_t(1)<<block_shift)-1) >> block_shift);
+      }
+
+      __forceinline float align_time(float ct) const
+      {
+        //return roundf(ct * float(numTimeSegments)) / float(numTimeSegments);
+        float t0 = (ct-max_time_range.lower)/max_time_range.size();
+        float t1 = roundf(t0 * float(max_num_time_segments)) / float(max_num_time_segments);
+        return t1*max_time_range.size()+max_time_range.lower;
+      }
+      
+      /*! stream output */
+      friend embree_ostream operator<<(embree_ostream cout, const PrimInfoMBT& pinfo) 
+      {
+	return cout << "PrimInfo { " << 
+          "object_range = " << pinfo.object_range << 
+          ", time_range = " << pinfo.time_range << 
+          ", time_segments = " << pinfo.num_time_segments << 
+          ", geomBounds = " << pinfo.geomBounds << 
+          ", centBounds = " << pinfo.centBounds << 
+          "}";
+      }
+      
+    public:
+      range<size_t> object_range; //!< primitive range
+      size_t num_time_segments;  //!< total number of time segments of all added primrefs
+      size_t max_num_time_segments; //!< maximum number of time segments of a primitive
+      BBox1f max_time_range; //!< time range of primitive with max_num_time_segments
+      BBox1f time_range; //!< merged time range of primitives when merging prims, or additionally clipped with build time range when used in SetMB
+    };
+
+    typedef PrimInfoMBT<typename PrimRefMB::BBox> PrimInfoMB;
+
+    struct SetMB : public PrimInfoMB
+    {
+      static const size_t PARALLEL_THRESHOLD = 3 * 1024;
+      static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024;
+      static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128;
+
+      typedef mvector<PrimRefMB>* PrimRefVector;
+
+      __forceinline SetMB() {}
+
+       __forceinline SetMB(const PrimInfoMB& pinfo_i, PrimRefVector prims)
+         : PrimInfoMB(pinfo_i), prims(prims) {}
+
+      __forceinline SetMB(const PrimInfoMB& pinfo_i, PrimRefVector prims, range<size_t> object_range_in, BBox1f time_range_in)
+        : PrimInfoMB(pinfo_i), prims(prims)
+      {
+        object_range = object_range_in;
+        time_range = intersect(time_range,time_range_in);
+      }
+      
+      __forceinline SetMB(const PrimInfoMB& pinfo_i, PrimRefVector prims, BBox1f time_range_in)
+        : PrimInfoMB(pinfo_i), prims(prims)
+      {
+        time_range = intersect(time_range,time_range_in);
+      }
+
+      void deterministic_order() const 
+      {
+        /* required as parallel partition destroys original primitive order */
+        PrimRefMB* prim = prims->data();
+        std::sort(&prim[object_range.begin()],&prim[object_range.end()]);
+      }
+
+      template<typename RecalculatePrimRef>
+      __forceinline LBBox3fa linearBounds(const RecalculatePrimRef& recalculatePrimRef) const
+      {
+        auto reduce = [&](const range<size_t>& r) -> LBBox3fa
+        {
+          LBBox3fa cbounds(empty);
+          for (size_t j = r.begin(); j < r.end(); j++)
+          {
+            PrimRefMB& ref = (*prims)[j];
+            const LBBox3fa bn = recalculatePrimRef.linearBounds(ref, time_range);
+            cbounds.extend(bn);
+          };
+          return cbounds;
+        };
+        
+        return parallel_reduce(object_range.begin(), object_range.end(), PARALLEL_FIND_BLOCK_SIZE, PARALLEL_THRESHOLD, LBBox3fa(empty),
+                               reduce,
+                               [&](const LBBox3fa& b0, const LBBox3fa& b1) -> LBBox3fa { return embree::merge(b0, b1); });
+      }
+
+      template<typename RecalculatePrimRef>
+        __forceinline LBBox3fa linearBounds(const RecalculatePrimRef& recalculatePrimRef, const LinearSpace3fa& space) const
+      {
+        auto reduce = [&](const range<size_t>& r) -> LBBox3fa
+        {
+          LBBox3fa cbounds(empty);
+          for (size_t j = r.begin(); j < r.end(); j++)
+          {
+            PrimRefMB& ref = (*prims)[j];
+            const LBBox3fa bn = recalculatePrimRef.linearBounds(ref, time_range, space);
+            cbounds.extend(bn);
+          };
+          return cbounds;
+        };
+        
+        return parallel_reduce(object_range.begin(), object_range.end(), PARALLEL_FIND_BLOCK_SIZE, PARALLEL_THRESHOLD, LBBox3fa(empty),
+                               reduce,
+                               [&](const LBBox3fa& b0, const LBBox3fa& b1) -> LBBox3fa { return embree::merge(b0, b1); });
+      }
+
+      template<typename RecalculatePrimRef>
+        const SetMB primInfo(const RecalculatePrimRef& recalculatePrimRef, const LinearSpace3fa& space) const
+      {
+        auto computePrimInfo = [&](const range<size_t>& r) -> PrimInfoMB
+        {
+          PrimInfoMB pinfo(empty);
+          for (size_t j=r.begin(); j<r.end(); j++)
+          {
+            PrimRefMB& ref = (*prims)[j];
+            PrimRefMB ref1 = recalculatePrimRef(ref,time_range,space);
+            pinfo.add_primref(ref1);
+          };
+          return pinfo;
+        };
+        
+        const PrimInfoMB pinfo = parallel_reduce(object_range.begin(), object_range.end(), PARALLEL_FIND_BLOCK_SIZE, PARALLEL_THRESHOLD, 
+                                                 PrimInfoMB(empty), computePrimInfo, PrimInfoMB::merge2);
+
+        return SetMB(pinfo,prims,object_range,time_range);
+      }
+      
+    public:
+      PrimRefVector prims;
+    };
+//}
+}
diff --git a/thirdparty/embree/kernels/builders/primrefgen.cpp b/thirdparty/embree/kernels/builders/primrefgen.cpp
new file mode 100644
index 0000000000..d279dc4993
--- /dev/null
+++ b/thirdparty/embree/kernels/builders/primrefgen.cpp
@@ -0,0 +1,312 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "primrefgen.h"
+#include "primrefgen_presplit.h"
+
+#include "../../common/algorithms/parallel_for_for.h"
+#include "../../common/algorithms/parallel_for_for_prefix_sum.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    PrimInfo createPrimRefArray(Geometry* geometry, unsigned int geomID, const size_t numPrimRefs, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor)
+    {
+      ParallelPrefixSumState<PrimInfo> pstate;
+      
+      /* first try */
+      progressMonitor(0);
+      PrimInfo pinfo = parallel_prefix_sum( pstate, size_t(0), geometry->size(), size_t(1024), PrimInfo(empty), [&](const range<size_t>& r, const PrimInfo& base) -> PrimInfo {
+          return geometry->createPrimRefArray(prims,r,r.begin(),geomID);
+        }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+
+      /* if we need to filter out geometry, run again */
+      if (pinfo.size() != numPrimRefs)
+      {
+        progressMonitor(0);
+        pinfo = parallel_prefix_sum( pstate, size_t(0), geometry->size(), size_t(1024), PrimInfo(empty), [&](const range<size_t>& r, const PrimInfo& base) -> PrimInfo {
+          return geometry->createPrimRefArray(prims,r,base.size(),geomID);
+        }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+      }
+      return pinfo;
+    }
+
+    PrimInfo createPrimRefArray(Scene* scene, Geometry::GTypeMask types, bool mblur, const size_t numPrimRefs, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor)
+    {
+      ParallelForForPrefixSumState<PrimInfo> pstate;
+      Scene::Iterator2 iter(scene,types,mblur);
+      
+      /* first try */
+      progressMonitor(0);
+      pstate.init(iter,size_t(1024));
+      PrimInfo pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID) -> PrimInfo {
+          return mesh->createPrimRefArray(prims,r,k,(unsigned)geomID);
+        }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+      
+      /* if we need to filter out geometry, run again */
+      if (pinfo.size() != numPrimRefs)
+      {
+        progressMonitor(0);
+        pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfo& base) -> PrimInfo {
+            return mesh->createPrimRefArray(prims,r,base.size(),(unsigned)geomID);
+          }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+      }
+      return pinfo;
+    }
+
+    PrimInfo createPrimRefArrayMBlur(Scene* scene, Geometry::GTypeMask types, const size_t numPrimRefs, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor, size_t itime)
+    {
+      ParallelForForPrefixSumState<PrimInfo> pstate;
+      Scene::Iterator2 iter(scene,types,true);
+      
+      /* first try */
+      progressMonitor(0);
+      pstate.init(iter,size_t(1024));
+      PrimInfo pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID) -> PrimInfo {
+          return mesh->createPrimRefArrayMB(prims,itime,r,k,(unsigned)geomID);
+        }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+      
+      /* if we need to filter out geometry, run again */
+      if (pinfo.size() != numPrimRefs)
+      {
+        progressMonitor(0);
+        pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfo& base) -> PrimInfo {
+            return mesh->createPrimRefArrayMB(prims,itime,r,base.size(),(unsigned)geomID);
+          }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+      }
+      return pinfo;
+    }
+
+    PrimInfoMB createPrimRefArrayMSMBlur(Scene* scene, Geometry::GTypeMask types, const size_t numPrimRefs, mvector<PrimRefMB>& prims, BuildProgressMonitor& progressMonitor, BBox1f t0t1)
+    {
+      ParallelForForPrefixSumState<PrimInfoMB> pstate;
+      Scene::Iterator2 iter(scene,types,true);
+      
+      /* first try */
+      progressMonitor(0);
+      pstate.init(iter,size_t(1024));
+      PrimInfoMB pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfoMB(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID) -> PrimInfoMB {
+          return mesh->createPrimRefMBArray(prims,t0t1,r,k,(unsigned)geomID);
+      }, [](const PrimInfoMB& a, const PrimInfoMB& b) -> PrimInfoMB { return PrimInfoMB::merge2(a,b); });
+      
+      /* if we need to filter out geometry, run again */
+      if (pinfo.size() != numPrimRefs)
+      {
+        progressMonitor(0);
+        pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfoMB(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfoMB& base) -> PrimInfoMB {
+            return mesh->createPrimRefMBArray(prims,t0t1,r,base.size(),(unsigned)geomID);
+        }, [](const PrimInfoMB& a, const PrimInfoMB& b) -> PrimInfoMB { return PrimInfoMB::merge2(a,b); });
+      }
+
+      /* the BVH starts with that time range, even though primitives might have smaller/larger time range */
+      pinfo.time_range = t0t1;
+      return pinfo;
+    }
+
+    template<typename Mesh>
+    size_t createMortonCodeArray(Mesh* mesh, mvector<BVHBuilderMorton::BuildPrim>& morton, BuildProgressMonitor& progressMonitor)
+    {
+      size_t numPrimitives = morton.size();
+
+      /* compute scene bounds */
+      std::pair<size_t,BBox3fa> cb_empty(0,empty);
+      auto cb = parallel_reduce 
+        ( size_t(0), numPrimitives, size_t(1024), cb_empty, [&](const range<size_t>& r) -> std::pair<size_t,BBox3fa>
+          {
+            size_t num = 0;
+            BBox3fa bounds = empty;
+            
+            for (size_t j=r.begin(); j<r.end(); j++)
+            {
+              BBox3fa prim_bounds = empty;
+              if (unlikely(!mesh->buildBounds(j,&prim_bounds))) continue;
+              bounds.extend(center2(prim_bounds));
+              num++;
+            }
+            return std::make_pair(num,bounds);
+          }, [] (const std::pair<size_t,BBox3fa>& a, const std::pair<size_t,BBox3fa>& b) {
+          return std::make_pair(a.first + b.first,merge(a.second,b.second)); 
+        });
+      
+      
+      size_t numPrimitivesGen = cb.first;
+      const BBox3fa centBounds = cb.second;
+      
+      /* compute morton codes */
+      if (likely(numPrimitivesGen == numPrimitives))
+      {
+        /* fast path if all primitives were valid */
+        BVHBuilderMorton::MortonCodeMapping mapping(centBounds);
+        parallel_for( size_t(0), numPrimitives, size_t(1024), [&](const range<size_t>& r) -> void {
+            BVHBuilderMorton::MortonCodeGenerator generator(mapping,&morton.data()[r.begin()]);
+            for (size_t j=r.begin(); j<r.end(); j++)
+              generator(mesh->bounds(j),unsigned(j));
+          });
+      }
+      else
+      {
+        /* slow path, fallback in case some primitives were invalid */
+        ParallelPrefixSumState<size_t> pstate;
+        BVHBuilderMorton::MortonCodeMapping mapping(centBounds);
+        parallel_prefix_sum( pstate, size_t(0), numPrimitives, size_t(1024), size_t(0), [&](const range<size_t>& r, const size_t base) -> size_t {
+            size_t num = 0;
+            BVHBuilderMorton::MortonCodeGenerator generator(mapping,&morton.data()[r.begin()]);
+            for (size_t j=r.begin(); j<r.end(); j++)
+            {
+              BBox3fa bounds = empty;
+              if (unlikely(!mesh->buildBounds(j,&bounds))) continue;
+              generator(bounds,unsigned(j));
+              num++;
+            }
+            return num;
+          }, std::plus<size_t>());
+        
+        parallel_prefix_sum( pstate, size_t(0), numPrimitives, size_t(1024), size_t(0), [&](const range<size_t>& r, const size_t base) -> size_t {
+            size_t num = 0;
+            BVHBuilderMorton::MortonCodeGenerator generator(mapping,&morton.data()[base]);
+            for (size_t j=r.begin(); j<r.end(); j++)
+            {
+              BBox3fa bounds = empty;
+              if (!mesh->buildBounds(j,&bounds)) continue;
+              generator(bounds,unsigned(j));
+              num++;
+            }
+            return num;
+          }, std::plus<size_t>());          
+      }
+      return numPrimitivesGen;
+    }
+
+    // ====================================================================================================
+    // ====================================================================================================
+    // ====================================================================================================
+
+    // special variants for grid meshes
+
+// -- GODOT start --
+#if defined(EMBREE_GEOMETRY_GRID)
+// -- GODOT end --
+    PrimInfo createPrimRefArrayGrids(Scene* scene, mvector<PrimRef>& prims, mvector<SubGridBuildData>& sgrids)
+    {
+      PrimInfo pinfo(empty);
+      size_t numPrimitives = 0;
+      
+      /* first run to get #primitives */
+
+      ParallelForForPrefixSumState<PrimInfo> pstate;
+      Scene::Iterator<GridMesh,false> iter(scene);
+
+      pstate.init(iter,size_t(1024));
+
+      /* iterate over all meshes in the scene */
+      pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](GridMesh* mesh, const range<size_t>& r, size_t k, size_t geomID) -> PrimInfo {
+          PrimInfo pinfo(empty);
+          for (size_t j=r.begin(); j<r.end(); j++)
+          {
+            if (!mesh->valid(j)) continue;
+            BBox3fa bounds = empty;
+            const PrimRef prim(bounds,(unsigned)geomID,(unsigned)j);
+            if (!mesh->valid(j)) continue;
+            pinfo.add_center2(prim,mesh->getNumSubGrids(j));
+          }
+          return pinfo;
+        }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+      numPrimitives = pinfo.size();
+          
+      /* resize arrays */
+      sgrids.resize(numPrimitives); 
+      prims.resize(numPrimitives); 
+
+      /* second run to fill primrefs and SubGridBuildData arrays */
+      pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](GridMesh* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfo& base) -> PrimInfo {
+          k = base.size();
+          size_t p_index = k;
+          PrimInfo pinfo(empty);
+          for (size_t j=r.begin(); j<r.end(); j++)
+          {
+            if (!mesh->valid(j)) continue;
+            const GridMesh::Grid &g = mesh->grid(j);
+            for (unsigned int y=0; y<g.resY-1u; y+=2)
+              for (unsigned int x=0; x<g.resX-1u; x+=2)
+              {
+                BBox3fa bounds = empty;
+                if (!mesh->buildBounds(g,x,y,bounds)) continue; // get bounds of subgrid
+                const PrimRef prim(bounds,(unsigned)geomID,(unsigned)p_index);
+                pinfo.add_center2(prim);
+                sgrids[p_index] = SubGridBuildData(x | g.get3x3FlagsX(x), y | g.get3x3FlagsY(y), unsigned(j));
+                prims[p_index++] = prim;                
+              }
+          }
+          return pinfo;
+        }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+      assert(pinfo.size() == numPrimitives);
+      return pinfo;
+    }
+
+    PrimInfo createPrimRefArrayGrids(GridMesh* mesh, mvector<PrimRef>& prims, mvector<SubGridBuildData>& sgrids)
+    {
+      unsigned int geomID_ = std::numeric_limits<unsigned int>::max ();
+
+      PrimInfo pinfo(empty);
+      size_t numPrimitives = 0;
+      
+      ParallelPrefixSumState<PrimInfo> pstate;
+      /* iterate over all grids in a single mesh */
+      pinfo = parallel_prefix_sum( pstate, size_t(0), mesh->size(), size_t(1024), PrimInfo(empty), [&](const range<size_t>& r, const PrimInfo& base) -> PrimInfo
+                                   {
+                                     PrimInfo pinfo(empty);
+                                     for (size_t j=r.begin(); j<r.end(); j++)
+                                     {
+                                       if (!mesh->valid(j)) continue;
+                                       BBox3fa bounds = empty;
+                                       const PrimRef prim(bounds,geomID_,unsigned(j));
+                                       pinfo.add_center2(prim,mesh->getNumSubGrids(j));
+                                     }
+                                     return pinfo;
+                                   }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+      numPrimitives = pinfo.size();
+      /* resize arrays */
+      sgrids.resize(numPrimitives); 
+      prims.resize(numPrimitives); 
+
+      /* second run to fill primrefs and SubGridBuildData arrays */
+      pinfo = parallel_prefix_sum( pstate, size_t(0), mesh->size(), size_t(1024), PrimInfo(empty), [&](const range<size_t>& r, const PrimInfo& base) -> PrimInfo
+                                   {
+
+                                     size_t p_index = base.size();
+                                     PrimInfo pinfo(empty);
+                                     for (size_t j=r.begin(); j<r.end(); j++)
+                                     {
+                                       if (!mesh->valid(j)) continue;
+                                       const GridMesh::Grid &g = mesh->grid(j);
+                                       for (unsigned int y=0; y<g.resY-1u; y+=2)
+                                         for (unsigned int x=0; x<g.resX-1u; x+=2)
+                                         {
+                                           BBox3fa bounds = empty;
+                                           if (!mesh->buildBounds(g,x,y,bounds)) continue; // get bounds of subgrid
+                                           const PrimRef prim(bounds,geomID_,unsigned(p_index));
+                                           pinfo.add_center2(prim);
+                                           sgrids[p_index] = SubGridBuildData(x | g.get3x3FlagsX(x), y | g.get3x3FlagsY(y), unsigned(j));
+                                           prims[p_index++] = prim;                
+                                         }
+                                     }
+                                     return pinfo;
+                                   }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+
+      return pinfo;
+    }
+// -- GODOT start --
+#endif
+// -- GODOT end --
+    
+    // ====================================================================================================
+    // ====================================================================================================
+    // ====================================================================================================
+
+    IF_ENABLED_TRIS (template size_t createMortonCodeArray<TriangleMesh>(TriangleMesh* mesh COMMA mvector<BVHBuilderMorton::BuildPrim>& morton COMMA BuildProgressMonitor& progressMonitor));
+    IF_ENABLED_QUADS(template size_t createMortonCodeArray<QuadMesh>(QuadMesh* mesh COMMA mvector<BVHBuilderMorton::BuildPrim>& morton COMMA BuildProgressMonitor& progressMonitor));
+    IF_ENABLED_USER (template size_t createMortonCodeArray<UserGeometry>(UserGeometry* mesh COMMA mvector<BVHBuilderMorton::BuildPrim>& morton COMMA BuildProgressMonitor& progressMonitor));
+    IF_ENABLED_INSTANCE (template size_t createMortonCodeArray<Instance>(Instance* mesh COMMA mvector<BVHBuilderMorton::BuildPrim>& morton COMMA BuildProgressMonitor& progressMonitor));
+  }
+}
diff --git a/thirdparty/embree/kernels/builders/primrefgen.h b/thirdparty/embree/kernels/builders/primrefgen.h
new file mode 100644
index 0000000000..c09a848ba3
--- /dev/null
+++ b/thirdparty/embree/kernels/builders/primrefgen.h
@@ -0,0 +1,34 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/scene.h"
+#include "../common/primref.h"
+#include "../common/primref_mb.h"
+#include "priminfo.h"
+#include "bvh_builder_morton.h"
+
+namespace embree
+{ 
+  namespace isa
+  {
+    PrimInfo createPrimRefArray(Geometry* geometry, unsigned int geomID, size_t numPrimitives, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor);
+   
+    PrimInfo createPrimRefArray(Scene* scene, Geometry::GTypeMask types, bool mblur, size_t numPrimitives, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor);
+   
+    PrimInfo createPrimRefArrayMBlur(Scene* scene, Geometry::GTypeMask types, size_t numPrimitives, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor, size_t itime = 0);
+
+    PrimInfoMB createPrimRefArrayMSMBlur(Scene* scene, Geometry::GTypeMask types, size_t numPrimitives, mvector<PrimRefMB>& prims, BuildProgressMonitor& progressMonitor, BBox1f t0t1 = BBox1f(0.0f,1.0f));
+
+    template<typename Mesh>
+      size_t createMortonCodeArray(Mesh* mesh, mvector<BVHBuilderMorton::BuildPrim>& morton, BuildProgressMonitor& progressMonitor);
+
+    /* special variants for grids */
+    PrimInfo createPrimRefArrayGrids(Scene* scene, mvector<PrimRef>& prims, mvector<SubGridBuildData>& sgrids);
+
+    PrimInfo createPrimRefArrayGrids(GridMesh* mesh, mvector<PrimRef>& prims, mvector<SubGridBuildData>& sgrids);
+    
+  }
+}
+
diff --git a/thirdparty/embree/kernels/builders/primrefgen_presplit.h b/thirdparty/embree/kernels/builders/primrefgen_presplit.h
new file mode 100644
index 0000000000..8cd251ddd2
--- /dev/null
+++ b/thirdparty/embree/kernels/builders/primrefgen_presplit.h
@@ -0,0 +1,371 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../builders/primrefgen.h"
+#include "../builders/heuristic_spatial.h"
+#include "../builders/splitter.h"
+
+#include "../../common/algorithms/parallel_for_for.h"
+#include "../../common/algorithms/parallel_for_for_prefix_sum.h"
+
+#define DBG_PRESPLIT(x)   
+#define CHECK_PRESPLIT(x) 
+
+#define GRID_SIZE 1024
+#define MAX_PRESPLITS_PER_PRIMITIVE_LOG 5
+#define MAX_PRESPLITS_PER_PRIMITIVE (1<<MAX_PRESPLITS_PER_PRIMITIVE_LOG)
+#define PRIORITY_CUTOFF_THRESHOLD 1.0f
+#define PRIORITY_SPLIT_POS_WEIGHT 1.5f
+
+namespace embree
+{  
+  namespace isa
+  {
+
+    struct PresplitItem
+    {
+      union {
+        float priority;    
+        unsigned int data;
+      };
+      unsigned int index;
+      
+      __forceinline operator unsigned() const
+      {
+	return reinterpret_cast<const unsigned&>(priority);
+      }
+      __forceinline bool operator < (const PresplitItem& item) const
+      {
+	return (priority < item.priority);
+      }
+
+      template<typename Mesh>
+      __forceinline static float compute_priority(const PrimRef &ref, Scene *scene, const Vec2i &mc)
+      {
+	const unsigned int geomID = ref.geomID();
+	const unsigned int primID = ref.primID();
+	const float area_aabb  = area(ref.bounds());
+	const float area_prim  = ((Mesh*)scene->get(geomID))->projectedPrimitiveArea(primID);
+        const unsigned int diff = 31 - lzcnt(mc.x^mc.y);
+        assert(area_prim <= area_aabb);
+        //const float priority = powf((area_aabb - area_prim) * powf(PRIORITY_SPLIT_POS_WEIGHT,(float)diff),1.0f/4.0f);   
+        const float priority = sqrtf(sqrtf( (area_aabb - area_prim) * powf(PRIORITY_SPLIT_POS_WEIGHT,(float)diff) ));
+        assert(priority >= 0.0f && priority < FLT_LARGE);
+	return priority;      
+      }
+
+    
+    };
+
+    inline std::ostream &operator<<(std::ostream &cout, const PresplitItem& item) {
+      return cout << "index " << item.index << " priority " << item.priority;    
+    };
+
+    template<typename SplitterFactory>    
+      void splitPrimitive(SplitterFactory &Splitter,
+                          const PrimRef &prim,
+                          const unsigned int geomID,
+                          const unsigned int primID,
+                          const unsigned int split_level,
+                          const Vec3fa &grid_base, 
+                          const float grid_scale,
+                          const float grid_extend,
+                          PrimRef subPrims[MAX_PRESPLITS_PER_PRIMITIVE],
+                          unsigned int& numSubPrims)
+    {
+      assert(split_level <= MAX_PRESPLITS_PER_PRIMITIVE_LOG);
+      if (split_level == 0)
+      {
+        assert(numSubPrims < MAX_PRESPLITS_PER_PRIMITIVE);
+        subPrims[numSubPrims++] = prim;
+      }
+      else
+      {
+        const Vec3fa lower = prim.lower;
+        const Vec3fa upper = prim.upper;
+        const Vec3fa glower = (lower-grid_base)*Vec3fa(grid_scale)+Vec3fa(0.2f);
+        const Vec3fa gupper = (upper-grid_base)*Vec3fa(grid_scale)-Vec3fa(0.2f);
+        Vec3ia ilower(floor(glower));
+        Vec3ia iupper(floor(gupper));
+
+        /* this ignores dimensions that are empty */
+        iupper = (Vec3ia)(select(vint4(glower) >= vint4(gupper),vint4(ilower),vint4(iupper)));
+
+        /* compute a morton code for the lower and upper grid coordinates. */
+        const unsigned int lower_code = bitInterleave(ilower.x,ilower.y,ilower.z);
+        const unsigned int upper_code = bitInterleave(iupper.x,iupper.y,iupper.z);
+			
+        /* if all bits are equal then we cannot split */
+        if(unlikely(lower_code == upper_code))
+        {
+          assert(numSubPrims < MAX_PRESPLITS_PER_PRIMITIVE);
+          subPrims[numSubPrims++] = prim;
+          return;
+        }
+		    
+        /* compute octree level and dimension to perform the split in */
+        const unsigned int diff = 31 - lzcnt(lower_code^upper_code);
+        const unsigned int level = diff / 3;
+        const unsigned int dim   = diff % 3;
+      
+        /* now we compute the grid position of the split */
+        const unsigned int isplit = iupper[dim] & ~((1<<level)-1);
+			    
+        /* compute world space position of split */
+        const float inv_grid_size = 1.0f / GRID_SIZE;
+        const float fsplit = grid_base[dim] + isplit * inv_grid_size * grid_extend;
+
+        assert(prim.lower[dim] <= fsplit &&
+               prim.upper[dim] >= fsplit);
+		
+        /* split primitive */
+        const auto splitter = Splitter(prim);
+        BBox3fa left,right;
+        splitter(prim.bounds(),dim,fsplit,left,right);
+        assert(!left.empty());
+        assert(!right.empty());
+
+			    
+        splitPrimitive(Splitter,PrimRef(left ,geomID,primID),geomID,primID,split_level-1,grid_base,grid_scale,grid_extend,subPrims,numSubPrims);
+        splitPrimitive(Splitter,PrimRef(right,geomID,primID),geomID,primID,split_level-1,grid_base,grid_scale,grid_extend,subPrims,numSubPrims);
+      }
+    }
+    
+    
+    template<typename Mesh, typename SplitterFactory>    
+      PrimInfo createPrimRefArray_presplit(Geometry* geometry, unsigned int geomID, size_t numPrimRefs, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor)
+    {
+      ParallelPrefixSumState<PrimInfo> pstate;
+      
+      /* first try */
+      progressMonitor(0);
+      PrimInfo pinfo = parallel_prefix_sum( pstate, size_t(0), geometry->size(), size_t(1024), PrimInfo(empty), [&](const range<size_t>& r, const PrimInfo& base) -> PrimInfo {
+	  return geometry->createPrimRefArray(prims,r,r.begin(),geomID);
+	}, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+
+      /* if we need to filter out geometry, run again */
+      if (pinfo.size() != numPrimRefs)
+	{
+	  progressMonitor(0);
+	  pinfo = parallel_prefix_sum( pstate, size_t(0), geometry->size(), size_t(1024), PrimInfo(empty), [&](const range<size_t>& r, const PrimInfo& base) -> PrimInfo {
+	      return geometry->createPrimRefArray(prims,r,base.size(),geomID);
+	    }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+	}
+      return pinfo;	
+    }
+    
+    __forceinline Vec2i computeMC(const Vec3fa &grid_base, const float grid_scale, const PrimRef &ref)
+    {
+      const Vec3fa lower = ref.lower;
+      const Vec3fa upper = ref.upper;
+      const Vec3fa glower = (lower-grid_base)*Vec3fa(grid_scale)+Vec3fa(0.2f);
+      const Vec3fa gupper = (upper-grid_base)*Vec3fa(grid_scale)-Vec3fa(0.2f);
+      Vec3ia ilower(floor(glower));
+      Vec3ia iupper(floor(gupper));
+      
+      /* this ignores dimensions that are empty */
+      iupper = (Vec3ia)select(vint4(glower) >= vint4(gupper),vint4(ilower),vint4(iupper));
+
+      /* compute a morton code for the lower and upper grid coordinates. */
+      const unsigned int lower_code = bitInterleave(ilower.x,ilower.y,ilower.z);
+      const unsigned int upper_code = bitInterleave(iupper.x,iupper.y,iupper.z);
+      return Vec2i(lower_code,upper_code);
+    }
+
+    template<typename Mesh, typename SplitterFactory>    
+      PrimInfo createPrimRefArray_presplit(Scene* scene, Geometry::GTypeMask types, bool mblur, size_t numPrimRefs, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor)
+    {	
+      static const size_t MIN_STEP_SIZE = 128;
+
+      ParallelForForPrefixSumState<PrimInfo> pstate;
+      Scene::Iterator2 iter(scene,types,mblur);
+
+      /* first try */
+      progressMonitor(0);
+      pstate.init(iter,size_t(1024));
+      PrimInfo pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID) -> PrimInfo {
+	  return mesh->createPrimRefArray(prims,r,k,(unsigned)geomID);
+	}, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+      
+      /* if we need to filter out geometry, run again */
+      if (pinfo.size() != numPrimRefs)
+	{
+	  progressMonitor(0);
+	  pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfo& base) -> PrimInfo {
+	      return mesh->createPrimRefArray(prims,r,base.size(),(unsigned)geomID);
+	    }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+	}
+
+      /* use correct number of primitives */
+      size_t numPrimitives = pinfo.size();
+      const size_t alloc_numPrimitives = prims.size(); 
+      const size_t numSplitPrimitivesBudget = alloc_numPrimitives - numPrimitives;
+
+      /* set up primitive splitter */
+      SplitterFactory Splitter(scene);
+
+
+      DBG_PRESPLIT(
+        const size_t org_numPrimitives = pinfo.size();
+        PRINT(numPrimitives);		
+        PRINT(alloc_numPrimitives);		
+        PRINT(numSplitPrimitivesBudget);
+        );
+
+      /* allocate double buffer presplit items */
+      const size_t presplit_allocation_size = sizeof(PresplitItem)*alloc_numPrimitives;
+      PresplitItem *presplitItem     = (PresplitItem*)alignedMalloc(presplit_allocation_size,64);
+      PresplitItem *tmp_presplitItem = (PresplitItem*)alignedMalloc(presplit_allocation_size,64);
+
+      /* compute grid */
+      const Vec3fa grid_base    = pinfo.geomBounds.lower;
+      const Vec3fa grid_diag    = pinfo.geomBounds.size();
+      const float grid_extend   = max(grid_diag.x,max(grid_diag.y,grid_diag.z));		
+      const float grid_scale    = grid_extend == 0.0f ? 0.0f : GRID_SIZE / grid_extend;
+
+      /* init presplit items and get total sum */
+      const float psum = parallel_reduce( size_t(0), numPrimitives, size_t(MIN_STEP_SIZE), 0.0f, [&](const range<size_t>& r) -> float {
+          float sum = 0.0f;
+          for (size_t i=r.begin(); i<r.end(); i++)
+          {		
+            presplitItem[i].index = (unsigned int)i;
+            const Vec2i mc = computeMC(grid_base,grid_scale,prims[i]);
+            /* if all bits are equal then we cannot split */
+            presplitItem[i].priority = (mc.x != mc.y) ? PresplitItem::compute_priority<Mesh>(prims[i],scene,mc) : 0.0f;    
+            /* FIXME: sum undeterministic */
+            sum += presplitItem[i].priority;
+          }
+          return sum;
+        },[](const float& a, const float& b) -> float { return a+b; });
+
+      /* compute number of splits per primitive */
+      const float inv_psum = 1.0f / psum;
+      parallel_for( size_t(0), numPrimitives, size_t(MIN_STEP_SIZE), [&](const range<size_t>& r) -> void {
+          for (size_t i=r.begin(); i<r.end(); i++)
+          {
+            if (presplitItem[i].priority > 0.0f)
+            {
+              const float rel_p = (float)numSplitPrimitivesBudget * presplitItem[i].priority * inv_psum;
+              if (rel_p >= PRIORITY_CUTOFF_THRESHOLD) // need at least a split budget that generates two sub-prims
+              {
+                presplitItem[i].priority = max(min(ceilf(logf(rel_p)/logf(2.0f)),(float)MAX_PRESPLITS_PER_PRIMITIVE_LOG),1.0f);
+                //presplitItem[i].priority = min(floorf(logf(rel_p)/logf(2.0f)),(float)MAX_PRESPLITS_PER_PRIMITIVE_LOG);
+                assert(presplitItem[i].priority >= 0.0f && presplitItem[i].priority <= (float)MAX_PRESPLITS_PER_PRIMITIVE_LOG);
+              }
+              else
+                presplitItem[i].priority = 0.0f;
+            }
+          }
+        });
+
+      auto isLeft = [&] (const PresplitItem &ref) { return ref.priority < PRIORITY_CUTOFF_THRESHOLD; };        
+      size_t center = parallel_partitioning(presplitItem,0,numPrimitives,isLeft,1024);
+
+      /* anything to split ? */
+      if (center < numPrimitives)
+      {
+        const size_t numPrimitivesToSplit = numPrimitives - center;
+        assert(presplitItem[center].priority >= 1.0f);
+
+        /* sort presplit items in ascending order */
+        radix_sort_u32(presplitItem + center,tmp_presplitItem + center,numPrimitivesToSplit,1024);
+
+        CHECK_PRESPLIT(
+          parallel_for( size_t(center+1), numPrimitives, size_t(MIN_STEP_SIZE), [&](const range<size_t>& r) -> void {
+              for (size_t i=r.begin(); i<r.end(); i++)
+                assert(presplitItem[i-1].priority <= presplitItem[i].priority);
+            });
+          );
+
+        unsigned int *const primOffset0 = (unsigned int*)tmp_presplitItem;
+        unsigned int *const primOffset1 = (unsigned int*)tmp_presplitItem + numPrimitivesToSplit;
+
+        /* compute actual number of sub-primitives generated within the [center;numPrimitives-1] range */
+        const size_t totalNumSubPrims = parallel_reduce( size_t(center), numPrimitives, size_t(MIN_STEP_SIZE), size_t(0), [&](const range<size_t>& t) -> size_t {
+            size_t sum = 0;
+            for (size_t i=t.begin(); i<t.end(); i++)
+            {	
+              PrimRef subPrims[MAX_PRESPLITS_PER_PRIMITIVE];	
+              assert(presplitItem[i].priority >= 1.0f);
+              const unsigned int  primrefID = presplitItem[i].index;	
+              const float prio              = presplitItem[i].priority;
+              const unsigned int   geomID   = prims[primrefID].geomID();
+              const unsigned int   primID   = prims[primrefID].primID();
+              const unsigned int split_levels = (unsigned int)prio;
+              unsigned int numSubPrims = 0;
+              splitPrimitive(Splitter,prims[primrefID],geomID,primID,split_levels,grid_base,grid_scale,grid_extend,subPrims,numSubPrims);
+              assert(numSubPrims);
+              numSubPrims--; // can reuse slot 
+              sum+=numSubPrims;
+              presplitItem[i].data = (numSubPrims << MAX_PRESPLITS_PER_PRIMITIVE_LOG) | split_levels;
+              primOffset0[i-center] = numSubPrims;
+            }
+            return sum;
+          },[](const size_t& a, const size_t& b) -> size_t { return a+b; });
+        
+        /* if we are over budget, need to shrink the range */
+        if (totalNumSubPrims > numSplitPrimitivesBudget) 
+        {
+          size_t new_center = numPrimitives-1;
+          size_t sum = 0;
+          for (;new_center>=center;new_center--)
+          {
+            const unsigned int numSubPrims = presplitItem[new_center].data >> MAX_PRESPLITS_PER_PRIMITIVE_LOG;
+            if (unlikely(sum + numSubPrims >= numSplitPrimitivesBudget)) break;
+            sum += numSubPrims;
+          }
+          new_center++;
+          center = new_center;
+        }
+
+        /* parallel prefix sum to compute offsets for storing sub-primitives */
+        const unsigned int offset = parallel_prefix_sum(primOffset0,primOffset1,numPrimitivesToSplit,(unsigned int)0,std::plus<unsigned int>());
+
+        /* iterate over range, and split primitives into sub primitives and append them to prims array */		    
+        parallel_for( size_t(center), numPrimitives, size_t(MIN_STEP_SIZE), [&](const range<size_t>& rn) -> void {
+            for (size_t j=rn.begin(); j<rn.end(); j++)		    
+            {
+              PrimRef subPrims[MAX_PRESPLITS_PER_PRIMITIVE];
+              const unsigned int  primrefID = presplitItem[j].index;	
+              const unsigned int   geomID   = prims[primrefID].geomID();
+              const unsigned int   primID   = prims[primrefID].primID();
+              const unsigned int split_levels = presplitItem[j].data & ((unsigned int)(1 << MAX_PRESPLITS_PER_PRIMITIVE_LOG)-1);
+
+              assert(split_levels);
+              assert(split_levels <= MAX_PRESPLITS_PER_PRIMITIVE_LOG);
+              unsigned int numSubPrims = 0;
+              splitPrimitive(Splitter,prims[primrefID],geomID,primID,split_levels,grid_base,grid_scale,grid_extend,subPrims,numSubPrims);
+              const size_t newID = numPrimitives + primOffset1[j-center];              
+              assert(newID+numSubPrims <= alloc_numPrimitives);
+              prims[primrefID] = subPrims[0];
+              for (size_t i=1;i<numSubPrims;i++)
+                prims[newID+i-1] = subPrims[i];
+            }
+          });
+
+        numPrimitives += offset;
+        DBG_PRESPLIT(
+          PRINT(pinfo.size());
+          PRINT(numPrimitives);
+          PRINT((float)numPrimitives/org_numPrimitives));                
+      }
+                
+      /* recompute centroid bounding boxes */
+      pinfo = parallel_reduce(size_t(0),numPrimitives,size_t(MIN_STEP_SIZE),PrimInfo(empty),[&] (const range<size_t>& r) -> PrimInfo {
+          PrimInfo p(empty);
+          for (size_t j=r.begin(); j<r.end(); j++)
+            p.add_center2(prims[j]);
+          return p;
+        }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+  
+      assert(pinfo.size() == numPrimitives);
+      
+      /* free double buffer presplit items */
+      alignedFree(tmp_presplitItem);		
+      alignedFree(presplitItem);
+      return pinfo;	
+    }
+  }
+}
diff --git a/thirdparty/embree/kernels/builders/splitter.h b/thirdparty/embree/kernels/builders/splitter.h
new file mode 100644
index 0000000000..f7720bd284
--- /dev/null
+++ b/thirdparty/embree/kernels/builders/splitter.h
@@ -0,0 +1,191 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/scene.h"
+#include "../common/primref.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<size_t N>
+    __forceinline void splitPolygon(const BBox3fa& bounds, 
+                                    const size_t dim, 
+                                    const float pos, 
+                                    const Vec3fa (&v)[N+1],
+                                    const Vec3fa (&inv_length)[N],
+                                    BBox3fa& left_o, 
+                                    BBox3fa& right_o)
+    {
+      BBox3fa left = empty, right = empty;
+      /* clip triangle to left and right box by processing all edges */
+      for (size_t i=0; i<N; i++)
+      {
+        const Vec3fa &v0 = v[i]; 
+        const Vec3fa &v1 = v[i+1]; 
+        const float v0d = v0[dim];
+        const float v1d = v1[dim];
+        
+        if (v0d <= pos) left. extend(v0); // this point is on left side
+        if (v0d >= pos) right.extend(v0); // this point is on right side
+        
+        if ((v0d < pos && pos < v1d) || (v1d < pos && pos < v0d)) // the edge crosses the splitting location
+        {
+          assert((v1d-v0d) != 0.0f);
+          const Vec3fa c = madd(Vec3fa((pos-v0d)*inv_length[i][dim]),v1-v0,v0);
+          left.extend(c);
+          right.extend(c);
+        }
+      }
+      
+      /* clip against current bounds */
+      left_o  = intersect(left,bounds);
+      right_o = intersect(right,bounds);
+    }
+    
+    template<size_t N>
+      __forceinline void splitPolygon(const PrimRef& prim, 
+                                      const size_t dim, 
+                                      const float pos, 
+                                      const Vec3fa (&v)[N+1],
+                                      PrimRef& left_o, 
+                                      PrimRef& right_o)
+    {
+      BBox3fa left = empty, right = empty;
+      for (size_t i=0; i<N; i++)
+      {
+        const Vec3fa &v0 = v[i]; 
+        const Vec3fa &v1 = v[i+1]; 
+        const float v0d = v0[dim];
+        const float v1d = v1[dim];
+        
+        if (v0d <= pos) left. extend(v0); // this point is on left side
+        if (v0d >= pos) right.extend(v0); // this point is on right side
+        
+        if ((v0d < pos && pos < v1d) || (v1d < pos && pos < v0d)) // the edge crosses the splitting location
+        {
+          assert((v1d-v0d) != 0.0f);
+          const float inv_length = 1.0f/(v1d-v0d);
+          const Vec3fa c = madd(Vec3fa((pos-v0d)*inv_length),v1-v0,v0);
+          left.extend(c);
+          right.extend(c);
+        }
+      }
+      
+      /* clip against current bounds */
+      new (&left_o ) PrimRef(intersect(left ,prim.bounds()),prim.geomID(), prim.primID());
+      new (&right_o) PrimRef(intersect(right,prim.bounds()),prim.geomID(), prim.primID());
+    }
+    
+    struct TriangleSplitter
+    {
+      __forceinline TriangleSplitter(const Scene* scene, const PrimRef& prim)
+      {
+        const unsigned int mask = 0xFFFFFFFF >> RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS;
+        const TriangleMesh* mesh = (const TriangleMesh*) scene->get(prim.geomID() & mask );  
+        TriangleMesh::Triangle tri = mesh->triangle(prim.primID());
+        v[0] = mesh->vertex(tri.v[0]);
+        v[1] = mesh->vertex(tri.v[1]);
+        v[2] = mesh->vertex(tri.v[2]);
+        v[3] = mesh->vertex(tri.v[0]);
+        inv_length[0] = Vec3fa(1.0f) / (v[1]-v[0]);
+        inv_length[1] = Vec3fa(1.0f) / (v[2]-v[1]);
+        inv_length[2] = Vec3fa(1.0f) / (v[0]-v[2]);
+      }
+      
+      __forceinline void operator() (const PrimRef& prim, const size_t dim, const float pos, PrimRef& left_o, PrimRef& right_o) const {
+        splitPolygon<3>(prim,dim,pos,v,left_o,right_o);
+      }
+      
+      __forceinline void operator() (const BBox3fa& prim, const size_t dim, const float pos, BBox3fa& left_o, BBox3fa& right_o) const {
+        splitPolygon<3>(prim,dim,pos,v,inv_length,left_o,right_o);
+      }
+      
+    private:
+      Vec3fa v[4];
+      Vec3fa inv_length[3];
+    };
+    
+    struct TriangleSplitterFactory
+    {
+      __forceinline TriangleSplitterFactory(const Scene* scene)
+        : scene(scene) {}
+      
+      __forceinline TriangleSplitter operator() (const PrimRef& prim) const {
+        return TriangleSplitter(scene,prim);
+      }
+      
+    private:
+      const Scene* scene;
+    };
+    
+    struct QuadSplitter
+    {
+      __forceinline QuadSplitter(const Scene* scene, const PrimRef& prim)
+      {
+        const unsigned int mask = 0xFFFFFFFF >> RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS;
+        const QuadMesh* mesh = (const QuadMesh*) scene->get(prim.geomID() & mask );  
+        QuadMesh::Quad quad = mesh->quad(prim.primID());
+        v[0] = mesh->vertex(quad.v[0]);
+        v[1] = mesh->vertex(quad.v[1]);
+        v[2] = mesh->vertex(quad.v[2]);
+        v[3] = mesh->vertex(quad.v[3]);
+        v[4] = mesh->vertex(quad.v[0]);
+        inv_length[0] = Vec3fa(1.0f) / (v[1]-v[0]);
+        inv_length[1] = Vec3fa(1.0f) / (v[2]-v[1]);
+        inv_length[2] = Vec3fa(1.0f) / (v[3]-v[2]);
+        inv_length[3] = Vec3fa(1.0f) / (v[0]-v[3]);
+      }
+      
+      __forceinline void operator() (const PrimRef& prim, const size_t dim, const float pos, PrimRef& left_o, PrimRef& right_o) const {
+        splitPolygon<4>(prim,dim,pos,v,left_o,right_o);
+      }
+      
+      __forceinline void operator() (const BBox3fa& prim, const size_t dim, const float pos, BBox3fa& left_o, BBox3fa& right_o) const {
+        splitPolygon<4>(prim,dim,pos,v,inv_length,left_o,right_o);
+      }
+      
+    private:
+      Vec3fa v[5];
+      Vec3fa inv_length[4];
+    };
+    
+    struct QuadSplitterFactory
+    {
+      __forceinline QuadSplitterFactory(const Scene* scene)
+        : scene(scene) {}
+      
+      __forceinline QuadSplitter operator() (const PrimRef& prim) const {
+        return QuadSplitter(scene,prim);
+      }
+      
+    private:
+      const Scene* scene;
+    };
+
+
+    struct DummySplitter
+    {
+      __forceinline DummySplitter(const Scene* scene, const PrimRef& prim)
+      {
+      }
+    };
+    
+    struct DummySplitterFactory
+    {
+      __forceinline DummySplitterFactory(const Scene* scene)
+        : scene(scene) {}
+      
+      __forceinline DummySplitter operator() (const PrimRef& prim) const {
+        return DummySplitter(scene,prim);
+      }
+      
+    private:
+      const Scene* scene;
+    };
+    
+  }
+}
+
diff --git a/thirdparty/embree/kernels/bvh/bvh.cpp b/thirdparty/embree/kernels/bvh/bvh.cpp
new file mode 100644
index 0000000000..a84295f0da
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh.cpp
@@ -0,0 +1,190 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh.h"
+#include "bvh_statistics.h"
+
+namespace embree
+{
+  template<int N>
+  BVHN<N>::BVHN (const PrimitiveType& primTy, Scene* scene)
+    : AccelData((N==4) ? AccelData::TY_BVH4 : (N==8) ? AccelData::TY_BVH8 : AccelData::TY_UNKNOWN),
+      primTy(&primTy), device(scene->device), scene(scene),
+      root(emptyNode), alloc(scene->device,scene->isStaticAccel()), numPrimitives(0), numVertices(0)
+  {
+  }
+
+  template<int N>
+  BVHN<N>::~BVHN ()
+  {
+    for (size_t i=0; i<objects.size(); i++) 
+      delete objects[i];
+  }
+
+  template<int N>
+  void BVHN<N>::clear()
+  {
+    set(BVHN::emptyNode,empty,0);
+    alloc.clear();
+  }
+
+  template<int N>
+  void BVHN<N>::set (NodeRef root, const LBBox3fa& bounds, size_t numPrimitives)
+  {
+    this->root = root;
+    this->bounds = bounds;
+    this->numPrimitives = numPrimitives;
+  }	
+
+  template<int N>
+  void BVHN<N>::clearBarrier(NodeRef& node)
+  {
+    if (node.isBarrier())
+      node.clearBarrier();
+    else if (!node.isLeaf()) {
+      BaseNode* n = node.baseNode(); // FIXME: flags should be stored in BVH
+      for (size_t c=0; c<N; c++)
+        clearBarrier(n->child(c));
+    }
+  }
+
+  template<int N>
+  void BVHN<N>::layoutLargeNodes(size_t num)
+  {
+#if defined(__64BIT__) // do not use tree rotations on 32 bit platforms, barrier bit in NodeRef will cause issues
+    struct NodeArea 
+    {
+      __forceinline NodeArea() {}
+
+      __forceinline NodeArea(NodeRef& node, const BBox3fa& bounds)
+        : node(&node), A(node.isLeaf() ? float(neg_inf) : area(bounds)) {}
+
+      __forceinline bool operator< (const NodeArea& other) const {
+        return this->A < other.A;
+      }
+
+      NodeRef* node;
+      float A;
+    };
+    std::vector<NodeArea> lst;
+    lst.reserve(num);
+    lst.push_back(NodeArea(root,empty));
+
+    while (lst.size() < num)
+    {
+      std::pop_heap(lst.begin(), lst.end());
+      NodeArea n = lst.back(); lst.pop_back();
+      if (!n.node->isAABBNode()) break;
+      AABBNode* node = n.node->getAABBNode();
+      for (size_t i=0; i<N; i++) {
+        if (node->child(i) == BVHN::emptyNode) continue;
+        lst.push_back(NodeArea(node->child(i),node->bounds(i)));
+        std::push_heap(lst.begin(), lst.end());
+      }
+    }
+
+    for (size_t i=0; i<lst.size(); i++)
+      lst[i].node->setBarrier();
+      
+    root = layoutLargeNodesRecursion(root,alloc.getCachedAllocator());
+#endif
+  }
+  
+  template<int N>
+  typename BVHN<N>::NodeRef BVHN<N>::layoutLargeNodesRecursion(NodeRef& node, const FastAllocator::CachedAllocator& allocator)
+  {
+    if (node.isBarrier()) {
+      node.clearBarrier();
+      return node;
+    }
+    else if (node.isAABBNode()) 
+    {
+      AABBNode* oldnode = node.getAABBNode();
+      AABBNode* newnode = (BVHN::AABBNode*) allocator.malloc0(sizeof(BVHN::AABBNode),byteNodeAlignment);
+      *newnode = *oldnode;
+      for (size_t c=0; c<N; c++)
+        newnode->child(c) = layoutLargeNodesRecursion(oldnode->child(c),allocator);
+      return encodeNode(newnode);
+    }
+    else return node;
+  }
+
+  template<int N>
+  double BVHN<N>::preBuild(const std::string& builderName)
+  {
+    if (builderName == "") 
+      return inf;
+
+    if (device->verbosity(2))
+    {
+      Lock<MutexSys> lock(g_printMutex);
+      std::cout << "building BVH" << N << (builderName.find("MBlur") != std::string::npos ? "MB" : "") << "<" << primTy->name() << "> using " << builderName << " ..." << std::endl << std::flush;
+    }
+
+    double t0 = 0.0;
+    if (device->benchmark || device->verbosity(2)) t0 = getSeconds();
+    return t0;
+  }
+
+  template<int N>
+  void BVHN<N>::postBuild(double t0)
+  {
+    if (t0 == double(inf))
+      return;
+    
+    double dt = 0.0;
+    if (device->benchmark || device->verbosity(2)) 
+      dt = getSeconds()-t0;
+
+    std::unique_ptr<BVHNStatistics<N>> stat;
+
+    /* print statistics */
+    if (device->verbosity(2))
+    {
+      if (!stat) stat.reset(new BVHNStatistics<N>(this));
+      const size_t usedBytes = alloc.getUsedBytes();
+      Lock<MutexSys> lock(g_printMutex);
+      std::cout << "finished BVH" << N << "<" << primTy->name() << "> : " << 1000.0f*dt << "ms, " << 1E-6*double(numPrimitives)/dt << " Mprim/s, " << 1E-9*double(usedBytes)/dt << " GB/s" << std::endl;
+    
+      if (device->verbosity(2))
+        std::cout << stat->str();
+
+      if (device->verbosity(2))
+      {
+        FastAllocator::AllStatistics stat(&alloc);
+        for (size_t i=0; i<objects.size(); i++)
+          if (objects[i])
+            stat = stat + FastAllocator::AllStatistics(&objects[i]->alloc);
+
+        stat.print(numPrimitives);
+      }
+
+      if (device->verbosity(3))
+      {
+        alloc.print_blocks();
+        for (size_t i=0; i<objects.size(); i++)
+          if (objects[i]) 
+            objects[i]->alloc.print_blocks();
+      }
+
+      std::cout << std::flush;
+    }
+
+    /* benchmark mode */
+    if (device->benchmark)
+    {
+      if (!stat) stat.reset(new BVHNStatistics<N>(this));
+      Lock<MutexSys> lock(g_printMutex);
+      std::cout << "BENCHMARK_BUILD " << dt << " " << double(numPrimitives)/dt << " " << stat->sah() << " " << stat->bytesUsed() << " BVH" << N << "<" << primTy->name() << ">" << std::endl << std::flush;
+    }
+  }
+
+#if defined(__AVX__)
+  template class BVHN<8>;
+#endif
+
+#if !defined(__AVX__) || !defined(EMBREE_TARGET_SSE2) && !defined(EMBREE_TARGET_SSE42)
+  template class BVHN<4>;
+#endif
+}
+
diff --git a/thirdparty/embree/kernels/bvh/bvh.h b/thirdparty/embree/kernels/bvh/bvh.h
new file mode 100644
index 0000000000..565eec5a58
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh.h
@@ -0,0 +1,235 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+/* include all node types */
+#include "bvh_node_aabb.h"
+#include "bvh_node_aabb_mb.h"
+#include "bvh_node_aabb_mb4d.h"
+#include "bvh_node_obb.h"
+#include "bvh_node_obb_mb.h"
+#include "bvh_node_qaabb.h"
+
+namespace embree
+{
+  /*! flags used to enable specific node types in intersectors */
+  enum BVHNodeFlags
+  {
+    BVH_FLAG_ALIGNED_NODE = 0x00001,
+    BVH_FLAG_ALIGNED_NODE_MB = 0x00010,
+    BVH_FLAG_UNALIGNED_NODE = 0x00100,
+    BVH_FLAG_UNALIGNED_NODE_MB = 0x01000,
+    BVH_FLAG_QUANTIZED_NODE = 0x100000,
+    BVH_FLAG_ALIGNED_NODE_MB4D = 0x1000000,
+    
+    /* short versions */
+    BVH_AN1 = BVH_FLAG_ALIGNED_NODE,
+    BVH_AN2 = BVH_FLAG_ALIGNED_NODE_MB,
+    BVH_AN2_AN4D = BVH_FLAG_ALIGNED_NODE_MB | BVH_FLAG_ALIGNED_NODE_MB4D,
+    BVH_UN1 = BVH_FLAG_UNALIGNED_NODE,
+    BVH_UN2 = BVH_FLAG_UNALIGNED_NODE_MB,
+    BVH_MB = BVH_FLAG_ALIGNED_NODE_MB | BVH_FLAG_UNALIGNED_NODE_MB | BVH_FLAG_ALIGNED_NODE_MB4D,
+    BVH_AN1_UN1 = BVH_FLAG_ALIGNED_NODE | BVH_FLAG_UNALIGNED_NODE,
+    BVH_AN2_UN2 = BVH_FLAG_ALIGNED_NODE_MB | BVH_FLAG_UNALIGNED_NODE_MB,
+    BVH_AN2_AN4D_UN2 = BVH_FLAG_ALIGNED_NODE_MB | BVH_FLAG_ALIGNED_NODE_MB4D | BVH_FLAG_UNALIGNED_NODE_MB,
+    BVH_QN1 = BVH_FLAG_QUANTIZED_NODE
+  };
+  
+  /*! Multi BVH with N children. Each node stores the bounding box of
+   * it's N children as well as N child references. */
+  template<int N>
+    class BVHN : public AccelData
+  {
+    ALIGNED_CLASS_(16);
+  public:
+    
+    /*! forward declaration of node ref type */
+    typedef NodeRefPtr<N> NodeRef;
+    typedef BaseNode_t<NodeRef,N> BaseNode;
+    typedef AABBNode_t<NodeRef,N> AABBNode;
+    typedef AABBNodeMB_t<NodeRef,N> AABBNodeMB;
+    typedef AABBNodeMB4D_t<NodeRef,N> AABBNodeMB4D;
+    typedef OBBNode_t<NodeRef,N> OBBNode;
+    typedef OBBNodeMB_t<NodeRef,N> OBBNodeMB;
+    typedef QuantizedBaseNode_t<N> QuantizedBaseNode;
+    typedef QuantizedBaseNodeMB_t<N> QuantizedBaseNodeMB;
+    typedef QuantizedNode_t<NodeRef,N> QuantizedNode;
+    
+    /*! Number of bytes the nodes and primitives are minimally aligned to.*/
+    static const size_t byteAlignment = 16;
+    static const size_t byteNodeAlignment = 4*N;
+    
+    /*! Empty node */
+    static const size_t emptyNode = NodeRef::emptyNode;
+    
+    /*! Invalid node, used as marker in traversal */
+    static const size_t invalidNode = NodeRef::invalidNode;
+    static const size_t popRay      = NodeRef::popRay;
+    
+    /*! Maximum depth of the BVH. */
+    static const size_t maxBuildDepth = 32;
+    static const size_t maxBuildDepthLeaf = maxBuildDepth+8;
+    static const size_t maxDepth = 2*maxBuildDepthLeaf; // 2x because of two level builder
+    
+    /*! Maximum number of primitive blocks in a leaf. */
+    static const size_t maxLeafBlocks = NodeRef::maxLeafBlocks;
+    
+  public:
+    
+    /*! Builder interface to create allocator */
+    struct CreateAlloc : public FastAllocator::Create {
+      __forceinline CreateAlloc (BVHN* bvh) : FastAllocator::Create(&bvh->alloc) {}
+    };
+    
+    typedef BVHNodeRecord<NodeRef>     NodeRecord;
+    typedef BVHNodeRecordMB<NodeRef>   NodeRecordMB;
+    typedef BVHNodeRecordMB4D<NodeRef> NodeRecordMB4D;
+    
+  public:
+    
+    /*! BVHN default constructor. */
+    BVHN (const PrimitiveType& primTy, Scene* scene);
+    
+    /*! BVHN destruction */
+    ~BVHN ();
+    
+    /*! clears the acceleration structure */
+    void clear();
+    
+    /*! sets BVH members after build */
+    void set (NodeRef root, const LBBox3fa& bounds, size_t numPrimitives);
+    
+    /*! Clears the barrier bits of a subtree. */
+    void clearBarrier(NodeRef& node);
+    
+    /*! lays out num large nodes of the BVH */
+    void layoutLargeNodes(size_t num);
+    NodeRef layoutLargeNodesRecursion(NodeRef& node, const FastAllocator::CachedAllocator& allocator);
+    
+    /*! called by all builders before build starts */
+    double preBuild(const std::string& builderName);
+    
+    /*! called by all builders after build ended */
+    void postBuild(double t0);
+    
+    /*! allocator class */
+    struct Allocator {
+      BVHN* bvh;
+      Allocator (BVHN* bvh) : bvh(bvh) {}
+      __forceinline void* operator() (size_t bytes) const { 
+        return bvh->alloc._threadLocal()->malloc(&bvh->alloc,bytes); 
+      }
+    };
+    
+    /*! post build cleanup */
+    void cleanup() {
+      alloc.cleanup();
+    }
+    
+  public:
+    
+    /*! Encodes a node */
+    static __forceinline NodeRef encodeNode(AABBNode* node) { return NodeRef::encodeNode(node); }
+    static __forceinline NodeRef encodeNode(AABBNodeMB* node) { return NodeRef::encodeNode(node); }
+    static __forceinline NodeRef encodeNode(AABBNodeMB4D* node) { return NodeRef::encodeNode(node); }
+    static __forceinline NodeRef encodeNode(OBBNode* node) { return NodeRef::encodeNode(node); }
+    static __forceinline NodeRef encodeNode(OBBNodeMB* node) { return NodeRef::encodeNode(node); }
+    static __forceinline NodeRef encodeLeaf(void* tri, size_t num) { return NodeRef::encodeLeaf(tri,num); }
+    static __forceinline NodeRef encodeTypedLeaf(void* ptr, size_t ty) { return NodeRef::encodeTypedLeaf(ptr,ty); }
+    
+  public:
+    
+    /*! Prefetches the node this reference points to */
+    __forceinline static void prefetch(const NodeRef ref, int types=0)
+    {
+#if defined(__AVX512PF__) // MIC
+      if (types != BVH_FLAG_QUANTIZED_NODE) {
+        prefetchL2(((char*)ref.ptr)+0*64);
+        prefetchL2(((char*)ref.ptr)+1*64);
+        if ((N >= 8) || (types > BVH_FLAG_ALIGNED_NODE)) {
+          prefetchL2(((char*)ref.ptr)+2*64);
+          prefetchL2(((char*)ref.ptr)+3*64);
+        }
+        if ((N >= 8) && (types > BVH_FLAG_ALIGNED_NODE)) {
+          /* KNL still needs L2 prefetches for large nodes */
+          prefetchL2(((char*)ref.ptr)+4*64);
+          prefetchL2(((char*)ref.ptr)+5*64);
+          prefetchL2(((char*)ref.ptr)+6*64);
+          prefetchL2(((char*)ref.ptr)+7*64);
+        }
+      }
+      else
+      {
+        /* todo: reduce if 32bit offsets are enabled */
+        prefetchL2(((char*)ref.ptr)+0*64);
+        prefetchL2(((char*)ref.ptr)+1*64);
+        prefetchL2(((char*)ref.ptr)+2*64);
+      }
+#else
+      if (types != BVH_FLAG_QUANTIZED_NODE) {
+        prefetchL1(((char*)ref.ptr)+0*64);
+        prefetchL1(((char*)ref.ptr)+1*64);
+        if ((N >= 8) || (types > BVH_FLAG_ALIGNED_NODE)) {
+          prefetchL1(((char*)ref.ptr)+2*64);
+          prefetchL1(((char*)ref.ptr)+3*64);
+        }
+        if ((N >= 8) && (types > BVH_FLAG_ALIGNED_NODE)) {
+          /* deactivate for large nodes on Xeon, as it introduces regressions */
+          //prefetchL1(((char*)ref.ptr)+4*64);
+          //prefetchL1(((char*)ref.ptr)+5*64);
+          //prefetchL1(((char*)ref.ptr)+6*64);
+          //prefetchL1(((char*)ref.ptr)+7*64);
+        }
+      }
+      else
+      {
+        /* todo: reduce if 32bit offsets are enabled */
+        prefetchL1(((char*)ref.ptr)+0*64);
+        prefetchL1(((char*)ref.ptr)+1*64);
+        prefetchL1(((char*)ref.ptr)+2*64);
+      }
+#endif
+    }
+    
+    __forceinline static void prefetchW(const NodeRef ref, int types=0)
+    {
+      embree::prefetchEX(((char*)ref.ptr)+0*64);
+      embree::prefetchEX(((char*)ref.ptr)+1*64);
+      if ((N >= 8) || (types > BVH_FLAG_ALIGNED_NODE)) {
+        embree::prefetchEX(((char*)ref.ptr)+2*64);
+        embree::prefetchEX(((char*)ref.ptr)+3*64);
+      }
+      if ((N >= 8) && (types > BVH_FLAG_ALIGNED_NODE)) {
+        embree::prefetchEX(((char*)ref.ptr)+4*64);
+        embree::prefetchEX(((char*)ref.ptr)+5*64);
+        embree::prefetchEX(((char*)ref.ptr)+6*64);
+        embree::prefetchEX(((char*)ref.ptr)+7*64);
+      }
+    }
+    
+    /*! bvh type information */
+  public:
+    const PrimitiveType* primTy;       //!< primitive type stored in the BVH
+    
+    /*! bvh data */
+  public:
+    Device* device;                    //!< device pointer
+    Scene* scene;                      //!< scene pointer
+    NodeRef root;                      //!< root node
+    FastAllocator alloc;               //!< allocator used to allocate nodes
+    
+    /*! statistics data */
+  public:
+    size_t numPrimitives;              //!< number of primitives the BVH is build over
+    size_t numVertices;                //!< number of vertices the BVH references
+    
+    /*! data arrays for special builders */
+  public:
+    std::vector<BVHN*> objects;
+    vector_t<char,aligned_allocator<char,32>> subdiv_patches;
+  };
+  
+  typedef BVHN<4> BVH4;
+  typedef BVHN<8> BVH8;
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh4_factory.cpp b/thirdparty/embree/kernels/bvh/bvh4_factory.cpp
new file mode 100644
index 0000000000..890d5e7b7c
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh4_factory.cpp
@@ -0,0 +1,1325 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh4_factory.h"
+#include "../bvh/bvh.h"
+
+#include "../geometry/curveNv.h"
+#include "../geometry/curveNi.h"
+#include "../geometry/curveNi_mb.h"
+#include "../geometry/linei.h"
+#include "../geometry/triangle.h"
+#include "../geometry/trianglev.h"
+#include "../geometry/trianglev_mb.h"
+#include "../geometry/trianglei.h"
+#include "../geometry/quadv.h"
+#include "../geometry/quadi.h"
+#include "../geometry/subdivpatch1.h"
+#include "../geometry/object.h"
+#include "../geometry/instance.h"
+#include "../geometry/subgrid.h"
+#include "../common/accelinstance.h"
+
+namespace embree
+{
+  DECLARE_SYMBOL2(Accel::Collider,BVH4ColliderUserGeom);
+
+  DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector4i,void);
+  DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector8i,void);
+  DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector4v,void);
+  DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector8v,void);
+  DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector4iMB,void);
+  DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector8iMB,void);
+    
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersector1);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersector1MB);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersectorRobust1);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersectorRobust1MB);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4Intersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4vIntersector1Pluecker);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iIntersector1Pluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4vMBIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iMBIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4vMBIntersector1Pluecker);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iMBIntersector1Pluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4Quad4vIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4Quad4iIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4Quad4vIntersector1Pluecker);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4Quad4iIntersector1Pluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4Quad4iMBIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4Quad4iMBIntersector1Pluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,QBVH4Triangle4iIntersector1Pluecker);
+  DECLARE_SYMBOL2(Accel::Intersector1,QBVH4Quad4iIntersector1Pluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4SubdivPatch1Intersector1);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4SubdivPatch1MBIntersector1);
+  
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4VirtualIntersector1);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4VirtualMBIntersector1);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4InstanceIntersector1);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4InstanceMBIntersector1);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4GridIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4GridMBIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4GridIntersector1Pluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersector4Hybrid);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersector4HybridMB);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersectorRobust4Hybrid);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersectorRobust4HybridMB);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4Intersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4Intersector4HybridMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4vIntersector4HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iIntersector4HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4vMBIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iMBIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4vMBIntersector4HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iMBIntersector4HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4vIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4vIntersector4HybridMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4iIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4vIntersector4HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4iIntersector4HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4iMBIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4iMBIntersector4HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4SubdivPatch1Intersector4);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4SubdivPatch1MBIntersector4);
+  
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4VirtualIntersector4Chunk);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4VirtualMBIntersector4Chunk);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4InstanceIntersector4Chunk);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4InstanceMBIntersector4Chunk);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4GridIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4GridMBIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4GridIntersector4HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersector8Hybrid);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersector8HybridMB);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersectorRobust8Hybrid);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersectorRobust8HybridMB);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4Intersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4Intersector8HybridMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4vIntersector8HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iIntersector8HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4vMBIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iMBIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4vMBIntersector8HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iMBIntersector8HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4vIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4vIntersector8HybridMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4iIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4vIntersector8HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4iIntersector8HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4iMBIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4iMBIntersector8HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4SubdivPatch1Intersector8);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4SubdivPatch1MBIntersector8);
+  
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4VirtualIntersector8Chunk);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4VirtualMBIntersector8Chunk);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4InstanceIntersector8Chunk);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4InstanceMBIntersector8Chunk);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4GridIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4GridMBIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4GridIntersector8HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersector16Hybrid);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersector16HybridMB);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersectorRobust16Hybrid);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersectorRobust16HybridMB);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4Intersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4Intersector16HybridMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4vIntersector16HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iIntersector16HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4vMBIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iMBIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4vMBIntersector16HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iMBIntersector16HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4vIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4vIntersector16HybridMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4iIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4vIntersector16HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4iIntersector16HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4iMBIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4iMBIntersector16HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4SubdivPatch1Intersector16);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4SubdivPatch1MBIntersector16);
+  
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4VirtualIntersector16Chunk);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4VirtualMBIntersector16Chunk);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4InstanceIntersector16Chunk);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4InstanceMBIntersector16Chunk);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4GridIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4GridMBIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4GridIntersector16HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4IntersectorStreamPacketFallback);
+
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Triangle4IntersectorStreamMoeller);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Triangle4IntersectorStreamMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Triangle4iIntersectorStreamMoeller);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Triangle4vIntersectorStreamPluecker);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Triangle4iIntersectorStreamPluecker);
+
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Quad4vIntersectorStreamMoeller);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Quad4vIntersectorStreamMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Quad4iIntersectorStreamMoeller);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Quad4vIntersectorStreamPluecker);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Quad4iIntersectorStreamPluecker);
+
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4VirtualIntersectorStream);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4InstanceIntersectorStream);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelTriangle4MeshSAH,void* COMMA Scene* COMMA bool);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelTriangle4vMeshSAH,void* COMMA Scene* COMMA bool);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelTriangle4iMeshSAH,void* COMMA Scene* COMMA bool);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelQuadMeshSAH,void* COMMA Scene* COMMA bool);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelVirtualSAH,void* COMMA Scene* COMMA bool);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelInstanceSAH,void* COMMA Scene* COMMA Geometry::GTypeMask COMMA bool);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Curve4vBuilder_OBB_New,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Curve4iBuilder_OBB_New,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4OBBCurve4iMBBuilder_OBB,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Curve8iBuilder_OBB_New,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4OBBCurve8iMBBuilder_OBB,void* COMMA Scene* COMMA size_t);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4SceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4QuantizedTriangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4QuantizedQuad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4SceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH4VirtualSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4VirtualMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH4InstanceSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4InstanceMBSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask);
+  
+  DECLARE_ISA_FUNCTION(Builder*,BVH4GridSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4GridMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH4SubdivPatch1BuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4SubdivPatch1MBBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4MeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vMeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iMeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4vMeshRefitSAH,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4VirtualMeshRefitSAH,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t);
+
+  BVH4Factory::BVH4Factory(int bfeatures, int ifeatures)
+  {
+    SELECT_SYMBOL_DEFAULT_AVX_AVX2(ifeatures,BVH4ColliderUserGeom);
+
+    selectBuilders(bfeatures);
+    selectIntersectors(ifeatures);
+  }
+
+  void BVH4Factory::selectBuilders(int features)
+  {
+    IF_ENABLED_TRIS (SELECT_SYMBOL_DEFAULT_AVX(features,BVH4BuilderTwoLevelTriangle4MeshSAH));
+    IF_ENABLED_TRIS (SELECT_SYMBOL_DEFAULT_AVX(features,BVH4BuilderTwoLevelTriangle4iMeshSAH));
+    IF_ENABLED_TRIS (SELECT_SYMBOL_DEFAULT_AVX(features,BVH4BuilderTwoLevelTriangle4vMeshSAH));
+    IF_ENABLED_QUADS (SELECT_SYMBOL_DEFAULT_AVX(features,BVH4BuilderTwoLevelQuadMeshSAH));
+    IF_ENABLED_USER (SELECT_SYMBOL_DEFAULT_AVX(features,BVH4BuilderTwoLevelVirtualSAH));
+    IF_ENABLED_INSTANCE (SELECT_SYMBOL_DEFAULT_AVX(features,BVH4BuilderTwoLevelInstanceSAH));
+
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Curve4vBuilder_OBB_New));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Curve4iBuilder_OBB_New));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4OBBCurve4iMBBuilder_OBB));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX(features,BVH4Curve8iBuilder_OBB_New));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX(features,BVH4OBBCurve8iMBBuilder_OBB));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Triangle4SceneBuilderSAH));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Triangle4vSceneBuilderSAH));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Triangle4iSceneBuilderSAH));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Triangle4iMBSceneBuilderSAH));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Triangle4vMBSceneBuilderSAH));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4QuantizedTriangle4iSceneBuilderSAH));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Quad4vSceneBuilderSAH));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Quad4iSceneBuilderSAH));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Quad4iMBSceneBuilderSAH));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4QuantizedQuad4iSceneBuilderSAH));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Triangle4SceneBuilderFastSpatialSAH));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Triangle4vSceneBuilderFastSpatialSAH));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Triangle4iSceneBuilderFastSpatialSAH));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Quad4vSceneBuilderFastSpatialSAH));
+
+    IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4VirtualSceneBuilderSAH));
+    IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4VirtualMBSceneBuilderSAH));
+
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4InstanceSceneBuilderSAH));
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4InstanceMBSceneBuilderSAH));
+    
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4GridSceneBuilderSAH));
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4GridMBSceneBuilderSAH));
+
+    IF_ENABLED_SUBDIV(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4SubdivPatch1BuilderSAH));
+    IF_ENABLED_SUBDIV(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4SubdivPatch1MBBuilderSAH));
+  }
+
+  void BVH4Factory::selectIntersectors(int features)
+  {
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(features,VirtualCurveIntersector4i));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,VirtualCurveIntersector8i));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(features,VirtualCurveIntersector4v));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,VirtualCurveIntersector8v));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(features,VirtualCurveIntersector4iMB));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,VirtualCurveIntersector8iMB));
+    
+    /* select intersectors1 */
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(features,BVH4OBBVirtualCurveIntersector1));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(features,BVH4OBBVirtualCurveIntersector1MB));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(features,BVH4OBBVirtualCurveIntersectorRobust1));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(features,BVH4OBBVirtualCurveIntersectorRobust1MB));
+    
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(features,BVH4Triangle4Intersector1Moeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX512(features,BVH4Triangle4iIntersector1Moeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX512(features,BVH4Triangle4vIntersector1Pluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX512(features,BVH4Triangle4iIntersector1Pluecker));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4vMBIntersector1Moeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4iMBIntersector1Moeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4vMBIntersector1Pluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4iMBIntersector1Pluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4vIntersector1Moeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4iIntersector1Moeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4vIntersector1Pluecker));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4iIntersector1Pluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4iMBIntersector1Pluecker));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4iMBIntersector1Moeller));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX512(features,QBVH4Triangle4iIntersector1Pluecker));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX512(features,QBVH4Quad4iIntersector1Pluecker));
+
+    IF_ENABLED_SUBDIV(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4SubdivPatch1Intersector1));
+    IF_ENABLED_SUBDIV(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4SubdivPatch1MBIntersector1));
+    
+    IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4VirtualIntersector1));
+    IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4VirtualMBIntersector1));
+
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4InstanceIntersector1));
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4InstanceMBIntersector1));
+
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4GridIntersector1Moeller));
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4GridMBIntersector1Moeller))
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4GridIntersector1Pluecker));
+
+#if defined (EMBREE_RAY_PACKETS)
+
+    /* select intersectors4 */
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(features,BVH4OBBVirtualCurveIntersector4Hybrid));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(features,BVH4OBBVirtualCurveIntersector4HybridMB));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(features,BVH4OBBVirtualCurveIntersectorRobust4Hybrid));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(features,BVH4OBBVirtualCurveIntersectorRobust4HybridMB));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4Intersector4HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4Intersector4HybridMoellerNoFilter));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4iIntersector4HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4vIntersector4HybridPluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4iIntersector4HybridPluecker));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4vMBIntersector4HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4iMBIntersector4HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4vMBIntersector4HybridPluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4iMBIntersector4HybridPluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4vIntersector4HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4vIntersector4HybridMoellerNoFilter));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4iIntersector4HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4vIntersector4HybridPluecker));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4iIntersector4HybridPluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4iMBIntersector4HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4iMBIntersector4HybridPluecker));
+
+    IF_ENABLED_SUBDIV(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4SubdivPatch1Intersector4));
+    IF_ENABLED_SUBDIV(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4SubdivPatch1MBIntersector4));
+    
+    IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4VirtualIntersector4Chunk));
+    IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4VirtualMBIntersector4Chunk));
+
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4InstanceIntersector4Chunk));
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4InstanceMBIntersector4Chunk));
+    
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4vIntersector4HybridMoeller));
+
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4GridIntersector4HybridMoeller));
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4GridMBIntersector4HybridMoeller));
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4GridIntersector4HybridPluecker));
+
+    /* select intersectors8 */
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4OBBVirtualCurveIntersector8Hybrid));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4OBBVirtualCurveIntersector8HybridMB));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4OBBVirtualCurveIntersectorRobust8Hybrid));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4OBBVirtualCurveIntersectorRobust8HybridMB));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4Triangle4Intersector8HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4Triangle4Intersector8HybridMoellerNoFilter));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4Triangle4iIntersector8HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4Triangle4vIntersector8HybridPluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4Triangle4iIntersector8HybridPluecker));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4Triangle4vMBIntersector8HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4Triangle4iMBIntersector8HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4Triangle4vMBIntersector8HybridPluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4Triangle4iMBIntersector8HybridPluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4Quad4vIntersector8HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4Quad4vIntersector8HybridMoellerNoFilter));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4Quad4iIntersector8HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4Quad4vIntersector8HybridPluecker));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4Quad4iIntersector8HybridPluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4Quad4iMBIntersector8HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4Quad4iMBIntersector8HybridPluecker));
+
+    IF_ENABLED_SUBDIV(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4SubdivPatch1Intersector8));
+    IF_ENABLED_SUBDIV(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4SubdivPatch1MBIntersector8));
+    
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4VirtualIntersector8Chunk));
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4VirtualMBIntersector8Chunk));
+
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4InstanceIntersector8Chunk));
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4InstanceMBIntersector8Chunk));
+
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4GridIntersector8HybridMoeller));
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4GridMBIntersector8HybridMoeller));
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4GridIntersector8HybridPluecker));
+
+    /* select intersectors16 */
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512(features,BVH4OBBVirtualCurveIntersector16Hybrid));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512(features,BVH4OBBVirtualCurveIntersector16HybridMB));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512(features,BVH4OBBVirtualCurveIntersectorRobust16Hybrid));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512(features,BVH4OBBVirtualCurveIntersectorRobust16HybridMB));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512(features,BVH4Triangle4Intersector16HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512(features,BVH4Triangle4Intersector16HybridMoellerNoFilter));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512(features,BVH4Triangle4iIntersector16HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512(features,BVH4Triangle4vIntersector16HybridPluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512(features,BVH4Triangle4iIntersector16HybridPluecker));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512(features,BVH4Triangle4vMBIntersector16HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512(features,BVH4Triangle4iMBIntersector16HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512(features,BVH4Triangle4vMBIntersector16HybridPluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512(features,BVH4Triangle4iMBIntersector16HybridPluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512(features,BVH4Quad4vIntersector16HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512(features,BVH4Quad4vIntersector16HybridMoellerNoFilter));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512(features,BVH4Quad4iIntersector16HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512(features,BVH4Quad4vIntersector16HybridPluecker));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512(features,BVH4Quad4iIntersector16HybridPluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512(features,BVH4Quad4iMBIntersector16HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512(features,BVH4Quad4iMBIntersector16HybridPluecker));
+
+    IF_ENABLED_SUBDIV(SELECT_SYMBOL_INIT_AVX512(features,BVH4SubdivPatch1Intersector16));
+    IF_ENABLED_SUBDIV(SELECT_SYMBOL_INIT_AVX512(features,BVH4SubdivPatch1MBIntersector16));
+    
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX512(features,BVH4VirtualIntersector16Chunk));
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX512(features,BVH4VirtualMBIntersector16Chunk));
+
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX512(features,BVH4InstanceIntersector16Chunk));
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX512(features,BVH4InstanceMBIntersector16Chunk));
+
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX512(features,BVH4GridIntersector16HybridMoeller));
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX512(features,BVH4GridMBIntersector16HybridMoeller));
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX512(features,BVH4GridIntersector16HybridPluecker));
+
+    /* select stream intersectors */
+    SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4IntersectorStreamPacketFallback);
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4IntersectorStreamMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4IntersectorStreamMoellerNoFilter));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4iIntersectorStreamMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4vIntersectorStreamPluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4iIntersectorStreamPluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4vIntersectorStreamMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4vIntersectorStreamMoellerNoFilter));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4iIntersectorStreamMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4vIntersectorStreamPluecker));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4iIntersectorStreamPluecker));
+
+    IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4VirtualIntersectorStream));
+    
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4InstanceIntersectorStream));
+
+#endif
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4OBBVirtualCurveIntersectors(BVH4* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.leafIntersector = leafIntersector;
+      intersectors.intersector1  = BVH4OBBVirtualCurveIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH4OBBVirtualCurveIntersector4Hybrid();
+      intersectors.intersector8  = BVH4OBBVirtualCurveIntersector8Hybrid();
+      intersectors.intersector16 = BVH4OBBVirtualCurveIntersector16Hybrid();
+      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.leafIntersector = leafIntersector;
+      intersectors.intersector1  = BVH4OBBVirtualCurveIntersectorRobust1();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH4OBBVirtualCurveIntersectorRobust4Hybrid();
+      intersectors.intersector8  = BVH4OBBVirtualCurveIntersectorRobust8Hybrid();
+      intersectors.intersector16 = BVH4OBBVirtualCurveIntersectorRobust16Hybrid();
+      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    default: assert(false);
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4OBBVirtualCurveIntersectorsMB(BVH4* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.leafIntersector = leafIntersector;
+      intersectors.intersector1  = BVH4OBBVirtualCurveIntersector1MB();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH4OBBVirtualCurveIntersector4HybridMB();
+      intersectors.intersector8  = BVH4OBBVirtualCurveIntersector8HybridMB();
+      intersectors.intersector16 = BVH4OBBVirtualCurveIntersector16HybridMB();
+      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.leafIntersector = leafIntersector;
+      intersectors.intersector1  = BVH4OBBVirtualCurveIntersectorRobust1MB();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH4OBBVirtualCurveIntersectorRobust4HybridMB();
+      intersectors.intersector8  = BVH4OBBVirtualCurveIntersectorRobust8HybridMB();
+      intersectors.intersector16 = BVH4OBBVirtualCurveIntersectorRobust16HybridMB();
+      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    default: assert(false);
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4Triangle4Intersectors(BVH4* bvh, IntersectVariant ivariant)
+  {
+    assert(ivariant == IntersectVariant::FAST);
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1           = BVH4Triangle4Intersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4_filter    = BVH4Triangle4Intersector4HybridMoeller();
+    intersectors.intersector4_nofilter  = BVH4Triangle4Intersector4HybridMoellerNoFilter();
+    intersectors.intersector8_filter    = BVH4Triangle4Intersector8HybridMoeller();
+    intersectors.intersector8_nofilter  = BVH4Triangle4Intersector8HybridMoellerNoFilter();
+    intersectors.intersector16_filter   = BVH4Triangle4Intersector16HybridMoeller();
+    intersectors.intersector16_nofilter = BVH4Triangle4Intersector16HybridMoellerNoFilter();
+    intersectors.intersectorN_filter    = BVH4Triangle4IntersectorStreamMoeller();
+    intersectors.intersectorN_nofilter  = BVH4Triangle4IntersectorStreamMoellerNoFilter();
+#endif
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4Triangle4vIntersectors(BVH4* bvh, IntersectVariant ivariant)
+  {
+    assert(ivariant == IntersectVariant::ROBUST);
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1  = BVH4Triangle4vIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4  = BVH4Triangle4vIntersector4HybridPluecker();
+    intersectors.intersector8  = BVH4Triangle4vIntersector8HybridPluecker();
+    intersectors.intersector16 = BVH4Triangle4vIntersector16HybridPluecker();
+    intersectors.intersectorN  = BVH4Triangle4vIntersectorStreamPluecker();
+#endif
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4Triangle4iIntersectors(BVH4* bvh, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH4Triangle4iIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH4Triangle4iIntersector4HybridMoeller();
+      intersectors.intersector8  = BVH4Triangle4iIntersector8HybridMoeller();
+      intersectors.intersector16 = BVH4Triangle4iIntersector16HybridMoeller();
+      intersectors.intersectorN  = BVH4Triangle4iIntersectorStreamMoeller();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH4Triangle4iIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH4Triangle4iIntersector4HybridPluecker();
+      intersectors.intersector8  = BVH4Triangle4iIntersector8HybridPluecker();
+      intersectors.intersector16 = BVH4Triangle4iIntersector16HybridPluecker();
+      intersectors.intersectorN  = BVH4Triangle4iIntersectorStreamPluecker();
+#endif
+      return intersectors;
+    }
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4Triangle4vMBIntersectors(BVH4* bvh, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH4Triangle4vMBIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH4Triangle4vMBIntersector4HybridMoeller();
+      intersectors.intersector8  = BVH4Triangle4vMBIntersector8HybridMoeller();
+      intersectors.intersector16 = BVH4Triangle4vMBIntersector16HybridMoeller();
+      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH4Triangle4vMBIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH4Triangle4vMBIntersector4HybridPluecker();
+      intersectors.intersector8  = BVH4Triangle4vMBIntersector8HybridPluecker();
+      intersectors.intersector16 = BVH4Triangle4vMBIntersector16HybridPluecker();
+      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4Triangle4iMBIntersectors(BVH4* bvh, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH4Triangle4iMBIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH4Triangle4iMBIntersector4HybridMoeller();
+      intersectors.intersector8  = BVH4Triangle4iMBIntersector8HybridMoeller();
+      intersectors.intersector16 = BVH4Triangle4iMBIntersector16HybridMoeller();
+      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH4Triangle4iMBIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH4Triangle4iMBIntersector4HybridPluecker();
+      intersectors.intersector8  = BVH4Triangle4iMBIntersector8HybridPluecker();
+      intersectors.intersector16 = BVH4Triangle4iMBIntersector16HybridPluecker();
+      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4Quad4vIntersectors(BVH4* bvh, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1           = BVH4Quad4vIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4_filter    = BVH4Quad4vIntersector4HybridMoeller();
+      intersectors.intersector4_nofilter  = BVH4Quad4vIntersector4HybridMoellerNoFilter();
+      intersectors.intersector8_filter    = BVH4Quad4vIntersector8HybridMoeller();
+      intersectors.intersector8_nofilter  = BVH4Quad4vIntersector8HybridMoellerNoFilter();
+      intersectors.intersector16_filter   = BVH4Quad4vIntersector16HybridMoeller();
+      intersectors.intersector16_nofilter = BVH4Quad4vIntersector16HybridMoellerNoFilter();
+      intersectors.intersectorN_filter    = BVH4Quad4vIntersectorStreamMoeller();
+      intersectors.intersectorN_nofilter  = BVH4Quad4vIntersectorStreamMoellerNoFilter();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH4Quad4vIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH4Quad4vIntersector4HybridPluecker();
+      intersectors.intersector8  = BVH4Quad4vIntersector8HybridPluecker();
+      intersectors.intersector16 = BVH4Quad4vIntersector16HybridPluecker();
+      intersectors.intersectorN  = BVH4Quad4vIntersectorStreamPluecker();
+#endif
+      return intersectors;
+    }
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4Quad4iIntersectors(BVH4* bvh, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1 = BVH4Quad4iIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4 = BVH4Quad4iIntersector4HybridMoeller();
+      intersectors.intersector8 = BVH4Quad4iIntersector8HybridMoeller();
+      intersectors.intersector16= BVH4Quad4iIntersector16HybridMoeller();
+      intersectors.intersectorN = BVH4Quad4iIntersectorStreamMoeller();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1 = BVH4Quad4iIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4 = BVH4Quad4iIntersector4HybridPluecker();
+      intersectors.intersector8 = BVH4Quad4iIntersector8HybridPluecker();
+      intersectors.intersector16= BVH4Quad4iIntersector16HybridPluecker();
+      intersectors.intersectorN = BVH4Quad4iIntersectorStreamPluecker();
+#endif
+      return intersectors;
+    }
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4Quad4iMBIntersectors(BVH4* bvh, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1 = BVH4Quad4iMBIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4 = BVH4Quad4iMBIntersector4HybridMoeller();
+      intersectors.intersector8 = BVH4Quad4iMBIntersector8HybridMoeller();
+      intersectors.intersector16= BVH4Quad4iMBIntersector16HybridMoeller();
+      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1 = BVH4Quad4iMBIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4 = BVH4Quad4iMBIntersector4HybridPluecker();
+      intersectors.intersector8 = BVH4Quad4iMBIntersector8HybridPluecker();
+      intersectors.intersector16= BVH4Quad4iMBIntersector16HybridPluecker();
+      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH4Factory::QBVH4Triangle4iIntersectors(BVH4* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1 = QBVH4Triangle4iIntersector1Pluecker();
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH4Factory::QBVH4Quad4iIntersectors(BVH4* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1 = QBVH4Quad4iIntersector1Pluecker();
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4UserGeometryIntersectors(BVH4* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1  = BVH4VirtualIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4  = BVH4VirtualIntersector4Chunk();
+    intersectors.intersector8  = BVH4VirtualIntersector8Chunk();
+    intersectors.intersector16 = BVH4VirtualIntersector16Chunk();
+    intersectors.intersectorN  = BVH4VirtualIntersectorStream();
+#endif
+    intersectors.collider      = BVH4ColliderUserGeom();
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4UserGeometryMBIntersectors(BVH4* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1  = BVH4VirtualMBIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4  = BVH4VirtualMBIntersector4Chunk();
+    intersectors.intersector8  = BVH4VirtualMBIntersector8Chunk();
+    intersectors.intersector16 = BVH4VirtualMBIntersector16Chunk();
+    intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4InstanceIntersectors(BVH4* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1  = BVH4InstanceIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4  = BVH4InstanceIntersector4Chunk();
+    intersectors.intersector8  = BVH4InstanceIntersector8Chunk();
+    intersectors.intersector16 = BVH4InstanceIntersector16Chunk();
+    intersectors.intersectorN  = BVH4InstanceIntersectorStream();
+#endif
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4InstanceMBIntersectors(BVH4* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1  = BVH4InstanceMBIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4  = BVH4InstanceMBIntersector4Chunk();
+    intersectors.intersector8  = BVH4InstanceMBIntersector8Chunk();
+    intersectors.intersector16 = BVH4InstanceMBIntersector16Chunk();
+    intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+    return intersectors;
+  }
+  
+  Accel::Intersectors BVH4Factory::BVH4SubdivPatch1Intersectors(BVH4* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1  = BVH4SubdivPatch1Intersector1();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4  = BVH4SubdivPatch1Intersector4();
+    intersectors.intersector8  = BVH4SubdivPatch1Intersector8();
+    intersectors.intersector16 = BVH4SubdivPatch1Intersector16();
+    intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4SubdivPatch1MBIntersectors(BVH4* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1  = BVH4SubdivPatch1MBIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4  = BVH4SubdivPatch1MBIntersector4();
+    intersectors.intersector8  = BVH4SubdivPatch1MBIntersector8();
+    intersectors.intersector16 = BVH4SubdivPatch1MBIntersector16();
+    intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+    return intersectors;
+  }
+
+  Accel* BVH4Factory::BVH4OBBVirtualCurve4i(Scene* scene, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(Curve4i::type,scene);
+    Accel::Intersectors intersectors = BVH4OBBVirtualCurveIntersectors(accel,VirtualCurveIntersector4i(),ivariant);
+
+    Builder* builder = nullptr;
+    if      (scene->device->hair_builder == "default"     ) builder = BVH4Curve4iBuilder_OBB_New(accel,scene,0);
+    else if (scene->device->hair_builder == "sah"         ) builder = BVH4Curve4iBuilder_OBB_New(accel,scene,0);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->hair_builder+" for BVH4OBB<VirtualCurve4i>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+#if defined(EMBREE_TARGET_SIMD8)
+  Accel* BVH4Factory::BVH4OBBVirtualCurve8i(Scene* scene, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(Curve8i::type,scene);
+    Accel::Intersectors intersectors = BVH4OBBVirtualCurveIntersectors(accel,VirtualCurveIntersector8i(),ivariant);
+
+    Builder* builder = nullptr;
+    if      (scene->device->hair_builder == "default"     ) builder = BVH4Curve8iBuilder_OBB_New(accel,scene,0);
+    else if (scene->device->hair_builder == "sah"         ) builder = BVH4Curve8iBuilder_OBB_New(accel,scene,0);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->hair_builder+" for BVH4OBB<VirtualCurve8i>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+#endif
+
+  Accel* BVH4Factory::BVH4OBBVirtualCurve4v(Scene* scene, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(Curve4v::type,scene);
+    Accel::Intersectors intersectors = BVH4OBBVirtualCurveIntersectors(accel,VirtualCurveIntersector4v(),ivariant);
+
+    Builder* builder = nullptr;
+    if      (scene->device->hair_builder == "default"     ) builder = BVH4Curve4vBuilder_OBB_New(accel,scene,0);
+    else if (scene->device->hair_builder == "sah"         ) builder = BVH4Curve4vBuilder_OBB_New(accel,scene,0);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->hair_builder+" for BVH4OBB<VirtualCurve4v>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4OBBVirtualCurve4iMB(Scene* scene, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(Curve4iMB::type,scene);
+    Accel::Intersectors intersectors = BVH4OBBVirtualCurveIntersectorsMB(accel,VirtualCurveIntersector4iMB(),ivariant);
+
+    Builder* builder = nullptr;
+    if      (scene->device->hair_builder == "default"     ) builder = BVH4OBBCurve4iMBBuilder_OBB(accel,scene,0);
+    else if (scene->device->hair_builder == "sah"         ) builder = BVH4OBBCurve4iMBBuilder_OBB(accel,scene,0);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->hair_builder+" for BVH4OBB<VirtualCurve4iMB>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+#if defined(EMBREE_TARGET_SIMD8)
+  Accel* BVH4Factory::BVH4OBBVirtualCurve8iMB(Scene* scene, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(Curve8iMB::type,scene);
+    Accel::Intersectors intersectors = BVH4OBBVirtualCurveIntersectorsMB(accel,VirtualCurveIntersector8iMB(), ivariant);
+
+    Builder* builder = nullptr;
+    if      (scene->device->hair_builder == "default"     ) builder = BVH4OBBCurve8iMBBuilder_OBB(accel,scene,0);
+    else if (scene->device->hair_builder == "sah"         ) builder = BVH4OBBCurve8iMBBuilder_OBB(accel,scene,0);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->hair_builder+" for BVH4OBB<VirtualCurve8iMB>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+#endif
+  
+  Accel* BVH4Factory::BVH4Triangle4(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(Triangle4::type,scene);
+
+    Accel::Intersectors intersectors;
+    if      (scene->device->tri_traverser == "default") intersectors = BVH4Triangle4Intersectors(accel,ivariant);
+    else if (scene->device->tri_traverser == "fast"   ) intersectors = BVH4Triangle4Intersectors(accel,IntersectVariant::FAST);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown traverser "+scene->device->tri_traverser+" for BVH4<Triangle4>");
+
+    Builder* builder = nullptr;
+    if (scene->device->tri_builder == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH4Triangle4SceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : builder = BVH4BuilderTwoLevelTriangle4MeshSAH(accel,scene,false); break;
+      case BuildVariant::HIGH_QUALITY: builder = BVH4Triangle4SceneBuilderFastSpatialSAH(accel,scene,0); break;
+      }
+    }
+    else if (scene->device->tri_builder == "sah"         ) builder = BVH4Triangle4SceneBuilderSAH(accel,scene,0);
+    else if (scene->device->tri_builder == "sah_fast_spatial" ) builder = BVH4Triangle4SceneBuilderFastSpatialSAH(accel,scene,0);
+    else if (scene->device->tri_builder == "sah_presplit") builder = BVH4Triangle4SceneBuilderSAH(accel,scene,MODE_HIGH_QUALITY);
+    else if (scene->device->tri_builder == "dynamic"     ) builder = BVH4BuilderTwoLevelTriangle4MeshSAH(accel,scene,false);
+    else if (scene->device->tri_builder == "morton"      ) builder = BVH4BuilderTwoLevelTriangle4MeshSAH(accel,scene,true);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder+" for BVH4<Triangle4>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4Triangle4v(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(Triangle4v::type,scene);
+
+    Accel::Intersectors intersectors;
+    if      (scene->device->tri_traverser == "default") intersectors = BVH4Triangle4vIntersectors(accel,ivariant);
+    else if (scene->device->tri_traverser == "fast"   ) intersectors = BVH4Triangle4vIntersectors(accel,IntersectVariant::FAST);
+    else if (scene->device->tri_traverser == "robust" ) intersectors = BVH4Triangle4vIntersectors(accel,IntersectVariant::ROBUST);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown traverser "+scene->device->tri_traverser+" for BVH4<Triangle4>");
+
+    Builder* builder = nullptr;
+    if (scene->device->tri_builder == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH4Triangle4vSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : builder = BVH4BuilderTwoLevelTriangle4vMeshSAH(accel,scene,false); break;
+      case BuildVariant::HIGH_QUALITY: builder = BVH4Triangle4vSceneBuilderFastSpatialSAH(accel,scene,0); break;
+      }
+    }
+    else if (scene->device->tri_builder == "sah"         ) builder = BVH4Triangle4vSceneBuilderSAH(accel,scene,0);
+    else if (scene->device->tri_builder == "sah_fast_spatial" ) builder = BVH4Triangle4vSceneBuilderFastSpatialSAH(accel,scene,0);
+    else if (scene->device->tri_builder == "sah_presplit") builder = BVH4Triangle4vSceneBuilderSAH(accel,scene,MODE_HIGH_QUALITY);
+    else if (scene->device->tri_builder == "dynamic"     ) builder = BVH4BuilderTwoLevelTriangle4vMeshSAH(accel,scene,false);
+    else if (scene->device->tri_builder == "morton"      ) builder = BVH4BuilderTwoLevelTriangle4vMeshSAH(accel,scene,true);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder+" for BVH4<Triangle4v>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4Triangle4i(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(Triangle4i::type,scene);
+
+    Accel::Intersectors intersectors;
+    if      (scene->device->tri_traverser == "default") intersectors = BVH4Triangle4iIntersectors(accel,ivariant);
+    else if (scene->device->tri_traverser == "fast"   ) intersectors = BVH4Triangle4iIntersectors(accel,IntersectVariant::FAST);
+    else if (scene->device->tri_traverser == "robust" ) intersectors = BVH4Triangle4iIntersectors(accel,IntersectVariant::ROBUST);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown traverser "+scene->device->tri_traverser+" for BVH4<Triangle4i>");
+
+    Builder* builder = nullptr;
+    if (scene->device->tri_builder == "default"     ) {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH4Triangle4iSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : builder = BVH4BuilderTwoLevelTriangle4iMeshSAH(accel,scene,false); break;
+      case BuildVariant::HIGH_QUALITY: builder = BVH4Triangle4iSceneBuilderFastSpatialSAH(accel,scene,0); break;
+      }
+    }
+    else if (scene->device->tri_builder == "sah"         ) builder = BVH4Triangle4iSceneBuilderSAH(accel,scene,0);
+    else if (scene->device->tri_builder == "sah_fast_spatial" ) builder = BVH4Triangle4iSceneBuilderFastSpatialSAH(accel,scene,0);
+    else if (scene->device->tri_builder == "sah_presplit") builder = BVH4Triangle4iSceneBuilderSAH(accel,scene,MODE_HIGH_QUALITY);
+    else if (scene->device->tri_builder == "dynamic"     ) builder = BVH4BuilderTwoLevelTriangle4iMeshSAH(accel,scene,false);
+    else if (scene->device->tri_builder == "morton"      ) builder = BVH4BuilderTwoLevelTriangle4iMeshSAH(accel,scene,true);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder+" for BVH4<Triangle4i>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4Triangle4iMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(Triangle4i::type,scene);
+
+    Accel::Intersectors intersectors;
+    if      (scene->device->tri_traverser_mb == "default") intersectors = BVH4Triangle4iMBIntersectors(accel,ivariant);
+    else if (scene->device->tri_traverser_mb == "fast"   ) intersectors = BVH4Triangle4iMBIntersectors(accel,IntersectVariant::FAST);
+    else if (scene->device->tri_traverser_mb == "robust" ) intersectors = BVH4Triangle4iMBIntersectors(accel,IntersectVariant::ROBUST);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown traverser "+scene->device->tri_traverser_mb+" for BVH4<Triangle4iMB>");
+
+    Builder* builder = nullptr;
+    if (scene->device->tri_builder_mb == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH4Triangle4iMBSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : assert(false); break; // FIXME: implement
+      case BuildVariant::HIGH_QUALITY: assert(false); break;
+      }
+    }
+    else  if (scene->device->tri_builder_mb == "internal_time_splits") builder = BVH4Triangle4iMBSceneBuilderSAH(accel,scene,0);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder_mb+" for BVH4<Triangle4iMB>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4Triangle4vMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(Triangle4vMB::type,scene);
+
+    Accel::Intersectors intersectors;
+    if      (scene->device->tri_traverser_mb == "default") intersectors = BVH4Triangle4vMBIntersectors(accel,ivariant);
+    else if (scene->device->tri_traverser_mb == "fast"   ) intersectors = BVH4Triangle4vMBIntersectors(accel,IntersectVariant::FAST);
+    else if (scene->device->tri_traverser_mb == "robust" ) intersectors = BVH4Triangle4vMBIntersectors(accel,IntersectVariant::ROBUST);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown traverser "+scene->device->tri_traverser_mb+" for BVH4<Triangle4vMB>");
+
+    Builder* builder = nullptr;
+    if (scene->device->tri_builder_mb == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH4Triangle4vMBSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : assert(false); break; // FIXME: implement
+      case BuildVariant::HIGH_QUALITY: assert(false); break;
+      }
+    }
+    else  if (scene->device->tri_builder_mb == "internal_time_splits") builder = BVH4Triangle4vMBSceneBuilderSAH(accel,scene,0);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder_mb+" for BVH4<Triangle4vMB>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4Quad4v(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(Quad4v::type,scene);
+    Accel::Intersectors intersectors = BVH4Quad4vIntersectors(accel,ivariant);
+
+    Builder* builder = nullptr;
+    if (scene->device->quad_builder == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH4Quad4vSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : builder = BVH4BuilderTwoLevelQuadMeshSAH(accel,scene,false); break;
+      case BuildVariant::HIGH_QUALITY: builder = BVH4Quad4vSceneBuilderFastSpatialSAH(accel,scene,0); break;
+      }
+    }
+    else if (scene->device->quad_builder == "sah"              ) builder = BVH4Quad4vSceneBuilderSAH(accel,scene,0);
+    else if (scene->device->quad_builder == "sah_fast_spatial" ) builder = BVH4Quad4vSceneBuilderFastSpatialSAH(accel,scene,0);
+    else if (scene->device->quad_builder == "dynamic"          ) builder = BVH4BuilderTwoLevelQuadMeshSAH(accel,scene,false);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder+" for BVH4<Quad4v>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4Quad4i(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(Quad4i::type,scene);
+    Accel::Intersectors intersectors = BVH4Quad4iIntersectors(accel,ivariant);
+
+    Builder* builder = nullptr;
+    if (scene->device->quad_builder == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH4Quad4iSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : assert(false); break; // FIXME: implement
+      case BuildVariant::HIGH_QUALITY: assert(false); break; // FIXME: implement
+      }
+    }
+    else if (scene->device->quad_builder == "sah") builder = BVH4Quad4iSceneBuilderSAH(accel,scene,0);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder+" for BVH4<Quad4i>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4Quad4iMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(Quad4i::type,scene);
+    Accel::Intersectors intersectors = BVH4Quad4iMBIntersectors(accel,ivariant);
+
+    Builder* builder = nullptr;
+    if (scene->device->quad_builder_mb == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH4Quad4iMBSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : assert(false); break; // FIXME: implement
+      case BuildVariant::HIGH_QUALITY: assert(false); break;
+      }
+    }
+    else if (scene->device->quad_builder_mb == "sah") builder = BVH4Quad4iMBSceneBuilderSAH(accel,scene,0);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder_mb+" for BVH4<Quad4iMB>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4QuantizedQuad4i(Scene* scene)
+  {
+    BVH4* accel = new BVH4(Quad4i::type,scene);
+    Builder* builder = BVH4QuantizedQuad4iSceneBuilderSAH(accel,scene,0);
+    Accel::Intersectors intersectors = QBVH4Quad4iIntersectors(accel);
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4QuantizedTriangle4i(Scene* scene)
+  {
+    BVH4* accel = new BVH4(Triangle4i::type,scene);
+    Builder* builder = BVH4QuantizedTriangle4iSceneBuilderSAH(accel,scene,0);
+    Accel::Intersectors intersectors = QBVH4Triangle4iIntersectors(accel);
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4SubdivPatch1(Scene* scene)
+  {
+    BVH4* accel = new BVH4(SubdivPatch1::type,scene);
+    Accel::Intersectors intersectors = BVH4SubdivPatch1Intersectors(accel);
+    Builder* builder = BVH4SubdivPatch1BuilderSAH(accel,scene,0);
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4SubdivPatch1MB(Scene* scene)
+  {
+    BVH4* accel = new BVH4(SubdivPatch1::type,scene);
+    Accel::Intersectors intersectors = BVH4SubdivPatch1MBIntersectors(accel);
+    Builder* builder = BVH4SubdivPatch1MBBuilderSAH(accel,scene,0);
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4UserGeometry(Scene* scene, BuildVariant bvariant)
+  {
+    BVH4* accel = new BVH4(Object::type,scene);
+    Accel::Intersectors intersectors = BVH4UserGeometryIntersectors(accel);
+
+    Builder* builder = nullptr;
+    if (scene->device->object_builder == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH4VirtualSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : builder = BVH4BuilderTwoLevelVirtualSAH(accel,scene,false); break;
+      case BuildVariant::HIGH_QUALITY: assert(false); break;
+      }
+    }
+    else if (scene->device->object_builder == "sah") builder = BVH4VirtualSceneBuilderSAH(accel,scene,0);
+    else if (scene->device->object_builder == "dynamic") builder = BVH4BuilderTwoLevelVirtualSAH(accel,scene,false);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->object_builder+" for BVH4<Object>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4UserGeometryMB(Scene* scene)
+  {
+    BVH4* accel = new BVH4(Object::type,scene);
+    Accel::Intersectors intersectors = BVH4UserGeometryMBIntersectors(accel);
+    Builder* builder = BVH4VirtualMBSceneBuilderSAH(accel,scene,0);
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4Instance(Scene* scene, bool isExpensive, BuildVariant bvariant)
+  {
+    BVH4* accel = new BVH4(InstancePrimitive::type,scene);
+    Accel::Intersectors intersectors = BVH4InstanceIntersectors(accel);
+    auto gtype = isExpensive ? Geometry::MTY_INSTANCE_EXPENSIVE : Geometry::MTY_INSTANCE_CHEAP;
+    // Builder* builder = BVH4InstanceSceneBuilderSAH(accel,scene,gtype);
+
+    Builder* builder = nullptr;
+    if (scene->device->object_builder == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH4InstanceSceneBuilderSAH(accel,scene,gtype); break;
+      case BuildVariant::DYNAMIC     : builder = BVH4BuilderTwoLevelInstanceSAH(accel,scene,gtype,false); break;
+      case BuildVariant::HIGH_QUALITY: assert(false); break;
+      }
+    }
+    else if (scene->device->object_builder == "sah") builder = BVH4InstanceSceneBuilderSAH(accel,scene,gtype);
+    else if (scene->device->object_builder == "dynamic") builder = BVH4BuilderTwoLevelInstanceSAH(accel,scene,gtype,false);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->object_builder+" for BVH4<Object>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4InstanceMB(Scene* scene, bool isExpensive)
+  {
+    BVH4* accel = new BVH4(InstancePrimitive::type,scene);
+    Accel::Intersectors intersectors = BVH4InstanceMBIntersectors(accel);
+    auto gtype = isExpensive ? Geometry::MTY_INSTANCE_EXPENSIVE : Geometry::MTY_INSTANCE_CHEAP;
+    Builder* builder = BVH4InstanceMBSceneBuilderSAH(accel,scene,gtype);
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4GridIntersectors(BVH4* bvh, IntersectVariant ivariant)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    if (ivariant == IntersectVariant::FAST)
+    {
+      intersectors.intersector1  = BVH4GridIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH4GridIntersector4HybridMoeller();
+      intersectors.intersector8  = BVH4GridIntersector8HybridMoeller();
+      intersectors.intersector16 = BVH4GridIntersector16HybridMoeller();
+      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+    }
+    else /* if (ivariant == IntersectVariant::ROBUST) */
+    {
+      intersectors.intersector1  = BVH4GridIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH4GridIntersector4HybridPluecker();
+      intersectors.intersector8  = BVH4GridIntersector8HybridPluecker();
+      intersectors.intersector16 = BVH4GridIntersector16HybridPluecker();
+      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif      
+    }
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4GridMBIntersectors(BVH4* bvh, IntersectVariant ivariant)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1  = BVH4GridMBIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4  = BVH4GridMBIntersector4HybridMoeller();
+    intersectors.intersector8  = BVH4GridMBIntersector8HybridMoeller();
+    intersectors.intersector16 = BVH4GridMBIntersector16HybridMoeller();
+    intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+    return intersectors;
+  }
+
+  Accel* BVH4Factory::BVH4Grid(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(SubGridQBVH4::type,scene);
+    Accel::Intersectors intersectors = BVH4GridIntersectors(accel,ivariant);
+
+    Builder* builder = nullptr;
+    if (scene->device->object_builder == "default") {
+      builder = BVH4GridSceneBuilderSAH(accel,scene,0);
+    }
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->grid_builder+" for BVH4<GridMesh>");
+    
+    return new AccelInstance(accel,builder,intersectors);    
+  }
+
+  Accel* BVH4Factory::BVH4GridMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(SubGridQBVH4::type,scene);
+    Accel::Intersectors intersectors = BVH4GridMBIntersectors(accel,ivariant);
+    Builder* builder = nullptr;
+    if (scene->device->object_builder == "default") {
+      builder = BVH4GridMBSceneBuilderSAH(accel,scene,0);
+    }
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->grid_builder+" for BVH4MB<GridMesh>");
+    return new AccelInstance(accel,builder,intersectors);        
+  }
+
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh4_factory.h b/thirdparty/embree/kernels/bvh/bvh4_factory.h
new file mode 100644
index 0000000000..30973971a4
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh4_factory.h
@@ -0,0 +1,316 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh_factory.h"
+
+namespace embree
+{
+  /*! BVH4 instantiations */
+  class BVH4Factory : public BVHFactory
+  {
+  public:
+    BVH4Factory(int bfeatures, int ifeatures);
+
+  public:
+    Accel* BVH4OBBVirtualCurve4i(Scene* scene, IntersectVariant ivariant);
+    Accel* BVH4OBBVirtualCurve4v(Scene* scene, IntersectVariant ivariant);
+    Accel* BVH4OBBVirtualCurve8i(Scene* scene, IntersectVariant ivariant);
+    Accel* BVH4OBBVirtualCurve4iMB(Scene* scene, IntersectVariant ivariant);
+    Accel* BVH4OBBVirtualCurve8iMB(Scene* scene, IntersectVariant ivariant);
+    DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector4i);
+    DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector8i);
+    DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector4v);
+    DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector8v);
+    DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector4iMB);
+    DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector8iMB);
+        
+    Accel* BVH4Triangle4   (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+    Accel* BVH4Triangle4v  (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::ROBUST);
+    Accel* BVH4Triangle4i  (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+    Accel* BVH4Triangle4vMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+    Accel* BVH4Triangle4iMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+
+    Accel* BVH4Quad4v  (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+    Accel* BVH4Quad4i  (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+    Accel* BVH4Quad4iMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+
+    Accel* BVH4QuantizedTriangle4i(Scene* scene);
+    Accel* BVH4QuantizedQuad4i(Scene* scene);
+ 
+    Accel* BVH4SubdivPatch1(Scene* scene);
+    Accel* BVH4SubdivPatch1MB(Scene* scene);
+
+    Accel* BVH4UserGeometry(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC);
+    Accel* BVH4UserGeometryMB(Scene* scene);
+
+    Accel* BVH4Instance(Scene* scene, bool isExpensive, BuildVariant bvariant = BuildVariant::STATIC);
+    Accel* BVH4InstanceMB(Scene* scene, bool isExpensive);
+
+    Accel* BVH4Grid(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+    Accel* BVH4GridMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+
+  private:
+    void selectBuilders(int features);
+    void selectIntersectors(int features);
+    
+  private:
+    Accel::Intersectors BVH4OBBVirtualCurveIntersectors(BVH4* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant);
+    Accel::Intersectors BVH4OBBVirtualCurveIntersectorsMB(BVH4* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant);
+    
+    Accel::Intersectors BVH4Triangle4Intersectors(BVH4* bvh, IntersectVariant ivariant);
+    Accel::Intersectors BVH4Triangle4vIntersectors(BVH4* bvh, IntersectVariant ivariant);
+    Accel::Intersectors BVH4Triangle4iIntersectors(BVH4* bvh, IntersectVariant ivariant);
+    Accel::Intersectors BVH4Triangle4iMBIntersectors(BVH4* bvh, IntersectVariant ivariant);
+    Accel::Intersectors BVH4Triangle4vMBIntersectors(BVH4* bvh, IntersectVariant ivariant);
+
+    Accel::Intersectors BVH4Quad4vIntersectors(BVH4* bvh, IntersectVariant ivariant);
+    Accel::Intersectors BVH4Quad4iIntersectors(BVH4* bvh, IntersectVariant ivariant);
+    Accel::Intersectors BVH4Quad4iMBIntersectors(BVH4* bvh, IntersectVariant ivariant);
+
+    Accel::Intersectors QBVH4Quad4iIntersectors(BVH4* bvh);
+    Accel::Intersectors QBVH4Triangle4iIntersectors(BVH4* bvh);
+
+    Accel::Intersectors BVH4UserGeometryIntersectors(BVH4* bvh);
+    Accel::Intersectors BVH4UserGeometryMBIntersectors(BVH4* bvh);
+
+    Accel::Intersectors BVH4InstanceIntersectors(BVH4* bvh);
+    Accel::Intersectors BVH4InstanceMBIntersectors(BVH4* bvh);
+    
+    Accel::Intersectors BVH4SubdivPatch1Intersectors(BVH4* bvh);
+    Accel::Intersectors BVH4SubdivPatch1MBIntersectors(BVH4* bvh);
+
+    Accel::Intersectors BVH4GridIntersectors(BVH4* bvh, IntersectVariant ivariant);
+    Accel::Intersectors BVH4GridMBIntersectors(BVH4* bvh, IntersectVariant ivariant);
+    
+  private:
+
+    DEFINE_SYMBOL2(Accel::Collider,BVH4ColliderUserGeom);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersector1);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersector1MB);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersectorRobust1);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersectorRobust1MB);
+    
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4Intersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4vIntersector1Pluecker);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iIntersector1Pluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4vMBIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iMBIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4vMBIntersector1Pluecker);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iMBIntersector1Pluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4Quad4vIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4Quad4iIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4Quad4vIntersector1Pluecker);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4Quad4iIntersector1Pluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4Quad4iMBIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4Quad4iMBIntersector1Pluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,QBVH4Triangle4iIntersector1Pluecker);
+    DEFINE_SYMBOL2(Accel::Intersector1,QBVH4Quad4iIntersector1Pluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4SubdivPatch1Intersector1);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4SubdivPatch1MBIntersector1);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4VirtualIntersector1);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4VirtualMBIntersector1);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4InstanceIntersector1);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4InstanceMBIntersector1);
+        
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4GridIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4GridMBIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4GridIntersector1Pluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersector4Hybrid);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersector4HybridMB);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersectorRobust4Hybrid);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersectorRobust4HybridMB);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4Intersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4Intersector4HybridMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4vIntersector4HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iIntersector4HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4vMBIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iMBIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4vMBIntersector4HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iMBIntersector4HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4vIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4vIntersector4HybridMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4iIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4vIntersector4HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4iIntersector4HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4iMBIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4iMBIntersector4HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4SubdivPatch1Intersector4);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4SubdivPatch1MBIntersector4);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4VirtualIntersector4Chunk);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4VirtualMBIntersector4Chunk);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4InstanceIntersector4Chunk);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4InstanceMBIntersector4Chunk);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4GridIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4GridMBIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4GridIntersector4HybridPluecker);
+
+    // ==============
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersector8Hybrid);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersector8HybridMB);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersectorRobust8Hybrid);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersectorRobust8HybridMB);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4Intersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4Intersector8HybridMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4vIntersector8HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iIntersector8HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4vMBIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iMBIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4vMBIntersector8HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iMBIntersector8HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4vIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4vIntersector8HybridMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4iIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4vIntersector8HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4iIntersector8HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4iMBIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4iMBIntersector8HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4SubdivPatch1Intersector8);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4SubdivPatch1MBIntersector8);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4VirtualIntersector8Chunk);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4VirtualMBIntersector8Chunk);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4InstanceIntersector8Chunk);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4InstanceMBIntersector8Chunk);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4GridIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4GridMBIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4GridIntersector8HybridPluecker);
+
+    // ==============
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersector16Hybrid);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersector16HybridMB);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersectorRobust16Hybrid);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersectorRobust16HybridMB);
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4Intersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4Intersector16HybridMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4vIntersector16HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iIntersector16HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4vMBIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iMBIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4vMBIntersector16HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iMBIntersector16HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4vIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4vIntersector16HybridMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4iIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4vIntersector16HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4iIntersector16HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4iMBIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4iMBIntersector16HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4SubdivPatch1Intersector16);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4SubdivPatch1MBIntersector16);
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4VirtualIntersector16Chunk);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4VirtualMBIntersector16Chunk);
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4InstanceIntersector16Chunk);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4InstanceMBIntersector16Chunk);
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4GridIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4GridMBIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4GridIntersector16HybridPluecker);
+
+    // ==============
+
+    DEFINE_SYMBOL2(Accel::IntersectorN, BVH4IntersectorStreamPacketFallback);
+
+    DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Triangle4IntersectorStreamMoeller);
+    DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Triangle4IntersectorStreamMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Triangle4iIntersectorStreamMoeller);
+    DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Triangle4vIntersectorStreamPluecker);
+    DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Triangle4iIntersectorStreamPluecker);
+
+    DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Quad4vIntersectorStreamMoeller);
+    DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Quad4vIntersectorStreamMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Quad4iIntersectorStreamMoeller);
+    DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Quad4vIntersectorStreamPluecker);
+    DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Quad4iIntersectorStreamPluecker);
+
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH4VirtualIntersectorStream);
+    
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH4InstanceIntersectorStream);
+       
+    // SAH scene builders
+  private:
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Curve4vBuilder_OBB_New,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Curve4iBuilder_OBB_New,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4OBBCurve4iMBBuilder_OBB,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Curve8iBuilder_OBB_New,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4OBBCurve8iMBBuilder_OBB,void* COMMA Scene* COMMA size_t);
+
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4SceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4vMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4QuantizedTriangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Quad4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Quad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Quad4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4QuantizedQuad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    
+    DEFINE_ISA_FUNCTION(Builder*,BVH4SubdivPatch1BuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4SubdivPatch1MBBuilderSAH,void* COMMA Scene* COMMA size_t);
+    
+    DEFINE_ISA_FUNCTION(Builder*,BVH4VirtualSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4VirtualMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+    DEFINE_ISA_FUNCTION(Builder*,BVH4InstanceSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4InstanceMBSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask);
+
+    DEFINE_ISA_FUNCTION(Builder*,BVH4GridSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4GridMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+    // spatial scene builder
+  private:
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4SceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4iSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Quad4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+    
+    // twolevel scene builders
+  private:
+    DEFINE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelTriangle4MeshSAH,void* COMMA Scene* COMMA bool);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelTriangle4vMeshSAH,void* COMMA Scene* COMMA bool);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelTriangle4iMeshSAH,void* COMMA Scene* COMMA bool);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelQuadMeshSAH,void* COMMA Scene* COMMA bool);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelVirtualSAH,void* COMMA Scene* COMMA bool);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelInstanceSAH,void* COMMA Scene* COMMA Geometry::GTypeMask COMMA bool);
+  };
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh8_factory.cpp b/thirdparty/embree/kernels/bvh/bvh8_factory.cpp
new file mode 100644
index 0000000000..d4521af241
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh8_factory.cpp
@@ -0,0 +1,1165 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "../common/isa.h" // to define EMBREE_TARGET_SIMD8
+
+#if defined (EMBREE_TARGET_SIMD8)
+
+#include "bvh8_factory.h"
+#include "../bvh/bvh.h"
+
+#include "../geometry/curveNv.h"
+#include "../geometry/curveNi.h"
+#include "../geometry/curveNi_mb.h"
+#include "../geometry/linei.h"
+#include "../geometry/triangle.h"
+#include "../geometry/trianglev.h"
+#include "../geometry/trianglev_mb.h"
+#include "../geometry/trianglei.h"
+#include "../geometry/quadv.h"
+#include "../geometry/quadi.h"
+#include "../geometry/subdivpatch1.h"
+#include "../geometry/object.h"
+#include "../geometry/instance.h"
+#include "../geometry/subgrid.h"
+#include "../common/accelinstance.h"
+
+namespace embree
+{
+  DECLARE_SYMBOL2(Accel::Collider,BVH8ColliderUserGeom);
+  
+  DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector8v,void);
+  DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector8iMB,void);
+  
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersector1);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersector1MB);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersectorRobust1);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersectorRobust1MB);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4Intersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vIntersector1Pluecker);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iIntersector1Pluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vIntersector1Woop);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vMBIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iMBIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vMBIntersector1Pluecker);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iMBIntersector1Pluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Quad4vIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Quad4iIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Quad4vIntersector1Pluecker);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Quad4iIntersector1Pluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Quad4iMBIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Quad4iMBIntersector1Pluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,QBVH8Triangle4iIntersector1Pluecker);
+  DECLARE_SYMBOL2(Accel::Intersector1,QBVH8Triangle4Intersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,QBVH8Quad4iIntersector1Pluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8VirtualIntersector1);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8VirtualMBIntersector1);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8InstanceIntersector1);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8InstanceMBIntersector1);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8GridIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8GridMBIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8GridIntersector1Pluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersector4Hybrid);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersector4HybridMB);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersectorRobust4Hybrid);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersectorRobust4HybridMB);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4Intersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4Intersector4HybridMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4vIntersector4HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iIntersector4HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4vMBIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iMBIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4vMBIntersector4HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iMBIntersector4HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4vIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4vIntersector4HybridMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4iIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4vIntersector4HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4iIntersector4HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4iMBIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4iMBIntersector4HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8VirtualIntersector4Chunk);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8VirtualMBIntersector4Chunk);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8InstanceIntersector4Chunk);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8InstanceMBIntersector4Chunk);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8GridIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8GridIntersector4HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersector8Hybrid);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersector8HybridMB);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersectorRobust8Hybrid);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersectorRobust8HybridMB);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4Intersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4Intersector8HybridMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4vIntersector8HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iIntersector8HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4vMBIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iMBIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4vMBIntersector8HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iMBIntersector8HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4vIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4vIntersector8HybridMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4iIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4vIntersector8HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4iIntersector8HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4iMBIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4iMBIntersector8HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8VirtualIntersector8Chunk);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8VirtualMBIntersector8Chunk);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8InstanceIntersector8Chunk);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8InstanceMBIntersector8Chunk);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8GridIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8GridIntersector8HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersector16Hybrid);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersector16HybridMB);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersectorRobust16Hybrid);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersectorRobust16HybridMB);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4Intersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4Intersector16HybridMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4vIntersector16HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iIntersector16HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4vMBIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iMBIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4vMBIntersector16HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iMBIntersector16HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4vIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4vIntersector16HybridMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4iIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4vIntersector16HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4iIntersector16HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4iMBIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4iMBIntersector16HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8VirtualIntersector16Chunk);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8VirtualMBIntersector16Chunk);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8InstanceIntersector16Chunk);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8InstanceMBIntersector16Chunk);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8GridIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8GridIntersector16HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8IntersectorStreamPacketFallback);
+
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4IntersectorStreamMoeller);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4IntersectorStreamMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4iIntersectorStreamMoeller);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4vIntersectorStreamPluecker);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4iIntersectorStreamPluecker);
+
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamMoeller);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Quad4iIntersectorStreamMoeller);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamPluecker);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Quad4iIntersectorStreamPluecker);
+
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8VirtualIntersectorStream);
+
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8InstanceIntersectorStream);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Curve8vBuilder_OBB_New,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8OBBCurve8iMBBuilder_OBB,void* COMMA Scene* COMMA size_t);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4SceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4vMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8QuantizedTriangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8QuantizedTriangle4SceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8QuantizedQuad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH8VirtualSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8VirtualMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  
+  DECLARE_ISA_FUNCTION(Builder*,BVH8InstanceSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8InstanceMBSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4SceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8GridSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8GridMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelTriangle4MeshSAH,void* COMMA Scene* COMMA bool);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelTriangle4vMeshSAH,void* COMMA Scene* COMMA bool);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelTriangle4iMeshSAH,void* COMMA Scene* COMMA bool);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelQuadMeshSAH,void* COMMA Scene* COMMA bool);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelVirtualSAH,void* COMMA Scene* COMMA bool);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelInstanceSAH,void* COMMA Scene* COMMA Geometry::GTypeMask COMMA bool);
+
+  BVH8Factory::BVH8Factory(int bfeatures, int ifeatures)
+  {
+    SELECT_SYMBOL_INIT_AVX(ifeatures,BVH8ColliderUserGeom);
+    
+    selectBuilders(bfeatures);
+    selectIntersectors(ifeatures);
+  }
+
+  void BVH8Factory::selectBuilders(int features)
+  {
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX(features,BVH8Curve8vBuilder_OBB_New));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX(features,BVH8OBBCurve8iMBBuilder_OBB));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX(features,BVH8Triangle4SceneBuilderSAH));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX(features,BVH8Triangle4vSceneBuilderSAH));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX(features,BVH8Triangle4iSceneBuilderSAH));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX(features,BVH8Triangle4iMBSceneBuilderSAH));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX(features,BVH8Triangle4vMBSceneBuilderSAH));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX(features,BVH8QuantizedTriangle4iSceneBuilderSAH));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX(features,BVH8QuantizedTriangle4SceneBuilderSAH));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX(features,BVH8Quad4vSceneBuilderSAH));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX(features,BVH8Quad4iSceneBuilderSAH));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX(features,BVH8Quad4iMBSceneBuilderSAH));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX(features,BVH8QuantizedQuad4iSceneBuilderSAH));
+
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX(features,BVH8VirtualSceneBuilderSAH));
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX(features,BVH8VirtualMBSceneBuilderSAH));
+
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX(features,BVH8InstanceSceneBuilderSAH));
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX(features,BVH8InstanceMBSceneBuilderSAH));
+    
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX(features,BVH8GridSceneBuilderSAH));
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX(features,BVH8GridMBSceneBuilderSAH));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX(features,BVH8Triangle4SceneBuilderFastSpatialSAH));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX(features,BVH8Triangle4vSceneBuilderFastSpatialSAH));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX(features,BVH8Quad4vSceneBuilderFastSpatialSAH));
+
+    IF_ENABLED_TRIS  (SELECT_SYMBOL_INIT_AVX(features,BVH8BuilderTwoLevelTriangle4MeshSAH));
+    IF_ENABLED_TRIS  (SELECT_SYMBOL_INIT_AVX(features,BVH8BuilderTwoLevelTriangle4vMeshSAH));
+    IF_ENABLED_TRIS  (SELECT_SYMBOL_INIT_AVX(features,BVH8BuilderTwoLevelTriangle4iMeshSAH));
+    IF_ENABLED_QUADS (SELECT_SYMBOL_INIT_AVX(features,BVH8BuilderTwoLevelQuadMeshSAH));
+    IF_ENABLED_USER  (SELECT_SYMBOL_INIT_AVX(features,BVH8BuilderTwoLevelVirtualSAH));
+    IF_ENABLED_INSTANCE (SELECT_SYMBOL_INIT_AVX(features,BVH8BuilderTwoLevelInstanceSAH));
+  }
+
+  void BVH8Factory::selectIntersectors(int features)
+  {
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,VirtualCurveIntersector8v));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,VirtualCurveIntersector8iMB));
+    
+    /* select intersectors1 */
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8OBBVirtualCurveIntersector1));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8OBBVirtualCurveIntersector1MB));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8OBBVirtualCurveIntersectorRobust1));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8OBBVirtualCurveIntersectorRobust1MB));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4Intersector1Moeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4iIntersector1Moeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4vIntersector1Pluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4iIntersector1Pluecker));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4vIntersector1Woop));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4vMBIntersector1Moeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4iMBIntersector1Moeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4vMBIntersector1Pluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4iMBIntersector1Pluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4vIntersector1Moeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4iIntersector1Moeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4vIntersector1Pluecker));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4iIntersector1Pluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4iMBIntersector1Moeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4iMBIntersector1Pluecker));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,QBVH8Triangle4iIntersector1Pluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,QBVH8Triangle4Intersector1Moeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,QBVH8Quad4iIntersector1Pluecker));
+
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8VirtualIntersector1));
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8VirtualMBIntersector1));
+
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8InstanceIntersector1));
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8InstanceMBIntersector1));
+
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8GridIntersector1Moeller));
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8GridMBIntersector1Moeller))
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8GridIntersector1Pluecker));
+
+#if defined (EMBREE_RAY_PACKETS)
+
+    /* select intersectors4 */
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8OBBVirtualCurveIntersector4Hybrid));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8OBBVirtualCurveIntersector4HybridMB));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8OBBVirtualCurveIntersectorRobust4Hybrid));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8OBBVirtualCurveIntersectorRobust4HybridMB));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4Intersector4HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4Intersector4HybridMoellerNoFilter));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4iIntersector4HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4vIntersector4HybridPluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4iIntersector4HybridPluecker));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4vMBIntersector4HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4iMBIntersector4HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4vMBIntersector4HybridPluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4iMBIntersector4HybridPluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4vIntersector4HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4vIntersector4HybridMoellerNoFilter));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4iIntersector4HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4vIntersector4HybridPluecker));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4iIntersector4HybridPluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2(features,BVH8Quad4iMBIntersector4HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2(features,BVH8Quad4iMBIntersector4HybridPluecker));
+
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8VirtualIntersector4Chunk));
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8VirtualMBIntersector4Chunk));
+
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8InstanceIntersector4Chunk));
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8InstanceMBIntersector4Chunk));
+
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8GridIntersector4HybridMoeller));
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8GridIntersector4HybridPluecker));
+
+    /* select intersectors8 */
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8OBBVirtualCurveIntersector8Hybrid));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8OBBVirtualCurveIntersector8HybridMB));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8OBBVirtualCurveIntersectorRobust8Hybrid));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8OBBVirtualCurveIntersectorRobust8HybridMB));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4Intersector8HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4Intersector8HybridMoellerNoFilter));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4iIntersector8HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4vIntersector8HybridPluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4iIntersector8HybridPluecker));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4vMBIntersector8HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4iMBIntersector8HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4vMBIntersector8HybridPluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4iMBIntersector8HybridPluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4vIntersector8HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4vIntersector8HybridMoellerNoFilter));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4iIntersector8HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4vIntersector8HybridPluecker));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4iIntersector8HybridPluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2(features,BVH8Quad4iMBIntersector8HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2(features,BVH8Quad4iMBIntersector8HybridPluecker));
+
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8VirtualIntersector8Chunk));
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8VirtualMBIntersector8Chunk));
+
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8InstanceIntersector8Chunk));
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8InstanceMBIntersector8Chunk));
+
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8GridIntersector8HybridMoeller));
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8GridIntersector8HybridPluecker));
+
+    /* select intersectors16 */
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512(features,BVH8OBBVirtualCurveIntersector16Hybrid));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512(features,BVH8OBBVirtualCurveIntersector16HybridMB));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512(features,BVH8OBBVirtualCurveIntersectorRobust16Hybrid));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512(features,BVH8OBBVirtualCurveIntersectorRobust16HybridMB));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512(features,BVH8Triangle4Intersector16HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512(features,BVH8Triangle4Intersector16HybridMoellerNoFilter));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512(features,BVH8Triangle4iIntersector16HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512(features,BVH8Triangle4vIntersector16HybridPluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512(features,BVH8Triangle4iIntersector16HybridPluecker));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512(features,BVH8Triangle4vMBIntersector16HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512(features,BVH8Triangle4iMBIntersector16HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512(features,BVH8Triangle4vMBIntersector16HybridPluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512(features,BVH8Triangle4iMBIntersector16HybridPluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512(features,BVH8Quad4vIntersector16HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512(features,BVH8Quad4vIntersector16HybridMoellerNoFilter));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512(features,BVH8Quad4iIntersector16HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512(features,BVH8Quad4vIntersector16HybridPluecker));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512(features,BVH8Quad4iIntersector16HybridPluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512(features,BVH8Quad4iMBIntersector16HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512(features,BVH8Quad4iMBIntersector16HybridPluecker));
+
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX512(features,BVH8VirtualIntersector16Chunk));
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX512(features,BVH8VirtualMBIntersector16Chunk));
+
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX512(features,BVH8InstanceIntersector16Chunk));
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX512(features,BVH8InstanceMBIntersector16Chunk));
+
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX512(features,BVH8GridIntersector16HybridMoeller));
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX512(features,BVH8GridIntersector16HybridPluecker));
+
+    /* select stream intersectors */
+
+    SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8IntersectorStreamPacketFallback);
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4IntersectorStreamMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4IntersectorStreamMoellerNoFilter));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4iIntersectorStreamMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4vIntersectorStreamPluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4iIntersectorStreamPluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4vIntersectorStreamMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4vIntersectorStreamMoellerNoFilter));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4iIntersectorStreamMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4vIntersectorStreamPluecker));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4iIntersectorStreamPluecker));
+
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8VirtualIntersectorStream));
+
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8InstanceIntersectorStream));
+
+#endif
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8OBBVirtualCurveIntersectors(BVH8* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.leafIntersector = leafIntersector;
+      intersectors.intersector1  = BVH8OBBVirtualCurveIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8OBBVirtualCurveIntersector4Hybrid();
+      intersectors.intersector8  = BVH8OBBVirtualCurveIntersector8Hybrid();
+      intersectors.intersector16 = BVH8OBBVirtualCurveIntersector16Hybrid();
+      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.leafIntersector = leafIntersector;
+      intersectors.intersector1  = BVH8OBBVirtualCurveIntersectorRobust1();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8OBBVirtualCurveIntersectorRobust4Hybrid();
+      intersectors.intersector8  = BVH8OBBVirtualCurveIntersectorRobust8Hybrid();
+      intersectors.intersector16 = BVH8OBBVirtualCurveIntersectorRobust16Hybrid();
+      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    default: assert(false);
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8OBBVirtualCurveIntersectorsMB(BVH8* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.leafIntersector = leafIntersector;
+      intersectors.intersector1  = BVH8OBBVirtualCurveIntersector1MB();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8OBBVirtualCurveIntersector4HybridMB();
+      intersectors.intersector8  = BVH8OBBVirtualCurveIntersector8HybridMB();
+      intersectors.intersector16 = BVH8OBBVirtualCurveIntersector16HybridMB();
+      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.leafIntersector = leafIntersector;
+      intersectors.intersector1  = BVH8OBBVirtualCurveIntersectorRobust1MB();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8OBBVirtualCurveIntersectorRobust4HybridMB();
+      intersectors.intersector8  = BVH8OBBVirtualCurveIntersectorRobust8HybridMB();
+      intersectors.intersector16 = BVH8OBBVirtualCurveIntersectorRobust16HybridMB();
+      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    default: assert(false);
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8Triangle4Intersectors(BVH8* bvh, IntersectVariant ivariant)
+  {
+    assert(ivariant == IntersectVariant::FAST);
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1           = BVH8Triangle4Intersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4_filter    = BVH8Triangle4Intersector4HybridMoeller();
+    intersectors.intersector4_nofilter  = BVH8Triangle4Intersector4HybridMoellerNoFilter();
+    intersectors.intersector8_filter    = BVH8Triangle4Intersector8HybridMoeller();
+    intersectors.intersector8_nofilter  = BVH8Triangle4Intersector8HybridMoellerNoFilter();
+    intersectors.intersector16_filter   = BVH8Triangle4Intersector16HybridMoeller();
+    intersectors.intersector16_nofilter = BVH8Triangle4Intersector16HybridMoellerNoFilter();
+    intersectors.intersectorN_filter    = BVH8Triangle4IntersectorStreamMoeller();
+    intersectors.intersectorN_nofilter  = BVH8Triangle4IntersectorStreamMoellerNoFilter();
+#endif
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8Triangle4vIntersectors(BVH8* bvh, IntersectVariant ivariant)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+#define ENABLE_WOOP_TEST 0
+#if ENABLE_WOOP_TEST == 0
+    //assert(ivariant == IntersectVariant::ROBUST);
+    intersectors.intersector1    = BVH8Triangle4vIntersector1Pluecker();
+#else
+    intersectors.intersector1    = BVH8Triangle4vIntersector1Woop();
+#endif
+
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4    = BVH8Triangle4vIntersector4HybridPluecker();
+    intersectors.intersector8    = BVH8Triangle4vIntersector8HybridPluecker();
+    intersectors.intersector16   = BVH8Triangle4vIntersector16HybridPluecker();
+    intersectors.intersectorN    = BVH8Triangle4vIntersectorStreamPluecker();
+#endif
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8Triangle4iIntersectors(BVH8* bvh, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH8Triangle4iIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8Triangle4iIntersector4HybridMoeller();
+      intersectors.intersector8  = BVH8Triangle4iIntersector8HybridMoeller();
+      intersectors.intersector16 = BVH8Triangle4iIntersector16HybridMoeller();
+      intersectors.intersectorN  = BVH8Triangle4iIntersectorStreamMoeller();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH8Triangle4iIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8Triangle4iIntersector4HybridPluecker();
+      intersectors.intersector8  = BVH8Triangle4iIntersector8HybridPluecker();
+      intersectors.intersector16 = BVH8Triangle4iIntersector16HybridPluecker();
+      intersectors.intersectorN  = BVH8Triangle4iIntersectorStreamPluecker();
+#endif
+      return intersectors;
+    }
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8Triangle4vMBIntersectors(BVH8* bvh, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH8Triangle4vMBIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8Triangle4vMBIntersector4HybridMoeller();
+      intersectors.intersector8  = BVH8Triangle4vMBIntersector8HybridMoeller();
+      intersectors.intersector16 = BVH8Triangle4vMBIntersector16HybridMoeller();
+      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH8Triangle4vMBIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8Triangle4vMBIntersector4HybridPluecker();
+      intersectors.intersector8  = BVH8Triangle4vMBIntersector8HybridPluecker();
+      intersectors.intersector16 = BVH8Triangle4vMBIntersector16HybridPluecker();
+      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8Triangle4iMBIntersectors(BVH8* bvh, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH8Triangle4iMBIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8Triangle4iMBIntersector4HybridMoeller();
+      intersectors.intersector8  = BVH8Triangle4iMBIntersector8HybridMoeller();
+      intersectors.intersector16 = BVH8Triangle4iMBIntersector16HybridMoeller();
+      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH8Triangle4iMBIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8Triangle4iMBIntersector4HybridPluecker();
+      intersectors.intersector8  = BVH8Triangle4iMBIntersector8HybridPluecker();
+      intersectors.intersector16 = BVH8Triangle4iMBIntersector16HybridPluecker();
+      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8Quad4vIntersectors(BVH8* bvh, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1           = BVH8Quad4vIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4_filter    = BVH8Quad4vIntersector4HybridMoeller();
+      intersectors.intersector4_nofilter  = BVH8Quad4vIntersector4HybridMoellerNoFilter();
+      intersectors.intersector8_filter    = BVH8Quad4vIntersector8HybridMoeller();
+      intersectors.intersector8_nofilter  = BVH8Quad4vIntersector8HybridMoellerNoFilter();
+      intersectors.intersector16_filter   = BVH8Quad4vIntersector16HybridMoeller();
+      intersectors.intersector16_nofilter = BVH8Quad4vIntersector16HybridMoellerNoFilter();
+      intersectors.intersectorN_filter    = BVH8Quad4vIntersectorStreamMoeller();
+      intersectors.intersectorN_nofilter  = BVH8Quad4vIntersectorStreamMoellerNoFilter();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH8Quad4vIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8Quad4vIntersector4HybridPluecker();
+      intersectors.intersector8  = BVH8Quad4vIntersector8HybridPluecker();
+      intersectors.intersector16 = BVH8Quad4vIntersector16HybridPluecker();
+      intersectors.intersectorN  = BVH8Quad4vIntersectorStreamPluecker();
+#endif
+      return intersectors;
+    }
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8Quad4iIntersectors(BVH8* bvh, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH8Quad4iIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8Quad4iIntersector4HybridMoeller();
+      intersectors.intersector8  = BVH8Quad4iIntersector8HybridMoeller();
+      intersectors.intersector16 = BVH8Quad4iIntersector16HybridMoeller();
+      intersectors.intersectorN  = BVH8Quad4iIntersectorStreamMoeller();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH8Quad4iIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8Quad4iIntersector4HybridPluecker();
+      intersectors.intersector8  = BVH8Quad4iIntersector8HybridPluecker();
+      intersectors.intersector16 = BVH8Quad4iIntersector16HybridPluecker();
+      intersectors.intersectorN  = BVH8Quad4iIntersectorStreamPluecker();
+#endif
+      return intersectors;
+    }
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8Quad4iMBIntersectors(BVH8* bvh, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH8Quad4iMBIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8Quad4iMBIntersector4HybridMoeller();
+      intersectors.intersector8  = BVH8Quad4iMBIntersector8HybridMoeller();
+      intersectors.intersector16 = BVH8Quad4iMBIntersector16HybridMoeller();
+      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH8Quad4iMBIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8Quad4iMBIntersector4HybridPluecker();
+      intersectors.intersector8  = BVH8Quad4iMBIntersector8HybridPluecker();
+      intersectors.intersector16 = BVH8Quad4iMBIntersector16HybridPluecker();
+      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH8Factory::QBVH8Triangle4iIntersectors(BVH8* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1 = QBVH8Triangle4iIntersector1Pluecker();
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH8Factory::QBVH8Triangle4Intersectors(BVH8* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1 = QBVH8Triangle4Intersector1Moeller();
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH8Factory::QBVH8Quad4iIntersectors(BVH8* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1 = QBVH8Quad4iIntersector1Pluecker();
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8UserGeometryIntersectors(BVH8* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1  = BVH8VirtualIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4  = BVH8VirtualIntersector4Chunk();
+    intersectors.intersector8  = BVH8VirtualIntersector8Chunk();
+    intersectors.intersector16 = BVH8VirtualIntersector16Chunk();
+    intersectors.intersectorN  = BVH8VirtualIntersectorStream();
+#endif
+    intersectors.collider      = BVH8ColliderUserGeom();
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8UserGeometryMBIntersectors(BVH8* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1  = BVH8VirtualMBIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4  = BVH8VirtualMBIntersector4Chunk();
+    intersectors.intersector8  = BVH8VirtualMBIntersector8Chunk();
+    intersectors.intersector16 = BVH8VirtualMBIntersector16Chunk();
+    intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
+#endif
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8InstanceIntersectors(BVH8* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1  = BVH8InstanceIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4  = BVH8InstanceIntersector4Chunk();
+    intersectors.intersector8  = BVH8InstanceIntersector8Chunk();
+    intersectors.intersector16 = BVH8InstanceIntersector16Chunk();
+    intersectors.intersectorN  = BVH8InstanceIntersectorStream();
+#endif
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8InstanceMBIntersectors(BVH8* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1  = BVH8InstanceMBIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4  = BVH8InstanceMBIntersector4Chunk();
+    intersectors.intersector8  = BVH8InstanceMBIntersector8Chunk();
+    intersectors.intersector16 = BVH8InstanceMBIntersector16Chunk();
+    intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
+#endif
+    return intersectors;
+  }
+
+  Accel* BVH8Factory::BVH8OBBVirtualCurve8v(Scene* scene, IntersectVariant ivariant)
+  {
+    BVH8* accel = new BVH8(Curve8v::type,scene);
+    Accel::Intersectors intersectors = BVH8OBBVirtualCurveIntersectors(accel,VirtualCurveIntersector8v(),ivariant);
+    Builder* builder = BVH8Curve8vBuilder_OBB_New(accel,scene,0);
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8OBBVirtualCurve8iMB(Scene* scene, IntersectVariant ivariant)
+  {
+    BVH8* accel = new BVH8(Curve8iMB::type,scene);
+    Accel::Intersectors intersectors = BVH8OBBVirtualCurveIntersectorsMB(accel,VirtualCurveIntersector8iMB(),ivariant);
+    Builder* builder = BVH8OBBCurve8iMBBuilder_OBB(accel,scene,0);
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8Triangle4(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH8* accel = new BVH8(Triangle4::type,scene);
+    Accel::Intersectors intersectors= BVH8Triangle4Intersectors(accel,ivariant);
+    Builder* builder = nullptr;
+    if (scene->device->tri_builder == "default")  {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH8Triangle4SceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : builder = BVH8BuilderTwoLevelTriangle4MeshSAH(accel,scene,false); break;
+      case BuildVariant::HIGH_QUALITY: builder = BVH8Triangle4SceneBuilderFastSpatialSAH(accel,scene,0); break;
+      }
+    }
+    else if (scene->device->tri_builder == "sah"         )  builder = BVH8Triangle4SceneBuilderSAH(accel,scene,0);
+    else if (scene->device->tri_builder == "sah_fast_spatial")  builder = BVH8Triangle4SceneBuilderFastSpatialSAH(accel,scene,0);
+    else if (scene->device->tri_builder == "sah_presplit")     builder = BVH8Triangle4SceneBuilderSAH(accel,scene,MODE_HIGH_QUALITY);
+    else if (scene->device->tri_builder == "dynamic"     ) builder = BVH8BuilderTwoLevelTriangle4MeshSAH(accel,scene,false);
+    else if (scene->device->tri_builder == "morton"     ) builder = BVH8BuilderTwoLevelTriangle4MeshSAH(accel,scene,true);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder+" for BVH8<Triangle4>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8Triangle4v(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH8* accel = new BVH8(Triangle4v::type,scene);
+    Accel::Intersectors intersectors= BVH8Triangle4vIntersectors(accel,ivariant);
+    Builder* builder = nullptr;
+    if (scene->device->tri_builder == "default")  {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH8Triangle4vSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : builder = BVH8BuilderTwoLevelTriangle4vMeshSAH(accel,scene,false); break;
+      case BuildVariant::HIGH_QUALITY: builder = BVH8Triangle4vSceneBuilderFastSpatialSAH(accel,scene,0); break;
+      }
+    }
+    else if (scene->device->tri_builder == "sah_fast_spatial")  builder = BVH8Triangle4SceneBuilderFastSpatialSAH(accel,scene,0);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder+" for BVH8<Triangle4v>");
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8Triangle4i(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH8* accel = new BVH8(Triangle4i::type,scene);
+    Accel::Intersectors intersectors = BVH8Triangle4iIntersectors(accel,ivariant);
+
+    Builder* builder = nullptr;
+    if (scene->device->tri_builder == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH8Triangle4iSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : builder = BVH8BuilderTwoLevelTriangle4iMeshSAH(accel,scene,false); break;
+      case BuildVariant::HIGH_QUALITY: assert(false); break; // FIXME: implement
+      }
+    }
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder+" for BVH8<Triangle4i>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8Triangle4iMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH8* accel = new BVH8(Triangle4i::type,scene);
+    Accel::Intersectors intersectors = BVH8Triangle4iMBIntersectors(accel,ivariant);
+
+    Builder* builder = nullptr;
+    if (scene->device->tri_builder_mb == "default") { // FIXME: implement
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH8Triangle4iMBSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : assert(false); break; // FIXME: implement
+      case BuildVariant::HIGH_QUALITY: assert(false); break;
+      }
+    }
+    else if (scene->device->tri_builder_mb == "internal_time_splits")  builder = BVH8Triangle4iMBSceneBuilderSAH(accel,scene,0);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder_mb+" for BVH8<Triangle4iMB>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8Triangle4vMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH8* accel = new BVH8(Triangle4vMB::type,scene);
+    Accel::Intersectors intersectors= BVH8Triangle4vMBIntersectors(accel,ivariant);
+
+    Builder* builder = nullptr;
+    if (scene->device->tri_builder_mb == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH8Triangle4vMBSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : assert(false); break; // FIXME: implement
+      case BuildVariant::HIGH_QUALITY: assert(false); break;
+      }
+    }
+    else if (scene->device->tri_builder_mb == "internal_time_splits")  builder = BVH8Triangle4vMBSceneBuilderSAH(accel,scene,0);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder_mb+" for BVH8<Triangle4vMB>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8QuantizedTriangle4i(Scene* scene)
+  {
+    BVH8* accel = new BVH8(Triangle4i::type,scene);
+    Accel::Intersectors intersectors = QBVH8Triangle4iIntersectors(accel);
+    Builder* builder = BVH8QuantizedTriangle4iSceneBuilderSAH(accel,scene,0);
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8QuantizedTriangle4(Scene* scene)
+  {
+    BVH8* accel = new BVH8(Triangle4::type,scene);
+    Accel::Intersectors intersectors = QBVH8Triangle4Intersectors(accel);
+    Builder* builder = BVH8QuantizedTriangle4SceneBuilderSAH(accel,scene,0);
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8Quad4v(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH8* accel = new BVH8(Quad4v::type,scene);
+    Accel::Intersectors intersectors = BVH8Quad4vIntersectors(accel,ivariant);
+
+    Builder* builder = nullptr;
+    if (scene->device->quad_builder == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH8Quad4vSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : builder = BVH8BuilderTwoLevelQuadMeshSAH(accel,scene,false); break;
+      case BuildVariant::HIGH_QUALITY: builder = BVH8Quad4vSceneBuilderFastSpatialSAH(accel,scene,0); break;
+      }
+    }
+    else if (scene->device->quad_builder == "dynamic"      ) builder = BVH8BuilderTwoLevelQuadMeshSAH(accel,scene,false);
+    else if (scene->device->quad_builder == "morton"       ) builder = BVH8BuilderTwoLevelQuadMeshSAH(accel,scene,true);
+    else if (scene->device->quad_builder == "sah_fast_spatial" ) builder = BVH8Quad4vSceneBuilderFastSpatialSAH(accel,scene,0);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder+" for BVH8<Quad4v>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8Quad4i(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH8* accel = new BVH8(Quad4i::type,scene);
+    Accel::Intersectors intersectors = BVH8Quad4iIntersectors(accel,ivariant);
+
+    Builder* builder = nullptr;
+    if (scene->device->quad_builder == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH8Quad4iSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : assert(false); break; // FIXME: implement
+      case BuildVariant::HIGH_QUALITY: assert(false); break; // FIXME: implement
+      }
+    }
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder+" for BVH8<Quad4i>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8Quad4iMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH8* accel = new BVH8(Quad4i::type,scene);
+    Accel::Intersectors intersectors = BVH8Quad4iMBIntersectors(accel,ivariant);
+
+    Builder* builder = nullptr;
+    if (scene->device->quad_builder_mb == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH8Quad4iMBSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : assert(false); break; // FIXME: implement
+      case BuildVariant::HIGH_QUALITY: assert(false); break;
+      }
+    }
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder_mb+" for BVH8<Quad4i>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8QuantizedQuad4i(Scene* scene)
+  {
+    BVH8* accel = new BVH8(Quad4i::type,scene);
+    Accel::Intersectors intersectors = QBVH8Quad4iIntersectors(accel);
+    Builder* builder = nullptr;
+    if      (scene->device->quad_builder == "default"     ) builder = BVH8QuantizedQuad4iSceneBuilderSAH(accel,scene,0);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder+" for QBVH8<Quad4i>");
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8UserGeometry(Scene* scene, BuildVariant bvariant)
+  {
+    BVH8* accel = new BVH8(Object::type,scene);
+    Accel::Intersectors intersectors = BVH8UserGeometryIntersectors(accel);
+
+    Builder* builder = nullptr;
+    if (scene->device->object_builder == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH8VirtualSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : builder = BVH8BuilderTwoLevelVirtualSAH(accel,scene,false); break;
+      case BuildVariant::HIGH_QUALITY: assert(false); break;
+      }
+    }
+    else if (scene->device->object_builder == "sah") builder = BVH8VirtualSceneBuilderSAH(accel,scene,0);
+    else if (scene->device->object_builder == "dynamic") builder = BVH8BuilderTwoLevelVirtualSAH(accel,scene,false);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->object_builder+" for BVH8<Object>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8UserGeometryMB(Scene* scene)
+  {
+    BVH8* accel = new BVH8(Object::type,scene);
+    Accel::Intersectors intersectors = BVH8UserGeometryMBIntersectors(accel);
+    Builder* builder = BVH8VirtualMBSceneBuilderSAH(accel,scene,0);
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8Instance(Scene* scene, bool isExpensive, BuildVariant bvariant)
+  {
+    BVH8* accel = new BVH8(InstancePrimitive::type,scene);
+    Accel::Intersectors intersectors = BVH8InstanceIntersectors(accel);
+    auto gtype = isExpensive ? Geometry::MTY_INSTANCE_EXPENSIVE : Geometry::MTY_INSTANCE; 
+    // Builder* builder = BVH8InstanceSceneBuilderSAH(accel,scene,gtype);
+
+    Builder* builder = nullptr;
+    if (scene->device->object_builder == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH8InstanceSceneBuilderSAH(accel,scene,gtype);; break;
+      case BuildVariant::DYNAMIC     : builder = BVH8BuilderTwoLevelInstanceSAH(accel,scene,gtype,false); break;
+      case BuildVariant::HIGH_QUALITY: assert(false); break;
+      }
+    }
+    else if (scene->device->object_builder == "sah") builder = BVH8InstanceSceneBuilderSAH(accel,scene,gtype);
+    else if (scene->device->object_builder == "dynamic") builder = BVH8BuilderTwoLevelInstanceSAH(accel,scene,gtype,false);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->object_builder+" for BVH8<Object>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8InstanceMB(Scene* scene, bool isExpensive)
+  {
+    BVH8* accel = new BVH8(InstancePrimitive::type,scene);
+    Accel::Intersectors intersectors = BVH8InstanceMBIntersectors(accel);
+    auto gtype = isExpensive ? Geometry::MTY_INSTANCE_EXPENSIVE : Geometry::MTY_INSTANCE; 
+    Builder* builder = BVH8InstanceMBSceneBuilderSAH(accel,scene,gtype);
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8GridIntersectors(BVH8* bvh, IntersectVariant ivariant)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    if (ivariant == IntersectVariant::FAST)
+    {
+      intersectors.intersector1  = BVH8GridIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8GridIntersector4HybridMoeller();
+      intersectors.intersector8  = BVH8GridIntersector8HybridMoeller();
+      intersectors.intersector16 = BVH8GridIntersector16HybridMoeller();
+      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
+#endif
+    }
+    else /* if (ivariant == IntersectVariant::ROBUST) */
+    {
+      intersectors.intersector1  = BVH8GridIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8GridIntersector4HybridPluecker();
+      intersectors.intersector8  = BVH8GridIntersector8HybridPluecker();
+      intersectors.intersector16 = BVH8GridIntersector16HybridPluecker();
+      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
+#endif            
+    }
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8GridMBIntersectors(BVH8* bvh, IntersectVariant ivariant)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1  = BVH8GridMBIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4  = nullptr;
+    intersectors.intersector8  = nullptr;
+    intersectors.intersector16 = nullptr;
+    intersectors.intersectorN  = nullptr;
+#endif
+    return intersectors;
+  }
+
+  Accel* BVH8Factory::BVH8Grid(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH8* accel = new BVH8(SubGridQBVH8::type,scene);
+    Accel::Intersectors intersectors = BVH8GridIntersectors(accel,ivariant);
+    Builder* builder = nullptr;
+    if (scene->device->grid_builder == "default") {
+      builder = BVH8GridSceneBuilderSAH(accel,scene,0);
+    }
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->object_builder+" for BVH4<GridMesh>");
+
+    return new AccelInstance(accel,builder,intersectors);    
+  }
+
+  Accel* BVH8Factory::BVH8GridMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH8* accel = new BVH8(SubGridQBVH8::type,scene);
+    Accel::Intersectors intersectors = BVH8GridMBIntersectors(accel,ivariant);
+    Builder* builder = nullptr;
+    if (scene->device->grid_builder_mb == "default") {
+      builder = BVH8GridMBSceneBuilderSAH(accel,scene,0);
+    }
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->object_builder+" for BVH8MB<GridMesh>");
+    return new AccelInstance(accel,builder,intersectors);        
+  }
+}
+
+#endif
diff --git a/thirdparty/embree/kernels/bvh/bvh8_factory.h b/thirdparty/embree/kernels/bvh/bvh8_factory.h
new file mode 100644
index 0000000000..198d6f1df0
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh8_factory.h
@@ -0,0 +1,280 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh_factory.h"
+
+namespace embree
+{
+  /*! BVH8 instantiations */
+  class BVH8Factory : public BVHFactory
+  {
+  public:
+    BVH8Factory(int bfeatures, int ifeatures);
+
+  public:
+    Accel* BVH8OBBVirtualCurve8v(Scene* scene, IntersectVariant ivariant);
+    Accel* BVH8OBBVirtualCurve8iMB(Scene* scene, IntersectVariant ivariant);
+    DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector8v);
+    DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector8iMB);
+    
+    Accel* BVH8Triangle4   (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+    Accel* BVH8Triangle4v  (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+    Accel* BVH8Triangle4i  (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+    Accel* BVH8Triangle4vMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+    Accel* BVH8Triangle4iMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+
+    Accel* BVH8Quad4v  (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+    Accel* BVH8Quad4i  (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+    Accel* BVH8Quad4iMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+
+    Accel* BVH8QuantizedTriangle4i(Scene* scene);
+    Accel* BVH8QuantizedTriangle4(Scene* scene);
+    Accel* BVH8QuantizedQuad4i(Scene* scene);
+
+    Accel* BVH8UserGeometry(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC);
+    Accel* BVH8UserGeometryMB(Scene* scene);
+
+    Accel* BVH8Instance(Scene* scene, bool isExpensive, BuildVariant bvariant = BuildVariant::STATIC);
+    Accel* BVH8InstanceMB(Scene* scene, bool isExpensive);
+
+    Accel* BVH8Grid(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+    Accel* BVH8GridMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+
+  private:
+    void selectBuilders(int features);
+    void selectIntersectors(int features);
+
+  private:
+    Accel::Intersectors BVH8OBBVirtualCurveIntersectors(BVH8* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant);
+    Accel::Intersectors BVH8OBBVirtualCurveIntersectorsMB(BVH8* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant);
+    
+    Accel::Intersectors BVH8Triangle4Intersectors(BVH8* bvh, IntersectVariant ivariant);
+    Accel::Intersectors BVH8Triangle4vIntersectors(BVH8* bvh, IntersectVariant ivariant);
+    Accel::Intersectors BVH8Triangle4iIntersectors(BVH8* bvh, IntersectVariant ivariant);
+    Accel::Intersectors BVH8Triangle4iMBIntersectors(BVH8* bvh, IntersectVariant ivariant);
+    Accel::Intersectors BVH8Triangle4vMBIntersectors(BVH8* bvh, IntersectVariant ivariant);
+
+    Accel::Intersectors BVH8Quad4vIntersectors(BVH8* bvh, IntersectVariant ivariant);
+    Accel::Intersectors BVH8Quad4iIntersectors(BVH8* bvh, IntersectVariant ivariant);
+    Accel::Intersectors BVH8Quad4iMBIntersectors(BVH8* bvh, IntersectVariant ivariant);
+
+    Accel::Intersectors QBVH8Triangle4iIntersectors(BVH8* bvh);
+    Accel::Intersectors QBVH8Triangle4Intersectors(BVH8* bvh);
+    Accel::Intersectors QBVH8Quad4iIntersectors(BVH8* bvh);
+
+    Accel::Intersectors BVH8UserGeometryIntersectors(BVH8* bvh);
+    Accel::Intersectors BVH8UserGeometryMBIntersectors(BVH8* bvh);
+
+    Accel::Intersectors BVH8InstanceIntersectors(BVH8* bvh);
+    Accel::Intersectors BVH8InstanceMBIntersectors(BVH8* bvh);
+
+    Accel::Intersectors BVH8GridIntersectors(BVH8* bvh, IntersectVariant ivariant);
+    Accel::Intersectors BVH8GridMBIntersectors(BVH8* bvh, IntersectVariant ivariant);
+
+  private:
+    DEFINE_SYMBOL2(Accel::Collider,BVH8ColliderUserGeom);
+    
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersector1);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersector1MB);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersectorRobust1);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersectorRobust1MB);
+    
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4Intersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vIntersector1Pluecker);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iIntersector1Pluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vMBIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iMBIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vMBIntersector1Pluecker);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iMBIntersector1Pluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vIntersector1Woop);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Quad4vIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Quad4iIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Quad4vIntersector1Pluecker);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Quad4iIntersector1Pluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Quad4iMBIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Quad4iMBIntersector1Pluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,QBVH8Triangle4iIntersector1Pluecker);
+    DEFINE_SYMBOL2(Accel::Intersector1,QBVH8Triangle4Intersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,QBVH8Quad4iIntersector1Pluecker);
+    
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8VirtualIntersector1);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8VirtualMBIntersector1);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8InstanceIntersector1);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8InstanceMBIntersector1);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8GridIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8GridMBIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8GridIntersector1Pluecker);
+    
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersector4Hybrid);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersector4HybridMB);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersectorRobust4Hybrid);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersectorRobust4HybridMB);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4Intersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4Intersector4HybridMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4vIntersector4HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iIntersector4HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4vMBIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iMBIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4vMBIntersector4HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iMBIntersector4HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4vIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4vIntersector4HybridMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4iIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4vIntersector4HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4iIntersector4HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4iMBIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4iMBIntersector4HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8VirtualIntersector4Chunk);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8VirtualMBIntersector4Chunk);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8InstanceIntersector4Chunk);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8InstanceMBIntersector4Chunk);
+    
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8GridIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8GridIntersector4HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersector8Hybrid);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersector8HybridMB);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersectorRobust8Hybrid);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersectorRobust8HybridMB);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4Intersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4Intersector8HybridMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4vIntersector8HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iIntersector8HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4vMBIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iMBIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4vMBIntersector8HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iMBIntersector8HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4vIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4vIntersector8HybridMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4iIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4vIntersector8HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4iIntersector8HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4iMBIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4iMBIntersector8HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8VirtualIntersector8Chunk);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8VirtualMBIntersector8Chunk);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8InstanceIntersector8Chunk);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8InstanceMBIntersector8Chunk);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8GridIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8GridIntersector8HybridPluecker);
+   
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersector16Hybrid);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersector16HybridMB);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersectorRobust16Hybrid);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersectorRobust16HybridMB);
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4Intersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4Intersector16HybridMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4vIntersector16HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iIntersector16HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4vMBIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iMBIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4vMBIntersector16HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iMBIntersector16HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4vIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4vIntersector16HybridMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4iIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4vIntersector16HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4iIntersector16HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4iMBIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4iMBIntersector16HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8VirtualIntersector16Chunk);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8VirtualMBIntersector16Chunk);
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8InstanceIntersector16Chunk);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8InstanceMBIntersector16Chunk);
+   
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8GridIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8GridIntersector16HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8IntersectorStreamPacketFallback);
+
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4IntersectorStreamMoeller);
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4IntersectorStreamMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4iIntersectorStreamMoeller);
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4vIntersectorStreamPluecker);
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4iIntersectorStreamPluecker);
+
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamMoeller);
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Quad4iIntersectorStreamMoeller);
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamPluecker);
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Quad4iIntersectorStreamPluecker);
+
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8VirtualIntersectorStream);
+    
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8InstanceIntersectorStream);
+
+    // SAH scene builders
+  private:
+    DEFINE_ISA_FUNCTION(Builder*,BVH8Curve8vBuilder_OBB_New,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8OBBCurve8iMBBuilder_OBB,void* COMMA Scene* COMMA size_t);
+ 
+    DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4SceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4vMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8QuantizedTriangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8QuantizedTriangle4SceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+ 
+    DEFINE_ISA_FUNCTION(Builder*,BVH8Quad4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8Quad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8Quad4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8QuantizedQuad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    
+    DEFINE_ISA_FUNCTION(Builder*,BVH8VirtualSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8VirtualMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+    DEFINE_ISA_FUNCTION(Builder*,BVH8InstanceSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8InstanceMBSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask);
+
+    DEFINE_ISA_FUNCTION(Builder*,BVH8GridSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8GridMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+    // SAH spatial scene builders
+  private:
+    DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4SceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8Quad4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+
+    // twolevel scene builders
+  private:
+    DEFINE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelTriangle4MeshSAH,void* COMMA Scene* COMMA bool);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelTriangle4vMeshSAH,void* COMMA Scene* COMMA bool);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelTriangle4iMeshSAH,void* COMMA Scene* COMMA bool);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelQuadMeshSAH,void* COMMA Scene* COMMA bool);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelVirtualSAH,void* COMMA Scene* COMMA bool);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelInstanceSAH,void* COMMA Scene* COMMA Geometry::GTypeMask COMMA bool);
+  };
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_builder.cpp b/thirdparty/embree/kernels/bvh/bvh_builder.cpp
new file mode 100644
index 0000000000..161d01bb5c
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_builder.cpp
@@ -0,0 +1,60 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh_builder.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int N>
+    typename BVHN<N>::NodeRef BVHNBuilderVirtual<N>::BVHNBuilderV::build(FastAllocator* allocator, BuildProgressMonitor& progressFunc, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings)
+    {
+      auto createLeafFunc = [&] (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) -> NodeRef {
+        return createLeaf(prims,set,alloc);
+      };
+      
+      settings.branchingFactor = N;
+      settings.maxDepth = BVH::maxBuildDepthLeaf;
+      return BVHBuilderBinnedSAH::build<NodeRef>
+        (FastAllocator::Create(allocator),typename BVH::AABBNode::Create2(),typename BVH::AABBNode::Set3(allocator,prims),createLeafFunc,progressFunc,prims,pinfo,settings);
+    }
+
+
+    template<int N>
+    typename BVHN<N>::NodeRef BVHNBuilderQuantizedVirtual<N>::BVHNBuilderV::build(FastAllocator* allocator, BuildProgressMonitor& progressFunc, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings)
+    {
+      auto createLeafFunc = [&] (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) -> NodeRef {
+        return createLeaf(prims,set,alloc);
+      };
+            
+      settings.branchingFactor = N;
+      settings.maxDepth = BVH::maxBuildDepthLeaf;
+      return BVHBuilderBinnedSAH::build<NodeRef>
+        (FastAllocator::Create(allocator),typename BVH::QuantizedNode::Create2(),typename BVH::QuantizedNode::Set2(),createLeafFunc,progressFunc,prims,pinfo,settings);
+    }
+
+    template<int N>
+    typename BVHN<N>::NodeRecordMB BVHNBuilderMblurVirtual<N>::BVHNBuilderV::build(FastAllocator* allocator, BuildProgressMonitor& progressFunc, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings, const BBox1f& timeRange)
+    {
+      auto createLeafFunc = [&] (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) -> NodeRecordMB {
+        return createLeaf(prims,set,alloc);
+      };
+
+      settings.branchingFactor = N;
+      settings.maxDepth = BVH::maxBuildDepthLeaf;
+      return BVHBuilderBinnedSAH::build<NodeRecordMB>
+        (FastAllocator::Create(allocator),typename BVH::AABBNodeMB::Create(),typename BVH::AABBNodeMB::SetTimeRange(timeRange),createLeafFunc,progressFunc,prims,pinfo,settings);
+    }
+
+    template struct BVHNBuilderVirtual<4>;
+    template struct BVHNBuilderQuantizedVirtual<4>;
+    template struct BVHNBuilderMblurVirtual<4>;    
+
+#if defined(__AVX__)
+    template struct BVHNBuilderVirtual<8>;
+    template struct BVHNBuilderQuantizedVirtual<8>;
+    template struct BVHNBuilderMblurVirtual<8>;
+#endif
+  }
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_builder.h b/thirdparty/embree/kernels/bvh/bvh_builder.h
new file mode 100644
index 0000000000..e35d052a62
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_builder.h
@@ -0,0 +1,115 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh.h"
+#include "../builders/bvh_builder_sah.h"
+#include "../builders/bvh_builder_msmblur.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+
+    template<int N>
+      struct BVHNBuilderVirtual
+      {
+        typedef BVHN<N> BVH;
+        typedef typename BVH::NodeRef NodeRef;
+        typedef FastAllocator::CachedAllocator Allocator;
+      
+        struct BVHNBuilderV {
+          NodeRef build(FastAllocator* allocator, BuildProgressMonitor& progress, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings);
+          virtual NodeRef createLeaf (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) = 0;
+        };
+
+        template<typename CreateLeafFunc>
+        struct BVHNBuilderT : public BVHNBuilderV
+        {
+          BVHNBuilderT (CreateLeafFunc createLeafFunc)
+            : createLeafFunc(createLeafFunc) {}
+
+          NodeRef createLeaf (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) {
+            return createLeafFunc(prims,set,alloc);
+          }
+
+        private:
+          CreateLeafFunc createLeafFunc;
+        };
+
+        template<typename CreateLeafFunc>
+        static NodeRef build(FastAllocator* allocator, CreateLeafFunc createLeaf, BuildProgressMonitor& progress, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings) {
+          return BVHNBuilderT<CreateLeafFunc>(createLeaf).build(allocator,progress,prims,pinfo,settings);
+        }
+      };
+
+    template<int N>
+      struct BVHNBuilderQuantizedVirtual
+      {
+        typedef BVHN<N> BVH;
+        typedef typename BVH::NodeRef NodeRef;
+        typedef FastAllocator::CachedAllocator Allocator;
+      
+        struct BVHNBuilderV {
+          NodeRef build(FastAllocator* allocator, BuildProgressMonitor& progress, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings);
+          virtual NodeRef createLeaf (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) = 0;
+        };
+
+        template<typename CreateLeafFunc>
+        struct BVHNBuilderT : public BVHNBuilderV
+        {
+          BVHNBuilderT (CreateLeafFunc createLeafFunc)
+            : createLeafFunc(createLeafFunc) {}
+
+          NodeRef createLeaf (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) {
+            return createLeafFunc(prims,set,alloc);
+          }
+
+        private:
+          CreateLeafFunc createLeafFunc;
+        };
+
+        template<typename CreateLeafFunc>
+        static NodeRef build(FastAllocator* allocator, CreateLeafFunc createLeaf, BuildProgressMonitor& progress, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings) {
+          return BVHNBuilderT<CreateLeafFunc>(createLeaf).build(allocator,progress,prims,pinfo,settings);
+        }
+      };
+
+    template<int N>
+      struct BVHNBuilderMblurVirtual
+      {
+        typedef BVHN<N> BVH;
+        typedef typename BVH::AABBNodeMB AABBNodeMB;
+        typedef typename BVH::NodeRef NodeRef;
+        typedef typename BVH::NodeRecordMB NodeRecordMB;
+        typedef FastAllocator::CachedAllocator Allocator;
+      
+        struct BVHNBuilderV {
+          NodeRecordMB build(FastAllocator* allocator, BuildProgressMonitor& progress, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings, const BBox1f& timeRange);
+          virtual NodeRecordMB createLeaf (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) = 0;
+        };
+
+        template<typename CreateLeafFunc>
+        struct BVHNBuilderT : public BVHNBuilderV
+        {
+          BVHNBuilderT (CreateLeafFunc createLeafFunc)
+            : createLeafFunc(createLeafFunc) {}
+
+          NodeRecordMB createLeaf (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) {
+            return createLeafFunc(prims,set,alloc);
+          }
+
+        private:
+          CreateLeafFunc createLeafFunc;
+        };
+
+        template<typename CreateLeafFunc>
+        static NodeRecordMB build(FastAllocator* allocator, CreateLeafFunc createLeaf, BuildProgressMonitor& progress, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings, const BBox1f& timeRange) {
+          return BVHNBuilderT<CreateLeafFunc>(createLeaf).build(allocator,progress,prims,pinfo,settings,timeRange);
+        }
+      };
+  }
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_builder_morton.cpp b/thirdparty/embree/kernels/bvh/bvh_builder_morton.cpp
new file mode 100644
index 0000000000..4a4d8d71df
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_builder_morton.cpp
@@ -0,0 +1,531 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh.h"
+#include "bvh_statistics.h"
+#include "bvh_rotate.h"
+#include "../common/profile.h"
+#include "../../common/algorithms/parallel_prefix_sum.h"
+
+#include "../builders/primrefgen.h"
+#include "../builders/bvh_builder_morton.h"
+
+#include "../geometry/triangle.h"
+#include "../geometry/trianglev.h"
+#include "../geometry/trianglei.h"
+#include "../geometry/quadv.h"
+#include "../geometry/quadi.h"
+#include "../geometry/object.h"
+#include "../geometry/instance.h"
+
+#if defined(__64BIT__)
+#  define ROTATE_TREE 1 // specifies number of tree rotation rounds to perform
+#else
+#  define ROTATE_TREE 0 // do not use tree rotations on 32 bit platforms, barrier bit in NodeRef will cause issues
+#endif
+
+namespace embree 
+{
+  namespace isa
+  {
+    template<int N>
+    struct SetBVHNBounds
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::NodeRecord NodeRecord;
+      typedef typename BVH::AABBNode AABBNode;
+
+      BVH* bvh;
+      __forceinline SetBVHNBounds (BVH* bvh) : bvh(bvh) {}
+
+      __forceinline NodeRecord operator() (NodeRef ref, const NodeRecord* children, size_t num)
+      {
+        AABBNode* node = ref.getAABBNode();
+
+        BBox3fa res = empty;
+        for (size_t i=0; i<num; i++) {
+          const BBox3fa b = children[i].bounds;
+          res.extend(b);
+          node->setRef(i,children[i].ref);
+          node->setBounds(i,b);
+        }
+
+        BBox3fx result = (BBox3fx&)res;
+#if ROTATE_TREE
+        if (N == 4)
+        {
+          size_t n = 0;
+          for (size_t i=0; i<num; i++)
+            n += children[i].bounds.lower.a;
+
+          if (n >= 4096) {
+            for (size_t i=0; i<num; i++) {
+              if (children[i].bounds.lower.a < 4096) {
+                for (int j=0; j<ROTATE_TREE; j++)
+                  BVHNRotate<N>::rotate(node->child(i));
+                node->child(i).setBarrier();
+              }
+            }
+          }
+          result.lower.a = unsigned(n);
+        }
+#endif
+
+        return NodeRecord(ref,result);
+      }
+    };
+
+    template<int N, typename Primitive>
+    struct CreateMortonLeaf;
+
+    template<int N>
+    struct CreateMortonLeaf<N,Triangle4>
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::NodeRecord NodeRecord;
+
+      __forceinline CreateMortonLeaf (TriangleMesh* mesh, unsigned int geomID, BVHBuilderMorton::BuildPrim* morton)
+        : mesh(mesh), morton(morton), geomID_(geomID) {}
+
+      __noinline NodeRecord operator() (const range<unsigned>& current, const FastAllocator::CachedAllocator& alloc)
+      {
+        vfloat4 lower(pos_inf);
+        vfloat4 upper(neg_inf);
+        size_t items = current.size();
+        size_t start = current.begin();
+        assert(items<=4);
+        
+        /* allocate leaf node */
+        Triangle4* accel = (Triangle4*) alloc.malloc1(sizeof(Triangle4),BVH::byteAlignment);
+        NodeRef ref = BVH::encodeLeaf((char*)accel,1);
+        vuint4 vgeomID = -1, vprimID = -1;
+        Vec3vf4 v0 = zero, v1 = zero, v2 = zero;
+        const TriangleMesh* __restrict__ const mesh = this->mesh;
+
+        for (size_t i=0; i<items; i++)
+        {
+          const unsigned int primID = morton[start+i].index;
+          const TriangleMesh::Triangle& tri = mesh->triangle(primID);
+          const Vec3fa& p0 = mesh->vertex(tri.v[0]);
+          const Vec3fa& p1 = mesh->vertex(tri.v[1]);
+          const Vec3fa& p2 = mesh->vertex(tri.v[2]);
+          lower = min(lower,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2);
+          upper = max(upper,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2);
+          vgeomID [i] = geomID_;
+          vprimID [i] = primID;
+          v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z;
+          v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z;
+          v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z;
+        }
+
+        Triangle4::store_nt(accel,Triangle4(v0,v1,v2,vgeomID,vprimID));
+        BBox3fx box_o = BBox3fx((Vec3fx)lower,(Vec3fx)upper);
+#if ROTATE_TREE
+        if (N == 4)
+          box_o.lower.a = unsigned(current.size());
+#endif
+        return NodeRecord(ref,box_o);
+      }
+    
+    private:
+      TriangleMesh* mesh;
+      BVHBuilderMorton::BuildPrim* morton;
+      unsigned int geomID_ = std::numeric_limits<unsigned int>::max();
+    };
+    
+    template<int N>
+    struct CreateMortonLeaf<N,Triangle4v>
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::NodeRecord NodeRecord;
+
+      __forceinline CreateMortonLeaf (TriangleMesh* mesh, unsigned int geomID, BVHBuilderMorton::BuildPrim* morton)
+        : mesh(mesh), morton(morton), geomID_(geomID) {}
+      
+      __noinline NodeRecord operator() (const range<unsigned>& current, const FastAllocator::CachedAllocator& alloc)
+      {
+        vfloat4 lower(pos_inf);
+        vfloat4 upper(neg_inf);
+        size_t items = current.size();
+        size_t start = current.begin();
+        assert(items<=4);
+        
+        /* allocate leaf node */
+        Triangle4v* accel = (Triangle4v*) alloc.malloc1(sizeof(Triangle4v),BVH::byteAlignment);
+        NodeRef ref = BVH::encodeLeaf((char*)accel,1);       
+        vuint4 vgeomID = -1, vprimID = -1;
+        Vec3vf4 v0 = zero, v1 = zero, v2 = zero;
+        const TriangleMesh* __restrict__ mesh = this->mesh;
+
+        for (size_t i=0; i<items; i++)
+        {
+          const unsigned int primID = morton[start+i].index;
+          const TriangleMesh::Triangle& tri = mesh->triangle(primID);
+          const Vec3fa& p0 = mesh->vertex(tri.v[0]);
+          const Vec3fa& p1 = mesh->vertex(tri.v[1]);
+          const Vec3fa& p2 = mesh->vertex(tri.v[2]);
+          lower = min(lower,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2);
+          upper = max(upper,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2);
+          vgeomID [i] = geomID_;
+          vprimID [i] = primID;
+          v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z;
+          v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z;
+          v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z;
+        }
+        Triangle4v::store_nt(accel,Triangle4v(v0,v1,v2,vgeomID,vprimID));
+        BBox3fx box_o = BBox3fx((Vec3fx)lower,(Vec3fx)upper);
+#if ROTATE_TREE
+        if (N == 4)
+          box_o.lower.a = current.size();
+#endif
+        return NodeRecord(ref,box_o);
+      }
+    private:
+      TriangleMesh* mesh;
+      BVHBuilderMorton::BuildPrim* morton;
+      unsigned int geomID_ = std::numeric_limits<unsigned int>::max();
+    };
+
+    template<int N>
+    struct CreateMortonLeaf<N,Triangle4i>
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::NodeRecord NodeRecord;
+
+      __forceinline CreateMortonLeaf (TriangleMesh* mesh, unsigned int geomID, BVHBuilderMorton::BuildPrim* morton)
+        : mesh(mesh), morton(morton), geomID_(geomID) {}
+      
+      __noinline NodeRecord operator() (const range<unsigned>& current, const FastAllocator::CachedAllocator& alloc)
+      {
+        vfloat4 lower(pos_inf);
+        vfloat4 upper(neg_inf);
+        size_t items = current.size();
+        size_t start = current.begin();
+        assert(items<=4);
+        
+        /* allocate leaf node */
+        Triangle4i* accel = (Triangle4i*) alloc.malloc1(sizeof(Triangle4i),BVH::byteAlignment);
+        NodeRef ref = BVH::encodeLeaf((char*)accel,1);
+        
+        vuint4 v0 = zero, v1 = zero, v2 = zero;
+        vuint4 vgeomID = -1, vprimID = -1;
+        const TriangleMesh* __restrict__ const mesh = this->mesh;
+        
+        for (size_t i=0; i<items; i++)
+        {
+          const unsigned int primID = morton[start+i].index;
+          const TriangleMesh::Triangle& tri = mesh->triangle(primID);
+          const Vec3fa& p0 = mesh->vertex(tri.v[0]);
+          const Vec3fa& p1 = mesh->vertex(tri.v[1]);
+          const Vec3fa& p2 = mesh->vertex(tri.v[2]);
+          lower = min(lower,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2);
+          upper = max(upper,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2);
+          vgeomID[i] = geomID_;
+          vprimID[i] = primID;
+          unsigned int int_stride = mesh->vertices0.getStride()/4;
+          v0[i] = tri.v[0] * int_stride; 
+          v1[i] = tri.v[1] * int_stride;
+          v2[i] = tri.v[2] * int_stride;
+        }
+        
+        for (size_t i=items; i<4; i++)
+        {
+          vgeomID[i] = vgeomID[0];
+          vprimID[i] = -1;
+          v0[i] = 0;
+          v1[i] = 0; 
+          v2[i] = 0;
+        }
+        Triangle4i::store_nt(accel,Triangle4i(v0,v1,v2,vgeomID,vprimID));
+        BBox3fx box_o = BBox3fx((Vec3fx)lower,(Vec3fx)upper);
+#if ROTATE_TREE
+        if (N == 4)
+          box_o.lower.a = current.size();
+#endif
+        return NodeRecord(ref,box_o);
+      }
+    private:
+      TriangleMesh* mesh;
+      BVHBuilderMorton::BuildPrim* morton;
+      unsigned int geomID_ = std::numeric_limits<unsigned int>::max();
+    };
+
+    template<int N>
+    struct CreateMortonLeaf<N,Quad4v>
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::NodeRecord NodeRecord;
+
+      __forceinline CreateMortonLeaf (QuadMesh* mesh, unsigned int geomID, BVHBuilderMorton::BuildPrim* morton)
+        : mesh(mesh), morton(morton), geomID_(geomID) {}
+      
+      __noinline NodeRecord operator() (const range<unsigned>& current, const FastAllocator::CachedAllocator& alloc)
+      {
+        vfloat4 lower(pos_inf);
+        vfloat4 upper(neg_inf);
+        size_t items = current.size();
+        size_t start = current.begin();
+        assert(items<=4);
+        
+        /* allocate leaf node */
+        Quad4v* accel = (Quad4v*) alloc.malloc1(sizeof(Quad4v),BVH::byteAlignment);
+        NodeRef ref = BVH::encodeLeaf((char*)accel,1);
+        
+        vuint4 vgeomID = -1, vprimID = -1;
+        Vec3vf4 v0 = zero, v1 = zero, v2 = zero, v3 = zero;
+        const QuadMesh* __restrict__ mesh = this->mesh;
+
+        for (size_t i=0; i<items; i++)
+        {
+          const unsigned int primID = morton[start+i].index;
+          const QuadMesh::Quad& tri = mesh->quad(primID);
+          const Vec3fa& p0 = mesh->vertex(tri.v[0]);
+          const Vec3fa& p1 = mesh->vertex(tri.v[1]);
+          const Vec3fa& p2 = mesh->vertex(tri.v[2]);
+          const Vec3fa& p3 = mesh->vertex(tri.v[3]);
+          lower = min(lower,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2,(vfloat4)p3);
+          upper = max(upper,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2,(vfloat4)p3);
+          vgeomID [i] = geomID_;
+          vprimID [i] = primID;
+          v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z;
+          v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z;
+          v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z;
+          v3.x[i] = p3.x; v3.y[i] = p3.y; v3.z[i] = p3.z;
+        }
+        Quad4v::store_nt(accel,Quad4v(v0,v1,v2,v3,vgeomID,vprimID));
+        BBox3fx box_o = BBox3fx((Vec3fx)lower,(Vec3fx)upper);
+#if ROTATE_TREE
+        if (N == 4)
+          box_o.lower.a = current.size();
+#endif
+        return NodeRecord(ref,box_o);
+      }
+    private:
+      QuadMesh* mesh;
+      BVHBuilderMorton::BuildPrim* morton;
+      unsigned int geomID_ = std::numeric_limits<unsigned int>::max();
+    };
+
+    template<int N>
+    struct CreateMortonLeaf<N,Object>
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::NodeRecord NodeRecord;
+
+      __forceinline CreateMortonLeaf (UserGeometry* mesh, unsigned int geomID, BVHBuilderMorton::BuildPrim* morton)
+        : mesh(mesh), morton(morton), geomID_(geomID) {}
+      
+      __noinline NodeRecord operator() (const range<unsigned>& current, const FastAllocator::CachedAllocator& alloc)
+      {
+        vfloat4 lower(pos_inf);
+        vfloat4 upper(neg_inf);
+        size_t items = current.size();
+        size_t start = current.begin();
+        
+        /* allocate leaf node */
+        Object* accel = (Object*) alloc.malloc1(items*sizeof(Object),BVH::byteAlignment);
+        NodeRef ref = BVH::encodeLeaf((char*)accel,items);
+        const UserGeometry* mesh = this->mesh;
+        
+        BBox3fa bounds = empty;
+        for (size_t i=0; i<items; i++)
+        {
+          const unsigned int index = morton[start+i].index;
+          const unsigned int primID = index; 
+          bounds.extend(mesh->bounds(primID));
+          new (&accel[i]) Object(geomID_,primID);
+        }
+
+        BBox3fx box_o = (BBox3fx&)bounds;
+#if ROTATE_TREE
+        if (N == 4)
+          box_o.lower.a = current.size();
+#endif
+        return NodeRecord(ref,box_o);
+      }
+    private:
+      UserGeometry* mesh;
+      BVHBuilderMorton::BuildPrim* morton;
+      unsigned int geomID_ = std::numeric_limits<unsigned int>::max();
+    };
+
+    template<int N>
+    struct CreateMortonLeaf<N,InstancePrimitive>
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::NodeRecord NodeRecord;
+
+      __forceinline CreateMortonLeaf (Instance* mesh, unsigned int geomID, BVHBuilderMorton::BuildPrim* morton)
+        : mesh(mesh), morton(morton), geomID_(geomID) {}
+      
+      __noinline NodeRecord operator() (const range<unsigned>& current, const FastAllocator::CachedAllocator& alloc)
+      {
+        vfloat4 lower(pos_inf);
+        vfloat4 upper(neg_inf);
+        size_t items = current.size();
+        size_t start = current.begin();
+        assert(items <= 1);
+        
+        /* allocate leaf node */
+        InstancePrimitive* accel = (InstancePrimitive*) alloc.malloc1(items*sizeof(InstancePrimitive),BVH::byteAlignment);
+        NodeRef ref = BVH::encodeLeaf((char*)accel,items);
+        const Instance* instance = this->mesh;
+        
+        BBox3fa bounds = empty;
+        for (size_t i=0; i<items; i++)
+        {
+          const unsigned int primID = morton[start+i].index; 
+          bounds.extend(instance->bounds(primID));
+          new (&accel[i]) InstancePrimitive(instance, geomID_);
+        }
+
+        BBox3fx box_o = (BBox3fx&)bounds;
+#if ROTATE_TREE
+        if (N == 4)
+          box_o.lower.a = current.size();
+#endif
+        return NodeRecord(ref,box_o);
+      }
+    private:
+      Instance* mesh;
+      BVHBuilderMorton::BuildPrim* morton;
+      unsigned int geomID_ = std::numeric_limits<unsigned int>::max();
+    };
+
+    template<typename Mesh>
+    struct CalculateMeshBounds
+    {
+      __forceinline CalculateMeshBounds (Mesh* mesh)
+        : mesh(mesh) {}
+      
+      __forceinline const BBox3fa operator() (const BVHBuilderMorton::BuildPrim& morton) {
+        return mesh->bounds(morton.index);
+      }
+      
+    private:
+      Mesh* mesh;
+    };        
+
+    template<int N, typename Mesh, typename Primitive>
+    class BVHNMeshBuilderMorton : public Builder
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::AABBNode AABBNode;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::NodeRecord NodeRecord;
+
+    public:
+      
+      BVHNMeshBuilderMorton (BVH* bvh, Mesh* mesh, unsigned int geomID, const size_t minLeafSize, const size_t maxLeafSize, const size_t singleThreadThreshold = DEFAULT_SINGLE_THREAD_THRESHOLD)
+        : bvh(bvh), mesh(mesh), morton(bvh->device,0), settings(N,BVH::maxBuildDepth,minLeafSize,min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks),singleThreadThreshold), geomID_(geomID) {}
+      
+      /* build function */
+      void build() 
+      {
+        /* we reset the allocator when the mesh size changed */
+        if (mesh->numPrimitives != numPreviousPrimitives) {
+          bvh->alloc.clear();
+          morton.clear();
+        }
+        size_t numPrimitives = mesh->size();
+        numPreviousPrimitives = numPrimitives;
+        
+        /* skip build for empty scene */
+        if (numPrimitives == 0) {
+          bvh->set(BVH::emptyNode,empty,0);
+          return;
+        }
+        
+        /* preallocate arrays */
+        morton.resize(numPrimitives);
+        size_t bytesEstimated = numPrimitives*sizeof(AABBNode)/(4*N) + size_t(1.2f*Primitive::blocks(numPrimitives)*sizeof(Primitive));
+        size_t bytesMortonCodes = numPrimitives*sizeof(BVHBuilderMorton::BuildPrim);
+        bytesEstimated = max(bytesEstimated,bytesMortonCodes); // the first allocation block is reused to sort the morton codes
+        bvh->alloc.init(bytesMortonCodes,bytesMortonCodes,bytesEstimated);
+
+        /* create morton code array */
+        BVHBuilderMorton::BuildPrim* dest = (BVHBuilderMorton::BuildPrim*) bvh->alloc.specialAlloc(bytesMortonCodes);
+        size_t numPrimitivesGen = createMortonCodeArray<Mesh>(mesh,morton,bvh->scene->progressInterface);
+
+        /* create BVH */
+        SetBVHNBounds<N> setBounds(bvh);
+        CreateMortonLeaf<N,Primitive> createLeaf(mesh,geomID_,morton.data());
+        CalculateMeshBounds<Mesh> calculateBounds(mesh);
+        auto root = BVHBuilderMorton::build<NodeRecord>(
+          typename BVH::CreateAlloc(bvh), 
+          typename BVH::AABBNode::Create(),
+          setBounds,createLeaf,calculateBounds,bvh->scene->progressInterface,
+          morton.data(),dest,numPrimitivesGen,settings);
+        
+        bvh->set(root.ref,LBBox3fa(root.bounds),numPrimitives);
+        
+#if ROTATE_TREE
+        if (N == 4)
+        {
+          for (int i=0; i<ROTATE_TREE; i++)
+            BVHNRotate<N>::rotate(bvh->root);
+          bvh->clearBarrier(bvh->root);
+        }
+#endif
+
+        /* clear temporary data for static geometry */
+        if (bvh->scene->isStaticAccel()) {
+          morton.clear();
+        }
+        bvh->cleanup();
+      }
+      
+      void clear() {
+        morton.clear();
+      }
+      
+    private:
+      BVH* bvh;
+      Mesh* mesh;
+      mvector<BVHBuilderMorton::BuildPrim> morton;
+      BVHBuilderMorton::Settings settings;
+      unsigned int geomID_ = std::numeric_limits<unsigned int>::max();
+      unsigned int numPreviousPrimitives = 0;
+    };
+
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+    Builder* BVH4Triangle4MeshBuilderMortonGeneral  (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<4,TriangleMesh,Triangle4> ((BVH4*)bvh,mesh,geomID,4,4); }
+    Builder* BVH4Triangle4vMeshBuilderMortonGeneral (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<4,TriangleMesh,Triangle4v>((BVH4*)bvh,mesh,geomID,4,4); }
+    Builder* BVH4Triangle4iMeshBuilderMortonGeneral (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<4,TriangleMesh,Triangle4i>((BVH4*)bvh,mesh,geomID,4,4); }
+#if defined(__AVX__)
+    Builder* BVH8Triangle4MeshBuilderMortonGeneral  (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<8,TriangleMesh,Triangle4> ((BVH8*)bvh,mesh,geomID,4,4); }
+    Builder* BVH8Triangle4vMeshBuilderMortonGeneral (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<8,TriangleMesh,Triangle4v>((BVH8*)bvh,mesh,geomID,4,4); }
+    Builder* BVH8Triangle4iMeshBuilderMortonGeneral (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<8,TriangleMesh,Triangle4i>((BVH8*)bvh,mesh,geomID,4,4); }
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_QUAD)
+    Builder* BVH4Quad4vMeshBuilderMortonGeneral (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<4,QuadMesh,Quad4v>((BVH4*)bvh,mesh,geomID,4,4); }
+#if defined(__AVX__)
+    Builder* BVH8Quad4vMeshBuilderMortonGeneral (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<8,QuadMesh,Quad4v>((BVH8*)bvh,mesh,geomID,4,4); }
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_USER)
+    Builder* BVH4VirtualMeshBuilderMortonGeneral (void* bvh, UserGeometry* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<4,UserGeometry,Object>((BVH4*)bvh,mesh,geomID,1,BVH4::maxLeafBlocks); }
+#if defined(__AVX__)
+    Builder* BVH8VirtualMeshBuilderMortonGeneral (void* bvh, UserGeometry* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<8,UserGeometry,Object>((BVH8*)bvh,mesh,geomID,1,BVH4::maxLeafBlocks); }    
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+    Builder* BVH4InstanceMeshBuilderMortonGeneral (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<4,Instance,InstancePrimitive>((BVH4*)bvh,mesh,gtype,geomID,1,BVH4::maxLeafBlocks); }
+#if defined(__AVX__)
+    Builder* BVH8InstanceMeshBuilderMortonGeneral (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<8,Instance,InstancePrimitive>((BVH8*)bvh,mesh,gtype,geomID,1,BVH4::maxLeafBlocks); }    
+#endif
+#endif
+
+  }
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_builder_sah.cpp b/thirdparty/embree/kernels/bvh/bvh_builder_sah.cpp
new file mode 100644
index 0000000000..fad02fcc04
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_builder_sah.cpp
@@ -0,0 +1,543 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh.h"
+#include "bvh_builder.h"
+#include "../builders/primrefgen.h"
+#include "../builders/splitter.h"
+
+#include "../geometry/linei.h"
+#include "../geometry/triangle.h"
+#include "../geometry/trianglev.h"
+#include "../geometry/trianglev_mb.h"
+#include "../geometry/trianglei.h"
+#include "../geometry/quadv.h"
+#include "../geometry/quadi.h"
+#include "../geometry/object.h"
+#include "../geometry/instance.h"
+#include "../geometry/subgrid.h"
+
+#include "../common/state.h"
+#include "../../common/algorithms/parallel_for_for.h"
+#include "../../common/algorithms/parallel_for_for_prefix_sum.h"
+
+#define PROFILE 0
+#define PROFILE_RUNS 20
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int N, typename Primitive>
+    struct CreateLeaf
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+
+      __forceinline CreateLeaf (BVH* bvh) : bvh(bvh) {}
+
+      __forceinline NodeRef operator() (const PrimRef* prims, const range<size_t>& set, const FastAllocator::CachedAllocator& alloc) const
+      {
+        size_t n = set.size();
+        size_t items = Primitive::blocks(n);
+        size_t start = set.begin();
+        Primitive* accel = (Primitive*) alloc.malloc1(items*sizeof(Primitive),BVH::byteAlignment);
+        typename BVH::NodeRef node = BVH::encodeLeaf((char*)accel,items);
+        for (size_t i=0; i<items; i++) {
+          accel[i].fill(prims,start,set.end(),bvh->scene);
+        }
+        return node;
+      }
+
+      BVH* bvh;
+    };
+
+
+    template<int N, typename Primitive>
+    struct CreateLeafQuantized
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+
+      __forceinline CreateLeafQuantized (BVH* bvh) : bvh(bvh) {}
+
+      __forceinline NodeRef operator() (const PrimRef* prims, const range<size_t>& set, const FastAllocator::CachedAllocator& alloc) const
+      {
+        size_t n = set.size();
+        size_t items = Primitive::blocks(n);
+        size_t start = set.begin();
+        Primitive* accel = (Primitive*) alloc.malloc1(items*sizeof(Primitive),BVH::byteAlignment);
+        typename BVH::NodeRef node = BVH::encodeLeaf((char*)accel,items);
+        for (size_t i=0; i<items; i++) {
+          accel[i].fill(prims,start,set.end(),bvh->scene);
+        }
+        return node;
+      }
+
+      BVH* bvh;
+    };
+
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+
+    template<int N, typename Primitive>
+    struct BVHNBuilderSAH : public Builder
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVHN<N>::NodeRef NodeRef;
+
+      BVH* bvh;
+      Scene* scene;
+      Geometry* mesh;
+      mvector<PrimRef> prims;
+      GeneralBVHBuilder::Settings settings;
+      Geometry::GTypeMask gtype_;
+      unsigned int geomID_ = std::numeric_limits<unsigned int>::max ();
+      bool primrefarrayalloc;
+      unsigned int numPreviousPrimitives = 0;
+
+      BVHNBuilderSAH (BVH* bvh, Scene* scene, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize,
+                      const Geometry::GTypeMask gtype, bool primrefarrayalloc = false)
+        : bvh(bvh), scene(scene), mesh(nullptr), prims(scene->device,0),
+          settings(sahBlockSize, minLeafSize, min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD), gtype_(gtype), primrefarrayalloc(primrefarrayalloc) {}
+
+      BVHNBuilderSAH (BVH* bvh, Geometry* mesh, unsigned int geomID, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const Geometry::GTypeMask gtype)
+        : bvh(bvh), scene(nullptr), mesh(mesh), prims(bvh->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD), gtype_(gtype), geomID_(geomID), primrefarrayalloc(false) {}
+
+      // FIXME: shrink bvh->alloc in destructor here and in other builders too
+
+      void build()
+      {
+        /* we reset the allocator when the mesh size changed */
+        if (mesh && mesh->numPrimitives != numPreviousPrimitives) {
+          bvh->alloc.clear();
+        }
+
+        /* if we use the primrefarray for allocations we have to take it back from the BVH */
+        if (settings.primrefarrayalloc != size_t(inf))
+          bvh->alloc.unshare(prims);
+
+	/* skip build for empty scene */
+        const size_t numPrimitives = mesh ? mesh->size() : scene->getNumPrimitives(gtype_,false);
+        numPreviousPrimitives = numPrimitives;
+        if (numPrimitives == 0) {
+          bvh->clear();
+          prims.clear();
+          return;
+        }
+
+        double t0 = bvh->preBuild(mesh ? "" : TOSTRING(isa) "::BVH" + toString(N) + "BuilderSAH");
+
+#if PROFILE
+        profile(2,PROFILE_RUNS,numPrimitives,[&] (ProfileTimer& timer) {
+#endif
+
+            /* create primref array */
+            if (primrefarrayalloc) {
+              settings.primrefarrayalloc = numPrimitives/1000;
+              if (settings.primrefarrayalloc < 1000)
+                settings.primrefarrayalloc = inf;
+            }
+
+            /* enable os_malloc for two level build */
+            if (mesh)
+              bvh->alloc.setOSallocation(true);
+
+            /* initialize allocator */
+            const size_t node_bytes = numPrimitives*sizeof(typename BVH::AABBNodeMB)/(4*N);
+            const size_t leaf_bytes = size_t(1.2*Primitive::blocks(numPrimitives)*sizeof(Primitive));
+            bvh->alloc.init_estimate(node_bytes+leaf_bytes);
+            settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,numPrimitives,node_bytes+leaf_bytes);
+            prims.resize(numPrimitives); 
+
+            PrimInfo pinfo = mesh ?
+              createPrimRefArray(mesh,geomID_,numPrimitives,prims,bvh->scene->progressInterface) :
+              createPrimRefArray(scene,gtype_,false,numPrimitives,prims,bvh->scene->progressInterface);
+
+            /* pinfo might has zero size due to invalid geometry */
+            if (unlikely(pinfo.size() == 0))
+            {
+              bvh->clear();
+              prims.clear();
+              return;
+            }
+
+            /* call BVH builder */
+            NodeRef root = BVHNBuilderVirtual<N>::build(&bvh->alloc,CreateLeaf<N,Primitive>(bvh),bvh->scene->progressInterface,prims.data(),pinfo,settings);
+            bvh->set(root,LBBox3fa(pinfo.geomBounds),pinfo.size());
+            bvh->layoutLargeNodes(size_t(pinfo.size()*0.005f));
+
+#if PROFILE
+          });
+#endif
+
+        /* if we allocated using the primrefarray we have to keep it alive */
+        if (settings.primrefarrayalloc != size_t(inf))
+          bvh->alloc.share(prims);
+
+        /* for static geometries we can do some cleanups */
+        else if (scene && scene->isStaticAccel()) {
+          prims.clear();
+        }
+	bvh->cleanup();
+        bvh->postBuild(t0);
+      }
+
+      void clear() {
+        prims.clear();
+      }
+    };
+
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+
+    template<int N, typename Primitive>
+    struct BVHNBuilderSAHQuantized : public Builder
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVHN<N>::NodeRef NodeRef;
+
+      BVH* bvh;
+      Scene* scene;
+      Geometry* mesh;
+      mvector<PrimRef> prims;
+      GeneralBVHBuilder::Settings settings;
+      Geometry::GTypeMask gtype_;
+      unsigned int geomID_ = std::numeric_limits<unsigned int>::max();
+      unsigned int numPreviousPrimitives = 0;
+
+      BVHNBuilderSAHQuantized (BVH* bvh, Scene* scene, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const Geometry::GTypeMask gtype)
+        : bvh(bvh), scene(scene), mesh(nullptr), prims(scene->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD), gtype_(gtype) {}
+
+      BVHNBuilderSAHQuantized (BVH* bvh, Geometry* mesh, unsigned int geomID, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const Geometry::GTypeMask gtype)
+        : bvh(bvh), scene(nullptr), mesh(mesh), prims(bvh->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD), gtype_(gtype), geomID_(geomID) {}
+
+      // FIXME: shrink bvh->alloc in destructor here and in other builders too
+
+      void build()
+      {
+        /* we reset the allocator when the mesh size changed */
+        if (mesh && mesh->numPrimitives != numPreviousPrimitives) {
+          bvh->alloc.clear();
+        }
+
+	/* skip build for empty scene */
+        const size_t numPrimitives = mesh ? mesh->size() : scene->getNumPrimitives(gtype_,false);
+        numPreviousPrimitives = numPrimitives;
+        if (numPrimitives == 0) {
+          prims.clear();
+          bvh->clear();
+          return;
+        }
+
+        double t0 = bvh->preBuild(mesh ? "" : TOSTRING(isa) "::QBVH" + toString(N) + "BuilderSAH");
+
+#if PROFILE
+        profile(2,PROFILE_RUNS,numPrimitives,[&] (ProfileTimer& timer) {
+#endif
+            /* create primref array */
+            prims.resize(numPrimitives);
+            PrimInfo pinfo = mesh ?
+              createPrimRefArray(mesh,geomID_,numPrimitives,prims,bvh->scene->progressInterface) :
+	      createPrimRefArray(scene,gtype_,false,numPrimitives,prims,bvh->scene->progressInterface);
+
+            /* enable os_malloc for two level build */
+            if (mesh)
+              bvh->alloc.setOSallocation(true);
+
+            /* call BVH builder */
+            const size_t node_bytes = numPrimitives*sizeof(typename BVH::QuantizedNode)/(4*N);
+            const size_t leaf_bytes = size_t(1.2*Primitive::blocks(numPrimitives)*sizeof(Primitive));
+            bvh->alloc.init_estimate(node_bytes+leaf_bytes);
+            settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,numPrimitives,node_bytes+leaf_bytes);
+            NodeRef root = BVHNBuilderQuantizedVirtual<N>::build(&bvh->alloc,CreateLeafQuantized<N,Primitive>(bvh),bvh->scene->progressInterface,prims.data(),pinfo,settings);
+            bvh->set(root,LBBox3fa(pinfo.geomBounds),pinfo.size());
+            //bvh->layoutLargeNodes(pinfo.size()*0.005f); // FIXME: COPY LAYOUT FOR LARGE NODES !!!
+#if PROFILE
+          });
+#endif
+
+	/* clear temporary data for static geometry */
+	if (scene && scene->isStaticAccel()) {
+          prims.clear();
+        }
+	bvh->cleanup();
+        bvh->postBuild(t0);
+      }
+
+      void clear() {
+        prims.clear();
+      }
+    };
+
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+
+
+    template<int N, typename Primitive>
+    struct CreateLeafGrid
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+
+      __forceinline CreateLeafGrid (BVH* bvh, const SubGridBuildData * const sgrids) : bvh(bvh),sgrids(sgrids) {}
+
+      __forceinline NodeRef operator() (const PrimRef* prims, const range<size_t>& set, const FastAllocator::CachedAllocator& alloc) const
+      {
+        const size_t items = set.size(); //Primitive::blocks(n);
+        const size_t start = set.begin();
+
+        /* collect all subsets with unique geomIDs */
+        assert(items <= N);
+        unsigned int geomIDs[N];
+        unsigned int num_geomIDs = 1;
+        geomIDs[0] = prims[start].geomID();
+
+        for (size_t i=1;i<items;i++)
+        {
+          bool found = false;
+          const unsigned int new_geomID = prims[start+i].geomID();
+          for (size_t j=0;j<num_geomIDs;j++)
+            if (new_geomID == geomIDs[j])
+            { found = true; break; }
+          if (!found) 
+            geomIDs[num_geomIDs++] = new_geomID;
+        }
+
+        /* allocate all leaf memory in one single block */
+        SubGridQBVHN<N>* accel = (SubGridQBVHN<N>*) alloc.malloc1(num_geomIDs*sizeof(SubGridQBVHN<N>),BVH::byteAlignment);
+        typename BVH::NodeRef node = BVH::encodeLeaf((char*)accel,num_geomIDs);
+
+        for (size_t g=0;g<num_geomIDs;g++)
+        {
+          unsigned int x[N];
+          unsigned int y[N];
+          unsigned int primID[N];
+          BBox3fa bounds[N];
+          unsigned int pos = 0;
+          for (size_t i=0;i<items;i++)
+          {
+            if (unlikely(prims[start+i].geomID() != geomIDs[g])) continue;
+
+            const SubGridBuildData& sgrid_bd = sgrids[prims[start+i].primID()];
+            x[pos] = sgrid_bd.sx;
+            y[pos] = sgrid_bd.sy;
+            primID[pos] = sgrid_bd.primID;
+            bounds[pos] = prims[start+i].bounds();
+            pos++;
+          }
+          assert(pos <= N);
+          new (&accel[g]) SubGridQBVHN<N>(x,y,primID,bounds,geomIDs[g],pos);
+        }
+
+        return node;
+      }
+
+      BVH* bvh;
+      const SubGridBuildData * const sgrids;
+    };
+
+
+    template<int N>
+    struct BVHNBuilderSAHGrid : public Builder
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVHN<N>::NodeRef NodeRef;
+      
+      BVH* bvh;
+      Scene* scene;
+      GridMesh* mesh;
+      mvector<PrimRef> prims;
+      mvector<SubGridBuildData> sgrids;
+      GeneralBVHBuilder::Settings settings;
+      const unsigned int geomID_ = std::numeric_limits<unsigned int>::max();
+      unsigned int numPreviousPrimitives = 0;
+
+      BVHNBuilderSAHGrid (BVH* bvh, Scene* scene, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const size_t mode)
+        : bvh(bvh), scene(scene), mesh(nullptr), prims(scene->device,0), sgrids(scene->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD) {}
+
+      BVHNBuilderSAHGrid (BVH* bvh, GridMesh* mesh, unsigned int geomID, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const size_t mode)
+        : bvh(bvh), scene(nullptr), mesh(mesh), prims(bvh->device,0), sgrids(scene->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD), geomID_(geomID) {}
+
+      void build()
+      {
+        /* we reset the allocator when the mesh size changed */
+        if (mesh && mesh->numPrimitives != numPreviousPrimitives) {
+          bvh->alloc.clear();
+        }
+        
+        /* if we use the primrefarray for allocations we have to take it back from the BVH */
+        if (settings.primrefarrayalloc != size_t(inf))
+          bvh->alloc.unshare(prims);
+
+        const size_t numGridPrimitives = mesh ? mesh->size() : scene->getNumPrimitives(GridMesh::geom_type,false);
+        numPreviousPrimitives = numGridPrimitives;
+
+
+        PrimInfo pinfo = mesh ? createPrimRefArrayGrids(mesh,prims,sgrids) : createPrimRefArrayGrids(scene,prims,sgrids);
+        const size_t numPrimitives = pinfo.size();
+        /* no primitives */
+        if (numPrimitives == 0) {
+          bvh->clear();
+          prims.clear();
+          sgrids.clear();
+          return;
+        }
+
+        double t0 = bvh->preBuild(mesh ? "" : TOSTRING(isa) "::BVH" + toString(N) + "BuilderSAH");
+
+        /* create primref array */
+        settings.primrefarrayalloc = numPrimitives/1000;
+        if (settings.primrefarrayalloc < 1000)
+          settings.primrefarrayalloc = inf;
+
+        /* enable os_malloc for two level build */
+        if (mesh)
+          bvh->alloc.setOSallocation(true);
+
+        /* initialize allocator */
+        const size_t node_bytes = numPrimitives*sizeof(typename BVH::AABBNodeMB)/(4*N);
+        const size_t leaf_bytes = size_t(1.2*(float)numPrimitives/N * sizeof(SubGridQBVHN<N>));
+
+        bvh->alloc.init_estimate(node_bytes+leaf_bytes);
+        settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,numPrimitives,node_bytes+leaf_bytes);
+
+        /* pinfo might has zero size due to invalid geometry */
+        if (unlikely(pinfo.size() == 0))
+        {
+          bvh->clear();
+          sgrids.clear();
+          prims.clear();
+          return;
+        }
+
+        /* call BVH builder */
+        NodeRef root = BVHNBuilderVirtual<N>::build(&bvh->alloc,CreateLeafGrid<N,SubGridQBVHN<N>>(bvh,sgrids.data()),bvh->scene->progressInterface,prims.data(),pinfo,settings);
+        bvh->set(root,LBBox3fa(pinfo.geomBounds),pinfo.size());
+        bvh->layoutLargeNodes(size_t(pinfo.size()*0.005f));
+
+        /* clear temporary array */
+        sgrids.clear();
+
+        /* if we allocated using the primrefarray we have to keep it alive */
+        if (settings.primrefarrayalloc != size_t(inf))
+          bvh->alloc.share(prims);
+
+        /* for static geometries we can do some cleanups */
+        else if (scene && scene->isStaticAccel()) {
+          prims.clear();
+        }
+	bvh->cleanup();
+        bvh->postBuild(t0);
+      }
+
+      void clear() {
+        prims.clear();
+      }
+    };
+
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+
+    
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+    Builder* BVH4Triangle4MeshBuilderSAH  (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<4,Triangle4>((BVH4*)bvh,mesh,geomID,4,1.0f,4,inf,TriangleMesh::geom_type); }
+    Builder* BVH4Triangle4vMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<4,Triangle4v>((BVH4*)bvh,mesh,geomID,4,1.0f,4,inf,TriangleMesh::geom_type); }
+    Builder* BVH4Triangle4iMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<4,Triangle4i>((BVH4*)bvh,mesh,geomID,4,1.0f,4,inf,TriangleMesh::geom_type); }
+
+    Builder* BVH4Triangle4SceneBuilderSAH  (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<4,Triangle4>((BVH4*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); }
+    Builder* BVH4Triangle4vSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<4,Triangle4v>((BVH4*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); }
+    Builder* BVH4Triangle4iSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<4,Triangle4i>((BVH4*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type,true); }
+
+    Builder* BVH4QuantizedTriangle4iSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<4,Triangle4i>((BVH4*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); }
+#if defined(__AVX__)
+    Builder* BVH8Triangle4MeshBuilderSAH  (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<8,Triangle4>((BVH8*)bvh,mesh,geomID,4,1.0f,4,inf,TriangleMesh::geom_type); }
+    Builder* BVH8Triangle4vMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<8,Triangle4v>((BVH8*)bvh,mesh,geomID,4,1.0f,4,inf,TriangleMesh::geom_type); }
+    Builder* BVH8Triangle4iMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<8,Triangle4i>((BVH8*)bvh,mesh,geomID,4,1.0f,4,inf,TriangleMesh::geom_type); }
+
+    Builder* BVH8Triangle4SceneBuilderSAH  (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<8,Triangle4>((BVH8*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); }
+    Builder* BVH8Triangle4vSceneBuilderSAH  (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<8,Triangle4v>((BVH8*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); }
+    Builder* BVH8Triangle4iSceneBuilderSAH     (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<8,Triangle4i>((BVH8*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type,true); }
+    Builder* BVH8QuantizedTriangle4iSceneBuilderSAH  (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<8,Triangle4i>((BVH8*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); }
+    Builder* BVH8QuantizedTriangle4SceneBuilderSAH  (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<8,Triangle4>((BVH8*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); }
+
+    
+
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_QUAD)
+    Builder* BVH4Quad4vMeshBuilderSAH     (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode)     { return new BVHNBuilderSAH<4,Quad4v>((BVH4*)bvh,mesh,geomID,4,1.0f,4,inf,QuadMesh::geom_type); }
+    Builder* BVH4Quad4iMeshBuilderSAH     (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode)     { return new BVHNBuilderSAH<4,Quad4i>((BVH4*)bvh,mesh,geomID,4,1.0f,4,inf,QuadMesh::geom_type); }
+    Builder* BVH4Quad4vSceneBuilderSAH     (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<4,Quad4v>((BVH4*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type); }
+    Builder* BVH4Quad4iSceneBuilderSAH     (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<4,Quad4i>((BVH4*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type,true); }
+    Builder* BVH4QuantizedQuad4vSceneBuilderSAH     (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<4,Quad4v>((BVH4*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type); }
+    Builder* BVH4QuantizedQuad4iSceneBuilderSAH     (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<4,Quad4i>((BVH4*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type); }
+
+#if defined(__AVX__)
+    Builder* BVH8Quad4vSceneBuilderSAH     (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<8,Quad4v>((BVH8*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type); }
+    Builder* BVH8Quad4iSceneBuilderSAH     (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<8,Quad4i>((BVH8*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type,true); }
+    Builder* BVH8QuantizedQuad4vSceneBuilderSAH     (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<8,Quad4v>((BVH8*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type); }
+    Builder* BVH8QuantizedQuad4iSceneBuilderSAH     (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<8,Quad4i>((BVH8*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type); }
+    Builder* BVH8Quad4vMeshBuilderSAH     (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode)     { return new BVHNBuilderSAH<8,Quad4v>((BVH8*)bvh,mesh,geomID,4,1.0f,4,inf,QuadMesh::geom_type); }
+
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_USER)
+
+    Builder* BVH4VirtualSceneBuilderSAH    (void* bvh, Scene* scene, size_t mode) {
+      int minLeafSize = scene->device->object_accel_min_leaf_size;
+      int maxLeafSize = scene->device->object_accel_max_leaf_size;
+      return new BVHNBuilderSAH<4,Object>((BVH4*)bvh,scene,4,1.0f,minLeafSize,maxLeafSize,UserGeometry::geom_type);
+    }
+
+    Builder* BVH4VirtualMeshBuilderSAH    (void* bvh, UserGeometry* mesh, unsigned int geomID, size_t mode) {
+      return new BVHNBuilderSAH<4,Object>((BVH4*)bvh,mesh,geomID,4,1.0f,1,inf,UserGeometry::geom_type);
+    }
+#if defined(__AVX__)
+
+    Builder* BVH8VirtualSceneBuilderSAH    (void* bvh, Scene* scene, size_t mode) {
+      int minLeafSize = scene->device->object_accel_min_leaf_size;
+      int maxLeafSize = scene->device->object_accel_max_leaf_size;
+      return new BVHNBuilderSAH<8,Object>((BVH8*)bvh,scene,8,1.0f,minLeafSize,maxLeafSize,UserGeometry::geom_type);
+    }
+
+    Builder* BVH8VirtualMeshBuilderSAH    (void* bvh, UserGeometry* mesh, unsigned int geomID, size_t mode) {
+      return new BVHNBuilderSAH<8,Object>((BVH8*)bvh,mesh,geomID,8,1.0f,1,inf,UserGeometry::geom_type);
+    }
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+    Builder* BVH4InstanceSceneBuilderSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype) { return new BVHNBuilderSAH<4,InstancePrimitive>((BVH4*)bvh,scene,4,1.0f,1,1,gtype); }
+    Builder* BVH4InstanceMeshBuilderSAH (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) {
+      return new BVHNBuilderSAH<4,InstancePrimitive>((BVH4*)bvh,mesh,geomID,4,1.0f,1,inf,gtype);
+    }
+#if defined(__AVX__)
+    Builder* BVH8InstanceSceneBuilderSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype) { return new BVHNBuilderSAH<8,InstancePrimitive>((BVH8*)bvh,scene,8,1.0f,1,1,gtype); }
+    Builder* BVH8InstanceMeshBuilderSAH (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) {
+      return new BVHNBuilderSAH<8,InstancePrimitive>((BVH8*)bvh,mesh,geomID,8,1.0f,1,inf,gtype);
+    }
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_GRID)
+    Builder* BVH4GridMeshBuilderSAH  (void* bvh, GridMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAHGrid<4>((BVH4*)bvh,mesh,geomID,4,1.0f,4,4,mode); }
+    Builder* BVH4GridSceneBuilderSAH (void* bvh, Scene* scene, size_t mode)   { return new BVHNBuilderSAHGrid<4>((BVH4*)bvh,scene,4,1.0f,4,4,mode); } // FIXME: check whether cost factors are correct
+
+#if defined(__AVX__)
+    Builder* BVH8GridMeshBuilderSAH  (void* bvh, GridMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAHGrid<8>((BVH8*)bvh,mesh,geomID,8,1.0f,8,8,mode); }
+    Builder* BVH8GridSceneBuilderSAH (void* bvh, Scene* scene, size_t mode)   { return new BVHNBuilderSAHGrid<8>((BVH8*)bvh,scene,8,1.0f,8,8,mode); } // FIXME: check whether cost factors are correct
+#endif
+#endif
+  }
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_builder_sah_mb.cpp b/thirdparty/embree/kernels/bvh/bvh_builder_sah_mb.cpp
new file mode 100644
index 0000000000..d163a80ab1
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_builder_sah_mb.cpp
@@ -0,0 +1,705 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh.h"
+#include "bvh_builder.h"
+#include "../builders/bvh_builder_msmblur.h"
+
+#include "../builders/primrefgen.h"
+#include "../builders/splitter.h"
+
+#include "../geometry/linei.h"
+#include "../geometry/triangle.h"
+#include "../geometry/trianglev.h"
+#include "../geometry/trianglev_mb.h"
+#include "../geometry/trianglei.h"
+#include "../geometry/quadv.h"
+#include "../geometry/quadi.h"
+#include "../geometry/object.h"
+#include "../geometry/instance.h"
+#include "../geometry/subgrid.h"
+
+#include "../common/state.h"
+
+// FIXME: remove after removing BVHNBuilderMBlurRootTimeSplitsSAH
+#include "../../common/algorithms/parallel_for_for.h"
+#include "../../common/algorithms/parallel_for_for_prefix_sum.h"
+
+
+namespace embree
+{
+  namespace isa
+  {
+
+#if 0
+    template<int N, typename Primitive>
+    struct CreateMBlurLeaf
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::NodeRecordMB NodeRecordMB;
+
+      __forceinline CreateMBlurLeaf (BVH* bvh, PrimRef* prims, size_t time) : bvh(bvh), prims(prims), time(time) {}
+
+      __forceinline NodeRecordMB operator() (const PrimRef* prims, const range<size_t>& set, const FastAllocator::CachedAllocator& alloc) const
+      {
+        size_t items = Primitive::blocks(set.size());
+        size_t start = set.begin();
+        for (size_t i=start; i<end; i++) assert((*current.prims.prims)[start].geomID() == (*current.prims.prims)[i].geomID()); // assert that all geomIDs are identical
+        Primitive* accel = (Primitive*) alloc.malloc1(items*sizeof(Primitive),BVH::byteAlignment);
+        NodeRef node = bvh->encodeLeaf((char*)accel,items);
+
+        LBBox3fa allBounds = empty;
+        for (size_t i=0; i<items; i++)
+          allBounds.extend(accel[i].fillMB(prims, start, set.end(), bvh->scene, time));
+
+        return NodeRecordMB(node,allBounds);
+      }
+
+      BVH* bvh;
+      PrimRef* prims;
+      size_t time;
+    };
+#endif
+
+    template<int N, typename Mesh, typename Primitive>
+    struct CreateMSMBlurLeaf
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::NodeRecordMB4D NodeRecordMB4D;
+
+      __forceinline CreateMSMBlurLeaf (BVH* bvh) : bvh(bvh) {}
+
+      __forceinline const NodeRecordMB4D operator() (const BVHBuilderMSMBlur::BuildRecord& current, const FastAllocator::CachedAllocator& alloc) const
+      {
+        size_t items = Primitive::blocks(current.prims.size());
+        size_t start = current.prims.begin();
+        size_t end   = current.prims.end();
+        for (size_t i=start; i<end; i++) assert((*current.prims.prims)[start].geomID() == (*current.prims.prims)[i].geomID()); // assert that all geomIDs are identical
+        Primitive* accel = (Primitive*) alloc.malloc1(items*sizeof(Primitive),BVH::byteNodeAlignment);
+        NodeRef node = bvh->encodeLeaf((char*)accel,items);
+        LBBox3fa allBounds = empty;
+        for (size_t i=0; i<items; i++)
+          allBounds.extend(accel[i].fillMB(current.prims.prims->data(), start, current.prims.end(), bvh->scene, current.prims.time_range));
+        return NodeRecordMB4D(node,allBounds,current.prims.time_range);
+      }
+
+      BVH* bvh;
+    };
+
+    /* Motion blur BVH with 4D nodes and internal time splits */
+    template<int N, typename Mesh, typename Primitive>
+    struct BVHNBuilderMBlurSAH : public Builder
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVHN<N>::NodeRef NodeRef;
+      typedef typename BVHN<N>::NodeRecordMB NodeRecordMB;
+      typedef typename BVHN<N>::AABBNodeMB AABBNodeMB;
+
+      BVH* bvh;
+      Scene* scene;
+      const size_t sahBlockSize;
+      const float intCost;
+      const size_t minLeafSize;
+      const size_t maxLeafSize;
+      const Geometry::GTypeMask gtype_;
+
+      BVHNBuilderMBlurSAH (BVH* bvh, Scene* scene, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const Geometry::GTypeMask gtype)
+        : bvh(bvh), scene(scene), sahBlockSize(sahBlockSize), intCost(intCost), minLeafSize(minLeafSize), maxLeafSize(min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks)), gtype_(gtype) {}
+
+      void build()
+      {
+	/* skip build for empty scene */
+        const size_t numPrimitives = scene->getNumPrimitives(gtype_,true);
+        if (numPrimitives == 0) { bvh->clear(); return; }
+
+        double t0 = bvh->preBuild(TOSTRING(isa) "::BVH" + toString(N) + "BuilderMBlurSAH");
+
+#if PROFILE
+        profile(2,PROFILE_RUNS,numPrimitives,[&] (ProfileTimer& timer) {
+#endif
+
+            //const size_t numTimeSteps = scene->getNumTimeSteps<typename Mesh::type_t,true>();
+            //const size_t numTimeSegments = numTimeSteps-1; assert(numTimeSteps > 1);
+
+            /*if (numTimeSegments == 1)
+              buildSingleSegment(numPrimitives);
+              else*/
+              buildMultiSegment(numPrimitives);
+
+#if PROFILE
+          });
+#endif
+
+	/* clear temporary data for static geometry */
+	bvh->cleanup();
+        bvh->postBuild(t0);
+      }
+
+#if 0 // No longer compatible when time_ranges are present for geometries. Would have to create temporal nodes sometimes, and put only a single geometry into leaf.
+      void buildSingleSegment(size_t numPrimitives)
+      {
+        /* create primref array */
+        mvector<PrimRef> prims(scene->device,numPrimitives);
+	const PrimInfo pinfo = createPrimRefArrayMBlur(scene,gtype_,numPrimitives,prims,bvh->scene->progressInterface,0);
+        /* early out if no valid primitives */
+        if (pinfo.size() == 0) { bvh->clear(); return; }
+        /* estimate acceleration structure size */
+        const size_t node_bytes = pinfo.size()*sizeof(AABBNodeMB)/(4*N);
+        const size_t leaf_bytes = size_t(1.2*Primitive::blocks(pinfo.size())*sizeof(Primitive));
+        bvh->alloc.init_estimate(node_bytes+leaf_bytes);
+
+        /* settings for BVH build */
+        GeneralBVHBuilder::Settings settings;
+        settings.branchingFactor = N;
+        settings.maxDepth = BVH::maxBuildDepthLeaf;
+        settings.logBlockSize = bsr(sahBlockSize);
+        settings.minLeafSize = min(minLeafSize,maxLeafSize);
+        settings.maxLeafSize = maxLeafSize;
+        settings.travCost = travCost;
+        settings.intCost = intCost;
+        settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,pinfo.size(),node_bytes+leaf_bytes);
+
+        /* build hierarchy */
+        auto root = BVHBuilderBinnedSAH::build<NodeRecordMB>
+          (typename BVH::CreateAlloc(bvh),typename BVH::AABBNodeMB::Create(),typename BVH::AABBNodeMB::Set(),
+           CreateMBlurLeaf<N,Primitive>(bvh,prims.data(),0),bvh->scene->progressInterface,
+           prims.data(),pinfo,settings);
+
+        bvh->set(root.ref,root.lbounds,pinfo.size());
+      }
+#endif
+
+      void buildMultiSegment(size_t numPrimitives)
+      {
+        /* create primref array */
+        mvector<PrimRefMB> prims(scene->device,numPrimitives);
+	PrimInfoMB pinfo = createPrimRefArrayMSMBlur(scene,gtype_,numPrimitives,prims,bvh->scene->progressInterface);
+
+        /* early out if no valid primitives */
+        if (pinfo.size() == 0) { bvh->clear(); return; }
+
+        /* estimate acceleration structure size */
+        const size_t node_bytes = pinfo.num_time_segments*sizeof(AABBNodeMB)/(4*N);
+        const size_t leaf_bytes = size_t(1.2*Primitive::blocks(pinfo.num_time_segments)*sizeof(Primitive));
+        bvh->alloc.init_estimate(node_bytes+leaf_bytes);
+
+        /* settings for BVH build */
+        BVHBuilderMSMBlur::Settings settings;
+        settings.branchingFactor = N;
+        settings.maxDepth = BVH::maxDepth;
+        settings.logBlockSize = bsr(sahBlockSize);
+        settings.minLeafSize = min(minLeafSize,maxLeafSize);
+        settings.maxLeafSize = maxLeafSize;
+        settings.travCost = travCost;
+        settings.intCost = intCost;
+        settings.singleLeafTimeSegment = Primitive::singleTimeSegment;
+        settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,pinfo.size(),node_bytes+leaf_bytes);
+        
+        /* build hierarchy */
+        auto root =
+          BVHBuilderMSMBlur::build<NodeRef>(prims,pinfo,scene->device,
+                                            RecalculatePrimRef<Mesh>(scene),
+                                            typename BVH::CreateAlloc(bvh),
+                                            typename BVH::AABBNodeMB4D::Create(),
+                                            typename BVH::AABBNodeMB4D::Set(),
+                                            CreateMSMBlurLeaf<N,Mesh,Primitive>(bvh),
+                                            bvh->scene->progressInterface,
+                                            settings);
+
+        bvh->set(root.ref,root.lbounds,pinfo.num_time_segments);
+      }
+
+      void clear() {
+      }
+    };
+
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+
+    struct GridRecalculatePrimRef
+    {
+      Scene* scene;
+      const SubGridBuildData * const sgrids;
+
+      __forceinline GridRecalculatePrimRef (Scene* scene, const SubGridBuildData * const sgrids)
+        : scene(scene), sgrids(sgrids) {}
+
+        __forceinline PrimRefMB operator() (const PrimRefMB& prim, const BBox1f time_range) const
+        {
+          const unsigned int geomID  = prim.geomID();
+          const GridMesh* mesh = scene->get<GridMesh>(geomID);
+          const unsigned int buildID = prim.primID();
+          const SubGridBuildData &subgrid = sgrids[buildID];                      
+          const unsigned int primID = subgrid.primID;
+          const size_t x = subgrid.x();
+          const size_t y = subgrid.y();
+          const LBBox3fa lbounds = mesh->linearBounds(mesh->grid(primID),x,y,time_range);
+          const unsigned num_time_segments = mesh->numTimeSegments();
+          const range<int> tbounds = mesh->timeSegmentRange(time_range);
+          return PrimRefMB (lbounds, tbounds.size(), mesh->time_range, num_time_segments, geomID, buildID);
+        }
+
+        __forceinline LBBox3fa linearBounds(const PrimRefMB& prim, const BBox1f time_range) const {
+          const unsigned int geomID  = prim.geomID();
+          const GridMesh* mesh = scene->get<GridMesh>(geomID);
+          const unsigned int buildID = prim.primID();
+          const SubGridBuildData &subgrid = sgrids[buildID];                      
+          const unsigned int primID = subgrid.primID;
+          const size_t x = subgrid.x();
+          const size_t y = subgrid.y();
+          return mesh->linearBounds(mesh->grid(primID),x,y,time_range);
+        }
+
+    };
+
+    template<int N>
+    struct CreateMSMBlurLeafGrid
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::NodeRecordMB4D NodeRecordMB4D;
+
+      __forceinline CreateMSMBlurLeafGrid (Scene* scene, BVH* bvh, const SubGridBuildData * const sgrids) : scene(scene), bvh(bvh), sgrids(sgrids) {}
+
+      __forceinline const NodeRecordMB4D operator() (const BVHBuilderMSMBlur::BuildRecord& current, const FastAllocator::CachedAllocator& alloc) const
+      {
+        const size_t items = current.prims.size(); 
+        const size_t start = current.prims.begin();
+
+        const PrimRefMB* prims = current.prims.prims->data();
+        /* collect all subsets with unique geomIDs */
+        assert(items <= N);
+        unsigned int geomIDs[N];
+        unsigned int num_geomIDs = 1;
+        geomIDs[0] = prims[start].geomID();
+
+        for (size_t i=1;i<items;i++)
+        {
+          bool found = false;
+          const unsigned int new_geomID = prims[start+i].geomID();
+          for (size_t j=0;j<num_geomIDs;j++)
+            if (new_geomID == geomIDs[j])
+            { found = true; break; }
+          if (!found) 
+            geomIDs[num_geomIDs++] = new_geomID;
+        }
+
+        /* allocate all leaf memory in one single block */
+        SubGridMBQBVHN<N>* accel = (SubGridMBQBVHN<N>*) alloc.malloc1(num_geomIDs*sizeof(SubGridMBQBVHN<N>),BVH::byteAlignment);
+        typename BVH::NodeRef node = bvh->encodeLeaf((char*)accel,num_geomIDs);
+
+        LBBox3fa allBounds = empty;
+
+        for (size_t g=0;g<num_geomIDs;g++)
+        {
+          const GridMesh* __restrict__ const mesh = scene->get<GridMesh>(geomIDs[g]);
+          unsigned int x[N];
+          unsigned int y[N];
+          unsigned int primID[N];
+          BBox3fa bounds0[N];
+          BBox3fa bounds1[N];
+          unsigned int pos = 0;
+          for (size_t i=0;i<items;i++)
+          {
+            if (unlikely(prims[start+i].geomID() != geomIDs[g])) continue;
+
+            const SubGridBuildData  &sgrid_bd = sgrids[prims[start+i].primID()];                      
+            x[pos] = sgrid_bd.sx;
+            y[pos] = sgrid_bd.sy;
+            primID[pos] = sgrid_bd.primID;
+            const size_t x = sgrid_bd.x();
+            const size_t y = sgrid_bd.y();
+            LBBox3fa newBounds = mesh->linearBounds(mesh->grid(sgrid_bd.primID),x,y,current.prims.time_range);
+            allBounds.extend(newBounds);
+            bounds0[pos] = newBounds.bounds0;
+            bounds1[pos] = newBounds.bounds1;
+            pos++;
+          }
+          assert(pos <= N);
+          new (&accel[g]) SubGridMBQBVHN<N>(x,y,primID,bounds0,bounds1,geomIDs[g],current.prims.time_range.lower,1.0f/current.prims.time_range.size(),pos);
+        }
+        return NodeRecordMB4D(node,allBounds,current.prims.time_range);       
+      }
+
+      Scene *scene;
+      BVH* bvh;
+      const SubGridBuildData * const sgrids;
+    };
+
+#if 0
+    template<int N>
+    struct CreateLeafGridMB
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::NodeRecordMB NodeRecordMB;
+
+      __forceinline CreateLeafGridMB (Scene* scene, BVH* bvh, const SubGridBuildData * const sgrids) 
+		  : scene(scene), bvh(bvh), sgrids(sgrids) {}
+
+      __forceinline NodeRecordMB operator() (const PrimRef* prims, const range<size_t>& set, const FastAllocator::CachedAllocator& alloc) const
+      {
+        const size_t items = set.size(); 
+        const size_t start = set.begin();
+
+        /* collect all subsets with unique geomIDs */
+        assert(items <= N);
+        unsigned int geomIDs[N];
+        unsigned int num_geomIDs = 1;
+        geomIDs[0] = prims[start].geomID();
+
+        for (size_t i=1;i<items;i++)
+        {
+          bool found = false;
+          const unsigned int new_geomID = prims[start+i].geomID();
+          for (size_t j=0;j<num_geomIDs;j++)
+            if (new_geomID == geomIDs[j])
+            { found = true; break; }
+          if (!found) 
+            geomIDs[num_geomIDs++] = new_geomID;
+        }
+
+        /* allocate all leaf memory in one single block */
+        SubGridMBQBVHN<N>* accel = (SubGridMBQBVHN<N>*) alloc.malloc1(num_geomIDs*sizeof(SubGridMBQBVHN<N>),BVH::byteAlignment);
+        typename BVH::NodeRef node = bvh->encodeLeaf((char*)accel,num_geomIDs);
+
+        LBBox3fa allBounds = empty;
+
+        for (size_t g=0;g<num_geomIDs;g++)
+        {
+          const GridMesh* __restrict__ const mesh = scene->get<GridMesh>(geomIDs[g]);
+
+          unsigned int x[N];
+          unsigned int y[N];
+          unsigned int primID[N];
+          BBox3fa bounds0[N];
+          BBox3fa bounds1[N];
+          unsigned int pos = 0;
+          for (size_t i=0;i<items;i++)
+          {
+            if (unlikely(prims[start+i].geomID() != geomIDs[g])) continue;
+
+            const SubGridBuildData  &sgrid_bd = sgrids[prims[start+i].primID()];                      
+            x[pos] = sgrid_bd.sx;
+            y[pos] = sgrid_bd.sy;
+            primID[pos] = sgrid_bd.primID;
+            const size_t x = sgrid_bd.x();
+            const size_t y = sgrid_bd.y();
+            bool MAYBE_UNUSED valid0 = mesh->buildBounds(mesh->grid(sgrid_bd.primID),x,y,0,bounds0[pos]);
+            bool MAYBE_UNUSED valid1 = mesh->buildBounds(mesh->grid(sgrid_bd.primID),x,y,1,bounds1[pos]);
+            assert(valid0);
+            assert(valid1);
+            allBounds.extend(LBBox3fa(bounds0[pos],bounds1[pos]));
+            pos++;
+          }
+          new (&accel[g]) SubGridMBQBVHN<N>(x,y,primID,bounds0,bounds1,geomIDs[g],0.0f,1.0f,pos);
+        }
+        return NodeRecordMB(node,allBounds);
+      }
+
+      Scene *scene;
+      BVH* bvh;
+      const SubGridBuildData * const sgrids;
+    };
+#endif
+
+
+    /* Motion blur BVH with 4D nodes and internal time splits */
+    template<int N>
+    struct BVHNBuilderMBlurSAHGrid : public Builder
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVHN<N>::NodeRef NodeRef;
+      typedef typename BVHN<N>::NodeRecordMB NodeRecordMB;
+      typedef typename BVHN<N>::AABBNodeMB AABBNodeMB;
+
+      BVH* bvh;
+      Scene* scene;
+      const size_t sahBlockSize;
+      const float intCost;
+      const size_t minLeafSize;
+      const size_t maxLeafSize;
+      mvector<SubGridBuildData> sgrids;
+
+
+      BVHNBuilderMBlurSAHGrid (BVH* bvh, Scene* scene, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize)
+        : bvh(bvh), scene(scene), sahBlockSize(sahBlockSize), intCost(intCost), minLeafSize(minLeafSize), maxLeafSize(min(maxLeafSize,BVH::maxLeafBlocks)), sgrids(scene->device,0) {}
+
+
+      PrimInfo createPrimRefArrayMBlurGrid(Scene* scene, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor, size_t itime)
+      {
+        /* first run to get #primitives */
+        ParallelForForPrefixSumState<PrimInfo> pstate;
+        Scene::Iterator<GridMesh,true> iter(scene);
+
+        pstate.init(iter,size_t(1024));
+
+        /* iterate over all meshes in the scene */
+        PrimInfo pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](GridMesh* mesh, const range<size_t>& r, size_t k, size_t geomID) -> PrimInfo {
+            
+            PrimInfo pinfo(empty);
+            for (size_t j=r.begin(); j<r.end(); j++)
+            {
+              if (!mesh->valid(j,range<size_t>(0,1))) continue;
+              BBox3fa bounds = empty;
+              const PrimRef prim(bounds,unsigned(geomID),unsigned(j));
+              pinfo.add_center2(prim,mesh->getNumSubGrids(j));
+            }
+            return pinfo;
+          }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+        
+        size_t numPrimitives = pinfo.size();
+        if (numPrimitives == 0) return pinfo;
+
+        /* resize arrays */
+        sgrids.resize(numPrimitives); 
+        prims.resize(numPrimitives); 
+
+        /* second run to fill primrefs and SubGridBuildData arrays */
+        pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](GridMesh* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfo& base) -> PrimInfo {
+            
+            k = base.size();
+            size_t p_index = k;
+            PrimInfo pinfo(empty);
+            for (size_t j=r.begin(); j<r.end(); j++)
+            {
+              const GridMesh::Grid &g = mesh->grid(j);
+              if (!mesh->valid(j,range<size_t>(0,1))) continue;
+              
+              for (unsigned int y=0; y<g.resY-1u; y+=2)
+                for (unsigned int x=0; x<g.resX-1u; x+=2)
+                {
+                  BBox3fa bounds = empty;
+                  if (!mesh->buildBounds(g,x,y,itime,bounds)) continue; // get bounds of subgrid
+                  const PrimRef prim(bounds,unsigned(geomID),unsigned(p_index));
+                  pinfo.add_center2(prim);
+                  sgrids[p_index] = SubGridBuildData(x | g.get3x3FlagsX(x), y | g.get3x3FlagsY(y), unsigned(j));
+                                                      prims[p_index++] = prim;                
+                }
+            }
+            return pinfo;
+          }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+        
+        assert(pinfo.size() == numPrimitives);
+        return pinfo;
+      }
+
+      PrimInfoMB createPrimRefArrayMSMBlurGrid(Scene* scene, mvector<PrimRefMB>& prims, BuildProgressMonitor& progressMonitor, BBox1f t0t1 = BBox1f(0.0f,1.0f))
+      {
+        /* first run to get #primitives */
+        ParallelForForPrefixSumState<PrimInfoMB> pstate;
+        Scene::Iterator<GridMesh,true> iter(scene);
+
+        pstate.init(iter,size_t(1024));
+        /* iterate over all meshes in the scene */
+        PrimInfoMB pinfoMB = parallel_for_for_prefix_sum0( pstate, iter, PrimInfoMB(empty), [&](GridMesh* mesh, const range<size_t>& r, size_t k, size_t /*geomID*/) -> PrimInfoMB {
+            
+            PrimInfoMB pinfoMB(empty);
+            for (size_t j=r.begin(); j<r.end(); j++)
+            {
+              if (!mesh->valid(j, mesh->timeSegmentRange(t0t1))) continue;
+              LBBox3fa bounds(empty);
+              PrimInfoMB gridMB(0,mesh->getNumSubGrids(j));
+              pinfoMB.merge(gridMB);
+            }
+            return pinfoMB;
+          }, [](const PrimInfoMB& a, const PrimInfoMB& b) -> PrimInfoMB { return PrimInfoMB::merge2(a,b); });
+        
+        size_t numPrimitives = pinfoMB.size();
+        if (numPrimitives == 0) return pinfoMB;
+
+        /* resize arrays */
+        sgrids.resize(numPrimitives); 
+        prims.resize(numPrimitives); 
+        /* second run to fill primrefs and SubGridBuildData arrays */
+        pinfoMB = parallel_for_for_prefix_sum1( pstate, iter, PrimInfoMB(empty), [&](GridMesh* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfoMB& base) -> PrimInfoMB {
+            
+            k = base.size();
+            size_t p_index = k;
+            PrimInfoMB pinfoMB(empty);
+            for (size_t j=r.begin(); j<r.end(); j++)
+            {
+              if (!mesh->valid(j, mesh->timeSegmentRange(t0t1))) continue;
+              const GridMesh::Grid &g = mesh->grid(j);
+              
+              for (unsigned int y=0; y<g.resY-1u; y+=2)
+                for (unsigned int x=0; x<g.resX-1u; x+=2)
+                {
+                  const PrimRefMB prim(mesh->linearBounds(g,x,y,t0t1),mesh->numTimeSegments(),mesh->time_range,mesh->numTimeSegments(),unsigned(geomID),unsigned(p_index));
+                  pinfoMB.add_primref(prim);
+                  sgrids[p_index] = SubGridBuildData(x | g.get3x3FlagsX(x), y | g.get3x3FlagsY(y), unsigned(j));
+                  prims[p_index++] = prim;                
+                }
+            }
+            return pinfoMB;
+          }, [](const PrimInfoMB& a, const PrimInfoMB& b) -> PrimInfoMB { return PrimInfoMB::merge2(a,b); });
+        
+        assert(pinfoMB.size() == numPrimitives);
+        pinfoMB.time_range = t0t1;
+        return pinfoMB;
+      }
+
+      void build()
+      {
+	/* skip build for empty scene */
+        const size_t numPrimitives = scene->getNumPrimitives(GridMesh::geom_type,true);
+        if (numPrimitives == 0) { bvh->clear(); return; }
+
+        double t0 = bvh->preBuild(TOSTRING(isa) "::BVH" + toString(N) + "BuilderMBlurSAHGrid");
+
+        //const size_t numTimeSteps = scene->getNumTimeSteps<GridMesh,true>();
+        //const size_t numTimeSegments = numTimeSteps-1; assert(numTimeSteps > 1);
+        //if (numTimeSegments == 1)
+        //  buildSingleSegment(numPrimitives);
+        //else
+        buildMultiSegment(numPrimitives);
+
+	/* clear temporary data for static geometry */
+	bvh->cleanup();
+        bvh->postBuild(t0);
+      }
+
+#if 0
+      void buildSingleSegment(size_t numPrimitives)
+      {
+        /* create primref array */
+        mvector<PrimRef> prims(scene->device,numPrimitives);
+        const PrimInfo pinfo = createPrimRefArrayMBlurGrid(scene,prims,bvh->scene->progressInterface,0);
+        /* early out if no valid primitives */
+        if (pinfo.size() == 0) { bvh->clear(); return; }
+
+        /* estimate acceleration structure size */
+        const size_t node_bytes = pinfo.size()*sizeof(AABBNodeMB)/(4*N);
+        //TODO: check leaf_bytes
+        const size_t leaf_bytes = size_t(1.2*(float)numPrimitives/N * sizeof(SubGridQBVHN<N>));
+        bvh->alloc.init_estimate(node_bytes+leaf_bytes);
+
+        /* settings for BVH build */
+        GeneralBVHBuilder::Settings settings;
+        settings.branchingFactor = N;
+        settings.maxDepth = BVH::maxBuildDepthLeaf;
+        settings.logBlockSize = bsr(sahBlockSize);
+        settings.minLeafSize = min(minLeafSize,maxLeafSize);
+        settings.maxLeafSize = maxLeafSize;
+        settings.travCost = travCost;
+        settings.intCost = intCost;
+        settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,pinfo.size(),node_bytes+leaf_bytes);
+
+        /* build hierarchy */
+        auto root = BVHBuilderBinnedSAH::build<NodeRecordMB>
+          (typename BVH::CreateAlloc(bvh),
+           typename BVH::AABBNodeMB::Create(),
+           typename BVH::AABBNodeMB::Set(),
+           CreateLeafGridMB<N>(scene,bvh,sgrids.data()),
+           bvh->scene->progressInterface,
+           prims.data(),pinfo,settings);
+
+        bvh->set(root.ref,root.lbounds,pinfo.size());
+      }
+#endif
+      
+      void buildMultiSegment(size_t numPrimitives)
+      {
+        /* create primref array */
+        mvector<PrimRefMB> prims(scene->device,numPrimitives);
+        PrimInfoMB pinfo = createPrimRefArrayMSMBlurGrid(scene,prims,bvh->scene->progressInterface);
+
+        /* early out if no valid primitives */
+        if (pinfo.size() == 0) { bvh->clear(); return; }
+
+
+
+        GridRecalculatePrimRef recalculatePrimRef(scene,sgrids.data());
+
+        /* estimate acceleration structure size */
+        const size_t node_bytes = pinfo.num_time_segments*sizeof(AABBNodeMB)/(4*N);
+        //FIXME: check leaf_bytes
+        //const size_t leaf_bytes = size_t(1.2*Primitive::blocks(pinfo.num_time_segments)*sizeof(SubGridQBVHN<N>));
+        const size_t leaf_bytes = size_t(1.2*(float)numPrimitives/N * sizeof(SubGridQBVHN<N>));
+
+        bvh->alloc.init_estimate(node_bytes+leaf_bytes);
+
+        /* settings for BVH build */
+        BVHBuilderMSMBlur::Settings settings;
+        settings.branchingFactor = N;
+        settings.maxDepth = BVH::maxDepth;
+        settings.logBlockSize = bsr(sahBlockSize);
+        settings.minLeafSize = min(minLeafSize,maxLeafSize);
+        settings.maxLeafSize = maxLeafSize;
+        settings.travCost = travCost;
+        settings.intCost = intCost;
+        settings.singleLeafTimeSegment = false; 
+        settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,pinfo.size(),node_bytes+leaf_bytes);
+        
+        /* build hierarchy */
+        auto root =
+          BVHBuilderMSMBlur::build<NodeRef>(prims,pinfo,scene->device,
+                                            recalculatePrimRef,
+                                            typename BVH::CreateAlloc(bvh),
+                                            typename BVH::AABBNodeMB4D::Create(),
+                                            typename BVH::AABBNodeMB4D::Set(),
+                                            CreateMSMBlurLeafGrid<N>(scene,bvh,sgrids.data()),
+                                            bvh->scene->progressInterface,
+                                            settings);
+        bvh->set(root.ref,root.lbounds,pinfo.num_time_segments);
+      }
+
+      void clear() {
+      }
+    };
+
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+    Builder* BVH4Triangle4iMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAH<4,TriangleMesh,Triangle4i>((BVH4*)bvh,scene,4,1.0f,4,inf,Geometry::MTY_TRIANGLE_MESH); }
+    Builder* BVH4Triangle4vMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAH<4,TriangleMesh,Triangle4vMB>((BVH4*)bvh,scene,4,1.0f,4,inf,Geometry::MTY_TRIANGLE_MESH); }
+#if defined(__AVX__)
+    Builder* BVH8Triangle4iMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAH<8,TriangleMesh,Triangle4i>((BVH8*)bvh,scene,4,1.0f,4,inf,Geometry::MTY_TRIANGLE_MESH); }
+    Builder* BVH8Triangle4vMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAH<8,TriangleMesh,Triangle4vMB>((BVH8*)bvh,scene,4,1.0f,4,inf,Geometry::MTY_TRIANGLE_MESH); }
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_QUAD)
+    Builder* BVH4Quad4iMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAH<4,QuadMesh,Quad4i>((BVH4*)bvh,scene,4,1.0f,4,inf,Geometry::MTY_QUAD_MESH); }
+#if defined(__AVX__)
+    Builder* BVH8Quad4iMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAH<8,QuadMesh,Quad4i>((BVH8*)bvh,scene,4,1.0f,4,inf,Geometry::MTY_QUAD_MESH); }
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_USER)
+    Builder* BVH4VirtualMBSceneBuilderSAH    (void* bvh, Scene* scene, size_t mode) {
+      int minLeafSize = scene->device->object_accel_mb_min_leaf_size;
+      int maxLeafSize = scene->device->object_accel_mb_max_leaf_size;
+      return new BVHNBuilderMBlurSAH<4,UserGeometry,Object>((BVH4*)bvh,scene,4,1.0f,minLeafSize,maxLeafSize,Geometry::MTY_USER_GEOMETRY);
+    }
+#if defined(__AVX__)
+    Builder* BVH8VirtualMBSceneBuilderSAH    (void* bvh, Scene* scene, size_t mode) {
+      int minLeafSize = scene->device->object_accel_mb_min_leaf_size;
+      int maxLeafSize = scene->device->object_accel_mb_max_leaf_size;
+      return new BVHNBuilderMBlurSAH<8,UserGeometry,Object>((BVH8*)bvh,scene,8,1.0f,minLeafSize,maxLeafSize,Geometry::MTY_USER_GEOMETRY);
+    }
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+    Builder* BVH4InstanceMBSceneBuilderSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype) { return new BVHNBuilderMBlurSAH<4,Instance,InstancePrimitive>((BVH4*)bvh,scene,4,1.0f,1,1,gtype); }
+#if defined(__AVX__)
+    Builder* BVH8InstanceMBSceneBuilderSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype) { return new BVHNBuilderMBlurSAH<8,Instance,InstancePrimitive>((BVH8*)bvh,scene,8,1.0f,1,1,gtype); }
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_GRID)
+    Builder* BVH4GridMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAHGrid<4>((BVH4*)bvh,scene,4,1.0f,4,4); }
+#if defined(__AVX__)
+    Builder* BVH8GridMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAHGrid<8>((BVH8*)bvh,scene,8,1.0f,8,8); }
+#endif
+#endif
+  }
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_builder_sah_spatial.cpp b/thirdparty/embree/kernels/bvh/bvh_builder_sah_spatial.cpp
new file mode 100644
index 0000000000..a4e55d7484
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_builder_sah_spatial.cpp
@@ -0,0 +1,201 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh.h"
+#include "bvh_builder.h"
+
+#include "../builders/primrefgen.h"
+#include "../builders/primrefgen_presplit.h"
+#include "../builders/splitter.h"
+
+#include "../geometry/linei.h"
+#include "../geometry/triangle.h"
+#include "../geometry/trianglev.h"
+#include "../geometry/trianglev_mb.h"
+#include "../geometry/trianglei.h"
+#include "../geometry/quadv.h"
+#include "../geometry/quadi.h"
+#include "../geometry/object.h"
+#include "../geometry/instance.h"
+#include "../geometry/subgrid.h"
+
+#include "../common/state.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int N, typename Primitive>
+    struct CreateLeafSpatial
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+
+      __forceinline CreateLeafSpatial (BVH* bvh) : bvh(bvh) {}
+
+      __forceinline NodeRef operator() (const PrimRef* prims, const range<size_t>& set, const FastAllocator::CachedAllocator& alloc) const
+      {
+        size_t n = set.size();
+        size_t items = Primitive::blocks(n);
+        size_t start = set.begin();
+        Primitive* accel = (Primitive*) alloc.malloc1(items*sizeof(Primitive),BVH::byteAlignment);
+        typename BVH::NodeRef node = BVH::encodeLeaf((char*)accel,items);
+        for (size_t i=0; i<items; i++) {
+          accel[i].fill(prims,start,set.end(),bvh->scene);
+        }
+        return node;
+      }
+
+      BVH* bvh;
+    };
+
+    template<int N, typename Mesh, typename Primitive, typename Splitter>
+    struct BVHNBuilderFastSpatialSAH : public Builder
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      BVH* bvh;
+      Scene* scene;
+      Mesh* mesh;
+      mvector<PrimRef> prims0;
+      GeneralBVHBuilder::Settings settings;
+      const float splitFactor;
+      unsigned int geomID_ = std::numeric_limits<unsigned int>::max();
+      unsigned int numPreviousPrimitives = 0;
+
+      BVHNBuilderFastSpatialSAH (BVH* bvh, Scene* scene, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const size_t mode)
+        : bvh(bvh), scene(scene), mesh(nullptr), prims0(scene->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD),
+          splitFactor(scene->device->max_spatial_split_replications) {}
+
+      BVHNBuilderFastSpatialSAH (BVH* bvh, Mesh* mesh, const unsigned int geomID, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const size_t mode)
+        : bvh(bvh), scene(nullptr), mesh(mesh), prims0(bvh->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD),
+          splitFactor(scene->device->max_spatial_split_replications), geomID_(geomID) {}
+
+      // FIXME: shrink bvh->alloc in destructor here and in other builders too
+
+      void build()
+      {
+        /* we reset the allocator when the mesh size changed */
+        if (mesh && mesh->numPrimitives != numPreviousPrimitives) {
+          bvh->alloc.clear();
+        }
+
+	/* skip build for empty scene */
+        const size_t numOriginalPrimitives = mesh ? mesh->size() : scene->getNumPrimitives(Mesh::geom_type,false);
+        numPreviousPrimitives = numOriginalPrimitives;
+        if (numOriginalPrimitives == 0) {
+          prims0.clear();
+          bvh->clear();
+          return;
+        }
+
+        const unsigned int maxGeomID = mesh ? geomID_ : scene->getMaxGeomID<Mesh,false>();
+        const bool usePreSplits = scene->device->useSpatialPreSplits || (maxGeomID >= ((unsigned int)1 << (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)));
+        double t0 = bvh->preBuild(mesh ? "" : TOSTRING(isa) "::BVH" + toString(N) + (usePreSplits ? "BuilderFastSpatialPresplitSAH" : "BuilderFastSpatialSAH"));
+
+        /* create primref array */
+        const size_t numSplitPrimitives = max(numOriginalPrimitives,size_t(splitFactor*numOriginalPrimitives));
+        prims0.resize(numSplitPrimitives);
+
+        /* enable os_malloc for two level build */
+        if (mesh)
+          bvh->alloc.setOSallocation(true);
+	
+	NodeRef root(0);
+	PrimInfo pinfo;
+	
+
+        if (likely(usePreSplits))
+	  {		     
+            /* spatial presplit SAH BVH builder */
+	    pinfo = mesh ?
+	      createPrimRefArray_presplit<Mesh,Splitter>(mesh,maxGeomID,numOriginalPrimitives,prims0,bvh->scene->progressInterface) :
+	      createPrimRefArray_presplit<Mesh,Splitter>(scene,Mesh::geom_type,false,numOriginalPrimitives,prims0,bvh->scene->progressInterface);
+
+	    const size_t node_bytes = pinfo.size()*sizeof(typename BVH::AABBNode)/(4*N);
+	    const size_t leaf_bytes = size_t(1.2*Primitive::blocks(pinfo.size())*sizeof(Primitive));
+	    bvh->alloc.init_estimate(node_bytes+leaf_bytes);
+	    settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,pinfo.size(),node_bytes+leaf_bytes);
+
+	    settings.branchingFactor = N;
+	    settings.maxDepth = BVH::maxBuildDepthLeaf;
+
+	    /* call BVH builder */
+	    root = BVHNBuilderVirtual<N>::build(&bvh->alloc,CreateLeafSpatial<N,Primitive>(bvh),bvh->scene->progressInterface,prims0.data(),pinfo,settings);
+	  }
+	else
+	  {
+            /* standard spatial split SAH BVH builder */
+	    pinfo = mesh ?
+	      createPrimRefArray(mesh,geomID_,numSplitPrimitives,prims0,bvh->scene->progressInterface) :
+	      createPrimRefArray(scene,Mesh::geom_type,false,numSplitPrimitives,prims0,bvh->scene->progressInterface);
+	
+	    Splitter splitter(scene);
+
+	    const size_t node_bytes = pinfo.size()*sizeof(typename BVH::AABBNode)/(4*N);
+	    const size_t leaf_bytes = size_t(1.2*Primitive::blocks(pinfo.size())*sizeof(Primitive));
+	    bvh->alloc.init_estimate(node_bytes+leaf_bytes);
+	    settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,pinfo.size(),node_bytes+leaf_bytes);
+
+	    settings.branchingFactor = N;
+	    settings.maxDepth = BVH::maxBuildDepthLeaf;
+
+	    /* call BVH builder */
+	    root = BVHBuilderBinnedFastSpatialSAH::build<NodeRef>(
+								  typename BVH::CreateAlloc(bvh),
+								  typename BVH::AABBNode::Create2(),
+								  typename BVH::AABBNode::Set2(),
+								  CreateLeafSpatial<N,Primitive>(bvh),
+								  splitter,
+								  bvh->scene->progressInterface,
+								  prims0.data(),
+								  numSplitPrimitives,
+								  pinfo,settings);
+
+	    /* ==================== */
+	  }
+
+        bvh->set(root,LBBox3fa(pinfo.geomBounds),pinfo.size());
+        bvh->layoutLargeNodes(size_t(pinfo.size()*0.005f));
+
+	/* clear temporary data for static geometry */
+	if (scene && scene->isStaticAccel()) {
+          prims0.clear();
+        }
+	bvh->cleanup();
+        bvh->postBuild(t0);
+      }
+
+      void clear() {
+        prims0.clear();
+      }
+    };
+
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+
+
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+
+    Builder* BVH4Triangle4SceneBuilderFastSpatialSAH  (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<4,TriangleMesh,Triangle4,TriangleSplitterFactory>((BVH4*)bvh,scene,4,1.0f,4,inf,mode); }
+    Builder* BVH4Triangle4vSceneBuilderFastSpatialSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<4,TriangleMesh,Triangle4v,TriangleSplitterFactory>((BVH4*)bvh,scene,4,1.0f,4,inf,mode); }
+    Builder* BVH4Triangle4iSceneBuilderFastSpatialSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<4,TriangleMesh,Triangle4i,TriangleSplitterFactory>((BVH4*)bvh,scene,4,1.0f,4,inf,mode); }
+
+#if defined(__AVX__)
+    Builder* BVH8Triangle4SceneBuilderFastSpatialSAH  (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<8,TriangleMesh,Triangle4,TriangleSplitterFactory>((BVH8*)bvh,scene,4,1.0f,4,inf,mode); }
+    Builder* BVH8Triangle4vSceneBuilderFastSpatialSAH  (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<8,TriangleMesh,Triangle4v,TriangleSplitterFactory>((BVH8*)bvh,scene,4,1.0f,4,inf,mode); }
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_QUAD)
+    Builder* BVH4Quad4vSceneBuilderFastSpatialSAH  (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<4,QuadMesh,Quad4v,QuadSplitterFactory>((BVH4*)bvh,scene,4,1.0f,4,inf,mode); }
+
+#if defined(__AVX__)
+    Builder* BVH8Quad4vSceneBuilderFastSpatialSAH  (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<8,QuadMesh,Quad4v,QuadSplitterFactory>((BVH8*)bvh,scene,4,1.0f,4,inf,mode); }
+#endif
+
+#endif
+  }
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_builder_twolevel.cpp b/thirdparty/embree/kernels/bvh/bvh_builder_twolevel.cpp
new file mode 100644
index 0000000000..5d45ed3748
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_builder_twolevel.cpp
@@ -0,0 +1,369 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh_builder_twolevel.h"
+#include "bvh_statistics.h"
+#include "../builders/bvh_builder_sah.h"
+#include "../common/scene_line_segments.h"
+#include "../common/scene_triangle_mesh.h"
+#include "../common/scene_quad_mesh.h"
+
+#define PROFILE 0
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int N, typename Mesh, typename Primitive>
+    BVHNBuilderTwoLevel<N,Mesh,Primitive>::BVHNBuilderTwoLevel (BVH* bvh, Scene* scene, Geometry::GTypeMask gtype, bool useMortonBuilder, const size_t singleThreadThreshold)
+      : bvh(bvh), scene(scene), refs(scene->device,0), prims(scene->device,0), singleThreadThreshold(singleThreadThreshold), gtype(gtype), useMortonBuilder_(useMortonBuilder) {}
+    
+    template<int N, typename Mesh, typename Primitive>
+    BVHNBuilderTwoLevel<N,Mesh,Primitive>::~BVHNBuilderTwoLevel () {
+    }
+
+    // ===========================================================================
+    // ===========================================================================
+    // ===========================================================================
+
+    template<int N, typename Mesh, typename Primitive>
+    void BVHNBuilderTwoLevel<N,Mesh,Primitive>::build()
+    {
+      /* delete some objects */
+      size_t num = scene->size();
+      if (num < bvh->objects.size()) {
+        parallel_for(num, bvh->objects.size(), [&] (const range<size_t>& r) {
+            for (size_t i=r.begin(); i<r.end(); i++) {
+              builders[i].reset();
+              delete bvh->objects[i]; bvh->objects[i] = nullptr;
+            }
+          });
+      }
+      
+#if PROFILE
+      while(1) 
+#endif
+      {
+      /* reset memory allocator */
+      bvh->alloc.reset();
+      
+      /* skip build for empty scene */
+      const size_t numPrimitives = scene->getNumPrimitives(gtype,false);
+
+      if (numPrimitives == 0) {
+        prims.resize(0);
+        bvh->set(BVH::emptyNode,empty,0);
+        return;
+      }
+
+      /* calculate the size of the entire BVH */
+      const size_t numLeafBlocks = Primitive::blocks(numPrimitives);
+      const size_t node_bytes = 2*numLeafBlocks*sizeof(typename BVH::AABBNode)/N;
+      const size_t leaf_bytes = size_t(1.2*numLeafBlocks*sizeof(Primitive));
+      bvh->alloc.init_estimate(node_bytes+leaf_bytes); 
+
+      double t0 = bvh->preBuild(TOSTRING(isa) "::BVH" + toString(N) + "BuilderTwoLevel");
+
+      /* resize object array if scene got larger */
+      if (bvh->objects.size()  < num) bvh->objects.resize(num);
+      if (builders.size() < num) builders.resize(num);
+      resizeRefsList ();
+      nextRef.store(0);
+      
+      /* create acceleration structures */
+      parallel_for(size_t(0), num, [&] (const range<size_t>& r)
+      {
+        for (size_t objectID=r.begin(); objectID<r.end(); objectID++)
+        {
+          Mesh* mesh = scene->getSafe<Mesh>(objectID);
+      
+          /* ignore meshes we do not support */
+          if (mesh == nullptr || mesh->numTimeSteps != 1)
+            continue;
+          
+          if (isSmallGeometry(mesh)) {
+             setupSmallBuildRefBuilder (objectID, mesh);
+          } else {
+            setupLargeBuildRefBuilder (objectID, mesh);
+          }
+        }
+      });
+
+      /* parallel build of acceleration structures */
+      parallel_for(size_t(0), num, [&] (const range<size_t>& r)
+      {
+        for (size_t objectID=r.begin(); objectID<r.end(); objectID++)
+        {
+          /* ignore if no triangle mesh or not enabled */
+          Mesh* mesh = scene->getSafe<Mesh>(objectID);
+          if (mesh == nullptr || !mesh->isEnabled() || mesh->numTimeSteps != 1) 
+            continue;
+
+          builders[objectID]->attachBuildRefs (this);
+        }
+      });
+
+
+#if PROFILE
+      double d0 = getSeconds();
+#endif
+      /* fast path for single geometry scenes */
+      if (nextRef == 1) { 
+        bvh->set(refs[0].node,LBBox3fa(refs[0].bounds()),numPrimitives);
+      }
+
+      else
+      {     
+        /* open all large nodes */
+        refs.resize(nextRef);
+
+        /* this probably needs some more tuning */
+        const size_t extSize = max(max((size_t)SPLIT_MIN_EXT_SPACE,refs.size()*SPLIT_MEMORY_RESERVE_SCALE),size_t((float)numPrimitives / SPLIT_MEMORY_RESERVE_FACTOR));
+ 
+#if !ENABLE_DIRECT_SAH_MERGE_BUILDER
+
+#if ENABLE_OPEN_SEQUENTIAL
+        open_sequential(extSize); 
+#endif
+        /* compute PrimRefs */
+        prims.resize(refs.size());
+#endif
+        
+        {
+#if ENABLE_DIRECT_SAH_MERGE_BUILDER
+
+          const PrimInfo pinfo = parallel_reduce(size_t(0), refs.size(),  PrimInfo(empty), [&] (const range<size_t>& r) -> PrimInfo {
+
+              PrimInfo pinfo(empty);
+              for (size_t i=r.begin(); i<r.end(); i++) {
+                pinfo.add_center2(refs[i]);
+              }
+              return pinfo;
+            }, [] (const PrimInfo& a, const PrimInfo& b) { return PrimInfo::merge(a,b); });
+          
+#else
+          const PrimInfo pinfo = parallel_reduce(size_t(0), refs.size(),  PrimInfo(empty), [&] (const range<size_t>& r) -> PrimInfo {
+
+              PrimInfo pinfo(empty);
+              for (size_t i=r.begin(); i<r.end(); i++) {
+                pinfo.add_center2(refs[i]);
+                prims[i] = PrimRef(refs[i].bounds(),(size_t)refs[i].node);
+              }
+              return pinfo;
+            }, [] (const PrimInfo& a, const PrimInfo& b) { return PrimInfo::merge(a,b); });
+#endif   
+       
+          /* skip if all objects where empty */
+          if (pinfo.size() == 0)
+            bvh->set(BVH::emptyNode,empty,0);
+        
+          /* otherwise build toplevel hierarchy */
+          else
+          {
+            /* settings for BVH build */
+            GeneralBVHBuilder::Settings settings;
+            settings.branchingFactor = N;
+            settings.maxDepth = BVH::maxBuildDepthLeaf;
+            settings.logBlockSize = bsr(N);
+            settings.minLeafSize = 1;
+            settings.maxLeafSize = 1;
+            settings.travCost = 1.0f;
+            settings.intCost = 1.0f;
+            settings.singleThreadThreshold = singleThreadThreshold;
+      
+#if ENABLE_DIRECT_SAH_MERGE_BUILDER
+            
+            refs.resize(extSize); 
+         
+            NodeRef root = BVHBuilderBinnedOpenMergeSAH::build<NodeRef,BuildRef>(
+              typename BVH::CreateAlloc(bvh),
+              typename BVH::AABBNode::Create2(),
+              typename BVH::AABBNode::Set2(),
+              
+              [&] (const BuildRef* refs, const range<size_t>& range, const FastAllocator::CachedAllocator& alloc) -> NodeRef  {
+                assert(range.size() == 1);
+                return (NodeRef) refs[range.begin()].node;
+              },
+              [&] (BuildRef &bref, BuildRef *refs) -> size_t { 
+                return openBuildRef(bref,refs);
+              },              
+              [&] (size_t dn) { bvh->scene->progressMonitor(0); },
+              refs.data(),extSize,pinfo,settings);
+#else
+            NodeRef root = BVHBuilderBinnedSAH::build<NodeRef>(
+              typename BVH::CreateAlloc(bvh),
+              typename BVH::AABBNode::Create2(),
+              typename BVH::AABBNode::Set2(),
+              
+              [&] (const PrimRef* prims, const range<size_t>& range, const FastAllocator::CachedAllocator& alloc) -> NodeRef {
+                assert(range.size() == 1);
+                return (NodeRef) prims[range.begin()].ID();
+              },
+              [&] (size_t dn) { bvh->scene->progressMonitor(0); },
+              prims.data(),pinfo,settings);
+#endif
+
+            
+            bvh->set(root,LBBox3fa(pinfo.geomBounds),numPrimitives);
+          }
+        }
+      }  
+        
+      bvh->alloc.cleanup();
+      bvh->postBuild(t0);
+#if PROFILE
+      double d1 = getSeconds();
+      std::cout << "TOP_LEVEL OPENING/REBUILD TIME " << 1000.0*(d1-d0) << " ms" << std::endl;
+#endif
+      }
+
+    }
+    
+    template<int N, typename Mesh, typename Primitive>
+    void BVHNBuilderTwoLevel<N,Mesh,Primitive>::deleteGeometry(size_t geomID)
+    {
+      if (geomID >= bvh->objects.size()) return;
+      if (builders[geomID]) builders[geomID].reset();
+      delete bvh->objects [geomID]; bvh->objects [geomID] = nullptr;
+    }
+
+    template<int N, typename Mesh, typename Primitive>
+    void BVHNBuilderTwoLevel<N,Mesh,Primitive>::clear()
+    {
+      for (size_t i=0; i<bvh->objects.size(); i++) 
+        if (bvh->objects[i]) bvh->objects[i]->clear();
+
+      for (size_t i=0; i<builders.size(); i++) 
+        if (builders[i]) builders[i].reset();
+
+      refs.clear();
+    }
+
+    template<int N, typename Mesh, typename Primitive>
+    void BVHNBuilderTwoLevel<N,Mesh,Primitive>::open_sequential(const size_t extSize)
+    {
+      if (refs.size() == 0)
+	return;
+
+      refs.reserve(extSize);
+
+#if 1
+      for (size_t i=0;i<refs.size();i++)
+      {
+        NodeRef ref = refs[i].node;
+        if (ref.isAABBNode())
+          BVH::prefetch(ref);
+      }
+#endif
+
+      std::make_heap(refs.begin(),refs.end());
+      while (refs.size()+N-1 <= extSize)
+      {
+        std::pop_heap (refs.begin(),refs.end()); 
+        NodeRef ref = refs.back().node;
+        if (ref.isLeaf()) break;
+        refs.pop_back();    
+        
+        AABBNode* node = ref.getAABBNode();
+        for (size_t i=0; i<N; i++) {
+          if (node->child(i) == BVH::emptyNode) continue;
+          refs.push_back(BuildRef(node->bounds(i),node->child(i)));
+         
+#if 1
+          NodeRef ref_pre = node->child(i);
+          if (ref_pre.isAABBNode())
+            ref_pre.prefetch();
+#endif
+          std::push_heap (refs.begin(),refs.end()); 
+        }
+      }
+    }
+
+    template<int N, typename Mesh, typename Primitive>
+    void BVHNBuilderTwoLevel<N,Mesh,Primitive>::setupSmallBuildRefBuilder (size_t objectID, Mesh const * const /*mesh*/)
+    {
+      if (builders[objectID] == nullptr ||                                         // new mesh
+          dynamic_cast<RefBuilderSmall*>(builders[objectID].get()) == nullptr)     // size change resulted in large->small change
+      {
+        builders[objectID].reset (new RefBuilderSmall(objectID));
+      }
+    }
+
+    template<int N, typename Mesh, typename Primitive>
+    void BVHNBuilderTwoLevel<N,Mesh,Primitive>::setupLargeBuildRefBuilder (size_t objectID, Mesh const * const mesh)
+    {
+      if (bvh->objects[objectID] == nullptr ||                                  // new mesh
+          builders[objectID]->meshQualityChanged (mesh->quality) ||             // changed build quality
+          dynamic_cast<RefBuilderLarge*>(builders[objectID].get()) == nullptr)  // size change resulted in small->large change
+      {
+        Builder* builder = nullptr;
+        delete bvh->objects[objectID]; 
+        createMeshAccel(objectID, builder);
+        builders[objectID].reset (new RefBuilderLarge(objectID, builder, mesh->quality));
+      }
+    }
+
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+    Builder* BVH4BuilderTwoLevelTriangle4MeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) {
+      return new BVHNBuilderTwoLevel<4,TriangleMesh,Triangle4>((BVH4*)bvh,scene,TriangleMesh::geom_type,useMortonBuilder);
+    }
+    Builder* BVH4BuilderTwoLevelTriangle4vMeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) {
+      return new BVHNBuilderTwoLevel<4,TriangleMesh,Triangle4v>((BVH4*)bvh,scene,TriangleMesh::geom_type,useMortonBuilder);
+    }
+    Builder* BVH4BuilderTwoLevelTriangle4iMeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) {
+      return new BVHNBuilderTwoLevel<4,TriangleMesh,Triangle4i>((BVH4*)bvh,scene,TriangleMesh::geom_type,useMortonBuilder);
+    }
+#endif
+
+#if defined(EMBREE_GEOMETRY_QUAD)
+    Builder* BVH4BuilderTwoLevelQuadMeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) {
+    return new BVHNBuilderTwoLevel<4,QuadMesh,Quad4v>((BVH4*)bvh,scene,QuadMesh::geom_type,useMortonBuilder);
+    }
+#endif
+
+#if defined(EMBREE_GEOMETRY_USER)
+    Builder* BVH4BuilderTwoLevelVirtualSAH (void* bvh, Scene* scene, bool useMortonBuilder) {
+    return new BVHNBuilderTwoLevel<4,UserGeometry,Object>((BVH4*)bvh,scene,UserGeometry::geom_type,useMortonBuilder);
+    }
+#endif
+
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+    Builder* BVH4BuilderTwoLevelInstanceSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype, bool useMortonBuilder) {
+      return new BVHNBuilderTwoLevel<4,Instance,InstancePrimitive>((BVH4*)bvh,scene,gtype,useMortonBuilder);
+    }
+#endif
+
+#if defined(__AVX__)
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+    Builder* BVH8BuilderTwoLevelTriangle4MeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) {
+      return new BVHNBuilderTwoLevel<8,TriangleMesh,Triangle4>((BVH8*)bvh,scene,TriangleMesh::geom_type,useMortonBuilder);
+    }
+    Builder* BVH8BuilderTwoLevelTriangle4vMeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) {
+      return new BVHNBuilderTwoLevel<8,TriangleMesh,Triangle4v>((BVH8*)bvh,scene,TriangleMesh::geom_type,useMortonBuilder);
+    }
+    Builder* BVH8BuilderTwoLevelTriangle4iMeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) {
+      return new BVHNBuilderTwoLevel<8,TriangleMesh,Triangle4i>((BVH8*)bvh,scene,TriangleMesh::geom_type,useMortonBuilder);
+    }
+#endif
+
+#if defined(EMBREE_GEOMETRY_QUAD)
+    Builder* BVH8BuilderTwoLevelQuadMeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) {
+      return new BVHNBuilderTwoLevel<8,QuadMesh,Quad4v>((BVH8*)bvh,scene,QuadMesh::geom_type,useMortonBuilder);
+    }
+#endif
+
+#if defined(EMBREE_GEOMETRY_USER)
+    Builder* BVH8BuilderTwoLevelVirtualSAH (void* bvh, Scene* scene, bool useMortonBuilder) {
+      return new BVHNBuilderTwoLevel<8,UserGeometry,Object>((BVH8*)bvh,scene,UserGeometry::geom_type,useMortonBuilder);
+    }
+#endif
+
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+    Builder* BVH8BuilderTwoLevelInstanceSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype, bool useMortonBuilder) {
+      return new BVHNBuilderTwoLevel<8,Instance,InstancePrimitive>((BVH8*)bvh,scene,gtype,useMortonBuilder);
+    }
+#endif
+
+#endif
+  }
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_builder_twolevel.h b/thirdparty/embree/kernels/bvh/bvh_builder_twolevel.h
new file mode 100644
index 0000000000..dc7ec7d278
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_builder_twolevel.h
@@ -0,0 +1,263 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <type_traits>
+
+#include "bvh_builder_twolevel_internal.h"
+#include "bvh.h"
+#include "../common/primref.h"
+#include "../builders/priminfo.h"
+#include "../builders/primrefgen.h"
+
+/* new open/merge builder */
+#define ENABLE_DIRECT_SAH_MERGE_BUILDER 1
+#define ENABLE_OPEN_SEQUENTIAL 0
+#define SPLIT_MEMORY_RESERVE_FACTOR 1000
+#define SPLIT_MEMORY_RESERVE_SCALE 2
+#define SPLIT_MIN_EXT_SPACE 1000
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int N, typename Mesh, typename Primitive>
+    class BVHNBuilderTwoLevel : public Builder
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::AABBNode AABBNode;
+      typedef typename BVH::NodeRef NodeRef;
+
+      __forceinline static bool isSmallGeometry(Mesh* mesh) {
+        return mesh->size() <= 4;
+      }
+
+    public:
+
+      typedef void (*createMeshAccelTy)(Scene* scene, unsigned int geomID, AccelData*& accel, Builder*& builder);
+
+      struct BuildRef : public PrimRef
+      {
+      public:
+        __forceinline BuildRef () {}
+
+        __forceinline BuildRef (const BBox3fa& bounds, NodeRef node)
+          : PrimRef(bounds,(size_t)node), node(node)
+        {
+          if (node.isLeaf())
+            bounds_area = 0.0f;
+          else
+            bounds_area = area(this->bounds());
+        }
+
+        /* used by the open/merge bvh builder */
+        __forceinline BuildRef (const BBox3fa& bounds, NodeRef node, const unsigned int geomID, const unsigned int numPrimitives)
+          : PrimRef(bounds,geomID,numPrimitives), node(node)
+        {
+          /* important for relative buildref ordering */
+          if (node.isLeaf())
+            bounds_area = 0.0f;
+          else
+            bounds_area = area(this->bounds());
+        }
+
+        __forceinline size_t size() const {
+          return primID();
+        }
+
+        friend bool operator< (const BuildRef& a, const BuildRef& b) {
+          return a.bounds_area < b.bounds_area;
+        }
+
+        friend __forceinline embree_ostream operator<<(embree_ostream cout, const BuildRef& ref) {
+          return cout << "{ lower = " << ref.lower << ", upper = " << ref.upper << ", center2 = " << ref.center2() << ", geomID = " << ref.geomID() << ", numPrimitives = " << ref.numPrimitives() << ", bounds_area = " << ref.bounds_area << " }";
+        }
+
+        __forceinline unsigned int numPrimitives() const { return primID(); }
+
+      public:
+        NodeRef node;
+        float bounds_area;
+      };
+
+
+      __forceinline size_t openBuildRef(BuildRef &bref, BuildRef *const refs) {
+        if (bref.node.isLeaf())
+        {
+          refs[0] = bref;
+          return 1;
+        }
+        NodeRef ref = bref.node;
+        unsigned int geomID   = bref.geomID();
+        unsigned int numPrims = max((unsigned int)bref.numPrimitives() / N,(unsigned int)1);
+        AABBNode* node = ref.getAABBNode();
+        size_t n = 0;
+        for (size_t i=0; i<N; i++) {
+          if (node->child(i) == BVH::emptyNode) continue;
+          refs[i] = BuildRef(node->bounds(i),node->child(i),geomID,numPrims);
+          n++;
+        }
+        assert(n > 1);
+        return n;        
+      }
+      
+      /*! Constructor. */
+      BVHNBuilderTwoLevel (BVH* bvh, Scene* scene, Geometry::GTypeMask gtype = Mesh::geom_type, bool useMortonBuilder = false, const size_t singleThreadThreshold = DEFAULT_SINGLE_THREAD_THRESHOLD);
+      
+      /*! Destructor */
+      ~BVHNBuilderTwoLevel ();
+      
+      /*! builder entry point */
+      void build();
+      void deleteGeometry(size_t geomID);
+      void clear();
+
+      void open_sequential(const size_t extSize);
+      
+    private:
+
+      class RefBuilderBase {
+      public:
+        virtual ~RefBuilderBase () {}
+        virtual void attachBuildRefs (BVHNBuilderTwoLevel* builder) = 0;
+        virtual bool meshQualityChanged (RTCBuildQuality currQuality) = 0;
+      };
+
+      class RefBuilderSmall : public RefBuilderBase {
+      public:
+
+        RefBuilderSmall (size_t objectID)
+          : objectID_ (objectID) {}
+
+        void attachBuildRefs (BVHNBuilderTwoLevel* topBuilder) {
+
+          Mesh* mesh = topBuilder->scene->template getSafe<Mesh>(objectID_);
+          size_t meshSize = mesh->size();
+          assert(isSmallGeometry(mesh));
+          
+          mvector<PrimRef> prefs(topBuilder->scene->device, meshSize);
+          auto pinfo = createPrimRefArray(mesh,objectID_,meshSize,prefs,topBuilder->bvh->scene->progressInterface);
+
+          size_t begin=0;
+          while (begin < pinfo.size())
+          {
+            Primitive* accel = (Primitive*) topBuilder->bvh->alloc.getCachedAllocator().malloc1(sizeof(Primitive),BVH::byteAlignment);
+            typename BVH::NodeRef node = BVH::encodeLeaf((char*)accel,1);
+            accel->fill(prefs.data(),begin,pinfo.size(),topBuilder->bvh->scene);
+            
+            /* create build primitive */
+#if ENABLE_DIRECT_SAH_MERGE_BUILDER
+            topBuilder->refs[topBuilder->nextRef++] = BVHNBuilderTwoLevel::BuildRef(pinfo.geomBounds,node,(unsigned int)objectID_,1);
+#else
+            topBuilder->refs[topBuilder->nextRef++] = BVHNBuilderTwoLevel::BuildRef(pinfo.geomBounds,node);
+#endif
+          }
+          assert(begin == pinfo.size());
+        }
+
+        bool meshQualityChanged (RTCBuildQuality /*currQuality*/) {
+          return false;
+        }
+        
+        size_t  objectID_;
+      };
+
+      class RefBuilderLarge : public RefBuilderBase {
+      public:
+        
+        RefBuilderLarge (size_t objectID, const Ref<Builder>& builder, RTCBuildQuality quality)
+        : objectID_ (objectID), builder_ (builder), quality_ (quality) {}
+
+        void attachBuildRefs (BVHNBuilderTwoLevel* topBuilder)
+        {
+          BVH* object  = topBuilder->getBVH(objectID_); assert(object);
+          
+          /* build object if it got modified */
+          if (topBuilder->isGeometryModified(objectID_))
+            builder_->build();
+
+          /* create build primitive */
+          if (!object->getBounds().empty())
+          {
+#if ENABLE_DIRECT_SAH_MERGE_BUILDER
+            Mesh* mesh = topBuilder->getMesh(objectID_);
+            topBuilder->refs[topBuilder->nextRef++] = BVHNBuilderTwoLevel::BuildRef(object->getBounds(),object->root,(unsigned int)objectID_,(unsigned int)mesh->size());
+#else
+            topBuilder->refs[topBuilder->nextRef++] = BVHNBuilderTwoLevel::BuildRef(object->getBounds(),object->root);
+#endif
+          }
+        }
+
+        bool meshQualityChanged (RTCBuildQuality currQuality) {
+          return currQuality != quality_;
+        }
+
+      private:
+        size_t          objectID_;
+        Ref<Builder>    builder_;
+        RTCBuildQuality quality_;
+      };
+
+      void setupLargeBuildRefBuilder (size_t objectID, Mesh const * const mesh);
+      void setupSmallBuildRefBuilder (size_t objectID, Mesh const * const mesh);
+
+      BVH*  getBVH (size_t objectID) {
+        return this->bvh->objects[objectID];
+      }
+      Mesh* getMesh (size_t objectID) {
+        return this->scene->template getSafe<Mesh>(objectID);
+      }
+      bool  isGeometryModified (size_t objectID) {
+        return this->scene->isGeometryModified(objectID);
+      }
+
+      void resizeRefsList ()
+      {
+        size_t num = parallel_reduce (size_t(0), scene->size(), size_t(0), 
+          [this](const range<size_t>& r)->size_t {
+            size_t c = 0;
+            for (auto i=r.begin(); i<r.end(); ++i) {
+              Mesh* mesh = scene->getSafe<Mesh>(i);
+              if (mesh == nullptr || mesh->numTimeSteps != 1)
+                continue;
+              size_t meshSize = mesh->size();
+              c += isSmallGeometry(mesh) ? Primitive::blocks(meshSize) : 1;
+            }
+            return c;
+          },
+          std::plus<size_t>()
+        );
+
+        if (refs.size() < num) {
+          refs.resize(num);
+        }
+      }
+
+      void createMeshAccel (size_t geomID, Builder*& builder)
+      {
+        bvh->objects[geomID] = new BVH(Primitive::type,scene);
+        BVH* accel = bvh->objects[geomID];
+        auto mesh = scene->getSafe<Mesh>(geomID);
+        if (nullptr == mesh) {
+          throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"geomID does not return correct type");
+          return;
+        }
+
+        __internal_two_level_builder__::MeshBuilder<N,Mesh,Primitive>()(accel, mesh, geomID, this->gtype, this->useMortonBuilder_, builder);
+      }      
+
+      using BuilderList = std::vector<std::unique_ptr<RefBuilderBase>>;
+
+      BuilderList         builders;
+      BVH*                bvh;
+      Scene*              scene;      
+      mvector<BuildRef>   refs;
+      mvector<PrimRef>    prims;
+      std::atomic<int>    nextRef;
+      const size_t        singleThreadThreshold;
+      Geometry::GTypeMask gtype;
+      bool                useMortonBuilder_ = false;
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_builder_twolevel_internal.h b/thirdparty/embree/kernels/bvh/bvh_builder_twolevel_internal.h
new file mode 100644
index 0000000000..023b52b780
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_builder_twolevel_internal.h
@@ -0,0 +1,267 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh.h"
+#include "../geometry/triangle.h"
+#include "../geometry/trianglev.h"
+#include "../geometry/trianglei.h"
+#include "../geometry/quadv.h"
+#include "../geometry/quadi.h"
+#include "../geometry/object.h"
+#include "../geometry/instance.h"
+
+namespace embree
+{
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4MeshBuilderMortonGeneral,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4MeshBuilderSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4MeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vMeshBuilderMortonGeneral,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vMeshBuilderSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vMeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iMeshBuilderMortonGeneral,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iMeshBuilderSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iMeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4vMeshBuilderMortonGeneral,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4vMeshBuilderSAH,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4vMeshRefitSAH,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4VirtualMeshBuilderMortonGeneral,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4VirtualMeshBuilderSAH,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4VirtualMeshRefitSAH,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4InstanceMeshBuilderMortonGeneral,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4InstanceMeshBuilderSAH,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4InstanceMeshRefitSAH,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t)
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4MeshBuilderMortonGeneral,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4MeshBuilderSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4MeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4vMeshBuilderMortonGeneral,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4vMeshBuilderSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4vMeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4iMeshBuilderMortonGeneral,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4iMeshBuilderSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4iMeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4vMeshBuilderMortonGeneral,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4vMeshBuilderSAH,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4vMeshRefitSAH,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8VirtualMeshBuilderMortonGeneral,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8VirtualMeshBuilderSAH,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8VirtualMeshRefitSAH,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t); 
+  DECLARE_ISA_FUNCTION(Builder*,BVH8InstanceMeshBuilderMortonGeneral,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8InstanceMeshBuilderSAH,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8InstanceMeshRefitSAH,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t) 
+  
+  namespace isa
+  {
+
+    namespace __internal_two_level_builder__ {
+
+      template<int N, typename Mesh, typename Primitive>
+      struct MortonBuilder {};
+      template<>
+      struct MortonBuilder<4,TriangleMesh,Triangle4> {
+        MortonBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4MeshBuilderMortonGeneral(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct MortonBuilder<4,TriangleMesh,Triangle4v> {
+        MortonBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4vMeshBuilderMortonGeneral(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct MortonBuilder<4,TriangleMesh,Triangle4i> {
+        MortonBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4iMeshBuilderMortonGeneral(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct MortonBuilder<4,QuadMesh,Quad4v> {
+        MortonBuilder () {}
+        Builder* operator () (void* bvh, QuadMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Quad4vMeshBuilderMortonGeneral(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct MortonBuilder<4,UserGeometry,Object> {
+        MortonBuilder () {}
+        Builder* operator () (void* bvh, UserGeometry* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4VirtualMeshBuilderMortonGeneral(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct MortonBuilder<4,Instance,InstancePrimitive> {
+        MortonBuilder () {}
+        Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH4InstanceMeshBuilderMortonGeneral(bvh,mesh,gtype,geomID,0);}
+      };
+      template<>
+      struct MortonBuilder<8,TriangleMesh,Triangle4> {
+        MortonBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4MeshBuilderMortonGeneral(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct MortonBuilder<8,TriangleMesh,Triangle4v> {
+        MortonBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4vMeshBuilderMortonGeneral(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct MortonBuilder<8,TriangleMesh,Triangle4i> {
+        MortonBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4iMeshBuilderMortonGeneral(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct MortonBuilder<8,QuadMesh,Quad4v> {
+        MortonBuilder () {}
+        Builder* operator () (void* bvh, QuadMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Quad4vMeshBuilderMortonGeneral(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct MortonBuilder<8,UserGeometry,Object> {
+        MortonBuilder () {}
+        Builder* operator () (void* bvh, UserGeometry* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8VirtualMeshBuilderMortonGeneral(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct MortonBuilder<8,Instance,InstancePrimitive> {
+        MortonBuilder () {}
+        Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH8InstanceMeshBuilderMortonGeneral(bvh,mesh,gtype,geomID,0);}
+      };
+
+      template<int N, typename Mesh, typename Primitive>
+      struct SAHBuilder {};
+      template<>
+      struct SAHBuilder<4,TriangleMesh,Triangle4> {
+        SAHBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4MeshBuilderSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct SAHBuilder<4,TriangleMesh,Triangle4v> {
+        SAHBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4vMeshBuilderSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct SAHBuilder<4,TriangleMesh,Triangle4i> {
+        SAHBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4iMeshBuilderSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct SAHBuilder<4,QuadMesh,Quad4v> {
+        SAHBuilder () {}
+        Builder* operator () (void* bvh, QuadMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Quad4vMeshBuilderSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct SAHBuilder<4,UserGeometry,Object> {
+        SAHBuilder () {}
+        Builder* operator () (void* bvh, UserGeometry* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4VirtualMeshBuilderSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct SAHBuilder<4,Instance,InstancePrimitive> {
+        SAHBuilder () {}
+        Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH4InstanceMeshBuilderSAH(bvh,mesh,gtype,geomID,0);}
+      };
+      template<>
+      struct SAHBuilder<8,TriangleMesh,Triangle4> {
+        SAHBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4MeshBuilderSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct SAHBuilder<8,TriangleMesh,Triangle4v> {
+        SAHBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4vMeshBuilderSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct SAHBuilder<8,TriangleMesh,Triangle4i> {
+        SAHBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4iMeshBuilderSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct SAHBuilder<8,QuadMesh,Quad4v> {
+        SAHBuilder () {}
+        Builder* operator () (void* bvh, QuadMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Quad4vMeshBuilderSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct SAHBuilder<8,UserGeometry,Object> {
+        SAHBuilder () {}
+        Builder* operator () (void* bvh, UserGeometry* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8VirtualMeshBuilderSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct SAHBuilder<8,Instance,InstancePrimitive> {
+        SAHBuilder () {}
+        Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH8InstanceMeshBuilderSAH(bvh,mesh,gtype,geomID,0);}
+      };
+
+      template<int N, typename Mesh, typename Primitive>
+      struct RefitBuilder {};
+      template<>
+      struct RefitBuilder<4,TriangleMesh,Triangle4> {
+        RefitBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4MeshRefitSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct RefitBuilder<4,TriangleMesh,Triangle4v> {
+        RefitBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4vMeshRefitSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct RefitBuilder<4,TriangleMesh,Triangle4i> {
+        RefitBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4iMeshRefitSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct RefitBuilder<4,QuadMesh,Quad4v> {
+        RefitBuilder () {}
+        Builder* operator () (void* bvh, QuadMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Quad4vMeshRefitSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct RefitBuilder<4,UserGeometry,Object> {
+        RefitBuilder () {}
+        Builder* operator () (void* bvh, UserGeometry* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4VirtualMeshRefitSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct RefitBuilder<4,Instance,InstancePrimitive> {
+        RefitBuilder () {}
+        Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH4InstanceMeshRefitSAH(bvh,mesh,gtype,geomID,0);}
+      };
+      template<>
+      struct RefitBuilder<8,TriangleMesh,Triangle4> {
+        RefitBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4MeshRefitSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct RefitBuilder<8,TriangleMesh,Triangle4v> {
+        RefitBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4vMeshRefitSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct RefitBuilder<8,TriangleMesh,Triangle4i> {
+        RefitBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4iMeshRefitSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct RefitBuilder<8,QuadMesh,Quad4v> {
+        RefitBuilder () {}
+        Builder* operator () (void* bvh, QuadMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Quad4vMeshRefitSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct RefitBuilder<8,UserGeometry,Object> {
+        RefitBuilder () {}
+        Builder* operator () (void* bvh, UserGeometry* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8VirtualMeshRefitSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct RefitBuilder<8,Instance,InstancePrimitive> {
+        RefitBuilder () {}
+        Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH8InstanceMeshRefitSAH(bvh,mesh,gtype,geomID,0);}
+      };
+      
+      template<int N, typename Mesh, typename Primitive>
+      struct MeshBuilder {
+        MeshBuilder () {}
+        void operator () (void* bvh, Mesh* mesh, size_t geomID, Geometry::GTypeMask gtype, bool useMortonBuilder, Builder*& builder) {
+          if(useMortonBuilder) {
+            builder = MortonBuilder<N,Mesh,Primitive>()(bvh,mesh,geomID,gtype);
+            return;
+          }
+          switch (mesh->quality) {
+            case RTC_BUILD_QUALITY_LOW:    builder = MortonBuilder<N,Mesh,Primitive>()(bvh,mesh,geomID,gtype); break;
+            case RTC_BUILD_QUALITY_MEDIUM:
+            case RTC_BUILD_QUALITY_HIGH:   builder = SAHBuilder<N,Mesh,Primitive>()(bvh,mesh,geomID,gtype); break;
+            case RTC_BUILD_QUALITY_REFIT:  builder = RefitBuilder<N,Mesh,Primitive>()(bvh,mesh,geomID,gtype); break;
+            default: throw_RTCError(RTC_ERROR_UNKNOWN,"invalid build quality");
+          }
+        }
+      };
+    }
+  }
+}
+\ No newline at end of file
diff --git a/thirdparty/embree/kernels/bvh/bvh_collider.cpp b/thirdparty/embree/kernels/bvh/bvh_collider.cpp
new file mode 100644
index 0000000000..9428c0b88e
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_collider.cpp
@@ -0,0 +1,375 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh_collider.h"
+#include "../geometry/triangle_triangle_intersector.h"
+
+namespace embree
+{ 
+  namespace isa
+  {
+#define CSTAT(x)
+
+    size_t parallel_depth_threshold = 3;
+    CSTAT(std::atomic<size_t> bvh_collide_traversal_steps(0));
+    CSTAT(std::atomic<size_t> bvh_collide_leaf_pairs(0));
+    CSTAT(std::atomic<size_t> bvh_collide_leaf_iterations(0));
+    CSTAT(std::atomic<size_t> bvh_collide_prim_intersections1(0));
+    CSTAT(std::atomic<size_t> bvh_collide_prim_intersections2(0));
+    CSTAT(std::atomic<size_t> bvh_collide_prim_intersections3(0));
+    CSTAT(std::atomic<size_t> bvh_collide_prim_intersections4(0));
+    CSTAT(std::atomic<size_t> bvh_collide_prim_intersections5(0));
+    CSTAT(std::atomic<size_t> bvh_collide_prim_intersections(0));
+
+    struct Collision
+    {
+      __forceinline Collision() {}
+
+      __forceinline Collision (unsigned geomID0, unsigned primID0, unsigned geomID1, unsigned primID1)
+        : geomID0(geomID0), primID0(primID0), geomID1(geomID1), primID1(primID1) {}
+
+      unsigned geomID0;
+      unsigned primID0;
+      unsigned geomID1;
+      unsigned primID1;
+    };
+    
+    template<int N>
+    __forceinline size_t overlap(const BBox3fa& box0, const typename BVHN<N>::AABBNode& node1)
+    {
+      const vfloat<N> lower_x = max(vfloat<N>(box0.lower.x),node1.lower_x);
+      const vfloat<N> lower_y = max(vfloat<N>(box0.lower.y),node1.lower_y);
+      const vfloat<N> lower_z = max(vfloat<N>(box0.lower.z),node1.lower_z);
+      const vfloat<N> upper_x = min(vfloat<N>(box0.upper.x),node1.upper_x);
+      const vfloat<N> upper_y = min(vfloat<N>(box0.upper.y),node1.upper_y);
+      const vfloat<N> upper_z = min(vfloat<N>(box0.upper.z),node1.upper_z);
+      return movemask((lower_x <= upper_x) & (lower_y <= upper_y) & (lower_z <= upper_z));
+    }
+
+    template<int N>
+    __forceinline size_t overlap(const BBox3fa& box0, const BBox<Vec3<vfloat<N>>>& box1)
+    {
+      const vfloat<N> lower_x = max(vfloat<N>(box0.lower.x),box1.lower.x);
+      const vfloat<N> lower_y = max(vfloat<N>(box0.lower.y),box1.lower.y);
+      const vfloat<N> lower_z = max(vfloat<N>(box0.lower.z),box1.lower.z);
+      const vfloat<N> upper_x = min(vfloat<N>(box0.upper.x),box1.upper.x);
+      const vfloat<N> upper_y = min(vfloat<N>(box0.upper.y),box1.upper.y);
+      const vfloat<N> upper_z = min(vfloat<N>(box0.upper.z),box1.upper.z);
+      return movemask((lower_x <= upper_x) & (lower_y <= upper_y) & (lower_z <= upper_z));
+    }
+
+    template<int N>
+    __forceinline size_t overlap(const BBox<Vec3<vfloat<N>>>& box0, size_t i, const BBox<Vec3<vfloat<N>>>& box1)
+    {
+      const vfloat<N> lower_x = max(vfloat<N>(box0.lower.x[i]),box1.lower.x);
+      const vfloat<N> lower_y = max(vfloat<N>(box0.lower.y[i]),box1.lower.y);
+      const vfloat<N> lower_z = max(vfloat<N>(box0.lower.z[i]),box1.lower.z);
+      const vfloat<N> upper_x = min(vfloat<N>(box0.upper.x[i]),box1.upper.x);
+      const vfloat<N> upper_y = min(vfloat<N>(box0.upper.y[i]),box1.upper.y);
+      const vfloat<N> upper_z = min(vfloat<N>(box0.upper.z[i]),box1.upper.z);
+      return movemask((lower_x <= upper_x) & (lower_y <= upper_y) & (lower_z <= upper_z));
+    }
+
+    bool intersect_triangle_triangle (Scene* scene0, unsigned geomID0, unsigned primID0, Scene* scene1, unsigned geomID1, unsigned primID1)
+    {
+      CSTAT(bvh_collide_prim_intersections1++);
+      const TriangleMesh* mesh0 = scene0->get<TriangleMesh>(geomID0);
+      const TriangleMesh* mesh1 = scene1->get<TriangleMesh>(geomID1);
+      const TriangleMesh::Triangle& tri0 = mesh0->triangle(primID0);
+      const TriangleMesh::Triangle& tri1 = mesh1->triangle(primID1);
+      
+      /* special culling for scene intersection with itself */
+      if (scene0 == scene1 && geomID0 == geomID1)
+      {
+        /* ignore self intersections */
+        if (primID0 == primID1)
+          return false;
+      }
+      CSTAT(bvh_collide_prim_intersections2++);
+      
+      if (scene0 == scene1 && geomID0 == geomID1)
+      {
+        /* ignore intersection with topological neighbors */
+        const vint4 t0(tri0.v[0],tri0.v[1],tri0.v[2],tri0.v[2]);
+        if (any(vint4(tri1.v[0]) == t0)) return false;
+        if (any(vint4(tri1.v[1]) == t0)) return false;
+        if (any(vint4(tri1.v[2]) == t0)) return false;
+      }
+      CSTAT(bvh_collide_prim_intersections3++);
+      
+      const Vec3fa a0 = mesh0->vertex(tri0.v[0]);
+      const Vec3fa a1 = mesh0->vertex(tri0.v[1]);
+      const Vec3fa a2 = mesh0->vertex(tri0.v[2]);
+      const Vec3fa b0 = mesh1->vertex(tri1.v[0]);
+      const Vec3fa b1 = mesh1->vertex(tri1.v[1]);
+      const Vec3fa b2 = mesh1->vertex(tri1.v[2]);
+      
+      return TriangleTriangleIntersector::intersect_triangle_triangle(a0,a1,a2,b0,b1,b2);
+    }
+    
+    template<int N>
+    __forceinline void BVHNColliderUserGeom<N>::processLeaf(NodeRef node0, NodeRef node1)
+    {
+      Collision collisions[16];
+      size_t num_collisions = 0;
+
+      size_t N0; Object* leaf0 = (Object*) node0.leaf(N0);
+      size_t N1; Object* leaf1 = (Object*) node1.leaf(N1);
+      for (size_t i=0; i<N0; i++) {
+        for (size_t j=0; j<N1; j++) {
+          const unsigned geomID0 = leaf0[i].geomID();
+          const unsigned primID0 = leaf0[i].primID();
+          const unsigned geomID1 = leaf1[j].geomID();
+          const unsigned primID1 = leaf1[j].primID();
+          if (this->scene0 == this->scene1 && geomID0 == geomID1 && primID0 == primID1) continue;
+          collisions[num_collisions++] = Collision(geomID0,primID0,geomID1,primID1);
+          if (num_collisions == 16) {
+            this->callback(this->userPtr,(RTCCollision*)&collisions,num_collisions);
+            num_collisions = 0;
+          }
+        }
+      }
+      if (num_collisions)
+        this->callback(this->userPtr,(RTCCollision*)&collisions,num_collisions);
+    }
+
+    template<int N>
+    void BVHNCollider<N>::collide_recurse(NodeRef ref0, const BBox3fa& bounds0, NodeRef ref1, const BBox3fa& bounds1, size_t depth0, size_t depth1)
+    {
+      CSTAT(bvh_collide_traversal_steps++);
+      if (unlikely(ref0.isLeaf())) {
+        if (unlikely(ref1.isLeaf())) {
+          CSTAT(bvh_collide_leaf_pairs++);
+          processLeaf(ref0,ref1);
+          return;
+        } else goto recurse_node1;
+        
+      } else {
+        if (unlikely(ref1.isLeaf())) {
+          goto recurse_node0;
+        } else {
+          if (area(bounds0) > area(bounds1)) {
+            goto recurse_node0;
+          }
+          else {
+            goto recurse_node1;
+          }
+        }
+      }
+
+      {
+      recurse_node0:
+        AABBNode* node0 = ref0.getAABBNode();
+        size_t mask = overlap<N>(bounds1,*node0);
+        //for (size_t m=mask, i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) {
+        //for (size_t i=0; i<N; i++) {
+#if 0
+        if (depth0 < parallel_depth_threshold) 
+        {
+          parallel_for(size_t(N), [&] ( size_t i ) {
+              if (mask & ( 1 << i)) {
+                BVHN<N>::prefetch(node0->child(i),BVH_FLAG_ALIGNED_NODE);
+                collide_recurse(node0->child(i),node0->bounds(i),ref1,bounds1,depth0+1,depth1);
+              }
+            });
+        } 
+        else
+#endif
+        {
+          for (size_t m=mask, i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) {
+            BVHN<N>::prefetch(node0->child(i),BVH_FLAG_ALIGNED_NODE);
+            collide_recurse(node0->child(i),node0->bounds(i),ref1,bounds1,depth0+1,depth1);
+          }
+        }
+        return;
+      }
+      
+      {
+      recurse_node1:
+        AABBNode* node1 = ref1.getAABBNode();
+        size_t mask = overlap<N>(bounds0,*node1);
+        //for (size_t m=mask, i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) {
+        //for (size_t i=0; i<N; i++) {
+#if 0
+        if (depth1 < parallel_depth_threshold) 
+        {
+          parallel_for(size_t(N), [&] ( size_t i ) {
+              if (mask & ( 1 << i)) {
+                BVHN<N>::prefetch(node1->child(i),BVH_FLAG_ALIGNED_NODE);
+                collide_recurse(ref0,bounds0,node1->child(i),node1->bounds(i),depth0,depth1+1);
+              }
+            });
+        }
+        else
+#endif
+        {
+          for (size_t m=mask, i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) {
+            BVHN<N>::prefetch(node1->child(i),BVH_FLAG_ALIGNED_NODE);
+            collide_recurse(ref0,bounds0,node1->child(i),node1->bounds(i),depth0,depth1+1);
+          }
+        }
+        return;
+      }
+    }
+
+    template<int N>
+    void BVHNCollider<N>::split(const CollideJob& job, jobvector& jobs)
+    {
+      if (unlikely(job.ref0.isLeaf())) {
+        if (unlikely(job.ref1.isLeaf())) {
+          jobs.push_back(job);
+          return;
+        } else goto recurse_node1;
+      } else {
+        if (unlikely(job.ref1.isLeaf())) {
+          goto recurse_node0;
+        } else {
+          if (area(job.bounds0) > area(job.bounds1)) {
+            goto recurse_node0;
+          }
+          else {
+            goto recurse_node1;
+          }
+        }
+      }
+      
+      {
+      recurse_node0:
+        const AABBNode* node0 = job.ref0.getAABBNode();
+        size_t mask = overlap<N>(job.bounds1,*node0);
+        for (size_t m=mask, i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) {
+          jobs.push_back(CollideJob(node0->child(i),node0->bounds(i),job.depth0+1,job.ref1,job.bounds1,job.depth1));
+        }
+        return;
+      }
+      
+      {
+      recurse_node1:
+        const AABBNode* node1 = job.ref1.getAABBNode();
+        size_t mask = overlap<N>(job.bounds0,*node1);
+        for (size_t m=mask, i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) {
+          jobs.push_back(CollideJob(job.ref0,job.bounds0,job.depth0,node1->child(i),node1->bounds(i),job.depth1+1));
+        }
+        return;
+      }
+    }
+    
+    template<int N>
+    void BVHNCollider<N>::collide_recurse_entry(NodeRef ref0, const BBox3fa& bounds0, NodeRef ref1, const BBox3fa& bounds1)
+    {
+      CSTAT(bvh_collide_traversal_steps = 0);
+      CSTAT(bvh_collide_leaf_pairs = 0);
+      CSTAT(bvh_collide_leaf_iterations = 0);
+      CSTAT(bvh_collide_prim_intersections1 = 0);
+      CSTAT(bvh_collide_prim_intersections2 = 0);
+      CSTAT(bvh_collide_prim_intersections3 = 0);
+      CSTAT(bvh_collide_prim_intersections4 = 0);
+      CSTAT(bvh_collide_prim_intersections5 = 0);
+      CSTAT(bvh_collide_prim_intersections = 0);
+#if 0
+      collide_recurse(ref0,bounds0,ref1,bounds1,0,0);
+#else
+      const int M = 2048;
+      jobvector jobs[2];
+      jobs[0].reserve(M);
+      jobs[1].reserve(M);
+      jobs[0].push_back(CollideJob(ref0,bounds0,0,ref1,bounds1,0));
+      int source = 0;
+      int target = 1;
+
+      /* try to split job until job list is full */
+      while (jobs[source].size()+8 <= M)
+      {
+        for (size_t i=0; i<jobs[source].size(); i++)
+        {
+          const CollideJob& job = jobs[source][i];
+          size_t remaining = jobs[source].size()-i;
+          if (jobs[target].size()+remaining+8 > M) {
+            jobs[target].push_back(job);
+          } else {
+            split(job,jobs[target]);
+          }
+        }
+
+        /* stop splitting jobs if we reached only leaves and cannot make progress anymore */
+        if (jobs[target].size() == jobs[source].size())
+          break;
+
+        jobs[source].resize(0);
+        std::swap(source,target);
+      }
+
+      /* parallel processing of all jobs */
+      parallel_for(size_t(jobs[source].size()), [&] ( size_t i ) {
+          CollideJob& j = jobs[source][i];
+          collide_recurse(j.ref0,j.bounds0,j.ref1,j.bounds1,j.depth0,j.depth1);
+        });
+      
+      
+#endif
+      CSTAT(PRINT(bvh_collide_traversal_steps));
+      CSTAT(PRINT(bvh_collide_leaf_pairs));
+      CSTAT(PRINT(bvh_collide_leaf_iterations));
+      CSTAT(PRINT(bvh_collide_prim_intersections1));
+      CSTAT(PRINT(bvh_collide_prim_intersections2));
+      CSTAT(PRINT(bvh_collide_prim_intersections3));
+      CSTAT(PRINT(bvh_collide_prim_intersections4));
+      CSTAT(PRINT(bvh_collide_prim_intersections5));
+      CSTAT(PRINT(bvh_collide_prim_intersections));
+    }
+   
+    template<int N>
+    void BVHNColliderUserGeom<N>::collide(BVH* __restrict__ bvh0, BVH* __restrict__ bvh1, RTCCollideFunc callback, void* userPtr)
+    { 
+      BVHNColliderUserGeom<N>(bvh0->scene,bvh1->scene,callback,userPtr).
+        collide_recurse_entry(bvh0->root,bvh0->bounds.bounds(),bvh1->root,bvh1->bounds.bounds());
+    }
+
+#if defined (EMBREE_LOWEST_ISA)
+    struct collision_regression_test : public RegressionTest
+    {
+      collision_regression_test(const char* name) : RegressionTest(name) {
+        registerRegressionTest(this);
+      }
+    
+      bool run ()
+      {
+        bool passed = true;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(-0.008815f, 0.041848f, -2.49875e-06f), Vec3fa(-0.008276f, 0.053318f, -2.49875e-06f), Vec3fa(0.003023f, 0.048969f, -2.49875e-06f),
+                                                                            Vec3fa(0.00245f, 0.037612f, -2.49875e-06f), Vec3fa(0.01434f, 0.042634f, -2.49875e-06f), Vec3fa(0.013499f, 0.031309f, -2.49875e-06f)) == false;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0)) == true;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0,0,1),Vec3fa(1,0,1),Vec3fa(0,1,1)) == false;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0,0,1),Vec3fa(1,0,0),Vec3fa(0,1,0)) == true;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0,0,0),Vec3fa(1,0,1),Vec3fa(0,1,1)) == true;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0.1f,0.1f,0),Vec3fa(1,0,1),Vec3fa(0,1,1)) == true;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0.1f,0.1f,-0.1f),Vec3fa(1,0,1),Vec3fa(0,1,1)) == true;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0)) == true;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0,0,0),Vec3fa(0.5f,0,0),Vec3fa(0,0.5f,0)) == true;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0.1f,0.1f,0),Vec3fa(0.5f,0,0),Vec3fa(0,0.5f,0)) == true;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0.1f,0.1f,0),Vec3fa(0.5f,0.1f,0),Vec3fa(0.1f,0.5f,0)) == true;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0.1f,-0.1f,0),Vec3fa(0.5f,0.1f,0),Vec3fa(0.1f,0.5f,0)) == true;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(-0.1f,0.1f,0),Vec3fa(0.5f,0.1f,0),Vec3fa(0.1f,0.5f,0)) == true;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), 
+                                               Vec3fa(-1,1,0) + Vec3fa(0,0,0),Vec3fa(-1,1,0) + Vec3fa(0.1f,0,0),Vec3fa(-1,1,0) + Vec3fa(0,0.1f,0)) == false;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), 
+                                               Vec3fa( 2,0.5f,0) + Vec3fa(0,0,0),Vec3fa( 2,0.5f,0) + Vec3fa(0.1f,0,0),Vec3fa( 2,0.5f,0) + Vec3fa(0,0.1f,0)) == false;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), 
+                                               Vec3fa(0.5f,-2.0f,0) + Vec3fa(0,0,0),Vec3fa(0.5f,-2.0f,0) + Vec3fa(0.1f,0,0),Vec3fa(0.5f,-2.0f,0) + Vec3fa(0,0.1f,0)) == false;
+        return passed;
+      }
+    };
+
+    collision_regression_test collision_regression("collision_regression_test");
+#endif
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Collider Definitions
+    ////////////////////////////////////////////////////////////////////////////////
+
+    DEFINE_COLLIDER(BVH4ColliderUserGeom,BVHNColliderUserGeom<4>);
+
+#if defined(__AVX__)
+    DEFINE_COLLIDER(BVH8ColliderUserGeom,BVHNColliderUserGeom<8>);
+#endif
+  }
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_collider.h b/thirdparty/embree/kernels/bvh/bvh_collider.h
new file mode 100644
index 0000000000..3c42f211c1
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_collider.h
@@ -0,0 +1,72 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh.h"
+#include "../geometry/trianglev.h"
+#include "../geometry/object.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int N>
+      class BVHNCollider
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::AABBNode AABBNode;
+
+      struct CollideJob
+      {
+        CollideJob () {}
+        
+        CollideJob (NodeRef ref0, const BBox3fa& bounds0, size_t depth0,
+                    NodeRef ref1, const BBox3fa& bounds1, size_t depth1)
+        : ref0(ref0), bounds0(bounds0), depth0(depth0), ref1(ref1), bounds1(bounds1), depth1(depth1) {}
+        
+        NodeRef ref0;
+        BBox3fa bounds0;
+        size_t depth0;
+        NodeRef ref1;
+        BBox3fa bounds1;
+        size_t depth1;
+      };
+
+      typedef vector_t<CollideJob, aligned_allocator<CollideJob,16>> jobvector;
+
+      void split(const CollideJob& job, jobvector& jobs);
+      
+    public:
+      __forceinline BVHNCollider (Scene* scene0, Scene* scene1, RTCCollideFunc callback, void* userPtr)
+        : scene0(scene0), scene1(scene1), callback(callback), userPtr(userPtr) {}
+
+    public:
+      virtual void processLeaf(NodeRef leaf0, NodeRef leaf1) = 0;
+      void collide_recurse(NodeRef node0, const BBox3fa& bounds0, NodeRef node1, const BBox3fa& bounds1, size_t depth0, size_t depth1);
+      void collide_recurse_entry(NodeRef node0, const BBox3fa& bounds0, NodeRef node1, const BBox3fa& bounds1);
+    
+    protected:
+      Scene* scene0;
+      Scene* scene1;
+      RTCCollideFunc callback;
+      void* userPtr;
+    };
+
+    template<int N>
+      class BVHNColliderUserGeom : public BVHNCollider<N>
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::AABBNode AABBNode;
+
+      __forceinline BVHNColliderUserGeom (Scene* scene0, Scene* scene1, RTCCollideFunc callback, void* userPtr)
+        : BVHNCollider<N>(scene0,scene1,callback,userPtr) {}
+
+      virtual void processLeaf(NodeRef leaf0, NodeRef leaf1);
+    public:
+      static void collide(BVH* __restrict__ bvh0, BVH* __restrict__ bvh1, RTCCollideFunc callback, void* userPtr);
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_factory.h b/thirdparty/embree/kernels/bvh/bvh_factory.h
new file mode 100644
index 0000000000..453d455bd9
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_factory.h
@@ -0,0 +1,21 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../bvh/bvh.h"
+#include "../common/isa.h"
+#include "../common/accel.h"
+#include "../common/scene.h"
+#include "../geometry/curve_intersector_virtual.h"
+
+namespace embree
+{
+  /*! BVH instantiations */
+  class BVHFactory
+  {
+  public:
+    enum class BuildVariant     { STATIC, DYNAMIC, HIGH_QUALITY };
+    enum class IntersectVariant { FAST, ROBUST };
+  };
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector1.cpp b/thirdparty/embree/kernels/bvh/bvh_intersector1.cpp
new file mode 100644
index 0000000000..9594f402c3
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_intersector1.cpp
@@ -0,0 +1,321 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh_intersector1.h"
+#include "node_intersector1.h"
+#include "bvh_traverser1.h"
+
+#include "../geometry/intersector_iterators.h"
+#include "../geometry/triangle_intersector.h"
+#include "../geometry/trianglev_intersector.h"
+#include "../geometry/trianglev_mb_intersector.h"
+#include "../geometry/trianglei_intersector.h"
+#include "../geometry/quadv_intersector.h"
+#include "../geometry/quadi_intersector.h"
+#include "../geometry/curveNv_intersector.h"
+#include "../geometry/curveNi_intersector.h"
+#include "../geometry/curveNi_mb_intersector.h"
+#include "../geometry/linei_intersector.h"
+#include "../geometry/subdivpatch1_intersector.h"
+#include "../geometry/object_intersector.h"
+#include "../geometry/instance_intersector.h"
+#include "../geometry/subgrid_intersector.h"
+#include "../geometry/subgrid_mb_intersector.h"
+#include "../geometry/curve_intersector_virtual.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int N, int types, bool robust, typename PrimitiveIntersector1>
+    void BVHNIntersector1<N, types, robust, PrimitiveIntersector1>::intersect(const Accel::Intersectors* __restrict__ This,
+                                                                              RayHit& __restrict__ ray,
+                                                                              IntersectContext* __restrict__ context)
+    {
+      const BVH* __restrict__ bvh = (const BVH*)This->ptr;
+      
+      /* we may traverse an empty BVH in case all geometry was invalid */
+      if (bvh->root == BVH::emptyNode)
+        return;
+      
+      /* perform per ray precalculations required by the primitive intersector */
+      Precalculations pre(ray, bvh);
+
+      /* stack state */
+      StackItemT<NodeRef> stack[stackSize];    // stack of nodes
+      StackItemT<NodeRef>* stackPtr = stack+1; // current stack pointer
+      StackItemT<NodeRef>* stackEnd = stack+stackSize;
+      stack[0].ptr  = bvh->root;
+      stack[0].dist = neg_inf;
+      
+      if (bvh->root == BVH::emptyNode)
+        return;
+      
+      /* filter out invalid rays */
+#if defined(EMBREE_IGNORE_INVALID_RAYS)
+      if (!ray.valid()) return;
+#endif
+      /* verify correct input */
+      assert(ray.valid());
+      assert(ray.tnear() >= 0.0f);
+      assert(!(types & BVH_MB) || (ray.time() >= 0.0f && ray.time() <= 1.0f));
+
+      /* load the ray into SIMD registers */
+      TravRay<N,robust> tray(ray.org, ray.dir, max(ray.tnear(), 0.0f), max(ray.tfar, 0.0f));
+
+      /* initialize the node traverser */
+      BVHNNodeTraverser1Hit<N, types> nodeTraverser;
+
+      /* pop loop */
+      while (true) pop:
+      {
+        /* pop next node */
+        if (unlikely(stackPtr == stack)) break;
+        stackPtr--;
+        NodeRef cur = NodeRef(stackPtr->ptr);
+
+        /* if popped node is too far, pop next one */
+        if (unlikely(*(float*)&stackPtr->dist > ray.tfar))
+          continue;
+
+        /* downtraversal loop */
+        while (true)
+        {
+          /* intersect node */
+          size_t mask; vfloat<N> tNear;
+          STAT3(normal.trav_nodes,1,1,1);
+          bool nodeIntersected = BVHNNodeIntersector1<N, types, robust>::intersect(cur, tray, ray.time(), tNear, mask);
+          if (unlikely(!nodeIntersected)) { STAT3(normal.trav_nodes,-1,-1,-1); break; }
+
+          /* if no child is hit, pop next node */
+          if (unlikely(mask == 0))
+            goto pop;
+
+          /* select next child and push other children */
+          nodeTraverser.traverseClosestHit(cur, mask, tNear, stackPtr, stackEnd);
+        }
+
+        /* this is a leaf node */
+        assert(cur != BVH::emptyNode);
+        STAT3(normal.trav_leaves,1,1,1);
+        size_t num; Primitive* prim = (Primitive*)cur.leaf(num);
+        size_t lazy_node = 0;
+        PrimitiveIntersector1::intersect(This, pre, ray, context, prim, num, tray, lazy_node);
+        tray.tfar = ray.tfar;
+
+        /* push lazy node onto stack */
+        if (unlikely(lazy_node)) {
+          stackPtr->ptr = lazy_node;
+          stackPtr->dist = neg_inf;
+          stackPtr++;
+        }
+      }
+    }
+
+    template<int N, int types, bool robust, typename PrimitiveIntersector1>
+    void BVHNIntersector1<N, types, robust, PrimitiveIntersector1>::occluded(const Accel::Intersectors* __restrict__ This,
+                                                                             Ray& __restrict__ ray,
+                                                                             IntersectContext* __restrict__ context)
+    {
+      const BVH* __restrict__ bvh = (const BVH*)This->ptr;
+      
+      /* we may traverse an empty BVH in case all geometry was invalid */
+      if (bvh->root == BVH::emptyNode)
+        return;
+       
+      /* early out for already occluded rays */
+      if (unlikely(ray.tfar < 0.0f))
+        return;
+
+      /* perform per ray precalculations required by the primitive intersector */
+      Precalculations pre(ray, bvh);
+
+      /* stack state */
+      NodeRef stack[stackSize];    // stack of nodes that still need to get traversed
+      NodeRef* stackPtr = stack+1; // current stack pointer
+      NodeRef* stackEnd = stack+stackSize;
+      stack[0] = bvh->root;
+
+      /* filter out invalid rays */
+#if defined(EMBREE_IGNORE_INVALID_RAYS)
+      if (!ray.valid()) return;
+#endif
+
+      /* verify correct input */
+      assert(ray.valid());
+      assert(ray.tnear() >= 0.0f);
+      assert(!(types & BVH_MB) || (ray.time() >= 0.0f && ray.time() <= 1.0f));
+
+      /* load the ray into SIMD registers */
+      TravRay<N,robust> tray(ray.org, ray.dir, max(ray.tnear(), 0.0f), max(ray.tfar, 0.0f));
+
+      /* initialize the node traverser */
+      BVHNNodeTraverser1Hit<N, types> nodeTraverser;
+
+      /* pop loop */
+      while (true) pop:
+      {
+        /* pop next node */
+        if (unlikely(stackPtr == stack)) break;
+        stackPtr--;
+        NodeRef cur = (NodeRef)*stackPtr;
+
+        /* downtraversal loop */
+        while (true)
+        {
+          /* intersect node */
+          size_t mask; vfloat<N> tNear;
+          STAT3(shadow.trav_nodes,1,1,1);
+          bool nodeIntersected = BVHNNodeIntersector1<N, types, robust>::intersect(cur, tray, ray.time(), tNear, mask);
+          if (unlikely(!nodeIntersected)) { STAT3(shadow.trav_nodes,-1,-1,-1); break; }
+
+          /* if no child is hit, pop next node */
+          if (unlikely(mask == 0))
+            goto pop;
+
+          /* select next child and push other children */
+          nodeTraverser.traverseAnyHit(cur, mask, tNear, stackPtr, stackEnd);
+        }
+
+        /* this is a leaf node */
+        assert(cur != BVH::emptyNode);
+        STAT3(shadow.trav_leaves,1,1,1);
+        size_t num; Primitive* prim = (Primitive*)cur.leaf(num);
+        size_t lazy_node = 0;
+        if (PrimitiveIntersector1::occluded(This, pre, ray, context, prim, num, tray, lazy_node)) {
+          ray.tfar = neg_inf;
+          break;
+        }
+
+        /* push lazy node onto stack */
+        if (unlikely(lazy_node)) {
+          *stackPtr = (NodeRef)lazy_node;
+          stackPtr++;
+        }
+      }
+    }
+
+    template<int N, int types, bool robust, typename PrimitiveIntersector1>
+    struct PointQueryDispatch
+    {
+      typedef typename PrimitiveIntersector1::Precalculations Precalculations;
+      typedef typename PrimitiveIntersector1::Primitive Primitive;
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::AABBNode AABBNode;
+      typedef typename BVH::AABBNodeMB4D AABBNodeMB4D;
+
+      static const size_t stackSize = 1+(N-1)*BVH::maxDepth+3; // +3 due to 16-wide store
+
+      static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context)
+      {
+        const BVH* __restrict__ bvh = (const BVH*)This->ptr;
+        
+        /* we may traverse an empty BVH in case all geometry was invalid */
+        if (bvh->root == BVH::emptyNode)
+          return false;
+        
+        /* stack state */
+        StackItemT<NodeRef> stack[stackSize];    // stack of nodes
+        StackItemT<NodeRef>* stackPtr = stack+1; // current stack pointer
+        StackItemT<NodeRef>* stackEnd = stack+stackSize;
+        stack[0].ptr  = bvh->root;
+        stack[0].dist = neg_inf;
+        
+        /* verify correct input */
+        assert(!(types & BVH_MB) || (query->time >= 0.0f && query->time <= 1.0f));
+
+        /* load the point query into SIMD registers */
+        TravPointQuery<N> tquery(query->p, context->query_radius);
+
+        /* initialize the node traverser */
+        BVHNNodeTraverser1Hit<N,types> nodeTraverser;
+
+        bool changed = false;
+        float cull_radius = context->query_type == POINT_QUERY_TYPE_SPHERE
+                          ? query->radius * query->radius
+                          : dot(context->query_radius, context->query_radius);
+
+        /* pop loop */
+        while (true) pop:
+        {
+          /* pop next node */
+          if (unlikely(stackPtr == stack)) break;
+          stackPtr--;
+          NodeRef cur = NodeRef(stackPtr->ptr);
+
+          /* if popped node is too far, pop next one */
+          if (unlikely(*(float*)&stackPtr->dist > cull_radius))
+            continue;
+
+          /* downtraversal loop */
+          while (true)
+          {
+            /* intersect node */
+            size_t mask; vfloat<N> tNear;
+            STAT3(point_query.trav_nodes,1,1,1);
+            bool nodeIntersected;
+            if (likely(context->query_type == POINT_QUERY_TYPE_SPHERE)) {
+              nodeIntersected = BVHNNodePointQuerySphere1<N, types>::pointQuery(cur, tquery, query->time, tNear, mask);
+            } else {
+              nodeIntersected = BVHNNodePointQueryAABB1  <N, types>::pointQuery(cur, tquery, query->time, tNear, mask);
+            }
+            if (unlikely(!nodeIntersected)) { STAT3(point_query.trav_nodes,-1,-1,-1); break; }
+
+            /* if no child is hit, pop next node */
+            if (unlikely(mask == 0))
+              goto pop;
+
+            /* select next child and push other children */
+            nodeTraverser.traverseClosestHit(cur, mask, tNear, stackPtr, stackEnd);
+          }
+
+          /* this is a leaf node */
+          assert(cur != BVH::emptyNode);
+          STAT3(point_query.trav_leaves,1,1,1);
+          size_t num; Primitive* prim = (Primitive*)cur.leaf(num);
+          size_t lazy_node = 0;
+          if (PrimitiveIntersector1::pointQuery(This, query, context, prim, num, tquery, lazy_node))
+          {
+            changed = true;
+            tquery.rad = context->query_radius;
+            cull_radius = context->query_type == POINT_QUERY_TYPE_SPHERE
+                        ? query->radius * query->radius
+                        : dot(context->query_radius, context->query_radius);
+          }
+
+          /* push lazy node onto stack */
+          if (unlikely(lazy_node)) {
+            stackPtr->ptr = lazy_node;
+            stackPtr->dist = neg_inf;
+            stackPtr++;
+          }
+        }
+        return changed;
+      }
+    };
+
+    /* disable point queries for not yet supported geometry types */
+    template<int N, int types, bool robust>
+    struct PointQueryDispatch<N, types, robust, VirtualCurveIntersector1> {
+      static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context) { return false; }
+    };
+    
+    template<int N, int types, bool robust>
+    struct PointQueryDispatch<N, types, robust, SubdivPatch1Intersector1> {
+      static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context) { return false; }
+    };
+    
+    template<int N, int types, bool robust>
+    struct PointQueryDispatch<N, types, robust, SubdivPatch1MBIntersector1> {
+      static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context) { return false; }
+    };
+
+    template<int N, int types, bool robust, typename PrimitiveIntersector1>
+    bool BVHNIntersector1<N, types, robust, PrimitiveIntersector1>::pointQuery(
+      const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context)
+    {
+      return PointQueryDispatch<N, types, robust, PrimitiveIntersector1>::pointQuery(This, query, context);
+    }
+  }
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector1.h b/thirdparty/embree/kernels/bvh/bvh_intersector1.h
new file mode 100644
index 0000000000..2df3d6eddb
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_intersector1.h
@@ -0,0 +1,34 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh.h"
+#include "../common/ray.h"
+#include "../common/point_query.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    /*! BVH single ray intersector. */
+    template<int N, int types, bool robust, typename PrimitiveIntersector1>
+    class BVHNIntersector1
+    {
+      /* shortcuts for frequently used types */
+      typedef typename PrimitiveIntersector1::Precalculations Precalculations;
+      typedef typename PrimitiveIntersector1::Primitive Primitive;
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::AABBNode AABBNode;
+      typedef typename BVH::AABBNodeMB4D AABBNodeMB4D;
+
+      static const size_t stackSize = 1+(N-1)*BVH::maxDepth+3; // +3 due to 16-wide store
+
+    public:
+      static void intersect (const Accel::Intersectors* This, RayHit& ray, IntersectContext* context);
+      static void occluded  (const Accel::Intersectors* This, Ray& ray, IntersectContext* context);
+      static bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context);
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector1_bvh4.cpp b/thirdparty/embree/kernels/bvh/bvh_intersector1_bvh4.cpp
new file mode 100644
index 0000000000..831d613367
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_intersector1_bvh4.cpp
@@ -0,0 +1,61 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh_intersector1.cpp"
+
+namespace embree
+{
+  namespace isa
+  {
+    int getISA() {
+      return VerifyMultiTargetLinking::getISA();
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// BVH4Intersector1 Definitions
+    ////////////////////////////////////////////////////////////////////////////////
+
+    IF_ENABLED_CURVES_OR_POINTS(DEFINE_INTERSECTOR1(BVH4OBBVirtualCurveIntersector1,BVHNIntersector1<4 COMMA BVH_AN1_UN1 COMMA false COMMA VirtualCurveIntersector1 >));
+    IF_ENABLED_CURVES_OR_POINTS(DEFINE_INTERSECTOR1(BVH4OBBVirtualCurveIntersector1MB,BVHNIntersector1<4 COMMA BVH_AN2_AN4D_UN2 COMMA false COMMA VirtualCurveIntersector1 >));
+
+    IF_ENABLED_CURVES_OR_POINTS(DEFINE_INTERSECTOR1(BVH4OBBVirtualCurveIntersectorRobust1,BVHNIntersector1<4 COMMA BVH_AN1_UN1 COMMA true COMMA VirtualCurveIntersector1 >));
+    IF_ENABLED_CURVES_OR_POINTS(DEFINE_INTERSECTOR1(BVH4OBBVirtualCurveIntersectorRobust1MB,BVHNIntersector1<4 COMMA BVH_AN2_AN4D_UN2 COMMA true COMMA VirtualCurveIntersector1 >));
+
+    IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4Intersector1Moeller,  BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersector1<TriangleMIntersector1Moeller  <4 COMMA true> > >));
+    IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4iIntersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersector1<TriangleMiIntersector1Moeller <4 COMMA true> > >));
+    IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4vIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN1 COMMA true  COMMA ArrayIntersector1<TriangleMvIntersector1Pluecker<4 COMMA true> > >));
+    IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4iIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN1 COMMA true  COMMA ArrayIntersector1<TriangleMiIntersector1Pluecker<4 COMMA true> > >));
+
+    IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4vMBIntersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersector1<TriangleMvMBIntersector1Moeller <4 COMMA true> > >));
+    IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4iMBIntersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersector1<TriangleMiMBIntersector1Moeller <4 COMMA true> > >));
+    IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4vMBIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA true  COMMA ArrayIntersector1<TriangleMvMBIntersector1Pluecker<4 COMMA true> > >));
+    IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4iMBIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA true  COMMA ArrayIntersector1<TriangleMiMBIntersector1Pluecker<4 COMMA true> > >));
+
+    IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(BVH4Quad4vIntersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersector1<QuadMvIntersector1Moeller <4 COMMA true> > >));
+    IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(BVH4Quad4iIntersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersector1<QuadMiIntersector1Moeller <4 COMMA true> > >));
+    IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(BVH4Quad4vIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN1 COMMA true  COMMA ArrayIntersector1<QuadMvIntersector1Pluecker<4 COMMA true> > >));
+    IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(BVH4Quad4iIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN1 COMMA true  COMMA ArrayIntersector1<QuadMiIntersector1Pluecker<4 COMMA true> > >));
+
+    IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(BVH4Quad4iMBIntersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersector1<QuadMiMBIntersector1Moeller <4 COMMA true> > >));
+    IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(BVH4Quad4iMBIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA true  COMMA ArrayIntersector1<QuadMiMBIntersector1Pluecker<4 COMMA true> > >));
+
+    IF_ENABLED_SUBDIV(DEFINE_INTERSECTOR1(BVH4SubdivPatch1Intersector1,BVHNIntersector1<4 COMMA BVH_AN1 COMMA true COMMA SubdivPatch1Intersector1>));
+    IF_ENABLED_SUBDIV(DEFINE_INTERSECTOR1(BVH4SubdivPatch1MBIntersector1,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA true COMMA SubdivPatch1MBIntersector1>));
+    
+    IF_ENABLED_USER(DEFINE_INTERSECTOR1(BVH4VirtualIntersector1,BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersector1<ObjectIntersector1<false>> >));
+    IF_ENABLED_USER(DEFINE_INTERSECTOR1(BVH4VirtualMBIntersector1,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersector1<ObjectIntersector1<true>> >));
+
+    IF_ENABLED_INSTANCE(DEFINE_INTERSECTOR1(BVH4InstanceIntersector1,BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersector1<InstanceIntersector1> >));
+    IF_ENABLED_INSTANCE(DEFINE_INTERSECTOR1(BVH4InstanceMBIntersector1,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersector1<InstanceIntersector1MB> >));
+
+    IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(QBVH4Triangle4iIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_QN1 COMMA false COMMA ArrayIntersector1<TriangleMiIntersector1Pluecker<4 COMMA true> > >));
+    IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(QBVH4Quad4iIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_QN1 COMMA false COMMA ArrayIntersector1<QuadMiIntersector1Pluecker<4 COMMA true> > >));
+
+    IF_ENABLED_GRIDS(DEFINE_INTERSECTOR1(BVH4GridIntersector1Moeller,BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA SubGridIntersector1Moeller<4 COMMA true> >));
+    IF_ENABLED_GRIDS(DEFINE_INTERSECTOR1(BVH4GridMBIntersector1Moeller,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA true COMMA SubGridMBIntersector1Pluecker<4 COMMA true> >));
+
+    IF_ENABLED_GRIDS(DEFINE_INTERSECTOR1(BVH4GridIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN1 COMMA true COMMA SubGridIntersector1Pluecker<4 COMMA true> >));
+    //IF_ENABLED_GRIDS(DEFINE_INTERSECTOR1(BVH4GridMBIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA false COMMA SubGridMBIntersector1Pluecker<4 COMMA true> >));
+
+  }
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid.h b/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid.h
new file mode 100644
index 0000000000..50ebf375c4
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid.h
@@ -0,0 +1,58 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh.h"
+#include "../common/ray.h"
+#include "../common/stack_item.h"
+#include "node_intersector_frustum.h"
+
+namespace embree
+{
+  namespace isa 
+  {
+    template<int K, bool robust>
+    struct TravRayK;
+
+    /*! BVH hybrid packet intersector. Switches between packet and single ray traversal (optional). */
+    template<int N, int K, int types, bool robust, typename PrimitiveIntersectorK, bool single = true>
+    class BVHNIntersectorKHybrid
+    {
+      /* shortcuts for frequently used types */
+      typedef typename PrimitiveIntersectorK::Precalculations Precalculations;
+      typedef typename PrimitiveIntersectorK::Primitive Primitive;
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::BaseNode BaseNode;
+      typedef typename BVH::AABBNode AABBNode;
+      
+      static const size_t stackSizeSingle = 1+(N-1)*BVH::maxDepth+3; // +3 due to 16-wide store
+      static const size_t stackSizeChunk = 1+(N-1)*BVH::maxDepth;
+
+      static const size_t switchThresholdIncoherent = \
+      (K==4)  ? 3 :
+      (K==8)  ? ((N==4) ? 5 : 7) :
+      (K==16) ? 14 : // 14 seems to work best for KNL due to better ordered chunk traversal
+      0;
+
+    private:
+      static void intersect1(Accel::Intersectors* This, const BVH* bvh, NodeRef root, size_t k, Precalculations& pre,
+                             RayHitK<K>& ray, const TravRayK<K, robust>& tray, IntersectContext* context);
+      static bool occluded1(Accel::Intersectors* This, const BVH* bvh, NodeRef root, size_t k, Precalculations& pre,
+                            RayK<K>& ray, const TravRayK<K, robust>& tray, IntersectContext* context);
+
+    public:
+      static void intersect(vint<K>* valid, Accel::Intersectors* This, RayHitK<K>& ray, IntersectContext* context);
+      static void occluded (vint<K>* valid, Accel::Intersectors* This, RayK<K>& ray, IntersectContext* context);
+
+      static void intersectCoherent(vint<K>* valid, Accel::Intersectors* This, RayHitK<K>& ray, IntersectContext* context);
+      static void occludedCoherent (vint<K>* valid, Accel::Intersectors* This, RayK<K>& ray, IntersectContext* context);
+
+    };
+
+    /*! BVH packet intersector. */
+    template<int N, int K, int types, bool robust, typename PrimitiveIntersectorK>
+    class BVHNIntersectorKChunk : public BVHNIntersectorKHybrid<N, K, types, robust, PrimitiveIntersectorK, false> {};
+  }
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector_stream.h b/thirdparty/embree/kernels/bvh/bvh_intersector_stream.h
new file mode 100644
index 0000000000..717f559677
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_intersector_stream.h
@@ -0,0 +1,270 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "node_intersector_packet_stream.h"
+#include "node_intersector_frustum.h"
+#include "bvh_traverser_stream.h"
+
+namespace embree
+{
+  namespace isa 
+  {
+    /*! BVH ray stream intersector. */
+    template<int N, int types, bool robust, typename PrimitiveIntersector>
+    class BVHNIntersectorStream
+    {
+      /* shortcuts for frequently used types */
+      template<int K> using PrimitiveIntersectorK = typename PrimitiveIntersector::template Type<K>;
+      template<int K> using PrimitiveK = typename PrimitiveIntersectorK<K>::PrimitiveK;
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::BaseNode BaseNode;
+      typedef typename BVH::AABBNode AABBNode;
+      typedef typename BVH::AABBNodeMB AABBNodeMB;
+
+      template<int K>
+      __forceinline static size_t initPacketsAndFrustum(RayK<K>** inputPackets, size_t numOctantRays,
+                                                        TravRayKStream<K, robust>* packets, Frustum<robust>& frustum, bool& commonOctant)
+      {
+        const size_t numPackets = (numOctantRays+K-1)/K;
+
+        Vec3vf<K> tmp_min_rdir(pos_inf);
+        Vec3vf<K> tmp_max_rdir(neg_inf);
+        Vec3vf<K> tmp_min_org(pos_inf);
+        Vec3vf<K> tmp_max_org(neg_inf);
+        vfloat<K> tmp_min_dist(pos_inf);
+        vfloat<K> tmp_max_dist(neg_inf);
+
+        size_t m_active = 0;
+        for (size_t i = 0; i < numPackets; i++)
+        {
+          const vfloat<K> tnear = inputPackets[i]->tnear();
+          const vfloat<K> tfar  = inputPackets[i]->tfar;
+          vbool<K> m_valid = (tnear <= tfar) & (tnear >= 0.0f);
+
+#if defined(EMBREE_IGNORE_INVALID_RAYS)
+          m_valid &= inputPackets[i]->valid();
+#endif
+
+          m_active |= (size_t)movemask(m_valid) << (i*K);
+
+          vfloat<K> packet_min_dist = max(tnear, 0.0f);
+          vfloat<K> packet_max_dist = select(m_valid, tfar, neg_inf);
+          tmp_min_dist = min(tmp_min_dist, packet_min_dist);
+          tmp_max_dist = max(tmp_max_dist, packet_max_dist);
+
+          const Vec3vf<K>& org = inputPackets[i]->org;
+          const Vec3vf<K>& dir = inputPackets[i]->dir;
+
+          new (&packets[i]) TravRayKStream<K, robust>(org, dir, packet_min_dist, packet_max_dist);
+
+          tmp_min_rdir = min(tmp_min_rdir, select(m_valid, packets[i].rdir, Vec3vf<K>(pos_inf)));
+          tmp_max_rdir = max(tmp_max_rdir, select(m_valid, packets[i].rdir, Vec3vf<K>(neg_inf)));
+          tmp_min_org  = min(tmp_min_org , select(m_valid,org , Vec3vf<K>(pos_inf)));
+          tmp_max_org  = max(tmp_max_org , select(m_valid,org , Vec3vf<K>(neg_inf)));
+        }
+
+        m_active &= (numOctantRays == (8 * sizeof(size_t))) ? (size_t)-1 : (((size_t)1 << numOctantRays)-1);
+
+        
+        const Vec3fa reduced_min_rdir(reduce_min(tmp_min_rdir.x),
+                                      reduce_min(tmp_min_rdir.y),
+                                      reduce_min(tmp_min_rdir.z));
+
+        const Vec3fa reduced_max_rdir(reduce_max(tmp_max_rdir.x),
+                                      reduce_max(tmp_max_rdir.y),
+                                      reduce_max(tmp_max_rdir.z));
+
+        const Vec3fa reduced_min_origin(reduce_min(tmp_min_org.x),
+                                        reduce_min(tmp_min_org.y),
+                                        reduce_min(tmp_min_org.z));
+
+        const Vec3fa reduced_max_origin(reduce_max(tmp_max_org.x),
+                                        reduce_max(tmp_max_org.y),
+                                        reduce_max(tmp_max_org.z));
+
+        commonOctant =
+          (reduced_max_rdir.x < 0.0f || reduced_min_rdir.x >= 0.0f) &&
+          (reduced_max_rdir.y < 0.0f || reduced_min_rdir.y >= 0.0f) &&
+          (reduced_max_rdir.z < 0.0f || reduced_min_rdir.z >= 0.0f);
+        
+        const float frustum_min_dist = reduce_min(tmp_min_dist);
+        const float frustum_max_dist = reduce_max(tmp_max_dist);
+
+        frustum.init(reduced_min_origin, reduced_max_origin,
+                     reduced_min_rdir, reduced_max_rdir,
+                     frustum_min_dist, frustum_max_dist,
+                     N);
+        
+        return m_active;
+      }
+
+      template<int K>
+      __forceinline static size_t intersectAABBNodePacket(size_t m_active,
+                                                             const TravRayKStream<K,robust>* packets,
+                                                             const AABBNode* __restrict__ node,
+                                                             size_t boxID,
+                                                             const NearFarPrecalculations& nf)
+      {
+        assert(m_active);
+        const size_t startPacketID = bsf(m_active) / K;
+        const size_t endPacketID   = bsr(m_active) / K;
+        size_t m_trav_active = 0;
+        for (size_t i = startPacketID; i <= endPacketID; i++)
+        {
+          const size_t m_hit = intersectNodeK<N>(node, boxID, packets[i], nf);
+          m_trav_active |= m_hit << (i*K);
+        } 
+        return m_trav_active;
+      }
+      
+      template<int K>
+      __forceinline static size_t traverseCoherentStream(size_t m_active,
+                                                         TravRayKStream<K, robust>* packets,
+                                                         const AABBNode* __restrict__ node,
+                                                         const Frustum<robust>& frustum,
+                                                         size_t* maskK,
+                                                         vfloat<N>& dist)
+      {
+        size_t m_node_hit = intersectNodeFrustum<N>(node, frustum, dist);
+        const size_t first_index    = bsf(m_active);
+        const size_t first_packetID = first_index / K;
+        const size_t first_rayID    = first_index % K;
+        size_t m_first_hit = intersectNode1<N>(node, packets[first_packetID], first_rayID, frustum.nf);
+
+        /* this make traversal independent of the ordering of rays */
+        size_t m_node = m_node_hit ^ m_first_hit;
+        while (unlikely(m_node))
+        {
+          const size_t boxID = bscf(m_node);
+          const size_t m_current = m_active & intersectAABBNodePacket(m_active, packets, node, boxID, frustum.nf);
+          m_node_hit ^= m_current ? (size_t)0 : ((size_t)1 << boxID);
+          maskK[boxID] = m_current;
+        }
+        return m_node_hit;
+      }
+      
+      // TODO: explicit 16-wide path for KNL
+      template<int K>
+      __forceinline static vint<N> traverseIncoherentStream(size_t m_active,
+                                                             TravRayKStreamFast<K>* __restrict__ packets,
+                                                             const AABBNode* __restrict__ node,
+                                                             const NearFarPrecalculations& nf,
+                                                             const int shiftTable[32])
+      {
+        const vfloat<N> bminX = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearX));
+        const vfloat<N> bminY = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearY));
+        const vfloat<N> bminZ = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearZ));
+        const vfloat<N> bmaxX = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farX));
+        const vfloat<N> bmaxY = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farY));
+        const vfloat<N> bmaxZ = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farZ));
+        assert(m_active);
+        vint<N> vmask(zero);
+        do
+        {   
+          STAT3(shadow.trav_nodes,1,1,1);
+          const size_t rayID = bscf(m_active);
+          assert(rayID < MAX_INTERNAL_STREAM_SIZE);
+          TravRayKStream<K,robust> &p = packets[rayID / K];
+          const size_t i = rayID % K;
+          const vint<N> bitmask(shiftTable[rayID]);
+          const vfloat<N> tNearX = msub(bminX, p.rdir.x[i], p.org_rdir.x[i]);
+          const vfloat<N> tNearY = msub(bminY, p.rdir.y[i], p.org_rdir.y[i]);
+          const vfloat<N> tNearZ = msub(bminZ, p.rdir.z[i], p.org_rdir.z[i]);
+          const vfloat<N> tFarX  = msub(bmaxX, p.rdir.x[i], p.org_rdir.x[i]);
+          const vfloat<N> tFarY  = msub(bmaxY, p.rdir.y[i], p.org_rdir.y[i]);
+          const vfloat<N> tFarZ  = msub(bmaxZ, p.rdir.z[i], p.org_rdir.z[i]); 
+          const vfloat<N> tNear  = maxi(tNearX, tNearY, tNearZ, vfloat<N>(p.tnear[i]));
+          const vfloat<N> tFar   = mini(tFarX , tFarY , tFarZ,  vfloat<N>(p.tfar[i]));      
+
+          const vbool<N> hit_mask = tNear <= tFar;
+#if defined(__AVX2__)
+          vmask = vmask | (bitmask & vint<N>(hit_mask));
+#else
+          vmask = select(hit_mask, vmask | bitmask, vmask);
+#endif
+        } while(m_active);
+        return vmask;        
+      }
+
+      template<int K>
+      __forceinline static vint<N> traverseIncoherentStream(size_t m_active,
+                                                             TravRayKStreamRobust<K>* __restrict__ packets,
+                                                             const AABBNode* __restrict__ node,
+                                                             const NearFarPrecalculations& nf,
+                                                             const int shiftTable[32])
+      {
+        const vfloat<N> bminX = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearX));
+        const vfloat<N> bminY = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearY));
+        const vfloat<N> bminZ = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearZ));
+        const vfloat<N> bmaxX = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farX));
+        const vfloat<N> bmaxY = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farY));
+        const vfloat<N> bmaxZ = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farZ));
+        assert(m_active);
+        vint<N> vmask(zero);
+        do
+        {   
+          STAT3(shadow.trav_nodes,1,1,1);
+          const size_t rayID = bscf(m_active);
+          assert(rayID < MAX_INTERNAL_STREAM_SIZE);
+          TravRayKStream<K,robust> &p = packets[rayID / K];
+          const size_t i = rayID % K;
+          const vint<N> bitmask(shiftTable[rayID]);
+          const vfloat<N> tNearX = (bminX - p.org.x[i]) * p.rdir.x[i];
+          const vfloat<N> tNearY = (bminY - p.org.y[i]) * p.rdir.y[i];
+          const vfloat<N> tNearZ = (bminZ - p.org.z[i]) * p.rdir.z[i];
+          const vfloat<N> tFarX  = (bmaxX - p.org.x[i]) * p.rdir.x[i];
+          const vfloat<N> tFarY  = (bmaxY - p.org.y[i]) * p.rdir.y[i];
+          const vfloat<N> tFarZ  = (bmaxZ - p.org.z[i]) * p.rdir.z[i];
+          const vfloat<N> tNear  = maxi(tNearX, tNearY, tNearZ, vfloat<N>(p.tnear[i]));
+          const vfloat<N> tFar   = mini(tFarX , tFarY , tFarZ,  vfloat<N>(p.tfar[i]));
+          const float round_down  = 1.0f-2.0f*float(ulp);
+          const float round_up    = 1.0f+2.0f*float(ulp);
+          const vbool<N> hit_mask = round_down*tNear <= round_up*tFar;
+#if defined(__AVX2__)
+          vmask = vmask | (bitmask & vint<N>(hit_mask));
+#else
+          vmask = select(hit_mask, vmask | bitmask, vmask);
+#endif
+        } while(m_active);
+        return vmask;
+      }
+                                                         
+
+      static const size_t stackSizeSingle = 1+(N-1)*BVH::maxDepth;
+
+    public:
+      static void intersect(Accel::Intersectors* This, RayHitN** inputRays, size_t numRays, IntersectContext* context);
+      static void occluded (Accel::Intersectors* This, RayN** inputRays, size_t numRays, IntersectContext* context);
+
+    private:
+      template<int K>
+      static void intersectCoherent(Accel::Intersectors* This, RayHitK<K>** inputRays, size_t numRays, IntersectContext* context);
+
+      template<int K>
+      static void occludedCoherent(Accel::Intersectors* This, RayK<K>** inputRays, size_t numRays, IntersectContext* context);
+
+      template<int K>
+      static void occludedIncoherent(Accel::Intersectors* This, RayK<K>** inputRays, size_t numRays, IntersectContext* context);
+    };
+
+
+    /*! BVH ray stream intersector with direct fallback to packets. */
+    template<int N>
+    class BVHNIntersectorStreamPacketFallback
+    {
+    public:
+      static void intersect(Accel::Intersectors* This, RayHitN** inputRays, size_t numRays, IntersectContext* context);
+      static void occluded (Accel::Intersectors* This, RayN** inputRays, size_t numRays, IntersectContext* context);
+
+    private:
+      template<int K>
+      static void intersectK(Accel::Intersectors* This, RayHitK<K>** inputRays, size_t numRays, IntersectContext* context);
+
+      template<int K>
+      static void occludedK(Accel::Intersectors* This, RayK<K>** inputRays, size_t numRays, IntersectContext* context);
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector_stream_filters.h b/thirdparty/embree/kernels/bvh/bvh_intersector_stream_filters.h
new file mode 100644
index 0000000000..e7df7c2ae2
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_intersector_stream_filters.h
@@ -0,0 +1,41 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/default.h"
+#include "../common/ray.h"
+#include "../common/scene.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    class RayStreamFilter
+    {
+    public:
+      static void intersectAOS(Scene* scene, RTCRayHit* rays, size_t N, size_t stride, IntersectContext* context);
+      static void intersectAOP(Scene* scene, RTCRayHit** rays, size_t N, IntersectContext* context);
+      static void intersectSOA(Scene* scene, char* rays, size_t N, size_t numPackets, size_t stride, IntersectContext* context);
+      static void intersectSOP(Scene* scene, const RTCRayHitNp* rays, size_t N, IntersectContext* context);
+
+      static void occludedAOS(Scene* scene, RTCRay* rays, size_t N, size_t stride, IntersectContext* context);
+      static void occludedAOP(Scene* scene, RTCRay** rays, size_t N, IntersectContext* context);
+      static void occludedSOA(Scene* scene, char* rays, size_t N, size_t numPackets, size_t stride, IntersectContext* context);
+      static void occludedSOP(Scene* scene, const RTCRayNp* rays, size_t N, IntersectContext* context);
+
+    private:
+      template<int K, bool intersect>
+      static void filterAOS(Scene* scene, void* rays, size_t N, size_t stride, IntersectContext* context);
+
+      template<int K, bool intersect>
+      static void filterAOP(Scene* scene, void** rays, size_t N, IntersectContext* context);
+
+      template<int K, bool intersect>
+      static void filterSOA(Scene* scene, char* rays, size_t N, size_t numPackets, size_t stride, IntersectContext* context);
+
+      template<int K, bool intersect>
+      static void filterSOP(Scene* scene, const void* rays, size_t N, IntersectContext* context);
+    };
+  }
+};
diff --git a/thirdparty/embree/kernels/bvh/bvh_node_aabb.h b/thirdparty/embree/kernels/bvh/bvh_node_aabb.h
new file mode 100644
index 0000000000..57530692bc
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_node_aabb.h
@@ -0,0 +1,213 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh_node_base.h"
+
+namespace embree
+{
+  /*! BVHN AABBNode */
+  template<typename NodeRef, int N>
+    struct AABBNode_t : public BaseNode_t<NodeRef, N>
+  {
+    using BaseNode_t<NodeRef,N>::children;
+    
+    struct Create
+    {
+      __forceinline NodeRef operator() (const FastAllocator::CachedAllocator& alloc, size_t numChildren = 0) const
+      {
+        AABBNode_t* node = (AABBNode_t*) alloc.malloc0(sizeof(AABBNode_t),NodeRef::byteNodeAlignment); node->clear();
+        return NodeRef::encodeNode(node);
+      }
+    };
+    
+    struct Set
+    {
+      __forceinline void operator() (NodeRef node, size_t i, NodeRef child, const BBox3fa& bounds) const {
+        node.getAABBNode()->setRef(i,child);
+        node.getAABBNode()->setBounds(i,bounds);
+      }
+    };
+    
+    struct Create2
+    {
+      template<typename BuildRecord>
+      __forceinline NodeRef operator() (BuildRecord* children, const size_t num, const FastAllocator::CachedAllocator& alloc) const
+      {
+        AABBNode_t* node = (AABBNode_t*) alloc.malloc0(sizeof(AABBNode_t), NodeRef::byteNodeAlignment); node->clear();
+        for (size_t i=0; i<num; i++) node->setBounds(i,children[i].bounds());
+        return NodeRef::encodeNode(node);
+      }
+    };
+    
+    struct Set2
+    {
+      template<typename BuildRecord>
+      __forceinline NodeRef operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRef* children, const size_t num) const
+      {
+        AABBNode_t* node = ref.getAABBNode();
+        for (size_t i=0; i<num; i++) node->setRef(i,children[i]);
+        return ref;
+      }
+    };
+    
+    struct Set3
+    {
+      Set3 (FastAllocator* allocator, PrimRef* prims)
+      : allocator(allocator), prims(prims) {}
+      
+      template<typename BuildRecord>
+      __forceinline NodeRef operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRef* children, const size_t num) const
+      {
+        AABBNode_t* node = ref.getAABBNode();
+        for (size_t i=0; i<num; i++) node->setRef(i,children[i]);
+        
+        if (unlikely(precord.alloc_barrier))
+        {
+          PrimRef* begin = &prims[precord.prims.begin()];
+          PrimRef* end   = &prims[precord.prims.end()]; // FIXME: extended end for spatial split builder!!!!!
+          size_t bytes = (size_t)end - (size_t)begin;
+          allocator->addBlock(begin,bytes);
+        }
+        
+        return ref;
+      }
+      
+      FastAllocator* const allocator;
+      PrimRef* const prims;
+    };
+    
+    /*! Clears the node. */
+    __forceinline void clear() {
+      lower_x = lower_y = lower_z = pos_inf;
+      upper_x = upper_y = upper_z = neg_inf;
+      BaseNode_t<NodeRef,N>::clear();
+    }
+    
+    /*! Sets bounding box and ID of child. */
+    __forceinline void setRef(size_t i, const NodeRef& ref) {
+      assert(i < N);
+      children[i] = ref;
+    }
+    
+    /*! Sets bounding box of child. */
+    __forceinline void setBounds(size_t i, const BBox3fa& bounds)
+    {
+      assert(i < N);
+      lower_x[i] = bounds.lower.x; lower_y[i] = bounds.lower.y; lower_z[i] = bounds.lower.z;
+      upper_x[i] = bounds.upper.x; upper_y[i] = bounds.upper.y; upper_z[i] = bounds.upper.z;
+    }
+    
+    /*! Sets bounding box and ID of child. */
+    __forceinline void set(size_t i, const NodeRef& ref, const BBox3fa& bounds) {
+      setBounds(i,bounds);
+      children[i] = ref;
+    }
+    
+    /*! Returns bounds of node. */
+    __forceinline BBox3fa bounds() const {
+      const Vec3fa lower(reduce_min(lower_x),reduce_min(lower_y),reduce_min(lower_z));
+      const Vec3fa upper(reduce_max(upper_x),reduce_max(upper_y),reduce_max(upper_z));
+      return BBox3fa(lower,upper);
+    }
+    
+    /*! Returns bounds of specified child. */
+    __forceinline BBox3fa bounds(size_t i) const
+    {
+      assert(i < N);
+      const Vec3fa lower(lower_x[i],lower_y[i],lower_z[i]);
+      const Vec3fa upper(upper_x[i],upper_y[i],upper_z[i]);
+      return BBox3fa(lower,upper);
+    }
+    
+    /*! Returns extent of bounds of specified child. */
+    __forceinline Vec3fa extend(size_t i) const {
+      return bounds(i).size();
+    }
+    
+    /*! Returns bounds of all children (implemented later as specializations) */
+    __forceinline void bounds(BBox<vfloat4>& bounds0, BBox<vfloat4>& bounds1, BBox<vfloat4>& bounds2, BBox<vfloat4>& bounds3) const;
+    
+    /*! swap two children of the node */
+    __forceinline void swap(size_t i, size_t j)
+    {
+      assert(i<N && j<N);
+      std::swap(children[i],children[j]);
+      std::swap(lower_x[i],lower_x[j]);
+      std::swap(lower_y[i],lower_y[j]);
+      std::swap(lower_z[i],lower_z[j]);
+      std::swap(upper_x[i],upper_x[j]);
+      std::swap(upper_y[i],upper_y[j]);
+      std::swap(upper_z[i],upper_z[j]);
+    }
+
+    /*! swap the children of two nodes */
+    __forceinline static void swap(AABBNode_t* a, size_t i, AABBNode_t* b, size_t j)
+    {
+      assert(i<N && j<N);
+      std::swap(a->children[i],b->children[j]);
+      std::swap(a->lower_x[i],b->lower_x[j]);
+      std::swap(a->lower_y[i],b->lower_y[j]);
+      std::swap(a->lower_z[i],b->lower_z[j]);
+      std::swap(a->upper_x[i],b->upper_x[j]);
+      std::swap(a->upper_y[i],b->upper_y[j]);
+      std::swap(a->upper_z[i],b->upper_z[j]);
+    }
+
+    /*! compacts a node (moves empty children to the end) */
+    __forceinline static void compact(AABBNode_t* a)
+    {
+      /* find right most filled node */
+      ssize_t j=N;
+      for (j=j-1; j>=0; j--)
+        if (a->child(j) != NodeRef::emptyNode)
+          break;
+
+      /* replace empty nodes with filled nodes */
+      for (ssize_t i=0; i<j; i++) {
+        if (a->child(i) == NodeRef::emptyNode) {
+          a->swap(i,j);
+          for (j=j-1; j>i; j--)
+            if (a->child(j) != NodeRef::emptyNode)
+              break;
+        }
+      }
+    }
+    
+    /*! Returns reference to specified child */
+    __forceinline       NodeRef& child(size_t i)       { assert(i<N); return children[i]; }
+    __forceinline const NodeRef& child(size_t i) const { assert(i<N); return children[i]; }
+    
+    /*! output operator */
+    friend embree_ostream operator<<(embree_ostream o, const AABBNode_t& n)
+    {
+      o << "AABBNode { " << embree_endl;
+      o << "  lower_x " << n.lower_x << embree_endl;
+      o << "  upper_x " << n.upper_x << embree_endl;
+      o << "  lower_y " << n.lower_y << embree_endl;
+      o << "  upper_y " << n.upper_y << embree_endl;
+      o << "  lower_z " << n.lower_z << embree_endl;
+      o << "  upper_z " << n.upper_z << embree_endl;
+      o << "  children = ";
+      for (size_t i=0; i<N; i++) o << n.children[i] << " ";
+      o << embree_endl;
+      o << "}" << embree_endl;
+      return o;
+    }
+    
+  public:
+    vfloat<N> lower_x;           //!< X dimension of lower bounds of all N children.
+    vfloat<N> upper_x;           //!< X dimension of upper bounds of all N children.
+    vfloat<N> lower_y;           //!< Y dimension of lower bounds of all N children.
+    vfloat<N> upper_y;           //!< Y dimension of upper bounds of all N children.
+    vfloat<N> lower_z;           //!< Z dimension of lower bounds of all N children.
+    vfloat<N> upper_z;           //!< Z dimension of upper bounds of all N children.
+  };
+
+  template<>
+    __forceinline void AABBNode_t<NodeRefPtr<4>,4>::bounds(BBox<vfloat4>& bounds0, BBox<vfloat4>& bounds1, BBox<vfloat4>& bounds2, BBox<vfloat4>& bounds3) const {
+    transpose(lower_x,lower_y,lower_z,vfloat4(zero),bounds0.lower,bounds1.lower,bounds2.lower,bounds3.lower);
+    transpose(upper_x,upper_y,upper_z,vfloat4(zero),bounds0.upper,bounds1.upper,bounds2.upper,bounds3.upper);
+  }
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_node_aabb_mb.h b/thirdparty/embree/kernels/bvh/bvh_node_aabb_mb.h
new file mode 100644
index 0000000000..c4cea7d8ba
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_node_aabb_mb.h
@@ -0,0 +1,247 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh_node_base.h"
+
+namespace embree
+{
+  /*! Motion Blur AABBNode */
+  template<typename NodeRef, int N>
+    struct AABBNodeMB_t : public BaseNode_t<NodeRef, N>
+  {
+    using BaseNode_t<NodeRef,N>::children;
+    typedef BVHNodeRecord<NodeRef>     NodeRecord;
+    typedef BVHNodeRecordMB<NodeRef>   NodeRecordMB;
+    typedef BVHNodeRecordMB4D<NodeRef> NodeRecordMB4D;
+    
+    struct Create
+    {
+      template<typename BuildRecord>
+      __forceinline NodeRef operator() (BuildRecord* children, const size_t num, const FastAllocator::CachedAllocator& alloc) const
+      {
+        AABBNodeMB_t* node = (AABBNodeMB_t*) alloc.malloc0(sizeof(AABBNodeMB_t),NodeRef::byteNodeAlignment); node->clear();
+        return NodeRef::encodeNode(node);
+      }
+    };
+    
+    struct Set
+    { 
+      template<typename BuildRecord>
+      __forceinline NodeRecordMB operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRecordMB* children, const size_t num) const
+      {
+        AABBNodeMB_t* node = ref.getAABBNodeMB();
+        
+        LBBox3fa bounds = empty;
+        for (size_t i=0; i<num; i++) {
+          node->setRef(i,children[i].ref);
+          node->setBounds(i,children[i].lbounds);
+          bounds.extend(children[i].lbounds);
+        }
+        return NodeRecordMB(ref,bounds);
+      }
+    };
+    
+    struct SetTimeRange
+    {
+      __forceinline SetTimeRange(BBox1f tbounds) : tbounds(tbounds) {}
+      
+      template<typename BuildRecord>
+      __forceinline NodeRecordMB operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRecordMB* children, const size_t num) const
+      {
+        AABBNodeMB_t* node = ref.getAABBNodeMB();
+        
+        LBBox3fa bounds = empty;
+        for (size_t i=0; i<num; i++) {
+          node->setRef(i, children[i].ref);
+          node->setBounds(i, children[i].lbounds, tbounds);
+          bounds.extend(children[i].lbounds);
+        }
+        return NodeRecordMB(ref,bounds);
+      }
+      
+      BBox1f tbounds;
+    };
+    
+    /*! Clears the node. */
+    __forceinline void clear()  {
+      lower_x = lower_y = lower_z = vfloat<N>(pos_inf);
+      upper_x = upper_y = upper_z = vfloat<N>(neg_inf);
+      lower_dx = lower_dy = lower_dz = vfloat<N>(0.0f);
+      upper_dx = upper_dy = upper_dz = vfloat<N>(0.0f);
+      BaseNode_t<NodeRef,N>::clear();
+    }
+    
+    /*! Sets ID of child. */
+    __forceinline void setRef(size_t i, NodeRef ref) {
+      children[i] = ref;
+    }
+    
+    /*! Sets bounding box of child. */
+    __forceinline void setBounds(size_t i, const BBox3fa& bounds0_i, const BBox3fa& bounds1_i)
+    {
+      /*! for empty bounds we have to avoid inf-inf=nan */
+      BBox3fa bounds0(min(bounds0_i.lower,Vec3fa(+FLT_MAX)),max(bounds0_i.upper,Vec3fa(-FLT_MAX)));
+      BBox3fa bounds1(min(bounds1_i.lower,Vec3fa(+FLT_MAX)),max(bounds1_i.upper,Vec3fa(-FLT_MAX)));
+      bounds0 = bounds0.enlarge_by(4.0f*float(ulp));
+      bounds1 = bounds1.enlarge_by(4.0f*float(ulp));
+      Vec3fa dlower = bounds1.lower-bounds0.lower;
+      Vec3fa dupper = bounds1.upper-bounds0.upper;
+      
+      lower_x[i] = bounds0.lower.x; lower_y[i] = bounds0.lower.y; lower_z[i] = bounds0.lower.z;
+      upper_x[i] = bounds0.upper.x; upper_y[i] = bounds0.upper.y; upper_z[i] = bounds0.upper.z;
+      
+      lower_dx[i] = dlower.x; lower_dy[i] = dlower.y; lower_dz[i] = dlower.z;
+      upper_dx[i] = dupper.x; upper_dy[i] = dupper.y; upper_dz[i] = dupper.z;
+    }
+    
+    /*! Sets bounding box of child. */
+    __forceinline void setBounds(size_t i, const LBBox3fa& bounds) {
+      setBounds(i, bounds.bounds0, bounds.bounds1);
+    }
+    
+    /*! Sets bounding box of child. */
+    __forceinline void setBounds(size_t i, const LBBox3fa& bounds, const BBox1f& tbounds) {
+      setBounds(i, bounds.global(tbounds));
+    }
+    
+    /*! Sets bounding box and ID of child. */
+    __forceinline void set(size_t i, NodeRef ref, const BBox3fa& bounds) {
+      lower_x[i] = bounds.lower.x; lower_y[i] = bounds.lower.y; lower_z[i] = bounds.lower.z;
+      upper_x[i] = bounds.upper.x; upper_y[i] = bounds.upper.y; upper_z[i] = bounds.upper.z;
+      children[i] = ref;
+    }
+    
+    /*! Sets bounding box and ID of child. */
+    __forceinline void set(size_t i, const NodeRecordMB4D& child)
+    {
+      setRef(i, child.ref);
+      setBounds(i, child.lbounds, child.dt);
+    }
+    
+    /*! Return bounding box for time 0 */
+    __forceinline BBox3fa bounds0(size_t i) const {
+      return BBox3fa(Vec3fa(lower_x[i],lower_y[i],lower_z[i]),
+                     Vec3fa(upper_x[i],upper_y[i],upper_z[i]));
+    }
+    
+    /*! Return bounding box for time 1 */
+    __forceinline BBox3fa bounds1(size_t i) const {
+      return BBox3fa(Vec3fa(lower_x[i]+lower_dx[i],lower_y[i]+lower_dy[i],lower_z[i]+lower_dz[i]),
+                     Vec3fa(upper_x[i]+upper_dx[i],upper_y[i]+upper_dy[i],upper_z[i]+upper_dz[i]));
+    }
+    
+    /*! Returns bounds of node. */
+    __forceinline BBox3fa bounds() const {
+      return BBox3fa(Vec3fa(reduce_min(min(lower_x,lower_x+lower_dx)),
+                            reduce_min(min(lower_y,lower_y+lower_dy)),
+                            reduce_min(min(lower_z,lower_z+lower_dz))),
+                     Vec3fa(reduce_max(max(upper_x,upper_x+upper_dx)),
+                            reduce_max(max(upper_y,upper_y+upper_dy)),
+                            reduce_max(max(upper_z,upper_z+upper_dz))));
+    }
+    
+    /*! Return bounding box of child i */
+    __forceinline BBox3fa bounds(size_t i) const {
+      return merge(bounds0(i),bounds1(i));
+    }
+    
+    /*! Return linear bounding box of child i */
+    __forceinline LBBox3fa lbounds(size_t i) const {
+      return LBBox3fa(bounds0(i),bounds1(i));
+    }
+    
+    /*! Return bounding box of child i at specified time */
+    __forceinline BBox3fa bounds(size_t i, float time) const {
+      return lerp(bounds0(i),bounds1(i),time);
+    }
+    
+    /*! Returns the expected surface area when randomly sampling the time. */
+    __forceinline float expectedHalfArea(size_t i) const {
+      return lbounds(i).expectedHalfArea();
+    }
+    
+    /*! Returns the expected surface area when randomly sampling the time. */
+    __forceinline float expectedHalfArea(size_t i, const BBox1f& t0t1) const {
+      return lbounds(i).expectedHalfArea(t0t1); 
+    }
+    
+    /*! swap two children of the node */
+    __forceinline void swap(size_t i, size_t j)
+    {
+      assert(i<N && j<N);
+      std::swap(children[i],children[j]);
+      
+      std::swap(lower_x[i],lower_x[j]);
+      std::swap(upper_x[i],upper_x[j]);
+      std::swap(lower_y[i],lower_y[j]);
+      std::swap(upper_y[i],upper_y[j]);
+      std::swap(lower_z[i],lower_z[j]);
+      std::swap(upper_z[i],upper_z[j]);
+      
+      std::swap(lower_dx[i],lower_dx[j]);
+      std::swap(upper_dx[i],upper_dx[j]);
+      std::swap(lower_dy[i],lower_dy[j]);
+      std::swap(upper_dy[i],upper_dy[j]);
+      std::swap(lower_dz[i],lower_dz[j]);
+      std::swap(upper_dz[i],upper_dz[j]);
+    }
+
+    /*! compacts a node (moves empty children to the end) */
+    __forceinline static void compact(AABBNodeMB_t* a)
+    {
+      /* find right most filled node */
+      ssize_t j=N;
+      for (j=j-1; j>=0; j--)
+        if (a->child(j) != NodeRef::emptyNode)
+          break;
+
+      /* replace empty nodes with filled nodes */
+      for (ssize_t i=0; i<j; i++) {
+        if (a->child(i) == NodeRef::emptyNode) {
+          a->swap(i,j);
+          for (j=j-1; j>i; j--)
+            if (a->child(j) != NodeRef::emptyNode)
+              break;
+        }
+      }
+    }
+    
+    /*! Returns reference to specified child */
+    __forceinline       NodeRef& child(size_t i)       { assert(i<N); return children[i]; }
+    __forceinline const NodeRef& child(size_t i) const { assert(i<N); return children[i]; }
+    
+    /*! stream output operator */
+    friend embree_ostream operator<<(embree_ostream cout, const AABBNodeMB_t& n) 
+    {
+      cout << "AABBNodeMB {" << embree_endl;
+      for (size_t i=0; i<N; i++) 
+      {
+        const BBox3fa b0 = n.bounds0(i);
+        const BBox3fa b1 = n.bounds1(i);
+        cout << "  child" << i << " { " << embree_endl;
+        cout << "    bounds0 = " << b0 << ", " << embree_endl;
+        cout << "    bounds1 = " << b1 << ", " << embree_endl;
+        cout << "  }";
+      }
+      cout << "}";
+      return cout;
+    }
+    
+  public:
+    vfloat<N> lower_x;        //!< X dimension of lower bounds of all N children.
+    vfloat<N> upper_x;        //!< X dimension of upper bounds of all N children.
+    vfloat<N> lower_y;        //!< Y dimension of lower bounds of all N children.
+    vfloat<N> upper_y;        //!< Y dimension of upper bounds of all N children.
+    vfloat<N> lower_z;        //!< Z dimension of lower bounds of all N children.
+    vfloat<N> upper_z;        //!< Z dimension of upper bounds of all N children.
+    
+    vfloat<N> lower_dx;        //!< X dimension of lower bounds of all N children.
+    vfloat<N> upper_dx;        //!< X dimension of upper bounds of all N children.
+    vfloat<N> lower_dy;        //!< Y dimension of lower bounds of all N children.
+    vfloat<N> upper_dy;        //!< Y dimension of upper bounds of all N children.
+    vfloat<N> lower_dz;        //!< Z dimension of lower bounds of all N children.
+    vfloat<N> upper_dz;        //!< Z dimension of upper bounds of all N children.
+  };
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_node_aabb_mb4d.h b/thirdparty/embree/kernels/bvh/bvh_node_aabb_mb4d.h
new file mode 100644
index 0000000000..46a81d7581
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_node_aabb_mb4d.h
@@ -0,0 +1,107 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh_node_aabb_mb.h"
+
+namespace embree
+{
+  /*! Aligned 4D Motion Blur Node */
+  template<typename NodeRef, int N>
+    struct AABBNodeMB4D_t : public AABBNodeMB_t<NodeRef, N>
+  {
+    using BaseNode_t<NodeRef,N>::children;
+    using AABBNodeMB_t<NodeRef,N>::set;
+
+    typedef BVHNodeRecord<NodeRef>     NodeRecord;
+    typedef BVHNodeRecordMB<NodeRef>   NodeRecordMB;
+    typedef BVHNodeRecordMB4D<NodeRef> NodeRecordMB4D;
+    
+    struct Create
+    {
+      template<typename BuildRecord>
+      __forceinline NodeRef operator() (BuildRecord*, const size_t, const FastAllocator::CachedAllocator& alloc, bool hasTimeSplits = true) const
+      {
+        if (hasTimeSplits)
+        {
+          AABBNodeMB4D_t* node = (AABBNodeMB4D_t*) alloc.malloc0(sizeof(AABBNodeMB4D_t),NodeRef::byteNodeAlignment); node->clear();
+          return NodeRef::encodeNode(node);
+        }
+        else
+        {
+          AABBNodeMB_t<NodeRef,N>* node = (AABBNodeMB_t<NodeRef,N>*) alloc.malloc0(sizeof(AABBNodeMB_t<NodeRef,N>),NodeRef::byteNodeAlignment); node->clear();
+          return NodeRef::encodeNode(node);
+        }
+      }
+    };
+
+    struct Set
+    {
+      template<typename BuildRecord>
+      __forceinline void operator() (const BuildRecord&, const BuildRecord*, NodeRef ref, NodeRecordMB4D* children, const size_t num) const
+      {
+        if (likely(ref.isAABBNodeMB())) {
+          for (size_t i=0; i<num; i++)
+            ref.getAABBNodeMB()->set(i, children[i]);
+        } else {
+          for (size_t i=0; i<num; i++)
+            ref.getAABBNodeMB4D()->set(i, children[i]);
+        }
+      }
+    };
+
+    /*! Clears the node. */
+    __forceinline void clear()  {
+      lower_t = vfloat<N>(pos_inf);
+      upper_t = vfloat<N>(neg_inf);
+      AABBNodeMB_t<NodeRef,N>::clear();
+    }
+    
+    /*! Sets bounding box of child. */
+    __forceinline void setBounds(size_t i, const LBBox3fa& bounds, const BBox1f& tbounds)
+    {
+      AABBNodeMB_t<NodeRef,N>::setBounds(i, bounds.global(tbounds));
+      lower_t[i] = tbounds.lower;
+      upper_t[i] = tbounds.upper == 1.0f ? 1.0f+float(ulp) : tbounds.upper;
+    }
+    
+    /*! Sets bounding box and ID of child. */
+    __forceinline void set(size_t i, const NodeRecordMB4D& child) {
+      AABBNodeMB_t<NodeRef,N>::setRef(i,child.ref);
+      setBounds(i, child.lbounds, child.dt);
+    }
+    
+    /*! Returns the expected surface area when randomly sampling the time. */
+    __forceinline float expectedHalfArea(size_t i) const {
+      return AABBNodeMB_t<NodeRef,N>::lbounds(i).expectedHalfArea(timeRange(i));
+    }
+    
+    /*! returns time range for specified child */
+    __forceinline BBox1f timeRange(size_t i) const {
+      return BBox1f(lower_t[i],upper_t[i]);
+    }
+    
+    /*! stream output operator */
+    friend embree_ostream operator<<(embree_ostream cout, const AABBNodeMB4D_t& n) 
+    {
+      cout << "AABBNodeMB4D {" << embree_endl;
+      for (size_t i=0; i<N; i++) 
+      {
+        const BBox3fa b0 = n.bounds0(i);
+        const BBox3fa b1 = n.bounds1(i);
+        cout << "  child" << i << " { " << embree_endl;
+        cout << "    bounds0 = " << lerp(b0,b1,n.lower_t[i]) << ", " << embree_endl;
+        cout << "    bounds1 = " << lerp(b0,b1,n.upper_t[i]) << ", " << embree_endl;
+        cout << "    time_bounds = " << n.lower_t[i] << ", " << n.upper_t[i] << embree_endl;
+        cout << "  }";
+      }
+      cout << "}";
+      return cout;
+    }
+    
+  public:
+    vfloat<N> lower_t;        //!< time dimension of lower bounds of all N children
+    vfloat<N> upper_t;        //!< time dimension of upper bounds of all N children
+  };
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_node_base.h b/thirdparty/embree/kernels/bvh/bvh_node_base.h
new file mode 100644
index 0000000000..a5570a7b9e
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_node_base.h
@@ -0,0 +1,43 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh_node_ref.h"
+
+namespace embree
+{
+  
+  /*! BVHN Base Node */
+  template<typename NodeRef, int N>
+    struct BaseNode_t
+  {
+    /*! Clears the node. */
+    __forceinline void clear()
+    {
+      for (size_t i=0; i<N; i++)
+        children[i] = NodeRef::emptyNode;
+    }
+    
+    /*! Returns reference to specified child */
+    __forceinline       NodeRef& child(size_t i)       { assert(i<N); return children[i]; }
+    __forceinline const NodeRef& child(size_t i) const { assert(i<N); return children[i]; }
+    
+    /*! verifies the node */
+    __forceinline bool verify() const
+    {
+      for (size_t i=0; i<N; i++) {
+        if (child(i) == NodeRef::emptyNode) {
+          for (; i<N; i++) {
+            if (child(i) != NodeRef::emptyNode)
+              return false;
+          }
+          break;
+        }
+      }
+      return true;
+    }
+    
+    NodeRef children[N];    //!< Pointer to the N children (can be a node or leaf)
+  };
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_node_obb.h b/thirdparty/embree/kernels/bvh/bvh_node_obb.h
new file mode 100644
index 0000000000..e6b500691e
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_node_obb.h
@@ -0,0 +1,98 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh_node_base.h"
+
+namespace embree
+{
+  /*! Node with unaligned bounds */
+  template<typename NodeRef, int N>
+    struct OBBNode_t : public BaseNode_t<NodeRef, N>
+  {
+    using BaseNode_t<NodeRef,N>::children;
+    
+    struct Create
+    {
+      __forceinline NodeRef operator() (const FastAllocator::CachedAllocator& alloc) const
+      {
+        OBBNode_t* node = (OBBNode_t*) alloc.malloc0(sizeof(OBBNode_t),NodeRef::byteNodeAlignment); node->clear();
+        return NodeRef::encodeNode(node);
+      }
+    };
+    
+    struct Set
+    {
+      __forceinline void operator() (NodeRef node, size_t i, NodeRef child, const OBBox3fa& bounds) const {
+        node.ungetAABBNode()->setRef(i,child);
+        node.ungetAABBNode()->setBounds(i,bounds);
+      }
+    };
+    
+    /*! Clears the node. */
+    __forceinline void clear()
+    {
+      naabb.l.vx = Vec3fa(nan);
+      naabb.l.vy = Vec3fa(nan);
+      naabb.l.vz = Vec3fa(nan);
+      naabb.p    = Vec3fa(nan);
+      BaseNode_t<NodeRef,N>::clear();
+    }
+    
+    /*! Sets bounding box. */
+    __forceinline void setBounds(size_t i, const OBBox3fa& b)
+    {
+      assert(i < N);
+      
+      AffineSpace3fa space = b.space;
+      space.p -= b.bounds.lower;
+      space = AffineSpace3fa::scale(1.0f/max(Vec3fa(1E-19f),b.bounds.upper-b.bounds.lower))*space;
+      
+      naabb.l.vx.x[i] = space.l.vx.x;
+      naabb.l.vx.y[i] = space.l.vx.y;
+      naabb.l.vx.z[i] = space.l.vx.z;
+      
+      naabb.l.vy.x[i] = space.l.vy.x;
+      naabb.l.vy.y[i] = space.l.vy.y;
+      naabb.l.vy.z[i] = space.l.vy.z;
+      
+      naabb.l.vz.x[i] = space.l.vz.x;
+      naabb.l.vz.y[i] = space.l.vz.y;
+      naabb.l.vz.z[i] = space.l.vz.z;
+      
+      naabb.p.x[i] = space.p.x;
+      naabb.p.y[i] = space.p.y;
+      naabb.p.z[i] = space.p.z;
+    }
+    
+    /*! Sets ID of child. */
+    __forceinline void setRef(size_t i, const NodeRef& ref) {
+      assert(i < N);
+      children[i] = ref;
+    }
+    
+    /*! Returns the extent of the bounds of the ith child */
+    __forceinline Vec3fa extent(size_t i) const {
+      assert(i<N);
+      const Vec3fa vx(naabb.l.vx.x[i],naabb.l.vx.y[i],naabb.l.vx.z[i]);
+      const Vec3fa vy(naabb.l.vy.x[i],naabb.l.vy.y[i],naabb.l.vy.z[i]);
+      const Vec3fa vz(naabb.l.vz.x[i],naabb.l.vz.y[i],naabb.l.vz.z[i]);
+      return rsqrt(vx*vx + vy*vy + vz*vz);
+    }
+    
+    /*! Returns reference to specified child */
+    __forceinline       NodeRef& child(size_t i)       { assert(i<N); return children[i]; }
+    __forceinline const NodeRef& child(size_t i) const { assert(i<N); return children[i]; }
+    
+    /*! output operator */
+    friend embree_ostream operator<<(embree_ostream o, const OBBNode_t& n)
+    {
+      o << "UnAABBNode { " << n.naabb << " } " << embree_endl;
+      return o;
+    }
+    
+  public:
+    AffineSpace3vf<N> naabb;   //!< non-axis aligned bounding boxes (bounds are [0,1] in specified space)
+  };
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_node_obb_mb.h b/thirdparty/embree/kernels/bvh/bvh_node_obb_mb.h
new file mode 100644
index 0000000000..c06b1aea5e
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_node_obb_mb.h
@@ -0,0 +1,90 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh_node_base.h"
+
+namespace embree
+{
+  template<typename NodeRef, int N>
+    struct OBBNodeMB_t : public BaseNode_t<NodeRef, N>
+  {
+    using BaseNode_t<NodeRef,N>::children;
+    
+    struct Create
+    {
+      __forceinline NodeRef operator() (const FastAllocator::CachedAllocator& alloc) const
+      {
+        OBBNodeMB_t* node = (OBBNodeMB_t*) alloc.malloc0(sizeof(OBBNodeMB_t),NodeRef::byteNodeAlignment); node->clear();
+        return NodeRef::encodeNode(node);
+      }
+    };
+    
+    struct Set
+    {
+      __forceinline void operator() (NodeRef node, size_t i, NodeRef child, const LinearSpace3fa& space, const LBBox3fa& lbounds, const BBox1f dt) const {
+        node.ungetAABBNodeMB()->setRef(i,child);
+        node.ungetAABBNodeMB()->setBounds(i,space,lbounds.global(dt));
+      }
+    };
+    
+    /*! Clears the node. */
+    __forceinline void clear()
+    {
+      space0 = one;
+      //b0.lower = b0.upper = Vec3fa(nan);
+      b1.lower = b1.upper = Vec3fa(nan);
+      BaseNode_t<NodeRef,N>::clear();
+    }
+    
+    /*! Sets space and bounding boxes. */
+    __forceinline void setBounds(size_t i, const AffineSpace3fa& space, const LBBox3fa& lbounds) {
+      setBounds(i,space,lbounds.bounds0,lbounds.bounds1);
+    }
+    
+    /*! Sets space and bounding boxes. */
+    __forceinline void setBounds(size_t i, const AffineSpace3fa& s0, const BBox3fa& a, const BBox3fa& c)
+    {
+      assert(i < N);
+      
+      AffineSpace3fa space = s0;
+      space.p -= a.lower;
+      Vec3fa scale = 1.0f/max(Vec3fa(1E-19f),a.upper-a.lower);
+      space = AffineSpace3fa::scale(scale)*space;
+      BBox3fa a1((a.lower-a.lower)*scale,(a.upper-a.lower)*scale);
+      BBox3fa c1((c.lower-a.lower)*scale,(c.upper-a.lower)*scale);
+      
+      space0.l.vx.x[i] = space.l.vx.x; space0.l.vx.y[i] = space.l.vx.y; space0.l.vx.z[i] = space.l.vx.z;
+      space0.l.vy.x[i] = space.l.vy.x; space0.l.vy.y[i] = space.l.vy.y; space0.l.vy.z[i] = space.l.vy.z;
+      space0.l.vz.x[i] = space.l.vz.x; space0.l.vz.y[i] = space.l.vz.y; space0.l.vz.z[i] = space.l.vz.z;
+      space0.p   .x[i] = space.p   .x; space0.p   .y[i] = space.p   .y; space0.p   .z[i] = space.p   .z;
+      
+      /*b0.lower.x[i] = a1.lower.x; b0.lower.y[i] = a1.lower.y; b0.lower.z[i] = a1.lower.z;
+        b0.upper.x[i] = a1.upper.x; b0.upper.y[i] = a1.upper.y; b0.upper.z[i] = a1.upper.z;*/
+      
+      b1.lower.x[i] = c1.lower.x; b1.lower.y[i] = c1.lower.y; b1.lower.z[i] = c1.lower.z;
+      b1.upper.x[i] = c1.upper.x; b1.upper.y[i] = c1.upper.y; b1.upper.z[i] = c1.upper.z;
+    }
+    
+    /*! Sets ID of child. */
+    __forceinline void setRef(size_t i, const NodeRef& ref) {
+      assert(i < N);
+      children[i] = ref;
+    }
+    
+    /*! Returns the extent of the bounds of the ith child */
+    __forceinline Vec3fa extent0(size_t i) const {
+      assert(i < N);
+      const Vec3fa vx(space0.l.vx.x[i],space0.l.vx.y[i],space0.l.vx.z[i]);
+      const Vec3fa vy(space0.l.vy.x[i],space0.l.vy.y[i],space0.l.vy.z[i]);
+      const Vec3fa vz(space0.l.vz.x[i],space0.l.vz.y[i],space0.l.vz.z[i]);
+      return rsqrt(vx*vx + vy*vy + vz*vz);
+    }
+    
+  public:
+    AffineSpace3vf<N> space0;
+    //BBox3vf<N> b0; // these are the unit bounds
+    BBox3vf<N> b1;
+  };
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_node_qaabb.h b/thirdparty/embree/kernels/bvh/bvh_node_qaabb.h
new file mode 100644
index 0000000000..2afc8c98e7
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_node_qaabb.h
@@ -0,0 +1,265 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh_node_base.h"
+
+namespace embree
+{
+  /*! BVHN Quantized Node */
+  template<int N>
+    struct __aligned(8) QuantizedBaseNode_t
+  {
+    typedef unsigned char T;
+    static const T MIN_QUAN = 0;
+    static const T MAX_QUAN = 255;
+    
+    /*! Clears the node. */
+    __forceinline void clear() {
+      for (size_t i=0; i<N; i++) lower_x[i] = lower_y[i] = lower_z[i] = MAX_QUAN;
+      for (size_t i=0; i<N; i++) upper_x[i] = upper_y[i] = upper_z[i] = MIN_QUAN;
+    }
+    
+    /*! Returns bounds of specified child. */
+    __forceinline BBox3fa bounds(size_t i) const
+    {
+      assert(i < N);
+      const Vec3fa lower(madd(scale.x,(float)lower_x[i],start.x),
+                         madd(scale.y,(float)lower_y[i],start.y),
+                         madd(scale.z,(float)lower_z[i],start.z));
+      const Vec3fa upper(madd(scale.x,(float)upper_x[i],start.x),
+                         madd(scale.y,(float)upper_y[i],start.y),
+                         madd(scale.z,(float)upper_z[i],start.z));
+      return BBox3fa(lower,upper);
+    }
+    
+    /*! Returns extent of bounds of specified child. */
+    __forceinline Vec3fa extent(size_t i) const {
+      return bounds(i).size();
+    }
+    
+    static __forceinline void init_dim(const vfloat<N> &lower,
+                                       const vfloat<N> &upper,
+                                       T lower_quant[N],
+                                       T upper_quant[N],
+                                       float &start,
+                                       float &scale)
+    {
+      /* quantize bounds */
+      const vbool<N> m_valid = lower != vfloat<N>(pos_inf);
+      const float minF = reduce_min(lower);
+      const float maxF = reduce_max(upper);
+      float diff = (1.0f+2.0f*float(ulp))*(maxF - minF);
+      float decode_scale = diff / float(MAX_QUAN);
+      if (decode_scale == 0.0f) decode_scale = 2.0f*FLT_MIN; // result may have been flushed to zero
+      assert(madd(decode_scale,float(MAX_QUAN),minF) >= maxF);
+      const float encode_scale = diff > 0 ? (float(MAX_QUAN) / diff) : 0.0f;
+      vint<N> ilower = max(vint<N>(floor((lower - vfloat<N>(minF))*vfloat<N>(encode_scale))),MIN_QUAN);
+      vint<N> iupper = min(vint<N>(ceil ((upper - vfloat<N>(minF))*vfloat<N>(encode_scale))),MAX_QUAN);
+      
+      /* lower/upper correction */
+      vbool<N> m_lower_correction = (madd(vfloat<N>(ilower),decode_scale,minF)) > lower;
+      vbool<N> m_upper_correction = (madd(vfloat<N>(iupper),decode_scale,minF)) < upper;
+      ilower = max(select(m_lower_correction,ilower-1,ilower),MIN_QUAN);
+      iupper = min(select(m_upper_correction,iupper+1,iupper),MAX_QUAN);
+      
+      /* disable invalid lanes */
+      ilower = select(m_valid,ilower,MAX_QUAN);
+      iupper = select(m_valid,iupper,MIN_QUAN);
+      
+      /* store as uchar to memory */
+      vint<N>::store(lower_quant,ilower);
+      vint<N>::store(upper_quant,iupper);
+      start = minF;
+      scale = decode_scale;
+      
+#if defined(DEBUG)
+      vfloat<N> extract_lower( vint<N>::loadu(lower_quant) );
+      vfloat<N> extract_upper( vint<N>::loadu(upper_quant) );
+      vfloat<N> final_extract_lower = madd(extract_lower,decode_scale,minF);
+      vfloat<N> final_extract_upper = madd(extract_upper,decode_scale,minF);
+      assert( (movemask(final_extract_lower <= lower ) & movemask(m_valid)) == movemask(m_valid));
+      assert( (movemask(final_extract_upper >= upper ) & movemask(m_valid)) == movemask(m_valid));
+#endif
+    }
+    
+    __forceinline void init_dim(AABBNode_t<NodeRefPtr<N>,N>& node)
+    {
+      init_dim(node.lower_x,node.upper_x,lower_x,upper_x,start.x,scale.x);
+      init_dim(node.lower_y,node.upper_y,lower_y,upper_y,start.y,scale.y);
+      init_dim(node.lower_z,node.upper_z,lower_z,upper_z,start.z,scale.z);
+    }
+    
+    __forceinline vbool<N> validMask() const { return vint<N>::loadu(lower_x) <= vint<N>::loadu(upper_x); }
+    
+#if defined(__AVX512F__) // KNL
+    __forceinline vbool16 validMask16() const { return le(0xff,vint<16>::loadu(lower_x),vint<16>::loadu(upper_x)); }
+#endif
+    __forceinline vfloat<N> dequantizeLowerX() const { return madd(vfloat<N>(vint<N>::loadu(lower_x)),scale.x,vfloat<N>(start.x)); }
+    
+    __forceinline vfloat<N> dequantizeUpperX() const { return madd(vfloat<N>(vint<N>::loadu(upper_x)),scale.x,vfloat<N>(start.x)); }
+    
+    __forceinline vfloat<N> dequantizeLowerY() const { return madd(vfloat<N>(vint<N>::loadu(lower_y)),scale.y,vfloat<N>(start.y)); }
+    
+    __forceinline vfloat<N> dequantizeUpperY() const { return madd(vfloat<N>(vint<N>::loadu(upper_y)),scale.y,vfloat<N>(start.y)); }
+    
+    __forceinline vfloat<N> dequantizeLowerZ() const { return madd(vfloat<N>(vint<N>::loadu(lower_z)),scale.z,vfloat<N>(start.z)); }
+    
+    __forceinline vfloat<N> dequantizeUpperZ() const { return madd(vfloat<N>(vint<N>::loadu(upper_z)),scale.z,vfloat<N>(start.z)); }
+    
+    template <int M>
+      __forceinline vfloat<M> dequantize(const size_t offset) const { return vfloat<M>(vint<M>::loadu(all_planes+offset)); }
+    
+#if defined(__AVX512F__)
+    __forceinline vfloat16 dequantizeLowerUpperX(const vint16 &p) const { return madd(vfloat16(permute(vint<16>::loadu(lower_x),p)),scale.x,vfloat16(start.x)); }
+    __forceinline vfloat16 dequantizeLowerUpperY(const vint16 &p) const { return madd(vfloat16(permute(vint<16>::loadu(lower_y),p)),scale.y,vfloat16(start.y)); }
+    __forceinline vfloat16 dequantizeLowerUpperZ(const vint16 &p) const { return madd(vfloat16(permute(vint<16>::loadu(lower_z),p)),scale.z,vfloat16(start.z)); }      
+#endif
+    
+    union {
+      struct {
+        T lower_x[N]; //!< 8bit discretized X dimension of lower bounds of all N children
+        T upper_x[N]; //!< 8bit discretized X dimension of upper bounds of all N children
+        T lower_y[N]; //!< 8bit discretized Y dimension of lower bounds of all N children
+        T upper_y[N]; //!< 8bit discretized Y dimension of upper bounds of all N children
+        T lower_z[N]; //!< 8bit discretized Z dimension of lower bounds of all N children
+        T upper_z[N]; //!< 8bit discretized Z dimension of upper bounds of all N children
+      };
+      T all_planes[6*N];
+    };
+    
+    Vec3f start;
+    Vec3f scale;
+    
+    friend embree_ostream operator<<(embree_ostream o, const QuantizedBaseNode_t& n)
+    {
+      o << "QuantizedBaseNode { " << embree_endl;
+      o << "  start   " << n.start << embree_endl;
+      o << "  scale   " << n.scale << embree_endl;
+      o << "  lower_x " << vuint<N>::loadu(n.lower_x) << embree_endl;
+      o << "  upper_x " << vuint<N>::loadu(n.upper_x) << embree_endl;
+      o << "  lower_y " << vuint<N>::loadu(n.lower_y) << embree_endl;
+      o << "  upper_y " << vuint<N>::loadu(n.upper_y) << embree_endl;
+      o << "  lower_z " << vuint<N>::loadu(n.lower_z) << embree_endl;
+      o << "  upper_z " << vuint<N>::loadu(n.upper_z) << embree_endl;
+      o << "}" << embree_endl;
+      return o;
+    }
+    
+  };
+
+  template<typename NodeRef, int N>
+    struct __aligned(8) QuantizedNode_t : public BaseNode_t<NodeRef, N>, QuantizedBaseNode_t<N>
+  {
+    using BaseNode_t<NodeRef,N>::children;
+    using QuantizedBaseNode_t<N>::lower_x;
+    using QuantizedBaseNode_t<N>::upper_x;
+    using QuantizedBaseNode_t<N>::lower_y;
+    using QuantizedBaseNode_t<N>::upper_y;
+    using QuantizedBaseNode_t<N>::lower_z;
+    using QuantizedBaseNode_t<N>::upper_z;
+    using QuantizedBaseNode_t<N>::start;
+    using QuantizedBaseNode_t<N>::scale;
+    using QuantizedBaseNode_t<N>::init_dim;
+    
+    __forceinline void setRef(size_t i, const NodeRef& ref) {
+      assert(i < N);
+      children[i] = ref;
+    }
+    
+    struct Create2
+    {
+      template<typename BuildRecord>
+      __forceinline NodeRef operator() (BuildRecord* children, const size_t n, const FastAllocator::CachedAllocator& alloc) const
+      {
+        __aligned(64) AABBNode_t<NodeRef,N> node;
+        node.clear();
+        for (size_t i=0; i<n; i++) {
+          node.setBounds(i,children[i].bounds());
+        }
+        QuantizedNode_t *qnode = (QuantizedNode_t*) alloc.malloc0(sizeof(QuantizedNode_t), NodeRef::byteAlignment);
+        qnode->init(node);
+        
+        return (size_t)qnode | NodeRef::tyQuantizedNode;
+      }
+    };
+    
+    struct Set2
+    {
+      template<typename BuildRecord>
+      __forceinline NodeRef operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRef* children, const size_t num) const
+      {
+        QuantizedNode_t* node = ref.quantizedNode();
+        for (size_t i=0; i<num; i++) node->setRef(i,children[i]);
+        return ref;
+      }
+    };
+    
+    __forceinline void init(AABBNode_t<NodeRef,N>& node)
+    {
+      for (size_t i=0;i<N;i++) children[i] = NodeRef::emptyNode;
+      init_dim(node);
+    }
+    
+  }; 
+  
+  /*! BVHN Quantized Node */
+  template<int N>
+    struct __aligned(8) QuantizedBaseNodeMB_t
+  {
+    QuantizedBaseNode_t<N> node0;
+    QuantizedBaseNode_t<N> node1;
+    
+    /*! Clears the node. */
+    __forceinline void clear() {
+      node0.clear();
+      node1.clear();
+    }
+    
+    /*! Returns bounds of specified child. */
+    __forceinline BBox3fa bounds(size_t i) const
+    {
+      assert(i < N);
+      BBox3fa bounds0 = node0.bounds(i);
+      BBox3fa bounds1 = node1.bounds(i);
+      bounds0.extend(bounds1);
+      return bounds0;
+    }
+    
+    /*! Returns extent of bounds of specified child. */
+    __forceinline Vec3fa extent(size_t i) const {
+      return bounds(i).size();
+    }
+    
+    __forceinline vbool<N> validMask() const { return node0.validMask(); }
+    
+    template<typename T>
+      __forceinline vfloat<N> dequantizeLowerX(const T t) const { return lerp(node0.dequantizeLowerX(),node1.dequantizeLowerX(),t); }
+    template<typename T>
+      __forceinline vfloat<N> dequantizeUpperX(const T t) const { return lerp(node0.dequantizeUpperX(),node1.dequantizeUpperX(),t); }
+    template<typename T>
+      __forceinline vfloat<N> dequantizeLowerY(const T t) const { return lerp(node0.dequantizeLowerY(),node1.dequantizeLowerY(),t); }
+    template<typename T>
+      __forceinline vfloat<N> dequantizeUpperY(const T t) const { return lerp(node0.dequantizeUpperY(),node1.dequantizeUpperY(),t); }
+    template<typename T>
+      __forceinline vfloat<N> dequantizeLowerZ(const T t) const { return lerp(node0.dequantizeLowerZ(),node1.dequantizeLowerZ(),t); }
+    template<typename T>
+      __forceinline vfloat<N> dequantizeUpperZ(const T t) const { return lerp(node0.dequantizeUpperZ(),node1.dequantizeUpperZ(),t); }
+    
+    
+    template<int M>
+      __forceinline vfloat<M> dequantizeLowerX(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeLowerX()[i]),vfloat<M>(node1.dequantizeLowerX()[i]),t); }
+    template<int M>
+      __forceinline vfloat<M> dequantizeUpperX(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeUpperX()[i]),vfloat<M>(node1.dequantizeUpperX()[i]),t); }
+    template<int M>
+      __forceinline vfloat<M> dequantizeLowerY(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeLowerY()[i]),vfloat<M>(node1.dequantizeLowerY()[i]),t); }
+    template<int M>
+      __forceinline vfloat<M> dequantizeUpperY(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeUpperY()[i]),vfloat<M>(node1.dequantizeUpperY()[i]),t); }
+    template<int M>
+      __forceinline vfloat<M> dequantizeLowerZ(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeLowerZ()[i]),vfloat<M>(node1.dequantizeLowerZ()[i]),t); }
+    template<int M>
+      __forceinline vfloat<M> dequantizeUpperZ(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeUpperZ()[i]),vfloat<M>(node1.dequantizeUpperZ()[i]),t); }
+    
+  };
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_node_ref.h b/thirdparty/embree/kernels/bvh/bvh_node_ref.h
new file mode 100644
index 0000000000..6f6da758de
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_node_ref.h
@@ -0,0 +1,242 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/default.h"
+#include "../common/alloc.h"
+#include "../common/accel.h"
+#include "../common/device.h"
+#include "../common/scene.h"
+#include "../geometry/primitive.h"
+#include "../common/ray.h"
+
+namespace embree
+{
+  /* BVH node reference with bounds */
+  template<typename NodeRef>
+  struct BVHNodeRecord
+  {
+    __forceinline BVHNodeRecord() {}
+    __forceinline BVHNodeRecord(NodeRef ref, const BBox3fa& bounds) : ref(ref), bounds((BBox3fx)bounds) {}
+    __forceinline BVHNodeRecord(NodeRef ref, const BBox3fx& bounds) : ref(ref), bounds(bounds) {}
+
+    NodeRef ref;
+    BBox3fx bounds;
+  };
+
+  template<typename NodeRef>
+  struct BVHNodeRecordMB
+  {
+    __forceinline BVHNodeRecordMB() {}
+    __forceinline BVHNodeRecordMB(NodeRef ref, const LBBox3fa& lbounds) : ref(ref), lbounds(lbounds) {}
+
+    NodeRef ref;
+    LBBox3fa lbounds;
+  };
+
+  template<typename NodeRef>
+  struct BVHNodeRecordMB4D
+  {
+    __forceinline BVHNodeRecordMB4D() {}
+    __forceinline BVHNodeRecordMB4D(NodeRef ref, const LBBox3fa& lbounds, const BBox1f& dt) : ref(ref), lbounds(lbounds), dt(dt) {}
+
+    NodeRef ref;
+    LBBox3fa lbounds;
+    BBox1f dt;
+  };
+
+  template<typename NodeRef, int N> struct BaseNode_t;
+  template<typename NodeRef, int N> struct AABBNode_t;
+  template<typename NodeRef, int N> struct AABBNodeMB_t;
+  template<typename NodeRef, int N> struct AABBNodeMB4D_t;
+  template<typename NodeRef, int N> struct OBBNode_t;
+  template<typename NodeRef, int N> struct OBBNodeMB_t;
+  template<typename NodeRef, int N> struct QuantizedNode_t;
+  template<typename NodeRef, int N> struct QuantizedNodeMB_t;
+  
+  /*! Pointer that points to a node or a list of primitives */
+  template<int N>
+    struct NodeRefPtr
+  {
+    //template<int NN> friend class BVHN;
+
+    /*! Number of bytes the nodes and primitives are minimally aligned to.*/
+    static const size_t byteAlignment = 16;
+    static const size_t byteNodeAlignment = 4*N;
+
+    /*! highest address bit is used as barrier for some algorithms */
+    static const size_t barrier_mask = (1LL << (8*sizeof(size_t)-1));
+
+    /*! Masks the bits that store the number of items per leaf. */
+    static const size_t align_mask = byteAlignment-1;
+    static const size_t items_mask = byteAlignment-1;
+
+    /*! different supported node types */
+    static const size_t tyAABBNode = 0;
+    static const size_t tyAABBNodeMB = 1;
+    static const size_t tyAABBNodeMB4D = 6;
+    static const size_t tyOBBNode = 2;
+    static const size_t tyOBBNodeMB = 3;
+    static const size_t tyQuantizedNode = 5;
+    static const size_t tyLeaf = 8;
+
+    /*! Empty node */
+    static const size_t emptyNode = tyLeaf;
+
+    /*! Invalid node, used as marker in traversal */
+    static const size_t invalidNode = (((size_t)-1) & (~items_mask)) | (tyLeaf+0);
+    static const size_t popRay      = (((size_t)-1) & (~items_mask)) | (tyLeaf+1);
+
+    /*! Maximum number of primitive blocks in a leaf. */
+    static const size_t maxLeafBlocks = items_mask-tyLeaf;
+        
+    /*! Default constructor */
+    __forceinline NodeRefPtr () {}
+    
+    /*! Construction from integer */
+    __forceinline NodeRefPtr (size_t ptr) : ptr(ptr) {}
+    
+    /*! Cast to size_t */
+    __forceinline operator size_t() const { return ptr; }
+    
+    /*! Sets the barrier bit. */
+    __forceinline void setBarrier() {
+#if defined(__64BIT__)
+      assert(!isBarrier());
+      ptr |= barrier_mask;
+#else
+      assert(false);
+#endif
+    }
+    
+    /*! Clears the barrier bit. */
+    __forceinline void clearBarrier() {
+#if defined(__64BIT__)
+      ptr &= ~barrier_mask;
+#else
+      assert(false);
+#endif
+    }
+    
+    /*! Checks if this is an barrier. A barrier tells the top level tree rotations how deep to enter the tree. */
+    __forceinline bool isBarrier() const { return (ptr & barrier_mask) != 0; }
+    
+    /*! checks if this is a leaf */
+    __forceinline size_t isLeaf() const { return ptr & tyLeaf; }
+    
+    /*! returns node type */
+    __forceinline int type() const { return ptr & (size_t)align_mask; }
+    
+    /*! checks if this is a node */
+    __forceinline int isAABBNode() const { return (ptr & (size_t)align_mask) == tyAABBNode; }
+    
+    /*! checks if this is a motion blur node */
+    __forceinline int isAABBNodeMB() const { return (ptr & (size_t)align_mask) == tyAABBNodeMB; }
+    
+    /*! checks if this is a 4D motion blur node */
+    __forceinline int isAABBNodeMB4D() const { return (ptr & (size_t)align_mask) == tyAABBNodeMB4D; }
+    
+    /*! checks if this is a node with unaligned bounding boxes */
+    __forceinline int isOBBNode() const { return (ptr & (size_t)align_mask) == tyOBBNode; }
+    
+    /*! checks if this is a motion blur node with unaligned bounding boxes */
+    __forceinline int isOBBNodeMB() const { return (ptr & (size_t)align_mask) == tyOBBNodeMB; }
+    
+    /*! checks if this is a quantized node */
+    __forceinline int isQuantizedNode() const { return (ptr & (size_t)align_mask) == tyQuantizedNode; }
+
+    /*! Encodes a node */
+    static __forceinline NodeRefPtr encodeNode(AABBNode_t<NodeRefPtr,N>* node) {
+      assert(!((size_t)node & align_mask));
+      return NodeRefPtr((size_t) node);
+    }
+
+    static __forceinline NodeRefPtr encodeNode(AABBNodeMB_t<NodeRefPtr,N>* node) {
+      assert(!((size_t)node & align_mask));
+      return NodeRefPtr((size_t) node | tyAABBNodeMB);
+    }
+
+    static __forceinline NodeRefPtr encodeNode(AABBNodeMB4D_t<NodeRefPtr,N>* node) {
+      assert(!((size_t)node & align_mask));
+      return NodeRefPtr((size_t) node | tyAABBNodeMB4D);
+    }
+
+    /*! Encodes an unaligned node */
+    static __forceinline NodeRefPtr encodeNode(OBBNode_t<NodeRefPtr,N>* node) {
+      return NodeRefPtr((size_t) node | tyOBBNode);
+    }
+
+    /*! Encodes an unaligned motion blur node */
+    static __forceinline NodeRefPtr encodeNode(OBBNodeMB_t<NodeRefPtr,N>* node) {
+      return NodeRefPtr((size_t) node | tyOBBNodeMB);
+    }
+
+    /*! Encodes a leaf */
+    static __forceinline NodeRefPtr encodeLeaf(void* tri, size_t num) {
+      assert(!((size_t)tri & align_mask));
+      assert(num <= maxLeafBlocks);
+      return NodeRefPtr((size_t)tri | (tyLeaf+min(num,(size_t)maxLeafBlocks)));
+    }
+
+    /*! Encodes a leaf */
+    static __forceinline NodeRefPtr encodeTypedLeaf(void* ptr, size_t ty) {
+      assert(!((size_t)ptr & align_mask));
+      return NodeRefPtr((size_t)ptr | (tyLeaf+ty));
+    }
+    
+    /*! returns base node pointer */
+    __forceinline BaseNode_t<NodeRefPtr,N>* baseNode()
+    {
+      assert(!isLeaf());
+      return (BaseNode_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask);
+    }
+    __forceinline const BaseNode_t<NodeRefPtr,N>* baseNode() const
+    {
+      assert(!isLeaf());
+      return (const BaseNode_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask);
+    }
+    
+    /*! returns node pointer */
+    __forceinline       AABBNode_t<NodeRefPtr,N>* getAABBNode()       { assert(isAABBNode()); return (      AABBNode_t<NodeRefPtr,N>*)ptr; }
+    __forceinline const AABBNode_t<NodeRefPtr,N>* getAABBNode() const { assert(isAABBNode()); return (const AABBNode_t<NodeRefPtr,N>*)ptr; }
+    
+    /*! returns motion blur node pointer */
+    __forceinline       AABBNodeMB_t<NodeRefPtr,N>* getAABBNodeMB()       { assert(isAABBNodeMB() || isAABBNodeMB4D()); return (      AABBNodeMB_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); }
+    __forceinline const AABBNodeMB_t<NodeRefPtr,N>* getAABBNodeMB() const { assert(isAABBNodeMB() || isAABBNodeMB4D()); return (const AABBNodeMB_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); }
+    
+    /*! returns 4D motion blur node pointer */
+    __forceinline       AABBNodeMB4D_t<NodeRefPtr,N>* getAABBNodeMB4D()       { assert(isAABBNodeMB4D()); return (      AABBNodeMB4D_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); }
+    __forceinline const AABBNodeMB4D_t<NodeRefPtr,N>* getAABBNodeMB4D() const { assert(isAABBNodeMB4D()); return (const AABBNodeMB4D_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); }
+    
+    /*! returns unaligned node pointer */
+    __forceinline       OBBNode_t<NodeRefPtr,N>* ungetAABBNode()       { assert(isOBBNode()); return (      OBBNode_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); }
+    __forceinline const OBBNode_t<NodeRefPtr,N>* ungetAABBNode() const { assert(isOBBNode()); return (const OBBNode_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); }
+    
+    /*! returns unaligned motion blur node pointer */
+    __forceinline       OBBNodeMB_t<NodeRefPtr,N>* ungetAABBNodeMB()       { assert(isOBBNodeMB()); return (      OBBNodeMB_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); }
+    __forceinline const OBBNodeMB_t<NodeRefPtr,N>* ungetAABBNodeMB() const { assert(isOBBNodeMB()); return (const OBBNodeMB_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); }
+    
+    /*! returns quantized node pointer */
+    __forceinline       QuantizedNode_t<NodeRefPtr,N>* quantizedNode()       { assert(isQuantizedNode()); return (      QuantizedNode_t<NodeRefPtr,N>*)(ptr  & ~(size_t)align_mask ); }
+    __forceinline const QuantizedNode_t<NodeRefPtr,N>* quantizedNode() const { assert(isQuantizedNode()); return (const QuantizedNode_t<NodeRefPtr,N>*)(ptr  & ~(size_t)align_mask ); }
+    
+    /*! returns leaf pointer */
+    __forceinline char* leaf(size_t& num) const {
+      assert(isLeaf());
+      num = (ptr & (size_t)items_mask)-tyLeaf;
+      return (char*)(ptr & ~(size_t)align_mask);
+    }
+    
+    /*! clear all bit flags */
+    __forceinline void clearFlags() {
+      ptr &= ~(size_t)align_mask;
+    }
+    
+     /*! returns the wideness */
+    __forceinline size_t getN() const { return N; }
+    
+  public:
+    size_t ptr;
+  };
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_refit.cpp b/thirdparty/embree/kernels/bvh/bvh_refit.cpp
new file mode 100644
index 0000000000..bf5c8538ba
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_refit.cpp
@@ -0,0 +1,247 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh_refit.h"
+#include "bvh_statistics.h"
+
+#include "../geometry/linei.h"
+#include "../geometry/triangle.h"
+#include "../geometry/trianglev.h"
+#include "../geometry/trianglei.h"
+#include "../geometry/quadv.h"
+#include "../geometry/object.h"
+#include "../geometry/instance.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    static const size_t SINGLE_THREAD_THRESHOLD = 4*1024;
+    
+    template<int N>
+    __forceinline bool compare(const typename BVHN<N>::NodeRef* a, const typename BVHN<N>::NodeRef* b)
+    {
+      size_t sa = *(size_t*)&a->node()->lower_x;
+      size_t sb = *(size_t*)&b->node()->lower_x;
+      return sa < sb;
+    }
+
+    template<int N>
+    BVHNRefitter<N>::BVHNRefitter (BVH* bvh, const LeafBoundsInterface& leafBounds)
+      : bvh(bvh), leafBounds(leafBounds), numSubTrees(0)
+    {
+    }
+
+    template<int N>
+    void BVHNRefitter<N>::refit()
+    {
+      if (bvh->numPrimitives <= SINGLE_THREAD_THRESHOLD) {
+        bvh->bounds = LBBox3fa(recurse_bottom(bvh->root));
+      }
+      else
+      {
+        BBox3fa subTreeBounds[MAX_NUM_SUB_TREES];
+        numSubTrees = 0;
+        gather_subtree_refs(bvh->root,numSubTrees,0);
+        if (numSubTrees)
+          parallel_for(size_t(0), numSubTrees, size_t(1), [&](const range<size_t>& r) {
+              for (size_t i=r.begin(); i<r.end(); i++) {
+                NodeRef& ref = subTrees[i];
+                subTreeBounds[i] = recurse_bottom(ref);
+              }
+            });
+
+        numSubTrees = 0;        
+        bvh->bounds = LBBox3fa(refit_toplevel(bvh->root,numSubTrees,subTreeBounds,0));
+      }    
+  }
+
+    template<int N>
+    void BVHNRefitter<N>::gather_subtree_refs(NodeRef& ref,
+                                              size_t &subtrees,
+                                              const size_t depth)
+    {
+      if (depth >= MAX_SUB_TREE_EXTRACTION_DEPTH) 
+      {
+        assert(subtrees < MAX_NUM_SUB_TREES);
+        subTrees[subtrees++] = ref;
+        return;
+      }
+
+      if (ref.isAABBNode())
+      {
+        AABBNode* node = ref.getAABBNode();
+        for (size_t i=0; i<N; i++) {
+          NodeRef& child = node->child(i);
+          if (unlikely(child == BVH::emptyNode)) continue;
+          gather_subtree_refs(child,subtrees,depth+1); 
+        }
+      }
+    }
+
+    template<int N>
+    BBox3fa BVHNRefitter<N>::refit_toplevel(NodeRef& ref,
+                                            size_t &subtrees,
+											const BBox3fa *const subTreeBounds,
+                                            const size_t depth)
+    {
+      if (depth >= MAX_SUB_TREE_EXTRACTION_DEPTH) 
+      {
+        assert(subtrees < MAX_NUM_SUB_TREES);
+        assert(subTrees[subtrees] == ref);
+        return subTreeBounds[subtrees++];
+      }
+
+      if (ref.isAABBNode())
+      {
+        AABBNode* node = ref.getAABBNode();
+        BBox3fa bounds[N];
+
+        for (size_t i=0; i<N; i++)
+        {
+          NodeRef& child = node->child(i);
+
+          if (unlikely(child == BVH::emptyNode)) 
+            bounds[i] = BBox3fa(empty);
+          else
+            bounds[i] = refit_toplevel(child,subtrees,subTreeBounds,depth+1); 
+        }
+        
+        BBox3vf<N> boundsT = transpose<N>(bounds);
+      
+        /* set new bounds */
+        node->lower_x = boundsT.lower.x;
+        node->lower_y = boundsT.lower.y;
+        node->lower_z = boundsT.lower.z;
+        node->upper_x = boundsT.upper.x;
+        node->upper_y = boundsT.upper.y;
+        node->upper_z = boundsT.upper.z;
+        
+        return merge<N>(bounds);
+      }
+      else
+        return leafBounds.leafBounds(ref);
+    }
+
+    // =========================================================
+    // =========================================================
+    // =========================================================
+
+    
+    template<int N>
+    BBox3fa BVHNRefitter<N>::recurse_bottom(NodeRef& ref)
+    {
+      /* this is a leaf node */
+      if (unlikely(ref.isLeaf()))
+        return leafBounds.leafBounds(ref);
+      
+      /* recurse if this is an internal node */
+      AABBNode* node = ref.getAABBNode();
+
+      /* enable exclusive prefetch for >= AVX platforms */      
+#if defined(__AVX__)      
+      BVH::prefetchW(ref);
+#endif      
+      BBox3fa bounds[N];
+
+      for (size_t i=0; i<N; i++)
+        if (unlikely(node->child(i) == BVH::emptyNode))
+        {
+          bounds[i] = BBox3fa(empty);          
+        }
+      else
+        bounds[i] = recurse_bottom(node->child(i));
+      
+      /* AOS to SOA transform */
+      BBox3vf<N> boundsT = transpose<N>(bounds);
+      
+      /* set new bounds */
+      node->lower_x = boundsT.lower.x;
+      node->lower_y = boundsT.lower.y;
+      node->lower_z = boundsT.lower.z;
+      node->upper_x = boundsT.upper.x;
+      node->upper_y = boundsT.upper.y;
+      node->upper_z = boundsT.upper.z;
+
+      return merge<N>(bounds);
+    }
+
+    template<int N, typename Mesh, typename Primitive>
+    BVHNRefitT<N,Mesh,Primitive>::BVHNRefitT (BVH* bvh, Builder* builder, Mesh* mesh, size_t mode)
+      : bvh(bvh), builder(builder), refitter(new BVHNRefitter<N>(bvh,*(typename BVHNRefitter<N>::LeafBoundsInterface*)this)), mesh(mesh), topologyVersion(0) {}
+
+    template<int N, typename Mesh, typename Primitive>
+    void BVHNRefitT<N,Mesh,Primitive>::clear()
+    {
+      if (builder) 
+        builder->clear();
+    }
+    
+    template<int N, typename Mesh, typename Primitive>
+    void BVHNRefitT<N,Mesh,Primitive>::build()
+    {
+      if (mesh->topologyChanged(topologyVersion)) {
+        topologyVersion = mesh->getTopologyVersion();
+        builder->build();
+      }
+      else
+        refitter->refit();
+    }
+
+    template class BVHNRefitter<4>;
+#if defined(__AVX__)
+    template class BVHNRefitter<8>;
+#endif
+    
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+    Builder* BVH4Triangle4MeshBuilderSAH  (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode);
+    Builder* BVH4Triangle4vMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode);
+    Builder* BVH4Triangle4iMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode);
+
+    Builder* BVH4Triangle4MeshRefitSAH  (void* accel, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<4,TriangleMesh,Triangle4> ((BVH4*)accel,BVH4Triangle4MeshBuilderSAH (accel,mesh,geomID,mode),mesh,mode); }
+    Builder* BVH4Triangle4vMeshRefitSAH (void* accel, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<4,TriangleMesh,Triangle4v>((BVH4*)accel,BVH4Triangle4vMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); }
+    Builder* BVH4Triangle4iMeshRefitSAH (void* accel, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<4,TriangleMesh,Triangle4i>((BVH4*)accel,BVH4Triangle4iMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); }
+#if  defined(__AVX__)
+    Builder* BVH8Triangle4MeshBuilderSAH  (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode);
+    Builder* BVH8Triangle4vMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode);
+    Builder* BVH8Triangle4iMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode);
+
+    Builder* BVH8Triangle4MeshRefitSAH  (void* accel, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<8,TriangleMesh,Triangle4> ((BVH8*)accel,BVH8Triangle4MeshBuilderSAH (accel,mesh,geomID,mode),mesh,mode); }
+    Builder* BVH8Triangle4vMeshRefitSAH (void* accel, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<8,TriangleMesh,Triangle4v>((BVH8*)accel,BVH8Triangle4vMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); }
+    Builder* BVH8Triangle4iMeshRefitSAH (void* accel, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<8,TriangleMesh,Triangle4i>((BVH8*)accel,BVH8Triangle4iMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); }
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_QUAD)
+    Builder* BVH4Quad4vMeshBuilderSAH (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode);
+    Builder* BVH4Quad4vMeshRefitSAH (void* accel, QuadMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<4,QuadMesh,Quad4v>((BVH4*)accel,BVH4Quad4vMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); }
+
+#if  defined(__AVX__)
+    Builder* BVH8Quad4vMeshBuilderSAH (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode);
+    Builder* BVH8Quad4vMeshRefitSAH (void* accel, QuadMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<8,QuadMesh,Quad4v>((BVH8*)accel,BVH8Quad4vMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); }
+#endif
+
+#endif
+
+#if defined(EMBREE_GEOMETRY_USER)
+    Builder* BVH4VirtualMeshBuilderSAH (void* bvh, UserGeometry* mesh, unsigned int geomID, size_t mode);
+    Builder* BVH4VirtualMeshRefitSAH (void* accel, UserGeometry* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<4,UserGeometry,Object>((BVH4*)accel,BVH4VirtualMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); }
+
+#if  defined(__AVX__)
+    Builder* BVH8VirtualMeshBuilderSAH (void* bvh, UserGeometry* mesh, unsigned int geomID, size_t mode);
+    Builder* BVH8VirtualMeshRefitSAH (void* accel, UserGeometry* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<8,UserGeometry,Object>((BVH8*)accel,BVH8VirtualMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); }
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+    Builder* BVH4InstanceMeshBuilderSAH (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode);
+    Builder* BVH4InstanceMeshRefitSAH (void* accel, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) { return new BVHNRefitT<4,Instance,InstancePrimitive>((BVH4*)accel,BVH4InstanceMeshBuilderSAH(accel,mesh,gtype,geomID,mode),mesh,mode); }
+
+#if  defined(__AVX__)
+    Builder* BVH8InstanceMeshBuilderSAH (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode);
+    Builder* BVH8InstanceMeshRefitSAH (void* accel, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) { return new BVHNRefitT<8,Instance,InstancePrimitive>((BVH8*)accel,BVH8InstanceMeshBuilderSAH(accel,mesh,gtype,geomID,mode),mesh,mode); }
+#endif
+#endif
+
+  }
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_refit.h b/thirdparty/embree/kernels/bvh/bvh_refit.h
new file mode 100644
index 0000000000..09bb3d8da5
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_refit.h
@@ -0,0 +1,95 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../bvh/bvh.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int N>
+    class BVHNRefitter
+    {
+    public:
+
+      /*! Type shortcuts */
+      typedef BVHN<N> BVH;
+      typedef typename BVH::AABBNode AABBNode;
+      typedef typename BVH::NodeRef NodeRef;
+
+      struct LeafBoundsInterface {
+        virtual const BBox3fa leafBounds(NodeRef& ref) const = 0;
+      };
+
+    public:
+    
+      /*! Constructor. */
+      BVHNRefitter (BVH* bvh, const LeafBoundsInterface& leafBounds);
+
+      /*! refits the BVH */
+      void refit();
+
+    private:
+      /* single-threaded subtree extraction based on BVH depth */
+      void gather_subtree_refs(NodeRef& ref, 
+                               size_t &subtrees,
+                               const size_t depth = 0);
+
+      /* single-threaded top-level refit */
+      BBox3fa refit_toplevel(NodeRef& ref,
+                             size_t &subtrees,
+							 const BBox3fa *const subTreeBounds,
+                             const size_t depth = 0);
+
+      /* single-threaded subtree refit */
+      BBox3fa recurse_bottom(NodeRef& ref);
+      
+    public:
+      BVH* bvh;                              //!< BVH to refit
+      const LeafBoundsInterface& leafBounds; //!< calculates bounds of leaves
+
+      static const size_t MAX_SUB_TREE_EXTRACTION_DEPTH = (N==4) ? 4   : (N==8) ? 3    : 3;
+      static const size_t MAX_NUM_SUB_TREES             = (N==4) ? 256 : (N==8) ? 512 : N*N*N; // N ^ MAX_SUB_TREE_EXTRACTION_DEPTH
+      size_t numSubTrees;
+      NodeRef subTrees[MAX_NUM_SUB_TREES];
+    };
+
+    template<int N, typename Mesh, typename Primitive>
+    class BVHNRefitT : public Builder, public BVHNRefitter<N>::LeafBoundsInterface
+    {
+    public:
+      
+      /*! Type shortcuts */
+      typedef BVHN<N> BVH;
+      typedef typename BVH::AABBNode AABBNode;
+      typedef typename BVH::NodeRef NodeRef;
+      
+    public:
+      BVHNRefitT (BVH* bvh, Builder* builder, Mesh* mesh, size_t mode);
+
+      virtual void build();
+      
+      virtual void clear();
+
+      virtual const BBox3fa leafBounds (NodeRef& ref) const
+      {
+        size_t num; char* prim = ref.leaf(num);
+        if (unlikely(ref == BVH::emptyNode)) return empty;
+
+        BBox3fa bounds = empty;
+        for (size_t i=0; i<num; i++)
+            bounds.extend(((Primitive*)prim)[i].update(mesh));
+        return bounds;
+      }
+      
+    private:
+      BVH* bvh;
+      std::unique_ptr<Builder> builder;
+      std::unique_ptr<BVHNRefitter<N>> refitter;
+      Mesh* mesh;
+      unsigned int topologyVersion;
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_rotate.cpp b/thirdparty/embree/kernels/bvh/bvh_rotate.cpp
new file mode 100644
index 0000000000..460bd60c62
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_rotate.cpp
@@ -0,0 +1,127 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh_rotate.h"
+
+namespace embree
+{
+  namespace isa 
+  {
+    /*! Computes half surface area of box. */
+    __forceinline float halfArea3f(const BBox<vfloat4>& box) {
+      const vfloat4 d = box.size();
+      const vfloat4 a = d*shuffle<1,2,0,3>(d);
+      return a[0]+a[1]+a[2];
+    }
+    
+    size_t BVHNRotate<4>::rotate(NodeRef parentRef, size_t depth)
+    {
+      /*! nothing to rotate if we reached a leaf node. */
+      if (parentRef.isBarrier()) return 0;
+      if (parentRef.isLeaf()) return 0;
+      AABBNode* parent = parentRef.getAABBNode();
+      
+      /*! rotate all children first */
+      vint4 cdepth;
+      for (size_t c=0; c<4; c++)
+	cdepth[c] = (int)rotate(parent->child(c),depth+1);
+      
+      /* compute current areas of all children */
+      vfloat4 sizeX = parent->upper_x-parent->lower_x;
+      vfloat4 sizeY = parent->upper_y-parent->lower_y;
+      vfloat4 sizeZ = parent->upper_z-parent->lower_z;
+      vfloat4 childArea = madd(sizeX,(sizeY + sizeZ),sizeY*sizeZ);
+      
+      /*! get node bounds */
+      BBox<vfloat4> child1_0,child1_1,child1_2,child1_3;
+      parent->bounds(child1_0,child1_1,child1_2,child1_3);
+      
+      /*! Find best rotation. We pick a first child (child1) and a sub-child 
+	(child2child) of a different second child (child2), and swap child1 
+	and child2child. We perform the best such swap. */
+      float bestArea = 0;
+      size_t bestChild1 = -1, bestChild2 = -1, bestChild2Child = -1;
+      for (size_t c2=0; c2<4; c2++)
+      {
+	/*! ignore leaf nodes as we cannot descent into them */
+	if (parent->child(c2).isBarrier()) continue;
+	if (parent->child(c2).isLeaf()) continue;
+	AABBNode* child2 = parent->child(c2).getAABBNode();
+	
+	/*! transpose child bounds */
+	BBox<vfloat4> child2c0,child2c1,child2c2,child2c3;
+	child2->bounds(child2c0,child2c1,child2c2,child2c3);
+	
+	/*! put child1_0 at each child2 position */
+	float cost00 = halfArea3f(merge(child1_0,child2c1,child2c2,child2c3));
+	float cost01 = halfArea3f(merge(child2c0,child1_0,child2c2,child2c3));
+	float cost02 = halfArea3f(merge(child2c0,child2c1,child1_0,child2c3));
+	float cost03 = halfArea3f(merge(child2c0,child2c1,child2c2,child1_0));
+	vfloat4 cost0 = vfloat4(cost00,cost01,cost02,cost03);
+	vfloat4 min0 = vreduce_min(cost0);
+	int pos0 = (int)bsf(movemask(min0 == cost0));
+	
+	/*! put child1_1 at each child2 position */
+	float cost10 = halfArea3f(merge(child1_1,child2c1,child2c2,child2c3));
+	float cost11 = halfArea3f(merge(child2c0,child1_1,child2c2,child2c3));
+	float cost12 = halfArea3f(merge(child2c0,child2c1,child1_1,child2c3));
+	float cost13 = halfArea3f(merge(child2c0,child2c1,child2c2,child1_1));
+	vfloat4 cost1 = vfloat4(cost10,cost11,cost12,cost13);
+	vfloat4 min1 = vreduce_min(cost1);
+	int pos1 = (int)bsf(movemask(min1 == cost1));
+	
+	/*! put child1_2 at each child2 position */
+	float cost20 = halfArea3f(merge(child1_2,child2c1,child2c2,child2c3));
+	float cost21 = halfArea3f(merge(child2c0,child1_2,child2c2,child2c3));
+	float cost22 = halfArea3f(merge(child2c0,child2c1,child1_2,child2c3));
+	float cost23 = halfArea3f(merge(child2c0,child2c1,child2c2,child1_2));
+	vfloat4 cost2 = vfloat4(cost20,cost21,cost22,cost23);
+	vfloat4 min2 = vreduce_min(cost2);
+	int pos2 = (int)bsf(movemask(min2 == cost2));
+	
+	/*! put child1_3 at each child2 position */
+	float cost30 = halfArea3f(merge(child1_3,child2c1,child2c2,child2c3));
+	float cost31 = halfArea3f(merge(child2c0,child1_3,child2c2,child2c3));
+	float cost32 = halfArea3f(merge(child2c0,child2c1,child1_3,child2c3));
+	float cost33 = halfArea3f(merge(child2c0,child2c1,child2c2,child1_3));
+	vfloat4 cost3 = vfloat4(cost30,cost31,cost32,cost33);
+	vfloat4 min3 = vreduce_min(cost3);
+	int pos3 = (int)bsf(movemask(min3 == cost3));
+	
+	/*! find best other child */
+	vfloat4 area0123 = vfloat4(extract<0>(min0),extract<0>(min1),extract<0>(min2),extract<0>(min3)) - vfloat4(childArea[c2]);
+	int pos[4] = { pos0,pos1,pos2,pos3 };
+	const size_t mbd = BVH4::maxBuildDepth;
+	vbool4 valid = vint4(int(depth+1))+cdepth <= vint4(mbd); // only select swaps that fulfill depth constraints
+	valid &= vint4(int(c2)) != vint4(step);
+	if (none(valid)) continue;
+	size_t c1 = select_min(valid,area0123);
+	float area = area0123[c1]; 
+        if (c1 == c2) continue; // can happen if bounds are NANs
+	
+	/*! accept a swap when it reduces cost and is not swapping a node with itself */
+	if (area < bestArea) {
+	  bestArea = area;
+	  bestChild1 = c1;
+	  bestChild2 = c2;
+	  bestChild2Child = pos[c1];
+	}
+      }
+      
+      /*! if we did not find a swap that improves the SAH then do nothing */
+      if (bestChild1 == size_t(-1)) return 1+reduce_max(cdepth);
+      
+      /*! perform the best found tree rotation */
+      AABBNode* child2 = parent->child(bestChild2).getAABBNode();
+      AABBNode::swap(parent,bestChild1,child2,bestChild2Child);
+      parent->setBounds(bestChild2,child2->bounds());
+      AABBNode::compact(parent);
+      AABBNode::compact(child2);
+      
+      /*! This returned depth is conservative as the child that was
+       *  pulled up in the tree could have been on the critical path. */
+      cdepth[bestChild1]++; // bestChild1 was pushed down one level
+      return 1+reduce_max(cdepth); 
+    }
+  }
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_rotate.h b/thirdparty/embree/kernels/bvh/bvh_rotate.h
new file mode 100644
index 0000000000..61ef64a679
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_rotate.h
@@ -0,0 +1,37 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh.h"
+
+namespace embree
+{
+  namespace isa 
+  { 
+    template<int N>
+    class BVHNRotate
+    {
+      typedef typename BVHN<N>::NodeRef NodeRef;
+
+    public:
+      static const bool enabled = false;
+
+      static __forceinline size_t rotate(NodeRef parentRef, size_t depth = 1) { return 0; }
+      static __forceinline void restructure(NodeRef ref, size_t depth = 1) {}
+    };
+
+    /* BVH4 tree rotations */
+    template<>
+    class BVHNRotate<4>
+    {
+      typedef BVH4::AABBNode AABBNode;
+      typedef BVH4::NodeRef NodeRef;
+      
+    public:
+      static const bool enabled = true;
+
+      static size_t rotate(NodeRef parentRef, size_t depth = 1);
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_statistics.cpp b/thirdparty/embree/kernels/bvh/bvh_statistics.cpp
new file mode 100644
index 0000000000..d857ff7d95
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_statistics.cpp
@@ -0,0 +1,168 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh_statistics.h"
+#include "../../common/algorithms/parallel_reduce.h"
+
+namespace embree
+{
+  template<int N>
+  BVHNStatistics<N>::BVHNStatistics (BVH* bvh) : bvh(bvh)
+  {
+    double A = max(0.0f,bvh->getLinearBounds().expectedHalfArea());
+    stat = statistics(bvh->root,A,BBox1f(0.0f,1.0f));
+  }
+  
+  template<int N>
+  std::string BVHNStatistics<N>::str()
+  {
+    std::ostringstream stream;
+    stream.setf(std::ios::fixed, std::ios::floatfield);
+    stream << "  primitives = " << bvh->numPrimitives << ", vertices = " << bvh->numVertices << ", depth = " << stat.depth << std::endl;
+    size_t totalBytes = stat.bytes(bvh);
+    double totalSAH = stat.sah(bvh);
+    stream << "  total            : sah = "  << std::setw(7) << std::setprecision(3) << totalSAH << " (100.00%), ";
+    stream << "#bytes = " << std::setw(7) << std::setprecision(2) << totalBytes/1E6 << " MB (100.00%), ";
+    stream << "#nodes = " << std::setw(7) << stat.size() << " (" << std::setw(6) << std::setprecision(2) << 100.0*stat.fillRate(bvh) << "% filled), ";
+    stream << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(totalBytes)/double(bvh->numPrimitives) << std::endl;
+    if (stat.statAABBNodes.numNodes    ) stream << "  getAABBNodes     : "  << stat.statAABBNodes.toString(bvh,totalSAH,totalBytes) << std::endl;
+    if (stat.statOBBNodes.numNodes  ) stream << "  ungetAABBNodes   : "  << stat.statOBBNodes.toString(bvh,totalSAH,totalBytes) << std::endl;
+    if (stat.statAABBNodesMB.numNodes  ) stream << "  getAABBNodesMB   : "  << stat.statAABBNodesMB.toString(bvh,totalSAH,totalBytes) << std::endl;
+    if (stat.statAABBNodesMB4D.numNodes) stream << "  getAABBNodesMB4D : "  << stat.statAABBNodesMB4D.toString(bvh,totalSAH,totalBytes) << std::endl;
+    if (stat.statOBBNodesMB.numNodes) stream << "  ungetAABBNodesMB : "  << stat.statOBBNodesMB.toString(bvh,totalSAH,totalBytes) << std::endl;
+    if (stat.statQuantizedNodes.numNodes  ) stream << "  quantizedNodes   : "  << stat.statQuantizedNodes.toString(bvh,totalSAH,totalBytes) << std::endl;
+    if (true)                               stream << "  leaves           : "  << stat.statLeaf.toString(bvh,totalSAH,totalBytes) << std::endl;
+    if (true)                               stream << "    histogram      : "  << stat.statLeaf.histToString() << std::endl;
+    return stream.str();
+  }
+  
+  template<int N>
+  typename BVHNStatistics<N>::Statistics BVHNStatistics<N>::statistics(NodeRef node, const double A, const BBox1f t0t1)
+  {
+    Statistics s;
+    assert(t0t1.size() > 0.0f);
+    double dt = max(0.0f,t0t1.size());
+    if (node.isAABBNode())
+    {
+      AABBNode* n = node.getAABBNode();
+      s = s + parallel_reduce(0,N,Statistics(),[&] ( const int i ) {
+          if (n->child(i) == BVH::emptyNode) return Statistics();
+          const double Ai = max(0.0f,halfArea(n->extend(i)));
+          Statistics s = statistics(n->child(i),Ai,t0t1); 
+          s.statAABBNodes.numChildren++;
+          return s;
+        }, Statistics::add);
+      s.statAABBNodes.numNodes++;
+      s.statAABBNodes.nodeSAH += dt*A;
+      s.depth++;
+    }
+    else if (node.isOBBNode())
+    {
+      OBBNode* n = node.ungetAABBNode();
+      s = s + parallel_reduce(0,N,Statistics(),[&] ( const int i ) {
+          if (n->child(i) == BVH::emptyNode) return Statistics();
+          const double Ai = max(0.0f,halfArea(n->extent(i)));
+          Statistics s = statistics(n->child(i),Ai,t0t1); 
+          s.statOBBNodes.numChildren++;
+          return s;
+        }, Statistics::add);
+      s.statOBBNodes.numNodes++;
+      s.statOBBNodes.nodeSAH += dt*A;
+      s.depth++;
+    }
+    else if (node.isAABBNodeMB())
+    {
+      AABBNodeMB* n = node.getAABBNodeMB();
+      s = s + parallel_reduce(0,N,Statistics(),[&] ( const int i ) {
+          if (n->child(i) == BVH::emptyNode) return Statistics();
+          const double Ai = max(0.0f,n->expectedHalfArea(i,t0t1));
+          Statistics s = statistics(n->child(i),Ai,t0t1);
+          s.statAABBNodesMB.numChildren++;
+          return s;
+        }, Statistics::add);
+      s.statAABBNodesMB.numNodes++;
+      s.statAABBNodesMB.nodeSAH += dt*A;
+      s.depth++;
+    }
+    else if (node.isAABBNodeMB4D())
+    {
+      AABBNodeMB4D* n = node.getAABBNodeMB4D();
+      s = s + parallel_reduce(0,N,Statistics(),[&] ( const int i ) {
+          if (n->child(i) == BVH::emptyNode) return Statistics();
+          const BBox1f t0t1i = intersect(t0t1,n->timeRange(i));
+          assert(!t0t1i.empty());
+          const double Ai = n->AABBNodeMB::expectedHalfArea(i,t0t1i);
+          Statistics s =  statistics(n->child(i),Ai,t0t1i);
+          s.statAABBNodesMB4D.numChildren++;
+          return s;
+        }, Statistics::add);
+      s.statAABBNodesMB4D.numNodes++;
+      s.statAABBNodesMB4D.nodeSAH += dt*A;
+      s.depth++;
+    }
+    else if (node.isOBBNodeMB())
+    {
+      OBBNodeMB* n = node.ungetAABBNodeMB();
+      s = s + parallel_reduce(0,N,Statistics(),[&] ( const int i ) {
+          if (n->child(i) == BVH::emptyNode) return Statistics();
+          const double Ai = max(0.0f,halfArea(n->extent0(i)));
+          Statistics s = statistics(n->child(i),Ai,t0t1); 
+          s.statOBBNodesMB.numChildren++;
+          return s;
+        }, Statistics::add);
+      s.statOBBNodesMB.numNodes++;
+      s.statOBBNodesMB.nodeSAH += dt*A;
+      s.depth++;
+    }
+    else if (node.isQuantizedNode())
+    {
+      QuantizedNode* n = node.quantizedNode();
+      s = s + parallel_reduce(0,N,Statistics(),[&] ( const int i ) {
+          if (n->child(i) == BVH::emptyNode) return Statistics();
+          const double Ai = max(0.0f,halfArea(n->extent(i)));
+          Statistics s = statistics(n->child(i),Ai,t0t1); 
+          s.statQuantizedNodes.numChildren++;
+          return s;
+        }, Statistics::add);
+      s.statQuantizedNodes.numNodes++;
+      s.statQuantizedNodes.nodeSAH += dt*A;
+      s.depth++;
+    }
+    else if (node.isLeaf())
+    {
+      size_t num; const char* tri = node.leaf(num);
+      if (num)
+      {
+        for (size_t i=0; i<num; i++)
+        {
+          const size_t bytes = bvh->primTy->getBytes(tri);
+          s.statLeaf.numPrimsActive += bvh->primTy->sizeActive(tri);
+          s.statLeaf.numPrimsTotal += bvh->primTy->sizeTotal(tri);
+          s.statLeaf.numBytes += bytes;
+          tri+=bytes;
+        }
+        s.statLeaf.numLeaves++;
+        s.statLeaf.numPrimBlocks += num;
+        s.statLeaf.leafSAH += dt*A*num;
+        if (num-1 < Statistics::LeafStat::NHIST) {
+          s.statLeaf.numPrimBlocksHistogram[num-1]++;
+        }
+      }
+    }
+    else {
+      // -- GODOT start --
+      // throw std::runtime_error("not supported node type in bvh_statistics");
+      abort();
+      // -- GODOT end --
+    }
+    return s;
+  } 
+
+#if defined(__AVX__)
+  template class BVHNStatistics<8>;
+#endif
+
+#if !defined(__AVX__) || !defined(EMBREE_TARGET_SSE2) && !defined(EMBREE_TARGET_SSE42)
+  template class BVHNStatistics<4>;
+#endif
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_statistics.h b/thirdparty/embree/kernels/bvh/bvh_statistics.h
new file mode 100644
index 0000000000..a28e115f1c
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_statistics.h
@@ -0,0 +1,285 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh.h"
+#include <sstream>
+
+namespace embree
+{
+  template<int N>
+  class BVHNStatistics
+  {
+    typedef BVHN<N> BVH;
+    typedef typename BVH::AABBNode AABBNode;
+    typedef typename BVH::OBBNode OBBNode;
+    typedef typename BVH::AABBNodeMB AABBNodeMB;
+    typedef typename BVH::AABBNodeMB4D AABBNodeMB4D;
+    typedef typename BVH::OBBNodeMB OBBNodeMB;
+    typedef typename BVH::QuantizedNode QuantizedNode;
+
+    typedef typename BVH::NodeRef NodeRef;
+
+    struct Statistics 
+    {
+      template<typename Node>
+        struct NodeStat
+      {
+        NodeStat ( double nodeSAH = 0,
+                   size_t numNodes = 0, 
+                   size_t numChildren = 0)
+        : nodeSAH(nodeSAH),
+          numNodes(numNodes), 
+          numChildren(numChildren) {}
+        
+        double sah(BVH* bvh) const {
+          return nodeSAH/bvh->getLinearBounds().expectedHalfArea();
+        }
+
+        size_t bytes() const {
+          return numNodes*sizeof(Node);
+        }
+
+        size_t size() const {
+          return numNodes;
+        }
+
+        double fillRateNom () const { return double(numChildren);  }
+        double fillRateDen () const { return double(numNodes*N);  }
+        double fillRate    () const { return fillRateNom()/fillRateDen(); }
+
+        __forceinline friend NodeStat operator+ ( const NodeStat& a, const NodeStat& b)
+        {
+          return NodeStat(a.nodeSAH + b.nodeSAH,
+                          a.numNodes+b.numNodes,
+                          a.numChildren+b.numChildren);
+        }
+
+        std::string toString(BVH* bvh, double sahTotal, size_t bytesTotal) const
+        {
+          std::ostringstream stream;
+          stream.setf(std::ios::fixed, std::ios::floatfield);
+          stream << "sah = " << std::setw(7) << std::setprecision(3) << sah(bvh);
+          stream << " (" << std::setw(6) << std::setprecision(2) << 100.0*sah(bvh)/sahTotal << "%), ";          
+          stream << "#bytes = " << std::setw(7) << std::setprecision(2) << bytes()/1E6  << " MB ";
+          stream << "(" << std::setw(6) << std::setprecision(2) << 100.0*double(bytes())/double(bytesTotal) << "%), ";
+          stream << "#nodes = " << std::setw(7) << numNodes << " (" << std::setw(6) << std::setprecision(2) << 100.0*fillRate() << "% filled), ";
+          stream << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytes())/double(bvh->numPrimitives);
+          return stream.str();
+        }
+
+      public:
+        double nodeSAH;
+        size_t numNodes;
+        size_t numChildren;
+      };
+
+      struct LeafStat
+      {
+        static const int NHIST = 8;
+
+        LeafStat ( double leafSAH = 0.0f, 
+                   size_t numLeaves = 0,
+                   size_t numPrimsActive = 0,
+                   size_t numPrimsTotal = 0,
+                   size_t numPrimBlocks = 0,
+                   size_t numBytes = 0)
+        : leafSAH(leafSAH),
+          numLeaves(numLeaves),
+          numPrimsActive(numPrimsActive),
+          numPrimsTotal(numPrimsTotal),
+          numPrimBlocks(numPrimBlocks),
+          numBytes(numBytes)
+        {
+          for (size_t i=0; i<NHIST; i++)
+            numPrimBlocksHistogram[i] = 0;
+        }
+
+        double sah(BVH* bvh) const {
+          return leafSAH/bvh->getLinearBounds().expectedHalfArea();
+        }
+
+        size_t bytes(BVH* bvh) const {
+          return numBytes;
+        }
+
+        size_t size() const {
+          return numLeaves;
+        }
+
+        double fillRateNom (BVH* bvh) const { return double(numPrimsActive);  }
+        double fillRateDen (BVH* bvh) const { return double(numPrimsTotal);  }
+        double fillRate    (BVH* bvh) const { return fillRateNom(bvh)/fillRateDen(bvh); }
+
+        __forceinline friend LeafStat operator+ ( const LeafStat& a, const LeafStat& b)
+        {
+          LeafStat stat(a.leafSAH + b.leafSAH,
+                        a.numLeaves+b.numLeaves,
+                        a.numPrimsActive+b.numPrimsActive,
+                        a.numPrimsTotal+b.numPrimsTotal,
+                        a.numPrimBlocks+b.numPrimBlocks,
+                        a.numBytes+b.numBytes);
+          for (size_t i=0; i<NHIST; i++) {
+            stat.numPrimBlocksHistogram[i] += a.numPrimBlocksHistogram[i];
+            stat.numPrimBlocksHistogram[i] += b.numPrimBlocksHistogram[i];
+          }
+          return stat;
+        }
+
+        std::string toString(BVH* bvh, double sahTotal, size_t bytesTotal) const
+        {
+          std::ostringstream stream;
+          stream.setf(std::ios::fixed, std::ios::floatfield);
+          stream << "sah = " << std::setw(7) << std::setprecision(3) << sah(bvh);
+          stream << " (" << std::setw(6) << std::setprecision(2) << 100.0*sah(bvh)/sahTotal << "%), ";
+          stream << "#bytes = " << std::setw(7) << std::setprecision(2) << double(bytes(bvh))/1E6  << " MB ";
+          stream << "(" << std::setw(6) << std::setprecision(2) << 100.0*double(bytes(bvh))/double(bytesTotal) << "%), ";
+          stream << "#nodes = " << std::setw(7) << numLeaves << " (" << std::setw(6) << std::setprecision(2) << 100.0*fillRate(bvh) << "% filled), ";
+          stream << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytes(bvh))/double(bvh->numPrimitives);
+          return stream.str();
+        }
+
+        std::string histToString() const
+        {
+          std::ostringstream stream;
+          stream.setf(std::ios::fixed, std::ios::floatfield);
+          for (size_t i=0; i<NHIST; i++)
+            stream << std::setw(6) << std::setprecision(2) << 100.0f*float(numPrimBlocksHistogram[i])/float(numLeaves) << "% ";
+          return stream.str();
+        }
+     
+      public:
+        double leafSAH;                    //!< SAH of the leaves only
+        size_t numLeaves;                  //!< Number of leaf nodes.
+        size_t numPrimsActive;             //!< Number of active primitives (
+        size_t numPrimsTotal;              //!< Number of active and inactive primitives
+        size_t numPrimBlocks;              //!< Number of primitive blocks.
+        size_t numBytes;                   //!< Number of bytes of leaves.
+        size_t numPrimBlocksHistogram[8];
+      };
+
+    public:
+      Statistics (size_t depth = 0,
+                  LeafStat statLeaf = LeafStat(),
+                  NodeStat<AABBNode> statAABBNodes = NodeStat<AABBNode>(),
+                  NodeStat<OBBNode> statOBBNodes = NodeStat<OBBNode>(),
+                  NodeStat<AABBNodeMB> statAABBNodesMB = NodeStat<AABBNodeMB>(),
+                  NodeStat<AABBNodeMB4D> statAABBNodesMB4D = NodeStat<AABBNodeMB4D>(),
+                  NodeStat<OBBNodeMB> statOBBNodesMB = NodeStat<OBBNodeMB>(),
+                  NodeStat<QuantizedNode> statQuantizedNodes = NodeStat<QuantizedNode>())
+
+      : depth(depth), 
+        statLeaf(statLeaf),
+        statAABBNodes(statAABBNodes),
+        statOBBNodes(statOBBNodes),
+        statAABBNodesMB(statAABBNodesMB),
+        statAABBNodesMB4D(statAABBNodesMB4D),
+        statOBBNodesMB(statOBBNodesMB),
+        statQuantizedNodes(statQuantizedNodes) {}
+
+      double sah(BVH* bvh) const 
+      {
+        return statLeaf.sah(bvh) +
+          statAABBNodes.sah(bvh) + 
+          statOBBNodes.sah(bvh) + 
+          statAABBNodesMB.sah(bvh) + 
+          statAABBNodesMB4D.sah(bvh) + 
+          statOBBNodesMB.sah(bvh) + 
+          statQuantizedNodes.sah(bvh);
+      }
+      
+      size_t bytes(BVH* bvh) const {
+        return statLeaf.bytes(bvh) +
+          statAABBNodes.bytes() + 
+          statOBBNodes.bytes() + 
+          statAABBNodesMB.bytes() + 
+          statAABBNodesMB4D.bytes() + 
+          statOBBNodesMB.bytes() + 
+          statQuantizedNodes.bytes();
+      }
+
+      size_t size() const 
+      {
+        return statLeaf.size() +
+          statAABBNodes.size() + 
+          statOBBNodes.size() + 
+          statAABBNodesMB.size() + 
+          statAABBNodesMB4D.size() + 
+          statOBBNodesMB.size() + 
+          statQuantizedNodes.size();
+      }
+
+      double fillRate (BVH* bvh) const 
+      {
+        double nom = statLeaf.fillRateNom(bvh) +
+          statAABBNodes.fillRateNom() + 
+          statOBBNodes.fillRateNom() + 
+          statAABBNodesMB.fillRateNom() + 
+          statAABBNodesMB4D.fillRateNom() + 
+          statOBBNodesMB.fillRateNom() + 
+          statQuantizedNodes.fillRateNom();
+        double den = statLeaf.fillRateDen(bvh) +
+          statAABBNodes.fillRateDen() + 
+          statOBBNodes.fillRateDen() + 
+          statAABBNodesMB.fillRateDen() + 
+          statAABBNodesMB4D.fillRateDen() + 
+          statOBBNodesMB.fillRateDen() + 
+          statQuantizedNodes.fillRateDen();
+        return nom/den;
+      }
+
+      friend Statistics operator+ ( const Statistics& a, const Statistics& b )
+      {
+        return Statistics(max(a.depth,b.depth),
+                          a.statLeaf + b.statLeaf,
+                          a.statAABBNodes + b.statAABBNodes,
+                          a.statOBBNodes + b.statOBBNodes,
+                          a.statAABBNodesMB + b.statAABBNodesMB,
+                          a.statAABBNodesMB4D + b.statAABBNodesMB4D,
+                          a.statOBBNodesMB + b.statOBBNodesMB,
+                          a.statQuantizedNodes + b.statQuantizedNodes);
+      }
+
+      static Statistics add ( const Statistics& a, const Statistics& b ) {
+        return a+b;
+      }
+
+    public:
+      size_t depth;
+      LeafStat statLeaf;
+      NodeStat<AABBNode> statAABBNodes;
+      NodeStat<OBBNode> statOBBNodes;
+      NodeStat<AABBNodeMB> statAABBNodesMB;
+      NodeStat<AABBNodeMB4D> statAABBNodesMB4D;
+      NodeStat<OBBNodeMB> statOBBNodesMB;
+      NodeStat<QuantizedNode> statQuantizedNodes;
+    };
+
+  public:
+
+    /* Constructor gathers statistics. */
+    BVHNStatistics (BVH* bvh);
+
+    /*! Convert statistics into a string */
+    std::string str();
+
+    double sah() const { 
+      return stat.sah(bvh); 
+    }
+
+    size_t bytesUsed() const {
+      return stat.bytes(bvh);
+    }
+
+  private:
+    Statistics statistics(NodeRef node, const double A, const BBox1f dt);
+
+  private:
+    BVH* bvh;
+    Statistics stat;
+  };
+
+  typedef BVHNStatistics<4> BVH4Statistics;
+  typedef BVHNStatistics<8> BVH8Statistics;
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_traverser1.h b/thirdparty/embree/kernels/bvh/bvh_traverser1.h
new file mode 100644
index 0000000000..8ce01b57f5
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_traverser1.h
@@ -0,0 +1,466 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh.h"
+#include "node_intersector1.h"
+#include "../common/stack_item.h"
+
+#define NEW_SORTING_CODE 1
+
+namespace embree
+{
+  namespace isa
+  {
+    /*! BVH regular node traversal for single rays. */
+    template<int N, int types>
+    class BVHNNodeTraverser1Hit;
+
+#if defined(__AVX512VL__) // SKX
+
+    template<int N>
+    __forceinline void isort_update(vint<N> &dist, const vint<N> &d)
+    {
+      const vint<N> dist_shift = align_shift_right<N-1>(dist,dist);
+      const vboolf<N> m_geq = d >= dist;
+      const vboolf<N> m_geq_shift = m_geq << 1;
+      dist = select(m_geq,d,dist);
+      dist = select(m_geq_shift,dist_shift,dist);
+    }
+
+    template<int N>
+    __forceinline void isort_quick_update(vint<N> &dist, const vint<N> &d) {
+      dist = align_shift_right<N-1>(dist,permute(d,vint<N>(zero)));
+    }
+
+    __forceinline size_t permuteExtract(const vint8& index, const vllong4& n0, const vllong4& n1) {
+      return toScalar(permutex2var((__m256i)index,n0,n1));
+    }
+
+    __forceinline float permuteExtract(const vint8& index, const vfloat8& n) {
+      return toScalar(permute(n,index));
+    }
+
+#endif
+
+    /* Specialization for BVH4. */
+    template<int types>
+    class BVHNNodeTraverser1Hit<4, types>
+    {
+      typedef BVH4 BVH;
+      typedef BVH4::NodeRef NodeRef;
+      typedef BVH4::BaseNode BaseNode;
+
+
+    public:
+      /* Traverses a node with at least one hit child. Optimized for finding the closest hit (intersection). */
+      static __forceinline void traverseClosestHit(NodeRef& cur,
+                                                   size_t mask,
+                                                   const vfloat4& tNear,
+                                                   StackItemT<NodeRef>*& stackPtr,
+                                                   StackItemT<NodeRef>* stackEnd)
+      {
+        assert(mask != 0);
+        const BaseNode* node = cur.baseNode();
+
+        /*! one child is hit, continue with that child */
+        size_t r = bscf(mask);
+        cur = node->child(r);
+        BVH::prefetch(cur,types);
+        if (likely(mask == 0)) {
+          assert(cur != BVH::emptyNode);
+          return;
+        }
+
+        /*! two children are hit, push far child, and continue with closer child */
+        NodeRef c0 = cur;
+        const unsigned int d0 = ((unsigned int*)&tNear)[r];
+        r = bscf(mask);
+        NodeRef c1 = node->child(r);
+        BVH::prefetch(c1,types);
+        const unsigned int d1 = ((unsigned int*)&tNear)[r];
+        assert(c0 != BVH::emptyNode);
+        assert(c1 != BVH::emptyNode);
+        if (likely(mask == 0)) {
+          assert(stackPtr < stackEnd);
+          if (d0 < d1) { stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++; cur = c0; return; }
+          else         { stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++; cur = c1; return; }
+        }
+
+#if NEW_SORTING_CODE == 1
+        vint4 s0((size_t)c0,(size_t)d0);
+        vint4 s1((size_t)c1,(size_t)d1);
+        r = bscf(mask);
+        NodeRef c2 = node->child(r); BVH::prefetch(c2,types); unsigned int d2 = ((unsigned int*)&tNear)[r]; 
+        vint4 s2((size_t)c2,(size_t)d2);
+        /* 3 hits */
+        if (likely(mask == 0)) {
+          StackItemT<NodeRef>::sort3(s0,s1,s2);
+          *(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1;
+          cur = toSizeT(s2);
+          stackPtr+=2;
+          return;
+        }
+        r = bscf(mask);
+        NodeRef c3 = node->child(r); BVH::prefetch(c3,types); unsigned int d3 = ((unsigned int*)&tNear)[r]; 
+        vint4 s3((size_t)c3,(size_t)d3);
+        /* 4 hits */
+        StackItemT<NodeRef>::sort4(s0,s1,s2,s3);
+        *(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1; *(vint4*)&stackPtr[2] = s2;
+        cur = toSizeT(s3);
+        stackPtr+=3;
+#else
+        /*! Here starts the slow path for 3 or 4 hit children. We push
+         *  all nodes onto the stack to sort them there. */
+        assert(stackPtr < stackEnd);
+        stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++;
+        assert(stackPtr < stackEnd);
+        stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++;
+
+        /*! three children are hit, push all onto stack and sort 3 stack items, continue with closest child */
+        assert(stackPtr < stackEnd);
+        r = bscf(mask);
+        NodeRef c = node->child(r); BVH::prefetch(c,types); unsigned int d = ((unsigned int*)&tNear)[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++;
+        assert(c != BVH::emptyNode);
+        if (likely(mask == 0)) {
+          sort(stackPtr[-1],stackPtr[-2],stackPtr[-3]);
+          cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;
+          return;
+        }
+
+        /*! four children are hit, push all onto stack and sort 4 stack items, continue with closest child */
+        assert(stackPtr < stackEnd);
+        r = bscf(mask);
+        c = node->child(r); BVH::prefetch(c,types); d = *(unsigned int*)&tNear[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++;
+        assert(c != BVH::emptyNode);
+        sort(stackPtr[-1],stackPtr[-2],stackPtr[-3],stackPtr[-4]);
+        cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;
+#endif
+      }
+
+      /* Traverses a node with at least one hit child. Optimized for finding any hit (occlusion). */
+      static __forceinline void traverseAnyHit(NodeRef& cur,
+                                               size_t mask,
+                                               const vfloat4& tNear,
+                                               NodeRef*& stackPtr,
+                                               NodeRef* stackEnd)
+      {
+        const BaseNode* node = cur.baseNode();
+
+        /*! one child is hit, continue with that child */
+        size_t r = bscf(mask);
+        cur = node->child(r); 
+        BVH::prefetch(cur,types);
+
+        /* simpler in sequence traversal order */
+        assert(cur != BVH::emptyNode);
+        if (likely(mask == 0)) return;
+        assert(stackPtr < stackEnd);
+        *stackPtr = cur; stackPtr++;
+
+        for (; ;)
+        {
+          r = bscf(mask);
+          cur = node->child(r); BVH::prefetch(cur,types);
+          assert(cur != BVH::emptyNode);
+          if (likely(mask == 0)) return;
+          assert(stackPtr < stackEnd);
+          *stackPtr = cur; stackPtr++;
+        }
+      }
+    };
+
+    /* Specialization for BVH8. */
+    template<int types>
+    class BVHNNodeTraverser1Hit<8, types>
+    {
+      typedef BVH8 BVH;
+      typedef BVH8::NodeRef NodeRef;
+      typedef BVH8::BaseNode BaseNode;
+      
+#if defined(__AVX512VL__)
+      template<class NodeRef, class BaseNode>
+        static __forceinline void traverseClosestHitAVX512VL8(NodeRef& cur,
+                                                              size_t mask,
+                                                              const vfloat8& tNear,
+                                                              StackItemT<NodeRef>*& stackPtr,
+                                                              StackItemT<NodeRef>* stackEnd)
+      {
+        assert(mask != 0);
+        const BaseNode* node = cur.baseNode();
+        const vllong4 n0 = vllong4::loadu((vllong4*)&node->children[0]);
+        const vllong4 n1 = vllong4::loadu((vllong4*)&node->children[4]);
+        vint8 distance_i = (asInt(tNear) & 0xfffffff8) | vint8(step);
+        distance_i = vint8::compact((int)mask,distance_i,distance_i);
+        cur = permuteExtract(distance_i,n0,n1);
+        BVH::prefetch(cur,types);
+
+        mask &= mask-1;
+        if (likely(mask == 0)) return;
+
+        /* 2 hits: order A0 B0 */
+        const vint8 d0(distance_i);
+        const vint8 d1(shuffle<1>(distance_i));
+        cur = permuteExtract(d1,n0,n1);
+        BVH::prefetch(cur,types);
+
+        const vint8 dist_A0 = min(d0, d1);
+        const vint8 dist_B0 = max(d0, d1);
+        assert(dist_A0[0] < dist_B0[0]);
+
+        mask &= mask-1;
+        if (likely(mask == 0)) {
+          cur                        = permuteExtract(dist_A0,n0,n1);
+          stackPtr[0].ptr            = permuteExtract(dist_B0,n0,n1);
+          *(float*)&stackPtr[0].dist = permuteExtract(dist_B0,tNear);
+          stackPtr++;
+          return;
+        }
+
+        /* 3 hits: order A1 B1 C1 */
+
+        const vint8 d2(shuffle<2>(distance_i));
+        cur = permuteExtract(d2,n0,n1);
+        BVH::prefetch(cur,types);
+
+        const vint8 dist_A1     = min(dist_A0,d2);
+        const vint8 dist_tmp_B1 = max(dist_A0,d2);
+        const vint8 dist_B1     = min(dist_B0,dist_tmp_B1);
+        const vint8 dist_C1     = max(dist_B0,dist_tmp_B1);
+        assert(dist_A1[0] < dist_B1[0]);
+        assert(dist_B1[0] < dist_C1[0]);
+
+        mask &= mask-1;
+        if (likely(mask == 0)) {
+          cur                        = permuteExtract(dist_A1,n0,n1);
+          stackPtr[0].ptr            = permuteExtract(dist_C1,n0,n1);
+          *(float*)&stackPtr[0].dist = permuteExtract(dist_C1,tNear);
+          stackPtr[1].ptr            = permuteExtract(dist_B1,n0,n1);
+          *(float*)&stackPtr[1].dist = permuteExtract(dist_B1,tNear);
+          stackPtr+=2;
+          return;
+        }
+
+        /* 4 hits: order A2 B2 C2 D2 */
+
+        const vint8 d3(shuffle<3>(distance_i));
+        cur = permuteExtract(d3,n0,n1);
+        BVH::prefetch(cur,types);
+
+        const vint8 dist_A2     = min(dist_A1,d3);
+        const vint8 dist_tmp_B2 = max(dist_A1,d3);
+        const vint8 dist_B2     = min(dist_B1,dist_tmp_B2);
+        const vint8 dist_tmp_C2 = max(dist_B1,dist_tmp_B2);
+        const vint8 dist_C2     = min(dist_C1,dist_tmp_C2);
+        const vint8 dist_D2     = max(dist_C1,dist_tmp_C2);
+        assert(dist_A2[0] < dist_B2[0]);
+        assert(dist_B2[0] < dist_C2[0]);
+        assert(dist_C2[0] < dist_D2[0]);
+        
+        mask &= mask-1;
+        if (likely(mask == 0)) {
+          cur                        = permuteExtract(dist_A2,n0,n1);
+          stackPtr[0].ptr            = permuteExtract(dist_D2,n0,n1);
+          *(float*)&stackPtr[0].dist = permuteExtract(dist_D2,tNear);
+          stackPtr[1].ptr            = permuteExtract(dist_C2,n0,n1);
+          *(float*)&stackPtr[1].dist = permuteExtract(dist_C2,tNear);
+          stackPtr[2].ptr            = permuteExtract(dist_B2,n0,n1);
+          *(float*)&stackPtr[2].dist = permuteExtract(dist_B2,tNear);
+          stackPtr+=3;
+          return;
+        }
+
+        /* >=5 hits: reverse to descending order for writing to stack */
+
+        distance_i = align_shift_right<3>(distance_i,distance_i);
+        const size_t hits = 4 + popcnt(mask);
+        vint8 dist(INT_MIN); // this will work with -0.0f (0x80000000) as distance, isort_update uses >= to insert
+	
+        isort_quick_update<8>(dist,dist_A2);
+        isort_quick_update<8>(dist,dist_B2);
+        isort_quick_update<8>(dist,dist_C2);
+        isort_quick_update<8>(dist,dist_D2);
+
+        do {
+
+          distance_i = align_shift_right<1>(distance_i,distance_i);
+          cur = permuteExtract(distance_i,n0,n1);
+          BVH::prefetch(cur,types);
+          const vint8 new_dist(permute(distance_i,vint8(zero)));
+          mask &= mask-1;
+          isort_update<8>(dist,new_dist);
+
+        } while(mask);
+
+        for (size_t i=0; i<7; i++)
+          assert(dist[i+0]>=dist[i+1]);
+
+        for (size_t i=0;i<hits-1;i++)
+        {
+          stackPtr->ptr            = permuteExtract(dist,n0,n1);
+          *(float*)&stackPtr->dist = permuteExtract(dist,tNear);
+          dist = align_shift_right<1>(dist,dist);
+          stackPtr++;
+        }
+        cur = permuteExtract(dist,n0,n1);
+      }
+#endif
+
+    public:
+      static __forceinline void traverseClosestHit(NodeRef& cur,
+                                                   size_t mask,
+                                                   const vfloat8& tNear,
+                                                   StackItemT<NodeRef>*& stackPtr,
+                                                   StackItemT<NodeRef>* stackEnd)
+      {
+        assert(mask != 0);
+#if defined(__AVX512VL__)
+        traverseClosestHitAVX512VL8<NodeRef,BaseNode>(cur,mask,tNear,stackPtr,stackEnd);
+#else
+
+        const BaseNode* node = cur.baseNode();
+
+        /*! one child is hit, continue with that child */
+        size_t r = bscf(mask);
+        cur = node->child(r);
+        BVH::prefetch(cur,types);
+        if (likely(mask == 0)) {
+          assert(cur != BVH::emptyNode);
+          return;
+        }
+
+        /*! two children are hit, push far child, and continue with closer child */
+        NodeRef c0 = cur;
+        const unsigned int d0 = ((unsigned int*)&tNear)[r];
+        r = bscf(mask);
+        NodeRef c1 = node->child(r);
+        BVH::prefetch(c1,types);
+        const unsigned int d1 = ((unsigned int*)&tNear)[r];
+
+        assert(c0 != BVH::emptyNode);
+        assert(c1 != BVH::emptyNode);
+        if (likely(mask == 0)) {
+          assert(stackPtr < stackEnd);
+          if (d0 < d1) { stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++; cur = c0; return; }
+          else         { stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++; cur = c1; return; }
+        }
+#if NEW_SORTING_CODE == 1
+        vint4 s0((size_t)c0,(size_t)d0);
+        vint4 s1((size_t)c1,(size_t)d1);
+
+        r = bscf(mask);
+        NodeRef c2 = node->child(r); BVH::prefetch(c2,types); unsigned int d2 = ((unsigned int*)&tNear)[r]; 
+        vint4 s2((size_t)c2,(size_t)d2);
+        /* 3 hits */
+        if (likely(mask == 0)) {
+          StackItemT<NodeRef>::sort3(s0,s1,s2);
+          *(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1;
+          cur = toSizeT(s2);
+          stackPtr+=2;
+          return;
+        }
+        r = bscf(mask);
+        NodeRef c3 = node->child(r); BVH::prefetch(c3,types); unsigned int d3 = ((unsigned int*)&tNear)[r]; 
+        vint4 s3((size_t)c3,(size_t)d3);
+        /* 4 hits */
+        if (likely(mask == 0)) {
+          StackItemT<NodeRef>::sort4(s0,s1,s2,s3);
+          *(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1; *(vint4*)&stackPtr[2] = s2;
+          cur = toSizeT(s3);
+          stackPtr+=3;
+          return;
+        }
+        *(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1; *(vint4*)&stackPtr[2] = s2; *(vint4*)&stackPtr[3] = s3;
+        /*! fallback case if more than 4 children are hit */
+        StackItemT<NodeRef>* stackFirst = stackPtr;
+        stackPtr+=4;      
+        while (1)
+        {
+          assert(stackPtr < stackEnd);
+          r = bscf(mask);
+          NodeRef c = node->child(r); BVH::prefetch(c,types); unsigned int d = *(unsigned int*)&tNear[r]; 
+          const vint4 s((size_t)c,(size_t)d);
+          *(vint4*)stackPtr++ = s;
+          assert(c != BVH::emptyNode);
+          if (unlikely(mask == 0)) break;
+        }
+        sort(stackFirst,stackPtr);
+        cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;
+#else
+        /*! Here starts the slow path for 3 or 4 hit children. We push
+         *  all nodes onto the stack to sort them there. */
+        assert(stackPtr < stackEnd);
+        stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++;
+        assert(stackPtr < stackEnd);
+        stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++;
+
+        /*! three children are hit, push all onto stack and sort 3 stack items, continue with closest child */
+        assert(stackPtr < stackEnd);
+        r = bscf(mask);
+        NodeRef c = node->child(r); BVH::prefetch(c,types); unsigned int d = ((unsigned int*)&tNear)[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++;
+        assert(c != BVH::emptyNode);
+        if (likely(mask == 0)) {
+          sort(stackPtr[-1],stackPtr[-2],stackPtr[-3]);
+          cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;
+          return;
+        }
+
+        /*! four children are hit, push all onto stack and sort 4 stack items, continue with closest child */
+        assert(stackPtr < stackEnd);
+        r = bscf(mask);
+        c = node->child(r); BVH::prefetch(c,types); d = *(unsigned int*)&tNear[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++;
+        assert(c != BVH::emptyNode);
+        if (likely(mask == 0)) {
+          sort(stackPtr[-1],stackPtr[-2],stackPtr[-3],stackPtr[-4]);
+          cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;
+          return;
+        }
+        /*! fallback case if more than 4 children are hit */
+        StackItemT<NodeRef>* stackFirst = stackPtr-4;
+        while (1)
+        {
+          assert(stackPtr < stackEnd);
+          r = bscf(mask);
+          c = node->child(r); BVH::prefetch(c,types); d = *(unsigned int*)&tNear[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++;
+          assert(c != BVH::emptyNode);
+          if (unlikely(mask == 0)) break;
+        }
+        sort(stackFirst,stackPtr);
+        cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;
+#endif
+#endif
+      }
+
+      static __forceinline void traverseAnyHit(NodeRef& cur,
+                                               size_t mask,
+                                               const vfloat8& tNear,
+                                               NodeRef*& stackPtr,
+                                               NodeRef* stackEnd)
+      {
+        const BaseNode* node = cur.baseNode();
+
+        /*! one child is hit, continue with that child */
+        size_t r = bscf(mask);
+        cur = node->child(r);
+        BVH::prefetch(cur,types);
+
+        /* simpler in sequence traversal order */
+        assert(cur != BVH::emptyNode);
+        if (likely(mask == 0)) return;
+        assert(stackPtr < stackEnd);
+        *stackPtr = cur; stackPtr++;
+
+        for (; ;)
+        {
+          r = bscf(mask);
+          cur = node->child(r); BVH::prefetch(cur,types);
+          assert(cur != BVH::emptyNode);
+          if (likely(mask == 0)) return;
+          assert(stackPtr < stackEnd);
+          *stackPtr = cur; stackPtr++;
+        }
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_traverser_stream.h b/thirdparty/embree/kernels/bvh/bvh_traverser_stream.h
new file mode 100644
index 0000000000..852981e69d
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_traverser_stream.h
@@ -0,0 +1,149 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh.h"
+#include "../common/ray.h"
+#include "../common/stack_item.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int N, int types>
+    class BVHNNodeTraverserStreamHitCoherent
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::BaseNode BaseNode;
+
+    public:
+      template<class T>
+      static __forceinline void traverseClosestHit(NodeRef& cur,
+                                                   size_t& m_trav_active,
+                                                   const vbool<N>& vmask,
+                                                   const vfloat<N>& tNear,
+                                                   const T* const tMask,
+                                                   StackItemMaskCoherent*& stackPtr)
+      {
+        const NodeRef parent = cur;
+        size_t mask = movemask(vmask);
+        assert(mask != 0);
+        const BaseNode* node = cur.baseNode();
+
+        /*! one child is hit, continue with that child */
+        const size_t r0 = bscf(mask);
+        assert(r0 < 8);
+        cur = node->child(r0);
+        BVHN<N>::prefetch(cur,types);
+        m_trav_active = tMask[r0];
+        assert(cur != BVH::emptyNode);
+        if (unlikely(mask == 0)) return;
+
+        const unsigned int* const tNear_i = (unsigned int*)&tNear;
+
+        /*! two children are hit, push far child, and continue with closer child */
+        NodeRef c0 = cur;
+        unsigned int d0 = tNear_i[r0];
+        const size_t r1 = bscf(mask);
+        assert(r1 < 8);
+        NodeRef c1 = node->child(r1);
+        BVHN<N>::prefetch(c1,types);
+        unsigned int d1 = tNear_i[r1];
+
+        assert(c0 != BVH::emptyNode);
+        assert(c1 != BVH::emptyNode);
+        if (likely(mask == 0)) {
+          if (d0 < d1) {
+            assert(tNear[r1] >= 0.0f);
+            stackPtr->mask    = tMask[r1];
+            stackPtr->parent  = parent;
+            stackPtr->child   = c1;
+            stackPtr++;
+            cur = c0;
+            m_trav_active = tMask[r0];
+            return;
+          }
+          else {
+            assert(tNear[r0] >= 0.0f);
+            stackPtr->mask    = tMask[r0];
+            stackPtr->parent  = parent;
+            stackPtr->child   = c0;
+            stackPtr++;
+            cur = c1;
+            m_trav_active = tMask[r1];
+            return;
+          }
+        }
+
+        /*! slow path for more than two hits */
+        size_t hits = movemask(vmask);
+        const vint<N> dist_i = select(vmask, (asInt(tNear) & 0xfffffff8) | vint<N>(step), 0);
+        const vint<N> dist_i_sorted = usort_descending(dist_i);
+        const vint<N> sorted_index = dist_i_sorted & 7;
+
+        size_t i = 0;
+        for (;;)
+        {
+          const unsigned int index = sorted_index[i];
+          assert(index < 8);
+          cur = node->child(index);
+          m_trav_active = tMask[index];
+          assert(m_trav_active);
+          BVHN<N>::prefetch(cur,types);
+          bscf(hits);
+          if (unlikely(hits==0)) break;
+          i++;
+          assert(cur != BVH::emptyNode);
+          assert(tNear[index] >= 0.0f);
+          stackPtr->mask    = m_trav_active;
+          stackPtr->parent  = parent;
+          stackPtr->child   = cur;
+          stackPtr++;
+        }
+      }
+
+      template<class T>
+      static __forceinline void traverseAnyHit(NodeRef& cur,
+                                               size_t& m_trav_active,
+                                               const vbool<N>& vmask,
+                                               const T* const tMask,
+                                               StackItemMaskCoherent*& stackPtr)
+      {
+        const NodeRef parent = cur;
+        size_t mask = movemask(vmask);
+        assert(mask != 0);
+        const BaseNode* node = cur.baseNode();
+
+        /*! one child is hit, continue with that child */
+        size_t r = bscf(mask);
+        cur = node->child(r);
+        BVHN<N>::prefetch(cur,types);
+        m_trav_active = tMask[r];
+
+        /* simple in order sequence */
+        assert(cur != BVH::emptyNode);
+        if (likely(mask == 0)) return;
+        stackPtr->mask    = m_trav_active;
+        stackPtr->parent  = parent;
+        stackPtr->child   = cur;
+        stackPtr++;
+
+        for (; ;)
+        {
+          r = bscf(mask);
+          cur = node->child(r);
+          BVHN<N>::prefetch(cur,types);
+          m_trav_active = tMask[r];
+          assert(cur != BVH::emptyNode);
+          if (likely(mask == 0)) return;
+          stackPtr->mask    = m_trav_active;
+          stackPtr->parent  = parent;
+          stackPtr->child   = cur;
+          stackPtr++;
+        }
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/bvh/node_intersector.h b/thirdparty/embree/kernels/bvh/node_intersector.h
new file mode 100644
index 0000000000..25edaf295d
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/node_intersector.h
@@ -0,0 +1,31 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    struct NearFarPrecalculations
+    {
+      size_t nearX, nearY, nearZ;
+      size_t farX, farY, farZ;
+
+      __forceinline NearFarPrecalculations() {}
+
+      __forceinline NearFarPrecalculations(const Vec3fa& dir, size_t N)
+      {
+        const size_t size = sizeof(float)*N;
+        nearX = (dir.x < 0.0f) ? 1*size : 0*size;
+        nearY = (dir.y < 0.0f) ? 3*size : 2*size;
+        nearZ = (dir.z < 0.0f) ? 5*size : 4*size;
+        farX  = nearX ^ size;
+        farY  = nearY ^ size;
+        farZ  = nearZ ^ size;
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/bvh/node_intersector1.h b/thirdparty/embree/kernels/bvh/node_intersector1.h
new file mode 100644
index 0000000000..1ec4fc63fc
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/node_intersector1.h
@@ -0,0 +1,1403 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "node_intersector.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Ray structure used in single-ray traversal
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, bool robust>
+      struct TravRayBase;
+      
+    /* Base (without tnear and tfar) */
+    template<int N>
+      struct TravRayBase<N,false>
+    {
+      __forceinline TravRayBase() {}
+
+      __forceinline TravRayBase(const Vec3fa& ray_org, const Vec3fa& ray_dir)
+        : org_xyz(ray_org), dir_xyz(ray_dir) 
+      {
+        const Vec3fa ray_rdir = rcp_safe(ray_dir);
+        org = Vec3vf<N>(ray_org.x,ray_org.y,ray_org.z);
+        dir = Vec3vf<N>(ray_dir.x,ray_dir.y,ray_dir.z);
+        rdir = Vec3vf<N>(ray_rdir.x,ray_rdir.y,ray_rdir.z);
+#if defined(__AVX2__) || defined(__ARM_NEON)
+        const Vec3fa ray_org_rdir = ray_org*ray_rdir;
+        org_rdir = Vec3vf<N>(ray_org_rdir.x,ray_org_rdir.y,ray_org_rdir.z);
+#endif
+        nearX = ray_rdir.x >= 0.0f ? 0*sizeof(vfloat<N>) : 1*sizeof(vfloat<N>);
+        nearY = ray_rdir.y >= 0.0f ? 2*sizeof(vfloat<N>) : 3*sizeof(vfloat<N>);
+        nearZ = ray_rdir.z >= 0.0f ? 4*sizeof(vfloat<N>) : 5*sizeof(vfloat<N>);
+        farX  = nearX ^ sizeof(vfloat<N>);
+        farY  = nearY ^ sizeof(vfloat<N>);
+        farZ  = nearZ ^ sizeof(vfloat<N>);
+      }
+
+      template<int K>
+      __forceinline void init(size_t k, const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir,
+                              const Vec3vf<K>& ray_rdir, const Vec3vi<K>& nearXYZ,
+                              size_t flip = sizeof(vfloat<N>))
+      {
+        org  = Vec3vf<N>(ray_org.x[k], ray_org.y[k], ray_org.z[k]);
+        dir  = Vec3vf<N>(ray_dir.x[k], ray_dir.y[k], ray_dir.z[k]);
+        rdir = Vec3vf<N>(ray_rdir.x[k], ray_rdir.y[k], ray_rdir.z[k]);
+#if defined(__AVX2__) || defined(__ARM_NEON)
+	org_rdir = org*rdir;
+#endif
+	nearX = nearXYZ.x[k];
+	nearY = nearXYZ.y[k];
+	nearZ = nearXYZ.z[k];
+        farX  = nearX ^ flip;
+        farY  = nearY ^ flip;
+        farZ  = nearZ ^ flip;
+      }
+
+      Vec3fa org_xyz, dir_xyz;
+      Vec3vf<N> org, dir, rdir;
+#if defined(__AVX2__) || defined(__ARM_NEON)
+      Vec3vf<N> org_rdir;
+#endif
+      size_t nearX, nearY, nearZ;
+      size_t farX, farY, farZ;
+    };
+
+    /* Base (without tnear and tfar) */
+    template<int N>
+      struct TravRayBase<N,true>
+    {
+      __forceinline TravRayBase() {}
+
+      __forceinline TravRayBase(const Vec3fa& ray_org, const Vec3fa& ray_dir)
+        : org_xyz(ray_org), dir_xyz(ray_dir) 
+      {
+        const float round_down = 1.0f-3.0f*float(ulp);
+        const float round_up   = 1.0f+3.0f*float(ulp);
+        const Vec3fa ray_rdir = 1.0f/zero_fix(ray_dir);
+        const Vec3fa ray_rdir_near = round_down*ray_rdir;
+        const Vec3fa ray_rdir_far  = round_up  *ray_rdir;
+        org = Vec3vf<N>(ray_org.x,ray_org.y,ray_org.z);
+        dir = Vec3vf<N>(ray_dir.x,ray_dir.y,ray_dir.z);
+        rdir_near = Vec3vf<N>(ray_rdir_near.x,ray_rdir_near.y,ray_rdir_near.z);
+        rdir_far  = Vec3vf<N>(ray_rdir_far .x,ray_rdir_far .y,ray_rdir_far .z);
+
+        nearX = ray_rdir_near.x >= 0.0f ? 0*sizeof(vfloat<N>) : 1*sizeof(vfloat<N>);
+        nearY = ray_rdir_near.y >= 0.0f ? 2*sizeof(vfloat<N>) : 3*sizeof(vfloat<N>);
+        nearZ = ray_rdir_near.z >= 0.0f ? 4*sizeof(vfloat<N>) : 5*sizeof(vfloat<N>);
+        farX  = nearX ^ sizeof(vfloat<N>);
+        farY  = nearY ^ sizeof(vfloat<N>);
+        farZ  = nearZ ^ sizeof(vfloat<N>);
+      }
+
+      template<int K>
+      __forceinline void init(size_t k, const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir,
+                              const Vec3vf<K>& ray_rdir, const Vec3vi<K>& nearXYZ,
+                              size_t flip = sizeof(vfloat<N>))
+      {
+        const vfloat<N> round_down = 1.0f-3.0f*float(ulp);
+        const vfloat<N> round_up   = 1.0f+3.0f*float(ulp);
+        org  = Vec3vf<N>(ray_org.x[k], ray_org.y[k], ray_org.z[k]);
+        dir  = Vec3vf<N>(ray_dir.x[k], ray_dir.y[k], ray_dir.z[k]);
+        rdir_near = round_down*Vec3vf<N>(ray_rdir.x[k], ray_rdir.y[k], ray_rdir.z[k]);
+        rdir_far  = round_up  *Vec3vf<N>(ray_rdir.x[k], ray_rdir.y[k], ray_rdir.z[k]);
+
+	nearX = nearXYZ.x[k];
+	nearY = nearXYZ.y[k];
+	nearZ = nearXYZ.z[k];
+        farX  = nearX ^ flip;
+        farY  = nearY ^ flip;
+        farZ  = nearZ ^ flip;
+      }
+
+      Vec3fa org_xyz, dir_xyz;
+      Vec3vf<N> org, dir, rdir_near, rdir_far;
+      size_t nearX, nearY, nearZ;
+      size_t farX, farY, farZ;
+    };
+
+    /* Full (with tnear and tfar) */
+    template<int N, bool robust>
+      struct TravRay : TravRayBase<N,robust>
+    {
+      __forceinline TravRay() {}
+
+      __forceinline TravRay(const Vec3fa& ray_org, const Vec3fa& ray_dir, float ray_tnear, float ray_tfar)
+        : TravRayBase<N,robust>(ray_org, ray_dir),
+          tnear(ray_tnear), tfar(ray_tfar) {}
+
+      template<int K>
+      __forceinline void init(size_t k, const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir,
+                              const Vec3vf<K>& ray_rdir, const Vec3vi<K>& nearXYZ,
+                              float ray_tnear, float ray_tfar,
+                              size_t flip = sizeof(vfloat<N>))
+      {
+        TravRayBase<N,robust>::template init<K>(k, ray_org, ray_dir, ray_rdir, nearXYZ, flip);
+        tnear = ray_tnear; tfar = ray_tfar;
+      }
+
+      vfloat<N> tnear;
+      vfloat<N> tfar;
+    };
+    
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Point Query structure used in single-ray traversal
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N>
+    struct TravPointQuery
+    {
+      __forceinline TravPointQuery() {}
+
+      __forceinline TravPointQuery(const Vec3fa& query_org, const Vec3fa& query_rad)
+      {
+        org = Vec3vf<N>(query_org.x, query_org.y, query_org.z);
+        rad = Vec3vf<N>(query_rad.x, query_rad.y, query_rad.z);
+      }
+
+      __forceinline vfloat<N> const& tfar() const {
+        return rad.x;
+      }
+
+      Vec3vf<N> org, rad;
+    };
+    
+    //////////////////////////////////////////////////////////////////////////////////////
+    // point query
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N>
+    __forceinline size_t pointQuerySphereDistAndMask(
+      const TravPointQuery<N>& query, vfloat<N>& dist, vfloat<N> const& minX, vfloat<N> const& maxX, 
+      vfloat<N> const& minY, vfloat<N> const& maxY, vfloat<N> const& minZ, vfloat<N> const& maxZ)
+    {
+      const vfloat<N> vX = min(max(query.org.x, minX), maxX) - query.org.x;
+      const vfloat<N> vY = min(max(query.org.y, minY), maxY) - query.org.y;
+      const vfloat<N> vZ = min(max(query.org.z, minZ), maxZ) - query.org.z;
+      dist = vX * vX + vY * vY + vZ * vZ;
+      const vbool<N> vmask = dist <= query.tfar()*query.tfar();
+      const vbool<N> valid = minX <= maxX;
+      return movemask(vmask) & movemask(valid);
+    }
+
+    template<int N>
+    __forceinline size_t pointQueryNodeSphere(const typename BVHN<N>::AABBNode* node, const TravPointQuery<N>& query, vfloat<N>& dist)
+    {
+      const vfloat<N> minX = vfloat<N>::load((float*)((const char*)&node->lower_x));
+      const vfloat<N> minY = vfloat<N>::load((float*)((const char*)&node->lower_y));
+      const vfloat<N> minZ = vfloat<N>::load((float*)((const char*)&node->lower_z));
+      const vfloat<N> maxX = vfloat<N>::load((float*)((const char*)&node->upper_x));
+      const vfloat<N> maxY = vfloat<N>::load((float*)((const char*)&node->upper_y));
+      const vfloat<N> maxZ = vfloat<N>::load((float*)((const char*)&node->upper_z));
+      return pointQuerySphereDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ);
+    }
+    
+    template<int N>
+    __forceinline size_t pointQueryNodeSphere(const typename BVHN<N>::AABBNodeMB* node, const TravPointQuery<N>& query, const float time, vfloat<N>& dist)
+    {
+      const vfloat<N>* pMinX = (const vfloat<N>*)((const char*)&node->lower_x);
+      const vfloat<N>* pMinY = (const vfloat<N>*)((const char*)&node->lower_y);
+      const vfloat<N>* pMinZ = (const vfloat<N>*)((const char*)&node->lower_z);
+      const vfloat<N>* pMaxX = (const vfloat<N>*)((const char*)&node->upper_x);
+      const vfloat<N>* pMaxY = (const vfloat<N>*)((const char*)&node->upper_y);
+      const vfloat<N>* pMaxZ = (const vfloat<N>*)((const char*)&node->upper_z);
+      const vfloat<N> minX = madd(time,pMinX[6],vfloat<N>(pMinX[0]));
+      const vfloat<N> minY = madd(time,pMinY[6],vfloat<N>(pMinY[0]));
+      const vfloat<N> minZ = madd(time,pMinZ[6],vfloat<N>(pMinZ[0]));
+      const vfloat<N> maxX = madd(time,pMaxX[6],vfloat<N>(pMaxX[0]));
+      const vfloat<N> maxY = madd(time,pMaxY[6],vfloat<N>(pMaxY[0]));
+      const vfloat<N> maxZ = madd(time,pMaxZ[6],vfloat<N>(pMaxZ[0]));
+      return pointQuerySphereDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ);
+    }
+    
+    template<int N>
+      __forceinline size_t pointQueryNodeSphereMB4D(const typename BVHN<N>::NodeRef ref, const TravPointQuery<N>& query, const float time, vfloat<N>& dist)
+    {
+      const typename BVHN<N>::AABBNodeMB* node = ref.getAABBNodeMB();
+      size_t mask = pointQueryNodeSphere(node, query, time, dist);
+
+      if (unlikely(ref.isAABBNodeMB4D())) {
+        const typename BVHN<N>::AABBNodeMB4D* node1 = (const typename BVHN<N>::AABBNodeMB4D*) node;
+        const vbool<N> vmask = (node1->lower_t <= time) & (time < node1->upper_t);
+        mask &= movemask(vmask);
+      }
+
+      return mask;
+    }
+    
+    template<int N>
+    __forceinline size_t pointQueryNodeSphere(const typename BVHN<N>::QuantizedBaseNode* node, const TravPointQuery<N>& query, vfloat<N>& dist)
+    {
+      const vfloat<N> start_x(node->start.x);
+      const vfloat<N> scale_x(node->scale.x);
+      const vfloat<N> minX = madd(node->template dequantize<N>((0*sizeof(vfloat<N>)) >> 2),scale_x,start_x);
+      const vfloat<N> maxX = madd(node->template dequantize<N>((1*sizeof(vfloat<N>)) >> 2),scale_x,start_x);
+      const vfloat<N> start_y(node->start.y);
+      const vfloat<N> scale_y(node->scale.y);
+      const vfloat<N> minY = madd(node->template dequantize<N>((2*sizeof(vfloat<N>)) >> 2),scale_y,start_y);
+      const vfloat<N> maxY = madd(node->template dequantize<N>((3*sizeof(vfloat<N>)) >> 2),scale_y,start_y);
+      const vfloat<N> start_z(node->start.z);
+      const vfloat<N> scale_z(node->scale.z);
+      const vfloat<N> minZ = madd(node->template dequantize<N>((4*sizeof(vfloat<N>)) >> 2),scale_z,start_z);
+      const vfloat<N> maxZ = madd(node->template dequantize<N>((5*sizeof(vfloat<N>)) >> 2),scale_z,start_z);
+      return pointQuerySphereDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ) & movemask(node->validMask());
+    }
+    
+    template<int N>
+    __forceinline size_t pointQueryNodeSphere(const typename BVHN<N>::QuantizedBaseNodeMB* node, const TravPointQuery<N>& query, const float time, vfloat<N>& dist)
+    {
+      const vfloat<N> minX = node->dequantizeLowerX(time);
+      const vfloat<N> maxX = node->dequantizeUpperX(time);
+      const vfloat<N> minY = node->dequantizeLowerY(time);
+      const vfloat<N> maxY = node->dequantizeUpperY(time);
+      const vfloat<N> minZ = node->dequantizeLowerZ(time);
+      const vfloat<N> maxZ = node->dequantizeUpperZ(time);     
+      return pointQuerySphereDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ) & movemask(node->validMask());
+    }
+    
+    template<int N>
+    __forceinline size_t pointQueryNodeSphere(const typename BVHN<N>::OBBNode* node, const TravPointQuery<N>& query, vfloat<N>& dist)
+    {
+      // TODO: point query - implement
+      const vbool<N> vmask = vbool<N>(true);
+      const size_t mask = movemask(vmask) & ((1<<N)-1);
+      dist = vfloat<N>(0.0f);
+      return mask;
+    }
+    
+    template<int N>
+    __forceinline size_t pointQueryNodeSphere(const typename BVHN<N>::OBBNodeMB* node, const TravPointQuery<N>& query, const float time, vfloat<N>& dist)
+    {
+      // TODO: point query - implement
+      const vbool<N> vmask = vbool<N>(true);
+      const size_t mask = movemask(vmask) & ((1<<N)-1);
+      dist = vfloat<N>(0.0f);
+      return mask;
+    }
+
+    template<int N>
+    __forceinline size_t pointQueryAABBDistAndMask(
+      const TravPointQuery<N>& query, vfloat<N>& dist, vfloat<N> const& minX, vfloat<N> const& maxX, 
+      vfloat<N> const& minY, vfloat<N> const& maxY, vfloat<N> const& minZ, vfloat<N> const& maxZ)
+    {
+      const vfloat<N> vX = min(max(query.org.x, minX), maxX) - query.org.x;
+      const vfloat<N> vY = min(max(query.org.y, minY), maxY) - query.org.y;
+      const vfloat<N> vZ = min(max(query.org.z, minZ), maxZ) - query.org.z;
+      dist = vX * vX + vY * vY + vZ * vZ;
+      const vbool<N> valid = minX <= maxX;
+      const vbool<N> vmask = !((maxX < query.org.x - query.rad.x) | (minX > query.org.x + query.rad.x) |
+                               (maxY < query.org.y - query.rad.y) | (minY > query.org.y + query.rad.y) |
+                               (maxZ < query.org.z - query.rad.z) | (minZ > query.org.z + query.rad.z));
+      return movemask(vmask) & movemask(valid);
+    }
+
+    template<int N>
+    __forceinline size_t pointQueryNodeAABB(const typename BVHN<N>::AABBNode* node, const TravPointQuery<N>& query, vfloat<N>& dist)
+    {
+      const vfloat<N> minX = vfloat<N>::load((float*)((const char*)&node->lower_x));
+      const vfloat<N> minY = vfloat<N>::load((float*)((const char*)&node->lower_y));
+      const vfloat<N> minZ = vfloat<N>::load((float*)((const char*)&node->lower_z));
+      const vfloat<N> maxX = vfloat<N>::load((float*)((const char*)&node->upper_x));
+      const vfloat<N> maxY = vfloat<N>::load((float*)((const char*)&node->upper_y));
+      const vfloat<N> maxZ = vfloat<N>::load((float*)((const char*)&node->upper_z));
+      return pointQueryAABBDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ);
+    }
+    
+    template<int N>
+    __forceinline size_t pointQueryNodeAABB(const typename BVHN<N>::AABBNodeMB* node, const TravPointQuery<N>& query, const float time, vfloat<N>& dist)
+    {
+      const vfloat<N>* pMinX = (const vfloat<N>*)((const char*)&node->lower_x);
+      const vfloat<N>* pMinY = (const vfloat<N>*)((const char*)&node->lower_y);
+      const vfloat<N>* pMinZ = (const vfloat<N>*)((const char*)&node->lower_z);
+      const vfloat<N>* pMaxX = (const vfloat<N>*)((const char*)&node->upper_x);
+      const vfloat<N>* pMaxY = (const vfloat<N>*)((const char*)&node->upper_y);
+      const vfloat<N>* pMaxZ = (const vfloat<N>*)((const char*)&node->upper_z);
+      const vfloat<N> minX = madd(time,pMinX[6],vfloat<N>(pMinX[0]));
+      const vfloat<N> minY = madd(time,pMinY[6],vfloat<N>(pMinY[0]));
+      const vfloat<N> minZ = madd(time,pMinZ[6],vfloat<N>(pMinZ[0]));
+      const vfloat<N> maxX = madd(time,pMaxX[6],vfloat<N>(pMaxX[0]));
+      const vfloat<N> maxY = madd(time,pMaxY[6],vfloat<N>(pMaxY[0]));
+      const vfloat<N> maxZ = madd(time,pMaxZ[6],vfloat<N>(pMaxZ[0]));
+      return pointQueryAABBDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ);
+    }
+    
+    template<int N>
+      __forceinline size_t pointQueryNodeAABBMB4D(const typename BVHN<N>::NodeRef ref, const TravPointQuery<N>& query, const float time, vfloat<N>& dist)
+    {
+      const typename BVHN<N>::AABBNodeMB* node = ref.getAABBNodeMB();
+      size_t mask = pointQueryNodeAABB(node, query, time, dist);
+
+      if (unlikely(ref.isAABBNodeMB4D())) {
+        const typename BVHN<N>::AABBNodeMB4D* node1 = (const typename BVHN<N>::AABBNodeMB4D*) node;
+        const vbool<N> vmask = (node1->lower_t <= time) & (time < node1->upper_t);
+        mask &= movemask(vmask);
+      }
+
+      return mask;
+    }
+    
+    template<int N>
+    __forceinline size_t pointQueryNodeAABB(const typename BVHN<N>::QuantizedBaseNode* node, const TravPointQuery<N>& query, vfloat<N>& dist)
+    {
+      const size_t mvalid  = movemask(node->validMask());
+      const vfloat<N> start_x(node->start.x);
+      const vfloat<N> scale_x(node->scale.x);
+      const vfloat<N> minX = madd(node->template dequantize<N>((0*sizeof(vfloat<N>)) >> 2),scale_x,start_x);
+      const vfloat<N> maxX = madd(node->template dequantize<N>((1*sizeof(vfloat<N>)) >> 2),scale_x,start_x);
+      const vfloat<N> start_y(node->start.y);
+      const vfloat<N> scale_y(node->scale.y);
+      const vfloat<N> minY = madd(node->template dequantize<N>((2*sizeof(vfloat<N>)) >> 2),scale_y,start_y);
+      const vfloat<N> maxY = madd(node->template dequantize<N>((3*sizeof(vfloat<N>)) >> 2),scale_y,start_y);
+      const vfloat<N> start_z(node->start.z);
+      const vfloat<N> scale_z(node->scale.z);
+      const vfloat<N> minZ = madd(node->template dequantize<N>((4*sizeof(vfloat<N>)) >> 2),scale_z,start_z);
+      const vfloat<N> maxZ = madd(node->template dequantize<N>((5*sizeof(vfloat<N>)) >> 2),scale_z,start_z);
+      return pointQueryAABBDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ) & mvalid;
+    }
+    
+    template<int N>
+    __forceinline size_t pointQueryNodeAABB(const typename BVHN<N>::QuantizedBaseNodeMB* node, const TravPointQuery<N>& query, const float time, vfloat<N>& dist)
+    {
+      const size_t mvalid  = movemask(node->validMask());
+      const vfloat<N> minX = node->dequantizeLowerX(time);
+      const vfloat<N> maxX = node->dequantizeUpperX(time);
+      const vfloat<N> minY = node->dequantizeLowerY(time);
+      const vfloat<N> maxY = node->dequantizeUpperY(time);
+      const vfloat<N> minZ = node->dequantizeLowerZ(time);
+      const vfloat<N> maxZ = node->dequantizeUpperZ(time);     
+      return pointQueryAABBDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ) & mvalid;
+    }
+    
+    template<int N>
+    __forceinline size_t pointQueryNodeAABB(const typename BVHN<N>::OBBNode* node, const TravPointQuery<N>& query, vfloat<N>& dist)
+    {
+      // TODO: point query - implement
+      const vbool<N> vmask = vbool<N>(true);
+      const size_t mask = movemask(vmask) & ((1<<N)-1);
+      dist = vfloat<N>(0.0f);
+      return mask;
+    }
+    
+    template<int N>
+    __forceinline size_t pointQueryNodeAABB(const typename BVHN<N>::OBBNodeMB* node, const TravPointQuery<N>& query, const float time, vfloat<N>& dist)
+    {
+      // TODO: point query - implement
+      const vbool<N> vmask = vbool<N>(true);
+      const size_t mask = movemask(vmask) & ((1<<N)-1);
+      dist = vfloat<N>(0.0f);
+      return mask;
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Fast AABBNode intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, bool robust>
+      __forceinline size_t intersectNode(const typename BVHN<N>::AABBNode* node, const TravRay<N,robust>& ray, vfloat<N>& dist);
+
+    template<>
+      __forceinline size_t intersectNode<4>(const typename BVH4::AABBNode* node, const TravRay<4,false>& ray, vfloat4& dist)
+    {
+#if defined(__AVX2__) || defined(__ARM_NEON)
+      const vfloat4 tNearX = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.org_rdir.x);
+      const vfloat4 tNearY = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.org_rdir.y);
+      const vfloat4 tNearZ = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.org_rdir.z);
+      const vfloat4 tFarX  = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.org_rdir.x);
+      const vfloat4 tFarY  = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.org_rdir.y);
+      const vfloat4 tFarZ  = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.org_rdir.z);
+#else
+      const vfloat4 tNearX = (vfloat4::load((float*)((const char*)&node->lower_x+ray.nearX)) - ray.org.x) * ray.rdir.x;
+      const vfloat4 tNearY = (vfloat4::load((float*)((const char*)&node->lower_x+ray.nearY)) - ray.org.y) * ray.rdir.y;
+      const vfloat4 tNearZ = (vfloat4::load((float*)((const char*)&node->lower_x+ray.nearZ)) - ray.org.z) * ray.rdir.z;
+      const vfloat4 tFarX  = (vfloat4::load((float*)((const char*)&node->lower_x+ray.farX )) - ray.org.x) * ray.rdir.x;
+      const vfloat4 tFarY  = (vfloat4::load((float*)((const char*)&node->lower_x+ray.farY )) - ray.org.y) * ray.rdir.y;
+      const vfloat4 tFarZ  = (vfloat4::load((float*)((const char*)&node->lower_x+ray.farZ )) - ray.org.z) * ray.rdir.z;
+#endif
+
+#if defined(__aarch64__)
+      const vfloat4 tNear = maxi(tNearX, tNearY, tNearZ, ray.tnear);
+      const vfloat4 tFar = mini(tFarX, tFarY, tFarZ, ray.tfar);
+      const vbool4 vmask = asInt(tNear) <= asInt(tFar);
+      const size_t mask = movemask(vmask);
+#elif defined(__SSE4_1__) && !defined(__AVX512F__) // up to HSW
+      const vfloat4 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat4 tFar  = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool4 vmask = asInt(tNear) > asInt(tFar);
+      const size_t mask = movemask(vmask) ^ ((1<<4)-1);
+#elif defined(__AVX512F__) // SKX
+      const vfloat4 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat4 tFar  = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool4 vmask = asInt(tNear) <= asInt(tFar);
+      const size_t mask = movemask(vmask);
+#else
+      const vfloat4 tNear = max(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat4 tFar  = min(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool4 vmask = tNear <= tFar;
+      const size_t mask = movemask(vmask);
+#endif
+      dist = tNear;
+      return mask;
+    }
+
+#if defined(__AVX__)
+
+    template<>
+      __forceinline size_t intersectNode<8>(const typename BVH8::AABBNode* node, const TravRay<8,false>& ray, vfloat8& dist)
+    {
+#if defined(__AVX2__) || defined(__ARM_NEON)
+      const vfloat8 tNearX = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.org_rdir.x);
+      const vfloat8 tNearY = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.org_rdir.y);
+      const vfloat8 tNearZ = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.org_rdir.z);
+      const vfloat8 tFarX  = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.org_rdir.x);
+      const vfloat8 tFarY  = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.org_rdir.y);
+      const vfloat8 tFarZ  = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.org_rdir.z);
+#else
+      const vfloat8 tNearX = (vfloat8::load((float*)((const char*)&node->lower_x+ray.nearX)) - ray.org.x) * ray.rdir.x;
+      const vfloat8 tNearY = (vfloat8::load((float*)((const char*)&node->lower_x+ray.nearY)) - ray.org.y) * ray.rdir.y;
+      const vfloat8 tNearZ = (vfloat8::load((float*)((const char*)&node->lower_x+ray.nearZ)) - ray.org.z) * ray.rdir.z;
+      const vfloat8 tFarX  = (vfloat8::load((float*)((const char*)&node->lower_x+ray.farX )) - ray.org.x) * ray.rdir.x;
+      const vfloat8 tFarY  = (vfloat8::load((float*)((const char*)&node->lower_x+ray.farY )) - ray.org.y) * ray.rdir.y;
+      const vfloat8 tFarZ  = (vfloat8::load((float*)((const char*)&node->lower_x+ray.farZ )) - ray.org.z) * ray.rdir.z;
+#endif
+      
+#if defined(__AVX2__) && !defined(__AVX512F__) // HSW
+      const vfloat8 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat8 tFar  = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool8 vmask = asInt(tNear) > asInt(tFar);
+      const size_t mask = movemask(vmask) ^ ((1<<8)-1);
+#elif defined(__AVX512F__) // SKX
+      const vfloat8 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat8 tFar  = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool8 vmask = asInt(tNear) <= asInt(tFar);
+      const size_t mask = movemask(vmask);
+#else
+      const vfloat8 tNear = max(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat8 tFar  = min(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool8 vmask = tNear <= tFar;
+      const size_t mask = movemask(vmask);
+#endif
+      dist = tNear;
+      return mask;
+    }
+
+#endif
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Robust AABBNode intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N>
+      __forceinline size_t intersectNodeRobust(const typename BVHN<N>::AABBNode* node, const TravRay<N,true>& ray, vfloat<N>& dist)
+    {
+      const vfloat<N> tNearX = (vfloat<N>::load((float*)((const char*)&node->lower_x+ray.nearX)) - ray.org.x) * ray.rdir_near.x;
+      const vfloat<N> tNearY = (vfloat<N>::load((float*)((const char*)&node->lower_x+ray.nearY)) - ray.org.y) * ray.rdir_near.y;
+      const vfloat<N> tNearZ = (vfloat<N>::load((float*)((const char*)&node->lower_x+ray.nearZ)) - ray.org.z) * ray.rdir_near.z;
+      const vfloat<N> tFarX  = (vfloat<N>::load((float*)((const char*)&node->lower_x+ray.farX )) - ray.org.x) * ray.rdir_far.x;
+      const vfloat<N> tFarY  = (vfloat<N>::load((float*)((const char*)&node->lower_x+ray.farY )) - ray.org.y) * ray.rdir_far.y;
+      const vfloat<N> tFarZ  = (vfloat<N>::load((float*)((const char*)&node->lower_x+ray.farZ )) - ray.org.z) * ray.rdir_far.z;
+      const vfloat<N> tNear = max(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat<N> tFar  = min(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool<N> vmask = tNear <= tFar;
+      const size_t mask = movemask(vmask);
+      dist = tNear;
+      return mask;
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Fast AABBNodeMB intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N>
+      __forceinline size_t intersectNode(const typename BVHN<N>::AABBNodeMB* node, const TravRay<N,false>& ray, const float time, vfloat<N>& dist)
+    {
+      const vfloat<N>* pNearX = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearX);
+      const vfloat<N>* pNearY = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearY);
+      const vfloat<N>* pNearZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearZ);
+      const vfloat<N>* pFarX  = (const vfloat<N>*)((const char*)&node->lower_x+ray.farX);
+      const vfloat<N>* pFarY  = (const vfloat<N>*)((const char*)&node->lower_x+ray.farY);
+      const vfloat<N>* pFarZ  = (const vfloat<N>*)((const char*)&node->lower_x+ray.farZ);
+#if defined(__AVX2__) || defined(__ARM_NEON)
+      const vfloat<N> tNearX = msub(madd(time,pNearX[6],vfloat<N>(pNearX[0])), ray.rdir.x, ray.org_rdir.x);
+      const vfloat<N> tNearY = msub(madd(time,pNearY[6],vfloat<N>(pNearY[0])), ray.rdir.y, ray.org_rdir.y);
+      const vfloat<N> tNearZ = msub(madd(time,pNearZ[6],vfloat<N>(pNearZ[0])), ray.rdir.z, ray.org_rdir.z);
+      const vfloat<N> tFarX  = msub(madd(time,pFarX [6],vfloat<N>(pFarX [0])), ray.rdir.x, ray.org_rdir.x);
+      const vfloat<N> tFarY  = msub(madd(time,pFarY [6],vfloat<N>(pFarY [0])), ray.rdir.y, ray.org_rdir.y);
+      const vfloat<N> tFarZ  = msub(madd(time,pFarZ [6],vfloat<N>(pFarZ [0])), ray.rdir.z, ray.org_rdir.z);
+#else
+      const vfloat<N> tNearX = (madd(time,pNearX[6],vfloat<N>(pNearX[0])) - ray.org.x) * ray.rdir.x;
+      const vfloat<N> tNearY = (madd(time,pNearY[6],vfloat<N>(pNearY[0])) - ray.org.y) * ray.rdir.y;
+      const vfloat<N> tNearZ = (madd(time,pNearZ[6],vfloat<N>(pNearZ[0])) - ray.org.z) * ray.rdir.z;
+      const vfloat<N> tFarX  = (madd(time,pFarX [6],vfloat<N>(pFarX [0])) - ray.org.x) * ray.rdir.x;
+      const vfloat<N> tFarY  = (madd(time,pFarY [6],vfloat<N>(pFarY [0])) - ray.org.y) * ray.rdir.y;
+      const vfloat<N> tFarZ  = (madd(time,pFarZ [6],vfloat<N>(pFarZ [0])) - ray.org.z) * ray.rdir.z;
+#endif
+#if defined(__AVX2__) && !defined(__AVX512F__) // HSW
+      const vfloat<N> tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat<N> tFar  = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool<N> vmask = asInt(tNear) > asInt(tFar);
+      const size_t mask = movemask(vmask) ^ ((1<<N)-1);
+#elif defined(__AVX512F__) // SKX
+      const vfloat<N> tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat<N> tFar  = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool<N> vmask = asInt(tNear) <= asInt(tFar);
+      const size_t mask = movemask(vmask);
+#else
+      const vfloat<N> tNear = max(ray.tnear,tNearX,tNearY,tNearZ);
+      const vfloat<N> tFar  = min(ray.tfar, tFarX ,tFarY ,tFarZ );
+      const vbool<N> vmask = tNear <= tFar;
+      const size_t mask = movemask(vmask);
+#endif
+      dist = tNear;
+      return mask;
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Robust AABBNodeMB intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N>
+      __forceinline size_t intersectNodeRobust(const typename BVHN<N>::AABBNodeMB* node, const TravRay<N,true>& ray, const float time, vfloat<N>& dist)
+    {
+      const vfloat<N>* pNearX = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearX);
+      const vfloat<N>* pNearY = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearY);
+      const vfloat<N>* pNearZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearZ);
+      const vfloat<N> tNearX = (madd(time,pNearX[6],vfloat<N>(pNearX[0])) - ray.org.x) * ray.rdir_near.x;
+      const vfloat<N> tNearY = (madd(time,pNearY[6],vfloat<N>(pNearY[0])) - ray.org.y) * ray.rdir_near.y;
+      const vfloat<N> tNearZ = (madd(time,pNearZ[6],vfloat<N>(pNearZ[0])) - ray.org.z) * ray.rdir_near.z;
+      const vfloat<N> tNear = max(ray.tnear,tNearX,tNearY,tNearZ);
+      const vfloat<N>* pFarX = (const vfloat<N>*)((const char*)&node->lower_x+ray.farX);
+      const vfloat<N>* pFarY = (const vfloat<N>*)((const char*)&node->lower_x+ray.farY);
+      const vfloat<N>* pFarZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.farZ);
+      const vfloat<N> tFarX = (madd(time,pFarX[6],vfloat<N>(pFarX[0])) - ray.org.x) * ray.rdir_far.x;
+      const vfloat<N> tFarY = (madd(time,pFarY[6],vfloat<N>(pFarY[0])) - ray.org.y) * ray.rdir_far.y;
+      const vfloat<N> tFarZ = (madd(time,pFarZ[6],vfloat<N>(pFarZ[0])) - ray.org.z) * ray.rdir_far.z;
+      const vfloat<N> tFar = min(ray.tfar,tFarX,tFarY,tFarZ);
+      const size_t mask = movemask(tNear <= tFar);
+      dist = tNear;
+      return mask;
+    }
+    
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Fast AABBNodeMB4D intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N>
+      __forceinline size_t intersectNodeMB4D(const typename BVHN<N>::NodeRef ref, const TravRay<N,false>& ray, const float time, vfloat<N>& dist)
+    {
+      const typename BVHN<N>::AABBNodeMB* node = ref.getAABBNodeMB();
+        
+      const vfloat<N>* pNearX = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearX);
+      const vfloat<N>* pNearY = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearY);
+      const vfloat<N>* pNearZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearZ);
+      const vfloat<N>* pFarX  = (const vfloat<N>*)((const char*)&node->lower_x+ray.farX);
+      const vfloat<N>* pFarY  = (const vfloat<N>*)((const char*)&node->lower_x+ray.farY);
+      const vfloat<N>* pFarZ  = (const vfloat<N>*)((const char*)&node->lower_x+ray.farZ);
+#if defined (__AVX2__) || defined(__ARM_NEON)
+      const vfloat<N> tNearX = msub(madd(time,pNearX[6],vfloat<N>(pNearX[0])), ray.rdir.x, ray.org_rdir.x);
+      const vfloat<N> tNearY = msub(madd(time,pNearY[6],vfloat<N>(pNearY[0])), ray.rdir.y, ray.org_rdir.y);
+      const vfloat<N> tNearZ = msub(madd(time,pNearZ[6],vfloat<N>(pNearZ[0])), ray.rdir.z, ray.org_rdir.z);
+      const vfloat<N> tFarX  = msub(madd(time,pFarX [6],vfloat<N>(pFarX [0])), ray.rdir.x, ray.org_rdir.x);
+      const vfloat<N> tFarY  = msub(madd(time,pFarY [6],vfloat<N>(pFarY [0])), ray.rdir.y, ray.org_rdir.y);
+      const vfloat<N> tFarZ  = msub(madd(time,pFarZ [6],vfloat<N>(pFarZ [0])), ray.rdir.z, ray.org_rdir.z);
+#else
+      const vfloat<N> tNearX = (madd(time,pNearX[6],vfloat<N>(pNearX[0])) - ray.org.x) * ray.rdir.x;
+      const vfloat<N> tNearY = (madd(time,pNearY[6],vfloat<N>(pNearY[0])) - ray.org.y) * ray.rdir.y;
+      const vfloat<N> tNearZ = (madd(time,pNearZ[6],vfloat<N>(pNearZ[0])) - ray.org.z) * ray.rdir.z;
+      const vfloat<N> tFarX  = (madd(time,pFarX [6],vfloat<N>(pFarX [0])) - ray.org.x) * ray.rdir.x;
+      const vfloat<N> tFarY  = (madd(time,pFarY [6],vfloat<N>(pFarY [0])) - ray.org.y) * ray.rdir.y;
+      const vfloat<N> tFarZ  = (madd(time,pFarZ [6],vfloat<N>(pFarZ [0])) - ray.org.z) * ray.rdir.z;
+#endif
+#if defined(__AVX2__) && !defined(__AVX512F__)
+      const vfloat<N> tNear = maxi(maxi(tNearX,tNearY),maxi(tNearZ,ray.tnear));
+      const vfloat<N> tFar  = mini(mini(tFarX ,tFarY ),mini(tFarZ ,ray.tfar ));
+#else
+      const vfloat<N> tNear = max(ray.tnear,tNearX,tNearY,tNearZ);
+      const vfloat<N> tFar  = min(ray.tfar, tFarX ,tFarY ,tFarZ );
+#endif
+      vbool<N> vmask = tNear <= tFar;
+      if (unlikely(ref.isAABBNodeMB4D())) {
+        const typename BVHN<N>::AABBNodeMB4D* node1 = (const typename BVHN<N>::AABBNodeMB4D*) node;
+        vmask &= (node1->lower_t <= time) & (time < node1->upper_t);
+      }
+      const size_t mask = movemask(vmask);
+      dist = tNear;
+      return mask;
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Robust AABBNodeMB4D intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N>
+      __forceinline size_t intersectNodeMB4DRobust(const typename BVHN<N>::NodeRef ref, const TravRay<N,true>& ray, const float time, vfloat<N>& dist)
+    {
+      const typename BVHN<N>::AABBNodeMB* node = ref.getAABBNodeMB();
+
+      const vfloat<N>* pNearX = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearX);
+      const vfloat<N>* pNearY = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearY);
+      const vfloat<N>* pNearZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearZ);
+      const vfloat<N> tNearX = (madd(time,pNearX[6],vfloat<N>(pNearX[0])) - ray.org.x) * ray.rdir_near.x;
+      const vfloat<N> tNearY = (madd(time,pNearY[6],vfloat<N>(pNearY[0])) - ray.org.y) * ray.rdir_near.y;
+      const vfloat<N> tNearZ = (madd(time,pNearZ[6],vfloat<N>(pNearZ[0])) - ray.org.z) * ray.rdir_near.z;
+      const vfloat<N> tNear = max(ray.tnear,tNearX,tNearY,tNearZ);
+      const vfloat<N>* pFarX = (const vfloat<N>*)((const char*)&node->lower_x+ray.farX);
+      const vfloat<N>* pFarY = (const vfloat<N>*)((const char*)&node->lower_x+ray.farY);
+      const vfloat<N>* pFarZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.farZ);
+      const vfloat<N> tFarX = (madd(time,pFarX[6],vfloat<N>(pFarX[0])) - ray.org.x) * ray.rdir_far.x;
+      const vfloat<N> tFarY = (madd(time,pFarY[6],vfloat<N>(pFarY[0])) - ray.org.y) * ray.rdir_far.y;
+      const vfloat<N> tFarZ = (madd(time,pFarZ[6],vfloat<N>(pFarZ[0])) - ray.org.z) * ray.rdir_far.z;
+      const vfloat<N> tFar = min(ray.tfar,tFarX,tFarY,tFarZ);
+      vbool<N> vmask = tNear <= tFar;
+      if (unlikely(ref.isAABBNodeMB4D())) {
+        const typename BVHN<N>::AABBNodeMB4D* node1 = (const typename BVHN<N>::AABBNodeMB4D*) node;
+        vmask &= (node1->lower_t <= time) & (time < node1->upper_t);
+      }
+      const size_t mask = movemask(vmask);
+      dist = tNear;
+      return mask;
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Fast QuantizedBaseNode intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, bool robust>
+      __forceinline size_t intersectNode(const typename BVHN<N>::QuantizedBaseNode* node, const TravRay<N,robust>& ray, vfloat<N>& dist);
+
+    template<>
+      __forceinline size_t intersectNode<4>(const typename BVH4::QuantizedBaseNode* node, const TravRay<4,false>& ray, vfloat4& dist)
+    {
+      const size_t mvalid  = movemask(node->validMask());
+      const vfloat4 start_x(node->start.x);
+      const vfloat4 scale_x(node->scale.x);
+      const vfloat4 lower_x = madd(node->dequantize<4>(ray.nearX >> 2),scale_x,start_x);
+      const vfloat4 upper_x = madd(node->dequantize<4>(ray.farX  >> 2),scale_x,start_x);
+      const vfloat4 start_y(node->start.y);
+      const vfloat4 scale_y(node->scale.y);
+      const vfloat4 lower_y = madd(node->dequantize<4>(ray.nearY >> 2),scale_y,start_y);
+      const vfloat4 upper_y = madd(node->dequantize<4>(ray.farY  >> 2),scale_y,start_y);
+      const vfloat4 start_z(node->start.z);
+      const vfloat4 scale_z(node->scale.z);
+      const vfloat4 lower_z = madd(node->dequantize<4>(ray.nearZ >> 2),scale_z,start_z);
+      const vfloat4 upper_z = madd(node->dequantize<4>(ray.farZ  >> 2),scale_z,start_z);
+
+#if defined(__AVX2__) || defined(__ARM_NEON)
+      const vfloat4 tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x);
+      const vfloat4 tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y);
+      const vfloat4 tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z);
+      const vfloat4 tFarX  = msub(upper_x, ray.rdir.x, ray.org_rdir.x);
+      const vfloat4 tFarY  = msub(upper_y, ray.rdir.y, ray.org_rdir.y);
+      const vfloat4 tFarZ  = msub(upper_z, ray.rdir.z, ray.org_rdir.z);
+#else
+      const vfloat4 tNearX = (lower_x - ray.org.x) * ray.rdir.x;
+      const vfloat4 tNearY = (lower_y - ray.org.y) * ray.rdir.y;
+      const vfloat4 tNearZ = (lower_z - ray.org.z) * ray.rdir.z;
+      const vfloat4 tFarX  = (upper_x - ray.org.x) * ray.rdir.x;
+      const vfloat4 tFarY  = (upper_y - ray.org.y) * ray.rdir.y;
+      const vfloat4 tFarZ  = (upper_z - ray.org.z) * ray.rdir.z;
+#endif
+      
+#if defined(__SSE4_1__) && !defined(__AVX512F__) // up to HSW
+      const vfloat4 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat4 tFar  = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool4 vmask = asInt(tNear) > asInt(tFar);
+      const size_t mask = movemask(vmask) ^ ((1<<4)-1);
+#elif defined(__AVX512F__) // SKX
+      const vfloat4 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat4 tFar  = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool4 vmask = asInt(tNear) <= asInt(tFar);
+      const size_t mask = movemask(vmask);
+#else
+      const vfloat4 tNear = max(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat4 tFar  = min(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool4 vmask = tNear <= tFar;
+      const size_t mask = movemask(vmask);
+#endif
+      dist = tNear;
+      return mask & mvalid;
+    }
+
+    template<>
+      __forceinline size_t intersectNode<4>(const typename BVH4::QuantizedBaseNode* node, const TravRay<4,true>& ray, vfloat4& dist)
+    {
+      const size_t mvalid  = movemask(node->validMask());
+      const vfloat4 start_x(node->start.x);
+      const vfloat4 scale_x(node->scale.x);
+      const vfloat4 lower_x = madd(node->dequantize<4>(ray.nearX >> 2),scale_x,start_x);
+      const vfloat4 upper_x = madd(node->dequantize<4>(ray.farX  >> 2),scale_x,start_x);
+      const vfloat4 start_y(node->start.y);
+      const vfloat4 scale_y(node->scale.y);
+      const vfloat4 lower_y = madd(node->dequantize<4>(ray.nearY >> 2),scale_y,start_y);
+      const vfloat4 upper_y = madd(node->dequantize<4>(ray.farY  >> 2),scale_y,start_y);
+      const vfloat4 start_z(node->start.z);
+      const vfloat4 scale_z(node->scale.z);
+      const vfloat4 lower_z = madd(node->dequantize<4>(ray.nearZ >> 2),scale_z,start_z);
+      const vfloat4 upper_z = madd(node->dequantize<4>(ray.farZ  >> 2),scale_z,start_z);
+
+      const vfloat4 tNearX = (lower_x - ray.org.x) * ray.rdir_near.x;
+      const vfloat4 tNearY = (lower_y - ray.org.y) * ray.rdir_near.y;
+      const vfloat4 tNearZ = (lower_z - ray.org.z) * ray.rdir_near.z;
+      const vfloat4 tFarX  = (upper_x - ray.org.x) * ray.rdir_far.x;
+      const vfloat4 tFarY  = (upper_y - ray.org.y) * ray.rdir_far.y;
+      const vfloat4 tFarZ  = (upper_z - ray.org.z) * ray.rdir_far.z;
+      
+      const vfloat4 tNear = max(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat4 tFar  = min(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool4 vmask = tNear <= tFar;
+      const size_t mask = movemask(vmask);
+      dist = tNear;
+      return mask & mvalid;
+    }
+
+
+#if defined(__AVX__)
+
+    template<>
+      __forceinline size_t intersectNode<8>(const typename BVH8::QuantizedBaseNode* node, const TravRay<8,false>& ray, vfloat8& dist)
+    {
+      const size_t mvalid  = movemask(node->validMask());
+      const vfloat8 start_x(node->start.x);
+      const vfloat8 scale_x(node->scale.x);
+      const vfloat8 lower_x = madd(node->dequantize<8>(ray.nearX >> 2),scale_x,start_x);
+      const vfloat8 upper_x = madd(node->dequantize<8>(ray.farX  >> 2),scale_x,start_x);
+      const vfloat8 start_y(node->start.y);
+      const vfloat8 scale_y(node->scale.y);
+      const vfloat8 lower_y = madd(node->dequantize<8>(ray.nearY >> 2),scale_y,start_y);
+      const vfloat8 upper_y = madd(node->dequantize<8>(ray.farY  >> 2),scale_y,start_y);
+      const vfloat8 start_z(node->start.z);
+      const vfloat8 scale_z(node->scale.z);
+      const vfloat8 lower_z = madd(node->dequantize<8>(ray.nearZ >> 2),scale_z,start_z);
+      const vfloat8 upper_z = madd(node->dequantize<8>(ray.farZ  >> 2),scale_z,start_z);
+
+#if defined(__AVX2__) || defined(__ARM_NEON)
+      const vfloat8 tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x);
+      const vfloat8 tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y);
+      const vfloat8 tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z);
+      const vfloat8 tFarX  = msub(upper_x, ray.rdir.x, ray.org_rdir.x);
+      const vfloat8 tFarY  = msub(upper_y, ray.rdir.y, ray.org_rdir.y);
+      const vfloat8 tFarZ  = msub(upper_z, ray.rdir.z, ray.org_rdir.z);
+#else
+      const vfloat8 tNearX = (lower_x - ray.org.x) * ray.rdir.x;
+      const vfloat8 tNearY = (lower_y - ray.org.y) * ray.rdir.y;
+      const vfloat8 tNearZ = (lower_z - ray.org.z) * ray.rdir.z;
+      const vfloat8 tFarX  = (upper_x - ray.org.x) * ray.rdir.x;
+      const vfloat8 tFarY  = (upper_y - ray.org.y) * ray.rdir.y;
+      const vfloat8 tFarZ  = (upper_z - ray.org.z) * ray.rdir.z;
+#endif
+      
+#if defined(__AVX2__) && !defined(__AVX512F__) // HSW
+      const vfloat8 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat8 tFar  = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool8 vmask = asInt(tNear) > asInt(tFar);
+      const size_t mask = movemask(vmask) ^ ((1<<8)-1);
+#elif defined(__AVX512F__) // SKX
+      const vfloat8 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat8 tFar  = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool8 vmask = asInt(tNear) <= asInt(tFar);
+      const size_t mask = movemask(vmask);
+#else
+      const vfloat8 tNear = max(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat8 tFar  = min(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool8 vmask = tNear <= tFar;
+      const size_t mask = movemask(vmask);
+#endif
+      dist = tNear;
+      return mask & mvalid;
+    }
+
+    template<>
+      __forceinline size_t intersectNode<8>(const typename BVH8::QuantizedBaseNode* node, const TravRay<8,true>& ray, vfloat8& dist)
+    {
+      const size_t mvalid  = movemask(node->validMask());
+      const vfloat8 start_x(node->start.x);
+      const vfloat8 scale_x(node->scale.x);
+      const vfloat8 lower_x = madd(node->dequantize<8>(ray.nearX >> 2),scale_x,start_x);
+      const vfloat8 upper_x = madd(node->dequantize<8>(ray.farX  >> 2),scale_x,start_x);
+      const vfloat8 start_y(node->start.y);
+      const vfloat8 scale_y(node->scale.y);
+      const vfloat8 lower_y = madd(node->dequantize<8>(ray.nearY >> 2),scale_y,start_y);
+      const vfloat8 upper_y = madd(node->dequantize<8>(ray.farY  >> 2),scale_y,start_y);
+      const vfloat8 start_z(node->start.z);
+      const vfloat8 scale_z(node->scale.z);
+      const vfloat8 lower_z = madd(node->dequantize<8>(ray.nearZ >> 2),scale_z,start_z);
+      const vfloat8 upper_z = madd(node->dequantize<8>(ray.farZ  >> 2),scale_z,start_z);
+
+      const vfloat8 tNearX = (lower_x - ray.org.x) * ray.rdir_near.x;
+      const vfloat8 tNearY = (lower_y - ray.org.y) * ray.rdir_near.y;
+      const vfloat8 tNearZ = (lower_z - ray.org.z) * ray.rdir_near.z;
+      const vfloat8 tFarX  = (upper_x - ray.org.x) * ray.rdir_far.x;
+      const vfloat8 tFarY  = (upper_y - ray.org.y) * ray.rdir_far.y;
+      const vfloat8 tFarZ  = (upper_z - ray.org.z) * ray.rdir_far.z;
+      
+      const vfloat8 tNear = max(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat8 tFar  = min(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool8 vmask = tNear <= tFar;
+      const size_t mask = movemask(vmask);
+
+      dist = tNear;
+      return mask & mvalid;
+    }
+
+
+#endif
+
+    template<int N>
+      __forceinline size_t intersectNode(const typename BVHN<N>::QuantizedBaseNodeMB* node, const TravRay<N,false>& ray, const float time, vfloat<N>& dist)
+    {
+      const vboolf<N> mvalid    = node->validMask();
+      const vfloat<N> lower_x   = node->dequantizeLowerX(time);
+      const vfloat<N> upper_x   = node->dequantizeUpperX(time);
+      const vfloat<N> lower_y   = node->dequantizeLowerY(time);
+      const vfloat<N> upper_y   = node->dequantizeUpperY(time);
+      const vfloat<N> lower_z   = node->dequantizeLowerZ(time);
+      const vfloat<N> upper_z   = node->dequantizeUpperZ(time);     
+#if defined(__AVX2__) || defined(__ARM_NEON)
+      const vfloat<N> tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x);
+      const vfloat<N> tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y);
+      const vfloat<N> tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z);
+      const vfloat<N> tFarX  = msub(upper_x, ray.rdir.x, ray.org_rdir.x);
+      const vfloat<N> tFarY  = msub(upper_y, ray.rdir.y, ray.org_rdir.y);
+      const vfloat<N> tFarZ  = msub(upper_z, ray.rdir.z, ray.org_rdir.z);
+#else
+      const vfloat<N> tNearX = (lower_x - ray.org.x) * ray.rdir.x;
+      const vfloat<N> tNearY = (lower_y - ray.org.y) * ray.rdir.y;
+      const vfloat<N> tNearZ = (lower_z - ray.org.z) * ray.rdir.z;
+      const vfloat<N> tFarX  = (upper_x - ray.org.x) * ray.rdir.x;
+      const vfloat<N> tFarY  = (upper_y - ray.org.y) * ray.rdir.y;
+      const vfloat<N> tFarZ  = (upper_z - ray.org.z) * ray.rdir.z;
+#endif      
+
+      const vfloat<N> tminX = mini(tNearX,tFarX);
+      const vfloat<N> tmaxX = maxi(tNearX,tFarX);
+      const vfloat<N> tminY = mini(tNearY,tFarY);
+      const vfloat<N> tmaxY = maxi(tNearY,tFarY);
+      const vfloat<N> tminZ = mini(tNearZ,tFarZ);
+      const vfloat<N> tmaxZ = maxi(tNearZ,tFarZ);
+      const vfloat<N> tNear = maxi(tminX,tminY,tminZ,ray.tnear);
+      const vfloat<N> tFar  = mini(tmaxX,tmaxY,tmaxZ,ray.tfar);
+#if defined(__AVX512F__) // SKX
+      const vbool<N> vmask =  le(mvalid,asInt(tNear),asInt(tFar));
+#else
+      const vbool<N> vmask = (asInt(tNear) <= asInt(tFar)) & mvalid;
+#endif
+      const size_t mask = movemask(vmask);
+      dist = tNear;
+      return mask;      
+    }
+
+    template<int N>
+      __forceinline size_t intersectNode(const typename BVHN<N>::QuantizedBaseNodeMB* node, const TravRay<N,true>& ray, const float time, vfloat<N>& dist)
+    {
+      const vboolf<N> mvalid    = node->validMask();
+      const vfloat<N> lower_x   = node->dequantizeLowerX(time);
+      const vfloat<N> upper_x   = node->dequantizeUpperX(time);
+      const vfloat<N> lower_y   = node->dequantizeLowerY(time);
+      const vfloat<N> upper_y   = node->dequantizeUpperY(time);
+      const vfloat<N> lower_z   = node->dequantizeLowerZ(time);
+      const vfloat<N> upper_z   = node->dequantizeUpperZ(time);     
+      const vfloat<N> tNearX = (lower_x - ray.org.x) * ray.rdir_near.x;
+      const vfloat<N> tNearY = (lower_y - ray.org.y) * ray.rdir_near.y;
+      const vfloat<N> tNearZ = (lower_z - ray.org.z) * ray.rdir_near.z;
+      const vfloat<N> tFarX  = (upper_x - ray.org.x) * ray.rdir_far.x;
+      const vfloat<N> tFarY  = (upper_y - ray.org.y) * ray.rdir_far.y;
+      const vfloat<N> tFarZ  = (upper_z - ray.org.z) * ray.rdir_far.z;
+
+      const vfloat<N> tminX = mini(tNearX,tFarX);
+      const vfloat<N> tmaxX = maxi(tNearX,tFarX);
+      const vfloat<N> tminY = mini(tNearY,tFarY);
+      const vfloat<N> tmaxY = maxi(tNearY,tFarY);
+      const vfloat<N> tminZ = mini(tNearZ,tFarZ);
+      const vfloat<N> tmaxZ = maxi(tNearZ,tFarZ);
+      const vfloat<N> tNear = maxi(tminX,tminY,tminZ,ray.tnear);
+      const vfloat<N> tFar  = mini(tmaxX,tmaxY,tmaxZ,ray.tfar);
+#if defined(__AVX512F__) // SKX
+      const vbool<N> vmask =  le(mvalid,asInt(tNear),asInt(tFar));
+#else
+      const vbool<N> vmask = (asInt(tNear) <= asInt(tFar)) & mvalid;
+#endif
+      const size_t mask = movemask(vmask);
+      dist = tNear;
+      return mask;      
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Fast OBBNode intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, bool robust>
+      __forceinline size_t intersectNode(const typename BVHN<N>::OBBNode* node, const TravRay<N,robust>& ray, vfloat<N>& dist)
+    {
+      const Vec3vf<N> dir = xfmVector(node->naabb,ray.dir);
+      //const Vec3vf<N> nrdir = Vec3vf<N>(vfloat<N>(-1.0f))/dir;
+      const Vec3vf<N> nrdir = Vec3vf<N>(vfloat<N>(-1.0f))*rcp_safe(dir);
+      const Vec3vf<N> org = xfmPoint(node->naabb,ray.org);
+      const Vec3vf<N> tLowerXYZ = org * nrdir;       // (Vec3fa(zero) - org) * rdir;
+      const Vec3vf<N> tUpperXYZ = tLowerXYZ - nrdir; // (Vec3fa(one ) - org) * rdir;
+
+      const vfloat<N> tNearX = mini(tLowerXYZ.x,tUpperXYZ.x);
+      const vfloat<N> tNearY = mini(tLowerXYZ.y,tUpperXYZ.y);
+      const vfloat<N> tNearZ = mini(tLowerXYZ.z,tUpperXYZ.z);
+      const vfloat<N> tFarX  = maxi(tLowerXYZ.x,tUpperXYZ.x);
+      const vfloat<N> tFarY  = maxi(tLowerXYZ.y,tUpperXYZ.y);
+      const vfloat<N> tFarZ  = maxi(tLowerXYZ.z,tUpperXYZ.z);
+      vfloat<N> tNear  = max(ray.tnear, tNearX,tNearY,tNearZ);
+      vfloat<N> tFar   = min(ray.tfar,  tFarX ,tFarY ,tFarZ );
+      if (robust) {
+        tNear = tNear*vfloat<N>(1.0f-3.0f*float(ulp));
+        tFar  = tFar *vfloat<N>(1.0f+3.0f*float(ulp));
+      }
+      const vbool<N> vmask = tNear <= tFar;
+      dist = tNear;
+      return movemask(vmask);
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Fast OBBNodeMB intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, bool robust>
+      __forceinline size_t intersectNode(const typename BVHN<N>::OBBNodeMB* node, const TravRay<N,robust>& ray, const float time, vfloat<N>& dist)
+    {
+      const AffineSpace3vf<N> xfm = node->space0;
+      const Vec3vf<N> b0_lower = zero;
+      const Vec3vf<N> b0_upper = one;
+      const Vec3vf<N> lower = lerp(b0_lower,node->b1.lower,vfloat<N>(time));
+      const Vec3vf<N> upper = lerp(b0_upper,node->b1.upper,vfloat<N>(time));
+
+      const BBox3vf<N> bounds(lower,upper);
+      const Vec3vf<N> dir = xfmVector(xfm,ray.dir);
+      const Vec3vf<N> rdir = rcp_safe(dir);
+      const Vec3vf<N> org = xfmPoint(xfm,ray.org);
+
+      const Vec3vf<N> tLowerXYZ = (bounds.lower - org) * rdir;
+      const Vec3vf<N> tUpperXYZ = (bounds.upper - org) * rdir;
+
+      const vfloat<N> tNearX = mini(tLowerXYZ.x,tUpperXYZ.x);
+      const vfloat<N> tNearY = mini(tLowerXYZ.y,tUpperXYZ.y);
+      const vfloat<N> tNearZ = mini(tLowerXYZ.z,tUpperXYZ.z);
+      const vfloat<N> tFarX  = maxi(tLowerXYZ.x,tUpperXYZ.x);
+      const vfloat<N> tFarY  = maxi(tLowerXYZ.y,tUpperXYZ.y);
+      const vfloat<N> tFarZ  = maxi(tLowerXYZ.z,tUpperXYZ.z);
+      vfloat<N> tNear  = max(ray.tnear, tNearX,tNearY,tNearZ);
+      vfloat<N> tFar   = min(ray.tfar,  tFarX ,tFarY ,tFarZ );
+      if (robust) {
+        tNear = tNear*vfloat<N>(1.0f-3.0f*float(ulp));
+        tFar  = tFar *vfloat<N>(1.0f+3.0f*float(ulp));
+      }
+      const vbool<N> vmask = tNear <= tFar;
+      dist = tNear;
+      return movemask(vmask);
+    }
+    
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Node intersectors used in point query raversal
+    //////////////////////////////////////////////////////////////////////////////////////
+    
+    /*! Computes traversal information for N nodes with 1 point query */
+    template<int N, int types>
+    struct BVHNNodePointQuerySphere1;
+
+    template<int N>
+    struct BVHNNodePointQuerySphere1<N, BVH_AN1>
+    {
+      static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = pointQueryNodeSphere(node.getAABBNode(), query, dist);
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodePointQuerySphere1<N, BVH_AN2>
+    {
+      static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = pointQueryNodeSphere(node.getAABBNodeMB(), query, time, dist);
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodePointQuerySphere1<N, BVH_AN2_AN4D>
+    {
+      static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = pointQueryNodeSphereMB4D<N>(node, query, time, dist);
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodePointQuerySphere1<N, BVH_AN1_UN1>
+    {
+      static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (likely(node.isAABBNode()))          mask = pointQueryNodeSphere(node.getAABBNode(), query, dist);
+        else if (unlikely(node.isOBBNode())) mask = pointQueryNodeSphere(node.ungetAABBNode(), query, dist);
+        else return false;
+        return true;
+      }
+    };
+    
+    template<int N>
+    struct BVHNNodePointQuerySphere1<N, BVH_AN2_UN2>
+    {
+      static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (likely(node.isAABBNodeMB()))           mask = pointQueryNodeSphere(node.getAABBNodeMB(), query, time, dist);
+        else if (unlikely(node.isOBBNodeMB()))  mask = pointQueryNodeSphere(node.ungetAABBNodeMB(), query, time, dist);
+        else return false;
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodePointQuerySphere1<N, BVH_AN2_AN4D_UN2>
+    {
+      static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        if (unlikely(node.isOBBNodeMB())) mask = pointQueryNodeSphere(node.ungetAABBNodeMB(), query, time, dist);
+        else                                    mask = pointQueryNodeSphereMB4D(node, query, time, dist);
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodePointQuerySphere1<N, BVH_QN1>
+    {
+      static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = pointQueryNodeSphere((const typename BVHN<N>::QuantizedNode*)node.quantizedNode(), query, dist);
+        return true;
+      }
+    };
+    
+    template<int N>
+    struct BVHNQuantizedBaseNodePointQuerySphere1
+    {
+      static __forceinline size_t pointQuery(const typename BVHN<N>::QuantizedBaseNode* node, const TravPointQuery<N>& query, vfloat<N>& dist)
+      {
+        return pointQueryNodeSphere(node,query,dist);
+      }
+
+      static __forceinline size_t pointQuery(const typename BVHN<N>::QuantizedBaseNodeMB* node, const TravPointQuery<N>& query, const float time, vfloat<N>& dist)
+      {
+        return pointQueryNodeSphere(node,query,time,dist);
+      }
+    };
+
+    /*! Computes traversal information for N nodes with 1 point query */
+    template<int N, int types>
+    struct BVHNNodePointQueryAABB1;
+
+    template<int N>
+    struct BVHNNodePointQueryAABB1<N, BVH_AN1>
+    {
+      static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = pointQueryNodeAABB(node.getAABBNode(), query, dist);
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodePointQueryAABB1<N, BVH_AN2>
+    {
+      static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = pointQueryNodeAABB(node.getAABBNodeMB(), query, time, dist);
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodePointQueryAABB1<N, BVH_AN2_AN4D>
+    {
+      static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = pointQueryNodeAABBMB4D<N>(node, query, time, dist);
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodePointQueryAABB1<N, BVH_AN1_UN1>
+    {
+      static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (likely(node.isAABBNode()))          mask = pointQueryNodeAABB(node.getAABBNode(), query, dist);
+        else if (unlikely(node.isOBBNode())) mask = pointQueryNodeAABB(node.ungetAABBNode(), query, dist);
+        else return false;
+        return true;
+      }
+    };
+    
+    template<int N>
+    struct BVHNNodePointQueryAABB1<N, BVH_AN2_UN2>
+    {
+      static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (likely(node.isAABBNodeMB()))           mask = pointQueryNodeAABB(node.getAABBNodeMB(), query, time, dist);
+        else if (unlikely(node.isOBBNodeMB()))  mask = pointQueryNodeAABB(node.ungetAABBNodeMB(), query, time, dist);
+        else return false;
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodePointQueryAABB1<N, BVH_AN2_AN4D_UN2>
+    {
+      static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        if (unlikely(node.isOBBNodeMB())) mask = pointQueryNodeAABB(node.ungetAABBNodeMB(), query, time, dist);
+        else                                    mask = pointQueryNodeAABBMB4D(node, query, time, dist);
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodePointQueryAABB1<N, BVH_QN1>
+    {
+      static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = pointQueryNodeAABB((const typename BVHN<N>::QuantizedNode*)node.quantizedNode(), query, dist);
+        return true;
+      }
+    };
+    
+    template<int N>
+    struct BVHNQuantizedBaseNodePointQueryAABB1
+    {
+      static __forceinline size_t pointQuery(const typename BVHN<N>::QuantizedBaseNode* node, const TravPointQuery<N>& query, vfloat<N>& dist)
+      {
+        return pointQueryNodeAABB(node,query,dist);
+      }
+
+      static __forceinline size_t pointQuery(const typename BVHN<N>::QuantizedBaseNodeMB* node, const TravPointQuery<N>& query, const float time, vfloat<N>& dist)
+      {
+        return pointQueryNodeAABB(node,query,time,dist);
+      }
+    };
+
+    
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Node intersectors used in ray traversal
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    /*! Intersects N nodes with 1 ray */
+    template<int N, int types, bool robust>
+    struct BVHNNodeIntersector1;
+
+    template<int N>
+    struct BVHNNodeIntersector1<N, BVH_AN1, false>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,false>& ray, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = intersectNode(node.getAABBNode(), ray, dist);
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodeIntersector1<N, BVH_AN1, true>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,true>& ray, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = intersectNodeRobust(node.getAABBNode(), ray, dist);
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodeIntersector1<N, BVH_AN2, false>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,false>& ray, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = intersectNode(node.getAABBNodeMB(), ray, time, dist);
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodeIntersector1<N, BVH_AN2, true>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,true>& ray, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = intersectNodeRobust(node.getAABBNodeMB(), ray, time, dist);
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodeIntersector1<N, BVH_AN2_AN4D, false>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,false>& ray, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = intersectNodeMB4D<N>(node, ray, time, dist);
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodeIntersector1<N, BVH_AN2_AN4D, true>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,true>& ray, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = intersectNodeMB4DRobust<N>(node, ray, time, dist);
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodeIntersector1<N, BVH_AN1_UN1, false>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,false>& ray, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (likely(node.isAABBNode()))          mask = intersectNode(node.getAABBNode(), ray, dist);
+        else if (unlikely(node.isOBBNode())) mask = intersectNode(node.ungetAABBNode(), ray, dist);
+        else return false;
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodeIntersector1<N, BVH_AN1_UN1, true>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,true>& ray, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (likely(node.isAABBNode()))          mask = intersectNodeRobust(node.getAABBNode(), ray, dist);
+        else if (unlikely(node.isOBBNode())) mask = intersectNode(node.ungetAABBNode(), ray, dist);
+        else return false;
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodeIntersector1<N, BVH_AN2_UN2, false>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,false>& ray, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (likely(node.isAABBNodeMB()))           mask = intersectNode(node.getAABBNodeMB(), ray, time, dist);
+        else if (unlikely(node.isOBBNodeMB()))  mask = intersectNode(node.ungetAABBNodeMB(), ray, time, dist);
+        else return false;
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodeIntersector1<N, BVH_AN2_UN2, true>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,true>& ray, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (likely(node.isAABBNodeMB()))           mask = intersectNodeRobust(node.getAABBNodeMB(), ray, time, dist);
+        else if (unlikely(node.isOBBNodeMB()))  mask = intersectNode(node.ungetAABBNodeMB(), ray, time, dist);
+        else return false;
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodeIntersector1<N, BVH_AN2_AN4D_UN2, false>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,false>& ray, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        if (unlikely(node.isOBBNodeMB())) mask = intersectNode(node.ungetAABBNodeMB(), ray, time, dist);
+        else                                    mask = intersectNodeMB4D(node, ray, time, dist);
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodeIntersector1<N, BVH_AN2_AN4D_UN2, true>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,true>& ray, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        if (unlikely(node.isOBBNodeMB())) mask = intersectNode(node.ungetAABBNodeMB(), ray, time, dist);
+        else                                    mask = intersectNodeMB4DRobust(node, ray, time, dist);
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodeIntersector1<N, BVH_QN1, false>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,false>& ray, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = intersectNode((const typename BVHN<N>::QuantizedNode*)node.quantizedNode(), ray, dist);
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodeIntersector1<N, BVH_QN1, true>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,true>& ray, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = intersectNodeRobust((const typename BVHN<N>::QuantizedNode*)node.quantizedNode(), ray, dist);
+        return true;
+      }
+    };
+
+    /*! Intersects N nodes with K rays */
+    template<int N, bool robust>
+      struct BVHNQuantizedBaseNodeIntersector1;
+
+    template<int N>
+      struct BVHNQuantizedBaseNodeIntersector1<N, false>
+    {
+      static __forceinline size_t intersect(const typename BVHN<N>::QuantizedBaseNode* node, const TravRay<N,false>& ray, vfloat<N>& dist)
+      {
+        return intersectNode(node,ray,dist);
+      }
+
+      static __forceinline size_t intersect(const typename BVHN<N>::QuantizedBaseNodeMB* node, const TravRay<N,false>& ray, const float time, vfloat<N>& dist)
+      {
+        return intersectNode(node,ray,time,dist);
+      }
+
+    };
+
+    template<int N>
+      struct BVHNQuantizedBaseNodeIntersector1<N, true>
+    {
+      static __forceinline size_t intersect(const typename BVHN<N>::QuantizedBaseNode* node, const TravRay<N,true>& ray, vfloat<N>& dist)
+      {
+        return intersectNode(node,ray,dist); 
+      }
+
+      static __forceinline size_t intersect(const typename BVHN<N>::QuantizedBaseNodeMB* node, const TravRay<N,true>& ray, const float time, vfloat<N>& dist)
+      {
+        return intersectNode(node,ray,time,dist);
+      }
+
+    };
+
+
+  }
+}
diff --git a/thirdparty/embree/kernels/bvh/node_intersector_frustum.h b/thirdparty/embree/kernels/bvh/node_intersector_frustum.h
new file mode 100644
index 0000000000..1f7215e5df
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/node_intersector_frustum.h
@@ -0,0 +1,241 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "node_intersector.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Frustum structure used in hybrid and stream traversal
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    /*
+       Optimized frustum test. We calculate t=(p-org)/dir in ray/box
+       intersection. We assume the rays are split by octant, thus
+       dir intervals are either positive or negative in each
+       dimension.
+
+       Case 1: dir.min >= 0 && dir.max >= 0:
+         t_min = (p_min - org_max) / dir_max = (p_min - org_max)*rdir_min = p_min*rdir_min - org_max*rdir_min
+         t_max = (p_max - org_min) / dir_min = (p_max - org_min)*rdir_max = p_max*rdir_max - org_min*rdir_max
+
+       Case 2: dir.min < 0 && dir.max < 0:
+         t_min = (p_max - org_min) / dir_min = (p_max - org_min)*rdir_max = p_max*rdir_max - org_min*rdir_max
+         t_max = (p_min - org_max) / dir_max = (p_min - org_max)*rdir_min = p_min*rdir_min - org_max*rdir_min
+    */
+
+    template<bool robust>
+    struct Frustum;
+    
+    /* Fast variant */
+    template<>
+    struct Frustum<false>
+    {
+      __forceinline Frustum() {}
+
+      template<int K>
+      __forceinline void init(const vbool<K>& valid, const Vec3vf<K>& org, const Vec3vf<K>& rdir, const vfloat<K>& ray_tnear, const vfloat<K>& ray_tfar, int N)
+      {
+        const Vec3fa reduced_min_org(reduce_min(select(valid, org.x, pos_inf)),
+                                     reduce_min(select(valid, org.y, pos_inf)),
+                                     reduce_min(select(valid, org.z, pos_inf)));
+
+        const Vec3fa reduced_max_org(reduce_max(select(valid, org.x, neg_inf)),
+                                     reduce_max(select(valid, org.y, neg_inf)),
+                                     reduce_max(select(valid, org.z, neg_inf)));
+
+        const Vec3fa reduced_min_rdir(reduce_min(select(valid, rdir.x, pos_inf)),
+                                      reduce_min(select(valid, rdir.y, pos_inf)),
+                                      reduce_min(select(valid, rdir.z, pos_inf)));
+
+        const Vec3fa reduced_max_rdir(reduce_max(select(valid, rdir.x, neg_inf)),
+                                      reduce_max(select(valid, rdir.y, neg_inf)),
+                                      reduce_max(select(valid, rdir.z, neg_inf)));
+
+        const float reduced_min_dist = reduce_min(select(valid, ray_tnear, vfloat<K>(pos_inf)));
+        const float reduced_max_dist = reduce_max(select(valid, ray_tfar , vfloat<K>(neg_inf)));
+
+        init(reduced_min_org, reduced_max_org, reduced_min_rdir, reduced_max_rdir, reduced_min_dist, reduced_max_dist, N);
+      }
+
+      __forceinline void init(const Vec3fa& reduced_min_org,
+                              const Vec3fa& reduced_max_org,
+                              const Vec3fa& reduced_min_rdir,
+                              const Vec3fa& reduced_max_rdir,
+                              float reduced_min_dist,
+                              float reduced_max_dist,
+                              int N)
+      {
+        const Vec3ba pos_rdir = ge_mask(reduced_min_rdir, Vec3fa(zero));
+
+        min_rdir = select(pos_rdir, reduced_min_rdir, reduced_max_rdir);
+        max_rdir = select(pos_rdir, reduced_max_rdir, reduced_min_rdir);
+
+        min_org_rdir = min_rdir * select(pos_rdir, reduced_max_org, reduced_min_org);
+        max_org_rdir = max_rdir * select(pos_rdir, reduced_min_org, reduced_max_org);
+
+        min_dist = reduced_min_dist;
+        max_dist = reduced_max_dist;
+
+        nf = NearFarPrecalculations(min_rdir, N);
+      }
+
+      template<int K>
+      __forceinline void updateMaxDist(const vfloat<K>& ray_tfar)
+      {
+        max_dist = reduce_max(ray_tfar);
+      }
+
+      NearFarPrecalculations nf;
+
+      Vec3fa min_rdir;
+      Vec3fa max_rdir;
+
+      Vec3fa min_org_rdir;
+      Vec3fa max_org_rdir;
+
+      float min_dist;
+      float max_dist;
+    };
+
+    typedef Frustum<false> FrustumFast;
+
+    /* Robust variant */
+    template<>
+    struct Frustum<true>
+    {
+      __forceinline Frustum() {}
+
+      template<int K>
+      __forceinline void init(const vbool<K>& valid, const Vec3vf<K>& org, const Vec3vf<K>& rdir, const vfloat<K>& ray_tnear, const vfloat<K>& ray_tfar, int N)
+      {
+        const Vec3fa reduced_min_org(reduce_min(select(valid, org.x, pos_inf)),
+                                     reduce_min(select(valid, org.y, pos_inf)),
+                                     reduce_min(select(valid, org.z, pos_inf)));
+
+        const Vec3fa reduced_max_org(reduce_max(select(valid, org.x, neg_inf)),
+                                     reduce_max(select(valid, org.y, neg_inf)),
+                                     reduce_max(select(valid, org.z, neg_inf)));
+
+        const Vec3fa reduced_min_rdir(reduce_min(select(valid, rdir.x, pos_inf)),
+                                      reduce_min(select(valid, rdir.y, pos_inf)),
+                                      reduce_min(select(valid, rdir.z, pos_inf)));
+
+        const Vec3fa reduced_max_rdir(reduce_max(select(valid, rdir.x, neg_inf)),
+                                      reduce_max(select(valid, rdir.y, neg_inf)),
+                                      reduce_max(select(valid, rdir.z, neg_inf)));
+
+        const float reduced_min_dist = reduce_min(select(valid, ray_tnear, vfloat<K>(pos_inf)));
+        const float reduced_max_dist = reduce_max(select(valid, ray_tfar , vfloat<K>(neg_inf)));
+
+        init(reduced_min_org, reduced_max_org, reduced_min_rdir, reduced_max_rdir, reduced_min_dist, reduced_max_dist, N);
+      }
+
+      __forceinline void init(const Vec3fa& reduced_min_org,
+                              const Vec3fa& reduced_max_org,
+                              const Vec3fa& reduced_min_rdir,
+                              const Vec3fa& reduced_max_rdir,
+                              float reduced_min_dist,
+                              float reduced_max_dist,
+                              int N)
+      {
+        const Vec3ba pos_rdir = ge_mask(reduced_min_rdir, Vec3fa(zero));
+        min_rdir = select(pos_rdir, reduced_min_rdir, reduced_max_rdir);
+        max_rdir = select(pos_rdir, reduced_max_rdir, reduced_min_rdir);
+
+        min_org = select(pos_rdir, reduced_max_org, reduced_min_org);
+        max_org = select(pos_rdir, reduced_min_org, reduced_max_org);
+
+        min_dist = reduced_min_dist;
+        max_dist = reduced_max_dist;
+
+        nf = NearFarPrecalculations(min_rdir, N);
+      }
+
+      template<int K>
+      __forceinline void updateMaxDist(const vfloat<K>& ray_tfar)
+      {
+        max_dist = reduce_max(ray_tfar);
+      }
+
+      NearFarPrecalculations nf;
+
+      Vec3fa min_rdir;
+      Vec3fa max_rdir;
+
+      Vec3fa min_org;
+      Vec3fa max_org;
+
+      float min_dist;
+      float max_dist;
+    };
+
+    typedef Frustum<true> FrustumRobust;
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Fast AABBNode intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N>
+    __forceinline size_t intersectNodeFrustum(const typename BVHN<N>::AABBNode* __restrict__ node,
+                                       const FrustumFast& frustum, vfloat<N>& dist)
+    {
+      const vfloat<N> bminX = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.nearX);
+      const vfloat<N> bminY = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.nearY);
+      const vfloat<N> bminZ = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.nearZ);
+      const vfloat<N> bmaxX = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farX);
+      const vfloat<N> bmaxY = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farY);
+      const vfloat<N> bmaxZ = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farZ);
+
+      const vfloat<N> fminX = msub(bminX, vfloat<N>(frustum.min_rdir.x), vfloat<N>(frustum.min_org_rdir.x));
+      const vfloat<N> fminY = msub(bminY, vfloat<N>(frustum.min_rdir.y), vfloat<N>(frustum.min_org_rdir.y));
+      const vfloat<N> fminZ = msub(bminZ, vfloat<N>(frustum.min_rdir.z), vfloat<N>(frustum.min_org_rdir.z));
+      const vfloat<N> fmaxX = msub(bmaxX, vfloat<N>(frustum.max_rdir.x), vfloat<N>(frustum.max_org_rdir.x));
+      const vfloat<N> fmaxY = msub(bmaxY, vfloat<N>(frustum.max_rdir.y), vfloat<N>(frustum.max_org_rdir.y));
+      const vfloat<N> fmaxZ = msub(bmaxZ, vfloat<N>(frustum.max_rdir.z), vfloat<N>(frustum.max_org_rdir.z));
+
+      const vfloat<N> fmin  = maxi(fminX, fminY, fminZ, vfloat<N>(frustum.min_dist));
+      dist = fmin;
+      const vfloat<N> fmax  = mini(fmaxX, fmaxY, fmaxZ, vfloat<N>(frustum.max_dist));
+      const vbool<N> vmask_node_hit = fmin <= fmax;
+      size_t m_node = movemask(vmask_node_hit) & (((size_t)1 << N)-1);
+      return m_node;
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Robust AABBNode intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N>
+    __forceinline size_t intersectNodeFrustum(const typename BVHN<N>::AABBNode* __restrict__ node,
+                                       const FrustumRobust& frustum, vfloat<N>& dist)
+    {
+      const vfloat<N> bminX = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.nearX);
+      const vfloat<N> bminY = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.nearY);
+      const vfloat<N> bminZ = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.nearZ);
+      const vfloat<N> bmaxX = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farX);
+      const vfloat<N> bmaxY = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farY);
+      const vfloat<N> bmaxZ = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farZ);
+
+      const vfloat<N> fminX = (bminX - vfloat<N>(frustum.min_org.x)) * vfloat<N>(frustum.min_rdir.x);
+      const vfloat<N> fminY = (bminY - vfloat<N>(frustum.min_org.y)) * vfloat<N>(frustum.min_rdir.y);
+      const vfloat<N> fminZ = (bminZ - vfloat<N>(frustum.min_org.z)) * vfloat<N>(frustum.min_rdir.z);
+      const vfloat<N> fmaxX = (bmaxX - vfloat<N>(frustum.max_org.x)) * vfloat<N>(frustum.max_rdir.x);
+      const vfloat<N> fmaxY = (bmaxY - vfloat<N>(frustum.max_org.y)) * vfloat<N>(frustum.max_rdir.y);
+      const vfloat<N> fmaxZ = (bmaxZ - vfloat<N>(frustum.max_org.z)) * vfloat<N>(frustum.max_rdir.z);
+
+      const float round_down = 1.0f-2.0f*float(ulp); // FIXME: use per instruction rounding for AVX512
+      const float round_up   = 1.0f+2.0f*float(ulp);
+      const vfloat<N> fmin  = max(fminX, fminY, fminZ, vfloat<N>(frustum.min_dist));
+      dist = fmin;
+      const vfloat<N> fmax  = min(fmaxX, fmaxY, fmaxZ, vfloat<N>(frustum.max_dist));
+      const vbool<N> vmask_node_hit = (round_down*fmin <= round_up*fmax);
+      size_t m_node = movemask(vmask_node_hit) & (((size_t)1 << N)-1);
+      return m_node;
+    }
+  }
+}
diff --git a/thirdparty/embree/kernels/bvh/node_intersector_packet.h b/thirdparty/embree/kernels/bvh/node_intersector_packet.h
new file mode 100644
index 0000000000..d5498fc5db
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/node_intersector_packet.h
@@ -0,0 +1,805 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "node_intersector.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Ray packet structure used in hybrid traversal
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int K, bool robust>
+    struct TravRayK;
+
+    /* Fast variant */
+    template<int K>
+    struct TravRayK<K, false>
+    {
+      __forceinline TravRayK() {}
+
+      __forceinline TravRayK(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, int N)
+      {
+        init(ray_org, ray_dir, N);
+      }
+
+      __forceinline TravRayK(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, const vfloat<K>& ray_tnear, const vfloat<K>& ray_tfar, int N)
+      {
+        init(ray_org, ray_dir, N);
+        tnear = ray_tnear;
+        tfar = ray_tfar;
+      }
+
+      __forceinline void init(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, int N)
+      {
+        org = ray_org;
+        dir = ray_dir;
+        rdir = rcp_safe(ray_dir);
+#if defined(__AVX2__) || defined(__ARM_NEON)
+        org_rdir = org * rdir;
+#endif
+
+        if (N)
+        {
+          const int size = sizeof(float)*N;
+          nearXYZ.x = select(rdir.x >= 0.0f, vint<K>(0*size), vint<K>(1*size));
+          nearXYZ.y = select(rdir.y >= 0.0f, vint<K>(2*size), vint<K>(3*size));
+          nearXYZ.z = select(rdir.z >= 0.0f, vint<K>(4*size), vint<K>(5*size));
+        }
+      }
+
+      Vec3vf<K> org;
+      Vec3vf<K> dir;
+      Vec3vf<K> rdir;
+#if defined(__AVX2__) || defined(__ARM_NEON)
+      Vec3vf<K> org_rdir;
+#endif
+      Vec3vi<K> nearXYZ;
+      vfloat<K> tnear;
+      vfloat<K> tfar;
+    };
+
+    template<int K>
+    using TravRayKFast = TravRayK<K, false>;
+
+    /* Robust variant */
+    template<int K>
+    struct TravRayK<K, true>
+    {
+      __forceinline TravRayK() {}
+
+      __forceinline TravRayK(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, int N)
+      {
+        init(ray_org, ray_dir, N);
+      }
+
+      __forceinline TravRayK(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, const vfloat<K>& ray_tnear, const vfloat<K>& ray_tfar, int N)
+      {
+        init(ray_org, ray_dir, N);
+        tnear = ray_tnear;
+        tfar = ray_tfar;
+      }
+
+      __forceinline void init(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, int N)
+      {
+        org = ray_org;
+        dir = ray_dir;
+        rdir = vfloat<K>(1.0f)/(zero_fix(ray_dir));
+
+        if (N)
+        {
+          const int size = sizeof(float)*N;
+          nearXYZ.x = select(rdir.x >= 0.0f, vint<K>(0*size), vint<K>(1*size));
+          nearXYZ.y = select(rdir.y >= 0.0f, vint<K>(2*size), vint<K>(3*size));
+          nearXYZ.z = select(rdir.z >= 0.0f, vint<K>(4*size), vint<K>(5*size));
+        }
+      }
+
+      Vec3vf<K> org;
+      Vec3vf<K> dir;
+      Vec3vf<K> rdir;
+      Vec3vi<K> nearXYZ;
+      vfloat<K> tnear;
+      vfloat<K> tfar;
+    };
+
+    template<int K>
+    using TravRayKRobust = TravRayK<K, true>;
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Fast AABBNode intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, int K>
+    __forceinline vbool<K> intersectNodeK(const typename BVHN<N>::AABBNode* node, size_t i,
+                                         const TravRayKFast<K>& ray, vfloat<K>& dist)
+
+    {
+  #if defined(__AVX2__) || defined(__ARM_NEON)
+      const vfloat<K> lclipMinX = msub(node->lower_x[i], ray.rdir.x, ray.org_rdir.x);
+      const vfloat<K> lclipMinY = msub(node->lower_y[i], ray.rdir.y, ray.org_rdir.y);
+      const vfloat<K> lclipMinZ = msub(node->lower_z[i], ray.rdir.z, ray.org_rdir.z);
+      const vfloat<K> lclipMaxX = msub(node->upper_x[i], ray.rdir.x, ray.org_rdir.x);
+      const vfloat<K> lclipMaxY = msub(node->upper_y[i], ray.rdir.y, ray.org_rdir.y);
+      const vfloat<K> lclipMaxZ = msub(node->upper_z[i], ray.rdir.z, ray.org_rdir.z);
+  #else
+      const vfloat<K> lclipMinX = (node->lower_x[i] - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMinY = (node->lower_y[i] - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMinZ = (node->lower_z[i] - ray.org.z) * ray.rdir.z;
+      const vfloat<K> lclipMaxX = (node->upper_x[i] - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMaxY = (node->upper_y[i] - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMaxZ = (node->upper_z[i] - ray.org.z) * ray.rdir.z;
+  #endif
+
+  #if defined(__AVX512F__) // SKX
+      if (K == 16)
+      {
+        /* use mixed float/int min/max */
+        const vfloat<K> lnearP = maxi(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ));
+        const vfloat<K> lfarP  = mini(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ));
+        const vbool<K> lhit    = asInt(maxi(lnearP, ray.tnear)) <= asInt(mini(lfarP, ray.tfar));
+        dist = lnearP;
+        return lhit;
+      }
+      else
+  #endif
+      {
+        const vfloat<K> lnearP = maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY), mini(lclipMinZ, lclipMaxZ));
+        const vfloat<K> lfarP  = mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY), maxi(lclipMinZ, lclipMaxZ));
+  #if defined(__AVX512F__) // SKX
+        const vbool<K> lhit    = asInt(maxi(lnearP, ray.tnear)) <= asInt(mini(lfarP, ray.tfar));
+  #else
+        const vbool<K> lhit    = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar);
+  #endif
+        dist = lnearP;
+        return lhit;
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Robust AABBNode intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, int K>
+    __forceinline vbool<K> intersectNodeKRobust(const typename BVHN<N>::AABBNode* node, size_t i,
+                                               const TravRayKRobust<K>& ray, vfloat<K>& dist)
+    {
+      // FIXME: use per instruction rounding for AVX512
+      const vfloat<K> lclipMinX = (node->lower_x[i] - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMinY = (node->lower_y[i] - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMinZ = (node->lower_z[i] - ray.org.z) * ray.rdir.z;
+      const vfloat<K> lclipMaxX = (node->upper_x[i] - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMaxY = (node->upper_y[i] - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMaxZ = (node->upper_z[i] - ray.org.z) * ray.rdir.z;
+      const float round_up   = 1.0f+3.0f*float(ulp);
+      const float round_down = 1.0f-3.0f*float(ulp);
+      const vfloat<K> lnearP = round_down*max(max(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY)), min(lclipMinZ, lclipMaxZ));
+      const vfloat<K> lfarP  = round_up  *min(min(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY)), max(lclipMinZ, lclipMaxZ));
+      const vbool<K> lhit   = max(lnearP, ray.tnear) <= min(lfarP, ray.tfar);
+      dist = lnearP;
+      return lhit;
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Fast AABBNodeMB intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, int K>
+    __forceinline vbool<K> intersectNodeK(const typename BVHN<N>::AABBNodeMB* node, const size_t i,
+                                         const TravRayKFast<K>& ray, const vfloat<K>& time, vfloat<K>& dist)
+    {
+      const vfloat<K> vlower_x = madd(time, vfloat<K>(node->lower_dx[i]), vfloat<K>(node->lower_x[i]));
+      const vfloat<K> vlower_y = madd(time, vfloat<K>(node->lower_dy[i]), vfloat<K>(node->lower_y[i]));
+      const vfloat<K> vlower_z = madd(time, vfloat<K>(node->lower_dz[i]), vfloat<K>(node->lower_z[i]));
+      const vfloat<K> vupper_x = madd(time, vfloat<K>(node->upper_dx[i]), vfloat<K>(node->upper_x[i]));
+      const vfloat<K> vupper_y = madd(time, vfloat<K>(node->upper_dy[i]), vfloat<K>(node->upper_y[i]));
+      const vfloat<K> vupper_z = madd(time, vfloat<K>(node->upper_dz[i]), vfloat<K>(node->upper_z[i]));
+
+#if defined(__AVX2__) || defined(__ARM_NEON)
+      const vfloat<K> lclipMinX = msub(vlower_x, ray.rdir.x, ray.org_rdir.x);
+      const vfloat<K> lclipMinY = msub(vlower_y, ray.rdir.y, ray.org_rdir.y);
+      const vfloat<K> lclipMinZ = msub(vlower_z, ray.rdir.z, ray.org_rdir.z);
+      const vfloat<K> lclipMaxX = msub(vupper_x, ray.rdir.x, ray.org_rdir.x);
+      const vfloat<K> lclipMaxY = msub(vupper_y, ray.rdir.y, ray.org_rdir.y);
+      const vfloat<K> lclipMaxZ = msub(vupper_z, ray.rdir.z, ray.org_rdir.z);
+#else
+      const vfloat<K> lclipMinX = (vlower_x - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMinY = (vlower_y - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMinZ = (vlower_z - ray.org.z) * ray.rdir.z;
+      const vfloat<K> lclipMaxX = (vupper_x - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMaxY = (vupper_y - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMaxZ = (vupper_z - ray.org.z) * ray.rdir.z;
+#endif
+
+#if defined(__AVX512F__) // SKX
+      if (K == 16)
+      {
+        /* use mixed float/int min/max */
+        const vfloat<K> lnearP = maxi(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ));
+        const vfloat<K> lfarP  = mini(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ));
+        const vbool<K> lhit    = asInt(maxi(lnearP, ray.tnear)) <= asInt(mini(lfarP, ray.tfar));
+        dist = lnearP;
+        return lhit;
+      }
+      else
+#endif
+      {
+        const vfloat<K> lnearP = maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY), mini(lclipMinZ, lclipMaxZ));
+        const vfloat<K> lfarP  = mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY), maxi(lclipMinZ, lclipMaxZ));
+#if defined(__AVX512F__) // SKX
+        const vbool<K> lhit    = asInt(maxi(lnearP, ray.tnear)) <= asInt(mini(lfarP, ray.tfar));
+#else
+        const vbool<K> lhit    = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar);
+#endif
+        dist = lnearP;
+        return lhit;
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Robust AABBNodeMB intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, int K>
+    __forceinline vbool<K> intersectNodeKRobust(const typename BVHN<N>::AABBNodeMB* node, const size_t i,
+                                               const TravRayKRobust<K>& ray, const vfloat<K>& time, vfloat<K>& dist)
+    {
+      const vfloat<K> vlower_x = madd(time, vfloat<K>(node->lower_dx[i]), vfloat<K>(node->lower_x[i]));
+      const vfloat<K> vlower_y = madd(time, vfloat<K>(node->lower_dy[i]), vfloat<K>(node->lower_y[i]));
+      const vfloat<K> vlower_z = madd(time, vfloat<K>(node->lower_dz[i]), vfloat<K>(node->lower_z[i]));
+      const vfloat<K> vupper_x = madd(time, vfloat<K>(node->upper_dx[i]), vfloat<K>(node->upper_x[i]));
+      const vfloat<K> vupper_y = madd(time, vfloat<K>(node->upper_dy[i]), vfloat<K>(node->upper_y[i]));
+      const vfloat<K> vupper_z = madd(time, vfloat<K>(node->upper_dz[i]), vfloat<K>(node->upper_z[i]));
+
+      const vfloat<K> lclipMinX = (vlower_x - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMinY = (vlower_y - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMinZ = (vlower_z - ray.org.z) * ray.rdir.z;
+      const vfloat<K> lclipMaxX = (vupper_x - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMaxY = (vupper_y - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMaxZ = (vupper_z - ray.org.z) * ray.rdir.z;
+
+      const float round_up   = 1.0f+3.0f*float(ulp);
+      const float round_down = 1.0f-3.0f*float(ulp);
+
+#if defined(__AVX512F__) // SKX
+      if (K == 16)
+      {
+        const vfloat<K> lnearP = round_down*maxi(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ));
+        const vfloat<K> lfarP  = round_up  *mini(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ));
+        const vbool<K>  lhit   = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar);
+        dist = lnearP;
+        return lhit;
+      }
+      else
+#endif
+      {
+        const vfloat<K> lnearP = round_down*maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY), mini(lclipMinZ, lclipMaxZ));
+        const vfloat<K> lfarP  = round_up  *mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY), maxi(lclipMinZ, lclipMaxZ));
+        const vbool<K>  lhit   = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar);
+        dist = lnearP;
+        return lhit;
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Fast AABBNodeMB4D intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, int K>
+    __forceinline vbool<K> intersectNodeKMB4D(const typename BVHN<N>::NodeRef ref, const size_t i,
+                                             const TravRayKFast<K>& ray, const vfloat<K>& time, vfloat<K>& dist)
+    {
+      const typename BVHN<N>::AABBNodeMB* node = ref.getAABBNodeMB();
+
+      const vfloat<K> vlower_x = madd(time, vfloat<K>(node->lower_dx[i]), vfloat<K>(node->lower_x[i]));
+      const vfloat<K> vlower_y = madd(time, vfloat<K>(node->lower_dy[i]), vfloat<K>(node->lower_y[i]));
+      const vfloat<K> vlower_z = madd(time, vfloat<K>(node->lower_dz[i]), vfloat<K>(node->lower_z[i]));
+      const vfloat<K> vupper_x = madd(time, vfloat<K>(node->upper_dx[i]), vfloat<K>(node->upper_x[i]));
+      const vfloat<K> vupper_y = madd(time, vfloat<K>(node->upper_dy[i]), vfloat<K>(node->upper_y[i]));
+      const vfloat<K> vupper_z = madd(time, vfloat<K>(node->upper_dz[i]), vfloat<K>(node->upper_z[i]));
+
+#if defined(__AVX2__) || defined(__ARM_NEON)
+      const vfloat<K> lclipMinX = msub(vlower_x, ray.rdir.x, ray.org_rdir.x);
+      const vfloat<K> lclipMinY = msub(vlower_y, ray.rdir.y, ray.org_rdir.y);
+      const vfloat<K> lclipMinZ = msub(vlower_z, ray.rdir.z, ray.org_rdir.z);
+      const vfloat<K> lclipMaxX = msub(vupper_x, ray.rdir.x, ray.org_rdir.x);
+      const vfloat<K> lclipMaxY = msub(vupper_y, ray.rdir.y, ray.org_rdir.y);
+      const vfloat<K> lclipMaxZ = msub(vupper_z, ray.rdir.z, ray.org_rdir.z);
+#else
+      const vfloat<K> lclipMinX = (vlower_x - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMinY = (vlower_y - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMinZ = (vlower_z - ray.org.z) * ray.rdir.z;
+      const vfloat<K> lclipMaxX = (vupper_x - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMaxY = (vupper_y - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMaxZ = (vupper_z - ray.org.z) * ray.rdir.z;
+#endif
+
+      const vfloat<K> lnearP = maxi(maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY)), mini(lclipMinZ, lclipMaxZ));
+      const vfloat<K> lfarP  = mini(mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY)), maxi(lclipMinZ, lclipMaxZ));
+      vbool<K> lhit = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar);
+      if (unlikely(ref.isAABBNodeMB4D())) {
+        const typename BVHN<N>::AABBNodeMB4D* node1 = (const typename BVHN<N>::AABBNodeMB4D*) node;
+        lhit = lhit & (vfloat<K>(node1->lower_t[i]) <= time) & (time < vfloat<K>(node1->upper_t[i]));
+      }
+      dist = lnearP;
+      return lhit;
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Robust AABBNodeMB4D intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, int K>
+    __forceinline vbool<K> intersectNodeKMB4DRobust(const typename BVHN<N>::NodeRef ref, const size_t i,
+                                                    const TravRayKRobust<K>& ray, const vfloat<K>& time, vfloat<K>& dist)
+    {
+      const typename BVHN<N>::AABBNodeMB* node = ref.getAABBNodeMB();
+
+      const vfloat<K> vlower_x = madd(time, vfloat<K>(node->lower_dx[i]), vfloat<K>(node->lower_x[i]));
+      const vfloat<K> vlower_y = madd(time, vfloat<K>(node->lower_dy[i]), vfloat<K>(node->lower_y[i]));
+      const vfloat<K> vlower_z = madd(time, vfloat<K>(node->lower_dz[i]), vfloat<K>(node->lower_z[i]));
+      const vfloat<K> vupper_x = madd(time, vfloat<K>(node->upper_dx[i]), vfloat<K>(node->upper_x[i]));
+      const vfloat<K> vupper_y = madd(time, vfloat<K>(node->upper_dy[i]), vfloat<K>(node->upper_y[i]));
+      const vfloat<K> vupper_z = madd(time, vfloat<K>(node->upper_dz[i]), vfloat<K>(node->upper_z[i]));
+
+      const vfloat<K> lclipMinX = (vlower_x - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMinY = (vlower_y - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMinZ = (vlower_z - ray.org.z) * ray.rdir.z;
+      const vfloat<K> lclipMaxX = (vupper_x - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMaxY = (vupper_y - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMaxZ = (vupper_z - ray.org.z) * ray.rdir.z;
+
+      const float round_up   = 1.0f+3.0f*float(ulp);
+      const float round_down = 1.0f-3.0f*float(ulp);
+      const vfloat<K> lnearP = round_down*maxi(maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY)), mini(lclipMinZ, lclipMaxZ));
+      const vfloat<K> lfarP  = round_up  *mini(mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY)), maxi(lclipMinZ, lclipMaxZ));
+      vbool<K> lhit = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar);
+
+      if (unlikely(ref.isAABBNodeMB4D())) {
+        const typename BVHN<N>::AABBNodeMB4D* node1 = (const typename BVHN<N>::AABBNodeMB4D*) node;
+        lhit = lhit & (vfloat<K>(node1->lower_t[i]) <= time) & (time < vfloat<K>(node1->upper_t[i]));
+      }
+      dist = lnearP;
+      return lhit;
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Fast OBBNode intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, int K, bool robust>
+    __forceinline vbool<K> intersectNodeK(const typename BVHN<N>::OBBNode* node, const size_t i,
+                                          const TravRayK<K,robust>& ray, vfloat<K>& dist)
+    {
+      const AffineSpace3vf<K> naabb(Vec3f(node->naabb.l.vx.x[i], node->naabb.l.vx.y[i], node->naabb.l.vx.z[i]),
+                                    Vec3f(node->naabb.l.vy.x[i], node->naabb.l.vy.y[i], node->naabb.l.vy.z[i]),
+                                    Vec3f(node->naabb.l.vz.x[i], node->naabb.l.vz.y[i], node->naabb.l.vz.z[i]),
+                                    Vec3f(node->naabb.p   .x[i], node->naabb.p   .y[i], node->naabb.p   .z[i]));
+
+      const Vec3vf<K> dir = xfmVector(naabb, ray.dir);
+      const Vec3vf<K> nrdir = Vec3vf<K>(vfloat<K>(-1.0f)) * rcp_safe(dir); // FIXME: negate instead of mul with -1?
+      const Vec3vf<K> org = xfmPoint(naabb, ray.org);
+
+      const vfloat<K> lclipMinX = org.x * nrdir.x; // (Vec3fa(zero) - org) * rdir;
+      const vfloat<K> lclipMinY = org.y * nrdir.y;
+      const vfloat<K> lclipMinZ = org.z * nrdir.z;
+      const vfloat<K> lclipMaxX  = lclipMinX - nrdir.x; // (Vec3fa(one) - org) * rdir;
+      const vfloat<K> lclipMaxY  = lclipMinY - nrdir.y;
+      const vfloat<K> lclipMaxZ  = lclipMinZ - nrdir.z;
+
+      vfloat<K> lnearP = maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY), mini(lclipMinZ, lclipMaxZ));
+      vfloat<K> lfarP  = mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY), maxi(lclipMinZ, lclipMaxZ));
+      if (robust) {
+        lnearP = lnearP*vfloat<K>(1.0f-3.0f*float(ulp));
+        lfarP  = lfarP *vfloat<K>(1.0f+3.0f*float(ulp));
+      }
+      const vbool<K> lhit    = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar);
+      dist = lnearP;
+      return lhit;
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Fast OBBNodeMB intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, int K, bool robust>
+    __forceinline vbool<K> intersectNodeK(const typename BVHN<N>::OBBNodeMB* node, const size_t i,
+                                          const TravRayK<K,robust>& ray, const vfloat<K>& time, vfloat<K>& dist)
+    {
+      const AffineSpace3vf<K> xfm(Vec3f(node->space0.l.vx.x[i], node->space0.l.vx.y[i], node->space0.l.vx.z[i]),
+                                  Vec3f(node->space0.l.vy.x[i], node->space0.l.vy.y[i], node->space0.l.vy.z[i]),
+                                  Vec3f(node->space0.l.vz.x[i], node->space0.l.vz.y[i], node->space0.l.vz.z[i]),
+                                  Vec3f(node->space0.p   .x[i], node->space0.p   .y[i], node->space0.p   .z[i]));
+
+      const Vec3vf<K> b0_lower = zero;
+      const Vec3vf<K> b0_upper = one;
+      const Vec3vf<K> b1_lower(node->b1.lower.x[i], node->b1.lower.y[i], node->b1.lower.z[i]);
+      const Vec3vf<K> b1_upper(node->b1.upper.x[i], node->b1.upper.y[i], node->b1.upper.z[i]);
+      const Vec3vf<K> lower = lerp(b0_lower, b1_lower, time);
+      const Vec3vf<K> upper = lerp(b0_upper, b1_upper, time);
+
+      const Vec3vf<K> dir = xfmVector(xfm, ray.dir);
+      const Vec3vf<K> rdir = rcp_safe(dir);
+      const Vec3vf<K> org = xfmPoint(xfm, ray.org);
+
+      const vfloat<K> lclipMinX = (lower.x - org.x) * rdir.x;
+      const vfloat<K> lclipMinY = (lower.y - org.y) * rdir.y;
+      const vfloat<K> lclipMinZ = (lower.z - org.z) * rdir.z;
+      const vfloat<K> lclipMaxX  = (upper.x - org.x) * rdir.x;
+      const vfloat<K> lclipMaxY  = (upper.y - org.y) * rdir.y;
+      const vfloat<K> lclipMaxZ  = (upper.z - org.z) * rdir.z;
+
+      vfloat<K> lnearP = maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY), mini(lclipMinZ, lclipMaxZ));
+      vfloat<K> lfarP  = mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY), maxi(lclipMinZ, lclipMaxZ));
+      if (robust) {
+        lnearP = lnearP*vfloat<K>(1.0f-3.0f*float(ulp));
+        lfarP  = lfarP *vfloat<K>(1.0f+3.0f*float(ulp));
+      }
+        
+      const vbool<K> lhit    = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar);
+      dist = lnearP;
+      return lhit;
+    }
+
+
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // QuantizedBaseNode intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, int K>
+    __forceinline vbool<K> intersectQuantizedNodeK(const typename BVHN<N>::QuantizedBaseNode* node, size_t i,
+                                                   const TravRayK<K,false>& ray, vfloat<K>& dist)
+
+    {
+      assert(movemask(node->validMask()) & ((size_t)1 << i));
+      const vfloat<N> lower_x = node->dequantizeLowerX();
+      const vfloat<N> upper_x = node->dequantizeUpperX();
+      const vfloat<N> lower_y = node->dequantizeLowerY();
+      const vfloat<N> upper_y = node->dequantizeUpperY();
+      const vfloat<N> lower_z = node->dequantizeLowerZ();
+      const vfloat<N> upper_z = node->dequantizeUpperZ();
+
+  #if defined(__AVX2__) || defined(__ARM_NEON)
+      const vfloat<K> lclipMinX = msub(lower_x[i], ray.rdir.x, ray.org_rdir.x);
+      const vfloat<K> lclipMinY = msub(lower_y[i], ray.rdir.y, ray.org_rdir.y);
+      const vfloat<K> lclipMinZ = msub(lower_z[i], ray.rdir.z, ray.org_rdir.z);
+      const vfloat<K> lclipMaxX = msub(upper_x[i], ray.rdir.x, ray.org_rdir.x);
+      const vfloat<K> lclipMaxY = msub(upper_y[i], ray.rdir.y, ray.org_rdir.y);
+      const vfloat<K> lclipMaxZ = msub(upper_z[i], ray.rdir.z, ray.org_rdir.z);
+  #else
+      const vfloat<K> lclipMinX = (lower_x[i] - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMinY = (lower_y[i] - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMinZ = (lower_z[i] - ray.org.z) * ray.rdir.z;
+      const vfloat<K> lclipMaxX = (upper_x[i] - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMaxY = (upper_y[i] - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMaxZ = (upper_z[i] - ray.org.z) * ray.rdir.z;
+  #endif
+
+  #if defined(__AVX512F__) // SKX
+      if (K == 16)
+      {
+        /* use mixed float/int min/max */
+        const vfloat<K> lnearP = maxi(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ));
+        const vfloat<K> lfarP  = mini(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ));
+        const vbool<K> lhit    = asInt(maxi(lnearP, ray.tnear)) <= asInt(mini(lfarP, ray.tfar));
+        dist = lnearP;
+        return lhit;
+      }
+      else
+  #endif
+      {
+        const vfloat<K> lnearP = maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY), mini(lclipMinZ, lclipMaxZ));
+        const vfloat<K> lfarP  = mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY), maxi(lclipMinZ, lclipMaxZ));
+  #if defined(__AVX512F__) // SKX
+        const vbool<K> lhit    = asInt(maxi(lnearP, ray.tnear)) <= asInt(mini(lfarP, ray.tfar));
+  #else
+        const vbool<K> lhit    = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar);
+  #endif
+        dist = lnearP;
+        return lhit;
+      }
+    }
+
+    template<int N, int K>
+    __forceinline vbool<K> intersectQuantizedNodeK(const typename BVHN<N>::QuantizedBaseNode* node, size_t i,
+          const TravRayK<K,true>& ray, vfloat<K>& dist)
+
+    {
+      assert(movemask(node->validMask()) & ((size_t)1 << i));
+      const vfloat<N> lower_x = node->dequantizeLowerX();
+      const vfloat<N> upper_x = node->dequantizeUpperX();
+      const vfloat<N> lower_y = node->dequantizeLowerY();
+      const vfloat<N> upper_y = node->dequantizeUpperY();
+      const vfloat<N> lower_z = node->dequantizeLowerZ();
+      const vfloat<N> upper_z = node->dequantizeUpperZ();
+
+      const vfloat<K> lclipMinX = (lower_x[i] - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMinY = (lower_y[i] - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMinZ = (lower_z[i] - ray.org.z) * ray.rdir.z;
+      const vfloat<K> lclipMaxX = (upper_x[i] - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMaxY = (upper_y[i] - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMaxZ = (upper_z[i] - ray.org.z) * ray.rdir.z;
+
+      const float round_up   = 1.0f+3.0f*float(ulp);
+      const float round_down = 1.0f-3.0f*float(ulp);
+
+      const vfloat<K> lnearP = round_down*max(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ));
+      const vfloat<K> lfarP  = round_up  *min(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ));
+      const vbool<K> lhit    = max(lnearP, ray.tnear) <= min(lfarP, ray.tfar);
+      dist = lnearP;
+      return lhit;
+      }
+
+    template<int N, int K>
+      __forceinline vbool<K> intersectQuantizedNodeMBK(const typename BVHN<N>::QuantizedBaseNodeMB* node, const size_t i,
+          const TravRayK<K,false>& ray, const vfloat<K>& time, vfloat<K>& dist)
+
+    {
+        assert(movemask(node->validMask()) & ((size_t)1 << i));
+
+        const vfloat<K> lower_x = node->template dequantizeLowerX<K>(i,time);
+        const vfloat<K> upper_x = node->template dequantizeUpperX<K>(i,time);
+        const vfloat<K> lower_y = node->template dequantizeLowerY<K>(i,time);
+        const vfloat<K> upper_y = node->template dequantizeUpperY<K>(i,time);
+        const vfloat<K> lower_z = node->template dequantizeLowerZ<K>(i,time);
+        const vfloat<K> upper_z = node->template dequantizeUpperZ<K>(i,time);
+        
+#if defined(__AVX2__) || defined(__ARM_NEON)
+        const vfloat<K> lclipMinX = msub(lower_x, ray.rdir.x, ray.org_rdir.x);
+        const vfloat<K> lclipMinY = msub(lower_y, ray.rdir.y, ray.org_rdir.y);
+        const vfloat<K> lclipMinZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z);
+        const vfloat<K> lclipMaxX = msub(upper_x, ray.rdir.x, ray.org_rdir.x);
+        const vfloat<K> lclipMaxY = msub(upper_y, ray.rdir.y, ray.org_rdir.y);
+        const vfloat<K> lclipMaxZ = msub(upper_z, ray.rdir.z, ray.org_rdir.z);
+#else
+        const vfloat<K> lclipMinX = (lower_x - ray.org.x) * ray.rdir.x;
+        const vfloat<K> lclipMinY = (lower_y - ray.org.y) * ray.rdir.y;
+        const vfloat<K> lclipMinZ = (lower_z - ray.org.z) * ray.rdir.z;
+        const vfloat<K> lclipMaxX = (upper_x - ray.org.x) * ray.rdir.x;
+        const vfloat<K> lclipMaxY = (upper_y - ray.org.y) * ray.rdir.y;
+        const vfloat<K> lclipMaxZ = (upper_z - ray.org.z) * ray.rdir.z;
+  #endif
+        const vfloat<K> lnearP = max(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ));
+        const vfloat<K> lfarP  = min(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ));
+        const vbool<K> lhit    = max(lnearP, ray.tnear) <= min(lfarP, ray.tfar);
+        dist = lnearP;
+        return lhit;
+      }
+
+
+    template<int N, int K>
+      __forceinline vbool<K> intersectQuantizedNodeMBK(const typename BVHN<N>::QuantizedBaseNodeMB* node, const size_t i,
+          const TravRayK<K,true>& ray, const vfloat<K>& time, vfloat<K>& dist)
+
+    {
+        assert(movemask(node->validMask()) & ((size_t)1 << i));
+
+        const vfloat<K> lower_x = node->template dequantizeLowerX<K>(i,time);
+        const vfloat<K> upper_x = node->template dequantizeUpperX<K>(i,time);
+        const vfloat<K> lower_y = node->template dequantizeLowerY<K>(i,time);
+        const vfloat<K> upper_y = node->template dequantizeUpperY<K>(i,time);
+        const vfloat<K> lower_z = node->template dequantizeLowerZ<K>(i,time);
+        const vfloat<K> upper_z = node->template dequantizeUpperZ<K>(i,time);
+
+        const vfloat<K> lclipMinX = (lower_x - ray.org.x) * ray.rdir.x;
+        const vfloat<K> lclipMinY = (lower_y - ray.org.y) * ray.rdir.y;
+        const vfloat<K> lclipMinZ = (lower_z - ray.org.z) * ray.rdir.z;
+        const vfloat<K> lclipMaxX = (upper_x - ray.org.x) * ray.rdir.x;
+        const vfloat<K> lclipMaxY = (upper_y - ray.org.y) * ray.rdir.y;
+        const vfloat<K> lclipMaxZ = (upper_z - ray.org.z) * ray.rdir.z;
+
+        const float round_up   = 1.0f+3.0f*float(ulp);
+        const float round_down = 1.0f-3.0f*float(ulp);
+
+        const vfloat<K> lnearP = round_down*max(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ));
+        const vfloat<K> lfarP  = round_up  *min(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ));
+        const vbool<K> lhit    = max(lnearP, ray.tnear) <= min(lfarP, ray.tfar);
+        dist = lnearP;
+        return lhit;
+      }
+
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Node intersectors used in hybrid traversal
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    /*! Intersects N nodes with K rays */
+    template<int N, int K, int types, bool robust>
+    struct BVHNNodeIntersectorK;
+
+    template<int N, int K>
+    struct BVHNNodeIntersectorK<N, K, BVH_AN1, false>
+    {
+      /* vmask is both an input and an output parameter! Its initial value should be the parent node
+         hit mask, which is used for correctly computing the current hit mask. The parent hit mask
+         is actually required only for motion blur node intersections (because different rays may
+         have different times), so for regular nodes vmask is simply overwritten. */
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+                                          const TravRayKFast<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+      {
+        vmask = intersectNodeK<N,K>(node.getAABBNode(), i, ray, dist);
+        return true;
+      }
+    };
+
+    template<int N, int K>
+    struct BVHNNodeIntersectorK<N, K, BVH_AN1, true>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+                                          const TravRayKRobust<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+      {
+        vmask = intersectNodeKRobust<N,K>(node.getAABBNode(), i, ray, dist);
+        return true;
+      }
+    };
+
+    template<int N, int K>
+    struct BVHNNodeIntersectorK<N, K, BVH_AN2, false>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+                                          const TravRayKFast<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+      {
+        vmask = intersectNodeK<N,K>(node.getAABBNodeMB(), i, ray, time, dist);
+        return true;
+      }
+    };
+
+    template<int N, int K>
+    struct BVHNNodeIntersectorK<N, K, BVH_AN2, true>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+                                          const TravRayKRobust<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+      {
+        vmask = intersectNodeKRobust<N,K>(node.getAABBNodeMB(), i, ray, time, dist);
+        return true;
+      }
+    };
+
+    template<int N, int K>
+    struct BVHNNodeIntersectorK<N, K, BVH_AN1_UN1, false>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+                                          const TravRayKFast<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+      {
+        if (likely(node.isAABBNode()))              vmask = intersectNodeK<N,K>(node.getAABBNode(), i, ray, dist);
+        else /*if (unlikely(node.isOBBNode()))*/ vmask = intersectNodeK<N,K>(node.ungetAABBNode(), i, ray, dist);
+        return true;
+      }
+    };
+
+    template<int N, int K>
+    struct BVHNNodeIntersectorK<N, K, BVH_AN1_UN1, true>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+                                          const TravRayKRobust<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+      {
+        if (likely(node.isAABBNode()))              vmask = intersectNodeKRobust<N,K>(node.getAABBNode(), i, ray, dist);
+        else /*if (unlikely(node.isOBBNode()))*/ vmask = intersectNodeK<N,K>(node.ungetAABBNode(), i, ray, dist);
+        return true;
+      }
+    };
+
+    template<int N, int K>
+    struct BVHNNodeIntersectorK<N, K, BVH_AN2_UN2, false>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+                                          const TravRayKFast<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+      {
+        if (likely(node.isAABBNodeMB()))              vmask = intersectNodeK<N,K>(node.getAABBNodeMB(), i, ray, time, dist);
+        else /*if (unlikely(node.isOBBNodeMB()))*/ vmask = intersectNodeK<N,K>(node.ungetAABBNodeMB(), i, ray, time, dist);
+        return true;
+      }
+    };
+
+    template<int N, int K>
+    struct BVHNNodeIntersectorK<N, K, BVH_AN2_UN2, true>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+                                          const TravRayKRobust<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+      {
+        if (likely(node.isAABBNodeMB()))              vmask = intersectNodeKRobust<N,K>(node.getAABBNodeMB(), i, ray, time, dist);
+        else /*if (unlikely(node.isOBBNodeMB()))*/ vmask = intersectNodeK<N,K>(node.ungetAABBNodeMB(), i, ray, time, dist);
+        return true;
+      }
+    };
+
+    template<int N, int K>
+    struct BVHNNodeIntersectorK<N, K, BVH_AN2_AN4D, false>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+                                          const TravRayKFast<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+      {
+        vmask &= intersectNodeKMB4D<N,K>(node, i, ray, time, dist);
+        return true;
+      }
+    };
+
+    template<int N, int K>
+    struct BVHNNodeIntersectorK<N, K, BVH_AN2_AN4D, true>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+                                          const TravRayKRobust<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+      {
+        vmask &= intersectNodeKMB4DRobust<N,K>(node, i, ray, time, dist);
+        return true;
+      }
+    };
+
+    template<int N, int K>
+    struct BVHNNodeIntersectorK<N, K, BVH_AN2_AN4D_UN2, false>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+                                          const TravRayKFast<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+      {
+        if (likely(node.isAABBNodeMB() || node.isAABBNodeMB4D())) {
+          vmask &= intersectNodeKMB4D<N,K>(node, i, ray, time, dist);
+        } else /*if (unlikely(node.isOBBNodeMB()))*/ {
+          assert(node.isOBBNodeMB());
+          vmask &= intersectNodeK<N,K>(node.ungetAABBNodeMB(), i, ray, time, dist);
+        }
+        return true;
+      }
+    };
+
+    template<int N, int K>
+    struct BVHNNodeIntersectorK<N, K, BVH_AN2_AN4D_UN2, true>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+                                          const TravRayKRobust<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+      {
+        if (likely(node.isAABBNodeMB() || node.isAABBNodeMB4D())) {
+          vmask &= intersectNodeKMB4DRobust<N,K>(node, i, ray, time, dist);
+        } else /*if (unlikely(node.isOBBNodeMB()))*/ {
+          assert(node.isOBBNodeMB());
+          vmask &= intersectNodeK<N,K>(node.ungetAABBNodeMB(), i, ray, time, dist);
+        }
+        return true;
+      }
+    };
+
+
+    /*! Intersects N nodes with K rays */
+    template<int N, int K, bool robust>
+    struct BVHNQuantizedBaseNodeIntersectorK;
+
+    template<int N, int K>
+    struct BVHNQuantizedBaseNodeIntersectorK<N, K, false>
+    {
+      static __forceinline vbool<K> intersectK(const typename BVHN<N>::QuantizedBaseNode* node, const size_t i,
+                                              const TravRayK<K,false>& ray, vfloat<K>& dist)
+      {
+        return intersectQuantizedNodeK<N,K>(node,i,ray,dist);
+      }
+
+      static __forceinline vbool<K> intersectK(const typename BVHN<N>::QuantizedBaseNodeMB* node, const size_t i,
+                                               const TravRayK<K,false>& ray, const vfloat<K>& time, vfloat<K>& dist)
+      {
+        return intersectQuantizedNodeMBK<N,K>(node,i,ray,time,dist);
+      }
+
+    };
+
+    template<int N, int K>
+    struct BVHNQuantizedBaseNodeIntersectorK<N, K, true>
+    {
+      static __forceinline vbool<K> intersectK(const typename BVHN<N>::QuantizedBaseNode* node, const size_t i,
+                                               const TravRayK<K,true>& ray, vfloat<K>& dist)
+      {
+        return intersectQuantizedNodeK<N,K>(node,i,ray,dist);
+      }
+
+      static __forceinline vbool<K> intersectK(const typename BVHN<N>::QuantizedBaseNodeMB* node, const size_t i,
+          const TravRayK<K,true>& ray, const vfloat<K>& time, vfloat<K>& dist)
+      {
+        return intersectQuantizedNodeMBK<N,K>(node,i,ray,time,dist);
+      }
+    };
+
+
+  }
+}
diff --git a/thirdparty/embree/kernels/bvh/node_intersector_packet_stream.h b/thirdparty/embree/kernels/bvh/node_intersector_packet_stream.h
new file mode 100644
index 0000000000..55b2c27231
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/node_intersector_packet_stream.h
@@ -0,0 +1,189 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "node_intersector.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Ray packet structure used in stream traversal
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int K, bool robust>
+    struct TravRayKStream;
+
+    /* Fast variant */
+    template<int K>
+    struct TravRayKStream<K, false>
+    {
+      __forceinline TravRayKStream() {}
+
+      __forceinline TravRayKStream(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, const vfloat<K>& ray_tnear, const vfloat<K>& ray_tfar)
+      {
+        init(ray_org, ray_dir);
+        tnear = ray_tnear;
+        tfar = ray_tfar;
+      }
+
+      __forceinline void init(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir)
+      {
+        rdir = rcp_safe(ray_dir);
+        org_rdir = ray_org * rdir;
+      }
+
+      Vec3vf<K> rdir;
+      Vec3vf<K> org_rdir;
+      vfloat<K> tnear;
+      vfloat<K> tfar;
+    };
+
+    template<int K>
+    using TravRayKStreamFast = TravRayKStream<K, false>;
+
+    /* Robust variant */
+    template<int K>
+    struct TravRayKStream<K, true>
+    {
+      __forceinline TravRayKStream() {}
+
+      __forceinline TravRayKStream(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, const vfloat<K>& ray_tnear, const vfloat<K>& ray_tfar)
+      {
+        init(ray_org, ray_dir);
+        tnear = ray_tnear;
+        tfar = ray_tfar;
+      }
+
+      __forceinline void init(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir)
+      {
+        rdir = vfloat<K>(1.0f)/(zero_fix(ray_dir));
+        org = ray_org;
+      }
+
+      Vec3vf<K> rdir;
+      Vec3vf<K> org;
+      vfloat<K> tnear;
+      vfloat<K> tfar;
+    };
+
+    template<int K>
+    using TravRayKStreamRobust = TravRayKStream<K, true>;
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Fast AABBNode intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, int K>
+    __forceinline size_t intersectNode1(const typename BVHN<N>::AABBNode* __restrict__ node,
+                                        const TravRayKStreamFast<K>& ray, size_t k, const NearFarPrecalculations& nf)
+    {
+      const vfloat<N> bminX = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearX));
+      const vfloat<N> bminY = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearY));
+      const vfloat<N> bminZ = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearZ));
+      const vfloat<N> bmaxX = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farX));
+      const vfloat<N> bmaxY = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farY));
+      const vfloat<N> bmaxZ = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farZ));
+
+      const vfloat<N> rminX = msub(bminX, vfloat<N>(ray.rdir.x[k]), vfloat<N>(ray.org_rdir.x[k]));
+      const vfloat<N> rminY = msub(bminY, vfloat<N>(ray.rdir.y[k]), vfloat<N>(ray.org_rdir.y[k]));
+      const vfloat<N> rminZ = msub(bminZ, vfloat<N>(ray.rdir.z[k]), vfloat<N>(ray.org_rdir.z[k]));
+      const vfloat<N> rmaxX = msub(bmaxX, vfloat<N>(ray.rdir.x[k]), vfloat<N>(ray.org_rdir.x[k]));
+      const vfloat<N> rmaxY = msub(bmaxY, vfloat<N>(ray.rdir.y[k]), vfloat<N>(ray.org_rdir.y[k]));
+      const vfloat<N> rmaxZ = msub(bmaxZ, vfloat<N>(ray.rdir.z[k]), vfloat<N>(ray.org_rdir.z[k]));
+      const vfloat<N> rmin  = maxi(rminX, rminY, rminZ, vfloat<N>(ray.tnear[k]));
+      const vfloat<N> rmax  = mini(rmaxX, rmaxY, rmaxZ, vfloat<N>(ray.tfar[k]));
+
+      const vbool<N> vmask_first_hit = rmin <= rmax;
+
+      return movemask(vmask_first_hit) & (((size_t)1 << N)-1);
+    }
+
+    template<int N, int K>
+    __forceinline size_t intersectNodeK(const typename BVHN<N>::AABBNode* __restrict__ node, size_t i,
+                                        const TravRayKStreamFast<K>& ray, const NearFarPrecalculations& nf)
+    {
+      char* ptr = (char*)&node->lower_x + i*sizeof(float);
+      const vfloat<K> bminX = *(const float*)(ptr + nf.nearX);
+      const vfloat<K> bminY = *(const float*)(ptr + nf.nearY);
+      const vfloat<K> bminZ = *(const float*)(ptr + nf.nearZ);
+      const vfloat<K> bmaxX = *(const float*)(ptr + nf.farX);
+      const vfloat<K> bmaxY = *(const float*)(ptr + nf.farY);
+      const vfloat<K> bmaxZ = *(const float*)(ptr + nf.farZ);
+
+      const vfloat<K> rminX = msub(bminX, ray.rdir.x, ray.org_rdir.x);
+      const vfloat<K> rminY = msub(bminY, ray.rdir.y, ray.org_rdir.y);
+      const vfloat<K> rminZ = msub(bminZ, ray.rdir.z, ray.org_rdir.z);
+      const vfloat<K> rmaxX = msub(bmaxX, ray.rdir.x, ray.org_rdir.x);
+      const vfloat<K> rmaxY = msub(bmaxY, ray.rdir.y, ray.org_rdir.y);
+      const vfloat<K> rmaxZ = msub(bmaxZ, ray.rdir.z, ray.org_rdir.z);
+
+      const vfloat<K> rmin  = maxi(rminX, rminY, rminZ, ray.tnear);
+      const vfloat<K> rmax  = mini(rmaxX, rmaxY, rmaxZ, ray.tfar);
+
+      const vbool<K> vmask_first_hit = rmin <= rmax;
+
+      return movemask(vmask_first_hit);
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Robust AABBNode intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, int K>
+    __forceinline size_t intersectNode1(const typename BVHN<N>::AABBNode* __restrict__ node,
+                                        const TravRayKStreamRobust<K>& ray, size_t k, const NearFarPrecalculations& nf)
+    {
+      const vfloat<N> bminX = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearX));
+      const vfloat<N> bminY = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearY));
+      const vfloat<N> bminZ = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearZ));
+      const vfloat<N> bmaxX = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farX));
+      const vfloat<N> bmaxY = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farY));
+      const vfloat<N> bmaxZ = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farZ));
+
+      const vfloat<N> rminX = (bminX - vfloat<N>(ray.org.x[k])) * vfloat<N>(ray.rdir.x[k]);
+      const vfloat<N> rminY = (bminY - vfloat<N>(ray.org.y[k])) * vfloat<N>(ray.rdir.y[k]);
+      const vfloat<N> rminZ = (bminZ - vfloat<N>(ray.org.z[k])) * vfloat<N>(ray.rdir.z[k]);
+      const vfloat<N> rmaxX = (bmaxX - vfloat<N>(ray.org.x[k])) * vfloat<N>(ray.rdir.x[k]);
+      const vfloat<N> rmaxY = (bmaxY - vfloat<N>(ray.org.y[k])) * vfloat<N>(ray.rdir.y[k]);
+      const vfloat<N> rmaxZ = (bmaxZ - vfloat<N>(ray.org.z[k])) * vfloat<N>(ray.rdir.z[k]);
+      const float round_up = 1.0f+3.0f*float(ulp); // FIXME: use per instruction rounding for AVX512
+      const vfloat<N> rmin  =            max(rminX, rminY, rminZ, vfloat<N>(ray.tnear[k]));
+      const vfloat<N> rmax  = round_up  *min(rmaxX, rmaxY, rmaxZ, vfloat<N>(ray.tfar[k]));
+
+      const vbool<N> vmask_first_hit = rmin <= rmax;
+
+      return movemask(vmask_first_hit) & (((size_t)1 << N)-1);
+    }
+
+    template<int N, int K>
+    __forceinline size_t intersectNodeK(const typename BVHN<N>::AABBNode* __restrict__ node, size_t i,
+                                        const TravRayKStreamRobust<K>& ray, const NearFarPrecalculations& nf)
+    {
+      char *ptr = (char*)&node->lower_x + i*sizeof(float);
+      const vfloat<K> bminX = *(const float*)(ptr + nf.nearX);
+      const vfloat<K> bminY = *(const float*)(ptr + nf.nearY);
+      const vfloat<K> bminZ = *(const float*)(ptr + nf.nearZ);
+      const vfloat<K> bmaxX = *(const float*)(ptr + nf.farX);
+      const vfloat<K> bmaxY = *(const float*)(ptr + nf.farY);
+      const vfloat<K> bmaxZ = *(const float*)(ptr + nf.farZ);
+
+      const vfloat<K> rminX = (bminX - ray.org.x) * ray.rdir.x;
+      const vfloat<K> rminY = (bminY - ray.org.y) * ray.rdir.y;
+      const vfloat<K> rminZ = (bminZ - ray.org.z) * ray.rdir.z;
+      const vfloat<K> rmaxX = (bmaxX - ray.org.x) * ray.rdir.x;
+      const vfloat<K> rmaxY = (bmaxY - ray.org.y) * ray.rdir.y;
+      const vfloat<K> rmaxZ = (bmaxZ - ray.org.z) * ray.rdir.z;
+
+      const float round_up  = 1.0f+3.0f*float(ulp);
+      const vfloat<K> rmin  =            max(rminX, rminY, rminZ, vfloat<K>(ray.tnear));
+      const vfloat<K> rmax  = round_up * min(rmaxX, rmaxY, rmaxZ, vfloat<K>(ray.tfar));
+
+      const vbool<K> vmask_first_hit = rmin <= rmax;
+
+      return movemask(vmask_first_hit);
+    }
+  }
+}
diff --git a/thirdparty/embree/kernels/common/accel.h b/thirdparty/embree/kernels/common/accel.h
new file mode 100644
index 0000000000..cc4ea1805b
--- /dev/null
+++ b/thirdparty/embree/kernels/common/accel.h
@@ -0,0 +1,556 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "ray.h"
+#include "point_query.h"
+#include "context.h"
+
+namespace embree
+{
+  class Scene;
+
+  /*! Base class for the acceleration structure data. */
+  class AccelData : public RefCount 
+  {
+    ALIGNED_CLASS_(16);
+  public:
+    enum Type { TY_UNKNOWN = 0, TY_ACCELN = 1, TY_ACCEL_INSTANCE = 2, TY_BVH4 = 3, TY_BVH8 = 4 };
+
+  public:
+    AccelData (const Type type) 
+      : bounds(empty), type(type) {}
+
+    /*! notifies the acceleration structure about the deletion of some geometry */
+    virtual void deleteGeometry(size_t geomID) {};
+   
+    /*! clears the acceleration structure data */
+    virtual void clear() = 0;
+
+    /*! returns normal bounds */
+    __forceinline BBox3fa getBounds() const {
+      return bounds.bounds();
+    }
+
+    /*! returns bounds for some time */
+    __forceinline BBox3fa getBounds(float t) const {
+      return bounds.interpolate(t);
+    }
+
+    /*! returns linear bounds */
+    __forceinline LBBox3fa getLinearBounds() const {
+      return bounds;
+    }
+
+    /*! checks if acceleration structure is empty */
+    __forceinline bool isEmpty() const {
+      return bounds.bounds0.lower.x == float(pos_inf);
+    }
+
+  public:
+    LBBox3fa bounds; // linear bounds
+    Type type;
+  };
+
+  /*! Base class for all intersectable and buildable acceleration structures. */
+  class Accel : public AccelData
+  {
+     ALIGNED_CLASS_(16);
+  public:
+
+    struct Intersectors;
+
+    /*! Type of collide function */
+    typedef void (*CollideFunc)(void* bvh0, void* bvh1, RTCCollideFunc callback, void* userPtr);
+
+    /*! Type of point query function */
+    typedef bool(*PointQueryFunc)(Intersectors* This,          /*!< this pointer to accel */
+                                  PointQuery* query,        /*!< point query for lookup */
+                                  PointQueryContext* context); /*!< point query context */
+
+    /*! Type of intersect function pointer for single rays. */
+    typedef void (*IntersectFunc)(Intersectors* This,  /*!< this pointer to accel */
+                                  RTCRayHit& ray,      /*!< ray to intersect */
+                                  IntersectContext* context);
+    
+    /*! Type of intersect function pointer for ray packets of size 4. */
+    typedef void (*IntersectFunc4)(const void* valid,  /*!< pointer to valid mask */
+                                   Intersectors* This, /*!< this pointer to accel */
+                                   RTCRayHit4& ray,    /*!< ray packet to intersect */
+                                   IntersectContext* context);
+    
+    /*! Type of intersect function pointer for ray packets of size 8. */
+    typedef void (*IntersectFunc8)(const void* valid,  /*!< pointer to valid mask */
+                                   Intersectors* This, /*!< this pointer to accel */
+                                   RTCRayHit8& ray,    /*!< ray packet to intersect */
+                                   IntersectContext* context);
+    
+    /*! Type of intersect function pointer for ray packets of size 16. */
+    typedef void (*IntersectFunc16)(const void* valid,  /*!< pointer to valid mask */
+                                    Intersectors* This, /*!< this pointer to accel */
+                                    RTCRayHit16& ray,   /*!< ray packet to intersect */
+                                    IntersectContext* context);
+
+    /*! Type of intersect function pointer for ray packets of size N. */
+    typedef void (*IntersectFuncN)(Intersectors* This, /*!< this pointer to accel */
+                                   RTCRayHitN** ray,   /*!< ray stream to intersect */
+                                   const size_t N,     /*!< number of rays in stream */
+                                   IntersectContext* context /*!< layout flags */);
+    
+    
+    /*! Type of occlusion function pointer for single rays. */
+    typedef void (*OccludedFunc) (Intersectors* This, /*!< this pointer to accel */
+                                  RTCRay& ray,        /*!< ray to test occlusion */
+                                  IntersectContext* context);
+    
+    /*! Type of occlusion function pointer for ray packets of size 4. */
+    typedef void (*OccludedFunc4) (const void* valid,  /*!< pointer to valid mask */
+                                   Intersectors* This, /*!< this pointer to accel */
+                                   RTCRay4& ray,       /*!< ray packet to test occlusion. */
+                                   IntersectContext* context);
+    
+    /*! Type of occlusion function pointer for ray packets of size 8. */
+    typedef void (*OccludedFunc8) (const void* valid,  /*!< pointer to valid mask */
+                                   Intersectors* This, /*!< this pointer to accel */
+                                   RTCRay8& ray,       /*!< ray packet to test occlusion. */
+                                   IntersectContext* context);
+    
+    /*! Type of occlusion function pointer for ray packets of size 16. */
+    typedef void (*OccludedFunc16) (const void* valid,  /*!< pointer to valid mask */
+                                    Intersectors* This, /*!< this pointer to accel */
+                                    RTCRay16& ray,      /*!< ray packet to test occlusion. */
+                                    IntersectContext* context);
+
+    /*! Type of intersect function pointer for ray packets of size N. */
+    typedef void (*OccludedFuncN)(Intersectors* This, /*!< this pointer to accel */
+                                  RTCRayN** ray,      /*!< ray stream to test occlusion */
+                                  const size_t N,     /*!< number of rays in stream */
+                                  IntersectContext* context /*!< layout flags */);
+    typedef void (*ErrorFunc) ();
+
+    struct Collider
+    {
+      Collider (ErrorFunc error = nullptr) 
+      : collide((CollideFunc)error), name(nullptr) {}
+
+      Collider (CollideFunc collide, const char* name)
+      : collide(collide), name(name) {}
+
+      operator bool() const { return name; }
+
+    public:
+      CollideFunc collide;  
+      const char* name;
+    };
+    
+    struct Intersector1
+    {
+      Intersector1 (ErrorFunc error = nullptr)
+      : intersect((IntersectFunc)error), occluded((OccludedFunc)error), name(nullptr) {}
+      
+      Intersector1 (IntersectFunc intersect, OccludedFunc occluded, const char* name)
+      : intersect(intersect), occluded(occluded), pointQuery(nullptr), name(name) {}
+      
+      Intersector1 (IntersectFunc intersect, OccludedFunc occluded, PointQueryFunc pointQuery, const char* name)
+      : intersect(intersect), occluded(occluded), pointQuery(pointQuery), name(name) {}
+
+      operator bool() const { return name; }
+
+    public:
+      static const char* type;
+      IntersectFunc intersect;
+      OccludedFunc occluded;
+      PointQueryFunc pointQuery;
+      const char* name;
+    };
+    
+    struct Intersector4 
+    {
+      Intersector4 (ErrorFunc error = nullptr)
+      : intersect((IntersectFunc4)error), occluded((OccludedFunc4)error), name(nullptr) {}
+
+      Intersector4 (IntersectFunc4 intersect, OccludedFunc4 occluded, const char* name)
+      : intersect(intersect), occluded(occluded), name(name) {}
+
+      operator bool() const { return name; }
+      
+    public:
+      static const char* type;
+      IntersectFunc4 intersect;
+      OccludedFunc4 occluded;
+      const char* name;
+    };
+    
+    struct Intersector8 
+    {
+      Intersector8 (ErrorFunc error = nullptr)
+      : intersect((IntersectFunc8)error), occluded((OccludedFunc8)error), name(nullptr) {}
+
+      Intersector8 (IntersectFunc8 intersect, OccludedFunc8 occluded, const char* name)
+      : intersect(intersect), occluded(occluded), name(name) {}
+
+      operator bool() const { return name; }
+      
+    public:
+      static const char* type;
+      IntersectFunc8 intersect;
+      OccludedFunc8 occluded;
+      const char* name;
+    };
+    
+    struct Intersector16 
+    {
+      Intersector16 (ErrorFunc error = nullptr)
+      : intersect((IntersectFunc16)error), occluded((OccludedFunc16)error), name(nullptr) {}
+
+      Intersector16 (IntersectFunc16 intersect, OccludedFunc16 occluded, const char* name)
+      : intersect(intersect), occluded(occluded), name(name) {}
+
+      operator bool() const { return name; }
+      
+    public:
+      static const char* type;
+      IntersectFunc16 intersect;
+      OccludedFunc16 occluded;
+      const char* name;
+    };
+
+    struct IntersectorN 
+    {
+      IntersectorN (ErrorFunc error = nullptr)
+      : intersect((IntersectFuncN)error), occluded((OccludedFuncN)error), name(nullptr) {}
+
+      IntersectorN (IntersectFuncN intersect, OccludedFuncN occluded, const char* name)
+      : intersect(intersect), occluded(occluded), name(name) {}
+
+      operator bool() const { return name; }
+      
+    public:
+      static const char* type;
+      IntersectFuncN intersect;
+      OccludedFuncN occluded;
+      const char* name;
+    };
+   
+    struct Intersectors 
+    {
+      Intersectors() 
+      : ptr(nullptr), leafIntersector(nullptr), collider(nullptr), intersector1(nullptr), intersector4(nullptr), intersector8(nullptr), intersector16(nullptr), intersectorN(nullptr) {}
+
+      Intersectors (ErrorFunc error) 
+      : ptr(nullptr), leafIntersector(nullptr), collider(error), intersector1(error), intersector4(error), intersector8(error), intersector16(error), intersectorN(error) {}
+
+      void print(size_t ident) 
+      {
+        if (collider.name) {
+          for (size_t i=0; i<ident; i++) std::cout << " ";
+          std::cout << "collider  = " << collider.name << std::endl;
+        }
+        if (intersector1.name) {
+          for (size_t i=0; i<ident; i++) std::cout << " ";
+          std::cout << "intersector1  = " << intersector1.name << std::endl;
+        }
+        if (intersector4.name) {
+          for (size_t i=0; i<ident; i++) std::cout << " ";
+          std::cout << "intersector4  = " << intersector4.name << std::endl;
+        }
+        if (intersector8.name) {
+          for (size_t i=0; i<ident; i++) std::cout << " ";
+          std::cout << "intersector8  = " << intersector8.name << std::endl;
+        }
+        if (intersector16.name) {
+          for (size_t i=0; i<ident; i++) std::cout << " ";
+          std::cout << "intersector16 = " << intersector16.name << std::endl;
+        }
+        if (intersectorN.name) {
+          for (size_t i=0; i<ident; i++) std::cout << " ";
+          std::cout << "intersectorN = " << intersectorN.name << std::endl;
+        }        
+      }
+
+      void select(bool filter)
+      {
+        if (intersector4_filter) {
+          if (filter) intersector4 = intersector4_filter;
+          else        intersector4 = intersector4_nofilter;
+        }
+        if (intersector8_filter) {
+          if (filter) intersector8 = intersector8_filter;
+          else        intersector8 = intersector8_nofilter;
+        }
+        if (intersector16_filter) {
+          if (filter) intersector16 = intersector16_filter;
+          else         intersector16 = intersector16_nofilter;
+        }
+        if (intersectorN_filter) {
+          if (filter) intersectorN = intersectorN_filter;
+          else        intersectorN = intersectorN_nofilter;
+        }        
+      }
+
+      __forceinline bool pointQuery (PointQuery* query, PointQueryContext* context) {
+        assert(intersector1.pointQuery);
+        return intersector1.pointQuery(this,query,context);
+      }
+
+      /*! collides two scenes */
+      __forceinline void collide (Accel* scene0, Accel* scene1, RTCCollideFunc callback, void* userPtr) {
+        assert(collider.collide);
+        collider.collide(scene0->intersectors.ptr,scene1->intersectors.ptr,callback,userPtr);
+      }
+
+      /*! Intersects a single ray with the scene. */
+      __forceinline void intersect (RTCRayHit& ray, IntersectContext* context) {
+        assert(intersector1.intersect);
+        intersector1.intersect(this,ray,context);
+      }
+
+      /*! Intersects a packet of 4 rays with the scene. */
+      __forceinline void intersect4 (const void* valid, RTCRayHit4& ray, IntersectContext* context) {
+        assert(intersector4.intersect);
+        intersector4.intersect(valid,this,ray,context);
+      }
+      
+      /*! Intersects a packet of 8 rays with the scene. */
+      __forceinline void intersect8 (const void* valid, RTCRayHit8& ray, IntersectContext* context) {
+        assert(intersector8.intersect);
+        intersector8.intersect(valid,this,ray,context);
+      }
+      
+      /*! Intersects a packet of 16 rays with the scene. */
+      __forceinline void intersect16 (const void* valid, RTCRayHit16& ray, IntersectContext* context) {
+        assert(intersector16.intersect);
+        intersector16.intersect(valid,this,ray,context);
+      }
+      
+      /*! Intersects a stream of N rays in SOA layout with the scene. */
+      __forceinline void intersectN (RTCRayHitN** rayN, const size_t N, IntersectContext* context)
+      {
+        assert(intersectorN.intersect);
+        intersectorN.intersect(this,rayN,N,context);
+      }
+      
+#if defined(__SSE__)
+      __forceinline void intersect(const vbool4& valid, RayHitK<4>& ray, IntersectContext* context) {
+        const vint<4> mask = valid.mask32();
+        intersect4(&mask,(RTCRayHit4&)ray,context);
+      }
+#endif
+#if defined(__AVX__)
+      __forceinline void intersect(const vbool8& valid, RayHitK<8>& ray, IntersectContext* context) {
+        const vint<8> mask = valid.mask32();
+        intersect8(&mask,(RTCRayHit8&)ray,context);
+      }
+#endif
+#if defined(__AVX512F__)
+      __forceinline void intersect(const vbool16& valid, RayHitK<16>& ray, IntersectContext* context) {
+        const vint<16> mask = valid.mask32();
+        intersect16(&mask,(RTCRayHit16&)ray,context);
+      }
+#endif
+      
+      template<int K>
+      __forceinline void intersectN (RayHitK<K>** rayN, const size_t N, IntersectContext* context)
+      {
+        intersectN((RTCRayHitN**)rayN,N,context);
+      }
+
+      /*! Tests if single ray is occluded by the scene. */
+      __forceinline void occluded (RTCRay& ray, IntersectContext* context) {
+        assert(intersector1.occluded);
+        intersector1.occluded(this,ray,context);
+      }
+      
+      /*! Tests if a packet of 4 rays is occluded by the scene. */
+      __forceinline void occluded4 (const void* valid, RTCRay4& ray, IntersectContext* context) {
+        assert(intersector4.occluded);
+        intersector4.occluded(valid,this,ray,context);
+      }
+      
+      /*! Tests if a packet of 8 rays is occluded by the scene. */
+      __forceinline void occluded8 (const void* valid, RTCRay8& ray, IntersectContext* context) {
+        assert(intersector8.occluded);
+        intersector8.occluded(valid,this,ray,context);
+      }
+      
+      /*! Tests if a packet of 16 rays is occluded by the scene. */
+      __forceinline void occluded16 (const void* valid, RTCRay16& ray, IntersectContext* context) {
+        assert(intersector16.occluded);
+        intersector16.occluded(valid,this,ray,context);
+      }
+      
+      /*! Tests if a stream of N rays in SOA layout is occluded by the scene. */
+      __forceinline void occludedN (RTCRayN** rayN, const size_t N, IntersectContext* context)
+      {
+        assert(intersectorN.occluded);
+        intersectorN.occluded(this,rayN,N,context);
+      }
+      
+#if defined(__SSE__)
+      __forceinline void occluded(const vbool4& valid, RayK<4>& ray, IntersectContext* context) {
+        const vint<4> mask = valid.mask32();
+        occluded4(&mask,(RTCRay4&)ray,context);
+      }
+#endif
+#if defined(__AVX__)
+      __forceinline void occluded(const vbool8& valid, RayK<8>& ray, IntersectContext* context) {
+        const vint<8> mask = valid.mask32();
+        occluded8(&mask,(RTCRay8&)ray,context);
+      }
+#endif
+#if defined(__AVX512F__)
+      __forceinline void occluded(const vbool16& valid, RayK<16>& ray, IntersectContext* context) {
+        const vint<16> mask = valid.mask32();
+        occluded16(&mask,(RTCRay16&)ray,context);
+      }
+#endif
+
+      template<int K>
+      __forceinline void occludedN (RayK<K>** rayN, const size_t N, IntersectContext* context)
+      {
+        occludedN((RTCRayN**)rayN,N,context);
+      }
+
+      /*! Tests if single ray is occluded by the scene. */
+      __forceinline void intersect(RTCRay& ray, IntersectContext* context) {
+        occluded(ray, context);
+      }
+
+      /*! Tests if a packet of K rays is occluded by the scene. */
+      template<int K>
+      __forceinline void intersect(const vbool<K>& valid, RayK<K>& ray, IntersectContext* context) {
+        occluded(valid, ray, context);
+      }
+
+      /*! Tests if a packet of N rays in SOA layout is occluded by the scene. */
+      template<int K>
+      __forceinline void intersectN(RayK<K>** rayN, const size_t N, IntersectContext* context) {
+        occludedN(rayN, N, context);
+      }
+      
+    public:
+      AccelData* ptr;
+      void* leafIntersector;
+      Collider collider;
+      Intersector1 intersector1;
+      Intersector4 intersector4;
+      Intersector4 intersector4_filter;
+      Intersector4 intersector4_nofilter;
+      Intersector8 intersector8;
+      Intersector8 intersector8_filter;
+      Intersector8 intersector8_nofilter;
+      Intersector16 intersector16;
+      Intersector16 intersector16_filter;
+      Intersector16 intersector16_nofilter;
+      IntersectorN intersectorN;
+      IntersectorN intersectorN_filter;
+      IntersectorN intersectorN_nofilter;      
+    };
+  
+  public:
+
+    /*! Construction */
+    Accel (const AccelData::Type type) 
+      : AccelData(type) {}
+    
+    /*! Construction */
+    Accel (const AccelData::Type type, const Intersectors& intersectors) 
+      : AccelData(type), intersectors(intersectors) {}
+
+    /*! Virtual destructor */
+    virtual ~Accel() {}
+
+    /*! makes the acceleration structure immutable */
+    virtual void immutable () {}
+    
+    /*! build acceleration structure */
+    virtual void build () = 0;
+
+  public:
+    Intersectors intersectors;
+  };
+
+#define DEFINE_COLLIDER(symbol,collider)                                \
+  Accel::Collider symbol() {                                            \
+    return Accel::Collider((Accel::CollideFunc)collider::collide,       \
+                           TOSTRING(isa) "::" TOSTRING(symbol));        \
+  }
+
+#define DEFINE_INTERSECTOR1(symbol,intersector)                               \
+  Accel::Intersector1 symbol() {                                              \
+    return Accel::Intersector1((Accel::IntersectFunc )intersector::intersect, \
+                               (Accel::OccludedFunc  )intersector::occluded,  \
+                               (Accel::PointQueryFunc)intersector::pointQuery,\
+                               TOSTRING(isa) "::" TOSTRING(symbol));          \
+  }
+  
+#define DEFINE_INTERSECTOR4(symbol,intersector)                               \
+  Accel::Intersector4 symbol() {                                              \
+    return Accel::Intersector4((Accel::IntersectFunc4)intersector::intersect, \
+                               (Accel::OccludedFunc4)intersector::occluded,   \
+                               TOSTRING(isa) "::" TOSTRING(symbol));          \
+  }
+  
+#define DEFINE_INTERSECTOR8(symbol,intersector)                               \
+  Accel::Intersector8 symbol() {                                              \
+    return Accel::Intersector8((Accel::IntersectFunc8)intersector::intersect, \
+                               (Accel::OccludedFunc8)intersector::occluded,   \
+                               TOSTRING(isa) "::" TOSTRING(symbol));          \
+  }
+
+#define DEFINE_INTERSECTOR16(symbol,intersector)                                \
+  Accel::Intersector16 symbol() {                                               \
+    return Accel::Intersector16((Accel::IntersectFunc16)intersector::intersect, \
+                                (Accel::OccludedFunc16)intersector::occluded,   \
+                                TOSTRING(isa) "::" TOSTRING(symbol));           \
+  }
+
+#define DEFINE_INTERSECTORN(symbol,intersector)                               \
+  Accel::IntersectorN symbol() {                                              \
+    return Accel::IntersectorN((Accel::IntersectFuncN)intersector::intersect, \
+                               (Accel::OccludedFuncN)intersector::occluded,   \
+                               TOSTRING(isa) "::" TOSTRING(symbol));          \
+  }
+
+  /* ray stream filter interface */
+  typedef void (*intersectStreamAOS_func)(Scene* scene, RTCRayHit*  _rayN, const size_t N, const size_t stride, IntersectContext* context);
+  typedef void (*intersectStreamAOP_func)(Scene* scene, RTCRayHit** _rayN, const size_t N, IntersectContext* context);
+  typedef void (*intersectStreamSOA_func)(Scene* scene, char* rayN, const size_t N, const size_t streams, const size_t stream_offset, IntersectContext* context);
+  typedef void (*intersectStreamSOP_func)(Scene* scene, const RTCRayHitNp* rayN, const size_t N, IntersectContext* context);
+
+  typedef void (*occludedStreamAOS_func)(Scene* scene, RTCRay*  _rayN, const size_t N, const size_t stride, IntersectContext* context);
+  typedef void (*occludedStreamAOP_func)(Scene* scene, RTCRay** _rayN, const size_t N, IntersectContext* context);
+  typedef void (*occludedStreamSOA_func)(Scene* scene, char* rayN, const size_t N, const size_t streams, const size_t stream_offset, IntersectContext* context);
+  typedef void (*occludedStreamSOP_func)(Scene* scene, const RTCRayNp* rayN, const size_t N, IntersectContext* context);
+
+  struct RayStreamFilterFuncs
+  {
+    RayStreamFilterFuncs()
+    : intersectAOS(nullptr), intersectAOP(nullptr), intersectSOA(nullptr), intersectSOP(nullptr),
+      occludedAOS(nullptr),  occludedAOP(nullptr),  occludedSOA(nullptr),  occludedSOP(nullptr) {}
+
+    RayStreamFilterFuncs(void (*ptr) ())
+    : intersectAOS((intersectStreamAOS_func) ptr), intersectAOP((intersectStreamAOP_func) ptr), intersectSOA((intersectStreamSOA_func) ptr), intersectSOP((intersectStreamSOP_func) ptr),
+      occludedAOS((occludedStreamAOS_func) ptr),   occludedAOP((occludedStreamAOP_func) ptr),   occludedSOA((occludedStreamSOA_func) ptr),   occludedSOP((occludedStreamSOP_func) ptr) {}
+
+    RayStreamFilterFuncs(intersectStreamAOS_func intersectAOS, intersectStreamAOP_func intersectAOP, intersectStreamSOA_func intersectSOA, intersectStreamSOP_func intersectSOP,
+                         occludedStreamAOS_func  occludedAOS,  occludedStreamAOP_func  occludedAOP,  occludedStreamSOA_func  occludedSOA,  occludedStreamSOP_func  occludedSOP)
+    : intersectAOS(intersectAOS), intersectAOP(intersectAOP), intersectSOA(intersectSOA), intersectSOP(intersectSOP),
+      occludedAOS(occludedAOS),   occludedAOP(occludedAOP),   occludedSOA(occludedSOA),   occludedSOP(occludedSOP) {}
+
+  public:
+    intersectStreamAOS_func intersectAOS;
+    intersectStreamAOP_func intersectAOP;
+    intersectStreamSOA_func intersectSOA;
+    intersectStreamSOP_func intersectSOP;
+
+    occludedStreamAOS_func occludedAOS;
+    occludedStreamAOP_func occludedAOP;
+    occludedStreamSOA_func occludedSOA;
+    occludedStreamSOP_func occludedSOP;
+  }; 
+
+  typedef RayStreamFilterFuncs (*RayStreamFilterFuncsType)();
+}
diff --git a/thirdparty/embree/kernels/common/accelinstance.h b/thirdparty/embree/kernels/common/accelinstance.h
new file mode 100644
index 0000000000..c63ef998bd
--- /dev/null
+++ b/thirdparty/embree/kernels/common/accelinstance.h
@@ -0,0 +1,41 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "accel.h"
+#include "builder.h"
+
+namespace embree
+{
+  class AccelInstance : public Accel
+  {
+  public:
+    AccelInstance (AccelData* accel, Builder* builder, Intersectors& intersectors)
+      : Accel(AccelData::TY_ACCEL_INSTANCE,intersectors), accel(accel), builder(builder) {}
+
+    void immutable () {
+      builder.reset(nullptr);
+    }
+
+  public:
+    void build () {
+      if (builder) builder->build();
+      bounds = accel->bounds;
+    }
+
+    void deleteGeometry(size_t geomID) {
+      if (accel  ) accel->deleteGeometry(geomID);
+      if (builder) builder->deleteGeometry(geomID);
+    }
+    
+    void clear() {
+      if (accel) accel->clear();
+      if (builder) builder->clear();
+    }
+
+  private:
+    std::unique_ptr<AccelData> accel;
+    std::unique_ptr<Builder> builder;
+  };
+}
diff --git a/thirdparty/embree/kernels/common/acceln.cpp b/thirdparty/embree/kernels/common/acceln.cpp
new file mode 100644
index 0000000000..32a27c560a
--- /dev/null
+++ b/thirdparty/embree/kernels/common/acceln.cpp
@@ -0,0 +1,232 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "acceln.h"
+#include "ray.h"
+#include "../../include/embree3/rtcore_ray.h"
+#include "../../common/algorithms/parallel_for.h"
+
+namespace embree
+{
+  AccelN::AccelN()
+    : Accel(AccelData::TY_ACCELN), accels() {}
+
+  AccelN::~AccelN() 
+  {
+    for (size_t i=0; i<accels.size(); i++)
+      delete accels[i];
+  }
+
+  void AccelN::accels_add(Accel* accel) 
+  {
+    assert(accel);
+    accels.push_back(accel);
+  }
+
+  void AccelN::accels_init() 
+  {
+    for (size_t i=0; i<accels.size(); i++)
+      delete accels[i];
+    
+    accels.clear();
+  }
+
+  bool AccelN::pointQuery (Accel::Intersectors* This_in, PointQuery* query, PointQueryContext* context)
+  {
+    bool changed = false;
+    AccelN* This = (AccelN*)This_in->ptr;
+    for (size_t i=0; i<This->accels.size(); i++)
+      if (!This->accels[i]->isEmpty())
+        changed |= This->accels[i]->intersectors.pointQuery(query,context);
+    return changed;
+  }
+
+  void AccelN::intersect (Accel::Intersectors* This_in, RTCRayHit& ray, IntersectContext* context) 
+  {
+    AccelN* This = (AccelN*)This_in->ptr;
+    for (size_t i=0; i<This->accels.size(); i++)
+      if (!This->accels[i]->isEmpty())
+        This->accels[i]->intersectors.intersect(ray,context);
+  }
+
+  void AccelN::intersect4 (const void* valid, Accel::Intersectors* This_in, RTCRayHit4& ray, IntersectContext* context) 
+  {
+    AccelN* This = (AccelN*)This_in->ptr;
+    for (size_t i=0; i<This->accels.size(); i++)
+      if (!This->accels[i]->isEmpty())
+        This->accels[i]->intersectors.intersect4(valid,ray,context);
+  }
+
+  void AccelN::intersect8 (const void* valid, Accel::Intersectors* This_in, RTCRayHit8& ray, IntersectContext* context) 
+  {
+    AccelN* This = (AccelN*)This_in->ptr;
+    for (size_t i=0; i<This->accels.size(); i++)
+      if (!This->accels[i]->isEmpty())
+        This->accels[i]->intersectors.intersect8(valid,ray,context);
+  }
+
+  void AccelN::intersect16 (const void* valid, Accel::Intersectors* This_in, RTCRayHit16& ray, IntersectContext* context) 
+  {
+    AccelN* This = (AccelN*)This_in->ptr;
+    for (size_t i=0; i<This->accels.size(); i++)
+      if (!This->accels[i]->isEmpty())
+        This->accels[i]->intersectors.intersect16(valid,ray,context);
+  }
+
+  void AccelN::intersectN (Accel::Intersectors* This_in, RTCRayHitN** ray, const size_t N, IntersectContext* context)
+  {
+    AccelN* This = (AccelN*)This_in->ptr;
+    for (size_t i=0; i<This->accels.size(); i++)
+      if (!This->accels[i]->isEmpty())
+        This->accels[i]->intersectors.intersectN(ray,N,context);
+  }
+
+  void AccelN::occluded (Accel::Intersectors* This_in, RTCRay& ray, IntersectContext* context) 
+  {
+    AccelN* This = (AccelN*)This_in->ptr;
+    for (size_t i=0; i<This->accels.size(); i++) {
+      if (This->accels[i]->isEmpty()) continue;
+      This->accels[i]->intersectors.occluded(ray,context); 
+      if (ray.tfar < 0.0f) break; 
+    }
+  }
+
+  void AccelN::occluded4 (const void* valid, Accel::Intersectors* This_in, RTCRay4& ray, IntersectContext* context) 
+  {
+    AccelN* This = (AccelN*)This_in->ptr;
+    for (size_t i=0; i<This->accels.size(); i++) {
+      if (This->accels[i]->isEmpty()) continue;
+      This->accels[i]->intersectors.occluded4(valid,ray,context);
+#if defined(__SSE2__)
+      vbool4 valid0 = asBool(((vint4*)valid)[0]);
+      vbool4 hit0   = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero);
+      if (unlikely(none(valid0 & hit0))) break;
+#endif
+    }
+  }
+
+  void AccelN::occluded8 (const void* valid, Accel::Intersectors* This_in, RTCRay8& ray, IntersectContext* context) 
+  {
+    AccelN* This = (AccelN*)This_in->ptr;
+    for (size_t i=0; i<This->accels.size(); i++) {
+      if (This->accels[i]->isEmpty()) continue;
+      This->accels[i]->intersectors.occluded8(valid,ray,context);
+#if defined(__SSE2__) // FIXME: use higher ISA
+      vbool4 valid0 = asBool(((vint4*)valid)[0]);
+      vbool4 hit0   = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero);
+      vbool4 valid1 = asBool(((vint4*)valid)[1]);
+      vbool4 hit1   = ((vfloat4*)ray.tfar)[1] >= vfloat4(zero);
+      if (unlikely((none((valid0 & hit0) | (valid1 & hit1))))) break;
+#endif
+    }
+  }
+
+  void AccelN::occluded16 (const void* valid, Accel::Intersectors* This_in, RTCRay16& ray, IntersectContext* context) 
+  {
+    AccelN* This = (AccelN*)This_in->ptr;
+    for (size_t i=0; i<This->accels.size(); i++) {
+      if (This->accels[i]->isEmpty()) continue;
+      This->accels[i]->intersectors.occluded16(valid,ray,context);
+#if defined(__SSE2__) // FIXME: use higher ISA
+      vbool4 valid0 = asBool(((vint4*)valid)[0]);
+      vbool4 hit0   = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero);
+      vbool4 valid1 = asBool(((vint4*)valid)[1]);
+      vbool4 hit1   = ((vfloat4*)ray.tfar)[1] >= vfloat4(zero);
+      vbool4 valid2 = asBool(((vint4*)valid)[2]);
+      vbool4 hit2   = ((vfloat4*)ray.tfar)[2] >= vfloat4(zero);
+      vbool4 valid3 = asBool(((vint4*)valid)[3]);
+      vbool4 hit3   = ((vfloat4*)ray.tfar)[3] >= vfloat4(zero);
+      if (unlikely((none((valid0 & hit0) | (valid1 & hit1) | (valid2 & hit2) | (valid3 & hit3))))) break;
+#endif
+    }
+  }
+
+  void AccelN::occludedN (Accel::Intersectors* This_in, RTCRayN** ray, const size_t N, IntersectContext* context)
+  {
+    AccelN* This = (AccelN*)This_in->ptr;
+    size_t M = N;
+    for (size_t i=0; i<This->accels.size(); i++)
+      if (!This->accels[i]->isEmpty())
+        This->accels[i]->intersectors.occludedN(ray,M,context);
+  }
+
+  void AccelN::accels_print(size_t ident)
+  {
+    for (size_t i=0; i<accels.size(); i++)
+    {
+      for (size_t j=0; j<ident; j++) std::cout << " "; 
+      std::cout << "accels[" << i << "]" << std::endl;
+      accels[i]->intersectors.print(ident+2);
+    }
+  }
+
+  void AccelN::accels_immutable()
+  {
+    for (size_t i=0; i<accels.size(); i++)
+      accels[i]->immutable();
+  }
+  
+  void AccelN::accels_build () 
+  {
+    /* reduce memory consumption */
+    accels.shrink_to_fit();
+    
+    /* build all acceleration structures in parallel */
+    parallel_for (accels.size(), [&] (size_t i) { 
+        accels[i]->build();
+      });
+
+    /* create list of non-empty acceleration structures */
+    bool valid1 = true;
+    bool valid4 = true;
+    bool valid8 = true;
+    bool valid16 = true;
+    for (size_t i=0; i<accels.size(); i++) {
+      valid1 &= (bool) accels[i]->intersectors.intersector1;
+      valid4 &= (bool) accels[i]->intersectors.intersector4;
+      valid8 &= (bool) accels[i]->intersectors.intersector8;
+      valid16 &= (bool) accels[i]->intersectors.intersector16;
+    }
+
+    if (accels.size() == 1) {
+      type = accels[0]->type; // FIXME: should just assign entire Accel
+      bounds = accels[0]->bounds;
+      intersectors = accels[0]->intersectors;
+    }
+    else 
+    {
+      type = AccelData::TY_ACCELN;
+      intersectors.ptr = this;
+      intersectors.intersector1  = Intersector1(&intersect,&occluded,&pointQuery,valid1 ? "AccelN::intersector1": nullptr);
+      intersectors.intersector4  = Intersector4(&intersect4,&occluded4,valid4 ? "AccelN::intersector4" : nullptr);
+      intersectors.intersector8  = Intersector8(&intersect8,&occluded8,valid8 ? "AccelN::intersector8" : nullptr);
+      intersectors.intersector16 = Intersector16(&intersect16,&occluded16,valid16 ? "AccelN::intersector16": nullptr);
+      intersectors.intersectorN  = IntersectorN(&intersectN,&occludedN,"AccelN::intersectorN");
+
+      /*! calculate bounds */
+      bounds = empty;
+      for (size_t i=0; i<accels.size(); i++) 
+        bounds.extend(accels[i]->bounds);
+    }
+  }
+
+  void AccelN::accels_select(bool filter)
+  {
+    for (size_t i=0; i<accels.size(); i++) 
+      accels[i]->intersectors.select(filter);
+  }
+
+  void AccelN::accels_deleteGeometry(size_t geomID) 
+  {
+    for (size_t i=0; i<accels.size(); i++) 
+      accels[i]->deleteGeometry(geomID);
+  }
+
+  void AccelN::accels_clear()
+  {
+    for (size_t i=0; i<accels.size(); i++) {
+      accels[i]->clear();
+    }
+  }
+}
+
diff --git a/thirdparty/embree/kernels/common/acceln.h b/thirdparty/embree/kernels/common/acceln.h
new file mode 100644
index 0000000000..0445b2e811
--- /dev/null
+++ b/thirdparty/embree/kernels/common/acceln.h
@@ -0,0 +1,49 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "accel.h"
+
+namespace embree
+{
+  /*! merges N acceleration structures together, by processing them in order */
+  class AccelN : public Accel
+  {
+  public:
+    AccelN ();
+    ~AccelN();
+
+  public:
+    void accels_add(Accel* accel);
+    void accels_init();
+
+  public:
+    static bool pointQuery (Accel::Intersectors* This, PointQuery* query, PointQueryContext* context);
+
+  public:
+    static void intersect (Accel::Intersectors* This, RTCRayHit& ray, IntersectContext* context);
+    static void intersect4 (const void* valid, Accel::Intersectors* This, RTCRayHit4& ray, IntersectContext* context);
+    static void intersect8 (const void* valid, Accel::Intersectors* This, RTCRayHit8& ray, IntersectContext* context);
+    static void intersect16 (const void* valid, Accel::Intersectors* This, RTCRayHit16& ray, IntersectContext* context);
+    static void intersectN (Accel::Intersectors* This, RTCRayHitN** ray, const size_t N, IntersectContext* context);
+
+  public:
+    static void occluded (Accel::Intersectors* This, RTCRay& ray, IntersectContext* context);
+    static void occluded4 (const void* valid, Accel::Intersectors* This, RTCRay4& ray, IntersectContext* context);
+    static void occluded8 (const void* valid, Accel::Intersectors* This, RTCRay8& ray, IntersectContext* context);
+    static void occluded16 (const void* valid, Accel::Intersectors* This, RTCRay16& ray, IntersectContext* context);
+    static void occludedN (Accel::Intersectors* This, RTCRayN** ray, const size_t N, IntersectContext* context);
+
+  public:
+    void accels_print(size_t ident);
+    void accels_immutable();
+    void accels_build ();
+    void accels_select(bool filter);
+    void accels_deleteGeometry(size_t geomID);
+    void accels_clear ();
+
+  public:
+    std::vector<Accel*> accels;
+  };
+}
diff --git a/thirdparty/embree/kernels/common/accelset.cpp b/thirdparty/embree/kernels/common/accelset.cpp
new file mode 100644
index 0000000000..8c18f31776
--- /dev/null
+++ b/thirdparty/embree/kernels/common/accelset.cpp
@@ -0,0 +1,17 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "accelset.h"
+#include "scene.h"
+
+namespace embree
+{
+  AccelSet::AccelSet (Device* device, Geometry::GType gtype, size_t numItems, size_t numTimeSteps) 
+    : Geometry(device,gtype,(unsigned int)numItems,(unsigned int)numTimeSteps), boundsFunc(nullptr) {}
+
+  AccelSet::IntersectorN::IntersectorN (ErrorFunc error) 
+    : intersect((IntersectFuncN)error), occluded((OccludedFuncN)error), name(nullptr) {}
+  
+  AccelSet::IntersectorN::IntersectorN (IntersectFuncN intersect, OccludedFuncN occluded, const char* name)
+    : intersect(intersect), occluded(occluded), name(name) {}
+}
diff --git a/thirdparty/embree/kernels/common/accelset.h b/thirdparty/embree/kernels/common/accelset.h
new file mode 100644
index 0000000000..90b184a07b
--- /dev/null
+++ b/thirdparty/embree/kernels/common/accelset.h
@@ -0,0 +1,248 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "builder.h"
+#include "geometry.h"
+#include "ray.h"
+#include "hit.h"
+
+namespace embree
+{
+  struct IntersectFunctionNArguments;
+  struct OccludedFunctionNArguments;
+  
+  typedef void (*ReportIntersectionFunc) (IntersectFunctionNArguments* args, const RTCFilterFunctionNArguments* filter_args);
+  typedef void (*ReportOcclusionFunc) (OccludedFunctionNArguments* args, const RTCFilterFunctionNArguments* filter_args);
+  
+  struct IntersectFunctionNArguments : public RTCIntersectFunctionNArguments
+  {
+    IntersectContext* internal_context;
+    Geometry* geometry;
+    ReportIntersectionFunc report;
+  };
+
+  struct OccludedFunctionNArguments : public RTCOccludedFunctionNArguments
+  {
+    IntersectContext* internal_context;
+    Geometry* geometry;
+    ReportOcclusionFunc report;
+  };
+
+  /*! Base class for set of acceleration structures. */
+  class AccelSet : public Geometry
+  {
+  public:
+    typedef RTCIntersectFunctionN IntersectFuncN;  
+    typedef RTCOccludedFunctionN OccludedFuncN;
+    typedef void (*ErrorFunc) ();
+
+      struct IntersectorN
+      {
+        IntersectorN (ErrorFunc error = nullptr) ;
+        IntersectorN (IntersectFuncN intersect, OccludedFuncN occluded, const char* name);
+        
+        operator bool() const { return name; }
+        
+      public:
+        static const char* type;
+        IntersectFuncN intersect;
+        OccludedFuncN occluded; 
+        const char* name;
+      };
+      
+    public:
+      
+      /*! construction */
+      AccelSet (Device* device, Geometry::GType gtype, size_t items, size_t numTimeSteps);
+      
+      /*! makes the acceleration structure immutable */
+      virtual void immutable () {}
+      
+      /*! build accel */
+      virtual void build () = 0;
+
+      /*! check if the i'th primitive is valid between the specified time range */
+      __forceinline bool valid(size_t i, const range<size_t>& itime_range) const
+      {
+        for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++)
+          if (!isvalid_non_empty(bounds(i,itime))) return false;
+        
+        return true;
+      }
+
+      /*! Calculates the bounds of an item */
+      __forceinline BBox3fa bounds(size_t i, size_t itime = 0) const
+      {
+        BBox3fa box;
+        assert(i < size());
+        RTCBoundsFunctionArguments args;
+        args.geometryUserPtr = userPtr;
+        args.primID = (unsigned int)i;
+        args.timeStep = (unsigned int)itime;
+        args.bounds_o = (RTCBounds*)&box;
+        boundsFunc(&args);
+        return box;
+      }
+
+      /*! calculates the linear bounds of the i'th item at the itime'th time segment */
+      __forceinline LBBox3fa linearBounds(size_t i, size_t itime) const
+      {
+        BBox3fa box[2];
+        assert(i < size());
+        RTCBoundsFunctionArguments args;
+        args.geometryUserPtr = userPtr;
+        args.primID = (unsigned int)i;
+        args.timeStep = (unsigned int)(itime+0);
+        args.bounds_o = (RTCBounds*)&box[0];
+        boundsFunc(&args);
+        args.timeStep = (unsigned int)(itime+1);
+        args.bounds_o = (RTCBounds*)&box[1];
+        boundsFunc(&args);
+        return LBBox3fa(box[0],box[1]);
+      }
+
+      /*! calculates the build bounds of the i'th item, if it's valid */
+      __forceinline bool buildBounds(size_t i, BBox3fa* bbox = nullptr) const
+      {
+        const BBox3fa b = bounds(i);
+        if (bbox) *bbox = b;
+        return isvalid_non_empty(b);
+      }
+
+      /*! calculates the build bounds of the i'th item at the itime'th time segment, if it's valid */
+      __forceinline bool buildBounds(size_t i, size_t itime, BBox3fa& bbox) const
+      {
+        const LBBox3fa bounds = linearBounds(i,itime);
+        bbox = bounds.bounds0; // use bounding box of first timestep to build BVH
+        return isvalid_non_empty(bounds);
+      }
+
+      /*! calculates the linear bounds of the i'th primitive for the specified time range */
+      __forceinline LBBox3fa linearBounds(size_t primID, const BBox1f& dt) const {
+        return LBBox3fa([&] (size_t itime) { return bounds(primID, itime); }, dt, time_range, fnumTimeSegments);
+      }
+      
+      /*! calculates the linear bounds of the i'th primitive for the specified time range */
+      __forceinline bool linearBounds(size_t i, const BBox1f& time_range, LBBox3fa& bbox) const  {
+        if (!valid(i, timeSegmentRange(time_range))) return false;
+        bbox = linearBounds(i, time_range);
+        return true;
+      }
+
+      /* gets version info of topology */
+      unsigned int getTopologyVersion() const {
+        return numPrimitives;
+      }
+    
+      /* returns true if topology changed */
+      bool topologyChanged(unsigned int otherVersion) const {
+        return numPrimitives != otherVersion;
+      }
+
+  public:
+
+      /*! Intersects a single ray with the scene. */
+      __forceinline void intersect (RayHit& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportIntersectionFunc report) 
+      {
+        assert(primID < size());
+        assert(intersectorN.intersect);
+        
+        int mask = -1;
+        IntersectFunctionNArguments args;
+        args.valid = &mask;
+        args.geometryUserPtr = userPtr;
+        args.context = context->user;
+        args.rayhit = (RTCRayHitN*)&ray;
+        args.N = 1;
+        args.geomID = geomID;
+        args.primID = primID;
+        args.internal_context = context;
+        args.geometry = this;
+        args.report = report;
+        
+        intersectorN.intersect(&args);
+      }
+
+      /*! Tests if single ray is occluded by the scene. */
+      __forceinline void occluded (Ray& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportOcclusionFunc report)
+      {
+        assert(primID < size());
+        assert(intersectorN.occluded);
+        
+        int mask = -1;
+        OccludedFunctionNArguments args;
+        args.valid = &mask;
+        args.geometryUserPtr = userPtr;
+        args.context = context->user;
+        args.ray = (RTCRayN*)&ray;
+        args.N = 1;
+        args.geomID = geomID;
+        args.primID = primID;
+        args.internal_context = context;
+        args.geometry = this;
+        args.report = report;
+        
+        intersectorN.occluded(&args);
+      }
+   
+      /*! Intersects a packet of K rays with the scene. */
+      template<int K>
+        __forceinline void intersect (const vbool<K>& valid, RayHitK<K>& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportIntersectionFunc report) 
+      {
+        assert(primID < size());
+        assert(intersectorN.intersect);
+        
+        vint<K> mask = valid.mask32();
+        IntersectFunctionNArguments args;
+        args.valid = (int*)&mask;
+        args.geometryUserPtr = userPtr;
+        args.context = context->user;
+        args.rayhit = (RTCRayHitN*)&ray;
+        args.N = K;
+        args.geomID = geomID;
+        args.primID = primID;
+        args.internal_context = context;
+        args.geometry = this;
+        args.report = report;
+         
+        intersectorN.intersect(&args);
+      }
+
+      /*! Tests if a packet of K rays is occluded by the scene. */
+      template<int K>
+        __forceinline void occluded (const vbool<K>& valid, RayK<K>& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportOcclusionFunc report)
+      {
+        assert(primID < size());
+        assert(intersectorN.occluded);
+        
+        vint<K> mask = valid.mask32();
+        OccludedFunctionNArguments args;
+        args.valid = (int*)&mask;
+        args.geometryUserPtr = userPtr;
+        args.context = context->user;
+        args.ray = (RTCRayN*)&ray;
+        args.N = K;
+        args.geomID = geomID;
+        args.primID = primID;
+        args.internal_context = context;
+        args.geometry = this;
+        args.report = report;
+        
+        intersectorN.occluded(&args);
+      }
+
+    public:
+      RTCBoundsFunction boundsFunc;
+      IntersectorN intersectorN;
+  };
+  
+#define DEFINE_SET_INTERSECTORN(symbol,intersector)                     \
+  AccelSet::IntersectorN symbol() {                                     \
+    return AccelSet::IntersectorN(intersector::intersect, \
+                                  intersector::occluded, \
+                                  TOSTRING(isa) "::" TOSTRING(symbol)); \
+  }
+}
diff --git a/thirdparty/embree/kernels/common/alloc.cpp b/thirdparty/embree/kernels/common/alloc.cpp
new file mode 100644
index 0000000000..1a0e1aeed3
--- /dev/null
+++ b/thirdparty/embree/kernels/common/alloc.cpp
@@ -0,0 +1,79 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "alloc.h"
+#include "../../common/sys/thread.h"
+
+namespace embree
+{
+  __thread FastAllocator::ThreadLocal2* FastAllocator::thread_local_allocator2 = nullptr;
+  SpinLock FastAllocator::s_thread_local_allocators_lock;
+  std::vector<std::unique_ptr<FastAllocator::ThreadLocal2>> FastAllocator::s_thread_local_allocators;
+   
+  struct fast_allocator_regression_test : public RegressionTest
+  {
+    BarrierSys barrier;
+    std::atomic<size_t> numFailed;
+    std::unique_ptr<FastAllocator> alloc;
+
+    fast_allocator_regression_test() 
+      : RegressionTest("fast_allocator_regression_test"), numFailed(0)
+    {
+      registerRegressionTest(this);
+    }
+
+    static void thread_alloc(fast_allocator_regression_test* This)
+    {
+      FastAllocator::CachedAllocator threadalloc = This->alloc->getCachedAllocator();
+
+      size_t* ptrs[1000];
+      for (size_t j=0; j<1000; j++)
+      {
+        This->barrier.wait();
+        for (size_t i=0; i<1000; i++) {
+          ptrs[i] = (size_t*) threadalloc.malloc0(sizeof(size_t)+(i%32));
+          *ptrs[i] = size_t(threadalloc.talloc0) + i;
+        }
+        for (size_t i=0; i<1000; i++) {
+          if (*ptrs[i] != size_t(threadalloc.talloc0) + i) 
+            This->numFailed++;
+        }
+        This->barrier.wait();
+      }
+    }
+    
+    bool run ()
+    {
+      alloc = make_unique(new FastAllocator(nullptr,false));
+      numFailed.store(0);
+
+      size_t numThreads = getNumberOfLogicalThreads();
+      barrier.init(numThreads+1);
+
+      /* create threads */
+      std::vector<thread_t> threads;
+      for (size_t i=0; i<numThreads; i++)
+        threads.push_back(createThread((thread_func)thread_alloc,this));
+
+      /* run test */ 
+      for (size_t i=0; i<1000; i++)
+      {
+        alloc->reset();
+        barrier.wait();
+        barrier.wait();
+      }
+     
+      /* destroy threads */
+      for (size_t i=0; i<numThreads; i++)
+        join(threads[i]);
+
+      alloc = nullptr;
+
+      return numFailed == 0;
+    }
+  };
+
+  fast_allocator_regression_test fast_allocator_regression;
+}
+
+
diff --git a/thirdparty/embree/kernels/common/alloc.h b/thirdparty/embree/kernels/common/alloc.h
new file mode 100644
index 0000000000..4458e35c24
--- /dev/null
+++ b/thirdparty/embree/kernels/common/alloc.h
@@ -0,0 +1,958 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "device.h"
+#include "scene.h"
+#include "primref.h"
+
+namespace embree
+{
+  class FastAllocator
+  {
+    /*! maximum supported alignment */
+    static const size_t maxAlignment = 64;
+
+    /*! maximum allocation size */
+
+    /* default settings */
+    //static const size_t defaultBlockSize = 4096;
+#define maxAllocationSize size_t(2*1024*1024-maxAlignment)
+
+    static const size_t MAX_THREAD_USED_BLOCK_SLOTS = 8;
+
+  public:
+
+    struct ThreadLocal2;
+    enum AllocationType { ALIGNED_MALLOC, OS_MALLOC, SHARED, ANY_TYPE };
+
+    /*! Per thread structure holding the current memory block. */
+    struct __aligned(64) ThreadLocal
+    {
+      ALIGNED_CLASS_(64);
+    public:
+
+      /*! Constructor for usage with ThreadLocalData */
+      __forceinline ThreadLocal (ThreadLocal2* parent) 
+	: parent(parent), ptr(nullptr), cur(0), end(0), allocBlockSize(0), bytesUsed(0), bytesWasted(0) {}
+
+      /*! initialize allocator */
+      void init(FastAllocator* alloc) 
+      {
+        ptr = nullptr;
+	cur = end = 0;
+        bytesUsed = 0;
+        bytesWasted = 0;
+        allocBlockSize = 0;
+        if (alloc) allocBlockSize = alloc->defaultBlockSize;
+      }
+
+      /* Allocate aligned memory from the threads memory block. */
+      __forceinline void* malloc(FastAllocator* alloc, size_t bytes, size_t align = 16) 
+      {
+        /* bind the thread local allocator to the proper FastAllocator*/
+        parent->bind(alloc);
+
+        assert(align <= maxAlignment);
+	bytesUsed += bytes;
+
+        /* try to allocate in local block */
+	size_t ofs = (align - cur) & (align-1);
+        cur += bytes + ofs;
+        if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }
+	cur -= bytes + ofs;
+        
+        /* if allocation is too large allocate with parent allocator */
+        if (4*bytes > allocBlockSize) {
+          return alloc->malloc(bytes,maxAlignment,false);
+	}
+
+        /* get new partial block if allocation failed */
+        size_t blockSize = allocBlockSize;
+        ptr = (char*) alloc->malloc(blockSize,maxAlignment,true);
+ 	bytesWasted += end-cur;
+	cur = 0; end = blockSize;
+
+        /* retry allocation */
+	ofs = (align - cur) & (align-1);
+        cur += bytes + ofs;
+        if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }
+	cur -= bytes + ofs;
+
+        /* get new full block if allocation failed */
+        blockSize = allocBlockSize;
+        ptr = (char*) alloc->malloc(blockSize,maxAlignment,false);
+	bytesWasted += end-cur;
+	cur = 0; end = blockSize;
+
+        /* retry allocation */
+	ofs = (align - cur) & (align-1);
+        cur += bytes + ofs;
+        if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }
+	cur -= bytes + ofs;
+
+        /* should never happen as large allocations get handled specially above */
+        assert(false);
+        return nullptr;
+      }
+
+      
+      /*! returns amount of used bytes */
+      __forceinline size_t getUsedBytes() const { return bytesUsed; }
+  
+      /*! returns amount of free bytes */
+      __forceinline size_t getFreeBytes() const { return end-cur; }
+      
+      /*! returns amount of wasted bytes */
+      __forceinline size_t getWastedBytes() const { return bytesWasted; }
+  
+    private:
+      ThreadLocal2* parent;
+      char*  ptr;            //!< pointer to memory block
+      size_t cur;            //!< current location of the allocator
+      size_t end;            //!< end of the memory block
+      size_t allocBlockSize; //!< block size for allocations
+      size_t bytesUsed;      //!< number of total bytes allocated
+      size_t bytesWasted;    //!< number of bytes wasted
+    };
+
+    /*! Two thread local structures. */
+    struct __aligned(64) ThreadLocal2
+    {
+      ALIGNED_CLASS_(64);
+    public:
+
+      __forceinline ThreadLocal2()
+        : alloc(nullptr), alloc0(this), alloc1(this) {}
+
+      /*! bind to fast allocator */
+      __forceinline void bind(FastAllocator* alloc_i) 
+      {
+        assert(alloc_i);
+        if (alloc.load() == alloc_i) return;
+        Lock<SpinLock> lock(mutex);
+        //if (alloc.load() == alloc_i) return; // not required as only one thread calls bind
+        if (alloc.load()) {
+          alloc.load()->bytesUsed   += alloc0.getUsedBytes()   + alloc1.getUsedBytes();
+          alloc.load()->bytesFree   += alloc0.getFreeBytes()   + alloc1.getFreeBytes();
+          alloc.load()->bytesWasted += alloc0.getWastedBytes() + alloc1.getWastedBytes();
+        }
+        alloc0.init(alloc_i);
+        alloc1.init(alloc_i);
+        alloc.store(alloc_i);
+        alloc_i->join(this);
+      }
+
+      /*! unbind to fast allocator */
+      void unbind(FastAllocator* alloc_i) 
+      {
+        assert(alloc_i);
+        if (alloc.load() != alloc_i) return;
+        Lock<SpinLock> lock(mutex);
+        if (alloc.load() != alloc_i) return; // required as a different thread calls unbind
+        alloc.load()->bytesUsed   += alloc0.getUsedBytes()   + alloc1.getUsedBytes();
+        alloc.load()->bytesFree   += alloc0.getFreeBytes()   + alloc1.getFreeBytes();
+        alloc.load()->bytesWasted += alloc0.getWastedBytes() + alloc1.getWastedBytes();
+        alloc0.init(nullptr);
+        alloc1.init(nullptr);
+        alloc.store(nullptr);
+      }
+
+    public:
+      SpinLock mutex;        //!< required as unbind is called from other threads
+      std::atomic<FastAllocator*> alloc;  //!< parent allocator
+      ThreadLocal alloc0;
+      ThreadLocal alloc1;
+    };
+
+    FastAllocator (Device* device, bool osAllocation) 
+      : device(device), slotMask(0), usedBlocks(nullptr), freeBlocks(nullptr), use_single_mode(false), defaultBlockSize(PAGE_SIZE), estimatedSize(0),
+        growSize(PAGE_SIZE), maxGrowSize(maxAllocationSize), log2_grow_size_scale(0), bytesUsed(0), bytesFree(0), bytesWasted(0), atype(osAllocation ? OS_MALLOC : ALIGNED_MALLOC),
+        primrefarray(device,0)
+    {
+      for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++)
+      {
+        threadUsedBlocks[i] = nullptr;
+        threadBlocks[i] = nullptr;
+        assert(!slotMutex[i].isLocked());
+      }
+    }
+
+    ~FastAllocator () {
+      clear();
+    }
+
+    /*! returns the device attached to this allocator */
+    Device* getDevice() {
+      return device;
+    }
+
+    void share(mvector<PrimRef>& primrefarray_i) {
+      primrefarray = std::move(primrefarray_i);
+    }
+
+    void unshare(mvector<PrimRef>& primrefarray_o)
+    {
+      reset(); // this removes blocks that are allocated inside the shared primref array
+      primrefarray_o = std::move(primrefarray);
+    }
+
+    /*! returns first fast thread local allocator */
+    __forceinline ThreadLocal* _threadLocal() {
+      return &threadLocal2()->alloc0;
+    }
+
+    void setOSallocation(bool flag)
+    {
+      atype = flag ? OS_MALLOC : ALIGNED_MALLOC;
+    }
+
+  private:
+
+    /*! returns both fast thread local allocators */
+    __forceinline ThreadLocal2* threadLocal2() 
+    {
+      ThreadLocal2* alloc = thread_local_allocator2;
+      if (alloc == nullptr) {
+        thread_local_allocator2 = alloc = new ThreadLocal2;
+        Lock<SpinLock> lock(s_thread_local_allocators_lock);
+        s_thread_local_allocators.push_back(make_unique(alloc));
+      }
+      return alloc;
+    }
+
+  public:
+
+    __forceinline void join(ThreadLocal2* alloc)
+    {
+      Lock<SpinLock> lock(thread_local_allocators_lock);
+      thread_local_allocators.push_back(alloc);
+    }
+
+  public:
+
+    struct CachedAllocator
+    {
+      __forceinline CachedAllocator(void* ptr)
+        : alloc(nullptr), talloc0(nullptr), talloc1(nullptr) 
+      {
+        assert(ptr == nullptr);
+      }
+
+      __forceinline CachedAllocator(FastAllocator* alloc, ThreadLocal2* talloc)
+        : alloc(alloc), talloc0(&talloc->alloc0), talloc1(alloc->use_single_mode ? &talloc->alloc0 : &talloc->alloc1) {}
+
+      __forceinline operator bool () const {
+        return alloc != nullptr;
+      }
+
+      __forceinline void* operator() (size_t bytes, size_t align = 16) const {
+        return talloc0->malloc(alloc,bytes,align);
+      }
+
+      __forceinline void* malloc0 (size_t bytes, size_t align = 16) const {
+        return talloc0->malloc(alloc,bytes,align);
+      }
+
+      __forceinline void* malloc1 (size_t bytes, size_t align = 16) const {
+        return talloc1->malloc(alloc,bytes,align);
+      }
+
+    public:
+      FastAllocator* alloc;
+      ThreadLocal* talloc0;
+      ThreadLocal* talloc1;
+    };
+
+    __forceinline CachedAllocator getCachedAllocator() {
+      return CachedAllocator(this,threadLocal2());
+    }
+
+    /*! Builder interface to create thread local allocator */
+    struct Create
+    {
+    public:
+      __forceinline Create (FastAllocator* allocator) : allocator(allocator) {}
+      __forceinline CachedAllocator operator() () const { return allocator->getCachedAllocator();  }
+
+    private:
+      FastAllocator* allocator;
+    };
+
+    void internal_fix_used_blocks()
+    {
+      /* move thread local blocks to global block list */
+      for (size_t i = 0; i < MAX_THREAD_USED_BLOCK_SLOTS; i++)
+      {
+        while (threadBlocks[i].load() != nullptr) {
+          Block* nextUsedBlock = threadBlocks[i].load()->next;
+          threadBlocks[i].load()->next = usedBlocks.load();
+          usedBlocks = threadBlocks[i].load();
+          threadBlocks[i] = nextUsedBlock;
+        }
+        threadBlocks[i] = nullptr;
+      }
+    }
+
+    static const size_t threadLocalAllocOverhead = 20; //! 20 means 5% parallel allocation overhead through unfilled thread local blocks
+    static const size_t mainAllocOverheadStatic  = 20;  //! 20 means 5% allocation overhead through unfilled main alloc blocks
+    static const size_t mainAllocOverheadDynamic = 8;  //! 20 means 12.5% allocation overhead through unfilled main alloc blocks
+
+    /* calculates a single threaded threshold for the builders such
+     * that for small scenes the overhead of partly allocated blocks
+     * per thread is low */
+    size_t fixSingleThreadThreshold(size_t branchingFactor, size_t defaultThreshold, size_t numPrimitives, size_t bytesEstimated)
+    {
+      if (numPrimitives == 0 || bytesEstimated == 0) 
+        return defaultThreshold;
+
+      /* calculate block size in bytes to fulfill threadLocalAllocOverhead constraint */
+      const size_t single_mode_factor = use_single_mode ? 1 : 2;
+      const size_t threadCount = TaskScheduler::threadCount();
+      const size_t singleThreadBytes = single_mode_factor*threadLocalAllocOverhead*defaultBlockSize;
+
+      /* if we do not have to limit number of threads use optimal thresdhold */
+      if ( (bytesEstimated+(singleThreadBytes-1))/singleThreadBytes >= threadCount)
+        return defaultThreshold;
+
+      /* otherwise limit number of threads by calculating proper single thread threshold */
+      else {
+        double bytesPerPrimitive = double(bytesEstimated)/double(numPrimitives);
+        return size_t(ceil(branchingFactor*singleThreadBytes/bytesPerPrimitive)); 
+      }
+    }
+
+    __forceinline size_t alignSize(size_t i) {
+      return (i+127)/128*128;
+    }
+
+    /*! initializes the grow size */
+    __forceinline void initGrowSizeAndNumSlots(size_t bytesEstimated, bool fast) 
+    {
+      /* we do not need single thread local allocator mode */
+      use_single_mode = false;
+     
+      /* calculate growSize such that at most mainAllocationOverhead gets wasted when a block stays unused */
+      size_t mainAllocOverhead = fast ? mainAllocOverheadDynamic : mainAllocOverheadStatic;
+      size_t blockSize = alignSize(bytesEstimated/mainAllocOverhead);
+      growSize = maxGrowSize = clamp(blockSize,size_t(1024),maxAllocationSize);
+
+      /* if we reached the maxAllocationSize for growSize, we can
+       * increase the number of allocation slots by still guaranteeing
+       * the mainAllocationOverhead */
+      slotMask = 0x0;
+
+      if (MAX_THREAD_USED_BLOCK_SLOTS >= 2 && bytesEstimated > 2*mainAllocOverhead*growSize) slotMask = 0x1;
+      if (MAX_THREAD_USED_BLOCK_SLOTS >= 4 && bytesEstimated > 4*mainAllocOverhead*growSize) slotMask = 0x3;
+      if (MAX_THREAD_USED_BLOCK_SLOTS >= 8 && bytesEstimated > 8*mainAllocOverhead*growSize) slotMask = 0x7;
+      if (MAX_THREAD_USED_BLOCK_SLOTS >= 8 && bytesEstimated > 16*mainAllocOverhead*growSize) { growSize *= 2; } /* if the overhead is tiny, double the growSize */
+
+      /* set the thread local alloc block size */
+      size_t defaultBlockSizeSwitch = PAGE_SIZE+maxAlignment;
+      
+      /* for sufficiently large scene we can increase the defaultBlockSize over the defaultBlockSizeSwitch size */
+#if 0 // we do not do this as a block size of 4160 if for some reason best for KNL
+      const size_t threadCount = TaskScheduler::threadCount();
+      const size_t single_mode_factor = use_single_mode ? 1 : 2;
+      const size_t singleThreadBytes = single_mode_factor*threadLocalAllocOverhead*defaultBlockSizeSwitch;
+      if (bytesEstimated+(singleThreadBytes-1))/singleThreadBytes >= threadCount)
+        defaultBlockSize = min(max(defaultBlockSizeSwitch,bytesEstimated/(single_mode_factor*threadLocalAllocOverhead*threadCount)),growSize);
+
+      /* otherwise we grow the defaultBlockSize up to defaultBlockSizeSwitch */
+        else
+#endif
+        defaultBlockSize = clamp(blockSize,size_t(1024),defaultBlockSizeSwitch);
+
+      if (bytesEstimated == 0) {
+        maxGrowSize = maxAllocationSize; // special mode if builder cannot estimate tree size
+        defaultBlockSize = defaultBlockSizeSwitch;
+      }
+      log2_grow_size_scale = 0;
+      
+      if (device->alloc_main_block_size != 0) growSize = device->alloc_main_block_size;
+      if (device->alloc_num_main_slots >= 1 ) slotMask = 0x0;
+      if (device->alloc_num_main_slots >= 2 ) slotMask = 0x1;
+      if (device->alloc_num_main_slots >= 4 ) slotMask = 0x3;
+      if (device->alloc_num_main_slots >= 8 ) slotMask = 0x7;
+      if (device->alloc_thread_block_size != 0) defaultBlockSize = device->alloc_thread_block_size;
+      if (device->alloc_single_thread_alloc != -1) use_single_mode = device->alloc_single_thread_alloc;
+    }
+
+    /*! initializes the allocator */
+    void init(size_t bytesAllocate, size_t bytesReserve, size_t bytesEstimate)
+    {
+      internal_fix_used_blocks();
+      /* distribute the allocation to multiple thread block slots */
+      slotMask = MAX_THREAD_USED_BLOCK_SLOTS-1; // FIXME: remove
+      if (usedBlocks.load() || freeBlocks.load()) { reset(); return; }
+      if (bytesReserve == 0) bytesReserve = bytesAllocate;
+      freeBlocks = Block::create(device,bytesAllocate,bytesReserve,nullptr,atype);
+      estimatedSize = bytesEstimate;
+      initGrowSizeAndNumSlots(bytesEstimate,true);
+    }
+
+    /*! initializes the allocator */
+    void init_estimate(size_t bytesEstimate)
+    {
+      internal_fix_used_blocks();
+      if (usedBlocks.load() || freeBlocks.load()) { reset(); return; }
+      /* single allocator mode ? */
+      estimatedSize = bytesEstimate;
+      //initGrowSizeAndNumSlots(bytesEstimate,false);
+      initGrowSizeAndNumSlots(bytesEstimate,false);
+
+    }
+
+    /*! frees state not required after build */
+    __forceinline void cleanup()
+    {
+      internal_fix_used_blocks();
+
+      /* unbind all thread local allocators */
+      for (auto alloc : thread_local_allocators) alloc->unbind(this);
+      thread_local_allocators.clear();
+    }
+
+    /*! resets the allocator, memory blocks get reused */
+    void reset ()
+    {
+      internal_fix_used_blocks();
+
+      bytesUsed.store(0);
+      bytesFree.store(0);
+      bytesWasted.store(0);
+
+      /* reset all used blocks and move them to begin of free block list */
+      while (usedBlocks.load() != nullptr) {
+        usedBlocks.load()->reset_block();
+        Block* nextUsedBlock = usedBlocks.load()->next;
+        usedBlocks.load()->next = freeBlocks.load();
+        freeBlocks = usedBlocks.load();
+        usedBlocks = nextUsedBlock;
+      }
+
+      /* remove all shared blocks as they are re-added during build */
+      freeBlocks.store(Block::remove_shared_blocks(freeBlocks.load()));
+
+      for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++)
+      {
+        threadUsedBlocks[i] = nullptr;
+        threadBlocks[i] = nullptr;
+      }
+      
+      /* unbind all thread local allocators */
+      for (auto alloc : thread_local_allocators) alloc->unbind(this);
+      thread_local_allocators.clear();
+    }
+
+    /*! frees all allocated memory */
+    __forceinline void clear()
+    {
+      cleanup();
+      bytesUsed.store(0);
+      bytesFree.store(0);
+      bytesWasted.store(0);
+      if (usedBlocks.load() != nullptr) usedBlocks.load()->clear_list(device); usedBlocks = nullptr;
+      if (freeBlocks.load() != nullptr) freeBlocks.load()->clear_list(device); freeBlocks = nullptr;
+      for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++) {
+        threadUsedBlocks[i] = nullptr;
+        threadBlocks[i] = nullptr;
+      }
+      primrefarray.clear();
+    }
+
+    __forceinline size_t incGrowSizeScale()
+    {
+      size_t scale = log2_grow_size_scale.fetch_add(1)+1;
+      return size_t(1) << min(size_t(16),scale);
+    }
+
+    /*! thread safe allocation of memory */
+    void* malloc(size_t& bytes, size_t align, bool partial)
+    {
+      assert(align <= maxAlignment);
+
+      while (true)
+      {
+        /* allocate using current block */
+        size_t threadID = TaskScheduler::threadID();
+        size_t slot = threadID & slotMask;
+	Block* myUsedBlocks = threadUsedBlocks[slot];
+        if (myUsedBlocks) {
+          void* ptr = myUsedBlocks->malloc(device,bytes,align,partial);
+          if (ptr) return ptr;
+        }
+
+        /* throw error if allocation is too large */
+        if (bytes > maxAllocationSize)
+          throw_RTCError(RTC_ERROR_UNKNOWN,"allocation is too large");
+
+        /* parallel block creation in case of no freeBlocks, avoids single global mutex */
+        if (likely(freeBlocks.load() == nullptr))
+        {
+          Lock<SpinLock> lock(slotMutex[slot]);
+          if (myUsedBlocks == threadUsedBlocks[slot]) {
+            const size_t alignedBytes = (bytes+(align-1)) & ~(align-1);
+            const size_t allocSize = max(min(growSize,maxGrowSize),alignedBytes);
+            assert(allocSize >= bytes);
+            threadBlocks[slot] = threadUsedBlocks[slot] = Block::create(device,allocSize,allocSize,threadBlocks[slot],atype); // FIXME: a large allocation might throw away a block here!
+            // FIXME: a direct allocation should allocate inside the block here, and not in the next loop! a different thread could do some allocation and make the large allocation fail.
+          }
+          continue;
+        }
+
+        /* if this fails allocate new block */
+        {
+          Lock<SpinLock> lock(mutex);
+	  if (myUsedBlocks == threadUsedBlocks[slot])
+	  {
+            if (freeBlocks.load() != nullptr) {
+	      Block* nextFreeBlock = freeBlocks.load()->next;
+	      freeBlocks.load()->next = usedBlocks;
+	      __memory_barrier();
+	      usedBlocks = freeBlocks.load();
+              threadUsedBlocks[slot] = freeBlocks.load();
+	      freeBlocks = nextFreeBlock;
+	    } else {
+              const size_t allocSize = min(growSize*incGrowSizeScale(),maxGrowSize);
+	      usedBlocks = threadUsedBlocks[slot] = Block::create(device,allocSize,allocSize,usedBlocks,atype); // FIXME: a large allocation should get delivered directly, like above!
+	    }
+          }
+        }
+      }
+    }
+
+    /*! add new block */
+    void addBlock(void* ptr, ssize_t bytes)
+    {
+      Lock<SpinLock> lock(mutex);
+      const size_t sizeof_Header = offsetof(Block,data[0]);
+      void* aptr = (void*) ((((size_t)ptr)+maxAlignment-1) & ~(maxAlignment-1));
+      size_t ofs = (size_t) aptr - (size_t) ptr;
+      bytes -= ofs;
+      if (bytes < 4096) return; // ignore empty or very small blocks
+      freeBlocks = new (aptr) Block(SHARED,bytes-sizeof_Header,bytes-sizeof_Header,freeBlocks,ofs);
+    }
+
+    /* special allocation only used from morton builder only a single time for each build */
+    void* specialAlloc(size_t bytes)
+    {
+      assert(freeBlocks.load() != nullptr && freeBlocks.load()->getBlockAllocatedBytes() >= bytes);
+      return freeBlocks.load()->ptr();
+    }
+
+    struct Statistics
+    {
+      Statistics ()
+      : bytesUsed(0), bytesFree(0), bytesWasted(0) {}
+
+      Statistics (size_t bytesUsed, size_t bytesFree, size_t bytesWasted)
+      : bytesUsed(bytesUsed), bytesFree(bytesFree), bytesWasted(bytesWasted) {}
+
+      Statistics (FastAllocator* alloc, AllocationType atype, bool huge_pages = false)
+      : bytesUsed(0), bytesFree(0), bytesWasted(0)
+      {
+        Block* usedBlocks = alloc->usedBlocks.load();
+        Block* freeBlocks = alloc->freeBlocks.load();
+        if (usedBlocks) bytesUsed += usedBlocks->getUsedBytes(atype,huge_pages);
+        if (freeBlocks) bytesFree += freeBlocks->getAllocatedBytes(atype,huge_pages);
+        if (usedBlocks) bytesFree += usedBlocks->getFreeBytes(atype,huge_pages);
+        if (freeBlocks) bytesWasted += freeBlocks->getWastedBytes(atype,huge_pages);
+        if (usedBlocks) bytesWasted += usedBlocks->getWastedBytes(atype,huge_pages);
+      }
+
+      std::string str(size_t numPrimitives)
+      {
+        std::stringstream str;
+        str.setf(std::ios::fixed, std::ios::floatfield);
+        str << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, "
+            << "free = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesFree << " MB, "
+            << "wasted = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesWasted << " MB, "            
+            << "total = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesAllocatedTotal() << " MB, "
+            << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesAllocatedTotal())/double(numPrimitives);
+        return str.str();
+      }
+
+      friend Statistics operator+ ( const Statistics& a, const Statistics& b)
+      {
+        return Statistics(a.bytesUsed+b.bytesUsed,
+                          a.bytesFree+b.bytesFree,
+                          a.bytesWasted+b.bytesWasted);
+      }
+
+      size_t bytesAllocatedTotal() const {
+        return bytesUsed + bytesFree + bytesWasted;
+      }
+
+    public:
+      size_t bytesUsed;
+      size_t bytesFree;
+      size_t bytesWasted;
+    };
+
+    Statistics getStatistics(AllocationType atype, bool huge_pages = false) {
+      return Statistics(this,atype,huge_pages);
+    }
+
+    size_t getUsedBytes() {
+      return bytesUsed;
+    }
+
+    size_t getWastedBytes() {
+      return bytesWasted;
+    }
+
+    struct AllStatistics
+    {
+      AllStatistics (FastAllocator* alloc)
+
+      : bytesUsed(alloc->bytesUsed),
+        bytesFree(alloc->bytesFree),
+        bytesWasted(alloc->bytesWasted),
+        stat_all(alloc,ANY_TYPE),
+        stat_malloc(alloc,ALIGNED_MALLOC),
+        stat_4K(alloc,OS_MALLOC,false),
+        stat_2M(alloc,OS_MALLOC,true),
+        stat_shared(alloc,SHARED) {}
+
+      AllStatistics (size_t bytesUsed,
+                     size_t bytesFree,
+                     size_t bytesWasted,
+                     Statistics stat_all,
+                     Statistics stat_malloc,
+                     Statistics stat_4K,
+                     Statistics stat_2M,
+                     Statistics stat_shared)
+
+      : bytesUsed(bytesUsed),
+        bytesFree(bytesFree),
+        bytesWasted(bytesWasted),
+        stat_all(stat_all),
+        stat_malloc(stat_malloc),
+        stat_4K(stat_4K),
+        stat_2M(stat_2M),
+        stat_shared(stat_shared) {}
+
+      friend AllStatistics operator+ (const AllStatistics& a, const AllStatistics& b)
+      {
+        return AllStatistics(a.bytesUsed+b.bytesUsed,
+                             a.bytesFree+b.bytesFree,
+                             a.bytesWasted+b.bytesWasted,
+                             a.stat_all + b.stat_all,
+                             a.stat_malloc + b.stat_malloc,
+                             a.stat_4K + b.stat_4K,
+                             a.stat_2M + b.stat_2M,
+                             a.stat_shared + b.stat_shared);
+      }
+
+      void print(size_t numPrimitives)
+      {
+        std::stringstream str0;
+        str0.setf(std::ios::fixed, std::ios::floatfield);
+        str0 << "  alloc : " 
+             << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, "
+             << "                                                            " 
+             << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesUsed)/double(numPrimitives);
+        std::cout << str0.str() << std::endl;
+      
+        std::stringstream str1;
+        str1.setf(std::ios::fixed, std::ios::floatfield);
+        str1 << "  alloc : " 
+             << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, "
+             << "free = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesFree << " MB, "            
+             << "wasted = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesWasted << " MB, "            
+             << "total = " << std::setw(7) << std::setprecision(3) << 1E-6f*(bytesUsed+bytesFree+bytesWasted) << " MB, "
+             << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesUsed+bytesFree+bytesWasted)/double(numPrimitives);
+        std::cout << str1.str() << std::endl;
+     
+        std::cout << "  total : " << stat_all.str(numPrimitives) << std::endl;
+        std::cout << "  4K    : " << stat_4K.str(numPrimitives) << std::endl;
+        std::cout << "  2M    : " << stat_2M.str(numPrimitives) << std::endl;
+        std::cout << "  malloc: " << stat_malloc.str(numPrimitives) << std::endl;
+        std::cout << "  shared: " << stat_shared.str(numPrimitives) << std::endl;
+      }
+
+    private:
+      size_t bytesUsed;
+      size_t bytesFree;
+      size_t bytesWasted;
+      Statistics stat_all;
+      Statistics stat_malloc;
+      Statistics stat_4K;
+      Statistics stat_2M;
+      Statistics stat_shared;
+    };
+
+    void print_blocks()
+    {
+      std::cout << "  estimatedSize = " << estimatedSize << ", slotMask = " << slotMask << ", use_single_mode = " << use_single_mode << ", maxGrowSize = " << maxGrowSize << ", defaultBlockSize = " << defaultBlockSize << std::endl;
+
+      std::cout << "  used blocks = ";
+      if (usedBlocks.load() != nullptr) usedBlocks.load()->print_list();
+      std::cout << "[END]" << std::endl;
+
+      std::cout << "  free blocks = ";
+      if (freeBlocks.load() != nullptr) freeBlocks.load()->print_list();
+      std::cout << "[END]" << std::endl;
+    }
+
+  private:
+
+    struct Block
+    {
+      static Block* create(MemoryMonitorInterface* device, size_t bytesAllocate, size_t bytesReserve, Block* next, AllocationType atype)
+      {
+        /* We avoid using os_malloc for small blocks as this could
+         * cause a risk of fragmenting the virtual address space and
+         * reach the limit of vm.max_map_count = 65k under Linux. */
+        if (atype == OS_MALLOC && bytesAllocate < maxAllocationSize)
+          atype = ALIGNED_MALLOC;
+
+        /* we need to additionally allocate some header */
+        const size_t sizeof_Header = offsetof(Block,data[0]);
+        bytesAllocate = sizeof_Header+bytesAllocate;
+        bytesReserve  = sizeof_Header+bytesReserve;
+
+        /* consume full 4k pages with using os_malloc */
+        if (atype == OS_MALLOC) {
+          bytesAllocate = ((bytesAllocate+PAGE_SIZE-1) & ~(PAGE_SIZE-1));
+          bytesReserve  = ((bytesReserve +PAGE_SIZE-1) & ~(PAGE_SIZE-1));
+        }
+
+        /* either use alignedMalloc or os_malloc */
+        void *ptr = nullptr;
+        if (atype == ALIGNED_MALLOC)
+        {
+          /* special handling for default block size */
+          if (bytesAllocate == (2*PAGE_SIZE_2M))
+          {
+            const size_t alignment = maxAlignment;
+            if (device) device->memoryMonitor(bytesAllocate+alignment,false);
+            ptr = alignedMalloc(bytesAllocate,alignment);
+
+            /* give hint to transparently convert these pages to 2MB pages */
+            const size_t ptr_aligned_begin = ((size_t)ptr) & ~size_t(PAGE_SIZE_2M-1);
+            os_advise((void*)(ptr_aligned_begin +              0),PAGE_SIZE_2M); // may fail if no memory mapped before block
+            os_advise((void*)(ptr_aligned_begin + 1*PAGE_SIZE_2M),PAGE_SIZE_2M);
+            os_advise((void*)(ptr_aligned_begin + 2*PAGE_SIZE_2M),PAGE_SIZE_2M); // may fail if no memory mapped after block
+
+            return new (ptr) Block(ALIGNED_MALLOC,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment);
+          }
+          else
+          {
+            const size_t alignment = maxAlignment;
+            if (device) device->memoryMonitor(bytesAllocate+alignment,false);
+            ptr = alignedMalloc(bytesAllocate,alignment);
+            return new (ptr) Block(ALIGNED_MALLOC,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment);
+          }
+        }
+        else if (atype == OS_MALLOC)
+        {
+          if (device) device->memoryMonitor(bytesAllocate,false);
+          bool huge_pages; ptr = os_malloc(bytesReserve,huge_pages);
+          return new (ptr) Block(OS_MALLOC,bytesAllocate-sizeof_Header,bytesReserve-sizeof_Header,next,0,huge_pages);
+        }
+        else
+          assert(false);
+
+        return NULL;
+      }
+
+      Block (AllocationType atype, size_t bytesAllocate, size_t bytesReserve, Block* next, size_t wasted, bool huge_pages = false)
+      : cur(0), allocEnd(bytesAllocate), reserveEnd(bytesReserve), next(next), wasted(wasted), atype(atype), huge_pages(huge_pages)
+      {
+        assert((((size_t)&data[0]) & (maxAlignment-1)) == 0);
+      }
+
+      static Block* remove_shared_blocks(Block* head)
+      {
+        Block** prev_next = &head;
+        for (Block* block = head; block; block = block->next) {
+          if (block->atype == SHARED) *prev_next = block->next;
+          else                         prev_next = &block->next;
+        }
+        return head;
+      }
+
+      void clear_list(MemoryMonitorInterface* device)
+      {
+        Block* block = this;
+        while (block) {
+          Block* next = block->next;
+          block->clear_block(device);
+          block = next;
+        }
+      }
+
+      void clear_block (MemoryMonitorInterface* device)
+      {
+        const size_t sizeof_Header = offsetof(Block,data[0]);
+        const ssize_t sizeof_Alloced = wasted+sizeof_Header+getBlockAllocatedBytes();
+
+        if (atype == ALIGNED_MALLOC) {
+          alignedFree(this);
+          if (device) device->memoryMonitor(-sizeof_Alloced,true);
+        }
+
+        else if (atype == OS_MALLOC) {
+         size_t sizeof_This = sizeof_Header+reserveEnd;
+         os_free(this,sizeof_This,huge_pages);
+         if (device) device->memoryMonitor(-sizeof_Alloced,true);
+        }
+
+        else /* if (atype == SHARED) */ {
+        }
+      }
+
+      void* malloc(MemoryMonitorInterface* device, size_t& bytes_in, size_t align, bool partial)
+      {
+        size_t bytes = bytes_in;
+        assert(align <= maxAlignment);
+        bytes = (bytes+(align-1)) & ~(align-1);
+	if (unlikely(cur+bytes > reserveEnd && !partial)) return nullptr;
+	const size_t i = cur.fetch_add(bytes);
+        if (unlikely(i+bytes > reserveEnd && !partial)) return nullptr;
+        if (unlikely(i > reserveEnd)) return nullptr;
+        bytes_in = bytes = min(bytes,reserveEnd-i);
+        
+	if (i+bytes > allocEnd) {
+          if (device) device->memoryMonitor(i+bytes-max(i,allocEnd),true);
+        }
+	return &data[i];
+      }
+
+      void* ptr() {
+        return &data[cur];
+      }
+
+      void reset_block ()
+      {
+        allocEnd = max(allocEnd,(size_t)cur);
+        cur = 0;
+      }
+
+      size_t getBlockUsedBytes() const {
+        return min(size_t(cur),reserveEnd);
+      }
+
+      size_t getBlockFreeBytes() const {
+	return getBlockAllocatedBytes() - getBlockUsedBytes();
+      }
+
+      size_t getBlockAllocatedBytes() const {
+        return min(max(allocEnd,size_t(cur)),reserveEnd);
+      }
+
+      size_t getBlockWastedBytes() const {
+        const size_t sizeof_Header = offsetof(Block,data[0]);
+        return sizeof_Header + wasted;
+      }
+
+      size_t getBlockReservedBytes() const {
+        return reserveEnd;
+      }
+  
+      bool hasType(AllocationType atype_i, bool huge_pages_i) const
+      {
+        if      (atype_i == ANY_TYPE ) return true;
+        else if (atype   == OS_MALLOC) return atype_i == atype && huge_pages_i == huge_pages;
+        else                           return atype_i == atype;
+      }
+
+      size_t getUsedBytes(AllocationType atype, bool huge_pages = false) const {
+        size_t bytes = 0;
+        for (const Block* block = this; block; block = block->next) {
+          if (!block->hasType(atype,huge_pages)) continue;
+          bytes += block->getBlockUsedBytes();
+        }
+        return bytes;
+      }
+
+      size_t getFreeBytes(AllocationType atype, bool huge_pages = false) const {
+        size_t bytes = 0;
+        for (const Block* block = this; block; block = block->next) {
+          if (!block->hasType(atype,huge_pages)) continue;
+          bytes += block->getBlockFreeBytes();
+        }
+        return bytes;
+      }
+
+      size_t getWastedBytes(AllocationType atype, bool huge_pages = false) const {
+        size_t bytes = 0;
+        for (const Block* block = this; block; block = block->next) {
+          if (!block->hasType(atype,huge_pages)) continue;
+          bytes += block->getBlockWastedBytes();
+        }
+        return bytes;
+      }
+
+      size_t getAllocatedBytes(AllocationType atype, bool huge_pages = false) const {
+        size_t bytes = 0;
+        for (const Block* block = this; block; block = block->next) {
+          if (!block->hasType(atype,huge_pages)) continue;
+          bytes += block->getBlockAllocatedBytes();
+        }
+        return bytes;
+      }
+
+      void print_list ()
+      {
+        for (const Block* block = this; block; block = block->next)
+          block->print_block();
+      }
+
+      void print_block() const
+      {
+        if (atype == ALIGNED_MALLOC) std::cout << "A";
+        else if (atype == OS_MALLOC) std::cout << "O";
+        else if (atype == SHARED) std::cout << "S";
+        if (huge_pages) std::cout << "H";
+        size_t bytesUsed = getBlockUsedBytes();
+        size_t bytesFree = getBlockFreeBytes();
+        size_t bytesWasted = getBlockWastedBytes();
+        std::cout << "[" << bytesUsed << ", " << bytesFree << ", " << bytesWasted << "] ";
+      }
+
+    public:
+      std::atomic<size_t> cur;        //!< current location of the allocator
+      std::atomic<size_t> allocEnd;   //!< end of the allocated memory region
+      std::atomic<size_t> reserveEnd; //!< end of the reserved memory region
+      Block* next;               //!< pointer to next block in list
+      size_t wasted;             //!< amount of memory wasted through block alignment
+      AllocationType atype;      //!< allocation mode of the block
+      bool huge_pages;           //!< whether the block uses huge pages
+      char align[maxAlignment-5*sizeof(size_t)-sizeof(AllocationType)-sizeof(bool)]; //!< align data to maxAlignment
+      char data[1];              //!< here starts memory to use for allocations
+    };
+
+  private:
+    Device* device;
+    SpinLock mutex;
+    size_t slotMask;
+    std::atomic<Block*> threadUsedBlocks[MAX_THREAD_USED_BLOCK_SLOTS];
+    std::atomic<Block*> usedBlocks;
+    std::atomic<Block*> freeBlocks;
+
+    std::atomic<Block*> threadBlocks[MAX_THREAD_USED_BLOCK_SLOTS];
+    SpinLock slotMutex[MAX_THREAD_USED_BLOCK_SLOTS];
+
+    bool use_single_mode;
+    size_t defaultBlockSize;
+    size_t estimatedSize;
+    size_t growSize;
+    size_t maxGrowSize;
+    std::atomic<size_t> log2_grow_size_scale; //!< log2 of scaling factor for grow size // FIXME: remove
+    std::atomic<size_t> bytesUsed;
+    std::atomic<size_t> bytesFree;
+    std::atomic<size_t> bytesWasted;
+    static __thread ThreadLocal2* thread_local_allocator2;
+    static SpinLock s_thread_local_allocators_lock;
+    static std::vector<std::unique_ptr<ThreadLocal2>> s_thread_local_allocators;
+    SpinLock thread_local_allocators_lock;
+    std::vector<ThreadLocal2*> thread_local_allocators;
+    AllocationType atype;
+    mvector<PrimRef> primrefarray;     //!< primrefarray used to allocate nodes
+  };
+}
diff --git a/thirdparty/embree/kernels/common/buffer.h b/thirdparty/embree/kernels/common/buffer.h
new file mode 100644
index 0000000000..793012c04d
--- /dev/null
+++ b/thirdparty/embree/kernels/common/buffer.h
@@ -0,0 +1,263 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "device.h"
+
+namespace embree
+{
+  /*! Implements an API data buffer object. This class may or may not own the data. */
+  class Buffer : public RefCount
+  {
+  public:
+    /*! Buffer construction */
+    Buffer() 
+      : device(nullptr), ptr(nullptr), numBytes(0), shared(false) {}
+
+    /*! Buffer construction */
+    Buffer(Device* device, size_t numBytes_in, void* ptr_in = nullptr)
+      : device(device), numBytes(numBytes_in)
+    {
+      device->refInc();
+      
+      if (ptr_in)
+      {
+        shared = true;
+        ptr = (char*)ptr_in;
+      }
+      else
+      {
+        shared = false;
+        alloc();
+      }
+    }
+    
+    /*! Buffer destruction */
+    ~Buffer() {
+      free();
+      device->refDec();
+    }
+    
+    /*! this class is not copyable */
+  private:
+    Buffer(const Buffer& other) DELETED; // do not implement
+    Buffer& operator =(const Buffer& other) DELETED; // do not implement
+    
+  public:
+    /* inits and allocates the buffer */
+    void create(Device* device_in, size_t numBytes_in)
+    {
+      init(device_in, numBytes_in);
+      alloc();
+    }
+    
+    /* inits the buffer */
+    void init(Device* device_in, size_t numBytes_in)
+    {
+      free();
+      device = device_in;
+      ptr = nullptr;
+      numBytes = numBytes_in;
+      shared = false;
+    }
+
+    /*! sets shared buffer */
+    void set(Device* device_in, void* ptr_in, size_t numBytes_in)
+    {
+      free();
+      device = device_in;
+      ptr = (char*)ptr_in;
+      if (numBytes_in != (size_t)-1)
+        numBytes = numBytes_in;
+      shared = true;
+    }
+    
+    /*! allocated buffer */
+    void alloc()
+    {
+      if (device)
+        device->memoryMonitor(this->bytes(), false);
+      size_t b = (this->bytes()+15) & ssize_t(-16);
+      ptr = (char*)alignedMalloc(b,16);
+    }
+    
+    /*! frees the buffer */
+    void free()
+    {
+      if (shared) return;
+      alignedFree(ptr); 
+      if (device)
+        device->memoryMonitor(-ssize_t(this->bytes()), true);
+      ptr = nullptr;
+    }
+    
+    /*! gets buffer pointer */
+    void* data()
+    {
+      /* report error if buffer is not existing */
+      if (!device)
+        throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer specified");
+      
+      /* return buffer */
+      return ptr;
+    }
+
+    /*! returns pointer to first element */
+    __forceinline char* getPtr() const {
+      return ptr;
+    }
+
+    /*! returns the number of bytes of the buffer */
+    __forceinline size_t bytes() const { 
+      return numBytes;
+    }
+    
+    /*! returns true of the buffer is not empty */
+    __forceinline operator bool() const { 
+      return ptr; 
+    }
+
+  public:
+    Device* device;  //!< device to report memory usage to
+    char* ptr;       //!< pointer to buffer data
+    size_t numBytes; //!< number of bytes in the buffer
+    bool shared;     //!< set if memory is shared with application
+  };
+
+  /*! An untyped contiguous range of a buffer. This class does not own the buffer content. */
+  class RawBufferView
+  {
+  public:
+    /*! Buffer construction */
+    RawBufferView()
+      : ptr_ofs(nullptr), stride(0), num(0), format(RTC_FORMAT_UNDEFINED), modCounter(1), modified(true), userData(0) {}
+
+  public:
+    /*! sets the buffer view */
+    void set(const Ref<Buffer>& buffer_in, size_t offset_in, size_t stride_in, size_t num_in, RTCFormat format_in)
+    {
+      if ((offset_in + stride_in * num_in) > (stride_in * buffer_in->numBytes))
+        throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "buffer range out of bounds");
+
+      ptr_ofs = buffer_in->ptr + offset_in;
+      stride = stride_in;
+      num = num_in;
+      format = format_in;
+      modCounter++;
+      modified = true;
+      buffer = buffer_in;
+    }
+
+    /*! returns pointer to the first element */
+    __forceinline char* getPtr() const {
+      return ptr_ofs;
+    }
+
+    /*! returns pointer to the i'th element */
+    __forceinline char* getPtr(size_t i) const
+    {
+      assert(i<num);
+      return ptr_ofs + i*stride;
+    }
+
+    /*! returns the number of elements of the buffer */
+    __forceinline size_t size() const { 
+      return num; 
+    }
+
+    /*! returns the number of bytes of the buffer */
+    __forceinline size_t bytes() const { 
+      return num*stride; 
+    }
+    
+    /*! returns the buffer stride */
+    __forceinline unsigned getStride() const
+    {
+      assert(stride <= unsigned(inf));
+      return unsigned(stride);
+    }
+
+    /*! return the buffer format */
+    __forceinline RTCFormat getFormat() const {
+      return format;
+    }
+
+    /*! mark buffer as modified or unmodified */
+    __forceinline void setModified() {
+      modCounter++;
+      modified = true;
+    }
+
+    /*! mark buffer as modified or unmodified */
+    __forceinline bool isModified(unsigned int otherModCounter) const {
+      return modCounter > otherModCounter;
+    }
+
+     /*! mark buffer as modified or unmodified */
+    __forceinline bool isLocalModified() const {
+      return modified;
+    }
+
+    /*! clear local modified flag */
+    __forceinline void clearLocalModified() {
+      modified = false;
+    }
+
+    /*! returns true of the buffer is not empty */
+    __forceinline operator bool() const { 
+      return ptr_ofs; 
+    }
+
+    /*! checks padding to 16 byte check, fails hard */
+    __forceinline void checkPadding16() const
+    {
+      if (ptr_ofs && num)
+        volatile int MAYBE_UNUSED w = *((int*)getPtr(size()-1)+3); // FIXME: is failing hard avoidable?
+    }
+
+  public:
+    char* ptr_ofs;      //!< base pointer plus offset
+    size_t stride;      //!< stride of the buffer in bytes
+    size_t num;         //!< number of elements in the buffer
+    RTCFormat format;   //!< format of the buffer
+    unsigned int modCounter; //!< version ID of this buffer
+    bool modified;      //!< local modified data
+    int userData;       //!< special data
+    Ref<Buffer> buffer; //!< reference to the parent buffer
+  };
+
+  /*! A typed contiguous range of a buffer. This class does not own the buffer content. */
+  template<typename T>
+  class BufferView : public RawBufferView
+  {
+  public:
+    typedef T value_type;
+
+    /*! access to the ith element of the buffer */
+    __forceinline       T& operator [](size_t i)       { assert(i<num); return *(T*)(ptr_ofs + i*stride); }
+    __forceinline const T& operator [](size_t i) const { assert(i<num); return *(T*)(ptr_ofs + i*stride); }
+  };
+
+  template<>
+  class BufferView<Vec3fa> : public RawBufferView
+  {
+  public:
+    typedef Vec3fa value_type;
+
+    /*! access to the ith element of the buffer */
+    __forceinline const Vec3fa operator [](size_t i) const
+    {
+      assert(i<num);
+      return Vec3fa(vfloat4::loadu((float*)(ptr_ofs + i*stride)));
+    }
+    
+    /*! writes the i'th element */
+    __forceinline void store(size_t i, const Vec3fa& v)
+    {
+      assert(i<num);
+      vfloat4::storeu((float*)(ptr_ofs + i*stride), (vfloat4)v);
+    }
+  };
+}
diff --git a/thirdparty/embree/kernels/common/builder.h b/thirdparty/embree/kernels/common/builder.h
new file mode 100644
index 0000000000..07fe7b069b
--- /dev/null
+++ b/thirdparty/embree/kernels/common/builder.h
@@ -0,0 +1,60 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "accel.h"
+
+namespace embree
+{
+#define MODE_HIGH_QUALITY (1<<8)
+
+  /*! virtual interface for all hierarchy builders */
+  class Builder : public RefCount {
+  public:
+
+    static const size_t DEFAULT_SINGLE_THREAD_THRESHOLD = 1024;
+
+    /*! initiates the hierarchy builder */
+    virtual void build() = 0;
+
+    /*! notifies the builder about the deletion of some geometry */
+    virtual void deleteGeometry(size_t geomID) {};
+
+    /*! clears internal builder state */
+    virtual void clear() = 0;
+  };
+
+  /*! virtual interface for progress monitor class */
+  struct BuildProgressMonitor {
+    virtual void operator() (size_t dn) const = 0;
+  };
+
+  /*! build the progress monitor interface from a closure */
+  template<typename Closure>
+    struct ProgressMonitorClosure : BuildProgressMonitor
+  {
+  public:
+    ProgressMonitorClosure (const Closure& closure) : closure(closure) {}
+    void operator() (size_t dn) const { closure(dn); }
+  private:
+    const Closure closure;
+  };
+  template<typename Closure> __forceinline const ProgressMonitorClosure<Closure> BuildProgressMonitorFromClosure(const Closure& closure) {
+    return ProgressMonitorClosure<Closure>(closure);
+  }
+
+  struct LineSegments;
+  struct TriangleMesh;
+  struct QuadMesh;
+  struct UserGeometry;
+
+  class Scene;
+
+  typedef void (*createLineSegmentsAccelTy)(Scene* scene, LineSegments* mesh, AccelData*& accel, Builder*& builder);
+  typedef void (*createTriangleMeshAccelTy)(Scene* scene, unsigned int geomID, AccelData*& accel, Builder*& builder);
+  typedef void (*createQuadMeshAccelTy)(Scene* scene, unsigned int geomID, AccelData*& accel, Builder*& builder);
+  typedef void (*createUserGeometryAccelTy)(Scene* scene, unsigned int geomID, AccelData*& accel, Builder*& builder);
+
+}
diff --git a/thirdparty/embree/kernels/common/context.h b/thirdparty/embree/kernels/common/context.h
new file mode 100644
index 0000000000..ccd88bdeac
--- /dev/null
+++ b/thirdparty/embree/kernels/common/context.h
@@ -0,0 +1,131 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "rtcore.h"
+#include "point_query.h"
+
+namespace embree
+{
+  class Scene;
+
+  struct IntersectContext
+  {
+  public:
+    __forceinline IntersectContext(Scene* scene, RTCIntersectContext* user_context)
+      : scene(scene), user(user_context) {}
+
+    __forceinline bool hasContextFilter() const {
+      return user->filter != nullptr;
+    }
+
+    __forceinline bool isCoherent() const {
+      return embree::isCoherent(user->flags);
+    }
+
+    __forceinline bool isIncoherent() const {
+      return embree::isIncoherent(user->flags);
+    }
+    
+  public:
+    Scene* scene;
+    RTCIntersectContext* user;
+  };
+
+  template<int M, typename Geometry>
+      __forceinline Vec4vf<M> enlargeRadiusToMinWidth(const IntersectContext* context, const Geometry* geom, const Vec3vf<M>& ray_org, const Vec4vf<M>& v)
+    {
+#if RTC_MIN_WIDTH
+      const vfloat<M> d = length(Vec3vf<M>(v) - ray_org);
+      const vfloat<M> r = clamp(context->user->minWidthDistanceFactor*d, v.w, geom->maxRadiusScale*v.w);
+      return Vec4vf<M>(v.x,v.y,v.z,r);
+#else
+      return v;
+#endif
+    }
+
+    template<typename Geometry>
+    __forceinline Vec3ff enlargeRadiusToMinWidth(const IntersectContext* context, const Geometry* geom, const Vec3fa& ray_org, const Vec3ff& v)
+  {
+#if RTC_MIN_WIDTH
+    const float d = length(Vec3fa(v) - ray_org);
+    const float r = clamp(context->user->minWidthDistanceFactor*d, v.w, geom->maxRadiusScale*v.w);
+    return Vec3ff(v.x,v.y,v.z,r);
+#else
+    return v;
+#endif
+  }
+  
+  enum PointQueryType
+  {
+    POINT_QUERY_TYPE_UNDEFINED = 0,
+    POINT_QUERY_TYPE_SPHERE = 1,
+    POINT_QUERY_TYPE_AABB = 2,
+  };
+
+  typedef bool (*PointQueryFunction)(struct RTCPointQueryFunctionArguments* args);
+  
+  struct PointQueryContext
+  {
+  public:
+    __forceinline PointQueryContext(Scene* scene, 
+                                    PointQuery* query_ws, 
+                                    PointQueryType query_type,
+                                    PointQueryFunction func, 
+                                    RTCPointQueryContext* userContext,
+                                    float similarityScale,
+                                    void* userPtr)
+      : scene(scene)
+      , query_ws(query_ws)
+      , query_type(query_type)
+      , func(func)
+      , userContext(userContext)
+      , similarityScale(similarityScale)
+      , userPtr(userPtr) 
+      , primID(RTC_INVALID_GEOMETRY_ID)
+      , geomID(RTC_INVALID_GEOMETRY_ID)
+      , query_radius(query_ws->radius)
+    { 
+      if (query_type == POINT_QUERY_TYPE_AABB) {
+        assert(similarityScale == 0.f);
+        updateAABB();
+      }
+      if (userContext->instStackSize == 0) {
+        assert(similarityScale == 1.f);
+      }
+    }
+
+  public:
+    __forceinline void updateAABB() 
+    {
+      if (likely(query_ws->radius == (float)inf || userContext->instStackSize == 0)) {
+        query_radius = Vec3fa(query_ws->radius);
+        return;
+      }
+
+      const AffineSpace3fa m = AffineSpace3fa_load_unaligned((AffineSpace3fa*)userContext->world2inst[userContext->instStackSize-1]);
+      BBox3fa bbox(Vec3fa(-query_ws->radius), Vec3fa(query_ws->radius));
+      bbox = xfmBounds(m, bbox);
+      query_radius = 0.5f * (bbox.upper - bbox.lower);
+    }
+
+public:
+    Scene* scene;
+
+    PointQuery* query_ws; // the original world space point query 
+    PointQueryType query_type;
+    PointQueryFunction func;
+    RTCPointQueryContext* userContext;
+    const float similarityScale;
+
+    void* userPtr;
+
+    unsigned int primID;
+    unsigned int geomID;
+
+    Vec3fa query_radius;  // used if the query is converted to an AABB internally
+  };
+}
+
diff --git a/thirdparty/embree/kernels/common/default.h b/thirdparty/embree/kernels/common/default.h
new file mode 100644
index 0000000000..f15d61b768
--- /dev/null
+++ b/thirdparty/embree/kernels/common/default.h
@@ -0,0 +1,268 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../../common/sys/platform.h"
+#include "../../common/sys/sysinfo.h"
+#include "../../common/sys/thread.h"
+#include "../../common/sys/alloc.h"
+#include "../../common/sys/ref.h"
+#include "../../common/sys/intrinsics.h"
+#include "../../common/sys/atomic.h"
+#include "../../common/sys/mutex.h"
+#include "../../common/sys/vector.h"
+#include "../../common/sys/array.h"
+#include "../../common/sys/string.h"
+#include "../../common/sys/regression.h"
+#include "../../common/sys/vector.h"
+
+#include "../../common/math/math.h"
+#include "../../common/math/transcendental.h"
+#include "../../common/simd/simd.h"
+#include "../../common/math/vec2.h"
+#include "../../common/math/vec3.h"
+#include "../../common/math/vec4.h"
+#include "../../common/math/vec2fa.h"
+#include "../../common/math/vec3fa.h"
+#include "../../common/math/interval.h"
+#include "../../common/math/bbox.h"
+#include "../../common/math/obbox.h"
+#include "../../common/math/lbbox.h"
+#include "../../common/math/linearspace2.h"
+#include "../../common/math/linearspace3.h"
+#include "../../common/math/affinespace.h"
+#include "../../common/math/range.h"
+#include "../../common/lexers/tokenstream.h"
+
+#include "../../common/tasking/taskscheduler.h"
+
+#define COMMA ,
+
+#include "../config.h"
+#include "isa.h"
+#include "stat.h"
+#include "profile.h"
+#include "rtcore.h"
+#include "vector.h"
+#include "state.h"
+#include "instance_stack.h"
+
+#include <vector>
+#include <map>
+#include <algorithm>
+#include <functional>
+#include <utility>
+#include <sstream>
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Vec2 shortcuts
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<int N> using Vec2vf  = Vec2<vfloat<N>>;
+  template<int N> using Vec2vd  = Vec2<vdouble<N>>;
+  template<int N> using Vec2vr  = Vec2<vreal<N>>;
+  template<int N> using Vec2vi  = Vec2<vint<N>>;
+  template<int N> using Vec2vl  = Vec2<vllong<N>>;
+  template<int N> using Vec2vb  = Vec2<vbool<N>>;
+  template<int N> using Vec2vbf = Vec2<vboolf<N>>;
+  template<int N> using Vec2vbd = Vec2<vboold<N>>;
+
+  typedef Vec2<vfloat4>  Vec2vf4;
+  typedef Vec2<vdouble4> Vec2vd4;
+  typedef Vec2<vreal4>   Vec2vr4;
+  typedef Vec2<vint4>    Vec2vi4;
+  typedef Vec2<vllong4>  Vec2vl4;
+  typedef Vec2<vbool4>   Vec2vb4;
+  typedef Vec2<vboolf4>  Vec2vbf4;
+  typedef Vec2<vboold4>  Vec2vbd4;
+
+  typedef Vec2<vfloat8>  Vec2vf8;
+  typedef Vec2<vdouble8> Vec2vd8;
+  typedef Vec2<vreal8>   Vec2vr8;
+  typedef Vec2<vint8>    Vec2vi8;
+  typedef Vec2<vllong8>  Vec2vl8;
+  typedef Vec2<vbool8>   Vec2vb8;
+  typedef Vec2<vboolf8>  Vec2vbf8;
+  typedef Vec2<vboold8>  Vec2vbd8;
+
+  typedef Vec2<vfloat16>  Vec2vf16;
+  typedef Vec2<vdouble16> Vec2vd16;
+  typedef Vec2<vreal16>   Vec2vr16;
+  typedef Vec2<vint16>    Vec2vi16;
+  typedef Vec2<vllong16>  Vec2vl16;
+  typedef Vec2<vbool16>   Vec2vb16;
+  typedef Vec2<vboolf16>  Vec2vbf16;
+  typedef Vec2<vboold16>  Vec2vbd16;
+
+  typedef Vec2<vfloatx>  Vec2vfx;
+  typedef Vec2<vdoublex> Vec2vdx;
+  typedef Vec2<vrealx>   Vec2vrx;
+  typedef Vec2<vintx>    Vec2vix;
+  typedef Vec2<vllongx>  Vec2vlx;
+  typedef Vec2<vboolx>   Vec2vbx;
+  typedef Vec2<vboolfx>  Vec2vbfx;
+  typedef Vec2<vbooldx>  Vec2vbdx;
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Vec3 shortcuts
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<int N> using Vec3vf  = Vec3<vfloat<N>>;
+  template<int N> using Vec3vd  = Vec3<vdouble<N>>;
+  template<int N> using Vec3vr  = Vec3<vreal<N>>;
+  template<int N> using Vec3vi  = Vec3<vint<N>>;
+  template<int N> using Vec3vl  = Vec3<vllong<N>>;
+  template<int N> using Vec3vb  = Vec3<vbool<N>>;
+  template<int N> using Vec3vbf = Vec3<vboolf<N>>;
+  template<int N> using Vec3vbd = Vec3<vboold<N>>;
+
+  typedef Vec3<vfloat4>  Vec3vf4;
+  typedef Vec3<vdouble4> Vec3vd4;
+  typedef Vec3<vreal4>   Vec3vr4;
+  typedef Vec3<vint4>    Vec3vi4;
+  typedef Vec3<vllong4>  Vec3vl4;
+  typedef Vec3<vbool4>   Vec3vb4;
+  typedef Vec3<vboolf4>  Vec3vbf4;
+  typedef Vec3<vboold4>  Vec3vbd4;
+
+  typedef Vec3<vfloat8>  Vec3vf8;
+  typedef Vec3<vdouble8> Vec3vd8;
+  typedef Vec3<vreal8>   Vec3vr8;
+  typedef Vec3<vint8>    Vec3vi8;
+  typedef Vec3<vllong8>  Vec3vl8;
+  typedef Vec3<vbool8>   Vec3vb8;
+  typedef Vec3<vboolf8>  Vec3vbf8;
+  typedef Vec3<vboold8>  Vec3vbd8;
+
+  typedef Vec3<vfloat16>  Vec3vf16;
+  typedef Vec3<vdouble16> Vec3vd16;
+  typedef Vec3<vreal16>   Vec3vr16;
+  typedef Vec3<vint16>    Vec3vi16;
+  typedef Vec3<vllong16>  Vec3vl16;
+  typedef Vec3<vbool16>   Vec3vb16;
+  typedef Vec3<vboolf16>  Vec3vbf16;
+  typedef Vec3<vboold16>  Vec3vbd16;
+
+  typedef Vec3<vfloatx>  Vec3vfx;
+  typedef Vec3<vdoublex> Vec3vdx;
+  typedef Vec3<vrealx>   Vec3vrx;
+  typedef Vec3<vintx>    Vec3vix;
+  typedef Vec3<vllongx>  Vec3vlx;
+  typedef Vec3<vboolx>   Vec3vbx;
+  typedef Vec3<vboolfx>  Vec3vbfx;
+  typedef Vec3<vbooldx>  Vec3vbdx;
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Vec4 shortcuts
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<int N> using Vec4vf  = Vec4<vfloat<N>>;
+  template<int N> using Vec4vd  = Vec4<vdouble<N>>;
+  template<int N> using Vec4vr  = Vec4<vreal<N>>;
+  template<int N> using Vec4vi  = Vec4<vint<N>>;
+  template<int N> using Vec4vl  = Vec4<vllong<N>>;
+  template<int N> using Vec4vb  = Vec4<vbool<N>>;
+  template<int N> using Vec4vbf = Vec4<vboolf<N>>;
+  template<int N> using Vec4vbd = Vec4<vboold<N>>;
+
+  typedef Vec4<vfloat4>  Vec4vf4;
+  typedef Vec4<vdouble4> Vec4vd4;
+  typedef Vec4<vreal4>   Vec4vr4;
+  typedef Vec4<vint4>    Vec4vi4;
+  typedef Vec4<vllong4>  Vec4vl4;
+  typedef Vec4<vbool4>   Vec4vb4;
+  typedef Vec4<vboolf4>  Vec4vbf4;
+  typedef Vec4<vboold4>  Vec4vbd4;
+
+  typedef Vec4<vfloat8>  Vec4vf8;
+  typedef Vec4<vdouble8> Vec4vd8;
+  typedef Vec4<vreal8>   Vec4vr8;
+  typedef Vec4<vint8>    Vec4vi8;
+  typedef Vec4<vllong8>  Vec4vl8;
+  typedef Vec4<vbool8>   Vec4vb8;
+  typedef Vec4<vboolf8>  Vec4vbf8;
+  typedef Vec4<vboold8>  Vec4vbd8;
+
+  typedef Vec4<vfloat16>  Vec4vf16;
+  typedef Vec4<vdouble16> Vec4vd16;
+  typedef Vec4<vreal16>   Vec4vr16;
+  typedef Vec4<vint16>    Vec4vi16;
+  typedef Vec4<vllong16>  Vec4vl16;
+  typedef Vec4<vbool16>   Vec4vb16;
+  typedef Vec4<vboolf16>  Vec4vbf16;
+  typedef Vec4<vboold16>  Vec4vbd16;
+
+  typedef Vec4<vfloatx>  Vec4vfx;
+  typedef Vec4<vdoublex> Vec4vdx;
+  typedef Vec4<vrealx>   Vec4vrx;
+  typedef Vec4<vintx>    Vec4vix;
+  typedef Vec4<vllongx>  Vec4vlx;
+  typedef Vec4<vboolx>   Vec4vbx;
+  typedef Vec4<vboolfx>  Vec4vbfx;
+  typedef Vec4<vbooldx>  Vec4vbdx;
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Other shortcuts
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<int N> using BBox3vf = BBox<Vec3vf<N>>;
+  typedef BBox<Vec3vf4>  BBox3vf4;
+  typedef BBox<Vec3vf8>  BBox3vf8;
+  typedef BBox<Vec3vf16> BBox3vf16;
+
+  /* calculate time segment itime and fractional time ftime */
+  __forceinline int getTimeSegment(float time, float numTimeSegments, float& ftime)
+  {
+    const float timeScaled = time * numTimeSegments;
+    const float itimef = clamp(floorf(timeScaled), 0.0f, numTimeSegments-1.0f);
+    ftime = timeScaled - itimef;
+    return int(itimef);
+  }
+
+  __forceinline int getTimeSegment(float time, float start_time, float end_time, float numTimeSegments, float& ftime)
+  {
+    const float timeScaled = (time-start_time)/(end_time-start_time) * numTimeSegments;
+    const float itimef = clamp(floorf(timeScaled), 0.0f, numTimeSegments-1.0f);
+    ftime = timeScaled - itimef;
+    return int(itimef);
+  }
+
+  template<int N>
+  __forceinline vint<N> getTimeSegment(const vfloat<N>& time, const vfloat<N>& numTimeSegments, vfloat<N>& ftime)
+  {
+    const vfloat<N> timeScaled = time * numTimeSegments;
+    const vfloat<N> itimef = clamp(floor(timeScaled), vfloat<N>(zero), numTimeSegments-1.0f);
+    ftime = timeScaled - itimef;
+    return vint<N>(itimef);
+  }
+
+  template<int N>
+    __forceinline vint<N> getTimeSegment(const vfloat<N>& time, const vfloat<N>& start_time, const vfloat<N>& end_time, const vfloat<N>& numTimeSegments, vfloat<N>& ftime)
+  {
+    const vfloat<N> timeScaled = (time-start_time)/(end_time-start_time) * numTimeSegments;
+    const vfloat<N> itimef = clamp(floor(timeScaled), vfloat<N>(zero), numTimeSegments-1.0f);
+    ftime = timeScaled - itimef;
+    return vint<N>(itimef);
+  }
+
+  /* calculate overlapping time segment range */
+  __forceinline range<int> getTimeSegmentRange(const BBox1f& time_range, float numTimeSegments)
+  {
+    const float round_up   = 1.0f+2.0f*float(ulp); // corrects inaccuracies to precisely match time step
+    const float round_down = 1.0f-2.0f*float(ulp);
+    const int itime_lower = (int)max(floor(round_up  *time_range.lower*numTimeSegments), 0.0f);
+    const int itime_upper = (int)min(ceil (round_down*time_range.upper*numTimeSegments), numTimeSegments);
+    return make_range(itime_lower, itime_upper);
+  }
+
+  /* calculate overlapping time segment range */
+  __forceinline range<int> getTimeSegmentRange(const BBox1f& range, BBox1f time_range, float numTimeSegments)
+  {
+    const float lower = (range.lower-time_range.lower)/time_range.size();
+    const float upper = (range.upper-time_range.lower)/time_range.size();
+    return getTimeSegmentRange(BBox1f(lower,upper),numTimeSegments);
+  }
+}
diff --git a/thirdparty/embree/kernels/common/device.cpp b/thirdparty/embree/kernels/common/device.cpp
new file mode 100644
index 0000000000..068e0c2983
--- /dev/null
+++ b/thirdparty/embree/kernels/common/device.cpp
@@ -0,0 +1,556 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "device.h"
+#include "../hash.h"
+#include "scene_triangle_mesh.h"
+#include "scene_user_geometry.h"
+#include "scene_instance.h"
+#include "scene_curves.h"
+#include "scene_subdiv_mesh.h"
+
+#include "../subdiv/tessellation_cache.h"
+
+#include "acceln.h"
+#include "geometry.h"
+
+#include "../geometry/cylinder.h"
+
+#include "../bvh/bvh4_factory.h"
+#include "../bvh/bvh8_factory.h"
+
+#include "../../common/tasking/taskscheduler.h"
+#include "../../common/sys/alloc.h"
+
+namespace embree
+{
+  /*! some global variables that can be set via rtcSetParameter1i for debugging purposes */
+  ssize_t Device::debug_int0 = 0;
+  ssize_t Device::debug_int1 = 0;
+  ssize_t Device::debug_int2 = 0;
+  ssize_t Device::debug_int3 = 0;
+
+  DECLARE_SYMBOL2(RayStreamFilterFuncs,rayStreamFilterFuncs);
+
+  static MutexSys g_mutex;
+  static std::map<Device*,size_t> g_cache_size_map;
+  static std::map<Device*,size_t> g_num_threads_map;
+
+  Device::Device (const char* cfg)
+  {
+    /* check that CPU supports lowest ISA */
+    if (!hasISA(ISA)) {
+      throw_RTCError(RTC_ERROR_UNSUPPORTED_CPU,"CPU does not support " ISA_STR);
+    }
+
+    /* set default frequency level for detected CPU */
+    switch (getCPUModel()) {
+    case CPU::UNKNOWN:         frequency_level = FREQUENCY_SIMD256; break;
+    case CPU::XEON_ICE_LAKE:   frequency_level = FREQUENCY_SIMD256; break;
+    case CPU::CORE_ICE_LAKE:   frequency_level = FREQUENCY_SIMD256; break;
+    case CPU::CORE_TIGER_LAKE: frequency_level = FREQUENCY_SIMD128; break;
+    case CPU::CORE_COMET_LAKE: frequency_level = FREQUENCY_SIMD128; break;
+    case CPU::CORE_CANNON_LAKE:frequency_level = FREQUENCY_SIMD128; break;
+    case CPU::CORE_KABY_LAKE:  frequency_level = FREQUENCY_SIMD128; break;
+    case CPU::XEON_SKY_LAKE:   frequency_level = FREQUENCY_SIMD128; break;
+    case CPU::CORE_SKY_LAKE:   frequency_level = FREQUENCY_SIMD128; break;
+    case CPU::XEON_BROADWELL:  frequency_level = FREQUENCY_SIMD256; break;
+    case CPU::CORE_BROADWELL:  frequency_level = FREQUENCY_SIMD256; break;
+    case CPU::XEON_HASWELL:    frequency_level = FREQUENCY_SIMD256; break;
+    case CPU::CORE_HASWELL:    frequency_level = FREQUENCY_SIMD256; break;
+    case CPU::XEON_IVY_BRIDGE: frequency_level = FREQUENCY_SIMD256; break;
+    case CPU::CORE_IVY_BRIDGE: frequency_level = FREQUENCY_SIMD256; break;
+    case CPU::SANDY_BRIDGE:    frequency_level = FREQUENCY_SIMD256; break;
+    case CPU::NEHALEM:         frequency_level = FREQUENCY_SIMD128; break;
+    case CPU::CORE2:           frequency_level = FREQUENCY_SIMD128; break;
+    case CPU::CORE1:           frequency_level = FREQUENCY_SIMD128; break;
+    case CPU::XEON_PHI_KNIGHTS_MILL   : frequency_level = FREQUENCY_SIMD512; break;
+    case CPU::XEON_PHI_KNIGHTS_LANDING: frequency_level = FREQUENCY_SIMD512; break;
+    case CPU::ARM:             frequency_level = FREQUENCY_SIMD128; break;
+    }
+
+    /* initialize global state */
+#if defined(EMBREE_CONFIG)
+    State::parseString(EMBREE_CONFIG);
+#endif
+    State::parseString(cfg);
+    State::verify();
+
+    /* check whether selected ISA is supported by the HW, as the user could have forced an unsupported ISA */    
+    if (!checkISASupport()) {
+      throw_RTCError(RTC_ERROR_UNSUPPORTED_CPU,"CPU does not support selected ISA");
+    }    
+    
+    /*! do some internal tests */
+    assert(isa::Cylinder::verify());
+
+    /*! enable huge page support if desired */
+#if defined(__WIN32__)
+    if (State::enable_selockmemoryprivilege)
+      State::hugepages_success &= win_enable_selockmemoryprivilege(State::verbosity(3));
+#endif
+    State::hugepages_success &= os_init(State::hugepages,State::verbosity(3));
+    
+    /*! set tessellation cache size */
+    setCacheSize( State::tessellation_cache_size );
+
+    /*! enable some floating point exceptions to catch bugs */
+    if (State::float_exceptions)
+    {
+      int exceptions = _MM_MASK_MASK;
+      //exceptions &= ~_MM_MASK_INVALID;
+      exceptions &= ~_MM_MASK_DENORM;
+      exceptions &= ~_MM_MASK_DIV_ZERO;
+      //exceptions &= ~_MM_MASK_OVERFLOW;
+      //exceptions &= ~_MM_MASK_UNDERFLOW;
+      //exceptions &= ~_MM_MASK_INEXACT;
+      _MM_SET_EXCEPTION_MASK(exceptions);
+    }
+    
+    /* print info header */
+    if (State::verbosity(1))
+      print();
+    if (State::verbosity(2)) 
+      State::print();
+
+    /* register all algorithms */
+    bvh4_factory = make_unique(new BVH4Factory(enabled_builder_cpu_features, enabled_cpu_features));
+
+#if defined(EMBREE_TARGET_SIMD8)
+    bvh8_factory = make_unique(new BVH8Factory(enabled_builder_cpu_features, enabled_cpu_features));
+#endif
+
+    /* setup tasking system */
+    initTaskingSystem(numThreads);
+
+    /* ray stream SOA to AOS conversion */
+#if defined(EMBREE_RAY_PACKETS)
+    RayStreamFilterFuncsType rayStreamFilterFuncs;
+    SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(enabled_cpu_features,rayStreamFilterFuncs);
+    rayStreamFilters = rayStreamFilterFuncs();
+#endif
+  }
+
+  Device::~Device ()
+  {
+    setCacheSize(0);
+    exitTaskingSystem();
+  }
+
+  std::string getEnabledTargets()
+  {
+    std::string v;
+#if defined(EMBREE_TARGET_SSE2)
+    v += "SSE2 ";
+#endif
+#if defined(EMBREE_TARGET_SSE42)
+    v += "SSE4.2 ";
+#endif
+#if defined(EMBREE_TARGET_AVX)
+    v += "AVX ";
+#endif
+#if defined(EMBREE_TARGET_AVX2)
+    v += "AVX2 ";
+#endif
+#if defined(EMBREE_TARGET_AVX512)
+    v += "AVX512 ";
+#endif
+    return v;
+  }
+
+  std::string getEmbreeFeatures()
+  {
+    std::string v;
+#if defined(EMBREE_RAY_MASK)
+    v += "raymasks ";
+#endif
+#if defined (EMBREE_BACKFACE_CULLING)
+    v += "backfaceculling ";
+#endif
+#if defined (EMBREE_BACKFACE_CULLING_CURVES)
+    v += "backfacecullingcurves ";
+#endif
+#if defined(EMBREE_FILTER_FUNCTION)
+    v += "intersection_filter ";
+#endif
+#if defined (EMBREE_COMPACT_POLYS)
+    v += "compact_polys ";
+#endif
+    return v;
+  }
+
+  void Device::print()
+  {
+    const int cpu_features = getCPUFeatures();
+    std::cout << std::endl;
+    std::cout << "Embree Ray Tracing Kernels " << RTC_VERSION_STRING << " (" << RTC_HASH << ")" << std::endl;
+    std::cout << "  Compiler  : " << getCompilerName() << std::endl;
+    std::cout << "  Build     : ";
+#if defined(DEBUG)
+    std::cout << "Debug " << std::endl;
+#else
+    std::cout << "Release " << std::endl;
+#endif
+    std::cout << "  Platform  : " << getPlatformName() << std::endl;
+    std::cout << "  CPU       : " << stringOfCPUModel(getCPUModel()) << " (" << getCPUVendor() << ")" << std::endl;
+    std::cout << "   Threads  : " << getNumberOfLogicalThreads() << std::endl;
+    std::cout << "   ISA      : " << stringOfCPUFeatures(cpu_features) << std::endl;
+    std::cout << "   Targets  : " << supportedTargetList(cpu_features) << std::endl;
+    const bool hasFTZ = _mm_getcsr() & _MM_FLUSH_ZERO_ON;
+    const bool hasDAZ = _mm_getcsr() & _MM_DENORMALS_ZERO_ON;
+    std::cout << "   MXCSR    : " << "FTZ=" << hasFTZ << ", DAZ=" << hasDAZ << std::endl;
+    std::cout << "  Config" << std::endl;
+    std::cout << "    Threads : " << (numThreads ? toString(numThreads) : std::string("default")) << std::endl;
+    std::cout << "    ISA     : " << stringOfCPUFeatures(enabled_cpu_features) << std::endl;
+    std::cout << "    Targets : " << supportedTargetList(enabled_cpu_features) << " (supported)" << std::endl;
+    std::cout << "              " << getEnabledTargets() << " (compile time enabled)" << std::endl;
+    std::cout << "    Features: " << getEmbreeFeatures() << std::endl;
+    std::cout << "    Tasking : ";
+#if defined(TASKING_TBB)
+    std::cout << "TBB" << TBB_VERSION_MAJOR << "." << TBB_VERSION_MINOR << " ";
+  #if TBB_INTERFACE_VERSION >= 12002
+    std::cout << "TBB_header_interface_" << TBB_INTERFACE_VERSION << " TBB_lib_interface_" << TBB_runtime_interface_version() << " ";
+  #else
+    std::cout << "TBB_header_interface_" << TBB_INTERFACE_VERSION << " TBB_lib_interface_" << tbb::TBB_runtime_interface_version() << " ";
+  #endif
+#endif
+#if defined(TASKING_INTERNAL)
+    std::cout << "internal_tasking_system ";
+#endif
+#if defined(TASKING_PPL)
+	std::cout << "PPL ";
+#endif
+    std::cout << std::endl;
+
+    /* check of FTZ and DAZ flags are set in CSR */
+    if (!hasFTZ || !hasDAZ) 
+    {
+#if !defined(_DEBUG)
+      if (State::verbosity(1)) 
+#endif
+      {
+        std::cout << std::endl;
+        std::cout << "================================================================================" << std::endl;
+        std::cout << "  WARNING: \"Flush to Zero\" or \"Denormals are Zero\" mode not enabled "         << std::endl 
+                  << "           in the MXCSR control and status register. This can have a severe "     << std::endl
+                  << "           performance impact. Please enable these modes for each application "   << std::endl
+                  << "           thread the following way:" << std::endl
+                  << std::endl 
+                  << "           #include \"xmmintrin.h\"" << std::endl 
+                  << "           #include \"pmmintrin.h\"" << std::endl 
+                  << std::endl 
+                  << "           _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);" << std::endl 
+                  << "           _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);" << std::endl;
+        std::cout << "================================================================================" << std::endl;
+        std::cout << std::endl;
+      }
+    }
+    std::cout << std::endl;
+  }
+
+  void Device::setDeviceErrorCode(RTCError error)
+  {
+    RTCError* stored_error = errorHandler.error();
+    if (*stored_error == RTC_ERROR_NONE)
+      *stored_error = error;
+  }
+
+  RTCError Device::getDeviceErrorCode()
+  {
+    RTCError* stored_error = errorHandler.error();
+    RTCError error = *stored_error;
+    *stored_error = RTC_ERROR_NONE;
+    return error;
+  }
+
+  void Device::setThreadErrorCode(RTCError error)
+  {
+    RTCError* stored_error = g_errorHandler.error();
+    if (*stored_error == RTC_ERROR_NONE)
+      *stored_error = error;
+  }
+
+  RTCError Device::getThreadErrorCode()
+  {
+    RTCError* stored_error = g_errorHandler.error();
+    RTCError error = *stored_error;
+    *stored_error = RTC_ERROR_NONE;
+    return error;
+  }
+
+  void Device::process_error(Device* device, RTCError error, const char* str)
+  { 
+    /* store global error code when device construction failed */
+    if (!device)
+      return setThreadErrorCode(error);
+
+    /* print error when in verbose mode */
+    if (device->verbosity(1)) 
+    {
+      switch (error) {
+      case RTC_ERROR_NONE         : std::cerr << "Embree: No error"; break;
+      case RTC_ERROR_UNKNOWN    : std::cerr << "Embree: Unknown error"; break;
+      case RTC_ERROR_INVALID_ARGUMENT : std::cerr << "Embree: Invalid argument"; break;
+      case RTC_ERROR_INVALID_OPERATION: std::cerr << "Embree: Invalid operation"; break;
+      case RTC_ERROR_OUT_OF_MEMORY    : std::cerr << "Embree: Out of memory"; break;
+      case RTC_ERROR_UNSUPPORTED_CPU  : std::cerr << "Embree: Unsupported CPU"; break;
+      default                   : std::cerr << "Embree: Invalid error code"; break;                   
+      };
+      if (str) std::cerr << ", (" << str << ")";
+      std::cerr << std::endl;
+    }
+
+    /* call user specified error callback */
+    if (device->error_function) 
+      device->error_function(device->error_function_userptr,error,str); 
+
+    /* record error code */
+    device->setDeviceErrorCode(error);
+  }
+
+  void Device::memoryMonitor(ssize_t bytes, bool post)
+  {
+    if (State::memory_monitor_function && bytes != 0) {
+      if (!State::memory_monitor_function(State::memory_monitor_userptr,bytes,post)) {
+        if (bytes > 0) { // only throw exception when we allocate memory to never throw inside a destructor
+          throw_RTCError(RTC_ERROR_OUT_OF_MEMORY,"memory monitor forced termination");
+        }
+      }
+    }
+  }
+
+  size_t getMaxNumThreads()
+  {
+    size_t maxNumThreads = 0;
+    for (std::map<Device*,size_t>::iterator i=g_num_threads_map.begin(); i != g_num_threads_map.end(); i++)
+      maxNumThreads = max(maxNumThreads, (*i).second);
+    if (maxNumThreads == 0)
+      maxNumThreads = std::numeric_limits<size_t>::max();
+    return maxNumThreads;
+  }
+
+  size_t getMaxCacheSize()
+  {
+    size_t maxCacheSize = 0;
+    for (std::map<Device*,size_t>::iterator i=g_cache_size_map.begin(); i!= g_cache_size_map.end(); i++)
+      maxCacheSize = max(maxCacheSize, (*i).second);
+    return maxCacheSize;
+  }
+ 
+  void Device::setCacheSize(size_t bytes) 
+  {
+#if defined(EMBREE_GEOMETRY_SUBDIVISION)
+    Lock<MutexSys> lock(g_mutex);
+    if (bytes == 0) g_cache_size_map.erase(this);
+    else            g_cache_size_map[this] = bytes;
+    
+    size_t maxCacheSize = getMaxCacheSize();
+    resizeTessellationCache(maxCacheSize);
+#endif
+  }
+
+  void Device::initTaskingSystem(size_t numThreads) 
+  {
+    Lock<MutexSys> lock(g_mutex);
+    if (numThreads == 0) 
+      g_num_threads_map[this] = std::numeric_limits<size_t>::max();
+    else 
+      g_num_threads_map[this] = numThreads;
+
+    /* create task scheduler */
+    size_t maxNumThreads = getMaxNumThreads();
+    TaskScheduler::create(maxNumThreads,State::set_affinity,State::start_threads);
+#if USE_TASK_ARENA
+    const size_t nThreads = min(maxNumThreads,TaskScheduler::threadCount());
+    const size_t uThreads = min(max(numUserThreads,(size_t)1),nThreads);
+    arena = make_unique(new tbb::task_arena((int)nThreads,(unsigned int)uThreads));
+#endif
+  }
+
+  void Device::exitTaskingSystem() 
+  {
+    Lock<MutexSys> lock(g_mutex);
+    g_num_threads_map.erase(this);
+
+    /* terminate tasking system */
+    if (g_num_threads_map.size() == 0) {
+      TaskScheduler::destroy();
+    } 
+    /* or configure new number of threads */
+    else {
+      size_t maxNumThreads = getMaxNumThreads();
+      TaskScheduler::create(maxNumThreads,State::set_affinity,State::start_threads);
+    }
+#if USE_TASK_ARENA
+    arena.reset();
+#endif
+  }
+
+  void Device::setProperty(const RTCDeviceProperty prop, ssize_t val)
+  {
+    /* hidden internal properties */
+    switch ((size_t)prop)
+    {
+    case 1000000: debug_int0 = val; return;
+    case 1000001: debug_int1 = val; return;
+    case 1000002: debug_int2 = val; return;
+    case 1000003: debug_int3 = val; return;
+    }
+
+    throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "unknown writable property");
+  }
+
+  ssize_t Device::getProperty(const RTCDeviceProperty prop)
+  {
+    size_t iprop = (size_t)prop;
+
+    /* get name of internal regression test */
+    if (iprop >= 2000000 && iprop < 3000000)
+    {
+      RegressionTest* test = getRegressionTest(iprop-2000000);
+      if (test) return (ssize_t) test->name.c_str();
+      else      return 0;
+    }
+
+    /* run internal regression test */
+    if (iprop >= 3000000 && iprop < 4000000)
+    {
+      RegressionTest* test = getRegressionTest(iprop-3000000);
+      if (test) return test->run();
+      else      return 0;
+    }
+
+    /* documented properties */
+    switch (prop) 
+    {
+    case RTC_DEVICE_PROPERTY_VERSION_MAJOR: return RTC_VERSION_MAJOR;
+    case RTC_DEVICE_PROPERTY_VERSION_MINOR: return RTC_VERSION_MINOR;
+    case RTC_DEVICE_PROPERTY_VERSION_PATCH: return RTC_VERSION_PATCH;
+    case RTC_DEVICE_PROPERTY_VERSION      : return RTC_VERSION;
+
+#if defined(EMBREE_TARGET_SIMD4) && defined(EMBREE_RAY_PACKETS)
+    case RTC_DEVICE_PROPERTY_NATIVE_RAY4_SUPPORTED:  return hasISA(SSE2);
+#else
+    case RTC_DEVICE_PROPERTY_NATIVE_RAY4_SUPPORTED:  return 0;
+#endif
+
+#if defined(EMBREE_TARGET_SIMD8) && defined(EMBREE_RAY_PACKETS)
+    case RTC_DEVICE_PROPERTY_NATIVE_RAY8_SUPPORTED:  return hasISA(AVX);
+#else
+    case RTC_DEVICE_PROPERTY_NATIVE_RAY8_SUPPORTED:  return 0;
+#endif
+
+#if defined(EMBREE_TARGET_SIMD16) && defined(EMBREE_RAY_PACKETS)
+    case RTC_DEVICE_PROPERTY_NATIVE_RAY16_SUPPORTED: return hasISA(AVX512);
+#else
+    case RTC_DEVICE_PROPERTY_NATIVE_RAY16_SUPPORTED: return 0;
+#endif
+
+#if defined(EMBREE_RAY_PACKETS)
+    case RTC_DEVICE_PROPERTY_RAY_STREAM_SUPPORTED:  return 1;
+#else
+    case RTC_DEVICE_PROPERTY_RAY_STREAM_SUPPORTED:  return 0;
+#endif
+    
+#if defined(EMBREE_RAY_MASK)
+    case RTC_DEVICE_PROPERTY_RAY_MASK_SUPPORTED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_RAY_MASK_SUPPORTED: return 0;
+#endif
+
+#if defined(EMBREE_BACKFACE_CULLING)
+    case RTC_DEVICE_PROPERTY_BACKFACE_CULLING_ENABLED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_BACKFACE_CULLING_ENABLED: return 0;
+#endif
+
+#if defined(EMBREE_BACKFACE_CULLING_CURVES)
+    case RTC_DEVICE_PROPERTY_BACKFACE_CULLING_CURVES_ENABLED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_BACKFACE_CULLING_CURVES_ENABLED: return 0;
+#endif
+
+#if defined(EMBREE_COMPACT_POLYS)
+    case RTC_DEVICE_PROPERTY_COMPACT_POLYS_ENABLED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_COMPACT_POLYS_ENABLED: return 0;
+#endif
+
+#if defined(EMBREE_FILTER_FUNCTION)
+    case RTC_DEVICE_PROPERTY_FILTER_FUNCTION_SUPPORTED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_FILTER_FUNCTION_SUPPORTED: return 0;
+#endif
+
+#if defined(EMBREE_IGNORE_INVALID_RAYS)
+    case RTC_DEVICE_PROPERTY_IGNORE_INVALID_RAYS_ENABLED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_IGNORE_INVALID_RAYS_ENABLED: return 0;
+#endif
+
+#if defined(TASKING_INTERNAL)
+    case RTC_DEVICE_PROPERTY_TASKING_SYSTEM: return 0;
+#endif
+
+#if defined(TASKING_TBB)
+    case RTC_DEVICE_PROPERTY_TASKING_SYSTEM: return 1;
+#endif
+
+#if defined(TASKING_PPL)
+    case RTC_DEVICE_PROPERTY_TASKING_SYSTEM: return 2;
+#endif
+
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+    case RTC_DEVICE_PROPERTY_TRIANGLE_GEOMETRY_SUPPORTED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_TRIANGLE_GEOMETRY_SUPPORTED: return 0;
+#endif
+        
+#if defined(EMBREE_GEOMETRY_QUAD)
+    case RTC_DEVICE_PROPERTY_QUAD_GEOMETRY_SUPPORTED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_QUAD_GEOMETRY_SUPPORTED: return 0;
+#endif
+
+#if defined(EMBREE_GEOMETRY_CURVE)
+    case RTC_DEVICE_PROPERTY_CURVE_GEOMETRY_SUPPORTED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_CURVE_GEOMETRY_SUPPORTED: return 0;
+#endif
+
+#if defined(EMBREE_GEOMETRY_SUBDIVISION)
+    case RTC_DEVICE_PROPERTY_SUBDIVISION_GEOMETRY_SUPPORTED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_SUBDIVISION_GEOMETRY_SUPPORTED: return 0;
+#endif
+
+#if defined(EMBREE_GEOMETRY_USER)
+    case RTC_DEVICE_PROPERTY_USER_GEOMETRY_SUPPORTED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_USER_GEOMETRY_SUPPORTED: return 0;
+#endif
+
+#if defined(EMBREE_GEOMETRY_POINT)
+    case RTC_DEVICE_PROPERTY_POINT_GEOMETRY_SUPPORTED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_POINT_GEOMETRY_SUPPORTED: return 0;
+#endif
+
+#if defined(TASKING_PPL)
+    case RTC_DEVICE_PROPERTY_JOIN_COMMIT_SUPPORTED: return 0;
+#elif defined(TASKING_TBB) && (TBB_INTERFACE_VERSION_MAJOR < 8)
+    case RTC_DEVICE_PROPERTY_JOIN_COMMIT_SUPPORTED: return 0;
+#else
+    case RTC_DEVICE_PROPERTY_JOIN_COMMIT_SUPPORTED: return 1;
+#endif
+
+#if defined(TASKING_TBB) && TASKING_TBB_USE_TASK_ISOLATION
+    case RTC_DEVICE_PROPERTY_PARALLEL_COMMIT_SUPPORTED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_PARALLEL_COMMIT_SUPPORTED: return 0;
+#endif
+
+    default: throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "unknown readable property"); break;
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/common/device.h b/thirdparty/embree/kernels/common/device.h
new file mode 100644
index 0000000000..21c42c654d
--- /dev/null
+++ b/thirdparty/embree/kernels/common/device.h
@@ -0,0 +1,85 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "state.h"
+#include "accel.h"
+
+namespace embree
+{
+  class BVH4Factory;
+  class BVH8Factory;
+
+  class Device : public State, public MemoryMonitorInterface
+  {
+    ALIGNED_CLASS_(16);
+
+  public:
+
+    /*! Device construction */
+    Device (const char* cfg);
+
+    /*! Device destruction */
+    virtual ~Device ();
+
+    /*! prints info about the device */
+    void print();
+
+    /*! sets the error code */
+    void setDeviceErrorCode(RTCError error);
+
+    /*! returns and clears the error code */
+    RTCError getDeviceErrorCode();
+
+    /*! sets the error code */
+    static void setThreadErrorCode(RTCError error);
+
+    /*! returns and clears the error code */
+    static RTCError getThreadErrorCode();
+
+    /*! processes error codes, do not call directly */
+    static void process_error(Device* device, RTCError error, const char* str);
+
+    /*! invokes the memory monitor callback */
+    void memoryMonitor(ssize_t bytes, bool post);
+
+    /*! sets the size of the software cache. */
+    void setCacheSize(size_t bytes);
+
+    /*! sets a property */
+    void setProperty(const RTCDeviceProperty prop, ssize_t val);
+
+    /*! gets a property */
+    ssize_t getProperty(const RTCDeviceProperty prop);
+
+  private:
+
+    /*! initializes the tasking system */
+    void initTaskingSystem(size_t numThreads);
+
+    /*! shuts down the tasking system */
+    void exitTaskingSystem();
+
+    /*! some variables that can be set via rtcSetParameter1i for debugging purposes */
+  public:
+    static ssize_t debug_int0;
+    static ssize_t debug_int1;
+    static ssize_t debug_int2;
+    static ssize_t debug_int3;
+
+  public:
+    std::unique_ptr<BVH4Factory> bvh4_factory;
+#if defined(EMBREE_TARGET_SIMD8)
+    std::unique_ptr<BVH8Factory> bvh8_factory;
+#endif
+    
+#if USE_TASK_ARENA
+    std::unique_ptr<tbb::task_arena> arena;
+#endif
+    
+    /* ray streams filter */
+    RayStreamFilterFuncs rayStreamFilters;
+  };
+}
diff --git a/thirdparty/embree/kernels/common/geometry.cpp b/thirdparty/embree/kernels/common/geometry.cpp
new file mode 100644
index 0000000000..d8d3f65a5c
--- /dev/null
+++ b/thirdparty/embree/kernels/common/geometry.cpp
@@ -0,0 +1,259 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "geometry.h"
+#include "scene.h"
+
+namespace embree
+{
+  const char* Geometry::gtype_names[Geometry::GTY_END] =
+  {
+    "flat_linear_curve",
+    "round_linear_curve",
+    "oriented_linear_curve",
+    "",
+    "flat_bezier_curve",
+    "round_bezier_curve",
+    "oriented_bezier_curve",
+    "",
+    "flat_bspline_curve",
+    "round_bspline_curve",
+    "oriented_bspline_curve",
+    "",
+    "flat_hermite_curve",
+    "round_hermite_curve",
+    "oriented_hermite_curve",
+    "",
+    "flat_catmull_rom_curve",
+    "round_catmull_rom_curve",
+    "oriented_catmull_rom_curve",
+    "",    
+    "triangles",
+    "quads",
+    "grid",
+    "subdivs",
+    "",
+    "sphere",
+    "disc",
+    "oriented_disc",
+    "",
+    "usergeom",
+    "instance_cheap",
+    "instance_expensive",
+  };
+     
+  Geometry::Geometry (Device* device, GType gtype, unsigned int numPrimitives, unsigned int numTimeSteps) 
+    : device(device), userPtr(nullptr),
+      numPrimitives(numPrimitives), numTimeSteps(unsigned(numTimeSteps)), fnumTimeSegments(float(numTimeSteps-1)), time_range(0.0f,1.0f),
+      mask(-1),
+      gtype(gtype),
+      gsubtype(GTY_SUBTYPE_DEFAULT),
+      quality(RTC_BUILD_QUALITY_MEDIUM),
+      state((unsigned)State::MODIFIED),
+      enabled(true),
+      intersectionFilterN(nullptr), occlusionFilterN(nullptr), pointQueryFunc(nullptr)
+  {
+    device->refInc();
+  }
+
+  Geometry::~Geometry()
+  {
+    device->refDec();
+  }
+
+  void Geometry::setNumPrimitives(unsigned int numPrimitives_in)
+  {      
+    if (numPrimitives_in == numPrimitives) return;
+    
+    numPrimitives = numPrimitives_in;
+    
+    Geometry::update();
+  }
+
+  void Geometry::setNumTimeSteps (unsigned int numTimeSteps_in)
+  {
+    if (numTimeSteps_in == numTimeSteps) {
+      return;
+    }
+    
+    numTimeSteps = numTimeSteps_in;
+    fnumTimeSegments = float(numTimeSteps_in-1);
+    
+    Geometry::update();
+  }
+
+  void Geometry::setTimeRange (const BBox1f range)
+  {
+    time_range = range;
+    Geometry::update();
+  }
+  
+  void Geometry::update()
+  {
+    ++modCounter_; // FIXME: required?
+    state = (unsigned)State::MODIFIED;
+  }
+  
+  void Geometry::commit() 
+  {
+    ++modCounter_;
+    state = (unsigned)State::COMMITTED;
+  }
+
+  void Geometry::preCommit()
+  {
+    if (State::MODIFIED == (State)state)
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"geometry not committed");
+  }
+
+  void Geometry::postCommit()
+  {
+  }
+
+  void Geometry::enable () 
+  {
+    if (isEnabled()) 
+      return;
+
+    enabled = true;
+    ++modCounter_;
+  }
+
+  void Geometry::disable () 
+  {
+    if (isDisabled()) 
+      return;
+    
+    enabled = false;
+    ++modCounter_;
+  }
+
+  void Geometry::setUserData (void* ptr)
+  {
+    userPtr = ptr;
+  }
+  
+  void Geometry::setIntersectionFilterFunctionN (RTCFilterFunctionN filter) 
+  {
+    if (!(getTypeMask() & (MTY_TRIANGLE_MESH | MTY_QUAD_MESH | MTY_CURVES | MTY_SUBDIV_MESH | MTY_USER_GEOMETRY | MTY_GRID_MESH)))
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"filter functions not supported for this geometry"); 
+
+    intersectionFilterN = filter;
+  }
+
+  void Geometry::setOcclusionFilterFunctionN (RTCFilterFunctionN filter) 
+  {
+    if (!(getTypeMask() & (MTY_TRIANGLE_MESH | MTY_QUAD_MESH | MTY_CURVES | MTY_SUBDIV_MESH | MTY_USER_GEOMETRY | MTY_GRID_MESH)))
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"filter functions not supported for this geometry"); 
+
+    occlusionFilterN = filter;
+  }
+  
+  void Geometry::setPointQueryFunction (RTCPointQueryFunction func) 
+  {
+    pointQueryFunc = func;
+  }
+
+  void Geometry::interpolateN(const RTCInterpolateNArguments* const args)
+  {
+    const void* valid_i = args->valid;
+    const unsigned* primIDs = args->primIDs;
+    const float* u = args->u;
+    const float* v = args->v;
+    unsigned int N = args->N;
+    RTCBufferType bufferType = args->bufferType;
+    unsigned int bufferSlot = args->bufferSlot;
+    float* P = args->P;
+    float* dPdu = args->dPdu;
+    float* dPdv = args->dPdv;
+    float* ddPdudu = args->ddPdudu;
+    float* ddPdvdv = args->ddPdvdv;
+    float* ddPdudv = args->ddPdudv;
+    unsigned int valueCount = args->valueCount;
+
+    if (valueCount > 256) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"maximally 256 floating point values can be interpolated per vertex");
+    const int* valid = (const int*) valid_i;
+ 
+    __aligned(64) float P_tmp[256];
+    __aligned(64) float dPdu_tmp[256];
+    __aligned(64) float dPdv_tmp[256];
+    __aligned(64) float ddPdudu_tmp[256];
+    __aligned(64) float ddPdvdv_tmp[256];
+    __aligned(64) float ddPdudv_tmp[256];
+
+    float* Pt = P ? P_tmp : nullptr;
+    float* dPdut = nullptr, *dPdvt = nullptr;
+    if (dPdu) { dPdut = dPdu_tmp; dPdvt = dPdv_tmp; }
+    float* ddPdudut = nullptr, *ddPdvdvt = nullptr, *ddPdudvt = nullptr;
+    if (ddPdudu) { ddPdudut = ddPdudu_tmp; ddPdvdvt = ddPdvdv_tmp; ddPdudvt = ddPdudv_tmp; }
+    
+    for (unsigned int i=0; i<N; i++)
+    {
+      if (valid && !valid[i]) continue;
+
+      RTCInterpolateArguments iargs;
+      iargs.primID = primIDs[i];
+      iargs.u = u[i];
+      iargs.v = v[i];
+      iargs.bufferType = bufferType;
+      iargs.bufferSlot = bufferSlot;
+      iargs.P = Pt;
+      iargs.dPdu = dPdut;
+      iargs.dPdv = dPdvt;
+      iargs.ddPdudu = ddPdudut;
+      iargs.ddPdvdv = ddPdvdvt;
+      iargs.ddPdudv = ddPdudvt;
+      iargs.valueCount = valueCount;
+      interpolate(&iargs);
+      
+      if (likely(P)) {
+        for (unsigned int j=0; j<valueCount; j++) 
+          P[j*N+i] = Pt[j];
+      }
+      if (likely(dPdu)) 
+      {
+        for (unsigned int j=0; j<valueCount; j++) {
+          dPdu[j*N+i] = dPdut[j];
+          dPdv[j*N+i] = dPdvt[j];
+        }
+      }
+      if (likely(ddPdudu)) 
+      {
+        for (unsigned int j=0; j<valueCount; j++) {
+          ddPdudu[j*N+i] = ddPdudut[j];
+          ddPdvdv[j*N+i] = ddPdvdvt[j];
+          ddPdudv[j*N+i] = ddPdudvt[j];
+        }
+      }
+    }
+  }
+    
+  bool Geometry::pointQuery(PointQuery* query, PointQueryContext* context)
+  {
+    assert(context->primID < size());
+   
+    RTCPointQueryFunctionArguments args;
+    args.query           = (RTCPointQuery*)context->query_ws;
+    args.userPtr         = context->userPtr;
+    args.primID          = context->primID;
+    args.geomID          = context->geomID;
+    args.context         = context->userContext;
+    args.similarityScale = context->similarityScale;
+    
+    bool update = false;
+    if(context->func)  update |= context->func(&args);
+    if(pointQueryFunc) update |= pointQueryFunc(&args);
+
+    if (update && context->userContext->instStackSize > 0)
+    {
+      // update point query
+      if (context->query_type == POINT_QUERY_TYPE_AABB) {
+        context->updateAABB();
+      } else {
+        assert(context->similarityScale > 0.f);
+        query->radius = context->query_ws->radius * context->similarityScale;
+      }
+    }
+    return update;
+  }
+}
diff --git a/thirdparty/embree/kernels/common/geometry.h b/thirdparty/embree/kernels/common/geometry.h
new file mode 100644
index 0000000000..2f9f2e7c94
--- /dev/null
+++ b/thirdparty/embree/kernels/common/geometry.h
@@ -0,0 +1,582 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "device.h"
+#include "buffer.h"
+#include "../common/point_query.h"
+#include "../builders/priminfo.h"
+
+namespace embree
+{
+  class Scene;
+  class Geometry;
+
+  struct GeometryCounts 
+  {
+    __forceinline GeometryCounts()
+      : numFilterFunctions(0),
+        numTriangles(0), numMBTriangles(0), 
+        numQuads(0), numMBQuads(0), 
+        numBezierCurves(0), numMBBezierCurves(0), 
+        numLineSegments(0), numMBLineSegments(0), 
+        numSubdivPatches(0), numMBSubdivPatches(0), 
+        numUserGeometries(0), numMBUserGeometries(0), 
+        numInstancesCheap(0), numMBInstancesCheap(0), 
+        numInstancesExpensive(0), numMBInstancesExpensive(0), 
+        numGrids(0), numMBGrids(0), 
+        numPoints(0), numMBPoints(0) {}
+
+    __forceinline size_t size() const {
+      return    numTriangles + numQuads + numBezierCurves + numLineSegments + numSubdivPatches + numUserGeometries + numInstancesCheap + numInstancesExpensive + numGrids + numPoints
+              + numMBTriangles + numMBQuads + numMBBezierCurves + numMBLineSegments + numMBSubdivPatches + numMBUserGeometries + numMBInstancesCheap + numMBInstancesExpensive + numMBGrids + numMBPoints;
+    }
+
+    __forceinline unsigned int enabledGeometryTypesMask() const
+    {
+      unsigned int mask = 0;
+      if (numTriangles) mask |= 1 << 0;
+      if (numQuads) mask |= 1 << 1;
+      if (numBezierCurves+numLineSegments) mask |= 1 << 2;
+      if (numSubdivPatches) mask |= 1 << 3;
+      if (numUserGeometries) mask |= 1 << 4;
+      if (numInstancesCheap) mask |= 1 << 5;
+      if (numInstancesExpensive) mask |= 1 << 6;
+      if (numGrids) mask |= 1 << 7;
+      if (numPoints) mask |= 1 << 8;
+
+      unsigned int maskMB = 0;
+      if (numMBTriangles) maskMB |= 1 << 0;
+      if (numMBQuads) maskMB |= 1 << 1;
+      if (numMBBezierCurves+numMBLineSegments) maskMB |= 1 << 2;
+      if (numMBSubdivPatches) maskMB |= 1 << 3;
+      if (numMBUserGeometries) maskMB |= 1 << 4;
+      if (numMBInstancesCheap) maskMB |= 1 << 5;
+      if (numMBInstancesExpensive) maskMB |= 1 << 6;
+      if (numMBGrids) maskMB |= 1 << 7;
+      if (numMBPoints) maskMB |= 1 << 8;
+      
+      return (mask<<8) + maskMB;
+    }
+
+    __forceinline GeometryCounts operator+ (GeometryCounts const & rhs) const
+    {
+      GeometryCounts ret;
+      ret.numFilterFunctions = numFilterFunctions + rhs.numFilterFunctions;
+      ret.numTriangles = numTriangles + rhs.numTriangles;
+      ret.numMBTriangles = numMBTriangles + rhs.numMBTriangles;
+      ret.numQuads = numQuads + rhs.numQuads;
+      ret.numMBQuads = numMBQuads + rhs.numMBQuads;
+      ret.numBezierCurves = numBezierCurves + rhs.numBezierCurves;
+      ret.numMBBezierCurves = numMBBezierCurves + rhs.numMBBezierCurves;
+      ret.numLineSegments = numLineSegments + rhs.numLineSegments;
+      ret.numMBLineSegments = numMBLineSegments + rhs.numMBLineSegments;
+      ret.numSubdivPatches = numSubdivPatches + rhs.numSubdivPatches;
+      ret.numMBSubdivPatches = numMBSubdivPatches + rhs.numMBSubdivPatches;
+      ret.numUserGeometries = numUserGeometries + rhs.numUserGeometries;
+      ret.numMBUserGeometries = numMBUserGeometries + rhs.numMBUserGeometries;
+      ret.numInstancesCheap = numInstancesCheap + rhs.numInstancesCheap;
+      ret.numMBInstancesCheap = numMBInstancesCheap + rhs.numMBInstancesCheap;
+      ret.numInstancesExpensive = numInstancesExpensive + rhs.numInstancesExpensive;
+      ret.numMBInstancesExpensive = numMBInstancesExpensive + rhs.numMBInstancesExpensive;
+      ret.numGrids = numGrids + rhs.numGrids;
+      ret.numMBGrids = numMBGrids + rhs.numMBGrids;
+      ret.numPoints = numPoints + rhs.numPoints;
+      ret.numMBPoints = numMBPoints + rhs.numMBPoints;
+
+      return ret;
+    }
+
+    size_t numFilterFunctions;       //!< number of geometries with filter functions enabled
+    size_t numTriangles;             //!< number of enabled triangles
+    size_t numMBTriangles;           //!< number of enabled motion blured triangles
+    size_t numQuads;                 //!< number of enabled quads
+    size_t numMBQuads;               //!< number of enabled motion blurred quads
+    size_t numBezierCurves;          //!< number of enabled curves
+    size_t numMBBezierCurves;        //!< number of enabled motion blurred curves
+    size_t numLineSegments;          //!< number of enabled line segments
+    size_t numMBLineSegments;        //!< number of enabled line motion blurred segments
+    size_t numSubdivPatches;         //!< number of enabled subdivision patches
+    size_t numMBSubdivPatches;       //!< number of enabled motion blured subdivision patches
+    size_t numUserGeometries;        //!< number of enabled user geometries
+    size_t numMBUserGeometries;      //!< number of enabled motion blurred user geometries
+    size_t numInstancesCheap;        //!< number of enabled cheap instances
+    size_t numMBInstancesCheap;      //!< number of enabled motion blurred cheap instances
+    size_t numInstancesExpensive;    //!< number of enabled expensive instances
+    size_t numMBInstancesExpensive;  //!< number of enabled motion blurred expensive instances
+    size_t numGrids;                 //!< number of enabled grid geometries
+    size_t numMBGrids;               //!< number of enabled motion blurred grid geometries
+    size_t numPoints;                //!< number of enabled points
+    size_t numMBPoints;              //!< number of enabled motion blurred points
+  };
+
+  /*! Base class all geometries are derived from */
+  class Geometry : public RefCount
+  {
+    friend class Scene;
+  public:
+
+    /*! type of geometry */
+    enum GType
+    {
+      GTY_FLAT_LINEAR_CURVE = 0,
+      GTY_ROUND_LINEAR_CURVE = 1,
+      GTY_ORIENTED_LINEAR_CURVE = 2,
+      GTY_CONE_LINEAR_CURVE = 3,
+      
+      GTY_FLAT_BEZIER_CURVE = 4,
+      GTY_ROUND_BEZIER_CURVE = 5,
+      GTY_ORIENTED_BEZIER_CURVE = 6,
+      
+      GTY_FLAT_BSPLINE_CURVE = 8,
+      GTY_ROUND_BSPLINE_CURVE = 9,
+      GTY_ORIENTED_BSPLINE_CURVE = 10,
+
+      GTY_FLAT_HERMITE_CURVE = 12,
+      GTY_ROUND_HERMITE_CURVE = 13,
+      GTY_ORIENTED_HERMITE_CURVE = 14,
+      
+      GTY_FLAT_CATMULL_ROM_CURVE = 16,
+      GTY_ROUND_CATMULL_ROM_CURVE = 17,
+      GTY_ORIENTED_CATMULL_ROM_CURVE = 18,      
+
+      GTY_TRIANGLE_MESH = 20,
+      GTY_QUAD_MESH = 21,
+      GTY_GRID_MESH = 22,
+      GTY_SUBDIV_MESH = 23,
+
+      GTY_SPHERE_POINT = 25,
+      GTY_DISC_POINT = 26,
+      GTY_ORIENTED_DISC_POINT = 27,
+      
+      GTY_USER_GEOMETRY = 29,
+      GTY_INSTANCE_CHEAP = 30,
+      GTY_INSTANCE_EXPENSIVE = 31,
+      GTY_END = 32,
+
+      GTY_BASIS_LINEAR = 0,
+      GTY_BASIS_BEZIER = 4,
+      GTY_BASIS_BSPLINE = 8,
+      GTY_BASIS_HERMITE = 12,
+      GTY_BASIS_CATMULL_ROM = 16,
+      GTY_BASIS_MASK = 28,
+
+      GTY_SUBTYPE_FLAT_CURVE = 0,
+      GTY_SUBTYPE_ROUND_CURVE = 1,
+      GTY_SUBTYPE_ORIENTED_CURVE = 2,
+      GTY_SUBTYPE_MASK = 3,
+    };
+
+    enum GSubType
+    {
+      GTY_SUBTYPE_DEFAULT= 0,
+      GTY_SUBTYPE_INSTANCE_LINEAR = 0,
+      GTY_SUBTYPE_INSTANCE_QUATERNION = 1
+    };
+
+    enum GTypeMask
+    {
+      MTY_FLAT_LINEAR_CURVE = 1ul << GTY_FLAT_LINEAR_CURVE,
+      MTY_ROUND_LINEAR_CURVE = 1ul << GTY_ROUND_LINEAR_CURVE,
+      MTY_CONE_LINEAR_CURVE = 1ul << GTY_CONE_LINEAR_CURVE,
+      MTY_ORIENTED_LINEAR_CURVE = 1ul << GTY_ORIENTED_LINEAR_CURVE,
+      
+      MTY_FLAT_BEZIER_CURVE = 1ul << GTY_FLAT_BEZIER_CURVE,
+      MTY_ROUND_BEZIER_CURVE = 1ul << GTY_ROUND_BEZIER_CURVE,
+      MTY_ORIENTED_BEZIER_CURVE = 1ul << GTY_ORIENTED_BEZIER_CURVE,
+      
+      MTY_FLAT_BSPLINE_CURVE = 1ul << GTY_FLAT_BSPLINE_CURVE,
+      MTY_ROUND_BSPLINE_CURVE = 1ul << GTY_ROUND_BSPLINE_CURVE,
+      MTY_ORIENTED_BSPLINE_CURVE = 1ul << GTY_ORIENTED_BSPLINE_CURVE,
+
+      MTY_FLAT_HERMITE_CURVE = 1ul << GTY_FLAT_HERMITE_CURVE,
+      MTY_ROUND_HERMITE_CURVE = 1ul << GTY_ROUND_HERMITE_CURVE,
+      MTY_ORIENTED_HERMITE_CURVE = 1ul << GTY_ORIENTED_HERMITE_CURVE,
+
+      MTY_FLAT_CATMULL_ROM_CURVE = 1ul << GTY_FLAT_CATMULL_ROM_CURVE,
+      MTY_ROUND_CATMULL_ROM_CURVE = 1ul << GTY_ROUND_CATMULL_ROM_CURVE,
+      MTY_ORIENTED_CATMULL_ROM_CURVE = 1ul << GTY_ORIENTED_CATMULL_ROM_CURVE,
+
+      MTY_CURVE2 = MTY_FLAT_LINEAR_CURVE | MTY_ROUND_LINEAR_CURVE | MTY_CONE_LINEAR_CURVE | MTY_ORIENTED_LINEAR_CURVE,
+      
+      MTY_CURVE4 = MTY_FLAT_BEZIER_CURVE | MTY_ROUND_BEZIER_CURVE | MTY_ORIENTED_BEZIER_CURVE |
+                   MTY_FLAT_BSPLINE_CURVE | MTY_ROUND_BSPLINE_CURVE | MTY_ORIENTED_BSPLINE_CURVE |
+                   MTY_FLAT_HERMITE_CURVE | MTY_ROUND_HERMITE_CURVE | MTY_ORIENTED_HERMITE_CURVE |
+                   MTY_FLAT_CATMULL_ROM_CURVE | MTY_ROUND_CATMULL_ROM_CURVE | MTY_ORIENTED_CATMULL_ROM_CURVE,
+
+      MTY_SPHERE_POINT = 1ul << GTY_SPHERE_POINT,
+      MTY_DISC_POINT = 1ul << GTY_DISC_POINT,
+      MTY_ORIENTED_DISC_POINT = 1ul << GTY_ORIENTED_DISC_POINT,
+
+      MTY_POINTS = MTY_SPHERE_POINT | MTY_DISC_POINT | MTY_ORIENTED_DISC_POINT,
+
+      MTY_CURVES = MTY_CURVE2 | MTY_CURVE4 | MTY_POINTS,
+
+      MTY_TRIANGLE_MESH = 1ul << GTY_TRIANGLE_MESH,
+      MTY_QUAD_MESH = 1ul << GTY_QUAD_MESH,
+      MTY_GRID_MESH = 1ul << GTY_GRID_MESH,
+      MTY_SUBDIV_MESH = 1ul << GTY_SUBDIV_MESH,
+      MTY_USER_GEOMETRY = 1ul << GTY_USER_GEOMETRY,
+
+      MTY_INSTANCE_CHEAP = 1ul << GTY_INSTANCE_CHEAP,
+      MTY_INSTANCE_EXPENSIVE = 1ul << GTY_INSTANCE_EXPENSIVE,
+      MTY_INSTANCE = MTY_INSTANCE_CHEAP | MTY_INSTANCE_EXPENSIVE
+    };
+
+    static const char* gtype_names[GTY_END];
+
+    enum class State : unsigned {
+      MODIFIED = 0,
+      COMMITTED = 1,
+    };
+
+  public:
+    
+    /*! Geometry constructor */
+    Geometry (Device* device, GType gtype, unsigned int numPrimitives, unsigned int numTimeSteps);
+
+    /*! Geometry destructor */
+    virtual ~Geometry();
+
+  public:
+
+    /*! tests if geometry is enabled */
+    __forceinline bool isEnabled() const { return enabled; }
+
+    /*! tests if geometry is disabled */
+    __forceinline bool isDisabled() const { return !isEnabled(); }
+
+    /*! tests if that geometry has some filter function set */
+    __forceinline bool hasFilterFunctions () const {
+      return (intersectionFilterN  != nullptr) || (occlusionFilterN  != nullptr);
+    }
+
+    /*! returns geometry type */
+    __forceinline GType getType() const { return gtype; }
+
+    /*! returns curve type */
+    __forceinline GType getCurveType() const { return (GType)(gtype & GTY_SUBTYPE_MASK); }
+
+    /*! returns curve basis */
+    __forceinline GType getCurveBasis() const { return (GType)(gtype & GTY_BASIS_MASK); }
+
+    /*! returns geometry type mask */
+    __forceinline GTypeMask getTypeMask() const { return (GTypeMask)(1 << gtype); }
+
+    /*! returns number of primitives */
+    __forceinline size_t size() const { return numPrimitives; }
+
+    /*! sets the number of primitives */
+    virtual void setNumPrimitives(unsigned int numPrimitives_in);
+
+    /*! sets number of time steps */
+    virtual void setNumTimeSteps (unsigned int numTimeSteps_in);
+
+    /*! sets motion blur time range */
+    void setTimeRange (const BBox1f range);
+
+    /*! sets number of vertex attributes */
+    virtual void setVertexAttributeCount (unsigned int N) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! sets number of topologies */
+    virtual void setTopologyCount (unsigned int N) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! sets the build quality */
+    void setBuildQuality(RTCBuildQuality quality_in)
+    {
+      this->quality = quality_in;
+      Geometry::update();
+    }
+
+    /* calculate time segment itime and fractional time ftime */
+    __forceinline int timeSegment(float time, float& ftime) const {
+      return getTimeSegment(time,time_range.lower,time_range.upper,fnumTimeSegments,ftime);
+    }
+
+    template<int N>
+      __forceinline vint<N> timeSegment(const vfloat<N>& time, vfloat<N>& ftime) const {
+      return getTimeSegment<N>(time,vfloat<N>(time_range.lower),vfloat<N>(time_range.upper),vfloat<N>(fnumTimeSegments),ftime);
+    }
+    
+    /* calculate overlapping time segment range */
+    __forceinline range<int> timeSegmentRange(const BBox1f& range) const {
+      return getTimeSegmentRange(range,time_range,fnumTimeSegments);
+    }
+
+    /* returns time that corresponds to time step */
+    __forceinline float timeStep(const int i) const {
+      assert(i>=0 && i<(int)numTimeSteps);
+      return time_range.lower + time_range.size()*float(i)/fnumTimeSegments;
+    }
+    
+    /*! for all geometries */
+  public:
+
+    /*! Enable geometry. */
+    virtual void enable();
+
+    /*! Update geometry. */
+    void update();
+    
+    /*! commit of geometry */
+    virtual void commit();
+
+    /*! Update geometry buffer. */
+    virtual void updateBuffer(RTCBufferType type, unsigned int slot) {
+      update(); // update everything for geometries not supporting this call
+    }
+    
+    /*! Disable geometry. */
+    virtual void disable();
+
+    /*! Verify the geometry */
+    virtual bool verify() { return true; }
+
+    /*! called before every build */
+    virtual void preCommit();
+  
+    /*! called after every build */
+    virtual void postCommit();
+
+    virtual void addElementsToCount (GeometryCounts & counts) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    };
+
+    /*! sets constant tessellation rate for the geometry */
+    virtual void setTessellationRate(float N) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! Sets the maximal curve radius scale allowed by min-width feature. */
+    virtual void setMaxRadiusScale(float s) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! Set user data pointer. */
+    virtual void setUserData(void* ptr);
+      
+    /*! Get user data pointer. */
+    __forceinline void* getUserData() const {
+      return userPtr;
+    }
+
+    /*! interpolates user data to the specified u/v location */
+    virtual void interpolate(const RTCInterpolateArguments* const args) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! interpolates user data to the specified u/v locations */
+    virtual void interpolateN(const RTCInterpolateNArguments* const args);
+
+    /* point query api */
+    bool pointQuery(PointQuery* query, PointQueryContext* context);
+
+    /*! for subdivision surfaces only */
+  public:
+    virtual void setSubdivisionMode (unsigned topologyID, RTCSubdivisionMode mode) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    virtual void setVertexAttributeTopology(unsigned int vertexBufferSlot, unsigned int indexBufferSlot) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! Set displacement function. */
+    virtual void setDisplacementFunction (RTCDisplacementFunctionN filter) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    virtual unsigned int getFirstHalfEdge(unsigned int faceID) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    virtual unsigned int getFace(unsigned int edgeID) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+    
+    virtual unsigned int getNextHalfEdge(unsigned int edgeID) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    virtual unsigned int getPreviousHalfEdge(unsigned int edgeID) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    virtual unsigned int getOppositeHalfEdge(unsigned int topologyID, unsigned int edgeID) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! get fast access to first vertex buffer if applicable */
+    virtual float * getCompactVertexArray () const {
+      return nullptr;
+    }
+
+    /*! Returns the modified counter - how many times the geo has been modified */
+    __forceinline unsigned int getModCounter () const {
+      return modCounter_;
+    }
+
+    /*! for triangle meshes and bezier curves only */
+  public:
+
+
+    /*! Sets ray mask. */
+    virtual void setMask(unsigned mask) { 
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+    
+    /*! Sets specified buffer. */
+    virtual void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! Gets specified buffer. */
+    virtual void* getBuffer(RTCBufferType type, unsigned int slot) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry");
+    }
+
+    /*! Set intersection filter function for ray packets of size N. */
+    virtual void setIntersectionFilterFunctionN (RTCFilterFunctionN filterN);
+
+    /*! Set occlusion filter function for ray packets of size N. */
+    virtual void setOcclusionFilterFunctionN (RTCFilterFunctionN filterN);
+
+    /*! for instances only */
+  public:
+
+    /*! Sets the instanced scene */
+    virtual void setInstancedScene(const Ref<Scene>& scene) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry");
+    }
+
+    /*! Sets transformation of the instance */
+    virtual void setTransform(const AffineSpace3fa& transform, unsigned int timeStep) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! Sets transformation of the instance */
+    virtual void setQuaternionDecomposition(const AffineSpace3ff& qd, unsigned int timeStep) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! Returns the transformation of the instance */
+    virtual AffineSpace3fa getTransform(float time) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! for user geometries only */
+  public:
+
+    /*! Set bounds function. */
+    virtual void setBoundsFunction (RTCBoundsFunction bounds, void* userPtr) { 
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! Set intersect function for ray packets of size N. */
+    virtual void setIntersectFunctionN (RTCIntersectFunctionN intersect) { 
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+    
+    /*! Set occlusion function for ray packets of size N. */
+    virtual void setOccludedFunctionN (RTCOccludedFunctionN occluded) { 
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+    
+    /*! Set point query function. */
+    void setPointQueryFunction(RTCPointQueryFunction func);
+
+    /*! returns number of time segments */
+    __forceinline unsigned numTimeSegments () const {
+      return numTimeSteps-1;
+    }
+
+  public:
+
+    virtual PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"createPrimRefArray not implemented for this geometry"); 
+    }
+
+    virtual PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, size_t itime, const range<size_t>& r, size_t k, unsigned int geomID) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"createPrimRefMBArray not implemented for this geometry"); 
+    }
+
+    virtual PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"createPrimRefMBArray not implemented for this geometry"); 
+    }
+
+    virtual LinearSpace3fa computeAlignedSpace(const size_t primID) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"computeAlignedSpace not implemented for this geometry"); 
+    }
+
+    virtual LinearSpace3fa computeAlignedSpaceMB(const size_t primID, const BBox1f time_range) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"computeAlignedSpace not implemented for this geometry"); 
+    }
+    
+    virtual Vec3fa computeDirection(unsigned int primID) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"computeDirection not implemented for this geometry"); 
+    }
+
+    virtual Vec3fa computeDirection(unsigned int primID, size_t time) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"computeDirection not implemented for this geometry"); 
+    }
+
+    virtual BBox3fa vbounds(size_t primID) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vbounds not implemented for this geometry"); 
+    }
+    
+    virtual BBox3fa vbounds(const LinearSpace3fa& space, size_t primID) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vbounds not implemented for this geometry"); 
+    }
+
+    virtual BBox3fa vbounds(const Vec3fa& ofs, const float scale, const float r_scale0, const LinearSpace3fa& space, size_t i, size_t itime = 0) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vbounds not implemented for this geometry"); 
+    }
+
+    virtual LBBox3fa vlinearBounds(size_t primID, const BBox1f& time_range) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vlinearBounds not implemented for this geometry"); 
+    }
+    
+    virtual LBBox3fa vlinearBounds(const LinearSpace3fa& space, size_t primID, const BBox1f& time_range) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vlinearBounds not implemented for this geometry"); 
+    }
+
+    virtual LBBox3fa vlinearBounds(const Vec3fa& ofs, const float scale, const float r_scale0, const LinearSpace3fa& space, size_t primID, const BBox1f& time_range) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vlinearBounds not implemented for this geometry"); 
+    }
+    
+  public:
+    __forceinline bool hasIntersectionFilter() const { return intersectionFilterN != nullptr; }
+    __forceinline bool hasOcclusionFilter() const { return occlusionFilterN != nullptr; }
+
+  public:
+    Device* device;             //!< device this geometry belongs to
+
+    void* userPtr;              //!< user pointer
+    unsigned int numPrimitives; //!< number of primitives of this geometry
+    
+    unsigned int numTimeSteps;  //!< number of time steps
+    float fnumTimeSegments;     //!< number of time segments (precalculation)
+    BBox1f time_range;          //!< motion blur time range
+    
+    unsigned int mask;             //!< for masking out geometry
+    unsigned int modCounter_ = 1; //!< counter for every modification - used to rebuild scenes when geo is modified
+    
+    struct {
+      GType gtype : 8;                //!< geometry type
+      GSubType gsubtype : 8;          //!< geometry subtype
+      RTCBuildQuality quality : 3;    //!< build quality for geometry
+      unsigned state : 2;
+      bool enabled : 1;              //!< true if geometry is enabled
+    };
+       
+    RTCFilterFunctionN intersectionFilterN;
+    RTCFilterFunctionN occlusionFilterN;
+    RTCPointQueryFunction pointQueryFunc;
+  };
+}
diff --git a/thirdparty/embree/kernels/common/hit.h b/thirdparty/embree/kernels/common/hit.h
new file mode 100644
index 0000000000..fd1a9d6391
--- /dev/null
+++ b/thirdparty/embree/kernels/common/hit.h
@@ -0,0 +1,114 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "ray.h"
+#include "instance_stack.h"
+
+namespace embree
+{
+  /* Hit structure for K hits */
+  template<int K>
+    struct HitK
+  {
+    /* Default construction does nothing */
+    __forceinline HitK() {}
+
+    /* Constructs a hit */
+    __forceinline HitK(const RTCIntersectContext* context, const vuint<K>& geomID, const vuint<K>& primID, const vfloat<K>& u, const vfloat<K>& v, const Vec3vf<K>& Ng)
+      : Ng(Ng), u(u), v(v), primID(primID), geomID(geomID) 
+    {
+      for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+        instID[l] = RTC_INVALID_GEOMETRY_ID;
+      instance_id_stack::copy_UV<K>(context->instID, instID);
+    }
+
+    /* Returns the size of the hit */
+    static __forceinline size_t size() { return K; }
+
+  public:
+    Vec3vf<K> Ng;  // geometry normal
+    vfloat<K> u;         // barycentric u coordinate of hit
+    vfloat<K> v;         // barycentric v coordinate of hit
+    vuint<K> primID;      // primitive ID
+    vuint<K> geomID;      // geometry ID
+    vuint<K> instID[RTC_MAX_INSTANCE_LEVEL_COUNT];      // instance ID
+  };
+
+  /* Specialization for a single hit */
+  template<>
+    struct __aligned(16) HitK<1>
+  {
+     /* Default construction does nothing */
+    __forceinline HitK() {}
+
+    /* Constructs a hit */
+    __forceinline HitK(const RTCIntersectContext* context, unsigned int geomID, unsigned int primID, float u, float v, const Vec3fa& Ng)
+      : Ng(Ng.x,Ng.y,Ng.z), u(u), v(v), primID(primID), geomID(geomID)
+    {
+      instance_id_stack::copy_UU(context->instID, instID);
+    }
+
+    /* Returns the size of the hit */
+    static __forceinline size_t size() { return 1; }
+
+  public:
+    Vec3<float> Ng;  // geometry normal
+    float u;         // barycentric u coordinate of hit
+    float v;         // barycentric v coordinate of hit
+    unsigned int primID;      // primitive ID
+    unsigned int geomID;      // geometry ID
+    unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT];      // instance ID
+  };
+
+  /* Shortcuts */
+  typedef HitK<1>  Hit;
+  typedef HitK<4>  Hit4;
+  typedef HitK<8>  Hit8;
+  typedef HitK<16> Hit16;
+
+  /* Outputs hit to stream */
+  template<int K>
+  __forceinline embree_ostream operator<<(embree_ostream cout, const HitK<K>& ray)
+  {
+    cout << "{ " << embree_endl
+         << "  Ng = " << ray.Ng <<  embree_endl
+         << "  u = " << ray.u <<  embree_endl
+         << "  v = " << ray.v << embree_endl
+         << "  primID = " << ray.primID <<  embree_endl
+         << "  geomID = " << ray.geomID << embree_endl
+         << "  instID =";
+    for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+    {
+      cout << " " << ray.instID[l];
+    }
+    cout << embree_endl;
+    return cout << "}";
+  }
+
+  template<typename Hit>
+    __forceinline void copyHitToRay(RayHit& ray, const Hit& hit)
+  {
+    ray.Ng   = hit.Ng;
+    ray.u    = hit.u;
+    ray.v    = hit.v;
+    ray.primID = hit.primID;
+    ray.geomID = hit.geomID;
+    instance_id_stack::copy_UU(hit.instID, ray.instID);
+  }
+
+  template<int K>
+    __forceinline void copyHitToRay(const vbool<K> &mask, RayHitK<K> &ray, const HitK<K> &hit)
+  {
+    vfloat<K>::storeu(mask,&ray.Ng.x, hit.Ng.x);
+    vfloat<K>::storeu(mask,&ray.Ng.y, hit.Ng.y);
+    vfloat<K>::storeu(mask,&ray.Ng.z, hit.Ng.z);
+    vfloat<K>::storeu(mask,&ray.u, hit.u);
+    vfloat<K>::storeu(mask,&ray.v, hit.v);
+    vuint<K>::storeu(mask,&ray.primID, hit.primID);
+    vuint<K>::storeu(mask,&ray.geomID, hit.geomID);
+    instance_id_stack::copy_VV<K>(hit.instID, ray.instID, mask);
+  }
+}
diff --git a/thirdparty/embree/kernels/common/instance_stack.h b/thirdparty/embree/kernels/common/instance_stack.h
new file mode 100644
index 0000000000..d3c0a643f1
--- /dev/null
+++ b/thirdparty/embree/kernels/common/instance_stack.h
@@ -0,0 +1,179 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "rtcore.h"
+
+namespace embree {
+namespace instance_id_stack {
+
+static_assert(RTC_MAX_INSTANCE_LEVEL_COUNT > 0, 
+              "RTC_MAX_INSTANCE_LEVEL_COUNT must be greater than 0.");
+
+/*******************************************************************************
+ * Instance ID stack manipulation.
+ * This is used from the instance intersector.
+ ******************************************************************************/
+
+/* 
+ * Push an instance to the stack. 
+ */
+RTC_FORCEINLINE bool push(RTCIntersectContext* context, 
+                          unsigned instanceId)
+{
+#if RTC_MAX_INSTANCE_LEVEL_COUNT > 1
+  const bool spaceAvailable = context->instStackSize < RTC_MAX_INSTANCE_LEVEL_COUNT;
+  /* We assert here because instances are silently dropped when the stack is full. 
+     This might be quite hard to find in production. */
+  assert(spaceAvailable); 
+  if (likely(spaceAvailable))
+    context->instID[context->instStackSize++] = instanceId;
+  return spaceAvailable;
+#else
+  const bool spaceAvailable = (context->instID[0] == RTC_INVALID_GEOMETRY_ID);
+  assert(spaceAvailable); 
+  if (likely(spaceAvailable))
+    context->instID[0] = instanceId;
+  return spaceAvailable;
+#endif
+}
+
+
+/* 
+ * Pop the last instance pushed to the stack. 
+ * Do not call on an empty stack. 
+ */
+RTC_FORCEINLINE void pop(RTCIntersectContext* context)
+{
+  assert(context);
+#if RTC_MAX_INSTANCE_LEVEL_COUNT > 1
+  assert(context->instStackSize > 0);
+  context->instID[--context->instStackSize] = RTC_INVALID_GEOMETRY_ID;
+#else
+  assert(context->instID[0] != RTC_INVALID_GEOMETRY_ID);
+  context->instID[0] = RTC_INVALID_GEOMETRY_ID;
+#endif
+}
+
+/*
+ * Optimized instance id stack copy.
+ * The copy() functions will either copy full
+ * stacks or copy only until the last valid element has been copied, depending
+ * on RTC_MAX_INSTANCE_LEVEL_COUNT.
+ */
+RTC_FORCEINLINE void copy_UU(const unsigned* src, unsigned* tgt)
+{
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT == 1)
+  tgt[0] = src[0];
+  
+#else
+  for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) {
+    tgt[l] = src[l];
+    if (RTC_MAX_INSTANCE_LEVEL_COUNT > 4)
+      if (src[l] == RTC_INVALID_GEOMETRY_ID)
+        break;
+  }
+#endif
+}
+
+template <int K>
+RTC_FORCEINLINE void copy_UV(const unsigned* src, vuint<K>* tgt)
+{
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT == 1)
+  tgt[0] = src[0];
+
+#else
+  for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) {
+    tgt[l] = src[l];
+    if (RTC_MAX_INSTANCE_LEVEL_COUNT > 4)
+      if (src[l] == RTC_INVALID_GEOMETRY_ID)
+        break;
+  }
+#endif
+}
+
+template <int K>
+RTC_FORCEINLINE void copy_UV(const unsigned* src, vuint<K>* tgt, size_t j)
+{
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT == 1)
+  tgt[0][j] = src[0];
+
+#else
+  for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) {
+    tgt[l][j] = src[l];
+    if (RTC_MAX_INSTANCE_LEVEL_COUNT > 4)
+      if (src[l] == RTC_INVALID_GEOMETRY_ID)
+        break;
+  }
+#endif
+}
+
+template <int K>
+RTC_FORCEINLINE void copy_UV(const unsigned* src, vuint<K>* tgt, const vbool<K>& mask)
+{
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT == 1)
+  vuint<K>::store(mask, tgt, src[0]);
+
+#else
+  for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) {
+    vuint<K>::store(mask, tgt + l, src[l]);
+    if (RTC_MAX_INSTANCE_LEVEL_COUNT > 4)
+      if (src[l] == RTC_INVALID_GEOMETRY_ID)
+        break;
+  }
+#endif
+}
+
+template <int K>
+RTC_FORCEINLINE void copy_VU(const vuint<K>* src, unsigned* tgt, size_t i)
+{
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT == 1)
+  tgt[0] = src[0][i];
+
+#else
+  for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) {
+    tgt[l] = src[l][i];
+    if (RTC_MAX_INSTANCE_LEVEL_COUNT > 4)
+      if (src[l][i] == RTC_INVALID_GEOMETRY_ID)
+        break;
+  }
+#endif
+}
+
+template <int K>
+RTC_FORCEINLINE void copy_VV(const vuint<K>* src, vuint<K>* tgt, size_t i, size_t j)
+{
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT == 1)
+  tgt[0][j] = src[0][i];
+
+#else
+  for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) {
+    tgt[l][j] = src[l][i];
+    if (RTC_MAX_INSTANCE_LEVEL_COUNT > 4)
+      if (src[l][i] == RTC_INVALID_GEOMETRY_ID)
+        break;
+  }
+#endif
+}
+
+template <int K>
+RTC_FORCEINLINE void copy_VV(const vuint<K>* src, vuint<K>* tgt, const vbool<K>& mask)
+{
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT == 1)
+  vuint<K>::store(mask, tgt, src[0]);
+
+#else
+  vbool<K> done = !mask;
+  for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) {
+    vuint<K>::store(mask, tgt + l, src[l]);
+    if (RTC_MAX_INSTANCE_LEVEL_COUNT > 4) {
+      done |= src[l] == RTC_INVALID_GEOMETRY_ID;
+      if (all(done)) break;
+    }
+  }
+#endif
+}
+
+} // namespace instance_id_stack
+} // namespace embree
diff --git a/thirdparty/embree/kernels/common/isa.h b/thirdparty/embree/kernels/common/isa.h
new file mode 100644
index 0000000000..ae6556336c
--- /dev/null
+++ b/thirdparty/embree/kernels/common/isa.h
@@ -0,0 +1,246 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../../common/sys/platform.h"
+#include "../../common/sys/sysinfo.h"
+
+namespace embree
+{
+#define DEFINE_SYMBOL2(type,name)               \
+  typedef type (*name##Func)();                 \
+  name##Func name;
+  
+#define DECLARE_SYMBOL2(type,name)                                       \
+  namespace sse2   { extern type name(); }                           \
+  namespace sse42  { extern type name(); }                           \
+  namespace avx    { extern type name(); }                           \
+  namespace avx2   { extern type name(); }                           \
+  namespace avx512 { extern type name(); }                           \
+  void name##_error2() { throw_RTCError(RTC_ERROR_UNKNOWN,"internal error in ISA selection for " TOSTRING(name)); } \
+  type name##_error() { return type(name##_error2); }                   \
+  type name##_zero() { return type(nullptr); }
+
+#define DECLARE_ISA_FUNCTION(type,symbol,args)                            \
+  namespace sse2   { extern type symbol(args); }                       \
+  namespace sse42  { extern type symbol(args); }                       \
+  namespace avx    { extern type symbol(args); }                       \
+  namespace avx2   { extern type symbol(args); }                       \
+  namespace avx512 { extern type symbol(args); }                     \
+  inline type symbol##_error(args) { throw_RTCError(RTC_ERROR_UNSUPPORTED_CPU,"function " TOSTRING(symbol) " not supported by your CPU"); } \
+  typedef type (*symbol##Ty)(args);                                       \
+  
+#define DEFINE_ISA_FUNCTION(type,symbol,args)   \
+  typedef type (*symbol##Func)(args);           \
+  symbol##Func symbol;
+  
+#define ZERO_SYMBOL(features,intersector)                      \
+  intersector = intersector##_zero;
+
+#define INIT_SYMBOL(features,intersector)                      \
+  intersector = decltype(intersector)(intersector##_error);
+
+#define SELECT_SYMBOL_DEFAULT(features,intersector) \
+  intersector = isa::intersector;
+
+#if defined(__SSE__)
+#if !defined(EMBREE_TARGET_SIMD4)
+#define EMBREE_TARGET_SIMD4
+#endif
+#endif
+
+#if defined(EMBREE_TARGET_SSE42)
+#define SELECT_SYMBOL_SSE42(features,intersector) \
+  if ((features & SSE42) == SSE42) intersector = sse42::intersector;
+#else
+#define SELECT_SYMBOL_SSE42(features,intersector)
+#endif
+
+#if defined(EMBREE_TARGET_AVX) || defined(__AVX__)
+#if !defined(EMBREE_TARGET_SIMD8)
+#define EMBREE_TARGET_SIMD8
+#endif
+#if defined(__AVX__) // if default ISA is >= AVX we treat AVX target as default target
+#define SELECT_SYMBOL_AVX(features,intersector)                 \
+  if ((features & ISA) == ISA) intersector = isa::intersector;
+#else
+#define SELECT_SYMBOL_AVX(features,intersector)                 \
+  if ((features & AVX) == AVX) intersector = avx::intersector;
+#endif
+#else
+#define SELECT_SYMBOL_AVX(features,intersector)
+#endif
+
+#if defined(EMBREE_TARGET_AVX2)
+#if !defined(EMBREE_TARGET_SIMD8)
+#define EMBREE_TARGET_SIMD8
+#endif
+#define SELECT_SYMBOL_AVX2(features,intersector) \
+  if ((features & AVX2) == AVX2) intersector = avx2::intersector;
+#else
+#define SELECT_SYMBOL_AVX2(features,intersector)
+#endif
+
+#if defined(EMBREE_TARGET_AVX512)
+#if !defined(EMBREE_TARGET_SIMD16)
+#define EMBREE_TARGET_SIMD16
+#endif
+#define SELECT_SYMBOL_AVX512(features,intersector) \
+  if ((features & AVX512) == AVX512) intersector = avx512::intersector;
+#else
+#define SELECT_SYMBOL_AVX512(features,intersector)
+#endif
+
+#define SELECT_SYMBOL_DEFAULT_SSE42(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);            \
+  SELECT_SYMBOL_SSE42(features,intersector);                                  
+  
+#define SELECT_SYMBOL_DEFAULT_SSE42_AVX(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);                \
+  SELECT_SYMBOL_SSE42(features,intersector);                  \
+  SELECT_SYMBOL_AVX(features,intersector);                        
+  
+#define SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);                     \
+  SELECT_SYMBOL_SSE42(features,intersector);                       \
+  SELECT_SYMBOL_AVX(features,intersector);                         \
+  SELECT_SYMBOL_AVX2(features,intersector);                       
+
+#define SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX512(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);                          \
+  SELECT_SYMBOL_SSE42(features,intersector);                            \
+  SELECT_SYMBOL_AVX(features,intersector);                              \
+  SELECT_SYMBOL_AVX512(features,intersector);
+
+#define SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);                                   \
+  SELECT_SYMBOL_AVX(features,intersector);                                       \
+  SELECT_SYMBOL_AVX2(features,intersector);                                      \
+  SELECT_SYMBOL_AVX512(features,intersector);
+
+#define SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);                         \
+  SELECT_SYMBOL_AVX(features,intersector);                             \
+  SELECT_SYMBOL_AVX2(features,intersector);                            \
+  SELECT_SYMBOL_AVX512(features,intersector);
+
+#define SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);                                         \
+  SELECT_SYMBOL_SSE42(features,intersector);                                           \
+  SELECT_SYMBOL_AVX(features,intersector);                                             \
+  SELECT_SYMBOL_AVX2(features,intersector);                                            \
+  SELECT_SYMBOL_AVX512(features,intersector);
+
+#define SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);                               \
+  SELECT_SYMBOL_SSE42(features,intersector);                                 \
+  SELECT_SYMBOL_AVX(features,intersector);                                   \
+  SELECT_SYMBOL_AVX2(features,intersector);                                  \
+  SELECT_SYMBOL_AVX512(features,intersector);
+  
+#define SELECT_SYMBOL_DEFAULT_AVX(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);          \
+  SELECT_SYMBOL_AVX(features,intersector);                        
+  
+#define SELECT_SYMBOL_DEFAULT_AVX_AVX2(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);               \
+  SELECT_SYMBOL_AVX(features,intersector);                   \
+  SELECT_SYMBOL_AVX2(features,intersector);                       
+  
+#define SELECT_SYMBOL_DEFAULT_AVX(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);                    \
+  SELECT_SYMBOL_AVX(features,intersector);
+  
+#define SELECT_SYMBOL_DEFAULT_AVX_AVX512(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);                              \
+  SELECT_SYMBOL_AVX(features,intersector);                                  \
+  SELECT_SYMBOL_AVX512(features,intersector);
+
+#define SELECT_SYMBOL_DEFAULT_AVX_AVX512(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);                    \
+  SELECT_SYMBOL_AVX(features,intersector);                        \
+  SELECT_SYMBOL_AVX512(features,intersector);
+  
+#define SELECT_SYMBOL_INIT_AVX(features,intersector) \
+  INIT_SYMBOL(features,intersector);                 \
+  SELECT_SYMBOL_AVX(features,intersector);                                
+  
+#define SELECT_SYMBOL_INIT_AVX_AVX2(features,intersector) \
+  INIT_SYMBOL(features,intersector);                      \
+  SELECT_SYMBOL_AVX(features,intersector);                \
+  SELECT_SYMBOL_AVX2(features,intersector);
+
+#define SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,intersector) \
+  INIT_SYMBOL(features,intersector);                                \
+  SELECT_SYMBOL_AVX(features,intersector);                          \
+  SELECT_SYMBOL_AVX2(features,intersector);                         \
+  SELECT_SYMBOL_AVX512(features,intersector);
+
+#define SELECT_SYMBOL_INIT_SSE42_AVX_AVX2(features,intersector) \
+  INIT_SYMBOL(features,intersector);                            \
+  SELECT_SYMBOL_SSE42(features,intersector);                    \
+  SELECT_SYMBOL_AVX(features,intersector);                      \
+  SELECT_SYMBOL_AVX2(features,intersector);
+  
+#define SELECT_SYMBOL_INIT_AVX(features,intersector) \
+  INIT_SYMBOL(features,intersector);                           \
+  SELECT_SYMBOL_AVX(features,intersector);
+
+#define SELECT_SYMBOL_INIT_AVX_AVX512(features,intersector) \
+  INIT_SYMBOL(features,intersector);                                     \
+  SELECT_SYMBOL_AVX(features,intersector);                               \
+  SELECT_SYMBOL_AVX512(features,intersector);
+
+#define SELECT_SYMBOL_INIT_AVX_AVX2(features,intersector) \
+  INIT_SYMBOL(features,intersector);                                \
+  SELECT_SYMBOL_AVX(features,intersector);                          \
+  SELECT_SYMBOL_AVX2(features,intersector);
+
+#define SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,intersector) \
+  INIT_SYMBOL(features,intersector);                                          \
+  SELECT_SYMBOL_AVX(features,intersector);                                    \
+  SELECT_SYMBOL_AVX2(features,intersector);                                   \
+  SELECT_SYMBOL_AVX512(features,intersector);
+
+#define SELECT_SYMBOL_INIT_SSE42_AVX_AVX2_AVX512(features,intersector) \
+  INIT_SYMBOL(features,intersector);                                                \
+  SELECT_SYMBOL_SSE42(features,intersector);                                        \
+  SELECT_SYMBOL_AVX(features,intersector);                                          \
+  SELECT_SYMBOL_AVX2(features,intersector);                                         \
+  SELECT_SYMBOL_AVX512(features,intersector);
+
+#define SELECT_SYMBOL_ZERO_SSE42_AVX_AVX2_AVX512(features,intersector) \
+  ZERO_SYMBOL(features,intersector);                                    \
+  SELECT_SYMBOL_SSE42(features,intersector);                            \
+  SELECT_SYMBOL_AVX(features,intersector);                              \
+  SELECT_SYMBOL_AVX2(features,intersector);                             \
+  SELECT_SYMBOL_AVX512(features,intersector);
+
+#define SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);                                   \
+  SELECT_SYMBOL_AVX(features,intersector);                                       \
+  SELECT_SYMBOL_AVX2(features,intersector);                                      \
+  SELECT_SYMBOL_AVX512(features,intersector);
+  
+#define SELECT_SYMBOL_INIT_AVX512(features,intersector) \
+  INIT_SYMBOL(features,intersector);                                 \
+  SELECT_SYMBOL_AVX512(features,intersector);
+  
+#define SELECT_SYMBOL_SSE42_AVX_AVX2(features,intersector) \
+  SELECT_SYMBOL_SSE42(features,intersector);               \
+  SELECT_SYMBOL_AVX(features,intersector);                 \
+  SELECT_SYMBOL_AVX2(features,intersector);
+
+  struct VerifyMultiTargetLinking {
+    static __noinline int getISA(int depth = 5) { 
+      if (depth == 0) return ISA; 
+      else return getISA(depth-1); 
+    }
+  };
+  namespace sse2   { int getISA(); };
+  namespace sse42  { int getISA(); };
+  namespace avx    { int getISA(); };
+  namespace avx2   { int getISA(); };
+  namespace avx512 { int getISA(); };
+}
diff --git a/thirdparty/embree/kernels/common/motion_derivative.h b/thirdparty/embree/kernels/common/motion_derivative.h
new file mode 100644
index 0000000000..c619d6a675
--- /dev/null
+++ b/thirdparty/embree/kernels/common/motion_derivative.h
@@ -0,0 +1,325 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../../common/math/affinespace.h"
+#include "../../common/math/interval.h"
+
+#include <functional>
+
+namespace embree {
+
+#define MOTION_DERIVATIVE_ROOT_EPSILON 1e-4f
+
+static void motion_derivative_coefficients(const float *p, float *coeff);
+
+struct MotionDerivativeCoefficients
+{
+  float theta;
+  float coeffs[3*8*7];
+
+  MotionDerivativeCoefficients() {}
+
+  // xfm0 and xfm1 are interpret as quaternion decomposition
+  MotionDerivativeCoefficients(AffineSpace3ff const& xfm0, AffineSpace3ff const& xfm1)
+  {
+    // cosTheta of the two quaternions
+    const float cosTheta = min(1.f, max(-1.f,
+                           xfm0.l.vx.w * xfm1.l.vx.w
+                         + xfm0.l.vy.w * xfm1.l.vy.w
+                         + xfm0.l.vz.w * xfm1.l.vz.w
+                         + xfm0.p.w * xfm1.p.w));
+
+    theta = std::acos(cosTheta);
+    Vec4f qperp(xfm1.p.w, xfm1.l.vx.w, xfm1.l.vy.w, xfm1.l.vz.w);
+    if (cosTheta < 0.995f) {
+      // compute perpendicular quaternion
+      qperp.x = xfm1.p.w    - cosTheta * xfm0.p.w;
+      qperp.y = xfm1.l.vx.w - cosTheta * xfm0.l.vx.w;
+      qperp.z = xfm1.l.vy.w - cosTheta * xfm0.l.vy.w;
+      qperp.w = xfm1.l.vz.w - cosTheta * xfm0.l.vz.w;
+      qperp = normalize(qperp);
+    }
+    const float p[33] = {
+      theta,
+      xfm0.l.vx.y, xfm0.l.vx.z, xfm0.l.vy.z, // translation component of xfm0
+      xfm1.l.vx.y, xfm1.l.vx.z, xfm1.l.vy.z, // translation component of xfm1
+      xfm0.p.w, xfm0.l.vx.w, xfm0.l.vy.w, xfm0.l.vz.w, // quaternion of xfm0
+      qperp.x, qperp.y, qperp.z, qperp.w,
+      xfm0.l.vx.x, xfm0.l.vy.x, xfm0.l.vz.x, xfm0.p.x, // scale/skew component of xfm0
+                   xfm0.l.vy.y, xfm0.l.vz.y, xfm0.p.y,
+                                xfm0.l.vz.z, xfm0.p.z,
+      xfm1.l.vx.x, xfm1.l.vy.x, xfm1.l.vz.x, xfm1.p.x, // scale/skew component of xfm1
+                   xfm1.l.vy.y, xfm1.l.vz.y, xfm1.p.y,
+                                xfm1.l.vz.z, xfm1.p.z
+    };
+    motion_derivative_coefficients(p, coeffs);
+  }
+};
+
+struct MotionDerivative
+{
+  float twoTheta;
+  float c[8];
+
+  MotionDerivative(MotionDerivativeCoefficients const& mdc,
+                    int dim, Vec3fa const& p0, Vec3fa const& p1)
+    : twoTheta(2.f*mdc.theta)
+  {
+    const float p[7] = { 1, p0.x, p0.y, p0.z, p1.x, p1.y, p1.z };
+    for (int i = 0; i < 8; ++i) {
+      c[i] = 0;
+      for (int j = 0; j < 7; ++j) {
+        c[i] += mdc.coeffs[8*7*dim + i*7 + j] * p[j];
+      }
+    }
+  }
+
+  template<typename T>
+  struct EvalMotionDerivative
+  {
+    MotionDerivative const& md;
+    float offset;
+
+    EvalMotionDerivative(MotionDerivative const& md, float offset) : md(md), offset(offset) {}
+
+    T operator()(T const& time) const {
+      return md.c[0] + md.c[1] * time
+          + (md.c[2] + md.c[3] * time + md.c[4] * time * time) * cos(md.twoTheta * time)
+          + (md.c[5] + md.c[6] * time + md.c[7] * time * time) * sin(md.twoTheta * time)
+          + offset;
+    }
+  };
+
+  unsigned int findRoots(
+    Interval1f const& interval,
+    float offset,
+    float* roots,
+    unsigned int maxNumRoots)
+  {
+    unsigned int numRoots = 0;
+    EvalMotionDerivative<Interval1f> eval(*this, offset);
+    findRoots(eval, interval, numRoots, roots, maxNumRoots);
+    return numRoots;
+  }
+
+  template<typename Eval>
+  static void findRoots(
+
+    Eval const& eval,
+    Interval1f const& interval,
+    unsigned int& numRoots,
+    float* roots,
+    unsigned int maxNumRoots)
+  {
+    Interval1f range = eval(interval);
+    if (range.lower > 0 || range.upper < 0 || range.lower >= range.upper) return;
+
+    const float split = 0.5f * (interval.upper + interval.lower);
+    if (interval.upper-interval.lower < 1e-7f || abs(split-interval.lower) < 1e-7f ||  abs(split-interval.upper) < 1e-7f)
+    {
+      // check if the root already exists
+      for (unsigned int k = 0; k < numRoots && k < maxNumRoots; ++k) {
+        if (abs(roots[k]-split) < MOTION_DERIVATIVE_ROOT_EPSILON)
+        return;
+      }
+      if (numRoots < maxNumRoots) {
+        roots[numRoots++] = split;
+      }
+      if (numRoots > maxNumRoots) {
+        printf("error: more roots than expected\n"); // FIXME: workaround for ICC2019.4 compiler bug under macOS
+        return;
+      }
+      return;
+    }
+
+    findRoots(eval, Interval1f(interval.lower, split), numRoots, roots, maxNumRoots);
+    findRoots(eval, Interval1f(split, interval.upper), numRoots, roots, maxNumRoots);
+  }
+};
+
+/******************************************************************************
+ *                       Code generated with sympy 1.4                        *
+ *              See http://www.sympy.org/ for more information.               *
+ *                                                                            *
+ * see                                                                        *
+ *                                                                            *
+ *     scripts/generate_motion_derivative_coefficients.py                     *
+ *                                                                            *
+ * for how this code is generated                                             *
+ *                                                                            *
+ ******************************************************************************/
+static void motion_derivative_coefficients(const float *p, float *coeff)
+{
+   coeff[0] = -p[1] + p[4] - p[7]*p[9]*p[23] + p[7]*p[9]*p[32] + p[7]*p[10]*p[21] - p[7]*p[10]*p[30] - p[8]*p[9]*p[21] + p[8]*p[9]*p[30] - p[8]*p[10]*p[23] + p[8]*p[10]*p[32] + p[9]*p[9]*p[18] - p[9]*p[9]*p[27] + p[10]*p[10]*p[18] - p[10]*p[10]*p[27] - p[11]*p[13]*p[23] + p[11]*p[13]*p[32] + p[11]*p[14]*p[21] - p[11]*p[14]*p[30] - p[12]*p[13]*p[21] + p[12]*p[13]*p[30] - p[12]*p[14]*p[23] + p[12]*p[14]*p[32] + p[13]*p[13]*p[18] - p[13]*p[13]*p[27] + p[14]*p[14]*p[18] - p[14]*p[14]*p[27] - p[18] + p[27];
+   coeff[1] = 2*p[9]*p[9]*p[15] - p[9]*p[9]*p[24] + 2*p[10]*p[10]*p[15] - p[10]*p[10]*p[24] + 2*p[13]*p[13]*p[15] - p[13]*p[13]*p[24] + 2*p[14]*p[14]*p[15] - p[14]*p[14]*p[24] - 2*p[15] + p[24];
+   coeff[2] = 2*p[7]*p[10]*p[19] - p[7]*p[10]*p[28] - 2*p[8]*p[9]*p[19] + p[8]*p[9]*p[28] + 2*p[9]*p[9]*p[16] - p[9]*p[9]*p[25] + 2*p[10]*p[10]*p[16] - p[10]*p[10]*p[25] + 2*p[11]*p[14]*p[19] - p[11]*p[14]*p[28] - 2*p[12]*p[13]*p[19] + p[12]*p[13]*p[28] + 2*p[13]*p[13]*p[16] - p[13]*p[13]*p[25] + 2*p[14]*p[14]*p[16] - p[14]*p[14]*p[25] - 2*p[16] + p[25];
+   coeff[3] = -2*p[7]*p[9]*p[22] + p[7]*p[9]*p[31] + 2*p[7]*p[10]*p[20] - p[7]*p[10]*p[29] - 2*p[8]*p[9]*p[20] + p[8]*p[9]*p[29] - 2*p[8]*p[10]*p[22] + p[8]*p[10]*p[31] + 2*p[9]*p[9]*p[17] - p[9]*p[9]*p[26] + 2*p[10]*p[10]*p[17] - p[10]*p[10]*p[26] - 2*p[11]*p[13]*p[22] + p[11]*p[13]*p[31] + 2*p[11]*p[14]*p[20] - p[11]*p[14]*p[29] - 2*p[12]*p[13]*p[20] + p[12]*p[13]*p[29] - 2*p[12]*p[14]*p[22] + p[12]*p[14]*p[31] + 2*p[13]*p[13]*p[17] - p[13]*p[13]*p[26] + 2*p[14]*p[14]*p[17] - p[14]*p[14]*p[26] - 2*p[17] + p[26];
+   coeff[4] = (-p[9]*p[9] - p[10]*p[10] - p[13]*p[13] - p[14]*p[14] + 1)*p[15];
+   coeff[5] = -p[7]*p[10]*p[19] + p[8]*p[9]*p[19] - p[9]*p[9]*p[16] - p[10]*p[10]*p[16] - p[11]*p[14]*p[19] + p[12]*p[13]*p[19] - p[13]*p[13]*p[16] - p[14]*p[14]*p[16] + p[16];
+   coeff[6] = p[7]*p[9]*p[22] - p[7]*p[10]*p[20] + p[8]*p[9]*p[20] + p[8]*p[10]*p[22] - p[9]*p[9]*p[17] - p[10]*p[10]*p[17] + p[11]*p[13]*p[22] - p[11]*p[14]*p[20] + p[12]*p[13]*p[20] + p[12]*p[14]*p[22] - p[13]*p[13]*p[17] - p[14]*p[14]*p[17] + p[17];
+   coeff[7] = 0;
+   coeff[8] = -2*p[9]*p[9]*p[15] + 2*p[9]*p[9]*p[24] - 2*p[10]*p[10]*p[15] + 2*p[10]*p[10]*p[24] - 2*p[13]*p[13]*p[15] + 2*p[13]*p[13]*p[24] - 2*p[14]*p[14]*p[15] + 2*p[14]*p[14]*p[24] + 2*p[15] - 2*p[24];
+   coeff[9] = -2*p[7]*p[10]*p[19] + 2*p[7]*p[10]*p[28] + 2*p[8]*p[9]*p[19] - 2*p[8]*p[9]*p[28] - 2*p[9]*p[9]*p[16] + 2*p[9]*p[9]*p[25] - 2*p[10]*p[10]*p[16] + 2*p[10]*p[10]*p[25] - 2*p[11]*p[14]*p[19] + 2*p[11]*p[14]*p[28] + 2*p[12]*p[13]*p[19] - 2*p[12]*p[13]*p[28] - 2*p[13]*p[13]*p[16] + 2*p[13]*p[13]*p[25] - 2*p[14]*p[14]*p[16] + 2*p[14]*p[14]*p[25] + 2*p[16] - 2*p[25];
+   coeff[10] = 2*p[7]*p[9]*p[22] - 2*p[7]*p[9]*p[31] - 2*p[7]*p[10]*p[20] + 2*p[7]*p[10]*p[29] + 2*p[8]*p[9]*p[20] - 2*p[8]*p[9]*p[29] + 2*p[8]*p[10]*p[22] - 2*p[8]*p[10]*p[31] - 2*p[9]*p[9]*p[17] + 2*p[9]*p[9]*p[26] - 2*p[10]*p[10]*p[17] + 2*p[10]*p[10]*p[26] + 2*p[11]*p[13]*p[22] - 2*p[11]*p[13]*p[31] - 2*p[11]*p[14]*p[20] + 2*p[11]*p[14]*p[29] + 2*p[12]*p[13]*p[20] - 2*p[12]*p[13]*p[29] + 2*p[12]*p[14]*p[22] - 2*p[12]*p[14]*p[31] - 2*p[13]*p[13]*p[17] + 2*p[13]*p[13]*p[26] - 2*p[14]*p[14]*p[17] + 2*p[14]*p[14]*p[26] + 2*p[17] - 2*p[26];
+   coeff[11] = 2*p[9]*p[9]*p[15] - 2*p[9]*p[9]*p[24] + 2*p[10]*p[10]*p[15] - 2*p[10]*p[10]*p[24] + 2*p[13]*p[13]*p[15] - 2*p[13]*p[13]*p[24] + 2*p[14]*p[14]*p[15] - 2*p[14]*p[14]*p[24] - 2*p[15] + 2*p[24];
+   coeff[12] = 2*p[7]*p[10]*p[19] - 2*p[7]*p[10]*p[28] - 2*p[8]*p[9]*p[19] + 2*p[8]*p[9]*p[28] + 2*p[9]*p[9]*p[16] - 2*p[9]*p[9]*p[25] + 2*p[10]*p[10]*p[16] - 2*p[10]*p[10]*p[25] + 2*p[11]*p[14]*p[19] - 2*p[11]*p[14]*p[28] - 2*p[12]*p[13]*p[19] + 2*p[12]*p[13]*p[28] + 2*p[13]*p[13]*p[16] - 2*p[13]*p[13]*p[25] + 2*p[14]*p[14]*p[16] - 2*p[14]*p[14]*p[25] - 2*p[16] + 2*p[25];
+   coeff[13] = -2*p[7]*p[9]*p[22] + 2*p[7]*p[9]*p[31] + 2*p[7]*p[10]*p[20] - 2*p[7]*p[10]*p[29] - 2*p[8]*p[9]*p[20] + 2*p[8]*p[9]*p[29] - 2*p[8]*p[10]*p[22] + 2*p[8]*p[10]*p[31] + 2*p[9]*p[9]*p[17] - 2*p[9]*p[9]*p[26] + 2*p[10]*p[10]*p[17] - 2*p[10]*p[10]*p[26] - 2*p[11]*p[13]*p[22] + 2*p[11]*p[13]*p[31] + 2*p[11]*p[14]*p[20] - 2*p[11]*p[14]*p[29] - 2*p[12]*p[13]*p[20] + 2*p[12]*p[13]*p[29] - 2*p[12]*p[14]*p[22] + 2*p[12]*p[14]*p[31] + 2*p[13]*p[13]*p[17] - 2*p[13]*p[13]*p[26] + 2*p[14]*p[14]*p[17] - 2*p[14]*p[14]*p[26] - 2*p[17] + 2*p[26];
+   coeff[14] = 2*p[0]*p[7]*p[11]*p[18] + 2*p[0]*p[7]*p[13]*p[23] - 2*p[0]*p[7]*p[14]*p[21] + 2*p[0]*p[8]*p[12]*p[18] + 2*p[0]*p[8]*p[13]*p[21] + 2*p[0]*p[8]*p[14]*p[23] + 2*p[0]*p[9]*p[11]*p[23] + 2*p[0]*p[9]*p[12]*p[21] - 2*p[0]*p[9]*p[13]*p[18] - 2*p[0]*p[10]*p[11]*p[21] + 2*p[0]*p[10]*p[12]*p[23] - 2*p[0]*p[10]*p[14]*p[18] - p[7]*p[9]*p[23] + p[7]*p[9]*p[32] + p[7]*p[10]*p[21] - p[7]*p[10]*p[30] - p[8]*p[9]*p[21] + p[8]*p[9]*p[30] - p[8]*p[10]*p[23] + p[8]*p[10]*p[32] + p[9]*p[9]*p[18] - p[9]*p[9]*p[27] + p[10]*p[10]*p[18] - p[10]*p[10]*p[27] + p[11]*p[13]*p[23] - p[11]*p[13]*p[32] - p[11]*p[14]*p[21] + p[11]*p[14]*p[30] + p[12]*p[13]*p[21] - p[12]*p[13]*p[30] + p[12]*p[14]*p[23] - p[12]*p[14]*p[32] - p[13]*p[13]*p[18] + p[13]*p[13]*p[27] - p[14]*p[14]*p[18] + p[14]*p[14]*p[27];
+   coeff[15] = 2*p[0]*p[7]*p[11]*p[15] + 2*p[0]*p[8]*p[12]*p[15] - 2*p[0]*p[9]*p[13]*p[15] - 2*p[0]*p[10]*p[14]*p[15] + 2*p[9]*p[9]*p[15] - p[9]*p[9]*p[24] + 2*p[10]*p[10]*p[15] - p[10]*p[10]*p[24] - 2*p[13]*p[13]*p[15] + p[13]*p[13]*p[24] - 2*p[14]*p[14]*p[15] + p[14]*p[14]*p[24];
+   coeff[16] = 2*p[0]*p[7]*p[11]*p[16] - 2*p[0]*p[7]*p[14]*p[19] + 2*p[0]*p[8]*p[12]*p[16] + 2*p[0]*p[8]*p[13]*p[19] + 2*p[0]*p[9]*p[12]*p[19] - 2*p[0]*p[9]*p[13]*p[16] - 2*p[0]*p[10]*p[11]*p[19] - 2*p[0]*p[10]*p[14]*p[16] + 2*p[7]*p[10]*p[19] - p[7]*p[10]*p[28] - 2*p[8]*p[9]*p[19] + p[8]*p[9]*p[28] + 2*p[9]*p[9]*p[16] - p[9]*p[9]*p[25] + 2*p[10]*p[10]*p[16] - p[10]*p[10]*p[25] - 2*p[11]*p[14]*p[19] + p[11]*p[14]*p[28] + 2*p[12]*p[13]*p[19] - p[12]*p[13]*p[28] - 2*p[13]*p[13]*p[16] + p[13]*p[13]*p[25] - 2*p[14]*p[14]*p[16] + p[14]*p[14]*p[25];
+   coeff[17] = 2*p[0]*p[7]*p[11]*p[17] + 2*p[0]*p[7]*p[13]*p[22] - 2*p[0]*p[7]*p[14]*p[20] + 2*p[0]*p[8]*p[12]*p[17] + 2*p[0]*p[8]*p[13]*p[20] + 2*p[0]*p[8]*p[14]*p[22] + 2*p[0]*p[9]*p[11]*p[22] + 2*p[0]*p[9]*p[12]*p[20] - 2*p[0]*p[9]*p[13]*p[17] - 2*p[0]*p[10]*p[11]*p[20] + 2*p[0]*p[10]*p[12]*p[22] - 2*p[0]*p[10]*p[14]*p[17] - 2*p[7]*p[9]*p[22] + p[7]*p[9]*p[31] + 2*p[7]*p[10]*p[20] - p[7]*p[10]*p[29] - 2*p[8]*p[9]*p[20] + p[8]*p[9]*p[29] - 2*p[8]*p[10]*p[22] + p[8]*p[10]*p[31] + 2*p[9]*p[9]*p[17] - p[9]*p[9]*p[26] + 2*p[10]*p[10]*p[17] - p[10]*p[10]*p[26] + 2*p[11]*p[13]*p[22] - p[11]*p[13]*p[31] - 2*p[11]*p[14]*p[20] + p[11]*p[14]*p[29] + 2*p[12]*p[13]*p[20] - p[12]*p[13]*p[29] + 2*p[12]*p[14]*p[22] - p[12]*p[14]*p[31] - 2*p[13]*p[13]*p[17] + p[13]*p[13]*p[26] - 2*p[14]*p[14]*p[17] + p[14]*p[14]*p[26];
+   coeff[18] = (-p[9]*p[9] - p[10]*p[10] + p[13]*p[13] + p[14]*p[14])*p[15];
+   coeff[19] = -p[7]*p[10]*p[19] + p[8]*p[9]*p[19] - p[9]*p[9]*p[16] - p[10]*p[10]*p[16] + p[11]*p[14]*p[19] - p[12]*p[13]*p[19] + p[13]*p[13]*p[16] + p[14]*p[14]*p[16];
+   coeff[20] = p[7]*p[9]*p[22] - p[7]*p[10]*p[20] + p[8]*p[9]*p[20] + p[8]*p[10]*p[22] - p[9]*p[9]*p[17] - p[10]*p[10]*p[17] - p[11]*p[13]*p[22] + p[11]*p[14]*p[20] - p[12]*p[13]*p[20] - p[12]*p[14]*p[22] + p[13]*p[13]*p[17] + p[14]*p[14]*p[17];
+   coeff[21] = 2*(-p[7]*p[11]*p[18] + p[7]*p[11]*p[27] - p[7]*p[13]*p[23] + p[7]*p[13]*p[32] + p[7]*p[14]*p[21] - p[7]*p[14]*p[30] - p[8]*p[12]*p[18] + p[8]*p[12]*p[27] - p[8]*p[13]*p[21] + p[8]*p[13]*p[30] - p[8]*p[14]*p[23] + p[8]*p[14]*p[32] - p[9]*p[11]*p[23] + p[9]*p[11]*p[32] - p[9]*p[12]*p[21] + p[9]*p[12]*p[30] + p[9]*p[13]*p[18] - p[9]*p[13]*p[27] + p[10]*p[11]*p[21] - p[10]*p[11]*p[30] - p[10]*p[12]*p[23] + p[10]*p[12]*p[32] + p[10]*p[14]*p[18] - p[10]*p[14]*p[27])*p[0];
+   coeff[22] = -4*p[0]*p[7]*p[11]*p[15] + 2*p[0]*p[7]*p[11]*p[24] - 4*p[0]*p[8]*p[12]*p[15] + 2*p[0]*p[8]*p[12]*p[24] + 4*p[0]*p[9]*p[13]*p[15] - 2*p[0]*p[9]*p[13]*p[24] + 4*p[0]*p[10]*p[14]*p[15] - 2*p[0]*p[10]*p[14]*p[24] - 2*p[9]*p[9]*p[15] + 2*p[9]*p[9]*p[24] - 2*p[10]*p[10]*p[15] + 2*p[10]*p[10]*p[24] + 2*p[13]*p[13]*p[15] - 2*p[13]*p[13]*p[24] + 2*p[14]*p[14]*p[15] - 2*p[14]*p[14]*p[24];
+   coeff[23] = -4*p[0]*p[7]*p[11]*p[16] + 2*p[0]*p[7]*p[11]*p[25] + 4*p[0]*p[7]*p[14]*p[19] - 2*p[0]*p[7]*p[14]*p[28] - 4*p[0]*p[8]*p[12]*p[16] + 2*p[0]*p[8]*p[12]*p[25] - 4*p[0]*p[8]*p[13]*p[19] + 2*p[0]*p[8]*p[13]*p[28] - 4*p[0]*p[9]*p[12]*p[19] + 2*p[0]*p[9]*p[12]*p[28] + 4*p[0]*p[9]*p[13]*p[16] - 2*p[0]*p[9]*p[13]*p[25] + 4*p[0]*p[10]*p[11]*p[19] - 2*p[0]*p[10]*p[11]*p[28] + 4*p[0]*p[10]*p[14]*p[16] - 2*p[0]*p[10]*p[14]*p[25] - 2*p[7]*p[10]*p[19] + 2*p[7]*p[10]*p[28] + 2*p[8]*p[9]*p[19] - 2*p[8]*p[9]*p[28] - 2*p[9]*p[9]*p[16] + 2*p[9]*p[9]*p[25] - 2*p[10]*p[10]*p[16] + 2*p[10]*p[10]*p[25] + 2*p[11]*p[14]*p[19] - 2*p[11]*p[14]*p[28] - 2*p[12]*p[13]*p[19] + 2*p[12]*p[13]*p[28] + 2*p[13]*p[13]*p[16] - 2*p[13]*p[13]*p[25] + 2*p[14]*p[14]*p[16] - 2*p[14]*p[14]*p[25];
+   coeff[24] = -4*p[0]*p[7]*p[11]*p[17] + 2*p[0]*p[7]*p[11]*p[26] - 4*p[0]*p[7]*p[13]*p[22] + 2*p[0]*p[7]*p[13]*p[31] + 4*p[0]*p[7]*p[14]*p[20] - 2*p[0]*p[7]*p[14]*p[29] - 4*p[0]*p[8]*p[12]*p[17] + 2*p[0]*p[8]*p[12]*p[26] - 4*p[0]*p[8]*p[13]*p[20] + 2*p[0]*p[8]*p[13]*p[29] - 4*p[0]*p[8]*p[14]*p[22] + 2*p[0]*p[8]*p[14]*p[31] - 4*p[0]*p[9]*p[11]*p[22] + 2*p[0]*p[9]*p[11]*p[31] - 4*p[0]*p[9]*p[12]*p[20] + 2*p[0]*p[9]*p[12]*p[29] + 4*p[0]*p[9]*p[13]*p[17] - 2*p[0]*p[9]*p[13]*p[26] + 4*p[0]*p[10]*p[11]*p[20] - 2*p[0]*p[10]*p[11]*p[29] - 4*p[0]*p[10]*p[12]*p[22] + 2*p[0]*p[10]*p[12]*p[31] + 4*p[0]*p[10]*p[14]*p[17] - 2*p[0]*p[10]*p[14]*p[26] + 2*p[7]*p[9]*p[22] - 2*p[7]*p[9]*p[31] - 2*p[7]*p[10]*p[20] + 2*p[7]*p[10]*p[29] + 2*p[8]*p[9]*p[20] - 2*p[8]*p[9]*p[29] + 2*p[8]*p[10]*p[22] - 2*p[8]*p[10]*p[31] - 2*p[9]*p[9]*p[17] + 2*p[9]*p[9]*p[26] - 2*p[10]*p[10]*p[17] + 2*p[10]*p[10]*p[26] - 2*p[11]*p[13]*p[22] + 2*p[11]*p[13]*p[31] + 2*p[11]*p[14]*p[20] - 2*p[11]*p[14]*p[29] - 2*p[12]*p[13]*p[20] + 2*p[12]*p[13]*p[29] - 2*p[12]*p[14]*p[22] + 2*p[12]*p[14]*p[31] + 2*p[13]*p[13]*p[17] - 2*p[13]*p[13]*p[26] + 2*p[14]*p[14]*p[17] - 2*p[14]*p[14]*p[26];
+   coeff[25] = 2*p[0]*p[7]*p[11]*p[15] + 2*p[0]*p[8]*p[12]*p[15] - 2*p[0]*p[9]*p[13]*p[15] - 2*p[0]*p[10]*p[14]*p[15] + 2*p[9]*p[9]*p[15] - 2*p[9]*p[9]*p[24] + 2*p[10]*p[10]*p[15] - 2*p[10]*p[10]*p[24] - 2*p[13]*p[13]*p[15] + 2*p[13]*p[13]*p[24] - 2*p[14]*p[14]*p[15] + 2*p[14]*p[14]*p[24];
+   coeff[26] = 2*p[0]*p[7]*p[11]*p[16] - 2*p[0]*p[7]*p[14]*p[19] + 2*p[0]*p[8]*p[12]*p[16] + 2*p[0]*p[8]*p[13]*p[19] + 2*p[0]*p[9]*p[12]*p[19] - 2*p[0]*p[9]*p[13]*p[16] - 2*p[0]*p[10]*p[11]*p[19] - 2*p[0]*p[10]*p[14]*p[16] + 2*p[7]*p[10]*p[19] - 2*p[7]*p[10]*p[28] - 2*p[8]*p[9]*p[19] + 2*p[8]*p[9]*p[28] + 2*p[9]*p[9]*p[16] - 2*p[9]*p[9]*p[25] + 2*p[10]*p[10]*p[16] - 2*p[10]*p[10]*p[25] - 2*p[11]*p[14]*p[19] + 2*p[11]*p[14]*p[28] + 2*p[12]*p[13]*p[19] - 2*p[12]*p[13]*p[28] - 2*p[13]*p[13]*p[16] + 2*p[13]*p[13]*p[25] - 2*p[14]*p[14]*p[16] + 2*p[14]*p[14]*p[25];
+   coeff[27] = 2*p[0]*p[7]*p[11]*p[17] + 2*p[0]*p[7]*p[13]*p[22] - 2*p[0]*p[7]*p[14]*p[20] + 2*p[0]*p[8]*p[12]*p[17] + 2*p[0]*p[8]*p[13]*p[20] + 2*p[0]*p[8]*p[14]*p[22] + 2*p[0]*p[9]*p[11]*p[22] + 2*p[0]*p[9]*p[12]*p[20] - 2*p[0]*p[9]*p[13]*p[17] - 2*p[0]*p[10]*p[11]*p[20] + 2*p[0]*p[10]*p[12]*p[22] - 2*p[0]*p[10]*p[14]*p[17] - 2*p[7]*p[9]*p[22] + 2*p[7]*p[9]*p[31] + 2*p[7]*p[10]*p[20] - 2*p[7]*p[10]*p[29] - 2*p[8]*p[9]*p[20] + 2*p[8]*p[9]*p[29] - 2*p[8]*p[10]*p[22] + 2*p[8]*p[10]*p[31] + 2*p[9]*p[9]*p[17] - 2*p[9]*p[9]*p[26] + 2*p[10]*p[10]*p[17] - 2*p[10]*p[10]*p[26] + 2*p[11]*p[13]*p[22] - 2*p[11]*p[13]*p[31] - 2*p[11]*p[14]*p[20] + 2*p[11]*p[14]*p[29] + 2*p[12]*p[13]*p[20] - 2*p[12]*p[13]*p[29] + 2*p[12]*p[14]*p[22] - 2*p[12]*p[14]*p[31] - 2*p[13]*p[13]*p[17] + 2*p[13]*p[13]*p[26] - 2*p[14]*p[14]*p[17] + 2*p[14]*p[14]*p[26];
+   coeff[28] = 0;
+   coeff[29] = 2*(p[7]*p[11]*p[15] - p[7]*p[11]*p[24] + p[8]*p[12]*p[15] - p[8]*p[12]*p[24] - p[9]*p[13]*p[15] + p[9]*p[13]*p[24] - p[10]*p[14]*p[15] + p[10]*p[14]*p[24])*p[0];
+   coeff[30] = 2*(p[7]*p[11]*p[16] - p[7]*p[11]*p[25] - p[7]*p[14]*p[19] + p[7]*p[14]*p[28] + p[8]*p[12]*p[16] - p[8]*p[12]*p[25] + p[8]*p[13]*p[19] - p[8]*p[13]*p[28] + p[9]*p[12]*p[19] - p[9]*p[12]*p[28] - p[9]*p[13]*p[16] + p[9]*p[13]*p[25] - p[10]*p[11]*p[19] + p[10]*p[11]*p[28] - p[10]*p[14]*p[16] + p[10]*p[14]*p[25])*p[0];
+   coeff[31] = 2*(p[7]*p[11]*p[17] - p[7]*p[11]*p[26] + p[7]*p[13]*p[22] - p[7]*p[13]*p[31] - p[7]*p[14]*p[20] + p[7]*p[14]*p[29] + p[8]*p[12]*p[17] - p[8]*p[12]*p[26] + p[8]*p[13]*p[20] - p[8]*p[13]*p[29] + p[8]*p[14]*p[22] - p[8]*p[14]*p[31] + p[9]*p[11]*p[22] - p[9]*p[11]*p[31] + p[9]*p[12]*p[20] - p[9]*p[12]*p[29] - p[9]*p[13]*p[17] + p[9]*p[13]*p[26] - p[10]*p[11]*p[20] + p[10]*p[11]*p[29] + p[10]*p[12]*p[22] - p[10]*p[12]*p[31] - p[10]*p[14]*p[17] + p[10]*p[14]*p[26])*p[0];
+   coeff[32] = 2*(-p[7]*p[11]*p[15] + p[7]*p[11]*p[24] - p[8]*p[12]*p[15] + p[8]*p[12]*p[24] + p[9]*p[13]*p[15] - p[9]*p[13]*p[24] + p[10]*p[14]*p[15] - p[10]*p[14]*p[24])*p[0];
+   coeff[33] = 2*(-p[7]*p[11]*p[16] + p[7]*p[11]*p[25] + p[7]*p[14]*p[19] - p[7]*p[14]*p[28] - p[8]*p[12]*p[16] + p[8]*p[12]*p[25] - p[8]*p[13]*p[19] + p[8]*p[13]*p[28] - p[9]*p[12]*p[19] + p[9]*p[12]*p[28] + p[9]*p[13]*p[16] - p[9]*p[13]*p[25] + p[10]*p[11]*p[19] - p[10]*p[11]*p[28] + p[10]*p[14]*p[16] - p[10]*p[14]*p[25])*p[0];
+   coeff[34] = 2*(-p[7]*p[11]*p[17] + p[7]*p[11]*p[26] - p[7]*p[13]*p[22] + p[7]*p[13]*p[31] + p[7]*p[14]*p[20] - p[7]*p[14]*p[29] - p[8]*p[12]*p[17] + p[8]*p[12]*p[26] - p[8]*p[13]*p[20] + p[8]*p[13]*p[29] - p[8]*p[14]*p[22] + p[8]*p[14]*p[31] - p[9]*p[11]*p[22] + p[9]*p[11]*p[31] - p[9]*p[12]*p[20] + p[9]*p[12]*p[29] + p[9]*p[13]*p[17] - p[9]*p[13]*p[26] + p[10]*p[11]*p[20] - p[10]*p[11]*p[29] - p[10]*p[12]*p[22] + p[10]*p[12]*p[31] + p[10]*p[14]*p[17] - p[10]*p[14]*p[26])*p[0];
+   coeff[35] = -2*p[0]*p[7]*p[9]*p[23] + 2*p[0]*p[7]*p[10]*p[21] - 2*p[0]*p[8]*p[9]*p[21] - 2*p[0]*p[8]*p[10]*p[23] + 2*p[0]*p[9]*p[9]*p[18] + 2*p[0]*p[10]*p[10]*p[18] + 2*p[0]*p[11]*p[13]*p[23] - 2*p[0]*p[11]*p[14]*p[21] + 2*p[0]*p[12]*p[13]*p[21] + 2*p[0]*p[12]*p[14]*p[23] - 2*p[0]*p[13]*p[13]*p[18] - 2*p[0]*p[14]*p[14]*p[18] - p[7]*p[11]*p[18] + p[7]*p[11]*p[27] - p[7]*p[13]*p[23] + p[7]*p[13]*p[32] + p[7]*p[14]*p[21] - p[7]*p[14]*p[30] - p[8]*p[12]*p[18] + p[8]*p[12]*p[27] - p[8]*p[13]*p[21] + p[8]*p[13]*p[30] - p[8]*p[14]*p[23] + p[8]*p[14]*p[32] - p[9]*p[11]*p[23] + p[9]*p[11]*p[32] - p[9]*p[12]*p[21] + p[9]*p[12]*p[30] + p[9]*p[13]*p[18] - p[9]*p[13]*p[27] + p[10]*p[11]*p[21] - p[10]*p[11]*p[30] - p[10]*p[12]*p[23] + p[10]*p[12]*p[32] + p[10]*p[14]*p[18] - p[10]*p[14]*p[27];
+   coeff[36] = 2*p[0]*p[9]*p[9]*p[15] + 2*p[0]*p[10]*p[10]*p[15] - 2*p[0]*p[13]*p[13]*p[15] - 2*p[0]*p[14]*p[14]*p[15] - 2*p[7]*p[11]*p[15] + p[7]*p[11]*p[24] - 2*p[8]*p[12]*p[15] + p[8]*p[12]*p[24] + 2*p[9]*p[13]*p[15] - p[9]*p[13]*p[24] + 2*p[10]*p[14]*p[15] - p[10]*p[14]*p[24];
+   coeff[37] = 2*p[0]*p[7]*p[10]*p[19] - 2*p[0]*p[8]*p[9]*p[19] + 2*p[0]*p[9]*p[9]*p[16] + 2*p[0]*p[10]*p[10]*p[16] - 2*p[0]*p[11]*p[14]*p[19] + 2*p[0]*p[12]*p[13]*p[19] - 2*p[0]*p[13]*p[13]*p[16] - 2*p[0]*p[14]*p[14]*p[16] - 2*p[7]*p[11]*p[16] + p[7]*p[11]*p[25] + 2*p[7]*p[14]*p[19] - p[7]*p[14]*p[28] - 2*p[8]*p[12]*p[16] + p[8]*p[12]*p[25] - 2*p[8]*p[13]*p[19] + p[8]*p[13]*p[28] - 2*p[9]*p[12]*p[19] + p[9]*p[12]*p[28] + 2*p[9]*p[13]*p[16] - p[9]*p[13]*p[25] + 2*p[10]*p[11]*p[19] - p[10]*p[11]*p[28] + 2*p[10]*p[14]*p[16] - p[10]*p[14]*p[25];
+   coeff[38] = -2*p[0]*p[7]*p[9]*p[22] + 2*p[0]*p[7]*p[10]*p[20] - 2*p[0]*p[8]*p[9]*p[20] - 2*p[0]*p[8]*p[10]*p[22] + 2*p[0]*p[9]*p[9]*p[17] + 2*p[0]*p[10]*p[10]*p[17] + 2*p[0]*p[11]*p[13]*p[22] - 2*p[0]*p[11]*p[14]*p[20] + 2*p[0]*p[12]*p[13]*p[20] + 2*p[0]*p[12]*p[14]*p[22] - 2*p[0]*p[13]*p[13]*p[17] - 2*p[0]*p[14]*p[14]*p[17] - 2*p[7]*p[11]*p[17] + p[7]*p[11]*p[26] - 2*p[7]*p[13]*p[22] + p[7]*p[13]*p[31] + 2*p[7]*p[14]*p[20] - p[7]*p[14]*p[29] - 2*p[8]*p[12]*p[17] + p[8]*p[12]*p[26] - 2*p[8]*p[13]*p[20] + p[8]*p[13]*p[29] - 2*p[8]*p[14]*p[22] + p[8]*p[14]*p[31] - 2*p[9]*p[11]*p[22] + p[9]*p[11]*p[31] - 2*p[9]*p[12]*p[20] + p[9]*p[12]*p[29] + 2*p[9]*p[13]*p[17] - p[9]*p[13]*p[26] + 2*p[10]*p[11]*p[20] - p[10]*p[11]*p[29] - 2*p[10]*p[12]*p[22] + p[10]*p[12]*p[31] + 2*p[10]*p[14]*p[17] - p[10]*p[14]*p[26];
+   coeff[39] = (p[7]*p[11] + p[8]*p[12] - p[9]*p[13] - p[10]*p[14])*p[15];
+   coeff[40] = p[7]*p[11]*p[16] - p[7]*p[14]*p[19] + p[8]*p[12]*p[16] + p[8]*p[13]*p[19] + p[9]*p[12]*p[19] - p[9]*p[13]*p[16] - p[10]*p[11]*p[19] - p[10]*p[14]*p[16];
+   coeff[41] = p[7]*p[11]*p[17] + p[7]*p[13]*p[22] - p[7]*p[14]*p[20] + p[8]*p[12]*p[17] + p[8]*p[13]*p[20] + p[8]*p[14]*p[22] + p[9]*p[11]*p[22] + p[9]*p[12]*p[20] - p[9]*p[13]*p[17] - p[10]*p[11]*p[20] + p[10]*p[12]*p[22] - p[10]*p[14]*p[17];
+   coeff[42] = 2*(p[7]*p[9]*p[23] - p[7]*p[9]*p[32] - p[7]*p[10]*p[21] + p[7]*p[10]*p[30] + p[8]*p[9]*p[21] - p[8]*p[9]*p[30] + p[8]*p[10]*p[23] - p[8]*p[10]*p[32] - p[9]*p[9]*p[18] + p[9]*p[9]*p[27] - p[10]*p[10]*p[18] + p[10]*p[10]*p[27] - p[11]*p[13]*p[23] + p[11]*p[13]*p[32] + p[11]*p[14]*p[21] - p[11]*p[14]*p[30] - p[12]*p[13]*p[21] + p[12]*p[13]*p[30] - p[12]*p[14]*p[23] + p[12]*p[14]*p[32] + p[13]*p[13]*p[18] - p[13]*p[13]*p[27] + p[14]*p[14]*p[18] - p[14]*p[14]*p[27])*p[0];
+   coeff[43] = -4*p[0]*p[9]*p[9]*p[15] + 2*p[0]*p[9]*p[9]*p[24] - 4*p[0]*p[10]*p[10]*p[15] + 2*p[0]*p[10]*p[10]*p[24] + 4*p[0]*p[13]*p[13]*p[15] - 2*p[0]*p[13]*p[13]*p[24] + 4*p[0]*p[14]*p[14]*p[15] - 2*p[0]*p[14]*p[14]*p[24] + 2*p[7]*p[11]*p[15] - 2*p[7]*p[11]*p[24] + 2*p[8]*p[12]*p[15] - 2*p[8]*p[12]*p[24] - 2*p[9]*p[13]*p[15] + 2*p[9]*p[13]*p[24] - 2*p[10]*p[14]*p[15] + 2*p[10]*p[14]*p[24];
+   coeff[44] = -4*p[0]*p[7]*p[10]*p[19] + 2*p[0]*p[7]*p[10]*p[28] + 4*p[0]*p[8]*p[9]*p[19] - 2*p[0]*p[8]*p[9]*p[28] - 4*p[0]*p[9]*p[9]*p[16] + 2*p[0]*p[9]*p[9]*p[25] - 4*p[0]*p[10]*p[10]*p[16] + 2*p[0]*p[10]*p[10]*p[25] + 4*p[0]*p[11]*p[14]*p[19] - 2*p[0]*p[11]*p[14]*p[28] - 4*p[0]*p[12]*p[13]*p[19] + 2*p[0]*p[12]*p[13]*p[28] + 4*p[0]*p[13]*p[13]*p[16] - 2*p[0]*p[13]*p[13]*p[25] + 4*p[0]*p[14]*p[14]*p[16] - 2*p[0]*p[14]*p[14]*p[25] + 2*p[7]*p[11]*p[16] - 2*p[7]*p[11]*p[25] - 2*p[7]*p[14]*p[19] + 2*p[7]*p[14]*p[28] + 2*p[8]*p[12]*p[16] - 2*p[8]*p[12]*p[25] + 2*p[8]*p[13]*p[19] - 2*p[8]*p[13]*p[28] + 2*p[9]*p[12]*p[19] - 2*p[9]*p[12]*p[28] - 2*p[9]*p[13]*p[16] + 2*p[9]*p[13]*p[25] - 2*p[10]*p[11]*p[19] + 2*p[10]*p[11]*p[28] - 2*p[10]*p[14]*p[16] + 2*p[10]*p[14]*p[25];
+   coeff[45] = 4*p[0]*p[7]*p[9]*p[22] - 2*p[0]*p[7]*p[9]*p[31] - 4*p[0]*p[7]*p[10]*p[20] + 2*p[0]*p[7]*p[10]*p[29] + 4*p[0]*p[8]*p[9]*p[20] - 2*p[0]*p[8]*p[9]*p[29] + 4*p[0]*p[8]*p[10]*p[22] - 2*p[0]*p[8]*p[10]*p[31] - 4*p[0]*p[9]*p[9]*p[17] + 2*p[0]*p[9]*p[9]*p[26] - 4*p[0]*p[10]*p[10]*p[17] + 2*p[0]*p[10]*p[10]*p[26] - 4*p[0]*p[11]*p[13]*p[22] + 2*p[0]*p[11]*p[13]*p[31] + 4*p[0]*p[11]*p[14]*p[20] - 2*p[0]*p[11]*p[14]*p[29] - 4*p[0]*p[12]*p[13]*p[20] + 2*p[0]*p[12]*p[13]*p[29] - 4*p[0]*p[12]*p[14]*p[22] + 2*p[0]*p[12]*p[14]*p[31] + 4*p[0]*p[13]*p[13]*p[17] - 2*p[0]*p[13]*p[13]*p[26] + 4*p[0]*p[14]*p[14]*p[17] - 2*p[0]*p[14]*p[14]*p[26] + 2*p[7]*p[11]*p[17] - 2*p[7]*p[11]*p[26] + 2*p[7]*p[13]*p[22] - 2*p[7]*p[13]*p[31] - 2*p[7]*p[14]*p[20] + 2*p[7]*p[14]*p[29] + 2*p[8]*p[12]*p[17] - 2*p[8]*p[12]*p[26] + 2*p[8]*p[13]*p[20] - 2*p[8]*p[13]*p[29] + 2*p[8]*p[14]*p[22] - 2*p[8]*p[14]*p[31] + 2*p[9]*p[11]*p[22] - 2*p[9]*p[11]*p[31] + 2*p[9]*p[12]*p[20] - 2*p[9]*p[12]*p[29] - 2*p[9]*p[13]*p[17] + 2*p[9]*p[13]*p[26] - 2*p[10]*p[11]*p[20] + 2*p[10]*p[11]*p[29] + 2*p[10]*p[12]*p[22] - 2*p[10]*p[12]*p[31] - 2*p[10]*p[14]*p[17] + 2*p[10]*p[14]*p[26];
+   coeff[46] = 2*p[0]*p[9]*p[9]*p[15] + 2*p[0]*p[10]*p[10]*p[15] - 2*p[0]*p[13]*p[13]*p[15] - 2*p[0]*p[14]*p[14]*p[15] - 2*p[7]*p[11]*p[15] + 2*p[7]*p[11]*p[24] - 2*p[8]*p[12]*p[15] + 2*p[8]*p[12]*p[24] + 2*p[9]*p[13]*p[15] - 2*p[9]*p[13]*p[24] + 2*p[10]*p[14]*p[15] - 2*p[10]*p[14]*p[24];
+   coeff[47] = 2*p[0]*p[7]*p[10]*p[19] - 2*p[0]*p[8]*p[9]*p[19] + 2*p[0]*p[9]*p[9]*p[16] + 2*p[0]*p[10]*p[10]*p[16] - 2*p[0]*p[11]*p[14]*p[19] + 2*p[0]*p[12]*p[13]*p[19] - 2*p[0]*p[13]*p[13]*p[16] - 2*p[0]*p[14]*p[14]*p[16] - 2*p[7]*p[11]*p[16] + 2*p[7]*p[11]*p[25] + 2*p[7]*p[14]*p[19] - 2*p[7]*p[14]*p[28] - 2*p[8]*p[12]*p[16] + 2*p[8]*p[12]*p[25] - 2*p[8]*p[13]*p[19] + 2*p[8]*p[13]*p[28] - 2*p[9]*p[12]*p[19] + 2*p[9]*p[12]*p[28] + 2*p[9]*p[13]*p[16] - 2*p[9]*p[13]*p[25] + 2*p[10]*p[11]*p[19] - 2*p[10]*p[11]*p[28] + 2*p[10]*p[14]*p[16] - 2*p[10]*p[14]*p[25];
+   coeff[48] = -2*p[0]*p[7]*p[9]*p[22] + 2*p[0]*p[7]*p[10]*p[20] - 2*p[0]*p[8]*p[9]*p[20] - 2*p[0]*p[8]*p[10]*p[22] + 2*p[0]*p[9]*p[9]*p[17] + 2*p[0]*p[10]*p[10]*p[17] + 2*p[0]*p[11]*p[13]*p[22] - 2*p[0]*p[11]*p[14]*p[20] + 2*p[0]*p[12]*p[13]*p[20] + 2*p[0]*p[12]*p[14]*p[22] - 2*p[0]*p[13]*p[13]*p[17] - 2*p[0]*p[14]*p[14]*p[17] - 2*p[7]*p[11]*p[17] + 2*p[7]*p[11]*p[26] - 2*p[7]*p[13]*p[22] + 2*p[7]*p[13]*p[31] + 2*p[7]*p[14]*p[20] - 2*p[7]*p[14]*p[29] - 2*p[8]*p[12]*p[17] + 2*p[8]*p[12]*p[26] - 2*p[8]*p[13]*p[20] + 2*p[8]*p[13]*p[29] - 2*p[8]*p[14]*p[22] + 2*p[8]*p[14]*p[31] - 2*p[9]*p[11]*p[22] + 2*p[9]*p[11]*p[31] - 2*p[9]*p[12]*p[20] + 2*p[9]*p[12]*p[29] + 2*p[9]*p[13]*p[17] - 2*p[9]*p[13]*p[26] + 2*p[10]*p[11]*p[20] - 2*p[10]*p[11]*p[29] - 2*p[10]*p[12]*p[22] + 2*p[10]*p[12]*p[31] + 2*p[10]*p[14]*p[17] - 2*p[10]*p[14]*p[26];
+   coeff[49] = 0;
+   coeff[50] = 2*(p[9]*p[9]*p[15] - p[9]*p[9]*p[24] + p[10]*p[10]*p[15] - p[10]*p[10]*p[24] - p[13]*p[13]*p[15] + p[13]*p[13]*p[24] - p[14]*p[14]*p[15] + p[14]*p[14]*p[24])*p[0];
+   coeff[51] = 2*(p[7]*p[10]*p[19] - p[7]*p[10]*p[28] - p[8]*p[9]*p[19] + p[8]*p[9]*p[28] + p[9]*p[9]*p[16] - p[9]*p[9]*p[25] + p[10]*p[10]*p[16] - p[10]*p[10]*p[25] - p[11]*p[14]*p[19] + p[11]*p[14]*p[28] + p[12]*p[13]*p[19] - p[12]*p[13]*p[28] - p[13]*p[13]*p[16] + p[13]*p[13]*p[25] - p[14]*p[14]*p[16] + p[14]*p[14]*p[25])*p[0];
+   coeff[52] = 2*(-p[7]*p[9]*p[22] + p[7]*p[9]*p[31] + p[7]*p[10]*p[20] - p[7]*p[10]*p[29] - p[8]*p[9]*p[20] + p[8]*p[9]*p[29] - p[8]*p[10]*p[22] + p[8]*p[10]*p[31] + p[9]*p[9]*p[17] - p[9]*p[9]*p[26] + p[10]*p[10]*p[17] - p[10]*p[10]*p[26] + p[11]*p[13]*p[22] - p[11]*p[13]*p[31] - p[11]*p[14]*p[20] + p[11]*p[14]*p[29] + p[12]*p[13]*p[20] - p[12]*p[13]*p[29] + p[12]*p[14]*p[22] - p[12]*p[14]*p[31] - p[13]*p[13]*p[17] + p[13]*p[13]*p[26] - p[14]*p[14]*p[17] + p[14]*p[14]*p[26])*p[0];
+   coeff[53] = 2*(-p[9]*p[9]*p[15] + p[9]*p[9]*p[24] - p[10]*p[10]*p[15] + p[10]*p[10]*p[24] + p[13]*p[13]*p[15] - p[13]*p[13]*p[24] + p[14]*p[14]*p[15] - p[14]*p[14]*p[24])*p[0];
+   coeff[54] = 2*(-p[7]*p[10]*p[19] + p[7]*p[10]*p[28] + p[8]*p[9]*p[19] - p[8]*p[9]*p[28] - p[9]*p[9]*p[16] + p[9]*p[9]*p[25] - p[10]*p[10]*p[16] + p[10]*p[10]*p[25] + p[11]*p[14]*p[19] - p[11]*p[14]*p[28] - p[12]*p[13]*p[19] + p[12]*p[13]*p[28] + p[13]*p[13]*p[16] - p[13]*p[13]*p[25] + p[14]*p[14]*p[16] - p[14]*p[14]*p[25])*p[0];
+   coeff[55] = 2*(p[7]*p[9]*p[22] - p[7]*p[9]*p[31] - p[7]*p[10]*p[20] + p[7]*p[10]*p[29] + p[8]*p[9]*p[20] - p[8]*p[9]*p[29] + p[8]*p[10]*p[22] - p[8]*p[10]*p[31] - p[9]*p[9]*p[17] + p[9]*p[9]*p[26] - p[10]*p[10]*p[17] + p[10]*p[10]*p[26] - p[11]*p[13]*p[22] + p[11]*p[13]*p[31] + p[11]*p[14]*p[20] - p[11]*p[14]*p[29] - p[12]*p[13]*p[20] + p[12]*p[13]*p[29] - p[12]*p[14]*p[22] + p[12]*p[14]*p[31] + p[13]*p[13]*p[17] - p[13]*p[13]*p[26] + p[14]*p[14]*p[17] - p[14]*p[14]*p[26])*p[0];
+   coeff[56] = -p[2] + p[5] + p[7]*p[8]*p[23] - p[7]*p[8]*p[32] - p[7]*p[10]*p[18] + p[7]*p[10]*p[27] + p[8]*p[8]*p[21] - p[8]*p[8]*p[30] - p[8]*p[9]*p[18] + p[8]*p[9]*p[27] - p[9]*p[10]*p[23] + p[9]*p[10]*p[32] + p[10]*p[10]*p[21] - p[10]*p[10]*p[30] + p[11]*p[12]*p[23] - p[11]*p[12]*p[32] - p[11]*p[14]*p[18] + p[11]*p[14]*p[27] + p[12]*p[12]*p[21] - p[12]*p[12]*p[30] - p[12]*p[13]*p[18] + p[12]*p[13]*p[27] - p[13]*p[14]*p[23] + p[13]*p[14]*p[32] + p[14]*p[14]*p[21] - p[14]*p[14]*p[30] - p[21] + p[30];
+   coeff[57] = -2*p[7]*p[10]*p[15] + p[7]*p[10]*p[24] - 2*p[8]*p[9]*p[15] + p[8]*p[9]*p[24] - 2*p[11]*p[14]*p[15] + p[11]*p[14]*p[24] - 2*p[12]*p[13]*p[15] + p[12]*p[13]*p[24];
+   coeff[58] = -2*p[7]*p[10]*p[16] + p[7]*p[10]*p[25] + 2*p[8]*p[8]*p[19] - p[8]*p[8]*p[28] - 2*p[8]*p[9]*p[16] + p[8]*p[9]*p[25] + 2*p[10]*p[10]*p[19] - p[10]*p[10]*p[28] - 2*p[11]*p[14]*p[16] + p[11]*p[14]*p[25] + 2*p[12]*p[12]*p[19] - p[12]*p[12]*p[28] - 2*p[12]*p[13]*p[16] + p[12]*p[13]*p[25] + 2*p[14]*p[14]*p[19] - p[14]*p[14]*p[28] - 2*p[19] + p[28];
+   coeff[59] = 2*p[7]*p[8]*p[22] - p[7]*p[8]*p[31] - 2*p[7]*p[10]*p[17] + p[7]*p[10]*p[26] + 2*p[8]*p[8]*p[20] - p[8]*p[8]*p[29] - 2*p[8]*p[9]*p[17] + p[8]*p[9]*p[26] - 2*p[9]*p[10]*p[22] + p[9]*p[10]*p[31] + 2*p[10]*p[10]*p[20] - p[10]*p[10]*p[29] + 2*p[11]*p[12]*p[22] - p[11]*p[12]*p[31] - 2*p[11]*p[14]*p[17] + p[11]*p[14]*p[26] + 2*p[12]*p[12]*p[20] - p[12]*p[12]*p[29] - 2*p[12]*p[13]*p[17] + p[12]*p[13]*p[26] - 2*p[13]*p[14]*p[22] + p[13]*p[14]*p[31] + 2*p[14]*p[14]*p[20] - p[14]*p[14]*p[29] - 2*p[20] + p[29];
+   coeff[60] = (p[7]*p[10] + p[8]*p[9] + p[11]*p[14] + p[12]*p[13])*p[15];
+   coeff[61] = p[7]*p[10]*p[16] - p[8]*p[8]*p[19] + p[8]*p[9]*p[16] - p[10]*p[10]*p[19] + p[11]*p[14]*p[16] - p[12]*p[12]*p[19] + p[12]*p[13]*p[16] - p[14]*p[14]*p[19] + p[19];
+   coeff[62] = -p[7]*p[8]*p[22] + p[7]*p[10]*p[17] - p[8]*p[8]*p[20] + p[8]*p[9]*p[17] + p[9]*p[10]*p[22] - p[10]*p[10]*p[20] - p[11]*p[12]*p[22] + p[11]*p[14]*p[17] - p[12]*p[12]*p[20] + p[12]*p[13]*p[17] + p[13]*p[14]*p[22] - p[14]*p[14]*p[20] + p[20];
+   coeff[63] = 0;
+   coeff[64] = 2*p[7]*p[10]*p[15] - 2*p[7]*p[10]*p[24] + 2*p[8]*p[9]*p[15] - 2*p[8]*p[9]*p[24] + 2*p[11]*p[14]*p[15] - 2*p[11]*p[14]*p[24] + 2*p[12]*p[13]*p[15] - 2*p[12]*p[13]*p[24];
+   coeff[65] = 2*p[7]*p[10]*p[16] - 2*p[7]*p[10]*p[25] - 2*p[8]*p[8]*p[19] + 2*p[8]*p[8]*p[28] + 2*p[8]*p[9]*p[16] - 2*p[8]*p[9]*p[25] - 2*p[10]*p[10]*p[19] + 2*p[10]*p[10]*p[28] + 2*p[11]*p[14]*p[16] - 2*p[11]*p[14]*p[25] - 2*p[12]*p[12]*p[19] + 2*p[12]*p[12]*p[28] + 2*p[12]*p[13]*p[16] - 2*p[12]*p[13]*p[25] - 2*p[14]*p[14]*p[19] + 2*p[14]*p[14]*p[28] + 2*p[19] - 2*p[28];
+   coeff[66] = -2*p[7]*p[8]*p[22] + 2*p[7]*p[8]*p[31] + 2*p[7]*p[10]*p[17] - 2*p[7]*p[10]*p[26] - 2*p[8]*p[8]*p[20] + 2*p[8]*p[8]*p[29] + 2*p[8]*p[9]*p[17] - 2*p[8]*p[9]*p[26] + 2*p[9]*p[10]*p[22] - 2*p[9]*p[10]*p[31] - 2*p[10]*p[10]*p[20] + 2*p[10]*p[10]*p[29] - 2*p[11]*p[12]*p[22] + 2*p[11]*p[12]*p[31] + 2*p[11]*p[14]*p[17] - 2*p[11]*p[14]*p[26] - 2*p[12]*p[12]*p[20] + 2*p[12]*p[12]*p[29] + 2*p[12]*p[13]*p[17] - 2*p[12]*p[13]*p[26] + 2*p[13]*p[14]*p[22] - 2*p[13]*p[14]*p[31] - 2*p[14]*p[14]*p[20] + 2*p[14]*p[14]*p[29] + 2*p[20] - 2*p[29];
+   coeff[67] = -2*p[7]*p[10]*p[15] + 2*p[7]*p[10]*p[24] - 2*p[8]*p[9]*p[15] + 2*p[8]*p[9]*p[24] - 2*p[11]*p[14]*p[15] + 2*p[11]*p[14]*p[24] - 2*p[12]*p[13]*p[15] + 2*p[12]*p[13]*p[24];
+   coeff[68] = -2*p[7]*p[10]*p[16] + 2*p[7]*p[10]*p[25] + 2*p[8]*p[8]*p[19] - 2*p[8]*p[8]*p[28] - 2*p[8]*p[9]*p[16] + 2*p[8]*p[9]*p[25] + 2*p[10]*p[10]*p[19] - 2*p[10]*p[10]*p[28] - 2*p[11]*p[14]*p[16] + 2*p[11]*p[14]*p[25] + 2*p[12]*p[12]*p[19] - 2*p[12]*p[12]*p[28] - 2*p[12]*p[13]*p[16] + 2*p[12]*p[13]*p[25] + 2*p[14]*p[14]*p[19] - 2*p[14]*p[14]*p[28] - 2*p[19] + 2*p[28];
+   coeff[69] = 2*p[7]*p[8]*p[22] - 2*p[7]*p[8]*p[31] - 2*p[7]*p[10]*p[17] + 2*p[7]*p[10]*p[26] + 2*p[8]*p[8]*p[20] - 2*p[8]*p[8]*p[29] - 2*p[8]*p[9]*p[17] + 2*p[8]*p[9]*p[26] - 2*p[9]*p[10]*p[22] + 2*p[9]*p[10]*p[31] + 2*p[10]*p[10]*p[20] - 2*p[10]*p[10]*p[29] + 2*p[11]*p[12]*p[22] - 2*p[11]*p[12]*p[31] - 2*p[11]*p[14]*p[17] + 2*p[11]*p[14]*p[26] + 2*p[12]*p[12]*p[20] - 2*p[12]*p[12]*p[29] - 2*p[12]*p[13]*p[17] + 2*p[12]*p[13]*p[26] - 2*p[13]*p[14]*p[22] + 2*p[13]*p[14]*p[31] + 2*p[14]*p[14]*p[20] - 2*p[14]*p[14]*p[29] - 2*p[20] + 2*p[29];
+   coeff[70] = 2*p[0]*p[7]*p[11]*p[21] - 2*p[0]*p[7]*p[12]*p[23] + 2*p[0]*p[7]*p[14]*p[18] - 2*p[0]*p[8]*p[11]*p[23] - 2*p[0]*p[8]*p[12]*p[21] + 2*p[0]*p[8]*p[13]*p[18] + 2*p[0]*p[9]*p[12]*p[18] + 2*p[0]*p[9]*p[13]*p[21] + 2*p[0]*p[9]*p[14]*p[23] + 2*p[0]*p[10]*p[11]*p[18] + 2*p[0]*p[10]*p[13]*p[23] - 2*p[0]*p[10]*p[14]*p[21] + p[7]*p[8]*p[23] - p[7]*p[8]*p[32] - p[7]*p[10]*p[18] + p[7]*p[10]*p[27] + p[8]*p[8]*p[21] - p[8]*p[8]*p[30] - p[8]*p[9]*p[18] + p[8]*p[9]*p[27] - p[9]*p[10]*p[23] + p[9]*p[10]*p[32] + p[10]*p[10]*p[21] - p[10]*p[10]*p[30] - p[11]*p[12]*p[23] + p[11]*p[12]*p[32] + p[11]*p[14]*p[18] - p[11]*p[14]*p[27] - p[12]*p[12]*p[21] + p[12]*p[12]*p[30] + p[12]*p[13]*p[18] - p[12]*p[13]*p[27] + p[13]*p[14]*p[23] - p[13]*p[14]*p[32] - p[14]*p[14]*p[21] + p[14]*p[14]*p[30];
+   coeff[71] = 2*p[0]*p[7]*p[14]*p[15] + 2*p[0]*p[8]*p[13]*p[15] + 2*p[0]*p[9]*p[12]*p[15] + 2*p[0]*p[10]*p[11]*p[15] - 2*p[7]*p[10]*p[15] + p[7]*p[10]*p[24] - 2*p[8]*p[9]*p[15] + p[8]*p[9]*p[24] + 2*p[11]*p[14]*p[15] - p[11]*p[14]*p[24] + 2*p[12]*p[13]*p[15] - p[12]*p[13]*p[24];
+   coeff[72] = 2*p[0]*p[7]*p[11]*p[19] + 2*p[0]*p[7]*p[14]*p[16] - 2*p[0]*p[8]*p[12]*p[19] + 2*p[0]*p[8]*p[13]*p[16] + 2*p[0]*p[9]*p[12]*p[16] + 2*p[0]*p[9]*p[13]*p[19] + 2*p[0]*p[10]*p[11]*p[16] - 2*p[0]*p[10]*p[14]*p[19] - 2*p[7]*p[10]*p[16] + p[7]*p[10]*p[25] + 2*p[8]*p[8]*p[19] - p[8]*p[8]*p[28] - 2*p[8]*p[9]*p[16] + p[8]*p[9]*p[25] + 2*p[10]*p[10]*p[19] - p[10]*p[10]*p[28] + 2*p[11]*p[14]*p[16] - p[11]*p[14]*p[25] - 2*p[12]*p[12]*p[19] + p[12]*p[12]*p[28] + 2*p[12]*p[13]*p[16] - p[12]*p[13]*p[25] - 2*p[14]*p[14]*p[19] + p[14]*p[14]*p[28];
+   coeff[73] = 2*p[0]*p[7]*p[11]*p[20] - 2*p[0]*p[7]*p[12]*p[22] + 2*p[0]*p[7]*p[14]*p[17] - 2*p[0]*p[8]*p[11]*p[22] - 2*p[0]*p[8]*p[12]*p[20] + 2*p[0]*p[8]*p[13]*p[17] + 2*p[0]*p[9]*p[12]*p[17] + 2*p[0]*p[9]*p[13]*p[20] + 2*p[0]*p[9]*p[14]*p[22] + 2*p[0]*p[10]*p[11]*p[17] + 2*p[0]*p[10]*p[13]*p[22] - 2*p[0]*p[10]*p[14]*p[20] + 2*p[7]*p[8]*p[22] - p[7]*p[8]*p[31] - 2*p[7]*p[10]*p[17] + p[7]*p[10]*p[26] + 2*p[8]*p[8]*p[20] - p[8]*p[8]*p[29] - 2*p[8]*p[9]*p[17] + p[8]*p[9]*p[26] - 2*p[9]*p[10]*p[22] + p[9]*p[10]*p[31] + 2*p[10]*p[10]*p[20] - p[10]*p[10]*p[29] - 2*p[11]*p[12]*p[22] + p[11]*p[12]*p[31] + 2*p[11]*p[14]*p[17] - p[11]*p[14]*p[26] - 2*p[12]*p[12]*p[20] + p[12]*p[12]*p[29] + 2*p[12]*p[13]*p[17] - p[12]*p[13]*p[26] + 2*p[13]*p[14]*p[22] - p[13]*p[14]*p[31] - 2*p[14]*p[14]*p[20] + p[14]*p[14]*p[29];
+   coeff[74] = (p[7]*p[10] + p[8]*p[9] - p[11]*p[14] - p[12]*p[13])*p[15];
+   coeff[75] = p[7]*p[10]*p[16] - p[8]*p[8]*p[19] + p[8]*p[9]*p[16] - p[10]*p[10]*p[19] - p[11]*p[14]*p[16] + p[12]*p[12]*p[19] - p[12]*p[13]*p[16] + p[14]*p[14]*p[19];
+   coeff[76] = -p[7]*p[8]*p[22] + p[7]*p[10]*p[17] - p[8]*p[8]*p[20] + p[8]*p[9]*p[17] + p[9]*p[10]*p[22] - p[10]*p[10]*p[20] + p[11]*p[12]*p[22] - p[11]*p[14]*p[17] + p[12]*p[12]*p[20] - p[12]*p[13]*p[17] - p[13]*p[14]*p[22] + p[14]*p[14]*p[20];
+   coeff[77] = 2*(-p[7]*p[11]*p[21] + p[7]*p[11]*p[30] + p[7]*p[12]*p[23] - p[7]*p[12]*p[32] - p[7]*p[14]*p[18] + p[7]*p[14]*p[27] + p[8]*p[11]*p[23] - p[8]*p[11]*p[32] + p[8]*p[12]*p[21] - p[8]*p[12]*p[30] - p[8]*p[13]*p[18] + p[8]*p[13]*p[27] - p[9]*p[12]*p[18] + p[9]*p[12]*p[27] - p[9]*p[13]*p[21] + p[9]*p[13]*p[30] - p[9]*p[14]*p[23] + p[9]*p[14]*p[32] - p[10]*p[11]*p[18] + p[10]*p[11]*p[27] - p[10]*p[13]*p[23] + p[10]*p[13]*p[32] + p[10]*p[14]*p[21] - p[10]*p[14]*p[30])*p[0];
+   coeff[78] = -4*p[0]*p[7]*p[14]*p[15] + 2*p[0]*p[7]*p[14]*p[24] - 4*p[0]*p[8]*p[13]*p[15] + 2*p[0]*p[8]*p[13]*p[24] - 4*p[0]*p[9]*p[12]*p[15] + 2*p[0]*p[9]*p[12]*p[24] - 4*p[0]*p[10]*p[11]*p[15] + 2*p[0]*p[10]*p[11]*p[24] + 2*p[7]*p[10]*p[15] - 2*p[7]*p[10]*p[24] + 2*p[8]*p[9]*p[15] - 2*p[8]*p[9]*p[24] - 2*p[11]*p[14]*p[15] + 2*p[11]*p[14]*p[24] - 2*p[12]*p[13]*p[15] + 2*p[12]*p[13]*p[24];
+   coeff[79] = -4*p[0]*p[7]*p[11]*p[19] + 2*p[0]*p[7]*p[11]*p[28] - 4*p[0]*p[7]*p[14]*p[16] + 2*p[0]*p[7]*p[14]*p[25] + 4*p[0]*p[8]*p[12]*p[19] - 2*p[0]*p[8]*p[12]*p[28] - 4*p[0]*p[8]*p[13]*p[16] + 2*p[0]*p[8]*p[13]*p[25] - 4*p[0]*p[9]*p[12]*p[16] + 2*p[0]*p[9]*p[12]*p[25] - 4*p[0]*p[9]*p[13]*p[19] + 2*p[0]*p[9]*p[13]*p[28] - 4*p[0]*p[10]*p[11]*p[16] + 2*p[0]*p[10]*p[11]*p[25] + 4*p[0]*p[10]*p[14]*p[19] - 2*p[0]*p[10]*p[14]*p[28] + 2*p[7]*p[10]*p[16] - 2*p[7]*p[10]*p[25] - 2*p[8]*p[8]*p[19] + 2*p[8]*p[8]*p[28] + 2*p[8]*p[9]*p[16] - 2*p[8]*p[9]*p[25] - 2*p[10]*p[10]*p[19] + 2*p[10]*p[10]*p[28] - 2*p[11]*p[14]*p[16] + 2*p[11]*p[14]*p[25] + 2*p[12]*p[12]*p[19] - 2*p[12]*p[12]*p[28] - 2*p[12]*p[13]*p[16] + 2*p[12]*p[13]*p[25] + 2*p[14]*p[14]*p[19] - 2*p[14]*p[14]*p[28];
+   coeff[80] = -4*p[0]*p[7]*p[11]*p[20] + 2*p[0]*p[7]*p[11]*p[29] + 4*p[0]*p[7]*p[12]*p[22] - 2*p[0]*p[7]*p[12]*p[31] - 4*p[0]*p[7]*p[14]*p[17] + 2*p[0]*p[7]*p[14]*p[26] + 4*p[0]*p[8]*p[11]*p[22] - 2*p[0]*p[8]*p[11]*p[31] + 4*p[0]*p[8]*p[12]*p[20] - 2*p[0]*p[8]*p[12]*p[29] - 4*p[0]*p[8]*p[13]*p[17] + 2*p[0]*p[8]*p[13]*p[26] - 4*p[0]*p[9]*p[12]*p[17] + 2*p[0]*p[9]*p[12]*p[26] - 4*p[0]*p[9]*p[13]*p[20] + 2*p[0]*p[9]*p[13]*p[29] - 4*p[0]*p[9]*p[14]*p[22] + 2*p[0]*p[9]*p[14]*p[31] - 4*p[0]*p[10]*p[11]*p[17] + 2*p[0]*p[10]*p[11]*p[26] - 4*p[0]*p[10]*p[13]*p[22] + 2*p[0]*p[10]*p[13]*p[31] + 4*p[0]*p[10]*p[14]*p[20] - 2*p[0]*p[10]*p[14]*p[29] - 2*p[7]*p[8]*p[22] + 2*p[7]*p[8]*p[31] + 2*p[7]*p[10]*p[17] - 2*p[7]*p[10]*p[26] - 2*p[8]*p[8]*p[20] + 2*p[8]*p[8]*p[29] + 2*p[8]*p[9]*p[17] - 2*p[8]*p[9]*p[26] + 2*p[9]*p[10]*p[22] - 2*p[9]*p[10]*p[31] - 2*p[10]*p[10]*p[20] + 2*p[10]*p[10]*p[29] + 2*p[11]*p[12]*p[22] - 2*p[11]*p[12]*p[31] - 2*p[11]*p[14]*p[17] + 2*p[11]*p[14]*p[26] + 2*p[12]*p[12]*p[20] - 2*p[12]*p[12]*p[29] - 2*p[12]*p[13]*p[17] + 2*p[12]*p[13]*p[26] - 2*p[13]*p[14]*p[22] + 2*p[13]*p[14]*p[31] + 2*p[14]*p[14]*p[20] - 2*p[14]*p[14]*p[29];
+   coeff[81] = 2*p[0]*p[7]*p[14]*p[15] + 2*p[0]*p[8]*p[13]*p[15] + 2*p[0]*p[9]*p[12]*p[15] + 2*p[0]*p[10]*p[11]*p[15] - 2*p[7]*p[10]*p[15] + 2*p[7]*p[10]*p[24] - 2*p[8]*p[9]*p[15] + 2*p[8]*p[9]*p[24] + 2*p[11]*p[14]*p[15] - 2*p[11]*p[14]*p[24] + 2*p[12]*p[13]*p[15] - 2*p[12]*p[13]*p[24];
+   coeff[82] = 2*p[0]*p[7]*p[11]*p[19] + 2*p[0]*p[7]*p[14]*p[16] - 2*p[0]*p[8]*p[12]*p[19] + 2*p[0]*p[8]*p[13]*p[16] + 2*p[0]*p[9]*p[12]*p[16] + 2*p[0]*p[9]*p[13]*p[19] + 2*p[0]*p[10]*p[11]*p[16] - 2*p[0]*p[10]*p[14]*p[19] - 2*p[7]*p[10]*p[16] + 2*p[7]*p[10]*p[25] + 2*p[8]*p[8]*p[19] - 2*p[8]*p[8]*p[28] - 2*p[8]*p[9]*p[16] + 2*p[8]*p[9]*p[25] + 2*p[10]*p[10]*p[19] - 2*p[10]*p[10]*p[28] + 2*p[11]*p[14]*p[16] - 2*p[11]*p[14]*p[25] - 2*p[12]*p[12]*p[19] + 2*p[12]*p[12]*p[28] + 2*p[12]*p[13]*p[16] - 2*p[12]*p[13]*p[25] - 2*p[14]*p[14]*p[19] + 2*p[14]*p[14]*p[28];
+   coeff[83] = 2*p[0]*p[7]*p[11]*p[20] - 2*p[0]*p[7]*p[12]*p[22] + 2*p[0]*p[7]*p[14]*p[17] - 2*p[0]*p[8]*p[11]*p[22] - 2*p[0]*p[8]*p[12]*p[20] + 2*p[0]*p[8]*p[13]*p[17] + 2*p[0]*p[9]*p[12]*p[17] + 2*p[0]*p[9]*p[13]*p[20] + 2*p[0]*p[9]*p[14]*p[22] + 2*p[0]*p[10]*p[11]*p[17] + 2*p[0]*p[10]*p[13]*p[22] - 2*p[0]*p[10]*p[14]*p[20] + 2*p[7]*p[8]*p[22] - 2*p[7]*p[8]*p[31] - 2*p[7]*p[10]*p[17] + 2*p[7]*p[10]*p[26] + 2*p[8]*p[8]*p[20] - 2*p[8]*p[8]*p[29] - 2*p[8]*p[9]*p[17] + 2*p[8]*p[9]*p[26] - 2*p[9]*p[10]*p[22] + 2*p[9]*p[10]*p[31] + 2*p[10]*p[10]*p[20] - 2*p[10]*p[10]*p[29] - 2*p[11]*p[12]*p[22] + 2*p[11]*p[12]*p[31] + 2*p[11]*p[14]*p[17] - 2*p[11]*p[14]*p[26] - 2*p[12]*p[12]*p[20] + 2*p[12]*p[12]*p[29] + 2*p[12]*p[13]*p[17] - 2*p[12]*p[13]*p[26] + 2*p[13]*p[14]*p[22] - 2*p[13]*p[14]*p[31] - 2*p[14]*p[14]*p[20] + 2*p[14]*p[14]*p[29];
+   coeff[84] = 0;
+   coeff[85] = 2*(p[7]*p[14]*p[15] - p[7]*p[14]*p[24] + p[8]*p[13]*p[15] - p[8]*p[13]*p[24] + p[9]*p[12]*p[15] - p[9]*p[12]*p[24] + p[10]*p[11]*p[15] - p[10]*p[11]*p[24])*p[0];
+   coeff[86] = 2*(p[7]*p[11]*p[19] - p[7]*p[11]*p[28] + p[7]*p[14]*p[16] - p[7]*p[14]*p[25] - p[8]*p[12]*p[19] + p[8]*p[12]*p[28] + p[8]*p[13]*p[16] - p[8]*p[13]*p[25] + p[9]*p[12]*p[16] - p[9]*p[12]*p[25] + p[9]*p[13]*p[19] - p[9]*p[13]*p[28] + p[10]*p[11]*p[16] - p[10]*p[11]*p[25] - p[10]*p[14]*p[19] + p[10]*p[14]*p[28])*p[0];
+   coeff[87] = 2*(p[7]*p[11]*p[20] - p[7]*p[11]*p[29] - p[7]*p[12]*p[22] + p[7]*p[12]*p[31] + p[7]*p[14]*p[17] - p[7]*p[14]*p[26] - p[8]*p[11]*p[22] + p[8]*p[11]*p[31] - p[8]*p[12]*p[20] + p[8]*p[12]*p[29] + p[8]*p[13]*p[17] - p[8]*p[13]*p[26] + p[9]*p[12]*p[17] - p[9]*p[12]*p[26] + p[9]*p[13]*p[20] - p[9]*p[13]*p[29] + p[9]*p[14]*p[22] - p[9]*p[14]*p[31] + p[10]*p[11]*p[17] - p[10]*p[11]*p[26] + p[10]*p[13]*p[22] - p[10]*p[13]*p[31] - p[10]*p[14]*p[20] + p[10]*p[14]*p[29])*p[0];
+   coeff[88] = 2*(-p[7]*p[14]*p[15] + p[7]*p[14]*p[24] - p[8]*p[13]*p[15] + p[8]*p[13]*p[24] - p[9]*p[12]*p[15] + p[9]*p[12]*p[24] - p[10]*p[11]*p[15] + p[10]*p[11]*p[24])*p[0];
+   coeff[89] = 2*(-p[7]*p[11]*p[19] + p[7]*p[11]*p[28] - p[7]*p[14]*p[16] + p[7]*p[14]*p[25] + p[8]*p[12]*p[19] - p[8]*p[12]*p[28] - p[8]*p[13]*p[16] + p[8]*p[13]*p[25] - p[9]*p[12]*p[16] + p[9]*p[12]*p[25] - p[9]*p[13]*p[19] + p[9]*p[13]*p[28] - p[10]*p[11]*p[16] + p[10]*p[11]*p[25] + p[10]*p[14]*p[19] - p[10]*p[14]*p[28])*p[0];
+   coeff[90] = 2*(-p[7]*p[11]*p[20] + p[7]*p[11]*p[29] + p[7]*p[12]*p[22] - p[7]*p[12]*p[31] - p[7]*p[14]*p[17] + p[7]*p[14]*p[26] + p[8]*p[11]*p[22] - p[8]*p[11]*p[31] + p[8]*p[12]*p[20] - p[8]*p[12]*p[29] - p[8]*p[13]*p[17] + p[8]*p[13]*p[26] - p[9]*p[12]*p[17] + p[9]*p[12]*p[26] - p[9]*p[13]*p[20] + p[9]*p[13]*p[29] - p[9]*p[14]*p[22] + p[9]*p[14]*p[31] - p[10]*p[11]*p[17] + p[10]*p[11]*p[26] - p[10]*p[13]*p[22] + p[10]*p[13]*p[31] + p[10]*p[14]*p[20] - p[10]*p[14]*p[29])*p[0];
+   coeff[91] = 2*p[0]*p[7]*p[8]*p[23] - 2*p[0]*p[7]*p[10]*p[18] + 2*p[0]*p[8]*p[8]*p[21] - 2*p[0]*p[8]*p[9]*p[18] - 2*p[0]*p[9]*p[10]*p[23] + 2*p[0]*p[10]*p[10]*p[21] - 2*p[0]*p[11]*p[12]*p[23] + 2*p[0]*p[11]*p[14]*p[18] - 2*p[0]*p[12]*p[12]*p[21] + 2*p[0]*p[12]*p[13]*p[18] + 2*p[0]*p[13]*p[14]*p[23] - 2*p[0]*p[14]*p[14]*p[21] - p[7]*p[11]*p[21] + p[7]*p[11]*p[30] + p[7]*p[12]*p[23] - p[7]*p[12]*p[32] - p[7]*p[14]*p[18] + p[7]*p[14]*p[27] + p[8]*p[11]*p[23] - p[8]*p[11]*p[32] + p[8]*p[12]*p[21] - p[8]*p[12]*p[30] - p[8]*p[13]*p[18] + p[8]*p[13]*p[27] - p[9]*p[12]*p[18] + p[9]*p[12]*p[27] - p[9]*p[13]*p[21] + p[9]*p[13]*p[30] - p[9]*p[14]*p[23] + p[9]*p[14]*p[32] - p[10]*p[11]*p[18] + p[10]*p[11]*p[27] - p[10]*p[13]*p[23] + p[10]*p[13]*p[32] + p[10]*p[14]*p[21] - p[10]*p[14]*p[30];
+   coeff[92] = -2*p[0]*p[7]*p[10]*p[15] - 2*p[0]*p[8]*p[9]*p[15] + 2*p[0]*p[11]*p[14]*p[15] + 2*p[0]*p[12]*p[13]*p[15] - 2*p[7]*p[14]*p[15] + p[7]*p[14]*p[24] - 2*p[8]*p[13]*p[15] + p[8]*p[13]*p[24] - 2*p[9]*p[12]*p[15] + p[9]*p[12]*p[24] - 2*p[10]*p[11]*p[15] + p[10]*p[11]*p[24];
+   coeff[93] = -2*p[0]*p[7]*p[10]*p[16] + 2*p[0]*p[8]*p[8]*p[19] - 2*p[0]*p[8]*p[9]*p[16] + 2*p[0]*p[10]*p[10]*p[19] + 2*p[0]*p[11]*p[14]*p[16] - 2*p[0]*p[12]*p[12]*p[19] + 2*p[0]*p[12]*p[13]*p[16] - 2*p[0]*p[14]*p[14]*p[19] - 2*p[7]*p[11]*p[19] + p[7]*p[11]*p[28] - 2*p[7]*p[14]*p[16] + p[7]*p[14]*p[25] + 2*p[8]*p[12]*p[19] - p[8]*p[12]*p[28] - 2*p[8]*p[13]*p[16] + p[8]*p[13]*p[25] - 2*p[9]*p[12]*p[16] + p[9]*p[12]*p[25] - 2*p[9]*p[13]*p[19] + p[9]*p[13]*p[28] - 2*p[10]*p[11]*p[16] + p[10]*p[11]*p[25] + 2*p[10]*p[14]*p[19] - p[10]*p[14]*p[28];
+   coeff[94] = 2*p[0]*p[7]*p[8]*p[22] - 2*p[0]*p[7]*p[10]*p[17] + 2*p[0]*p[8]*p[8]*p[20] - 2*p[0]*p[8]*p[9]*p[17] - 2*p[0]*p[9]*p[10]*p[22] + 2*p[0]*p[10]*p[10]*p[20] - 2*p[0]*p[11]*p[12]*p[22] + 2*p[0]*p[11]*p[14]*p[17] - 2*p[0]*p[12]*p[12]*p[20] + 2*p[0]*p[12]*p[13]*p[17] + 2*p[0]*p[13]*p[14]*p[22] - 2*p[0]*p[14]*p[14]*p[20] - 2*p[7]*p[11]*p[20] + p[7]*p[11]*p[29] + 2*p[7]*p[12]*p[22] - p[7]*p[12]*p[31] - 2*p[7]*p[14]*p[17] + p[7]*p[14]*p[26] + 2*p[8]*p[11]*p[22] - p[8]*p[11]*p[31] + 2*p[8]*p[12]*p[20] - p[8]*p[12]*p[29] - 2*p[8]*p[13]*p[17] + p[8]*p[13]*p[26] - 2*p[9]*p[12]*p[17] + p[9]*p[12]*p[26] - 2*p[9]*p[13]*p[20] + p[9]*p[13]*p[29] - 2*p[9]*p[14]*p[22] + p[9]*p[14]*p[31] - 2*p[10]*p[11]*p[17] + p[10]*p[11]*p[26] - 2*p[10]*p[13]*p[22] + p[10]*p[13]*p[31] + 2*p[10]*p[14]*p[20] - p[10]*p[14]*p[29];
+   coeff[95] = (p[7]*p[14] + p[8]*p[13] + p[9]*p[12] + p[10]*p[11])*p[15];
+   coeff[96] = p[7]*p[11]*p[19] + p[7]*p[14]*p[16] - p[8]*p[12]*p[19] + p[8]*p[13]*p[16] + p[9]*p[12]*p[16] + p[9]*p[13]*p[19] + p[10]*p[11]*p[16] - p[10]*p[14]*p[19];
+   coeff[97] = p[7]*p[11]*p[20] - p[7]*p[12]*p[22] + p[7]*p[14]*p[17] - p[8]*p[11]*p[22] - p[8]*p[12]*p[20] + p[8]*p[13]*p[17] + p[9]*p[12]*p[17] + p[9]*p[13]*p[20] + p[9]*p[14]*p[22] + p[10]*p[11]*p[17] + p[10]*p[13]*p[22] - p[10]*p[14]*p[20];
+   coeff[98] = 2*(-p[7]*p[8]*p[23] + p[7]*p[8]*p[32] + p[7]*p[10]*p[18] - p[7]*p[10]*p[27] - p[8]*p[8]*p[21] + p[8]*p[8]*p[30] + p[8]*p[9]*p[18] - p[8]*p[9]*p[27] + p[9]*p[10]*p[23] - p[9]*p[10]*p[32] - p[10]*p[10]*p[21] + p[10]*p[10]*p[30] + p[11]*p[12]*p[23] - p[11]*p[12]*p[32] - p[11]*p[14]*p[18] + p[11]*p[14]*p[27] + p[12]*p[12]*p[21] - p[12]*p[12]*p[30] - p[12]*p[13]*p[18] + p[12]*p[13]*p[27] - p[13]*p[14]*p[23] + p[13]*p[14]*p[32] + p[14]*p[14]*p[21] - p[14]*p[14]*p[30])*p[0];
+   coeff[99] = 4*p[0]*p[7]*p[10]*p[15] - 2*p[0]*p[7]*p[10]*p[24] + 4*p[0]*p[8]*p[9]*p[15] - 2*p[0]*p[8]*p[9]*p[24] - 4*p[0]*p[11]*p[14]*p[15] + 2*p[0]*p[11]*p[14]*p[24] - 4*p[0]*p[12]*p[13]*p[15] + 2*p[0]*p[12]*p[13]*p[24] + 2*p[7]*p[14]*p[15] - 2*p[7]*p[14]*p[24] + 2*p[8]*p[13]*p[15] - 2*p[8]*p[13]*p[24] + 2*p[9]*p[12]*p[15] - 2*p[9]*p[12]*p[24] + 2*p[10]*p[11]*p[15] - 2*p[10]*p[11]*p[24];
+   coeff[100] = 4*p[0]*p[7]*p[10]*p[16] - 2*p[0]*p[7]*p[10]*p[25] - 4*p[0]*p[8]*p[8]*p[19] + 2*p[0]*p[8]*p[8]*p[28] + 4*p[0]*p[8]*p[9]*p[16] - 2*p[0]*p[8]*p[9]*p[25] - 4*p[0]*p[10]*p[10]*p[19] + 2*p[0]*p[10]*p[10]*p[28] - 4*p[0]*p[11]*p[14]*p[16] + 2*p[0]*p[11]*p[14]*p[25] + 4*p[0]*p[12]*p[12]*p[19] - 2*p[0]*p[12]*p[12]*p[28] - 4*p[0]*p[12]*p[13]*p[16] + 2*p[0]*p[12]*p[13]*p[25] + 4*p[0]*p[14]*p[14]*p[19] - 2*p[0]*p[14]*p[14]*p[28] + 2*p[7]*p[11]*p[19] - 2*p[7]*p[11]*p[28] + 2*p[7]*p[14]*p[16] - 2*p[7]*p[14]*p[25] - 2*p[8]*p[12]*p[19] + 2*p[8]*p[12]*p[28] + 2*p[8]*p[13]*p[16] - 2*p[8]*p[13]*p[25] + 2*p[9]*p[12]*p[16] - 2*p[9]*p[12]*p[25] + 2*p[9]*p[13]*p[19] - 2*p[9]*p[13]*p[28] + 2*p[10]*p[11]*p[16] - 2*p[10]*p[11]*p[25] - 2*p[10]*p[14]*p[19] + 2*p[10]*p[14]*p[28];
+   coeff[101] = -4*p[0]*p[7]*p[8]*p[22] + 2*p[0]*p[7]*p[8]*p[31] + 4*p[0]*p[7]*p[10]*p[17] - 2*p[0]*p[7]*p[10]*p[26] - 4*p[0]*p[8]*p[8]*p[20] + 2*p[0]*p[8]*p[8]*p[29] + 4*p[0]*p[8]*p[9]*p[17] - 2*p[0]*p[8]*p[9]*p[26] + 4*p[0]*p[9]*p[10]*p[22] - 2*p[0]*p[9]*p[10]*p[31] - 4*p[0]*p[10]*p[10]*p[20] + 2*p[0]*p[10]*p[10]*p[29] + 4*p[0]*p[11]*p[12]*p[22] - 2*p[0]*p[11]*p[12]*p[31] - 4*p[0]*p[11]*p[14]*p[17] + 2*p[0]*p[11]*p[14]*p[26] + 4*p[0]*p[12]*p[12]*p[20] - 2*p[0]*p[12]*p[12]*p[29] - 4*p[0]*p[12]*p[13]*p[17] + 2*p[0]*p[12]*p[13]*p[26] - 4*p[0]*p[13]*p[14]*p[22] + 2*p[0]*p[13]*p[14]*p[31] + 4*p[0]*p[14]*p[14]*p[20] - 2*p[0]*p[14]*p[14]*p[29] + 2*p[7]*p[11]*p[20] - 2*p[7]*p[11]*p[29] - 2*p[7]*p[12]*p[22] + 2*p[7]*p[12]*p[31] + 2*p[7]*p[14]*p[17] - 2*p[7]*p[14]*p[26] - 2*p[8]*p[11]*p[22] + 2*p[8]*p[11]*p[31] - 2*p[8]*p[12]*p[20] + 2*p[8]*p[12]*p[29] + 2*p[8]*p[13]*p[17] - 2*p[8]*p[13]*p[26] + 2*p[9]*p[12]*p[17] - 2*p[9]*p[12]*p[26] + 2*p[9]*p[13]*p[20] - 2*p[9]*p[13]*p[29] + 2*p[9]*p[14]*p[22] - 2*p[9]*p[14]*p[31] + 2*p[10]*p[11]*p[17] - 2*p[10]*p[11]*p[26] + 2*p[10]*p[13]*p[22] - 2*p[10]*p[13]*p[31] - 2*p[10]*p[14]*p[20] + 2*p[10]*p[14]*p[29];
+   coeff[102] = -2*p[0]*p[7]*p[10]*p[15] - 2*p[0]*p[8]*p[9]*p[15] + 2*p[0]*p[11]*p[14]*p[15] + 2*p[0]*p[12]*p[13]*p[15] - 2*p[7]*p[14]*p[15] + 2*p[7]*p[14]*p[24] - 2*p[8]*p[13]*p[15] + 2*p[8]*p[13]*p[24] - 2*p[9]*p[12]*p[15] + 2*p[9]*p[12]*p[24] - 2*p[10]*p[11]*p[15] + 2*p[10]*p[11]*p[24];
+   coeff[103] = -2*p[0]*p[7]*p[10]*p[16] + 2*p[0]*p[8]*p[8]*p[19] - 2*p[0]*p[8]*p[9]*p[16] + 2*p[0]*p[10]*p[10]*p[19] + 2*p[0]*p[11]*p[14]*p[16] - 2*p[0]*p[12]*p[12]*p[19] + 2*p[0]*p[12]*p[13]*p[16] - 2*p[0]*p[14]*p[14]*p[19] - 2*p[7]*p[11]*p[19] + 2*p[7]*p[11]*p[28] - 2*p[7]*p[14]*p[16] + 2*p[7]*p[14]*p[25] + 2*p[8]*p[12]*p[19] - 2*p[8]*p[12]*p[28] - 2*p[8]*p[13]*p[16] + 2*p[8]*p[13]*p[25] - 2*p[9]*p[12]*p[16] + 2*p[9]*p[12]*p[25] - 2*p[9]*p[13]*p[19] + 2*p[9]*p[13]*p[28] - 2*p[10]*p[11]*p[16] + 2*p[10]*p[11]*p[25] + 2*p[10]*p[14]*p[19] - 2*p[10]*p[14]*p[28];
+   coeff[104] = 2*p[0]*p[7]*p[8]*p[22] - 2*p[0]*p[7]*p[10]*p[17] + 2*p[0]*p[8]*p[8]*p[20] - 2*p[0]*p[8]*p[9]*p[17] - 2*p[0]*p[9]*p[10]*p[22] + 2*p[0]*p[10]*p[10]*p[20] - 2*p[0]*p[11]*p[12]*p[22] + 2*p[0]*p[11]*p[14]*p[17] - 2*p[0]*p[12]*p[12]*p[20] + 2*p[0]*p[12]*p[13]*p[17] + 2*p[0]*p[13]*p[14]*p[22] - 2*p[0]*p[14]*p[14]*p[20] - 2*p[7]*p[11]*p[20] + 2*p[7]*p[11]*p[29] + 2*p[7]*p[12]*p[22] - 2*p[7]*p[12]*p[31] - 2*p[7]*p[14]*p[17] + 2*p[7]*p[14]*p[26] + 2*p[8]*p[11]*p[22] - 2*p[8]*p[11]*p[31] + 2*p[8]*p[12]*p[20] - 2*p[8]*p[12]*p[29] - 2*p[8]*p[13]*p[17] + 2*p[8]*p[13]*p[26] - 2*p[9]*p[12]*p[17] + 2*p[9]*p[12]*p[26] - 2*p[9]*p[13]*p[20] + 2*p[9]*p[13]*p[29] - 2*p[9]*p[14]*p[22] + 2*p[9]*p[14]*p[31] - 2*p[10]*p[11]*p[17] + 2*p[10]*p[11]*p[26] - 2*p[10]*p[13]*p[22] + 2*p[10]*p[13]*p[31] + 2*p[10]*p[14]*p[20] - 2*p[10]*p[14]*p[29];
+   coeff[105] = 0;
+   coeff[106] = 2*(-p[7]*p[10]*p[15] + p[7]*p[10]*p[24] - p[8]*p[9]*p[15] + p[8]*p[9]*p[24] + p[11]*p[14]*p[15] - p[11]*p[14]*p[24] + p[12]*p[13]*p[15] - p[12]*p[13]*p[24])*p[0];
+   coeff[107] = 2*(-p[7]*p[10]*p[16] + p[7]*p[10]*p[25] + p[8]*p[8]*p[19] - p[8]*p[8]*p[28] - p[8]*p[9]*p[16] + p[8]*p[9]*p[25] + p[10]*p[10]*p[19] - p[10]*p[10]*p[28] + p[11]*p[14]*p[16] - p[11]*p[14]*p[25] - p[12]*p[12]*p[19] + p[12]*p[12]*p[28] + p[12]*p[13]*p[16] - p[12]*p[13]*p[25] - p[14]*p[14]*p[19] + p[14]*p[14]*p[28])*p[0];
+   coeff[108] = 2*(p[7]*p[8]*p[22] - p[7]*p[8]*p[31] - p[7]*p[10]*p[17] + p[7]*p[10]*p[26] + p[8]*p[8]*p[20] - p[8]*p[8]*p[29] - p[8]*p[9]*p[17] + p[8]*p[9]*p[26] - p[9]*p[10]*p[22] + p[9]*p[10]*p[31] + p[10]*p[10]*p[20] - p[10]*p[10]*p[29] - p[11]*p[12]*p[22] + p[11]*p[12]*p[31] + p[11]*p[14]*p[17] - p[11]*p[14]*p[26] - p[12]*p[12]*p[20] + p[12]*p[12]*p[29] + p[12]*p[13]*p[17] - p[12]*p[13]*p[26] + p[13]*p[14]*p[22] - p[13]*p[14]*p[31] - p[14]*p[14]*p[20] + p[14]*p[14]*p[29])*p[0];
+   coeff[109] = 2*(p[7]*p[10]*p[15] - p[7]*p[10]*p[24] + p[8]*p[9]*p[15] - p[8]*p[9]*p[24] - p[11]*p[14]*p[15] + p[11]*p[14]*p[24] - p[12]*p[13]*p[15] + p[12]*p[13]*p[24])*p[0];
+   coeff[110] = 2*(p[7]*p[10]*p[16] - p[7]*p[10]*p[25] - p[8]*p[8]*p[19] + p[8]*p[8]*p[28] + p[8]*p[9]*p[16] - p[8]*p[9]*p[25] - p[10]*p[10]*p[19] + p[10]*p[10]*p[28] - p[11]*p[14]*p[16] + p[11]*p[14]*p[25] + p[12]*p[12]*p[19] - p[12]*p[12]*p[28] - p[12]*p[13]*p[16] + p[12]*p[13]*p[25] + p[14]*p[14]*p[19] - p[14]*p[14]*p[28])*p[0];
+   coeff[111] = 2*(-p[7]*p[8]*p[22] + p[7]*p[8]*p[31] + p[7]*p[10]*p[17] - p[7]*p[10]*p[26] - p[8]*p[8]*p[20] + p[8]*p[8]*p[29] + p[8]*p[9]*p[17] - p[8]*p[9]*p[26] + p[9]*p[10]*p[22] - p[9]*p[10]*p[31] - p[10]*p[10]*p[20] + p[10]*p[10]*p[29] + p[11]*p[12]*p[22] - p[11]*p[12]*p[31] - p[11]*p[14]*p[17] + p[11]*p[14]*p[26] + p[12]*p[12]*p[20] - p[12]*p[12]*p[29] - p[12]*p[13]*p[17] + p[12]*p[13]*p[26] - p[13]*p[14]*p[22] + p[13]*p[14]*p[31] + p[14]*p[14]*p[20] - p[14]*p[14]*p[29])*p[0];
+   coeff[112] = -p[3] + p[6] - p[7]*p[8]*p[21] + p[7]*p[8]*p[30] + p[7]*p[9]*p[18] - p[7]*p[9]*p[27] + p[8]*p[8]*p[23] - p[8]*p[8]*p[32] - p[8]*p[10]*p[18] + p[8]*p[10]*p[27] + p[9]*p[9]*p[23] - p[9]*p[9]*p[32] - p[9]*p[10]*p[21] + p[9]*p[10]*p[30] - p[11]*p[12]*p[21] + p[11]*p[12]*p[30] + p[11]*p[13]*p[18] - p[11]*p[13]*p[27] + p[12]*p[12]*p[23] - p[12]*p[12]*p[32] - p[12]*p[14]*p[18] + p[12]*p[14]*p[27] + p[13]*p[13]*p[23] - p[13]*p[13]*p[32] - p[13]*p[14]*p[21] + p[13]*p[14]*p[30] - p[23] + p[32];
+   coeff[113] = 2*p[7]*p[9]*p[15] - p[7]*p[9]*p[24] - 2*p[8]*p[10]*p[15] + p[8]*p[10]*p[24] + 2*p[11]*p[13]*p[15] - p[11]*p[13]*p[24] - 2*p[12]*p[14]*p[15] + p[12]*p[14]*p[24];
+   coeff[114] = -2*p[7]*p[8]*p[19] + p[7]*p[8]*p[28] + 2*p[7]*p[9]*p[16] - p[7]*p[9]*p[25] - 2*p[8]*p[10]*p[16] + p[8]*p[10]*p[25] - 2*p[9]*p[10]*p[19] + p[9]*p[10]*p[28] - 2*p[11]*p[12]*p[19] + p[11]*p[12]*p[28] + 2*p[11]*p[13]*p[16] - p[11]*p[13]*p[25] - 2*p[12]*p[14]*p[16] + p[12]*p[14]*p[25] - 2*p[13]*p[14]*p[19] + p[13]*p[14]*p[28];
+   coeff[115] = -2*p[7]*p[8]*p[20] + p[7]*p[8]*p[29] + 2*p[7]*p[9]*p[17] - p[7]*p[9]*p[26] + 2*p[8]*p[8]*p[22] - p[8]*p[8]*p[31] - 2*p[8]*p[10]*p[17] + p[8]*p[10]*p[26] + 2*p[9]*p[9]*p[22] - p[9]*p[9]*p[31] - 2*p[9]*p[10]*p[20] + p[9]*p[10]*p[29] - 2*p[11]*p[12]*p[20] + p[11]*p[12]*p[29] + 2*p[11]*p[13]*p[17] - p[11]*p[13]*p[26] + 2*p[12]*p[12]*p[22] - p[12]*p[12]*p[31] - 2*p[12]*p[14]*p[17] + p[12]*p[14]*p[26] + 2*p[13]*p[13]*p[22] - p[13]*p[13]*p[31] - 2*p[13]*p[14]*p[20] + p[13]*p[14]*p[29] - 2*p[22] + p[31];
+   coeff[116] = (-p[7]*p[9] + p[8]*p[10] - p[11]*p[13] + p[12]*p[14])*p[15];
+   coeff[117] = p[7]*p[8]*p[19] - p[7]*p[9]*p[16] + p[8]*p[10]*p[16] + p[9]*p[10]*p[19] + p[11]*p[12]*p[19] - p[11]*p[13]*p[16] + p[12]*p[14]*p[16] + p[13]*p[14]*p[19];
+   coeff[118] = p[7]*p[8]*p[20] - p[7]*p[9]*p[17] - p[8]*p[8]*p[22] + p[8]*p[10]*p[17] - p[9]*p[9]*p[22] + p[9]*p[10]*p[20] + p[11]*p[12]*p[20] - p[11]*p[13]*p[17] - p[12]*p[12]*p[22] + p[12]*p[14]*p[17] - p[13]*p[13]*p[22] + p[13]*p[14]*p[20] + p[22];
+   coeff[119] = 0;
+   coeff[120] = -2*p[7]*p[9]*p[15] + 2*p[7]*p[9]*p[24] + 2*p[8]*p[10]*p[15] - 2*p[8]*p[10]*p[24] - 2*p[11]*p[13]*p[15] + 2*p[11]*p[13]*p[24] + 2*p[12]*p[14]*p[15] - 2*p[12]*p[14]*p[24];
+   coeff[121] = 2*p[7]*p[8]*p[19] - 2*p[7]*p[8]*p[28] - 2*p[7]*p[9]*p[16] + 2*p[7]*p[9]*p[25] + 2*p[8]*p[10]*p[16] - 2*p[8]*p[10]*p[25] + 2*p[9]*p[10]*p[19] - 2*p[9]*p[10]*p[28] + 2*p[11]*p[12]*p[19] - 2*p[11]*p[12]*p[28] - 2*p[11]*p[13]*p[16] + 2*p[11]*p[13]*p[25] + 2*p[12]*p[14]*p[16] - 2*p[12]*p[14]*p[25] + 2*p[13]*p[14]*p[19] - 2*p[13]*p[14]*p[28];
+   coeff[122] = 2*p[7]*p[8]*p[20] - 2*p[7]*p[8]*p[29] - 2*p[7]*p[9]*p[17] + 2*p[7]*p[9]*p[26] - 2*p[8]*p[8]*p[22] + 2*p[8]*p[8]*p[31] + 2*p[8]*p[10]*p[17] - 2*p[8]*p[10]*p[26] - 2*p[9]*p[9]*p[22] + 2*p[9]*p[9]*p[31] + 2*p[9]*p[10]*p[20] - 2*p[9]*p[10]*p[29] + 2*p[11]*p[12]*p[20] - 2*p[11]*p[12]*p[29] - 2*p[11]*p[13]*p[17] + 2*p[11]*p[13]*p[26] - 2*p[12]*p[12]*p[22] + 2*p[12]*p[12]*p[31] + 2*p[12]*p[14]*p[17] - 2*p[12]*p[14]*p[26] - 2*p[13]*p[13]*p[22] + 2*p[13]*p[13]*p[31] + 2*p[13]*p[14]*p[20] - 2*p[13]*p[14]*p[29] + 2*p[22] - 2*p[31];
+   coeff[123] = 2*p[7]*p[9]*p[15] - 2*p[7]*p[9]*p[24] - 2*p[8]*p[10]*p[15] + 2*p[8]*p[10]*p[24] + 2*p[11]*p[13]*p[15] - 2*p[11]*p[13]*p[24] - 2*p[12]*p[14]*p[15] + 2*p[12]*p[14]*p[24];
+   coeff[124] = -2*p[7]*p[8]*p[19] + 2*p[7]*p[8]*p[28] + 2*p[7]*p[9]*p[16] - 2*p[7]*p[9]*p[25] - 2*p[8]*p[10]*p[16] + 2*p[8]*p[10]*p[25] - 2*p[9]*p[10]*p[19] + 2*p[9]*p[10]*p[28] - 2*p[11]*p[12]*p[19] + 2*p[11]*p[12]*p[28] + 2*p[11]*p[13]*p[16] - 2*p[11]*p[13]*p[25] - 2*p[12]*p[14]*p[16] + 2*p[12]*p[14]*p[25] - 2*p[13]*p[14]*p[19] + 2*p[13]*p[14]*p[28];
+   coeff[125] = -2*p[7]*p[8]*p[20] + 2*p[7]*p[8]*p[29] + 2*p[7]*p[9]*p[17] - 2*p[7]*p[9]*p[26] + 2*p[8]*p[8]*p[22] - 2*p[8]*p[8]*p[31] - 2*p[8]*p[10]*p[17] + 2*p[8]*p[10]*p[26] + 2*p[9]*p[9]*p[22] - 2*p[9]*p[9]*p[31] - 2*p[9]*p[10]*p[20] + 2*p[9]*p[10]*p[29] - 2*p[11]*p[12]*p[20] + 2*p[11]*p[12]*p[29] + 2*p[11]*p[13]*p[17] - 2*p[11]*p[13]*p[26] + 2*p[12]*p[12]*p[22] - 2*p[12]*p[12]*p[31] - 2*p[12]*p[14]*p[17] + 2*p[12]*p[14]*p[26] + 2*p[13]*p[13]*p[22] - 2*p[13]*p[13]*p[31] - 2*p[13]*p[14]*p[20] + 2*p[13]*p[14]*p[29] - 2*p[22] + 2*p[31];
+   coeff[126] = 2*p[0]*p[7]*p[11]*p[23] + 2*p[0]*p[7]*p[12]*p[21] - 2*p[0]*p[7]*p[13]*p[18] + 2*p[0]*p[8]*p[11]*p[21] - 2*p[0]*p[8]*p[12]*p[23] + 2*p[0]*p[8]*p[14]*p[18] - 2*p[0]*p[9]*p[11]*p[18] - 2*p[0]*p[9]*p[13]*p[23] + 2*p[0]*p[9]*p[14]*p[21] + 2*p[0]*p[10]*p[12]*p[18] + 2*p[0]*p[10]*p[13]*p[21] + 2*p[0]*p[10]*p[14]*p[23] - p[7]*p[8]*p[21] + p[7]*p[8]*p[30] + p[7]*p[9]*p[18] - p[7]*p[9]*p[27] + p[8]*p[8]*p[23] - p[8]*p[8]*p[32] - p[8]*p[10]*p[18] + p[8]*p[10]*p[27] + p[9]*p[9]*p[23] - p[9]*p[9]*p[32] - p[9]*p[10]*p[21] + p[9]*p[10]*p[30] + p[11]*p[12]*p[21] - p[11]*p[12]*p[30] - p[11]*p[13]*p[18] + p[11]*p[13]*p[27] - p[12]*p[12]*p[23] + p[12]*p[12]*p[32] + p[12]*p[14]*p[18] - p[12]*p[14]*p[27] - p[13]*p[13]*p[23] + p[13]*p[13]*p[32] + p[13]*p[14]*p[21] - p[13]*p[14]*p[30];
+   coeff[127] = -2*p[0]*p[7]*p[13]*p[15] + 2*p[0]*p[8]*p[14]*p[15] - 2*p[0]*p[9]*p[11]*p[15] + 2*p[0]*p[10]*p[12]*p[15] + 2*p[7]*p[9]*p[15] - p[7]*p[9]*p[24] - 2*p[8]*p[10]*p[15] + p[8]*p[10]*p[24] - 2*p[11]*p[13]*p[15] + p[11]*p[13]*p[24] + 2*p[12]*p[14]*p[15] - p[12]*p[14]*p[24];
+   coeff[128] = 2*p[0]*p[7]*p[12]*p[19] - 2*p[0]*p[7]*p[13]*p[16] + 2*p[0]*p[8]*p[11]*p[19] + 2*p[0]*p[8]*p[14]*p[16] - 2*p[0]*p[9]*p[11]*p[16] + 2*p[0]*p[9]*p[14]*p[19] + 2*p[0]*p[10]*p[12]*p[16] + 2*p[0]*p[10]*p[13]*p[19] - 2*p[7]*p[8]*p[19] + p[7]*p[8]*p[28] + 2*p[7]*p[9]*p[16] - p[7]*p[9]*p[25] - 2*p[8]*p[10]*p[16] + p[8]*p[10]*p[25] - 2*p[9]*p[10]*p[19] + p[9]*p[10]*p[28] + 2*p[11]*p[12]*p[19] - p[11]*p[12]*p[28] - 2*p[11]*p[13]*p[16] + p[11]*p[13]*p[25] + 2*p[12]*p[14]*p[16] - p[12]*p[14]*p[25] + 2*p[13]*p[14]*p[19] - p[13]*p[14]*p[28];
+   coeff[129] = 2*p[0]*p[7]*p[11]*p[22] + 2*p[0]*p[7]*p[12]*p[20] - 2*p[0]*p[7]*p[13]*p[17] + 2*p[0]*p[8]*p[11]*p[20] - 2*p[0]*p[8]*p[12]*p[22] + 2*p[0]*p[8]*p[14]*p[17] - 2*p[0]*p[9]*p[11]*p[17] - 2*p[0]*p[9]*p[13]*p[22] + 2*p[0]*p[9]*p[14]*p[20] + 2*p[0]*p[10]*p[12]*p[17] + 2*p[0]*p[10]*p[13]*p[20] + 2*p[0]*p[10]*p[14]*p[22] - 2*p[7]*p[8]*p[20] + p[7]*p[8]*p[29] + 2*p[7]*p[9]*p[17] - p[7]*p[9]*p[26] + 2*p[8]*p[8]*p[22] - p[8]*p[8]*p[31] - 2*p[8]*p[10]*p[17] + p[8]*p[10]*p[26] + 2*p[9]*p[9]*p[22] - p[9]*p[9]*p[31] - 2*p[9]*p[10]*p[20] + p[9]*p[10]*p[29] + 2*p[11]*p[12]*p[20] - p[11]*p[12]*p[29] - 2*p[11]*p[13]*p[17] + p[11]*p[13]*p[26] - 2*p[12]*p[12]*p[22] + p[12]*p[12]*p[31] + 2*p[12]*p[14]*p[17] - p[12]*p[14]*p[26] - 2*p[13]*p[13]*p[22] + p[13]*p[13]*p[31] + 2*p[13]*p[14]*p[20] - p[13]*p[14]*p[29];
+   coeff[130] = (-p[7]*p[9] + p[8]*p[10] + p[11]*p[13] - p[12]*p[14])*p[15];
+   coeff[131] = p[7]*p[8]*p[19] - p[7]*p[9]*p[16] + p[8]*p[10]*p[16] + p[9]*p[10]*p[19] - p[11]*p[12]*p[19] + p[11]*p[13]*p[16] - p[12]*p[14]*p[16] - p[13]*p[14]*p[19];
+   coeff[132] = p[7]*p[8]*p[20] - p[7]*p[9]*p[17] - p[8]*p[8]*p[22] + p[8]*p[10]*p[17] - p[9]*p[9]*p[22] + p[9]*p[10]*p[20] - p[11]*p[12]*p[20] + p[11]*p[13]*p[17] + p[12]*p[12]*p[22] - p[12]*p[14]*p[17] + p[13]*p[13]*p[22] - p[13]*p[14]*p[20];
+   coeff[133] = 2*(-p[7]*p[11]*p[23] + p[7]*p[11]*p[32] - p[7]*p[12]*p[21] + p[7]*p[12]*p[30] + p[7]*p[13]*p[18] - p[7]*p[13]*p[27] - p[8]*p[11]*p[21] + p[8]*p[11]*p[30] + p[8]*p[12]*p[23] - p[8]*p[12]*p[32] - p[8]*p[14]*p[18] + p[8]*p[14]*p[27] + p[9]*p[11]*p[18] - p[9]*p[11]*p[27] + p[9]*p[13]*p[23] - p[9]*p[13]*p[32] - p[9]*p[14]*p[21] + p[9]*p[14]*p[30] - p[10]*p[12]*p[18] + p[10]*p[12]*p[27] - p[10]*p[13]*p[21] + p[10]*p[13]*p[30] - p[10]*p[14]*p[23] + p[10]*p[14]*p[32])*p[0];
+   coeff[134] = 4*p[0]*p[7]*p[13]*p[15] - 2*p[0]*p[7]*p[13]*p[24] - 4*p[0]*p[8]*p[14]*p[15] + 2*p[0]*p[8]*p[14]*p[24] + 4*p[0]*p[9]*p[11]*p[15] - 2*p[0]*p[9]*p[11]*p[24] - 4*p[0]*p[10]*p[12]*p[15] + 2*p[0]*p[10]*p[12]*p[24] - 2*p[7]*p[9]*p[15] + 2*p[7]*p[9]*p[24] + 2*p[8]*p[10]*p[15] - 2*p[8]*p[10]*p[24] + 2*p[11]*p[13]*p[15] - 2*p[11]*p[13]*p[24] - 2*p[12]*p[14]*p[15] + 2*p[12]*p[14]*p[24];
+   coeff[135] = -4*p[0]*p[7]*p[12]*p[19] + 2*p[0]*p[7]*p[12]*p[28] + 4*p[0]*p[7]*p[13]*p[16] - 2*p[0]*p[7]*p[13]*p[25] - 4*p[0]*p[8]*p[11]*p[19] + 2*p[0]*p[8]*p[11]*p[28] - 4*p[0]*p[8]*p[14]*p[16] + 2*p[0]*p[8]*p[14]*p[25] + 4*p[0]*p[9]*p[11]*p[16] - 2*p[0]*p[9]*p[11]*p[25] - 4*p[0]*p[9]*p[14]*p[19] + 2*p[0]*p[9]*p[14]*p[28] - 4*p[0]*p[10]*p[12]*p[16] + 2*p[0]*p[10]*p[12]*p[25] - 4*p[0]*p[10]*p[13]*p[19] + 2*p[0]*p[10]*p[13]*p[28] + 2*p[7]*p[8]*p[19] - 2*p[7]*p[8]*p[28] - 2*p[7]*p[9]*p[16] + 2*p[7]*p[9]*p[25] + 2*p[8]*p[10]*p[16] - 2*p[8]*p[10]*p[25] + 2*p[9]*p[10]*p[19] - 2*p[9]*p[10]*p[28] - 2*p[11]*p[12]*p[19] + 2*p[11]*p[12]*p[28] + 2*p[11]*p[13]*p[16] - 2*p[11]*p[13]*p[25] - 2*p[12]*p[14]*p[16] + 2*p[12]*p[14]*p[25] - 2*p[13]*p[14]*p[19] + 2*p[13]*p[14]*p[28];
+   coeff[136] = -4*p[0]*p[7]*p[11]*p[22] + 2*p[0]*p[7]*p[11]*p[31] - 4*p[0]*p[7]*p[12]*p[20] + 2*p[0]*p[7]*p[12]*p[29] + 4*p[0]*p[7]*p[13]*p[17] - 2*p[0]*p[7]*p[13]*p[26] - 4*p[0]*p[8]*p[11]*p[20] + 2*p[0]*p[8]*p[11]*p[29] + 4*p[0]*p[8]*p[12]*p[22] - 2*p[0]*p[8]*p[12]*p[31] - 4*p[0]*p[8]*p[14]*p[17] + 2*p[0]*p[8]*p[14]*p[26] + 4*p[0]*p[9]*p[11]*p[17] - 2*p[0]*p[9]*p[11]*p[26] + 4*p[0]*p[9]*p[13]*p[22] - 2*p[0]*p[9]*p[13]*p[31] - 4*p[0]*p[9]*p[14]*p[20] + 2*p[0]*p[9]*p[14]*p[29] - 4*p[0]*p[10]*p[12]*p[17] + 2*p[0]*p[10]*p[12]*p[26] - 4*p[0]*p[10]*p[13]*p[20] + 2*p[0]*p[10]*p[13]*p[29] - 4*p[0]*p[10]*p[14]*p[22] + 2*p[0]*p[10]*p[14]*p[31] + 2*p[7]*p[8]*p[20] - 2*p[7]*p[8]*p[29] - 2*p[7]*p[9]*p[17] + 2*p[7]*p[9]*p[26] - 2*p[8]*p[8]*p[22] + 2*p[8]*p[8]*p[31] + 2*p[8]*p[10]*p[17] - 2*p[8]*p[10]*p[26] - 2*p[9]*p[9]*p[22] + 2*p[9]*p[9]*p[31] + 2*p[9]*p[10]*p[20] - 2*p[9]*p[10]*p[29] - 2*p[11]*p[12]*p[20] + 2*p[11]*p[12]*p[29] + 2*p[11]*p[13]*p[17] - 2*p[11]*p[13]*p[26] + 2*p[12]*p[12]*p[22] - 2*p[12]*p[12]*p[31] - 2*p[12]*p[14]*p[17] + 2*p[12]*p[14]*p[26] + 2*p[13]*p[13]*p[22] - 2*p[13]*p[13]*p[31] - 2*p[13]*p[14]*p[20] + 2*p[13]*p[14]*p[29];
+   coeff[137] = -2*p[0]*p[7]*p[13]*p[15] + 2*p[0]*p[8]*p[14]*p[15] - 2*p[0]*p[9]*p[11]*p[15] + 2*p[0]*p[10]*p[12]*p[15] + 2*p[7]*p[9]*p[15] - 2*p[7]*p[9]*p[24] - 2*p[8]*p[10]*p[15] + 2*p[8]*p[10]*p[24] - 2*p[11]*p[13]*p[15] + 2*p[11]*p[13]*p[24] + 2*p[12]*p[14]*p[15] - 2*p[12]*p[14]*p[24];
+   coeff[138] = 2*p[0]*p[7]*p[12]*p[19] - 2*p[0]*p[7]*p[13]*p[16] + 2*p[0]*p[8]*p[11]*p[19] + 2*p[0]*p[8]*p[14]*p[16] - 2*p[0]*p[9]*p[11]*p[16] + 2*p[0]*p[9]*p[14]*p[19] + 2*p[0]*p[10]*p[12]*p[16] + 2*p[0]*p[10]*p[13]*p[19] - 2*p[7]*p[8]*p[19] + 2*p[7]*p[8]*p[28] + 2*p[7]*p[9]*p[16] - 2*p[7]*p[9]*p[25] - 2*p[8]*p[10]*p[16] + 2*p[8]*p[10]*p[25] - 2*p[9]*p[10]*p[19] + 2*p[9]*p[10]*p[28] + 2*p[11]*p[12]*p[19] - 2*p[11]*p[12]*p[28] - 2*p[11]*p[13]*p[16] + 2*p[11]*p[13]*p[25] + 2*p[12]*p[14]*p[16] - 2*p[12]*p[14]*p[25] + 2*p[13]*p[14]*p[19] - 2*p[13]*p[14]*p[28];
+   coeff[139] = 2*p[0]*p[7]*p[11]*p[22] + 2*p[0]*p[7]*p[12]*p[20] - 2*p[0]*p[7]*p[13]*p[17] + 2*p[0]*p[8]*p[11]*p[20] - 2*p[0]*p[8]*p[12]*p[22] + 2*p[0]*p[8]*p[14]*p[17] - 2*p[0]*p[9]*p[11]*p[17] - 2*p[0]*p[9]*p[13]*p[22] + 2*p[0]*p[9]*p[14]*p[20] + 2*p[0]*p[10]*p[12]*p[17] + 2*p[0]*p[10]*p[13]*p[20] + 2*p[0]*p[10]*p[14]*p[22] - 2*p[7]*p[8]*p[20] + 2*p[7]*p[8]*p[29] + 2*p[7]*p[9]*p[17] - 2*p[7]*p[9]*p[26] + 2*p[8]*p[8]*p[22] - 2*p[8]*p[8]*p[31] - 2*p[8]*p[10]*p[17] + 2*p[8]*p[10]*p[26] + 2*p[9]*p[9]*p[22] - 2*p[9]*p[9]*p[31] - 2*p[9]*p[10]*p[20] + 2*p[9]*p[10]*p[29] + 2*p[11]*p[12]*p[20] - 2*p[11]*p[12]*p[29] - 2*p[11]*p[13]*p[17] + 2*p[11]*p[13]*p[26] - 2*p[12]*p[12]*p[22] + 2*p[12]*p[12]*p[31] + 2*p[12]*p[14]*p[17] - 2*p[12]*p[14]*p[26] - 2*p[13]*p[13]*p[22] + 2*p[13]*p[13]*p[31] + 2*p[13]*p[14]*p[20] - 2*p[13]*p[14]*p[29];
+   coeff[140] = 0;
+   coeff[141] = 2*(-p[7]*p[13]*p[15] + p[7]*p[13]*p[24] + p[8]*p[14]*p[15] - p[8]*p[14]*p[24] - p[9]*p[11]*p[15] + p[9]*p[11]*p[24] + p[10]*p[12]*p[15] - p[10]*p[12]*p[24])*p[0];
+   coeff[142] = 2*(p[7]*p[12]*p[19] - p[7]*p[12]*p[28] - p[7]*p[13]*p[16] + p[7]*p[13]*p[25] + p[8]*p[11]*p[19] - p[8]*p[11]*p[28] + p[8]*p[14]*p[16] - p[8]*p[14]*p[25] - p[9]*p[11]*p[16] + p[9]*p[11]*p[25] + p[9]*p[14]*p[19] - p[9]*p[14]*p[28] + p[10]*p[12]*p[16] - p[10]*p[12]*p[25] + p[10]*p[13]*p[19] - p[10]*p[13]*p[28])*p[0];
+   coeff[143] = 2*(p[7]*p[11]*p[22] - p[7]*p[11]*p[31] + p[7]*p[12]*p[20] - p[7]*p[12]*p[29] - p[7]*p[13]*p[17] + p[7]*p[13]*p[26] + p[8]*p[11]*p[20] - p[8]*p[11]*p[29] - p[8]*p[12]*p[22] + p[8]*p[12]*p[31] + p[8]*p[14]*p[17] - p[8]*p[14]*p[26] - p[9]*p[11]*p[17] + p[9]*p[11]*p[26] - p[9]*p[13]*p[22] + p[9]*p[13]*p[31] + p[9]*p[14]*p[20] - p[9]*p[14]*p[29] + p[10]*p[12]*p[17] - p[10]*p[12]*p[26] + p[10]*p[13]*p[20] - p[10]*p[13]*p[29] + p[10]*p[14]*p[22] - p[10]*p[14]*p[31])*p[0];
+   coeff[144] = 2*(p[7]*p[13]*p[15] - p[7]*p[13]*p[24] - p[8]*p[14]*p[15] + p[8]*p[14]*p[24] + p[9]*p[11]*p[15] - p[9]*p[11]*p[24] - p[10]*p[12]*p[15] + p[10]*p[12]*p[24])*p[0];
+   coeff[145] = 2*(-p[7]*p[12]*p[19] + p[7]*p[12]*p[28] + p[7]*p[13]*p[16] - p[7]*p[13]*p[25] - p[8]*p[11]*p[19] + p[8]*p[11]*p[28] - p[8]*p[14]*p[16] + p[8]*p[14]*p[25] + p[9]*p[11]*p[16] - p[9]*p[11]*p[25] - p[9]*p[14]*p[19] + p[9]*p[14]*p[28] - p[10]*p[12]*p[16] + p[10]*p[12]*p[25] - p[10]*p[13]*p[19] + p[10]*p[13]*p[28])*p[0];
+   coeff[146] = 2*(-p[7]*p[11]*p[22] + p[7]*p[11]*p[31] - p[7]*p[12]*p[20] + p[7]*p[12]*p[29] + p[7]*p[13]*p[17] - p[7]*p[13]*p[26] - p[8]*p[11]*p[20] + p[8]*p[11]*p[29] + p[8]*p[12]*p[22] - p[8]*p[12]*p[31] - p[8]*p[14]*p[17] + p[8]*p[14]*p[26] + p[9]*p[11]*p[17] - p[9]*p[11]*p[26] + p[9]*p[13]*p[22] - p[9]*p[13]*p[31] - p[9]*p[14]*p[20] + p[9]*p[14]*p[29] - p[10]*p[12]*p[17] + p[10]*p[12]*p[26] - p[10]*p[13]*p[20] + p[10]*p[13]*p[29] - p[10]*p[14]*p[22] + p[10]*p[14]*p[31])*p[0];
+   coeff[147] = -2*p[0]*p[7]*p[8]*p[21] + 2*p[0]*p[7]*p[9]*p[18] + 2*p[0]*p[8]*p[8]*p[23] - 2*p[0]*p[8]*p[10]*p[18] + 2*p[0]*p[9]*p[9]*p[23] - 2*p[0]*p[9]*p[10]*p[21] + 2*p[0]*p[11]*p[12]*p[21] - 2*p[0]*p[11]*p[13]*p[18] - 2*p[0]*p[12]*p[12]*p[23] + 2*p[0]*p[12]*p[14]*p[18] - 2*p[0]*p[13]*p[13]*p[23] + 2*p[0]*p[13]*p[14]*p[21] - p[7]*p[11]*p[23] + p[7]*p[11]*p[32] - p[7]*p[12]*p[21] + p[7]*p[12]*p[30] + p[7]*p[13]*p[18] - p[7]*p[13]*p[27] - p[8]*p[11]*p[21] + p[8]*p[11]*p[30] + p[8]*p[12]*p[23] - p[8]*p[12]*p[32] - p[8]*p[14]*p[18] + p[8]*p[14]*p[27] + p[9]*p[11]*p[18] - p[9]*p[11]*p[27] + p[9]*p[13]*p[23] - p[9]*p[13]*p[32] - p[9]*p[14]*p[21] + p[9]*p[14]*p[30] - p[10]*p[12]*p[18] + p[10]*p[12]*p[27] - p[10]*p[13]*p[21] + p[10]*p[13]*p[30] - p[10]*p[14]*p[23] + p[10]*p[14]*p[32];
+   coeff[148] = 2*p[0]*p[7]*p[9]*p[15] - 2*p[0]*p[8]*p[10]*p[15] - 2*p[0]*p[11]*p[13]*p[15] + 2*p[0]*p[12]*p[14]*p[15] + 2*p[7]*p[13]*p[15] - p[7]*p[13]*p[24] - 2*p[8]*p[14]*p[15] + p[8]*p[14]*p[24] + 2*p[9]*p[11]*p[15] - p[9]*p[11]*p[24] - 2*p[10]*p[12]*p[15] + p[10]*p[12]*p[24];
+   coeff[149] = -2*p[0]*p[7]*p[8]*p[19] + 2*p[0]*p[7]*p[9]*p[16] - 2*p[0]*p[8]*p[10]*p[16] - 2*p[0]*p[9]*p[10]*p[19] + 2*p[0]*p[11]*p[12]*p[19] - 2*p[0]*p[11]*p[13]*p[16] + 2*p[0]*p[12]*p[14]*p[16] + 2*p[0]*p[13]*p[14]*p[19] - 2*p[7]*p[12]*p[19] + p[7]*p[12]*p[28] + 2*p[7]*p[13]*p[16] - p[7]*p[13]*p[25] - 2*p[8]*p[11]*p[19] + p[8]*p[11]*p[28] - 2*p[8]*p[14]*p[16] + p[8]*p[14]*p[25] + 2*p[9]*p[11]*p[16] - p[9]*p[11]*p[25] - 2*p[9]*p[14]*p[19] + p[9]*p[14]*p[28] - 2*p[10]*p[12]*p[16] + p[10]*p[12]*p[25] - 2*p[10]*p[13]*p[19] + p[10]*p[13]*p[28];
+   coeff[150] = -2*p[0]*p[7]*p[8]*p[20] + 2*p[0]*p[7]*p[9]*p[17] + 2*p[0]*p[8]*p[8]*p[22] - 2*p[0]*p[8]*p[10]*p[17] + 2*p[0]*p[9]*p[9]*p[22] - 2*p[0]*p[9]*p[10]*p[20] + 2*p[0]*p[11]*p[12]*p[20] - 2*p[0]*p[11]*p[13]*p[17] - 2*p[0]*p[12]*p[12]*p[22] + 2*p[0]*p[12]*p[14]*p[17] - 2*p[0]*p[13]*p[13]*p[22] + 2*p[0]*p[13]*p[14]*p[20] - 2*p[7]*p[11]*p[22] + p[7]*p[11]*p[31] - 2*p[7]*p[12]*p[20] + p[7]*p[12]*p[29] + 2*p[7]*p[13]*p[17] - p[7]*p[13]*p[26] - 2*p[8]*p[11]*p[20] + p[8]*p[11]*p[29] + 2*p[8]*p[12]*p[22] - p[8]*p[12]*p[31] - 2*p[8]*p[14]*p[17] + p[8]*p[14]*p[26] + 2*p[9]*p[11]*p[17] - p[9]*p[11]*p[26] + 2*p[9]*p[13]*p[22] - p[9]*p[13]*p[31] - 2*p[9]*p[14]*p[20] + p[9]*p[14]*p[29] - 2*p[10]*p[12]*p[17] + p[10]*p[12]*p[26] - 2*p[10]*p[13]*p[20] + p[10]*p[13]*p[29] - 2*p[10]*p[14]*p[22] + p[10]*p[14]*p[31];
+   coeff[151] = (-p[7]*p[13] + p[8]*p[14] - p[9]*p[11] + p[10]*p[12])*p[15];
+   coeff[152] = p[7]*p[12]*p[19] - p[7]*p[13]*p[16] + p[8]*p[11]*p[19] + p[8]*p[14]*p[16] - p[9]*p[11]*p[16] + p[9]*p[14]*p[19] + p[10]*p[12]*p[16] + p[10]*p[13]*p[19];
+   coeff[153] = p[7]*p[11]*p[22] + p[7]*p[12]*p[20] - p[7]*p[13]*p[17] + p[8]*p[11]*p[20] - p[8]*p[12]*p[22] + p[8]*p[14]*p[17] - p[9]*p[11]*p[17] - p[9]*p[13]*p[22] + p[9]*p[14]*p[20] + p[10]*p[12]*p[17] + p[10]*p[13]*p[20] + p[10]*p[14]*p[22];
+   coeff[154] = 2*(p[7]*p[8]*p[21] - p[7]*p[8]*p[30] - p[7]*p[9]*p[18] + p[7]*p[9]*p[27] - p[8]*p[8]*p[23] + p[8]*p[8]*p[32] + p[8]*p[10]*p[18] - p[8]*p[10]*p[27] - p[9]*p[9]*p[23] + p[9]*p[9]*p[32] + p[9]*p[10]*p[21] - p[9]*p[10]*p[30] - p[11]*p[12]*p[21] + p[11]*p[12]*p[30] + p[11]*p[13]*p[18] - p[11]*p[13]*p[27] + p[12]*p[12]*p[23] - p[12]*p[12]*p[32] - p[12]*p[14]*p[18] + p[12]*p[14]*p[27] + p[13]*p[13]*p[23] - p[13]*p[13]*p[32] - p[13]*p[14]*p[21] + p[13]*p[14]*p[30])*p[0];
+   coeff[155] = -4*p[0]*p[7]*p[9]*p[15] + 2*p[0]*p[7]*p[9]*p[24] + 4*p[0]*p[8]*p[10]*p[15] - 2*p[0]*p[8]*p[10]*p[24] + 4*p[0]*p[11]*p[13]*p[15] - 2*p[0]*p[11]*p[13]*p[24] - 4*p[0]*p[12]*p[14]*p[15] + 2*p[0]*p[12]*p[14]*p[24] - 2*p[7]*p[13]*p[15] + 2*p[7]*p[13]*p[24] + 2*p[8]*p[14]*p[15] - 2*p[8]*p[14]*p[24] - 2*p[9]*p[11]*p[15] + 2*p[9]*p[11]*p[24] + 2*p[10]*p[12]*p[15] - 2*p[10]*p[12]*p[24];
+   coeff[156] = 4*p[0]*p[7]*p[8]*p[19] - 2*p[0]*p[7]*p[8]*p[28] - 4*p[0]*p[7]*p[9]*p[16] + 2*p[0]*p[7]*p[9]*p[25] + 4*p[0]*p[8]*p[10]*p[16] - 2*p[0]*p[8]*p[10]*p[25] + 4*p[0]*p[9]*p[10]*p[19] - 2*p[0]*p[9]*p[10]*p[28] - 4*p[0]*p[11]*p[12]*p[19] + 2*p[0]*p[11]*p[12]*p[28] + 4*p[0]*p[11]*p[13]*p[16] - 2*p[0]*p[11]*p[13]*p[25] - 4*p[0]*p[12]*p[14]*p[16] + 2*p[0]*p[12]*p[14]*p[25] - 4*p[0]*p[13]*p[14]*p[19] + 2*p[0]*p[13]*p[14]*p[28] + 2*p[7]*p[12]*p[19] - 2*p[7]*p[12]*p[28] - 2*p[7]*p[13]*p[16] + 2*p[7]*p[13]*p[25] + 2*p[8]*p[11]*p[19] - 2*p[8]*p[11]*p[28] + 2*p[8]*p[14]*p[16] - 2*p[8]*p[14]*p[25] - 2*p[9]*p[11]*p[16] + 2*p[9]*p[11]*p[25] + 2*p[9]*p[14]*p[19] - 2*p[9]*p[14]*p[28] + 2*p[10]*p[12]*p[16] - 2*p[10]*p[12]*p[25] + 2*p[10]*p[13]*p[19] - 2*p[10]*p[13]*p[28];
+   coeff[157] = 4*p[0]*p[7]*p[8]*p[20] - 2*p[0]*p[7]*p[8]*p[29] - 4*p[0]*p[7]*p[9]*p[17] + 2*p[0]*p[7]*p[9]*p[26] - 4*p[0]*p[8]*p[8]*p[22] + 2*p[0]*p[8]*p[8]*p[31] + 4*p[0]*p[8]*p[10]*p[17] - 2*p[0]*p[8]*p[10]*p[26] - 4*p[0]*p[9]*p[9]*p[22] + 2*p[0]*p[9]*p[9]*p[31] + 4*p[0]*p[9]*p[10]*p[20] - 2*p[0]*p[9]*p[10]*p[29] - 4*p[0]*p[11]*p[12]*p[20] + 2*p[0]*p[11]*p[12]*p[29] + 4*p[0]*p[11]*p[13]*p[17] - 2*p[0]*p[11]*p[13]*p[26] + 4*p[0]*p[12]*p[12]*p[22] - 2*p[0]*p[12]*p[12]*p[31] - 4*p[0]*p[12]*p[14]*p[17] + 2*p[0]*p[12]*p[14]*p[26] + 4*p[0]*p[13]*p[13]*p[22] - 2*p[0]*p[13]*p[13]*p[31] - 4*p[0]*p[13]*p[14]*p[20] + 2*p[0]*p[13]*p[14]*p[29] + 2*p[7]*p[11]*p[22] - 2*p[7]*p[11]*p[31] + 2*p[7]*p[12]*p[20] - 2*p[7]*p[12]*p[29] - 2*p[7]*p[13]*p[17] + 2*p[7]*p[13]*p[26] + 2*p[8]*p[11]*p[20] - 2*p[8]*p[11]*p[29] - 2*p[8]*p[12]*p[22] + 2*p[8]*p[12]*p[31] + 2*p[8]*p[14]*p[17] - 2*p[8]*p[14]*p[26] - 2*p[9]*p[11]*p[17] + 2*p[9]*p[11]*p[26] - 2*p[9]*p[13]*p[22] + 2*p[9]*p[13]*p[31] + 2*p[9]*p[14]*p[20] - 2*p[9]*p[14]*p[29] + 2*p[10]*p[12]*p[17] - 2*p[10]*p[12]*p[26] + 2*p[10]*p[13]*p[20] - 2*p[10]*p[13]*p[29] + 2*p[10]*p[14]*p[22] - 2*p[10]*p[14]*p[31];
+   coeff[158] = 2*p[0]*p[7]*p[9]*p[15] - 2*p[0]*p[8]*p[10]*p[15] - 2*p[0]*p[11]*p[13]*p[15] + 2*p[0]*p[12]*p[14]*p[15] + 2*p[7]*p[13]*p[15] - 2*p[7]*p[13]*p[24] - 2*p[8]*p[14]*p[15] + 2*p[8]*p[14]*p[24] + 2*p[9]*p[11]*p[15] - 2*p[9]*p[11]*p[24] - 2*p[10]*p[12]*p[15] + 2*p[10]*p[12]*p[24];
+   coeff[159] = -2*p[0]*p[7]*p[8]*p[19] + 2*p[0]*p[7]*p[9]*p[16] - 2*p[0]*p[8]*p[10]*p[16] - 2*p[0]*p[9]*p[10]*p[19] + 2*p[0]*p[11]*p[12]*p[19] - 2*p[0]*p[11]*p[13]*p[16] + 2*p[0]*p[12]*p[14]*p[16] + 2*p[0]*p[13]*p[14]*p[19] - 2*p[7]*p[12]*p[19] + 2*p[7]*p[12]*p[28] + 2*p[7]*p[13]*p[16] - 2*p[7]*p[13]*p[25] - 2*p[8]*p[11]*p[19] + 2*p[8]*p[11]*p[28] - 2*p[8]*p[14]*p[16] + 2*p[8]*p[14]*p[25] + 2*p[9]*p[11]*p[16] - 2*p[9]*p[11]*p[25] - 2*p[9]*p[14]*p[19] + 2*p[9]*p[14]*p[28] - 2*p[10]*p[12]*p[16] + 2*p[10]*p[12]*p[25] - 2*p[10]*p[13]*p[19] + 2*p[10]*p[13]*p[28];
+   coeff[160] = -2*p[0]*p[7]*p[8]*p[20] + 2*p[0]*p[7]*p[9]*p[17] + 2*p[0]*p[8]*p[8]*p[22] - 2*p[0]*p[8]*p[10]*p[17] + 2*p[0]*p[9]*p[9]*p[22] - 2*p[0]*p[9]*p[10]*p[20] + 2*p[0]*p[11]*p[12]*p[20] - 2*p[0]*p[11]*p[13]*p[17] - 2*p[0]*p[12]*p[12]*p[22] + 2*p[0]*p[12]*p[14]*p[17] - 2*p[0]*p[13]*p[13]*p[22] + 2*p[0]*p[13]*p[14]*p[20] - 2*p[7]*p[11]*p[22] + 2*p[7]*p[11]*p[31] - 2*p[7]*p[12]*p[20] + 2*p[7]*p[12]*p[29] + 2*p[7]*p[13]*p[17] - 2*p[7]*p[13]*p[26] - 2*p[8]*p[11]*p[20] + 2*p[8]*p[11]*p[29] + 2*p[8]*p[12]*p[22] - 2*p[8]*p[12]*p[31] - 2*p[8]*p[14]*p[17] + 2*p[8]*p[14]*p[26] + 2*p[9]*p[11]*p[17] - 2*p[9]*p[11]*p[26] + 2*p[9]*p[13]*p[22] - 2*p[9]*p[13]*p[31] - 2*p[9]*p[14]*p[20] + 2*p[9]*p[14]*p[29] - 2*p[10]*p[12]*p[17] + 2*p[10]*p[12]*p[26] - 2*p[10]*p[13]*p[20] + 2*p[10]*p[13]*p[29] - 2*p[10]*p[14]*p[22] + 2*p[10]*p[14]*p[31];
+   coeff[161] = 0;
+   coeff[162] = 2*(p[7]*p[9]*p[15] - p[7]*p[9]*p[24] - p[8]*p[10]*p[15] + p[8]*p[10]*p[24] - p[11]*p[13]*p[15] + p[11]*p[13]*p[24] + p[12]*p[14]*p[15] - p[12]*p[14]*p[24])*p[0];
+   coeff[163] = 2*(-p[7]*p[8]*p[19] + p[7]*p[8]*p[28] + p[7]*p[9]*p[16] - p[7]*p[9]*p[25] - p[8]*p[10]*p[16] + p[8]*p[10]*p[25] - p[9]*p[10]*p[19] + p[9]*p[10]*p[28] + p[11]*p[12]*p[19] - p[11]*p[12]*p[28] - p[11]*p[13]*p[16] + p[11]*p[13]*p[25] + p[12]*p[14]*p[16] - p[12]*p[14]*p[25] + p[13]*p[14]*p[19] - p[13]*p[14]*p[28])*p[0];
+   coeff[164] = 2*(-p[7]*p[8]*p[20] + p[7]*p[8]*p[29] + p[7]*p[9]*p[17] - p[7]*p[9]*p[26] + p[8]*p[8]*p[22] - p[8]*p[8]*p[31] - p[8]*p[10]*p[17] + p[8]*p[10]*p[26] + p[9]*p[9]*p[22] - p[9]*p[9]*p[31] - p[9]*p[10]*p[20] + p[9]*p[10]*p[29] + p[11]*p[12]*p[20] - p[11]*p[12]*p[29] - p[11]*p[13]*p[17] + p[11]*p[13]*p[26] - p[12]*p[12]*p[22] + p[12]*p[12]*p[31] + p[12]*p[14]*p[17] - p[12]*p[14]*p[26] - p[13]*p[13]*p[22] + p[13]*p[13]*p[31] + p[13]*p[14]*p[20] - p[13]*p[14]*p[29])*p[0];
+   coeff[165] = 2*(-p[7]*p[9]*p[15] + p[7]*p[9]*p[24] + p[8]*p[10]*p[15] - p[8]*p[10]*p[24] + p[11]*p[13]*p[15] - p[11]*p[13]*p[24] - p[12]*p[14]*p[15] + p[12]*p[14]*p[24])*p[0];
+   coeff[166] = 2*(p[7]*p[8]*p[19] - p[7]*p[8]*p[28] - p[7]*p[9]*p[16] + p[7]*p[9]*p[25] + p[8]*p[10]*p[16] - p[8]*p[10]*p[25] + p[9]*p[10]*p[19] - p[9]*p[10]*p[28] - p[11]*p[12]*p[19] + p[11]*p[12]*p[28] + p[11]*p[13]*p[16] - p[11]*p[13]*p[25] - p[12]*p[14]*p[16] + p[12]*p[14]*p[25] - p[13]*p[14]*p[19] + p[13]*p[14]*p[28])*p[0];
+   coeff[167] = 2*(p[7]*p[8]*p[20] - p[7]*p[8]*p[29] - p[7]*p[9]*p[17] + p[7]*p[9]*p[26] - p[8]*p[8]*p[22] + p[8]*p[8]*p[31] + p[8]*p[10]*p[17] - p[8]*p[10]*p[26] - p[9]*p[9]*p[22] + p[9]*p[9]*p[31] + p[9]*p[10]*p[20] - p[9]*p[10]*p[29] - p[11]*p[12]*p[20] + p[11]*p[12]*p[29] + p[11]*p[13]*p[17] - p[11]*p[13]*p[26] + p[12]*p[12]*p[22] - p[12]*p[12]*p[31] - p[12]*p[14]*p[17] + p[12]*p[14]*p[26] + p[13]*p[13]*p[22] - p[13]*p[13]*p[31] - p[13]*p[14]*p[20] + p[13]*p[14]*p[29])*p[0];
+}
+
+} // namespace embree
diff --git a/thirdparty/embree/kernels/common/point_query.h b/thirdparty/embree/kernels/common/point_query.h
new file mode 100644
index 0000000000..7d55c91fff
--- /dev/null
+++ b/thirdparty/embree/kernels/common/point_query.h
@@ -0,0 +1,136 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+
+namespace embree
+{
+  /* Point query structure for closest point query */
+  template<int K>
+  struct RTC_ALIGN(16) PointQueryK 
+  {
+    /* Default construction does nothing */
+    __forceinline PointQueryK() {}
+
+    /* Constructs a ray from origin, direction, and ray segment. Near
+     * has to be smaller than far */
+    __forceinline PointQueryK(const Vec3vf<K>& p, const vfloat<K>& radius = inf, const vfloat<K>& time = zero)
+      : p(p), time(time), radius(radius) {}
+
+    /* Returns the size of the ray */
+    static __forceinline size_t size() { return K; }
+
+    /* Calculates if this is a valid ray that does not cause issues during traversal */
+    __forceinline vbool<K> valid() const
+    {
+      const vbool<K> vx = (abs(p.x) <= vfloat<K>(FLT_LARGE));
+      const vbool<K> vy = (abs(p.y) <= vfloat<K>(FLT_LARGE));
+      const vbool<K> vz = (abs(p.z) <= vfloat<K>(FLT_LARGE));
+      const vbool<K> vn = radius >= vfloat<K>(0);
+      const vbool<K> vf = abs(time) < vfloat<K>(inf);
+      return vx & vy & vz & vn & vf;
+    }
+
+    __forceinline void get(PointQueryK<1>* ray) const;
+    __forceinline void get(size_t i, PointQueryK<1>& ray) const;
+    __forceinline void set(const PointQueryK<1>* ray);
+    __forceinline void set(size_t i, const PointQueryK<1>& ray);
+
+    Vec3vf<K> p;      // location of the query point
+    vfloat<K> time;   // time for motion blur
+    vfloat<K> radius; // radius for the point query
+  };
+  
+  /* Specialization for a single point query */
+  template<>
+  struct RTC_ALIGN(16) PointQueryK<1>
+  {
+    /* Default construction does nothing */
+    __forceinline PointQueryK() {}
+
+    /* Constructs a ray from origin, direction, and ray segment. Near
+     *  has to be smaller than far */
+    __forceinline PointQueryK(const Vec3fa& p, float radius = inf, float time = zero)
+      : p(p), time(time), radius(radius) {}
+
+    /* Calculates if this is a valid ray that does not cause issues during traversal */
+    __forceinline bool valid() const {
+      return all(le_mask(abs(Vec3fa(p)), Vec3fa(FLT_LARGE)) & le_mask(Vec3fa(0.f), Vec3fa(radius))) && abs(time) < float(inf);
+    }
+
+    Vec3f p;  
+    float time;
+    float radius;
+  };
+  
+  /* Converts point query packet to single point query */
+  template<int K>
+  __forceinline void PointQueryK<K>::get(PointQueryK<1>* query) const
+  {
+    for (size_t i = 0; i < K; i++) // FIXME: use SIMD transpose
+    {
+      query[i].p.x    = p.x[i]; 
+      query[i].p.y    = p.y[i]; 
+      query[i].p.z    = p.z[i];
+      query[i].time   = time[i];
+      query[i].radius = radius[i]; 
+    }
+  }
+
+  /* Extracts a single point query out of a point query packet*/
+  template<int K>
+  __forceinline void PointQueryK<K>::get(size_t i, PointQueryK<1>& query) const
+  {
+    query.p.x    = p.x[i]; 
+    query.p.y    = p.y[i]; 
+    query.p.z    = p.z[i];
+    query.radius = radius[i];  
+    query.time   = time[i];  
+  }
+
+  /* Converts single point query to point query packet */
+  template<int K>
+  __forceinline void PointQueryK<K>::set(const PointQueryK<1>* query)
+  {
+    for (size_t i = 0; i < K; i++)
+    {
+      p.x[i]    = query[i].p.x;
+      p.y[i]    = query[i].p.y;
+      p.z[i]    = query[i].p.z;
+      radius[i] = query[i].radius; 
+      time[i]   = query[i].time; 
+    }
+  }
+
+  /* inserts a single point query into a point query packet element */
+  template<int K>
+  __forceinline void PointQueryK<K>::set(size_t i, const PointQueryK<1>& query)
+  {
+    p.x[i]    = query.p.x;
+    p.y[i]    = query.p.y;
+    p.z[i]    = query.p.z;
+    radius[i] = query.radius; 
+    time[i]   = query.time; 
+  }
+
+  /* Shortcuts */
+  typedef PointQueryK<1>  PointQuery;
+  typedef PointQueryK<4>  PointQuery4;
+  typedef PointQueryK<8>  PointQuery8;
+  typedef PointQueryK<16> PointQuery16;
+  struct PointQueryN;
+
+  /* Outputs point query to stream */
+  template<int K>
+  __forceinline embree_ostream operator <<(embree_ostream cout, const PointQueryK<K>& query)
+  {
+    cout << "{ " << embree_endl
+        << "  p = "    << query.p      << embree_endl
+        << "  r = "    << query.radius << embree_endl
+        << "  time = " << query.time   << embree_endl
+        << "}";
+    return cout;
+  }
+}
diff --git a/thirdparty/embree/kernels/common/primref.h b/thirdparty/embree/kernels/common/primref.h
new file mode 100644
index 0000000000..d61763487b
--- /dev/null
+++ b/thirdparty/embree/kernels/common/primref.h
@@ -0,0 +1,138 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+
+namespace embree
+{
+  /*! A primitive reference stores the bounds of the primitive and its ID. */
+  struct __aligned(32) PrimRef 
+  {
+    __forceinline PrimRef () {}
+
+#if defined(__AVX__)
+    __forceinline PrimRef(const PrimRef& v) { 
+      vfloat8::store((float*)this,vfloat8::load((float*)&v));
+    }
+    __forceinline PrimRef& operator=(const PrimRef& v) { 
+      vfloat8::store((float*)this,vfloat8::load((float*)&v)); return *this;
+    }
+#endif
+
+    __forceinline PrimRef (const BBox3fa& bounds, unsigned int geomID, unsigned int primID) 
+    {
+      lower = Vec3fx(bounds.lower, geomID);
+      upper = Vec3fx(bounds.upper, primID);
+    }
+
+    __forceinline PrimRef (const BBox3fa& bounds, size_t id) 
+    {
+#if defined(__64BIT__)
+      lower = Vec3fx(bounds.lower, (unsigned)(id & 0xFFFFFFFF));
+      upper = Vec3fx(bounds.upper, (unsigned)((id >> 32) & 0xFFFFFFFF));
+#else
+      lower = Vec3fx(bounds.lower, (unsigned)id);
+      upper = Vec3fx(bounds.upper, (unsigned)0);
+#endif
+    }
+
+    /*! calculates twice the center of the primitive */
+    __forceinline const Vec3fa center2() const {
+      return lower+upper;
+    }
+    
+    /*! return the bounding box of the primitive */
+    __forceinline const BBox3fa bounds() const {
+      return BBox3fa(lower,upper);
+    }
+
+    /*! size for bin heuristic is 1 */
+    __forceinline unsigned size() const { 
+      return 1;
+    }
+
+    /*! returns bounds and centroid used for binning */
+    __forceinline void binBoundsAndCenter(BBox3fa& bounds_o, Vec3fa& center_o) const 
+    {
+      bounds_o = bounds();
+      center_o = embree::center2(bounds_o);
+    }
+
+    __forceinline unsigned& geomIDref() {  // FIXME: remove !!!!!!!
+      return lower.u;
+    }
+    __forceinline unsigned& primIDref() {  // FIXME: remove !!!!!!!
+      return upper.u;
+    }
+    
+    /*! returns the geometry ID */
+    __forceinline unsigned geomID() const { 
+      return lower.a;
+    }
+
+    /*! returns the primitive ID */
+    __forceinline unsigned primID() const { 
+      return upper.a;
+    }
+
+    /*! returns an size_t sized ID */
+    __forceinline size_t ID() const { 
+#if defined(__64BIT__)
+      return size_t(lower.u) + (size_t(upper.u) << 32);
+#else
+      return size_t(lower.u);
+#endif
+    }
+
+    /*! special function for operator< */
+    __forceinline uint64_t ID64() const {
+      return (((uint64_t)primID()) << 32) + (uint64_t)geomID();
+    }
+    
+    /*! allows sorting the primrefs by ID */
+    friend __forceinline bool operator<(const PrimRef& p0, const PrimRef& p1) {
+      return p0.ID64() < p1.ID64();
+    }
+
+    /*! Outputs primitive reference to a stream. */
+    friend __forceinline embree_ostream operator<<(embree_ostream cout, const PrimRef& ref) {
+      return cout << "{ lower = " << ref.lower << ", upper = " << ref.upper << ", geomID = " << ref.geomID() << ", primID = " << ref.primID() << " }";
+    }
+
+  public:
+    Vec3fx lower;     //!< lower bounds and geomID
+    Vec3fx upper;     //!< upper bounds and primID
+  };
+
+  /*! fast exchange for PrimRefs */
+  __forceinline void xchg(PrimRef& a, PrimRef& b)
+  {
+#if defined(__AVX__)
+    const vfloat8 aa = vfloat8::load((float*)&a);
+    const vfloat8 bb = vfloat8::load((float*)&b);
+    vfloat8::store((float*)&a,bb);
+    vfloat8::store((float*)&b,aa);
+#else
+    std::swap(a,b);
+#endif
+  }
+
+  /************************************************************************************/
+  /************************************************************************************/
+  /************************************************************************************/
+  /************************************************************************************/
+  
+  struct SubGridBuildData {
+    unsigned short sx,sy;
+    unsigned int primID;
+    
+    __forceinline SubGridBuildData() {};
+    __forceinline SubGridBuildData(const unsigned int sx, const unsigned int sy, const unsigned int primID) : sx(sx), sy(sy), primID(primID) {};
+    
+    __forceinline size_t x() const { return (size_t)sx & 0x7fff; }
+    __forceinline size_t y() const { return (size_t)sy & 0x7fff; }
+    
+  };
+}
diff --git a/thirdparty/embree/kernels/common/primref_mb.h b/thirdparty/embree/kernels/common/primref_mb.h
new file mode 100644
index 0000000000..fb08a05003
--- /dev/null
+++ b/thirdparty/embree/kernels/common/primref_mb.h
@@ -0,0 +1,262 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+
+#define MBLUR_BIN_LBBOX 1
+
+namespace embree
+{
+#if MBLUR_BIN_LBBOX
+
+  /*! A primitive reference stores the bounds of the primitive and its ID. */
+  struct PrimRefMB
+  {
+    typedef LBBox3fa BBox;
+
+    __forceinline PrimRefMB () {}
+
+    __forceinline PrimRefMB (const LBBox3fa& lbounds_i, unsigned int activeTimeSegments, BBox1f time_range, unsigned int totalTimeSegments, unsigned int geomID, unsigned int primID)
+      : lbounds((LBBox3fx)lbounds_i), time_range(time_range)
+    {
+      assert(activeTimeSegments > 0);
+      lbounds.bounds0.lower.a = geomID;
+      lbounds.bounds0.upper.a = primID;
+      lbounds.bounds1.lower.a = activeTimeSegments;
+      lbounds.bounds1.upper.a = totalTimeSegments;
+    }
+
+    __forceinline PrimRefMB (EmptyTy empty, const LBBox3fa& lbounds_i, unsigned int activeTimeSegments, BBox1f time_range, unsigned int totalTimeSegments, size_t id)
+      : lbounds((LBBox3fx)lbounds_i), time_range(time_range)
+    {
+      assert(activeTimeSegments > 0);
+#if defined(__64BIT__)
+      lbounds.bounds0.lower.a = id & 0xFFFFFFFF;
+      lbounds.bounds0.upper.a = (id >> 32) & 0xFFFFFFFF;
+#else
+      lbounds.bounds0.lower.a = id;
+      lbounds.bounds0.upper.a = 0;
+#endif
+      lbounds.bounds1.lower.a = activeTimeSegments;
+      lbounds.bounds1.upper.a = totalTimeSegments;
+    }
+    
+    __forceinline PrimRefMB (const LBBox3fa& lbounds_i, unsigned int activeTimeSegments, BBox1f time_range, unsigned int totalTimeSegments, size_t id)
+      : lbounds((LBBox3fx)lbounds_i), time_range(time_range)
+    {
+      assert(activeTimeSegments > 0);
+#if defined(__64BIT__)
+      lbounds.bounds0.lower.u = id & 0xFFFFFFFF;
+      lbounds.bounds0.upper.u = (id >> 32) & 0xFFFFFFFF;
+#else
+      lbounds.bounds0.lower.u = id;
+      lbounds.bounds0.upper.u = 0;
+#endif
+      lbounds.bounds1.lower.a = activeTimeSegments;
+      lbounds.bounds1.upper.a = totalTimeSegments;
+    }
+
+    /*! returns bounds for binning */
+    __forceinline LBBox3fa bounds() const {
+      return lbounds;
+    }
+
+    /*! returns the number of time segments of this primref */
+    __forceinline unsigned size() const {
+      return lbounds.bounds1.lower.a;
+    }
+
+    __forceinline unsigned totalTimeSegments() const {
+      return lbounds.bounds1.upper.a;
+    }
+
+     /* calculate overlapping time segment range */
+    __forceinline range<int> timeSegmentRange(const BBox1f& range) const {
+      return getTimeSegmentRange(range,time_range,float(totalTimeSegments()));
+    }
+
+     /* returns time that corresponds to time step */
+    __forceinline float timeStep(const int i) const {
+      assert(i>=0 && i<=(int)totalTimeSegments());
+      return time_range.lower + time_range.size()*float(i)/float(totalTimeSegments());
+    }
+    
+    /*! checks if time range overlaps */
+    __forceinline bool time_range_overlap(const BBox1f& range) const
+    {
+      if (0.9999f*time_range.upper <= range.lower) return false;
+      if (1.0001f*time_range.lower >= range.upper) return false;
+      return true;
+    }
+
+    /*! returns center for binning */
+    __forceinline Vec3fa binCenter() const {
+      return center2(lbounds.interpolate(0.5f));
+    }
+
+    /*! returns bounds and centroid used for binning */
+    __forceinline void binBoundsAndCenter(LBBox3fa& bounds_o, Vec3fa& center_o) const
+    {
+      bounds_o = bounds();
+      center_o = binCenter();
+    }
+
+    /*! returns the geometry ID */
+    __forceinline unsigned geomID() const {
+      return lbounds.bounds0.lower.a;
+    }
+
+    /*! returns the primitive ID */
+    __forceinline unsigned primID() const {
+      return lbounds.bounds0.upper.a;
+    }
+
+    /*! returns an size_t sized ID */
+    __forceinline size_t ID() const {
+#if defined(__64BIT__)
+      return size_t(lbounds.bounds0.lower.u) + (size_t(lbounds.bounds0.upper.u) << 32);
+#else
+      return size_t(lbounds.bounds0.lower.u);
+#endif
+    }
+
+    /*! special function for operator< */
+    __forceinline uint64_t ID64() const {
+      return (((uint64_t)primID()) << 32) + (uint64_t)geomID();
+    }
+
+    /*! allows sorting the primrefs by ID */
+    friend __forceinline bool operator<(const PrimRefMB& p0, const PrimRefMB& p1) {
+      return p0.ID64() < p1.ID64();
+    }
+
+    /*! Outputs primitive reference to a stream. */
+    friend __forceinline embree_ostream operator<<(embree_ostream cout, const PrimRefMB& ref) {
+      return cout << "{ time_range = " << ref.time_range << ", bounds = " << ref.bounds() << ", geomID = " << ref.geomID() << ", primID = " << ref.primID() << ", active_segments = " << ref.size() << ",  total_segments = " << ref.totalTimeSegments() << " }";
+    }
+
+  public:
+    LBBox3fx lbounds;
+    BBox1f time_range; // entire geometry time range
+  };
+
+#else
+
+  /*! A primitive reference stores the bounds of the primitive and its ID. */
+  struct __aligned(16) PrimRefMB
+  {
+    typedef BBox3fa BBox;
+
+    __forceinline PrimRefMB () {}
+
+    __forceinline PrimRefMB (const LBBox3fa& bounds, unsigned int activeTimeSegments, BBox1f time_range, unsigned int totalTimeSegments, unsigned int geomID, unsigned int primID)
+      : bbox(bounds.interpolate(0.5f)), _activeTimeSegments(activeTimeSegments), _totalTimeSegments(totalTimeSegments), time_range(time_range)
+    {
+      assert(activeTimeSegments > 0);
+      bbox.lower.a = geomID;
+      bbox.upper.a = primID;
+    }
+    
+    __forceinline PrimRefMB (EmptyTy empty, const LBBox3fa& bounds, unsigned int activeTimeSegments, BBox1f time_range, unsigned int totalTimeSegments, size_t id)
+      : bbox(bounds.interpolate(0.5f)), _activeTimeSegments(activeTimeSegments), _totalTimeSegments(totalTimeSegments), time_range(time_range)
+    {
+      assert(activeTimeSegments > 0);
+#if defined(__64BIT__)
+      bbox.lower.u = id & 0xFFFFFFFF;
+      bbox.upper.u = (id >> 32) & 0xFFFFFFFF;
+#else
+      bbox.lower.u = id;
+      bbox.upper.u = 0;
+#endif
+    }
+    
+    /*! returns bounds for binning */
+    __forceinline BBox3fa bounds() const {
+      return bbox;
+    }
+
+    /*! returns the number of time segments of this primref */
+    __forceinline unsigned int size() const { 
+      return _activeTimeSegments;
+    }
+
+    __forceinline unsigned int totalTimeSegments() const { 
+      return _totalTimeSegments;
+    }
+
+     /* calculate overlapping time segment range */
+    __forceinline range<int> timeSegmentRange(const BBox1f& range) const {
+      return getTimeSegmentRange(range,time_range,float(_totalTimeSegments));
+    }
+
+     /* returns time that corresponds to time step */
+    __forceinline float timeStep(const int i) const {
+      assert(i>=0 && i<=(int)_totalTimeSegments);
+      return time_range.lower + time_range.size()*float(i)/float(_totalTimeSegments);
+    }
+    
+    /*! checks if time range overlaps */
+    __forceinline bool time_range_overlap(const BBox1f& range) const
+    {
+      if (0.9999f*time_range.upper <= range.lower) return false;
+      if (1.0001f*time_range.lower >= range.upper) return false;
+      return true;
+    }
+
+    /*! returns center for binning */
+    __forceinline Vec3fa binCenter() const {
+      return center2(bounds());
+    }
+
+    /*! returns bounds and centroid used for binning */
+    __forceinline void binBoundsAndCenter(BBox3fa& bounds_o, Vec3fa& center_o) const
+    {
+      bounds_o = bounds();
+      center_o = center2(bounds());
+    }
+
+    /*! returns the geometry ID */
+    __forceinline unsigned int geomID() const { 
+      return bbox.lower.a;
+    }
+
+    /*! returns the primitive ID */
+    __forceinline unsigned int primID() const { 
+      return bbox.upper.a;
+    }
+
+    /*! returns an size_t sized ID */
+    __forceinline size_t ID() const { 
+#if defined(__64BIT__)
+      return size_t(bbox.lower.u) + (size_t(bbox.upper.u) << 32);
+#else
+      return size_t(bbox.lower.u);
+#endif
+    }
+
+    /*! special function for operator< */
+    __forceinline uint64_t ID64() const {
+      return (((uint64_t)primID()) << 32) + (uint64_t)geomID();
+    }
+    
+    /*! allows sorting the primrefs by ID */
+    friend __forceinline bool operator<(const PrimRefMB& p0, const PrimRefMB& p1) {
+      return p0.ID64() < p1.ID64();
+    }
+
+    /*! Outputs primitive reference to a stream. */
+    friend __forceinline embree_ostream operator<<(embree_ostream cout, const PrimRefMB& ref) {
+      return cout << "{ bounds = " << ref.bounds() << ", geomID = " << ref.geomID() << ", primID = " << ref.primID() << ", active_segments = " << ref.size() << ",  total_segments = " << ref.totalTimeSegments() << " }";
+    }
+
+  public:
+    BBox3fa bbox; // bounds, geomID, primID
+    unsigned int _activeTimeSegments;
+    unsigned int _totalTimeSegments;
+    BBox1f time_range; // entire geometry time range
+  };
+
+#endif
+}
diff --git a/thirdparty/embree/kernels/common/profile.h b/thirdparty/embree/kernels/common/profile.h
new file mode 100644
index 0000000000..5ef7f6ec0f
--- /dev/null
+++ b/thirdparty/embree/kernels/common/profile.h
@@ -0,0 +1,159 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+
+namespace embree
+{
+  /*! helper structure for the implementation of the profile functions below */
+  struct ProfileTimer
+  {
+    static const size_t N = 20;
+    
+    ProfileTimer () {}
+
+    ProfileTimer (const size_t numSkip) : i(0), j(0), maxJ(0), numSkip(numSkip), t0(0)
+    {
+      for (size_t i=0; i<N; i++) names[i] = nullptr;
+      for (size_t i=0; i<N; i++) dt_fst[i] = 0.0;
+      for (size_t i=0; i<N; i++) dt_min[i] = pos_inf;
+      for (size_t i=0; i<N; i++) dt_avg[i] = 0.0;
+      for (size_t i=0; i<N; i++) dt_max[i] = neg_inf;
+    }
+    
+    __forceinline void begin() 
+    {
+      j=0;
+      t0 = tj = getSeconds();
+    }
+
+    __forceinline void end() {
+      absolute("total");
+      i++;
+    }
+
+    __forceinline void operator() (const char* name) {
+      relative(name);
+    }
+
+    __forceinline void absolute (const char* name) 
+    {
+      const double t1 = getSeconds();
+      const double dt = t1-t0;
+      assert(names[j] == nullptr || names[j] == name);
+      names[j] = name;
+      if (i == 0) dt_fst[j] = dt;
+      if (i>=numSkip) {
+        dt_min[j] = min(dt_min[j],dt);
+        dt_avg[j] = dt_avg[j] + dt;
+        dt_max[j] = max(dt_max[j],dt);
+      }
+      j++;
+      maxJ = max(maxJ,j);
+    }
+
+    __forceinline void relative (const char* name) 
+    {
+      const double t1 = getSeconds();
+      const double dt = t1-tj;
+      tj = t1;
+      assert(names[j] == nullptr || names[j] == name);
+      names[j] = name;
+      if (i == 0) dt_fst[j] = dt;
+      if (i>=numSkip) {
+        dt_min[j] = min(dt_min[j],dt);
+        dt_avg[j] = dt_avg[j] + dt;
+        dt_max[j] = max(dt_max[j],dt);
+      }
+      j++;
+      maxJ = max(maxJ,j);
+    }
+
+    void print(size_t numElements) 
+    {
+      for (size_t k=0; k<N; k++) 
+        dt_avg[k] /= double(i-numSkip);
+
+      printf("  profile [M/s]:\n");
+      for (size_t j=0; j<maxJ; j++)
+        printf("%20s:  fst = %7.2f M/s, min = %7.2f M/s, avg = %7.2f M/s, max = %7.2f M/s\n",
+               names[j],numElements/dt_fst[j]*1E-6,numElements/dt_max[j]*1E-6,numElements/dt_avg[j]*1E-6,numElements/dt_min[j]*1E-6);
+
+      printf("  profile [ms]:\n");
+      for (size_t j=0; j<maxJ; j++) 
+        printf("%20s:  fst = %7.2f ms, min = %7.2f ms, avg = %7.2f ms, max = %7.2fms\n",
+               names[j],1000.0*dt_fst[j],1000.0*dt_min[j],1000.0*dt_avg[j],1000.0*dt_max[j]);
+    }
+
+    void print() 
+    {
+      printf("  profile:\n");
+
+      for (size_t k=0; k<N; k++) 
+        dt_avg[k] /= double(i-numSkip);
+
+      for (size_t j=0; j<maxJ; j++) {
+        printf("%20s:  fst = %7.2f ms, min = %7.2f ms, avg = %7.2f ms, max = %7.2fms\n",
+               names[j],1000.0*dt_fst[j],1000.0*dt_min[j],1000.0*dt_avg[j],1000.0*dt_max[j]);
+      }
+    }
+
+    double avg() {
+      return dt_avg[maxJ-1]/double(i-numSkip);
+    }
+    
+  private:
+    size_t i;
+    size_t j;
+    size_t maxJ;
+    size_t numSkip;
+    double t0;
+    double tj;
+    const char* names[N];
+    double dt_fst[N];
+    double dt_min[N];
+    double dt_avg[N];
+    double dt_max[N];
+  };
+
+  /*! This function executes some code block multiple times and measured sections of it. 
+      Use the following way:
+
+      profile(1,10,1000,[&](ProfileTimer& timer) {
+        // code
+        timer("A");
+        // code 
+        timer("B");
+      });
+  */
+  template<typename Closure>
+    void profile(const size_t numSkip, const size_t numIter, const size_t numElements, const Closure& closure) 
+    {
+      ProfileTimer timer(numSkip);
+      
+      for (size_t i=0; i<numSkip+numIter; i++) 
+      {
+        timer.begin();
+	closure(timer);
+        timer.end();
+      }
+      timer.print(numElements);
+    }
+
+  /*! similar as the function above, but the timer object comes externally */
+  template<typename Closure>
+    void profile(ProfileTimer& timer, const size_t numSkip, const size_t numIter, const size_t numElements, const Closure& closure) 
+    {
+      timer = ProfileTimer(numSkip);
+      
+      for (size_t i=0; i<numSkip+numIter; i++) 
+      {
+        timer.begin();
+	closure(timer);
+        timer.end();
+      }
+      timer.print(numElements);
+    }
+}
diff --git a/thirdparty/embree/kernels/common/ray.h b/thirdparty/embree/kernels/common/ray.h
new file mode 100644
index 0000000000..7b951cc1e8
--- /dev/null
+++ b/thirdparty/embree/kernels/common/ray.h
@@ -0,0 +1,1517 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "instance_stack.h"
+
+// FIXME: if ray gets seperated into ray* and hit, uload4 needs to be adjusted
+
+namespace embree
+{
+  static const size_t MAX_INTERNAL_STREAM_SIZE = 32;
+
+  /* Ray structure for K rays */
+  template<int K>
+  struct RayK
+  {
+    /* Default construction does nothing */
+    __forceinline RayK() {}
+
+    /* Constructs a ray from origin, direction, and ray segment. Near
+     * has to be smaller than far */
+    __forceinline RayK(const Vec3vf<K>& org, const Vec3vf<K>& dir,
+                       const vfloat<K>& tnear = zero, const vfloat<K>& tfar = inf,
+                       const vfloat<K>& time = zero, const vint<K>& mask = -1, const vint<K>& id = 0, const vint<K>& flags = 0)
+      : org(org), dir(dir), _tnear(tnear), tfar(tfar), _time(time), mask(mask), id(id), flags(flags) {}
+
+    /* Returns the size of the ray */
+    static __forceinline size_t size() { return K; }
+
+    /* Calculates if this is a valid ray that does not cause issues during traversal */
+    __forceinline vbool<K> valid() const
+    {
+      const vbool<K> vx = (abs(org.x) <= vfloat<K>(FLT_LARGE)) & (abs(dir.x) <= vfloat<K>(FLT_LARGE));
+      const vbool<K> vy = (abs(org.y) <= vfloat<K>(FLT_LARGE)) & (abs(dir.y) <= vfloat<K>(FLT_LARGE));
+      const vbool<K> vz = (abs(org.z) <= vfloat<K>(FLT_LARGE)) & (abs(dir.z) <= vfloat<K>(FLT_LARGE));
+      const vbool<K> vn = abs(tnear()) <= vfloat<K>(inf);
+      const vbool<K> vf = abs(tfar) <= vfloat<K>(inf);
+      return vx & vy & vz & vn & vf;
+    }
+
+    __forceinline void get(RayK<1>* ray) const;
+    __forceinline void get(size_t i, RayK<1>& ray) const;
+    __forceinline void set(const RayK<1>* ray);
+    __forceinline void set(size_t i, const RayK<1>& ray);
+
+    __forceinline void copy(size_t dest, size_t source);
+
+    __forceinline vint<K> octant() const
+    {
+      return select(dir.x < 0.0f, vint<K>(1), vint<K>(zero)) |
+             select(dir.y < 0.0f, vint<K>(2), vint<K>(zero)) |
+             select(dir.z < 0.0f, vint<K>(4), vint<K>(zero));
+    }
+
+    /* Ray data */
+    Vec3vf<K> org;    // ray origin
+    vfloat<K> _tnear; // start of ray segment
+    Vec3vf<K> dir;    // ray direction
+    vfloat<K> _time;  // time of this ray for motion blur
+    vfloat<K> tfar;   // end of ray segment
+    vint<K> mask;     // used to mask out objects during traversal
+    vint<K> id;      
+    vint<K> flags;  
+
+    __forceinline vfloat<K>& tnear() { return _tnear; }
+    __forceinline vfloat<K>& time()  { return _time; }
+    __forceinline const vfloat<K>& tnear() const { return _tnear; }
+    __forceinline const vfloat<K>& time()  const { return _time; }
+  };
+
+  /* Ray+hit structure for K rays */
+  template<int K>
+  struct RayHitK : RayK<K>
+  {
+    using RayK<K>::org;
+    using RayK<K>::_tnear;
+    using RayK<K>::dir;
+    using RayK<K>::_time;
+    using RayK<K>::tfar;
+    using RayK<K>::mask;
+    using RayK<K>::id;
+    using RayK<K>::flags;
+
+    using RayK<K>::tnear;
+    using RayK<K>::time;
+
+    /* Default construction does nothing */
+    __forceinline RayHitK() {}
+
+    /* Constructs a ray from origin, direction, and ray segment. Near
+     * has to be smaller than far */
+    __forceinline RayHitK(const Vec3vf<K>& org, const Vec3vf<K>& dir,
+                          const vfloat<K>& tnear = zero, const vfloat<K>& tfar = inf,
+                          const vfloat<K>& time = zero, const vint<K>& mask = -1, const vint<K>& id = 0, const vint<K>& flags = 0)
+      : RayK<K>(org, dir, tnear, tfar, time, mask, id, flags),
+        geomID(RTC_INVALID_GEOMETRY_ID) 
+    {
+      for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+        instID[l] = RTC_INVALID_GEOMETRY_ID;
+    }
+
+    __forceinline RayHitK(const RayK<K>& ray)
+      : RayK<K>(ray),
+        geomID(RTC_INVALID_GEOMETRY_ID) 
+    {
+      for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+        instID[l] = RTC_INVALID_GEOMETRY_ID;
+    }
+
+    __forceinline RayHitK<K>& operator =(const RayK<K>& ray)
+    {
+      org    = ray.org;
+      _tnear = ray._tnear;
+      dir    = ray.dir;
+      _time  = ray._time;
+      tfar   = ray.tfar;
+      mask   = ray.mask;
+      id     = ray.id;
+      flags  = ray.flags;
+
+      geomID = RTC_INVALID_GEOMETRY_ID;
+      for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+        instID[l] = RTC_INVALID_GEOMETRY_ID;
+
+      return *this;
+    }
+
+    /* Calculates if the hit is valid */
+    __forceinline void verifyHit(const vbool<K>& valid0) const
+    {
+      vbool<K> valid = valid0 & geomID != vuint<K>(RTC_INVALID_GEOMETRY_ID);
+      const vbool<K> vt = (abs(tfar) <= vfloat<K>(FLT_LARGE)) | (tfar == vfloat<K>(neg_inf));
+      const vbool<K> vu = (abs(u) <= vfloat<K>(FLT_LARGE));
+      const vbool<K> vv = (abs(u) <= vfloat<K>(FLT_LARGE));
+      const vbool<K> vnx = abs(Ng.x) <= vfloat<K>(FLT_LARGE);
+      const vbool<K> vny = abs(Ng.y) <= vfloat<K>(FLT_LARGE);
+      const vbool<K> vnz = abs(Ng.z) <= vfloat<K>(FLT_LARGE);
+      if (any(valid & !vt)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid t");
+      if (any(valid & !vu)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid u");
+      if (any(valid & !vv)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid v");
+      if (any(valid & !vnx)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid Ng.x");
+      if (any(valid & !vny)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid Ng.y");
+      if (any(valid & !vnz)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid Ng.z");
+    }
+
+    __forceinline void get(RayHitK<1>* ray) const;
+    __forceinline void get(size_t i, RayHitK<1>& ray) const;
+    __forceinline void set(const RayHitK<1>* ray);
+    __forceinline void set(size_t i, const RayHitK<1>& ray);
+
+    __forceinline void copy(size_t dest, size_t source);
+
+    /* Hit data */
+    Vec3vf<K> Ng;   // geometry normal
+    vfloat<K> u;    // barycentric u coordinate of hit
+    vfloat<K> v;    // barycentric v coordinate of hit
+    vuint<K> primID; // primitive ID
+    vuint<K> geomID; // geometry ID
+    vuint<K> instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // instance ID
+  };
+
+  /* Specialization for a single ray */
+  template<>
+  struct RayK<1>
+  {
+    /* Default construction does nothing */
+    __forceinline RayK() {}
+
+    /* Constructs a ray from origin, direction, and ray segment. Near
+     *  has to be smaller than far */
+    __forceinline RayK(const Vec3fa& org, const Vec3fa& dir, float tnear = zero, float tfar = inf, float time = zero, int mask = -1, int id = 0, int flags = 0)
+      : org(org,tnear), dir(dir,time), tfar(tfar), mask(mask), id(id), flags(flags) {}
+
+    /* Calculates if this is a valid ray that does not cause issues during traversal */
+    __forceinline bool valid() const {
+      return all(le_mask(abs(Vec3fa(org)), Vec3fa(FLT_LARGE)) & le_mask(abs(Vec3fa(dir)), Vec3fa(FLT_LARGE))) && abs(tnear()) <= float(inf) && abs(tfar) <= float(inf);
+    }
+
+    /* Ray data */
+    Vec3ff org;  // 3 floats for ray origin, 1 float for tnear
+    //float tnear; // start of ray segment
+    Vec3ff dir;  // 3 floats for ray direction, 1 float for time
+    // float time; 
+    float tfar;  // end of ray segment
+    int mask;    // used to mask out objects during traversal
+    int id;      // ray ID
+    int flags;   // ray flags
+
+    __forceinline float& tnear() { return org.w; };
+    __forceinline const float& tnear() const { return org.w; };
+
+    __forceinline float& time() { return dir.w; };
+    __forceinline const float& time() const { return dir.w; };
+
+  };
+
+  template<>
+  struct RayHitK<1> : RayK<1>
+  {
+    /* Default construction does nothing */
+    __forceinline RayHitK() {}
+
+    /* Constructs a ray from origin, direction, and ray segment. Near
+     *  has to be smaller than far */
+    __forceinline RayHitK(const Vec3fa& org, const Vec3fa& dir, float tnear = zero, float tfar = inf, float time = zero, int mask = -1, int id = 0, int flags = 0)
+      : RayK<1>(org, dir, tnear, tfar, time, mask, id, flags),
+        geomID(RTC_INVALID_GEOMETRY_ID) {}
+
+    __forceinline RayHitK(const RayK<1>& ray)
+      : RayK<1>(ray),
+        geomID(RTC_INVALID_GEOMETRY_ID) {}
+
+    __forceinline RayHitK<1>& operator =(const RayK<1>& ray)
+    {
+      org    = ray.org;
+      dir    = ray.dir;
+      tfar   = ray.tfar;
+      mask   = ray.mask;
+      id     = ray.id;
+      flags  = ray.flags;
+
+      geomID = RTC_INVALID_GEOMETRY_ID;
+
+      return *this;
+    }
+
+    /* Calculates if the hit is valid */
+    __forceinline void verifyHit() const
+    {
+      if (geomID == RTC_INVALID_GEOMETRY_ID) return;
+      const bool vt = (abs(tfar) <= FLT_LARGE) || (tfar == float(neg_inf));
+      const bool vu = (abs(u) <= FLT_LARGE);
+      const bool vv = (abs(u) <= FLT_LARGE);
+      const bool vnx = abs(Ng.x) <= FLT_LARGE;
+      const bool vny = abs(Ng.y) <= FLT_LARGE;
+      const bool vnz = abs(Ng.z) <= FLT_LARGE;
+      if (!vt) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid t");
+      if (!vu) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid u");
+      if (!vv) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid v");
+      if (!vnx) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid Ng.x");
+      if (!vny) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid Ng.y");
+      if (!vnz) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid Ng.z");
+    }
+
+    /* Hit data */
+    Vec3f Ng;            // not normalized geometry normal
+    float u;             // barycentric u coordinate of hit
+    float v;             // barycentric v coordinate of hit
+    unsigned int primID; // primitive ID
+    unsigned int geomID; // geometry ID
+    unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // instance ID
+  };
+
+  /* Converts ray packet to single rays */
+  template<int K>
+  __forceinline void RayK<K>::get(RayK<1>* ray) const
+  {
+    for (size_t i = 0; i < K; i++) // FIXME: use SIMD transpose
+    {
+      ray[i].org.x = org.x[i]; ray[i].org.y = org.y[i]; ray[i].org.z = org.z[i]; ray[i].tnear() = tnear()[i];
+      ray[i].dir.x = dir.x[i]; ray[i].dir.y = dir.y[i]; ray[i].dir.z = dir.z[i]; ray[i].time()  = time()[i];
+      ray[i].tfar  = tfar[i];  ray[i].mask = mask[i]; ray[i].id = id[i]; ray[i].flags = flags[i];
+    }
+  }
+
+  template<int K>
+  __forceinline void RayHitK<K>::get(RayHitK<1>* ray) const
+  {
+    // FIXME: use SIMD transpose
+    for (size_t i = 0; i < K; i++)
+      get(i, ray[i]);
+  }
+
+  /* Extracts a single ray out of a ray packet*/
+  template<int K>
+  __forceinline void RayK<K>::get(size_t i, RayK<1>& ray) const
+  {
+    ray.org.x = org.x[i]; ray.org.y = org.y[i]; ray.org.z = org.z[i]; ray.tnear() = tnear()[i]; 
+    ray.dir.x = dir.x[i]; ray.dir.y = dir.y[i]; ray.dir.z = dir.z[i]; ray.time()  = time()[i];  
+    ray.tfar  = tfar[i]; ray.mask = mask[i];  ray.id = id[i]; ray.flags = flags[i];
+  }
+
+  template<int K>
+  __forceinline void RayHitK<K>::get(size_t i, RayHitK<1>& ray) const
+  {
+    ray.org.x = org.x[i]; ray.org.y = org.y[i]; ray.org.z = org.z[i]; ray.tnear() = tnear()[i];
+    ray.dir.x = dir.x[i]; ray.dir.y = dir.y[i]; ray.dir.z = dir.z[i]; ray.tfar  = tfar[i]; ray.time()  = time()[i]; 
+    ray.mask = mask[i];  ray.id = id[i]; ray.flags = flags[i];
+    ray.Ng.x = Ng.x[i]; ray.Ng.y = Ng.y[i]; ray.Ng.z = Ng.z[i];
+    ray.u = u[i]; ray.v = v[i];
+    ray.primID = primID[i]; ray.geomID = geomID[i]; 
+
+    instance_id_stack::copy_VU<K>(instID, ray.instID, i);
+  }
+
+  /* Converts single rays to ray packet */
+  template<int K>
+  __forceinline void RayK<K>::set(const RayK<1>* ray)
+  {
+    // FIXME: use SIMD transpose
+    for (size_t i = 0; i < K; i++)
+      set(i, ray[i]);
+  }
+
+  template<int K>
+  __forceinline void RayHitK<K>::set(const RayHitK<1>* ray)
+  {
+    // FIXME: use SIMD transpose
+    for (size_t i = 0; i < K; i++)
+      set(i, ray[i]);
+  }
+
+  /* inserts a single ray into a ray packet element */
+  template<int K>
+  __forceinline void RayK<K>::set(size_t i, const RayK<1>& ray)
+  {
+    org.x[i] = ray.org.x; org.y[i] = ray.org.y; org.z[i] = ray.org.z; tnear()[i] = ray.tnear();
+    dir.x[i] = ray.dir.x; dir.y[i] = ray.dir.y; dir.z[i] = ray.dir.z; time()[i] = ray.time();
+    tfar[i] = ray.tfar; mask[i] = ray.mask; id[i] = ray.id; flags[i] = ray.flags;
+  }
+
+  template<int K>
+  __forceinline void RayHitK<K>::set(size_t i, const RayHitK<1>& ray)
+  {
+    org.x[i] = ray.org.x; org.y[i] = ray.org.y; org.z[i] = ray.org.z; tnear()[i] = ray.tnear();
+    dir.x[i] = ray.dir.x; dir.y[i] = ray.dir.y; dir.z[i] = ray.dir.z; time()[i] = ray.time();
+    tfar[i] = ray.tfar; mask[i] = ray.mask; id[i] = ray.id; flags[i] = ray.flags;
+    Ng.x[i] = ray.Ng.x; Ng.y[i] = ray.Ng.y; Ng.z[i] = ray.Ng.z;
+    u[i] = ray.u; v[i] = ray.v;
+    primID[i] = ray.primID; geomID[i] = ray.geomID;
+
+    instance_id_stack::copy_UV<K>(ray.instID, instID, i);
+  }
+
+  /* copies a ray packet element into another element*/
+  template<int K>
+  __forceinline void RayK<K>::copy(size_t dest, size_t source)
+  {
+    org.x[dest] = org.x[source]; org.y[dest] = org.y[source]; org.z[dest] = org.z[source]; tnear()[dest] = tnear()[source];
+    dir.x[dest] = dir.x[source]; dir.y[dest] = dir.y[source]; dir.z[dest] = dir.z[source]; time()[dest] = time()[source]; 
+    tfar [dest] = tfar[source]; mask[dest] = mask[source]; id[dest] = id[source]; flags[dest] = flags[source]; 
+  }
+
+  template<int K>
+  __forceinline void RayHitK<K>::copy(size_t dest, size_t source)
+  {
+    org.x[dest] = org.x[source]; org.y[dest] = org.y[source]; org.z[dest] = org.z[source]; tnear()[dest] = tnear()[source];
+    dir.x[dest] = dir.x[source]; dir.y[dest] = dir.y[source]; dir.z[dest] = dir.z[source]; time()[dest] = time()[source]; 
+    tfar [dest] = tfar[source]; mask[dest] = mask[source]; id[dest] = id[source]; flags[dest] = flags[source];
+    Ng.x[dest] = Ng.x[source]; Ng.y[dest] = Ng.y[source]; Ng.z[dest] = Ng.z[source];
+    u[dest] = u[source]; v[dest] = v[source];
+    primID[dest] = primID[source]; geomID[dest] = geomID[source];  
+
+    instance_id_stack::copy_VV<K>(instID, instID, source, dest);
+  }
+
+  /* Shortcuts */
+  typedef RayK<1>  Ray;
+  typedef RayK<4>  Ray4;
+  typedef RayK<8>  Ray8;
+  typedef RayK<16> Ray16;
+  struct RayN;
+
+  typedef RayHitK<1>  RayHit;
+  typedef RayHitK<4>  RayHit4;
+  typedef RayHitK<8>  RayHit8;
+  typedef RayHitK<16> RayHit16;
+  struct RayHitN;
+
+  template<int K, bool intersect>
+  struct RayTypeHelper;
+
+  template<int K>
+  struct RayTypeHelper<K, true>
+  {
+    typedef RayHitK<K> Ty;
+  };
+
+  template<int K>
+  struct RayTypeHelper<K, false>
+  {
+    typedef RayK<K> Ty;
+  };
+
+  template<bool intersect>
+  using RayType = typename RayTypeHelper<1, intersect>::Ty;
+
+  template<int K, bool intersect>
+  using RayTypeK = typename RayTypeHelper<K, intersect>::Ty;
+
+  /* Outputs ray to stream */
+  template<int K>
+  __forceinline embree_ostream operator <<(embree_ostream cout, const RayK<K>& ray)
+  {
+    return cout << "{ " << embree_endl
+                << "  org = " << ray.org << embree_endl
+                << "  dir = " << ray.dir << embree_endl
+                << "  near = " << ray.tnear() << embree_endl
+                << "  far = " << ray.tfar << embree_endl
+                << "  time = " << ray.time() << embree_endl
+                << "  mask = " << ray.mask << embree_endl
+                << "  id = " << ray.id << embree_endl
+                << "  flags = " << ray.flags << embree_endl
+                << "}";
+  }
+
+  template<int K>
+  __forceinline embree_ostream operator <<(embree_ostream cout, const RayHitK<K>& ray)
+  {
+    cout << "{ " << embree_endl
+         << "  org = " << ray.org << embree_endl
+         << "  dir = " << ray.dir << embree_endl
+         << "  near = " << ray.tnear() << embree_endl
+         << "  far = " << ray.tfar << embree_endl
+         << "  time = " << ray.time() << embree_endl
+         << "  mask = " << ray.mask << embree_endl
+         << "  id = " << ray.id << embree_endl
+         << "  flags = " << ray.flags << embree_endl
+         << "  Ng = " << ray.Ng
+         << "  u = " << ray.u <<  embree_endl
+         << "  v = " << ray.v << embree_endl
+         << "  primID = " << ray.primID <<  embree_endl
+         << "  geomID = " << ray.geomID << embree_endl
+         << "  instID =";
+    for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+    {
+      cout << " " << ray.instID[l];
+    }
+    cout << embree_endl;
+    return cout << "}";
+  }
+
+  struct RayStreamSOA
+  {
+    __forceinline RayStreamSOA(void* rays, size_t N)
+      : ptr((char*)rays), N(N) {}
+
+    /* ray data access functions */
+    __forceinline float* org_x(size_t offset = 0) { return (float*)&ptr[0*4*N+offset]; }  // x coordinate of ray origin
+    __forceinline float* org_y(size_t offset = 0) { return (float*)&ptr[1*4*N+offset]; }  // y coordinate of ray origin
+    __forceinline float* org_z(size_t offset = 0) { return (float*)&ptr[2*4*N+offset]; }; // z coordinate of ray origin
+    __forceinline float* tnear(size_t offset = 0) { return (float*)&ptr[3*4*N+offset]; }; // start of ray segment
+
+    __forceinline float* dir_x(size_t offset = 0) { return (float*)&ptr[4*4*N+offset]; }; // x coordinate of ray direction
+    __forceinline float* dir_y(size_t offset = 0) { return (float*)&ptr[5*4*N+offset]; }; // y coordinate of ray direction
+    __forceinline float* dir_z(size_t offset = 0) { return (float*)&ptr[6*4*N+offset]; }; // z coordinate of ray direction
+    __forceinline float* time (size_t offset = 0) { return (float*)&ptr[7*4*N+offset]; }; // time of this ray for motion blur
+
+    __forceinline float* tfar (size_t offset = 0) { return (float*)&ptr[8*4*N+offset]; }; // end of ray segment (set to hit distance)
+    __forceinline int*   mask (size_t offset = 0) { return (int*)&ptr[9*4*N+offset];   }; // used to mask out objects during traversal (optional)
+    __forceinline int*   id   (size_t offset = 0) { return (int*)&ptr[10*4*N+offset];  }; // id
+    __forceinline int*   flags(size_t offset = 0) { return (int*)&ptr[11*4*N+offset];  }; // flags
+
+    /* hit data access functions */
+    __forceinline float* Ng_x(size_t offset = 0) { return (float*)&ptr[12*4*N+offset]; }; // x coordinate of geometry normal
+    __forceinline float* Ng_y(size_t offset = 0) { return (float*)&ptr[13*4*N+offset]; }; // y coordinate of geometry normal
+    __forceinline float* Ng_z(size_t offset = 0) { return (float*)&ptr[14*4*N+offset]; }; // z coordinate of geometry normal
+
+    __forceinline float* u(size_t offset = 0) { return (float*)&ptr[15*4*N+offset]; };    // barycentric u coordinate of hit
+    __forceinline float* v(size_t offset = 0) { return (float*)&ptr[16*4*N+offset]; };    // barycentric v coordinate of hit
+
+    __forceinline unsigned int* primID(size_t offset = 0) { return (unsigned int*)&ptr[17*4*N+offset]; };   // primitive ID
+    __forceinline unsigned int* geomID(size_t offset = 0) { return (unsigned int*)&ptr[18*4*N+offset]; };   // geometry ID
+    __forceinline unsigned int* instID(size_t level, size_t offset = 0) { return (unsigned int*)&ptr[19*4*N+level*4*N+offset]; };   // instance ID
+
+    __forceinline Ray getRayByOffset(size_t offset)
+    {
+      Ray ray;
+      ray.org.x   = org_x(offset)[0];
+      ray.org.y   = org_y(offset)[0];
+      ray.org.z   = org_z(offset)[0];
+      ray.tnear() = tnear(offset)[0];
+      ray.dir.x   = dir_x(offset)[0];
+      ray.dir.y   = dir_y(offset)[0];
+      ray.dir.z   = dir_z(offset)[0];
+      ray.time()  = time(offset)[0];
+      ray.tfar    = tfar(offset)[0];
+      ray.mask    = mask(offset)[0];
+      ray.id      = id(offset)[0];
+      ray.flags   = flags(offset)[0];
+      return ray;
+    }
+
+    template<int K>
+    __forceinline RayK<K> getRayByOffset(size_t offset)
+    {
+      RayK<K> ray;
+      ray.org.x  = vfloat<K>::loadu(org_x(offset));
+      ray.org.y  = vfloat<K>::loadu(org_y(offset));
+      ray.org.z  = vfloat<K>::loadu(org_z(offset));
+      ray.tnear  = vfloat<K>::loadu(tnear(offset));
+      ray.dir.x  = vfloat<K>::loadu(dir_x(offset));
+      ray.dir.y  = vfloat<K>::loadu(dir_y(offset));
+      ray.dir.z  = vfloat<K>::loadu(dir_z(offset));
+      ray.time   = vfloat<K>::loadu(time(offset));
+      ray.tfar   = vfloat<K>::loadu(tfar(offset));
+      ray.mask   = vint<K>::loadu(mask(offset));
+      ray.id     = vint<K>::loadu(id(offset));
+      ray.flags  = vint<K>::loadu(flags(offset));
+      return ray;
+    }
+
+    template<int K>
+    __forceinline RayK<K> getRayByOffset(const vbool<K>& valid, size_t offset)
+    {
+      RayK<K> ray;
+      ray.org.x   = vfloat<K>::loadu(valid, org_x(offset));
+      ray.org.y   = vfloat<K>::loadu(valid, org_y(offset));
+      ray.org.z   = vfloat<K>::loadu(valid, org_z(offset));
+      ray.tnear() = vfloat<K>::loadu(valid, tnear(offset));
+      ray.dir.x   = vfloat<K>::loadu(valid, dir_x(offset));
+      ray.dir.y   = vfloat<K>::loadu(valid, dir_y(offset));
+      ray.dir.z   = vfloat<K>::loadu(valid, dir_z(offset));
+      ray.time()  = vfloat<K>::loadu(valid, time(offset));
+      ray.tfar  = vfloat<K>::loadu(valid, tfar(offset));
+
+#if !defined(__AVX__)
+      /* SSE: some ray members must be loaded with scalar instructions to ensure that we don't cause memory faults,
+         because the SSE masked loads always access the entire vector */
+      if (unlikely(!all(valid)))
+      {
+        ray.mask  = zero;
+        ray.id    = zero;
+        ray.flags = zero;
+
+        for (size_t k = 0; k < K; k++)
+        {
+          if (likely(valid[k]))
+          {
+            ray.mask[k]  = mask(offset)[k];
+            ray.id[k]    = id(offset)[k];
+            ray.flags[k] = flags(offset)[k];
+          }
+        }
+      }
+      else
+#endif
+      {
+        ray.mask  = vint<K>::loadu(valid, mask(offset));
+        ray.id    = vint<K>::loadu(valid, id(offset));
+        ray.flags = vint<K>::loadu(valid, flags(offset));
+      }
+
+      return ray;
+    }
+
+    template<int K>
+    __forceinline void setHitByOffset(const vbool<K>& valid_i, size_t offset, const RayHitK<K>& ray)
+    {
+      /* 
+       * valid_i: stores which of the input rays exist (do not access nonexistent rays!)
+       * valid:   stores which of the rays actually hit something.
+       */
+      vbool<K> valid = valid_i;
+      valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID);
+
+      if (likely(any(valid)))
+      {
+        vfloat<K>::storeu(valid, tfar(offset), ray.tfar);
+        vfloat<K>::storeu(valid, Ng_x(offset), ray.Ng.x);
+        vfloat<K>::storeu(valid, Ng_y(offset), ray.Ng.y);
+        vfloat<K>::storeu(valid, Ng_z(offset), ray.Ng.z);
+        vfloat<K>::storeu(valid, u(offset), ray.u);
+        vfloat<K>::storeu(valid, v(offset), ray.v);
+
+#if !defined(__AVX__)
+        /* SSE: some ray members must be stored with scalar instructions to ensure that we don't cause memory faults,
+           because the SSE masked stores always access the entire vector */
+        if (unlikely(!all(valid_i)))
+        {
+          for (size_t k = 0; k < K; k++)
+          {
+            if (likely(valid[k]))
+            {
+              primID(offset)[k] = ray.primID[k];
+              geomID(offset)[k] = ray.geomID[k];
+
+              instID(0, offset)[k] = ray.instID[0][k];
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
+              for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && ray.instID[l-1][k] != RTC_INVALID_GEOMETRY_ID; ++l)
+                instID(l, offset)[k] = ray.instID[l][k];
+#endif
+            }
+          }
+        }
+        else
+#endif
+        {
+          vuint<K>::storeu(valid, primID(offset), ray.primID);
+          vuint<K>::storeu(valid, geomID(offset), ray.geomID);
+
+          vuint<K>::storeu(valid, instID(0, offset), ray.instID[0]);
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
+          for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l)
+            vuint<K>::storeu(valid, instID(l, offset), ray.instID[l]);
+#endif
+        }
+      }
+    }
+
+    template<int K>
+    __forceinline void setHitByOffset(const vbool<K>& valid_i, size_t offset, const RayK<K>& ray)
+    {
+      vbool<K> valid = valid_i;
+      valid &= (ray.tfar < 0.0f);
+
+      if (likely(any(valid)))
+        vfloat<K>::storeu(valid, tfar(offset), ray.tfar);
+    }
+
+    __forceinline size_t getOctantByOffset(size_t offset)
+    {
+      const float dx = dir_x(offset)[0];
+      const float dy = dir_y(offset)[0];
+      const float dz = dir_z(offset)[0];
+      const size_t octantID = (dx < 0.0f ? 1 : 0) + (dy < 0.0f ? 2 : 0) + (dz < 0.0f ? 4 : 0);
+      return octantID;
+    }
+
+    __forceinline bool isValidByOffset(size_t offset)
+    {
+      const float nnear = tnear(offset)[0];
+      const float ffar  = tfar(offset)[0];
+      return nnear <= ffar;
+    }
+
+    template<int K>
+    __forceinline RayK<K> getRayByOffset(const vbool<K>& valid, const vint<K>& offset)
+    {
+      RayK<K> ray;
+
+#if defined(__AVX2__)
+      ray.org.x   = vfloat<K>::template gather<1>(valid, org_x(), offset);
+      ray.org.y   = vfloat<K>::template gather<1>(valid, org_y(), offset);
+      ray.org.z   = vfloat<K>::template gather<1>(valid, org_z(), offset);
+      ray.tnear() = vfloat<K>::template gather<1>(valid, tnear(), offset);
+      ray.dir.x   = vfloat<K>::template gather<1>(valid, dir_x(), offset);
+      ray.dir.y   = vfloat<K>::template gather<1>(valid, dir_y(), offset);
+      ray.dir.z   = vfloat<K>::template gather<1>(valid, dir_z(), offset);
+      ray.time()  = vfloat<K>::template gather<1>(valid, time(), offset);
+      ray.tfar    = vfloat<K>::template gather<1>(valid, tfar(), offset);
+      ray.mask    = vint<K>::template gather<1>(valid, mask(), offset);
+      ray.id      = vint<K>::template gather<1>(valid, id(), offset);
+      ray.flags   = vint<K>::template gather<1>(valid, flags(), offset);
+#else
+      ray.org     = zero;
+      ray.tnear() = zero;
+      ray.dir     = zero;
+      ray.time()  = zero;
+      ray.tfar    = zero;
+      ray.mask    = zero;
+      ray.id      = zero;
+      ray.flags   = zero;
+
+      for (size_t k = 0; k < K; k++)
+      {
+        if (likely(valid[k]))
+        {
+          const size_t ofs = offset[k];
+
+          ray.org.x[k]   = *org_x(ofs);
+          ray.org.y[k]   = *org_y(ofs);
+          ray.org.z[k]   = *org_z(ofs);
+          ray.tnear()[k] = *tnear(ofs);
+          ray.dir.x[k]   = *dir_x(ofs);
+          ray.dir.y[k]   = *dir_y(ofs);
+          ray.dir.z[k]   = *dir_z(ofs);
+          ray.time()[k]  = *time(ofs);
+          ray.tfar[k]    = *tfar(ofs);
+          ray.mask[k]    = *mask(ofs);
+          ray.id[k]      = *id(ofs);
+          ray.flags[k]   = *flags(ofs);
+        }
+      }
+#endif
+
+      return ray;
+    }
+
+    template<int K>
+    __forceinline void setHitByOffset(const vbool<K>& valid_i, const vint<K>& offset, const RayHitK<K>& ray)
+    {
+      vbool<K> valid = valid_i;
+      valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID);
+
+      if (likely(any(valid)))
+      {
+#if defined(__AVX512F__)
+        vfloat<K>::template scatter<1>(valid, tfar(), offset, ray.tfar);
+        vfloat<K>::template scatter<1>(valid, Ng_x(), offset, ray.Ng.x);
+        vfloat<K>::template scatter<1>(valid, Ng_y(), offset, ray.Ng.y);
+        vfloat<K>::template scatter<1>(valid, Ng_z(), offset, ray.Ng.z);
+        vfloat<K>::template scatter<1>(valid, u(), offset, ray.u);
+        vfloat<K>::template scatter<1>(valid, v(), offset, ray.v);
+        vuint<K>::template scatter<1>(valid, primID(), offset, ray.primID);
+        vuint<K>::template scatter<1>(valid, geomID(), offset, ray.geomID);
+
+        vuint<K>::template scatter<1>(valid, instID(0), offset, ray.instID[0]);
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
+        for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l)
+          vuint<K>::template scatter<1>(valid, instID(l), offset, ray.instID[l]);
+#endif
+#else
+        size_t valid_bits = movemask(valid);
+        while (valid_bits != 0)
+        {
+          const size_t k = bscf(valid_bits);
+          const size_t ofs = offset[k];
+
+          *tfar(ofs) = ray.tfar[k];
+
+          *Ng_x(ofs)   = ray.Ng.x[k];
+          *Ng_y(ofs)   = ray.Ng.y[k];
+          *Ng_z(ofs)   = ray.Ng.z[k];
+          *u(ofs)      = ray.u[k];
+          *v(ofs)      = ray.v[k];
+          *primID(ofs) = ray.primID[k];
+          *geomID(ofs) = ray.geomID[k];
+
+          *instID(0, ofs) = ray.instID[0][k];
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
+          for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && ray.instID[l-1][k] != RTC_INVALID_GEOMETRY_ID; ++l)
+            *instID(l, ofs) = ray.instID[l][k];
+#endif
+        }
+#endif
+      }
+    }
+
+    template<int K>
+    __forceinline void setHitByOffset(const vbool<K>& valid_i, const vint<K>& offset, const RayK<K>& ray)
+    {
+      vbool<K> valid = valid_i;
+      valid &= (ray.tfar < 0.0f);
+
+      if (likely(any(valid)))
+      {
+#if defined(__AVX512F__)
+        vfloat<K>::template scatter<1>(valid, tfar(), offset, ray.tfar);
+#else
+        size_t valid_bits = movemask(valid);
+        while (valid_bits != 0)
+        {
+          const size_t k = bscf(valid_bits);
+          const size_t ofs = offset[k];
+
+          *tfar(ofs) = ray.tfar[k];
+        }
+#endif
+      }
+    }
+
+    char* __restrict__ ptr;
+    size_t N;
+  };
+
+  template<size_t MAX_K>
+  struct StackRayStreamSOA : public RayStreamSOA
+  {
+    __forceinline StackRayStreamSOA(size_t K)
+      : RayStreamSOA(data, K) { assert(K <= MAX_K); }
+
+    char data[MAX_K / 4 * sizeof(RayHit4)];
+  };
+
+
+  struct RayStreamSOP
+  {
+    template<class T>
+    __forceinline void init(T& t)
+    {
+      org_x  = (float*)&t.org.x;
+      org_y  = (float*)&t.org.y;
+      org_z  = (float*)&t.org.z;
+      tnear  = (float*)&t.tnear;
+      dir_x  = (float*)&t.dir.x;
+      dir_y  = (float*)&t.dir.y;
+      dir_z  = (float*)&t.dir.z;
+      time   = (float*)&t.time;
+      tfar   = (float*)&t.tfar;
+      mask   = (unsigned int*)&t.mask;
+      id     = (unsigned int*)&t.id;
+      flags  = (unsigned int*)&t.flags;
+
+      Ng_x   = (float*)&t.Ng.x;
+      Ng_y   = (float*)&t.Ng.y;
+      Ng_z   = (float*)&t.Ng.z;
+      u      = (float*)&t.u;
+      v      = (float*)&t.v;
+      primID = (unsigned int*)&t.primID;
+      geomID = (unsigned int*)&t.geomID;
+
+      for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+        instID[l] = (unsigned int*)&t.instID[l];
+    }
+
+    __forceinline Ray getRayByOffset(size_t offset)
+    {
+      Ray ray;
+      ray.org.x   = *(float* __restrict__)((char*)org_x + offset);
+      ray.org.y   = *(float* __restrict__)((char*)org_y + offset);
+      ray.org.z   = *(float* __restrict__)((char*)org_z + offset);
+      ray.dir.x   = *(float* __restrict__)((char*)dir_x + offset);
+      ray.dir.y   = *(float* __restrict__)((char*)dir_y + offset);
+      ray.dir.z   = *(float* __restrict__)((char*)dir_z + offset);
+      ray.tfar  = *(float* __restrict__)((char*)tfar + offset);
+      ray.tnear() = tnear ? *(float* __restrict__)((char*)tnear + offset) : 0.0f;
+      ray.time()  = time ? *(float* __restrict__)((char*)time + offset) : 0.0f;
+      ray.mask    = mask ? *(unsigned int* __restrict__)((char*)mask + offset) : -1;
+      ray.id      = id ? *(unsigned int* __restrict__)((char*)id + offset) : -1;
+      ray.flags   = flags ? *(unsigned int* __restrict__)((char*)flags + offset) : -1;
+      return ray;
+    }
+
+    template<int K>
+    __forceinline RayK<K> getRayByOffset(const vbool<K>& valid, size_t offset)
+    {
+      RayK<K> ray;
+      ray.org.x   = vfloat<K>::loadu(valid, (float* __restrict__)((char*)org_x + offset));
+      ray.org.y   = vfloat<K>::loadu(valid, (float* __restrict__)((char*)org_y + offset));
+      ray.org.z   = vfloat<K>::loadu(valid, (float* __restrict__)((char*)org_z + offset));
+      ray.dir.x   = vfloat<K>::loadu(valid, (float* __restrict__)((char*)dir_x + offset));
+      ray.dir.y   = vfloat<K>::loadu(valid, (float* __restrict__)((char*)dir_y + offset));
+      ray.dir.z   = vfloat<K>::loadu(valid, (float* __restrict__)((char*)dir_z + offset));
+      ray.tfar    = vfloat<K>::loadu(valid, (float* __restrict__)((char*)tfar + offset));
+      ray.tnear() = tnear ? vfloat<K>::loadu(valid, (float* __restrict__)((char*)tnear + offset)) : 0.0f;
+      ray.time()  = time ? vfloat<K>::loadu(valid, (float* __restrict__)((char*)time + offset)) : 0.0f;
+      ray.mask    = mask ? vint<K>::loadu(valid, (const void* __restrict__)((char*)mask + offset)) : -1;
+      ray.id      = id ? vint<K>::loadu(valid, (const void* __restrict__)((char*)id + offset)) : -1;
+      ray.flags   = flags ? vint<K>::loadu(valid, (const void* __restrict__)((char*)flags + offset)) : -1;
+      return ray;
+    }
+
+    template<int K>
+    __forceinline Vec3vf<K> getDirByOffset(const vbool<K>& valid, size_t offset)
+    {
+      Vec3vf<K> dir;
+      dir.x = vfloat<K>::loadu(valid, (float* __restrict__)((char*)dir_x + offset));
+      dir.y = vfloat<K>::loadu(valid, (float* __restrict__)((char*)dir_y + offset));
+      dir.z = vfloat<K>::loadu(valid, (float* __restrict__)((char*)dir_z + offset));
+      return dir;
+    }
+
+    __forceinline void setHitByOffset(size_t offset, const RayHit& ray)
+    {
+      if (ray.geomID != RTC_INVALID_GEOMETRY_ID)
+      {
+        *(float* __restrict__)((char*)tfar + offset) = ray.tfar;
+
+        if (likely(Ng_x)) *(float* __restrict__)((char*)Ng_x + offset) = ray.Ng.x;
+        if (likely(Ng_y)) *(float* __restrict__)((char*)Ng_y + offset) = ray.Ng.y;
+        if (likely(Ng_z)) *(float* __restrict__)((char*)Ng_z + offset) = ray.Ng.z;
+        *(float* __restrict__)((char*)u + offset) = ray.u;
+        *(float* __restrict__)((char*)v + offset) = ray.v;
+        *(unsigned int* __restrict__)((char*)geomID + offset) = ray.geomID;
+        *(unsigned int* __restrict__)((char*)primID + offset) = ray.primID;
+
+        if (likely(instID[0])) {
+          *(unsigned int* __restrict__)((char*)instID[0] + offset) = ray.instID[0];
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
+          for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID; ++l)
+            *(unsigned int* __restrict__)((char*)instID[l] + offset) = ray.instID[l];
+#endif
+        }
+      }
+    }
+
+    __forceinline void setHitByOffset(size_t offset, const Ray& ray)
+    {
+      *(float* __restrict__)((char*)tfar + offset) = ray.tfar;
+    }
+
+    template<int K>
+    __forceinline void setHitByOffset(const vbool<K>& valid_i, size_t offset, const RayHitK<K>& ray)
+    {
+      vbool<K> valid = valid_i;
+      valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID);
+
+      if (likely(any(valid)))
+      {
+        vfloat<K>::storeu(valid, (float* __restrict__)((char*)tfar + offset), ray.tfar);
+
+        if (likely(Ng_x)) vfloat<K>::storeu(valid, (float* __restrict__)((char*)Ng_x + offset), ray.Ng.x);
+        if (likely(Ng_y)) vfloat<K>::storeu(valid, (float* __restrict__)((char*)Ng_y + offset), ray.Ng.y);
+        if (likely(Ng_z)) vfloat<K>::storeu(valid, (float* __restrict__)((char*)Ng_z + offset), ray.Ng.z);
+        vfloat<K>::storeu(valid, (float* __restrict__)((char*)u + offset), ray.u);
+        vfloat<K>::storeu(valid, (float* __restrict__)((char*)v + offset), ray.v);
+        vuint<K>::storeu(valid, (unsigned int* __restrict__)((char*)primID + offset), ray.primID);
+        vuint<K>::storeu(valid, (unsigned int* __restrict__)((char*)geomID + offset), ray.geomID);
+
+        if (likely(instID[0])) {
+          vuint<K>::storeu(valid, (unsigned int* __restrict__)((char*)instID[0] + offset), ray.instID[0]);
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
+          for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l)
+            vuint<K>::storeu(valid, (unsigned int* __restrict__)((char*)instID[l] + offset), ray.instID[l]);
+#endif
+        }
+      }
+    }
+
+    template<int K>
+    __forceinline void setHitByOffset(const vbool<K>& valid_i, size_t offset, const RayK<K>& ray)
+    {
+      vbool<K> valid = valid_i;
+      valid &= (ray.tfar < 0.0f);
+
+      if (likely(any(valid)))
+        vfloat<K>::storeu(valid, (float* __restrict__)((char*)tfar + offset), ray.tfar);
+    }
+
+    __forceinline size_t getOctantByOffset(size_t offset)
+    {
+      const float dx = *(float* __restrict__)((char*)dir_x + offset);
+      const float dy = *(float* __restrict__)((char*)dir_y + offset);
+      const float dz = *(float* __restrict__)((char*)dir_z + offset);
+      const size_t octantID = (dx < 0.0f ? 1 : 0) + (dy < 0.0f ? 2 : 0) + (dz < 0.0f ? 4 : 0);
+      return octantID;
+    }
+
+    __forceinline bool isValidByOffset(size_t offset)
+    {
+      const float nnear = tnear ? *(float* __restrict__)((char*)tnear + offset) : 0.0f;
+      const float ffar  = *(float* __restrict__)((char*)tfar + offset);
+      return nnear <= ffar;
+    }
+
+    template<int K>
+    __forceinline vbool<K> isValidByOffset(const vbool<K>& valid, size_t offset)
+    {
+      const vfloat<K> nnear = tnear ? vfloat<K>::loadu(valid, (float* __restrict__)((char*)tnear + offset)) : 0.0f;
+      const vfloat<K> ffar  = vfloat<K>::loadu(valid, (float* __restrict__)((char*)tfar + offset));
+      return nnear <= ffar;
+    }
+
+    template<int K>
+    __forceinline RayK<K> getRayByOffset(const vbool<K>& valid, const vint<K>& offset)
+    {
+      RayK<K> ray;
+
+#if defined(__AVX2__)
+      ray.org.x   = vfloat<K>::template gather<1>(valid, org_x, offset);
+      ray.org.y   = vfloat<K>::template gather<1>(valid, org_y, offset);
+      ray.org.z   = vfloat<K>::template gather<1>(valid, org_z, offset);
+      ray.dir.x   = vfloat<K>::template gather<1>(valid, dir_x, offset);
+      ray.dir.y   = vfloat<K>::template gather<1>(valid, dir_y, offset);
+      ray.dir.z   = vfloat<K>::template gather<1>(valid, dir_z, offset);
+      ray.tfar    = vfloat<K>::template gather<1>(valid, tfar, offset);
+      ray.tnear() = tnear ? vfloat<K>::template gather<1>(valid, tnear, offset) : vfloat<K>(zero);
+      ray.time()  = time ? vfloat<K>::template gather<1>(valid, time, offset) : vfloat<K>(zero);
+      ray.mask    = mask ? vint<K>::template gather<1>(valid, (int*)mask, offset) : vint<K>(-1);
+      ray.id      = id ? vint<K>::template gather<1>(valid, (int*)id, offset) : vint<K>(-1);
+      ray.flags   = flags ? vint<K>::template gather<1>(valid, (int*)flags, offset) : vint<K>(-1);
+#else
+      ray.org     = zero;
+      ray.tnear() = zero;
+      ray.dir     = zero;
+      ray.tfar    = zero;
+      ray.time()  = zero;
+      ray.mask    = zero;
+      ray.id      = zero;
+      ray.flags   = zero;
+
+      for (size_t k = 0; k < K; k++)
+      {
+        if (likely(valid[k]))
+        {
+          const size_t ofs = offset[k];
+
+          ray.org.x[k]   = *(float* __restrict__)((char*)org_x + ofs);
+          ray.org.y[k]   = *(float* __restrict__)((char*)org_y + ofs);
+          ray.org.z[k]   = *(float* __restrict__)((char*)org_z + ofs);
+          ray.dir.x[k]   = *(float* __restrict__)((char*)dir_x + ofs);
+          ray.dir.y[k]   = *(float* __restrict__)((char*)dir_y + ofs);
+          ray.dir.z[k]   = *(float* __restrict__)((char*)dir_z + ofs);
+          ray.tfar[k]  = *(float* __restrict__)((char*)tfar + ofs);
+          ray.tnear()[k] = tnear ? *(float* __restrict__)((char*)tnear + ofs) : 0.0f;
+          ray.time()[k]  = time ? *(float* __restrict__)((char*)time + ofs) : 0.0f;
+          ray.mask[k]    = mask ? *(int* __restrict__)((char*)mask + ofs) : -1;
+          ray.id[k]      = id ? *(int* __restrict__)((char*)id + ofs) : -1;
+          ray.flags[k]   = flags ? *(int* __restrict__)((char*)flags + ofs) : -1;
+        }
+      }
+#endif
+
+      return ray;
+    }
+
+    template<int K>
+    __forceinline void setHitByOffset(const vbool<K>& valid_i, const vint<K>& offset, const RayHitK<K>& ray)
+    {
+      vbool<K> valid = valid_i;
+      valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID);
+
+      if (likely(any(valid)))
+      {
+#if defined(__AVX512F__)
+        vfloat<K>::template scatter<1>(valid, tfar, offset, ray.tfar);
+
+        if (likely(Ng_x)) vfloat<K>::template scatter<1>(valid, Ng_x, offset, ray.Ng.x);
+        if (likely(Ng_y)) vfloat<K>::template scatter<1>(valid, Ng_y, offset, ray.Ng.y);
+        if (likely(Ng_z)) vfloat<K>::template scatter<1>(valid, Ng_z, offset, ray.Ng.z);
+        vfloat<K>::template scatter<1>(valid, u, offset, ray.u);
+        vfloat<K>::template scatter<1>(valid, v, offset, ray.v);
+        vuint<K>::template scatter<1>(valid, (unsigned int*)geomID, offset, ray.geomID);
+        vuint<K>::template scatter<1>(valid, (unsigned int*)primID, offset, ray.primID);
+
+        if (likely(instID[0])) {
+          vuint<K>::template scatter<1>(valid, (unsigned int*)instID[0], offset, ray.instID[0]);
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
+          for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l)
+            vuint<K>::template scatter<1>(valid, (unsigned int*)instID[l], offset, ray.instID[l]);
+#endif
+        }
+#else
+        size_t valid_bits = movemask(valid);
+        while (valid_bits != 0)
+        {
+          const size_t k = bscf(valid_bits);
+          const size_t ofs = offset[k];
+
+          *(float* __restrict__)((char*)tfar + ofs) = ray.tfar[k];
+
+          if (likely(Ng_x)) *(float* __restrict__)((char*)Ng_x + ofs) = ray.Ng.x[k];
+          if (likely(Ng_y)) *(float* __restrict__)((char*)Ng_y + ofs) = ray.Ng.y[k];
+          if (likely(Ng_z)) *(float* __restrict__)((char*)Ng_z + ofs) = ray.Ng.z[k];
+          *(float* __restrict__)((char*)u + ofs) = ray.u[k];
+          *(float* __restrict__)((char*)v + ofs) = ray.v[k];
+          *(unsigned int* __restrict__)((char*)primID + ofs) = ray.primID[k];
+          *(unsigned int* __restrict__)((char*)geomID + ofs) = ray.geomID[k];
+
+          if (likely(instID[0])) {
+            *(unsigned int* __restrict__)((char*)instID[0] + ofs) = ray.instID[0][k];
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
+            for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && ray.instID[l-1][k] != RTC_INVALID_GEOMETRY_ID; ++l)
+              *(unsigned int* __restrict__)((char*)instID[l] + ofs) = ray.instID[l][k];
+#endif
+          }
+        }
+#endif
+      }
+    }
+
+    template<int K>
+    __forceinline void setHitByOffset(const vbool<K>& valid_i, const vint<K>& offset, const RayK<K>& ray)
+    {
+      vbool<K> valid = valid_i;
+      valid &= (ray.tfar < 0.0f);
+
+      if (likely(any(valid)))
+      {
+#if defined(__AVX512F__)
+        vfloat<K>::template scatter<1>(valid, tfar, offset, ray.tfar);
+#else
+        size_t valid_bits = movemask(valid);
+        while (valid_bits != 0)
+        {
+          const size_t k = bscf(valid_bits);
+          const size_t ofs = offset[k];
+
+          *(float* __restrict__)((char*)tfar + ofs) = ray.tfar[k];
+        }
+#endif
+      }
+    }
+
+    /* ray data */
+    float* __restrict__ org_x; // x coordinate of ray origin
+    float* __restrict__ org_y; // y coordinate of ray origin
+    float* __restrict__ org_z; // z coordinate of ray origin
+    float* __restrict__ tnear; // start of ray segment (optional)
+
+    float* __restrict__ dir_x; // x coordinate of ray direction
+    float* __restrict__ dir_y; // y coordinate of ray direction
+    float* __restrict__ dir_z; // z coordinate of ray direction
+    float* __restrict__ time;         // time of this ray for motion blur (optional)
+
+    float* __restrict__ tfar;  // end of ray segment (set to hit distance)
+    unsigned int* __restrict__ mask;  // used to mask out objects during traversal (optional)
+    unsigned int* __restrict__ id;    // ray ID
+    unsigned int* __restrict__ flags; // ray flags
+
+    /* hit data */
+    float* __restrict__ Ng_x; // x coordinate of geometry normal (optional)
+    float* __restrict__ Ng_y; // y coordinate of geometry normal (optional)
+    float* __restrict__ Ng_z; // z coordinate of geometry normal (optional)
+
+    float* __restrict__ u;    // barycentric u coordinate of hit
+    float* __restrict__ v;    // barycentric v coordinate of hit
+
+    unsigned int* __restrict__ primID; // primitive ID
+    unsigned int* __restrict__ geomID; // geometry ID
+    unsigned int* __restrict__ instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // instance ID (optional)
+  };
+
+
+  struct RayStreamAOS
+  {
+    __forceinline RayStreamAOS(void* rays)
+      : ptr((Ray*)rays) {}
+
+    __forceinline Ray& getRayByOffset(size_t offset)
+    {
+      return *(Ray*)((char*)ptr + offset);
+    }
+
+    template<int K>
+    __forceinline RayK<K> getRayByOffset(const vint<K>& offset);
+
+    template<int K>
+    __forceinline RayK<K> getRayByOffset(const vbool<K>& valid, const vint<K>& offset)
+    {
+      const vint<K> valid_offset = select(valid, offset, vintx(zero));
+      return getRayByOffset<K>(valid_offset);
+    }
+
+    template<int K>
+    __forceinline void setHitByOffset(const vbool<K>& valid_i, const vint<K>& offset, const RayHitK<K>& ray)
+    {
+      vbool<K> valid = valid_i;
+      valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID);
+
+      if (likely(any(valid)))
+      {
+#if defined(__AVX512F__)
+        vfloat<K>::template scatter<1>(valid, &ptr->tfar, offset, ray.tfar);
+        vfloat<K>::template scatter<1>(valid, &((RayHit*)ptr)->Ng.x, offset, ray.Ng.x);
+        vfloat<K>::template scatter<1>(valid, &((RayHit*)ptr)->Ng.y, offset, ray.Ng.y);
+        vfloat<K>::template scatter<1>(valid, &((RayHit*)ptr)->Ng.z, offset, ray.Ng.z);
+        vfloat<K>::template scatter<1>(valid, &((RayHit*)ptr)->u, offset, ray.u);
+        vfloat<K>::template scatter<1>(valid, &((RayHit*)ptr)->v, offset, ray.v);
+        vuint<K>::template scatter<1>(valid, (unsigned int*)&((RayHit*)ptr)->primID, offset, ray.primID);
+        vuint<K>::template scatter<1>(valid, (unsigned int*)&((RayHit*)ptr)->geomID, offset, ray.geomID);
+
+        vuint<K>::template scatter<1>(valid, (unsigned int*)&((RayHit*)ptr)->instID[0], offset, ray.instID[0]);
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
+        for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l)
+          vuint<K>::template scatter<1>(valid, (unsigned int*)&((RayHit*)ptr)->instID[l], offset, ray.instID[l]);
+#endif
+#else
+        size_t valid_bits = movemask(valid);
+        while (valid_bits != 0)
+        {
+          const size_t k = bscf(valid_bits);
+          RayHit* __restrict__ ray_k = (RayHit*)((char*)ptr + offset[k]);
+          ray_k->tfar   = ray.tfar[k];
+          ray_k->Ng.x   = ray.Ng.x[k];
+          ray_k->Ng.y   = ray.Ng.y[k];
+          ray_k->Ng.z   = ray.Ng.z[k];
+          ray_k->u      = ray.u[k];
+          ray_k->v      = ray.v[k];
+          ray_k->primID = ray.primID[k];
+          ray_k->geomID = ray.geomID[k];
+
+          instance_id_stack::copy_VU<K>(ray.instID, ray_k->instID, k);
+        }
+#endif
+      }
+    }
+
+    template<int K>
+    __forceinline void setHitByOffset(const vbool<K>& valid_i, const vint<K>& offset, const RayK<K>& ray)
+    {
+      vbool<K> valid = valid_i;
+      valid &= (ray.tfar < 0.0f);
+
+      if (likely(any(valid)))
+      {
+#if defined(__AVX512F__)
+        vfloat<K>::template scatter<1>(valid, &ptr->tfar, offset, ray.tfar);
+#else
+        size_t valid_bits = movemask(valid);
+        while (valid_bits != 0)
+        {
+          const size_t k = bscf(valid_bits);
+          Ray* __restrict__ ray_k = (Ray*)((char*)ptr + offset[k]);
+          ray_k->tfar = ray.tfar[k];
+        }
+#endif
+      }
+    }
+
+    Ray* __restrict__ ptr;
+  };
+
+  template<>
+  __forceinline Ray4 RayStreamAOS::getRayByOffset<4>(const vint4& offset)
+  {
+    Ray4 ray;
+
+    /* load and transpose: org.x, org.y, org.z, tnear */
+    const vfloat4 a0 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[0]))->org);
+    const vfloat4 a1 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[1]))->org);
+    const vfloat4 a2 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[2]))->org);
+    const vfloat4 a3 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[3]))->org);
+
+    transpose(a0,a1,a2,a3, ray.org.x, ray.org.y, ray.org.z, ray.tnear());
+
+    /* load and transpose: dir.x, dir.y, dir.z, time */
+    const vfloat4 b0 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[0]))->dir);
+    const vfloat4 b1 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[1]))->dir);
+    const vfloat4 b2 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[2]))->dir);
+    const vfloat4 b3 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[3]))->dir);
+
+    transpose(b0,b1,b2,b3, ray.dir.x, ray.dir.y, ray.dir.z, ray.time());
+
+    /* load and transpose: tfar, mask, id, flags */
+    const vfloat4 c0 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[0]))->tfar);
+    const vfloat4 c1 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[1]))->tfar);
+    const vfloat4 c2 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[2]))->tfar);
+    const vfloat4 c3 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[3]))->tfar);
+
+    vfloat4 maskf, idf, flagsf;
+    transpose(c0,c1,c2,c3, ray.tfar, maskf, idf, flagsf);
+    ray.mask  = asInt(maskf);
+    ray.id    = asInt(idf);
+    ray.flags = asInt(flagsf);
+
+    return ray;
+  }
+
+#if defined(__AVX__)
+  template<>
+  __forceinline Ray8 RayStreamAOS::getRayByOffset<8>(const vint8& offset)
+  {
+    Ray8 ray;
+
+    /* load and transpose: org.x, org.y, org.z, tnear, dir.x, dir.y, dir.z, time */
+    const vfloat8 ab0 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[0]))->org);
+    const vfloat8 ab1 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[1]))->org);
+    const vfloat8 ab2 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[2]))->org);
+    const vfloat8 ab3 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[3]))->org);
+    const vfloat8 ab4 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[4]))->org);
+    const vfloat8 ab5 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[5]))->org);
+    const vfloat8 ab6 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[6]))->org);
+    const vfloat8 ab7 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[7]))->org);
+
+    transpose(ab0,ab1,ab2,ab3,ab4,ab5,ab6,ab7, ray.org.x, ray.org.y, ray.org.z, ray.tnear(), ray.dir.x, ray.dir.y, ray.dir.z, ray.time());
+
+    /* load and transpose: tfar, mask, id, flags */
+    const vfloat4 c0 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[0]))->tfar);
+    const vfloat4 c1 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[1]))->tfar);
+    const vfloat4 c2 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[2]))->tfar);
+    const vfloat4 c3 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[3]))->tfar);
+    const vfloat4 c4 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[4]))->tfar);
+    const vfloat4 c5 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[5]))->tfar);
+    const vfloat4 c6 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[6]))->tfar);
+    const vfloat4 c7 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[7]))->tfar);
+
+    vfloat8 maskf, idf, flagsf;
+    transpose(c0,c1,c2,c3,c4,c5,c6,c7, ray.tfar, maskf, idf, flagsf);
+    ray.mask  = asInt(maskf);
+    ray.id    = asInt(idf);
+    ray.flags = asInt(flagsf);
+
+    return ray;
+  }
+#endif
+
+#if defined(__AVX512F__)
+  template<>
+  __forceinline Ray16 RayStreamAOS::getRayByOffset<16>(const vint16& offset)
+  {
+    Ray16 ray;
+
+    /* load and transpose: org.x, org.y, org.z, tnear, dir.x, dir.y, dir.z, time */
+    const vfloat8 ab0  = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 0]))->org);
+    const vfloat8 ab1  = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 1]))->org);
+    const vfloat8 ab2  = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 2]))->org);
+    const vfloat8 ab3  = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 3]))->org);
+    const vfloat8 ab4  = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 4]))->org);
+    const vfloat8 ab5  = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 5]))->org);
+    const vfloat8 ab6  = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 6]))->org);
+    const vfloat8 ab7  = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 7]))->org);
+    const vfloat8 ab8  = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 8]))->org);
+    const vfloat8 ab9  = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 9]))->org);
+    const vfloat8 ab10 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[10]))->org);
+    const vfloat8 ab11 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[11]))->org);
+    const vfloat8 ab12 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[12]))->org);
+    const vfloat8 ab13 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[13]))->org);
+    const vfloat8 ab14 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[14]))->org);
+    const vfloat8 ab15 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[15]))->org);
+
+    transpose(ab0,ab1,ab2,ab3,ab4,ab5,ab6,ab7,ab8,ab9,ab10,ab11,ab12,ab13,ab14,ab15,
+              ray.org.x, ray.org.y, ray.org.z, ray.tnear(), ray.dir.x, ray.dir.y, ray.dir.z, ray.time());
+
+    /* load and transpose: tfar, mask, id, flags */
+    const vfloat4 c0  = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 0]))->tfar);
+    const vfloat4 c1  = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 1]))->tfar);
+    const vfloat4 c2  = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 2]))->tfar);
+    const vfloat4 c3  = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 3]))->tfar);
+    const vfloat4 c4  = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 4]))->tfar);
+    const vfloat4 c5  = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 5]))->tfar);
+    const vfloat4 c6  = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 6]))->tfar);
+    const vfloat4 c7  = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 7]))->tfar);
+    const vfloat4 c8  = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 8]))->tfar);
+    const vfloat4 c9  = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 9]))->tfar);
+    const vfloat4 c10 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[10]))->tfar);
+    const vfloat4 c11 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[11]))->tfar);
+    const vfloat4 c12 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[12]))->tfar);
+    const vfloat4 c13 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[13]))->tfar);
+    const vfloat4 c14 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[14]))->tfar);
+    const vfloat4 c15 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[15]))->tfar);
+
+    vfloat16 maskf, idf, flagsf;
+    transpose(c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,
+              ray.tfar, maskf, idf, flagsf);
+    ray.mask  = asInt(maskf);
+    ray.id    = asInt(idf);
+    ray.flags = asInt(flagsf);
+
+    return ray;
+  }
+#endif
+
+
+  struct RayStreamAOP
+  {
+    __forceinline RayStreamAOP(void* rays)
+      : ptr((Ray**)rays) {}
+
+    __forceinline Ray& getRayByIndex(size_t index)
+    {
+      return *ptr[index];
+    }
+
+    template<int K>
+    __forceinline RayK<K> getRayByIndex(const vint<K>& index);
+
+    template<int K>
+    __forceinline RayK<K> getRayByIndex(const vbool<K>& valid, const vint<K>& index)
+    {
+      const vint<K> valid_index = select(valid, index, vintx(zero));
+      return getRayByIndex<K>(valid_index);
+    }
+
+    template<int K>
+    __forceinline void setHitByIndex(const vbool<K>& valid_i, const vint<K>& index, const RayHitK<K>& ray)
+    {
+      vbool<K> valid = valid_i;
+      valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID);
+
+      if (likely(any(valid)))
+      {
+        size_t valid_bits = movemask(valid);
+        while (valid_bits != 0)
+        {
+          const size_t k = bscf(valid_bits);
+          RayHit* __restrict__ ray_k = (RayHit*)ptr[index[k]];
+
+          ray_k->tfar = ray.tfar[k];
+          ray_k->Ng.x   = ray.Ng.x[k];
+          ray_k->Ng.y   = ray.Ng.y[k];
+          ray_k->Ng.z   = ray.Ng.z[k];
+          ray_k->u      = ray.u[k];
+          ray_k->v      = ray.v[k];
+          ray_k->primID = ray.primID[k];
+          ray_k->geomID = ray.geomID[k];
+          instance_id_stack::copy_VU<K>(ray.instID, ray_k->instID, k);
+        }
+      }
+    }
+
+    template<int K>
+    __forceinline void setHitByIndex(const vbool<K>& valid_i, const vint<K>& index, const RayK<K>& ray)
+    {
+      vbool<K> valid = valid_i;
+      valid &= (ray.tfar < 0.0f);
+
+      if (likely(any(valid)))
+      {
+        size_t valid_bits = movemask(valid);
+        while (valid_bits != 0)
+        {
+          const size_t k = bscf(valid_bits);
+          Ray* __restrict__ ray_k = ptr[index[k]];
+
+          ray_k->tfar = ray.tfar[k];
+        }
+      }
+    }
+
+    Ray** __restrict__ ptr;
+  };
+
+  template<>
+  __forceinline Ray4 RayStreamAOP::getRayByIndex<4>(const vint4& index)
+  {
+    Ray4 ray;
+
+    /* load and transpose: org.x, org.y, org.z, tnear */
+    const vfloat4 a0 = vfloat4::loadu(&ptr[index[0]]->org);
+    const vfloat4 a1 = vfloat4::loadu(&ptr[index[1]]->org);
+    const vfloat4 a2 = vfloat4::loadu(&ptr[index[2]]->org);
+    const vfloat4 a3 = vfloat4::loadu(&ptr[index[3]]->org);
+
+    transpose(a0,a1,a2,a3, ray.org.x, ray.org.y, ray.org.z, ray.tnear());
+
+    /* load and transpose: dir.x, dir.y, dir.z, time */
+    const vfloat4 b0 = vfloat4::loadu(&ptr[index[0]]->dir);
+    const vfloat4 b1 = vfloat4::loadu(&ptr[index[1]]->dir);
+    const vfloat4 b2 = vfloat4::loadu(&ptr[index[2]]->dir);
+    const vfloat4 b3 = vfloat4::loadu(&ptr[index[3]]->dir);
+
+    transpose(b0,b1,b2,b3, ray.dir.x, ray.dir.y, ray.dir.z, ray.time());
+
+    /* load and transpose: tfar, mask, id, flags */
+    const vfloat4 c0 = vfloat4::loadu(&ptr[index[0]]->tfar);
+    const vfloat4 c1 = vfloat4::loadu(&ptr[index[1]]->tfar);
+    const vfloat4 c2 = vfloat4::loadu(&ptr[index[2]]->tfar);
+    const vfloat4 c3 = vfloat4::loadu(&ptr[index[3]]->tfar);
+
+    vfloat4 maskf, idf, flagsf;
+    transpose(c0,c1,c2,c3, ray.tfar, maskf, idf, flagsf);
+    ray.mask  = asInt(maskf);
+    ray.id    = asInt(idf);
+    ray.flags = asInt(flagsf);
+
+    return ray;
+  }
+
+#if defined(__AVX__)
+  template<>
+  __forceinline Ray8 RayStreamAOP::getRayByIndex<8>(const vint8& index)
+  {
+    Ray8 ray;
+
+    /* load and transpose: org.x, org.y, org.z, tnear, dir.x, dir.y, dir.z, time */
+    const vfloat8 ab0 = vfloat8::loadu(&ptr[index[0]]->org);
+    const vfloat8 ab1 = vfloat8::loadu(&ptr[index[1]]->org);
+    const vfloat8 ab2 = vfloat8::loadu(&ptr[index[2]]->org);
+    const vfloat8 ab3 = vfloat8::loadu(&ptr[index[3]]->org);
+    const vfloat8 ab4 = vfloat8::loadu(&ptr[index[4]]->org);
+    const vfloat8 ab5 = vfloat8::loadu(&ptr[index[5]]->org);
+    const vfloat8 ab6 = vfloat8::loadu(&ptr[index[6]]->org);
+    const vfloat8 ab7 = vfloat8::loadu(&ptr[index[7]]->org);
+
+    transpose(ab0,ab1,ab2,ab3,ab4,ab5,ab6,ab7, ray.org.x, ray.org.y, ray.org.z, ray.tnear(), ray.dir.x, ray.dir.y, ray.dir.z, ray.time());
+
+    /* load and transpose: tfar, mask, id, flags */
+    const vfloat4 c0 = vfloat4::loadu(&ptr[index[0]]->tfar);
+    const vfloat4 c1 = vfloat4::loadu(&ptr[index[1]]->tfar);
+    const vfloat4 c2 = vfloat4::loadu(&ptr[index[2]]->tfar);
+    const vfloat4 c3 = vfloat4::loadu(&ptr[index[3]]->tfar);
+    const vfloat4 c4 = vfloat4::loadu(&ptr[index[4]]->tfar);
+    const vfloat4 c5 = vfloat4::loadu(&ptr[index[5]]->tfar);
+    const vfloat4 c6 = vfloat4::loadu(&ptr[index[6]]->tfar);
+    const vfloat4 c7 = vfloat4::loadu(&ptr[index[7]]->tfar);
+
+    vfloat8 maskf, idf, flagsf;
+    transpose(c0,c1,c2,c3,c4,c5,c6,c7, ray.tfar, maskf, idf, flagsf);
+    ray.mask  = asInt(maskf);
+    ray.id    = asInt(idf);
+    ray.flags = asInt(flagsf);
+
+    return ray;
+  }
+#endif
+
+#if defined(__AVX512F__)
+  template<>
+  __forceinline Ray16 RayStreamAOP::getRayByIndex<16>(const vint16& index)
+  {
+    Ray16 ray;
+
+    /* load and transpose: org.x, org.y, org.z, tnear, dir.x, dir.y, dir.z, time */
+    const vfloat8 ab0  = vfloat8::loadu(&ptr[index[0]]->org);
+    const vfloat8 ab1  = vfloat8::loadu(&ptr[index[1]]->org);
+    const vfloat8 ab2  = vfloat8::loadu(&ptr[index[2]]->org);
+    const vfloat8 ab3  = vfloat8::loadu(&ptr[index[3]]->org);
+    const vfloat8 ab4  = vfloat8::loadu(&ptr[index[4]]->org);
+    const vfloat8 ab5  = vfloat8::loadu(&ptr[index[5]]->org);
+    const vfloat8 ab6  = vfloat8::loadu(&ptr[index[6]]->org);
+    const vfloat8 ab7  = vfloat8::loadu(&ptr[index[7]]->org);
+    const vfloat8 ab8  = vfloat8::loadu(&ptr[index[8]]->org);
+    const vfloat8 ab9  = vfloat8::loadu(&ptr[index[9]]->org);
+    const vfloat8 ab10 = vfloat8::loadu(&ptr[index[10]]->org);
+    const vfloat8 ab11 = vfloat8::loadu(&ptr[index[11]]->org);
+    const vfloat8 ab12 = vfloat8::loadu(&ptr[index[12]]->org);
+    const vfloat8 ab13 = vfloat8::loadu(&ptr[index[13]]->org);
+    const vfloat8 ab14 = vfloat8::loadu(&ptr[index[14]]->org);
+    const vfloat8 ab15 = vfloat8::loadu(&ptr[index[15]]->org);
+
+    transpose(ab0,ab1,ab2,ab3,ab4,ab5,ab6,ab7,ab8,ab9,ab10,ab11,ab12,ab13,ab14,ab15,
+              ray.org.x, ray.org.y, ray.org.z, ray.tnear(), ray.dir.x, ray.dir.y, ray.dir.z, ray.time());
+
+    /* load and transpose: tfar, mask, id, flags */
+    const vfloat4 c0  = vfloat4::loadu(&ptr[index[0]]->tfar);
+    const vfloat4 c1  = vfloat4::loadu(&ptr[index[1]]->tfar);
+    const vfloat4 c2  = vfloat4::loadu(&ptr[index[2]]->tfar);
+    const vfloat4 c3  = vfloat4::loadu(&ptr[index[3]]->tfar);
+    const vfloat4 c4  = vfloat4::loadu(&ptr[index[4]]->tfar);
+    const vfloat4 c5  = vfloat4::loadu(&ptr[index[5]]->tfar);
+    const vfloat4 c6  = vfloat4::loadu(&ptr[index[6]]->tfar);
+    const vfloat4 c7  = vfloat4::loadu(&ptr[index[7]]->tfar);
+    const vfloat4 c8  = vfloat4::loadu(&ptr[index[8]]->tfar);
+    const vfloat4 c9  = vfloat4::loadu(&ptr[index[9]]->tfar);
+    const vfloat4 c10 = vfloat4::loadu(&ptr[index[10]]->tfar);
+    const vfloat4 c11 = vfloat4::loadu(&ptr[index[11]]->tfar);
+    const vfloat4 c12 = vfloat4::loadu(&ptr[index[12]]->tfar);
+    const vfloat4 c13 = vfloat4::loadu(&ptr[index[13]]->tfar);
+    const vfloat4 c14 = vfloat4::loadu(&ptr[index[14]]->tfar);
+    const vfloat4 c15 = vfloat4::loadu(&ptr[index[15]]->tfar);
+
+    vfloat16 maskf, idf, flagsf;
+    transpose(c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,
+              ray.tfar, maskf, idf, flagsf);
+
+    ray.mask  = asInt(maskf);
+    ray.id    = asInt(idf);
+    ray.flags = asInt(flagsf);
+
+    return ray;
+  }
+#endif
+}
diff --git a/thirdparty/embree/kernels/common/rtcore.cpp b/thirdparty/embree/kernels/common/rtcore.cpp
new file mode 100644
index 0000000000..94b3819e42
--- /dev/null
+++ b/thirdparty/embree/kernels/common/rtcore.cpp
@@ -0,0 +1,1766 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#define RTC_EXPORT_API
+
+#include "default.h"
+#include "device.h"
+#include "scene.h"
+#include "context.h"
+#include "../../include/embree3/rtcore_ray.h"
+using namespace embree;
+
+RTC_NAMESPACE_BEGIN;
+
+  /* mutex to make API thread safe */
+  static MutexSys g_mutex;
+
+  RTC_API RTCDevice rtcNewDevice(const char* config)
+  {
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcNewDevice);
+    Lock<MutexSys> lock(g_mutex);
+    Device* device = new Device(config);
+    return (RTCDevice) device->refInc();
+    RTC_CATCH_END(nullptr);
+    return (RTCDevice) nullptr;
+  }
+
+  RTC_API void rtcRetainDevice(RTCDevice hdevice) 
+  {
+    Device* device = (Device*) hdevice;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcRetainDevice);
+    RTC_VERIFY_HANDLE(hdevice);
+    Lock<MutexSys> lock(g_mutex);
+    device->refInc();
+    RTC_CATCH_END(nullptr);
+  }
+  
+  RTC_API void rtcReleaseDevice(RTCDevice hdevice) 
+  {
+    Device* device = (Device*) hdevice;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcReleaseDevice);
+    RTC_VERIFY_HANDLE(hdevice);
+    Lock<MutexSys> lock(g_mutex);
+    device->refDec();
+    RTC_CATCH_END(nullptr);
+  }
+  
+  RTC_API ssize_t rtcGetDeviceProperty(RTCDevice hdevice, RTCDeviceProperty prop)
+  {
+    Device* device = (Device*) hdevice;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetDeviceProperty);
+    RTC_VERIFY_HANDLE(hdevice);
+    Lock<MutexSys> lock(g_mutex);
+    return device->getProperty(prop);
+    RTC_CATCH_END(device);
+    return 0;
+  }
+
+  RTC_API void rtcSetDeviceProperty(RTCDevice hdevice, const RTCDeviceProperty prop, ssize_t val)
+  {
+    Device* device = (Device*) hdevice;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetDeviceProperty);
+    const bool internal_prop = (size_t)prop >= 1000000 && (size_t)prop < 1000004;
+    if (!internal_prop) RTC_VERIFY_HANDLE(hdevice); // allow NULL device for special internal settings
+    Lock<MutexSys> lock(g_mutex);
+    device->setProperty(prop,val);
+    RTC_CATCH_END(device);
+  }
+
+  RTC_API RTCError rtcGetDeviceError(RTCDevice hdevice)
+  {
+    Device* device = (Device*) hdevice;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetDeviceError);
+    if (device == nullptr) return Device::getThreadErrorCode();
+    else                   return device->getDeviceErrorCode();
+    RTC_CATCH_END(device);
+    return RTC_ERROR_UNKNOWN;
+  }
+
+  RTC_API void rtcSetDeviceErrorFunction(RTCDevice hdevice, RTCErrorFunction error, void* userPtr)
+  {
+    Device* device = (Device*) hdevice;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetDeviceErrorFunction);
+    RTC_VERIFY_HANDLE(hdevice);
+    device->setErrorFunction(error, userPtr);
+    RTC_CATCH_END(device);
+  }
+
+  RTC_API void rtcSetDeviceMemoryMonitorFunction(RTCDevice hdevice, RTCMemoryMonitorFunction memoryMonitor, void* userPtr)
+  {
+    Device* device = (Device*) hdevice;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetDeviceMemoryMonitorFunction);
+    device->setMemoryMonitorFunction(memoryMonitor, userPtr);
+    RTC_CATCH_END(device);
+  }
+
+  RTC_API RTCBuffer rtcNewBuffer(RTCDevice hdevice, size_t byteSize)
+  {
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcNewBuffer);
+    RTC_VERIFY_HANDLE(hdevice);
+    Buffer* buffer = new Buffer((Device*)hdevice, byteSize);
+    return (RTCBuffer)buffer->refInc();
+    RTC_CATCH_END((Device*)hdevice);
+    return nullptr;
+  }
+
+  RTC_API RTCBuffer rtcNewSharedBuffer(RTCDevice hdevice, void* ptr, size_t byteSize)
+  {
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcNewSharedBuffer);
+    RTC_VERIFY_HANDLE(hdevice);
+    Buffer* buffer = new Buffer((Device*)hdevice, byteSize, ptr);
+    return (RTCBuffer)buffer->refInc();
+    RTC_CATCH_END((Device*)hdevice);
+    return nullptr;
+  }
+
+  RTC_API void* rtcGetBufferData(RTCBuffer hbuffer)
+  {
+    Buffer* buffer = (Buffer*)hbuffer;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetBufferData);
+    RTC_VERIFY_HANDLE(hbuffer);
+    return buffer->data();
+    RTC_CATCH_END2(buffer);
+    return nullptr;
+  }
+
+  RTC_API void rtcRetainBuffer(RTCBuffer hbuffer)
+  {
+    Buffer* buffer = (Buffer*)hbuffer;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcRetainBuffer);
+    RTC_VERIFY_HANDLE(hbuffer);
+    buffer->refInc();
+    RTC_CATCH_END2(buffer);
+  }
+  
+  RTC_API void rtcReleaseBuffer(RTCBuffer hbuffer)
+  {
+    Buffer* buffer = (Buffer*)hbuffer;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcReleaseBuffer);
+    RTC_VERIFY_HANDLE(hbuffer);
+    buffer->refDec();
+    RTC_CATCH_END2(buffer);
+  }
+
+  RTC_API RTCScene rtcNewScene (RTCDevice hdevice) 
+  {
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcNewScene);
+    RTC_VERIFY_HANDLE(hdevice);
+    Scene* scene = new Scene((Device*)hdevice);
+    return (RTCScene) scene->refInc();
+    RTC_CATCH_END((Device*)hdevice);
+    return nullptr;
+  }
+
+  RTC_API RTCDevice rtcGetSceneDevice(RTCScene hscene)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetSceneDevice);
+    RTC_VERIFY_HANDLE(hscene);
+    return (RTCDevice)scene->device->refInc(); // user will own one additional device reference
+    RTC_CATCH_END2(scene);
+    return (RTCDevice)nullptr;
+  }
+
+  RTC_API void rtcSetSceneProgressMonitorFunction(RTCScene hscene, RTCProgressMonitorFunction progress, void* ptr) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetSceneProgressMonitorFunction);
+    RTC_VERIFY_HANDLE(hscene);
+    Lock<MutexSys> lock(g_mutex);
+    scene->setProgressMonitorFunction(progress,ptr);
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcSetSceneBuildQuality (RTCScene hscene, RTCBuildQuality quality) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetSceneBuildQuality);
+    RTC_VERIFY_HANDLE(hscene);
+    if (quality != RTC_BUILD_QUALITY_LOW &&
+        quality != RTC_BUILD_QUALITY_MEDIUM &&
+        quality != RTC_BUILD_QUALITY_HIGH)
+      // -- GODOT start --
+      // throw std::runtime_error("invalid build quality");
+      abort();
+      // -- GODOT end --
+    scene->setBuildQuality(quality);
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcSetSceneFlags (RTCScene hscene, RTCSceneFlags flags) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetSceneFlags);
+    RTC_VERIFY_HANDLE(hscene);
+    scene->setSceneFlags(flags);
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API RTCSceneFlags rtcGetSceneFlags(RTCScene hscene)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetSceneFlags);
+    RTC_VERIFY_HANDLE(hscene);
+    return scene->getSceneFlags();
+    RTC_CATCH_END2(scene);
+    return RTC_SCENE_FLAG_NONE;
+  }
+  
+  RTC_API void rtcCommitScene (RTCScene hscene) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcCommitScene);
+    RTC_VERIFY_HANDLE(hscene);
+    scene->commit(false);
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcJoinCommitScene (RTCScene hscene) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcJoinCommitScene);
+    RTC_VERIFY_HANDLE(hscene);
+    scene->commit(true);
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcGetSceneBounds(RTCScene hscene, RTCBounds* bounds_o)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetSceneBounds);
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    BBox3fa bounds = scene->bounds.bounds();
+    bounds_o->lower_x = bounds.lower.x;
+    bounds_o->lower_y = bounds.lower.y;
+    bounds_o->lower_z = bounds.lower.z;
+    bounds_o->align0  = 0;
+    bounds_o->upper_x = bounds.upper.x;
+    bounds_o->upper_y = bounds.upper.y;
+    bounds_o->upper_z = bounds.upper.z;
+    bounds_o->align1  = 0;
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcGetSceneLinearBounds(RTCScene hscene, RTCLinearBounds* bounds_o)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetSceneBounds);
+    RTC_VERIFY_HANDLE(hscene);
+    if (bounds_o == nullptr)
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid destination pointer");
+    if (scene->isModified())
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    
+    bounds_o->bounds0.lower_x = scene->bounds.bounds0.lower.x;
+    bounds_o->bounds0.lower_y = scene->bounds.bounds0.lower.y;
+    bounds_o->bounds0.lower_z = scene->bounds.bounds0.lower.z;
+    bounds_o->bounds0.align0  = 0;
+    bounds_o->bounds0.upper_x = scene->bounds.bounds0.upper.x;
+    bounds_o->bounds0.upper_y = scene->bounds.bounds0.upper.y;
+    bounds_o->bounds0.upper_z = scene->bounds.bounds0.upper.z;
+    bounds_o->bounds0.align1  = 0;
+    bounds_o->bounds1.lower_x = scene->bounds.bounds1.lower.x;
+    bounds_o->bounds1.lower_y = scene->bounds.bounds1.lower.y;
+    bounds_o->bounds1.lower_z = scene->bounds.bounds1.lower.z;
+    bounds_o->bounds1.align0  = 0;
+    bounds_o->bounds1.upper_x = scene->bounds.bounds1.upper.x;
+    bounds_o->bounds1.upper_y = scene->bounds.bounds1.upper.y;
+    bounds_o->bounds1.upper_z = scene->bounds.bounds1.upper.z;
+    bounds_o->bounds1.align1  = 0;
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcCollide (RTCScene hscene0, RTCScene hscene1, RTCCollideFunc callback, void* userPtr)
+  {
+    Scene* scene0 = (Scene*) hscene0;
+    Scene* scene1 = (Scene*) hscene1;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcCollide);
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene0);
+    RTC_VERIFY_HANDLE(hscene1);
+    if (scene0->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed");
+    if (scene1->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed");
+    if (scene0->device != scene1->device) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scenes are from different devices");
+    auto nUserPrims0 = scene0->getNumPrimitives (Geometry::MTY_USER_GEOMETRY, false);
+    auto nUserPrims1 = scene1->getNumPrimitives (Geometry::MTY_USER_GEOMETRY, false);
+    if (scene0->numPrimitives() != nUserPrims0 && scene1->numPrimitives() != nUserPrims1) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scenes must only contain user geometries with a single timestep");
+#endif
+    scene0->intersectors.collide(scene0,scene1,callback,userPtr);
+    RTC_CATCH_END(scene0->device);
+  }
+  
+  inline bool pointQuery(Scene* scene, RTCPointQuery* query, RTCPointQueryContext* userContext, RTCPointQueryFunction queryFunc, void* userPtr)
+  {
+    bool changed = false;
+    if (userContext->instStackSize > 0)
+    {
+      const AffineSpace3fa transform = AffineSpace3fa_load_unaligned((AffineSpace3fa*)userContext->world2inst[userContext->instStackSize-1]);
+
+      float similarityScale = 0.f;
+      const bool similtude = similarityTransform(transform, &similarityScale);
+      assert((similtude && similarityScale > 0) || (!similtude && similarityScale == 0.f));
+
+      PointQuery query_inst;
+      query_inst.p = xfmPoint(transform, Vec3fa(query->x, query->y, query->z)); 
+      query_inst.radius = query->radius * similarityScale;
+      query_inst.time = query->time;
+      
+      PointQueryContext context_inst(scene, (PointQuery*)query,
+        similtude ? POINT_QUERY_TYPE_SPHERE : POINT_QUERY_TYPE_AABB,
+        queryFunc, userContext, similarityScale, userPtr);
+      changed = scene->intersectors.pointQuery((PointQuery*)&query_inst, &context_inst);
+    }
+    else
+    {
+      PointQueryContext context(scene, (PointQuery*)query, 
+        POINT_QUERY_TYPE_SPHERE, queryFunc, userContext, 1.f, userPtr);
+      changed = scene->intersectors.pointQuery((PointQuery*)query, &context);
+    }
+    return changed;
+  }
+
+  RTC_API bool rtcPointQuery(RTCScene hscene, RTCPointQuery* query, RTCPointQueryContext* userContext, RTCPointQueryFunction queryFunc, void* userPtr)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcPointQuery);
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    RTC_VERIFY_HANDLE(userContext);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed");
+    if (((size_t)query) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "query not aligned to 16 bytes");   
+    if (((size_t)userContext) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "context not aligned to 16 bytes");   
+#endif
+
+    return pointQuery(scene, query, userContext, queryFunc, userPtr);
+    RTC_CATCH_END2_FALSE(scene);
+  }
+  
+  RTC_API bool rtcPointQuery4 (const int* valid, RTCScene hscene, RTCPointQuery4* query, struct RTCPointQueryContext* userContext, RTCPointQueryFunction queryFunc, void** userPtrN)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcPointQuery4);
+
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed");
+    if (((size_t)valid) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 16 bytes");   
+    if (((size_t)query) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "query not aligned to 16 bytes");   
+#endif
+    STAT(size_t cnt=0; for (size_t i=0; i<4; i++) cnt += ((int*)valid)[i] == -1;);
+    STAT3(point_query.travs,cnt,cnt,cnt);
+
+    bool changed = false;
+    PointQuery4* query4 = (PointQuery4*)query;
+    PointQuery query1; 
+    for (size_t i=0; i<4; i++) {
+      if (!valid[i]) continue;
+      query4->get(i,query1);
+      changed |= pointQuery(scene, (RTCPointQuery*)&query1, userContext, queryFunc, userPtrN?userPtrN[i]:NULL);
+      query4->set(i,query1);
+    }
+    return changed;
+    RTC_CATCH_END2_FALSE(scene);
+  }
+  
+  RTC_API bool rtcPointQuery8 (const int* valid, RTCScene hscene, RTCPointQuery8* query, struct RTCPointQueryContext* userContext, RTCPointQueryFunction queryFunc, void** userPtrN)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcPointQuery8);
+    
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed");
+    if (((size_t)valid) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 16 bytes");   
+    if (((size_t)query) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "query not aligned to 16 bytes");   
+#endif
+    STAT(size_t cnt=0; for (size_t i=0; i<4; i++) cnt += ((int*)valid)[i] == -1;);
+    STAT3(point_query.travs,cnt,cnt,cnt);
+
+    bool changed = false;
+    PointQuery8* query8 = (PointQuery8*)query;
+    PointQuery query1; 
+    for (size_t i=0; i<8; i++) {
+      if (!valid[i]) continue;
+      query8->get(i,query1);
+      changed |= pointQuery(scene, (RTCPointQuery*)&query1, userContext, queryFunc, userPtrN?userPtrN[i]:NULL);
+      query8->set(i,query1);
+    }
+    return changed;
+    RTC_CATCH_END2_FALSE(scene);
+  }
+
+  RTC_API bool rtcPointQuery16 (const int* valid, RTCScene hscene, RTCPointQuery16* query, struct RTCPointQueryContext* userContext, RTCPointQueryFunction queryFunc, void** userPtrN)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcPointQuery16);
+
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed");
+    if (((size_t)valid) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 16 bytes");   
+    if (((size_t)query) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "query not aligned to 16 bytes");   
+#endif
+    STAT(size_t cnt=0; for (size_t i=0; i<4; i++) cnt += ((int*)valid)[i] == -1;);
+    STAT3(point_query.travs,cnt,cnt,cnt);
+
+    bool changed = false;
+    PointQuery16* query16 = (PointQuery16*)query;
+    PointQuery query1; 
+    for (size_t i=0; i<16; i++) {
+      if (!valid[i]) continue;
+      PointQuery query1; query16->get(i,query1);
+      changed |= pointQuery(scene, (RTCPointQuery*)&query1, userContext, queryFunc, userPtrN?userPtrN[i]:NULL);
+      query16->set(i,query1);
+    }
+    return changed;
+    RTC_CATCH_END2_FALSE(scene);
+  }
+
+  RTC_API void rtcIntersect1 (RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit* rayhit) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcIntersect1);
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)rayhit) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 16 bytes");   
+#endif
+    STAT3(normal.travs,1,1,1);
+    IntersectContext context(scene,user_context);
+    scene->intersectors.intersect(*rayhit,&context);
+#if defined(DEBUG)
+    ((RayHit*)rayhit)->verifyHit();
+#endif
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcIntersect4 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit4* rayhit) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcIntersect4);
+
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)valid) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 16 bytes");   
+    if (((size_t)rayhit)   & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit not aligned to 16 bytes");   
+#endif
+    STAT(size_t cnt=0; for (size_t i=0; i<4; i++) cnt += ((int*)valid)[i] == -1;);
+    STAT3(normal.travs,cnt,cnt,cnt);
+
+    IntersectContext context(scene,user_context);
+#if !defined(EMBREE_RAY_PACKETS)
+    Ray4* ray4 = (Ray4*) rayhit;
+    for (size_t i=0; i<4; i++) {
+      if (!valid[i]) continue;
+      RayHit ray1; ray4->get(i,ray1);
+      scene->intersectors.intersect((RTCRayHit&)ray1,&context);
+      ray4->set(i,ray1);
+    }
+#else
+    scene->intersectors.intersect4(valid,*rayhit,&context);
+#endif
+    
+    RTC_CATCH_END2(scene);
+  }
+  
+  RTC_API void rtcIntersect8 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit8* rayhit) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcIntersect8);
+
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)valid) & 0x1F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 32 bytes");   
+    if (((size_t)rayhit)   & 0x1F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit not aligned to 32 bytes");   
+#endif
+    STAT(size_t cnt=0; for (size_t i=0; i<8; i++) cnt += ((int*)valid)[i] == -1;);
+    STAT3(normal.travs,cnt,cnt,cnt);
+
+    IntersectContext context(scene,user_context);
+#if !defined(EMBREE_RAY_PACKETS)
+    Ray8* ray8 = (Ray8*) rayhit;
+    for (size_t i=0; i<8; i++) {
+      if (!valid[i]) continue;
+      RayHit ray1; ray8->get(i,ray1);
+      scene->intersectors.intersect((RTCRayHit&)ray1,&context);
+      ray8->set(i,ray1);
+    }
+#else
+    if (likely(scene->intersectors.intersector8))
+      scene->intersectors.intersect8(valid,*rayhit,&context);
+    else
+      scene->device->rayStreamFilters.intersectSOA(scene,(char*)rayhit,8,1,sizeof(RTCRayHit8),&context);
+#endif
+    RTC_CATCH_END2(scene);
+  }
+  
+  RTC_API void rtcIntersect16 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit16* rayhit) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcIntersect16);
+
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)valid) & 0x3F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 64 bytes");   
+    if (((size_t)rayhit)   & 0x3F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit not aligned to 64 bytes");   
+#endif
+    STAT(size_t cnt=0; for (size_t i=0; i<16; i++) cnt += ((int*)valid)[i] == -1;);
+    STAT3(normal.travs,cnt,cnt,cnt);
+
+    IntersectContext context(scene,user_context);
+#if !defined(EMBREE_RAY_PACKETS)
+    Ray16* ray16 = (Ray16*) rayhit;
+    for (size_t i=0; i<16; i++) {
+      if (!valid[i]) continue;
+      RayHit ray1; ray16->get(i,ray1);
+      scene->intersectors.intersect((RTCRayHit&)ray1,&context);
+      ray16->set(i,ray1);
+    }
+#else
+    if (likely(scene->intersectors.intersector16))
+      scene->intersectors.intersect16(valid,*rayhit,&context);
+    else
+      scene->device->rayStreamFilters.intersectSOA(scene,(char*)rayhit,16,1,sizeof(RTCRayHit16),&context);
+#endif
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcIntersect1M (RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit* rayhit, unsigned int M, size_t byteStride) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcIntersect1M);
+
+#if defined (EMBREE_RAY_PACKETS)
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)rayhit ) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes");   
+#endif
+    STAT3(normal.travs,M,M,M);
+    IntersectContext context(scene,user_context);
+
+    /* fast codepath for single rays */
+    if (likely(M == 1)) {
+      if (likely(rayhit->ray.tnear <= rayhit->ray.tfar)) 
+        scene->intersectors.intersect(*rayhit,&context);
+    } 
+
+    /* codepath for streams */
+    else {
+      scene->device->rayStreamFilters.intersectAOS(scene,rayhit,M,byteStride,&context);   
+    }
+#else
+    throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect1M not supported");
+#endif
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcIntersect1Mp (RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit** rn, unsigned int M) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcIntersect1Mp);
+
+#if defined (EMBREE_RAY_PACKETS)
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)rn) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes");   
+#endif
+    STAT3(normal.travs,M,M,M);
+    IntersectContext context(scene,user_context);
+
+    /* fast codepath for single rays */
+    if (likely(M == 1)) {
+      if (likely(rn[0]->ray.tnear <= rn[0]->ray.tfar)) 
+        scene->intersectors.intersect(*rn[0],&context);
+    } 
+
+    /* codepath for streams */
+    else {
+      scene->device->rayStreamFilters.intersectAOP(scene,rn,M,&context);
+    }
+#else
+    throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect1Mp not supported");
+#endif
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcIntersectNM (RTCScene hscene, RTCIntersectContext* user_context, struct RTCRayHitN* rayhit, unsigned int N, unsigned int M, size_t byteStride) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcIntersectNM);
+
+#if defined (EMBREE_RAY_PACKETS)
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)rayhit) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes");   
+#endif
+    STAT3(normal.travs,N*M,N*M,N*M);
+    IntersectContext context(scene,user_context);
+
+    /* code path for single ray streams */
+    if (likely(N == 1))
+    {
+      /* fast code path for streams of size 1 */
+      if (likely(M == 1)) {
+        if (likely(((RTCRayHit*)rayhit)->ray.tnear <= ((RTCRayHit*)rayhit)->ray.tfar))
+          scene->intersectors.intersect(*(RTCRayHit*)rayhit,&context);
+      } 
+      /* normal codepath for single ray streams */
+      else {
+        scene->device->rayStreamFilters.intersectAOS(scene,(RTCRayHit*)rayhit,M,byteStride,&context);
+      }
+    }
+    /* code path for ray packet streams */
+    else {
+      scene->device->rayStreamFilters.intersectSOA(scene,(char*)rayhit,N,M,byteStride,&context);
+    }
+#else
+    throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersectNM not supported");
+#endif
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcIntersectNp (RTCScene hscene, RTCIntersectContext* user_context, const RTCRayHitNp* rayhit, unsigned int N) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcIntersectNp);
+
+#if defined (EMBREE_RAY_PACKETS)
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)rayhit->ray.org_x ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.org_x not aligned to 4 bytes");   
+    if (((size_t)rayhit->ray.org_y ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.org_y not aligned to 4 bytes");   
+    if (((size_t)rayhit->ray.org_z ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.org_z not aligned to 4 bytes");   
+    if (((size_t)rayhit->ray.dir_x ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.dir_x not aligned to 4 bytes");   
+    if (((size_t)rayhit->ray.dir_y ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.dir_y not aligned to 4 bytes");   
+    if (((size_t)rayhit->ray.dir_z ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.dir_z not aligned to 4 bytes");   
+    if (((size_t)rayhit->ray.tnear ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.dir_x not aligned to 4 bytes");   
+    if (((size_t)rayhit->ray.tfar  ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.tnear not aligned to 4 bytes");   
+    if (((size_t)rayhit->ray.time  ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.time not aligned to 4 bytes");   
+    if (((size_t)rayhit->ray.mask  ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.mask not aligned to 4 bytes");   
+    if (((size_t)rayhit->hit.Ng_x  ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.Ng_x not aligned to 4 bytes");   
+    if (((size_t)rayhit->hit.Ng_y  ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.Ng_y not aligned to 4 bytes");   
+    if (((size_t)rayhit->hit.Ng_z  ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.Ng_z not aligned to 4 bytes");   
+    if (((size_t)rayhit->hit.u     ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.u not aligned to 4 bytes");   
+    if (((size_t)rayhit->hit.v     ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.v not aligned to 4 bytes");   
+    if (((size_t)rayhit->hit.geomID) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.geomID not aligned to 4 bytes");   
+    if (((size_t)rayhit->hit.primID) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.primID not aligned to 4 bytes");   
+    if (((size_t)rayhit->hit.instID) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.instID not aligned to 4 bytes");   
+#endif
+    STAT3(normal.travs,N,N,N);
+    IntersectContext context(scene,user_context);
+    scene->device->rayStreamFilters.intersectSOP(scene,rayhit,N,&context);
+#else
+    throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersectNp not supported");
+#endif
+    RTC_CATCH_END2(scene);
+  }
+  
+  RTC_API void rtcOccluded1 (RTCScene hscene, RTCIntersectContext* user_context, RTCRay* ray) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcOccluded1);
+    STAT3(shadow.travs,1,1,1);
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)ray) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 16 bytes");   
+#endif
+    IntersectContext context(scene,user_context);
+    scene->intersectors.occluded(*ray,&context);
+    RTC_CATCH_END2(scene);
+  }
+  
+  RTC_API void rtcOccluded4 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRay4* ray) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcOccluded4);
+
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)valid) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 16 bytes");   
+    if (((size_t)ray)   & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 16 bytes");   
+#endif
+    STAT(size_t cnt=0; for (size_t i=0; i<4; i++) cnt += ((int*)valid)[i] == -1;);
+    STAT3(shadow.travs,cnt,cnt,cnt);
+
+    IntersectContext context(scene,user_context);
+#if !defined(EMBREE_RAY_PACKETS)
+    RayHit4* ray4 = (RayHit4*) ray;
+    for (size_t i=0; i<4; i++) {
+      if (!valid[i]) continue;
+      RayHit ray1; ray4->get(i,ray1);
+      scene->intersectors.occluded((RTCRay&)ray1,&context);
+      ray4->geomID[i] = ray1.geomID; 
+    }
+#else
+    scene->intersectors.occluded4(valid,*ray,&context);
+#endif
+    
+    RTC_CATCH_END2(scene);
+  }
+ 
+  RTC_API void rtcOccluded8 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRay8* ray) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcOccluded8);
+
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)valid) & 0x1F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 32 bytes");   
+    if (((size_t)ray)   & 0x1F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 32 bytes");   
+#endif
+    STAT(size_t cnt=0; for (size_t i=0; i<8; i++) cnt += ((int*)valid)[i] == -1;);
+    STAT3(shadow.travs,cnt,cnt,cnt);
+
+    IntersectContext context(scene,user_context);
+#if !defined(EMBREE_RAY_PACKETS)
+    RayHit8* ray8 = (RayHit8*) ray;
+    for (size_t i=0; i<8; i++) {
+      if (!valid[i]) continue;
+      RayHit ray1; ray8->get(i,ray1);
+      scene->intersectors.occluded((RTCRay&)ray1,&context);
+      ray8->set(i,ray1);
+    }
+#else
+    if (likely(scene->intersectors.intersector8))
+      scene->intersectors.occluded8(valid,*ray,&context);
+    else
+      scene->device->rayStreamFilters.occludedSOA(scene,(char*)ray,8,1,sizeof(RTCRay8),&context);
+#endif
+
+    RTC_CATCH_END2(scene);
+  }
+  
+  RTC_API void rtcOccluded16 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRay16* ray) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcOccluded16);
+
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)valid) & 0x3F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 64 bytes");   
+    if (((size_t)ray)   & 0x3F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 64 bytes");   
+#endif
+    STAT(size_t cnt=0; for (size_t i=0; i<16; i++) cnt += ((int*)valid)[i] == -1;);
+    STAT3(shadow.travs,cnt,cnt,cnt);
+
+    IntersectContext context(scene,user_context);
+#if !defined(EMBREE_RAY_PACKETS)
+    RayHit16* ray16 = (RayHit16*) ray;
+    for (size_t i=0; i<16; i++) {
+      if (!valid[i]) continue;
+      RayHit ray1; ray16->get(i,ray1);
+      scene->intersectors.occluded((RTCRay&)ray1,&context);
+      ray16->set(i,ray1);
+    }
+#else
+    if (likely(scene->intersectors.intersector16))
+      scene->intersectors.occluded16(valid,*ray,&context);
+    else
+      scene->device->rayStreamFilters.occludedSOA(scene,(char*)ray,16,1,sizeof(RTCRay16),&context);
+#endif
+
+    RTC_CATCH_END2(scene);
+  }
+  
+  RTC_API void rtcOccluded1M(RTCScene hscene, RTCIntersectContext* user_context, RTCRay* ray, unsigned int M, size_t byteStride) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcOccluded1M);
+
+#if defined (EMBREE_RAY_PACKETS)
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)ray) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes");   
+#endif
+    STAT3(shadow.travs,M,M,M);
+    IntersectContext context(scene,user_context);
+    /* fast codepath for streams of size 1 */
+    if (likely(M == 1)) {
+      if (likely(ray->tnear <= ray->tfar)) 
+        scene->intersectors.occluded (*ray,&context);
+    } 
+    /* codepath for normal streams */
+    else {
+      scene->device->rayStreamFilters.occludedAOS(scene,ray,M,byteStride,&context);
+    }
+#else
+    throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcOccluded1M not supported");
+#endif
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcOccluded1Mp(RTCScene hscene, RTCIntersectContext* user_context, RTCRay** ray, unsigned int M) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcOccluded1Mp);
+
+#if defined (EMBREE_RAY_PACKETS)
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)ray) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes");   
+#endif
+    STAT3(shadow.travs,M,M,M);
+    IntersectContext context(scene,user_context);
+
+    /* fast codepath for streams of size 1 */
+    if (likely(M == 1)) {
+      if (likely(ray[0]->tnear <= ray[0]->tfar)) 
+        scene->intersectors.occluded (*ray[0],&context);
+    } 
+    /* codepath for normal streams */
+    else {
+      scene->device->rayStreamFilters.occludedAOP(scene,ray,M,&context);
+    }
+#else
+    throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcOccluded1Mp not supported");
+#endif
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcOccludedNM(RTCScene hscene, RTCIntersectContext* user_context, RTCRayN* ray, unsigned int N, unsigned int M, size_t byteStride)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcOccludedNM);
+
+#if defined (EMBREE_RAY_PACKETS)
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (byteStride < sizeof(RTCRayHit)) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"byteStride too small");
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)ray) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes");   
+#endif
+    STAT3(shadow.travs,N*M,N*N,N*N);
+    IntersectContext context(scene,user_context);
+
+    /* codepath for single rays */
+    if (likely(N == 1))
+    {
+      /* fast path for streams of size 1 */
+      if (likely(M == 1)) {
+        if (likely(((RTCRay*)ray)->tnear <= ((RTCRay*)ray)->tfar))
+          scene->intersectors.occluded (*(RTCRay*)ray,&context);
+      } 
+      /* codepath for normal ray streams */
+      else {
+        scene->device->rayStreamFilters.occludedAOS(scene,(RTCRay*)ray,M,byteStride,&context);
+      }
+    }
+    /* code path for ray packet streams */
+    else {
+      scene->device->rayStreamFilters.occludedSOA(scene,(char*)ray,N,M,byteStride,&context);
+    }
+#else
+    throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcOccludedNM not supported");
+#endif
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcOccludedNp(RTCScene hscene, RTCIntersectContext* user_context, const RTCRayNp* ray, unsigned int N)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcOccludedNp);
+
+#if defined (EMBREE_RAY_PACKETS)
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)ray->org_x ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "org_x not aligned to 4 bytes");   
+    if (((size_t)ray->org_y ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "org_y not aligned to 4 bytes");   
+    if (((size_t)ray->org_z ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "org_z not aligned to 4 bytes");   
+    if (((size_t)ray->dir_x ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "dir_x not aligned to 4 bytes");   
+    if (((size_t)ray->dir_y ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "dir_y not aligned to 4 bytes");   
+    if (((size_t)ray->dir_z ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "dir_z not aligned to 4 bytes");   
+    if (((size_t)ray->tnear ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "dir_x not aligned to 4 bytes");   
+    if (((size_t)ray->tfar  ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "tnear not aligned to 4 bytes");   
+    if (((size_t)ray->time  ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "time not aligned to 4 bytes");   
+    if (((size_t)ray->mask  ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 4 bytes");   
+#endif
+    STAT3(shadow.travs,N,N,N);
+    IntersectContext context(scene,user_context);
+    scene->device->rayStreamFilters.occludedSOP(scene,ray,N,&context);
+#else
+    throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcOccludedNp not supported");
+#endif
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcRetainScene (RTCScene hscene) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcRetainScene);
+    RTC_VERIFY_HANDLE(hscene);
+    scene->refInc();
+    RTC_CATCH_END2(scene);
+  }
+  
+  RTC_API void rtcReleaseScene (RTCScene hscene) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcReleaseScene);
+    RTC_VERIFY_HANDLE(hscene);
+    scene->refDec();
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcSetGeometryInstancedScene(RTCGeometry hgeometry, RTCScene hscene)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    Ref<Scene> scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryInstancedScene);
+    RTC_VERIFY_HANDLE(hgeometry);
+    RTC_VERIFY_HANDLE(hscene);
+    geometry->setInstancedScene(scene);
+    RTC_CATCH_END2(geometry);
+  }
+
+  AffineSpace3fa loadTransform(RTCFormat format, const float* xfm)
+  {
+    AffineSpace3fa space = one;
+    switch (format)
+    {
+    case RTC_FORMAT_FLOAT3X4_ROW_MAJOR:
+      space = AffineSpace3fa(Vec3fa(xfm[ 0], xfm[ 4], xfm[ 8]),
+                             Vec3fa(xfm[ 1], xfm[ 5], xfm[ 9]),
+                             Vec3fa(xfm[ 2], xfm[ 6], xfm[10]),
+                             Vec3fa(xfm[ 3], xfm[ 7], xfm[11]));
+      break;
+
+    case RTC_FORMAT_FLOAT3X4_COLUMN_MAJOR:
+      space = AffineSpace3fa(Vec3fa(xfm[ 0], xfm[ 1], xfm[ 2]),
+                             Vec3fa(xfm[ 3], xfm[ 4], xfm[ 5]),
+                             Vec3fa(xfm[ 6], xfm[ 7], xfm[ 8]),
+                             Vec3fa(xfm[ 9], xfm[10], xfm[11]));
+      break;
+
+    case RTC_FORMAT_FLOAT4X4_COLUMN_MAJOR:
+      space = AffineSpace3fa(Vec3fa(xfm[ 0], xfm[ 1], xfm[ 2]),
+                             Vec3fa(xfm[ 4], xfm[ 5], xfm[ 6]),
+                             Vec3fa(xfm[ 8], xfm[ 9], xfm[10]),
+                             Vec3fa(xfm[12], xfm[13], xfm[14]));
+      break;
+
+    default: 
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid matrix format");
+      break;
+    }
+    return space;
+  }
+
+  void storeTransform(const AffineSpace3fa& space, RTCFormat format, float* xfm)
+  {
+    switch (format)
+    {
+    case RTC_FORMAT_FLOAT3X4_ROW_MAJOR:
+      xfm[ 0] = space.l.vx.x;  xfm[ 1] = space.l.vy.x;  xfm[ 2] = space.l.vz.x;  xfm[ 3] = space.p.x;
+      xfm[ 4] = space.l.vx.y;  xfm[ 5] = space.l.vy.y;  xfm[ 6] = space.l.vz.y;  xfm[ 7] = space.p.y;
+      xfm[ 8] = space.l.vx.z;  xfm[ 9] = space.l.vy.z;  xfm[10] = space.l.vz.z;  xfm[11] = space.p.z;
+      break;
+
+    case RTC_FORMAT_FLOAT3X4_COLUMN_MAJOR:
+      xfm[ 0] = space.l.vx.x;  xfm[ 1] = space.l.vx.y;  xfm[ 2] = space.l.vx.z;
+      xfm[ 3] = space.l.vy.x;  xfm[ 4] = space.l.vy.y;  xfm[ 5] = space.l.vy.z;
+      xfm[ 6] = space.l.vz.x;  xfm[ 7] = space.l.vz.y;  xfm[ 8] = space.l.vz.z;
+      xfm[ 9] = space.p.x;     xfm[10] = space.p.y;     xfm[11] = space.p.z;
+      break;
+
+    case RTC_FORMAT_FLOAT4X4_COLUMN_MAJOR:
+      xfm[ 0] = space.l.vx.x;  xfm[ 1] = space.l.vx.y;  xfm[ 2] = space.l.vx.z;  xfm[ 3] = 0.f;
+      xfm[ 4] = space.l.vy.x;  xfm[ 5] = space.l.vy.y;  xfm[ 6] = space.l.vy.z;  xfm[ 7] = 0.f;
+      xfm[ 8] = space.l.vz.x;  xfm[ 9] = space.l.vz.y;  xfm[10] = space.l.vz.z;  xfm[11] = 0.f;
+      xfm[12] = space.p.x;     xfm[13] = space.p.y;     xfm[14] = space.p.z;     xfm[15] = 1.f;
+      break;
+
+    default:
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid matrix format");
+      break;
+    }
+  }
+
+  RTC_API void rtcSetGeometryTransform(RTCGeometry hgeometry, unsigned int timeStep, RTCFormat format, const void* xfm)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryTransform);
+    RTC_VERIFY_HANDLE(hgeometry);
+    RTC_VERIFY_HANDLE(xfm);
+    const AffineSpace3fa transform = loadTransform(format, (const float*)xfm);
+    geometry->setTransform(transform, timeStep);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryTransformQuaternion(RTCGeometry hgeometry, unsigned int timeStep, const RTCQuaternionDecomposition* qd)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryTransformQuaternion);
+    RTC_VERIFY_HANDLE(hgeometry);
+    RTC_VERIFY_HANDLE(qd);
+    
+    AffineSpace3fx transform;
+    transform.l.vx.x = qd->scale_x;
+    transform.l.vy.y = qd->scale_y;
+    transform.l.vz.z = qd->scale_z;
+    transform.l.vy.x = qd->skew_xy;
+    transform.l.vz.x = qd->skew_xz;
+    transform.l.vz.y = qd->skew_yz;
+    transform.l.vx.y = qd->translation_x;
+    transform.l.vx.z = qd->translation_y;
+    transform.l.vy.z = qd->translation_z;
+    transform.p.x    = qd->shift_x;
+    transform.p.y    = qd->shift_y;
+    transform.p.z    = qd->shift_z;
+
+    // normalize quaternion
+    Quaternion3f q(qd->quaternion_r, qd->quaternion_i, qd->quaternion_j, qd->quaternion_k);
+    q = normalize(q);
+    transform.l.vx.w = q.i;
+    transform.l.vy.w = q.j;
+    transform.l.vz.w = q.k;
+    transform.p.w    = q.r;
+
+    geometry->setQuaternionDecomposition(transform, timeStep);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcGetGeometryTransform(RTCGeometry hgeometry, float time, RTCFormat format, void* xfm)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetGeometryTransform);
+    const AffineSpace3fa transform = geometry->getTransform(time);
+    storeTransform(transform, format, (float*)xfm);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcFilterIntersection(const struct RTCIntersectFunctionNArguments* const args_i, const struct RTCFilterFunctionNArguments* filter_args)
+  {
+    IntersectFunctionNArguments* args = (IntersectFunctionNArguments*) args_i;
+    args->report(args,filter_args);
+  }
+
+  RTC_API void rtcFilterOcclusion(const struct RTCOccludedFunctionNArguments* const args_i, const struct RTCFilterFunctionNArguments* filter_args)
+  {
+    OccludedFunctionNArguments* args = (OccludedFunctionNArguments*) args_i;
+    args->report(args,filter_args);
+  }
+  
+  RTC_API RTCGeometry rtcNewGeometry (RTCDevice hdevice, RTCGeometryType type)
+  {
+    Device* device = (Device*) hdevice;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcNewGeometry);
+    RTC_VERIFY_HANDLE(hdevice);
+
+    switch (type)
+    {
+    case RTC_GEOMETRY_TYPE_TRIANGLE:
+    {
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+      createTriangleMeshTy createTriangleMesh = nullptr;
+      SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(device->enabled_cpu_features,createTriangleMesh);
+      Geometry* geom = createTriangleMesh(device);
+      return (RTCGeometry) geom->refInc();
+#else
+      throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_TRIANGLE is not supported");
+#endif
+    }
+    
+    case RTC_GEOMETRY_TYPE_QUAD:
+    {
+#if defined(EMBREE_GEOMETRY_QUAD)
+      createQuadMeshTy createQuadMesh = nullptr;
+      SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(device->enabled_cpu_features,createQuadMesh);
+      Geometry* geom = createQuadMesh(device);
+      return (RTCGeometry) geom->refInc();
+#else
+      throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_QUAD is not supported");
+#endif
+    }
+    
+    case RTC_GEOMETRY_TYPE_SPHERE_POINT:
+    case RTC_GEOMETRY_TYPE_DISC_POINT:
+    case RTC_GEOMETRY_TYPE_ORIENTED_DISC_POINT:
+    {
+#if defined(EMBREE_GEOMETRY_POINT)
+      createPointsTy createPoints = nullptr;
+      SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(device->enabled_builder_cpu_features, createPoints);
+
+      Geometry *geom;
+      switch(type) {
+        case RTC_GEOMETRY_TYPE_SPHERE_POINT:
+          geom = createPoints(device, Geometry::GTY_SPHERE_POINT);
+          break;
+        case RTC_GEOMETRY_TYPE_DISC_POINT:
+          geom = createPoints(device, Geometry::GTY_DISC_POINT);
+          break;
+        case RTC_GEOMETRY_TYPE_ORIENTED_DISC_POINT:
+          geom = createPoints(device, Geometry::GTY_ORIENTED_DISC_POINT);
+          break;
+        default:
+          geom = nullptr;
+          break;
+      }
+      return (RTCGeometry) geom->refInc();
+#else
+      throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_POINT is not supported");
+#endif
+    }
+
+    case RTC_GEOMETRY_TYPE_CONE_LINEAR_CURVE:
+    case RTC_GEOMETRY_TYPE_ROUND_LINEAR_CURVE:
+    case RTC_GEOMETRY_TYPE_FLAT_LINEAR_CURVE:
+      
+    case RTC_GEOMETRY_TYPE_ROUND_BEZIER_CURVE:
+    case RTC_GEOMETRY_TYPE_FLAT_BEZIER_CURVE:
+    case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_BEZIER_CURVE:
+      
+    case RTC_GEOMETRY_TYPE_ROUND_BSPLINE_CURVE:
+    case RTC_GEOMETRY_TYPE_FLAT_BSPLINE_CURVE:
+    case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_BSPLINE_CURVE:
+
+    case RTC_GEOMETRY_TYPE_ROUND_HERMITE_CURVE:
+    case RTC_GEOMETRY_TYPE_FLAT_HERMITE_CURVE:
+    case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_HERMITE_CURVE:
+
+    case RTC_GEOMETRY_TYPE_ROUND_CATMULL_ROM_CURVE:
+    case RTC_GEOMETRY_TYPE_FLAT_CATMULL_ROM_CURVE:
+    case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_CATMULL_ROM_CURVE:
+    {
+#if defined(EMBREE_GEOMETRY_CURVE)
+      createLineSegmentsTy createLineSegments = nullptr;
+      SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(device->enabled_cpu_features,createLineSegments);
+      createCurvesTy createCurves = nullptr;
+      SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(device->enabled_cpu_features,createCurves);
+      
+      Geometry* geom;
+      switch (type) {
+      case RTC_GEOMETRY_TYPE_CONE_LINEAR_CURVE             : geom = createLineSegments (device,Geometry::GTY_CONE_LINEAR_CURVE); break;
+      case RTC_GEOMETRY_TYPE_ROUND_LINEAR_CURVE            : geom = createLineSegments (device,Geometry::GTY_ROUND_LINEAR_CURVE); break;
+      case RTC_GEOMETRY_TYPE_FLAT_LINEAR_CURVE             : geom = createLineSegments (device,Geometry::GTY_FLAT_LINEAR_CURVE); break;
+      //case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_LINEAR_CURVE  : geom = createLineSegments (device,Geometry::GTY_ORIENTED_LINEAR_CURVE); break;
+        
+      case RTC_GEOMETRY_TYPE_ROUND_BEZIER_CURVE            : geom = createCurves(device,Geometry::GTY_ROUND_BEZIER_CURVE); break;
+      case RTC_GEOMETRY_TYPE_FLAT_BEZIER_CURVE             : geom = createCurves(device,Geometry::GTY_FLAT_BEZIER_CURVE); break;
+      case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_BEZIER_CURVE  : geom = createCurves(device,Geometry::GTY_ORIENTED_BEZIER_CURVE); break;
+        
+      case RTC_GEOMETRY_TYPE_ROUND_BSPLINE_CURVE           : geom = createCurves(device,Geometry::GTY_ROUND_BSPLINE_CURVE); break;
+      case RTC_GEOMETRY_TYPE_FLAT_BSPLINE_CURVE            : geom = createCurves(device,Geometry::GTY_FLAT_BSPLINE_CURVE); break;
+      case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_BSPLINE_CURVE : geom = createCurves(device,Geometry::GTY_ORIENTED_BSPLINE_CURVE); break;
+        
+      case RTC_GEOMETRY_TYPE_ROUND_HERMITE_CURVE           : geom = createCurves(device,Geometry::GTY_ROUND_HERMITE_CURVE); break;
+      case RTC_GEOMETRY_TYPE_FLAT_HERMITE_CURVE            : geom = createCurves(device,Geometry::GTY_FLAT_HERMITE_CURVE); break;
+      case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_HERMITE_CURVE : geom = createCurves(device,Geometry::GTY_ORIENTED_HERMITE_CURVE); break;
+
+      case RTC_GEOMETRY_TYPE_ROUND_CATMULL_ROM_CURVE           : geom = createCurves(device,Geometry::GTY_ROUND_CATMULL_ROM_CURVE); break;
+      case RTC_GEOMETRY_TYPE_FLAT_CATMULL_ROM_CURVE            : geom = createCurves(device,Geometry::GTY_FLAT_CATMULL_ROM_CURVE); break;
+      case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_CATMULL_ROM_CURVE : geom = createCurves(device,Geometry::GTY_ORIENTED_CATMULL_ROM_CURVE); break;
+      default:                                    geom = nullptr; break;
+      }
+      return (RTCGeometry) geom->refInc();
+#else
+      throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_CURVE is not supported");
+#endif
+    }
+    
+    case RTC_GEOMETRY_TYPE_SUBDIVISION:
+    {
+#if defined(EMBREE_GEOMETRY_SUBDIVISION)
+      createSubdivMeshTy createSubdivMesh = nullptr;
+      SELECT_SYMBOL_DEFAULT_AVX(device->enabled_cpu_features,createSubdivMesh);
+      //SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(device->enabled_cpu_features,createSubdivMesh); // FIXME: this does not work for some reason?
+      Geometry* geom = createSubdivMesh(device);
+      return (RTCGeometry) geom->refInc();
+#else
+      throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_SUBDIVISION is not supported");
+#endif
+    }
+    
+    case RTC_GEOMETRY_TYPE_USER:
+    {
+#if defined(EMBREE_GEOMETRY_USER)
+      createUserGeometryTy createUserGeometry = nullptr;
+      SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(device->enabled_cpu_features,createUserGeometry);
+      Geometry* geom = createUserGeometry(device);
+      return (RTCGeometry) geom->refInc();
+#else
+      throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_USER is not supported");
+#endif
+    }
+
+    case RTC_GEOMETRY_TYPE_INSTANCE:
+    {
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+      createInstanceTy createInstance = nullptr;
+      SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(device->enabled_cpu_features,createInstance);
+      Geometry* geom = createInstance(device);
+      return (RTCGeometry) geom->refInc();
+#else
+      throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_INSTANCE is not supported");
+#endif
+    }
+
+    case RTC_GEOMETRY_TYPE_GRID:
+    {
+#if defined(EMBREE_GEOMETRY_GRID)
+      createGridMeshTy createGridMesh = nullptr;
+      SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(device->enabled_cpu_features,createGridMesh);
+      Geometry* geom = createGridMesh(device);
+      return (RTCGeometry) geom->refInc();
+#else
+      throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_GRID is not supported");
+#endif
+    }
+    
+    default:
+      throw_RTCError(RTC_ERROR_UNKNOWN,"invalid geometry type");
+    }
+    
+    RTC_CATCH_END(device);
+    return nullptr;
+  }
+
+  RTC_API void rtcSetGeometryUserPrimitiveCount(RTCGeometry hgeometry, unsigned int userPrimitiveCount)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryUserPrimitiveCount);
+    RTC_VERIFY_HANDLE(hgeometry);
+    
+    if (unlikely(geometry->getType() != Geometry::GTY_USER_GEOMETRY))
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation only allowed for user geometries"); 
+
+    geometry->setNumPrimitives(userPrimitiveCount);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryTimeStepCount(RTCGeometry hgeometry, unsigned int timeStepCount)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryTimeStepCount);
+    RTC_VERIFY_HANDLE(hgeometry);
+
+    if (timeStepCount > RTC_MAX_TIME_STEP_COUNT)
+      throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"number of time steps is out of range");
+    
+    geometry->setNumTimeSteps(timeStepCount);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryTimeRange(RTCGeometry hgeometry, float startTime, float endTime)
+  {
+    Ref<Geometry> geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryTimeRange);
+    RTC_VERIFY_HANDLE(hgeometry);
+
+    if (startTime > endTime)
+      throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"startTime has to be smaller or equal to the endTime");
+        
+    geometry->setTimeRange(BBox1f(startTime,endTime));
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryVertexAttributeCount(RTCGeometry hgeometry, unsigned int N)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryVertexAttributeCount);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setVertexAttributeCount(N);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryTopologyCount(RTCGeometry hgeometry, unsigned int N)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryTopologyCount);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setTopologyCount(N);
+    RTC_CATCH_END2(geometry);
+  }
+ 
+  RTC_API void rtcSetGeometryBuildQuality (RTCGeometry hgeometry, RTCBuildQuality quality) 
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryBuildQuality);
+    RTC_VERIFY_HANDLE(hgeometry);
+    if (quality != RTC_BUILD_QUALITY_LOW &&
+        quality != RTC_BUILD_QUALITY_MEDIUM &&
+        quality != RTC_BUILD_QUALITY_HIGH &&
+        quality != RTC_BUILD_QUALITY_REFIT)
+      // -- GODOT start --
+      // throw std::runtime_error("invalid build quality");
+      abort();
+      // -- GODOT end --
+    geometry->setBuildQuality(quality);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryMaxRadiusScale(RTCGeometry hgeometry, float maxRadiusScale)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryMaxRadiusScale);
+    RTC_VERIFY_HANDLE(hgeometry);
+#if RTC_MIN_WIDTH
+    if (maxRadiusScale < 1.0f) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"maximal radius scale has to be larger or equal to 1");
+    geometry->setMaxRadiusScale(maxRadiusScale);
+#else
+    throw_RTCError(RTC_ERROR_INVALID_OPERATION,"min-width feature is not enabled");
+#endif
+    RTC_CATCH_END2(geometry);
+  }
+  
+  RTC_API void rtcSetGeometryMask (RTCGeometry hgeometry, unsigned int mask) 
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryMask);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setMask(mask);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometrySubdivisionMode (RTCGeometry hgeometry, unsigned topologyID, RTCSubdivisionMode mode) 
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometrySubdivisionMode);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setSubdivisionMode(topologyID,mode);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryVertexAttributeTopology(RTCGeometry hgeometry, unsigned int vertexAttributeID, unsigned int topologyID)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryVertexAttributeTopology);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setVertexAttributeTopology(vertexAttributeID, topologyID);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryBuffer(RTCGeometry hgeometry, RTCBufferType type, unsigned int slot, RTCFormat format, RTCBuffer hbuffer, size_t byteOffset, size_t byteStride, size_t itemCount)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    Ref<Buffer> buffer = (Buffer*)hbuffer;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryBuffer);
+    RTC_VERIFY_HANDLE(hgeometry);
+    RTC_VERIFY_HANDLE(hbuffer);
+    
+    if (geometry->device != buffer->device)
+      throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"inputs are from different devices");
+    
+    if (itemCount > 0xFFFFFFFFu)
+      throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"buffer too large");
+    
+    geometry->setBuffer(type, slot, format, buffer, byteOffset, byteStride, (unsigned int)itemCount);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetSharedGeometryBuffer(RTCGeometry hgeometry, RTCBufferType type, unsigned int slot, RTCFormat format, const void* ptr, size_t byteOffset, size_t byteStride, size_t itemCount)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetSharedGeometryBuffer);
+    RTC_VERIFY_HANDLE(hgeometry);
+    
+    if (itemCount > 0xFFFFFFFFu)
+      throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"buffer too large");
+    
+    Ref<Buffer> buffer = new Buffer(geometry->device, itemCount*byteStride, (char*)ptr + byteOffset);
+    geometry->setBuffer(type, slot, format, buffer, 0, byteStride, (unsigned int)itemCount);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void* rtcSetNewGeometryBuffer(RTCGeometry hgeometry, RTCBufferType type, unsigned int slot, RTCFormat format, size_t byteStride, size_t itemCount)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetNewGeometryBuffer);
+    RTC_VERIFY_HANDLE(hgeometry);
+
+    if (itemCount > 0xFFFFFFFFu)
+      throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"buffer too large");
+    
+    /* vertex buffers need to get overallocated slightly as elements are accessed using SSE loads */
+    size_t bytes = itemCount*byteStride;
+    if (type == RTC_BUFFER_TYPE_VERTEX || type == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE)
+      bytes += (16 - (byteStride%16))%16;
+      
+    Ref<Buffer> buffer = new Buffer(geometry->device, bytes);
+    geometry->setBuffer(type, slot, format, buffer, 0, byteStride, (unsigned int)itemCount);
+    return buffer->data();
+    RTC_CATCH_END2(geometry);
+    return nullptr;
+  }
+
+  RTC_API void* rtcGetGeometryBufferData(RTCGeometry hgeometry, RTCBufferType type, unsigned int slot)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetGeometryBufferData);
+    RTC_VERIFY_HANDLE(hgeometry);
+    return geometry->getBuffer(type, slot);
+    RTC_CATCH_END2(geometry);
+    return nullptr;
+  }
+  
+  RTC_API void rtcEnableGeometry (RTCGeometry hgeometry) 
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcEnableGeometry);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->enable();
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcUpdateGeometryBuffer (RTCGeometry hgeometry, RTCBufferType type, unsigned int slot) 
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcUpdateGeometryBuffer);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->updateBuffer(type, slot);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcDisableGeometry (RTCGeometry hgeometry) 
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcDisableGeometry);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->disable();
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryTessellationRate (RTCGeometry hgeometry, float tessellationRate)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryTessellationRate);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setTessellationRate(tessellationRate);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryUserData (RTCGeometry hgeometry, void* ptr) 
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryUserData);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setUserData(ptr);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void* rtcGetGeometryUserData (RTCGeometry hgeometry)
+  {
+    Geometry* geometry = (Geometry*) hgeometry; // no ref counting here!
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetGeometryUserData);
+    RTC_VERIFY_HANDLE(hgeometry);
+    return geometry->getUserData();
+    RTC_CATCH_END2(geometry);
+    return nullptr;
+  }
+
+  RTC_API void rtcSetGeometryBoundsFunction (RTCGeometry hgeometry, RTCBoundsFunction bounds, void* userPtr)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryBoundsFunction);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setBoundsFunction(bounds,userPtr);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryDisplacementFunction (RTCGeometry hgeometry, RTCDisplacementFunctionN displacement)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryDisplacementFunction);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setDisplacementFunction(displacement);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryIntersectFunction (RTCGeometry hgeometry, RTCIntersectFunctionN intersect) 
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryIntersectFunction);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setIntersectFunctionN(intersect);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryPointQueryFunction(RTCGeometry hgeometry, RTCPointQueryFunction pointQuery)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryPointQueryFunction);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setPointQueryFunction(pointQuery);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API unsigned int rtcGetGeometryFirstHalfEdge(RTCGeometry hgeometry, unsigned int faceID)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetGeometryFirstHalfEdge);
+    return geometry->getFirstHalfEdge(faceID);
+    RTC_CATCH_END2(geometry);
+    return -1;
+  }
+
+  RTC_API unsigned int rtcGetGeometryFace(RTCGeometry hgeometry, unsigned int edgeID)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetGeometryFace);
+    return geometry->getFace(edgeID);
+    RTC_CATCH_END2(geometry);
+    return -1;
+  }
+
+  RTC_API unsigned int rtcGetGeometryNextHalfEdge(RTCGeometry hgeometry, unsigned int edgeID)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetGeometryNextHalfEdge);
+    return geometry->getNextHalfEdge(edgeID);
+    RTC_CATCH_END2(geometry);
+    return -1;
+  }
+
+  RTC_API unsigned int rtcGetGeometryPreviousHalfEdge(RTCGeometry hgeometry, unsigned int edgeID)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetGeometryPreviousHalfEdge);
+    return geometry->getPreviousHalfEdge(edgeID);
+    RTC_CATCH_END2(geometry);
+    return -1;
+  }
+
+  RTC_API unsigned int rtcGetGeometryOppositeHalfEdge(RTCGeometry hgeometry, unsigned int topologyID, unsigned int edgeID)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetGeometryOppositeHalfEdge);
+    return geometry->getOppositeHalfEdge(topologyID,edgeID);
+    RTC_CATCH_END2(geometry);
+    return -1;
+  }
+
+  RTC_API void rtcSetGeometryOccludedFunction (RTCGeometry hgeometry, RTCOccludedFunctionN occluded) 
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetOccludedFunctionN);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setOccludedFunctionN(occluded);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryIntersectFilterFunction (RTCGeometry hgeometry, RTCFilterFunctionN filter) 
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryIntersectFilterFunction);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setIntersectionFilterFunctionN(filter);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryOccludedFilterFunction (RTCGeometry hgeometry, RTCFilterFunctionN filter) 
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryOccludedFilterFunction);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setOcclusionFilterFunctionN(filter);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcInterpolate(const RTCInterpolateArguments* const args)
+  {
+    Geometry* geometry = (Geometry*) args->geometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcInterpolate);
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(args->geometry);
+#endif
+    geometry->interpolate(args);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcInterpolateN(const RTCInterpolateNArguments* const args)
+  {
+    Geometry* geometry = (Geometry*) args->geometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcInterpolateN);
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(args->geometry);
+#endif
+    geometry->interpolateN(args);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcCommitGeometry (RTCGeometry hgeometry)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcCommitGeometry);
+    RTC_VERIFY_HANDLE(hgeometry);
+    return geometry->commit();
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API unsigned int rtcAttachGeometry (RTCScene hscene, RTCGeometry hgeometry)
+  {
+    Scene* scene = (Scene*) hscene;
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcAttachGeometry);
+    RTC_VERIFY_HANDLE(hscene);
+    RTC_VERIFY_HANDLE(hgeometry);
+    if (scene->device != geometry->device)
+      throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"inputs are from different devices");
+    return scene->bind(RTC_INVALID_GEOMETRY_ID,geometry);
+    RTC_CATCH_END2(scene);
+    return -1;
+  }
+
+  RTC_API void rtcAttachGeometryByID (RTCScene hscene, RTCGeometry hgeometry, unsigned int geomID)
+  {
+    Scene* scene = (Scene*) hscene;
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcAttachGeometryByID);
+    RTC_VERIFY_HANDLE(hscene);
+    RTC_VERIFY_HANDLE(hgeometry);
+    RTC_VERIFY_GEOMID(geomID);
+    if (scene->device != geometry->device)
+      throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"inputs are from different devices");
+    scene->bind(geomID,geometry);
+    RTC_CATCH_END2(scene);
+  }
+  
+  RTC_API void rtcDetachGeometry (RTCScene hscene, unsigned int geomID)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcDetachGeometry);
+    RTC_VERIFY_HANDLE(hscene);
+    RTC_VERIFY_GEOMID(geomID);
+    scene->detachGeometry(geomID);
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcRetainGeometry (RTCGeometry hgeometry)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcRetainGeometry);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->refInc();
+    RTC_CATCH_END2(geometry);
+  }
+  
+  RTC_API void rtcReleaseGeometry (RTCGeometry hgeometry)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcReleaseGeometry);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->refDec();
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API RTCGeometry rtcGetGeometry (RTCScene hscene, unsigned int geomID)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetGeometry);
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    RTC_VERIFY_GEOMID(geomID);
+#endif
+    return (RTCGeometry) scene->get(geomID);
+    RTC_CATCH_END2(scene);
+    return nullptr;
+  }
+
+RTC_NAMESPACE_END
diff --git a/thirdparty/embree/kernels/common/rtcore.h b/thirdparty/embree/kernels/common/rtcore.h
new file mode 100644
index 0000000000..373e49a689
--- /dev/null
+++ b/thirdparty/embree/kernels/common/rtcore.h
@@ -0,0 +1,142 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../../include/embree3/rtcore.h"
+RTC_NAMESPACE_USE
+
+namespace embree
+{  
+  /*! decoding of intersection flags */
+  __forceinline bool isCoherent  (RTCIntersectContextFlags flags) { return (flags & RTC_INTERSECT_CONTEXT_FLAG_COHERENT) == RTC_INTERSECT_CONTEXT_FLAG_COHERENT; }
+  __forceinline bool isIncoherent(RTCIntersectContextFlags flags) { return (flags & RTC_INTERSECT_CONTEXT_FLAG_COHERENT) == RTC_INTERSECT_CONTEXT_FLAG_INCOHERENT; }
+
+#if defined(TASKING_TBB) && (TBB_INTERFACE_VERSION_MAJOR >= 8)
+#  define USE_TASK_ARENA 1
+#else
+#  define USE_TASK_ARENA 0
+#endif
+
+#if defined(TASKING_TBB) && (TBB_INTERFACE_VERSION >= 11009) // TBB 2019 Update 9
+#  define TASKING_TBB_USE_TASK_ISOLATION 1
+#else
+#  define TASKING_TBB_USE_TASK_ISOLATION 0
+#endif
+
+/*! Macros used in the rtcore API implementation */
+// -- GODOT start --
+// #define RTC_CATCH_BEGIN try {
+#define RTC_CATCH_BEGIN
+  
+// #define RTC_CATCH_END(device)                                                \
+//   } catch (std::bad_alloc&) {                                                   \
+//     Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
+//   } catch (rtcore_error& e) {                                                   \
+//     Device::process_error(device,e.error,e.what());                             \
+//   } catch (std::exception& e) {                                                 \
+//     Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
+//   } catch (...) {                                                               \
+//     Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
+//   }
+#define RTC_CATCH_END(device)
+  
+// #define RTC_CATCH_END2(scene)                                                \
+//   } catch (std::bad_alloc&) {                                                   \
+//     Device* device = scene ? scene->device : nullptr;                           \
+//     Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
+//   } catch (rtcore_error& e) {                                                   \
+//     Device* device = scene ? scene->device : nullptr;                           \
+//     Device::process_error(device,e.error,e.what());                             \
+//   } catch (std::exception& e) {                                                 \
+//     Device* device = scene ? scene->device : nullptr;                           \
+//     Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
+//   } catch (...) {                                                               \
+//     Device* device = scene ? scene->device : nullptr;                           \
+//     Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
+//   }
+#define RTC_CATCH_END2(scene)
+
+// #define RTC_CATCH_END2_FALSE(scene)                                             \
+//   } catch (std::bad_alloc&) {                                                   \
+//     Device* device = scene ? scene->device : nullptr;                           \
+//     Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
+//     return false;                                                               \
+//   } catch (rtcore_error& e) {                                                   \
+//     Device* device = scene ? scene->device : nullptr;                           \
+//     Device::process_error(device,e.error,e.what());                             \
+//     return false;                                                               \
+//   } catch (std::exception& e) {                                                 \
+//     Device* device = scene ? scene->device : nullptr;                           \
+//     Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
+//     return false;                                                               \
+//   } catch (...) {                                                               \
+//     Device* device = scene ? scene->device : nullptr;                           \
+//     Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
+//     return false;                                                               \
+//   }
+#define RTC_CATCH_END2_FALSE(scene) return false;
+// -- GODOT end --
+
+#define RTC_VERIFY_HANDLE(handle)                               \
+  if (handle == nullptr) {                                         \
+    throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"invalid argument"); \
+  }
+
+#define RTC_VERIFY_GEOMID(id)                                   \
+  if (id == RTC_INVALID_GEOMETRY_ID) {                             \
+    throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"invalid argument"); \
+  }
+
+#define RTC_VERIFY_UPPER(id,upper)                              \
+  if (id > upper) {                                                \
+    throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"invalid argument"); \
+  }
+
+#define RTC_VERIFY_RANGE(id,lower,upper)	\
+  if (id < lower || id > upper)						  \
+    throw_RTCError(RTC_ERROR_INVALID_OPERATION,"argument out of bounds");
+  
+#if 0 // enable to debug print all API calls
+#define RTC_TRACE(x) std::cout << #x << std::endl;
+#else
+#define RTC_TRACE(x) 
+#endif
+
+// -- GODOT begin --
+//   /*! used to throw embree API errors */
+//   struct rtcore_error : public std::exception
+//   {
+//     __forceinline rtcore_error(RTCError error, const std::string& str)
+//       : error(error), str(str) {}
+//     
+//     ~rtcore_error() throw() {}
+//     
+//     const char* what () const throw () {
+//       return str.c_str();
+//     }
+//     
+//     RTCError error;
+//     std::string str;
+//   };
+// -- GODOT end --
+
+#if defined(DEBUG) // only report file and line in debug mode
+  // -- GODOT begin --
+  // #define throw_RTCError(error,str) \
+  //   throw rtcore_error(error,std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
+  #define throw_RTCError(error,str) \
+    printf(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)), abort();
+  // -- GODOT end --
+#else
+  // -- GODOT begin --
+  // #define throw_RTCError(error,str) \
+  //   throw rtcore_error(error,str);
+  #define throw_RTCError(error,str) \
+    abort();
+  // -- GODOT end --
+#endif
+
+#define RTC_BUILD_ARGUMENTS_HAS(settings,member) \
+  (settings.byteSize > (offsetof(RTCBuildArguments,member)+sizeof(settings.member))) 
+}
diff --git a/thirdparty/embree/kernels/common/rtcore_builder.cpp b/thirdparty/embree/kernels/common/rtcore_builder.cpp
new file mode 100644
index 0000000000..1f1b6f6ddf
--- /dev/null
+++ b/thirdparty/embree/kernels/common/rtcore_builder.cpp
@@ -0,0 +1,442 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#define RTC_EXPORT_API
+
+#include "default.h"
+#include "device.h"
+#include "scene.h"
+#include "context.h"
+#include "alloc.h"
+
+#include "../builders/bvh_builder_sah.h"
+#include "../builders/bvh_builder_morton.h"
+
+namespace embree
+{ 
+  namespace isa // FIXME: support more ISAs for builders
+  {
+    struct BVH : public RefCount
+    {
+      BVH (Device* device)
+        : device(device), allocator(device,true), morton_src(device,0), morton_tmp(device,0)
+      {
+        device->refInc();
+      }
+
+      ~BVH() {
+        device->refDec();
+      }
+
+    public:
+      Device* device;
+      FastAllocator allocator;
+      mvector<BVHBuilderMorton::BuildPrim> morton_src;
+      mvector<BVHBuilderMorton::BuildPrim> morton_tmp;
+    };
+
+    void* rtcBuildBVHMorton(const RTCBuildArguments* arguments)
+    {
+      BVH* bvh = (BVH*) arguments->bvh;
+      RTCBuildPrimitive* prims_i =  arguments->primitives;
+      size_t primitiveCount = arguments->primitiveCount;
+      RTCCreateNodeFunction createNode = arguments->createNode;
+      RTCSetNodeChildrenFunction setNodeChildren = arguments->setNodeChildren;
+      RTCSetNodeBoundsFunction setNodeBounds = arguments->setNodeBounds;
+      RTCCreateLeafFunction createLeaf = arguments->createLeaf;
+      RTCProgressMonitorFunction buildProgress = arguments->buildProgress;
+      void* userPtr = arguments->userPtr;
+        
+      std::atomic<size_t> progress(0);
+      
+      /* initialize temporary arrays for morton builder */
+      PrimRef* prims = (PrimRef*) prims_i;
+      mvector<BVHBuilderMorton::BuildPrim>& morton_src = bvh->morton_src;
+      mvector<BVHBuilderMorton::BuildPrim>& morton_tmp = bvh->morton_tmp;
+      morton_src.resize(primitiveCount);
+      morton_tmp.resize(primitiveCount);
+
+      /* compute centroid bounds */
+      const BBox3fa centBounds = parallel_reduce ( size_t(0), primitiveCount, BBox3fa(empty), [&](const range<size_t>& r) -> BBox3fa {
+
+          BBox3fa bounds(empty);
+          for (size_t i=r.begin(); i<r.end(); i++) 
+            bounds.extend(prims[i].bounds().center2());
+          return bounds;
+        }, BBox3fa::merge);
+      
+      /* compute morton codes */
+      BVHBuilderMorton::MortonCodeMapping mapping(centBounds);
+      parallel_for ( size_t(0), primitiveCount, [&](const range<size_t>& r) {
+          BVHBuilderMorton::MortonCodeGenerator generator(mapping,&morton_src[r.begin()]);
+          for (size_t i=r.begin(); i<r.end(); i++) {
+            generator(prims[i].bounds(),(unsigned) i);
+          }
+        });
+
+      /* start morton build */
+      std::pair<void*,BBox3fa> root = BVHBuilderMorton::build<std::pair<void*,BBox3fa>>(
+        
+        /* thread local allocator for fast allocations */
+        [&] () -> FastAllocator::CachedAllocator { 
+          return bvh->allocator.getCachedAllocator();
+        },
+        
+        /* lambda function that allocates BVH nodes */
+        [&] ( const FastAllocator::CachedAllocator& alloc, size_t N ) -> void* {
+          return createNode((RTCThreadLocalAllocator)&alloc, (unsigned int)N,userPtr);
+        },
+        
+        /* lambda function that sets bounds */
+        [&] (void* node, const std::pair<void*,BBox3fa>* children, size_t N) -> std::pair<void*,BBox3fa>
+        {
+          BBox3fa bounds = empty;
+          void* childptrs[BVHBuilderMorton::MAX_BRANCHING_FACTOR];
+          const RTCBounds* cbounds[BVHBuilderMorton::MAX_BRANCHING_FACTOR];
+          for (size_t i=0; i<N; i++) {
+            bounds.extend(children[i].second);
+            childptrs[i] = children[i].first;
+            cbounds[i] = (const RTCBounds*)&children[i].second;
+          }
+          setNodeBounds(node,cbounds,(unsigned int)N,userPtr);
+          setNodeChildren(node,childptrs, (unsigned int)N,userPtr);
+          return std::make_pair(node,bounds);
+        },
+        
+        /* lambda function that creates BVH leaves */
+        [&]( const range<unsigned>& current, const FastAllocator::CachedAllocator& alloc) -> std::pair<void*,BBox3fa>
+        {
+	  RTCBuildPrimitive localBuildPrims[RTC_BUILD_MAX_PRIMITIVES_PER_LEAF];
+	  BBox3fa bounds = empty;
+	  for (size_t i=0;i<current.size();i++)
+	    {
+	      const size_t id = morton_src[current.begin()+i].index;
+	      bounds.extend(prims[id].bounds());
+	      localBuildPrims[i] = prims_i[id];
+	    }
+          void* node = createLeaf((RTCThreadLocalAllocator)&alloc,localBuildPrims,current.size(),userPtr);
+          return std::make_pair(node,bounds);
+        },
+        
+        /* lambda that calculates the bounds for some primitive */
+        [&] (const BVHBuilderMorton::BuildPrim& morton) -> BBox3fa {
+          return prims[morton.index].bounds();
+        },
+        
+        /* progress monitor function */
+        [&] (size_t dn) {
+          if (!buildProgress) return true;
+          const size_t n = progress.fetch_add(dn)+dn;
+          const double f = std::min(1.0,double(n)/double(primitiveCount));
+          return buildProgress(userPtr,f);
+        },
+        
+        morton_src.data(),morton_tmp.data(),primitiveCount,
+        *arguments);
+
+      bvh->allocator.cleanup();
+      return root.first;
+    }
+
+    void* rtcBuildBVHBinnedSAH(const RTCBuildArguments* arguments)
+    {
+      BVH* bvh = (BVH*) arguments->bvh;
+      RTCBuildPrimitive* prims =  arguments->primitives;
+      size_t primitiveCount = arguments->primitiveCount;
+      RTCCreateNodeFunction createNode = arguments->createNode;
+      RTCSetNodeChildrenFunction setNodeChildren = arguments->setNodeChildren;
+      RTCSetNodeBoundsFunction setNodeBounds = arguments->setNodeBounds;
+      RTCCreateLeafFunction createLeaf = arguments->createLeaf;
+      RTCProgressMonitorFunction buildProgress = arguments->buildProgress;
+      void* userPtr = arguments->userPtr;
+      
+      std::atomic<size_t> progress(0);
+  
+      /* calculate priminfo */
+      auto computeBounds = [&](const range<size_t>& r) -> CentGeomBBox3fa
+        {
+          CentGeomBBox3fa bounds(empty);
+          for (size_t j=r.begin(); j<r.end(); j++)
+            bounds.extend((BBox3fa&)prims[j]);
+          return bounds;
+        };
+      const CentGeomBBox3fa bounds = 
+        parallel_reduce(size_t(0),primitiveCount,size_t(1024),size_t(1024),CentGeomBBox3fa(empty), computeBounds, CentGeomBBox3fa::merge2);
+
+      const PrimInfo pinfo(0,primitiveCount,bounds);
+      
+      /* build BVH */
+      void* root = BVHBuilderBinnedSAH::build<void*>(
+        
+        /* thread local allocator for fast allocations */
+        [&] () -> FastAllocator::CachedAllocator { 
+          return bvh->allocator.getCachedAllocator();
+        },
+
+        /* lambda function that creates BVH nodes */
+        [&](BVHBuilderBinnedSAH::BuildRecord* children, const size_t N, const FastAllocator::CachedAllocator& alloc) -> void*
+        {
+          void* node = createNode((RTCThreadLocalAllocator)&alloc, (unsigned int)N,userPtr);
+          const RTCBounds* cbounds[GeneralBVHBuilder::MAX_BRANCHING_FACTOR];
+          for (size_t i=0; i<N; i++) cbounds[i] = (const RTCBounds*) &children[i].prims.geomBounds;
+          setNodeBounds(node,cbounds, (unsigned int)N,userPtr);
+          return node;
+        },
+
+        /* lambda function that updates BVH nodes */
+        [&](const BVHBuilderBinnedSAH::BuildRecord& precord, const BVHBuilderBinnedSAH::BuildRecord* crecords, void* node, void** children, const size_t N) -> void* {
+          setNodeChildren(node,children, (unsigned int)N,userPtr);
+          return node;
+        },
+        
+        /* lambda function that creates BVH leaves */
+        [&](const PrimRef* prims, const range<size_t>& range, const FastAllocator::CachedAllocator& alloc) -> void* {
+          return createLeaf((RTCThreadLocalAllocator)&alloc,(RTCBuildPrimitive*)(prims+range.begin()),range.size(),userPtr);
+        },
+        
+        /* progress monitor function */
+        [&] (size_t dn) {
+          if (!buildProgress) return true;
+          const size_t n = progress.fetch_add(dn)+dn;
+          const double f = std::min(1.0,double(n)/double(primitiveCount));
+          return buildProgress(userPtr,f);
+        },
+        
+        (PrimRef*)prims,pinfo,*arguments);
+        
+      bvh->allocator.cleanup();
+      return root;
+    }
+
+    static __forceinline const std::pair<CentGeomBBox3fa,unsigned int> mergePair(const std::pair<CentGeomBBox3fa,unsigned int>& a, const std::pair<CentGeomBBox3fa,unsigned int>& b) {
+      CentGeomBBox3fa centBounds = CentGeomBBox3fa::merge2(a.first,b.first);
+      unsigned int maxGeomID = max(a.second,b.second); 
+      return std::pair<CentGeomBBox3fa,unsigned int>(centBounds,maxGeomID);
+    }
+
+    void* rtcBuildBVHSpatialSAH(const RTCBuildArguments* arguments)
+    {
+      BVH* bvh = (BVH*) arguments->bvh;
+      RTCBuildPrimitive* prims =  arguments->primitives;
+      size_t primitiveCount = arguments->primitiveCount;
+      RTCCreateNodeFunction createNode = arguments->createNode;
+      RTCSetNodeChildrenFunction setNodeChildren = arguments->setNodeChildren;
+      RTCSetNodeBoundsFunction setNodeBounds = arguments->setNodeBounds;
+      RTCCreateLeafFunction createLeaf = arguments->createLeaf;
+      RTCSplitPrimitiveFunction splitPrimitive = arguments->splitPrimitive;
+      RTCProgressMonitorFunction buildProgress = arguments->buildProgress;
+      void* userPtr = arguments->userPtr;
+      
+      std::atomic<size_t> progress(0);
+  
+      /* calculate priminfo */
+
+      auto computeBounds = [&](const range<size_t>& r) -> std::pair<CentGeomBBox3fa,unsigned int>
+        {
+          CentGeomBBox3fa bounds(empty);
+          unsigned maxGeomID = 0;
+          for (size_t j=r.begin(); j<r.end(); j++)
+          {
+            bounds.extend((BBox3fa&)prims[j]);
+            maxGeomID = max(maxGeomID,prims[j].geomID);
+          }
+          return std::pair<CentGeomBBox3fa,unsigned int>(bounds,maxGeomID);
+        };
+
+
+      const std::pair<CentGeomBBox3fa,unsigned int> pair = 
+        parallel_reduce(size_t(0),primitiveCount,size_t(1024),size_t(1024),std::pair<CentGeomBBox3fa,unsigned int>(CentGeomBBox3fa(empty),0), computeBounds, mergePair);
+
+      CentGeomBBox3fa bounds = pair.first;
+      const unsigned int maxGeomID = pair.second;
+      
+      if (unlikely(maxGeomID >= ((unsigned int)1 << (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS))))
+        {
+          /* fallback code for max geomID larger than threshold */
+          return rtcBuildBVHBinnedSAH(arguments);
+        }
+
+      const PrimInfo pinfo(0,primitiveCount,bounds);
+
+      /* function that splits a build primitive */
+      struct Splitter
+      {
+        Splitter (RTCSplitPrimitiveFunction splitPrimitive, unsigned geomID, unsigned primID, void* userPtr)
+          : splitPrimitive(splitPrimitive), geomID(geomID), primID(primID), userPtr(userPtr) {}
+        
+        __forceinline void operator() (PrimRef& prim, const size_t dim, const float pos, PrimRef& left_o, PrimRef& right_o) const 
+        {
+          prim.geomIDref() &= BVHBuilderBinnedFastSpatialSAH::GEOMID_MASK;
+          splitPrimitive((RTCBuildPrimitive*)&prim,(unsigned)dim,pos,(RTCBounds*)&left_o,(RTCBounds*)&right_o,userPtr);
+          left_o.geomIDref()  = geomID; left_o.primIDref()  = primID;
+          right_o.geomIDref() = geomID; right_o.primIDref() = primID;
+        }
+
+        __forceinline void operator() (const BBox3fa& box, const size_t dim, const float pos, BBox3fa& left_o, BBox3fa& right_o) const 
+        {
+          PrimRef prim(box,geomID & BVHBuilderBinnedFastSpatialSAH::GEOMID_MASK,primID);
+          splitPrimitive((RTCBuildPrimitive*)&prim,(unsigned)dim,pos,(RTCBounds*)&left_o,(RTCBounds*)&right_o,userPtr);
+        }
+   
+        RTCSplitPrimitiveFunction splitPrimitive;
+        unsigned geomID;
+        unsigned primID;
+        void* userPtr;
+      };
+
+      /* build BVH */
+      void* root = BVHBuilderBinnedFastSpatialSAH::build<void*>(
+        
+        /* thread local allocator for fast allocations */
+        [&] () -> FastAllocator::CachedAllocator { 
+          return bvh->allocator.getCachedAllocator();
+        },
+
+        /* lambda function that creates BVH nodes */
+        [&] (BVHBuilderBinnedFastSpatialSAH::BuildRecord* children, const size_t N, const FastAllocator::CachedAllocator& alloc) -> void*
+        {
+          void* node = createNode((RTCThreadLocalAllocator)&alloc, (unsigned int)N,userPtr);
+          const RTCBounds* cbounds[GeneralBVHBuilder::MAX_BRANCHING_FACTOR];
+          for (size_t i=0; i<N; i++) cbounds[i] = (const RTCBounds*) &children[i].prims.geomBounds;
+          setNodeBounds(node,cbounds, (unsigned int)N,userPtr);
+          return node;
+        },
+
+        /* lambda function that updates BVH nodes */
+        [&] (const BVHBuilderBinnedFastSpatialSAH::BuildRecord& precord, const BVHBuilderBinnedFastSpatialSAH::BuildRecord* crecords, void* node, void** children, const size_t N) -> void* {
+          setNodeChildren(node,children, (unsigned int)N,userPtr);
+          return node;
+        },
+        
+        /* lambda function that creates BVH leaves */
+        [&] (const PrimRef* prims, const range<size_t>& range, const FastAllocator::CachedAllocator& alloc) -> void* {
+          return createLeaf((RTCThreadLocalAllocator)&alloc,(RTCBuildPrimitive*)(prims+range.begin()),range.size(),userPtr);
+        },
+        
+        /* returns the splitter */
+        [&] ( const PrimRef& prim ) -> Splitter {
+          return Splitter(splitPrimitive,prim.geomID(),prim.primID(),userPtr);
+        },
+
+        /* progress monitor function */
+        [&] (size_t dn) {
+          if (!buildProgress) return true;
+          const size_t n = progress.fetch_add(dn)+dn;
+          const double f = std::min(1.0,double(n)/double(primitiveCount));
+          return buildProgress(userPtr,f);
+        },
+        
+        (PrimRef*)prims,
+        arguments->primitiveArrayCapacity,
+        pinfo,*arguments);
+        
+      bvh->allocator.cleanup();
+      return root;
+    }
+  }
+}
+
+using namespace embree;
+using namespace embree::isa;
+
+RTC_NAMESPACE_BEGIN
+
+    RTC_API RTCBVH rtcNewBVH(RTCDevice device)
+    {
+      RTC_CATCH_BEGIN;
+      RTC_TRACE(rtcNewAllocator);
+      RTC_VERIFY_HANDLE(device);
+      BVH* bvh = new BVH((Device*)device);
+      return (RTCBVH) bvh->refInc();
+      RTC_CATCH_END((Device*)device);
+      return nullptr;
+    }
+
+    RTC_API void* rtcBuildBVH(const RTCBuildArguments* arguments)
+    {
+      BVH* bvh = (BVH*) arguments->bvh;
+      RTC_CATCH_BEGIN;
+      RTC_TRACE(rtcBuildBVH);
+      RTC_VERIFY_HANDLE(bvh);
+      RTC_VERIFY_HANDLE(arguments);
+      RTC_VERIFY_HANDLE(arguments->createNode);
+      RTC_VERIFY_HANDLE(arguments->setNodeChildren);
+      RTC_VERIFY_HANDLE(arguments->setNodeBounds);
+      RTC_VERIFY_HANDLE(arguments->createLeaf);
+
+      if (arguments->primitiveArrayCapacity < arguments->primitiveCount)
+        throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"primitiveArrayCapacity must be greater or equal to primitiveCount")
+
+      /* initialize the allocator */
+      bvh->allocator.init_estimate(arguments->primitiveCount*sizeof(BBox3fa));
+      bvh->allocator.reset();
+
+      /* switch between differnet builders based on quality level */
+      if (arguments->buildQuality == RTC_BUILD_QUALITY_LOW)
+        return rtcBuildBVHMorton(arguments);
+      else if (arguments->buildQuality == RTC_BUILD_QUALITY_MEDIUM)
+        return rtcBuildBVHBinnedSAH(arguments);
+      else if (arguments->buildQuality == RTC_BUILD_QUALITY_HIGH) {
+        if (arguments->splitPrimitive == nullptr || arguments->primitiveArrayCapacity <= arguments->primitiveCount)
+          return rtcBuildBVHBinnedSAH(arguments);
+        else
+          return rtcBuildBVHSpatialSAH(arguments);
+      }
+      else
+        throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid build quality");
+
+      /* if we are in dynamic mode, then do not clear temporary data */
+      if (!(arguments->buildFlags & RTC_BUILD_FLAG_DYNAMIC))
+      {
+        bvh->morton_src.clear();
+        bvh->morton_tmp.clear();
+      }
+
+      RTC_CATCH_END(bvh->device);
+      return nullptr;
+    }
+
+    RTC_API void* rtcThreadLocalAlloc(RTCThreadLocalAllocator localAllocator, size_t bytes, size_t align)
+    {
+      FastAllocator::CachedAllocator* alloc = (FastAllocator::CachedAllocator*) localAllocator;
+      RTC_CATCH_BEGIN;
+      RTC_TRACE(rtcThreadLocalAlloc);
+      return alloc->malloc0(bytes,align);
+      RTC_CATCH_END(alloc->alloc->getDevice());
+      return nullptr;
+    }
+
+    RTC_API void rtcMakeStaticBVH(RTCBVH hbvh)
+    {
+      BVH* bvh = (BVH*) hbvh;
+      RTC_CATCH_BEGIN;
+      RTC_TRACE(rtcStaticBVH);
+      RTC_VERIFY_HANDLE(hbvh);
+      bvh->morton_src.clear();
+      bvh->morton_tmp.clear();
+      RTC_CATCH_END(bvh->device);
+    }
+
+    RTC_API void rtcRetainBVH(RTCBVH hbvh)
+    {
+      BVH* bvh = (BVH*) hbvh;
+      Device* device = bvh ? bvh->device : nullptr;
+      RTC_CATCH_BEGIN;
+      RTC_TRACE(rtcRetainBVH);
+      RTC_VERIFY_HANDLE(hbvh);
+      bvh->refInc();
+      RTC_CATCH_END(device);
+    }
+    
+    RTC_API void rtcReleaseBVH(RTCBVH hbvh)
+    {
+      BVH* bvh = (BVH*) hbvh;
+      Device* device = bvh ? bvh->device : nullptr;
+      RTC_CATCH_BEGIN;
+      RTC_TRACE(rtcReleaseBVH);
+      RTC_VERIFY_HANDLE(hbvh);
+      bvh->refDec();
+      RTC_CATCH_END(device);
+    }
+
+RTC_NAMESPACE_END
diff --git a/thirdparty/embree/kernels/common/scene.cpp b/thirdparty/embree/kernels/common/scene.cpp
new file mode 100644
index 0000000000..408d7eae6f
--- /dev/null
+++ b/thirdparty/embree/kernels/common/scene.cpp
@@ -0,0 +1,955 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "scene.h"
+
+#include "../bvh/bvh4_factory.h"
+#include "../bvh/bvh8_factory.h"
+#include "../../common/algorithms/parallel_reduce.h"
+ 
+namespace embree
+{
+  /* error raising rtcIntersect and rtcOccluded functions */
+  void missing_rtcCommit()      { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); }
+  void invalid_rtcIntersect1()  { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect and rtcOccluded not enabled"); }
+  void invalid_rtcIntersect4()  { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect4 and rtcOccluded4 not enabled"); }
+  void invalid_rtcIntersect8()  { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect8 and rtcOccluded8 not enabled"); }
+  void invalid_rtcIntersect16() { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect16 and rtcOccluded16 not enabled"); }
+  void invalid_rtcIntersectN()  { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersectN and rtcOccludedN not enabled"); }
+
+  Scene::Scene (Device* device)
+    : device(device),
+      flags_modified(true), enabled_geometry_types(0),
+      scene_flags(RTC_SCENE_FLAG_NONE),
+      quality_flags(RTC_BUILD_QUALITY_MEDIUM),
+      is_build(false), modified(true),
+      progressInterface(this), progress_monitor_function(nullptr), progress_monitor_ptr(nullptr), progress_monitor_counter(0)
+  {
+    device->refInc();
+
+    intersectors = Accel::Intersectors(missing_rtcCommit);
+
+    /* one can overwrite flags through device for debugging */
+    if (device->quality_flags != -1)
+      quality_flags = (RTCBuildQuality) device->quality_flags;
+    if (device->scene_flags != -1)
+      scene_flags = (RTCSceneFlags) device->scene_flags;
+  }
+
+  Scene::~Scene() noexcept
+  {
+    device->refDec();
+  }
+  
+  void Scene::printStatistics()
+  {
+    /* calculate maximum number of time segments */
+    unsigned max_time_steps = 0;
+    for (size_t i=0; i<size(); i++) {
+      if (!get(i)) continue;
+      max_time_steps = max(max_time_steps,get(i)->numTimeSteps);
+    }
+
+    /* initialize vectors*/
+    std::vector<size_t> statistics[Geometry::GTY_END];
+    for (size_t i=0; i<Geometry::GTY_END; i++)
+      statistics[i].resize(max_time_steps);
+
+    /* gather statistics */
+    for (size_t i=0; i<size(); i++) 
+    {
+      if (!get(i)) continue;
+      int ty = get(i)->getType(); 
+      assert(ty<Geometry::GTY_END);
+      int timesegments = get(i)->numTimeSegments(); 
+      assert((unsigned int)timesegments < max_time_steps);
+      statistics[ty][timesegments] += get(i)->size();
+    }
+
+    /* print statistics */
+    std::cout << std::setw(23) << "segments" << ": ";
+    for (size_t t=0; t<max_time_steps; t++)
+      std::cout << std::setw(10) << t;
+    std::cout << std::endl;
+
+    std::cout << "-------------------------";
+    for (size_t t=0; t<max_time_steps; t++)
+      std::cout << "----------";
+    std::cout << std::endl;
+    
+    for (size_t p=0; p<Geometry::GTY_END; p++)
+    {
+      if (std::string(Geometry::gtype_names[p]) == "") continue;
+      std::cout << std::setw(23) << Geometry::gtype_names[p] << ": ";
+      for (size_t t=0; t<max_time_steps; t++)
+        std::cout << std::setw(10) << statistics[p][t];
+      std::cout << std::endl;
+    }
+  }
+
+  void Scene::createTriangleAccel()
+  {
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+    if (device->tri_accel == "default") 
+    {
+      if (quality_flags != RTC_BUILD_QUALITY_LOW)
+      {
+        int mode =  2*(int)isCompactAccel() + 1*(int)isRobustAccel(); 
+        switch (mode) {
+        case /*0b00*/ 0: 
+#if defined (EMBREE_TARGET_SIMD8)
+          if (device->canUseAVX())
+	  {
+            if (quality_flags == RTC_BUILD_QUALITY_HIGH) 
+              accels_add(device->bvh8_factory->BVH8Triangle4(this,BVHFactory::BuildVariant::HIGH_QUALITY,BVHFactory::IntersectVariant::FAST));
+            else
+              accels_add(device->bvh8_factory->BVH8Triangle4(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST));
+          }
+          else 
+#endif
+          { 
+            if (quality_flags == RTC_BUILD_QUALITY_HIGH) 
+              accels_add(device->bvh4_factory->BVH4Triangle4(this,BVHFactory::BuildVariant::HIGH_QUALITY,BVHFactory::IntersectVariant::FAST));
+            else 
+              accels_add(device->bvh4_factory->BVH4Triangle4(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST));
+          }
+          break;
+
+        case /*0b01*/ 1: 
+#if defined (EMBREE_TARGET_SIMD8)
+          if (device->canUseAVX()) 
+            accels_add(device->bvh8_factory->BVH8Triangle4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST));
+          else
+#endif
+            accels_add(device->bvh4_factory->BVH4Triangle4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST));
+
+          break;
+        case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST  )); break;
+        case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break;
+        }
+      }
+      else /* dynamic */
+      {
+#if defined (EMBREE_TARGET_SIMD8)
+          if (device->canUseAVX())
+	  {
+            int mode =  2*(int)isCompactAccel() + 1*(int)isRobustAccel();
+            switch (mode) {
+            case /*0b00*/ 0: accels_add(device->bvh8_factory->BVH8Triangle4 (this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST  )); break;
+            case /*0b01*/ 1: accels_add(device->bvh8_factory->BVH8Triangle4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break;
+            case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST  )); break;
+            case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break;
+            }
+          }
+          else
+#endif
+          {
+            int mode =  2*(int)isCompactAccel() + 1*(int)isRobustAccel();
+            switch (mode) {
+            case /*0b00*/ 0: accels_add(device->bvh4_factory->BVH4Triangle4 (this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST  )); break;
+            case /*0b01*/ 1: accels_add(device->bvh4_factory->BVH4Triangle4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break;
+            case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST  )); break;
+            case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break;
+            }
+          }
+      }
+    }
+    else if (device->tri_accel == "bvh4.triangle4")       accels_add(device->bvh4_factory->BVH4Triangle4 (this));
+    else if (device->tri_accel == "bvh4.triangle4v")      accels_add(device->bvh4_factory->BVH4Triangle4v(this));
+    else if (device->tri_accel == "bvh4.triangle4i")      accels_add(device->bvh4_factory->BVH4Triangle4i(this));
+    else if (device->tri_accel == "qbvh4.triangle4i")     accels_add(device->bvh4_factory->BVH4QuantizedTriangle4i(this));
+
+#if defined (EMBREE_TARGET_SIMD8)
+    else if (device->tri_accel == "bvh8.triangle4")       accels_add(device->bvh8_factory->BVH8Triangle4 (this));
+    else if (device->tri_accel == "bvh8.triangle4v")      accels_add(device->bvh8_factory->BVH8Triangle4v(this));
+    else if (device->tri_accel == "bvh8.triangle4i")      accels_add(device->bvh8_factory->BVH8Triangle4i(this));
+    else if (device->tri_accel == "qbvh8.triangle4i")     accels_add(device->bvh8_factory->BVH8QuantizedTriangle4i(this));
+    else if (device->tri_accel == "qbvh8.triangle4")      accels_add(device->bvh8_factory->BVH8QuantizedTriangle4(this));
+#endif
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown triangle acceleration structure "+device->tri_accel);
+#endif
+  }
+
+  void Scene::createTriangleMBAccel()
+  {
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+    if (device->tri_accel_mb == "default")
+    {
+      int mode =  2*(int)isCompactAccel() + 1*(int)isRobustAccel(); 
+      
+#if defined (EMBREE_TARGET_SIMD8)
+      if (device->canUseAVX2()) // BVH8 reduces performance on AVX only-machines
+      {
+        switch (mode) {
+        case /*0b00*/ 0: accels_add(device->bvh8_factory->BVH8Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST  )); break;
+        case /*0b01*/ 1: accels_add(device->bvh8_factory->BVH8Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break;
+        case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST  )); break;
+        case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break;
+        }
+      }
+      else
+#endif
+      {
+        switch (mode) {
+        case /*0b00*/ 0: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST  )); break;
+        case /*0b01*/ 1: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break;
+        case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST  )); break;
+        case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break;
+        }
+      }
+    }
+    else if (device->tri_accel_mb == "bvh4.triangle4imb") accels_add(device->bvh4_factory->BVH4Triangle4iMB(this));
+    else if (device->tri_accel_mb == "bvh4.triangle4vmb") accels_add(device->bvh4_factory->BVH4Triangle4vMB(this));
+#if defined (EMBREE_TARGET_SIMD8)
+    else if (device->tri_accel_mb == "bvh8.triangle4imb") accels_add(device->bvh8_factory->BVH8Triangle4iMB(this));
+    else if (device->tri_accel_mb == "bvh8.triangle4vmb") accels_add(device->bvh8_factory->BVH8Triangle4vMB(this));
+#endif
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown motion blur triangle acceleration structure "+device->tri_accel_mb);
+#endif
+  }
+
+  void Scene::createQuadAccel()
+  {
+#if defined(EMBREE_GEOMETRY_QUAD)
+    if (device->quad_accel == "default") 
+    {
+      if (quality_flags != RTC_BUILD_QUALITY_LOW)
+      {
+        /* static */
+        int mode =  2*(int)isCompactAccel() + 1*(int)isRobustAccel(); 
+        switch (mode) {
+        case /*0b00*/ 0:
+#if defined (EMBREE_TARGET_SIMD8)
+          if (device->canUseAVX())
+          {
+            if (quality_flags == RTC_BUILD_QUALITY_HIGH) 
+              accels_add(device->bvh8_factory->BVH8Quad4v(this,BVHFactory::BuildVariant::HIGH_QUALITY,BVHFactory::IntersectVariant::FAST));
+            else
+              accels_add(device->bvh8_factory->BVH8Quad4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST));
+          }
+          else
+#endif
+          {
+            if (quality_flags == RTC_BUILD_QUALITY_HIGH) 
+              accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::HIGH_QUALITY,BVHFactory::IntersectVariant::FAST));
+            else
+              accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST));
+          }
+          break;
+
+        case /*0b01*/ 1:
+#if defined (EMBREE_TARGET_SIMD8)
+          if (device->canUseAVX())
+            accels_add(device->bvh8_factory->BVH8Quad4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST));
+          else
+#endif
+            accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST));
+          break;
+
+        case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Quad4i(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST)); break;
+        case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Quad4i(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break;
+        }
+      }
+      else /* dynamic */
+      {
+#if defined (EMBREE_TARGET_SIMD8)
+          if (device->canUseAVX())
+	  {
+            int mode =  2*(int)isCompactAccel() + 1*(int)isRobustAccel();
+            switch (mode) {
+            case /*0b00*/ 0: accels_add(device->bvh8_factory->BVH8Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST)); break;
+            case /*0b01*/ 1: accels_add(device->bvh8_factory->BVH8Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break;
+            case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST)); break;
+            case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break;
+            }
+          }
+          else
+#endif
+          {
+            int mode =  2*(int)isCompactAccel() + 1*(int)isRobustAccel();
+            switch (mode) {
+            case /*0b00*/ 0: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST)); break;
+            case /*0b01*/ 1: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break;
+            case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST)); break;
+            case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break;
+            }
+          }
+      }
+    }
+    else if (device->quad_accel == "bvh4.quad4v")       accels_add(device->bvh4_factory->BVH4Quad4v(this));
+    else if (device->quad_accel == "bvh4.quad4i")       accels_add(device->bvh4_factory->BVH4Quad4i(this));
+    else if (device->quad_accel == "qbvh4.quad4i")      accels_add(device->bvh4_factory->BVH4QuantizedQuad4i(this));
+
+#if defined (EMBREE_TARGET_SIMD8)
+    else if (device->quad_accel == "bvh8.quad4v")       accels_add(device->bvh8_factory->BVH8Quad4v(this));
+    else if (device->quad_accel == "bvh8.quad4i")       accels_add(device->bvh8_factory->BVH8Quad4i(this));
+    else if (device->quad_accel == "qbvh8.quad4i")      accels_add(device->bvh8_factory->BVH8QuantizedQuad4i(this));
+#endif
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown quad acceleration structure "+device->quad_accel);
+#endif
+  }
+
+  void Scene::createQuadMBAccel()
+  {
+#if defined(EMBREE_GEOMETRY_QUAD)
+    if (device->quad_accel_mb == "default") 
+    {
+      int mode =  2*(int)isCompactAccel() + 1*(int)isRobustAccel(); 
+      switch (mode) {
+      case /*0b00*/ 0:
+#if defined (EMBREE_TARGET_SIMD8)
+        if (device->canUseAVX())
+          accels_add(device->bvh8_factory->BVH8Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST));
+        else
+#endif
+          accels_add(device->bvh4_factory->BVH4Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST));
+        break;
+
+      case /*0b01*/ 1:
+#if defined (EMBREE_TARGET_SIMD8)
+        if (device->canUseAVX())
+          accels_add(device->bvh8_factory->BVH8Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST));
+        else
+#endif
+          accels_add(device->bvh4_factory->BVH4Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST));
+        break;
+
+      case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST  )); break;
+      case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break;
+      }
+    }
+    else if (device->quad_accel_mb == "bvh4.quad4imb") accels_add(device->bvh4_factory->BVH4Quad4iMB(this));
+#if defined (EMBREE_TARGET_SIMD8)
+    else if (device->quad_accel_mb == "bvh8.quad4imb") accels_add(device->bvh8_factory->BVH8Quad4iMB(this));
+#endif
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown quad motion blur acceleration structure "+device->quad_accel_mb);
+#endif
+  }
+
+  void Scene::createHairAccel()
+  {
+#if defined(EMBREE_GEOMETRY_CURVE) || defined(EMBREE_GEOMETRY_POINT)
+    if (device->hair_accel == "default")
+    {
+      int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel();
+#if defined (EMBREE_TARGET_SIMD8)
+      if (device->canUseAVX2()) // only enable on HSW machines, for SNB this codepath is slower
+      {
+        switch (mode) {
+        case /*0b00*/ 0: accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8v(this,BVHFactory::IntersectVariant::FAST)); break;
+        case /*0b01*/ 1: accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8v(this,BVHFactory::IntersectVariant::ROBUST)); break;
+        case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve8i(this,BVHFactory::IntersectVariant::FAST)); break;
+        case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve8i(this,BVHFactory::IntersectVariant::ROBUST)); break;
+        }
+      }
+      else
+#endif
+      {
+        switch (mode) {
+        case /*0b00*/ 0: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4v(this,BVHFactory::IntersectVariant::FAST)); break;
+        case /*0b01*/ 1: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4v(this,BVHFactory::IntersectVariant::ROBUST)); break;
+        case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4i(this,BVHFactory::IntersectVariant::FAST)); break;
+        case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4i(this,BVHFactory::IntersectVariant::ROBUST)); break;
+        }
+      }
+    }
+    else if (device->hair_accel == "bvh4obb.virtualcurve4v" ) accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4v(this,BVHFactory::IntersectVariant::FAST));
+    else if (device->hair_accel == "bvh4obb.virtualcurve4i" ) accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4i(this,BVHFactory::IntersectVariant::FAST));
+#if defined (EMBREE_TARGET_SIMD8)
+    else if (device->hair_accel == "bvh8obb.virtualcurve8v" ) accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8v(this,BVHFactory::IntersectVariant::FAST));
+    else if (device->hair_accel == "bvh4obb.virtualcurve8i" ) accels_add(device->bvh4_factory->BVH4OBBVirtualCurve8i(this,BVHFactory::IntersectVariant::FAST));
+#endif
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown hair acceleration structure "+device->hair_accel);
+#endif
+  }
+
+  void Scene::createHairMBAccel()
+  {
+#if defined(EMBREE_GEOMETRY_CURVE) || defined(EMBREE_GEOMETRY_POINT)
+    if (device->hair_accel_mb == "default")
+    {
+#if defined (EMBREE_TARGET_SIMD8)
+      if (device->canUseAVX2()) // only enable on HSW machines, on SNB this codepath is slower
+      {
+        if (isRobustAccel()) accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8iMB(this,BVHFactory::IntersectVariant::ROBUST));
+        else                 accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8iMB(this,BVHFactory::IntersectVariant::FAST));
+      }
+      else
+#endif
+      {
+        if (isRobustAccel()) accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4iMB(this,BVHFactory::IntersectVariant::ROBUST));
+        else                 accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4iMB(this,BVHFactory::IntersectVariant::FAST));
+      }
+    }
+    else if (device->hair_accel_mb == "bvh4.virtualcurve4imb") accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4iMB(this,BVHFactory::IntersectVariant::FAST));
+
+#if defined (EMBREE_TARGET_SIMD8)
+    else if (device->hair_accel_mb == "bvh4.virtualcurve8imb") accels_add(device->bvh4_factory->BVH4OBBVirtualCurve8iMB(this,BVHFactory::IntersectVariant::FAST));
+    else if (device->hair_accel_mb == "bvh8.virtualcurve8imb") accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8iMB(this,BVHFactory::IntersectVariant::FAST));
+#endif
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown motion blur hair acceleration structure "+device->hair_accel_mb);
+#endif
+  }
+
+  void Scene::createSubdivAccel()
+  {
+#if defined(EMBREE_GEOMETRY_SUBDIVISION)
+    if (device->subdiv_accel == "default") {
+      accels_add(device->bvh4_factory->BVH4SubdivPatch1(this));
+    }
+    else if (device->subdiv_accel == "bvh4.grid.eager" ) accels_add(device->bvh4_factory->BVH4SubdivPatch1(this));
+    else if (device->subdiv_accel == "bvh4.subdivpatch1eager" ) accels_add(device->bvh4_factory->BVH4SubdivPatch1(this));
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown subdiv accel "+device->subdiv_accel);
+#endif
+  }
+
+  void Scene::createSubdivMBAccel()
+  {
+#if defined(EMBREE_GEOMETRY_SUBDIVISION)
+    if (device->subdiv_accel_mb == "default") {
+      accels_add(device->bvh4_factory->BVH4SubdivPatch1MB(this));
+    }
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown subdiv mblur accel "+device->subdiv_accel_mb);
+#endif
+  }
+
+  void Scene::createUserGeometryAccel()
+  {
+#if defined(EMBREE_GEOMETRY_USER)
+    if (device->object_accel == "default") 
+    {
+#if defined (EMBREE_TARGET_SIMD8)
+      if (device->canUseAVX() && !isCompactAccel())
+      {
+        if (quality_flags != RTC_BUILD_QUALITY_LOW) {
+          accels_add(device->bvh8_factory->BVH8UserGeometry(this,BVHFactory::BuildVariant::STATIC));
+        } else {
+          accels_add(device->bvh8_factory->BVH8UserGeometry(this,BVHFactory::BuildVariant::DYNAMIC));
+        }
+      }
+      else
+#endif
+      {
+        if (quality_flags != RTC_BUILD_QUALITY_LOW) {
+          accels_add(device->bvh4_factory->BVH4UserGeometry(this,BVHFactory::BuildVariant::STATIC));
+        } else {
+          accels_add(device->bvh4_factory->BVH4UserGeometry(this,BVHFactory::BuildVariant::DYNAMIC));
+        }
+      }
+    }
+    else if (device->object_accel == "bvh4.object") accels_add(device->bvh4_factory->BVH4UserGeometry(this));
+#if defined (EMBREE_TARGET_SIMD8)
+    else if (device->object_accel == "bvh8.object") accels_add(device->bvh8_factory->BVH8UserGeometry(this));
+#endif
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown user geometry accel "+device->object_accel);
+#endif
+  }
+
+  void Scene::createUserGeometryMBAccel()
+  {
+#if defined(EMBREE_GEOMETRY_USER)
+    if (device->object_accel_mb == "default"    ) {
+#if defined (EMBREE_TARGET_SIMD8)
+      if (device->canUseAVX() && !isCompactAccel())
+        accels_add(device->bvh8_factory->BVH8UserGeometryMB(this));
+      else
+#endif
+        accels_add(device->bvh4_factory->BVH4UserGeometryMB(this));
+    }
+    else if (device->object_accel_mb == "bvh4.object") accels_add(device->bvh4_factory->BVH4UserGeometryMB(this));
+#if defined (EMBREE_TARGET_SIMD8)
+    else if (device->object_accel_mb == "bvh8.object") accels_add(device->bvh8_factory->BVH8UserGeometryMB(this));
+#endif
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown user geometry mblur accel "+device->object_accel_mb);
+#endif
+  }
+
+  void Scene::createInstanceAccel()
+  {
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+    // if (device->object_accel == "default") 
+    {
+#if defined (EMBREE_TARGET_SIMD8)
+      if (device->canUseAVX() && !isCompactAccel()) {
+        if (quality_flags != RTC_BUILD_QUALITY_LOW) {
+          accels_add(device->bvh8_factory->BVH8Instance(this, false, BVHFactory::BuildVariant::STATIC));
+        } else {
+          accels_add(device->bvh8_factory->BVH8Instance(this, false, BVHFactory::BuildVariant::DYNAMIC));
+        }
+      } 
+      else
+#endif
+      {
+        if (quality_flags != RTC_BUILD_QUALITY_LOW) {
+          accels_add(device->bvh4_factory->BVH4Instance(this, false, BVHFactory::BuildVariant::STATIC));
+        } else {
+          accels_add(device->bvh4_factory->BVH4Instance(this, false, BVHFactory::BuildVariant::DYNAMIC));
+        }
+      }
+    }
+    // else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown instance accel "+device->instance_accel);
+#endif
+  }
+
+  void Scene::createInstanceMBAccel()
+  {
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+    //if (device->instance_accel_mb == "default")
+    {
+#if defined (EMBREE_TARGET_SIMD8)
+      if (device->canUseAVX() && !isCompactAccel())
+        accels_add(device->bvh8_factory->BVH8InstanceMB(this, false));
+      else
+#endif
+        accels_add(device->bvh4_factory->BVH4InstanceMB(this, false));
+    }
+    //else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown instance mblur accel "+device->instance_accel_mb);
+#endif
+  }
+
+  void Scene::createInstanceExpensiveAccel()
+  {
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+    // if (device->object_accel == "default") 
+    {
+#if defined (EMBREE_TARGET_SIMD8)
+      if (device->canUseAVX() && !isCompactAccel()) {
+        if (quality_flags != RTC_BUILD_QUALITY_LOW) {
+          accels_add(device->bvh8_factory->BVH8Instance(this, true, BVHFactory::BuildVariant::STATIC));
+        } else {
+          accels_add(device->bvh8_factory->BVH8Instance(this, true, BVHFactory::BuildVariant::DYNAMIC));
+        }
+      } 
+      else
+#endif
+      {
+        if (quality_flags != RTC_BUILD_QUALITY_LOW) {
+          accels_add(device->bvh4_factory->BVH4Instance(this, true, BVHFactory::BuildVariant::STATIC));
+        } else {
+          accels_add(device->bvh4_factory->BVH4Instance(this, true, BVHFactory::BuildVariant::DYNAMIC));
+        }
+      }
+    }
+    // else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown instance accel "+device->instance_accel);
+#endif
+  }
+
+  void Scene::createInstanceExpensiveMBAccel()
+  {
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+    //if (device->instance_accel_mb == "default")
+    {
+#if defined (EMBREE_TARGET_SIMD8)
+      if (device->canUseAVX() && !isCompactAccel())
+        accels_add(device->bvh8_factory->BVH8InstanceMB(this, true));
+      else
+#endif
+        accels_add(device->bvh4_factory->BVH4InstanceMB(this, true));
+    }
+    //else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown instance mblur accel "+device->instance_accel_mb);
+#endif
+  }
+
+  void Scene::createGridAccel()
+  {
+    BVHFactory::IntersectVariant ivariant = isRobustAccel() ? BVHFactory::IntersectVariant::ROBUST : BVHFactory::IntersectVariant::FAST;
+#if defined(EMBREE_GEOMETRY_GRID)
+    if (device->grid_accel == "default") 
+    {
+#if defined (EMBREE_TARGET_SIMD8)
+      if (device->canUseAVX() && !isCompactAccel())
+      {
+        accels_add(device->bvh8_factory->BVH8Grid(this,BVHFactory::BuildVariant::STATIC,ivariant));
+      }
+      else
+#endif
+      {
+        accels_add(device->bvh4_factory->BVH4Grid(this,BVHFactory::BuildVariant::STATIC,ivariant));
+      }
+    }
+    else if (device->grid_accel == "bvh4.grid") accels_add(device->bvh4_factory->BVH4Grid(this,BVHFactory::BuildVariant::STATIC,ivariant));
+#if defined (EMBREE_TARGET_SIMD8)
+    else if (device->grid_accel == "bvh8.grid") accels_add(device->bvh8_factory->BVH8Grid(this,BVHFactory::BuildVariant::STATIC,ivariant));
+#endif
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown grid accel "+device->grid_accel);
+#endif
+
+  }
+
+  void Scene::createGridMBAccel()
+  {
+#if defined(EMBREE_GEOMETRY_GRID)
+    if (device->grid_accel_mb == "default") 
+    {
+      accels_add(device->bvh4_factory->BVH4GridMB(this,BVHFactory::BuildVariant::STATIC));
+    }
+    else if (device->grid_accel_mb == "bvh4mb.grid") accels_add(device->bvh4_factory->BVH4GridMB(this));
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown grid mb accel "+device->grid_accel);
+#endif
+
+  }
+  
+  void Scene::clear() {
+  }
+
+  unsigned Scene::bind(unsigned geomID, Ref<Geometry> geometry) 
+  {
+    Lock<SpinLock> lock(geometriesMutex);
+    if (geomID == RTC_INVALID_GEOMETRY_ID) {
+      geomID = id_pool.allocate();
+      if (geomID == RTC_INVALID_GEOMETRY_ID)
+        throw_RTCError(RTC_ERROR_INVALID_OPERATION,"too many geometries inside scene");
+    }
+    else
+    {
+      if (!id_pool.add(geomID))
+        throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid geometry ID provided");
+    }
+    if (geomID >= geometries.size()) {
+      geometries.resize(geomID+1);
+      vertices.resize(geomID+1);
+      geometryModCounters_.resize(geomID+1);
+    }
+    geometries[geomID] = geometry;
+    geometryModCounters_[geomID] = 0;
+    if (geometry->isEnabled()) {
+      setModified ();
+    }
+    return geomID;
+  }
+
+  void Scene::detachGeometry(size_t geomID)
+  {
+    Lock<SpinLock> lock(geometriesMutex);
+    
+    if (geomID >= geometries.size())
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid geometry ID");
+
+    Ref<Geometry>& geometry = geometries[geomID];
+    if (geometry == null)
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid geometry");
+    
+    if (geometry->isEnabled()) {
+      setModified ();
+    }
+    accels_deleteGeometry(unsigned(geomID));
+    id_pool.deallocate((unsigned)geomID);
+    geometries[geomID] = null;
+    vertices[geomID] = nullptr;
+    geometryModCounters_[geomID] = 0;
+  }
+
+  void Scene::updateInterface()
+  {
+    is_build = true;
+  }
+
+  void Scene::commit_task ()
+  {
+    checkIfModifiedAndSet ();
+    if (!isModified()) {
+      return;
+    }
+    
+    /* print scene statistics */
+    if (device->verbosity(2))
+      printStatistics();
+
+    progress_monitor_counter = 0;
+    
+    /* gather scene stats and call preCommit function of each geometry */
+    this->world = parallel_reduce (size_t(0), geometries.size(), GeometryCounts (), 
+      [this](const range<size_t>& r)->GeometryCounts
+      {
+        GeometryCounts c;
+        for (auto i=r.begin(); i<r.end(); ++i) 
+        {
+          if (geometries[i] && geometries[i]->isEnabled()) 
+          {
+            geometries[i]->preCommit();
+            geometries[i]->addElementsToCount (c);
+            c.numFilterFunctions += (int) geometries[i]->hasFilterFunctions();
+          }
+        }
+        return c;
+      },
+      std::plus<GeometryCounts>()
+    );
+    
+    /* select acceleration structures to build */
+    unsigned int new_enabled_geometry_types = world.enabledGeometryTypesMask();
+    if (flags_modified || new_enabled_geometry_types != enabled_geometry_types)
+    {
+      accels_init();
+
+      /* we need to make all geometries modified, otherwise two level builder will 
+        not rebuild currently not modified geometries */
+      parallel_for(geometryModCounters_.size(), [&] ( const size_t i ) {
+          geometryModCounters_[i] = 0;
+        });
+      
+      if (getNumPrimitives(TriangleMesh::geom_type,false)) createTriangleAccel();
+      if (getNumPrimitives(TriangleMesh::geom_type,true)) createTriangleMBAccel();
+      if (getNumPrimitives(QuadMesh::geom_type,false)) createQuadAccel();
+      if (getNumPrimitives(QuadMesh::geom_type,true)) createQuadMBAccel();
+      if (getNumPrimitives(GridMesh::geom_type,false)) createGridAccel();
+      if (getNumPrimitives(GridMesh::geom_type,true)) createGridMBAccel();
+      if (getNumPrimitives(SubdivMesh::geom_type,false)) createSubdivAccel();
+      if (getNumPrimitives(SubdivMesh::geom_type,true)) createSubdivMBAccel();
+      if (getNumPrimitives(Geometry::MTY_CURVES,false)) createHairAccel();
+      if (getNumPrimitives(Geometry::MTY_CURVES,true)) createHairMBAccel();
+      if (getNumPrimitives(UserGeometry::geom_type,false)) createUserGeometryAccel();
+      if (getNumPrimitives(UserGeometry::geom_type,true)) createUserGeometryMBAccel();
+      if (getNumPrimitives(Geometry::MTY_INSTANCE_CHEAP,false)) createInstanceAccel();
+      if (getNumPrimitives(Geometry::MTY_INSTANCE_CHEAP,true)) createInstanceMBAccel();
+      if (getNumPrimitives(Geometry::MTY_INSTANCE_EXPENSIVE,false)) createInstanceExpensiveAccel();
+      if (getNumPrimitives(Geometry::MTY_INSTANCE_EXPENSIVE,true)) createInstanceExpensiveMBAccel();
+      
+      flags_modified = false;
+      enabled_geometry_types = new_enabled_geometry_types;
+    }
+    
+    /* select fast code path if no filter function is present */
+    accels_select(hasFilterFunction());
+  
+    /* build all hierarchies of this scene */
+    accels_build();
+
+    /* make static geometry immutable */
+    if (!isDynamicAccel()) {
+      accels_immutable();
+      flags_modified = true; // in non-dynamic mode we have to re-create accels
+    }
+
+    /* call postCommit function of each geometry */
+    parallel_for(geometries.size(), [&] ( const size_t i ) {
+        if (geometries[i] && geometries[i]->isEnabled()) {
+          geometries[i]->postCommit();
+          vertices[i] = geometries[i]->getCompactVertexArray();
+          geometryModCounters_[i] = geometries[i]->getModCounter();
+        }
+      });
+      
+    updateInterface();
+
+    if (device->verbosity(2)) {
+      std::cout << "created scene intersector" << std::endl;
+      accels_print(2);
+      std::cout << "selected scene intersector" << std::endl;
+      intersectors.print(2);
+    }
+    
+    setModified(false);
+  }
+
+  void Scene::setBuildQuality(RTCBuildQuality quality_flags_i)
+  {
+    if (quality_flags == quality_flags_i) return;
+    quality_flags = quality_flags_i;
+    flags_modified = true;
+  }
+
+  RTCBuildQuality Scene::getBuildQuality() const {
+    return quality_flags;
+  }
+
+  void Scene::setSceneFlags(RTCSceneFlags scene_flags_i)
+  {
+    if (scene_flags == scene_flags_i) return;
+    scene_flags = scene_flags_i;
+    flags_modified = true;
+  }
+
+  RTCSceneFlags Scene::getSceneFlags() const {
+    return scene_flags;
+  }
+                   
+#if defined(TASKING_INTERNAL)
+
+  void Scene::commit (bool join) 
+  {
+    Lock<MutexSys> buildLock(buildMutex,false);
+
+    /* allocates own taskscheduler for each build */
+    Ref<TaskScheduler> scheduler = nullptr;
+    { 
+      Lock<MutexSys> lock(schedulerMutex);
+      scheduler = this->scheduler;
+      if (scheduler == null) {
+        buildLock.lock();
+        this->scheduler = scheduler = new TaskScheduler;
+      }
+    }
+
+    /* worker threads join build */
+    if (!buildLock.isLocked())
+    {
+      if (!join) 
+        throw_RTCError(RTC_ERROR_INVALID_OPERATION,"use rtcJoinCommitScene to join a build operation");
+      
+      scheduler->join();
+      return;
+    }
+
+    /* initiate build */
+    // -- GODOT start --
+    // try {
+      scheduler->spawn_root([&]() { commit_task(); Lock<MutexSys> lock(schedulerMutex); this->scheduler = nullptr; }, 1, !join);
+    // }
+    // catch (...) {
+    //   accels_clear();
+    //   updateInterface();
+    //   Lock<MutexSys> lock(schedulerMutex);
+    //   this->scheduler = nullptr;
+    //   throw;
+    // }
+    // -- GODOT end --
+  }
+
+#endif
+
+#if defined(TASKING_TBB)
+
+  void Scene::commit (bool join) 
+  {
+#if defined(TASKING_TBB) && (TBB_INTERFACE_VERSION_MAJOR < 8)
+    if (join)
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcJoinCommitScene not supported with this TBB version");
+#endif
+
+    /* try to obtain build lock */
+    Lock<MutexSys> lock(buildMutex,buildMutex.try_lock());
+
+    /* join hierarchy build */
+    if (!lock.isLocked())
+    {
+#if !TASKING_TBB_USE_TASK_ISOLATION
+      if (!join) 
+        throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invoking rtcCommitScene from multiple threads is not supported with this TBB version");
+#endif
+      
+      do {
+
+#if USE_TASK_ARENA
+        if (join) {
+          device->arena->execute([&]{ group.wait(); });
+        }
+        else
+#endif
+        {
+          group.wait();
+        }
+
+        pause_cpu();
+        yield();
+      } while (!buildMutex.try_lock());
+      
+      buildMutex.unlock();
+      return;
+    }   
+
+    /* for best performance set FTZ and DAZ flags in the MXCSR control and status register */
+    const unsigned int mxcsr = _mm_getcsr();
+    _mm_setcsr(mxcsr | /* FTZ */ (1<<15) | /* DAZ */ (1<<6));
+    
+    try {
+#if TBB_INTERFACE_VERSION_MAJOR < 8    
+      tbb::task_group_context ctx( tbb::task_group_context::isolated, tbb::task_group_context::default_traits);
+#else
+      tbb::task_group_context ctx( tbb::task_group_context::isolated, tbb::task_group_context::default_traits | tbb::task_group_context::fp_settings );
+#endif
+      //ctx.set_priority(tbb::priority_high);
+
+#if USE_TASK_ARENA
+      if (join)
+      {
+        device->arena->execute([&]{
+            group.run([&]{
+                tbb::parallel_for (size_t(0), size_t(1), size_t(1), [&] (size_t) { commit_task(); }, ctx);
+              });
+            group.wait();
+          });
+      }
+      else
+#endif
+      {
+        group.run([&]{
+            tbb::parallel_for (size_t(0), size_t(1), size_t(1), [&] (size_t) { commit_task(); }, ctx);
+          });
+        group.wait();
+      }
+     
+      /* reset MXCSR register again */
+      _mm_setcsr(mxcsr);
+    } 
+    catch (...)
+    {
+      /* reset MXCSR register again */
+      _mm_setcsr(mxcsr);
+      
+      accels_clear();
+      updateInterface();
+      throw;
+    }
+  }
+#endif
+
+#if defined(TASKING_PPL)
+
+  void Scene::commit (bool join) 
+  {
+#if defined(TASKING_PPL)
+    if (join)
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcJoinCommitScene not supported with PPL");
+#endif
+
+    /* try to obtain build lock */
+    Lock<MutexSys> lock(buildMutex);
+
+    checkIfModifiedAndSet ();
+    if (!isModified()) {
+      return;
+    }
+
+    /* for best performance set FTZ and DAZ flags in the MXCSR control and status register */
+    const unsigned int mxcsr = _mm_getcsr();
+    _mm_setcsr(mxcsr | /* FTZ */ (1<<15) | /* DAZ */ (1<<6));
+    
+    try {
+
+      group.run([&]{
+          concurrency::parallel_for(size_t(0), size_t(1), size_t(1), [&](size_t) { commit_task(); });
+        });
+      group.wait();
+
+       /* reset MXCSR register again */
+      _mm_setcsr(mxcsr);
+    } 
+    catch (...)
+    {
+      /* reset MXCSR register again */
+      _mm_setcsr(mxcsr);
+      
+      accels_clear();
+      updateInterface();
+      throw;
+    }
+  }
+#endif
+
+  void Scene::setProgressMonitorFunction(RTCProgressMonitorFunction func, void* ptr) 
+  {
+    progress_monitor_function = func;
+    progress_monitor_ptr      = ptr;
+  }
+
+  void Scene::progressMonitor(double dn)
+  {
+    if (progress_monitor_function) {
+      size_t n = size_t(dn) + progress_monitor_counter.fetch_add(size_t(dn));
+      if (!progress_monitor_function(progress_monitor_ptr, n / (double(numPrimitives())))) {
+        throw_RTCError(RTC_ERROR_CANCELLED,"progress monitor forced termination");
+      }
+    }
+  }
+}
diff --git a/thirdparty/embree/kernels/common/scene.h b/thirdparty/embree/kernels/common/scene.h
new file mode 100644
index 0000000000..5ed80a63f6
--- /dev/null
+++ b/thirdparty/embree/kernels/common/scene.h
@@ -0,0 +1,390 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+ 
+#pragma once
+
+#include "default.h"
+#include "device.h"
+#include "builder.h"
+#include "../../common/algorithms/parallel_any_of.h"
+#include "scene_triangle_mesh.h"
+#include "scene_quad_mesh.h"
+#include "scene_user_geometry.h"
+#include "scene_instance.h"
+#include "scene_curves.h"
+#include "scene_line_segments.h"
+#include "scene_subdiv_mesh.h"
+#include "scene_grid_mesh.h"
+#include "scene_points.h"
+#include "../subdiv/tessellation_cache.h"
+
+#include "acceln.h"
+#include "geometry.h"
+
+namespace embree
+{
+  /*! Base class all scenes are derived from */
+  class Scene : public AccelN
+  {
+    ALIGNED_CLASS_(std::alignment_of<Scene>::value);
+
+  public:
+    template<typename Ty, bool mblur = false>
+      class Iterator
+      {
+      public:
+      Iterator ()  {}
+      
+      Iterator (Scene* scene, bool all = false) 
+      : scene(scene), all(all) {}
+      
+      __forceinline Ty* at(const size_t i)
+      {
+        Geometry* geom = scene->geometries[i].ptr;
+        if (geom == nullptr) return nullptr;
+        if (!all && !geom->isEnabled()) return nullptr;
+        const size_t mask = geom->getTypeMask() & Ty::geom_type; 
+        if (!(mask)) return nullptr;
+        if ((geom->numTimeSteps != 1) != mblur) return nullptr;
+        return (Ty*) geom;
+      }
+
+      __forceinline Ty* operator[] (const size_t i) {
+        return at(i);
+      }
+
+      __forceinline size_t size() const {
+        return scene->size();
+      }
+      
+      __forceinline size_t numPrimitives() const {
+        return scene->getNumPrimitives(Ty::geom_type,mblur);
+      }
+
+      __forceinline size_t maxPrimitivesPerGeometry() 
+      {
+        size_t ret = 0;
+        for (size_t i=0; i<scene->size(); i++) {
+          Ty* mesh = at(i);
+          if (mesh == nullptr) continue;
+          ret = max(ret,mesh->size());
+        }
+        return ret;
+      }
+
+      __forceinline unsigned int maxGeomID() 
+      {
+        unsigned int ret = 0;
+        for (size_t i=0; i<scene->size(); i++) {
+          Ty* mesh = at(i);
+          if (mesh == nullptr) continue;
+          ret = max(ret,(unsigned int)i);
+        }
+        return ret;
+      }
+
+      __forceinline unsigned maxTimeStepsPerGeometry()
+      {
+        unsigned ret = 0;
+        for (size_t i=0; i<scene->size(); i++) {
+          Ty* mesh = at(i);
+          if (mesh == nullptr) continue;
+          ret = max(ret,mesh->numTimeSteps);
+        }
+        return ret;
+      }
+      
+    private:
+      Scene* scene;
+      bool all;
+      };
+
+      class Iterator2
+      {
+      public:
+      Iterator2 () {}
+      
+      Iterator2 (Scene* scene, Geometry::GTypeMask typemask, bool mblur) 
+      : scene(scene), typemask(typemask), mblur(mblur) {}
+      
+      __forceinline Geometry* at(const size_t i)
+      {
+        Geometry* geom = scene->geometries[i].ptr;
+        if (geom == nullptr) return nullptr;
+        if (!geom->isEnabled()) return nullptr;
+        if (!(geom->getTypeMask() & typemask)) return nullptr;
+        if ((geom->numTimeSteps != 1) != mblur) return nullptr;
+        return geom;
+      }
+
+      __forceinline Geometry* operator[] (const size_t i) {
+        return at(i);
+      }
+
+      __forceinline size_t size() const {
+        return scene->size();
+      }
+      
+    private:
+      Scene* scene;
+      Geometry::GTypeMask typemask;
+      bool mblur;
+    };
+
+  public:
+    
+    /*! Scene construction */
+    Scene (Device* device);
+
+    /*! Scene destruction */
+    ~Scene () noexcept;
+
+  private:
+    /*! class is non-copyable */
+    Scene (const Scene& other) DELETED; // do not implement
+    Scene& operator= (const Scene& other) DELETED; // do not implement
+
+  public:
+    void createTriangleAccel();
+    void createTriangleMBAccel();
+    void createQuadAccel();
+    void createQuadMBAccel();
+    void createHairAccel();
+    void createHairMBAccel();
+    void createSubdivAccel();
+    void createSubdivMBAccel();
+    void createUserGeometryAccel();
+    void createUserGeometryMBAccel();
+    void createInstanceAccel();
+    void createInstanceMBAccel();
+    void createInstanceExpensiveAccel();
+    void createInstanceExpensiveMBAccel();
+    void createGridAccel();
+    void createGridMBAccel();
+
+    /*! prints statistics about the scene */
+    void printStatistics();
+
+    /*! clears the scene */
+    void clear();
+
+    /*! detaches some geometry */
+    void detachGeometry(size_t geomID);
+
+    void setBuildQuality(RTCBuildQuality quality_flags);
+    RTCBuildQuality getBuildQuality() const;
+    
+    void setSceneFlags(RTCSceneFlags scene_flags);
+    RTCSceneFlags getSceneFlags() const;
+    
+    void commit (bool join);
+    void commit_task ();
+    void build () {}
+
+    void updateInterface();
+
+    /* return number of geometries */
+    __forceinline size_t size() const { return geometries.size(); }
+    
+    /* bind geometry to the scene */
+    unsigned int bind (unsigned geomID, Ref<Geometry> geometry);
+    
+    /* determines if scene is modified */
+    __forceinline bool isModified() const { return modified; }
+
+    /* sets modified flag */
+    __forceinline void setModified(bool f = true) { 
+      modified = f; 
+    }
+
+    __forceinline bool isGeometryModified(size_t geomID)
+    {
+      Ref<Geometry>& g = geometries[geomID];
+      if (!g) return false;
+      return g->getModCounter() > geometryModCounters_[geomID];
+    }
+
+  protected:
+    
+    __forceinline void checkIfModifiedAndSet () 
+    {
+      if (isModified ()) return;
+      
+      auto geometryIsModified = [this](size_t geomID)->bool {
+        return isGeometryModified(geomID);
+      };
+
+      if (parallel_any_of (size_t(0), geometries.size (), geometryIsModified)) {
+        setModified ();
+      }
+    }
+    
+  public:
+
+    /* get mesh by ID */
+    __forceinline       Geometry* get(size_t i)       { assert(i < geometries.size()); return geometries[i].ptr; }
+    __forceinline const Geometry* get(size_t i) const { assert(i < geometries.size()); return geometries[i].ptr; }
+
+    template<typename Mesh>
+      __forceinline       Mesh* get(size_t i)       { 
+      assert(i < geometries.size()); 
+      assert(geometries[i]->getTypeMask() & Mesh::geom_type);
+      return (Mesh*)geometries[i].ptr; 
+    }
+    template<typename Mesh>
+      __forceinline const Mesh* get(size_t i) const { 
+      assert(i < geometries.size()); 
+      assert(geometries[i]->getTypeMask() & Mesh::geom_type);
+      return (Mesh*)geometries[i].ptr; 
+    }
+
+    template<typename Mesh>
+    __forceinline Mesh* getSafe(size_t i) {
+      assert(i < geometries.size());
+      if (geometries[i] == null) return nullptr;
+      if (!(geometries[i]->getTypeMask() & Mesh::geom_type)) return nullptr;
+      else return (Mesh*) geometries[i].ptr;
+    }
+
+    __forceinline Ref<Geometry> get_locked(size_t i)  {
+      Lock<SpinLock> lock(geometriesMutex);
+      assert(i < geometries.size()); 
+      return geometries[i]; 
+    }
+
+    /* flag decoding */
+    __forceinline bool isFastAccel() const { return !isCompactAccel() && !isRobustAccel(); }
+    __forceinline bool isCompactAccel() const { return scene_flags & RTC_SCENE_FLAG_COMPACT; }
+    __forceinline bool isRobustAccel()  const { return scene_flags & RTC_SCENE_FLAG_ROBUST; }
+    __forceinline bool isStaticAccel()  const { return !(scene_flags & RTC_SCENE_FLAG_DYNAMIC); }
+    __forceinline bool isDynamicAccel() const { return scene_flags & RTC_SCENE_FLAG_DYNAMIC; }
+    
+    __forceinline bool hasContextFilterFunction() const {
+      return scene_flags & RTC_SCENE_FLAG_CONTEXT_FILTER_FUNCTION;
+    }
+    
+    __forceinline bool hasGeometryFilterFunction() {
+      return world.numFilterFunctions != 0;
+    }
+      
+    __forceinline bool hasFilterFunction() {
+      return hasContextFilterFunction() || hasGeometryFilterFunction();
+    }
+    
+    /* test if scene got already build */
+    __forceinline bool isBuild() const { return is_build; }
+
+  public:
+    IDPool<unsigned,0xFFFFFFFE> id_pool;
+    vector<Ref<Geometry>> geometries; //!< list of all user geometries
+    vector<unsigned int> geometryModCounters_;
+    vector<float*> vertices;
+    
+  public:
+    Device* device;
+
+    /* these are to detect if we need to recreate the acceleration structures */
+    bool flags_modified;
+    unsigned int enabled_geometry_types;
+    
+    RTCSceneFlags scene_flags;
+    RTCBuildQuality quality_flags;
+    MutexSys buildMutex;
+    SpinLock geometriesMutex;
+    bool is_build;
+  private:
+    bool modified;                   //!< true if scene got modified
+
+  public:
+    
+    /*! global lock step task scheduler */
+#if defined(TASKING_INTERNAL) 
+    MutexSys schedulerMutex;
+    Ref<TaskScheduler> scheduler;
+#elif defined(TASKING_TBB) && TASKING_TBB_USE_TASK_ISOLATION
+    tbb::isolated_task_group group;
+#elif defined(TASKING_TBB)
+    tbb::task_group group;
+#elif defined(TASKING_PPL)
+    concurrency::task_group group;
+#endif
+    
+  public:
+    struct BuildProgressMonitorInterface : public BuildProgressMonitor {
+      BuildProgressMonitorInterface(Scene* scene) 
+      : scene(scene) {}
+      void operator() (size_t dn) const { scene->progressMonitor(double(dn)); }
+    private:
+      Scene* scene;
+    };
+    BuildProgressMonitorInterface progressInterface;
+    RTCProgressMonitorFunction progress_monitor_function;
+    void* progress_monitor_ptr;
+    std::atomic<size_t> progress_monitor_counter;
+    void progressMonitor(double nprims);
+    void setProgressMonitorFunction(RTCProgressMonitorFunction func, void* ptr);
+
+  private:
+    GeometryCounts world;               //!< counts for geometry
+
+  public:
+
+    __forceinline size_t numPrimitives() const {
+      return world.size();
+    }
+
+    __forceinline size_t getNumPrimitives(Geometry::GTypeMask mask, bool mblur) const
+    {
+      size_t count = 0;
+      
+      if (mask & Geometry::MTY_TRIANGLE_MESH)
+        count += mblur ? world.numMBTriangles : world.numTriangles;
+      
+      if (mask & Geometry::MTY_QUAD_MESH)
+        count += mblur ? world.numMBQuads : world.numQuads;
+      
+      if (mask & Geometry::MTY_CURVE2)
+        count += mblur ? world.numMBLineSegments : world.numLineSegments;
+      
+      if (mask & Geometry::MTY_CURVE4)
+        count += mblur ? world.numMBBezierCurves : world.numBezierCurves;
+      
+      if (mask & Geometry::MTY_POINTS)
+        count += mblur ? world.numMBPoints : world.numPoints;
+      
+      if (mask & Geometry::MTY_SUBDIV_MESH)
+        count += mblur ? world.numMBSubdivPatches : world.numSubdivPatches;
+      
+      if (mask & Geometry::MTY_USER_GEOMETRY)
+        count += mblur ? world.numMBUserGeometries : world.numUserGeometries;
+      
+      if (mask & Geometry::MTY_INSTANCE_CHEAP)
+        count += mblur ? world.numMBInstancesCheap : world.numInstancesCheap;
+      
+      if (mask & Geometry::MTY_INSTANCE_EXPENSIVE)
+        count += mblur  ? world.numMBInstancesExpensive : world.numInstancesExpensive;
+      
+      if (mask & Geometry::MTY_GRID_MESH)
+        count += mblur  ? world.numMBGrids : world.numGrids;
+      
+      return count;
+    }
+    
+    template<typename Mesh, bool mblur>
+    __forceinline unsigned getNumTimeSteps()
+    {
+      if (!mblur)
+        return 1;
+
+      Scene::Iterator<Mesh,mblur> iter(this);
+      return iter.maxTimeStepsPerGeometry();
+    }
+
+    template<typename Mesh, bool mblur>
+    __forceinline unsigned int getMaxGeomID()
+    {
+      Scene::Iterator<Mesh,mblur> iter(this);
+      return iter.maxGeomID();
+    }
+  };
+}
diff --git a/thirdparty/embree/kernels/common/scene_curves.h b/thirdparty/embree/kernels/common/scene_curves.h
new file mode 100644
index 0000000000..a5a39e42d4
--- /dev/null
+++ b/thirdparty/embree/kernels/common/scene_curves.h
@@ -0,0 +1,688 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "geometry.h"
+#include "buffer.h"
+
+#include "../subdiv/bezier_curve.h"
+#include "../subdiv/hermite_curve.h"
+#include "../subdiv/bspline_curve.h"
+#include "../subdiv/catmullrom_curve.h"
+#include "../subdiv/linear_bezier_patch.h"
+
+namespace embree
+{
+  /*! represents an array of bicubic bezier curves */
+  struct CurveGeometry : public Geometry
+  {
+    /*! type of this geometry */
+    static const Geometry::GTypeMask geom_type = Geometry::MTY_CURVE4;
+
+  public:
+    
+    /*! bezier curve construction */
+    CurveGeometry (Device* device, Geometry::GType gtype);
+    
+  public:
+    void setMask(unsigned mask);
+    void setNumTimeSteps (unsigned int numTimeSteps);
+    void setVertexAttributeCount (unsigned int N);
+    void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num);
+    void* getBuffer(RTCBufferType type, unsigned int slot);
+    void updateBuffer(RTCBufferType type, unsigned int slot);
+    void commit();
+    bool verify();
+    void setTessellationRate(float N);
+    void setMaxRadiusScale(float s);
+    void addElementsToCount (GeometryCounts & counts) const;
+
+  public:
+    
+    /*! returns the number of vertices */
+    __forceinline size_t numVertices() const {
+      return vertices[0].size();
+    }
+
+    /*! returns the i'th curve */
+    __forceinline const unsigned int& curve(size_t i) const {
+      return curves[i];
+    }
+
+    /*! returns i'th vertex of the first time step */
+    __forceinline Vec3ff vertex(size_t i) const {
+      return vertices0[i];
+    }
+
+    /*! returns i'th normal of the first time step */
+    __forceinline Vec3fa normal(size_t i) const {
+      return normals0[i];
+    }
+
+    /*! returns i'th tangent of the first time step */
+    __forceinline Vec3ff tangent(size_t i) const {
+      return tangents0[i];
+    }
+
+    /*! returns i'th normal derivative of the first time step */
+    __forceinline Vec3fa dnormal(size_t i) const {
+      return dnormals0[i];
+    }
+
+    /*! returns i'th radius of the first time step */
+    __forceinline float radius(size_t i) const {
+      return vertices0[i].w;
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline Vec3ff vertex(size_t i, size_t itime) const {
+      return vertices[itime][i];
+    }
+
+    /*! returns i'th normal of itime'th timestep */
+    __forceinline Vec3fa normal(size_t i, size_t itime) const {
+      return normals[itime][i];
+    }
+
+    /*! returns i'th tangent of itime'th timestep */
+    __forceinline Vec3ff tangent(size_t i, size_t itime) const {
+      return tangents[itime][i];
+    }
+
+    /*! returns i'th normal derivative of itime'th timestep */
+    __forceinline Vec3fa dnormal(size_t i, size_t itime) const {
+      return dnormals[itime][i];
+    }
+
+    /*! returns i'th radius of itime'th timestep */
+    __forceinline float radius(size_t i, size_t itime) const {
+      return vertices[itime][i].w;
+    }
+
+    /*! gathers the curve starting with i'th vertex */
+    __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, size_t i) const
+    {
+      p0 = vertex(i+0);
+      p1 = vertex(i+1);
+      p2 = vertex(i+2);
+      p3 = vertex(i+3);
+    }
+
+    /*! gathers the curve starting with i'th vertex of itime'th timestep */
+    __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, size_t i, size_t itime) const
+    {
+      p0 = vertex(i+0,itime);
+      p1 = vertex(i+1,itime);
+      p2 = vertex(i+2,itime);
+      p3 = vertex(i+3,itime);
+    }
+
+    /*! gathers the curve starting with i'th vertex */
+    __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, Vec3fa& n0, Vec3fa& n1, Vec3fa& n2, Vec3fa& n3, size_t i) const
+    {
+      p0 = vertex(i+0);
+      p1 = vertex(i+1);
+      p2 = vertex(i+2);
+      p3 = vertex(i+3);
+      n0 = normal(i+0);
+      n1 = normal(i+1);
+      n2 = normal(i+2);
+      n3 = normal(i+3);
+    }
+
+    /*! gathers the curve starting with i'th vertex of itime'th timestep */
+    __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, Vec3fa& n0, Vec3fa& n1, Vec3fa& n2, Vec3fa& n3, size_t i, size_t itime) const
+    {
+      p0 = vertex(i+0,itime);
+      p1 = vertex(i+1,itime);
+      p2 = vertex(i+2,itime);
+      p3 = vertex(i+3,itime);
+      n0 = normal(i+0,itime);
+      n1 = normal(i+1,itime);
+      n2 = normal(i+2,itime);
+      n3 = normal(i+3,itime);
+    }
+
+    /*! prefetches the curve starting with i'th vertex of itime'th timestep */
+    __forceinline void prefetchL1_vertices(size_t i) const
+    {
+      prefetchL1(vertices0.getPtr(i)+0);
+      prefetchL1(vertices0.getPtr(i)+64);
+    }
+
+    /*! prefetches the curve starting with i'th vertex of itime'th timestep */
+    __forceinline void prefetchL2_vertices(size_t i) const
+    {
+      prefetchL2(vertices0.getPtr(i)+0);
+      prefetchL2(vertices0.getPtr(i)+64);
+    }  
+
+    /*! loads curve vertices for specified time */
+    __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, size_t i, float time) const
+    {
+      float ftime;
+      const size_t itime = timeSegment(time, ftime);
+
+      const float t0 = 1.0f - ftime;
+      const float t1 = ftime;
+      Vec3ff a0,a1,a2,a3;
+      gather(a0,a1,a2,a3,i,itime);
+      Vec3ff b0,b1,b2,b3;
+      gather(b0,b1,b2,b3,i,itime+1);
+      p0 = madd(Vec3ff(t0),a0,t1*b0);
+      p1 = madd(Vec3ff(t0),a1,t1*b1);
+      p2 = madd(Vec3ff(t0),a2,t1*b2);
+      p3 = madd(Vec3ff(t0),a3,t1*b3);
+    }
+
+    /*! loads curve vertices for specified time */
+    __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, Vec3fa& n0, Vec3fa& n1, Vec3fa& n2, Vec3fa& n3, size_t i, float time) const
+    {
+      float ftime;
+      const size_t itime = timeSegment(time, ftime);
+
+      const float t0 = 1.0f - ftime;
+      const float t1 = ftime;
+      Vec3ff a0,a1,a2,a3; Vec3fa an0,an1,an2,an3;
+      gather(a0,a1,a2,a3,an0,an1,an2,an3,i,itime);
+      Vec3ff b0,b1,b2,b3; Vec3fa bn0,bn1,bn2,bn3;
+      gather(b0,b1,b2,b3,bn0,bn1,bn2,bn3,i,itime+1);
+      p0 = madd(Vec3ff(t0),a0,t1*b0);
+      p1 = madd(Vec3ff(t0),a1,t1*b1);
+      p2 = madd(Vec3ff(t0),a2,t1*b2);
+      p3 = madd(Vec3ff(t0),a3,t1*b3);
+      n0 = madd(Vec3ff(t0),an0,t1*bn0);
+      n1 = madd(Vec3ff(t0),an1,t1*bn1);
+      n2 = madd(Vec3ff(t0),an2,t1*bn2);
+      n3 = madd(Vec3ff(t0),an3,t1*bn3);
+    }
+
+    template<typename SourceCurve3ff, typename SourceCurve3fa, typename TensorLinearCubicBezierSurface3fa>
+    __forceinline TensorLinearCubicBezierSurface3fa getNormalOrientedCurve(IntersectContext* context, const Vec3fa& ray_org, const unsigned int primID, const size_t itime) const
+    {
+      Vec3ff v0,v1,v2,v3; Vec3fa n0,n1,n2,n3;
+      unsigned int vertexID = curve(primID);
+      gather(v0,v1,v2,v3,n0,n1,n2,n3,vertexID,itime);
+      SourceCurve3ff ccurve(v0,v1,v2,v3);
+      SourceCurve3fa ncurve(n0,n1,n2,n3);
+      ccurve = enlargeRadiusToMinWidth(context,this,ray_org,ccurve);
+      return TensorLinearCubicBezierSurface3fa::fromCenterAndNormalCurve(ccurve,ncurve);
+    }
+
+    template<typename SourceCurve3ff, typename SourceCurve3fa, typename TensorLinearCubicBezierSurface3fa>
+    __forceinline TensorLinearCubicBezierSurface3fa getNormalOrientedCurve(IntersectContext* context, const Vec3fa& ray_org, const unsigned int primID, const float time) const
+    {
+      float ftime;
+      const size_t itime = timeSegment(time, ftime);
+      const TensorLinearCubicBezierSurface3fa curve0 = getNormalOrientedCurve<SourceCurve3ff, SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context,ray_org,primID,itime+0);
+      const TensorLinearCubicBezierSurface3fa curve1 = getNormalOrientedCurve<SourceCurve3ff, SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context,ray_org,primID,itime+1);
+      return clerp(curve0,curve1,ftime);
+    }
+
+    /*! gathers the hermite curve starting with i'th vertex */
+    __forceinline void gather_hermite(Vec3ff& p0, Vec3ff& t0, Vec3ff& p1, Vec3ff& t1, size_t i) const
+    {
+      p0 = vertex (i+0);
+      p1 = vertex (i+1);
+      t0 = tangent(i+0);
+      t1 = tangent(i+1);
+    }
+
+    /*! gathers the hermite curve starting with i'th vertex of itime'th timestep */
+    __forceinline void gather_hermite(Vec3ff& p0, Vec3ff& t0, Vec3ff& p1, Vec3ff& t1, size_t i, size_t itime) const
+    {
+      p0 = vertex (i+0,itime);
+      p1 = vertex (i+1,itime);
+      t0 = tangent(i+0,itime);
+      t1 = tangent(i+1,itime);
+    }
+
+    /*! loads curve vertices for specified time */
+    __forceinline void gather_hermite(Vec3ff& p0, Vec3ff& t0, Vec3ff& p1, Vec3ff& t1, size_t i, float time) const
+    {
+      float ftime;
+      const size_t itime = timeSegment(time, ftime);
+      const float f0 = 1.0f - ftime, f1 = ftime;
+      Vec3ff ap0,at0,ap1,at1;
+      gather_hermite(ap0,at0,ap1,at1,i,itime);
+      Vec3ff bp0,bt0,bp1,bt1;
+      gather_hermite(bp0,bt0,bp1,bt1,i,itime+1);
+      p0 = madd(Vec3ff(f0),ap0,f1*bp0);
+      t0 = madd(Vec3ff(f0),at0,f1*bt0);
+      p1 = madd(Vec3ff(f0),ap1,f1*bp1);
+      t1 = madd(Vec3ff(f0),at1,f1*bt1);
+    }
+
+    /*! gathers the hermite curve starting with i'th vertex */
+    __forceinline void gather_hermite(Vec3ff& p0, Vec3ff& t0, Vec3fa& n0, Vec3fa& dn0, Vec3ff& p1, Vec3ff& t1, Vec3fa& n1, Vec3fa& dn1, size_t i) const
+    {
+      p0 = vertex (i+0);
+      p1 = vertex (i+1);
+      t0 = tangent(i+0);
+      t1 = tangent(i+1);
+      n0 = normal(i+0);
+      n1 = normal(i+1);
+      dn0 = dnormal(i+0);
+      dn1 = dnormal(i+1);
+    }
+
+    /*! gathers the hermite curve starting with i'th vertex of itime'th timestep */
+    __forceinline void gather_hermite(Vec3ff& p0, Vec3ff& t0, Vec3fa& n0, Vec3fa& dn0, Vec3ff& p1, Vec3ff& t1, Vec3fa& n1, Vec3fa& dn1, size_t i, size_t itime) const
+    {
+      p0 = vertex (i+0,itime);
+      p1 = vertex (i+1,itime);
+      t0 = tangent(i+0,itime);
+      t1 = tangent(i+1,itime);
+      n0 = normal(i+0,itime);
+      n1 = normal(i+1,itime);
+      dn0 = dnormal(i+0,itime);
+      dn1 = dnormal(i+1,itime);
+    }
+
+    /*! loads curve vertices for specified time */
+    __forceinline void gather_hermite(Vec3ff& p0, Vec3fa& t0, Vec3fa& n0, Vec3fa& dn0, Vec3ff& p1, Vec3fa& t1, Vec3fa& n1, Vec3fa& dn1, size_t i, float time) const
+    {
+      float ftime;
+      const size_t itime = timeSegment(time, ftime);
+      const float f0 = 1.0f - ftime, f1 = ftime;
+      Vec3ff ap0,at0,ap1,at1; Vec3fa an0,adn0,an1,adn1;
+      gather_hermite(ap0,at0,an0,adn0,ap1,at1,an1,adn1,i,itime);
+      Vec3ff bp0,bt0,bp1,bt1; Vec3fa bn0,bdn0,bn1,bdn1;
+      gather_hermite(bp0,bt0,bn0,bdn0,bp1,bt1,bn1,bdn1,i,itime+1);
+      p0 = madd(Vec3ff(f0),ap0,f1*bp0);
+      t0 = madd(Vec3ff(f0),at0,f1*bt0);
+      n0 = madd(Vec3ff(f0),an0,f1*bn0);
+      dn0= madd(Vec3ff(f0),adn0,f1*bdn0);
+      p1 = madd(Vec3ff(f0),ap1,f1*bp1);
+      t1 = madd(Vec3ff(f0),at1,f1*bt1);
+      n1 = madd(Vec3ff(f0),an1,f1*bn1);
+      dn1= madd(Vec3ff(f0),adn1,f1*bdn1);
+    }
+
+    template<typename SourceCurve3ff, typename SourceCurve3fa, typename TensorLinearCubicBezierSurface3fa>
+      __forceinline TensorLinearCubicBezierSurface3fa getNormalOrientedHermiteCurve(IntersectContext* context, const Vec3fa& ray_org, const unsigned int primID, const size_t itime) const
+    {
+      Vec3ff v0,t0,v1,t1; Vec3fa n0,dn0,n1,dn1;
+      unsigned int vertexID = curve(primID);
+      gather_hermite(v0,t0,n0,dn0,v1,t1,n1,dn1,vertexID,itime);
+
+      SourceCurve3ff ccurve(v0,t0,v1,t1);
+      SourceCurve3fa ncurve(n0,dn0,n1,dn1);
+      ccurve = enlargeRadiusToMinWidth(context,this,ray_org,ccurve);
+      return TensorLinearCubicBezierSurface3fa::fromCenterAndNormalCurve(ccurve,ncurve);
+    }
+
+    template<typename SourceCurve3ff, typename SourceCurve3fa, typename TensorLinearCubicBezierSurface3fa>
+    __forceinline TensorLinearCubicBezierSurface3fa getNormalOrientedHermiteCurve(IntersectContext* context, const Vec3fa& ray_org, const unsigned int primID, const float time) const
+    {
+      float ftime;
+      const size_t itime = timeSegment(time, ftime);
+      const TensorLinearCubicBezierSurface3fa curve0 = getNormalOrientedHermiteCurve<SourceCurve3ff, SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray_org, primID,itime+0);
+      const TensorLinearCubicBezierSurface3fa curve1 = getNormalOrientedHermiteCurve<SourceCurve3ff, SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray_org, primID,itime+1);
+      return clerp(curve0,curve1,ftime);
+    }
+
+  private:
+    void resizeBuffers(unsigned int numSteps);
+
+  public:
+    BufferView<unsigned int> curves;        //!< array of curve indices
+    BufferView<Vec3ff> vertices0;           //!< fast access to first vertex buffer
+    BufferView<Vec3fa> normals0;            //!< fast access to first normal buffer
+    BufferView<Vec3ff> tangents0;           //!< fast access to first tangent buffer
+    BufferView<Vec3fa> dnormals0;           //!< fast access to first normal derivative buffer
+    vector<BufferView<Vec3ff>> vertices;    //!< vertex array for each timestep
+    vector<BufferView<Vec3fa>> normals;     //!< normal array for each timestep
+    vector<BufferView<Vec3ff>> tangents;    //!< tangent array for each timestep
+    vector<BufferView<Vec3fa>> dnormals;    //!< normal derivative array for each timestep
+    BufferView<char> flags;                 //!< start, end flag per segment
+    vector<BufferView<char>> vertexAttribs; //!< user buffers
+    int tessellationRate;                   //!< tessellation rate for flat curve
+    float maxRadiusScale = 1.0;             //!< maximal min-width scaling of curve radii
+  };
+
+  namespace isa
+  {
+    
+  template<template<typename Ty> class Curve>
+  struct CurveGeometryInterface : public CurveGeometry
+  {
+    typedef Curve<Vec3ff> Curve3ff;
+    typedef Curve<Vec3fa> Curve3fa;
+    
+    CurveGeometryInterface (Device* device, Geometry::GType gtype)
+      : CurveGeometry(device,gtype) {}
+    
+    __forceinline const Curve3ff getCurveScaledRadius(size_t i, size_t itime = 0) const 
+    {
+      const unsigned int index = curve(i);
+      Vec3ff v0 = vertex(index+0,itime);
+      Vec3ff v1 = vertex(index+1,itime);
+      Vec3ff v2 = vertex(index+2,itime);
+      Vec3ff v3 = vertex(index+3,itime);
+      v0.w *= maxRadiusScale;
+      v1.w *= maxRadiusScale;
+      v2.w *= maxRadiusScale;
+      v3.w *= maxRadiusScale;
+      return Curve3ff (v0,v1,v2,v3);
+    }
+    
+    __forceinline const Curve3ff getCurveScaledRadius(const LinearSpace3fa& space, size_t i, size_t itime = 0) const 
+    {
+      const unsigned int index = curve(i);
+      const Vec3ff v0 = vertex(index+0,itime);
+      const Vec3ff v1 = vertex(index+1,itime);
+      const Vec3ff v2 = vertex(index+2,itime);
+      const Vec3ff v3 = vertex(index+3,itime);
+      const Vec3ff w0(xfmPoint(space,(Vec3fa)v0), maxRadiusScale*v0.w);
+      const Vec3ff w1(xfmPoint(space,(Vec3fa)v1), maxRadiusScale*v1.w);
+      const Vec3ff w2(xfmPoint(space,(Vec3fa)v2), maxRadiusScale*v2.w);
+      const Vec3ff w3(xfmPoint(space,(Vec3fa)v3), maxRadiusScale*v3.w);
+      return Curve3ff(w0,w1,w2,w3);
+    }
+    
+    __forceinline const Curve3ff getCurveScaledRadius(const Vec3fa& ofs, const float scale, const float r_scale0, const LinearSpace3fa& space, size_t i, size_t itime = 0) const 
+    {
+      const float r_scale = r_scale0*scale;
+      const unsigned int index = curve(i);
+      const Vec3ff v0 = vertex(index+0,itime);
+      const Vec3ff v1 = vertex(index+1,itime);
+      const Vec3ff v2 = vertex(index+2,itime);
+      const Vec3ff v3 = vertex(index+3,itime);
+      const Vec3ff w0(xfmPoint(space,((Vec3fa)v0-ofs)*Vec3fa(scale)), maxRadiusScale*v0.w*r_scale);
+      const Vec3ff w1(xfmPoint(space,((Vec3fa)v1-ofs)*Vec3fa(scale)), maxRadiusScale*v1.w*r_scale);
+      const Vec3ff w2(xfmPoint(space,((Vec3fa)v2-ofs)*Vec3fa(scale)), maxRadiusScale*v2.w*r_scale);
+      const Vec3ff w3(xfmPoint(space,((Vec3fa)v3-ofs)*Vec3fa(scale)), maxRadiusScale*v3.w*r_scale);
+      return Curve3ff(w0,w1,w2,w3);
+    }
+    
+    __forceinline const Curve3fa getNormalCurve(size_t i, size_t itime = 0) const 
+    {
+      const unsigned int index = curve(i);
+      const Vec3fa n0 = normal(index+0,itime);
+      const Vec3fa n1 = normal(index+1,itime);
+      const Vec3fa n2 = normal(index+2,itime);
+      const Vec3fa n3 = normal(index+3,itime);
+      return Curve3fa (n0,n1,n2,n3);
+    }
+    
+    __forceinline const TensorLinearCubicBezierSurface3fa getOrientedCurveScaledRadius(size_t i, size_t itime = 0) const 
+    {
+      const Curve3ff center = getCurveScaledRadius(i,itime);
+      const Curve3fa normal = getNormalCurve(i,itime);
+      const TensorLinearCubicBezierSurface3fa ocurve = TensorLinearCubicBezierSurface3fa::fromCenterAndNormalCurve(center,normal);
+      return ocurve;
+    }
+    
+    __forceinline const TensorLinearCubicBezierSurface3fa getOrientedCurveScaledRadius(const LinearSpace3fa& space, size_t i, size_t itime = 0) const {
+      return getOrientedCurveScaledRadius(i,itime).xfm(space);
+    }
+    
+    __forceinline const TensorLinearCubicBezierSurface3fa getOrientedCurveScaledRadius(const Vec3fa& ofs, const float scale, const LinearSpace3fa& space, size_t i, size_t itime = 0) const {
+      return getOrientedCurveScaledRadius(i,itime).xfm(space,ofs,scale);
+    }
+    
+    /*! check if the i'th primitive is valid at the itime'th time step */
+    __forceinline bool valid(Geometry::GType ctype, size_t i, const range<size_t>& itime_range) const
+    {
+      const unsigned int index = curve(i);
+      if (index+3 >= numVertices()) return false;
+      
+      for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++)
+      {
+        const float r0 = radius(index+0,itime);
+        const float r1 = radius(index+1,itime);
+        const float r2 = radius(index+2,itime);
+        const float r3 = radius(index+3,itime);
+        if (!isvalid(r0) || !isvalid(r1) || !isvalid(r2) || !isvalid(r3))
+          return false;
+        
+        const Vec3fa v0 = vertex(index+0,itime);
+        const Vec3fa v1 = vertex(index+1,itime);
+        const Vec3fa v2 = vertex(index+2,itime);
+        const Vec3fa v3 = vertex(index+3,itime);
+        if (!isvalid(v0) || !isvalid(v1) || !isvalid(v2) || !isvalid(v3))
+          return false;
+        
+        if (ctype == Geometry::GTY_SUBTYPE_ORIENTED_CURVE)
+        {
+          const Vec3fa n0 = normal(index+0,itime);
+          const Vec3fa n1 = normal(index+1,itime);
+          if (!isvalid(n0) || !isvalid(n1))
+            return false;
+        }
+      }
+      
+      return true;
+    }
+
+    template<int N>
+    void interpolate_impl(const RTCInterpolateArguments* const args)
+    {
+      unsigned int primID = args->primID;
+      float u = args->u;
+      RTCBufferType bufferType = args->bufferType;
+      unsigned int bufferSlot = args->bufferSlot;
+      float* P = args->P;
+      float* dPdu = args->dPdu;
+      float* ddPdudu = args->ddPdudu;
+      unsigned int valueCount = args->valueCount;
+      
+      /* calculate base pointer and stride */
+      assert((bufferType == RTC_BUFFER_TYPE_VERTEX && bufferSlot < numTimeSteps) ||
+             (bufferType == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE && bufferSlot <= vertexAttribs.size()));
+      const char* src = nullptr; 
+      size_t stride = 0;
+      if (bufferType == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE) {
+        src    = vertexAttribs[bufferSlot].getPtr();
+        stride = vertexAttribs[bufferSlot].getStride();
+      } else {
+        src    = vertices[bufferSlot].getPtr();
+        stride = vertices[bufferSlot].getStride();
+      }
+      
+      for (unsigned int i=0; i<valueCount; i+=N)
+      {
+        size_t ofs = i*sizeof(float);
+        const size_t index = curves[primID];
+        const vbool<N> valid = vint<N>((int)i)+vint<N>(step) < vint<N>((int)valueCount);
+        const vfloat<N> p0 = mem<vfloat<N>>::loadu(valid,(float*)&src[(index+0)*stride+ofs]);
+        const vfloat<N> p1 = mem<vfloat<N>>::loadu(valid,(float*)&src[(index+1)*stride+ofs]);
+        const vfloat<N> p2 = mem<vfloat<N>>::loadu(valid,(float*)&src[(index+2)*stride+ofs]);
+        const vfloat<N> p3 = mem<vfloat<N>>::loadu(valid,(float*)&src[(index+3)*stride+ofs]);
+        
+        const Curve<vfloat<N>> curve(p0,p1,p2,p3);
+        if (P      ) mem<vfloat<N>>::storeu(valid,P+i,      curve.eval(u));
+        if (dPdu   ) mem<vfloat<N>>::storeu(valid,dPdu+i,   curve.eval_du(u));
+        if (ddPdudu) mem<vfloat<N>>::storeu(valid,ddPdudu+i,curve.eval_dudu(u));
+      }
+    }
+
+    void interpolate(const RTCInterpolateArguments* const args) {
+      interpolate_impl<4>(args);
+    }
+  };
+  
+  template<template<typename Ty> class Curve>
+  struct HermiteCurveGeometryInterface : public CurveGeometry
+  {
+    typedef Curve<Vec3ff> HermiteCurve3ff;
+    typedef Curve<Vec3fa> HermiteCurve3fa;
+    
+    HermiteCurveGeometryInterface (Device* device, Geometry::GType gtype)
+      : CurveGeometry(device,gtype) {}
+    
+    __forceinline const HermiteCurve3ff getCurveScaledRadius(size_t i, size_t itime = 0) const 
+    {
+      const unsigned int index = curve(i);
+      Vec3ff v0 = vertex(index+0,itime);
+      Vec3ff v1 = vertex(index+1,itime);
+      Vec3ff t0 = tangent(index+0,itime);
+      Vec3ff t1 = tangent(index+1,itime);
+      v0.w *= maxRadiusScale;
+      v1.w *= maxRadiusScale;
+      t0.w *= maxRadiusScale;
+      t1.w *= maxRadiusScale;
+      return HermiteCurve3ff (v0,t0,v1,t1);
+    }
+    
+    __forceinline const HermiteCurve3ff getCurveScaledRadius(const LinearSpace3fa& space, size_t i, size_t itime = 0) const 
+    {
+      const unsigned int index = curve(i);
+      const Vec3ff v0 = vertex(index+0,itime);
+      const Vec3ff v1 = vertex(index+1,itime);
+      const Vec3ff t0 = tangent(index+0,itime);
+      const Vec3ff t1 = tangent(index+1,itime);
+      const Vec3ff V0(xfmPoint(space,(Vec3fa)v0),maxRadiusScale*v0.w);
+      const Vec3ff V1(xfmPoint(space,(Vec3fa)v1),maxRadiusScale*v1.w);
+      const Vec3ff T0(xfmVector(space,(Vec3fa)t0),maxRadiusScale*t0.w);
+      const Vec3ff T1(xfmVector(space,(Vec3fa)t1),maxRadiusScale*t1.w);
+      return HermiteCurve3ff(V0,T0,V1,T1);
+    }
+    
+    __forceinline const HermiteCurve3ff getCurveScaledRadius(const Vec3fa& ofs, const float scale, const float r_scale0, const LinearSpace3fa& space, size_t i, size_t itime = 0) const 
+    {
+      const float r_scale = r_scale0*scale;
+      const unsigned int index = curve(i);
+      const Vec3ff v0 = vertex(index+0,itime);
+      const Vec3ff v1 = vertex(index+1,itime);
+      const Vec3ff t0 = tangent(index+0,itime);
+      const Vec3ff t1 = tangent(index+1,itime);
+      const Vec3ff V0(xfmPoint(space,(v0-ofs)*Vec3fa(scale)), maxRadiusScale*v0.w*r_scale);
+      const Vec3ff V1(xfmPoint(space,(v1-ofs)*Vec3fa(scale)), maxRadiusScale*v1.w*r_scale);
+      const Vec3ff T0(xfmVector(space,t0*Vec3fa(scale)), maxRadiusScale*t0.w*r_scale);
+      const Vec3ff T1(xfmVector(space,t1*Vec3fa(scale)), maxRadiusScale*t1.w*r_scale);
+      return HermiteCurve3ff(V0,T0,V1,T1);
+    }
+    
+    __forceinline const HermiteCurve3fa getNormalCurve(size_t i, size_t itime = 0) const 
+    {
+      const unsigned int index = curve(i);
+      const Vec3fa n0 = normal(index+0,itime);
+      const Vec3fa n1 = normal(index+1,itime);
+      const Vec3fa dn0 = dnormal(index+0,itime);
+      const Vec3fa dn1 = dnormal(index+1,itime);
+      return HermiteCurve3fa (n0,dn0,n1,dn1);
+    }
+    
+    __forceinline const TensorLinearCubicBezierSurface3fa getOrientedCurveScaledRadius(size_t i, size_t itime = 0) const 
+    {
+      const HermiteCurve3ff center = getCurveScaledRadius(i,itime);
+      const HermiteCurve3fa normal = getNormalCurve(i,itime);
+      const TensorLinearCubicBezierSurface3fa ocurve = TensorLinearCubicBezierSurface3fa::fromCenterAndNormalCurve(center,normal);
+      return ocurve;
+    }
+    
+    __forceinline const TensorLinearCubicBezierSurface3fa getOrientedCurveScaledRadius(const LinearSpace3fa& space, size_t i, size_t itime = 0) const {
+      return getOrientedCurveScaledRadius(i,itime).xfm(space);
+    }
+    
+    __forceinline const TensorLinearCubicBezierSurface3fa getOrientedCurveScaledRadius(const Vec3fa& ofs, const float scale, const LinearSpace3fa& space, size_t i, size_t itime = 0) const {
+      return getOrientedCurveScaledRadius(i,itime).xfm(space,ofs,scale);
+    }
+    
+    /*! check if the i'th primitive is valid at the itime'th time step */
+    __forceinline bool valid(Geometry::GType ctype, size_t i, const range<size_t>& itime_range) const
+    {
+      const unsigned int index = curve(i);
+      if (index+1 >= numVertices()) return false;
+      
+      for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++)
+      {
+        const Vec3ff v0 = vertex(index+0,itime);
+        const Vec3ff v1 = vertex(index+1,itime);
+        if (!isvalid4(v0) || !isvalid4(v1))
+          return false;
+        
+        const Vec3ff t0 = tangent(index+0,itime);
+        const Vec3ff t1 = tangent(index+1,itime);
+        if (!isvalid4(t0) || !isvalid4(t1))
+          return false;
+        
+        if (ctype == Geometry::GTY_SUBTYPE_ORIENTED_CURVE)
+        {
+          const Vec3fa n0 = normal(index+0,itime);
+          const Vec3fa n1 = normal(index+1,itime);
+          if (!isvalid(n0) || !isvalid(n1))
+            return false;
+          
+          const Vec3fa dn0 = dnormal(index+0,itime);
+          const Vec3fa dn1 = dnormal(index+1,itime);
+          if (!isvalid(dn0) || !isvalid(dn1))
+            return false;
+        }
+      }
+      
+      return true;
+    }
+
+    template<int N>
+    void interpolate_impl(const RTCInterpolateArguments* const args)
+    {
+      unsigned int primID = args->primID;
+      float u = args->u;
+      RTCBufferType bufferType = args->bufferType;
+      unsigned int bufferSlot = args->bufferSlot;
+      float* P = args->P;
+      float* dPdu = args->dPdu;
+      float* ddPdudu = args->ddPdudu;
+      unsigned int valueCount = args->valueCount;
+      
+      /* we interpolate vertex attributes linearly for hermite basis */
+      if (bufferType == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE)
+      {
+        assert(bufferSlot <= vertexAttribs.size());
+        const char* vsrc = vertexAttribs[bufferSlot].getPtr();
+        const size_t vstride = vertexAttribs[bufferSlot].getStride();
+        
+        for (unsigned int i=0; i<valueCount; i+=N)
+        {
+          const size_t ofs = i*sizeof(float);
+          const size_t index = curves[primID];
+          const vbool<N> valid = vint<N>((int)i)+vint<N>(step) < vint<N>((int)valueCount);
+          const vfloat<N> p0 = mem<vfloat<N>>::loadu(valid,(float*)&vsrc[(index+0)*vstride+ofs]);
+          const vfloat<N> p1 = mem<vfloat<N>>::loadu(valid,(float*)&vsrc[(index+1)*vstride+ofs]);
+          
+          if (P      ) mem<vfloat<N>>::storeu(valid,P+i,      madd(1.0f-u,p0,u*p1));
+          if (dPdu   ) mem<vfloat<N>>::storeu(valid,dPdu+i,   p1-p0);
+          if (ddPdudu) mem<vfloat<N>>::storeu(valid,ddPdudu+i,vfloat<N>(zero));
+        }
+      }
+      
+      /* interpolation for vertex buffers */
+      else
+      {
+        assert(bufferSlot < numTimeSteps);
+        const char* vsrc = vertices[bufferSlot].getPtr();
+        const char* tsrc = tangents[bufferSlot].getPtr();
+        const size_t vstride = vertices[bufferSlot].getStride();
+        const size_t tstride = vertices[bufferSlot].getStride();
+        
+        for (unsigned int i=0; i<valueCount; i+=N)
+        {
+          const size_t ofs = i*sizeof(float);
+          const size_t index = curves[primID];
+          const vbool<N> valid = vint<N>((int)i)+vint<N>(step) < vint<N>((int)valueCount);
+          const vfloat<N> p0 = mem<vfloat<N>>::loadu(valid,(float*)&vsrc[(index+0)*vstride+ofs]);
+          const vfloat<N> p1 = mem<vfloat<N>>::loadu(valid,(float*)&vsrc[(index+1)*vstride+ofs]);
+          const vfloat<N> t0 = mem<vfloat<N>>::loadu(valid,(float*)&tsrc[(index+0)*tstride+ofs]);
+          const vfloat<N> t1 = mem<vfloat<N>>::loadu(valid,(float*)&tsrc[(index+1)*tstride+ofs]);
+          
+          const HermiteCurveT<vfloat<N>> curve(p0,t0,p1,t1);
+          if (P      ) mem<vfloat<N>>::storeu(valid,P+i,      curve.eval(u));
+          if (dPdu   ) mem<vfloat<N>>::storeu(valid,dPdu+i,   curve.eval_du(u));
+          if (ddPdudu) mem<vfloat<N>>::storeu(valid,ddPdudu+i,curve.eval_dudu(u));
+        }
+      }
+    }
+
+    void interpolate(const RTCInterpolateArguments* const args) {
+      interpolate_impl<4>(args);
+    }
+  };
+  }
+  
+  DECLARE_ISA_FUNCTION(CurveGeometry*, createCurves, Device* COMMA Geometry::GType);
+}
diff --git a/thirdparty/embree/kernels/common/scene_grid_mesh.h b/thirdparty/embree/kernels/common/scene_grid_mesh.h
new file mode 100644
index 0000000000..fb6fed445b
--- /dev/null
+++ b/thirdparty/embree/kernels/common/scene_grid_mesh.h
@@ -0,0 +1,294 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "geometry.h"
+#include "buffer.h"
+
+namespace embree
+{
+  /*! Grid Mesh */
+  struct GridMesh : public Geometry
+  {
+    /*! type of this geometry */
+    static const Geometry::GTypeMask geom_type = Geometry::MTY_GRID_MESH;
+
+    /*! grid */
+    struct Grid 
+    {
+      unsigned int startVtxID;
+      unsigned int lineVtxOffset;
+      unsigned short resX,resY;
+
+      /* border flags due to 3x3 vertex pattern */
+      __forceinline unsigned int get3x3FlagsX(const unsigned int x) const
+      {
+        return (x + 2 >= (unsigned int)resX) ? (1<<15) : 0;
+      }
+
+      /* border flags due to 3x3 vertex pattern */
+      __forceinline unsigned int get3x3FlagsY(const unsigned int y) const
+      {
+        return (y + 2 >= (unsigned int)resY) ? (1<<15) : 0;
+      }
+
+      /*! outputs grid structure */
+      __forceinline friend embree_ostream operator<<(embree_ostream cout, const Grid& t) {
+        return cout << "Grid { startVtxID " << t.startVtxID << ", lineVtxOffset " << t.lineVtxOffset << ", resX " << t.resX << ", resY " << t.resY << " }";
+      }
+    };
+
+  public:
+
+    /*! grid mesh construction */
+    GridMesh (Device* device); 
+
+    /* geometry interface */
+  public:
+    void setMask(unsigned mask);
+    void setNumTimeSteps (unsigned int numTimeSteps);
+    void setVertexAttributeCount (unsigned int N);
+    void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num);
+    void* getBuffer(RTCBufferType type, unsigned int slot);
+    void updateBuffer(RTCBufferType type, unsigned int slot);
+    void commit();
+    bool verify();
+    void interpolate(const RTCInterpolateArguments* const args);
+
+    template<int N>
+    void interpolate_impl(const RTCInterpolateArguments* const args)
+    {
+      unsigned int primID = args->primID;
+      float U = args->u;
+      float V = args->v;
+      
+      /* clamp input u,v to [0;1] range */
+      U = max(min(U,1.0f),0.0f);
+      V = max(min(V,1.0f),0.0f);
+      
+      RTCBufferType bufferType = args->bufferType;
+      unsigned int bufferSlot = args->bufferSlot;
+      float* P = args->P;
+      float* dPdu = args->dPdu;
+      float* dPdv = args->dPdv;
+      float* ddPdudu = args->ddPdudu;
+      float* ddPdvdv = args->ddPdvdv;
+      float* ddPdudv = args->ddPdudv;
+      unsigned int valueCount = args->valueCount;
+      
+      /* calculate base pointer and stride */
+      assert((bufferType == RTC_BUFFER_TYPE_VERTEX && bufferSlot < numTimeSteps) ||
+             (bufferType == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE && bufferSlot <= vertexAttribs.size()));
+      const char* src = nullptr; 
+      size_t stride = 0;
+      if (bufferType == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE) {
+        src    = vertexAttribs[bufferSlot].getPtr();
+        stride = vertexAttribs[bufferSlot].getStride();
+      } else {
+        src    = vertices[bufferSlot].getPtr();
+        stride = vertices[bufferSlot].getStride();
+      }
+      
+      const Grid& grid = grids[primID];
+      const int grid_width  = grid.resX-1;
+      const int grid_height = grid.resY-1;
+      const float rcp_grid_width = rcp(float(grid_width));
+      const float rcp_grid_height = rcp(float(grid_height));
+      const int iu = min((int)floor(U*grid_width ),grid_width);
+      const int iv = min((int)floor(V*grid_height),grid_height);
+      const float u = U*grid_width-float(iu);
+      const float v = V*grid_height-float(iv);
+      
+      for (unsigned int i=0; i<valueCount; i+=N)
+      {
+        const size_t ofs = i*sizeof(float);
+        const unsigned int idx0 = grid.startVtxID + (iv+0)*grid.lineVtxOffset + iu;
+        const unsigned int idx1 = grid.startVtxID + (iv+1)*grid.lineVtxOffset + iu;
+        
+        const vbool<N> valid = vint<N>((int)i)+vint<N>(step) < vint<N>(int(valueCount));
+        const vfloat<N> p0 = mem<vfloat<N>>::loadu(valid,(float*)&src[(idx0+0)*stride+ofs]);
+        const vfloat<N> p1 = mem<vfloat<N>>::loadu(valid,(float*)&src[(idx0+1)*stride+ofs]);
+        const vfloat<N> p2 = mem<vfloat<N>>::loadu(valid,(float*)&src[(idx1+1)*stride+ofs]);
+        const vfloat<N> p3 = mem<vfloat<N>>::loadu(valid,(float*)&src[(idx1+0)*stride+ofs]);
+        const vbool<N> left = u+v <= 1.0f;
+        const vfloat<N> Q0 = select(left,p0,p2);
+        const vfloat<N> Q1 = select(left,p1,p3);
+        const vfloat<N> Q2 = select(left,p3,p1);
+        const vfloat<N> U  = select(left,u,vfloat<N>(1.0f)-u);
+        const vfloat<N> V  = select(left,v,vfloat<N>(1.0f)-v);
+        const vfloat<N> W  = 1.0f-U-V;
+        
+        if (P) {
+          mem<vfloat<N>>::storeu(valid,P+i,madd(W,Q0,madd(U,Q1,V*Q2)));
+        }
+        if (dPdu) { 
+          assert(dPdu); mem<vfloat<N>>::storeu(valid,dPdu+i,select(left,Q1-Q0,Q0-Q1)*rcp_grid_width);
+          assert(dPdv); mem<vfloat<N>>::storeu(valid,dPdv+i,select(left,Q2-Q0,Q0-Q2)*rcp_grid_height);
+        }
+        if (ddPdudu) { 
+          assert(ddPdudu); mem<vfloat<N>>::storeu(valid,ddPdudu+i,vfloat<N>(zero));
+          assert(ddPdvdv); mem<vfloat<N>>::storeu(valid,ddPdvdv+i,vfloat<N>(zero));
+          assert(ddPdudv); mem<vfloat<N>>::storeu(valid,ddPdudv+i,vfloat<N>(zero));
+        }
+      }
+    }
+    
+    void addElementsToCount (GeometryCounts & counts) const;
+    
+    __forceinline unsigned int getNumSubGrids(const size_t gridID)
+    {
+      const Grid &g = grid(gridID);
+      return max((unsigned int)1,((unsigned int)g.resX >> 1) * ((unsigned int)g.resY >> 1));
+    }
+
+    /*! get fast access to first vertex buffer */
+    __forceinline float * getCompactVertexArray () const {
+      return (float*) vertices0.getPtr();
+    }
+
+  public:
+
+    /*! returns number of vertices */
+    __forceinline size_t numVertices() const {
+      return vertices[0].size();
+    }
+    
+    /*! returns i'th grid*/
+    __forceinline const Grid& grid(size_t i) const {
+      return grids[i];
+    }
+
+    /*! returns i'th vertex of the first time step  */
+    __forceinline const Vec3fa vertex(size_t i) const { // FIXME: check if this does a unaligned load
+      return vertices0[i];
+    }
+
+    /*! returns i'th vertex of the first time step */
+    __forceinline const char* vertexPtr(size_t i) const {
+      return vertices0.getPtr(i);
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline const Vec3fa vertex(size_t i, size_t itime) const {
+      return vertices[itime][i];
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline const char* vertexPtr(size_t i, size_t itime) const {
+      return vertices[itime].getPtr(i);
+    }
+
+    /*! returns i'th vertex of the first timestep */
+    __forceinline size_t grid_vertex_index(const Grid& g, size_t x, size_t y) const {
+      assert(x < (size_t)g.resX);
+      assert(y < (size_t)g.resY);
+      return g.startVtxID + x + y * g.lineVtxOffset;
+    }
+    
+    /*! returns i'th vertex of the first timestep */
+    __forceinline const Vec3fa grid_vertex(const Grid& g, size_t x, size_t y) const {
+      const size_t index = grid_vertex_index(g,x,y);
+      return vertex(index);
+    }
+
+    /*! returns i'th vertex of the itime'th timestep */
+    __forceinline const Vec3fa grid_vertex(const Grid& g, size_t x, size_t y, size_t itime) const {
+      const size_t index = grid_vertex_index(g,x,y);
+      return vertex(index,itime);
+    }
+
+    /*! calculates the build bounds of the i'th primitive, if it's valid */
+    __forceinline bool buildBounds(const Grid& g, size_t sx, size_t sy, BBox3fa& bbox) const
+    {
+      BBox3fa b(empty);
+      for (size_t t=0; t<numTimeSteps; t++)
+      {
+        for (size_t y=sy;y<min(sy+3,(size_t)g.resY);y++)
+          for (size_t x=sx;x<min(sx+3,(size_t)g.resX);x++)
+          {
+            const Vec3fa v = grid_vertex(g,x,y,t);
+            if (unlikely(!isvalid(v))) return false;
+            b.extend(v);
+          }
+      }
+
+      bbox = b;
+      return true;
+    }
+
+    /*! calculates the build bounds of the i'th primitive at the itime'th time segment, if it's valid */
+    __forceinline bool buildBounds(const Grid& g, size_t sx, size_t sy, size_t itime, BBox3fa& bbox) const
+    {
+      assert(itime < numTimeSteps);
+      BBox3fa b0(empty);
+      for (size_t y=sy;y<min(sy+3,(size_t)g.resY);y++)
+        for (size_t x=sx;x<min(sx+3,(size_t)g.resX);x++)
+        {
+          const Vec3fa v = grid_vertex(g,x,y,itime);
+          if (unlikely(!isvalid(v))) return false;
+          b0.extend(v);
+        }
+
+      /* use bounds of first time step in builder */
+      bbox = b0;
+      return true;
+    }
+
+    __forceinline bool valid(size_t gridID, size_t itime=0) const {
+      return valid(gridID, make_range(itime, itime));
+    }
+
+    /*! check if the i'th primitive is valid between the specified time range */
+    __forceinline bool valid(size_t gridID, const range<size_t>& itime_range) const
+    {
+      if (unlikely(gridID >= grids.size())) return false;
+      const Grid &g = grid(gridID);
+      if (unlikely(g.startVtxID + 0                                     >= vertices0.size())) return false;
+      if (unlikely(g.startVtxID + (g.resY-1)*g.lineVtxOffset + g.resX-1 >= vertices0.size())) return false;
+
+      for (size_t y=0;y<g.resY;y++)
+        for (size_t x=0;x<g.resX;x++)
+          for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++)
+            if (!isvalid(grid_vertex(g,x,y,itime))) return false;
+      return true;
+    }
+
+
+    __forceinline BBox3fa bounds(const Grid& g, size_t sx, size_t sy, size_t itime) const
+    {
+      BBox3fa box(empty);
+      buildBounds(g,sx,sy,itime,box);
+      return box;
+    }
+
+    __forceinline LBBox3fa linearBounds(const Grid& g, size_t sx, size_t sy, size_t itime) const {
+      BBox3fa bounds0, bounds1;
+      buildBounds(g,sx,sy,itime+0,bounds0);
+      buildBounds(g,sx,sy,itime+1,bounds1);
+      return LBBox3fa(bounds0,bounds1);
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline LBBox3fa linearBounds(const Grid& g, size_t sx, size_t sy, const BBox1f& dt) const {
+      return LBBox3fa([&] (size_t itime) { return bounds(g,sx,sy,itime); }, dt, time_range, fnumTimeSegments);
+    }
+
+  public:
+    BufferView<Grid> grids;      //!< array of triangles
+    BufferView<Vec3fa> vertices0;        //!< fast access to first vertex buffer
+    vector<BufferView<Vec3fa>> vertices; //!< vertex array for each timestep
+    vector<RawBufferView> vertexAttribs; //!< vertex attributes
+  };
+
+  namespace isa
+  {
+    struct GridMeshISA : public GridMesh
+    {
+      GridMeshISA (Device* device)
+        : GridMesh(device) {}
+    };
+  }
+
+  DECLARE_ISA_FUNCTION(GridMesh*, createGridMesh, Device*);
+}
diff --git a/thirdparty/embree/kernels/common/scene_instance.h b/thirdparty/embree/kernels/common/scene_instance.h
new file mode 100644
index 0000000000..773f2b6fec
--- /dev/null
+++ b/thirdparty/embree/kernels/common/scene_instance.h
@@ -0,0 +1,272 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "geometry.h"
+#include "accel.h"
+
+namespace embree
+{
+  struct MotionDerivativeCoefficients;
+
+  /*! Instanced acceleration structure */
+  struct Instance : public Geometry
+  {
+    ALIGNED_STRUCT_(16);
+    static const Geometry::GTypeMask geom_type = Geometry::MTY_INSTANCE;
+
+  public:
+    Instance (Device* device, Accel* object = nullptr, unsigned int numTimeSteps = 1);
+    ~Instance();
+
+  private:
+    Instance (const Instance& other) DELETED; // do not implement
+    Instance& operator= (const Instance& other) DELETED; // do not implement
+
+  private:
+    LBBox3fa nonlinearBounds(const BBox1f& time_range_in,
+                             const BBox1f& geom_time_range,
+                             float geom_time_segments) const;
+
+    BBox3fa boundSegment(size_t itime,
+      BBox3fa const& obbox0, BBox3fa const& obbox1,
+      BBox3fa const& bbox0, BBox3fa const& bbox1,
+      float t_min, float t_max) const;
+
+    /* calculates the (correct) interpolated bounds */
+    __forceinline BBox3fa bounds(size_t itime0, size_t itime1, float f) const
+    {
+      if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION))
+        return xfmBounds(slerp(local2world[itime0], local2world[itime1], f),
+                         lerp(getObjectBounds(itime0), getObjectBounds(itime1), f));
+      return xfmBounds(lerp(local2world[itime0], local2world[itime1], f),
+                        lerp(getObjectBounds(itime0), getObjectBounds(itime1), f));
+    }
+
+  public:
+    virtual void setNumTimeSteps (unsigned int numTimeSteps) override;
+    virtual void setInstancedScene(const Ref<Scene>& scene) override;
+    virtual void setTransform(const AffineSpace3fa& local2world, unsigned int timeStep) override;
+    virtual void setQuaternionDecomposition(const AffineSpace3ff& qd, unsigned int timeStep) override;
+    virtual AffineSpace3fa getTransform(float time) override;
+    virtual void setMask (unsigned mask) override;
+    virtual void build() {}
+    virtual void addElementsToCount (GeometryCounts & counts) const override;
+    virtual void commit() override;
+
+  public:
+
+     /*! calculates the bounds of instance */
+    __forceinline BBox3fa bounds(size_t i) const {
+      assert(i == 0);
+      if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION))
+        return xfmBounds(quaternionDecompositionToAffineSpace(local2world[0]),object->bounds.bounds());
+      return xfmBounds(local2world[0],object->bounds.bounds());
+    }
+
+    /*! gets the bounds of the instanced scene */
+    __forceinline BBox3fa getObjectBounds(size_t itime) const {
+      return object->getBounds(timeStep(itime));
+    }
+
+     /*! calculates the bounds of instance */
+    __forceinline BBox3fa bounds(size_t i, size_t itime) const {
+      assert(i == 0);
+      if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION))
+        return xfmBounds(quaternionDecompositionToAffineSpace(local2world[itime]),getObjectBounds(itime));
+      return xfmBounds(local2world[itime],getObjectBounds(itime));
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline LBBox3fa linearBounds(size_t i, const BBox1f& dt) const {
+      assert(i == 0);
+      LBBox3fa lbbox = nonlinearBounds(dt, time_range, fnumTimeSegments);
+      return lbbox;
+    }
+
+    /*! calculates the build bounds of the i'th item, if it's valid */
+    __forceinline bool buildBounds(size_t i, BBox3fa* bbox = nullptr) const
+    {
+      assert(i==0);
+      const BBox3fa b = bounds(i);
+      if (bbox) *bbox = b;
+      return isvalid(b);
+    }
+
+     /*! calculates the build bounds of the i'th item at the itime'th time segment, if it's valid */
+    __forceinline bool buildBounds(size_t i, size_t itime, BBox3fa& bbox) const
+    {
+      assert(i==0);
+      const LBBox3fa bounds = linearBounds(i,itime);
+      bbox = bounds.bounds ();
+      return isvalid(bounds);
+    }
+
+    /* gets version info of topology */
+    unsigned int getTopologyVersion() const {
+      return numPrimitives;
+    }
+  
+    /* returns true if topology changed */
+    bool topologyChanged(unsigned int otherVersion) const {
+      return numPrimitives != otherVersion;
+    }
+
+    /*! check if the i'th primitive is valid between the specified time range */
+    __forceinline bool valid(size_t i, const range<size_t>& itime_range) const
+    {
+      assert(i == 0);
+      for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++)
+        if (!isvalid(bounds(i,itime))) return false;
+
+      return true;
+    }
+
+    __forceinline AffineSpace3fa getLocal2World() const
+    {
+      if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION))
+        return quaternionDecompositionToAffineSpace(local2world[0]);
+      return local2world[0];
+    }
+
+    __forceinline AffineSpace3fa getLocal2World(float t) const
+    {
+      float ftime; const unsigned int itime = timeSegment(t, ftime);
+      if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION))
+        return slerp(local2world[itime+0],local2world[itime+1],ftime);
+      return lerp(local2world[itime+0],local2world[itime+1],ftime);
+    }
+
+    __forceinline AffineSpace3fa getWorld2Local() const {
+      return world2local0;
+    }
+
+    __forceinline AffineSpace3fa getWorld2Local(float t) const {
+      return rcp(getLocal2World(t));
+    }
+
+    template<int K>
+    __forceinline AffineSpace3vf<K> getWorld2Local(const vbool<K>& valid, const vfloat<K>& t) const
+    {
+      if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION))
+        return getWorld2LocalSlerp<K>(valid, t);
+      return getWorld2LocalLerp<K>(valid, t);
+    }
+
+    private:
+
+    template<int K>
+    __forceinline AffineSpace3vf<K> getWorld2LocalSlerp(const vbool<K>& valid, const vfloat<K>& t) const
+    {
+      vfloat<K> ftime;
+      const vint<K> itime_k = timeSegment<K>(t, ftime);
+      assert(any(valid));
+      const size_t index = bsf(movemask(valid));
+      const int itime = itime_k[index];
+      if (likely(all(valid, itime_k == vint<K>(itime)))) {
+        return rcp(slerp(AffineSpace3vff<K>(local2world[itime+0]),
+                         AffineSpace3vff<K>(local2world[itime+1]),
+                         ftime));
+      }
+      else {
+        AffineSpace3vff<K> space0,space1;
+        vbool<K> valid1 = valid;
+        while (any(valid1)) {
+          vbool<K> valid2;
+          const int itime = next_unique(valid1, itime_k, valid2);
+          space0 = select(valid2, AffineSpace3vff<K>(local2world[itime+0]), space0);
+          space1 = select(valid2, AffineSpace3vff<K>(local2world[itime+1]), space1);
+        }
+        return rcp(slerp(space0, space1, ftime));
+      }
+    }
+
+    template<int K>
+    __forceinline AffineSpace3vf<K> getWorld2LocalLerp(const vbool<K>& valid, const vfloat<K>& t) const
+    {
+      vfloat<K> ftime;
+      const vint<K> itime_k = timeSegment<K>(t, ftime);
+      assert(any(valid));
+      const size_t index = bsf(movemask(valid));
+      const int itime = itime_k[index];
+      if (likely(all(valid, itime_k == vint<K>(itime)))) {
+        return rcp(lerp(AffineSpace3vf<K>((AffineSpace3fa)local2world[itime+0]),
+                        AffineSpace3vf<K>((AffineSpace3fa)local2world[itime+1]),
+                        ftime));
+      } else {
+        AffineSpace3vf<K> space0,space1;
+        vbool<K> valid1 = valid;
+        while (any(valid1)) {
+          vbool<K> valid2;
+          const int itime = next_unique(valid1, itime_k, valid2);
+          space0 = select(valid2, AffineSpace3vf<K>((AffineSpace3fa)local2world[itime+0]), space0);
+          space1 = select(valid2, AffineSpace3vf<K>((AffineSpace3fa)local2world[itime+1]), space1);
+        }
+        return rcp(lerp(space0, space1, ftime));
+      }
+    }
+
+  public:
+    Accel* object;                 //!< pointer to instanced acceleration structure
+    AffineSpace3ff* local2world;   //!< transformation from local space to world space for each timestep (either normal matrix or quaternion decomposition)
+    AffineSpace3fa world2local0;   //!< transformation from world space to local space for timestep 0
+  };
+
+  namespace isa
+  {
+    struct InstanceISA : public Instance
+    {
+      InstanceISA (Device* device)
+        : Instance(device) {}
+
+      PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        assert(r.begin() == 0);
+        assert(r.end()   == 1);
+
+        PrimInfo pinfo(empty);
+        BBox3fa b = empty;
+        if (!buildBounds(0,&b)) return pinfo;
+        // const BBox3fa b = bounds(0);
+        // if (!isvalid(b)) return pinfo;
+
+        const PrimRef prim(b,geomID,unsigned(0));
+        pinfo.add_center2(prim);
+        prims[k++] = prim;
+        return pinfo;
+      }
+
+      PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, size_t itime, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        assert(r.begin() == 0);
+        assert(r.end()   == 1);
+
+        PrimInfo pinfo(empty);
+        BBox3fa b = empty;
+        if (!buildBounds(0,&b)) return pinfo;
+        // if (!valid(0,range<size_t>(itime))) return pinfo;
+        // const PrimRef prim(linearBounds(0,itime).bounds(),geomID,unsigned(0));
+        const PrimRef prim(b,geomID,unsigned(0));
+        pinfo.add_center2(prim);
+        prims[k++] = prim;
+        return pinfo;
+      }
+      
+      PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        assert(r.begin() == 0);
+        assert(r.end()   == 1);
+
+        PrimInfoMB pinfo(empty);
+        if (!valid(0, timeSegmentRange(t0t1))) return pinfo;
+        const PrimRefMB prim(linearBounds(0,t0t1),this->numTimeSegments(),this->time_range,this->numTimeSegments(),geomID,unsigned(0));
+        pinfo.add_primref(prim);
+        prims[k++] = prim;
+        return pinfo;
+      }
+    };
+  }
+
+  DECLARE_ISA_FUNCTION(Instance*, createInstance, Device*);
+}
diff --git a/thirdparty/embree/kernels/common/scene_line_segments.h b/thirdparty/embree/kernels/common/scene_line_segments.h
new file mode 100644
index 0000000000..3c9fdb39db
--- /dev/null
+++ b/thirdparty/embree/kernels/common/scene_line_segments.h
@@ -0,0 +1,345 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "geometry.h"
+#include "buffer.h"
+
+namespace embree
+{
+  /*! represents an array of line segments */
+  struct LineSegments : public Geometry
+  {
+    /*! type of this geometry */
+    static const Geometry::GTypeMask geom_type = Geometry::MTY_CURVE2;
+
+  public:
+
+    /*! line segments construction */
+    LineSegments (Device* device, Geometry::GType gtype);
+
+  public:
+    void setMask (unsigned mask);
+    void setNumTimeSteps (unsigned int numTimeSteps);
+    void setVertexAttributeCount (unsigned int N);
+    void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num);
+    void* getBuffer(RTCBufferType type, unsigned int slot);
+    void updateBuffer(RTCBufferType type, unsigned int slot);
+    void commit();
+    bool verify ();
+    void interpolate(const RTCInterpolateArguments* const args);
+    void setTessellationRate(float N);
+    void setMaxRadiusScale(float s);
+    void addElementsToCount (GeometryCounts & counts) const;
+
+    template<int N>
+    void interpolate_impl(const RTCInterpolateArguments* const args)
+    {
+      unsigned int primID = args->primID;
+      float u = args->u;
+      RTCBufferType bufferType = args->bufferType;
+      unsigned int bufferSlot = args->bufferSlot;
+      float* P = args->P;
+      float* dPdu = args->dPdu;
+      float* ddPdudu = args->ddPdudu;
+      unsigned int valueCount = args->valueCount;
+      
+      /* calculate base pointer and stride */
+      assert((bufferType == RTC_BUFFER_TYPE_VERTEX && bufferSlot < numTimeSteps) ||
+             (bufferType == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE && bufferSlot <= vertexAttribs.size()));
+      const char* src = nullptr;
+      size_t stride = 0;
+      if (bufferType == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE) {
+        src    = vertexAttribs[bufferSlot].getPtr();
+        stride = vertexAttribs[bufferSlot].getStride();
+      } else {
+        src    = vertices[bufferSlot].getPtr();
+        stride = vertices[bufferSlot].getStride();
+      }
+      
+      for (unsigned int i=0; i<valueCount; i+=N)
+      {
+        const size_t ofs = i*sizeof(float);
+        const size_t segment = segments[primID];
+        const vbool<N> valid = vint<N>((int)i)+vint<N>(step) < vint<N>(int(valueCount));
+        const vfloat<N> p0 = mem<vfloat<N>>::loadu(valid,(float*)&src[(segment+0)*stride+ofs]);
+        const vfloat<N> p1 = mem<vfloat<N>>::loadu(valid,(float*)&src[(segment+1)*stride+ofs]);
+        if (P      ) mem<vfloat<N>>::storeu(valid,P+i,lerp(p0,p1,u));
+        if (dPdu   ) mem<vfloat<N>>::storeu(valid,dPdu+i,p1-p0);
+        if (ddPdudu) mem<vfloat<N>>::storeu(valid,dPdu+i,vfloat<N>(zero));
+      }
+    }
+    
+  public:
+
+    /*! returns the number of vertices */
+    __forceinline size_t numVertices() const {
+      return vertices[0].size();
+    }
+
+    /*! returns the i'th segment */
+    __forceinline const unsigned int& segment(size_t i) const {
+      return segments[i];
+    }
+
+    /*! returns the segment to the left of the i'th segment */
+    __forceinline bool segmentLeftExists(size_t i) const {
+      assert (flags);
+      return (flags[i] & RTC_CURVE_FLAG_NEIGHBOR_LEFT) != 0;
+    }
+
+    /*! returns the segment to the right of the i'th segment */
+    __forceinline bool segmentRightExists(size_t i) const {
+      assert (flags);
+      return (flags[i] & RTC_CURVE_FLAG_NEIGHBOR_RIGHT) != 0;
+    }
+
+     /*! returns i'th vertex of the first time step */
+    __forceinline Vec3ff vertex(size_t i) const {
+      return vertices0[i];
+    }
+
+    /*! returns i'th vertex of the first time step */
+    __forceinline const char* vertexPtr(size_t i) const {
+      return vertices0.getPtr(i);
+    }
+
+    /*! returns i'th normal of the first time step */
+    __forceinline Vec3fa normal(size_t i) const {
+      return normals0[i];
+    }
+
+    /*! returns i'th radius of the first time step */
+    __forceinline float radius(size_t i) const {
+      return vertices0[i].w;
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline Vec3ff vertex(size_t i, size_t itime) const {
+      return vertices[itime][i];
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline const char* vertexPtr(size_t i, size_t itime) const {
+      return vertices[itime].getPtr(i);
+    }
+
+    /*! returns i'th normal of itime'th timestep */
+    __forceinline Vec3fa normal(size_t i, size_t itime) const {
+      return normals[itime][i];
+    }
+
+    /*! returns i'th radius of itime'th timestep */
+    __forceinline float radius(size_t i, size_t itime) const {
+      return vertices[itime][i].w;
+    }
+
+    /*! calculates bounding box of i'th line segment */
+    __forceinline BBox3fa bounds(const Vec3ff& v0, const Vec3ff& v1) const
+    {
+      const BBox3ff b = merge(BBox3ff(v0),BBox3ff(v1));
+      return enlarge((BBox3fa)b,maxRadiusScale*Vec3fa(max(v0.w,v1.w)));
+    }
+
+    /*! calculates bounding box of i'th line segment */
+    __forceinline BBox3fa bounds(size_t i) const
+    {
+      const unsigned int index = segment(i);
+      const Vec3ff v0 = vertex(index+0);
+      const Vec3ff v1 = vertex(index+1);
+      return bounds(v0,v1);
+    }
+
+    /*! calculates bounding box of i'th line segment for the itime'th time step */
+    __forceinline BBox3fa bounds(size_t i, size_t itime) const
+    {
+      const unsigned int index = segment(i);
+      const Vec3ff v0 = vertex(index+0,itime);
+      const Vec3ff v1 = vertex(index+1,itime);
+      return bounds(v0,v1);
+    }
+
+    /*! calculates bounding box of i'th line segment */
+    __forceinline BBox3fa bounds(const LinearSpace3fa& space, size_t i) const
+    {
+      const unsigned int index = segment(i);
+      const Vec3ff v0 = vertex(index+0);
+      const Vec3ff v1 = vertex(index+1);
+      const Vec3ff w0(xfmVector(space,(Vec3fa)v0),v0.w);
+      const Vec3ff w1(xfmVector(space,(Vec3fa)v1),v1.w);
+      return bounds(w0,w1);
+    }
+
+    /*! calculates bounding box of i'th line segment for the itime'th time step */
+    __forceinline BBox3fa bounds(const LinearSpace3fa& space, size_t i, size_t itime) const
+    {
+      const unsigned int index = segment(i);
+      const Vec3ff v0 = vertex(index+0,itime);
+      const Vec3ff v1 = vertex(index+1,itime);
+      const Vec3ff w0(xfmVector(space,(Vec3fa)v0),v0.w);
+      const Vec3ff w1(xfmVector(space,(Vec3fa)v1),v1.w);
+      return bounds(w0,w1);
+    }
+
+    /*! check if the i'th primitive is valid at the itime'th timestep */
+    __forceinline bool valid(size_t i, size_t itime) const {
+      return valid(i, make_range(itime, itime));
+    }
+
+    /*! check if the i'th primitive is valid between the specified time range */
+    __forceinline bool valid(size_t i, const range<size_t>& itime_range) const
+    {
+      const unsigned int index = segment(i);
+      if (index+1 >= numVertices()) return false;
+      
+      for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++)
+      {
+        const Vec3ff v0 = vertex(index+0,itime); if (unlikely(!isvalid4(v0))) return false;
+        const Vec3ff v1 = vertex(index+1,itime); if (unlikely(!isvalid4(v1))) return false;
+        if (min(v0.w,v1.w) < 0.0f) return false;
+      }
+      return true;
+    }
+
+    /*! calculates the linear bounds of the i'th primitive at the itimeGlobal'th time segment */
+    __forceinline LBBox3fa linearBounds(size_t i, size_t itime) const {
+      return LBBox3fa(bounds(i,itime+0),bounds(i,itime+1));
+    }
+
+    /*! calculates the build bounds of the i'th primitive, if it's valid */
+    __forceinline bool buildBounds(size_t i, BBox3fa* bbox) const
+    {
+      if (!valid(i,0)) return false;
+      *bbox = bounds(i); 
+      return true;
+    }
+
+    /*! calculates the build bounds of the i'th primitive at the itime'th time segment, if it's valid */
+    __forceinline bool buildBounds(size_t i, size_t itime, BBox3fa& bbox) const
+    {
+      if (!valid(i,itime+0) || !valid(i,itime+1)) return false;
+      bbox = bounds(i,itime);  // use bounds of first time step in builder
+      return true;
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline LBBox3fa linearBounds(size_t primID, const BBox1f& dt) const {
+      return LBBox3fa([&] (size_t itime) { return bounds(primID, itime); }, dt, time_range, fnumTimeSegments);
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline LBBox3fa linearBounds(const LinearSpace3fa& space, size_t primID, const BBox1f& dt) const {
+      return LBBox3fa([&] (size_t itime) { return bounds(space, primID, itime); }, dt, time_range, fnumTimeSegments);
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline bool linearBounds(size_t i, const BBox1f& time_range, LBBox3fa& bbox) const
+    {
+      if (!valid(i, timeSegmentRange(time_range))) return false;
+      bbox = linearBounds(i, time_range);
+      return true;
+    }
+
+    /*! get fast access to first vertex buffer */
+    __forceinline float * getCompactVertexArray () const {
+      return (float*) vertices0.getPtr();
+    }
+
+  public:
+    BufferView<unsigned int> segments;      //!< array of line segment indices
+    BufferView<Vec3ff> vertices0;           //!< fast access to first vertex buffer
+    BufferView<Vec3fa> normals0;            //!< fast access to first normal buffer
+    BufferView<char> flags;                 //!< start, end flag per segment
+    vector<BufferView<Vec3ff>> vertices;    //!< vertex array for each timestep
+    vector<BufferView<Vec3fa>> normals;     //!< normal array for each timestep
+    vector<BufferView<char>> vertexAttribs; //!< user buffers
+    int tessellationRate;                   //!< tessellation rate for bezier curve
+    float maxRadiusScale = 1.0;             //!< maximal min-width scaling of curve radii
+  };
+
+  namespace isa
+  {
+    struct LineSegmentsISA : public LineSegments
+    {
+      LineSegmentsISA (Device* device, Geometry::GType gtype)
+        : LineSegments(device,gtype) {}
+
+      Vec3fa computeDirection(unsigned int primID) const
+      {
+        const unsigned vtxID = segment(primID);
+        const Vec3fa v0 = vertex(vtxID+0);
+        const Vec3fa v1 = vertex(vtxID+1);
+        return v1-v0;
+      }
+
+      Vec3fa computeDirection(unsigned int primID, size_t time) const
+      {
+        const unsigned vtxID = segment(primID);
+        const Vec3fa v0 = vertex(vtxID+0,time);
+        const Vec3fa v1 = vertex(vtxID+1,time);
+        return v1-v0;
+      }
+
+      PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfo pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          BBox3fa bounds = empty;
+          if (!buildBounds(j,&bounds)) continue;
+          const PrimRef prim(bounds,geomID,unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+
+      PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, size_t itime, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfo pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          BBox3fa bounds = empty;
+          if (!buildBounds(j,itime,bounds)) continue;
+          const PrimRef prim(bounds,geomID,unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+      
+      PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfoMB pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          if (!valid(j, timeSegmentRange(t0t1))) continue;
+          const PrimRefMB prim(linearBounds(j,t0t1),this->numTimeSegments(),this->time_range,this->numTimeSegments(),geomID,unsigned(j));
+          pinfo.add_primref(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+
+      BBox3fa vbounds(size_t i) const {
+        return bounds(i);
+      }
+      
+      BBox3fa vbounds(const LinearSpace3fa& space, size_t i) const {
+        return bounds(space,i);
+      }
+
+      LBBox3fa vlinearBounds(size_t primID, const BBox1f& time_range) const {
+        return linearBounds(primID,time_range);
+      }
+      
+      LBBox3fa vlinearBounds(const LinearSpace3fa& space, size_t primID, const BBox1f& time_range) const {
+        return linearBounds(space,primID,time_range);
+      }
+    };
+  }
+
+  DECLARE_ISA_FUNCTION(LineSegments*, createLineSegments, Device* COMMA Geometry::GType);
+}
diff --git a/thirdparty/embree/kernels/common/scene_points.h b/thirdparty/embree/kernels/common/scene_points.h
new file mode 100644
index 0000000000..017e098a51
--- /dev/null
+++ b/thirdparty/embree/kernels/common/scene_points.h
@@ -0,0 +1,282 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "buffer.h"
+#include "default.h"
+#include "geometry.h"
+
+namespace embree
+{
+  /*! represents an array of points */
+  struct Points : public Geometry
+  {
+    /*! type of this geometry */
+    static const Geometry::GTypeMask geom_type = Geometry::MTY_POINTS;
+
+   public:
+    /*! line segments construction */
+    Points(Device* device, Geometry::GType gtype);
+
+   public:
+    void setMask(unsigned mask);
+    void setNumTimeSteps(unsigned int numTimeSteps);
+    void setVertexAttributeCount(unsigned int N);
+    void setBuffer(RTCBufferType type,
+                   unsigned int slot,
+                   RTCFormat format,
+                   const Ref<Buffer>& buffer,
+                   size_t offset,
+                   size_t stride,
+                   unsigned int num);
+    void* getBuffer(RTCBufferType type, unsigned int slot);
+    void updateBuffer(RTCBufferType type, unsigned int slot);
+    void commit();
+    bool verify();
+    void setMaxRadiusScale(float s);
+    void addElementsToCount (GeometryCounts & counts) const;
+
+   public:
+    /*! returns the number of vertices */
+    __forceinline size_t numVertices() const {
+      return vertices[0].size();
+    }
+
+    /*! returns i'th vertex of the first time step */
+    __forceinline Vec3ff vertex(size_t i) const {
+      return vertices0[i];
+    }
+
+    /*! returns i'th vertex of the first time step */
+    __forceinline const char* vertexPtr(size_t i) const {
+      return vertices0.getPtr(i);
+    }
+
+    /*! returns i'th normal of the first time step */
+    __forceinline Vec3fa normal(size_t i) const {
+      return normals0[i];
+    }
+
+    /*! returns i'th radius of the first time step */
+    __forceinline float radius(size_t i) const {
+      return vertices0[i].w;
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline Vec3ff vertex(size_t i, size_t itime) const {
+      return vertices[itime][i];
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline const char* vertexPtr(size_t i, size_t itime) const {
+      return vertices[itime].getPtr(i);
+    }
+
+    /*! returns i'th normal of itime'th timestep */
+    __forceinline Vec3fa normal(size_t i, size_t itime) const {
+      return normals[itime][i];
+    }
+
+    /*! returns i'th radius of itime'th timestep */
+    __forceinline float radius(size_t i, size_t itime) const {
+      return vertices[itime][i].w;
+    }
+
+    /*! calculates bounding box of i'th line segment */
+    __forceinline BBox3fa bounds(const Vec3ff& v0) const {
+      return enlarge(BBox3fa(v0), maxRadiusScale*Vec3fa(v0.w));
+    }
+
+    /*! calculates bounding box of i'th line segment */
+    __forceinline BBox3fa bounds(size_t i) const
+    {
+      const Vec3ff v0 = vertex(i);
+      return bounds(v0);
+    }
+
+    /*! calculates bounding box of i'th line segment for the itime'th time step */
+    __forceinline BBox3fa bounds(size_t i, size_t itime) const
+    {
+      const Vec3ff v0 = vertex(i, itime);
+      return bounds(v0);
+    }
+
+    /*! calculates bounding box of i'th line segment */
+    __forceinline BBox3fa bounds(const LinearSpace3fa& space, size_t i) const
+    {
+      const Vec3ff v0 = vertex(i);
+      const Vec3ff w0(xfmVector(space, (Vec3fa)v0), v0.w);
+      return bounds(w0);
+    }
+
+    /*! calculates bounding box of i'th line segment for the itime'th time step */
+    __forceinline BBox3fa bounds(const LinearSpace3fa& space, size_t i, size_t itime) const
+    {
+      const Vec3ff v0 = vertex(i, itime);
+      const Vec3ff w0(xfmVector(space, (Vec3fa)v0), v0.w);
+      return bounds(w0);
+    }
+
+    /*! check if the i'th primitive is valid at the itime'th timestep */
+    __forceinline bool valid(size_t i, size_t itime) const {
+      return valid(i, make_range(itime, itime));
+    }
+
+    /*! check if the i'th primitive is valid between the specified time range */
+    __forceinline bool valid(size_t i, const range<size_t>& itime_range) const
+    {
+      const unsigned int index = (unsigned int)i;
+      if (index >= numVertices())
+        return false;
+
+      for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++) {
+        const Vec3ff v0 = vertex(index + 0, itime);
+        if (unlikely(!isvalid4(v0)))
+          return false;
+        if (v0.w < 0.0f)
+          return false;
+      }
+      return true;
+    }
+
+    /*! calculates the linear bounds of the i'th primitive at the itimeGlobal'th time segment */
+    __forceinline LBBox3fa linearBounds(size_t i, size_t itime) const {
+      return LBBox3fa(bounds(i, itime + 0), bounds(i, itime + 1));
+    }
+
+    /*! calculates the build bounds of the i'th primitive, if it's valid */
+    __forceinline bool buildBounds(size_t i, BBox3fa* bbox) const
+    {
+      if (!valid(i, 0))
+        return false;
+      *bbox = bounds(i);
+      return true;
+    }
+
+    /*! calculates the build bounds of the i'th primitive at the itime'th time segment, if it's valid */
+    __forceinline bool buildBounds(size_t i, size_t itime, BBox3fa& bbox) const
+    {
+      if (!valid(i, itime + 0) || !valid(i, itime + 1))
+        return false;
+      bbox = bounds(i, itime);  // use bounds of first time step in builder
+      return true;
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline LBBox3fa linearBounds(size_t primID, const BBox1f& dt) const {
+      return LBBox3fa([&](size_t itime) { return bounds(primID, itime); }, dt, time_range, fnumTimeSegments);
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline LBBox3fa linearBounds(const LinearSpace3fa& space, size_t primID, const BBox1f& dt) const {
+      return LBBox3fa([&](size_t itime) { return bounds(space, primID, itime); }, dt, time_range, fnumTimeSegments);
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline bool linearBounds(size_t i, const BBox1f& time_range, LBBox3fa& bbox) const
+    {
+      if (!valid(i, timeSegmentRange(time_range))) return false;
+      bbox = linearBounds(i, time_range);
+      return true;
+    }
+
+    /*! get fast access to first vertex buffer */
+    __forceinline float * getCompactVertexArray () const {
+      return (float*) vertices0.getPtr();
+    }
+
+   public:
+    BufferView<Vec3ff> vertices0;            //!< fast access to first vertex buffer
+    BufferView<Vec3fa> normals0;             //!< fast access to first normal buffer
+    vector<BufferView<Vec3ff>> vertices;     //!< vertex array for each timestep
+    vector<BufferView<Vec3fa>> normals;      //!< normal array for each timestep
+    vector<BufferView<char>> vertexAttribs;  //!< user buffers
+    float maxRadiusScale = 1.0;              //!< maximal min-width scaling of curve radii
+  };
+
+  namespace isa
+  {
+    struct PointsISA : public Points
+    {
+      PointsISA(Device* device, Geometry::GType gtype) : Points(device, gtype) {}
+
+      Vec3fa computeDirection(unsigned int primID) const
+      {
+        return Vec3fa(1, 0, 0);
+      }
+
+      Vec3fa computeDirection(unsigned int primID, size_t time) const
+      {
+        return Vec3fa(1, 0, 0);
+      }
+
+      PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfo pinfo(empty);
+        for (size_t j = r.begin(); j < r.end(); j++) {
+          BBox3fa bounds = empty;
+          if (!buildBounds(j, &bounds))
+            continue;
+          const PrimRef prim(bounds, geomID, unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+
+      PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, size_t itime, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfo pinfo(empty);
+        for (size_t j = r.begin(); j < r.end(); j++) {
+          BBox3fa bounds = empty;
+          if (!buildBounds(j, itime, bounds))
+            continue;
+          const PrimRef prim(bounds, geomID, unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+
+      PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims,
+                                      const BBox1f& t0t1,
+                                      const range<size_t>& r,
+                                      size_t k,
+                                      unsigned int geomID) const
+      {
+        PrimInfoMB pinfo(empty);
+        for (size_t j = r.begin(); j < r.end(); j++) {
+          if (!valid(j, timeSegmentRange(t0t1)))
+            continue;
+          const PrimRefMB prim(linearBounds(j, t0t1), this->numTimeSegments(), this->time_range, this->numTimeSegments(), geomID, unsigned(j));
+          pinfo.add_primref(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+
+      BBox3fa vbounds(size_t i) const
+      {
+        return bounds(i);
+      }
+
+      BBox3fa vbounds(const LinearSpace3fa& space, size_t i) const
+      {
+        return bounds(space, i);
+      }
+
+      LBBox3fa vlinearBounds(size_t primID, const BBox1f& time_range) const
+      {
+        return linearBounds(primID, time_range);
+      }
+
+      LBBox3fa vlinearBounds(const LinearSpace3fa& space, size_t primID, const BBox1f& time_range) const
+      {
+        return linearBounds(space, primID, time_range);
+      }
+    };
+  }  // namespace isa
+
+  DECLARE_ISA_FUNCTION(Points*, createPoints, Device* COMMA Geometry::GType);
+}  // namespace embree
diff --git a/thirdparty/embree/kernels/common/scene_quad_mesh.h b/thirdparty/embree/kernels/common/scene_quad_mesh.h
new file mode 100644
index 0000000000..bd8eeaaeb7
--- /dev/null
+++ b/thirdparty/embree/kernels/common/scene_quad_mesh.h
@@ -0,0 +1,337 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "geometry.h"
+#include "buffer.h"
+
+namespace embree
+{
+  /*! Quad Mesh */
+  struct QuadMesh : public Geometry
+  {
+    /*! type of this geometry */
+    static const Geometry::GTypeMask geom_type = Geometry::MTY_QUAD_MESH;
+    
+    /*! triangle indices */
+    struct Quad
+    {
+      uint32_t v[4];
+
+      /*! outputs triangle indices */
+      __forceinline friend embree_ostream operator<<(embree_ostream cout, const Quad& q) {
+        return cout << "Quad {" << q.v[0] << ", " << q.v[1] << ", " << q.v[2] << ", " << q.v[3] << " }";
+      }
+    };
+
+  public:
+
+    /*! quad mesh construction */
+    QuadMesh (Device* device); 
+  
+    /* geometry interface */
+  public:
+    void setMask(unsigned mask);
+    void setNumTimeSteps (unsigned int numTimeSteps);
+    void setVertexAttributeCount (unsigned int N);
+    void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num);
+    void* getBuffer(RTCBufferType type, unsigned int slot);
+    void updateBuffer(RTCBufferType type, unsigned int slot);
+    void commit();
+    bool verify();
+    void interpolate(const RTCInterpolateArguments* const args);
+    void addElementsToCount (GeometryCounts & counts) const;
+
+    template<int N>
+      void interpolate_impl(const RTCInterpolateArguments* const args)
+    {
+      unsigned int primID = args->primID;
+      float u = args->u;
+      float v = args->v;
+      RTCBufferType bufferType = args->bufferType;
+      unsigned int bufferSlot = args->bufferSlot;
+      float* P = args->P;
+      float* dPdu = args->dPdu;
+      float* dPdv = args->dPdv;
+      float* ddPdudu = args->ddPdudu;
+      float* ddPdvdv = args->ddPdvdv;
+      float* ddPdudv = args->ddPdudv;
+      unsigned int valueCount = args->valueCount;
+      
+      /* calculate base pointer and stride */
+      assert((bufferType == RTC_BUFFER_TYPE_VERTEX && bufferSlot < numTimeSteps) ||
+             (bufferType == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE && bufferSlot <= vertexAttribs.size()));
+      const char* src = nullptr; 
+      size_t stride = 0;
+      if (bufferType == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE) {
+        src    = vertexAttribs[bufferSlot].getPtr();
+        stride = vertexAttribs[bufferSlot].getStride();
+      } else {
+        src    = vertices[bufferSlot].getPtr();
+        stride = vertices[bufferSlot].getStride();
+      }
+      
+      for (unsigned int i=0; i<valueCount; i+=N)
+      {
+        const vbool<N> valid = vint<N>((int)i)+vint<N>(step) < vint<N>(int(valueCount));
+        const size_t ofs = i*sizeof(float);
+        const Quad& tri = quad(primID);
+        const vfloat<N> p0 = mem<vfloat<N>>::loadu(valid,(float*)&src[tri.v[0]*stride+ofs]);
+        const vfloat<N> p1 = mem<vfloat<N>>::loadu(valid,(float*)&src[tri.v[1]*stride+ofs]);
+        const vfloat<N> p2 = mem<vfloat<N>>::loadu(valid,(float*)&src[tri.v[2]*stride+ofs]);
+        const vfloat<N> p3 = mem<vfloat<N>>::loadu(valid,(float*)&src[tri.v[3]*stride+ofs]);      
+        const vbool<N> left = u+v <= 1.0f;
+        const vfloat<N> Q0 = select(left,p0,p2);
+        const vfloat<N> Q1 = select(left,p1,p3);
+        const vfloat<N> Q2 = select(left,p3,p1);
+        const vfloat<N> U  = select(left,u,vfloat<N>(1.0f)-u);
+        const vfloat<N> V  = select(left,v,vfloat<N>(1.0f)-v);
+        const vfloat<N> W  = 1.0f-U-V;
+        if (P) {
+          mem<vfloat<N>>::storeu(valid,P+i,madd(W,Q0,madd(U,Q1,V*Q2)));
+        }
+        if (dPdu) { 
+          assert(dPdu); mem<vfloat<N>>::storeu(valid,dPdu+i,select(left,Q1-Q0,Q0-Q1));
+          assert(dPdv); mem<vfloat<N>>::storeu(valid,dPdv+i,select(left,Q2-Q0,Q0-Q2));
+        }
+        if (ddPdudu) { 
+          assert(ddPdudu); mem<vfloat<N>>::storeu(valid,ddPdudu+i,vfloat<N>(zero));
+          assert(ddPdvdv); mem<vfloat<N>>::storeu(valid,ddPdvdv+i,vfloat<N>(zero));
+          assert(ddPdudv); mem<vfloat<N>>::storeu(valid,ddPdudv+i,vfloat<N>(zero));
+        }
+      }
+    }
+        
+  public:
+
+    /*! returns number of vertices */
+    __forceinline size_t numVertices() const {
+      return vertices[0].size();
+    }
+    
+    /*! returns i'th quad */
+    __forceinline const Quad& quad(size_t i) const {
+      return quads[i];
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline const Vec3fa vertex(size_t i) const {
+      return vertices0[i];
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline const char* vertexPtr(size_t i) const {
+      return vertices0.getPtr(i);
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline const Vec3fa vertex(size_t i, size_t itime) const {
+      return vertices[itime][i];
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline const char* vertexPtr(size_t i, size_t itime) const {
+      return vertices[itime].getPtr(i);
+    }
+
+    /*! calculates the bounds of the i'th quad */
+    __forceinline BBox3fa bounds(size_t i) const 
+    {
+      const Quad& q = quad(i);
+      const Vec3fa v0 = vertex(q.v[0]);
+      const Vec3fa v1 = vertex(q.v[1]);
+      const Vec3fa v2 = vertex(q.v[2]);
+      const Vec3fa v3 = vertex(q.v[3]);
+      return BBox3fa(min(v0,v1,v2,v3),max(v0,v1,v2,v3));
+    }
+
+    /*! calculates the bounds of the i'th quad at the itime'th timestep */
+    __forceinline BBox3fa bounds(size_t i, size_t itime) const
+    {
+      const Quad& q = quad(i);
+      const Vec3fa v0 = vertex(q.v[0],itime);
+      const Vec3fa v1 = vertex(q.v[1],itime);
+      const Vec3fa v2 = vertex(q.v[2],itime);
+      const Vec3fa v3 = vertex(q.v[3],itime);
+      return BBox3fa(min(v0,v1,v2,v3),max(v0,v1,v2,v3));
+    }
+
+    /*! check if the i'th primitive is valid at the itime'th timestep */
+    __forceinline bool valid(size_t i, size_t itime) const {
+      return valid(i, make_range(itime, itime));
+    }
+
+    /*! check if the i'th primitive is valid between the specified time range */
+    __forceinline bool valid(size_t i, const range<size_t>& itime_range) const
+    {
+      const Quad& q = quad(i);
+      if (unlikely(q.v[0] >= numVertices())) return false;
+      if (unlikely(q.v[1] >= numVertices())) return false;
+      if (unlikely(q.v[2] >= numVertices())) return false;
+      if (unlikely(q.v[3] >= numVertices())) return false;
+
+      for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++)
+      {
+        if (!isvalid(vertex(q.v[0],itime))) return false;
+        if (!isvalid(vertex(q.v[1],itime))) return false;
+        if (!isvalid(vertex(q.v[2],itime))) return false;
+        if (!isvalid(vertex(q.v[3],itime))) return false;
+      }
+
+      return true;
+    }
+
+    /*! calculates the linear bounds of the i'th quad at the itimeGlobal'th time segment */
+    __forceinline LBBox3fa linearBounds(size_t i, size_t itime) const {
+      return LBBox3fa(bounds(i,itime+0),bounds(i,itime+1));
+    }
+
+    /*! calculates the build bounds of the i'th primitive, if it's valid */
+    __forceinline bool buildBounds(size_t i, BBox3fa* bbox = nullptr) const
+    {
+      const Quad& q = quad(i);
+      if (q.v[0] >= numVertices()) return false;
+      if (q.v[1] >= numVertices()) return false;
+      if (q.v[2] >= numVertices()) return false;
+      if (q.v[3] >= numVertices()) return false;
+
+      for (unsigned int t=0; t<numTimeSteps; t++)
+      {
+        const Vec3fa v0 = vertex(q.v[0],t);
+        const Vec3fa v1 = vertex(q.v[1],t);
+        const Vec3fa v2 = vertex(q.v[2],t);
+        const Vec3fa v3 = vertex(q.v[3],t);
+
+        if (unlikely(!isvalid(v0) || !isvalid(v1) || !isvalid(v2) || !isvalid(v3)))
+          return false;
+      }
+
+      if (bbox) 
+        *bbox = bounds(i);
+
+      return true;
+    }
+
+    /*! calculates the build bounds of the i'th primitive at the itime'th time segment, if it's valid */
+    __forceinline bool buildBounds(size_t i, size_t itime, BBox3fa& bbox) const
+    {
+      const Quad& q = quad(i);
+      if (unlikely(q.v[0] >= numVertices())) return false;
+      if (unlikely(q.v[1] >= numVertices())) return false;
+      if (unlikely(q.v[2] >= numVertices())) return false;
+      if (unlikely(q.v[3] >= numVertices())) return false;
+
+      assert(itime+1 < numTimeSteps);
+      const Vec3fa a0 = vertex(q.v[0],itime+0); if (unlikely(!isvalid(a0))) return false;
+      const Vec3fa a1 = vertex(q.v[1],itime+0); if (unlikely(!isvalid(a1))) return false;
+      const Vec3fa a2 = vertex(q.v[2],itime+0); if (unlikely(!isvalid(a2))) return false;
+      const Vec3fa a3 = vertex(q.v[3],itime+0); if (unlikely(!isvalid(a3))) return false;
+      const Vec3fa b0 = vertex(q.v[0],itime+1); if (unlikely(!isvalid(b0))) return false;
+      const Vec3fa b1 = vertex(q.v[1],itime+1); if (unlikely(!isvalid(b1))) return false;
+      const Vec3fa b2 = vertex(q.v[2],itime+1); if (unlikely(!isvalid(b2))) return false;
+      const Vec3fa b3 = vertex(q.v[3],itime+1); if (unlikely(!isvalid(b3))) return false;
+      
+      /* use bounds of first time step in builder */
+      bbox = BBox3fa(min(a0,a1,a2,a3),max(a0,a1,a2,a3));
+      return true;
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline LBBox3fa linearBounds(size_t primID, const BBox1f& dt) const {
+      return LBBox3fa([&] (size_t itime) { return bounds(primID, itime); }, dt, time_range, fnumTimeSegments);
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline bool linearBounds(size_t i, const BBox1f& dt, LBBox3fa& bbox) const
+    {
+      if (!valid(i, timeSegmentRange(dt))) return false;
+      bbox = linearBounds(i, dt);
+      return true;
+    }
+
+    /*! get fast access to first vertex buffer */
+    __forceinline float * getCompactVertexArray () const {
+      return (float*) vertices0.getPtr();
+    }
+
+    /* gets version info of topology */
+    unsigned int getTopologyVersion() const {
+      return quads.modCounter;
+    }
+    
+    /* returns true if topology changed */
+    bool topologyChanged(unsigned int otherVersion) const {
+      return quads.isModified(otherVersion); // || numPrimitivesChanged;
+    }
+
+    /* returns the projected area */
+    __forceinline float projectedPrimitiveArea(const size_t i) const {
+      const Quad& q = quad(i);
+      const Vec3fa v0 = vertex(q.v[0]);
+      const Vec3fa v1 = vertex(q.v[1]);
+      const Vec3fa v2 = vertex(q.v[2]);
+      const Vec3fa v3 = vertex(q.v[3]);
+      return areaProjectedTriangle(v0,v1,v3) +
+	areaProjectedTriangle(v1,v2,v3);
+    }
+
+  public:
+    BufferView<Quad> quads;                 //!< array of quads
+    BufferView<Vec3fa> vertices0;           //!< fast access to first vertex buffer
+    vector<BufferView<Vec3fa>> vertices;    //!< vertex array for each timestep
+    vector<BufferView<char>> vertexAttribs; //!< vertex attribute buffers
+  };
+
+  namespace isa
+  {
+    struct QuadMeshISA : public QuadMesh
+    {
+      QuadMeshISA (Device* device)
+        : QuadMesh(device) {}
+
+      PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfo pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          BBox3fa bounds = empty;
+          if (!buildBounds(j,&bounds)) continue;
+          const PrimRef prim(bounds,geomID,unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+
+      PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, size_t itime, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfo pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          BBox3fa bounds = empty;
+          if (!buildBounds(j,itime,bounds)) continue;
+          const PrimRef prim(bounds,geomID,unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+      
+      PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfoMB pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          if (!valid(j, timeSegmentRange(t0t1))) continue;
+          const PrimRefMB prim(linearBounds(j,t0t1),this->numTimeSegments(),this->time_range,this->numTimeSegments(),geomID,unsigned(j));
+          pinfo.add_primref(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+    };
+  }
+
+  DECLARE_ISA_FUNCTION(QuadMesh*, createQuadMesh, Device*);
+}
diff --git a/thirdparty/embree/kernels/common/scene_subdiv_mesh.h b/thirdparty/embree/kernels/common/scene_subdiv_mesh.h
new file mode 100644
index 0000000000..1db170196d
--- /dev/null
+++ b/thirdparty/embree/kernels/common/scene_subdiv_mesh.h
@@ -0,0 +1,326 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "geometry.h"
+#include "buffer.h"
+#include "../subdiv/half_edge.h"
+#include "../subdiv/tessellation_cache.h"
+#include "../subdiv/catmullclark_coefficients.h"
+#include "../subdiv/patch.h"
+#include "../../common/algorithms/parallel_map.h"
+#include "../../common/algorithms/parallel_set.h"
+
+namespace embree
+{
+  class SubdivMesh : public Geometry
+  {
+    ALIGNED_CLASS_(16);
+  public:
+
+    typedef HalfEdge::Edge Edge;
+    
+    /*! type of this geometry */
+    static const Geometry::GTypeMask geom_type = Geometry::MTY_SUBDIV_MESH;
+
+    /*! structure used to sort half edges using radix sort by their key */
+    struct KeyHalfEdge 
+    {
+      KeyHalfEdge() {}
+      
+      KeyHalfEdge (uint64_t key, HalfEdge* edge) 
+      : key(key), edge(edge) {}
+      
+      __forceinline operator uint64_t() const { 
+	return key; 
+      }
+
+      friend __forceinline bool operator<(const KeyHalfEdge& e0, const KeyHalfEdge& e1) {
+        return e0.key < e1.key;
+      }
+      
+    public:
+      uint64_t key;
+      HalfEdge* edge;
+    };
+
+  public:
+
+    /*! subdiv mesh construction */
+    SubdivMesh(Device* device);
+
+  public:
+    void setMask (unsigned mask);
+    void setSubdivisionMode (unsigned int topologyID, RTCSubdivisionMode mode);
+    void setVertexAttributeTopology(unsigned int vertexAttribID, unsigned int topologyID);
+    void setNumTimeSteps (unsigned int numTimeSteps);
+    void setVertexAttributeCount (unsigned int N);
+    void setTopologyCount (unsigned int N);
+    void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num);
+    void* getBuffer(RTCBufferType type, unsigned int slot);
+    void updateBuffer(RTCBufferType type, unsigned int slot);
+    void setTessellationRate(float N);
+    bool verify();
+    void commit();
+    void addElementsToCount (GeometryCounts & counts) const;
+    void setDisplacementFunction (RTCDisplacementFunctionN func);
+    unsigned int getFirstHalfEdge(unsigned int faceID);
+    unsigned int getFace(unsigned int edgeID);
+    unsigned int getNextHalfEdge(unsigned int edgeID);
+    unsigned int getPreviousHalfEdge(unsigned int edgeID);
+    unsigned int getOppositeHalfEdge(unsigned int topologyID, unsigned int edgeID);
+
+  public:
+
+    /*! return the number of faces */
+    size_t numFaces() const { 
+      return faceVertices.size(); 
+    }
+
+    /*! return the number of edges */
+    size_t numEdges() const { 
+      return topology[0].vertexIndices.size(); 
+    }
+
+    /*! return the number of vertices */
+    size_t numVertices() const { 
+      return vertices[0].size(); 
+    }
+
+    /*! calculates the bounds of the i'th subdivision patch at the j'th timestep */
+    __forceinline BBox3fa bounds(size_t i, size_t j = 0) const {
+      return topology[0].getHalfEdge(i)->bounds(vertices[j]);
+    }
+
+    /*! check if the i'th primitive is valid */
+    __forceinline bool valid(size_t i) const {
+      return topology[0].valid(i) && !invalidFace(i);
+    }
+
+    /*! check if the i'th primitive is valid for the j'th time range */
+    __forceinline bool valid(size_t i, size_t j) const {
+      return topology[0].valid(i) && !invalidFace(i,j);
+    }
+
+    /*! prints some statistics */
+    void printStatistics();
+
+    /*! initializes the half edge data structure */
+    void initializeHalfEdgeStructures ();
+ 
+  public:
+
+    /*! returns the vertex buffer for some time step */
+    __forceinline const BufferView<Vec3fa>& getVertexBuffer( const size_t t = 0 ) const {
+      return vertices[t];
+    }
+
+    /* returns tessellation level of edge */
+    __forceinline float getEdgeLevel(const size_t i) const
+    {
+      if (levels) return clamp(levels[i],1.0f,4096.0f); // FIXME: do we want to limit edge level?
+      else return clamp(tessellationRate,1.0f,4096.0f); // FIXME: do we want to limit edge level?
+    }
+
+  public:
+    RTCDisplacementFunctionN displFunc;    //!< displacement function
+
+    /*! all buffers in this section are provided by the application */
+  public:
+    
+    /*! the topology contains all data that may differ when
+     *  interpolating different user data buffers */
+    struct Topology
+    {
+    public:
+
+      /*! Default topology construction */
+      Topology () : halfEdges(nullptr,0) {}
+
+      /*! Topology initialization */
+      Topology (SubdivMesh* mesh);
+
+      /*! make the class movable */
+    public: 
+      Topology (Topology&& other) // FIXME: this is only required to workaround compilation issues under Windows
+        : mesh(std::move(other.mesh)), 
+          vertexIndices(std::move(other.vertexIndices)),
+          subdiv_mode(std::move(other.subdiv_mode)),
+          halfEdges(std::move(other.halfEdges)),
+          halfEdges0(std::move(other.halfEdges0)),
+          halfEdges1(std::move(other.halfEdges1)) {}
+      
+      Topology& operator= (Topology&& other) // FIXME: this is only required to workaround compilation issues under Windows
+      {
+        mesh = std::move(other.mesh); 
+        vertexIndices = std::move(other.vertexIndices);
+        subdiv_mode = std::move(other.subdiv_mode);
+        halfEdges = std::move(other.halfEdges);
+        halfEdges0 = std::move(other.halfEdges0);
+        halfEdges1 = std::move(other.halfEdges1);
+        return *this;
+      }
+
+    public:
+      /*! check if the i'th primitive is valid in this topology */
+      __forceinline bool valid(size_t i) const 
+      {
+        if (unlikely(subdiv_mode == RTC_SUBDIVISION_MODE_NO_BOUNDARY)) {
+          if (getHalfEdge(i)->faceHasBorder()) return false;
+        }
+        return true;
+      }
+      
+      /*! updates the interpolation mode for the topology */
+      void setSubdivisionMode (RTCSubdivisionMode mode);
+
+      /*! marks all buffers as modified */
+      void update ();
+
+      /*! verifies index array */
+      bool verify (size_t numVertices);
+
+      /*! initializes the half edge data structure */
+      void initializeHalfEdgeStructures ();
+
+    private:
+      
+      /*! recalculates the half edges */
+      void calculateHalfEdges();
+      
+      /*! updates half edges when recalculation is not necessary */
+      void updateHalfEdges();
+      
+      /*! user input data */
+    public:
+
+      SubdivMesh* mesh;
+
+      /*! indices of the vertices composing each face */
+      BufferView<unsigned int> vertexIndices;
+      
+      /*! subdiv interpolation mode */
+      RTCSubdivisionMode subdiv_mode;
+
+      /*! generated data */
+    public:
+
+      /*! returns the start half edge for face f */
+      __forceinline const HalfEdge* getHalfEdge ( const size_t f ) const { 
+        return &halfEdges[mesh->faceStartEdge[f]]; 
+      }
+
+      /*! Half edge structure, generated by initHalfEdgeStructures */
+      mvector<HalfEdge> halfEdges;
+
+      /*! the following data is only required during construction of the
+       *  half edge structure and can be cleared for static scenes */
+    private:
+      
+      /*! two arrays used to sort the half edges */
+      std::vector<KeyHalfEdge> halfEdges0;
+      std::vector<KeyHalfEdge> halfEdges1;
+    };
+
+    /*! returns the start half edge for topology t and face f */
+    __forceinline const HalfEdge* getHalfEdge ( const size_t t , const size_t f ) const { 
+      return topology[t].getHalfEdge(f);
+    }
+
+    /*! buffer containing the number of vertices for each face */
+    BufferView<unsigned int> faceVertices;
+
+    /*! array of topologies */
+    vector<Topology> topology;
+
+    /*! vertex buffer (one buffer for each time step) */
+    vector<BufferView<Vec3fa>> vertices;
+
+    /*! user data buffers */
+    vector<RawBufferView> vertexAttribs;
+
+    /*! edge crease buffer containing edges (pairs of vertices) that carry edge crease weights */
+    BufferView<Edge> edge_creases;
+    
+    /*! edge crease weights for each edge of the edge_creases buffer */
+    BufferView<float> edge_crease_weights;
+    
+    /*! vertex crease buffer containing all vertices that carry vertex crease weights */
+    BufferView<unsigned int> vertex_creases;
+    
+    /*! vertex crease weights for each vertex of the vertex_creases buffer */
+    BufferView<float> vertex_crease_weights;
+
+    /*! subdivision level for each half edge of the vertexIndices buffer */
+    BufferView<float> levels;
+    float tessellationRate;  // constant rate that is used when levels is not set
+
+    /*! buffer that marks specific faces as holes */
+    BufferView<unsigned> holes;
+
+    /*! all data in this section is generated by initializeHalfEdgeStructures function */
+  private:
+
+    /*! number of half edges used by faces */
+    size_t numHalfEdges; 
+
+    /*! fast lookup table to find the first half edge for some face */
+    mvector<uint32_t> faceStartEdge;
+
+    /*! fast lookup table to find the face for some half edge */
+    mvector<uint32_t> halfEdgeFace;
+
+    /*! set with all holes */
+    parallel_set<uint32_t> holeSet;
+
+    /*! fast lookup table to detect invalid faces */
+    mvector<char> invalid_face;
+
+    /*! test if face i is invalid in timestep j */
+    __forceinline       char& invalidFace(size_t i, size_t j = 0)       { return invalid_face[i*numTimeSteps+j]; }
+    __forceinline const char& invalidFace(size_t i, size_t j = 0) const { return invalid_face[i*numTimeSteps+j]; }
+
+    /*! interpolation cache */
+  public:
+    static __forceinline size_t numInterpolationSlots4(size_t stride) { return (stride+15)/16; }
+    static __forceinline size_t numInterpolationSlots8(size_t stride) { return (stride+31)/32; }
+    static __forceinline size_t interpolationSlot(size_t prim, size_t slot, size_t stride) {
+      const size_t slots = numInterpolationSlots4(stride); 
+      assert(slot < slots); 
+      return slots*prim+slot;
+    }
+    std::vector<std::vector<SharedLazyTessellationCache::CacheEntry>> vertex_buffer_tags;
+    std::vector<std::vector<SharedLazyTessellationCache::CacheEntry>> vertex_attrib_buffer_tags;
+    std::vector<Patch3fa::Ref> patch_eval_trees;
+    
+    /*! the following data is only required during construction of the
+     *  half edge structure and can be cleared for static scenes */
+  private:
+
+    /*! map with all vertex creases */
+    parallel_map<uint32_t,float> vertexCreaseMap;
+    
+    /*! map with all edge creases */
+    parallel_map<uint64_t,float> edgeCreaseMap;
+
+  protected:
+    
+    /*! counts number of geometry commits */
+    size_t commitCounter;
+  };
+
+  namespace isa
+  {
+    struct SubdivMeshISA : public SubdivMesh
+    {
+      SubdivMeshISA (Device* device)
+        : SubdivMesh(device) {}
+
+      void interpolate(const RTCInterpolateArguments* const args);
+      void interpolateN(const RTCInterpolateNArguments* const args);
+    };
+  }
+
+  DECLARE_ISA_FUNCTION(SubdivMesh*, createSubdivMesh, Device*);
+};
diff --git a/thirdparty/embree/kernels/common/scene_triangle_mesh.cpp b/thirdparty/embree/kernels/common/scene_triangle_mesh.cpp
new file mode 100644
index 0000000000..3bbd7e51ae
--- /dev/null
+++ b/thirdparty/embree/kernels/common/scene_triangle_mesh.cpp
@@ -0,0 +1,194 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "scene_triangle_mesh.h"
+#include "scene.h"
+
+namespace embree
+{
+#if defined(EMBREE_LOWEST_ISA)
+
+  TriangleMesh::TriangleMesh (Device* device)
+    : Geometry(device,GTY_TRIANGLE_MESH,0,1)
+  {
+    vertices.resize(numTimeSteps);
+  }
+
+  void TriangleMesh::setMask (unsigned mask) 
+  {
+    this->mask = mask; 
+    Geometry::update();
+  }
+
+  void TriangleMesh::setNumTimeSteps (unsigned int numTimeSteps)
+  {
+    vertices.resize(numTimeSteps);
+    Geometry::setNumTimeSteps(numTimeSteps);
+  }
+
+  void TriangleMesh::setVertexAttributeCount (unsigned int N)
+  {
+    vertexAttribs.resize(N);
+    Geometry::update();
+  }
+  
+  void TriangleMesh::setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num)
+  {
+    /* verify that all accesses are 4 bytes aligned */
+    if (((size_t(buffer->getPtr()) + offset) & 0x3) || (stride & 0x3)) 
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION, "data must be 4 bytes aligned");
+
+    if (type == RTC_BUFFER_TYPE_VERTEX)
+    {
+      if (format != RTC_FORMAT_FLOAT3)
+        throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid vertex buffer format");
+
+      /* if buffer is larger than 16GB the premultiplied index optimization does not work */
+      if (stride*num > 16ll*1024ll*1024ll*1024ll)
+        throw_RTCError(RTC_ERROR_INVALID_OPERATION, "vertex buffer can be at most 16GB large");
+
+      if (slot >= vertices.size())
+        throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid vertex buffer slot");
+
+      vertices[slot].set(buffer, offset, stride, num, format);
+      vertices[slot].checkPadding16();
+      vertices0 = vertices[0];
+    }
+    else if (type == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE)
+    {
+      if (format < RTC_FORMAT_FLOAT || format > RTC_FORMAT_FLOAT16)
+        throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid vertex attribute buffer format");
+
+      if (slot >= vertexAttribs.size())
+        throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid vertex attribute buffer slot");
+      
+      vertexAttribs[slot].set(buffer, offset, stride, num, format);
+      vertexAttribs[slot].checkPadding16();
+    }
+    else if (type == RTC_BUFFER_TYPE_INDEX)
+    {
+      if (slot != 0)
+        throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot");
+      if (format != RTC_FORMAT_UINT3)
+        throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid index buffer format");
+
+      triangles.set(buffer, offset, stride, num, format);
+      setNumPrimitives(num);
+    }
+    else 
+      throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "unknown buffer type");
+  }
+
+  void* TriangleMesh::getBuffer(RTCBufferType type, unsigned int slot)
+  {
+    if (type == RTC_BUFFER_TYPE_INDEX)
+    {
+      if (slot != 0)
+        throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot");
+      return triangles.getPtr();
+    }
+    else if (type == RTC_BUFFER_TYPE_VERTEX)
+    {
+      if (slot >= vertices.size())
+        throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot");
+      return vertices[slot].getPtr();
+    }
+    else if (type == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE)
+    {
+      if (slot >= vertexAttribs.size())
+        throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot");
+      return vertexAttribs[slot].getPtr();
+    }
+    else
+    {
+      throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "unknown buffer type");
+      return nullptr;
+    }
+  }
+
+  void TriangleMesh::updateBuffer(RTCBufferType type, unsigned int slot)
+  {
+    if (type == RTC_BUFFER_TYPE_INDEX)
+    {
+      if (slot != 0)
+        throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot");
+      triangles.setModified();
+    }
+    else if (type == RTC_BUFFER_TYPE_VERTEX)
+    {
+      if (slot >= vertices.size())
+        throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot");
+      vertices[slot].setModified();
+    }
+    else if (type == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE)
+    {
+      if (slot >= vertexAttribs.size())
+        throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot");
+      vertexAttribs[slot].setModified();
+    }
+    else
+    {
+      throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "unknown buffer type");
+    }
+
+    Geometry::update();
+  }
+
+  void TriangleMesh::commit() 
+  {
+    /* verify that stride of all time steps are identical */
+    for (unsigned int t=0; t<numTimeSteps; t++)
+      if (vertices[t].getStride() != vertices[0].getStride())
+        throw_RTCError(RTC_ERROR_INVALID_OPERATION,"stride of vertex buffers have to be identical for each time step");
+
+    Geometry::commit();
+  }
+
+  void TriangleMesh::addElementsToCount (GeometryCounts & counts) const 
+  {
+    if (numTimeSteps == 1) counts.numTriangles += numPrimitives;
+    else                   counts.numMBTriangles += numPrimitives;
+  }
+
+  bool TriangleMesh::verify() 
+  {
+    /*! verify size of vertex arrays */
+    if (vertices.size() == 0) return false;
+    for (const auto& buffer : vertices)
+      if (buffer.size() != numVertices())
+        return false;
+
+    /*! verify size of user vertex arrays */
+    for (const auto& buffer : vertexAttribs)
+      if (buffer.size() != numVertices())
+        return false;
+
+    /*! verify triangle indices */
+    for (size_t i=0; i<size(); i++) {     
+      if (triangles[i].v[0] >= numVertices()) return false; 
+      if (triangles[i].v[1] >= numVertices()) return false; 
+      if (triangles[i].v[2] >= numVertices()) return false; 
+    }
+
+    /*! verify vertices */
+    for (const auto& buffer : vertices)
+      for (size_t i=0; i<buffer.size(); i++)
+	if (!isvalid(buffer[i])) 
+	  return false;
+
+    return true;
+  }
+
+  void TriangleMesh::interpolate(const RTCInterpolateArguments* const args) {
+    interpolate_impl<4>(args);
+  }
+ 
+#endif
+
+  namespace isa
+  {
+    TriangleMesh* createTriangleMesh(Device* device) {
+      return new TriangleMeshISA(device);
+    }
+  }
+}
diff --git a/thirdparty/embree/kernels/common/scene_triangle_mesh.h b/thirdparty/embree/kernels/common/scene_triangle_mesh.h
new file mode 100644
index 0000000000..ad3f602fde
--- /dev/null
+++ b/thirdparty/embree/kernels/common/scene_triangle_mesh.h
@@ -0,0 +1,318 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "geometry.h"
+#include "buffer.h"
+
+namespace embree
+{
+  /*! Triangle Mesh */
+  struct TriangleMesh : public Geometry
+  {
+    /*! type of this geometry */
+    static const Geometry::GTypeMask geom_type = Geometry::MTY_TRIANGLE_MESH;
+
+    /*! triangle indices */
+    struct Triangle 
+    {
+      uint32_t v[3];
+
+      /*! outputs triangle indices */
+      __forceinline friend embree_ostream operator<<(embree_ostream cout, const Triangle& t) {
+        return cout << "Triangle { " << t.v[0] << ", " << t.v[1] << ", " << t.v[2] << " }";
+      }
+    };
+
+  public:
+
+    /*! triangle mesh construction */
+    TriangleMesh (Device* device); 
+
+    /* geometry interface */
+  public:
+    void setMask(unsigned mask);
+    void setNumTimeSteps (unsigned int numTimeSteps);
+    void setVertexAttributeCount (unsigned int N);
+    void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num);
+    void* getBuffer(RTCBufferType type, unsigned int slot);
+    void updateBuffer(RTCBufferType type, unsigned int slot);
+    void commit();
+    bool verify();
+    void interpolate(const RTCInterpolateArguments* const args);
+    void addElementsToCount (GeometryCounts & counts) const;
+
+    template<int N>
+    void interpolate_impl(const RTCInterpolateArguments* const args)
+    {
+      unsigned int primID = args->primID;
+      float u = args->u;
+      float v = args->v;
+      RTCBufferType bufferType = args->bufferType;
+      unsigned int bufferSlot = args->bufferSlot;
+      float* P = args->P;
+      float* dPdu = args->dPdu;
+      float* dPdv = args->dPdv;
+      float* ddPdudu = args->ddPdudu;
+      float* ddPdvdv = args->ddPdvdv;
+      float* ddPdudv = args->ddPdudv;
+      unsigned int valueCount = args->valueCount;
+      
+      /* calculate base pointer and stride */
+      assert((bufferType == RTC_BUFFER_TYPE_VERTEX && bufferSlot < numTimeSteps) ||
+             (bufferType == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE && bufferSlot <= vertexAttribs.size()));
+      const char* src = nullptr; 
+      size_t stride = 0;
+      if (bufferType == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE) {
+        src    = vertexAttribs[bufferSlot].getPtr();
+        stride = vertexAttribs[bufferSlot].getStride();
+      } else {
+        src    = vertices[bufferSlot].getPtr();
+        stride = vertices[bufferSlot].getStride();
+      }
+      
+      for (unsigned int i=0; i<valueCount; i+=N)
+      {
+        size_t ofs = i*sizeof(float);
+        const float w = 1.0f-u-v;
+        const Triangle& tri = triangle(primID);
+        const vbool<N> valid = vint<N>((int)i)+vint<N>(step) < vint<N>(int(valueCount));
+        const vfloat<N> p0 = mem<vfloat<N>>::loadu(valid,(float*)&src[tri.v[0]*stride+ofs]);
+        const vfloat<N> p1 = mem<vfloat<N>>::loadu(valid,(float*)&src[tri.v[1]*stride+ofs]);
+        const vfloat<N> p2 = mem<vfloat<N>>::loadu(valid,(float*)&src[tri.v[2]*stride+ofs]);
+        
+        if (P) {
+          mem<vfloat<N>>::storeu(valid,P+i,madd(w,p0,madd(u,p1,v*p2)));
+        }
+        if (dPdu) {
+          assert(dPdu); mem<vfloat<N>>::storeu(valid,dPdu+i,p1-p0);
+          assert(dPdv); mem<vfloat<N>>::storeu(valid,dPdv+i,p2-p0);
+        }
+        if (ddPdudu) {
+          assert(ddPdudu); mem<vfloat<N>>::storeu(valid,ddPdudu+i,vfloat<N>(zero));
+          assert(ddPdvdv); mem<vfloat<N>>::storeu(valid,ddPdvdv+i,vfloat<N>(zero));
+          assert(ddPdudv); mem<vfloat<N>>::storeu(valid,ddPdudv+i,vfloat<N>(zero));
+        }
+      }
+    }
+    
+  public:
+    
+    /*! returns number of vertices */
+    __forceinline size_t numVertices() const {
+      return vertices[0].size();
+    }
+    
+    /*! returns i'th triangle*/
+    __forceinline const Triangle& triangle(size_t i) const {
+      return triangles[i];
+    }
+
+    /*! returns i'th vertex of the first time step  */
+    __forceinline const Vec3fa vertex(size_t i) const {
+      return vertices0[i];
+    }
+
+    /*! returns i'th vertex of the first time step */
+    __forceinline const char* vertexPtr(size_t i) const {
+      return vertices0.getPtr(i);
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline const Vec3fa vertex(size_t i, size_t itime) const {
+      return vertices[itime][i];
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline const char* vertexPtr(size_t i, size_t itime) const {
+      return vertices[itime].getPtr(i);
+    }
+
+    /*! calculates the bounds of the i'th triangle */
+    __forceinline BBox3fa bounds(size_t i) const 
+    {
+      const Triangle& tri = triangle(i);
+      const Vec3fa v0 = vertex(tri.v[0]);
+      const Vec3fa v1 = vertex(tri.v[1]);
+      const Vec3fa v2 = vertex(tri.v[2]);
+      return BBox3fa(min(v0,v1,v2),max(v0,v1,v2));
+    }
+
+    /*! calculates the bounds of the i'th triangle at the itime'th timestep */
+    __forceinline BBox3fa bounds(size_t i, size_t itime) const
+    {
+      const Triangle& tri = triangle(i);
+      const Vec3fa v0 = vertex(tri.v[0],itime);
+      const Vec3fa v1 = vertex(tri.v[1],itime);
+      const Vec3fa v2 = vertex(tri.v[2],itime);
+      return BBox3fa(min(v0,v1,v2),max(v0,v1,v2));
+    }
+
+    /*! check if the i'th primitive is valid at the itime'th timestep */
+    __forceinline bool valid(size_t i, size_t itime) const {
+      return valid(i, make_range(itime, itime));
+    }
+
+    /*! check if the i'th primitive is valid between the specified time range */
+    __forceinline bool valid(size_t i, const range<size_t>& itime_range) const
+    {
+      const Triangle& tri = triangle(i);
+      if (unlikely(tri.v[0] >= numVertices())) return false;
+      if (unlikely(tri.v[1] >= numVertices())) return false;
+      if (unlikely(tri.v[2] >= numVertices())) return false;
+
+      for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++)
+      {
+        if (!isvalid(vertex(tri.v[0],itime))) return false;
+        if (!isvalid(vertex(tri.v[1],itime))) return false;
+        if (!isvalid(vertex(tri.v[2],itime))) return false;
+      }
+
+      return true;
+    }
+
+    /*! calculates the linear bounds of the i'th primitive at the itimeGlobal'th time segment */
+    __forceinline LBBox3fa linearBounds(size_t i, size_t itime) const {
+      return LBBox3fa(bounds(i,itime+0),bounds(i,itime+1));
+    }
+
+    /*! calculates the build bounds of the i'th primitive, if it's valid */
+    __forceinline bool buildBounds(size_t i, BBox3fa* bbox = nullptr) const
+    {
+      const Triangle& tri = triangle(i);
+      if (unlikely(tri.v[0] >= numVertices())) return false;
+      if (unlikely(tri.v[1] >= numVertices())) return false;
+      if (unlikely(tri.v[2] >= numVertices())) return false;
+
+      for (size_t t=0; t<numTimeSteps; t++)
+      {
+        const Vec3fa v0 = vertex(tri.v[0],t);
+        const Vec3fa v1 = vertex(tri.v[1],t);
+        const Vec3fa v2 = vertex(tri.v[2],t);
+        if (unlikely(!isvalid(v0) || !isvalid(v1) || !isvalid(v2)))
+          return false;
+      }
+
+      if (likely(bbox)) 
+        *bbox = bounds(i);
+
+      return true;
+    }
+
+    /*! calculates the build bounds of the i'th primitive at the itime'th time segment, if it's valid */
+    __forceinline bool buildBounds(size_t i, size_t itime, BBox3fa& bbox) const
+    {
+      const Triangle& tri = triangle(i);
+      if (unlikely(tri.v[0] >= numVertices())) return false;
+      if (unlikely(tri.v[1] >= numVertices())) return false;
+      if (unlikely(tri.v[2] >= numVertices())) return false;
+
+      assert(itime+1 < numTimeSteps);
+      const Vec3fa a0 = vertex(tri.v[0],itime+0); if (unlikely(!isvalid(a0))) return false;
+      const Vec3fa a1 = vertex(tri.v[1],itime+0); if (unlikely(!isvalid(a1))) return false;
+      const Vec3fa a2 = vertex(tri.v[2],itime+0); if (unlikely(!isvalid(a2))) return false;
+      const Vec3fa b0 = vertex(tri.v[0],itime+1); if (unlikely(!isvalid(b0))) return false;
+      const Vec3fa b1 = vertex(tri.v[1],itime+1); if (unlikely(!isvalid(b1))) return false;
+      const Vec3fa b2 = vertex(tri.v[2],itime+1); if (unlikely(!isvalid(b2))) return false;
+      
+      /* use bounds of first time step in builder */
+      bbox = BBox3fa(min(a0,a1,a2),max(a0,a1,a2));
+      return true;
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline LBBox3fa linearBounds(size_t primID, const BBox1f& dt) const {
+      return LBBox3fa([&] (size_t itime) { return bounds(primID, itime); }, dt, time_range, fnumTimeSegments);
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline bool linearBounds(size_t i, const BBox1f& dt, LBBox3fa& bbox) const  {
+      if (!valid(i, timeSegmentRange(dt))) return false;
+      bbox = linearBounds(i, dt);
+      return true;
+    }
+
+    /*! get fast access to first vertex buffer */
+    __forceinline float * getCompactVertexArray () const {
+      return (float*) vertices0.getPtr();
+    }
+
+    /* gets version info of topology */
+    unsigned int getTopologyVersion() const {
+      return triangles.modCounter;
+    }
+    
+    /* returns true if topology changed */
+    bool topologyChanged(unsigned int otherVersion) const {
+      return triangles.isModified(otherVersion); // || numPrimitivesChanged;
+    }
+
+    /* returns the projected area */
+    __forceinline float projectedPrimitiveArea(const size_t i) const {
+      const Triangle& tri = triangle(i);
+      const Vec3fa v0 = vertex(tri.v[0]);
+      const Vec3fa v1 = vertex(tri.v[1]);
+      const Vec3fa v2 = vertex(tri.v[2]);      
+      return areaProjectedTriangle(v0,v1,v2);
+    }
+
+  public:
+    BufferView<Triangle> triangles;      //!< array of triangles
+    BufferView<Vec3fa> vertices0;        //!< fast access to first vertex buffer
+    vector<BufferView<Vec3fa>> vertices; //!< vertex array for each timestep
+    vector<RawBufferView> vertexAttribs; //!< vertex attributes
+  };
+
+  namespace isa
+  {
+    struct TriangleMeshISA : public TriangleMesh
+    {
+      TriangleMeshISA (Device* device)
+        : TriangleMesh(device) {}
+
+      PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfo pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          BBox3fa bounds = empty;
+          if (!buildBounds(j,&bounds)) continue;
+          const PrimRef prim(bounds,geomID,unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+
+      PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, size_t itime, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfo pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          BBox3fa bounds = empty;
+          if (!buildBounds(j,itime,bounds)) continue;
+          const PrimRef prim(bounds,geomID,unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+      
+      PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfoMB pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          if (!valid(j, timeSegmentRange(t0t1))) continue;
+          const PrimRefMB prim(linearBounds(j,t0t1),this->numTimeSegments(),this->time_range,this->numTimeSegments(),geomID,unsigned(j));
+          pinfo.add_primref(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+    };
+  }
+
+  DECLARE_ISA_FUNCTION(TriangleMesh*, createTriangleMesh, Device*);
+}
diff --git a/thirdparty/embree/kernels/common/scene_user_geometry.h b/thirdparty/embree/kernels/common/scene_user_geometry.h
new file mode 100644
index 0000000000..2867b18b79
--- /dev/null
+++ b/thirdparty/embree/kernels/common/scene_user_geometry.h
@@ -0,0 +1,77 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "accelset.h"
+
+namespace embree
+{
+  /*! User geometry with user defined intersection functions */
+  struct UserGeometry : public AccelSet
+  {
+    /*! type of this geometry */
+    static const Geometry::GTypeMask geom_type = Geometry::MTY_USER_GEOMETRY;
+
+  public:
+    UserGeometry (Device* device, unsigned int items = 0, unsigned int numTimeSteps = 1);
+    virtual void setMask (unsigned mask);
+    virtual void setBoundsFunction (RTCBoundsFunction bounds, void* userPtr);
+    virtual void setIntersectFunctionN (RTCIntersectFunctionN intersect);
+    virtual void setOccludedFunctionN (RTCOccludedFunctionN occluded);
+    virtual void build() {}
+    virtual void addElementsToCount (GeometryCounts & counts) const;
+  };
+
+  namespace isa
+  {
+    struct UserGeometryISA : public UserGeometry
+    {
+      UserGeometryISA (Device* device)
+        : UserGeometry(device) {}
+
+      PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfo pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          BBox3fa bounds = empty;
+          if (!buildBounds(j,&bounds)) continue;
+          const PrimRef prim(bounds,geomID,unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+
+      PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, size_t itime, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfo pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          BBox3fa bounds = empty;
+          if (!buildBounds(j,itime,bounds)) continue;
+          const PrimRef prim(bounds,geomID,unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+      
+      PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfoMB pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          if (!valid(j, timeSegmentRange(t0t1))) continue;
+          const PrimRefMB prim(linearBounds(j,t0t1),this->numTimeSegments(),this->time_range,this->numTimeSegments(),geomID,unsigned(j));
+          pinfo.add_primref(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+    };
+  }
+  
+  DECLARE_ISA_FUNCTION(UserGeometry*, createUserGeometry, Device*);
+}
diff --git a/thirdparty/embree/kernels/common/stack_item.h b/thirdparty/embree/kernels/common/stack_item.h
new file mode 100644
index 0000000000..c31c64e862
--- /dev/null
+++ b/thirdparty/embree/kernels/common/stack_item.h
@@ -0,0 +1,125 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+
+namespace embree
+{
+  /*! An item on the stack holds the node ID and distance of that node. */
+  template<typename T>
+  struct __aligned(16) StackItemT
+  {
+    /*! assert that the xchg function works */
+    static_assert(sizeof(T) <= 12, "sizeof(T) <= 12 failed");
+
+    __forceinline StackItemT() {}
+
+    __forceinline StackItemT(T &ptr, unsigned &dist) : ptr(ptr), dist(dist) {}
+
+    /*! use SSE instructions to swap stack items */
+    __forceinline static void xchg(StackItemT& a, StackItemT& b) 
+    { 
+      const vfloat4 sse_a = vfloat4::load((float*)&a); 
+      const vfloat4 sse_b = vfloat4::load((float*)&b);
+      vfloat4::store(&a,sse_b);
+      vfloat4::store(&b,sse_a);
+    }
+
+    /*! Sort 2 stack items. */
+    __forceinline friend void sort(StackItemT& s1, StackItemT& s2) {
+      if (s2.dist < s1.dist) xchg(s2,s1);
+    }
+    
+    /*! Sort 3 stack items. */
+    __forceinline friend void sort(StackItemT& s1, StackItemT& s2, StackItemT& s3)
+    {
+      if (s2.dist < s1.dist) xchg(s2,s1);
+      if (s3.dist < s2.dist) xchg(s3,s2);
+      if (s2.dist < s1.dist) xchg(s2,s1);
+    }
+    
+    /*! Sort 4 stack items. */
+    __forceinline friend void sort(StackItemT& s1, StackItemT& s2, StackItemT& s3, StackItemT& s4)
+    {
+      if (s2.dist < s1.dist) xchg(s2,s1);
+      if (s4.dist < s3.dist) xchg(s4,s3);
+      if (s3.dist < s1.dist) xchg(s3,s1);
+      if (s4.dist < s2.dist) xchg(s4,s2);
+      if (s3.dist < s2.dist) xchg(s3,s2);
+    }
+
+    /*! use SSE instructions to swap stack items */
+    __forceinline static void cmp_xchg(vint4& a, vint4& b) 
+    { 
+#if defined(__AVX512VL__)
+      const vboolf4 mask(shuffle<2,2,2,2>(b) < shuffle<2,2,2,2>(a));
+#else
+      const vboolf4 mask0(b < a);
+      const vboolf4 mask(shuffle<2,2,2,2>(mask0));
+#endif
+      const vint4 c = select(mask,b,a);
+      const vint4 d = select(mask,a,b);
+      a = c;
+      b = d;
+    }
+    
+    /*! Sort 3 stack items. */
+    __forceinline static void sort3(vint4& s1, vint4& s2, vint4& s3)
+    {
+      cmp_xchg(s2,s1);
+      cmp_xchg(s3,s2);
+      cmp_xchg(s2,s1);
+    }
+    
+    /*! Sort 4 stack items. */
+    __forceinline static void sort4(vint4& s1, vint4& s2, vint4& s3, vint4& s4)
+    {
+      cmp_xchg(s2,s1);
+      cmp_xchg(s4,s3);
+      cmp_xchg(s3,s1);
+      cmp_xchg(s4,s2);
+      cmp_xchg(s3,s2);
+    }
+
+
+    /*! Sort N stack items. */
+    __forceinline friend void sort(StackItemT* begin, StackItemT* end)
+    {
+      for (StackItemT* i = begin+1; i != end; ++i)
+      {
+        const vfloat4 item = vfloat4::load((float*)i);
+        const unsigned dist = i->dist;
+        StackItemT* j = i;
+
+        while ((j != begin) && ((j-1)->dist < dist))
+        {
+          vfloat4::store(j, vfloat4::load((float*)(j-1)));
+          --j;
+        }
+
+        vfloat4::store(j, item);
+      }
+    }
+    
+  public:
+    T ptr;
+    unsigned dist;
+  };
+
+  /*! An item on the stack holds the node ID and active ray mask. */
+  template<typename T>
+  struct __aligned(8) StackItemMaskT
+  {
+    T ptr;
+    size_t mask;
+  };
+
+  struct __aligned(8) StackItemMaskCoherent
+  {
+    size_t mask;
+    size_t parent;
+    size_t child;
+  };
+}
diff --git a/thirdparty/embree/kernels/common/stat.cpp b/thirdparty/embree/kernels/common/stat.cpp
new file mode 100644
index 0000000000..ebb77cd534
--- /dev/null
+++ b/thirdparty/embree/kernels/common/stat.cpp
@@ -0,0 +1,128 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "stat.h"
+
+namespace embree
+{
+  Stat Stat::instance; 
+  
+  Stat::Stat () {
+  }
+
+  Stat::~Stat () 
+  {
+#ifdef EMBREE_STAT_COUNTERS
+    Stat::print(std::cout);
+#endif
+  }
+
+  void Stat::print(std::ostream& cout)
+  {
+    Counters& cntrs = instance.cntrs;
+    Counters::Data& data = instance.cntrs.code;
+    //Counters::Data& data = instance.cntrs.active;
+
+    /* print absolute numbers */
+    cout << "--------- ABSOLUTE ---------" << std::endl;
+    cout << "  #normal_travs   = " << float(data.normal.travs            )*1E-6 << "M" << std::endl;
+    cout << "    #nodes        = " << float(data.normal.trav_nodes       )*1E-6 << "M" << std::endl;
+    cout << "    #nodes_xfm    = " << float(data.normal.trav_xfm_nodes   )*1E-6 << "M" << std::endl;
+    cout << "    #leaves       = " << float(data.normal.trav_leaves      )*1E-6 << "M" << std::endl;
+    cout << "    #prims        = " << float(data.normal.trav_prims       )*1E-6 << "M" << std::endl;
+    cout << "    #prim_hits    = " << float(data.normal.trav_prim_hits   )*1E-6 << "M" << std::endl;
+
+    cout << "    #stack nodes  = " << float(data.normal.trav_stack_nodes )*1E-6 << "M" << std::endl;
+    cout << "    #stack pop    = " << float(data.normal.trav_stack_pop )*1E-6 << "M" << std::endl;
+
+    size_t normal_box_hits = 0;
+    size_t weighted_box_hits = 0;
+    for (size_t i=0;i<SIZE_HISTOGRAM;i++) { 
+      normal_box_hits += data.normal.trav_hit_boxes[i];
+      weighted_box_hits += data.normal.trav_hit_boxes[i]*i;
+    }
+    cout << "    #hit_boxes    = " << normal_box_hits << " (total) distribution: ";
+    float average = 0.0f;
+    for (size_t i=0;i<SIZE_HISTOGRAM;i++) 
+    {
+      float value = 100.0f * data.normal.trav_hit_boxes[i] / normal_box_hits;
+      cout << "[" << i << "] " << value << " ";
+      average += (float)i*data.normal.trav_hit_boxes[i] / normal_box_hits;
+    }
+    cout << "    average = " << average << std::endl;
+    for (size_t i=0;i<SIZE_HISTOGRAM;i++) cout << "[" << i << "] " << 100.0f * data.normal.trav_hit_boxes[i]*i / weighted_box_hits << " ";
+    cout << std::endl;
+
+    if (data.shadow.travs) {
+      cout << "  #shadow_travs = " << float(data.shadow.travs         )*1E-6 << "M" << std::endl;
+      cout << "    #nodes      = " << float(data.shadow.trav_nodes    )*1E-6 << "M" << std::endl;
+      cout << "    #nodes_xfm  = " << float(data.shadow.trav_xfm_nodes)*1E-6 << "M" << std::endl;
+      cout << "    #leaves     = " << float(data.shadow.trav_leaves   )*1E-6 << "M" << std::endl;
+      cout << "    #prims      = " << float(data.shadow.trav_prims    )*1E-6 << "M" << std::endl;
+      cout << "    #prim_hits  = " << float(data.shadow.trav_prim_hits)*1E-6 << "M" << std::endl;
+
+      cout << "    #stack nodes = " << float(data.shadow.trav_stack_nodes )*1E-6 << "M" << std::endl;
+      cout << "    #stack pop   = " << float(data.shadow.trav_stack_pop )*1E-6 << "M" << std::endl;
+
+      size_t shadow_box_hits = 0;
+      size_t weighted_shadow_box_hits = 0;
+
+      for (size_t i=0;i<SIZE_HISTOGRAM;i++) {        
+        shadow_box_hits += data.shadow.trav_hit_boxes[i];
+        weighted_shadow_box_hits += data.shadow.trav_hit_boxes[i]*i;
+      }
+      cout << "    #hit_boxes    = ";
+      for (size_t i=0;i<SIZE_HISTOGRAM;i++) cout << "[" << i << "] " << 100.0f * data.shadow.trav_hit_boxes[i] / shadow_box_hits << " ";
+      cout << std::endl;
+      for (size_t i=0;i<SIZE_HISTOGRAM;i++) cout << "[" << i << "] " << 100.0f * data.shadow.trav_hit_boxes[i]*i / weighted_shadow_box_hits << " ";
+      cout << std::endl;
+    }
+    cout << std::endl;
+
+    /* print per traversal numbers */
+    cout << "--------- PER TRAVERSAL ---------" << std::endl;
+    float active_normal_travs       = float(cntrs.active.normal.travs      )/float(cntrs.all.normal.travs      );
+    float active_normal_trav_nodes  = float(cntrs.active.normal.trav_nodes )/float(cntrs.all.normal.trav_nodes );
+    float active_normal_trav_xfm_nodes  = float(cntrs.active.normal.trav_xfm_nodes )/float(cntrs.all.normal.trav_xfm_nodes );
+    float active_normal_trav_leaves = float(cntrs.active.normal.trav_leaves)/float(cntrs.all.normal.trav_leaves);
+    float active_normal_trav_prims   = float(cntrs.active.normal.trav_prims  )/float(cntrs.all.normal.trav_prims  );
+    float active_normal_trav_prim_hits = float(cntrs.active.normal.trav_prim_hits  )/float(cntrs.all.normal.trav_prim_hits  );
+    float active_normal_trav_stack_pop = float(cntrs.active.normal.trav_stack_pop  )/float(cntrs.all.normal.trav_stack_pop  );
+
+    cout << "  #normal_travs   = " << float(cntrs.code.normal.travs      )/float(cntrs.code.normal.travs) << ", " << 100.0f*active_normal_travs       << "% active" << std::endl;
+    cout << "    #nodes        = " << float(cntrs.code.normal.trav_nodes )/float(cntrs.code.normal.travs) << ", " << 100.0f*active_normal_trav_nodes  << "% active" << std::endl;
+    cout << "    #node_xfm     = " << float(cntrs.code.normal.trav_xfm_nodes )/float(cntrs.code.normal.travs) << ", " << 100.0f*active_normal_trav_xfm_nodes  << "% active" << std::endl;
+    cout << "    #leaves       = " << float(cntrs.code.normal.trav_leaves)/float(cntrs.code.normal.travs) << ", " << 100.0f*active_normal_trav_leaves << "% active" << std::endl;
+    cout << "    #prims        = " << float(cntrs.code.normal.trav_prims  )/float(cntrs.code.normal.travs) << ", " << 100.0f*active_normal_trav_prims   << "% active" << std::endl;
+    cout << "    #prim_hits    = " << float(cntrs.code.normal.trav_prim_hits  )/float(cntrs.code.normal.travs) << ", " << 100.0f*active_normal_trav_prim_hits   << "% active" << std::endl;
+    cout << "    #stack_pop    = " << float(cntrs.code.normal.trav_stack_pop  )/float(cntrs.code.normal.travs) << ", " << 100.0f*active_normal_trav_stack_pop   << "% active" << std::endl;
+
+    if (cntrs.all.shadow.travs) {
+      float active_shadow_travs       = float(cntrs.active.shadow.travs      )/float(cntrs.all.shadow.travs      );
+      float active_shadow_trav_nodes  = float(cntrs.active.shadow.trav_nodes )/float(cntrs.all.shadow.trav_nodes );
+      float active_shadow_trav_xfm_nodes  = float(cntrs.active.shadow.trav_xfm_nodes )/float(cntrs.all.shadow.trav_xfm_nodes );
+      float active_shadow_trav_leaves = float(cntrs.active.shadow.trav_leaves)/float(cntrs.all.shadow.trav_leaves);
+      float active_shadow_trav_prims   = float(cntrs.active.shadow.trav_prims  )/float(cntrs.all.shadow.trav_prims  );
+      float active_shadow_trav_prim_hits = float(cntrs.active.shadow.trav_prim_hits  )/float(cntrs.all.shadow.trav_prim_hits  );
+
+      cout << "  #shadow_travs = " << float(cntrs.code.shadow.travs      )/float(cntrs.code.shadow.travs) << ", " << 100.0f*active_shadow_travs       << "% active" << std::endl;
+      cout << "    #nodes      = " << float(cntrs.code.shadow.trav_nodes )/float(cntrs.code.shadow.travs) << ", " << 100.0f*active_shadow_trav_nodes  << "% active" << std::endl;
+      cout << "    #nodes_xfm  = " << float(cntrs.code.shadow.trav_xfm_nodes )/float(cntrs.code.shadow.travs) << ", " << 100.0f*active_shadow_trav_xfm_nodes  << "% active" << std::endl;
+      cout << "    #leaves     = " << float(cntrs.code.shadow.trav_leaves)/float(cntrs.code.shadow.travs) << ", " << 100.0f*active_shadow_trav_leaves << "% active" << std::endl;
+      cout << "    #prims      = " << float(cntrs.code.shadow.trav_prims  )/float(cntrs.code.shadow.travs) << ", " << 100.0f*active_shadow_trav_prims   << "% active" << std::endl;
+      cout << "    #prim_hits  = " << float(cntrs.code.shadow.trav_prim_hits  )/float(cntrs.code.shadow.travs) << ", " << 100.0f*active_shadow_trav_prim_hits   << "% active" << std::endl;
+
+    }
+    cout << std::endl;
+
+     /* print user counters for performance tuning */
+    cout << "--------- USER ---------" << std::endl;
+    for (size_t i=0; i<10; i++)
+      cout << "#user" << i << " = " << float(cntrs.user[i])/float(cntrs.all.normal.travs+cntrs.all.shadow.travs) << " per traversal" << std::endl;
+
+    cout << "#user5/user3 " << 100.0f*float(cntrs.user[5])/float(cntrs.user[3]) << "%" << std::endl;
+    cout << "#user6/user3 " << 100.0f*float(cntrs.user[6])/float(cntrs.user[3]) << "%" << std::endl;
+    cout << "#user7/user3 " << 100.0f*float(cntrs.user[7])/float(cntrs.user[3]) << "%" << std::endl;
+    cout << std::endl;
+  }
+}
diff --git a/thirdparty/embree/kernels/common/stat.h b/thirdparty/embree/kernels/common/stat.h
new file mode 100644
index 0000000000..02fc07e67f
--- /dev/null
+++ b/thirdparty/embree/kernels/common/stat.h
@@ -0,0 +1,116 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+
+/* Macros to gather statistics */
+#ifdef EMBREE_STAT_COUNTERS
+#  define STAT(x) x
+#  define STAT3(s,x,y,z) \
+  STAT(Stat::get().code  .s+=x);               \
+  STAT(Stat::get().active.s+=y);               \
+  STAT(Stat::get().all   .s+=z);
+#  define STAT_USER(i,x) Stat::get().user[i]+=x;
+#else
+#  define STAT(x)
+#  define STAT3(s,x,y,z)
+#  define STAT_USER(i,x) 
+#endif
+
+namespace embree
+{
+  /*! Gathers ray tracing statistics. We count 1) how often a code
+   *  location is reached, 2) how many SIMD lanes are active, 3) how
+   *  many SIMD lanes reach the code location */
+  class Stat
+  { 
+  public:
+
+    static const size_t SIZE_HISTOGRAM = 64+1;
+
+    /*! constructs stat counter class */
+    Stat ();
+
+    /*! destructs stat counter class */
+    ~Stat ();
+
+    class Counters 
+    {
+    public:
+      Counters () { 
+        clear(); 
+      }
+      
+      void clear() 
+      { 
+        all.clear();
+        active.clear();
+        code.clear();
+        for (auto& u : user) u.store(0);
+      }
+
+    public:
+
+	/* per packet and per ray stastics */
+	struct Data
+        {
+          void clear () {
+            normal.clear();
+            shadow.clear();
+            point_query.clear();
+          }
+
+	  /* normal and shadow ray statistics */
+	  struct 
+          {
+            void clear() 
+            {
+              travs.store(0);
+              trav_nodes.store(0);
+              trav_leaves.store(0);
+              trav_prims.store(0);
+              trav_prim_hits.store(0);
+              for (auto& v : trav_hit_boxes) v.store(0);
+              trav_stack_pop.store(0);
+              trav_stack_nodes.store(0); 
+              trav_xfm_nodes.store(0); 
+            }
+
+          public:
+	    std::atomic<size_t> travs;
+	    std::atomic<size_t> trav_nodes;
+	    std::atomic<size_t> trav_leaves;
+	    std::atomic<size_t> trav_prims;
+	    std::atomic<size_t> trav_prim_hits;
+	    std::atomic<size_t> trav_hit_boxes[SIZE_HISTOGRAM+1];
+	    std::atomic<size_t> trav_stack_pop;
+	    std::atomic<size_t> trav_stack_nodes; 
+            std::atomic<size_t> trav_xfm_nodes; 
+            
+	  } normal, shadow, point_query;
+	} all, active, code; 
+
+        std::atomic<size_t> user[10];
+    };
+
+  public:
+
+    static __forceinline Counters& get() {
+      return instance.cntrs;
+    }
+    
+    static void clear() {
+      instance.cntrs.clear();
+    }
+    
+    static void print(embree_ostream cout);
+
+  private: 
+    Counters cntrs;
+
+  private:
+    static Stat instance;
+  };
+}
diff --git a/thirdparty/embree/kernels/common/state.cpp b/thirdparty/embree/kernels/common/state.cpp
new file mode 100644
index 0000000000..01c862da0c
--- /dev/null
+++ b/thirdparty/embree/kernels/common/state.cpp
@@ -0,0 +1,519 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "state.h"
+#include "../../common/lexers/streamfilters.h"
+
+namespace embree
+{
+  MutexSys g_printMutex;
+
+  State::ErrorHandler State::g_errorHandler;
+
+  State::ErrorHandler::ErrorHandler()
+    : thread_error(createTls()) {}
+
+  State::ErrorHandler::~ErrorHandler()
+  {
+    Lock<MutexSys> lock(errors_mutex);
+    for (size_t i=0; i<thread_errors.size(); i++)
+      delete thread_errors[i];
+    destroyTls(thread_error);
+    thread_errors.clear();
+  }
+
+  RTCError* State::ErrorHandler::error() 
+  {
+    RTCError* stored_error = (RTCError*) getTls(thread_error);
+    if (stored_error) return stored_error;
+
+    Lock<MutexSys> lock(errors_mutex);
+    stored_error = new RTCError(RTC_ERROR_NONE);
+    thread_errors.push_back(stored_error);
+    setTls(thread_error,stored_error);
+    return stored_error;
+  }
+
+  State::State () 
+    : enabled_cpu_features(getCPUFeatures()),
+      enabled_builder_cpu_features(enabled_cpu_features),
+      frequency_level(FREQUENCY_SIMD256)
+  {
+    tri_accel = "default";
+    tri_builder = "default";
+    tri_traverser = "default";
+    
+    tri_accel_mb = "default";
+    tri_builder_mb = "default";
+    tri_traverser_mb = "default";
+
+    quad_accel = "default";
+    quad_builder = "default";
+    quad_traverser = "default";
+
+    quad_accel_mb = "default";
+    quad_builder_mb = "default";
+    quad_traverser_mb = "default";
+
+    line_accel = "default";
+    line_builder = "default";
+    line_traverser = "default";
+
+    line_accel_mb = "default";
+    line_builder_mb = "default";
+    line_traverser_mb = "default";
+    
+    hair_accel = "default";
+    hair_builder = "default";
+    hair_traverser = "default";
+
+    hair_accel_mb = "default";
+    hair_builder_mb = "default";
+    hair_traverser_mb = "default";
+
+    object_accel = "default";
+    object_builder = "default";
+    object_accel_min_leaf_size = 1;
+    object_accel_max_leaf_size = 1;
+
+    object_accel_mb = "default";
+    object_builder_mb = "default";
+    object_accel_mb_min_leaf_size = 1;
+    object_accel_mb_max_leaf_size = 1;
+
+    max_spatial_split_replications = 1.2f;
+    useSpatialPreSplits = false;
+
+    tessellation_cache_size = 128*1024*1024;
+
+    subdiv_accel = "default";
+    subdiv_accel_mb = "default";
+
+    grid_accel = "default";
+    grid_builder = "default";
+    grid_accel_mb = "default";
+    grid_builder_mb = "default";
+
+    instancing_open_min = 0;
+    instancing_block_size = 0;
+    instancing_open_factor = 8.0f; 
+    instancing_open_max_depth = 32;
+    instancing_open_max = 50000000;
+
+    float_exceptions = false;
+    quality_flags = -1;
+    scene_flags = -1;
+    verbose = 0;
+    benchmark = 0;
+
+    numThreads = 0;
+    numUserThreads = 0;
+
+#if TASKING_INTERNAL
+    set_affinity = true;
+#else
+    set_affinity = false;
+#endif
+
+    start_threads = false;
+    enable_selockmemoryprivilege = false;
+#if defined(__LINUX__)
+    hugepages = true;
+#else
+    hugepages = false;
+#endif
+    hugepages_success = true;
+
+    alloc_main_block_size = 0;
+    alloc_num_main_slots = 0;
+    alloc_thread_block_size = 0;
+    alloc_single_thread_alloc = -1;
+
+    error_function = nullptr;
+    error_function_userptr = nullptr;
+
+    memory_monitor_function = nullptr;
+    memory_monitor_userptr = nullptr;
+  }
+
+  State::~State() {
+  }
+
+  bool State::hasISA(const int isa) {
+    return (enabled_cpu_features & isa) == isa;
+  }
+
+  bool State::checkISASupport() {
+    return (getCPUFeatures() & enabled_cpu_features) == enabled_cpu_features;
+  }
+  
+  void State::verify()
+  {
+    /* verify that calculations stay in range */
+    assert(rcp(min_rcp_input)*FLT_LARGE+FLT_LARGE < 0.01f*FLT_MAX);
+
+    /* here we verify that CPP files compiled for a specific ISA only
+     * call that same or lower ISA version of non-inlined class member
+     * functions */
+#if defined(DEBUG)
+#if defined(EMBREE_TARGET_SSE2)
+    assert(sse2::getISA() <= SSE2);
+#endif
+#if defined(EMBREE_TARGET_SSE42)
+    assert(sse42::getISA() <= SSE42);
+#endif
+#if defined(EMBREE_TARGET_AVX)
+    assert(avx::getISA() <= AVX);
+#endif
+#if defined(EMBREE_TARGET_AVX2)
+    assert(avx2::getISA() <= AVX2);
+#endif
+#if defined (EMBREE_TARGET_AVX512)
+    assert(avx512::getISA() <= AVX512);
+#endif
+#endif
+  }
+
+  const char* symbols[3] = { "=", ",", "|" };
+
+  bool State::parseFile(const FileName& fileName)
+  {
+    FILE* f = fopen(fileName.c_str(),"r");
+    if (!f) return false;
+    Ref<Stream<int> > file = new FileStream(f,fileName);
+    
+    std::vector<std::string> syms;
+    for (size_t i=0; i<sizeof(symbols)/sizeof(void*); i++) 
+      syms.push_back(symbols[i]);
+    
+    Ref<TokenStream> cin = new TokenStream(new LineCommentFilter(file,"#"),
+                                           TokenStream::alpha+TokenStream::ALPHA+TokenStream::numbers+"_.",
+                                           TokenStream::separators,syms);
+    parse(cin);
+    return true;
+  }
+
+  void State::parseString(const char* cfg)
+  {
+    if (cfg == nullptr) return;
+
+    std::vector<std::string> syms;
+    for (size_t i=0; i<sizeof(symbols)/sizeof(void*); i++) 
+      syms.push_back(symbols[i]);
+    
+    Ref<TokenStream> cin = new TokenStream(new StrStream(cfg),
+                                           TokenStream::alpha+TokenStream::ALPHA+TokenStream::numbers+"_.",
+                                           TokenStream::separators,syms);
+    parse(cin);
+  }
+  
+  int string_to_cpufeatures(const std::string& isa)
+  {
+    if      (isa == "sse" ) return SSE;
+    else if (isa == "sse2") return SSE2;
+    else if (isa == "sse3") return SSE3;
+    else if (isa == "ssse3") return SSSE3;
+    else if (isa == "sse41") return SSE41;
+    else if (isa == "sse4.1") return SSE41;
+    else if (isa == "sse42") return SSE42;
+    else if (isa == "sse4.2") return SSE42;
+    else if (isa == "avx") return AVX;
+    else if (isa == "avxi") return AVXI;
+    else if (isa == "avx2") return AVX2;
+    else if (isa == "avx512") return AVX512;
+    else return SSE2;
+  }
+
+  void State::parse(Ref<TokenStream> cin)
+  {
+    /* parse until end of stream */
+    while (cin->peek() != Token::Eof())
+    {
+      const Token tok = cin->get();
+
+      if (tok == Token::Id("threads") && cin->trySymbol("=")) 
+        numThreads = cin->get().Int();
+
+      else if (tok == Token::Id("user_threads")&& cin->trySymbol("=")) 
+        numUserThreads = cin->get().Int();
+
+      else if (tok == Token::Id("set_affinity")&& cin->trySymbol("=")) 
+        set_affinity = cin->get().Int();
+
+      else if (tok == Token::Id("affinity")&& cin->trySymbol("=")) 
+        set_affinity = cin->get().Int();
+      
+      else if (tok == Token::Id("start_threads")&& cin->trySymbol("=")) 
+        start_threads = cin->get().Int();
+      
+      else if (tok == Token::Id("isa") && cin->trySymbol("=")) {
+        std::string isa_str = toLowerCase(cin->get().Identifier());
+        enabled_cpu_features = string_to_cpufeatures(isa_str);
+        enabled_builder_cpu_features = enabled_cpu_features;
+      }
+
+      else if (tok == Token::Id("max_isa") && cin->trySymbol("=")) {
+        std::string isa_str = toLowerCase(cin->get().Identifier());
+        enabled_cpu_features &= string_to_cpufeatures(isa_str);
+        enabled_builder_cpu_features &= enabled_cpu_features;
+      }
+
+      else if (tok == Token::Id("max_builder_isa") && cin->trySymbol("=")) {
+        std::string isa_str = toLowerCase(cin->get().Identifier());
+        enabled_builder_cpu_features &= string_to_cpufeatures(isa_str);
+      }
+
+      else if (tok == Token::Id("frequency_level") && cin->trySymbol("=")) {
+        std::string freq = cin->get().Identifier();
+        if      (freq == "simd128") frequency_level = FREQUENCY_SIMD128;
+        else if (freq == "simd256") frequency_level = FREQUENCY_SIMD256;
+        else if (freq == "simd512") frequency_level = FREQUENCY_SIMD512;
+      }
+
+      else if (tok == Token::Id("enable_selockmemoryprivilege") && cin->trySymbol("=")) {
+        enable_selockmemoryprivilege = cin->get().Int();
+      }
+      else if (tok == Token::Id("hugepages") && cin->trySymbol("=")) {
+        hugepages = cin->get().Int();
+      }
+
+      else if (tok == Token::Id("float_exceptions") && cin->trySymbol("=")) 
+        float_exceptions = cin->get().Int();
+
+      else if ((tok == Token::Id("tri_accel") || tok == Token::Id("accel")) && cin->trySymbol("="))
+        tri_accel = cin->get().Identifier();
+      else if ((tok == Token::Id("tri_builder") || tok == Token::Id("builder")) && cin->trySymbol("="))
+        tri_builder = cin->get().Identifier();
+      else if ((tok == Token::Id("tri_traverser") || tok == Token::Id("traverser")) && cin->trySymbol("="))
+        tri_traverser = cin->get().Identifier();
+     
+      else if ((tok == Token::Id("tri_accel_mb") || tok == Token::Id("accel_mb")) && cin->trySymbol("="))
+        tri_accel_mb = cin->get().Identifier();
+      else if ((tok == Token::Id("tri_builder_mb") || tok == Token::Id("builder_mb")) && cin->trySymbol("="))
+        tri_builder_mb = cin->get().Identifier();
+      else if ((tok == Token::Id("tri_traverser_mb") || tok == Token::Id("traverser_mb")) && cin->trySymbol("="))
+        tri_traverser_mb = cin->get().Identifier();
+
+      else if ((tok == Token::Id("quad_accel")) && cin->trySymbol("="))
+        quad_accel = cin->get().Identifier();
+      else if ((tok == Token::Id("quad_builder")) && cin->trySymbol("="))
+        quad_builder = cin->get().Identifier();
+      else if ((tok == Token::Id("quad_traverser")) && cin->trySymbol("="))
+        quad_traverser = cin->get().Identifier();
+
+      else if ((tok == Token::Id("quad_accel_mb")) && cin->trySymbol("="))
+        quad_accel_mb = cin->get().Identifier();
+      else if ((tok == Token::Id("quad_builder_mb")) && cin->trySymbol("="))
+        quad_builder_mb = cin->get().Identifier();
+      else if ((tok == Token::Id("quad_traverser_mb")) && cin->trySymbol("="))
+        quad_traverser_mb = cin->get().Identifier();
+
+      else if ((tok == Token::Id("line_accel")) && cin->trySymbol("="))
+        line_accel = cin->get().Identifier();
+      else if ((tok == Token::Id("line_builder")) && cin->trySymbol("="))
+        line_builder = cin->get().Identifier();
+      else if ((tok == Token::Id("line_traverser")) && cin->trySymbol("="))
+        line_traverser = cin->get().Identifier();
+
+      else if ((tok == Token::Id("line_accel_mb")) && cin->trySymbol("="))
+        line_accel_mb = cin->get().Identifier();
+      else if ((tok == Token::Id("line_builder_mb")) && cin->trySymbol("="))
+        line_builder_mb = cin->get().Identifier();
+      else if ((tok == Token::Id("line_traverser_mb")) && cin->trySymbol("="))
+        line_traverser_mb = cin->get().Identifier();
+      
+      else if (tok == Token::Id("hair_accel") && cin->trySymbol("="))
+        hair_accel = cin->get().Identifier();
+      else if (tok == Token::Id("hair_builder") && cin->trySymbol("="))
+        hair_builder = cin->get().Identifier();
+      else if (tok == Token::Id("hair_traverser") && cin->trySymbol("="))
+        hair_traverser = cin->get().Identifier();
+
+      else if (tok == Token::Id("hair_accel_mb") && cin->trySymbol("="))
+        hair_accel_mb = cin->get().Identifier();
+      else if (tok == Token::Id("hair_builder_mb") && cin->trySymbol("="))
+        hair_builder_mb = cin->get().Identifier();
+      else if (tok == Token::Id("hair_traverser_mb") && cin->trySymbol("="))
+        hair_traverser_mb = cin->get().Identifier();
+
+      else if (tok == Token::Id("object_accel") && cin->trySymbol("="))
+        object_accel = cin->get().Identifier();
+      else if (tok == Token::Id("object_builder") && cin->trySymbol("="))
+        object_builder = cin->get().Identifier();
+      else if (tok == Token::Id("object_accel_min_leaf_size") && cin->trySymbol("="))
+        object_accel_min_leaf_size = cin->get().Int();
+      else if (tok == Token::Id("object_accel_max_leaf_size") && cin->trySymbol("="))
+        object_accel_max_leaf_size = cin->get().Int();
+
+      else if (tok == Token::Id("object_accel_mb") && cin->trySymbol("="))
+        object_accel_mb = cin->get().Identifier();
+      else if (tok == Token::Id("object_builder_mb") && cin->trySymbol("="))
+        object_builder_mb = cin->get().Identifier();
+      else if (tok == Token::Id("object_accel_mb_min_leaf_size") && cin->trySymbol("="))
+        object_accel_mb_min_leaf_size = cin->get().Int();
+      else if (tok == Token::Id("object_accel_mb_max_leaf_size") && cin->trySymbol("="))
+        object_accel_mb_max_leaf_size = cin->get().Int();
+
+      else if (tok == Token::Id("instancing_open_min") && cin->trySymbol("="))
+        instancing_open_min = cin->get().Int();
+      else if (tok == Token::Id("instancing_block_size") && cin->trySymbol("=")) {
+        instancing_block_size = cin->get().Int();
+        instancing_open_factor = 0.0f;
+      }
+      else if (tok == Token::Id("instancing_open_max_depth") && cin->trySymbol("="))
+        instancing_open_max_depth = cin->get().Int();
+      else if (tok == Token::Id("instancing_open_factor") && cin->trySymbol("=")) {
+        instancing_block_size = 0;
+        instancing_open_factor = cin->get().Float();
+      }
+      else if (tok == Token::Id("instancing_open_max") && cin->trySymbol("="))
+        instancing_open_max = cin->get().Int();
+
+      else if (tok == Token::Id("subdiv_accel") && cin->trySymbol("="))
+        subdiv_accel = cin->get().Identifier();
+      else if (tok == Token::Id("subdiv_accel_mb") && cin->trySymbol("="))
+        subdiv_accel_mb = cin->get().Identifier();
+
+      else if (tok == Token::Id("grid_accel") && cin->trySymbol("="))
+        grid_accel = cin->get().Identifier();
+      else if (tok == Token::Id("grid_accel_mb") && cin->trySymbol("="))
+        grid_accel_mb = cin->get().Identifier();
+      
+      else if (tok == Token::Id("verbose") && cin->trySymbol("="))
+        verbose = cin->get().Int();
+      else if (tok == Token::Id("benchmark") && cin->trySymbol("="))
+        benchmark = cin->get().Int();
+      
+      else if (tok == Token::Id("quality")) {
+        if (cin->trySymbol("=")) {
+          Token flag = cin->get();
+          if      (flag == Token::Id("low"))    quality_flags = RTC_BUILD_QUALITY_LOW;
+          else if (flag == Token::Id("medium")) quality_flags = RTC_BUILD_QUALITY_MEDIUM;
+          else if (flag == Token::Id("high"))   quality_flags = RTC_BUILD_QUALITY_HIGH;
+        }
+      }
+
+      else if (tok == Token::Id("scene_flags")) {
+        scene_flags = 0;
+        if (cin->trySymbol("=")) {
+          do {
+            Token flag = cin->get();
+            if (flag == Token::Id("dynamic") ) scene_flags |= RTC_SCENE_FLAG_DYNAMIC;
+            else if (flag == Token::Id("compact")) scene_flags |= RTC_SCENE_FLAG_COMPACT;
+            else if (flag == Token::Id("robust")) scene_flags |= RTC_SCENE_FLAG_ROBUST;
+          } while (cin->trySymbol("|"));
+        }
+      }
+      
+      else if (tok == Token::Id("max_spatial_split_replications") && cin->trySymbol("="))
+        max_spatial_split_replications = cin->get().Float();
+
+      else if (tok == Token::Id("presplits") && cin->trySymbol("="))
+        useSpatialPreSplits = cin->get().Int() != 0 ? true : false;
+
+      else if (tok == Token::Id("tessellation_cache_size") && cin->trySymbol("="))
+        tessellation_cache_size = size_t(cin->get().Float()*1024.0f*1024.0f);
+      else if (tok == Token::Id("cache_size") && cin->trySymbol("="))
+        tessellation_cache_size = size_t(cin->get().Float()*1024.0f*1024.0f);
+
+      else if (tok == Token::Id("alloc_main_block_size") && cin->trySymbol("="))
+        alloc_main_block_size = cin->get().Int();
+       else if (tok == Token::Id("alloc_num_main_slots") && cin->trySymbol("="))
+        alloc_num_main_slots = cin->get().Int();
+       else if (tok == Token::Id("alloc_thread_block_size") && cin->trySymbol("="))
+         alloc_thread_block_size = cin->get().Int();
+       else if (tok == Token::Id("alloc_single_thread_alloc") && cin->trySymbol("="))
+         alloc_single_thread_alloc = cin->get().Int();
+
+      cin->trySymbol(","); // optional , separator
+    }
+  }
+
+  bool State::verbosity(size_t N) {
+    return N <= verbose;
+  }
+
+  void State::print()
+  {
+    std::cout << "general:" << std::endl;
+    std::cout << "  build threads      = " << numThreads   << std::endl;
+    std::cout << "  build user threads = " << numUserThreads   << std::endl;
+    std::cout << "  start_threads      = " << start_threads << std::endl;
+    std::cout << "  affinity           = " << set_affinity << std::endl;
+    std::cout << "  frequency_level    = ";
+    switch (frequency_level) {
+    case FREQUENCY_SIMD128: std::cout << "simd128" << std::endl; break;
+    case FREQUENCY_SIMD256: std::cout << "simd256" << std::endl; break;
+    case FREQUENCY_SIMD512: std::cout << "simd512" << std::endl; break;
+    default: std::cout << "error" << std::endl; break;
+    }
+    
+    std::cout << "  hugepages          = ";
+    if (!hugepages) std::cout << "disabled" << std::endl;
+    else if (hugepages_success) std::cout << "enabled" << std::endl;
+    else std::cout << "failed" << std::endl;
+
+    std::cout << "  verbosity          = " << verbose << std::endl;
+    std::cout << "  cache_size         = " << float(tessellation_cache_size)*1E-6 << " MB" << std::endl;
+    std::cout << "  max_spatial_split_replications = " << max_spatial_split_replications << std::endl;
+    
+    std::cout << "triangles:" << std::endl;
+    std::cout << "  accel              = " << tri_accel << std::endl;
+    std::cout << "  builder            = " << tri_builder << std::endl;
+    std::cout << "  traverser          = " << tri_traverser << std::endl;
+        
+    std::cout << "motion blur triangles:" << std::endl;
+    std::cout << "  accel              = " << tri_accel_mb << std::endl;
+    std::cout << "  builder            = " << tri_builder_mb << std::endl;
+    std::cout << "  traverser          = " << tri_traverser_mb << std::endl;
+
+    std::cout << "quads:" << std::endl;
+    std::cout << "  accel              = " << quad_accel << std::endl;
+    std::cout << "  builder            = " << quad_builder << std::endl;
+    std::cout << "  traverser          = " << quad_traverser << std::endl;
+
+    std::cout << "motion blur quads:" << std::endl;
+    std::cout << "  accel              = " << quad_accel_mb << std::endl;
+    std::cout << "  builder            = " << quad_builder_mb << std::endl;
+    std::cout << "  traverser          = " << quad_traverser_mb << std::endl;
+
+    std::cout << "line segments:" << std::endl;
+    std::cout << "  accel              = " << line_accel << std::endl;
+    std::cout << "  builder            = " << line_builder << std::endl;
+    std::cout << "  traverser          = " << line_traverser << std::endl;
+
+    std::cout << "motion blur line segments:" << std::endl;
+    std::cout << "  accel              = " << line_accel_mb << std::endl;
+    std::cout << "  builder            = " << line_builder_mb << std::endl;
+    std::cout << "  traverser          = " << line_traverser_mb << std::endl;
+    
+    std::cout << "hair:" << std::endl;
+    std::cout << "  accel              = " << hair_accel << std::endl;
+    std::cout << "  builder            = " << hair_builder << std::endl;
+    std::cout << "  traverser          = " << hair_traverser << std::endl;
+
+    std::cout << "motion blur hair:" << std::endl;
+    std::cout << "  accel              = " << hair_accel_mb << std::endl;
+    std::cout << "  builder            = " << hair_builder_mb << std::endl;
+    std::cout << "  traverser          = " << hair_traverser_mb << std::endl;
+    
+    std::cout << "subdivision surfaces:" << std::endl;
+    std::cout << "  accel              = " << subdiv_accel << std::endl;
+
+    std::cout << "grids:" << std::endl;
+    std::cout << "  accel              = " << grid_accel << std::endl;
+    std::cout << "  builder            = " << grid_builder << std::endl;
+
+    std::cout << "motion blur grids:" << std::endl;
+    std::cout << "  accel              = " << grid_accel_mb << std::endl;
+    std::cout << "  builder            = " << grid_builder_mb << std::endl;
+
+    std::cout << "object_accel:" << std::endl;
+    std::cout << "  min_leaf_size      = " << object_accel_min_leaf_size << std::endl;
+    std::cout << "  max_leaf_size      = " << object_accel_max_leaf_size << std::endl;
+
+    std::cout << "object_accel_mb:" << std::endl;
+    std::cout << "  min_leaf_size      = " << object_accel_mb_min_leaf_size << std::endl;
+    std::cout << "  max_leaf_size      = " << object_accel_mb_max_leaf_size << std::endl;
+  }
+}
diff --git a/thirdparty/embree/kernels/common/state.h b/thirdparty/embree/kernels/common/state.h
new file mode 100644
index 0000000000..33bcc843b2
--- /dev/null
+++ b/thirdparty/embree/kernels/common/state.h
@@ -0,0 +1,196 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+
+namespace embree
+{
+  /* mutex to make printing to cout thread safe */
+  extern MutexSys g_printMutex;
+
+  struct State : public RefCount
+  {
+  public:
+    /*! state construction */
+    State ();
+
+    /*! state destruction */
+    ~State();
+
+    /*! verifies that state is correct */
+    void verify();
+
+    /*! parses state from a configuration file */
+    bool parseFile(const FileName& fileName);
+
+    /*! parses the state from a string */
+    void parseString(const char* cfg);
+
+    /*! parses the state from a stream */
+    void parse(Ref<TokenStream> cin);
+
+    /*! prints the state */
+    void print();
+
+    /*! checks if verbosity level is at least N */
+    bool verbosity(size_t N);
+
+    /*! checks if some particular ISA is enabled */
+    bool hasISA(const int isa);
+
+    /*! check whether selected ISA is supported by the HW */    
+    bool checkISASupport();
+    
+  public:
+    std::string tri_accel;                 //!< acceleration structure to use for triangles
+    std::string tri_builder;               //!< builder to use for triangles
+    std::string tri_traverser;             //!< traverser to use for triangles
+    
+  public:
+    std::string tri_accel_mb;              //!< acceleration structure to use for motion blur triangles
+    std::string tri_builder_mb;            //!< builder to use for motion blur triangles
+    std::string tri_traverser_mb;          //!< traverser to use for triangles
+
+  public:
+    std::string quad_accel;                 //!< acceleration structure to use for quads
+    std::string quad_builder;               //!< builder to use for quads
+    std::string quad_traverser;             //!< traverser to use for quads
+
+  public:
+    std::string quad_accel_mb;             //!< acceleration structure to use for motion blur quads
+    std::string quad_builder_mb;           //!< builder to use for motion blur quads
+    std::string quad_traverser_mb;         //!< traverser to use for motion blur quads
+
+  public:
+    std::string line_accel;                 //!< acceleration structure to use for line segments
+    std::string line_builder;               //!< builder to use for line segments
+    std::string line_traverser;             //!< traverser to use for line segments
+
+  public:
+    std::string line_accel_mb;             //!< acceleration structure to use for motion blur line segments
+    std::string line_builder_mb;           //!< builder to use for motion blur line segments
+    std::string line_traverser_mb;         //!< traverser to use for motion blur line segments
+
+  public:
+    std::string hair_accel;                //!< hair acceleration structure to use
+    std::string hair_builder;              //!< builder to use for hair
+    std::string hair_traverser;            //!< traverser to use for hair
+
+  public:
+    std::string hair_accel_mb;             //!< acceleration structure to use for motion blur hair
+    std::string hair_builder_mb;           //!< builder to use for motion blur hair
+    std::string hair_traverser_mb;         //!< traverser to use for motion blur hair
+
+  public:
+    std::string object_accel;               //!< acceleration structure for user geometries
+    std::string object_builder;             //!< builder for user geometries
+    int object_accel_min_leaf_size;         //!< minimum leaf size for object acceleration structure
+    int object_accel_max_leaf_size;         //!< maximum leaf size for object acceleration structure
+
+  public:
+    std::string object_accel_mb;            //!< acceleration structure for user geometries
+    std::string object_builder_mb;          //!< builder for user geometries
+    int object_accel_mb_min_leaf_size;      //!< minimum leaf size for mblur object acceleration structure
+    int object_accel_mb_max_leaf_size;      //!< maximum leaf size for mblur object acceleration structure
+
+  public:
+    std::string subdiv_accel;              //!< acceleration structure to use for subdivision surfaces
+    std::string subdiv_accel_mb;           //!< acceleration structure to use for subdivision surfaces
+
+  public:
+    std::string grid_accel;              //!< acceleration structure to use for grids
+    std::string grid_builder;            //!< builder for grids
+    std::string grid_accel_mb;           //!< acceleration structure to use for motion blur grids
+    std::string grid_builder_mb;         //!< builder for motion blur grids
+
+  public:
+    float max_spatial_split_replications;  //!< maximally replications*N many primitives in accel for spatial splits
+    bool useSpatialPreSplits;              //!< use spatial pre-splits instead of the full spatial split builder
+    size_t tessellation_cache_size;        //!< size of the shared tessellation cache 
+
+  public:
+    size_t instancing_open_min;            //!< instancing opens tree to minimally that number of subtrees
+    size_t instancing_block_size;          //!< instancing opens tree up to average block size of primitives
+    float  instancing_open_factor;         //!< instancing opens tree up to x times the number of instances
+    size_t instancing_open_max_depth;      //!< maximum open depth for geometries
+    size_t instancing_open_max;            //!< instancing opens tree to maximally that number of subtrees
+
+  public:
+    bool float_exceptions;                 //!< enable floating point exceptions
+    int quality_flags;
+    int scene_flags;
+    size_t verbose;                        //!< verbosity of output
+    size_t benchmark;                      //!< true
+    
+  public:
+    size_t numThreads;                     //!< number of threads to use in builders
+    size_t numUserThreads;                 //!< number of user provided threads to use in builders
+    bool set_affinity;                     //!< sets affinity for worker threads
+    bool start_threads;                    //!< true when threads should be started at device creation time
+    int enabled_cpu_features;              //!< CPU ISA features to use
+    int enabled_builder_cpu_features;      //!< CPU ISA features to use for builders only
+    enum FREQUENCY_LEVEL {
+      FREQUENCY_SIMD128,
+      FREQUENCY_SIMD256,
+      FREQUENCY_SIMD512
+    } frequency_level;                     //!< frequency level the app wants to run on (default is SIMD256)
+    bool enable_selockmemoryprivilege;     //!< configures the SeLockMemoryPrivilege under Windows to enable huge pages
+    bool hugepages;                        //!< true if huge pages should get used
+    bool hugepages_success;                //!< status for enabling huge pages
+
+  public:
+    size_t alloc_main_block_size;          //!< main allocation block size (shared between threads)
+    int alloc_num_main_slots;              //!< number of such shared blocks to be used to allocate
+    size_t alloc_thread_block_size;        //!< size of thread local allocator block size
+    int alloc_single_thread_alloc;         //!< in single mode nodes and leaves use same thread local allocator
+
+  public:
+
+    /*! checks if we can use AVX */
+    bool canUseAVX() {
+      return hasISA(AVX) && frequency_level != FREQUENCY_SIMD128;
+    }
+
+    /*! checks if we can use AVX2 */
+    bool canUseAVX2() {
+      return hasISA(AVX2) && frequency_level != FREQUENCY_SIMD128;
+    }
+    
+    struct ErrorHandler
+    {
+    public:
+      ErrorHandler();
+      ~ErrorHandler();
+      RTCError* error();
+
+    public:
+      tls_t thread_error;
+      std::vector<RTCError*> thread_errors;
+      MutexSys errors_mutex;
+    };
+    ErrorHandler errorHandler;
+    static ErrorHandler g_errorHandler;
+
+  public:
+    void setErrorFunction(RTCErrorFunction fptr, void* uptr) 
+    {
+      error_function = fptr;
+      error_function_userptr = uptr;
+    }
+
+    RTCErrorFunction error_function;
+    void* error_function_userptr;
+
+  public:
+    void setMemoryMonitorFunction(RTCMemoryMonitorFunction fptr, void* uptr) 
+    {
+      memory_monitor_function = fptr;
+      memory_monitor_userptr = uptr;
+    }
+      
+    RTCMemoryMonitorFunction memory_monitor_function;
+    void* memory_monitor_userptr;
+  };
+}
diff --git a/thirdparty/embree/kernels/common/vector.h b/thirdparty/embree/kernels/common/vector.h
new file mode 100644
index 0000000000..4b08275f3b
--- /dev/null
+++ b/thirdparty/embree/kernels/common/vector.h
@@ -0,0 +1,76 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "default.h"
+
+namespace embree
+{
+  /*! invokes the memory monitor callback */
+  struct MemoryMonitorInterface {
+    virtual void memoryMonitor(ssize_t bytes, bool post) = 0;
+  };
+
+  /*! allocator that performs aligned monitored allocations */
+  template<typename T, size_t alignment = 64>
+    struct aligned_monitored_allocator
+    {
+      typedef T value_type;
+      typedef T* pointer;
+      typedef const T* const_pointer;
+      typedef T& reference;
+      typedef const T& const_reference;
+      typedef std::size_t size_type;
+      typedef std::ptrdiff_t difference_type;
+      
+      __forceinline aligned_monitored_allocator(MemoryMonitorInterface* device) 
+        : device(device), hugepages(false) {}
+
+      __forceinline pointer allocate( size_type n ) 
+      {
+        if (n) {
+          assert(device);
+          device->memoryMonitor(n*sizeof(T),false);
+        }
+        if (n*sizeof(value_type) >= 14 * PAGE_SIZE_2M)
+        {
+          pointer p =  (pointer) os_malloc(n*sizeof(value_type),hugepages);
+          assert(p);
+          return p;
+        }
+        return (pointer) alignedMalloc(n*sizeof(value_type),alignment);
+      }
+
+      __forceinline void deallocate( pointer p, size_type n ) 
+      {
+        if (p)
+        {
+          if (n*sizeof(value_type) >= 14 * PAGE_SIZE_2M)
+            os_free(p,n*sizeof(value_type),hugepages); 
+          else
+            alignedFree(p);
+        }
+        else assert(n == 0);
+
+        if (n) {
+          assert(device);
+          device->memoryMonitor(-ssize_t(n)*sizeof(T),true);
+        }
+      }
+
+      __forceinline void construct( pointer p, const_reference val ) {
+        new (p) T(val);
+      }
+
+      __forceinline void destroy( pointer p ) {
+        p->~T();
+      }
+
+    private:
+      MemoryMonitorInterface* device;
+      bool hugepages;
+    };
+
+  /*! monitored vector */
+  template<typename T>
+    using mvector = vector_t<T,aligned_monitored_allocator<T,std::alignment_of<T>::value> >;
+}
diff --git a/thirdparty/embree/kernels/config.h b/thirdparty/embree/kernels/config.h
new file mode 100644
index 0000000000..80a8ab2a56
--- /dev/null
+++ b/thirdparty/embree/kernels/config.h
@@ -0,0 +1,76 @@
+
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+/* #undef EMBREE_RAY_MASK */
+/* #undef EMBREE_STAT_COUNTERS */
+/* #undef EMBREE_BACKFACE_CULLING */
+/* #undef EMBREE_BACKFACE_CULLING_CURVES */
+#define EMBREE_FILTER_FUNCTION
+/* #undef EMBREE_IGNORE_INVALID_RAYS */
+#define EMBREE_GEOMETRY_TRIANGLE
+/* #undef EMBREE_GEOMETRY_QUAD */
+/* #undef EMBREE_GEOMETRY_CURVE */
+/* #undef EMBREE_GEOMETRY_SUBDIVISION */
+/* #undef EMBREE_GEOMETRY_USER */
+/* #undef EMBREE_GEOMETRY_INSTANCE */
+/* #undef EMBREE_GEOMETRY_GRID */
+/* #undef EMBREE_GEOMETRY_POINT */
+/* #undef EMBREE_RAY_PACKETS */
+/* #undef EMBREE_COMPACT_POLYS */
+
+#define EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR 2.0
+
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+  #define IF_ENABLED_TRIS(x) x
+#else
+  #define IF_ENABLED_TRIS(x)
+#endif
+
+#if defined(EMBREE_GEOMETRY_QUAD)
+  #define IF_ENABLED_QUADS(x) x
+#else
+  #define IF_ENABLED_QUADS(x)
+#endif
+
+#if defined(EMBREE_GEOMETRY_CURVE) || defined(EMBREE_GEOMETRY_POINT)
+  #define IF_ENABLED_CURVES_OR_POINTS(x) x
+#else
+  #define IF_ENABLED_CURVES_OR_POINTS(x)
+#endif
+
+#if defined(EMBREE_GEOMETRY_CURVE)
+  #define IF_ENABLED_CURVES(x) x
+#else
+  #define IF_ENABLED_CURVES(x)
+#endif
+
+#if defined(EMBREE_GEOMETRY_POINT)
+  #define IF_ENABLED_POINTS(x) x
+#else
+  #define IF_ENABLED_POINTS(x)
+#endif
+
+#if defined(EMBREE_GEOMETRY_SUBDIVISION)
+  #define IF_ENABLED_SUBDIV(x) x
+#else
+  #define IF_ENABLED_SUBDIV(x)
+#endif
+
+#if defined(EMBREE_GEOMETRY_USER)
+  #define IF_ENABLED_USER(x) x
+#else
+  #define IF_ENABLED_USER(x)
+#endif
+
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+  #define IF_ENABLED_INSTANCE(x) x
+#else
+  #define IF_ENABLED_INSTANCE(x)
+#endif
+
+#if defined(EMBREE_GEOMETRY_GRID)
+  #define IF_ENABLED_GRIDS(x) x
+#else
+  #define IF_ENABLED_GRIDS(x)
+#endif
diff --git a/thirdparty/embree/kernels/geometry/cone.h b/thirdparty/embree/kernels/geometry/cone.h
new file mode 100644
index 0000000000..17429bab32
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/cone.h
@@ -0,0 +1,321 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    struct Cone
+    {
+      const Vec3fa p0; //!< start position of cone
+      const Vec3fa p1; //!< end position of cone
+      const float r0;  //!< start radius of cone
+      const float r1;  //!< end radius of cone
+
+      __forceinline Cone(const Vec3fa& p0, const float r0, const Vec3fa& p1, const float r1) 
+        : p0(p0), p1(p1), r0(r0), r1(r1) {}
+
+      __forceinline bool intersect(const Vec3fa& org, const Vec3fa& dir, 
+                                   BBox1f& t_o, 
+                                   float& u0_o, Vec3fa& Ng0_o, 
+                                   float& u1_o, Vec3fa& Ng1_o) const 
+      {
+        /* calculate quadratic equation to solve */
+        const Vec3fa v0 = p0-org;
+        const Vec3fa v1 = p1-org;
+        
+        const float rl = rcp_length(v1-v0);
+        const Vec3fa P0 = v0, dP = (v1-v0)*rl;
+        const float dr = (r1-r0)*rl;
+        const Vec3fa O = -P0, dO = dir;
+        
+        const float dOdO = dot(dO,dO);
+        const float OdO = dot(dO,O);
+        const float OO = dot(O,O);
+        const float dOz = dot(dP,dO);
+        const float Oz = dot(dP,O);
+
+        const float R = r0 + Oz*dr;          
+        const float A = dOdO - sqr(dOz) * (1.0f+sqr(dr));
+        const float B = 2.0f * (OdO - dOz*(Oz + R*dr));
+        const float C = OO - (sqr(Oz) + sqr(R));
+
+        /* we miss the cone if determinant is smaller than zero */
+        const float D = B*B - 4.0f*A*C;
+        if (D < 0.0f) return false;
+
+        /* special case for rays that are "parallel" to the cone */
+        const float eps = float(1<<8)*float(ulp)*max(abs(dOdO),abs(sqr(dOz)));
+        if (unlikely(abs(A) < eps))
+        {
+          /* cylinder case */
+          if (abs(dr) < 16.0f*float(ulp)) {
+            if (C <= 0.0f) { t_o = BBox1f(neg_inf,pos_inf); return true; } 
+            else           { t_o = BBox1f(pos_inf,neg_inf); return false; }
+          }
+
+          /* cone case */
+          else 
+          {
+            /* if we hit the negative cone there cannot be a hit */
+            const float t = -C/B;
+            const float z0 = Oz+t*dOz;
+            const float z0r = r0+z0*dr;
+            if (z0r < 0.0f) return false;
+
+            /* test if we start inside or outside the cone */
+            if (dOz*dr > 0.0f) t_o = BBox1f(t,pos_inf);
+            else               t_o = BBox1f(neg_inf,t);
+          }
+        }
+
+        /* standard case for "non-parallel" rays */
+        else
+        {
+          const float Q = sqrt(D);
+          const float rcp_2A = rcp(2.0f*A);
+          t_o.lower = (-B-Q)*rcp_2A;
+          t_o.upper = (-B+Q)*rcp_2A;
+          
+          /* standard case where both hits are on same cone */
+          if (likely(A > 0.0f)) {
+            const float z0 = Oz+t_o.lower*dOz;
+            const float z0r = r0+z0*dr;
+            if (z0r < 0.0f) return false;
+          } 
+
+          /* special case where the hits are on the positive and negative cone */
+          else 
+          {
+            /* depending on the ray direction and the open direction
+             * of the cone we have a hit from inside or outside the
+             * cone */
+            if (dOz*dr > 0) t_o.upper = pos_inf;
+            else            t_o.lower = neg_inf;
+          }
+        }
+
+        /* calculates u and Ng for near hit */
+        {
+          u0_o = (Oz+t_o.lower*dOz)*rl;
+          const Vec3fa Pr = t_o.lower*dir;
+          const Vec3fa Pl = v0 + u0_o*(v1-v0);
+          const Vec3fa R = normalize(Pr-Pl);
+          const Vec3fa U = (p1-p0)+(r1-r0)*R;
+          const Vec3fa V = cross(p1-p0,R);
+          Ng0_o = cross(V,U);
+        }
+
+        /* calculates u and Ng for far hit */
+        {
+          u1_o = (Oz+t_o.upper*dOz)*rl;
+          const Vec3fa Pr = t_o.upper*dir;
+          const Vec3fa Pl = v0 + u1_o*(v1-v0);
+          const Vec3fa R = normalize(Pr-Pl);
+          const Vec3fa U = (p1-p0)+(r1-r0)*R;
+          const Vec3fa V = cross(p1-p0,R);
+          Ng1_o = cross(V,U);
+        }
+        return true;
+      }
+
+      __forceinline bool intersect(const Vec3fa& org, const Vec3fa& dir, BBox1f& t_o) const 
+      {
+        float u0_o; Vec3fa Ng0_o; float u1_o; Vec3fa Ng1_o;
+        return intersect(org,dir,t_o,u0_o,Ng0_o,u1_o,Ng1_o);
+      }
+
+      static bool verify(const size_t id, const Cone& cone, const Ray& ray, bool shouldhit, const float t0, const float t1)
+      {
+        float eps = 0.001f;
+        BBox1f t; bool hit;
+        hit = cone.intersect(ray.org,ray.dir,t);
+
+        bool failed = hit != shouldhit;
+        if (shouldhit) failed |= std::isinf(t0) ? t0 != t.lower : (t0 == -1E6) ? t.lower > -1E6f : abs(t0-t.lower) > eps;
+        if (shouldhit) failed |= std::isinf(t1) ? t1 != t.upper : (t1 == +1E6) ? t.upper < +1E6f : abs(t1-t.upper) > eps;
+        if (!failed) return true;
+        embree_cout << "Cone test " << id << " failed: cone = " << cone << ", ray = " << ray << ", hit = " << hit << ", t = " << t << embree_endl; 
+        return false;
+      }
+
+      /* verify cone class */
+      static bool verify()
+      {
+        bool passed = true;
+        const Cone cone0(Vec3fa(0.0f,0.0f,0.0f),0.0f,Vec3fa(1.0f,0.0f,0.0f),1.0f);
+        passed &= verify(0,cone0,Ray(Vec3fa(-2.0f,1.0f,0.0f),Vec3fa(+1.0f,+0.0f,+0.0f),0.0f,float(inf)),true,3.0f,pos_inf);
+        passed &= verify(1,cone0,Ray(Vec3fa(+2.0f,1.0f,0.0f),Vec3fa(-1.0f,+0.0f,+0.0f),0.0f,float(inf)),true,neg_inf,1.0f);
+        passed &= verify(2,cone0,Ray(Vec3fa(-1.0f,0.0f,2.0f),Vec3fa(+0.0f,+0.0f,-1.0f),0.0f,float(inf)),false,0.0f,0.0f);
+        passed &= verify(3,cone0,Ray(Vec3fa(+1.0f,0.0f,2.0f),Vec3fa(+0.0f,+0.0f,-1.0f),0.0f,float(inf)),true,1.0f,3.0f);
+        passed &= verify(4,cone0,Ray(Vec3fa(-1.0f,0.0f,0.0f),Vec3fa(+1.0f,+0.0f,+0.0f),0.0f,float(inf)),true,1.0f,pos_inf);
+        passed &= verify(5,cone0,Ray(Vec3fa(+1.0f,0.0f,0.0f),Vec3fa(-1.0f,+0.0f,+0.0f),0.0f,float(inf)),true,neg_inf,1.0f);
+        passed &= verify(6,cone0,Ray(Vec3fa(+0.0f,0.0f,1.0f),Vec3fa(+0.0f,+0.0f,-1.0f),0.0f,float(inf)),true,1.0f,1.0f);
+        passed &= verify(7,cone0,Ray(Vec3fa(+0.0f,1.0f,0.0f),Vec3fa(-1.0f,-1.0f,+0.0f),0.0f,float(inf)),false,0.0f,0.0f);
+        passed &= verify(8,cone0,Ray(Vec3fa(+0.0f,1.0f,0.0f),Vec3fa(+1.0f,-1.0f,+0.0f),0.0f,float(inf)),true,0.5f,+1E6);
+        passed &= verify(9,cone0,Ray(Vec3fa(+0.0f,1.0f,0.0f),Vec3fa(-1.0f,+1.0f,+0.0f),0.0f,float(inf)),true,-1E6,-0.5f);
+        const Cone cone1(Vec3fa(0.0f,0.0f,0.0f),1.0f,Vec3fa(1.0f,0.0f,0.0f),0.0f);
+        passed &= verify(10,cone1,Ray(Vec3fa(-2.0f,1.0f,0.0f),Vec3fa(+1.0f,+0.0f,+0.0f),0.0f,float(inf)),true,neg_inf,2.0f);
+        passed &= verify(11,cone1,Ray(Vec3fa(-1.0f,0.0f,2.0f),Vec3fa(+0.0f,+0.0f,-1.0f),0.0f,float(inf)),true,0.0f,4.0f);
+        const Cone cylinder(Vec3fa(0.0f,0.0f,0.0f),1.0f,Vec3fa(1.0f,0.0f,0.0f),1.0f);
+        passed &= verify(12,cylinder,Ray(Vec3fa(-2.0f,1.0f,0.0f),Vec3fa( 0.0f,-1.0f,+0.0f),0.0f,float(inf)),true,0.0f,2.0f);
+        passed &= verify(13,cylinder,Ray(Vec3fa(+2.0f,1.0f,0.0f),Vec3fa( 0.0f,-1.0f,+0.0f),0.0f,float(inf)),true,0.0f,2.0f);
+        passed &= verify(14,cylinder,Ray(Vec3fa(+2.0f,1.0f,2.0f),Vec3fa( 0.0f,-1.0f,+0.0f),0.0f,float(inf)),false,0.0f,0.0f);
+        passed &= verify(15,cylinder,Ray(Vec3fa(+0.0f,0.0f,0.0f),Vec3fa( 1.0f, 0.0f,+0.0f),0.0f,float(inf)),true,neg_inf,pos_inf);
+        passed &= verify(16,cylinder,Ray(Vec3fa(+0.0f,0.0f,0.0f),Vec3fa(-1.0f, 0.0f,+0.0f),0.0f,float(inf)),true,neg_inf,pos_inf);
+        passed &= verify(17,cylinder,Ray(Vec3fa(+0.0f,2.0f,0.0f),Vec3fa( 1.0f, 0.0f,+0.0f),0.0f,float(inf)),false,pos_inf,neg_inf);
+        passed &= verify(18,cylinder,Ray(Vec3fa(+0.0f,2.0f,0.0f),Vec3fa(-1.0f, 0.0f,+0.0f),0.0f,float(inf)),false,pos_inf,neg_inf);
+        return passed;
+      }
+
+      /*! output operator */
+      friend __forceinline embree_ostream operator<<(embree_ostream cout, const Cone& c) {
+        return cout << "Cone { p0 = " << c.p0 << ", r0 = " << c.r0 << ", p1 = " << c.p1 << ", r1 = " << c.r1 << "}";
+      }
+    };
+
+    template<int N>
+      struct ConeN
+    {
+      typedef Vec3<vfloat<N>> Vec3vfN;
+      
+      const Vec3vfN p0;     //!< start position of cone
+      const Vec3vfN p1;     //!< end position of cone
+      const vfloat<N> r0;   //!< start radius of cone
+      const vfloat<N> r1;   //!< end radius of cone
+
+      __forceinline ConeN(const Vec3vfN& p0, const vfloat<N>& r0, const Vec3vfN& p1, const vfloat<N>& r1) 
+        : p0(p0), p1(p1), r0(r0), r1(r1) {}
+
+      __forceinline Cone operator[] (const size_t i) const
+      {
+        assert(i<N);
+        return Cone(Vec3fa(p0.x[i],p0.y[i],p0.z[i]),r0[i],Vec3fa(p1.x[i],p1.y[i],p1.z[i]),r1[i]);
+      }
+
+      __forceinline vbool<N> intersect(const Vec3fa& org, const Vec3fa& dir, 
+                                       BBox<vfloat<N>>& t_o, 
+                                       vfloat<N>& u0_o, Vec3vfN& Ng0_o, 
+                                       vfloat<N>& u1_o, Vec3vfN& Ng1_o) const
+      {
+        /* calculate quadratic equation to solve */
+        const Vec3vfN v0 = p0-Vec3vfN(org);
+        const Vec3vfN v1 = p1-Vec3vfN(org);
+
+        const vfloat<N> rl = rcp_length(v1-v0);
+        const Vec3vfN P0 = v0, dP = (v1-v0)*rl;
+        const vfloat<N> dr = (r1-r0)*rl;
+        const Vec3vfN O = -P0, dO = dir;
+       
+        const vfloat<N> dOdO = dot(dO,dO);
+        const vfloat<N> OdO = dot(dO,O);
+        const vfloat<N> OO = dot(O,O);
+        const vfloat<N> dOz = dot(dP,dO);
+        const vfloat<N> Oz = dot(dP,O);
+        
+        const vfloat<N> R = r0 + Oz*dr;          
+        const vfloat<N> A = dOdO - sqr(dOz) * (vfloat<N>(1.0f)+sqr(dr));
+        const vfloat<N> B = 2.0f * (OdO - dOz*(Oz + R*dr));
+        const vfloat<N> C = OO - (sqr(Oz) + sqr(R));
+
+        /* we miss the cone if determinant is smaller than zero */
+        const vfloat<N> D = B*B - 4.0f*A*C;
+        vbool<N> valid = D >= 0.0f;
+        if (none(valid)) return valid;
+
+        /* special case for rays that are "parallel" to the cone */
+        const vfloat<N> eps = float(1<<8)*float(ulp)*max(abs(dOdO),abs(sqr(dOz)));
+        const vbool<N> validt = valid &  (abs(A) < eps);
+        const vbool<N> validf = valid & !(abs(A) < eps);
+        if (unlikely(any(validt)))
+        {
+          const vboolx validtt = validt & (abs(dr) <  16.0f*float(ulp));
+          const vboolx validtf = validt & (abs(dr) >= 16.0f*float(ulp));
+          
+          /* cylinder case */
+          if (unlikely(any(validtt))) 
+          {
+            t_o.lower = select(validtt, select(C <= 0.0f, vfloat<N>(neg_inf), vfloat<N>(pos_inf)), t_o.lower);
+            t_o.upper = select(validtt, select(C <= 0.0f, vfloat<N>(pos_inf), vfloat<N>(neg_inf)), t_o.upper);
+            valid &= !validtt | C <= 0.0f;
+          }
+
+          /* cone case */
+          if (any(validtf)) 
+          {
+            /* if we hit the negative cone there cannot be a hit */
+            const vfloat<N> t = -C/B;
+            const vfloat<N> z0 = Oz+t*dOz;
+            const vfloat<N> z0r = r0+z0*dr;
+            valid &= !validtf | z0r >= 0.0f;
+
+            /* test if we start inside or outside the cone */
+            t_o.lower = select(validtf, select(dOz*dr > 0.0f, t, vfloat<N>(neg_inf)), t_o.lower);
+            t_o.upper = select(validtf, select(dOz*dr > 0.0f, vfloat<N>(pos_inf), t), t_o.upper);
+          }
+        }
+
+        /* standard case for "non-parallel" rays */
+        if (likely(any(validf)))
+        {
+          const vfloat<N> Q = sqrt(D);
+          const vfloat<N> rcp_2A = 0.5f*rcp(A);
+          t_o.lower = select(validf, (-B-Q)*rcp_2A, t_o.lower);
+          t_o.upper = select(validf, (-B+Q)*rcp_2A, t_o.upper);
+          
+          /* standard case where both hits are on same cone */
+          const vbool<N> validft = validf &   A>0.0f;
+          const vbool<N> validff = validf & !(A>0.0f);
+          if (any(validft)) {
+            const vfloat<N> z0 = Oz+t_o.lower*dOz;
+            const vfloat<N> z0r = r0+z0*dr;
+            valid &= !validft | z0r >= 0.0f;
+          } 
+
+          /* special case where the hits are on the positive and negative cone */
+          if (any(validff)) {
+            /* depending on the ray direction and the open direction
+             * of the cone we have a hit from inside or outside the
+             * cone */
+            t_o.lower = select(validff, select(dOz*dr > 0.0f, t_o.lower, float(neg_inf)), t_o.lower);
+            t_o.upper = select(validff, select(dOz*dr > 0.0f, float(pos_inf), t_o.upper), t_o.upper);
+          }
+        }
+
+        /* calculates u and Ng for near hit */
+        {
+          u0_o = (Oz+t_o.lower*dOz)*rl;
+          const Vec3vfN Pr = t_o.lower*Vec3vfN(dir);
+          const Vec3vfN Pl = v0 + u0_o*(v1-v0);
+          const Vec3vfN R = normalize(Pr-Pl);
+          const Vec3vfN U = (p1-p0)+(r1-r0)*R;
+          const Vec3vfN V = cross(p1-p0,R);
+          Ng0_o = cross(V,U);
+        }
+
+        /* calculates u and Ng for far hit */
+        {
+          u1_o = (Oz+t_o.upper*dOz)*rl;
+          const Vec3vfN Pr = t_o.lower*Vec3vfN(dir);
+          const Vec3vfN Pl = v0 + u1_o*(v1-v0);
+          const Vec3vfN R = normalize(Pr-Pl);
+          const Vec3vfN U = (p1-p0)+(r1-r0)*R;
+          const Vec3vfN V = cross(p1-p0,R);
+          Ng1_o = cross(V,U);
+        }
+        return valid;
+      }
+ 
+      __forceinline vbool<N> intersect(const Vec3fa& org, const Vec3fa& dir, BBox<vfloat<N>>& t_o) const
+      {
+        vfloat<N> u0_o; Vec3vfN Ng0_o; vfloat<N> u1_o; Vec3vfN Ng1_o;
+        return intersect(org,dir,t_o,u0_o,Ng0_o,u1_o,Ng1_o);
+      }
+    };
+  }
+}
+
diff --git a/thirdparty/embree/kernels/geometry/coneline_intersector.h b/thirdparty/embree/kernels/geometry/coneline_intersector.h
new file mode 100644
index 0000000000..90f3792eff
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/coneline_intersector.h
@@ -0,0 +1,209 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "curve_intersector_precalculations.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    namespace __coneline_internal 
+    {
+      template<int M, typename Epilog, typename ray_tfar_func>
+        static __forceinline bool intersectCone(const vbool<M>& valid_i,
+                                                const Vec3vf<M>& ray_org_in, const Vec3vf<M>& ray_dir, 
+                                                const vfloat<M>& ray_tnear, const ray_tfar_func& ray_tfar,
+                                                const Vec4vf<M>& v0, const Vec4vf<M>& v1,
+                                                const vbool<M>& cL, const vbool<M>& cR,
+                                                const Epilog& epilog)
+      {   
+        vbool<M> valid = valid_i;
+
+        /* move ray origin closer to make calculations numerically stable */
+        const vfloat<M> dOdO = sqr(ray_dir);
+        const vfloat<M> rcp_dOdO = rcp(dOdO);
+        const Vec3vf<M> center = vfloat<M>(0.5f)*(v0.xyz()+v1.xyz());
+        const vfloat<M> dt = dot(center-ray_org_in,ray_dir)*rcp_dOdO;
+        const Vec3vf<M> ray_org = ray_org_in + dt*ray_dir;
+
+        const Vec3vf<M> dP = v1.xyz() - v0.xyz();
+        const Vec3vf<M> p0 = ray_org - v0.xyz();
+        const Vec3vf<M> p1 = ray_org - v1.xyz();
+        
+        const vfloat<M> dPdP  = sqr(dP);
+        const vfloat<M> dP0   = dot(p0,dP);
+        const vfloat<M> dP1   = dot(p1,dP); 
+        const vfloat<M> dOdP  = dot(ray_dir,dP);
+
+        // intersect cone body
+        const vfloat<M> dr  = v0.w - v1.w;
+        const vfloat<M> hy  = dPdP + sqr(dr);
+        const vfloat<M> dO0 = dot(ray_dir,p0);
+        const vfloat<M> OO  = sqr(p0);
+        const vfloat<M> dPdP2 = sqr(dPdP);
+        const vfloat<M> dPdPr0 = dPdP*v0.w;
+        
+        const vfloat<M> A = dPdP2     - sqr(dOdP)*hy;
+        const vfloat<M> B = dPdP2*dO0 - dP0*dOdP*hy   + dPdPr0*(dr*dOdP);
+        const vfloat<M> C = dPdP2*OO  - sqr(dP0)*hy   + dPdPr0*(2.0f*dr*dP0 - dPdPr0);
+        
+        const vfloat<M> D = B*B - A*C;
+        valid &= D >= 0.0f;
+        if (unlikely(none(valid))) {
+          return false;
+        }
+
+        /* standard case for "non-parallel" rays */
+        const vfloat<M> Q = sqrt(D);
+        const vfloat<M> rcp_A = rcp(A);
+        /* special case for rays that are "parallel" to the cone - assume miss */
+        const vbool<M> isParallel = abs(A) <= min_rcp_input;
+
+        vfloat<M> t_cone_lower = select (isParallel, neg_inf, (-B-Q)*rcp_A);
+        vfloat<M> t_cone_upper = select (isParallel, pos_inf, (-B+Q)*rcp_A);
+        const vfloat<M> y_lower = dP0 + t_cone_lower*dOdP;
+        const vfloat<M> y_upper = dP0 + t_cone_upper*dOdP;
+        t_cone_lower = select(valid & y_lower > 0.0f & y_lower < dPdP, t_cone_lower, pos_inf);
+        t_cone_upper = select(valid & y_upper > 0.0f & y_upper < dPdP, t_cone_upper, neg_inf);
+
+        const vbool<M> hitDisk0 = valid & cL;
+        const vbool<M> hitDisk1 = valid & cR;
+        const vfloat<M> rcp_dOdP = rcp(dOdP);
+        const vfloat<M> t_disk0 = select (hitDisk0, select (sqr(p0*dOdP-ray_dir*dP0)<(sqr(v0.w)*sqr(dOdP)), -dP0*rcp_dOdP, pos_inf), pos_inf);
+        const vfloat<M> t_disk1 = select (hitDisk1, select (sqr(p1*dOdP-ray_dir*dP1)<(sqr(v1.w)*sqr(dOdP)), -dP1*rcp_dOdP, pos_inf), pos_inf);
+        const vfloat<M> t_disk_lower = min(t_disk0, t_disk1);
+        const vfloat<M> t_disk_upper = max(t_disk0, t_disk1);
+
+        const vfloat<M> t_lower = min(t_cone_lower, t_disk_lower);
+        const vfloat<M> t_upper = max(t_cone_upper, select(t_lower==t_disk_lower, 
+                                                      select(t_disk_upper==vfloat<M>(pos_inf),neg_inf,t_disk_upper), 
+                                                      select(t_disk_lower==vfloat<M>(pos_inf),neg_inf,t_disk_lower)));
+
+        const vbool<M> valid_lower = valid & ray_tnear <= dt+t_lower & dt+t_lower <= ray_tfar() & t_lower != vfloat<M>(pos_inf);
+        const vbool<M> valid_upper = valid & ray_tnear <= dt+t_upper & dt+t_upper <= ray_tfar() & t_upper != vfloat<M>(neg_inf);
+
+        const vbool<M> valid_first = valid_lower | valid_upper;
+        if (unlikely(none(valid_first)))
+          return false;
+
+        const vfloat<M> t_first = select(valid_lower, t_lower, t_upper);
+        const vfloat<M> y_first = select(valid_lower, y_lower, y_upper);
+
+        const vfloat<M> rcp_dPdP = rcp(dPdP);
+        const Vec3vf<M> dP2drr0dP = dPdP*dr*v0.w*dP;
+        const Vec3vf<M> dPhy = dP*hy;
+        const vbool<M> cone_hit_first = valid & (t_first == t_cone_lower | t_first == t_cone_upper);
+        const vbool<M> disk0_hit_first = valid & (t_first == t_disk0);
+        const Vec3vf<M> Ng_first = select(cone_hit_first, dPdP2*(p0+t_first*ray_dir)+dP2drr0dP-dPhy*y_first, select(disk0_hit_first, -dP, dP));
+        const vfloat<M> u_first = select(cone_hit_first, y_first*rcp_dPdP, select(disk0_hit_first, vfloat<M>(zero), vfloat<M>(one)));
+
+        /* invoke intersection filter for first hit */
+        RoundLineIntersectorHitM<M> hit(u_first,zero,dt+t_first,Ng_first);
+        const bool is_hit_first = epilog(valid_first, hit);
+
+        /* check for possible second hits before potentially accepted hit */
+        const vfloat<M> t_second = t_upper;
+        const vfloat<M> y_second = y_upper;
+        const vbool<M> valid_second = valid_lower & valid_upper & (dt+t_upper <= ray_tfar());
+        if (unlikely(none(valid_second)))
+          return is_hit_first;
+        
+        /* invoke intersection filter for second hit */
+        const vbool<M> cone_hit_second = t_second == t_cone_lower | t_second == t_cone_upper;
+        const vbool<M> disk0_hit_second = t_second == t_disk0;
+        const Vec3vf<M> Ng_second = select(cone_hit_second, dPdP2*(p0+t_second*ray_dir)+dP2drr0dP-dPhy*y_second, select(disk0_hit_second, -dP, dP));
+        const vfloat<M> u_second = select(cone_hit_second, y_second*rcp_dPdP, select(disk0_hit_first, vfloat<M>(zero), vfloat<M>(one)));
+
+        hit = RoundLineIntersectorHitM<M>(u_second,zero,dt+t_second,Ng_second);
+        const bool is_hit_second = epilog(valid_second, hit);
+        
+        return is_hit_first | is_hit_second;
+      }
+    }
+
+    template<int M>
+      struct ConeLineIntersectorHitM
+      {
+        __forceinline ConeLineIntersectorHitM() {}
+        
+        __forceinline ConeLineIntersectorHitM(const vfloat<M>& u, const vfloat<M>& v, const vfloat<M>& t, const Vec3vf<M>& Ng)
+          : vu(u), vv(v), vt(t), vNg(Ng) {}
+	
+        __forceinline void finalize() {}
+	
+        __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); }
+        __forceinline float t  (const size_t i) const { return vt[i]; }
+        __forceinline Vec3fa Ng(const size_t i) const { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); }
+	
+      public:
+        vfloat<M> vu;
+        vfloat<M> vv;
+        vfloat<M> vt;
+        Vec3vf<M> vNg;
+      };
+    
+    template<int M>
+      struct ConeCurveIntersector1
+      {
+        typedef CurvePrecalculations1 Precalculations;
+        
+        struct ray_tfar {
+          Ray& ray;
+          __forceinline ray_tfar(Ray& ray) : ray(ray) {}
+          __forceinline vfloat<M> operator() () const { return ray.tfar; };
+        };
+
+        template<typename Epilog>
+        static __forceinline bool intersect(const vbool<M>& valid_i,
+                                            Ray& ray,
+                                            IntersectContext* context,
+                                            const LineSegments* geom,
+                                            const Precalculations& pre,
+                                            const Vec4vf<M>& v0i, const Vec4vf<M>& v1i,
+                                            const vbool<M>& cL, const vbool<M>& cR,
+                                            const Epilog& epilog)
+        {
+          const Vec3vf<M> ray_org(ray.org.x, ray.org.y, ray.org.z);
+          const Vec3vf<M> ray_dir(ray.dir.x, ray.dir.y, ray.dir.z);
+          const vfloat<M> ray_tnear(ray.tnear());
+          const Vec4vf<M> v0 = enlargeRadiusToMinWidth<M>(context,geom,ray_org,v0i);
+          const Vec4vf<M> v1 = enlargeRadiusToMinWidth<M>(context,geom,ray_org,v1i);
+          return  __coneline_internal::intersectCone<M>(valid_i,ray_org,ray_dir,ray_tnear,ray_tfar(ray),v0,v1,cL,cR,epilog);
+        }
+      };
+    
+    template<int M, int K>
+      struct ConeCurveIntersectorK
+      {
+        typedef CurvePrecalculationsK<K> Precalculations;
+        
+        struct ray_tfar {
+          RayK<K>& ray;
+          size_t k;
+          __forceinline ray_tfar(RayK<K>& ray, size_t k) : ray(ray), k(k) {}
+          __forceinline vfloat<M> operator() () const { return ray.tfar[k]; };
+        };
+        
+        template<typename Epilog>
+        static __forceinline bool intersect(const vbool<M>& valid_i,
+                                            RayK<K>& ray, size_t k,
+                                            IntersectContext* context,
+                                            const LineSegments* geom,
+                                            const Precalculations& pre,
+                                            const Vec4vf<M>& v0i, const Vec4vf<M>& v1i,
+                                            const vbool<M>& cL, const vbool<M>& cR,
+                                            const Epilog& epilog)
+        {
+          const Vec3vf<M> ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]);
+          const Vec3vf<M> ray_dir(ray.dir.x[k], ray.dir.y[k], ray.dir.z[k]);
+          const vfloat<M> ray_tnear = ray.tnear()[k];
+          const Vec4vf<M> v0 = enlargeRadiusToMinWidth<M>(context,geom,ray_org,v0i);
+          const Vec4vf<M> v1 = enlargeRadiusToMinWidth<M>(context,geom,ray_org,v1i);
+          return __coneline_internal::intersectCone<M>(valid_i,ray_org,ray_dir,ray_tnear,ray_tfar(ray,k),v0,v1,cL,cR,epilog);
+        }
+      };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/conelinei_intersector.h b/thirdparty/embree/kernels/geometry/conelinei_intersector.h
new file mode 100644
index 0000000000..6a985ebcad
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/conelinei_intersector.h
@@ -0,0 +1,141 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "coneline_intersector.h"
+#include "intersector_epilog.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M, bool filter>
+    struct ConeCurveMiIntersector1
+    {
+      typedef LineMi<M> Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; 
+        vbool<M> cL,cR;
+        line.gather(v0,v1,cL,cR,geom);
+        const vbool<M> valid = line.valid();
+        ConeCurveIntersector1<M>::intersect(valid,ray,context,geom,pre,v0,v1,cL,cR,Intersect1EpilogM<M,filter>(ray,context,line.geomID(),line.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; 
+        vbool<M> cL,cR;
+        line.gather(v0,v1,cL,cR,geom);
+        const vbool<M> valid = line.valid();
+        return ConeCurveIntersector1<M>::intersect(valid,ray,context,geom,pre,v0,v1,cL,cR,Occluded1EpilogM<M,filter>(ray,context,line.geomID(),line.primID()));
+        return false;
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& line)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, line);
+      }
+    };
+
+    template<int M, bool filter>
+    struct ConeCurveMiMBIntersector1
+    {
+      typedef LineMi<M> Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; 
+        vbool<M> cL,cR;
+        line.gather(v0,v1,cL,cR,geom,ray.time());
+        const vbool<M> valid = line.valid();
+        ConeCurveIntersector1<M>::intersect(valid,ray,context,geom,pre,v0,v1,cL,cR,Intersect1EpilogM<M,filter>(ray,context,line.geomID(),line.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; 
+        vbool<M> cL,cR;
+        line.gather(v0,v1,cL,cR,geom,ray.time());
+        const vbool<M> valid = line.valid();
+        return ConeCurveIntersector1<M>::intersect(valid,ray,context,geom,pre,v0,v1,cL,cR,Occluded1EpilogM<M,filter>(ray,context,line.geomID(),line.primID()));
+        return false;
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& line)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, line);
+      }
+    };
+
+    template<int M, int K, bool filter>
+    struct ConeCurveMiIntersectorK
+    {
+      typedef LineMi<M> Primitive;
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; 
+        vbool<M> cL,cR;
+        line.gather(v0,v1,cL,cR,geom);
+        const vbool<M> valid = line.valid();
+        ConeCurveIntersectorK<M,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,cL,cR,Intersect1KEpilogM<M,K,filter>(ray,k,context,line.geomID(),line.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; 
+        vbool<M> cL,cR;
+        line.gather(v0,v1,cL,cR,geom);
+        const vbool<M> valid = line.valid();
+        return ConeCurveIntersectorK<M,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,cL,cR,Occluded1KEpilogM<M,K,filter>(ray,k,context,line.geomID(),line.primID()));
+      }
+    };
+
+    template<int M, int K, bool filter>
+    struct ConeCurveMiMBIntersectorK
+    {
+      typedef LineMi<M> Primitive;
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context,  const Primitive& line)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; 
+        vbool<M> cL,cR;
+        line.gather(v0,v1,cL,cR,geom,ray.time()[k]);
+        const vbool<M> valid = line.valid();
+        ConeCurveIntersectorK<M,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,cL,cR,Intersect1KEpilogM<M,K,filter>(ray,k,context,line.geomID(),line.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; 
+        vbool<M> cL,cR;
+        line.gather(v0,v1,cL,cR,geom,ray.time()[k]);
+        const vbool<M> valid = line.valid();
+        return ConeCurveIntersectorK<M,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,cL,cR,Occluded1KEpilogM<M,K,filter>(ray,k,context,line.geomID(),line.primID()));
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/curveNi.h b/thirdparty/embree/kernels/geometry/curveNi.h
new file mode 100644
index 0000000000..6366a6fb9c
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/curveNi.h
@@ -0,0 +1,222 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+#include "curve_intersector_precalculations.h"
+
+namespace embree
+{
+  template<int M>
+    struct CurveNi
+  {
+    struct Type : public PrimitiveType {
+      const char* name() const;
+      size_t sizeActive(const char* This) const;
+      size_t sizeTotal(const char* This) const;
+      size_t getBytes(const char* This) const;
+    };
+    static Type type;
+
+  public:
+
+    /* Returns maximum number of stored primitives */
+    static __forceinline size_t max_size() { return M; }
+
+    /* Returns required number of primitive blocks for N primitives */
+    static __forceinline size_t blocks(size_t N) { return (N+M-1)/M; }
+
+    static __forceinline size_t bytes(size_t N)
+    {
+      const size_t f = N/M, r = N%M;
+      static_assert(sizeof(CurveNi) == 22+25*M, "internal data layout issue");
+      return f*sizeof(CurveNi) + (r!=0)*(22 + 25*r);
+    }
+
+  public:
+
+    /*! Default constructor. */
+    __forceinline CurveNi () {}
+
+    /*! fill curve from curve list */
+    __forceinline void fill(const PrimRef* prims, size_t& begin, size_t _end, Scene* scene)
+    {  
+      size_t end = min(begin+M,_end);
+      N = (unsigned char)(end-begin);
+      const unsigned int geomID0 = prims[begin].geomID();
+      this->geomID(N) = geomID0;
+      ty = (unsigned char) scene->get(geomID0)->getType();
+
+      /* encode all primitives */
+      BBox3fa bounds = empty;
+      for (size_t i=0; i<N; i++)
+      {
+        const PrimRef& prim = prims[begin+i];
+        const unsigned int geomID = prim.geomID(); assert(geomID == geomID0);
+        const unsigned int primID = prim.primID();
+        bounds.extend(scene->get(geomID)->vbounds(primID));
+      }
+
+      /* calculate offset and scale */
+      Vec3fa loffset = bounds.lower;
+      float lscale = reduce_min(256.0f/(bounds.size()*sqrt(3.0f)));
+      if (bounds.size() == Vec3fa(zero)) lscale = 0.0f;
+      *this->offset(N) = loffset;
+      *this->scale(N) = lscale;
+      
+      /* encode all primitives */
+      for (size_t i=0; i<M && begin<end; i++, begin++)
+      {
+        const PrimRef& prim = prims[begin];
+        const unsigned int geomID = prim.geomID();
+        const unsigned int primID = prim.primID();
+        const LinearSpace3fa space2 = scene->get(geomID)->computeAlignedSpace(primID);
+        
+        const LinearSpace3fa space3(trunc(126.0f*space2.vx),trunc(126.0f*space2.vy),trunc(126.0f*space2.vz));
+        const BBox3fa bounds = scene->get(geomID)->vbounds(loffset,lscale,max(length(space3.vx),length(space3.vy),length(space3.vz)),space3.transposed(),primID);
+        
+        bounds_vx_x(N)[i] = (char) space3.vx.x;
+        bounds_vx_y(N)[i] = (char) space3.vx.y;
+        bounds_vx_z(N)[i] = (char) space3.vx.z;
+        bounds_vx_lower(N)[i] = (short) clamp(floor(bounds.lower.x),-32767.0f,32767.0f);
+        bounds_vx_upper(N)[i] = (short) clamp(ceil (bounds.upper.x),-32767.0f,32767.0f);
+        assert(-32767.0f <= floor(bounds.lower.x) && floor(bounds.lower.x) <= 32767.0f);
+        assert(-32767.0f <= ceil (bounds.upper.x) && ceil (bounds.upper.x) <= 32767.0f);
+
+        bounds_vy_x(N)[i] = (char) space3.vy.x;
+        bounds_vy_y(N)[i] = (char) space3.vy.y;
+        bounds_vy_z(N)[i] = (char) space3.vy.z;
+        bounds_vy_lower(N)[i] = (short) clamp(floor(bounds.lower.y),-32767.0f,32767.0f);
+        bounds_vy_upper(N)[i] = (short) clamp(ceil (bounds.upper.y),-32767.0f,32767.0f);
+        assert(-32767.0f <= floor(bounds.lower.y) && floor(bounds.lower.y) <= 32767.0f);
+        assert(-32767.0f <= ceil (bounds.upper.y) && ceil (bounds.upper.y) <= 32767.0f);
+
+        bounds_vz_x(N)[i] = (char) space3.vz.x;
+        bounds_vz_y(N)[i] = (char) space3.vz.y;
+        bounds_vz_z(N)[i] = (char) space3.vz.z;
+        bounds_vz_lower(N)[i] = (short) clamp(floor(bounds.lower.z),-32767.0f,32767.0f);
+        bounds_vz_upper(N)[i] = (short) clamp(ceil (bounds.upper.z),-32767.0f,32767.0f);
+        assert(-32767.0f <= floor(bounds.lower.z) && floor(bounds.lower.z) <= 32767.0f);
+        assert(-32767.0f <= ceil (bounds.upper.z) && ceil (bounds.upper.z) <= 32767.0f);
+               
+        this->primID(N)[i] = primID;
+      }
+    }
+
+    template<typename BVH, typename Allocator>
+      __forceinline static typename BVH::NodeRef createLeaf (BVH* bvh, const PrimRef* prims, const range<size_t>& set, const Allocator& alloc)
+    {
+      size_t start = set.begin();
+      size_t items = CurveNi::blocks(set.size());
+      size_t numbytes = CurveNi::bytes(set.size());
+      CurveNi* accel = (CurveNi*) alloc.malloc1(numbytes,BVH::byteAlignment);
+      for (size_t i=0; i<items; i++) {
+        accel[i].fill(prims,start,set.end(),bvh->scene);
+      }
+      return bvh->encodeLeaf((char*)accel,items);
+    };
+    
+  public:
+    
+    // 27.6 - 46 bytes per primitive
+    unsigned char ty;
+    unsigned char N;
+    unsigned char data[4+25*M+16];
+
+    /*
+    struct Layout
+    {
+      unsigned int geomID;
+      unsigned int primID[N];
+      
+      char bounds_vx_x[N];
+      char bounds_vx_y[N];
+      char bounds_vx_z[N];
+      short bounds_vx_lower[N];
+      short bounds_vx_upper[N];
+      
+      char bounds_vy_x[N];
+      char bounds_vy_y[N];
+      char bounds_vy_z[N];
+      short bounds_vy_lower[N];
+      short bounds_vy_upper[N];
+      
+      char bounds_vz_x[N];
+      char bounds_vz_y[N];
+      char bounds_vz_z[N];
+      short bounds_vz_lower[N];
+      short bounds_vz_upper[N];
+      
+      Vec3f offset;
+      float scale;
+    };
+    */
+    
+    __forceinline       unsigned int& geomID(size_t N)       { return *(unsigned int*)((char*)this+2); }
+    __forceinline const unsigned int& geomID(size_t N) const { return *(unsigned int*)((char*)this+2); }
+    
+    __forceinline       unsigned int* primID(size_t N)       { return (unsigned int*)((char*)this+6); }
+    __forceinline const unsigned int* primID(size_t N) const { return (unsigned int*)((char*)this+6); }
+    
+    __forceinline       char* bounds_vx_x(size_t N)       { return (char*)((char*)this+6+4*N); }
+    __forceinline const char* bounds_vx_x(size_t N) const { return (char*)((char*)this+6+4*N); }
+    
+    __forceinline       char* bounds_vx_y(size_t N)       { return (char*)((char*)this+6+5*N); }
+    __forceinline const char* bounds_vx_y(size_t N) const { return (char*)((char*)this+6+5*N); }
+    
+    __forceinline       char* bounds_vx_z(size_t N)       { return (char*)((char*)this+6+6*N); }
+    __forceinline const char* bounds_vx_z(size_t N) const { return (char*)((char*)this+6+6*N); }
+    
+    __forceinline       short* bounds_vx_lower(size_t N)       { return (short*)((char*)this+6+7*N); }
+    __forceinline const short* bounds_vx_lower(size_t N) const { return (short*)((char*)this+6+7*N); }
+    
+    __forceinline       short* bounds_vx_upper(size_t N)       { return (short*)((char*)this+6+9*N); }
+    __forceinline const short* bounds_vx_upper(size_t N) const { return (short*)((char*)this+6+9*N); }
+    
+    __forceinline       char* bounds_vy_x(size_t N)       { return (char*)((char*)this+6+11*N); }
+    __forceinline const char* bounds_vy_x(size_t N) const { return (char*)((char*)this+6+11*N); }
+    
+    __forceinline       char* bounds_vy_y(size_t N)       { return (char*)((char*)this+6+12*N); }
+    __forceinline const char* bounds_vy_y(size_t N) const { return (char*)((char*)this+6+12*N); }
+    
+    __forceinline       char* bounds_vy_z(size_t N)       { return (char*)((char*)this+6+13*N); }
+    __forceinline const char* bounds_vy_z(size_t N) const { return (char*)((char*)this+6+13*N); }
+    
+    __forceinline       short* bounds_vy_lower(size_t N)       { return (short*)((char*)this+6+14*N); }
+    __forceinline const short* bounds_vy_lower(size_t N) const { return (short*)((char*)this+6+14*N); }
+    
+    __forceinline       short* bounds_vy_upper(size_t N)       { return (short*)((char*)this+6+16*N); }
+    __forceinline const short* bounds_vy_upper(size_t N) const { return (short*)((char*)this+6+16*N); }
+    
+    __forceinline       char* bounds_vz_x(size_t N)       { return (char*)((char*)this+6+18*N); }
+    __forceinline const char* bounds_vz_x(size_t N) const { return (char*)((char*)this+6+18*N); }
+    
+    __forceinline       char* bounds_vz_y(size_t N)       { return (char*)((char*)this+6+19*N); }
+    __forceinline const char* bounds_vz_y(size_t N) const { return (char*)((char*)this+6+19*N); }
+    
+    __forceinline       char* bounds_vz_z(size_t N)       { return (char*)((char*)this+6+20*N); }
+    __forceinline const char* bounds_vz_z(size_t N) const { return (char*)((char*)this+6+20*N); }
+    
+    __forceinline       short* bounds_vz_lower(size_t N)       { return (short*)((char*)this+6+21*N); }
+    __forceinline const short* bounds_vz_lower(size_t N) const { return (short*)((char*)this+6+21*N); }
+    
+    __forceinline       short* bounds_vz_upper(size_t N)       { return (short*)((char*)this+6+23*N); }
+    __forceinline const short* bounds_vz_upper(size_t N) const { return (short*)((char*)this+6+23*N); }
+    
+    __forceinline       Vec3f* offset(size_t N)       { return (Vec3f*)((char*)this+6+25*N); }
+    __forceinline const Vec3f* offset(size_t N) const { return (Vec3f*)((char*)this+6+25*N); }
+    
+    __forceinline       float* scale(size_t N)       { return (float*)((char*)this+6+25*N+12); }
+    __forceinline const float* scale(size_t N) const { return (float*)((char*)this+6+25*N+12); }
+
+    __forceinline       char* end(size_t N)       { return (char*)this+6+25*N+16; }
+    __forceinline const char* end(size_t N) const { return (char*)this+6+25*N+16; }
+  };
+
+  template<int M>
+    typename CurveNi<M>::Type CurveNi<M>::type;
+
+  typedef CurveNi<4> Curve4i;
+  typedef CurveNi<8> Curve8i;
+}
diff --git a/thirdparty/embree/kernels/geometry/curveNi_intersector.h b/thirdparty/embree/kernels/geometry/curveNi_intersector.h
new file mode 100644
index 0000000000..c0b66515c1
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/curveNi_intersector.h
@@ -0,0 +1,569 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "curveNi.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M>
+    struct CurveNiIntersector1
+    {
+      typedef CurveNi<M> Primitive;
+      typedef Vec3vf<M> Vec3vfM;
+      typedef LinearSpace3<Vec3vfM>LinearSpace3vfM;
+      typedef CurvePrecalculations1 Precalculations;
+
+      static __forceinline vbool<M> intersect(Ray& ray, const Primitive& prim, vfloat<M>& tNear_o)
+      {
+        const size_t N = prim.N;
+        const vfloat4 offset_scale = vfloat4::loadu(prim.offset(N));
+        const Vec3fa offset = Vec3fa(offset_scale);
+        const Vec3fa scale = Vec3fa(shuffle<3,3,3,3>(offset_scale));
+        const Vec3fa org1 = (ray.org-offset)*scale;
+        const Vec3fa dir1 = ray.dir*scale;
+        
+        const LinearSpace3vfM space(vfloat<M>::load(prim.bounds_vx_x(N)), vfloat<M>::load(prim.bounds_vx_y(N)), vfloat<M>::load(prim.bounds_vx_z(N)),
+                                    vfloat<M>::load(prim.bounds_vy_x(N)), vfloat<M>::load(prim.bounds_vy_y(N)), vfloat<M>::load(prim.bounds_vy_z(N)),
+                                    vfloat<M>::load(prim.bounds_vz_x(N)), vfloat<M>::load(prim.bounds_vz_y(N)), vfloat<M>::load(prim.bounds_vz_z(N)));
+
+        const Vec3vfM dir2 = xfmVector(space,Vec3vfM(dir1));
+        const Vec3vfM org2 = xfmPoint (space,Vec3vfM(org1));
+        const Vec3vfM rcp_dir2 = rcp_safe(dir2);
+       
+        const vfloat<M> t_lower_x = (vfloat<M>::load(prim.bounds_vx_lower(N))-vfloat<M>(org2.x))*vfloat<M>(rcp_dir2.x);
+        const vfloat<M> t_upper_x = (vfloat<M>::load(prim.bounds_vx_upper(N))-vfloat<M>(org2.x))*vfloat<M>(rcp_dir2.x);
+        const vfloat<M> t_lower_y = (vfloat<M>::load(prim.bounds_vy_lower(N))-vfloat<M>(org2.y))*vfloat<M>(rcp_dir2.y);
+        const vfloat<M> t_upper_y = (vfloat<M>::load(prim.bounds_vy_upper(N))-vfloat<M>(org2.y))*vfloat<M>(rcp_dir2.y);
+        const vfloat<M> t_lower_z = (vfloat<M>::load(prim.bounds_vz_lower(N))-vfloat<M>(org2.z))*vfloat<M>(rcp_dir2.z);
+        const vfloat<M> t_upper_z = (vfloat<M>::load(prim.bounds_vz_upper(N))-vfloat<M>(org2.z))*vfloat<M>(rcp_dir2.z);
+
+        const vfloat<M> round_up  (1.0f+3.0f*float(ulp));
+        const vfloat<M> round_down(1.0f-3.0f*float(ulp));
+        const vfloat<M> tNear = round_down*max(mini(t_lower_x,t_upper_x),mini(t_lower_y,t_upper_y),mini(t_lower_z,t_upper_z),vfloat<M>(ray.tnear()));
+        const vfloat<M> tFar  = round_up  *min(maxi(t_lower_x,t_upper_x),maxi(t_lower_y,t_upper_y),maxi(t_lower_z,t_upper_z),vfloat<M>(ray.tfar));
+        tNear_o = tNear;
+        return (vint<M>(step) < vint<M>(prim.N)) & (tNear <= tFar);
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_t(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID));
+
+          size_t mask1 = mask;
+          const size_t i1 = bscf(mask1);
+          if (mask) {
+            const unsigned int primID1 = prim.primID(N)[i1];
+            geom->prefetchL1_vertices(geom->curve(primID1));
+            if (mask1) {
+              const size_t i2 = bsf(mask1);
+              const unsigned int primID2 = prim.primID(N)[i2];
+              geom->prefetchL2_vertices(geom->curve(primID2));
+            }
+          }
+          
+          Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,Epilog(ray,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_t(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID));
+         
+          size_t mask1 = mask;
+          const size_t i1 = bscf(mask1);
+          if (mask) {
+            const unsigned int primID1 = prim.primID(N)[i1];
+            geom->prefetchL1_vertices(geom->curve(primID1));
+            if (mask1) {
+              const size_t i2 = bsf(mask1);
+              const unsigned int primID2 = prim.primID(N)[i2];
+              geom->prefetchL2_vertices(geom->curve(primID2));
+            }
+          }
+
+          if (Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,Epilog(ray,context,geomID,primID)))
+            return true;
+          
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+        return false;
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_n(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          
+          unsigned int vertexID = geom->curve(primID);
+          Vec3ff a0,a1,a2,a3; Vec3fa n0,n1,n2,n3; geom->gather(a0,a1,a2,a3,n0,n1,n2,n3,vertexID);
+
+          size_t mask1 = mask;
+          const size_t i1 = bscf(mask1);
+          if (mask) {
+            const unsigned int primID1 = prim.primID(N)[i1];
+            geom->prefetchL1_vertices(geom->curve(primID1));
+            if (mask1) {
+              const size_t i2 = bsf(mask1);
+              const unsigned int primID2 = prim.primID(N)[i2];
+              geom->prefetchL2_vertices(geom->curve(primID2));
+            }
+          }
+          
+          Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,n0,n1,n2,n3,Epilog(ray,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_n(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+
+          unsigned int vertexID = geom->curve(primID);
+          Vec3ff a0,a1,a2,a3; Vec3fa n0,n1,n2,n3; geom->gather(a0,a1,a2,a3,n0,n1,n2,n3,vertexID);
+
+          size_t mask1 = mask;
+          const size_t i1 = bscf(mask1);
+          if (mask) {
+            const unsigned int primID1 = prim.primID(N)[i1];
+            geom->prefetchL1_vertices(geom->curve(primID1));
+            if (mask1) {
+              const size_t i2 = bsf(mask1);
+              const unsigned int primID2 = prim.primID(N)[i2];
+              geom->prefetchL2_vertices(geom->curve(primID2));
+            }
+          }
+
+          if (Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,n0,n1,n2,n3,Epilog(ray,context,geomID,primID)))
+            return true;
+          
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+        return false;
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_h(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID));
+          Intersector().intersect(pre,ray,context,geom,primID,p0,t0,p1,t1,Epilog(ray,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_h(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID));
+          if (Intersector().intersect(pre,ray,context,geom,primID,p0,t0,p1,t1,Epilog(ray,context,geomID,primID)))
+            return true;
+          
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+        return false;
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_hn(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff p0,t0,p1,t1; Vec3fa n0,dn0,n1,dn1; geom->gather_hermite(p0,t0,n0,dn0,p1,t1,n1,dn1,geom->curve(primID));
+          Intersector().intersect(pre,ray,context,geom,primID,p0,t0,p1,t1,n0,dn0,n1,dn1,Epilog(ray,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_hn(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff p0,t0,p1,t1; Vec3fa n0,dn0,n1,dn1; geom->gather_hermite(p0,t0,n0,dn0,p1,t1,n1,dn1,geom->curve(primID));
+          if (Intersector().intersect(pre,ray,context,geom,primID,p0,t0,p1,t1,n0,dn0,n1,dn1,Epilog(ray,context,geomID,primID)))
+            return true;
+          
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+        return false;
+      }
+    };
+
+    template<int M, int K>
+      struct CurveNiIntersectorK
+    {
+      typedef CurveNi<M> Primitive;
+      typedef Vec3vf<M> Vec3vfM;
+      typedef LinearSpace3<Vec3vfM>LinearSpace3vfM;
+      typedef CurvePrecalculationsK<K> Precalculations;
+      
+      static __forceinline vbool<M> intersect(RayK<K>& ray, const size_t k, const Primitive& prim, vfloat<M>& tNear_o)
+      {
+        const size_t N = prim.N;
+        const vfloat4 offset_scale = vfloat4::loadu(prim.offset(N));
+        const Vec3fa offset = Vec3fa(offset_scale);
+        const Vec3fa scale = Vec3fa(shuffle<3,3,3,3>(offset_scale));
+
+        const Vec3fa ray_org(ray.org.x[k],ray.org.y[k],ray.org.z[k]);
+        const Vec3fa ray_dir(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]);
+        const Vec3fa org1 = (ray_org-offset)*scale;
+        const Vec3fa dir1 = ray_dir*scale;
+        
+        const LinearSpace3vfM space(vfloat<M>::load(prim.bounds_vx_x(N)), vfloat<M>::load(prim.bounds_vx_y(N)), vfloat<M>::load(prim.bounds_vx_z(N)),
+                                    vfloat<M>::load(prim.bounds_vy_x(N)), vfloat<M>::load(prim.bounds_vy_y(N)), vfloat<M>::load(prim.bounds_vy_z(N)),
+                                    vfloat<M>::load(prim.bounds_vz_x(N)), vfloat<M>::load(prim.bounds_vz_y(N)), vfloat<M>::load(prim.bounds_vz_z(N)));
+
+        const Vec3vfM dir2 = xfmVector(space,Vec3vfM(dir1));
+        const Vec3vfM org2 = xfmPoint (space,Vec3vfM(org1));
+        const Vec3vfM rcp_dir2 = rcp_safe(dir2);
+       
+        const vfloat<M> t_lower_x = (vfloat<M>::load(prim.bounds_vx_lower(N))-vfloat<M>(org2.x))*vfloat<M>(rcp_dir2.x);
+        const vfloat<M> t_upper_x = (vfloat<M>::load(prim.bounds_vx_upper(N))-vfloat<M>(org2.x))*vfloat<M>(rcp_dir2.x);
+        const vfloat<M> t_lower_y = (vfloat<M>::load(prim.bounds_vy_lower(N))-vfloat<M>(org2.y))*vfloat<M>(rcp_dir2.y);
+        const vfloat<M> t_upper_y = (vfloat<M>::load(prim.bounds_vy_upper(N))-vfloat<M>(org2.y))*vfloat<M>(rcp_dir2.y);
+        const vfloat<M> t_lower_z = (vfloat<M>::load(prim.bounds_vz_lower(N))-vfloat<M>(org2.z))*vfloat<M>(rcp_dir2.z);
+        const vfloat<M> t_upper_z = (vfloat<M>::load(prim.bounds_vz_upper(N))-vfloat<M>(org2.z))*vfloat<M>(rcp_dir2.z);
+
+        const vfloat<M> round_up  (1.0f+3.0f*float(ulp));
+        const vfloat<M> round_down(1.0f-3.0f*float(ulp));
+        const vfloat<M> tNear = round_down*max(mini(t_lower_x,t_upper_x),mini(t_lower_y,t_upper_y),mini(t_lower_z,t_upper_z),vfloat<M>(ray.tnear()[k]));
+        const vfloat<M> tFar  = round_up  *min(maxi(t_lower_x,t_upper_x),maxi(t_lower_y,t_upper_y),maxi(t_lower_z,t_upper_z),vfloat<M>(ray.tfar[k]));
+        tNear_o = tNear;
+        return (vint<M>(step) < vint<M>(prim.N)) & (tNear <= tFar);
+      }
+      
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_t(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID));
+
+          size_t mask1 = mask;
+          const size_t i1 = bscf(mask1);
+          if (mask) {
+            const unsigned int primID1 = prim.primID(N)[i1];
+            geom->prefetchL1_vertices(geom->curve(primID1));
+            if (mask1) {
+              const size_t i2 = bsf(mask1);
+              const unsigned int primID2 = prim.primID(N)[i2];
+              geom->prefetchL2_vertices(geom->curve(primID2));
+            }
+          }
+
+          Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,Epilog(ray,k,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+      }
+      
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_t(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID));
+
+          size_t mask1 = mask;
+          const size_t i1 = bscf(mask1);
+          if (mask) {
+            const unsigned int primID1 = prim.primID(N)[i1];
+            geom->prefetchL1_vertices(geom->curve(primID1));
+            if (mask1) {
+              const size_t i2 = bsf(mask1);
+              const unsigned int primID2 = prim.primID(N)[i2];
+              geom->prefetchL2_vertices(geom->curve(primID2));
+            }
+          }
+          
+          if (Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,Epilog(ray,k,context,geomID,primID)))
+            return true;
+          
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+        return false;
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_n(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+
+          unsigned int vertexID = geom->curve(primID);
+          Vec3ff a0,a1,a2,a3; Vec3fa n0,n1,n2,n3; geom->gather(a0,a1,a2,a3,n0,n1,n2,n3,vertexID);
+
+          size_t mask1 = mask;
+          const size_t i1 = bscf(mask1);
+          if (mask) {
+            const unsigned int primID1 = prim.primID(N)[i1];
+            geom->prefetchL1_vertices(geom->curve(primID1));
+            if (mask1) {
+              const size_t i2 = bsf(mask1);
+              const unsigned int primID2 = prim.primID(N)[i2];
+              geom->prefetchL2_vertices(geom->curve(primID2));
+            }
+          }
+
+          Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,n0,n1,n2,n3,Epilog(ray,k,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+      }
+      
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_n(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+
+          unsigned int vertexID = geom->curve(primID);
+          Vec3ff a0,a1,a2,a3; Vec3fa n0,n1,n2,n3; geom->gather(a0,a1,a2,a3,n0,n1,n2,n3,vertexID);
+
+          size_t mask1 = mask;
+          const size_t i1 = bscf(mask1);
+          if (mask) {
+            const unsigned int primID1 = prim.primID(N)[i1];
+            geom->prefetchL1_vertices(geom->curve(primID1));
+            if (mask1) {
+              const size_t i2 = bsf(mask1);
+              const unsigned int primID2 = prim.primID(N)[i2];
+              geom->prefetchL2_vertices(geom->curve(primID2));
+            }
+          }
+
+          if (Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,n0,n1,n2,n3,Epilog(ray,k,context,geomID,primID)))
+            return true;
+          
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+        return false;
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_h(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID));
+          Intersector().intersect(pre,ray,k,context,geom,primID,p0,t0,p1,t1,Epilog(ray,k,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+      }
+      
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_h(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID));
+          if (Intersector().intersect(pre,ray,k,context,geom,primID,p0,t0,p1,t1,Epilog(ray,k,context,geomID,primID)))
+            return true;
+          
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+        return false;
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_hn(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff p0,t0,p1,t1; Vec3fa n0,dn0,n1,dn1; geom->gather_hermite(p0,t0,n0,dn0,p1,t1,n1,dn1,geom->curve(primID));
+          Intersector().intersect(pre,ray,k,context,geom,primID,p0,t0,p1,t1,n0,dn0,n1,dn1,Epilog(ray,k,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+      }
+      
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_hn(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff p0,t0,p1,t1; Vec3fa n0,dn0,n1,dn1; geom->gather_hermite(p0,t0,n0,dn0,p1,t1,n1,dn1,geom->curve(primID));
+          if (Intersector().intersect(pre,ray,k,context,geom,primID,p0,t0,p1,t1,n0,dn0,n1,dn1,Epilog(ray,k,context,geomID,primID)))
+            return true;
+          
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+        return false;
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/curveNi_mb.h b/thirdparty/embree/kernels/geometry/curveNi_mb.h
new file mode 100644
index 0000000000..5d972b43a0
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/curveNi_mb.h
@@ -0,0 +1,278 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+#include "curve_intersector_precalculations.h"
+
+namespace embree
+{
+  template<int M>
+    struct CurveNiMB
+  {
+    struct Type : public PrimitiveType {
+      const char* name() const;
+      size_t sizeActive(const char* This) const;
+      size_t sizeTotal(const char* This) const;
+      size_t getBytes(const char* This) const;
+    };
+    static Type type;
+
+  public:
+
+    /* Returns maximum number of stored primitives */
+    static __forceinline size_t max_size() { return M; }
+
+    /* Returns required number of primitive blocks for N primitives */
+    static __forceinline size_t blocks(size_t N) { return (N+M-1)/M; }
+
+    static __forceinline size_t bytes(size_t N)
+    {
+      const size_t f = N/M, r = N%M;
+      static_assert(sizeof(CurveNiMB) == 6+37*M+24, "internal data layout issue");
+      return f*sizeof(CurveNiMB) + (r!=0)*(6+37*r+24);
+    }
+
+  public:
+
+    /*! Default constructor. */
+    __forceinline CurveNiMB () {}
+
+    /*! fill curve from curve list */
+    __forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t& begin, size_t _end, Scene* scene, const BBox1f time_range)
+    {
+      size_t end = min(begin+M,_end);
+      N = (unsigned char)(end-begin);
+      const unsigned int geomID0 = prims[begin].geomID();
+      this->geomID(N) = geomID0;
+      ty = (unsigned char) scene->get(geomID0)->getType();
+
+      /* encode all primitives */
+      LBBox3fa lbounds = empty;
+      for (size_t i=0; i<N; i++)
+      {
+        const PrimRefMB& prim = prims[begin+i];
+        const unsigned int geomID = prim.geomID(); assert(geomID == geomID0);
+        const unsigned int primID = prim.primID();
+        lbounds.extend(scene->get(geomID)->vlinearBounds(primID,time_range));
+      }
+      BBox3fa bounds = lbounds.bounds();
+
+      /* calculate offset and scale */
+      Vec3fa loffset = bounds.lower;
+      float lscale = reduce_min(256.0f/(bounds.size()*sqrt(3.0f)));
+      if (bounds.size() == Vec3fa(zero)) lscale = 0.0f;
+      *this->offset(N) = loffset;
+      *this->scale(N) = lscale;
+      this->time_offset(N) = time_range.lower;
+      this->time_scale(N) = 1.0f/time_range.size();
+      
+      /* encode all primitives */
+      for (size_t i=0; i<M && begin<end; i++, begin++)
+      {
+        const PrimRefMB& prim = prims[begin];
+        const unsigned int geomID = prim.geomID();
+        const unsigned int primID = prim.primID();
+        const LinearSpace3fa space2 = scene->get(geomID)->computeAlignedSpaceMB(primID,time_range);
+        
+        const LinearSpace3fa space3(trunc(126.0f*space2.vx),trunc(126.0f*space2.vy),trunc(126.0f*space2.vz));
+        const LBBox3fa bounds = scene->get(geomID)->vlinearBounds(loffset,lscale,max(length(space3.vx),length(space3.vy),length(space3.vz)),space3.transposed(),primID,time_range);
+        
+        // NOTE: this weird (char) (short) cast works around VS2015 Win32 compiler bug
+        bounds_vx_x(N)[i] = (char) (short) space3.vx.x;
+        bounds_vx_y(N)[i] = (char) (short) space3.vx.y;
+        bounds_vx_z(N)[i] = (char) (short) space3.vx.z;
+        bounds_vx_lower0(N)[i] = (short) clamp(floor(bounds.bounds0.lower.x),-32767.0f,32767.0f);
+        bounds_vx_upper0(N)[i] = (short) clamp(ceil (bounds.bounds0.upper.x),-32767.0f,32767.0f);
+        bounds_vx_lower1(N)[i] = (short) clamp(floor(bounds.bounds1.lower.x),-32767.0f,32767.0f);
+        bounds_vx_upper1(N)[i] = (short) clamp(ceil (bounds.bounds1.upper.x),-32767.0f,32767.0f);
+        assert(-32767.0f <= floor(bounds.bounds0.lower.x) && floor(bounds.bounds0.lower.x) <= 32767.0f);
+        assert(-32767.0f <= ceil (bounds.bounds0.upper.x) && ceil (bounds.bounds0.upper.x) <= 32767.0f);
+        assert(-32767.0f <= floor(bounds.bounds1.lower.x) && floor(bounds.bounds1.lower.x) <= 32767.0f);
+        assert(-32767.0f <= ceil (bounds.bounds1.upper.x) && ceil (bounds.bounds1.upper.x) <= 32767.0f);
+        
+        bounds_vy_x(N)[i] = (char) (short) space3.vy.x;
+        bounds_vy_y(N)[i] = (char) (short) space3.vy.y;
+        bounds_vy_z(N)[i] = (char) (short) space3.vy.z;
+        bounds_vy_lower0(N)[i] = (short) clamp(floor(bounds.bounds0.lower.y),-32767.0f,32767.0f);
+        bounds_vy_upper0(N)[i] = (short) clamp(ceil (bounds.bounds0.upper.y),-32767.0f,32767.0f);
+        bounds_vy_lower1(N)[i] = (short) clamp(floor(bounds.bounds1.lower.y),-32767.0f,32767.0f);
+        bounds_vy_upper1(N)[i] = (short) clamp(ceil (bounds.bounds1.upper.y),-32767.0f,32767.0f);
+        assert(-32767.0f <= floor(bounds.bounds0.lower.y) && floor(bounds.bounds0.lower.y) <= 32767.0f);
+        assert(-32767.0f <= ceil (bounds.bounds0.upper.y) && ceil (bounds.bounds0.upper.y) <= 32767.0f);
+        assert(-32767.0f <= floor(bounds.bounds1.lower.y) && floor(bounds.bounds1.lower.y) <= 32767.0f);
+        assert(-32767.0f <= ceil (bounds.bounds1.upper.y) && ceil (bounds.bounds1.upper.y) <= 32767.0f);
+
+        bounds_vz_x(N)[i] = (char) (short) space3.vz.x;
+        bounds_vz_y(N)[i] = (char) (short) space3.vz.y;
+        bounds_vz_z(N)[i] = (char) (short) space3.vz.z;
+        bounds_vz_lower0(N)[i] = (short) clamp(floor(bounds.bounds0.lower.z),-32767.0f,32767.0f);
+        bounds_vz_upper0(N)[i] = (short) clamp(ceil (bounds.bounds0.upper.z),-32767.0f,32767.0f);
+        bounds_vz_lower1(N)[i] = (short) clamp(floor(bounds.bounds1.lower.z),-32767.0f,32767.0f);
+        bounds_vz_upper1(N)[i] = (short) clamp(ceil (bounds.bounds1.upper.z),-32767.0f,32767.0f);
+        assert(-32767.0f <= floor(bounds.bounds0.lower.z) && floor(bounds.bounds0.lower.z) <= 32767.0f);
+        assert(-32767.0f <= ceil (bounds.bounds0.upper.z) && ceil (bounds.bounds0.upper.z) <= 32767.0f);
+        assert(-32767.0f <= floor(bounds.bounds1.lower.z) && floor(bounds.bounds1.lower.z) <= 32767.0f);
+        assert(-32767.0f <= ceil (bounds.bounds1.upper.z) && ceil (bounds.bounds1.upper.z) <= 32767.0f);
+               
+        this->primID(N)[i] = primID;
+      }
+      
+      return lbounds;
+    }
+
+    template<typename BVH, typename SetMB, typename Allocator>
+    __forceinline static typename BVH::NodeRecordMB4D createLeafMB(BVH* bvh, const SetMB& prims, const Allocator& alloc)
+    {
+      size_t start = prims.begin();
+      size_t end   = prims.end();
+      size_t items = CurveNiMB::blocks(prims.size());
+      size_t numbytes = CurveNiMB::bytes(prims.size());
+      CurveNiMB* accel = (CurveNiMB*) alloc.malloc1(numbytes,BVH::byteAlignment);
+      const typename BVH::NodeRef node = bvh->encodeLeaf((char*)accel,items);
+      
+      LBBox3fa bounds = empty;
+      for (size_t i=0; i<items; i++)
+        bounds.extend(accel[i].fillMB(prims.prims->data(),start,end,bvh->scene,prims.time_range));
+      
+      return typename BVH::NodeRecordMB4D(node,bounds,prims.time_range);
+    };
+
+    
+  public:
+    
+    // 27.6 - 46 bytes per primitive
+    unsigned char ty;
+    unsigned char N;
+    unsigned char data[4+37*M+24];
+
+    /*
+    struct Layout
+    {
+      unsigned int geomID;
+      unsigned int primID[N];
+      
+      char bounds_vx_x[N];
+      char bounds_vx_y[N];
+      char bounds_vx_z[N];
+      short bounds_vx_lower0[N];
+      short bounds_vx_upper0[N];
+      short bounds_vx_lower1[N];
+      short bounds_vx_upper1[N];
+      
+      char bounds_vy_x[N];
+      char bounds_vy_y[N];
+      char bounds_vy_z[N];
+      short bounds_vy_lower0[N];
+      short bounds_vy_upper0[N];
+      short bounds_vy_lower1[N];
+      short bounds_vy_upper1[N];
+      
+      char bounds_vz_x[N];
+      char bounds_vz_y[N];
+      char bounds_vz_z[N];
+      short bounds_vz_lower0[N];
+      short bounds_vz_upper0[N];
+      short bounds_vz_lower1[N];
+      short bounds_vz_upper1[N];
+      
+      Vec3f offset;
+      float scale;
+
+      float time_offset;
+      float time_scale;
+    };
+    */
+    
+    __forceinline       unsigned int& geomID(size_t N)       { return *(unsigned int*)((char*)this+2); }
+    __forceinline const unsigned int& geomID(size_t N) const { return *(unsigned int*)((char*)this+2); }
+    
+    __forceinline       unsigned int* primID(size_t N)       { return (unsigned int*)((char*)this+6); }
+    __forceinline const unsigned int* primID(size_t N) const { return (unsigned int*)((char*)this+6); }
+    
+    __forceinline       char* bounds_vx_x(size_t N)       { return (char*)((char*)this+6+4*N); }
+    __forceinline const char* bounds_vx_x(size_t N) const { return (char*)((char*)this+6+4*N); }
+    
+    __forceinline       char* bounds_vx_y(size_t N)       { return (char*)((char*)this+6+5*N); }
+    __forceinline const char* bounds_vx_y(size_t N) const { return (char*)((char*)this+6+5*N); }
+    
+    __forceinline       char* bounds_vx_z(size_t N)       { return (char*)((char*)this+6+6*N); }
+    __forceinline const char* bounds_vx_z(size_t N) const { return (char*)((char*)this+6+6*N); }
+    
+    __forceinline       short* bounds_vx_lower0(size_t N)       { return (short*)((char*)this+6+7*N); }
+    __forceinline const short* bounds_vx_lower0(size_t N) const { return (short*)((char*)this+6+7*N); }
+    
+    __forceinline       short* bounds_vx_upper0(size_t N)       { return (short*)((char*)this+6+9*N); }
+    __forceinline const short* bounds_vx_upper0(size_t N) const { return (short*)((char*)this+6+9*N); }
+
+    __forceinline       short* bounds_vx_lower1(size_t N)       { return (short*)((char*)this+6+11*N); }
+    __forceinline const short* bounds_vx_lower1(size_t N) const { return (short*)((char*)this+6+11*N); }
+    
+    __forceinline       short* bounds_vx_upper1(size_t N)       { return (short*)((char*)this+6+13*N); }
+    __forceinline const short* bounds_vx_upper1(size_t N) const { return (short*)((char*)this+6+13*N); }
+
+    __forceinline       char* bounds_vy_x(size_t N)       { return (char*)((char*)this+6+15*N); }
+    __forceinline const char* bounds_vy_x(size_t N) const { return (char*)((char*)this+6+15*N); }
+    
+    __forceinline       char* bounds_vy_y(size_t N)       { return (char*)((char*)this+6+16*N); }
+    __forceinline const char* bounds_vy_y(size_t N) const { return (char*)((char*)this+6+16*N); }
+    
+    __forceinline       char* bounds_vy_z(size_t N)       { return (char*)((char*)this+6+17*N); }
+    __forceinline const char* bounds_vy_z(size_t N) const { return (char*)((char*)this+6+17*N); }
+    
+    __forceinline       short* bounds_vy_lower0(size_t N)       { return (short*)((char*)this+6+18*N); }
+    __forceinline const short* bounds_vy_lower0(size_t N) const { return (short*)((char*)this+6+18*N); }
+    
+    __forceinline       short* bounds_vy_upper0(size_t N)       { return (short*)((char*)this+6+20*N); }
+    __forceinline const short* bounds_vy_upper0(size_t N) const { return (short*)((char*)this+6+20*N); }
+
+    __forceinline       short* bounds_vy_lower1(size_t N)       { return (short*)((char*)this+6+22*N); }
+    __forceinline const short* bounds_vy_lower1(size_t N) const { return (short*)((char*)this+6+22*N); }
+    
+    __forceinline       short* bounds_vy_upper1(size_t N)       { return (short*)((char*)this+6+24*N); }
+    __forceinline const short* bounds_vy_upper1(size_t N) const { return (short*)((char*)this+6+24*N); }
+    
+    __forceinline       char* bounds_vz_x(size_t N)       { return (char*)((char*)this+6+26*N); }
+    __forceinline const char* bounds_vz_x(size_t N) const { return (char*)((char*)this+6+26*N); }
+    
+    __forceinline       char* bounds_vz_y(size_t N)       { return (char*)((char*)this+6+27*N); }
+    __forceinline const char* bounds_vz_y(size_t N) const { return (char*)((char*)this+6+27*N); }
+    
+    __forceinline       char* bounds_vz_z(size_t N)       { return (char*)((char*)this+6+28*N); }
+    __forceinline const char* bounds_vz_z(size_t N) const { return (char*)((char*)this+6+28*N); }
+    
+    __forceinline       short* bounds_vz_lower0(size_t N)       { return (short*)((char*)this+6+29*N); }
+    __forceinline const short* bounds_vz_lower0(size_t N) const { return (short*)((char*)this+6+29*N); }
+    
+    __forceinline       short* bounds_vz_upper0(size_t N)       { return (short*)((char*)this+6+31*N); }
+    __forceinline const short* bounds_vz_upper0(size_t N) const { return (short*)((char*)this+6+31*N); }
+
+    __forceinline       short* bounds_vz_lower1(size_t N)       { return (short*)((char*)this+6+33*N); }
+    __forceinline const short* bounds_vz_lower1(size_t N) const { return (short*)((char*)this+6+33*N); }
+    
+    __forceinline       short* bounds_vz_upper1(size_t N)       { return (short*)((char*)this+6+35*N); }
+    __forceinline const short* bounds_vz_upper1(size_t N) const { return (short*)((char*)this+6+35*N); }
+
+    __forceinline       Vec3f* offset(size_t N)       { return (Vec3f*)((char*)this+6+37*N); }
+    __forceinline const Vec3f* offset(size_t N) const { return (Vec3f*)((char*)this+6+37*N); }
+    
+    __forceinline       float* scale(size_t N)       { return (float*)((char*)this+6+37*N+12); }
+    __forceinline const float* scale(size_t N) const { return (float*)((char*)this+6+37*N+12); }
+
+    __forceinline       float& time_offset(size_t N)       { return *(float*)((char*)this+6+37*N+16); }
+    __forceinline const float& time_offset(size_t N) const { return *(float*)((char*)this+6+37*N+16); }
+    
+    __forceinline       float& time_scale(size_t N)       { return *(float*)((char*)this+6+37*N+20); }
+    __forceinline const float& time_scale(size_t N) const { return *(float*)((char*)this+6+37*N+20); }
+
+    __forceinline       char* end(size_t N)       { return (char*)this+6+37*N+24; }
+    __forceinline const char* end(size_t N) const { return (char*)this+6+37*N+24; }
+  };
+
+  template<int M>
+    typename CurveNiMB<M>::Type CurveNiMB<M>::type;
+
+  typedef CurveNiMB<4> Curve4iMB;
+  typedef CurveNiMB<8> Curve8iMB;
+}
diff --git a/thirdparty/embree/kernels/geometry/curveNi_mb_intersector.h b/thirdparty/embree/kernels/geometry/curveNi_mb_intersector.h
new file mode 100644
index 0000000000..bab796b33b
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/curveNi_mb_intersector.h
@@ -0,0 +1,516 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "curveNi_mb.h"
+#include "../subdiv/linear_bezier_patch.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M>
+      struct CurveNiMBIntersector1
+    {
+      typedef CurveNiMB<M> Primitive;
+      typedef Vec3vf<M> Vec3vfM;
+      typedef LinearSpace3<Vec3vfM>LinearSpace3vfM;
+      typedef CurvePrecalculations1 Precalculations;
+
+      static __forceinline vbool<M> intersect(Ray& ray, const Primitive& prim, vfloat<M>& tNear_o)
+      {
+        const size_t N = prim.N;
+        const vfloat4 offset_scale = vfloat4::loadu(prim.offset(N));
+        const Vec3fa offset = Vec3fa(offset_scale);
+        const Vec3fa scale = Vec3fa(shuffle<3,3,3,3>(offset_scale));
+        const Vec3fa org1 = (ray.org-offset)*scale;
+        const Vec3fa dir1 = ray.dir*scale;
+        
+        const LinearSpace3vfM space(vfloat<M>::load(prim.bounds_vx_x(N)), vfloat<M>::load(prim.bounds_vx_y(N)), vfloat<M>::load(prim.bounds_vx_z(N)),
+                                    vfloat<M>::load(prim.bounds_vy_x(N)), vfloat<M>::load(prim.bounds_vy_y(N)), vfloat<M>::load(prim.bounds_vy_z(N)),
+                                    vfloat<M>::load(prim.bounds_vz_x(N)), vfloat<M>::load(prim.bounds_vz_y(N)), vfloat<M>::load(prim.bounds_vz_z(N)));
+
+        const Vec3vfM dir2 = xfmVector(space,Vec3vfM(dir1));
+        const Vec3vfM org2 = xfmPoint (space,Vec3vfM(org1));
+        const Vec3vfM rcp_dir2 = rcp_safe(dir2);
+
+        const vfloat<M> ltime = (ray.time()-prim.time_offset(N))*prim.time_scale(N);
+        const vfloat<M> vx_lower0 = vfloat<M>::load(prim.bounds_vx_lower0(N));
+        const vfloat<M> vx_lower1 = vfloat<M>::load(prim.bounds_vx_lower1(N));
+        const vfloat<M> vx_lower = madd(ltime,vx_lower1-vx_lower0,vx_lower0);
+        const vfloat<M> vx_upper0 = vfloat<M>::load(prim.bounds_vx_upper0(N));
+        const vfloat<M> vx_upper1 = vfloat<M>::load(prim.bounds_vx_upper1(N));
+        const vfloat<M> vx_upper = madd(ltime,vx_upper1-vx_upper0,vx_upper0);
+
+        const vfloat<M> vy_lower0 = vfloat<M>::load(prim.bounds_vy_lower0(N));
+        const vfloat<M> vy_lower1 = vfloat<M>::load(prim.bounds_vy_lower1(N));
+        const vfloat<M> vy_lower = madd(ltime,vy_lower1-vy_lower0,vy_lower0);
+        const vfloat<M> vy_upper0 = vfloat<M>::load(prim.bounds_vy_upper0(N));
+        const vfloat<M> vy_upper1 = vfloat<M>::load(prim.bounds_vy_upper1(N));
+        const vfloat<M> vy_upper = madd(ltime,vy_upper1-vy_upper0,vy_upper0);
+        
+        const vfloat<M> vz_lower0 = vfloat<M>::load(prim.bounds_vz_lower0(N));
+        const vfloat<M> vz_lower1 = vfloat<M>::load(prim.bounds_vz_lower1(N));
+        const vfloat<M> vz_lower = madd(ltime,vz_lower1-vz_lower0,vz_lower0);
+        const vfloat<M> vz_upper0 = vfloat<M>::load(prim.bounds_vz_upper0(N));
+        const vfloat<M> vz_upper1 = vfloat<M>::load(prim.bounds_vz_upper1(N));
+        const vfloat<M> vz_upper = madd(ltime,vz_upper1-vz_upper0,vz_upper0);
+       
+        const vfloat<M> t_lower_x = (vx_lower-vfloat<M>(org2.x))*vfloat<M>(rcp_dir2.x);
+        const vfloat<M> t_upper_x = (vx_upper-vfloat<M>(org2.x))*vfloat<M>(rcp_dir2.x);
+        const vfloat<M> t_lower_y = (vy_lower-vfloat<M>(org2.y))*vfloat<M>(rcp_dir2.y);
+        const vfloat<M> t_upper_y = (vy_upper-vfloat<M>(org2.y))*vfloat<M>(rcp_dir2.y);
+        const vfloat<M> t_lower_z = (vz_lower-vfloat<M>(org2.z))*vfloat<M>(rcp_dir2.z);
+        const vfloat<M> t_upper_z = (vz_upper-vfloat<M>(org2.z))*vfloat<M>(rcp_dir2.z);
+
+        const vfloat<M> round_up  (1.0f+3.0f*float(ulp));
+        const vfloat<M> round_down(1.0f-3.0f*float(ulp));
+        const vfloat<M> tNear = round_down*max(mini(t_lower_x,t_upper_x),mini(t_lower_y,t_upper_y),mini(t_lower_z,t_upper_z),vfloat<M>(ray.tnear()));
+        const vfloat<M> tFar  = round_up  *min(maxi(t_lower_x,t_upper_x),maxi(t_lower_y,t_upper_y),maxi(t_lower_z,t_upper_z),vfloat<M>(ray.tfar));
+        tNear_o = tNear;
+        return (vint<M>(step) < vint<M>(prim.N)) & (tNear <= tFar);
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_t(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID),ray.time());
+
+          Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,Epilog(ray,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_t(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID),ray.time());
+
+          if (Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,Epilog(ray,context,geomID,primID)))
+              return true;
+
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+        return false;
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_n(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedCurve<typename Intersector::SourceCurve3ff, typename Intersector::SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray.org, primID,ray.time());
+          Intersector().intersect(pre,ray,context,geom,primID,curve,Epilog(ray,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_n(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedCurve<typename Intersector::SourceCurve3ff, typename Intersector::SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray.org, primID,ray.time());
+
+          if (Intersector().intersect(pre,ray,context,geom,primID,curve,Epilog(ray,context,geomID,primID)))
+              return true;
+
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+        return false;
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_h(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID),ray.time());
+          Intersector().intersect(pre,ray,context,geom,primID,p0,t0,p1,t1,Epilog(ray,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_h(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID),ray.time());
+          if (Intersector().intersect(pre,ray,context,geom,primID,p0,t0,p1,t1,Epilog(ray,context,geomID,primID)))
+              return true;
+
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+        return false;
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_hn(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedHermiteCurve<typename Intersector::SourceCurve3ff, typename Intersector::SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray.org, primID,ray.time());
+          Intersector().intersect(pre,ray,context,geom,primID,curve,Epilog(ray,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_hn(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedHermiteCurve<typename Intersector::SourceCurve3ff, typename Intersector::SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray.org, primID,ray.time());
+          if (Intersector().intersect(pre,ray,context,geom,primID,curve,Epilog(ray,context,geomID,primID)))
+              return true;
+
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+        return false;
+      }
+    };
+
+    template<int M, int K>
+      struct CurveNiMBIntersectorK
+    {
+      typedef CurveNiMB<M> Primitive;
+      typedef Vec3vf<M> Vec3vfM;
+      typedef LinearSpace3<Vec3vfM>LinearSpace3vfM;
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      static __forceinline vbool<M> intersect(RayK<K>& ray, const size_t k, const Primitive& prim, vfloat<M>& tNear_o)
+      {
+        const size_t N = prim.N;
+        const vfloat4 offset_scale = vfloat4::loadu(prim.offset(N));
+        const Vec3fa offset = Vec3fa(offset_scale);
+        const Vec3fa scale = Vec3fa(shuffle<3,3,3,3>(offset_scale));
+
+        const Vec3fa ray_org(ray.org.x[k],ray.org.y[k],ray.org.z[k]);
+        const Vec3fa ray_dir(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]);
+        const Vec3fa org1 = (ray_org-offset)*scale;
+        const Vec3fa dir1 = ray_dir*scale;
+        
+        const LinearSpace3vfM space(vfloat<M>::load(prim.bounds_vx_x(N)), vfloat<M>::load(prim.bounds_vx_y(N)), vfloat<M>::load(prim.bounds_vx_z(N)),
+                                    vfloat<M>::load(prim.bounds_vy_x(N)), vfloat<M>::load(prim.bounds_vy_y(N)), vfloat<M>::load(prim.bounds_vy_z(N)),
+                                    vfloat<M>::load(prim.bounds_vz_x(N)), vfloat<M>::load(prim.bounds_vz_y(N)), vfloat<M>::load(prim.bounds_vz_z(N)));
+
+        const Vec3vfM dir2 = xfmVector(space,Vec3vfM(dir1));
+        const Vec3vfM org2 = xfmPoint (space,Vec3vfM(org1));
+        const Vec3vfM rcp_dir2 = rcp_safe(dir2);
+
+        const vfloat<M> ltime = (ray.time()[k]-prim.time_offset(N))*prim.time_scale(N);
+        const vfloat<M> vx_lower0 = vfloat<M>::load(prim.bounds_vx_lower0(N));
+        const vfloat<M> vx_lower1 = vfloat<M>::load(prim.bounds_vx_lower1(N));
+        const vfloat<M> vx_lower = madd(ltime,vx_lower1-vx_lower0,vx_lower0);
+        const vfloat<M> vx_upper0 = vfloat<M>::load(prim.bounds_vx_upper0(N));
+        const vfloat<M> vx_upper1 = vfloat<M>::load(prim.bounds_vx_upper1(N));
+        const vfloat<M> vx_upper = madd(ltime,vx_upper1-vx_upper0,vx_upper0);
+
+        const vfloat<M> vy_lower0 = vfloat<M>::load(prim.bounds_vy_lower0(N));
+        const vfloat<M> vy_lower1 = vfloat<M>::load(prim.bounds_vy_lower1(N));
+        const vfloat<M> vy_lower = madd(ltime,vy_lower1-vy_lower0,vy_lower0);
+        const vfloat<M> vy_upper0 = vfloat<M>::load(prim.bounds_vy_upper0(N));
+        const vfloat<M> vy_upper1 = vfloat<M>::load(prim.bounds_vy_upper1(N));
+        const vfloat<M> vy_upper = madd(ltime,vy_upper1-vy_upper0,vy_upper0);
+        
+        const vfloat<M> vz_lower0 = vfloat<M>::load(prim.bounds_vz_lower0(N));
+        const vfloat<M> vz_lower1 = vfloat<M>::load(prim.bounds_vz_lower1(N));
+        const vfloat<M> vz_lower = madd(ltime,vz_lower1-vz_lower0,vz_lower0);
+        const vfloat<M> vz_upper0 = vfloat<M>::load(prim.bounds_vz_upper0(N));
+        const vfloat<M> vz_upper1 = vfloat<M>::load(prim.bounds_vz_upper1(N));
+        const vfloat<M> vz_upper = madd(ltime,vz_upper1-vz_upper0,vz_upper0);
+       
+        const vfloat<M> t_lower_x = (vx_lower-vfloat<M>(org2.x))*vfloat<M>(rcp_dir2.x);
+        const vfloat<M> t_upper_x = (vx_upper-vfloat<M>(org2.x))*vfloat<M>(rcp_dir2.x);
+        const vfloat<M> t_lower_y = (vy_lower-vfloat<M>(org2.y))*vfloat<M>(rcp_dir2.y);
+        const vfloat<M> t_upper_y = (vy_upper-vfloat<M>(org2.y))*vfloat<M>(rcp_dir2.y);
+        const vfloat<M> t_lower_z = (vz_lower-vfloat<M>(org2.z))*vfloat<M>(rcp_dir2.z);
+        const vfloat<M> t_upper_z = (vz_upper-vfloat<M>(org2.z))*vfloat<M>(rcp_dir2.z);
+
+        const vfloat<M> round_up  (1.0f+3.0f*float(ulp));
+        const vfloat<M> round_down(1.0f-3.0f*float(ulp));
+        const vfloat<M> tNear = round_down*max(mini(t_lower_x,t_upper_x),mini(t_lower_y,t_upper_y),mini(t_lower_z,t_upper_z),vfloat<M>(ray.tnear()[k]));
+        const vfloat<M> tFar  = round_up  *min(maxi(t_lower_x,t_upper_x),maxi(t_lower_y,t_upper_y),maxi(t_lower_z,t_upper_z),vfloat<M>(ray.tfar[k]));
+        tNear_o = tNear;
+        return (vint<M>(step) < vint<M>(prim.N)) & (tNear <= tFar);
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_t(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID),ray.time()[k]);
+
+          Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,Epilog(ray,k,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_t(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID),ray.time()[k]);
+
+          if (Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,Epilog(ray,k,context,geomID,primID)))
+            return true;
+
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+        return false;
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_n(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          const Vec3fa ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]);
+          const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedCurve<typename Intersector::SourceCurve3ff, typename Intersector::SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray_org, primID,ray.time()[k]);
+          Intersector().intersect(pre,ray,k,context,geom,primID,curve,Epilog(ray,k,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_n(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          const Vec3fa ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]);
+          const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedCurve<typename Intersector::SourceCurve3ff, typename Intersector::SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray_org, primID,ray.time()[k]);
+          
+          if (Intersector().intersect(pre,ray,k,context,geom,primID,curve,Epilog(ray,k,context,geomID,primID)))
+            return true;
+
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+        return false;
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_h(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID),ray.time()[k]);
+          Intersector().intersect(pre,ray,k,context,geom,primID,p0,t0,p1,t1,Epilog(ray,k,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_h(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID),ray.time()[k]);
+          if (Intersector().intersect(pre,ray,k,context,geom,primID,p0,t0,p1,t1,Epilog(ray,k,context,geomID,primID)))
+            return true;
+
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+        return false;
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_hn(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          const Vec3fa ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]);
+          const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedHermiteCurve<typename Intersector::SourceCurve3ff, typename Intersector::SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray_org, primID,ray.time()[k]);
+          Intersector().intersect(pre,ray,k,context,geom,primID,curve,Epilog(ray,k,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_hn(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          const Vec3fa ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]);
+          const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedHermiteCurve<typename Intersector::SourceCurve3ff, typename Intersector::SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray_org, primID,ray.time()[k]);
+          if (Intersector().intersect(pre,ray,k,context,geom,primID,curve,Epilog(ray,k,context,geomID,primID)))
+            return true;
+
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+        return false;
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/curveNv.h b/thirdparty/embree/kernels/geometry/curveNv.h
new file mode 100644
index 0000000000..e41a381706
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/curveNv.h
@@ -0,0 +1,101 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "curveNi.h"
+
+namespace embree
+{
+  template<int M>
+    struct CurveNv : public CurveNi<M>
+  {
+    using CurveNi<M>::N;
+      
+    struct Type : public PrimitiveType {
+      const char* name() const;
+      size_t sizeActive(const char* This) const;
+      size_t sizeTotal(const char* This) const;
+      size_t getBytes(const char* This) const;
+    };
+    static Type type;
+
+  public:
+
+    /* Returns maximum number of stored primitives */
+    static __forceinline size_t max_size() { return M; }
+
+    /* Returns required number of primitive blocks for N primitives */
+    static __forceinline size_t blocks(size_t N) { return (N+M-1)/M; }
+
+    static __forceinline size_t bytes(size_t N)
+    {
+      const size_t f = N/M, r = N%M;
+      static_assert(sizeof(CurveNv) == 22+25*M+4*16*M, "internal data layout issue");
+      return f*sizeof(CurveNv) + (r!=0)*(22 + 25*r + 4*16*r);
+    }
+
+  public:
+
+    /*! Default constructor. */
+    __forceinline CurveNv () {}
+
+    /*! fill curve from curve list */
+    __forceinline void fill(const PrimRef* prims, size_t& begin, size_t _end, Scene* scene)
+    {
+      size_t end = min(begin+M,_end);
+      size_t N = end-begin;
+
+      /* encode all primitives */
+      for (size_t i=0; i<N; i++)
+      {
+        const PrimRef& prim = prims[begin+i];
+        const unsigned int geomID = prim.geomID();
+        const unsigned int primID = prim.primID();
+        CurveGeometry* mesh = (CurveGeometry*) scene->get(geomID);
+        const unsigned vtxID = mesh->curve(primID);
+        Vec3fa::storeu(&this->vertices(i,N)[0],mesh->vertex(vtxID+0));
+        Vec3fa::storeu(&this->vertices(i,N)[1],mesh->vertex(vtxID+1));
+        Vec3fa::storeu(&this->vertices(i,N)[2],mesh->vertex(vtxID+2));
+        Vec3fa::storeu(&this->vertices(i,N)[3],mesh->vertex(vtxID+3));
+      }
+    }
+
+    template<typename BVH, typename Allocator>
+      __forceinline static typename BVH::NodeRef createLeaf (BVH* bvh, const PrimRef* prims, const range<size_t>& set, const Allocator& alloc)
+    {
+      if (set.size() == 0)
+        return BVH::emptyNode;
+      
+      /* fall back to CurveNi for oriented curves */
+      unsigned int geomID = prims[set.begin()].geomID();
+      if (bvh->scene->get(geomID)->getCurveType() == Geometry::GTY_SUBTYPE_ORIENTED_CURVE) {
+        return CurveNi<M>::createLeaf(bvh,prims,set,alloc);
+      }
+      if (bvh->scene->get(geomID)->getCurveBasis() == Geometry::GTY_BASIS_HERMITE) {
+        return CurveNi<M>::createLeaf(bvh,prims,set,alloc);
+      }
+      
+      size_t start = set.begin();
+      size_t items = CurveNv::blocks(set.size());
+      size_t numbytes = CurveNv::bytes(set.size());
+      CurveNv* accel = (CurveNv*) alloc.malloc1(numbytes,BVH::byteAlignment);
+      for (size_t i=0; i<items; i++) {
+        accel[i].CurveNv<M>::fill(prims,start,set.end(),bvh->scene);
+        accel[i].CurveNi<M>::fill(prims,start,set.end(),bvh->scene);
+      }
+      return bvh->encodeLeaf((char*)accel,items);
+    };
+    
+  public:
+    unsigned char data[4*16*M];
+    __forceinline       Vec3fa* vertices(size_t i, size_t N)       { return (Vec3fa*)CurveNi<M>::end(N)+4*i; }
+    __forceinline const Vec3fa* vertices(size_t i, size_t N) const { return (Vec3fa*)CurveNi<M>::end(N)+4*i; }
+  };
+
+  template<int M>
+    typename CurveNv<M>::Type CurveNv<M>::type;
+
+  typedef CurveNv<4> Curve4v;
+  typedef CurveNv<8> Curve8v;
+}
diff --git a/thirdparty/embree/kernels/geometry/curveNv_intersector.h b/thirdparty/embree/kernels/geometry/curveNv_intersector.h
new file mode 100644
index 0000000000..2742725aec
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/curveNv_intersector.h
@@ -0,0 +1,181 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "curveNv.h"
+#include "curveNi_intersector.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M>
+      struct CurveNvIntersector1 : public CurveNiIntersector1<M>
+    {
+      typedef CurveNv<M> Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_t(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = CurveNiIntersector1<M>::intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = (CurveGeometry*) context->scene->get(geomID);
+          const Vec3ff a0 = Vec3ff::loadu(&prim.vertices(i,N)[0]);
+          const Vec3ff a1 = Vec3ff::loadu(&prim.vertices(i,N)[1]);
+          const Vec3ff a2 = Vec3ff::loadu(&prim.vertices(i,N)[2]);
+          const Vec3ff a3 = Vec3ff::loadu(&prim.vertices(i,N)[3]);
+
+          size_t mask1 = mask;
+          const size_t i1 = bscf(mask1);
+          if (mask) {
+            prefetchL1(&prim.vertices(i1,N)[0]);
+            prefetchL1(&prim.vertices(i1,N)[4]);
+            if (mask1) {
+              const size_t i2 = bsf(mask1);
+              prefetchL2(&prim.vertices(i2,N)[0]);
+              prefetchL2(&prim.vertices(i2,N)[4]);
+            }
+          }
+
+          Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,Epilog(ray,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_t(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = CurveNiIntersector1<M>::intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = (CurveGeometry*) context->scene->get(geomID);
+          const Vec3ff a0 = Vec3ff::loadu(&prim.vertices(i,N)[0]);
+          const Vec3ff a1 = Vec3ff::loadu(&prim.vertices(i,N)[1]);
+          const Vec3ff a2 = Vec3ff::loadu(&prim.vertices(i,N)[2]);
+          const Vec3ff a3 = Vec3ff::loadu(&prim.vertices(i,N)[3]);
+
+          size_t mask1 = mask;
+          const size_t i1 = bscf(mask1);
+          if (mask) {
+            prefetchL1(&prim.vertices(i1,N)[0]);
+            prefetchL1(&prim.vertices(i1,N)[4]);
+            if (mask1) {
+              const size_t i2 = bsf(mask1);
+              prefetchL2(&prim.vertices(i2,N)[0]);
+              prefetchL2(&prim.vertices(i2,N)[4]);
+            }
+          }
+          
+          if (Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,Epilog(ray,context,geomID,primID)))
+            return true;
+          
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+        return false;
+      }
+    };
+
+    template<int M, int K>
+      struct CurveNvIntersectorK : public CurveNiIntersectorK<M,K>
+    {
+      typedef CurveNv<M> Primitive;
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_t(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = CurveNiIntersectorK<M,K>::intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = (CurveGeometry*) context->scene->get(geomID);
+          const Vec3ff a0 = Vec3ff::loadu(&prim.vertices(i,N)[0]);
+          const Vec3ff a1 = Vec3ff::loadu(&prim.vertices(i,N)[1]);
+          const Vec3ff a2 = Vec3ff::loadu(&prim.vertices(i,N)[2]);
+          const Vec3ff a3 = Vec3ff::loadu(&prim.vertices(i,N)[3]);
+
+          size_t mask1 = mask;
+          const size_t i1 = bscf(mask1);
+          if (mask) {
+            prefetchL1(&prim.vertices(i1,N)[0]);
+            prefetchL1(&prim.vertices(i1,N)[4]);
+            if (mask1) {
+              const size_t i2 = bsf(mask1);
+              prefetchL2(&prim.vertices(i2,N)[0]);
+              prefetchL2(&prim.vertices(i2,N)[4]);
+            }
+          }
+
+          Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,Epilog(ray,k,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_t(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = CurveNiIntersectorK<M,K>::intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = (CurveGeometry*) context->scene->get(geomID);
+          const Vec3ff a0 = Vec3ff::loadu(&prim.vertices(i,N)[0]);
+          const Vec3ff a1 = Vec3ff::loadu(&prim.vertices(i,N)[1]);
+          const Vec3ff a2 = Vec3ff::loadu(&prim.vertices(i,N)[2]);
+          const Vec3ff a3 = Vec3ff::loadu(&prim.vertices(i,N)[3]);
+
+          size_t mask1 = mask;
+          const size_t i1 = bscf(mask1);
+          if (mask) {
+            prefetchL1(&prim.vertices(i1,N)[0]);
+            prefetchL1(&prim.vertices(i1,N)[4]);
+            if (mask1) {
+              const size_t i2 = bsf(mask1);
+              prefetchL2(&prim.vertices(i2,N)[0]);
+              prefetchL2(&prim.vertices(i2,N)[4]);
+            }
+          }
+
+          if (Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,Epilog(ray,k,context,geomID,primID)))
+            return true;
+
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+        return false;
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/curve_intersector.h b/thirdparty/embree/kernels/geometry/curve_intersector.h
new file mode 100644
index 0000000000..1e8ac26125
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/curve_intersector.h
@@ -0,0 +1,98 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+#include "../subdiv/bezier_curve.h"
+#include "../common/primref.h"
+#include "bezier_hair_intersector.h"
+#include "bezier_ribbon_intersector.h"
+#include "bezier_curve_intersector.h"
+#include "oriented_curve_intersector.h"
+#include "../bvh/node_intersector1.h"
+
+// FIXME: this file seems replicate of curve_intersector_virtual.h
+
+namespace embree
+{
+  namespace isa
+  {
+    struct VirtualCurveIntersector1
+    {
+      typedef unsigned char Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+      
+      template<int N, bool robust>
+        static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+      {
+        assert(num == 1);
+        RTCGeometryType ty = (RTCGeometryType)(*prim);
+        assert(This->leafIntersector);
+        VirtualCurvePrimitive::Intersectors& leafIntersector = ((VirtualCurvePrimitive*) This->leafIntersector)->vtbl[ty];
+        leafIntersector.intersect<1>(&pre,&ray,context,prim);
+      }
+      
+      template<int N, bool robust>        
+        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+      {
+        assert(num == 1);
+        RTCGeometryType ty = (RTCGeometryType)(*prim);
+        assert(This->leafIntersector);
+        VirtualCurvePrimitive::Intersectors& leafIntersector = ((VirtualCurvePrimitive*) This->leafIntersector)->vtbl[ty];
+        return leafIntersector.occluded<1>(&pre,&ray,context,prim);
+      }
+    };
+
+    template<int K>
+      struct VirtualCurveIntersectorK 
+      {
+        typedef unsigned char Primitive;
+        typedef CurvePrecalculationsK<K> Precalculations;
+        
+        static __forceinline void intersect(const vbool<K>& valid_i, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node)
+        {
+          assert(num == 1);
+          RTCGeometryType ty = (RTCGeometryType)(*prim);
+          assert(This->leafIntersector);
+          VirtualCurvePrimitive::Intersectors& leafIntersector = ((VirtualCurvePrimitive*) This->leafIntersector)->vtbl[ty];
+          size_t mask = movemask(valid_i);
+          while (mask) leafIntersector.intersect<K>(&pre,&ray,bscf(mask),context,prim);
+        }
+        
+        static __forceinline vbool<K> occluded(const vbool<K>& valid_i, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node)
+        {
+          assert(num == 1);
+          RTCGeometryType ty = (RTCGeometryType)(*prim);
+          assert(This->leafIntersector);
+          VirtualCurvePrimitive::Intersectors& leafIntersector = ((VirtualCurvePrimitive*) This->leafIntersector)->vtbl[ty];
+          vbool<K> valid_o = false;
+          size_t mask = movemask(valid_i);
+          while (mask) {
+            size_t k = bscf(mask);
+            if (leafIntersector.occluded<K>(&pre,&ray,k,context,prim))
+              set(valid_o, k);
+          }
+          return valid_o;
+        }
+        
+        static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node)
+        {
+          assert(num == 1);
+          RTCGeometryType ty = (RTCGeometryType)(*prim);
+          assert(This->leafIntersector);
+          VirtualCurvePrimitive::Intersectors& leafIntersector = ((VirtualCurvePrimitive*) This->leafIntersector)->vtbl[ty];
+          leafIntersector.intersect<K>(&pre,&ray,k,context,prim);
+        }
+        
+        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node)
+        {
+          assert(num == 1);
+          RTCGeometryType ty = (RTCGeometryType)(*prim);
+          assert(This->leafIntersector);
+          VirtualCurvePrimitive::Intersectors& leafIntersector = ((VirtualCurvePrimitive*) This->leafIntersector)->vtbl[ty];
+          return leafIntersector.occluded<K>(&pre,&ray,k,context,prim);
+        }
+      };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/curve_intersector_distance.h b/thirdparty/embree/kernels/geometry/curve_intersector_distance.h
new file mode 100644
index 0000000000..748a9511a5
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/curve_intersector_distance.h
@@ -0,0 +1,129 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "curve_intersector_precalculations.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<typename NativeCurve3fa, int M>
+    struct DistanceCurveHit
+    {
+      __forceinline DistanceCurveHit() {}
+
+      __forceinline DistanceCurveHit(const vbool<M>& valid, const vfloat<M>& U, const vfloat<M>& V, const vfloat<M>& T, const int i, const int N,
+                                     const NativeCurve3fa& curve3D)
+        : U(U), V(V), T(T), i(i), N(N), curve3D(curve3D), valid(valid) {}
+      
+      __forceinline void finalize() 
+      {
+        vu = (vfloat<M>(step)+U+vfloat<M>(float(i)))*(1.0f/float(N));
+        vv = V;
+        vt = T;
+      }
+      
+      __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); }
+      __forceinline float t  (const size_t i) const { return vt[i]; }
+      __forceinline Vec3fa Ng(const size_t i) const { 
+        return curve3D.eval_du(vu[i]);
+      }
+      
+    public:
+      vfloat<M> U;
+      vfloat<M> V;
+      vfloat<M> T;
+      int i, N;
+      NativeCurve3fa curve3D;
+      
+    public:
+      vbool<M> valid;
+      vfloat<M> vu;
+      vfloat<M> vv;
+      vfloat<M> vt;
+    };
+    
+    template<typename NativeCurve3fa>
+    struct DistanceCurve1Intersector1
+    {
+      template<typename Epilog>
+      __forceinline bool intersect(const CurvePrecalculations1& pre,Ray& ray,
+                                   IntersectContext* context,
+                                   const CurveGeometry* geom, const unsigned int primID,
+                                   const Vec3fa& v0, const Vec3fa& v1, const Vec3fa& v2, const Vec3fa& v3,
+                                   const Epilog& epilog)
+      {
+        const int N = geom->tessellationRate;
+        
+        /* transform control points into ray space */
+        const NativeCurve3fa curve3Di(v0,v1,v2,v3);
+        const NativeCurve3fa curve3D = enlargeRadiusToMinWidth(context,geom,ray.org,curve3Di);
+        const NativeCurve3fa curve2D = curve3D.xfm_pr(pre.ray_space,ray.org);
+      
+        /* evaluate the bezier curve */
+        vboolx valid = vfloatx(step) < vfloatx(float(N));
+        const Vec4vfx p0 = curve2D.template eval0<VSIZEX>(0,N);
+        const Vec4vfx p1 = curve2D.template eval1<VSIZEX>(0,N);
+
+        /* approximative intersection with cone */
+        const Vec4vfx v = p1-p0;
+        const Vec4vfx w = -p0;
+        const vfloatx d0 = madd(w.x,v.x,w.y*v.y);
+        const vfloatx d1 = madd(v.x,v.x,v.y*v.y);
+        const vfloatx u = clamp(d0*rcp(d1),vfloatx(zero),vfloatx(one));
+        const Vec4vfx p = madd(u,v,p0);
+        const vfloatx t = p.z*pre.depth_scale;
+        const vfloatx d2 = madd(p.x,p.x,p.y*p.y); 
+        const vfloatx r = p.w;
+        const vfloatx r2 = r*r;
+        valid &= (d2 <= r2) & (vfloatx(ray.tnear()) <= t) & (t <= vfloatx(ray.tfar));
+        if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f) 
+          valid &= t > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR)*r*pre.depth_scale; // ignore self intersections
+
+        /* update hit information */
+        bool ishit = false;
+        if (unlikely(any(valid))) {
+          DistanceCurveHit<NativeCurve3fa,VSIZEX> hit(valid,u,0.0f,t,0,N,curve3D);
+          ishit = ishit | epilog(valid,hit);
+        }
+
+        if (unlikely(VSIZEX < N)) 
+        {
+          /* process SIMD-size many segments per iteration */
+          for (int i=VSIZEX; i<N; i+=VSIZEX)
+          {
+            /* evaluate the bezier curve */
+            vboolx valid = vintx(i)+vintx(step) < vintx(N);
+            const Vec4vfx p0 = curve2D.template eval0<VSIZEX>(i,N);
+            const Vec4vfx p1 = curve2D.template eval1<VSIZEX>(i,N);
+            
+            /* approximative intersection with cone */
+            const Vec4vfx v = p1-p0;
+            const Vec4vfx w = -p0;
+            const vfloatx d0 = madd(w.x,v.x,w.y*v.y);
+            const vfloatx d1 = madd(v.x,v.x,v.y*v.y);
+            const vfloatx u = clamp(d0*rcp(d1),vfloatx(zero),vfloatx(one));
+            const Vec4vfx p = madd(u,v,p0);
+            const vfloatx t = p.z*pre.depth_scale;
+            const vfloatx d2 = madd(p.x,p.x,p.y*p.y); 
+            const vfloatx r = p.w;
+            const vfloatx r2 = r*r;
+            valid &= (d2 <= r2) & (vfloatx(ray.tnear()) <= t) & (t <= vfloatx(ray.tfar));
+            if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f)
+              valid &= t > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR)*r*pre.depth_scale; // ignore self intersections
+
+             /* update hit information */
+            if (unlikely(any(valid))) {
+              DistanceCurveHit<NativeCurve3fa,VSIZEX> hit(valid,u,0.0f,t,i,N,curve3D);
+              ishit = ishit | epilog(valid,hit);
+            }
+          }
+        }
+        return ishit;
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/curve_intersector_oriented.h b/thirdparty/embree/kernels/geometry/curve_intersector_oriented.h
new file mode 100644
index 0000000000..3d8900c2aa
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/curve_intersector_oriented.h
@@ -0,0 +1,417 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "curve_intersector_precalculations.h"
+#include "curve_intersector_sweep.h"
+#include "../subdiv/linear_bezier_patch.h"
+
+#define DBG(x)
+
+namespace embree
+{
+  namespace isa
+  {
+    template<typename Ray, typename Epilog>
+      struct TensorLinearCubicBezierSurfaceIntersector
+      {
+        const LinearSpace3fa& ray_space;
+        Ray& ray;
+        TensorLinearCubicBezierSurface3fa curve3d;
+        TensorLinearCubicBezierSurface2fa curve2d;
+        float eps;
+        const Epilog& epilog;
+        bool isHit;
+
+        __forceinline TensorLinearCubicBezierSurfaceIntersector (const LinearSpace3fa& ray_space, Ray& ray, const TensorLinearCubicBezierSurface3fa& curve3d, const Epilog& epilog)
+          : ray_space(ray_space), ray(ray), curve3d(curve3d), epilog(epilog), isHit(false)
+        {
+          const TensorLinearCubicBezierSurface3fa curve3dray = curve3d.xfm(ray_space,ray.org);
+          curve2d = TensorLinearCubicBezierSurface2fa(CubicBezierCurve2fa(curve3dray.L),CubicBezierCurve2fa(curve3dray.R));
+          const BBox2fa b2 = curve2d.bounds();
+          eps = 8.0f*float(ulp)*reduce_max(max(abs(b2.lower),abs(b2.upper)));
+        }
+        
+        __forceinline Interval1f solve_linear(const float u0, const float u1, const float& p0, const float& p1)
+        {
+          if (p1 == p0) {
+            if (p0 == 0.0f) return Interval1f(u0,u1);
+            else return Interval1f(empty);
+          }
+          const float t = -p0/(p1-p0);
+          const float tt = lerp(u0,u1,t);
+          return Interval1f(tt);
+        }
+
+        __forceinline void solve_linear(const float u0, const float u1, const Interval1f& p0, const Interval1f& p1, Interval1f& u)
+        {
+          if (sign(p0.lower) != sign(p0.upper)) u.extend(u0);
+          if (sign(p0.lower) != sign(p1.lower)) u.extend(solve_linear(u0,u1,p0.lower,p1.lower));
+          if (sign(p0.upper) != sign(p1.upper)) u.extend(solve_linear(u0,u1,p0.upper,p1.upper));
+          if (sign(p1.lower) != sign(p1.upper)) u.extend(u1);
+        }
+
+        __forceinline Interval1f bezier_clipping(const CubicBezierCurve<Interval1f>& curve)
+        {
+          Interval1f u = empty;
+          solve_linear(0.0f/3.0f,1.0f/3.0f,curve.v0,curve.v1,u);
+          solve_linear(0.0f/3.0f,2.0f/3.0f,curve.v0,curve.v2,u);
+          solve_linear(0.0f/3.0f,3.0f/3.0f,curve.v0,curve.v3,u);
+          solve_linear(1.0f/3.0f,2.0f/3.0f,curve.v1,curve.v2,u);
+          solve_linear(1.0f/3.0f,3.0f/3.0f,curve.v1,curve.v3,u);
+          solve_linear(2.0f/3.0f,3.0f/3.0f,curve.v2,curve.v3,u);
+          return intersect(u,Interval1f(0.0f,1.0f));
+        }
+        
+        __forceinline Interval1f bezier_clipping(const LinearBezierCurve<Interval1f>& curve)
+        {
+          Interval1f v = empty;
+          solve_linear(0.0f,1.0f,curve.v0,curve.v1,v);
+          return intersect(v,Interval1f(0.0f,1.0f));
+        }
+
+        __forceinline void solve_bezier_clipping(BBox1f cu, BBox1f cv, const TensorLinearCubicBezierSurface2fa& curve2)
+        {
+          BBox2fa bounds = curve2.bounds();
+          if (bounds.upper.x < 0.0f) return;
+          if (bounds.upper.y < 0.0f) return;
+          if (bounds.lower.x > 0.0f) return;
+          if (bounds.lower.y > 0.0f) return;
+          
+          if (max(cu.size(),cv.size()) < 1E-4f)
+          {
+            const float u = cu.center();
+            const float v = cv.center();
+            TensorLinearCubicBezierSurface1f curve_z = curve3d.xfm(ray_space.row2(),ray.org);
+            const float t = curve_z.eval(u,v);
+            if (ray.tnear() <= t && t <= ray.tfar) {
+              const Vec3fa Ng = cross(curve3d.eval_du(u,v),curve3d.eval_dv(u,v));
+              BezierCurveHit hit(t,u,v,Ng);
+              isHit |= epilog(hit);
+            }
+            return;
+          }
+          
+          const Vec2fa dv = curve2.axis_v();
+          const TensorLinearCubicBezierSurface1f curve1v = curve2.xfm(dv);
+          LinearBezierCurve<Interval1f> curve0v = curve1v.reduce_u();
+          if (!curve0v.hasRoot()) return;
+          
+          const Interval1f v = bezier_clipping(curve0v);
+          if (isEmpty(v)) return;
+          TensorLinearCubicBezierSurface2fa curve2a = curve2.clip_v(v);
+          cv = BBox1f(lerp(cv.lower,cv.upper,v.lower),lerp(cv.lower,cv.upper,v.upper));
+
+          const Vec2fa du = curve2.axis_u();
+          const TensorLinearCubicBezierSurface1f curve1u = curve2a.xfm(du);
+          CubicBezierCurve<Interval1f> curve0u = curve1u.reduce_v();         
+          int roots = curve0u.maxRoots();
+          if (roots == 0) return;
+          
+          if (roots == 1)
+          {
+            const Interval1f u = bezier_clipping(curve0u);
+            if (isEmpty(u)) return;
+            TensorLinearCubicBezierSurface2fa curve2b = curve2a.clip_u(u);
+            cu = BBox1f(lerp(cu.lower,cu.upper,u.lower),lerp(cu.lower,cu.upper,u.upper));
+            solve_bezier_clipping(cu,cv,curve2b);
+            return;
+          }
+
+          TensorLinearCubicBezierSurface2fa curve2l, curve2r;
+          curve2a.split_u(curve2l,curve2r);
+          solve_bezier_clipping(BBox1f(cu.lower,cu.center()),cv,curve2l);
+          solve_bezier_clipping(BBox1f(cu.center(),cu.upper),cv,curve2r);
+        }
+        
+        __forceinline bool solve_bezier_clipping()
+        {
+          solve_bezier_clipping(BBox1f(0.0f,1.0f),BBox1f(0.0f,1.0f),curve2d);
+          return isHit;
+        }
+
+        __forceinline void solve_newton_raphson(BBox1f cu, BBox1f cv)
+        {
+          Vec2fa uv(cu.center(),cv.center());
+          const Vec2fa dfdu = curve2d.eval_du(uv.x,uv.y);
+          const Vec2fa dfdv = curve2d.eval_dv(uv.x,uv.y);
+          const LinearSpace2fa rcp_J = rcp(LinearSpace2fa(dfdu,dfdv));
+          solve_newton_raphson_loop(cu,cv,uv,dfdu,dfdv,rcp_J);
+        }
+
+        __forceinline void solve_newton_raphson_loop(BBox1f cu, BBox1f cv, const Vec2fa& uv_in, const Vec2fa& dfdu, const Vec2fa& dfdv, const LinearSpace2fa& rcp_J)
+        {
+          Vec2fa uv = uv_in;
+          
+          for (size_t i=0; i<200; i++)
+          {
+            const Vec2fa f = curve2d.eval(uv.x,uv.y);
+            const Vec2fa duv = rcp_J*f;
+            uv -= duv;
+
+            if (max(abs(f.x),abs(f.y)) < eps)
+            {
+              const float u = uv.x;
+              const float v = uv.y;
+              if (!(u >= 0.0f && u <= 1.0f)) return; // rejects NaNs
+              if (!(v >= 0.0f && v <= 1.0f)) return; // rejects NaNs
+              const TensorLinearCubicBezierSurface1f curve_z = curve3d.xfm(ray_space.row2(),ray.org);
+              const float t = curve_z.eval(u,v);
+              if (!(ray.tnear() <= t && t <= ray.tfar)) return; // rejects NaNs
+              const Vec3fa Ng = cross(curve3d.eval_du(u,v),curve3d.eval_dv(u,v));
+              BezierCurveHit hit(t,u,v,Ng);
+              isHit |= epilog(hit);
+              return;
+            }
+          }       
+        }
+
+        __forceinline bool clip_v(BBox1f& cu, BBox1f& cv)
+        {
+          const Vec2fa dv = curve2d.eval_dv(cu.lower,cv.lower);
+          const TensorLinearCubicBezierSurface1f curve1v = curve2d.xfm(dv).clip(cu,cv);
+          LinearBezierCurve<Interval1f> curve0v = curve1v.reduce_u();
+          if (!curve0v.hasRoot()) return false;
+          Interval1f v = bezier_clipping(curve0v);
+          if (isEmpty(v)) return false;
+          v = intersect(v + Interval1f(-0.1f,+0.1f),Interval1f(0.0f,1.0f));
+          cv = BBox1f(lerp(cv.lower,cv.upper,v.lower),lerp(cv.lower,cv.upper,v.upper));
+          return true;
+        }
+
+        __forceinline bool solve_krawczyk(bool very_small, BBox1f& cu, BBox1f& cv)
+        {
+          /* perform bezier clipping in v-direction to get tight v-bounds */
+          TensorLinearCubicBezierSurface2fa curve2 = curve2d.clip(cu,cv);
+          const Vec2fa dv = curve2.axis_v();
+          const TensorLinearCubicBezierSurface1f curve1v = curve2.xfm(dv);
+          LinearBezierCurve<Interval1f> curve0v = curve1v.reduce_u();
+          if (unlikely(!curve0v.hasRoot())) return true;
+          Interval1f v = bezier_clipping(curve0v);
+          if (unlikely(isEmpty(v))) return true;
+          v = intersect(v + Interval1f(-0.1f,+0.1f),Interval1f(0.0f,1.0f));
+          curve2 = curve2.clip_v(v);
+          cv = BBox1f(lerp(cv.lower,cv.upper,v.lower),lerp(cv.lower,cv.upper,v.upper));
+
+          /* perform one newton raphson iteration */
+          Vec2fa c(cu.center(),cv.center());
+          Vec2fa f,dfdu,dfdv; curve2d.eval(c.x,c.y,f,dfdu,dfdv);
+          const LinearSpace2fa rcp_J = rcp(LinearSpace2fa(dfdu,dfdv));
+          const Vec2fa c1 = c - rcp_J*f;
+          
+          /* calculate bounds of derivatives */
+          const BBox2fa bounds_du = (1.0f/cu.size())*curve2.derivative_u().bounds();
+          const BBox2fa bounds_dv = (1.0f/cv.size())*curve2.derivative_v().bounds();
+
+          /* calculate krawczyk test */
+          LinearSpace2<Vec2<Interval1f>> I(Interval1f(1.0f), Interval1f(0.0f),
+                                           Interval1f(0.0f), Interval1f(1.0f));
+
+          LinearSpace2<Vec2<Interval1f>> G(Interval1f(bounds_du.lower.x,bounds_du.upper.x), Interval1f(bounds_dv.lower.x,bounds_dv.upper.x),
+                                           Interval1f(bounds_du.lower.y,bounds_du.upper.y), Interval1f(bounds_dv.lower.y,bounds_dv.upper.y));
+
+          const LinearSpace2<Vec2f> rcp_J2(rcp_J);
+          const LinearSpace2<Vec2<Interval1f>> rcp_Ji(rcp_J2);
+          
+          const Vec2<Interval1f> x(cu,cv);
+          const Vec2<Interval1f> K = Vec2<Interval1f>(Vec2f(c1)) + (I - rcp_Ji*G)*(x-Vec2<Interval1f>(Vec2f(c)));
+
+          /* test if there is no solution */
+          const Vec2<Interval1f> KK = intersect(K,x);
+          if (unlikely(isEmpty(KK.x) || isEmpty(KK.y))) return true;
+
+          /* exit if convergence cannot get proven, but terminate if we are very small */
+          if (unlikely(!subset(K,x) && !very_small)) return false;
+
+          /* solve using newton raphson iteration of convergence is guarenteed */
+          solve_newton_raphson_loop(cu,cv,c1,dfdu,dfdv,rcp_J);
+          return true;
+        }
+
+        __forceinline void solve_newton_raphson_no_recursion(BBox1f cu, BBox1f cv)
+        {
+           if (!clip_v(cu,cv)) return;
+           return solve_newton_raphson(cu,cv);
+        }
+        
+        __forceinline void solve_newton_raphson_recursion(BBox1f cu, BBox1f cv)
+        {
+          unsigned int sptr = 0;
+          const unsigned int stack_size = 4;
+          unsigned int mask_stack[stack_size];
+          BBox1f cu_stack[stack_size];
+          BBox1f cv_stack[stack_size];
+          goto entry;
+          
+          /* terminate if stack is empty */
+          while (sptr)
+          {
+            /* pop from stack */
+            {
+              sptr--;
+              size_t mask = mask_stack[sptr];
+              cu = cu_stack[sptr];
+              cv = cv_stack[sptr];
+              const size_t i = bscf(mask);
+              mask_stack[sptr] = mask;
+              if (mask) sptr++; // there are still items on the stack
+              
+              /* process next element recurse into each hit curve segment */
+              const float u0 = float(i+0)*(1.0f/(VSIZEX-1));
+              const float u1 = float(i+1)*(1.0f/(VSIZEX-1));
+              const BBox1f cui(lerp(cu.lower,cu.upper,u0),lerp(cu.lower,cu.upper,u1));
+              cu = cui;
+            }
+
+#if 0
+            solve_newton_raphson_no_recursion(cu,cv);
+            continue;
+            
+#else
+            /* we assume convergence for small u ranges and verify using krawczyk */
+            if (cu.size() < 1.0f/6.0f) {
+              const bool very_small = cu.size() < 0.001f || sptr >= stack_size;
+              if (solve_krawczyk(very_small,cu,cv)) {
+                continue;
+              }
+            }
+#endif
+
+          entry:
+          
+            /* split the curve into VSIZEX-1 segments in u-direction */
+            vboolx valid = true;
+            TensorLinearCubicBezierSurface<Vec2vfx> subcurves = curve2d.clip_v(cv).vsplit_u(valid,cu);
+            
+            /* slabs test in u-direction */
+            Vec2vfx ndv = cross(subcurves.axis_v());
+            BBox<vfloatx> boundsv = subcurves.vxfm(ndv).bounds();
+            valid &= boundsv.lower <= eps;
+            valid &= boundsv.upper >= -eps;
+            if (none(valid)) continue;
+
+            /* slabs test in v-direction */
+            Vec2vfx ndu = cross(subcurves.axis_u());
+            BBox<vfloatx> boundsu = subcurves.vxfm(ndu).bounds();
+            valid &= boundsu.lower <= eps;
+            valid &= boundsu.upper >= -eps;
+            if (none(valid)) continue;
+
+            /* push valid segments to stack */
+            assert(sptr < stack_size);
+            mask_stack [sptr] = movemask(valid);
+            cu_stack   [sptr] = cu;
+            cv_stack   [sptr] = cv;
+            sptr++;
+          }
+        }
+        
+        __forceinline bool solve_newton_raphson_main()
+        {
+          BBox1f vu(0.0f,1.0f);
+          BBox1f vv(0.0f,1.0f);
+          solve_newton_raphson_recursion(vu,vv);
+          return isHit;
+        }
+      };
+
+
+    template<template<typename Ty> class SourceCurve>
+      struct OrientedCurve1Intersector1
+    {
+      //template<typename Ty> using Curve = SourceCurve<Ty>;
+      typedef SourceCurve<Vec3ff> SourceCurve3ff;
+      typedef SourceCurve<Vec3fa> SourceCurve3fa;
+      
+      __forceinline OrientedCurve1Intersector1() {}
+      
+      __forceinline OrientedCurve1Intersector1(const Ray& ray, const void* ptr) {}
+      
+      template<typename Epilog>
+      __noinline bool intersect(const CurvePrecalculations1& pre, Ray& ray,
+                                IntersectContext* context,
+                                const CurveGeometry* geom, const unsigned int primID, 
+                                const Vec3ff& v0i, const Vec3ff& v1i, const Vec3ff& v2i, const Vec3ff& v3i,
+                                const Vec3fa& n0i, const Vec3fa& n1i, const Vec3fa& n2i, const Vec3fa& n3i,
+                                const Epilog& epilog) const
+      {
+        STAT3(normal.trav_prims,1,1,1);
+
+        SourceCurve3ff ccurve(v0i,v1i,v2i,v3i);
+        SourceCurve3fa ncurve(n0i,n1i,n2i,n3i);
+        ccurve = enlargeRadiusToMinWidth(context,geom,ray.org,ccurve);
+        TensorLinearCubicBezierSurface3fa curve = TensorLinearCubicBezierSurface3fa::fromCenterAndNormalCurve(ccurve,ncurve);
+        //return TensorLinearCubicBezierSurfaceIntersector<Ray,Epilog>(pre.ray_space,ray,curve,epilog).solve_bezier_clipping();
+        return TensorLinearCubicBezierSurfaceIntersector<Ray,Epilog>(pre.ray_space,ray,curve,epilog).solve_newton_raphson_main();
+      }
+
+      template<typename Epilog>
+      __noinline bool intersect(const CurvePrecalculations1& pre, Ray& ray,
+                                IntersectContext* context,
+                                const CurveGeometry* geom, const unsigned int primID,
+                                const TensorLinearCubicBezierSurface3fa& curve, const Epilog& epilog) const
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        //return TensorLinearCubicBezierSurfaceIntersector<Ray,Epilog>(pre.ray_space,ray,curve,epilog).solve_bezier_clipping();
+        return TensorLinearCubicBezierSurfaceIntersector<Ray,Epilog>(pre.ray_space,ray,curve,epilog).solve_newton_raphson_main();
+      }
+    };
+
+    template<template<typename Ty> class SourceCurve, int K>
+      struct OrientedCurve1IntersectorK
+    {
+      //template<typename Ty> using Curve = SourceCurve<Ty>;
+      typedef SourceCurve<Vec3ff> SourceCurve3ff;
+      typedef SourceCurve<Vec3fa> SourceCurve3fa;
+      
+      struct Ray1
+      {
+        __forceinline Ray1(RayK<K>& ray, size_t k)
+          : org(ray.org.x[k],ray.org.y[k],ray.org.z[k]), dir(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]), _tnear(ray.tnear()[k]), tfar(ray.tfar[k]) {}
+
+        Vec3fa org;
+        Vec3fa dir;
+        float _tnear;
+        float& tfar;
+
+        __forceinline float& tnear() { return _tnear; }
+        //__forceinline float& tfar()  { return _tfar; }
+        __forceinline const float& tnear() const { return _tnear; }
+        //__forceinline const float& tfar()  const { return _tfar; }
+      };
+
+      template<typename Epilog>
+      __forceinline bool intersect(const CurvePrecalculationsK<K>& pre, RayK<K>& vray, size_t k,
+                                   IntersectContext* context,
+                                   const CurveGeometry* geom, const unsigned int primID,
+                                   const Vec3ff& v0i, const Vec3ff& v1i, const Vec3ff& v2i, const Vec3ff& v3i,
+                                   const Vec3fa& n0i, const Vec3fa& n1i, const Vec3fa& n2i, const Vec3fa& n3i,
+                                   const Epilog& epilog)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Ray1 ray(vray,k);
+        SourceCurve3ff ccurve(v0i,v1i,v2i,v3i);
+        SourceCurve3fa ncurve(n0i,n1i,n2i,n3i);
+        ccurve = enlargeRadiusToMinWidth(context,geom,ray.org,ccurve);
+        TensorLinearCubicBezierSurface3fa curve = TensorLinearCubicBezierSurface3fa::fromCenterAndNormalCurve(ccurve,ncurve);
+        //return TensorLinearCubicBezierSurfaceIntersector<Ray1,Epilog>(pre.ray_space[k],ray,curve,epilog).solve_bezier_clipping();
+        return TensorLinearCubicBezierSurfaceIntersector<Ray1,Epilog>(pre.ray_space[k],ray,curve,epilog).solve_newton_raphson_main();
+      }
+
+      template<typename Epilog>
+      __forceinline bool intersect(const CurvePrecalculationsK<K>& pre, RayK<K>& vray, size_t k,
+                                   IntersectContext* context,
+                                   const CurveGeometry* geom, const unsigned int primID,
+                                   const TensorLinearCubicBezierSurface3fa& curve,
+                                   const Epilog& epilog)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Ray1 ray(vray,k);
+        //return TensorLinearCubicBezierSurfaceIntersector<Ray1,Epilog>(pre.ray_space[k],ray,curve,epilog).solve_bezier_clipping();
+        return TensorLinearCubicBezierSurfaceIntersector<Ray1,Epilog>(pre.ray_space[k],ray,curve,epilog).solve_newton_raphson_main();
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/curve_intersector_precalculations.h b/thirdparty/embree/kernels/geometry/curve_intersector_precalculations.h
new file mode 100644
index 0000000000..de6b70be1b
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/curve_intersector_precalculations.h
@@ -0,0 +1,49 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "../common/geometry.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    struct CurvePrecalculations1
+    {
+      float depth_scale;
+      LinearSpace3fa ray_space;
+           
+      __forceinline CurvePrecalculations1() {}
+
+      __forceinline CurvePrecalculations1(const Ray& ray, const void* ptr)
+      {
+        depth_scale = rsqrt(dot(ray.dir,ray.dir));
+        LinearSpace3fa space = frame(depth_scale*ray.dir);
+        space.vz *= depth_scale;
+        ray_space = space.transposed();
+      }
+    };
+    
+    template<int K>
+      struct CurvePrecalculationsK
+    {
+      vfloat<K> depth_scale;
+      LinearSpace3fa ray_space[K];
+
+      __forceinline CurvePrecalculationsK(const vbool<K>& valid, const RayK<K>& ray)
+      {
+        size_t mask = movemask(valid);
+        depth_scale = rsqrt(dot(ray.dir,ray.dir));
+        while (mask) {
+          size_t k = bscf(mask);
+          Vec3fa ray_dir_k = Vec3fa(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]);
+          LinearSpace3fa ray_space_k = frame(depth_scale[k]*ray_dir_k);
+          ray_space_k.vz *= depth_scale[k];
+          ray_space[k] = ray_space_k.transposed();
+        }
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/curve_intersector_ribbon.h b/thirdparty/embree/kernels/geometry/curve_intersector_ribbon.h
new file mode 100644
index 0000000000..c3272e99fd
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/curve_intersector_ribbon.h
@@ -0,0 +1,216 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "quad_intersector.h"
+#include "curve_intersector_precalculations.h"
+
+#define Bezier1Intersector1 RibbonCurve1Intersector1
+#define Bezier1IntersectorK RibbonCurve1IntersectorK
+
+namespace embree
+{
+  namespace isa
+  {
+    template<typename NativeCurve3ff, int M>
+    struct RibbonHit
+    {
+      __forceinline RibbonHit() {}
+
+      __forceinline RibbonHit(const vbool<M>& valid, const vfloat<M>& U, const vfloat<M>& V, const vfloat<M>& T, const int i, const int N,
+                              const NativeCurve3ff& curve3D)
+        : U(U), V(V), T(T), i(i), N(N), curve3D(curve3D), valid(valid) {}
+      
+      __forceinline void finalize() 
+      {
+        vu = (vfloat<M>(step)+U+vfloat<M>(float(i)))*(1.0f/float(N));
+        vv = V;
+        vt = T;
+      }
+      
+      __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); }
+      __forceinline float t  (const size_t i) const { return vt[i]; }
+      __forceinline Vec3fa Ng(const size_t i) const { return curve3D.eval_du(vu[i]); }
+
+      __forceinline Vec2vf<M> uv() const { return Vec2vf<M>(vu,vv); }
+      __forceinline vfloat<M> t () const { return vt; }
+      __forceinline Vec3vf<M> Ng() const { return (Vec3vf<M>) curve3D.template veval_du<M>(vu); }
+      
+    public:
+      vfloat<M> U;
+      vfloat<M> V;
+      vfloat<M> T;
+      int i, N;
+      NativeCurve3ff curve3D;
+      
+    public:
+      vbool<M> valid;
+      vfloat<M> vu;
+      vfloat<M> vv;
+      vfloat<M> vt;
+    };
+
+    /* calculate squared distance of point p0 to line p1->p2 */
+    __forceinline std::pair<vfloatx,vfloatx> sqr_point_line_distance(const Vec2vfx& p0, const Vec2vfx& p1, const Vec2vfx& p2)
+    {
+      const vfloatx num = det(p2-p1,p1-p0);
+      const vfloatx den2 = dot(p2-p1,p2-p1);
+      return std::make_pair(num*num,den2);
+    }
+    
+    /* performs culling against a cylinder */
+    __forceinline vboolx cylinder_culling_test(const Vec2vfx& p0, const Vec2vfx& p1, const Vec2vfx& p2, const vfloatx& r)
+    {
+      const std::pair<vfloatx,vfloatx> d = sqr_point_line_distance(p0,p1,p2);
+      return d.first <= r*r*d.second;
+    }
+
+    template<typename NativeCurve3ff, typename Epilog>
+    __forceinline bool intersect_ribbon(const Vec3fa& ray_org, const Vec3fa& ray_dir, const float ray_tnear, const float& ray_tfar,
+                                        const LinearSpace3fa& ray_space, const float& depth_scale,
+                                        const NativeCurve3ff& curve3D, const int N,
+                                        const Epilog& epilog)
+    {
+      /* transform control points into ray space */
+      const NativeCurve3ff curve2D = curve3D.xfm_pr(ray_space,ray_org);
+      float eps = 4.0f*float(ulp)*reduce_max(max(abs(curve2D.v0),abs(curve2D.v1),abs(curve2D.v2),abs(curve2D.v3)));
+      
+      /* evaluate the bezier curve */
+      bool ishit = false;
+      vboolx valid = vfloatx(step) < vfloatx(float(N));
+      const Vec4vfx p0 = curve2D.template eval0<VSIZEX>(0,N);
+      const Vec4vfx p1 = curve2D.template eval1<VSIZEX>(0,N);
+      valid &= cylinder_culling_test(zero,Vec2vfx(p0.x,p0.y),Vec2vfx(p1.x,p1.y),max(p0.w,p1.w));
+      
+      if (any(valid)) 
+      {
+        Vec3vfx dp0dt = curve2D.template derivative0<VSIZEX>(0,N);
+        Vec3vfx dp1dt = curve2D.template derivative1<VSIZEX>(0,N);
+        dp0dt = select(reduce_max(abs(dp0dt)) < vfloatx(eps),Vec3vfx(p1-p0),dp0dt);
+        dp1dt = select(reduce_max(abs(dp1dt)) < vfloatx(eps),Vec3vfx(p1-p0),dp1dt);
+        const Vec3vfx n0(dp0dt.y,-dp0dt.x,0.0f);
+        const Vec3vfx n1(dp1dt.y,-dp1dt.x,0.0f);
+        const Vec3vfx nn0 = normalize(n0);
+        const Vec3vfx nn1 = normalize(n1);
+        const Vec3vfx lp0 = madd(p0.w,nn0,Vec3vfx(p0));
+        const Vec3vfx lp1 = madd(p1.w,nn1,Vec3vfx(p1));
+        const Vec3vfx up0 = nmadd(p0.w,nn0,Vec3vfx(p0));
+        const Vec3vfx up1 = nmadd(p1.w,nn1,Vec3vfx(p1));
+        
+        vfloatx vu,vv,vt;
+        vboolx valid0 = intersect_quad_backface_culling<VSIZEX>(valid,zero,Vec3fa(0,0,1),ray_tnear,ray_tfar,lp0,lp1,up1,up0,vu,vv,vt);
+
+        if (any(valid0))
+        {
+          /* ignore self intersections */
+          if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f) {
+            vfloatx r = lerp(p0.w, p1.w, vu);
+            valid0 &= vt > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR)*r*depth_scale;
+          }
+          
+          if (any(valid0))
+          {
+            vv = madd(2.0f,vv,vfloatx(-1.0f));
+            RibbonHit<NativeCurve3ff,VSIZEX> bhit(valid0,vu,vv,vt,0,N,curve3D);
+            ishit |= epilog(bhit.valid,bhit);
+          }
+        }
+      }
+      
+      if (unlikely(VSIZEX < N)) 
+      {
+        /* process SIMD-size many segments per iteration */
+        for (int i=VSIZEX; i<N; i+=VSIZEX)
+        {
+          /* evaluate the bezier curve */
+          vboolx valid = vintx(i)+vintx(step) < vintx(N);
+          const Vec4vfx p0 = curve2D.template eval0<VSIZEX>(i,N);
+          const Vec4vfx p1 = curve2D.template eval1<VSIZEX>(i,N);
+          valid &= cylinder_culling_test(zero,Vec2vfx(p0.x,p0.y),Vec2vfx(p1.x,p1.y),max(p0.w,p1.w));
+          if (none(valid)) continue;
+          
+          Vec3vfx dp0dt = curve2D.template derivative0<VSIZEX>(i,N);
+          Vec3vfx dp1dt = curve2D.template derivative1<VSIZEX>(i,N);
+          dp0dt = select(reduce_max(abs(dp0dt)) < vfloatx(eps),Vec3vfx(p1-p0),dp0dt);
+          dp1dt = select(reduce_max(abs(dp1dt)) < vfloatx(eps),Vec3vfx(p1-p0),dp1dt);
+          const Vec3vfx n0(dp0dt.y,-dp0dt.x,0.0f);
+          const Vec3vfx n1(dp1dt.y,-dp1dt.x,0.0f);
+          const Vec3vfx nn0 = normalize(n0);
+          const Vec3vfx nn1 = normalize(n1);
+          const Vec3vfx lp0 = madd(p0.w,nn0,Vec3vfx(p0));
+          const Vec3vfx lp1 = madd(p1.w,nn1,Vec3vfx(p1));
+          const Vec3vfx up0 = nmadd(p0.w,nn0,Vec3vfx(p0));
+          const Vec3vfx up1 = nmadd(p1.w,nn1,Vec3vfx(p1));
+          
+          vfloatx vu,vv,vt;
+          vboolx valid0 = intersect_quad_backface_culling<VSIZEX>(valid,zero,Vec3fa(0,0,1),ray_tnear,ray_tfar,lp0,lp1,up1,up0,vu,vv,vt);
+
+          if (any(valid0))
+          {
+            /* ignore self intersections */
+            if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f) {
+              vfloatx r = lerp(p0.w, p1.w, vu);
+              valid0 &= vt > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR)*r*depth_scale;
+            }
+            
+            if (any(valid0))
+            {
+              vv = madd(2.0f,vv,vfloatx(-1.0f));
+              RibbonHit<NativeCurve3ff,VSIZEX> bhit(valid0,vu,vv,vt,i,N,curve3D);
+              ishit |= epilog(bhit.valid,bhit);
+            }
+          }
+        }
+      }
+      return ishit;
+    }
+        
+    template<template<typename Ty> class NativeCurve>
+    struct RibbonCurve1Intersector1
+    {
+      typedef NativeCurve<Vec3ff> NativeCurve3ff;
+      
+      template<typename Epilog>
+      __forceinline bool intersect(const CurvePrecalculations1& pre, Ray& ray,
+                                   IntersectContext* context,
+                                   const CurveGeometry* geom, const unsigned int primID,
+                                   const Vec3ff& v0, const Vec3ff& v1, const Vec3ff& v2, const Vec3ff& v3,
+                                   const Epilog& epilog)
+      {
+        const int N = geom->tessellationRate;
+        NativeCurve3ff curve(v0,v1,v2,v3);
+        curve = enlargeRadiusToMinWidth(context,geom,ray.org,curve);
+        return intersect_ribbon<NativeCurve3ff>(ray.org,ray.dir,ray.tnear(),ray.tfar,
+                                                pre.ray_space,pre.depth_scale,
+                                                curve,N,
+                                                epilog);
+      }
+    };
+    
+    template<template<typename Ty> class NativeCurve, int K>
+    struct RibbonCurve1IntersectorK
+    {
+      typedef NativeCurve<Vec3ff> NativeCurve3ff;
+      
+      template<typename Epilog>
+      __forceinline bool intersect(const CurvePrecalculationsK<K>& pre, RayK<K>& ray, size_t k,
+                                   IntersectContext* context,
+                                   const CurveGeometry* geom, const unsigned int primID,
+                                   const Vec3ff& v0, const Vec3ff& v1, const Vec3ff& v2, const Vec3ff& v3,
+                                   const Epilog& epilog)
+      {
+        const int N = geom->tessellationRate;
+        const Vec3fa ray_org(ray.org.x[k],ray.org.y[k],ray.org.z[k]);
+        const Vec3fa ray_dir(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]);
+        NativeCurve3ff curve(v0,v1,v2,v3);
+        curve = enlargeRadiusToMinWidth(context,geom,ray_org,curve);
+        return intersect_ribbon<NativeCurve3ff>(ray_org,ray_dir,ray.tnear()[k],ray.tfar[k],
+                                                pre.ray_space[k],pre.depth_scale[k],
+                                                curve,N,
+                                                epilog);
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/curve_intersector_sweep.h b/thirdparty/embree/kernels/geometry/curve_intersector_sweep.h
new file mode 100644
index 0000000000..2d4abd73ac
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/curve_intersector_sweep.h
@@ -0,0 +1,364 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "cylinder.h"
+#include "plane.h"
+#include "line_intersector.h"
+#include "curve_intersector_precalculations.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    static const size_t numJacobianIterations = 5;
+#if defined(__AVX__)
+    static const size_t numBezierSubdivisions = 2;
+#else
+    static const size_t numBezierSubdivisions = 3;
+#endif
+
+    struct BezierCurveHit
+    {
+      __forceinline BezierCurveHit() {}
+
+      __forceinline BezierCurveHit(const float t, const float u, const Vec3fa& Ng)
+        : t(t), u(u), v(0.0f), Ng(Ng) {}
+
+      __forceinline BezierCurveHit(const float t, const float u, const float v, const Vec3fa& Ng)
+        : t(t), u(u), v(v), Ng(Ng) {}
+      
+      __forceinline void finalize() {}
+      
+    public:
+      float t;
+      float u;
+      float v; 
+      Vec3fa Ng;
+    };
+    
+    template<typename NativeCurve3ff, typename Ray, typename Epilog>
+    __forceinline bool intersect_bezier_iterative_debug(const Ray& ray, const float dt, const NativeCurve3ff& curve, size_t i,
+                                                        const vfloatx& u, const BBox<vfloatx>& tp, const BBox<vfloatx>& h0, const BBox<vfloatx>& h1, 
+                                                        const Vec3vfx& Ng, const Vec4vfx& dP0du, const Vec4vfx& dP3du,
+                                                        const Epilog& epilog)
+    {
+      if (tp.lower[i]+dt > ray.tfar) return false;
+      Vec3fa Ng_o = Vec3fa(Ng.x[i],Ng.y[i],Ng.z[i]);
+      if (h0.lower[i] == tp.lower[i]) Ng_o = -Vec3fa(dP0du.x[i],dP0du.y[i],dP0du.z[i]);
+      if (h1.lower[i] == tp.lower[i]) Ng_o = +Vec3fa(dP3du.x[i],dP3du.y[i],dP3du.z[i]);
+      BezierCurveHit hit(tp.lower[i]+dt,u[i],Ng_o);
+      return epilog(hit);
+    }
+
+    template<typename NativeCurve3ff, typename Ray, typename Epilog> 
+     __forceinline bool intersect_bezier_iterative_jacobian(const Ray& ray, const float dt, const NativeCurve3ff& curve, float u, float t, const Epilog& epilog)
+    {
+      const Vec3fa org = zero;
+      const Vec3fa dir = ray.dir;
+      const float length_ray_dir = length(dir);
+
+      /* error of curve evaluations is propertional to largest coordinate */
+      const BBox3ff box = curve.bounds();
+      const float P_err = 16.0f*float(ulp)*reduce_max(max(abs(box.lower),abs(box.upper)));
+     
+      for (size_t i=0; i<numJacobianIterations; i++) 
+      {
+        const Vec3fa Q = madd(Vec3fa(t),dir,org);
+        //const Vec3fa dQdu = zero;
+        const Vec3fa dQdt = dir;
+        const float Q_err = 16.0f*float(ulp)*length_ray_dir*t; // works as org=zero here
+           
+        Vec3ff P,dPdu,ddPdu; curve.eval(u,P,dPdu,ddPdu);
+        //const Vec3fa dPdt = zero;
+
+        const Vec3fa R = Q-P;
+        const float len_R = length(R); //reduce_max(abs(R));
+        const float R_err = max(Q_err,P_err);
+        const Vec3fa dRdu = /*dQdu*/-dPdu;
+        const Vec3fa dRdt = dQdt;//-dPdt;
+
+        const Vec3fa T = normalize(dPdu);
+        const Vec3fa dTdu = dnormalize(dPdu,ddPdu);
+        //const Vec3fa dTdt = zero;
+        const float cos_err = P_err/length(dPdu);
+
+        /* Error estimate for dot(R,T):
+
+           dot(R,T) = cos(R,T) |R| |T|
+                    = (cos(R,T) +- cos_error) * (|R| +- |R|_err) * (|T| +- |T|_err)
+                    = cos(R,T)*|R|*|T| 
+                      +- cos(R,T)*(|R|*|T|_err + |T|*|R|_err)
+                      +- cos_error*(|R| + |T|)
+                      +- lower order terms
+           with cos(R,T) being in [0,1] and |T| = 1 we get:
+             dot(R,T)_err = |R|*|T|_err + |R|_err = cos_error*(|R|+1)
+        */
+              
+        const float f = dot(R,T);
+        const float f_err = len_R*P_err + R_err + cos_err*(1.0f+len_R);
+        const float dfdu = dot(dRdu,T) + dot(R,dTdu);
+        const float dfdt = dot(dRdt,T);// + dot(R,dTdt);
+
+        const float K = dot(R,R)-sqr(f);
+        const float dKdu = /*2.0f*/(dot(R,dRdu)-f*dfdu);
+        const float dKdt = /*2.0f*/(dot(R,dRdt)-f*dfdt);
+        const float rsqrt_K = rsqrt(K);
+
+        const float g = sqrt(K)-P.w;
+        const float g_err = R_err + f_err + 16.0f*float(ulp)*box.upper.w;
+        const float dgdu = /*0.5f*/dKdu*rsqrt_K-dPdu.w;
+        const float dgdt = /*0.5f*/dKdt*rsqrt_K;//-dPdt.w;
+
+        const LinearSpace2f J = LinearSpace2f(dfdu,dfdt,dgdu,dgdt);
+        const Vec2f dut = rcp(J)*Vec2f(f,g);
+        const Vec2f ut = Vec2f(u,t) - dut;
+        u = ut.x; t = ut.y;
+
+        if (abs(f) < f_err && abs(g) < g_err)
+        {
+          t+=dt;
+          if (!(ray.tnear() <= t && t <= ray.tfar)) return false; // rejects NaNs
+          if (!(u >= 0.0f && u <= 1.0f)) return false; // rejects NaNs
+          const Vec3fa R = normalize(Q-P);
+          const Vec3fa U = madd(Vec3fa(dPdu.w),R,dPdu);
+          const Vec3fa V = cross(dPdu,R);
+          BezierCurveHit hit(t,u,cross(V,U));
+          return epilog(hit);
+        }
+      }
+      return false;
+    }
+
+    template<typename NativeCurve3ff, typename Ray, typename Epilog>
+    bool intersect_bezier_recursive_jacobian(const Ray& ray, const float dt, const NativeCurve3ff& curve,
+                                             float u0, float u1, unsigned int depth, const Epilog& epilog)
+    {
+#if defined(__AVX__)
+      enum { VSIZEX_ = 8 };
+      typedef vbool8 vboolx; // maximally 8-wide to work around KNL issues
+      typedef vint8 vintx; 
+      typedef vfloat8 vfloatx;
+#else
+      enum { VSIZEX_ = 4 };
+      typedef vbool4 vboolx;
+      typedef vint4 vintx; 
+      typedef vfloat4 vfloatx;
+#endif
+      typedef Vec3<vfloatx> Vec3vfx;
+      typedef Vec4<vfloatx> Vec4vfx;
+    
+      unsigned int maxDepth = numBezierSubdivisions;
+      bool found = false;
+      const Vec3fa org = zero;
+      const Vec3fa dir = ray.dir;
+
+      unsigned int sptr = 0;
+      const unsigned int stack_size = numBezierSubdivisions+1; // +1 because of unstable workaround below
+      struct StackEntry {
+        vboolx valid;
+        vfloatx tlower;
+        float u0;
+        float u1;
+        unsigned int depth;
+      };
+      StackEntry stack[stack_size];
+      goto entry;
+
+       /* terminate if stack is empty */
+      while (sptr)
+      {
+        /* pop from stack */
+        {
+          sptr--;
+          vboolx valid = stack[sptr].valid;
+          const vfloatx tlower = stack[sptr].tlower;
+          valid &= tlower+dt <= ray.tfar;
+          if (none(valid)) continue;
+          u0 = stack[sptr].u0;
+          u1 = stack[sptr].u1;
+          depth = stack[sptr].depth;
+          const size_t i = select_min(valid,tlower); clear(valid,i);
+          stack[sptr].valid = valid;
+          if (any(valid)) sptr++; // there are still items on the stack
+
+          /* process next segment */
+          const vfloatx vu0 = lerp(u0,u1,vfloatx(step)*(1.0f/(vfloatx::size-1)));
+          u0 = vu0[i+0];
+          u1 = vu0[i+1];
+        }
+      entry:
+
+        /* subdivide curve */
+        const float dscale = (u1-u0)*(1.0f/(3.0f*(vfloatx::size-1)));
+        const vfloatx vu0 = lerp(u0,u1,vfloatx(step)*(1.0f/(vfloatx::size-1)));
+        Vec4vfx P0, dP0du; curve.template veval<VSIZEX_>(vu0,P0,dP0du); dP0du = dP0du * Vec4vfx(dscale);
+        const Vec4vfx P3 = shift_right_1(P0);
+        const Vec4vfx dP3du = shift_right_1(dP0du); 
+        const Vec4vfx P1 = P0 + dP0du; 
+        const Vec4vfx P2 = P3 - dP3du;
+        
+        /* calculate bounding cylinders */
+        const vfloatx rr1 = sqr_point_to_line_distance(Vec3vfx(dP0du),Vec3vfx(P3-P0));
+        const vfloatx rr2 = sqr_point_to_line_distance(Vec3vfx(dP3du),Vec3vfx(P3-P0));
+        const vfloatx maxr12 = sqrt(max(rr1,rr2));
+        const vfloatx one_plus_ulp  = 1.0f+2.0f*float(ulp);
+        const vfloatx one_minus_ulp = 1.0f-2.0f*float(ulp);
+        vfloatx r_outer = max(P0.w,P1.w,P2.w,P3.w)+maxr12;
+        vfloatx r_inner = min(P0.w,P1.w,P2.w,P3.w)-maxr12;
+        r_outer = one_plus_ulp*r_outer;
+        r_inner = max(0.0f,one_minus_ulp*r_inner);
+        const CylinderN<vfloatx::size> cylinder_outer(Vec3vfx(P0),Vec3vfx(P3),r_outer);
+        const CylinderN<vfloatx::size> cylinder_inner(Vec3vfx(P0),Vec3vfx(P3),r_inner);
+        vboolx valid = true; clear(valid,vfloatx::size-1);
+        
+        /* intersect with outer cylinder */
+        BBox<vfloatx> tc_outer; vfloatx u_outer0; Vec3vfx Ng_outer0; vfloatx u_outer1; Vec3vfx Ng_outer1;
+        valid &= cylinder_outer.intersect(org,dir,tc_outer,u_outer0,Ng_outer0,u_outer1,Ng_outer1);
+        if (none(valid)) continue;
+        
+        /* intersect with cap-planes */
+        BBox<vfloatx> tp(ray.tnear()-dt,ray.tfar-dt);
+        tp = embree::intersect(tp,tc_outer);
+        BBox<vfloatx> h0 = HalfPlaneN<vfloatx::size>(Vec3vfx(P0),+Vec3vfx(dP0du)).intersect(org,dir);
+        tp = embree::intersect(tp,h0);
+        BBox<vfloatx> h1 = HalfPlaneN<vfloatx::size>(Vec3vfx(P3),-Vec3vfx(dP3du)).intersect(org,dir);
+        tp = embree::intersect(tp,h1);
+        valid &= tp.lower <= tp.upper;
+        if (none(valid)) continue;
+        
+        /* clamp and correct u parameter */
+        u_outer0 = clamp(u_outer0,vfloatx(0.0f),vfloatx(1.0f));
+        u_outer1 = clamp(u_outer1,vfloatx(0.0f),vfloatx(1.0f));
+        u_outer0 = lerp(u0,u1,(vfloatx(step)+u_outer0)*(1.0f/float(vfloatx::size)));
+        u_outer1 = lerp(u0,u1,(vfloatx(step)+u_outer1)*(1.0f/float(vfloatx::size)));
+        
+        /* intersect with inner cylinder */
+        BBox<vfloatx> tc_inner;
+        vfloatx u_inner0 = zero; Vec3vfx Ng_inner0 = zero; vfloatx u_inner1 = zero; Vec3vfx Ng_inner1 = zero;
+        const vboolx valid_inner = cylinder_inner.intersect(org,dir,tc_inner,u_inner0,Ng_inner0,u_inner1,Ng_inner1);
+        
+        /* at the unstable area we subdivide deeper */
+        const vboolx unstable0 = (!valid_inner) | (abs(dot(Vec3vfx(Vec3fa(ray.dir)),Ng_inner0)) < 0.3f);
+        const vboolx unstable1 = (!valid_inner) | (abs(dot(Vec3vfx(Vec3fa(ray.dir)),Ng_inner1)) < 0.3f);
+      
+        /* subtract the inner interval from the current hit interval */
+        BBox<vfloatx> tp0, tp1;
+        subtract(tp,tc_inner,tp0,tp1);
+        vboolx valid0 = valid & (tp0.lower <= tp0.upper);
+        vboolx valid1 = valid & (tp1.lower <= tp1.upper);
+        if (none(valid0 | valid1)) continue;
+        
+        /* iterate over all first hits front to back */
+        const vintx termDepth0 = select(unstable0,vintx(maxDepth+1),vintx(maxDepth));
+        vboolx recursion_valid0 = valid0 & (depth < termDepth0);
+        valid0 &= depth >= termDepth0;
+        
+        while (any(valid0))
+        {
+          const size_t i = select_min(valid0,tp0.lower); clear(valid0,i);
+          found = found | intersect_bezier_iterative_jacobian(ray,dt,curve,u_outer0[i],tp0.lower[i],epilog);
+          //found = found | intersect_bezier_iterative_debug   (ray,dt,curve,i,u_outer0,tp0,h0,h1,Ng_outer0,dP0du,dP3du,epilog);
+          valid0 &= tp0.lower+dt <= ray.tfar;
+        }
+        valid1 &= tp1.lower+dt <= ray.tfar;
+        
+        /* iterate over all second hits front to back */
+        const vintx termDepth1 = select(unstable1,vintx(maxDepth+1),vintx(maxDepth));
+        vboolx recursion_valid1 = valid1 & (depth < termDepth1);
+        valid1 &= depth >= termDepth1;
+        while (any(valid1))
+        {
+          const size_t i = select_min(valid1,tp1.lower); clear(valid1,i);
+          found = found | intersect_bezier_iterative_jacobian(ray,dt,curve,u_outer1[i],tp1.upper[i],epilog);
+          //found = found | intersect_bezier_iterative_debug   (ray,dt,curve,i,u_outer1,tp1,h0,h1,Ng_outer1,dP0du,dP3du,epilog);
+          valid1 &= tp1.lower+dt <= ray.tfar;
+        }
+
+        /* push valid segments to stack */
+        recursion_valid0 &= tp0.lower+dt <= ray.tfar;
+        recursion_valid1 &= tp1.lower+dt <= ray.tfar;
+        const vboolx recursion_valid = recursion_valid0 | recursion_valid1;
+        if (any(recursion_valid))
+        {
+          assert(sptr < stack_size);
+          stack[sptr].valid = recursion_valid;
+          stack[sptr].tlower = select(recursion_valid0,tp0.lower,tp1.lower);
+          stack[sptr].u0 = u0;
+          stack[sptr].u1 = u1;
+          stack[sptr].depth = depth+1;
+          sptr++;
+        }
+      }
+      return found;
+    }
+
+    template<template<typename Ty> class NativeCurve>
+    struct SweepCurve1Intersector1
+    {
+      typedef NativeCurve<Vec3ff> NativeCurve3ff;
+      
+      template<typename Epilog>
+      __noinline bool intersect(const CurvePrecalculations1& pre, Ray& ray,
+                                IntersectContext* context,
+                                const CurveGeometry* geom, const unsigned int primID,
+                                const Vec3ff& v0, const Vec3ff& v1, const Vec3ff& v2, const Vec3ff& v3,
+                                const Epilog& epilog)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+
+        /* move ray closer to make intersection stable */
+        NativeCurve3ff curve0(v0,v1,v2,v3);
+        curve0 = enlargeRadiusToMinWidth(context,geom,ray.org,curve0);
+        const float dt = dot(curve0.center()-ray.org,ray.dir)*rcp(dot(ray.dir,ray.dir));
+        const Vec3ff ref(madd(Vec3fa(dt),ray.dir,ray.org),0.0f);
+        const NativeCurve3ff curve1 = curve0-ref;
+        return intersect_bezier_recursive_jacobian(ray,dt,curve1,0.0f,1.0f,1,epilog);
+      }
+    };
+
+    template<template<typename Ty> class NativeCurve, int K>
+    struct SweepCurve1IntersectorK
+    {
+      typedef NativeCurve<Vec3ff> NativeCurve3ff;
+      
+      struct Ray1
+      {
+        __forceinline Ray1(RayK<K>& ray, size_t k)
+          : org(ray.org.x[k],ray.org.y[k],ray.org.z[k]), dir(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]), _tnear(ray.tnear()[k]), tfar(ray.tfar[k]) {}
+
+        Vec3fa org;
+        Vec3fa dir;
+        float _tnear;
+        float& tfar;
+
+        __forceinline float& tnear() { return _tnear; }
+        //__forceinline float& tfar()  { return _tfar; }
+        __forceinline const float& tnear() const { return _tnear; }
+        //__forceinline const float& tfar()  const { return _tfar; }
+        
+      };
+
+      template<typename Epilog>
+      __forceinline bool intersect(const CurvePrecalculationsK<K>& pre, RayK<K>& vray, size_t k,
+                                   IntersectContext* context,
+                                   const CurveGeometry* geom, const unsigned int primID,
+                                   const Vec3ff& v0, const Vec3ff& v1, const Vec3ff& v2, const Vec3ff& v3,
+                                   const Epilog& epilog)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Ray1 ray(vray,k);
+
+        /* move ray closer to make intersection stable */
+        NativeCurve3ff curve0(v0,v1,v2,v3);
+        curve0 = enlargeRadiusToMinWidth(context,geom,ray.org,curve0);
+        const float dt = dot(curve0.center()-ray.org,ray.dir)*rcp(dot(ray.dir,ray.dir));
+        const Vec3ff ref(madd(Vec3fa(dt),ray.dir,ray.org),0.0f);
+        const NativeCurve3ff curve1 = curve0-ref;
+        return intersect_bezier_recursive_jacobian(ray,dt,curve1,0.0f,1.0f,1,epilog);
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/curve_intersector_virtual.h b/thirdparty/embree/kernels/geometry/curve_intersector_virtual.h
new file mode 100644
index 0000000000..cffa8e46ad
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/curve_intersector_virtual.h
@@ -0,0 +1,671 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+#include "../subdiv/bezier_curve.h"
+#include "../common/primref.h"
+#include "curve_intersector_precalculations.h"
+#include "../bvh/node_intersector1.h"
+#include "../bvh/node_intersector_packet.h"
+
+#include "intersector_epilog.h"
+
+#include "../subdiv/bezier_curve.h"
+#include "../subdiv/bspline_curve.h"
+#include "../subdiv/hermite_curve.h"
+#include "../subdiv/catmullrom_curve.h"
+
+#include "spherei_intersector.h"
+#include "disci_intersector.h"
+
+#include "linei_intersector.h"
+#include "roundlinei_intersector.h"
+#include "conelinei_intersector.h"
+
+#include "curveNi_intersector.h"
+#include "curveNv_intersector.h"
+#include "curveNi_mb_intersector.h"
+
+#include "curve_intersector_distance.h"
+#include "curve_intersector_ribbon.h"
+#include "curve_intersector_oriented.h"
+#include "curve_intersector_sweep.h"
+
+namespace embree
+{
+  struct VirtualCurveIntersector
+  {
+    typedef void (*Intersect1Ty)(void* pre, void* ray, IntersectContext* context, const void* primitive);
+    typedef bool (*Occluded1Ty )(void* pre, void* ray, IntersectContext* context, const void* primitive);
+    
+    typedef void (*Intersect4Ty)(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive);
+    typedef bool (*Occluded4Ty) (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive);
+    
+    typedef void (*Intersect8Ty)(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive);
+    typedef bool (*Occluded8Ty) (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive);
+    
+    typedef void (*Intersect16Ty)(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive);
+    typedef bool (*Occluded16Ty) (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive);
+
+  public:
+    struct Intersectors
+    {
+      Intersectors() {} // WARNING: Do not zero initialize this, as we otherwise get problems with thread unsafe local static variable initialization (e.g. on VS2013) in curve_intersector_virtual.cpp.
+      
+      template<int K> void intersect(void* pre, void* ray, IntersectContext* context, const void* primitive);
+      template<int K> bool occluded (void* pre, void* ray, IntersectContext* context, const void* primitive);
+
+      template<int K> void intersect(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive);
+      template<int K> bool occluded (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive);
+
+    public:
+      Intersect1Ty intersect1;
+      Occluded1Ty  occluded1;
+      Intersect4Ty intersect4;
+      Occluded4Ty  occluded4;
+      Intersect8Ty intersect8;
+      Occluded8Ty  occluded8;
+      Intersect16Ty intersect16;
+      Occluded16Ty  occluded16;
+    };
+    
+    Intersectors vtbl[Geometry::GTY_END];
+  };
+
+  template<> __forceinline void VirtualCurveIntersector::Intersectors::intersect<1> (void* pre, void* ray, IntersectContext* context, const void* primitive) { assert(intersect1); intersect1(pre,ray,context,primitive); }
+  template<> __forceinline bool VirtualCurveIntersector::Intersectors::occluded<1>  (void* pre, void* ray, IntersectContext* context, const void* primitive) { assert(occluded1); return occluded1(pre,ray,context,primitive); }
+      
+  template<> __forceinline void VirtualCurveIntersector::Intersectors::intersect<4>(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive) { assert(intersect4); intersect4(pre,ray,k,context,primitive); }
+  template<> __forceinline bool VirtualCurveIntersector::Intersectors::occluded<4> (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive) { assert(occluded4); return occluded4(pre,ray,k,context,primitive); }
+      
+#if defined(__AVX__)
+  template<> __forceinline void VirtualCurveIntersector::Intersectors::intersect<8>(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive) { assert(intersect8); intersect8(pre,ray,k,context,primitive); }
+  template<> __forceinline bool VirtualCurveIntersector::Intersectors::occluded<8> (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive) { assert(occluded8); return occluded8(pre,ray,k,context,primitive); }
+#endif
+  
+#if defined(__AVX512F__)
+  template<> __forceinline void VirtualCurveIntersector::Intersectors::intersect<16>(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive) { assert(intersect16); intersect16(pre,ray,k,context,primitive); }
+  template<> __forceinline bool VirtualCurveIntersector::Intersectors::occluded<16> (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive) { assert(occluded16); return occluded16(pre,ray,k,context,primitive); }
+#endif
+  
+  namespace isa
+  {
+    struct VirtualCurveIntersector1
+    {
+      typedef unsigned char Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+      
+      template<int N, bool robust>
+        static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+      {
+        assert(num == 1);
+        RTCGeometryType ty = (RTCGeometryType)(*prim);
+        assert(This->leafIntersector);
+        VirtualCurveIntersector::Intersectors& leafIntersector = ((VirtualCurveIntersector*) This->leafIntersector)->vtbl[ty];
+        leafIntersector.intersect<1>(&pre,&ray,context,prim);
+      }
+
+      template<int N, bool robust>      
+        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+      {
+        assert(num == 1);
+        RTCGeometryType ty = (RTCGeometryType)(*prim);
+        assert(This->leafIntersector);
+        VirtualCurveIntersector::Intersectors& leafIntersector = ((VirtualCurveIntersector*) This->leafIntersector)->vtbl[ty];
+        return leafIntersector.occluded<1>(&pre,&ray,context,prim);
+      }
+    };
+
+    template<int K>
+      struct VirtualCurveIntersectorK 
+      {
+        typedef unsigned char Primitive;
+        typedef CurvePrecalculationsK<K> Precalculations;
+        
+        template<bool robust>        
+        static __forceinline void intersect(const vbool<K>& valid_i, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+        {
+          assert(num == 1);
+          RTCGeometryType ty = (RTCGeometryType)(*prim);
+          assert(This->leafIntersector);
+          VirtualCurveIntersector::Intersectors& leafIntersector = ((VirtualCurveIntersector*) This->leafIntersector)->vtbl[ty];
+          size_t mask = movemask(valid_i);
+          while (mask) leafIntersector.intersect<K>(&pre,&ray,bscf(mask),context,prim);
+        }
+        
+        template<bool robust>        
+        static __forceinline vbool<K> occluded(const vbool<K>& valid_i, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+        {
+          assert(num == 1);
+          RTCGeometryType ty = (RTCGeometryType)(*prim);
+          assert(This->leafIntersector);
+          VirtualCurveIntersector::Intersectors& leafIntersector = ((VirtualCurveIntersector*) This->leafIntersector)->vtbl[ty];
+          vbool<K> valid_o = false;
+          size_t mask = movemask(valid_i);
+          while (mask) {
+            size_t k = bscf(mask);
+            if (leafIntersector.occluded<K>(&pre,&ray,k,context,prim))
+              set(valid_o, k);
+          }
+          return valid_o;
+        }
+        
+        template<int N, bool robust>              
+        static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+        {
+          assert(num == 1);
+          RTCGeometryType ty = (RTCGeometryType)(*prim);
+          assert(This->leafIntersector);
+          VirtualCurveIntersector::Intersectors& leafIntersector = ((VirtualCurveIntersector*) This->leafIntersector)->vtbl[ty];
+          leafIntersector.intersect<K>(&pre,&ray,k,context,prim);
+        }
+        
+        template<int N, bool robust>      
+        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+        {
+          assert(num == 1);
+          RTCGeometryType ty = (RTCGeometryType)(*prim);
+          assert(This->leafIntersector);
+          VirtualCurveIntersector::Intersectors& leafIntersector = ((VirtualCurveIntersector*) This->leafIntersector)->vtbl[ty];
+          return leafIntersector.occluded<K>(&pre,&ray,k,context,prim);
+        }
+      };
+
+    template<int N>
+    static VirtualCurveIntersector::Intersectors LinearRoundConeNiIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &RoundLinearCurveMiIntersector1<N,true>::intersect;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &RoundLinearCurveMiIntersector1<N,true>::occluded;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &RoundLinearCurveMiIntersectorK<N,4,true>::intersect;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &RoundLinearCurveMiIntersectorK<N,4,true>::occluded;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&RoundLinearCurveMiIntersectorK<N,8,true>::intersect;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &RoundLinearCurveMiIntersectorK<N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&RoundLinearCurveMiIntersectorK<N,16,true>::intersect;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &RoundLinearCurveMiIntersectorK<N,16,true>::occluded;
+#endif
+      return intersectors;
+    }
+
+    template<int N>
+    static VirtualCurveIntersector::Intersectors LinearConeNiIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &ConeCurveMiIntersector1<N,true>::intersect;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &ConeCurveMiIntersector1<N,true>::occluded;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &ConeCurveMiIntersectorK<N,4,true>::intersect;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &ConeCurveMiIntersectorK<N,4,true>::occluded;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&ConeCurveMiIntersectorK<N,8,true>::intersect;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &ConeCurveMiIntersectorK<N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&ConeCurveMiIntersectorK<N,16,true>::intersect;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &ConeCurveMiIntersectorK<N,16,true>::occluded;
+#endif
+      return intersectors;
+    }
+
+    template<int N>
+    static VirtualCurveIntersector::Intersectors LinearRoundConeNiMBIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &RoundLinearCurveMiMBIntersector1<N,true>::intersect;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &RoundLinearCurveMiMBIntersector1<N,true>::occluded;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &RoundLinearCurveMiMBIntersectorK<N,4,true>::intersect;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &RoundLinearCurveMiMBIntersectorK<N,4,true>::occluded;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&RoundLinearCurveMiMBIntersectorK<N,8,true>::intersect;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &RoundLinearCurveMiMBIntersectorK<N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&RoundLinearCurveMiMBIntersectorK<N,16,true>::intersect;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &RoundLinearCurveMiMBIntersectorK<N,16,true>::occluded;
+#endif
+      return intersectors;
+    }
+
+    template<int N>
+    static VirtualCurveIntersector::Intersectors LinearConeNiMBIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &ConeCurveMiMBIntersector1<N,true>::intersect;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &ConeCurveMiMBIntersector1<N,true>::occluded;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &ConeCurveMiMBIntersectorK<N,4,true>::intersect;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &ConeCurveMiMBIntersectorK<N,4,true>::occluded;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&ConeCurveMiMBIntersectorK<N,8,true>::intersect;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &ConeCurveMiMBIntersectorK<N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&ConeCurveMiMBIntersectorK<N,16,true>::intersect;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &ConeCurveMiMBIntersectorK<N,16,true>::occluded;
+#endif
+      return intersectors;
+    }
+
+
+    template<int N>
+      static VirtualCurveIntersector::Intersectors LinearRibbonNiIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &FlatLinearCurveMiIntersector1<N,true>::intersect;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &FlatLinearCurveMiIntersector1<N,true>::occluded;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &FlatLinearCurveMiIntersectorK<N,4,true>::intersect;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &FlatLinearCurveMiIntersectorK<N,4,true>::occluded;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&FlatLinearCurveMiIntersectorK<N,8,true>::intersect;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &FlatLinearCurveMiIntersectorK<N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&FlatLinearCurveMiIntersectorK<N,16,true>::intersect;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &FlatLinearCurveMiIntersectorK<N,16,true>::occluded;
+#endif
+      return intersectors;
+    }
+    
+    template<int N>
+      static VirtualCurveIntersector::Intersectors LinearRibbonNiMBIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &FlatLinearCurveMiMBIntersector1<N,true>::intersect;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &FlatLinearCurveMiMBIntersector1<N,true>::occluded;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &FlatLinearCurveMiMBIntersectorK<N,4,true>::intersect;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &FlatLinearCurveMiMBIntersectorK<N,4,true>::occluded;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&FlatLinearCurveMiMBIntersectorK<N,8,true>::intersect;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &FlatLinearCurveMiMBIntersectorK<N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&FlatLinearCurveMiMBIntersectorK<N,16,true>::intersect;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &FlatLinearCurveMiMBIntersectorK<N,16,true>::occluded;
+#endif
+      return intersectors;
+    }
+    
+    template<int N>
+      static VirtualCurveIntersector::Intersectors SphereNiIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &SphereMiIntersector1<N,true>::intersect;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &SphereMiIntersector1<N,true>::occluded;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &SphereMiIntersectorK<N,4,true>::intersect;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &SphereMiIntersectorK<N,4,true>::occluded;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&SphereMiIntersectorK<N,8,true>::intersect;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &SphereMiIntersectorK<N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&SphereMiIntersectorK<N,16,true>::intersect;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &SphereMiIntersectorK<N,16,true>::occluded;
+#endif
+      return intersectors;
+    }
+    
+    template<int N>
+      static VirtualCurveIntersector::Intersectors SphereNiMBIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &SphereMiMBIntersector1<N,true>::intersect;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &SphereMiMBIntersector1<N,true>::occluded;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &SphereMiMBIntersectorK<N,4,true>::intersect;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &SphereMiMBIntersectorK<N,4,true>::occluded;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&SphereMiMBIntersectorK<N,8,true>::intersect;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &SphereMiMBIntersectorK<N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&SphereMiMBIntersectorK<N,16,true>::intersect;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &SphereMiMBIntersectorK<N,16,true>::occluded;
+#endif
+      return intersectors;
+    }
+    
+    template<int N>
+      static VirtualCurveIntersector::Intersectors DiscNiIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &DiscMiIntersector1<N,true>::intersect;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &DiscMiIntersector1<N,true>::occluded;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &DiscMiIntersectorK<N,4,true>::intersect;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &DiscMiIntersectorK<N,4,true>::occluded;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&DiscMiIntersectorK<N,8,true>::intersect;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &DiscMiIntersectorK<N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&DiscMiIntersectorK<N,16,true>::intersect;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &DiscMiIntersectorK<N,16,true>::occluded;
+#endif
+      return intersectors;
+    }
+    
+    template<int N>
+      static VirtualCurveIntersector::Intersectors DiscNiMBIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &DiscMiMBIntersector1<N,true>::intersect;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &DiscMiMBIntersector1<N,true>::occluded;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &DiscMiMBIntersectorK<N,4,true>::intersect;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &DiscMiMBIntersectorK<N,4,true>::occluded;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&DiscMiMBIntersectorK<N,8,true>::intersect;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &DiscMiMBIntersectorK<N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&DiscMiMBIntersectorK<N,16,true>::intersect;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &DiscMiMBIntersectorK<N,16,true>::occluded;
+#endif
+      return intersectors;
+    }
+    
+    template<int N>
+      static VirtualCurveIntersector::Intersectors OrientedDiscNiIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &OrientedDiscMiIntersector1<N,true>::intersect;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &OrientedDiscMiIntersector1<N,true>::occluded;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &OrientedDiscMiIntersectorK<N,4,true>::intersect;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &OrientedDiscMiIntersectorK<N,4,true>::occluded;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&OrientedDiscMiIntersectorK<N,8,true>::intersect;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &OrientedDiscMiIntersectorK<N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&OrientedDiscMiIntersectorK<N,16,true>::intersect;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &OrientedDiscMiIntersectorK<N,16,true>::occluded;
+#endif
+      return intersectors;
+    }
+    
+    template<int N>
+      static VirtualCurveIntersector::Intersectors OrientedDiscNiMBIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &OrientedDiscMiMBIntersector1<N,true>::intersect;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &OrientedDiscMiMBIntersector1<N,true>::occluded;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &OrientedDiscMiMBIntersectorK<N,4,true>::intersect;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &OrientedDiscMiMBIntersectorK<N,4,true>::occluded;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&OrientedDiscMiMBIntersectorK<N,8,true>::intersect;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &OrientedDiscMiMBIntersectorK<N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&OrientedDiscMiMBIntersectorK<N,16,true>::intersect;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &OrientedDiscMiMBIntersectorK<N,16,true>::occluded;
+#endif
+      return intersectors;
+    }
+    
+    template<template<typename Ty> class Curve, int N>
+      static VirtualCurveIntersector::Intersectors RibbonNiIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiIntersector1<N>::template intersect_t<RibbonCurve1Intersector1<Curve>, Intersect1EpilogMU<VSIZEX,true> >;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &CurveNiIntersector1<N>::template occluded_t <RibbonCurve1Intersector1<Curve>, Occluded1EpilogMU<VSIZEX,true> >;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &CurveNiIntersectorK<N,4>::template intersect_t<RibbonCurve1IntersectorK<Curve,4>, Intersect1KEpilogMU<VSIZEX,4,true> >;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &CurveNiIntersectorK<N,4>::template occluded_t <RibbonCurve1IntersectorK<Curve,4>, Occluded1KEpilogMU<VSIZEX,4,true> >;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiIntersectorK<N,8>::template intersect_t<RibbonCurve1IntersectorK<Curve,8>, Intersect1KEpilogMU<VSIZEX,8,true> >;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &CurveNiIntersectorK<N,8>::template occluded_t <RibbonCurve1IntersectorK<Curve,8>, Occluded1KEpilogMU<VSIZEX,8,true> >;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiIntersectorK<N,16>::template intersect_t<RibbonCurve1IntersectorK<Curve,16>, Intersect1KEpilogMU<VSIZEX,16,true> >;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &CurveNiIntersectorK<N,16>::template occluded_t <RibbonCurve1IntersectorK<Curve,16>, Occluded1KEpilogMU<VSIZEX,16,true> >;
+#endif
+      return intersectors;
+    }
+    
+    template<template<typename Ty> class Curve, int N>
+      static VirtualCurveIntersector::Intersectors RibbonNvIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNvIntersector1<N>::template intersect_t<RibbonCurve1Intersector1<Curve>, Intersect1EpilogMU<VSIZEX,true> >;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &CurveNvIntersector1<N>::template occluded_t <RibbonCurve1Intersector1<Curve>, Occluded1EpilogMU<VSIZEX,true> >;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &CurveNvIntersectorK<N,4>::template intersect_t<RibbonCurve1IntersectorK<Curve,4>, Intersect1KEpilogMU<VSIZEX,4,true> >;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &CurveNvIntersectorK<N,4>::template occluded_t <RibbonCurve1IntersectorK<Curve,4>, Occluded1KEpilogMU<VSIZEX,4,true> >;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNvIntersectorK<N,8>::template intersect_t<RibbonCurve1IntersectorK<Curve,8>, Intersect1KEpilogMU<VSIZEX,8,true> >;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &CurveNvIntersectorK<N,8>::template occluded_t <RibbonCurve1IntersectorK<Curve,8>, Occluded1KEpilogMU<VSIZEX,8,true> >;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNvIntersectorK<N,16>::template intersect_t<RibbonCurve1IntersectorK<Curve,16>, Intersect1KEpilogMU<VSIZEX,16,true> >;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &CurveNvIntersectorK<N,16>::template occluded_t <RibbonCurve1IntersectorK<Curve,16>, Occluded1KEpilogMU<VSIZEX,16,true> >;
+#endif
+      return intersectors;
+    }
+    
+    template<template<typename Ty> class Curve, int N>
+      static VirtualCurveIntersector::Intersectors RibbonNiMBIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiMBIntersector1<N>::template intersect_t<RibbonCurve1Intersector1<Curve>, Intersect1EpilogMU<VSIZEX,true> >;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &CurveNiMBIntersector1<N>::template occluded_t <RibbonCurve1Intersector1<Curve>, Occluded1EpilogMU<VSIZEX,true> >;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &CurveNiMBIntersectorK<N,4>::template intersect_t<RibbonCurve1IntersectorK<Curve,4>, Intersect1KEpilogMU<VSIZEX,4,true> >;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &CurveNiMBIntersectorK<N,4>::template occluded_t <RibbonCurve1IntersectorK<Curve,4>, Occluded1KEpilogMU<VSIZEX,4,true> >;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiMBIntersectorK<N,8>::template intersect_t<RibbonCurve1IntersectorK<Curve,8>, Intersect1KEpilogMU<VSIZEX,8,true> >;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &CurveNiMBIntersectorK<N,8>::template occluded_t <RibbonCurve1IntersectorK<Curve,8>, Occluded1KEpilogMU<VSIZEX,8,true> >;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiMBIntersectorK<N,16>::template intersect_t<RibbonCurve1IntersectorK<Curve,16>, Intersect1KEpilogMU<VSIZEX,16,true> >;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &CurveNiMBIntersectorK<N,16>::template occluded_t <RibbonCurve1IntersectorK<Curve,16>, Occluded1KEpilogMU<VSIZEX,16,true> >;
+#endif
+      return intersectors;
+    }
+    
+    template<template<typename Ty> class Curve, int N>
+      static VirtualCurveIntersector::Intersectors CurveNiIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiIntersector1<N>::template intersect_t<SweepCurve1Intersector1<Curve>, Intersect1Epilog1<true> >;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &CurveNiIntersector1<N>::template occluded_t <SweepCurve1Intersector1<Curve>, Occluded1Epilog1<true> >;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiIntersectorK<N,4>::template intersect_t<SweepCurve1IntersectorK<Curve,4>, Intersect1KEpilog1<4,true> >;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty) &CurveNiIntersectorK<N,4>::template occluded_t <SweepCurve1IntersectorK<Curve,4>, Occluded1KEpilog1<4,true> >;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiIntersectorK<N,8>::template intersect_t<SweepCurve1IntersectorK<Curve,8>, Intersect1KEpilog1<8,true> >;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &CurveNiIntersectorK<N,8>::template occluded_t <SweepCurve1IntersectorK<Curve,8>, Occluded1KEpilog1<8,true> >;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiIntersectorK<N,16>::template intersect_t<SweepCurve1IntersectorK<Curve,16>, Intersect1KEpilog1<16,true> >;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &CurveNiIntersectorK<N,16>::template occluded_t <SweepCurve1IntersectorK<Curve,16>, Occluded1KEpilog1<16,true> >;
+#endif
+      return intersectors;
+    }
+    
+    template<template<typename Ty> class Curve, int N>
+      static VirtualCurveIntersector::Intersectors CurveNvIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNvIntersector1<N>::template intersect_t<SweepCurve1Intersector1<Curve>, Intersect1Epilog1<true> >;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &CurveNvIntersector1<N>::template occluded_t <SweepCurve1Intersector1<Curve>, Occluded1Epilog1<true> >;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNvIntersectorK<N,4>::template intersect_t<SweepCurve1IntersectorK<Curve,4>, Intersect1KEpilog1<4,true> >;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty) &CurveNvIntersectorK<N,4>::template occluded_t <SweepCurve1IntersectorK<Curve,4>, Occluded1KEpilog1<4,true> >;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNvIntersectorK<N,8>::template intersect_t<SweepCurve1IntersectorK<Curve,8>, Intersect1KEpilog1<8,true> >;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &CurveNvIntersectorK<N,8>::template occluded_t <SweepCurve1IntersectorK<Curve,8>, Occluded1KEpilog1<8,true> >;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNvIntersectorK<N,16>::template intersect_t<SweepCurve1IntersectorK<Curve,16>, Intersect1KEpilog1<16,true> >;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &CurveNvIntersectorK<N,16>::template occluded_t <SweepCurve1IntersectorK<Curve,16>, Occluded1KEpilog1<16,true> >;
+#endif
+      return intersectors;
+    }
+    
+    template<template<typename Ty> class Curve, int N>
+      static VirtualCurveIntersector::Intersectors CurveNiMBIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiMBIntersector1<N>::template intersect_t<SweepCurve1Intersector1<Curve>, Intersect1Epilog1<true> >;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &CurveNiMBIntersector1<N>::template occluded_t <SweepCurve1Intersector1<Curve>, Occluded1Epilog1<true> >;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiMBIntersectorK<N,4>::template intersect_t<SweepCurve1IntersectorK<Curve,4>, Intersect1KEpilog1<4,true> >;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty) &CurveNiMBIntersectorK<N,4>::template occluded_t <SweepCurve1IntersectorK<Curve,4>, Occluded1KEpilog1<4,true> >;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiMBIntersectorK<N,8>::template intersect_t<SweepCurve1IntersectorK<Curve,8>, Intersect1KEpilog1<8,true> >;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &CurveNiMBIntersectorK<N,8>::template occluded_t <SweepCurve1IntersectorK<Curve,8>, Occluded1KEpilog1<8,true> >;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiMBIntersectorK<N,16>::template intersect_t<SweepCurve1IntersectorK<Curve,16>, Intersect1KEpilog1<16,true> >;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &CurveNiMBIntersectorK<N,16>::template occluded_t <SweepCurve1IntersectorK<Curve,16>, Occluded1KEpilog1<16,true> >;
+#endif
+      return intersectors;
+    }
+    
+    template<template<typename Ty> class Curve, int N>
+      static VirtualCurveIntersector::Intersectors OrientedCurveNiIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiIntersector1<N>::template intersect_n<OrientedCurve1Intersector1<Curve>, Intersect1Epilog1<true> >;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &CurveNiIntersector1<N>::template occluded_n <OrientedCurve1Intersector1<Curve>, Occluded1Epilog1<true> >;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiIntersectorK<N,4>::template intersect_n<OrientedCurve1IntersectorK<Curve,4>, Intersect1KEpilog1<4,true> >;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty) &CurveNiIntersectorK<N,4>::template occluded_n <OrientedCurve1IntersectorK<Curve,4>, Occluded1KEpilog1<4,true> >;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiIntersectorK<N,8>::template intersect_n<OrientedCurve1IntersectorK<Curve,8>, Intersect1KEpilog1<8,true> >;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &CurveNiIntersectorK<N,8>::template occluded_n <OrientedCurve1IntersectorK<Curve,8>, Occluded1KEpilog1<8,true> >;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiIntersectorK<N,16>::template intersect_n<OrientedCurve1IntersectorK<Curve,16>, Intersect1KEpilog1<16,true> >;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &CurveNiIntersectorK<N,16>::template occluded_n <OrientedCurve1IntersectorK<Curve,16>, Occluded1KEpilog1<16,true> >;
+#endif
+      return intersectors;
+    }
+    
+    template<template<typename Ty> class Curve, int N>
+      static VirtualCurveIntersector::Intersectors OrientedCurveNiMBIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiMBIntersector1<N>::template intersect_n<OrientedCurve1Intersector1<Curve>, Intersect1Epilog1<true> >;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &CurveNiMBIntersector1<N>::template occluded_n <OrientedCurve1Intersector1<Curve>, Occluded1Epilog1<true> >;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiMBIntersectorK<N,4>::template intersect_n<OrientedCurve1IntersectorK<Curve,4>, Intersect1KEpilog1<4,true> >;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty) &CurveNiMBIntersectorK<N,4>::template occluded_n <OrientedCurve1IntersectorK<Curve,4>, Occluded1KEpilog1<4,true> >;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiMBIntersectorK<N,8>::template intersect_n<OrientedCurve1IntersectorK<Curve,8>, Intersect1KEpilog1<8,true> >;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &CurveNiMBIntersectorK<N,8>::template occluded_n <OrientedCurve1IntersectorK<Curve,8>, Occluded1KEpilog1<8,true> >;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiMBIntersectorK<N,16>::template intersect_n<OrientedCurve1IntersectorK<Curve,16>, Intersect1KEpilog1<16,true> >;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &CurveNiMBIntersectorK<N,16>::template occluded_n <OrientedCurve1IntersectorK<Curve,16>, Occluded1KEpilog1<16,true> >;
+#endif
+      return intersectors;
+    }
+    
+    template<template<typename Ty> class Curve, int N>
+      static VirtualCurveIntersector::Intersectors HermiteRibbonNiIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiIntersector1<N>::template intersect_h<RibbonCurve1Intersector1<Curve>, Intersect1EpilogMU<VSIZEX,true> >;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &CurveNiIntersector1<N>::template occluded_h <RibbonCurve1Intersector1<Curve>, Occluded1EpilogMU<VSIZEX,true> >;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiIntersectorK<N,4>::template intersect_h<RibbonCurve1IntersectorK<Curve,4>, Intersect1KEpilogMU<VSIZEX,4,true> >;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty) &CurveNiIntersectorK<N,4>::template occluded_h <RibbonCurve1IntersectorK<Curve,4>, Occluded1KEpilogMU<VSIZEX,4,true> >;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiIntersectorK<N,8>::template intersect_h<RibbonCurve1IntersectorK<Curve,8>, Intersect1KEpilogMU<VSIZEX,8,true> >;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &CurveNiIntersectorK<N,8>::template occluded_h <RibbonCurve1IntersectorK<Curve,8>, Occluded1KEpilogMU<VSIZEX,8,true> >;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiIntersectorK<N,16>::template intersect_h<RibbonCurve1IntersectorK<Curve,16>, Intersect1KEpilogMU<VSIZEX,16,true> >;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &CurveNiIntersectorK<N,16>::template occluded_h <RibbonCurve1IntersectorK<Curve,16>, Occluded1KEpilogMU<VSIZEX,16,true> >;
+#endif
+      return intersectors;
+    }
+    
+    template<template<typename Ty> class Curve, int N>
+      static VirtualCurveIntersector::Intersectors HermiteRibbonNiMBIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiMBIntersector1<N>::template intersect_h<RibbonCurve1Intersector1<Curve>, Intersect1EpilogMU<VSIZEX,true> >;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &CurveNiMBIntersector1<N>::template occluded_h <RibbonCurve1Intersector1<Curve>, Occluded1EpilogMU<VSIZEX,true> >;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiMBIntersectorK<N,4>::template intersect_h<RibbonCurve1IntersectorK<Curve,4>, Intersect1KEpilogMU<VSIZEX,4,true> >;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty) &CurveNiMBIntersectorK<N,4>::template occluded_h <RibbonCurve1IntersectorK<Curve,4>, Occluded1KEpilogMU<VSIZEX,4,true> >;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiMBIntersectorK<N,8>::template intersect_h<RibbonCurve1IntersectorK<Curve,8>, Intersect1KEpilogMU<VSIZEX,8,true> >;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &CurveNiMBIntersectorK<N,8>::template occluded_h <RibbonCurve1IntersectorK<Curve,8>, Occluded1KEpilogMU<VSIZEX,8,true> >;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiMBIntersectorK<N,16>::template intersect_h<RibbonCurve1IntersectorK<Curve,16>, Intersect1KEpilogMU<VSIZEX,16,true> >;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &CurveNiMBIntersectorK<N,16>::template occluded_h <RibbonCurve1IntersectorK<Curve,16>, Occluded1KEpilogMU<VSIZEX,16,true> >;
+#endif
+      return intersectors;
+    }
+    
+    template<template<typename Ty> class Curve, int N>
+      static VirtualCurveIntersector::Intersectors HermiteCurveNiIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiIntersector1<N>::template intersect_h<SweepCurve1Intersector1<Curve>, Intersect1Epilog1<true> >;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &CurveNiIntersector1<N>::template occluded_h <SweepCurve1Intersector1<Curve>, Occluded1Epilog1<true> >;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiIntersectorK<N,4>::template intersect_h<SweepCurve1IntersectorK<Curve,4>, Intersect1KEpilog1<4,true> >;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty) &CurveNiIntersectorK<N,4>::template occluded_h <SweepCurve1IntersectorK<Curve,4>, Occluded1KEpilog1<4,true> >;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiIntersectorK<N,8>::template intersect_h<SweepCurve1IntersectorK<Curve,8>, Intersect1KEpilog1<8,true> >;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &CurveNiIntersectorK<N,8>::template occluded_h <SweepCurve1IntersectorK<Curve,8>, Occluded1KEpilog1<8,true> >;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiIntersectorK<N,16>::template intersect_h<SweepCurve1IntersectorK<Curve,16>, Intersect1KEpilog1<16,true> >;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &CurveNiIntersectorK<N,16>::template occluded_h <SweepCurve1IntersectorK<Curve,16>, Occluded1KEpilog1<16,true> >;
+#endif
+      return intersectors;
+    }
+    
+    template<template<typename Ty> class Curve, int N>
+      static VirtualCurveIntersector::Intersectors HermiteCurveNiMBIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiMBIntersector1<N>::template intersect_h<SweepCurve1Intersector1<Curve>, Intersect1Epilog1<true> >;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &CurveNiMBIntersector1<N>::template occluded_h <SweepCurve1Intersector1<Curve>, Occluded1Epilog1<true> >;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiMBIntersectorK<N,4>::template intersect_h<SweepCurve1IntersectorK<Curve,4>, Intersect1KEpilog1<4,true> >;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty) &CurveNiMBIntersectorK<N,4>::template occluded_h <SweepCurve1IntersectorK<Curve,4>, Occluded1KEpilog1<4,true> >;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiMBIntersectorK<N,8>::template intersect_h<SweepCurve1IntersectorK<Curve,8>, Intersect1KEpilog1<8,true> >;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &CurveNiMBIntersectorK<N,8>::template occluded_h <SweepCurve1IntersectorK<Curve,8>, Occluded1KEpilog1<8,true> >;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiMBIntersectorK<N,16>::template intersect_h<SweepCurve1IntersectorK<Curve,16>, Intersect1KEpilog1<16,true> >;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &CurveNiMBIntersectorK<N,16>::template occluded_h <SweepCurve1IntersectorK<Curve,16>, Occluded1KEpilog1<16,true> >;
+#endif
+      return intersectors;
+    }
+    
+    template<template<typename Ty> class Curve, int N>
+      static VirtualCurveIntersector::Intersectors HermiteOrientedCurveNiIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiIntersector1<N>::template intersect_hn<OrientedCurve1Intersector1<Curve>, Intersect1Epilog1<true> >;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &CurveNiIntersector1<N>::template occluded_hn <OrientedCurve1Intersector1<Curve>, Occluded1Epilog1<true> >;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiIntersectorK<N,4>::template intersect_hn<OrientedCurve1IntersectorK<Curve,4>, Intersect1KEpilog1<4,true> >;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty) &CurveNiIntersectorK<N,4>::template occluded_hn <OrientedCurve1IntersectorK<Curve,4>, Occluded1KEpilog1<4,true> >;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiIntersectorK<N,8>::template intersect_hn<OrientedCurve1IntersectorK<Curve,8>, Intersect1KEpilog1<8,true> >;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &CurveNiIntersectorK<N,8>::template occluded_hn <OrientedCurve1IntersectorK<Curve,8>, Occluded1KEpilog1<8,true> >;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiIntersectorK<N,16>::template intersect_hn<OrientedCurve1IntersectorK<Curve,16>, Intersect1KEpilog1<16,true> >;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &CurveNiIntersectorK<N,16>::template occluded_hn <OrientedCurve1IntersectorK<Curve,16>, Occluded1KEpilog1<16,true> >;
+#endif
+      return intersectors;
+    }
+    
+    template<template<typename Ty> class Curve, int N>
+      static VirtualCurveIntersector::Intersectors HermiteOrientedCurveNiMBIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiMBIntersector1<N>::template intersect_hn<OrientedCurve1Intersector1<Curve>, Intersect1Epilog1<true> >;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &CurveNiMBIntersector1<N>::template occluded_hn <OrientedCurve1Intersector1<Curve>, Occluded1Epilog1<true> >;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiMBIntersectorK<N,4>::template intersect_hn<OrientedCurve1IntersectorK<Curve,4>, Intersect1KEpilog1<4,true> >;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty) &CurveNiMBIntersectorK<N,4>::template occluded_hn <OrientedCurve1IntersectorK<Curve,4>, Occluded1KEpilog1<4,true> >;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiMBIntersectorK<N,8>::template intersect_hn<OrientedCurve1IntersectorK<Curve,8>, Intersect1KEpilog1<8,true> >;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &CurveNiMBIntersectorK<N,8>::template occluded_hn <OrientedCurve1IntersectorK<Curve,8>, Occluded1KEpilog1<8,true> >;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiMBIntersectorK<N,16>::template intersect_hn<OrientedCurve1IntersectorK<Curve,16>, Intersect1KEpilog1<16,true> >;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &CurveNiMBIntersectorK<N,16>::template occluded_hn <OrientedCurve1IntersectorK<Curve,16>, Occluded1KEpilog1<16,true> >;
+#endif
+      return intersectors;
+    }
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/cylinder.h b/thirdparty/embree/kernels/geometry/cylinder.h
new file mode 100644
index 0000000000..dab02989ce
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/cylinder.h
@@ -0,0 +1,223 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    struct Cylinder
+    {
+      const Vec3fa p0;  //!< start location
+      const Vec3fa p1;  //!< end position
+      const float rr;   //!< squared radius of cylinder
+
+      __forceinline Cylinder(const Vec3fa& p0, const Vec3fa& p1, const float r) 
+        : p0(p0), p1(p1), rr(sqr(r)) {}
+
+      __forceinline Cylinder(const Vec3fa& p0, const Vec3fa& p1, const float rr, bool) 
+        : p0(p0), p1(p1), rr(rr) {}
+
+      __forceinline bool intersect(const Vec3fa& org,
+                                   const Vec3fa& dir, 
+                                   BBox1f& t_o, 
+                                   float& u0_o, Vec3fa& Ng0_o,
+                                   float& u1_o, Vec3fa& Ng1_o) const
+      {
+        /* calculate quadratic equation to solve */
+        const float rl = rcp_length(p1-p0);
+        const Vec3fa P0 = p0, dP = (p1-p0)*rl;
+        const Vec3fa O = org-P0, dO = dir;
+        
+        const float dOdO = dot(dO,dO);
+        const float OdO = dot(dO,O);
+        const float OO = dot(O,O);
+        const float dOz = dot(dP,dO);
+        const float Oz = dot(dP,O);
+        
+        const float A = dOdO - sqr(dOz);
+        const float B = 2.0f * (OdO - dOz*Oz);
+        const float C = OO - sqr(Oz) - rr;
+        
+        /* we miss the cylinder if determinant is smaller than zero */
+        const float D = B*B - 4.0f*A*C;
+        if (D < 0.0f) {
+          t_o = BBox1f(pos_inf,neg_inf);
+          return false;
+        }
+        
+        /* special case for rays that are parallel to the cylinder */
+        const float eps = 16.0f*float(ulp)*max(abs(dOdO),abs(sqr(dOz)));
+        if (abs(A) < eps) 
+        {
+          if (C <= 0.0f) {
+            t_o = BBox1f(neg_inf,pos_inf);
+            return true;
+          } else {
+            t_o = BBox1f(pos_inf,neg_inf);
+            return false;
+          }
+        }
+        
+        /* standard case for rays that are not parallel to the cylinder */
+        const float Q = sqrt(D);
+        const float rcp_2A = rcp(2.0f*A);
+        const float t0 = (-B-Q)*rcp_2A;
+        const float t1 = (-B+Q)*rcp_2A;
+        
+        /* calculates u and Ng for near hit */
+        {
+          u0_o = madd(t0,dOz,Oz)*rl;
+          const Vec3fa Pr = t0*dir;
+          const Vec3fa Pl = madd(u0_o,p1-p0,p0);
+          Ng0_o = Pr-Pl;
+        }
+
+        /* calculates u and Ng for far hit */
+        {
+          u1_o = madd(t1,dOz,Oz)*rl;
+          const Vec3fa Pr = t1*dir;
+          const Vec3fa Pl = madd(u1_o,p1-p0,p0);
+          Ng1_o = Pr-Pl;
+        }
+
+        t_o.lower = t0;
+        t_o.upper = t1;
+        return true;
+      }
+
+      __forceinline bool intersect(const Vec3fa& org_i, const Vec3fa& dir, BBox1f& t_o) const
+      {
+        float u0_o; Vec3fa Ng0_o;
+        float u1_o; Vec3fa Ng1_o;
+        return intersect(org_i,dir,t_o,u0_o,Ng0_o,u1_o,Ng1_o);
+      }
+
+      static bool verify(const size_t id, const Cylinder& cylinder, const RayHit& ray, bool shouldhit, const float t0, const float t1)
+      {
+        float eps = 0.001f;
+        BBox1f t; bool hit;
+        hit = cylinder.intersect(ray.org,ray.dir,t);
+
+        bool failed = hit != shouldhit;
+        if (shouldhit) failed |= std::isinf(t0) ? t0 != t.lower : abs(t0-t.lower) > eps;
+        if (shouldhit) failed |= std::isinf(t1) ? t1 != t.upper : abs(t1-t.upper) > eps;
+        if (!failed) return true;
+        embree_cout << "Cylinder test " << id << " failed: cylinder = " << cylinder << ", ray = " << ray << ", hit = " << hit << ", t = " << t << embree_endl; 
+        return false;
+      }
+
+      /* verify cylinder class */
+      static bool verify()
+      {
+        bool passed = true;
+        const Cylinder cylinder(Vec3fa(0.0f,0.0f,0.0f),Vec3fa(1.0f,0.0f,0.0f),1.0f);
+        passed &= verify(0,cylinder,RayHit(Vec3fa(-2.0f,1.0f,0.0f),Vec3fa( 0.0f,-1.0f,+0.0f),0.0f,float(inf)),true,0.0f,2.0f);
+        passed &= verify(1,cylinder,RayHit(Vec3fa(+2.0f,1.0f,0.0f),Vec3fa( 0.0f,-1.0f,+0.0f),0.0f,float(inf)),true,0.0f,2.0f);
+        passed &= verify(2,cylinder,RayHit(Vec3fa(+2.0f,1.0f,2.0f),Vec3fa( 0.0f,-1.0f,+0.0f),0.0f,float(inf)),false,0.0f,0.0f);
+        passed &= verify(3,cylinder,RayHit(Vec3fa(+0.0f,0.0f,0.0f),Vec3fa( 1.0f, 0.0f,+0.0f),0.0f,float(inf)),true,neg_inf,pos_inf);
+        passed &= verify(4,cylinder,RayHit(Vec3fa(+0.0f,0.0f,0.0f),Vec3fa(-1.0f, 0.0f,+0.0f),0.0f,float(inf)),true,neg_inf,pos_inf);
+        passed &= verify(5,cylinder,RayHit(Vec3fa(+0.0f,2.0f,0.0f),Vec3fa( 1.0f, 0.0f,+0.0f),0.0f,float(inf)),false,pos_inf,neg_inf);
+        passed &= verify(6,cylinder,RayHit(Vec3fa(+0.0f,2.0f,0.0f),Vec3fa(-1.0f, 0.0f,+0.0f),0.0f,float(inf)),false,pos_inf,neg_inf);
+        return passed;
+      }
+
+      /*! output operator */
+      friend __forceinline embree_ostream operator<<(embree_ostream cout, const Cylinder& c) {
+        return cout << "Cylinder { p0 = " << c.p0 << ", p1 = " << c.p1 << ", r = " << sqrtf(c.rr) << "}";
+      }
+    };
+
+    template<int N>
+      struct CylinderN
+    { 
+      const Vec3vf<N> p0;     //!< start location
+      const Vec3vf<N> p1;     //!< end position
+      const vfloat<N> rr;   //!< squared radius of cylinder
+
+      __forceinline CylinderN(const Vec3vf<N>& p0, const Vec3vf<N>& p1, const vfloat<N>& r)
+        : p0(p0), p1(p1), rr(sqr(r)) {}
+
+      __forceinline CylinderN(const Vec3vf<N>& p0, const Vec3vf<N>& p1, const vfloat<N>& rr, bool)
+        : p0(p0), p1(p1), rr(rr) {}
+
+     
+      __forceinline vbool<N> intersect(const Vec3fa& org, const Vec3fa& dir, 
+                                       BBox<vfloat<N>>& t_o, 
+                                       vfloat<N>& u0_o, Vec3vf<N>& Ng0_o,
+                                       vfloat<N>& u1_o, Vec3vf<N>& Ng1_o) const
+      {
+        /* calculate quadratic equation to solve */
+        const vfloat<N> rl = rcp_length(p1-p0);
+        const Vec3vf<N> P0 = p0, dP = (p1-p0)*rl;
+        const Vec3vf<N> O = Vec3vf<N>(org)-P0, dO = dir;
+        
+        const vfloat<N> dOdO = dot(dO,dO);
+        const vfloat<N> OdO = dot(dO,O);
+        const vfloat<N> OO = dot(O,O);
+        const vfloat<N> dOz = dot(dP,dO);
+        const vfloat<N> Oz = dot(dP,O);
+        
+        const vfloat<N> A = dOdO - sqr(dOz);
+        const vfloat<N> B = 2.0f * (OdO - dOz*Oz);
+        const vfloat<N> C = OO - sqr(Oz) - rr;
+        
+        /* we miss the cylinder if determinant is smaller than zero */
+        const vfloat<N> D = B*B - 4.0f*A*C;
+        vbool<N> valid = D >= 0.0f;
+        if (none(valid)) {
+          t_o = BBox<vfloat<N>>(empty);
+          return valid;
+        }
+
+        /* standard case for rays that are not parallel to the cylinder */
+        const vfloat<N> Q = sqrt(D);
+        const vfloat<N> rcp_2A = rcp(2.0f*A);
+        const vfloat<N> t0 = (-B-Q)*rcp_2A;
+        const vfloat<N> t1 = (-B+Q)*rcp_2A;
+        
+        /* calculates u and Ng for near hit */
+        {
+          u0_o = madd(t0,dOz,Oz)*rl;
+          const Vec3vf<N> Pr = t0*Vec3vf<N>(dir);
+          const Vec3vf<N> Pl = madd(u0_o,p1-p0,p0);
+          Ng0_o = Pr-Pl;
+        }
+        
+        /* calculates u and Ng for far hit */
+        {
+          u1_o = madd(t1,dOz,Oz)*rl;
+          const Vec3vf<N> Pr = t1*Vec3vf<N>(dir);
+          const Vec3vf<N> Pl = madd(u1_o,p1-p0,p0);
+          Ng1_o = Pr-Pl;
+        }
+
+        t_o.lower = select(valid, t0, vfloat<N>(pos_inf));
+        t_o.upper = select(valid, t1, vfloat<N>(neg_inf));
+
+        /* special case for rays that are parallel to the cylinder */
+        const vfloat<N> eps = 16.0f*float(ulp)*max(abs(dOdO),abs(sqr(dOz)));
+        vbool<N> validt = valid & (abs(A) < eps); 
+        if (unlikely(any(validt))) 
+        {
+          vbool<N> inside = C <= 0.0f;
+          t_o.lower = select(validt,select(inside,vfloat<N>(neg_inf),vfloat<N>(pos_inf)),t_o.lower);
+          t_o.upper = select(validt,select(inside,vfloat<N>(pos_inf),vfloat<N>(neg_inf)),t_o.upper);
+          valid &= !validt | inside;
+        }
+        return valid;
+      }
+
+      __forceinline vbool<N> intersect(const Vec3fa& org_i, const Vec3fa& dir, BBox<vfloat<N>>& t_o) const
+      {
+        vfloat<N> u0_o; Vec3vf<N> Ng0_o;
+        vfloat<N> u1_o; Vec3vf<N> Ng1_o;
+        return intersect(org_i,dir,t_o,u0_o,Ng0_o,u1_o,Ng1_o);
+      }
+    };
+  }
+}
+
diff --git a/thirdparty/embree/kernels/geometry/disc_intersector.h b/thirdparty/embree/kernels/geometry/disc_intersector.h
new file mode 100644
index 0000000000..816c066899
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/disc_intersector.h
@@ -0,0 +1,216 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "../common/scene_points.h"
+#include "curve_intersector_precalculations.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M>
+    struct DiscIntersectorHitM
+    {
+      __forceinline DiscIntersectorHitM() {}
+
+      __forceinline DiscIntersectorHitM(const vfloat<M>& u, const vfloat<M>& v, const vfloat<M>& t, const Vec3vf<M>& Ng)
+          : vu(u), vv(v), vt(t), vNg(Ng)
+      {
+      }
+
+      __forceinline void finalize() {}
+
+      __forceinline Vec2f uv(const size_t i) const
+      {
+        return Vec2f(vu[i], vv[i]);
+      }
+      __forceinline float t(const size_t i) const
+      {
+        return vt[i];
+      }
+      __forceinline Vec3fa Ng(const size_t i) const
+      {
+        return Vec3fa(vNg.x[i], vNg.y[i], vNg.z[i]);
+      }
+
+     public:
+      vfloat<M> vu;
+      vfloat<M> vv;
+      vfloat<M> vt;
+      Vec3vf<M> vNg;
+    };
+
+    template<int M>
+    struct DiscIntersector1
+    {
+      typedef CurvePrecalculations1 Precalculations;
+
+      template<typename Epilog>
+      static __forceinline bool intersect(
+          const vbool<M>& valid_i,
+          Ray& ray,
+          IntersectContext* context,
+          const Points* geom,
+          const Precalculations& pre,
+          const Vec4vf<M>& v0i,
+          const Epilog& epilog)
+      {
+        vbool<M> valid = valid_i;
+
+        const Vec3vf<M> ray_org(ray.org.x, ray.org.y, ray.org.z);
+        const Vec3vf<M> ray_dir(ray.dir.x, ray.dir.y, ray.dir.z);
+        const vfloat<M> rd2    = rcp(dot(ray_dir, ray_dir));
+
+        const Vec4vf<M> v0 = enlargeRadiusToMinWidth<M>(context,geom,ray_org,v0i);
+        const Vec3vf<M> center = v0.xyz();
+        const vfloat<M> radius = v0.w;
+
+        const Vec3vf<M> c0     = center - ray_org;
+        const vfloat<M> projC0 = dot(c0, ray_dir) * rd2;
+
+        valid &= (vfloat<M>(ray.tnear()) <= projC0) & (projC0 <= vfloat<M>(ray.tfar));
+        if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f)
+          valid &= projC0 > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR) * radius * pre.depth_scale;  // ignore self intersections
+        if (unlikely(none(valid)))
+          return false;
+        
+        const Vec3vf<M> perp   = c0 - projC0 * ray_dir;
+        const vfloat<M> l2     = dot(perp, perp);
+        const vfloat<M> r2     = radius * radius;
+        valid &= (l2 <= r2);
+        if (unlikely(none(valid)))
+          return false;
+
+        DiscIntersectorHitM<M> hit(zero, zero, projC0, -ray_dir);
+        return epilog(valid, hit);
+      }
+
+      template<typename Epilog>
+      static __forceinline bool intersect(const vbool<M>& valid_i,
+                                          Ray& ray,
+                                          IntersectContext* context,
+                                          const Points* geom,
+                                          const Precalculations& pre,
+                                          const Vec4vf<M>& v0i,
+                                          const Vec3vf<M>& normal,
+                                          const Epilog& epilog)
+      {
+        vbool<M> valid         = valid_i;
+        const Vec3vf<M> ray_org(ray.org.x, ray.org.y, ray.org.z);
+
+        const Vec4vf<M> v0 = enlargeRadiusToMinWidth<M>(context,geom,ray_org,v0i);
+        const Vec3vf<M> center = v0.xyz();
+        const vfloat<M> radius = v0.w;
+
+        vfloat<M> divisor       = dot(Vec3vf<M>((Vec3fa)ray.dir), normal);
+        const vbool<M> parallel = divisor == vfloat<M>(0.f);
+        valid &= !parallel;
+        divisor = select(parallel, 1.f, divisor);  // prevent divide by zero
+
+        vfloat<M> t = dot(center - Vec3vf<M>((Vec3fa)ray.org), Vec3vf<M>(normal)) / divisor;
+
+        valid &= (vfloat<M>(ray.tnear()) <= t) & (t <= vfloat<M>(ray.tfar));
+        if (unlikely(none(valid)))
+          return false;
+
+        Vec3vf<M> intersection = Vec3vf<M>((Vec3fa)ray.org) + Vec3vf<M>((Vec3fa)ray.dir) * t;
+        vfloat<M> dist2        = dot(intersection - center, intersection - center);
+        valid &= dist2 < radius * radius;
+        if (unlikely(none(valid)))
+          return false;
+
+        DiscIntersectorHitM<M> hit(zero, zero, t, normal);
+        return epilog(valid, hit);
+      }
+    };
+
+    template<int M, int K>
+    struct DiscIntersectorK
+    {
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      template<typename Epilog>
+      static __forceinline bool intersect(const vbool<M>& valid_i,
+                                          RayK<K>& ray,
+                                          size_t k,
+                                          IntersectContext* context,
+                                          const Points* geom,
+                                          const Precalculations& pre,
+                                          const Vec4vf<M>& v0i,
+                                          const Epilog& epilog)
+      {
+        vbool<M> valid = valid_i;
+
+        const Vec3vf<M> ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]);
+        const Vec3vf<M> ray_dir(ray.dir.x[k], ray.dir.y[k], ray.dir.z[k]);
+        const vfloat<M> rd2    = rcp(dot(ray_dir, ray_dir));
+
+        const Vec4vf<M> v0 = enlargeRadiusToMinWidth<M>(context,geom,ray_org,v0i);
+        const Vec3vf<M> center = v0.xyz();
+        const vfloat<M> radius = v0.w;
+
+        const Vec3vf<M> c0     = center - ray_org;
+        const vfloat<M> projC0 = dot(c0, ray_dir) * rd2;
+
+        valid &= (vfloat<M>(ray.tnear()[k]) <= projC0) & (projC0 <= vfloat<M>(ray.tfar[k]));
+        if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f)
+          valid &= projC0 > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR) * radius * pre.depth_scale[k];  // ignore self intersections
+        if (unlikely(none(valid)))
+          return false;
+
+        const Vec3vf<M> perp   = c0 - projC0 * ray_dir;
+        const vfloat<M> l2     = dot(perp, perp);
+        const vfloat<M> r2     = radius * radius;
+        valid &= (l2 <= r2);
+        if (unlikely(none(valid)))
+          return false;
+
+        DiscIntersectorHitM<M> hit(zero, zero, projC0, -ray_dir);
+        return epilog(valid, hit);
+      }
+
+      template<typename Epilog>
+      static __forceinline bool intersect(const vbool<M>& valid_i,
+                                          RayK<K>& ray,
+                                          size_t k,
+                                          IntersectContext* context,
+                                          const Points* geom,
+                                          const Precalculations& pre,
+                                          const Vec4vf<M>& v0i,
+                                          const Vec3vf<M>& normal,
+                                          const Epilog& epilog)
+      {
+        vbool<M> valid         = valid_i;
+        const Vec3vf<M> ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]);
+        const Vec3vf<M> ray_dir(ray.dir.x[k], ray.dir.y[k], ray.dir.z[k]);
+
+        const Vec4vf<M> v0 = enlargeRadiusToMinWidth<M>(context,geom,ray_org,v0i);
+        const Vec3vf<M> center = v0.xyz();
+        const vfloat<M> radius = v0.w;
+        
+        vfloat<M> divisor       = dot(Vec3vf<M>(ray_dir), normal);
+        const vbool<M> parallel = divisor == vfloat<M>(0.f);
+        valid &= !parallel;
+        divisor = select(parallel, 1.f, divisor);  // prevent divide by zero
+
+        vfloat<M> t = dot(center - Vec3vf<M>(ray_org), Vec3vf<M>(normal)) / divisor;
+
+        valid &= (vfloat<M>(ray.tnear()[k]) <= t) & (t <= vfloat<M>(ray.tfar[k]));
+        if (unlikely(none(valid)))
+          return false;
+
+        Vec3vf<M> intersection = Vec3vf<M>(ray_org) + Vec3vf<M>(ray_dir) * t;
+        vfloat<M> dist2        = dot(intersection - center, intersection - center);
+        valid &= dist2 < radius * radius;
+        if (unlikely(none(valid)))
+          return false;
+
+        DiscIntersectorHitM<M> hit(zero, zero, t, normal);
+        return epilog(valid, hit);
+      }
+    };
+  }  // namespace isa
+}  // namespace embree
diff --git a/thirdparty/embree/kernels/geometry/disci_intersector.h b/thirdparty/embree/kernels/geometry/disci_intersector.h
new file mode 100644
index 0000000000..bb9d396f6e
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/disci_intersector.h
@@ -0,0 +1,277 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "disc_intersector.h"
+#include "intersector_epilog.h"
+#include "pointi.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M, bool filter>
+    struct DiscMiIntersector1
+    {
+      typedef PointMi<M> Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre,
+                                          RayHit& ray,
+                                          IntersectContext* context,
+                                          const Primitive& Disc)
+      {
+        STAT3(normal.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Disc.gather(v0, geom);
+        const vbool<M> valid = Disc.valid();
+        DiscIntersector1<M>::intersect(
+          valid, ray, context, geom, pre, v0, Intersect1EpilogM<M, filter>(ray, context, Disc.geomID(), Disc.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre,
+                                         Ray& ray,
+                                         IntersectContext* context,
+                                         const Primitive& Disc)
+      {
+        STAT3(shadow.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Disc.gather(v0, geom);
+        const vbool<M> valid = Disc.valid();
+        return DiscIntersector1<M>::intersect(
+          valid, ray, context, geom, pre, v0, Occluded1EpilogM<M, filter>(ray, context, Disc.geomID(), Disc.primID()));
+      }
+    };
+
+    template<int M, bool filter>
+    struct DiscMiMBIntersector1
+    {
+      typedef PointMi<M> Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre,
+                                          RayHit& ray,
+                                          IntersectContext* context,
+                                          const Primitive& Disc)
+      {
+        STAT3(normal.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Disc.gather(v0, geom, ray.time());
+        const vbool<M> valid = Disc.valid();
+        DiscIntersector1<M>::intersect(
+          valid, ray, context, geom, pre, v0, Intersect1EpilogM<M, filter>(ray, context, Disc.geomID(), Disc.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre,
+                                         Ray& ray,
+                                         IntersectContext* context,
+                                         const Primitive& Disc)
+      {
+        STAT3(shadow.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Disc.gather(v0, geom, ray.time());
+        const vbool<M> valid = Disc.valid();
+        return DiscIntersector1<M>::intersect(
+          valid, ray, context, geom, pre, v0, Occluded1EpilogM<M, filter>(ray, context, Disc.geomID(), Disc.primID()));
+      }
+    };
+
+    template<int M, int K, bool filter>
+    struct DiscMiIntersectorK
+    {
+      typedef PointMi<M> Primitive;
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      static __forceinline void intersect(
+          const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& Disc)
+      {
+        STAT3(normal.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Disc.gather(v0, geom);
+        const vbool<M> valid = Disc.valid();
+        DiscIntersectorK<M, K>::intersect(
+            valid, ray, k, context, geom, pre, v0,
+            Intersect1KEpilogM<M, K, filter>(ray, k, context, Disc.geomID(), Disc.primID()));
+      }
+
+      static __forceinline bool occluded(
+          const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& Disc)
+      {
+        STAT3(shadow.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Disc.gather(v0, geom);
+        const vbool<M> valid = Disc.valid();
+        return DiscIntersectorK<M, K>::intersect(
+          valid, ray, k, context, geom, pre, v0,
+          Occluded1KEpilogM<M, K, filter>(ray, k, context, Disc.geomID(), Disc.primID()));
+      }
+    };
+
+    template<int M, int K, bool filter>
+    struct DiscMiMBIntersectorK
+    {
+      typedef PointMi<M> Primitive;
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      static __forceinline void intersect(
+          const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& Disc)
+      {
+        STAT3(normal.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Disc.gather(v0, geom, ray.time()[k]);
+        const vbool<M> valid = Disc.valid();
+        DiscIntersectorK<M, K>::intersect(
+          valid, ray, k, context, geom, pre, v0,
+          Intersect1KEpilogM<M, K, filter>(ray, k, context, Disc.geomID(), Disc.primID()));
+      }
+
+      static __forceinline bool occluded(
+          const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& Disc)
+      {
+        STAT3(shadow.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Disc.gather(v0, geom, ray.time()[k]);
+        const vbool<M> valid = Disc.valid();
+        return DiscIntersectorK<M, K>::intersect(
+          valid, ray, k, context, geom, pre, v0, Occluded1KEpilogM<M, K, filter>(ray, k, context, Disc.geomID(), Disc.primID()));
+      }
+    };
+
+    template<int M, bool filter>
+    struct OrientedDiscMiIntersector1
+    {
+      typedef PointMi<M> Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre,
+                                          RayHit& ray,
+                                          IntersectContext* context,
+                                          const Primitive& Disc)
+      {
+        STAT3(normal.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Vec3vf<M> n0;
+        Disc.gather(v0, n0, geom);
+        const vbool<M> valid = Disc.valid();
+        DiscIntersector1<M>::intersect(
+          valid, ray, context, geom, pre, v0, n0, Intersect1EpilogM<M, filter>(ray, context, Disc.geomID(), Disc.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre,
+                                         Ray& ray,
+                                         IntersectContext* context,
+                                         const Primitive& Disc)
+      {
+        STAT3(shadow.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Vec3vf<M> n0;
+        Disc.gather(v0, n0, geom);
+        const vbool<M> valid = Disc.valid();
+        return DiscIntersector1<M>::intersect(
+          valid, ray, context, geom, pre, v0, n0, Occluded1EpilogM<M, filter>(ray, context, Disc.geomID(), Disc.primID()));
+      }
+    };
+
+    template<int M, bool filter>
+    struct OrientedDiscMiMBIntersector1
+    {
+      typedef PointMi<M> Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre,
+                                          RayHit& ray,
+                                          IntersectContext* context,
+                                          const Primitive& Disc)
+      {
+        STAT3(normal.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Vec3vf<M> n0;
+        Disc.gather(v0, n0, geom, ray.time());
+        const vbool<M> valid = Disc.valid();
+        DiscIntersector1<M>::intersect(
+          valid, ray, context, geom, pre, v0, n0, Intersect1EpilogM<M, filter>(ray, context, Disc.geomID(), Disc.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre,
+                                         Ray& ray,
+                                         IntersectContext* context,
+                                         const Primitive& Disc)
+      {
+        STAT3(shadow.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Vec3vf<M> n0;
+        Disc.gather(v0, n0, geom, ray.time());
+        const vbool<M> valid = Disc.valid();
+        return DiscIntersector1<M>::intersect(
+          valid, ray, context, geom, pre, v0, n0, Occluded1EpilogM<M, filter>(ray, context, Disc.geomID(), Disc.primID()));
+      }
+    };
+
+    template<int M, int K, bool filter>
+    struct OrientedDiscMiIntersectorK
+    {
+      typedef PointMi<M> Primitive;
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      static __forceinline void intersect(
+          const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& Disc)
+      {
+        STAT3(normal.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Vec3vf<M> n0;
+        Disc.gather(v0, n0, geom);
+        const vbool<M> valid = Disc.valid();
+        DiscIntersectorK<M, K>::intersect(
+            valid, ray, k, context, geom, pre, v0, n0,
+            Intersect1KEpilogM<M, K, filter>(ray, k, context, Disc.geomID(), Disc.primID()));
+      }
+
+      static __forceinline bool occluded(
+          const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& Disc)
+      {
+        STAT3(shadow.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Vec3vf<M> n0;
+        Disc.gather(v0, n0, geom);
+        const vbool<M> valid = Disc.valid();
+        return DiscIntersectorK<M, K>::intersect(
+            valid, ray, k, context, geom, pre, v0, n0,
+            Occluded1KEpilogM<M, K, filter>(ray, k, context, Disc.geomID(), Disc.primID()));
+      }
+    };
+
+    template<int M, int K, bool filter>
+    struct OrientedDiscMiMBIntersectorK
+    {
+      typedef PointMi<M> Primitive;
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      static __forceinline void intersect(
+          const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& Disc)
+      {
+        STAT3(normal.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Vec3vf<M> n0;
+        Disc.gather(v0, n0, geom, ray.time()[k]);
+        const vbool<M> valid = Disc.valid();
+        DiscIntersectorK<M, K>::intersect(
+            valid, ray, k, context, geom, pre, v0, n0,
+            Intersect1KEpilogM<M, K, filter>(ray, k, context, Disc.geomID(), Disc.primID()));
+      }
+
+      static __forceinline bool occluded(
+          const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& Disc)
+      {
+        STAT3(shadow.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Vec3vf<M> n0;
+        Disc.gather(v0, n0, geom, ray.time()[k]);
+        const vbool<M> valid = Disc.valid();
+        return DiscIntersectorK<M, K>::intersect(
+            valid, ray, k, context, geom, pre, v0, n0,
+            Occluded1KEpilogM<M, K, filter>(ray, k, context, Disc.geomID(), Disc.primID()));
+      }
+    };
+  }  // namespace isa
+}  // namespace embree
diff --git a/thirdparty/embree/kernels/geometry/filter.h b/thirdparty/embree/kernels/geometry/filter.h
new file mode 100644
index 0000000000..3b4d924ea7
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/filter.h
@@ -0,0 +1,204 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/geometry.h"
+#include "../common/ray.h"
+#include "../common/hit.h"
+#include "../common/context.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    __forceinline bool runIntersectionFilter1Helper(RTCFilterFunctionNArguments* args, const Geometry* const geometry, IntersectContext* context)
+    {
+      if (geometry->intersectionFilterN)
+      {
+        assert(context->scene->hasGeometryFilterFunction());
+        geometry->intersectionFilterN(args);
+
+        if (args->valid[0] == 0)
+          return false;
+      }
+            
+      if (context->user->filter) {
+        assert(context->scene->hasContextFilterFunction());
+        context->user->filter(args);
+
+        if (args->valid[0] == 0)
+          return false;
+      }
+      
+      copyHitToRay(*(RayHit*)args->ray,*(Hit*)args->hit);
+      return true;
+    }
+    
+    __forceinline bool runIntersectionFilter1(const Geometry* const geometry, RayHit& ray, IntersectContext* context, Hit& hit)
+    {
+      RTCFilterFunctionNArguments args;
+      int mask = -1;
+      args.valid = &mask;
+      args.geometryUserPtr = geometry->userPtr;
+      args.context = context->user;
+      args.ray = (RTCRayN*)&ray;
+      args.hit = (RTCHitN*)&hit;
+      args.N = 1;
+      return runIntersectionFilter1Helper(&args,geometry,context);
+    }
+
+    __forceinline void reportIntersection1(IntersectFunctionNArguments* args, const RTCFilterFunctionNArguments* filter_args)
+    {
+#if defined(EMBREE_FILTER_FUNCTION)
+      IntersectContext* MAYBE_UNUSED context = args->internal_context;
+      const Geometry* const geometry = args->geometry;
+      if (geometry->intersectionFilterN) {
+        assert(context->scene->hasGeometryFilterFunction());
+        geometry->intersectionFilterN(filter_args);
+      }
+      
+      //if (args->valid[0] == 0)
+      //  return;
+
+      if (context->user->filter) {
+        assert(context->scene->hasContextFilterFunction());
+        context->user->filter(filter_args);
+      }
+#endif
+    }
+    
+    __forceinline bool runOcclusionFilter1Helper(RTCFilterFunctionNArguments* args, const Geometry* const geometry, IntersectContext* context)
+    {
+      if (geometry->occlusionFilterN)
+      {
+        assert(context->scene->hasGeometryFilterFunction());
+        geometry->occlusionFilterN(args);
+
+        if (args->valid[0] == 0)
+          return false;
+      }
+      
+      if (context->user->filter) {
+        assert(context->scene->hasContextFilterFunction());
+        context->user->filter(args);
+
+        if (args->valid[0] == 0)
+          return false;
+      }
+      return true;
+    }
+
+    __forceinline bool runOcclusionFilter1(const Geometry* const geometry, Ray& ray, IntersectContext* context, Hit& hit)
+    {
+      RTCFilterFunctionNArguments args;
+      int mask = -1;
+      args.valid = &mask;
+      args.geometryUserPtr = geometry->userPtr;
+      args.context = context->user;
+      args.ray = (RTCRayN*)&ray;
+      args.hit = (RTCHitN*)&hit;
+      args.N = 1;
+      return runOcclusionFilter1Helper(&args,geometry,context);
+    }
+
+    __forceinline void reportOcclusion1(OccludedFunctionNArguments* args, const RTCFilterFunctionNArguments* filter_args)
+    {
+#if defined(EMBREE_FILTER_FUNCTION)
+      IntersectContext* MAYBE_UNUSED context = args->internal_context;
+      const Geometry* const geometry = args->geometry;
+      if (geometry->occlusionFilterN) {
+        assert(context->scene->hasGeometryFilterFunction());
+        geometry->occlusionFilterN(filter_args);
+      }
+      
+      //if (args->valid[0] == 0)
+      //  return false;
+      
+      if (context->user->filter) {
+        assert(context->scene->hasContextFilterFunction());
+        context->user->filter(filter_args);
+      }
+#endif
+    }
+
+    template<int K>
+      __forceinline vbool<K> runIntersectionFilterHelper(RTCFilterFunctionNArguments* args, const Geometry* const geometry, IntersectContext* context)
+    {
+      vint<K>* mask = (vint<K>*) args->valid;
+      if (geometry->intersectionFilterN)
+      {
+        assert(context->scene->hasGeometryFilterFunction());
+        geometry->intersectionFilterN(args);
+      }
+
+      vbool<K> valid_o = *mask != vint<K>(zero);
+      if (none(valid_o)) return valid_o;
+
+      if (context->user->filter) {
+        assert(context->scene->hasContextFilterFunction());
+        context->user->filter(args);
+      }
+
+      valid_o = *mask != vint<K>(zero);
+      if (none(valid_o)) return valid_o;
+      
+      copyHitToRay(valid_o,*(RayHitK<K>*)args->ray,*(HitK<K>*)args->hit);
+      return valid_o;
+    }
+    
+    template<int K>
+    __forceinline vbool<K> runIntersectionFilter(const vbool<K>& valid, const Geometry* const geometry, RayHitK<K>& ray, IntersectContext* context, HitK<K>& hit)
+    {
+      RTCFilterFunctionNArguments args;
+      vint<K> mask = valid.mask32();
+      args.valid = (int*)&mask;
+      args.geometryUserPtr = geometry->userPtr;
+      args.context = context->user;
+      args.ray = (RTCRayN*)&ray;
+      args.hit = (RTCHitN*)&hit;
+      args.N = K;
+      return runIntersectionFilterHelper<K>(&args,geometry,context);
+    }
+
+    template<int K>
+      __forceinline vbool<K> runOcclusionFilterHelper(RTCFilterFunctionNArguments* args, const Geometry* const geometry, IntersectContext* context)
+    {
+      vint<K>* mask = (vint<K>*) args->valid;
+      if (geometry->occlusionFilterN)
+      {
+        assert(context->scene->hasGeometryFilterFunction());
+        geometry->occlusionFilterN(args);
+      }
+
+      vbool<K> valid_o = *mask != vint<K>(zero);
+      
+      if (none(valid_o)) return valid_o;
+
+      if (context->user->filter) {
+        assert(context->scene->hasContextFilterFunction());
+        context->user->filter(args);
+      }
+
+      valid_o = *mask != vint<K>(zero);
+
+      RayK<K>* ray = (RayK<K>*) args->ray;
+      ray->tfar = select(valid_o, vfloat<K>(neg_inf), ray->tfar);
+      return valid_o;
+    }
+
+    template<int K>
+      __forceinline vbool<K> runOcclusionFilter(const vbool<K>& valid, const Geometry* const geometry, RayK<K>& ray, IntersectContext* context, HitK<K>& hit)
+    {
+      RTCFilterFunctionNArguments args;
+      vint<K> mask = valid.mask32();
+      args.valid = (int*)&mask;
+      args.geometryUserPtr = geometry->userPtr;
+      args.context = context->user;
+      args.ray = (RTCRayN*)&ray;
+      args.hit = (RTCHitN*)&hit;
+      args.N = K;
+      return runOcclusionFilterHelper<K>(&args,geometry,context);
+    }
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/grid_intersector.h b/thirdparty/embree/kernels/geometry/grid_intersector.h
new file mode 100644
index 0000000000..9c59cef119
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/grid_intersector.h
@@ -0,0 +1,99 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "grid_soa.h"
+#include "grid_soa_intersector1.h"
+#include "grid_soa_intersector_packet.h"
+#include "../common/ray.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<typename T>
+      class SubdivPatch1Precalculations : public T
+    { 
+    public:
+      __forceinline SubdivPatch1Precalculations (const Ray& ray, const void* ptr)
+        : T(ray,ptr) {}
+    };
+
+    template<int K, typename T>
+      class SubdivPatch1PrecalculationsK : public T
+    { 
+    public:
+      __forceinline SubdivPatch1PrecalculationsK (const vbool<K>& valid, RayK<K>& ray)
+        : T(valid,ray) {}
+    };
+
+    class Grid1Intersector1
+    {
+    public:
+      typedef GridSOA Primitive;
+      typedef Grid1Precalculations<GridSOAIntersector1::Precalculations> Precalculations;
+
+      /*! Intersect a ray with the primitive. */
+      static __forceinline void intersect(Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t ty, size_t& lazy_node) 
+      {
+        GridSOAIntersector1::intersect(pre,ray,context,prim,lazy_node);
+      }
+      static __forceinline void intersect(Precalculations& pre, RayHit& ray, IntersectContext* context, size_t ty0, const Primitive* prim, size_t ty, size_t& lazy_node) {
+        intersect(pre,ray,context,prim,ty,lazy_node);
+      }
+      
+      /*! Test if the ray is occluded by the primitive */
+      static __forceinline bool occluded(Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t ty, size_t& lazy_node)
+      {
+        GridSOAIntersector1::occluded(pre,ray,context,prim,lazy_node);
+      }
+      static __forceinline bool occluded(Precalculations& pre, Ray& ray, IntersectContext* context, size_t ty0, const Primitive* prim, size_t ty, size_t& lazy_node) {
+        return occluded(pre,ray,context,prim,ty,lazy_node);
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive* prim, size_t ty, size_t& lazy_node) {
+        assert(false && "not implemented");
+        return false;
+      }
+
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, size_t ty0, const Primitive* prim, size_t ty, size_t& lazy_node) {
+        assert(false && "not implemented");
+        return false;
+      }
+    };
+
+    template <int K>
+      struct GridIntersectorK
+    {
+      typedef GridSOA Primitive;
+      typedef SubdivPatch1PrecalculationsK<K,typename GridSOAIntersectorK<K>::Precalculations> Precalculations;
+      
+      
+      static __forceinline void intersect(const vbool<K>& valid, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t ty, size_t& lazy_node)
+      {
+        GridSOAIntersectorK<K>::intersect(valid,pre,ray,context,prim,lazy_node);
+      }
+      
+      static __forceinline vbool<K> occluded(const vbool<K>& valid, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t ty, size_t& lazy_node)
+      {
+        GridSOAIntersectorK<K>::occluded(valid,pre,ray,context,prim,lazy_node);
+      }
+      
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t ty, size_t& lazy_node)
+      {
+        GridSOAIntersectorK<K>::intersect(pre,ray,k,context,prim,lazy_node);
+      }
+      
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t ty, size_t& lazy_node)
+      {
+        GridSOAIntersectorK<K>::occluded(pre,ray,k,context,prim,lazy_node);
+      }
+    };
+
+    typedef Grid1IntersectorK<4>  SubdivPatch1Intersector4;
+    typedef Grid1IntersectorK<8>  SubdivPatch1Intersector8;
+    typedef Grid1IntersectorK<16> SubdivPatch1Intersector16;
+
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/grid_soa.h b/thirdparty/embree/kernels/geometry/grid_soa.h
new file mode 100644
index 0000000000..cea90aedf6
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/grid_soa.h
@@ -0,0 +1,275 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "../common/scene_subdiv_mesh.h"
+#include "../bvh/bvh.h"
+#include "../subdiv/tessellation.h"
+#include "../subdiv/tessellation_cache.h"
+#include "subdivpatch1.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    class GridSOA
+    {
+    public:
+
+      /*! GridSOA constructor */
+      GridSOA(const SubdivPatch1Base* patches, const unsigned time_steps,
+              const unsigned x0, const unsigned x1, const unsigned y0, const unsigned y1, const unsigned swidth, const unsigned sheight,
+              const SubdivMesh* const geom, const size_t totalBvhBytes, const size_t gridBytes, BBox3fa* bounds_o = nullptr);
+
+      /*! Subgrid creation */
+      template<typename Allocator>
+        static GridSOA* create(const SubdivPatch1Base* patches, const unsigned time_steps,
+                               unsigned x0, unsigned x1, unsigned y0, unsigned y1, 
+                               const Scene* scene, Allocator& alloc, BBox3fa* bounds_o = nullptr)
+      {
+        const unsigned width = x1-x0+1;  
+        const unsigned height = y1-y0+1; 
+        const GridRange range(0,width-1,0,height-1);
+        size_t bvhBytes = 0;
+        if (time_steps == 1) 
+          bvhBytes = getBVHBytes(range,sizeof(BVH4::AABBNode),0);
+        else {
+          bvhBytes = (time_steps-1)*getBVHBytes(range,sizeof(BVH4::AABBNodeMB),0);
+          bvhBytes += getTemporalBVHBytes(make_range(0,int(time_steps-1)),sizeof(BVH4::AABBNodeMB4D));
+        }
+        const size_t gridBytes = 4*size_t(width)*size_t(height)*sizeof(float);  
+        size_t rootBytes = time_steps*sizeof(BVH4::NodeRef);
+#if !defined(__64BIT__)
+        rootBytes += 4; // We read 2 elements behind the grid. As we store at least 8 root bytes after the grid we are fine in 64 bit mode. But in 32 bit mode we have to do additional padding.
+#endif
+        void* data = alloc(offsetof(GridSOA,data)+bvhBytes+time_steps*gridBytes+rootBytes);
+        assert(data);
+        return new (data) GridSOA(patches,time_steps,x0,x1,y0,y1,patches->grid_u_res,patches->grid_v_res,scene->get<SubdivMesh>(patches->geomID()),bvhBytes,gridBytes,bounds_o);
+      }
+
+      /*! Grid creation */
+      template<typename Allocator>
+        static GridSOA* create(const SubdivPatch1Base* const patches, const unsigned time_steps,
+                               const Scene* scene, const Allocator& alloc, BBox3fa* bounds_o = nullptr) 
+      {
+        return create(patches,time_steps,0,patches->grid_u_res-1,0,patches->grid_v_res-1,scene,alloc,bounds_o);
+      }
+
+       /*! returns reference to root */
+      __forceinline       BVH4::NodeRef& root(size_t t = 0)       { return (BVH4::NodeRef&)data[rootOffset + t*sizeof(BVH4::NodeRef)]; }
+      __forceinline const BVH4::NodeRef& root(size_t t = 0) const { return (BVH4::NodeRef&)data[rootOffset + t*sizeof(BVH4::NodeRef)]; }
+
+      /*! returns pointer to BVH array */
+      __forceinline       char* bvhData()       { return &data[0]; }
+      __forceinline const char* bvhData() const { return &data[0]; }
+
+      /*! returns pointer to Grid array */
+      __forceinline       float* gridData(size_t t = 0)       { return (float*) &data[gridOffset + t*gridBytes]; }
+      __forceinline const float* gridData(size_t t = 0) const { return (float*) &data[gridOffset + t*gridBytes]; }
+      
+      __forceinline void* encodeLeaf(size_t u, size_t v) {
+        return (void*) (16*(v * width + u + 1)); // +1 to not create empty leaf
+      }
+      __forceinline float* decodeLeaf(size_t t, const void* ptr) {
+        return gridData(t) + (((size_t) (ptr) >> 4) - 1);
+      }
+
+      /*! returns the size of the BVH over the grid in bytes */
+      static size_t getBVHBytes(const GridRange& range, const size_t nodeBytes, const size_t leafBytes);
+
+      /*! returns the size of the temporal BVH over the time range BVHs */
+      static size_t getTemporalBVHBytes(const range<int> time_range, const size_t nodeBytes);
+
+      /*! calculates bounding box of grid range */
+      __forceinline BBox3fa calculateBounds(size_t time, const GridRange& range) const
+      {
+        const float* const grid_array = gridData(time);
+        const float* const grid_x_array = grid_array + 0 * dim_offset;
+        const float* const grid_y_array = grid_array + 1 * dim_offset;
+        const float* const grid_z_array = grid_array + 2 * dim_offset;
+        
+        /* compute the bounds just for the range! */
+        BBox3fa bounds( empty );
+        for (unsigned v = range.v_start; v<=range.v_end; v++) 
+        {
+          for (unsigned u = range.u_start; u<=range.u_end; u++)
+          {
+            const float x = grid_x_array[ v * width + u];
+            const float y = grid_y_array[ v * width + u];
+            const float z = grid_z_array[ v * width + u];
+            bounds.extend( Vec3fa(x,y,z) );
+          }
+        }
+        assert(is_finite(bounds));
+        return bounds;
+      }
+
+      /*! Evaluates grid over patch and builds BVH4 tree over the grid. */
+      std::pair<BVH4::NodeRef,BBox3fa> buildBVH(BBox3fa* bounds_o);
+      
+      /*! Create BVH4 tree over grid. */
+      std::pair<BVH4::NodeRef,BBox3fa> buildBVH(const GridRange& range, size_t& allocator);
+
+      /*! Evaluates grid over patch and builds MSMBlur BVH4 tree over the grid. */
+      std::pair<BVH4::NodeRef,LBBox3fa> buildMSMBlurBVH(const range<int> time_range, BBox3fa* bounds_o);
+      
+      /*! Create MBlur BVH4 tree over grid. */
+      std::pair<BVH4::NodeRef,LBBox3fa> buildMBlurBVH(size_t time, const GridRange& range, size_t& allocator);
+
+      /*! Create MSMBlur BVH4 tree over grid. */
+      std::pair<BVH4::NodeRef,LBBox3fa> buildMSMBlurBVH(const range<int> time_range, size_t& allocator, BBox3fa* bounds_o);
+
+      template<typename Loader>
+        struct MapUV
+      {
+        typedef typename Loader::vfloat vfloat;
+        const float* const grid_uv;
+        size_t line_offset;
+        size_t lines;
+
+        __forceinline MapUV(const float* const grid_uv, size_t line_offset, const size_t lines)
+          : grid_uv(grid_uv), line_offset(line_offset), lines(lines) {}
+
+        __forceinline void operator() (vfloat& u, vfloat& v, Vec3<vfloat>& Ng) const {
+          const Vec3<vfloat> tri_v012_uv = Loader::gather(grid_uv,line_offset,lines);	
+          const Vec2<vfloat> uv0 = GridSOA::decodeUV(tri_v012_uv[0]);
+          const Vec2<vfloat> uv1 = GridSOA::decodeUV(tri_v012_uv[1]);
+          const Vec2<vfloat> uv2 = GridSOA::decodeUV(tri_v012_uv[2]);        
+          const Vec2<vfloat> uv = u * uv1 + v * uv2 + (1.0f-u-v) * uv0;        
+          u = uv[0];v = uv[1]; 
+        }
+      };
+
+      struct Gather2x3
+      {
+        enum { M = 4 };
+        typedef vbool4 vbool;
+        typedef vint4 vint;
+        typedef vfloat4 vfloat;
+        
+        static __forceinline const Vec3vf4 gather(const float* const grid, const size_t line_offset, const size_t lines)
+        {
+          vfloat4 r0 = vfloat4::loadu(grid + 0*line_offset);
+          vfloat4 r1 = vfloat4::loadu(grid + 1*line_offset); // this accesses 2 elements too much in case of 2x2 grid, but this is ok as we ensure enough padding after the grid
+          if (unlikely(line_offset == 2))
+          {
+            r0 = shuffle<0,1,1,1>(r0);
+            r1 = shuffle<0,1,1,1>(r1);
+          }
+          return Vec3vf4(unpacklo(r0,r1),       // r00, r10, r01, r11
+                         shuffle<1,1,2,2>(r0),  // r01, r01, r02, r02
+                         shuffle<0,1,1,2>(r1)); // r10, r11, r11, r12
+        }
+
+        static __forceinline void gather(const float* const grid_x, 
+                                         const float* const grid_y, 
+                                         const float* const grid_z, 
+                                         const size_t line_offset,
+                                         const size_t lines,
+                                         Vec3vf4& v0_o,
+                                         Vec3vf4& v1_o,
+                                         Vec3vf4& v2_o)
+        {
+          const Vec3vf4 tri_v012_x = gather(grid_x,line_offset,lines);
+          const Vec3vf4 tri_v012_y = gather(grid_y,line_offset,lines);
+          const Vec3vf4 tri_v012_z = gather(grid_z,line_offset,lines);
+          v0_o = Vec3vf4(tri_v012_x[0],tri_v012_y[0],tri_v012_z[0]);
+          v1_o = Vec3vf4(tri_v012_x[1],tri_v012_y[1],tri_v012_z[1]);
+          v2_o = Vec3vf4(tri_v012_x[2],tri_v012_y[2],tri_v012_z[2]);
+        }
+      };
+      
+#if defined (__AVX__)
+      struct Gather3x3
+      {
+        enum { M = 8 };
+        typedef vbool8 vbool;
+        typedef vint8 vint;
+        typedef vfloat8 vfloat;
+        
+        static __forceinline const Vec3vf8 gather(const float* const grid, const size_t line_offset, const size_t lines)
+        {
+          vfloat4 ra = vfloat4::loadu(grid + 0*line_offset);
+          vfloat4 rb = vfloat4::loadu(grid + 1*line_offset); // this accesses 2 elements too much in case of 2x2 grid, but this is ok as we ensure enough padding after the grid
+          vfloat4 rc;
+          if (likely(lines > 2)) 
+            rc = vfloat4::loadu(grid + 2*line_offset);
+          else                   
+            rc = rb;
+
+          if (unlikely(line_offset == 2))
+          {
+            ra = shuffle<0,1,1,1>(ra);
+            rb = shuffle<0,1,1,1>(rb);
+            rc = shuffle<0,1,1,1>(rc);
+          }
+          
+          const vfloat8 r0 = vfloat8(ra,rb);
+          const vfloat8 r1 = vfloat8(rb,rc);
+          return Vec3vf8(unpacklo(r0,r1),         // r00, r10, r01, r11, r10, r20, r11, r21
+                         shuffle<1,1,2,2>(r0),    // r01, r01, r02, r02, r11, r11, r12, r12
+                         shuffle<0,1,1,2>(r1));   // r10, r11, r11, r12, r20, r21, r21, r22
+        }
+
+        static __forceinline void gather(const float* const grid_x, 
+                                         const float* const grid_y, 
+                                         const float* const grid_z, 
+                                         const size_t line_offset,
+                                         const size_t lines,
+                                         Vec3vf8& v0_o,
+                                         Vec3vf8& v1_o,
+                                         Vec3vf8& v2_o)
+        {
+          const Vec3vf8 tri_v012_x = gather(grid_x,line_offset,lines);
+          const Vec3vf8 tri_v012_y = gather(grid_y,line_offset,lines);
+          const Vec3vf8 tri_v012_z = gather(grid_z,line_offset,lines);
+          v0_o = Vec3vf8(tri_v012_x[0],tri_v012_y[0],tri_v012_z[0]);
+          v1_o = Vec3vf8(tri_v012_x[1],tri_v012_y[1],tri_v012_z[1]);
+          v2_o = Vec3vf8(tri_v012_x[2],tri_v012_y[2],tri_v012_z[2]);
+        }
+      };
+#endif
+
+      template<typename vfloat>
+      static __forceinline Vec2<vfloat> decodeUV(const vfloat& uv)
+      {
+        typedef typename vfloat::Int vint;
+        const vint iu  = asInt(uv) & 0xffff;
+        const vint iv  = srl(asInt(uv),16);
+	const vfloat u = (vfloat)iu * vfloat(8.0f/0x10000);
+	const vfloat v = (vfloat)iv * vfloat(8.0f/0x10000);
+	return Vec2<vfloat>(u,v);
+      }
+      
+      __forceinline unsigned int geomID() const  {
+        return _geomID;
+      } 
+      
+      __forceinline unsigned int primID() const  {
+        return _primID;
+      } 
+
+    public:
+      BVH4::NodeRef troot;
+#if !defined(__64BIT__)
+      unsigned align1;
+#endif
+      unsigned time_steps;
+      unsigned width;
+
+      unsigned height;
+      unsigned dim_offset;
+      unsigned _geomID;
+      unsigned _primID;
+
+      unsigned align2;
+      unsigned gridOffset;
+      unsigned gridBytes;
+      unsigned rootOffset;
+
+      char data[1];        //!< after the struct we first store the BVH, then the grid, and finally the roots
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/grid_soa_intersector1.h b/thirdparty/embree/kernels/geometry/grid_soa_intersector1.h
new file mode 100644
index 0000000000..8fbf0d4bdf
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/grid_soa_intersector1.h
@@ -0,0 +1,207 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "grid_soa.h"
+#include "../common/ray.h"
+#include "triangle_intersector_pluecker.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    class GridSOAIntersector1
+    {
+    public:
+      typedef void Primitive;
+      
+      class Precalculations
+      { 
+      public:
+        __forceinline Precalculations (const Ray& ray, const void* ptr)
+          : grid(nullptr) {}
+        
+      public:
+        GridSOA* grid;
+        int itime;
+        float ftime;
+      };
+      
+      template<typename Loader>
+        static __forceinline void intersect(RayHit& ray,
+                                            IntersectContext* context, 
+                                            const float* const grid_x,
+                                            const size_t line_offset,
+                                            const size_t lines,
+                                            Precalculations& pre)
+      {
+        typedef typename Loader::vfloat vfloat;
+        const size_t dim_offset    = pre.grid->dim_offset;
+        const float* const grid_y  = grid_x + 1 * dim_offset;
+        const float* const grid_z  = grid_x + 2 * dim_offset;
+        const float* const grid_uv = grid_x + 3 * dim_offset;
+        Vec3<vfloat> v0, v1, v2;
+        Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,v0,v1,v2);       
+        GridSOA::MapUV<Loader> mapUV(grid_uv,line_offset,lines);
+        PlueckerIntersector1<Loader::M> intersector(ray,nullptr);
+        intersector.intersect(ray,v0,v1,v2,mapUV,Intersect1EpilogMU<Loader::M,true>(ray,context,pre.grid->geomID(),pre.grid->primID()));
+      };
+      
+      template<typename Loader>
+        static __forceinline bool occluded(Ray& ray,
+                                           IntersectContext* context, 
+                                           const float* const grid_x,
+                                           const size_t line_offset,
+                                           const size_t lines,
+                                           Precalculations& pre)
+      {
+        typedef typename Loader::vfloat vfloat;
+        const size_t dim_offset    = pre.grid->dim_offset;
+        const float* const grid_y  = grid_x + 1 * dim_offset;
+        const float* const grid_z  = grid_x + 2 * dim_offset;
+        const float* const grid_uv = grid_x + 3 * dim_offset;
+
+        Vec3<vfloat> v0, v1, v2;
+        Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,v0,v1,v2);
+        
+        GridSOA::MapUV<Loader> mapUV(grid_uv,line_offset,lines);
+        PlueckerIntersector1<Loader::M> intersector(ray,nullptr);
+        return intersector.intersect(ray,v0,v1,v2,mapUV,Occluded1EpilogMU<Loader::M,true>(ray,context,pre.grid->geomID(),pre.grid->primID()));
+      }
+      
+      /*! Intersect a ray with the primitive. */
+      static __forceinline void intersect(Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node) 
+      {
+        const size_t line_offset   = pre.grid->width;
+        const size_t lines         = pre.grid->height;
+        const float* const grid_x  = pre.grid->decodeLeaf(0,prim);
+        
+#if defined(__AVX__)
+        intersect<GridSOA::Gather3x3>( ray, context, grid_x, line_offset, lines, pre);
+#else
+        intersect<GridSOA::Gather2x3>(ray, context, grid_x            , line_offset, lines, pre);
+        if (likely(lines > 2))
+          intersect<GridSOA::Gather2x3>(ray, context, grid_x+line_offset, line_offset, lines, pre);
+#endif
+      }
+      
+      /*! Test if the ray is occluded by the primitive */
+      static __forceinline bool occluded(Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      {
+        const size_t line_offset   = pre.grid->width;
+        const size_t lines         = pre.grid->height;
+        const float* const grid_x  = pre.grid->decodeLeaf(0,prim);
+        
+#if defined(__AVX__)
+        return occluded<GridSOA::Gather3x3>( ray, context, grid_x, line_offset, lines, pre);
+#else
+        if (occluded<GridSOA::Gather2x3>(ray, context, grid_x            , line_offset, lines, pre)) return true;
+        if (likely(lines > 2))
+          if (occluded<GridSOA::Gather2x3>(ray, context, grid_x+line_offset, line_offset, lines, pre)) return true;
+#endif
+        return false;
+      }      
+    };
+
+    class GridSOAMBIntersector1
+    {
+    public:
+      typedef void Primitive;
+      typedef GridSOAIntersector1::Precalculations Precalculations;
+      
+      template<typename Loader>
+        static __forceinline void intersect(RayHit& ray, const float ftime,
+                                            IntersectContext* context, 
+                                            const float* const grid_x,
+                                            const size_t line_offset,
+                                            const size_t lines,
+                                            Precalculations& pre)
+      {
+        typedef typename Loader::vfloat vfloat;
+        const size_t dim_offset    = pre.grid->dim_offset;
+        const size_t grid_offset   = pre.grid->gridBytes >> 2;
+        const float* const grid_y  = grid_x + 1 * dim_offset;
+        const float* const grid_z  = grid_x + 2 * dim_offset;
+        const float* const grid_uv = grid_x + 3 * dim_offset;
+
+        Vec3<vfloat> a0, a1, a2;
+        Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,a0,a1,a2);
+
+        Vec3<vfloat> b0, b1, b2;
+        Loader::gather(grid_x+grid_offset,grid_y+grid_offset,grid_z+grid_offset,line_offset,lines,b0,b1,b2);
+
+        Vec3<vfloat> v0 = lerp(a0,b0,vfloat(ftime));
+        Vec3<vfloat> v1 = lerp(a1,b1,vfloat(ftime));
+        Vec3<vfloat> v2 = lerp(a2,b2,vfloat(ftime));
+
+        GridSOA::MapUV<Loader> mapUV(grid_uv,line_offset,lines);
+        PlueckerIntersector1<Loader::M> intersector(ray,nullptr);
+        intersector.intersect(ray,v0,v1,v2,mapUV,Intersect1EpilogMU<Loader::M,true>(ray,context,pre.grid->geomID(),pre.grid->primID()));
+      };
+      
+      template<typename Loader>
+        static __forceinline bool occluded(Ray& ray, const float ftime,
+                                           IntersectContext* context, 
+                                           const float* const grid_x,
+                                           const size_t line_offset,
+                                           const size_t lines,
+                                           Precalculations& pre)
+      {
+        typedef typename Loader::vfloat vfloat;
+        const size_t dim_offset    = pre.grid->dim_offset;
+        const size_t grid_offset   = pre.grid->gridBytes >> 2;
+        const float* const grid_y  = grid_x + 1 * dim_offset;
+        const float* const grid_z  = grid_x + 2 * dim_offset;
+        const float* const grid_uv = grid_x + 3 * dim_offset;
+
+        Vec3<vfloat> a0, a1, a2;
+        Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,a0,a1,a2);
+
+        Vec3<vfloat> b0, b1, b2;
+        Loader::gather(grid_x+grid_offset,grid_y+grid_offset,grid_z+grid_offset,line_offset,lines,b0,b1,b2);
+       
+        Vec3<vfloat> v0 = lerp(a0,b0,vfloat(ftime));
+        Vec3<vfloat> v1 = lerp(a1,b1,vfloat(ftime));
+        Vec3<vfloat> v2 = lerp(a2,b2,vfloat(ftime));
+        
+        GridSOA::MapUV<Loader> mapUV(grid_uv,line_offset,lines);
+        PlueckerIntersector1<Loader::M> intersector(ray,nullptr);
+        return intersector.intersect(ray,v0,v1,v2,mapUV,Occluded1EpilogMU<Loader::M,true>(ray,context,pre.grid->geomID(),pre.grid->primID()));
+      }
+      
+      /*! Intersect a ray with the primitive. */
+      static __forceinline void intersect(Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node) 
+      { 
+        const size_t line_offset   = pre.grid->width;
+        const size_t lines         = pre.grid->height;
+        const float* const grid_x  = pre.grid->decodeLeaf(pre.itime,prim);
+        
+#if defined(__AVX__)
+        intersect<GridSOA::Gather3x3>( ray, pre.ftime, context, grid_x, line_offset, lines, pre);
+#else
+        intersect<GridSOA::Gather2x3>(ray, pre.ftime, context, grid_x, line_offset, lines, pre);
+        if (likely(lines > 2))
+          intersect<GridSOA::Gather2x3>(ray, pre.ftime, context, grid_x+line_offset, line_offset, lines, pre);
+#endif
+      }
+      
+      /*! Test if the ray is occluded by the primitive */
+      static __forceinline bool occluded(Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      {
+        const size_t line_offset   = pre.grid->width;
+        const size_t lines         = pre.grid->height;
+        const float* const grid_x  = pre.grid->decodeLeaf(pre.itime,prim);
+        
+#if defined(__AVX__)
+        return occluded<GridSOA::Gather3x3>( ray, pre.ftime, context, grid_x, line_offset, lines, pre);
+#else
+        if (occluded<GridSOA::Gather2x3>(ray, pre.ftime, context, grid_x            , line_offset, lines, pre)) return true;
+        if (likely(lines > 2))
+          if (occluded<GridSOA::Gather2x3>(ray, pre.ftime, context, grid_x+line_offset, line_offset, lines, pre)) return true;
+#endif
+        return false;
+      }      
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/grid_soa_intersector_packet.h b/thirdparty/embree/kernels/geometry/grid_soa_intersector_packet.h
new file mode 100644
index 0000000000..14cacab5fe
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/grid_soa_intersector_packet.h
@@ -0,0 +1,445 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "grid_soa.h"
+#include "../common/ray.h"
+#include "triangle_intersector_pluecker.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int K>
+      struct MapUV0
+    {
+      const float* const grid_uv;
+      size_t ofs00, ofs01, ofs10, ofs11;
+      
+      __forceinline MapUV0(const float* const grid_uv, size_t ofs00, size_t ofs01, size_t ofs10, size_t ofs11)
+        : grid_uv(grid_uv), ofs00(ofs00), ofs01(ofs01), ofs10(ofs10), ofs11(ofs11) {}
+      
+      __forceinline void operator() (vfloat<K>& u, vfloat<K>& v, Vec3vf<K>& Ng) const {
+        const vfloat<K> uv00(grid_uv[ofs00]);
+        const vfloat<K> uv01(grid_uv[ofs01]);
+        const vfloat<K> uv10(grid_uv[ofs10]);
+        const vfloat<K> uv11(grid_uv[ofs11]);
+        const Vec2vf<K> uv0 = GridSOA::decodeUV(uv00);
+        const Vec2vf<K> uv1 = GridSOA::decodeUV(uv01);
+        const Vec2vf<K> uv2 = GridSOA::decodeUV(uv10);
+        const Vec2vf<K> uv = madd(u,uv1,madd(v,uv2,(1.0f-u-v)*uv0));
+        u = uv[0]; v = uv[1];
+      }
+    };
+    
+    template<int K>
+      struct MapUV1
+    {
+      const float* const grid_uv;
+      size_t ofs00, ofs01, ofs10, ofs11;
+      
+      __forceinline MapUV1(const float* const grid_uv, size_t ofs00, size_t ofs01, size_t ofs10, size_t ofs11)
+        : grid_uv(grid_uv), ofs00(ofs00), ofs01(ofs01), ofs10(ofs10), ofs11(ofs11) {}
+      
+      __forceinline void operator() (vfloat<K>& u, vfloat<K>& v, Vec3vf<K>& Ng) const {
+        const vfloat<K> uv00(grid_uv[ofs00]);
+        const vfloat<K> uv01(grid_uv[ofs01]);
+        const vfloat<K> uv10(grid_uv[ofs10]);
+        const vfloat<K> uv11(grid_uv[ofs11]);
+        const Vec2vf<K> uv0 = GridSOA::decodeUV(uv10);
+        const Vec2vf<K> uv1 = GridSOA::decodeUV(uv01);
+        const Vec2vf<K> uv2 = GridSOA::decodeUV(uv11);
+        const Vec2vf<K> uv = madd(u,uv1,madd(v,uv2,(1.0f-u-v)*uv0));
+        u = uv[0]; v = uv[1];
+      }
+    };
+    
+    template<int K>
+      class GridSOAIntersectorK
+    {
+    public:
+      typedef void Primitive;
+
+      class Precalculations
+      {
+#if defined(__AVX__)
+        static const int M = 8;
+#else
+        static const int M = 4;
+#endif
+
+      public:
+        __forceinline Precalculations (const vbool<K>& valid, const RayK<K>& ray)
+          : grid(nullptr), intersector(valid,ray) {}
+
+      public:
+        GridSOA* grid;
+        PlueckerIntersectorK<M,K> intersector; // FIXME: use quad intersector
+      };
+
+      /*! Intersect a ray with the primitive. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      {
+        const size_t dim_offset    = pre.grid->dim_offset;
+        const size_t line_offset   = pre.grid->width;
+        const float* const grid_x  = pre.grid->decodeLeaf(0,prim);
+        const float* const grid_y  = grid_x + 1 * dim_offset;
+        const float* const grid_z  = grid_x + 2 * dim_offset;
+        const float* const grid_uv = grid_x + 3 * dim_offset;
+
+        const size_t max_x = pre.grid->width  == 2 ? 1 : 2;
+        const size_t max_y = pre.grid->height == 2 ? 1 : 2;
+        for (size_t y=0; y<max_y; y++)        
+        {
+          for (size_t x=0; x<max_x; x++)
+          {
+            const size_t ofs00 = (y+0)*line_offset+(x+0);
+            const size_t ofs01 = (y+0)*line_offset+(x+1);
+            const size_t ofs10 = (y+1)*line_offset+(x+0);
+            const size_t ofs11 = (y+1)*line_offset+(x+1);
+            const Vec3vf<K> p00(grid_x[ofs00],grid_y[ofs00],grid_z[ofs00]);
+            const Vec3vf<K> p01(grid_x[ofs01],grid_y[ofs01],grid_z[ofs01]);
+            const Vec3vf<K> p10(grid_x[ofs10],grid_y[ofs10],grid_z[ofs10]);
+            const Vec3vf<K> p11(grid_x[ofs11],grid_y[ofs11],grid_z[ofs11]);
+
+            pre.intersector.intersectK(valid_i,ray,p00,p01,p10,MapUV0<K>(grid_uv,ofs00,ofs01,ofs10,ofs11),IntersectKEpilogMU<1,K,true>(ray,context,pre.grid->geomID(),pre.grid->primID()));
+            pre.intersector.intersectK(valid_i,ray,p10,p01,p11,MapUV1<K>(grid_uv,ofs00,ofs01,ofs10,ofs11),IntersectKEpilogMU<1,K,true>(ray,context,pre.grid->geomID(),pre.grid->primID()));
+          }
+        }
+      }
+
+      /*! Test if the ray is occluded by the primitive */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      {
+        const size_t dim_offset    = pre.grid->dim_offset;
+        const size_t line_offset   = pre.grid->width;
+        const float* const grid_x  = pre.grid->decodeLeaf(0,prim);
+        const float* const grid_y  = grid_x + 1 * dim_offset;
+        const float* const grid_z  = grid_x + 2 * dim_offset;
+        const float* const grid_uv = grid_x + 3 * dim_offset;
+
+        vbool<K> valid = valid_i;
+        const size_t max_x = pre.grid->width  == 2 ? 1 : 2;
+        const size_t max_y = pre.grid->height == 2 ? 1 : 2;
+        for (size_t y=0; y<max_y; y++)        
+        {
+          for (size_t x=0; x<max_x; x++)
+          {
+            const size_t ofs00 = (y+0)*line_offset+(x+0);
+            const size_t ofs01 = (y+0)*line_offset+(x+1);
+            const size_t ofs10 = (y+1)*line_offset+(x+0);
+            const size_t ofs11 = (y+1)*line_offset+(x+1);
+            const Vec3vf<K> p00(grid_x[ofs00],grid_y[ofs00],grid_z[ofs00]);
+            const Vec3vf<K> p01(grid_x[ofs01],grid_y[ofs01],grid_z[ofs01]);
+            const Vec3vf<K> p10(grid_x[ofs10],grid_y[ofs10],grid_z[ofs10]);
+            const Vec3vf<K> p11(grid_x[ofs11],grid_y[ofs11],grid_z[ofs11]);
+
+            pre.intersector.intersectK(valid,ray,p00,p01,p10,MapUV0<K>(grid_uv,ofs00,ofs01,ofs10,ofs11),OccludedKEpilogMU<1,K,true>(valid,ray,context,pre.grid->geomID(),pre.grid->primID()));
+            if (none(valid)) break;
+            pre.intersector.intersectK(valid,ray,p10,p01,p11,MapUV1<K>(grid_uv,ofs00,ofs01,ofs10,ofs11),OccludedKEpilogMU<1,K,true>(valid,ray,context,pre.grid->geomID(),pre.grid->primID()));
+            if (none(valid)) break;
+          }
+        }
+        return !valid;
+      }
+
+      template<typename Loader>
+        static __forceinline void intersect(RayHitK<K>& ray, size_t k,
+                                            IntersectContext* context,
+                                            const float* const grid_x,
+                                            const size_t line_offset,
+                                            const size_t lines,
+                                            Precalculations& pre)
+      {
+        typedef typename Loader::vfloat vfloat;
+        const size_t dim_offset    = pre.grid->dim_offset;
+        const float* const grid_y  = grid_x + 1 * dim_offset;
+        const float* const grid_z  = grid_x + 2 * dim_offset;
+        const float* const grid_uv = grid_x + 3 * dim_offset;
+        Vec3<vfloat> v0, v1, v2; Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,v0,v1,v2);
+        pre.intersector.intersect(ray,k,v0,v1,v2,GridSOA::MapUV<Loader>(grid_uv,line_offset,lines),Intersect1KEpilogMU<Loader::M,K,true>(ray,k,context,pre.grid->geomID(),pre.grid->primID()));
+      };
+
+      template<typename Loader>
+        static __forceinline bool occluded(RayK<K>& ray, size_t k,
+                                           IntersectContext* context,
+                                           const float* const grid_x,
+                                           const size_t line_offset,
+                                           const size_t lines,
+                                           Precalculations& pre)
+      {
+        typedef typename Loader::vfloat vfloat;
+        const size_t dim_offset    = pre.grid->dim_offset;
+        const float* const grid_y  = grid_x + 1 * dim_offset;
+        const float* const grid_z  = grid_x + 2 * dim_offset;
+        const float* const grid_uv = grid_x + 3 * dim_offset;
+        Vec3<vfloat> v0, v1, v2; Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,v0,v1,v2);
+        return pre.intersector.intersect(ray,k,v0,v1,v2,GridSOA::MapUV<Loader>(grid_uv,line_offset,lines),Occluded1KEpilogMU<Loader::M,K,true>(ray,k,context,pre.grid->geomID(),pre.grid->primID()));
+      }
+
+      /*! Intersect a ray with the primitive. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      {
+        const size_t line_offset   = pre.grid->width;
+        const size_t lines         = pre.grid->height;
+        const float* const grid_x  = pre.grid->decodeLeaf(0,prim);
+#if defined(__AVX__)
+        intersect<GridSOA::Gather3x3>( ray, k, context, grid_x, line_offset, lines, pre);
+#else
+        intersect<GridSOA::Gather2x3>(ray, k, context, grid_x            , line_offset, lines, pre);
+        if (likely(lines > 2))
+          intersect<GridSOA::Gather2x3>(ray, k, context, grid_x+line_offset, line_offset, lines, pre);
+#endif
+      }
+
+      /*! Test if the ray is occluded by the primitive */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      {
+        const size_t line_offset   = pre.grid->width;
+        const size_t lines         = pre.grid->height;
+        const float* const grid_x  = pre.grid->decodeLeaf(0,prim);
+
+#if defined(__AVX__)
+        return occluded<GridSOA::Gather3x3>( ray, k, context, grid_x, line_offset, lines, pre);
+#else
+        if (occluded<GridSOA::Gather2x3>(ray, k, context, grid_x            , line_offset, lines, pre)) return true;
+        if (likely(lines > 2))
+          if (occluded<GridSOA::Gather2x3>(ray, k, context, grid_x+line_offset, line_offset, lines, pre)) return true;
+#endif
+        return false;
+      }
+    };
+
+    template<int K>
+    class GridSOAMBIntersectorK
+    {
+    public:
+      typedef void Primitive;
+      typedef typename GridSOAIntersectorK<K>::Precalculations Precalculations;
+
+      /*! Intersect a ray with the primitive. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      {
+        vfloat<K> vftime;
+        vint<K> vitime = getTimeSegment<K>(ray.time(), vfloat<K>((float)(pre.grid->time_steps-1)), vftime);
+
+        vbool<K> valid1 = valid_i;
+        while (any(valid1)) {
+          const size_t j = bsf(movemask(valid1));
+          const int itime = vitime[j];
+          const vbool<K> valid2 = valid1 & (itime == vitime);
+          valid1 = valid1 & !valid2;
+          intersect(valid2,pre,ray,vftime,itime,context,prim,lazy_node);
+        }
+      }
+
+      /*! Intersect a ray with the primitive. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, const vfloat<K>& ftime, int itime, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      {
+        const size_t grid_offset   = pre.grid->gridBytes >> 2;
+        const size_t dim_offset    = pre.grid->dim_offset;
+        const size_t line_offset   = pre.grid->width;
+        const float* const grid_x  = pre.grid->decodeLeaf(itime,prim);
+        const float* const grid_y  = grid_x + 1 * dim_offset;
+        const float* const grid_z  = grid_x + 2 * dim_offset;
+        const float* const grid_uv = grid_x + 3 * dim_offset;
+
+        const size_t max_x = pre.grid->width  == 2 ? 1 : 2;
+        const size_t max_y = pre.grid->height == 2 ? 1 : 2;
+        for (size_t y=0; y<max_y; y++)        
+        {
+          for (size_t x=0; x<max_x; x++)
+          {
+            size_t ofs00 = (y+0)*line_offset+(x+0);
+            size_t ofs01 = (y+0)*line_offset+(x+1);
+            size_t ofs10 = (y+1)*line_offset+(x+0);
+            size_t ofs11 = (y+1)*line_offset+(x+1);
+            const Vec3vf<K> a00(grid_x[ofs00],grid_y[ofs00],grid_z[ofs00]);
+            const Vec3vf<K> a01(grid_x[ofs01],grid_y[ofs01],grid_z[ofs01]);
+            const Vec3vf<K> a10(grid_x[ofs10],grid_y[ofs10],grid_z[ofs10]);
+            const Vec3vf<K> a11(grid_x[ofs11],grid_y[ofs11],grid_z[ofs11]);
+            ofs00 += grid_offset;
+            ofs01 += grid_offset;
+            ofs10 += grid_offset;
+            ofs11 += grid_offset;
+            const Vec3vf<K> b00(grid_x[ofs00],grid_y[ofs00],grid_z[ofs00]);
+            const Vec3vf<K> b01(grid_x[ofs01],grid_y[ofs01],grid_z[ofs01]);
+            const Vec3vf<K> b10(grid_x[ofs10],grid_y[ofs10],grid_z[ofs10]);
+            const Vec3vf<K> b11(grid_x[ofs11],grid_y[ofs11],grid_z[ofs11]);
+            const Vec3vf<K> p00 = lerp(a00,b00,ftime);
+            const Vec3vf<K> p01 = lerp(a01,b01,ftime);
+            const Vec3vf<K> p10 = lerp(a10,b10,ftime);
+            const Vec3vf<K> p11 = lerp(a11,b11,ftime);
+
+            pre.intersector.intersectK(valid_i,ray,p00,p01,p10,MapUV0<K>(grid_uv,ofs00,ofs01,ofs10,ofs11),IntersectKEpilogMU<1,K,true>(ray,context,pre.grid->geomID(),pre.grid->primID()));
+            pre.intersector.intersectK(valid_i,ray,p10,p01,p11,MapUV1<K>(grid_uv,ofs00,ofs01,ofs10,ofs11),IntersectKEpilogMU<1,K,true>(ray,context,pre.grid->geomID(),pre.grid->primID()));
+          }
+        }
+      }
+
+      /*! Test if the ray is occluded by the primitive */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      {
+        vfloat<K> vftime;
+        vint<K> vitime = getTimeSegment<K>(ray.time(), vfloat<K>((float)(pre.grid->time_steps-1)), vftime);
+
+        vbool<K> valid_o = valid_i;
+        vbool<K> valid1 = valid_i;
+        while (any(valid1)) {
+          const int j = int(bsf(movemask(valid1)));
+          const int itime = vitime[j];
+          const vbool<K> valid2 = valid1 & (itime == vitime);
+          valid1 = valid1 & !valid2;
+          valid_o &= !valid2 | occluded(valid2,pre,ray,vftime,itime,context,prim,lazy_node);
+        }
+        return !valid_o;
+      }
+
+      /*! Test if the ray is occluded by the primitive */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, const vfloat<K>& ftime, int itime, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      {
+        const size_t grid_offset   = pre.grid->gridBytes >> 2;
+        const size_t dim_offset    = pre.grid->dim_offset;
+        const size_t line_offset   = pre.grid->width;
+        const float* const grid_x  = pre.grid->decodeLeaf(itime,prim);
+        const float* const grid_y  = grid_x + 1 * dim_offset;
+        const float* const grid_z  = grid_x + 2 * dim_offset;
+        const float* const grid_uv = grid_x + 3 * dim_offset;
+
+        vbool<K> valid = valid_i;
+        const size_t max_x = pre.grid->width  == 2 ? 1 : 2;
+        const size_t max_y = pre.grid->height == 2 ? 1 : 2;
+        for (size_t y=0; y<max_y; y++)        
+        {
+          for (size_t x=0; x<max_x; x++)
+          {
+            size_t ofs00 = (y+0)*line_offset+(x+0);
+            size_t ofs01 = (y+0)*line_offset+(x+1);
+            size_t ofs10 = (y+1)*line_offset+(x+0);
+            size_t ofs11 = (y+1)*line_offset+(x+1);
+            const Vec3vf<K> a00(grid_x[ofs00],grid_y[ofs00],grid_z[ofs00]);
+            const Vec3vf<K> a01(grid_x[ofs01],grid_y[ofs01],grid_z[ofs01]);
+            const Vec3vf<K> a10(grid_x[ofs10],grid_y[ofs10],grid_z[ofs10]);
+            const Vec3vf<K> a11(grid_x[ofs11],grid_y[ofs11],grid_z[ofs11]);
+            ofs00 += grid_offset;
+            ofs01 += grid_offset;
+            ofs10 += grid_offset;
+            ofs11 += grid_offset;
+            const Vec3vf<K> b00(grid_x[ofs00],grid_y[ofs00],grid_z[ofs00]);
+            const Vec3vf<K> b01(grid_x[ofs01],grid_y[ofs01],grid_z[ofs01]);
+            const Vec3vf<K> b10(grid_x[ofs10],grid_y[ofs10],grid_z[ofs10]);
+            const Vec3vf<K> b11(grid_x[ofs11],grid_y[ofs11],grid_z[ofs11]);
+            const Vec3vf<K> p00 = lerp(a00,b00,ftime);
+            const Vec3vf<K> p01 = lerp(a01,b01,ftime);
+            const Vec3vf<K> p10 = lerp(a10,b10,ftime);
+            const Vec3vf<K> p11 = lerp(a11,b11,ftime);
+
+            pre.intersector.intersectK(valid,ray,p00,p01,p10,MapUV0<K>(grid_uv,ofs00,ofs01,ofs10,ofs11),OccludedKEpilogMU<1,K,true>(valid,ray,context,pre.grid->geomID(),pre.grid->primID()));
+            if (none(valid)) break;
+            pre.intersector.intersectK(valid,ray,p10,p01,p11,MapUV1<K>(grid_uv,ofs00,ofs01,ofs10,ofs11),OccludedKEpilogMU<1,K,true>(valid,ray,context,pre.grid->geomID(),pre.grid->primID()));
+            if (none(valid)) break;
+          }
+        }
+        return valid;
+      }
+
+      template<typename Loader>
+        static __forceinline void intersect(RayHitK<K>& ray, size_t k,
+                                            const float ftime,
+                                            IntersectContext* context,
+                                            const float* const grid_x,
+                                            const size_t line_offset,
+                                            const size_t lines,
+                                            Precalculations& pre)
+      {
+        typedef typename Loader::vfloat vfloat;
+        const size_t grid_offset   = pre.grid->gridBytes >> 2;
+        const size_t dim_offset    = pre.grid->dim_offset;
+        const float* const grid_y  = grid_x + 1 * dim_offset;
+        const float* const grid_z  = grid_x + 2 * dim_offset;
+        const float* const grid_uv = grid_x + 3 * dim_offset;
+
+        Vec3<vfloat> a0, a1, a2;
+        Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,a0,a1,a2);
+
+        Vec3<vfloat> b0, b1, b2;
+        Loader::gather(grid_x+grid_offset,grid_y+grid_offset,grid_z+grid_offset,line_offset,lines,b0,b1,b2);
+
+        Vec3<vfloat> v0 = lerp(a0,b0,vfloat(ftime));
+        Vec3<vfloat> v1 = lerp(a1,b1,vfloat(ftime));
+        Vec3<vfloat> v2 = lerp(a2,b2,vfloat(ftime));
+
+        pre.intersector.intersect(ray,k,v0,v1,v2,GridSOA::MapUV<Loader>(grid_uv,line_offset,lines),Intersect1KEpilogMU<Loader::M,K,true>(ray,k,context,pre.grid->geomID(),pre.grid->primID()));
+      };
+
+      template<typename Loader>
+        static __forceinline bool occluded(RayK<K>& ray, size_t k,
+                                           const float ftime,
+                                           IntersectContext* context,
+                                           const float* const grid_x,
+                                           const size_t line_offset,
+                                           const size_t lines,
+                                           Precalculations& pre)
+      {
+        typedef typename Loader::vfloat vfloat;
+        const size_t grid_offset   = pre.grid->gridBytes >> 2;
+        const size_t dim_offset    = pre.grid->dim_offset;
+        const float* const grid_y  = grid_x + 1 * dim_offset;
+        const float* const grid_z  = grid_x + 2 * dim_offset;
+        const float* const grid_uv = grid_x + 3 * dim_offset;
+
+        Vec3<vfloat> a0, a1, a2;
+        Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,a0,a1,a2);
+
+        Vec3<vfloat> b0, b1, b2;
+        Loader::gather(grid_x+grid_offset,grid_y+grid_offset,grid_z+grid_offset,line_offset,lines,b0,b1,b2);
+
+        Vec3<vfloat> v0 = lerp(a0,b0,vfloat(ftime));
+        Vec3<vfloat> v1 = lerp(a1,b1,vfloat(ftime));
+        Vec3<vfloat> v2 = lerp(a2,b2,vfloat(ftime));
+
+        return pre.intersector.intersect(ray,k,v0,v1,v2,GridSOA::MapUV<Loader>(grid_uv,line_offset,lines),Occluded1KEpilogMU<Loader::M,K,true>(ray,k,context,pre.grid->geomID(),pre.grid->primID()));
+      }
+
+      /*! Intersect a ray with the primitive. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      { 
+        float ftime;
+        int itime = getTimeSegment(ray.time()[k], float(pre.grid->time_steps-1), ftime);
+
+        const size_t line_offset   = pre.grid->width;
+        const size_t lines         = pre.grid->height;
+        const float* const grid_x  = pre.grid->decodeLeaf(itime,prim);
+
+#if defined(__AVX__)
+        intersect<GridSOA::Gather3x3>( ray, k, ftime, context, grid_x, line_offset, lines, pre);
+#else
+        intersect<GridSOA::Gather2x3>(ray, k, ftime, context, grid_x, line_offset, lines, pre);
+        if (likely(lines > 2))
+          intersect<GridSOA::Gather2x3>(ray, k, ftime, context, grid_x+line_offset, line_offset, lines, pre);
+#endif
+      }
+
+      /*! Test if the ray is occluded by the primitive */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      {
+        float ftime;
+        int itime = getTimeSegment(ray.time()[k], float(pre.grid->time_steps-1), ftime);
+
+        const size_t line_offset   = pre.grid->width;
+        const size_t lines         = pre.grid->height;
+        const float* const grid_x  = pre.grid->decodeLeaf(itime,prim);
+
+#if defined(__AVX__)
+        return occluded<GridSOA::Gather3x3>( ray, k, ftime, context, grid_x, line_offset, lines, pre);
+#else
+        if (occluded<GridSOA::Gather2x3>(ray, k, ftime, context, grid_x, line_offset, lines, pre)) return true;
+        if (likely(lines > 2))
+          if (occluded<GridSOA::Gather2x3>(ray, k, ftime, context, grid_x+line_offset, line_offset, lines, pre)) return true;
+#endif
+        return false;
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/instance.h b/thirdparty/embree/kernels/geometry/instance.h
new file mode 100644
index 0000000000..7c0e7e0f49
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/instance.h
@@ -0,0 +1,78 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+#include "../common/scene_instance.h"
+
+namespace embree
+{
+  struct InstancePrimitive
+  {
+    struct Type : public PrimitiveType 
+    {
+      const char* name() const;
+      size_t sizeActive(const char* This) const;
+      size_t sizeTotal(const char* This) const;
+      size_t getBytes(const char* This) const;
+    };
+    static Type type;
+
+  public:
+
+    /* primitive supports multiple time segments */
+    static const bool singleTimeSegment = false;
+
+    /* Returns maximum number of stored primitives */
+    static __forceinline size_t max_size() { return 1; }
+
+    /* Returns required number of primitive blocks for N primitives */
+    static __forceinline size_t blocks(size_t N) { return N; }
+
+  public:
+
+    InstancePrimitive (const Instance* instance, unsigned int instID) 
+    : instance(instance) 
+    , instID_(instID)
+    {}
+
+    __forceinline void fill(const PrimRef* prims, size_t& i, size_t end, Scene* scene)
+    {
+      assert(end-i == 1);
+      const PrimRef& prim = prims[i]; i++;
+      const unsigned int geomID = prim.geomID();
+      const Instance* instance = scene->get<Instance>(geomID);
+      new (this) InstancePrimitive(instance, geomID);
+    }
+
+    __forceinline LBBox3fa fillMB(const PrimRef* prims, size_t& i, size_t end, Scene* scene, size_t itime)
+    {
+      assert(end-i == 1);
+      const PrimRef& prim = prims[i]; i++;
+      const unsigned int geomID = prim.geomID();
+      const Instance* instance = scene->get<Instance>(geomID);
+      new (this) InstancePrimitive(instance,geomID);
+      return instance->linearBounds(0,itime);
+    }
+
+    __forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t& i, size_t end, Scene* scene, const BBox1f time_range)
+    {
+      assert(end-i == 1);
+      const PrimRefMB& prim = prims[i]; i++;
+      const unsigned int geomID = prim.geomID();
+      const Instance* instance = scene->get<Instance>(geomID);
+      new (this) InstancePrimitive(instance,geomID);
+      return instance->linearBounds(0,time_range);
+    }
+
+    /* Updates the primitive */
+    __forceinline BBox3fa update(Instance* instance) {
+      return instance->bounds(0);
+    }
+
+  public:
+    const Instance* instance;
+    const unsigned int instID_ = std::numeric_limits<unsigned int>::max ();
+  };
+}
diff --git a/thirdparty/embree/kernels/geometry/instance_intersector.h b/thirdparty/embree/kernels/geometry/instance_intersector.h
new file mode 100644
index 0000000000..28a7b728e5
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/instance_intersector.h
@@ -0,0 +1,84 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "instance.h"
+#include "../common/ray.h"
+#include "../common/point_query.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    struct InstanceIntersector1
+    {
+      typedef InstancePrimitive Primitive;
+
+      struct Precalculations {
+        __forceinline Precalculations (const Ray& ray, const void *ptr) {}
+      };
+      
+      static void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim);
+      static bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim);
+      static bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& prim);
+    };
+
+    struct InstanceIntersector1MB
+    {
+      typedef InstancePrimitive Primitive;
+
+      struct Precalculations {
+        __forceinline Precalculations (const Ray& ray, const void *ptr) {}
+      };
+      
+      static void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim);
+      static bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim);
+      static bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& prim);
+    };
+
+    template<int K>
+      struct InstanceIntersectorK
+    {
+      typedef InstancePrimitive Primitive;
+      
+      struct Precalculations {
+        __forceinline Precalculations (const vbool<K>& valid, const RayK<K>& ray) {}
+      };
+      
+      static void intersect(const vbool<K>& valid_i, const Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive& prim);
+      static vbool<K> occluded(const vbool<K>& valid_i, const Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive& prim);
+
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& prim) {
+        intersect(vbool<K>(1<<int(k)),pre,ray,context,prim);
+      }
+      
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& prim) {
+        occluded(vbool<K>(1<<int(k)),pre,ray,context,prim);
+        return ray.tfar[k] < 0.0f; 
+      }
+    };
+
+    template<int K>
+      struct InstanceIntersectorKMB
+    {
+      typedef InstancePrimitive Primitive;
+      
+      struct Precalculations {
+        __forceinline Precalculations (const vbool<K>& valid, const RayK<K>& ray) {}
+      };
+      
+      static void intersect(const vbool<K>& valid_i, const Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive& prim);
+      static vbool<K> occluded(const vbool<K>& valid_i, const Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive& prim);
+
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& prim) {
+        intersect(vbool<K>(1<<int(k)),pre,ray,context,prim);
+      }
+      
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& prim) {
+        occluded(vbool<K>(1<<int(k)),pre,ray,context,prim);
+        return ray.tfar[k] < 0.0f; 
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/intersector_epilog.h b/thirdparty/embree/kernels/geometry/intersector_epilog.h
new file mode 100644
index 0000000000..7bf134cc54
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/intersector_epilog.h
@@ -0,0 +1,979 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "../common/context.h"
+#include "filter.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M>
+    struct UVIdentity {
+      __forceinline void operator() (vfloat<M>& u, vfloat<M>& v, Vec3vf<M>& Ng) const {}
+    };
+
+
+    template<bool filter>
+    struct Intersect1Epilog1
+    {
+      RayHit& ray;
+      IntersectContext* context;
+      const unsigned int geomID;
+      const unsigned int primID;
+
+      __forceinline Intersect1Epilog1(RayHit& ray,
+                                      IntersectContext* context,
+                                      const unsigned int geomID,
+                                      const unsigned int primID)
+        : ray(ray), context(context), geomID(geomID), primID(primID) {}
+
+      template<typename Hit>
+      __forceinline bool operator() (Hit& hit) const
+      {
+        /* ray mask test */
+        Scene* scene MAYBE_UNUSED = context->scene;
+        Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+#if defined(EMBREE_RAY_MASK)
+        if ((geometry->mask & ray.mask) == 0) return false;
+#endif
+        hit.finalize();
+
+        /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+        if (filter) {
+          if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) {
+            HitK<1> h(context->user,geomID,primID,hit.u,hit.v,hit.Ng);
+            const float old_t = ray.tfar;
+            ray.tfar = hit.t;
+            bool found = runIntersectionFilter1(geometry,ray,context,h);
+            if (!found) ray.tfar = old_t;
+            return found;
+          }
+        }
+#endif
+
+        /* update hit information */
+        ray.tfar = hit.t;
+        ray.Ng = hit.Ng;
+        ray.u = hit.u;
+        ray.v = hit.v;
+        ray.primID = primID;
+        ray.geomID = geomID;
+        instance_id_stack::copy_UU(context->user->instID, ray.instID);
+        return true;
+      }
+    };
+
+    template<bool filter>
+    struct Occluded1Epilog1
+    {
+      Ray& ray;
+      IntersectContext* context;
+      const unsigned int geomID;
+      const unsigned int primID;
+
+      __forceinline Occluded1Epilog1(Ray& ray,
+                                     IntersectContext* context,
+                                     const unsigned int geomID,
+                                     const unsigned int primID)
+        : ray(ray), context(context), geomID(geomID), primID(primID) {}
+
+      template<typename Hit>
+      __forceinline bool operator() (Hit& hit) const
+      {
+        /* ray mask test */
+        Scene* scene MAYBE_UNUSED = context->scene;
+        Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+
+
+#if defined(EMBREE_RAY_MASK)
+        if ((geometry->mask & ray.mask) == 0) return false;
+#endif
+        hit.finalize();
+
+        /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+        if (filter) {
+          if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter())) {
+            HitK<1> h(context->user,geomID,primID,hit.u,hit.v,hit.Ng);
+            const float old_t = ray.tfar;
+            ray.tfar = hit.t;
+            const bool found = runOcclusionFilter1(geometry,ray,context,h);
+            if (!found) ray.tfar = old_t;
+            return found;
+          }
+        }
+#endif
+        return true;
+      }
+    };
+
+    template<int K, bool filter>
+    struct Intersect1KEpilog1
+    {
+      RayHitK<K>& ray;
+      size_t k;
+      IntersectContext* context;
+      const unsigned int geomID;
+      const unsigned int primID;
+
+      __forceinline Intersect1KEpilog1(RayHitK<K>& ray, size_t k,
+                                       IntersectContext* context,
+                                       const unsigned int geomID,
+                                       const unsigned int primID)
+        : ray(ray), k(k), context(context), geomID(geomID), primID(primID) {}
+
+      template<typename Hit>
+      __forceinline bool operator() (Hit& hit) const
+      {
+        /* ray mask test */
+        Scene* scene MAYBE_UNUSED = context->scene;
+        Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+#if defined(EMBREE_RAY_MASK)
+        if ((geometry->mask & ray.mask[k]) == 0)
+          return false;
+#endif
+        hit.finalize();
+
+        /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+        if (filter) {
+          if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) {
+            HitK<K> h(context->user,geomID,primID,hit.u,hit.v,hit.Ng);
+            const float old_t = ray.tfar[k];
+            ray.tfar[k] = hit.t;
+            const bool found = any(runIntersectionFilter(vbool<K>(1<<k),geometry,ray,context,h));
+            if (!found) ray.tfar[k] = old_t;
+            return found;
+          }
+        }
+#endif
+
+        /* update hit information */
+        ray.tfar[k] = hit.t;
+        ray.Ng.x[k] = hit.Ng.x;
+        ray.Ng.y[k] = hit.Ng.y;
+        ray.Ng.z[k] = hit.Ng.z;
+        ray.u[k] = hit.u;
+        ray.v[k] = hit.v;
+        ray.primID[k] = primID;
+        ray.geomID[k] = geomID;
+        instance_id_stack::copy_UV<K>(context->user->instID, ray.instID, k);
+        return true;
+      }
+    };
+    
+    template<int K, bool filter>
+    struct Occluded1KEpilog1
+    {
+      RayK<K>& ray;
+      size_t k;
+      IntersectContext* context;
+      const unsigned int geomID;
+      const unsigned int primID;
+
+      __forceinline Occluded1KEpilog1(RayK<K>& ray, size_t k,
+                                      IntersectContext* context,
+                                      const unsigned int geomID,
+                                      const unsigned int primID)
+        : ray(ray), k(k), context(context), geomID(geomID), primID(primID) {}
+
+      template<typename Hit>
+      __forceinline bool operator() (Hit& hit) const
+      {
+        /* ray mask test */
+        Scene* scene MAYBE_UNUSED = context->scene;
+        Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+#if defined(EMBREE_RAY_MASK)
+        if ((geometry->mask & ray.mask[k]) == 0)
+          return false;
+#endif
+
+        /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+        if (filter) {
+          if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter())) {
+            hit.finalize();
+            HitK<K> h(context->user,geomID,primID,hit.u,hit.v,hit.Ng);
+            const float old_t = ray.tfar[k];
+            ray.tfar[k] = hit.t;
+            const bool found = any(runOcclusionFilter(vbool<K>(1<<k),geometry,ray,context,h));
+            if (!found) ray.tfar[k] = old_t;
+            return found;
+          }
+        }
+#endif 
+        return true;
+      }
+    };
+    
+    template<int M, bool filter>
+    struct Intersect1EpilogM
+    {
+      RayHit& ray;
+      IntersectContext* context;
+      const vuint<M>& geomIDs;
+      const vuint<M>& primIDs;
+
+      __forceinline Intersect1EpilogM(RayHit& ray,
+                                      IntersectContext* context,
+                                      const vuint<M>& geomIDs,
+                                      const vuint<M>& primIDs)
+        : ray(ray), context(context), geomIDs(geomIDs), primIDs(primIDs) {}
+
+      template<typename Hit>
+      __forceinline bool operator() (const vbool<M>& valid_i, Hit& hit) const
+      {
+        Scene* scene MAYBE_UNUSED = context->scene;
+        vbool<M> valid = valid_i;
+        hit.finalize();
+        size_t i = select_min(valid,hit.vt);
+        unsigned int geomID = geomIDs[i];
+
+        /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION) || defined(EMBREE_RAY_MASK)
+        bool foundhit = false;
+        goto entry;
+        while (true)
+        {
+          if (unlikely(none(valid))) return foundhit;
+          i = select_min(valid,hit.vt);
+
+          geomID = geomIDs[i];
+        entry:
+          Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+
+#if defined(EMBREE_RAY_MASK)
+          /* goto next hit if mask test fails */
+          if ((geometry->mask & ray.mask) == 0) {
+            clear(valid,i);
+            continue;
+          }
+#endif
+
+#if defined(EMBREE_FILTER_FUNCTION) 
+          /* call intersection filter function */
+          if (filter) {
+            if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) {
+              const Vec2f uv = hit.uv(i);
+              HitK<1> h(context->user,geomID,primIDs[i],uv.x,uv.y,hit.Ng(i));
+              const float old_t = ray.tfar;
+              ray.tfar = hit.t(i);
+              const bool found = runIntersectionFilter1(geometry,ray,context,h);
+              if (!found) ray.tfar = old_t;
+              foundhit |= found;
+              clear(valid,i);
+              valid &= hit.vt <= ray.tfar; // intersection filters may modify tfar value
+              continue;
+            }
+          }
+#endif
+          break;
+        }
+#endif
+
+        /* update hit information */
+        const Vec2f uv = hit.uv(i);
+        ray.tfar = hit.vt[i];
+        ray.Ng.x = hit.vNg.x[i];
+        ray.Ng.y = hit.vNg.y[i];
+        ray.Ng.z = hit.vNg.z[i];
+        ray.u = uv.x;
+        ray.v = uv.y;
+        ray.primID = primIDs[i];
+        ray.geomID = geomID;
+        instance_id_stack::copy_UU(context->user->instID, ray.instID);
+        return true;
+
+      }
+    };
+
+    template<int M, bool filter>
+    struct Occluded1EpilogM
+    {
+      Ray& ray;
+      IntersectContext* context;
+      const vuint<M>& geomIDs;
+      const vuint<M>& primIDs;
+
+      __forceinline Occluded1EpilogM(Ray& ray,
+                                     IntersectContext* context,
+                                     const vuint<M>& geomIDs,
+                                     const vuint<M>& primIDs)
+        : ray(ray), context(context), geomIDs(geomIDs), primIDs(primIDs) {}
+
+      template<typename Hit>
+      __forceinline bool operator() (const vbool<M>& valid_i, Hit& hit) const
+      {
+        Scene* scene MAYBE_UNUSED = context->scene;
+        /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION) || defined(EMBREE_RAY_MASK)
+        if (unlikely(filter))
+          hit.finalize(); /* called only once */
+
+        vbool<M> valid = valid_i;
+        size_t m=movemask(valid);
+        goto entry;
+        while (true)
+        {
+          if (unlikely(m == 0)) return false;
+        entry:
+          size_t i=bsf(m);
+
+          const unsigned int geomID = geomIDs[i];
+          Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+
+#if defined(EMBREE_RAY_MASK)
+          /* goto next hit if mask test fails */
+          if ((geometry->mask & ray.mask) == 0) {
+            m=btc(m,i);
+            continue;
+          }
+#endif
+
+#if defined(EMBREE_FILTER_FUNCTION)
+          /* if we have no filter then the test passed */
+          if (filter) {
+            if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter()))
+            {
+              const Vec2f uv = hit.uv(i);
+              HitK<1> h(context->user,geomID,primIDs[i],uv.x,uv.y,hit.Ng(i));
+              const float old_t = ray.tfar;
+              ray.tfar = hit.t(i);
+              if (runOcclusionFilter1(geometry,ray,context,h)) return true;
+              ray.tfar = old_t;
+              m=btc(m,i);
+              continue;
+            }
+          }
+#endif
+          break;
+        }
+#endif
+
+        return true;
+      }
+    };
+
+    template<int M, bool filter>
+    struct Intersect1EpilogMU
+    {
+      RayHit& ray;
+      IntersectContext* context;
+      const unsigned int geomID;
+      const unsigned int primID;
+
+      __forceinline Intersect1EpilogMU(RayHit& ray,
+                                       IntersectContext* context,
+                                       const unsigned int geomID,
+                                       const unsigned int primID)
+        : ray(ray), context(context), geomID(geomID), primID(primID) {}
+
+      template<typename Hit>
+      __forceinline bool operator() (const vbool<M>& valid_i, Hit& hit) const
+      {
+        /* ray mask test */
+        Scene* scene MAYBE_UNUSED = context->scene;
+        Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+#if defined(EMBREE_RAY_MASK)
+        if ((geometry->mask & ray.mask) == 0) return false;
+#endif
+
+        vbool<M> valid = valid_i;
+        hit.finalize();
+
+        size_t i = select_min(valid,hit.vt);
+
+        /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+        if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter()))
+        {
+          bool foundhit = false;
+          while (true)
+          {
+            /* call intersection filter function */
+            Vec2f uv = hit.uv(i);
+            const float old_t = ray.tfar;
+            ray.tfar = hit.t(i);
+            HitK<1> h(context->user,geomID,primID,uv.x,uv.y,hit.Ng(i));
+            const bool found = runIntersectionFilter1(geometry,ray,context,h);
+            if (!found) ray.tfar = old_t;
+            foundhit |= found;
+            clear(valid,i);
+            valid &= hit.vt <= ray.tfar; // intersection filters may modify tfar value
+            if (unlikely(none(valid))) break;
+            i = select_min(valid,hit.vt);
+          }
+          return foundhit;
+        }
+#endif
+
+        /* update hit information */
+        const Vec2f uv = hit.uv(i);
+        const Vec3fa Ng = hit.Ng(i);
+        ray.tfar = hit.t(i);
+        ray.Ng.x = Ng.x;
+        ray.Ng.y = Ng.y;
+        ray.Ng.z = Ng.z;
+        ray.u = uv.x;
+        ray.v = uv.y;
+        ray.primID = primID;
+        ray.geomID = geomID;
+        instance_id_stack::copy_UU(context->user->instID, ray.instID);
+        return true;
+      }
+    };
+    
+    template<int M, bool filter>
+    struct Occluded1EpilogMU
+    {
+      Ray& ray;
+      IntersectContext* context;
+      const unsigned int geomID;
+      const unsigned int primID;
+
+      __forceinline Occluded1EpilogMU(Ray& ray,
+                                      IntersectContext* context,
+                                      const unsigned int geomID,
+                                      const unsigned int primID)
+        : ray(ray), context(context), geomID(geomID), primID(primID) {}
+
+      template<typename Hit>
+      __forceinline bool operator() (const vbool<M>& valid, Hit& hit) const
+      {
+        /* ray mask test */
+        Scene* scene MAYBE_UNUSED = context->scene;
+        Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+#if defined(EMBREE_RAY_MASK)
+        if ((geometry->mask & ray.mask) == 0) return false;
+#endif
+
+        /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+        if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter()))
+        {
+          hit.finalize();
+          for (size_t m=movemask(valid), i=bsf(m); m!=0; m=btc(m,i), i=bsf(m))
+          {
+            const Vec2f uv = hit.uv(i);
+            const float old_t = ray.tfar;
+            ray.tfar = hit.t(i);
+            HitK<1> h(context->user,geomID,primID,uv.x,uv.y,hit.Ng(i));
+            if (runOcclusionFilter1(geometry,ray,context,h)) return true;
+            ray.tfar = old_t;
+          }
+          return false;
+        }
+#endif
+        return true;
+      }
+    };
+        
+    template<int M, int K, bool filter>
+    struct IntersectKEpilogM
+    {
+      RayHitK<K>& ray;
+      IntersectContext* context;
+      const vuint<M>& geomIDs;
+      const vuint<M>& primIDs;
+      const size_t i;
+
+      __forceinline IntersectKEpilogM(RayHitK<K>& ray,
+                                      IntersectContext* context,
+                                     const vuint<M>& geomIDs,
+                                     const vuint<M>& primIDs,
+                                     size_t i)
+        : ray(ray), context(context), geomIDs(geomIDs), primIDs(primIDs), i(i) {}
+
+      template<typename Hit>
+      __forceinline vbool<K> operator() (const vbool<K>& valid_i, const Hit& hit) const
+      {
+        Scene* scene MAYBE_UNUSED = context->scene;
+
+        vfloat<K> u, v, t;
+        Vec3vf<K> Ng;
+        vbool<K> valid = valid_i;
+
+        std::tie(u,v,t,Ng) = hit();
+
+        const unsigned int geomID = geomIDs[i];
+        const unsigned int primID = primIDs[i];
+        Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+
+        /* ray masking test */
+#if defined(EMBREE_RAY_MASK)
+        valid &= (geometry->mask & ray.mask) != 0;
+        if (unlikely(none(valid))) return false;
+#endif
+
+        /* occlusion filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+        if (filter) {
+          if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) {
+            HitK<K> h(context->user,geomID,primID,u,v,Ng);
+            const vfloat<K> old_t = ray.tfar;
+            ray.tfar = select(valid,t,ray.tfar);
+            const vbool<K> m_accept = runIntersectionFilter(valid,geometry,ray,context,h);
+            ray.tfar = select(m_accept,ray.tfar,old_t);
+            return m_accept;
+          }
+        }
+#endif
+
+        /* update hit information */
+        vfloat<K>::store(valid,&ray.tfar,t);
+        vfloat<K>::store(valid,&ray.Ng.x,Ng.x);
+        vfloat<K>::store(valid,&ray.Ng.y,Ng.y);
+        vfloat<K>::store(valid,&ray.Ng.z,Ng.z);
+        vfloat<K>::store(valid,&ray.u,u);
+        vfloat<K>::store(valid,&ray.v,v);
+        vuint<K>::store(valid,&ray.primID,primID);
+        vuint<K>::store(valid,&ray.geomID,geomID);
+        instance_id_stack::copy_UV<K>(context->user->instID, ray.instID, valid);
+        return valid;
+      }
+    };
+    
+    template<int M, int K, bool filter>
+    struct OccludedKEpilogM
+    {
+      vbool<K>& valid0;
+      RayK<K>& ray;
+      IntersectContext* context;
+      const vuint<M>& geomIDs;
+      const vuint<M>& primIDs;
+      const size_t i;
+
+      __forceinline OccludedKEpilogM(vbool<K>& valid0,
+                                     RayK<K>& ray,
+                                     IntersectContext* context,
+                                     const vuint<M>& geomIDs,
+                                     const vuint<M>& primIDs,
+                                     size_t i)
+        : valid0(valid0), ray(ray), context(context), geomIDs(geomIDs), primIDs(primIDs), i(i) {}
+
+      template<typename Hit>
+      __forceinline vbool<K> operator() (const vbool<K>& valid_i, const Hit& hit) const
+      {
+        vbool<K> valid = valid_i;
+
+        /* ray masking test */
+        Scene* scene MAYBE_UNUSED = context->scene;
+        const unsigned int geomID MAYBE_UNUSED = geomIDs[i];
+        const unsigned int primID MAYBE_UNUSED = primIDs[i];
+        Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+#if defined(EMBREE_RAY_MASK)
+        valid &= (geometry->mask & ray.mask) != 0;
+        if (unlikely(none(valid))) return valid;
+#endif
+
+        /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+        if (filter) {
+          if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter()))
+          {
+            vfloat<K> u, v, t;
+            Vec3vf<K> Ng;
+            std::tie(u,v,t,Ng) = hit();
+            HitK<K> h(context->user,geomID,primID,u,v,Ng);
+            const vfloat<K> old_t = ray.tfar;
+            ray.tfar = select(valid,t,ray.tfar);
+            valid = runOcclusionFilter(valid,geometry,ray,context,h);
+            ray.tfar = select(valid,ray.tfar,old_t);
+          }
+        }
+#endif
+
+        /* update occlusion */
+        valid0 = valid0 & !valid;
+        return valid;
+      }
+    };
+    
+    template<int M, int K, bool filter>
+    struct IntersectKEpilogMU
+    {
+      RayHitK<K>& ray;
+      IntersectContext* context;
+      const unsigned int geomID;
+      const unsigned int primID;
+
+      __forceinline IntersectKEpilogMU(RayHitK<K>& ray,
+                                       IntersectContext* context,
+                                       const unsigned int geomID,
+                                       const unsigned int primID)
+        : ray(ray), context(context), geomID(geomID), primID(primID) {}
+
+      template<typename Hit>
+      __forceinline vbool<K> operator() (const vbool<K>& valid_org, const Hit& hit) const
+      {
+        vbool<K> valid = valid_org;
+        vfloat<K> u, v, t;
+        Vec3vf<K> Ng;
+        std::tie(u,v,t,Ng) = hit();
+
+        Scene* scene MAYBE_UNUSED = context->scene;
+        Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+
+        /* ray masking test */
+#if defined(EMBREE_RAY_MASK)
+        valid &= (geometry->mask & ray.mask) != 0;
+        if (unlikely(none(valid))) return false;
+#endif
+
+        /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+        if (filter) {
+          if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) {
+            HitK<K> h(context->user,geomID,primID,u,v,Ng);
+            const vfloat<K> old_t = ray.tfar;
+            ray.tfar = select(valid,t,ray.tfar);
+            const vbool<K> m_accept = runIntersectionFilter(valid,geometry,ray,context,h);
+            ray.tfar = select(m_accept,ray.tfar,old_t);
+            return m_accept;
+          }
+        }
+#endif
+
+        /* update hit information */
+        vfloat<K>::store(valid,&ray.tfar,t);
+        vfloat<K>::store(valid,&ray.Ng.x,Ng.x);
+        vfloat<K>::store(valid,&ray.Ng.y,Ng.y);
+        vfloat<K>::store(valid,&ray.Ng.z,Ng.z);
+        vfloat<K>::store(valid,&ray.u,u);
+        vfloat<K>::store(valid,&ray.v,v);
+        vuint<K>::store(valid,&ray.primID,primID);
+        vuint<K>::store(valid,&ray.geomID,geomID);
+        instance_id_stack::copy_UV<K>(context->user->instID, ray.instID, valid);
+        return valid;
+      }
+    };
+    
+    template<int M, int K, bool filter>
+    struct OccludedKEpilogMU
+    {
+      vbool<K>& valid0;
+      RayK<K>& ray;
+      IntersectContext* context;
+      const unsigned int geomID;
+      const unsigned int primID;
+
+      __forceinline OccludedKEpilogMU(vbool<K>& valid0,
+                                      RayK<K>& ray,
+                                      IntersectContext* context,
+                                      const unsigned int geomID,
+                                      const unsigned int primID)
+        : valid0(valid0), ray(ray), context(context), geomID(geomID), primID(primID) {}
+
+      template<typename Hit>
+      __forceinline vbool<K> operator() (const vbool<K>& valid_i, const Hit& hit) const
+      {
+        vbool<K> valid = valid_i;
+        Scene* scene MAYBE_UNUSED = context->scene;
+        Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+
+#if defined(EMBREE_RAY_MASK)
+        valid &= (geometry->mask & ray.mask) != 0;
+        if (unlikely(none(valid))) return false;
+#endif
+
+        /* occlusion filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+        if (filter) {
+          if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter()))
+          {
+            vfloat<K> u, v, t;
+            Vec3vf<K> Ng;
+            std::tie(u,v,t,Ng) = hit();
+            HitK<K> h(context->user,geomID,primID,u,v,Ng);
+            const vfloat<K> old_t = ray.tfar;
+            ray.tfar = select(valid,t,ray.tfar);
+            valid = runOcclusionFilter(valid,geometry,ray,context,h);
+            ray.tfar = select(valid,ray.tfar,old_t);
+          }
+        }
+#endif
+
+        /* update occlusion */
+        valid0 = valid0 & !valid;
+        return valid;
+      }
+    };
+    
+    template<int M, int K, bool filter>
+    struct Intersect1KEpilogM
+    {
+      RayHitK<K>& ray;
+      size_t k;
+      IntersectContext* context;
+      const vuint<M>& geomIDs;
+      const vuint<M>& primIDs;
+
+      __forceinline Intersect1KEpilogM(RayHitK<K>& ray, size_t k,
+                                       IntersectContext* context,
+                                       const vuint<M>& geomIDs,
+                                       const vuint<M>& primIDs)
+        : ray(ray), k(k), context(context), geomIDs(geomIDs), primIDs(primIDs) {}
+
+      template<typename Hit>
+      __forceinline bool operator() (const vbool<M>& valid_i, Hit& hit) const
+      {
+        Scene* scene MAYBE_UNUSED = context->scene;
+        vbool<M> valid = valid_i;
+        hit.finalize();
+        size_t i = select_min(valid,hit.vt);
+        assert(i<M);
+        unsigned int geomID = geomIDs[i];
+
+        /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION) || defined(EMBREE_RAY_MASK)
+        bool foundhit = false;
+        goto entry;
+        while (true)
+        {
+          if (unlikely(none(valid))) return foundhit;
+          i = select_min(valid,hit.vt);
+          assert(i<M);
+          geomID = geomIDs[i];
+        entry:
+          Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+
+#if defined(EMBREE_RAY_MASK)
+          /* goto next hit if mask test fails */
+          if ((geometry->mask & ray.mask[k]) == 0) {
+            clear(valid,i);
+            continue;
+          }
+#endif
+
+#if defined(EMBREE_FILTER_FUNCTION) 
+          /* call intersection filter function */
+          if (filter) {
+            if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) {
+              assert(i<M);
+              const Vec2f uv = hit.uv(i);
+              HitK<K> h(context->user,geomID,primIDs[i],uv.x,uv.y,hit.Ng(i));
+              const float old_t = ray.tfar[k];
+              ray.tfar[k] = hit.t(i);
+              const bool found = any(runIntersectionFilter(vbool<K>(1<<k),geometry,ray,context,h));
+              if (!found) ray.tfar[k] = old_t;
+              foundhit = foundhit | found;
+              clear(valid,i);
+              valid &= hit.vt <= ray.tfar[k]; // intersection filters may modify tfar value
+              continue;
+            }
+          }
+#endif
+          break;
+        }
+#endif
+        assert(i<M);
+        /* update hit information */
+        const Vec2f uv = hit.uv(i);
+        ray.tfar[k] = hit.t(i);
+        ray.Ng.x[k] = hit.vNg.x[i];
+        ray.Ng.y[k] = hit.vNg.y[i];
+        ray.Ng.z[k] = hit.vNg.z[i];
+        ray.u[k] = uv.x;
+        ray.v[k] = uv.y;
+        ray.primID[k] = primIDs[i];
+        ray.geomID[k] = geomID;
+        instance_id_stack::copy_UV<K>(context->user->instID, ray.instID, k);
+        return true;
+      }
+    };
+    
+    template<int M, int K, bool filter>
+    struct Occluded1KEpilogM
+    {
+      RayK<K>& ray;
+      size_t k;
+      IntersectContext* context;
+      const vuint<M>& geomIDs;
+      const vuint<M>& primIDs;
+
+      __forceinline Occluded1KEpilogM(RayK<K>& ray, size_t k,
+                                      IntersectContext* context,
+                                      const vuint<M>& geomIDs,
+                                      const vuint<M>& primIDs)
+        : ray(ray), k(k), context(context), geomIDs(geomIDs), primIDs(primIDs) {}
+
+      template<typename Hit>
+      __forceinline bool operator() (const vbool<M>& valid_i, Hit& hit) const
+      {
+        Scene* scene MAYBE_UNUSED = context->scene;
+
+        /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION) || defined(EMBREE_RAY_MASK)
+        if (unlikely(filter))
+          hit.finalize(); /* called only once */
+
+        vbool<M> valid = valid_i;
+        size_t m=movemask(valid);
+        goto entry;
+        while (true)
+        {
+          if (unlikely(m == 0)) return false;
+        entry:
+          size_t i=bsf(m);
+
+          const unsigned int geomID = geomIDs[i];
+          Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+
+#if defined(EMBREE_RAY_MASK)
+          /* goto next hit if mask test fails */
+          if ((geometry->mask & ray.mask[k]) == 0) {
+            m=btc(m,i);
+            continue;
+          }
+#endif
+
+#if defined(EMBREE_FILTER_FUNCTION)
+          /* execute occlusion filer */
+          if (filter) {
+            if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter()))
+            {
+              const Vec2f uv = hit.uv(i);
+              const float old_t = ray.tfar[k];
+              ray.tfar[k] = hit.t(i);
+              HitK<K> h(context->user,geomID,primIDs[i],uv.x,uv.y,hit.Ng(i));
+              if (any(runOcclusionFilter(vbool<K>(1<<k),geometry,ray,context,h))) return true;
+              ray.tfar[k] = old_t;
+              m=btc(m,i);
+              continue;
+            }
+          }
+#endif
+          break;
+        }
+#endif
+        return true;
+      }
+    };
+    
+    template<int M, int K, bool filter>
+    struct Intersect1KEpilogMU
+    {
+      RayHitK<K>& ray;
+      size_t k;
+      IntersectContext* context;
+      const unsigned int geomID;
+      const unsigned int primID;
+
+      __forceinline Intersect1KEpilogMU(RayHitK<K>& ray, size_t k,
+                                        IntersectContext* context,
+                                        const unsigned int geomID,
+                                        const unsigned int primID)
+        : ray(ray), k(k), context(context), geomID(geomID), primID(primID) {}
+
+      template<typename Hit>
+      __forceinline bool operator() (const vbool<M>& valid_i, Hit& hit) const
+      {
+        Scene* scene MAYBE_UNUSED = context->scene;
+        Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+#if defined(EMBREE_RAY_MASK)
+        /* ray mask test */
+        if ((geometry->mask & ray.mask[k]) == 0)
+          return false;
+#endif
+
+        /* finalize hit calculation */
+        vbool<M> valid = valid_i;
+        hit.finalize();
+        size_t i = select_min(valid,hit.vt);
+
+        /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+        if (filter) {
+          if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter()))
+          {
+            bool foundhit = false;
+            while (true)
+            {
+              const Vec2f uv = hit.uv(i);
+              const float old_t = ray.tfar[k];
+              ray.tfar[k] = hit.t(i);
+              HitK<K> h(context->user,geomID,primID,uv.x,uv.y,hit.Ng(i));
+              const bool found = any(runIntersectionFilter(vbool<K>(1<<k),geometry,ray,context,h));
+              if (!found) ray.tfar[k] = old_t;
+              foundhit = foundhit | found;
+              clear(valid,i);
+              valid &= hit.vt <= ray.tfar[k]; // intersection filters may modify tfar value
+              if (unlikely(none(valid))) break;
+              i = select_min(valid,hit.vt);
+            }
+            return foundhit;
+          }
+        }
+#endif
+
+        /* update hit information */
+        const Vec2f uv = hit.uv(i);
+        const Vec3fa Ng = hit.Ng(i);
+        ray.tfar[k] = hit.t(i);
+        ray.Ng.x[k] = Ng.x;
+        ray.Ng.y[k] = Ng.y;
+        ray.Ng.z[k] = Ng.z;
+        ray.u[k] = uv.x;
+        ray.v[k] = uv.y;
+        ray.primID[k] = primID;
+        ray.geomID[k] = geomID;
+        instance_id_stack::copy_UV<K>(context->user->instID, ray.instID, k);
+        return true;
+      }
+    };
+    
+    template<int M, int K, bool filter>
+    struct Occluded1KEpilogMU
+    {
+      RayK<K>& ray;
+      size_t k;
+      IntersectContext* context;
+      const unsigned int geomID;
+      const unsigned int primID;
+
+      __forceinline Occluded1KEpilogMU(RayK<K>& ray, size_t k,
+                                       IntersectContext* context,
+                                       const unsigned int geomID,
+                                       const unsigned int primID)
+        : ray(ray), k(k), context(context), geomID(geomID), primID(primID) {}
+
+      template<typename Hit>
+      __forceinline bool operator() (const vbool<M>& valid_i, Hit& hit) const
+      {
+        Scene* scene MAYBE_UNUSED = context->scene;
+        Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+#if defined(EMBREE_RAY_MASK)
+        /* ray mask test */
+        if ((geometry->mask & ray.mask[k]) == 0)
+          return false;
+#endif
+
+        /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+        if (filter) {
+          if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter()))
+          {
+            hit.finalize();
+            for (size_t m=movemask(valid_i), i=bsf(m); m!=0; m=btc(m,i), i=bsf(m))
+            {
+              const Vec2f uv = hit.uv(i);
+              const float old_t = ray.tfar[k];
+              ray.tfar[k] = hit.t(i);
+              HitK<K> h(context->user,geomID,primID,uv.x,uv.y,hit.Ng(i));
+              if (any(runOcclusionFilter(vbool<K>(1<<k),geometry,ray,context,h))) return true;
+              ray.tfar[k] = old_t;
+            }
+            return false;
+          }
+        }
+#endif 
+        return true;
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/intersector_iterators.h b/thirdparty/embree/kernels/geometry/intersector_iterators.h
new file mode 100644
index 0000000000..9cac1cd25c
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/intersector_iterators.h
@@ -0,0 +1,172 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/scene.h"
+#include "../common/ray.h"
+#include "../common/point_query.h"
+#include "../bvh/node_intersector1.h"
+#include "../bvh/node_intersector_packet.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<typename Intersector>
+    struct ArrayIntersector1
+    {
+      typedef typename Intersector::Primitive Primitive;
+      typedef typename Intersector::Precalculations Precalculations;
+
+      template<int N, bool robust>
+      static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+      {
+        for (size_t i=0; i<num; i++)
+          Intersector::intersect(pre,ray,context,prim[i]);
+      }
+
+      template<int N, bool robust>
+      static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+      {
+        for (size_t i=0; i<num; i++) {
+          if (Intersector::occluded(pre,ray,context,prim[i]))
+            return true;
+        }
+        return false;
+      }
+      
+      template<int N>
+      static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, const Primitive* prim, size_t num, const TravPointQuery<N> &tquery, size_t& lazy_node)
+      {
+        bool changed = false;
+        for (size_t i=0; i<num; i++)
+          changed |= Intersector::pointQuery(query, context, prim[i]);
+        return changed;
+      }
+
+      template<int K>
+      static __forceinline void intersectK(const vbool<K>& valid, /* PrecalculationsK& pre, */ RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node)
+      {
+      }
+
+      template<int K>
+      static __forceinline vbool<K> occludedK(const vbool<K>& valid, /* PrecalculationsK& pre, */ RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node)
+      {
+        return valid;
+      }
+    };
+
+    template<int K, typename Intersector>
+    struct ArrayIntersectorK_1
+    {
+      typedef typename Intersector::Primitive Primitive;
+      typedef typename Intersector::Precalculations Precalculations;
+
+      template<bool robust>
+      static __forceinline void intersect(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+      {
+        for (size_t i=0; i<num; i++) {
+          Intersector::intersect(valid,pre,ray,context,prim[i]);
+        }
+      }
+
+      template<bool robust>
+      static __forceinline vbool<K> occluded(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+      {
+        vbool<K> valid0 = valid;
+        for (size_t i=0; i<num; i++) {
+          valid0 &= !Intersector::occluded(valid0,pre,ray,context,prim[i]);
+          if (none(valid0)) break;
+        }
+        return !valid0;
+      }
+
+      template<int N, bool robust>
+      static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+      {
+        for (size_t i=0; i<num; i++) {
+          Intersector::intersect(pre,ray,k,context,prim[i]);
+        }
+      }
+
+      template<int N, bool robust>
+      static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+      {
+        for (size_t i=0; i<num; i++) {
+          if (Intersector::occluded(pre,ray,k,context,prim[i]))
+            return true;
+        }
+        return false;
+      }
+    };
+
+    // =============================================================================================
+
+    template<int K, typename IntersectorK>
+    struct ArrayIntersectorKStream
+    {
+      typedef typename IntersectorK::Primitive PrimitiveK;
+      typedef typename IntersectorK::Precalculations PrecalculationsK;
+
+      static __forceinline void intersectK(const vbool<K>& valid, const Accel::Intersectors* This, /* PrecalculationsK& pre, */ RayHitK<K>& ray, IntersectContext* context, const PrimitiveK* prim, size_t num, size_t& lazy_node)
+      {
+        PrecalculationsK pre(valid,ray); // FIXME: might cause trouble
+
+        for (size_t i=0; i<num; i++) {
+          IntersectorK::intersect(valid,pre,ray,context,prim[i]);
+        }
+      }
+
+      static __forceinline vbool<K> occludedK(const vbool<K>& valid, const Accel::Intersectors* This, /* PrecalculationsK& pre, */ RayK<K>& ray, IntersectContext* context, const PrimitiveK* prim, size_t num, size_t& lazy_node)
+      {
+        PrecalculationsK pre(valid,ray); // FIXME: might cause trouble
+        vbool<K> valid0 = valid;
+        for (size_t i=0; i<num; i++) {
+          valid0 &= !IntersectorK::occluded(valid0,pre,ray,context,prim[i]);
+          if (none(valid0)) break;
+        }
+        return !valid0;
+      }
+
+      static __forceinline void intersect(const Accel::Intersectors* This, RayHitK<K>& ray, size_t k, IntersectContext* context, const PrimitiveK* prim, size_t num, size_t& lazy_node)
+      {
+        PrecalculationsK pre(ray.tnear() <= ray.tfar,ray); // FIXME: might cause trouble
+        for (size_t i=0; i<num; i++) {
+          IntersectorK::intersect(pre,ray,k,context,prim[i]);
+        }
+      }
+
+      static __forceinline bool occluded(const Accel::Intersectors* This, RayK<K>& ray, size_t k, IntersectContext* context, const PrimitiveK* prim, size_t num, size_t& lazy_node)
+      {
+        PrecalculationsK pre(ray.tnear() <= ray.tfar,ray); // FIXME: might cause trouble
+        for (size_t i=0; i<num; i++) {
+          if (IntersectorK::occluded(pre,ray,k,context,prim[i]))
+            return true;
+        }
+        return false;
+      }
+
+      static __forceinline size_t occluded(const Accel::Intersectors* This, size_t cur_mask, RayK<K>** __restrict__ inputPackets, IntersectContext* context, const PrimitiveK* prim, size_t num, size_t& lazy_node)
+      {
+        size_t m_occluded = 0;
+        for (size_t i=0; i<num; i++) {
+          size_t bits = cur_mask & (~m_occluded);
+          for (; bits!=0; )
+          {
+            const size_t rayID = bscf(bits);
+            RayHitK<K> &ray = *inputPackets[rayID / K];
+            const size_t k = rayID % K;
+            PrecalculationsK pre(ray.tnear() <= ray.tfar,ray); // FIXME: might cause trouble
+            if (IntersectorK::occluded(pre,ray,k,context,prim[i]))
+            {
+              m_occluded |= (size_t)1 << rayID;
+              ray.tfar[k] = neg_inf;
+            }
+          }
+        }
+        return m_occluded;
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/line_intersector.h b/thirdparty/embree/kernels/geometry/line_intersector.h
new file mode 100644
index 0000000000..41096d8794
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/line_intersector.h
@@ -0,0 +1,145 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "curve_intersector_precalculations.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M>
+      struct LineIntersectorHitM
+      {
+        __forceinline LineIntersectorHitM() {}
+
+        __forceinline LineIntersectorHitM(const vfloat<M>& u, const vfloat<M>& v, const vfloat<M>& t, const Vec3vf<M>& Ng)
+          : vu(u), vv(v), vt(t), vNg(Ng) {}
+        
+        __forceinline void finalize() {}
+        
+        __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); }
+        __forceinline float t  (const size_t i) const { return vt[i]; }
+        __forceinline Vec3fa Ng(const size_t i) const { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); }
+
+        __forceinline Vec2vf<M> uv() const { return Vec2vf<M>(vu,vv); }
+        __forceinline vfloat<M> t () const { return vt; }
+        __forceinline Vec3vf<M> Ng() const { return vNg; }
+        
+      public:
+        vfloat<M> vu;
+        vfloat<M> vv;
+        vfloat<M> vt;
+        Vec3vf<M> vNg;
+      };
+    
+    template<int M>
+      struct FlatLinearCurveIntersector1
+      {
+        typedef CurvePrecalculations1 Precalculations;
+        
+        template<typename Ray, typename Epilog>
+        static __forceinline bool intersect(const vbool<M>& valid_i,
+                                            Ray& ray,
+                                            IntersectContext* context,
+                                            const LineSegments* geom,
+                                            const Precalculations& pre,
+                                            const Vec4vf<M>& v0i, const Vec4vf<M>& v1i,
+                                            const Epilog& epilog)
+        {
+          /* transform end points into ray space */
+          vbool<M> valid = valid_i;
+          vfloat<M> depth_scale = pre.depth_scale;
+          LinearSpace3<Vec3vf<M>> ray_space = pre.ray_space;
+
+          const Vec3vf<M> ray_org ((Vec3fa)ray.org);
+          const Vec4vf<M> v0 = enlargeRadiusToMinWidth<M>(context,geom,ray_org,v0i);
+          const Vec4vf<M> v1 = enlargeRadiusToMinWidth<M>(context,geom,ray_org,v1i);
+          
+          Vec4vf<M> p0(xfmVector(ray_space,v0.xyz()-ray_org), v0.w);
+          Vec4vf<M> p1(xfmVector(ray_space,v1.xyz()-ray_org), v1.w);
+          
+          /* approximative intersection with cone */
+          const Vec4vf<M> v = p1-p0;
+          const Vec4vf<M> w = -p0;
+          const vfloat<M> d0 = madd(w.x,v.x,w.y*v.y);
+          const vfloat<M> d1 = madd(v.x,v.x,v.y*v.y);
+          const vfloat<M> u = clamp(d0*rcp(d1),vfloat<M>(zero),vfloat<M>(one));
+          const Vec4vf<M> p = madd(u,v,p0);
+          const vfloat<M> t = p.z;
+          const vfloat<M> d2 = madd(p.x,p.x,p.y*p.y);
+          const vfloat<M> r = p.w;
+          const vfloat<M> r2 = r*r;
+          valid &= (d2 <= r2) & (vfloat<M>(ray.tnear()) <= t) & (t <= vfloat<M>(ray.tfar));
+          if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f) 
+            valid &= t > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR)*r*depth_scale; // ignore self intersections
+          if (unlikely(none(valid))) return false;
+          
+          /* ignore denormalized segments */
+          const Vec3vf<M> T = v1.xyz()-v0.xyz();
+          valid &= (T.x != vfloat<M>(zero)) | (T.y != vfloat<M>(zero)) | (T.z != vfloat<M>(zero));
+          if (unlikely(none(valid))) return false;
+          
+          /* update hit information */
+          LineIntersectorHitM<M> hit(u,zero,t,T);
+          return epilog(valid,hit);
+        }
+      };
+    
+    template<int M, int K>
+      struct FlatLinearCurveIntersectorK
+      {
+        typedef CurvePrecalculationsK<K> Precalculations;
+        
+        template<typename Epilog>
+        static __forceinline bool intersect(const vbool<M>& valid_i,
+                                            RayK<K>& ray, size_t k,
+                                            IntersectContext* context,
+                                            const LineSegments* geom,
+                                            const Precalculations& pre,
+                                            const Vec4vf<M>& v0i, const Vec4vf<M>& v1i,
+                                            const Epilog& epilog)
+        {
+          /* transform end points into ray space */
+          vbool<M> valid = valid_i;
+          vfloat<M> depth_scale = pre.depth_scale[k];
+          LinearSpace3<Vec3vf<M>> ray_space = pre.ray_space[k];
+          const Vec3vf<M> ray_org(ray.org.x[k],ray.org.y[k],ray.org.z[k]);
+          const Vec3vf<M> ray_dir(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]);
+
+          const Vec4vf<M> v0 = enlargeRadiusToMinWidth<M>(context,geom,ray_org,v0i);
+          const Vec4vf<M> v1 = enlargeRadiusToMinWidth<M>(context,geom,ray_org,v1i);
+          
+          Vec4vf<M> p0(xfmVector(ray_space,v0.xyz()-ray_org), v0.w);
+          Vec4vf<M> p1(xfmVector(ray_space,v1.xyz()-ray_org), v1.w);
+          
+          /* approximative intersection with cone */
+          const Vec4vf<M> v = p1-p0;
+          const Vec4vf<M> w = -p0;
+          const vfloat<M> d0 = madd(w.x,v.x,w.y*v.y);
+          const vfloat<M> d1 = madd(v.x,v.x,v.y*v.y);
+          const vfloat<M> u = clamp(d0*rcp(d1),vfloat<M>(zero),vfloat<M>(one));
+          const Vec4vf<M> p = madd(u,v,p0);
+          const vfloat<M> t = p.z;
+          const vfloat<M> d2 = madd(p.x,p.x,p.y*p.y);
+          const vfloat<M> r = p.w;
+          const vfloat<M> r2 = r*r;
+          valid &= (d2 <= r2) & (vfloat<M>(ray.tnear()[k]) <= t) & (t <= vfloat<M>(ray.tfar[k]));
+          if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f) 
+            valid &= t > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR)*r*depth_scale; // ignore self intersections
+          if (unlikely(none(valid))) return false;
+          
+          /* ignore denormalized segments */
+          const Vec3vf<M> T = v1.xyz()-v0.xyz();
+          valid &= (T.x != vfloat<M>(zero)) | (T.y != vfloat<M>(zero)) | (T.z != vfloat<M>(zero));
+          if (unlikely(none(valid))) return false;
+          
+          /* update hit information */
+          LineIntersectorHitM<M> hit(u,zero,t,T);
+          return epilog(valid,hit);
+        }
+      };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/linei.h b/thirdparty/embree/kernels/geometry/linei.h
new file mode 100644
index 0000000000..3ee70ac012
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/linei.h
@@ -0,0 +1,705 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+
+namespace embree
+{
+  template<int M>
+  struct LineMi
+  {
+    /* Virtual interface to query information about the line segment type */
+    struct Type : public PrimitiveType
+    {
+      const char* name() const;
+      size_t sizeActive(const char* This) const;
+      size_t sizeTotal(const char* This) const;
+      size_t getBytes(const char* This) const;      
+    };
+    static Type type;
+
+  public:
+
+    /* primitive supports multiple time segments */
+    static const bool singleTimeSegment = false;
+
+    /* Returns maximum number of stored line segments */
+    static __forceinline size_t max_size() { return M; }
+
+    /* Returns required number of primitive blocks for N line segments */
+    static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); }
+
+    /* Returns required number of bytes for N line segments */
+    static __forceinline size_t bytes(size_t N) { return blocks(N)*sizeof(LineMi); }
+
+  public:
+
+    /* Default constructor */
+    __forceinline LineMi() {  }
+
+    /* Construction from vertices and IDs */
+    __forceinline LineMi(const vuint<M>& v0, unsigned short leftExists, unsigned short rightExists, const vuint<M>& geomIDs, const vuint<M>& primIDs, Geometry::GType gtype)
+      : gtype((unsigned char)gtype), m((unsigned char)popcnt(vuint<M>(primIDs) != vuint<M>(-1))), sharedGeomID(geomIDs[0]), leftExists (leftExists), rightExists(rightExists), v0(v0), primIDs(primIDs)
+    {
+      assert(all(vuint<M>(geomID()) == geomIDs));
+    }
+
+    /* Returns a mask that tells which line segments are valid */
+    __forceinline vbool<M> valid() const { return primIDs != vuint<M>(-1); }
+
+    /* Returns if the specified line segment is valid */
+    __forceinline bool valid(const size_t i) const { assert(i<M); return primIDs[i] != -1; }
+
+    /* Returns the number of stored line segments */
+    __forceinline size_t size() const { return bsf(~movemask(valid())); }
+
+    /* Returns the geometry IDs */
+    //template<class T>
+    //static __forceinline T unmask(T &index) { return index & 0x3fffffff; }
+
+    __forceinline     unsigned int geomID(unsigned int i = 0) const { return sharedGeomID; }
+    //__forceinline       vuint<M> geomID()       { return unmask(geomIDs); }
+    //__forceinline const vuint<M> geomID() const { return unmask(geomIDs); }
+    //__forceinline unsigned int geomID(const size_t i) const { assert(i<M); return unmask(geomIDs[i]); }
+
+    /* Returns the primitive IDs */
+    __forceinline       vuint<M>& primID()       { return primIDs; }
+    __forceinline const vuint<M>& primID() const { return primIDs; }
+    __forceinline unsigned int primID(const size_t i) const { assert(i<M); return primIDs[i]; }
+
+    /* gather the line segments */
+    __forceinline void gather(Vec4vf<M>& p0,
+                              Vec4vf<M>& p1,
+                              const LineSegments* geom) const;
+
+    __forceinline void gatheri(Vec4vf<M>& p0,
+                               Vec4vf<M>& p1,
+                               const LineSegments* geom,
+                               const int itime) const;
+
+    __forceinline void gather(Vec4vf<M>& p0,
+                              Vec4vf<M>& p1,
+                              const LineSegments* geom,
+                              float time) const;
+
+    /* gather the line segments with lateral info */
+    __forceinline void gather(Vec4vf<M>& p0,
+                              Vec4vf<M>& p1,
+                              Vec4vf<M>& pL,
+                              Vec4vf<M>& pR,
+                              const LineSegments* geom) const;
+
+    __forceinline void gatheri(Vec4vf<M>& p0,
+                               Vec4vf<M>& p1,
+                               Vec4vf<M>& pL,
+                               Vec4vf<M>& pR,
+                               const LineSegments* geom,
+                               const int itime) const;
+
+    __forceinline void gather(Vec4vf<M>& p0,
+                              Vec4vf<M>& p1,
+                              Vec4vf<M>& pL,
+                              Vec4vf<M>& pR,
+                              const LineSegments* geom,
+                              float time) const;
+
+    __forceinline void gather(Vec4vf<M>& p0,
+                              Vec4vf<M>& p1,
+                              vbool<M>& cL,
+                              vbool<M>& cR,
+                              const LineSegments* geom) const;
+
+    __forceinline void gatheri(Vec4vf<M>& p0,
+                               Vec4vf<M>& p1,
+                               vbool<M>& cL,
+                               vbool<M>& cR,
+                               const LineSegments* geom,
+                               const int itime) const;
+
+    __forceinline void gather(Vec4vf<M>& p0,
+                              Vec4vf<M>& p1,
+                              vbool<M>& cL,
+                              vbool<M>& cR,
+                              const LineSegments* geom,
+                              float time) const;
+
+    /* Calculate the bounds of the line segments */
+    __forceinline const BBox3fa bounds(const Scene* scene, size_t itime = 0) const
+    {
+      BBox3fa bounds = empty;
+      for (size_t i=0; i<M && valid(i); i++)
+      {
+        const LineSegments* geom = scene->get<LineSegments>(geomID(i));
+        const Vec3ff& p0 = geom->vertex(v0[i]+0,itime);
+        const Vec3ff& p1 = geom->vertex(v0[i]+1,itime);
+        BBox3fa b = merge(BBox3fa(p0),BBox3fa(p1));
+        b = enlarge(b,Vec3fa(max(p0.w,p1.w)));
+        bounds.extend(b);
+      }
+      return bounds;
+    }
+
+    /* Calculate the linear bounds of the primitive */
+    __forceinline LBBox3fa linearBounds(const Scene* scene, size_t itime) {
+      return LBBox3fa(bounds(scene,itime+0), bounds(scene,itime+1));
+    }
+
+    __forceinline LBBox3fa linearBounds(const Scene *const scene, size_t itime, size_t numTimeSteps) {
+      LBBox3fa allBounds = empty;
+      for (size_t i=0; i<M && valid(i); i++)
+      {
+        const LineSegments* geom = scene->get<LineSegments>(geomID(i));
+        allBounds.extend(geom->linearBounds(primID(i), itime, numTimeSteps));
+      }
+      return allBounds;
+    }
+
+    __forceinline LBBox3fa linearBounds(const Scene *const scene, const BBox1f time_range) 
+    {
+      LBBox3fa allBounds = empty;
+      for (size_t i=0; i<M && valid(i); i++)
+      {
+        const LineSegments* geom = scene->get<LineSegments>(geomID((unsigned int)i));
+        allBounds.extend(geom->linearBounds(primID(i), time_range));
+      }
+      return allBounds;
+    }
+
+    /* Fill line segment from line segment list */
+    template<typename PrimRefT>
+    __forceinline void fill(const PrimRefT* prims, size_t& begin, size_t end, Scene* scene)
+    {
+      Geometry::GType gty = scene->get(prims[begin].geomID())->getType();
+      vuint<M> geomID, primID;
+      vuint<M> v0;
+      unsigned short leftExists = 0;
+      unsigned short rightExists = 0;
+      const PrimRefT* prim = &prims[begin];
+
+      for (size_t i=0; i<M; i++)
+      {
+        const LineSegments* geom = scene->get<LineSegments>(prim->geomID());
+        if (begin<end) {
+          geomID[i] = prim->geomID();
+          primID[i] = prim->primID();
+          v0[i] = geom->segment(prim->primID());
+          leftExists |= geom->segmentLeftExists(primID[i]) << i;
+          rightExists |= geom->segmentRightExists(primID[i]) << i;         
+          begin++;
+        } else {
+          assert(i);
+          if (i>0) {
+            geomID[i] = geomID[i-1];
+            primID[i] = -1;
+            v0[i] = v0[i-1];
+          }
+        }
+        if (begin<end) prim = &prims[begin]; // FIXME: remove this line
+      }
+      new (this) LineMi(v0,leftExists,rightExists,geomID,primID,gty); // FIXME: use non temporal store
+    }
+
+     template<typename BVH, typename Allocator>
+      __forceinline static typename BVH::NodeRef createLeaf (BVH* bvh, const PrimRef* prims, const range<size_t>& set, const Allocator& alloc)
+    {
+      size_t start = set.begin();
+      size_t items = LineMi::blocks(set.size());
+      size_t numbytes = LineMi::bytes(set.size());
+      LineMi* accel = (LineMi*) alloc.malloc1(numbytes,M*sizeof(float));
+      for (size_t i=0; i<items; i++) {
+        accel[i].fill(prims,start,set.end(),bvh->scene);
+      }
+      return bvh->encodeLeaf((char*)accel,items);
+    };
+    
+    __forceinline LBBox3fa fillMB(const PrimRef* prims, size_t& begin, size_t end, Scene* scene, size_t itime)
+    {
+      fill(prims,begin,end,scene);
+      return linearBounds(scene,itime);
+    }
+
+    __forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t& begin, size_t end, Scene* scene, const BBox1f time_range)
+    {
+      fill(prims,begin,end,scene);
+      return linearBounds(scene,time_range);
+    }
+
+      template<typename BVH, typename SetMB, typename Allocator>
+    __forceinline static typename BVH::NodeRecordMB4D createLeafMB(BVH* bvh, const SetMB& prims, const Allocator& alloc)
+    {
+      size_t start = prims.begin();
+      size_t end   = prims.end();
+      size_t items = LineMi::blocks(prims.size());
+      size_t numbytes = LineMi::bytes(prims.size());
+      LineMi* accel = (LineMi*) alloc.malloc1(numbytes,M*sizeof(float));
+      const typename BVH::NodeRef node = bvh->encodeLeaf((char*)accel,items);
+      
+      LBBox3fa bounds = empty;
+      for (size_t i=0; i<items; i++)
+        bounds.extend(accel[i].fillMB(prims.prims->data(),start,end,bvh->scene,prims.time_range));
+      
+      return typename BVH::NodeRecordMB4D(node,bounds,prims.time_range);
+    };
+
+    /* Updates the primitive */
+    __forceinline BBox3fa update(LineSegments* geom)
+    {
+      BBox3fa bounds = empty;
+      for (size_t i=0; i<M && valid(i); i++)
+      {
+        const Vec3ff& p0 = geom->vertex(v0[i]+0);
+        const Vec3ff& p1 = geom->vertex(v0[i]+1);
+        BBox3fa b = merge(BBox3fa(p0),BBox3fa(p1));
+        b = enlarge(b,Vec3fa(max(p0.w,p1.w)));
+        bounds.extend(b);
+      }
+      return bounds;
+    }
+
+    /*! output operator */
+    friend __forceinline embree_ostream operator<<(embree_ostream cout, const LineMi& line) {
+      return cout << "Line" << M << "i {" << line.v0 << ", " << line.geomID() << ", " << line.primID() << "}";
+    }
+    
+  public:
+    unsigned char gtype;
+    unsigned char m;
+    unsigned int sharedGeomID;
+    unsigned short leftExists, rightExists;
+    vuint<M> v0;      // index of start vertex
+  private:
+    vuint<M> primIDs; // primitive ID
+  };
+
+  template<>
+    __forceinline void LineMi<4>::gather(Vec4vf4& p0,
+                                         Vec4vf4& p1,
+                                         const LineSegments* geom) const
+  {
+    const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0]));
+    const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1]));
+    const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2]));
+    const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3]));
+    transpose(a0,a1,a2,a3,p0.x,p0.y,p0.z,p0.w);
+
+    const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1));
+    const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1));
+    const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1));
+    const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1));
+    transpose(b0,b1,b2,b3,p1.x,p1.y,p1.z,p1.w);
+  }
+
+  template<>
+  __forceinline void LineMi<4>::gatheri(Vec4vf4& p0,
+                                       Vec4vf4& p1,
+                                       const LineSegments* geom,
+                                       const int itime) const
+  {
+    const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0],itime));
+    const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1],itime));
+    const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2],itime));
+    const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3],itime));
+    transpose(a0,a1,a2,a3,p0.x,p0.y,p0.z,p0.w);
+
+    const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1,itime));
+    const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1,itime));
+    const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1,itime));
+    const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1,itime));
+    transpose(b0,b1,b2,b3,p1.x,p1.y,p1.z,p1.w);
+  }
+
+  template<>
+    __forceinline void LineMi<4>::gather(Vec4vf4& p0,
+                                         Vec4vf4& p1,
+                                         const LineSegments* geom,
+                                         float time) const
+  {
+    float ftime;
+    const int itime = geom->timeSegment(time, ftime);
+
+    Vec4vf4 a0,a1;
+    gatheri(a0,a1,geom,itime);
+    Vec4vf4 b0,b1;
+    gatheri(b0,b1,geom,itime+1);
+    p0 = lerp(a0,b0,vfloat4(ftime));
+    p1 = lerp(a1,b1,vfloat4(ftime));
+  }
+
+  template<>
+    __forceinline void LineMi<4>::gather(Vec4vf4& p0,
+                                         Vec4vf4& p1,
+                                         vbool4&  cL,
+                                         vbool4&  cR,
+                                         const LineSegments* geom) const
+  {
+    gather(p0,p1,geom);
+    cL = !vbool4(leftExists);
+    cR = !vbool4(rightExists);
+  }
+
+  template<>
+    __forceinline void LineMi<4>::gatheri(Vec4vf4& p0,
+                                          Vec4vf4& p1,
+                                          vbool4&  cL,
+                                          vbool4&  cR,
+                                          const LineSegments* geom,
+                                          const int itime) const
+  {
+    gatheri(p0,p1,geom,itime);
+    cL = !vbool4(leftExists);
+    cR = !vbool4(rightExists);
+  }
+
+  template<>
+    __forceinline void LineMi<4>::gather(Vec4vf4& p0,
+                                         Vec4vf4& p1,
+                                         vbool4&  cL,
+                                         vbool4&  cR,
+                                         const LineSegments* geom,
+                                         float time) const
+  {
+    float ftime;
+    const int itime = geom->timeSegment(time, ftime);
+    
+    Vec4vf4 a0,a1;
+    gatheri(a0,a1,geom,itime);
+    Vec4vf4 b0,b1;
+    gatheri(b0,b1,geom,itime+1);
+    p0 = lerp(a0,b0,vfloat4(ftime));
+    p1 = lerp(a1,b1,vfloat4(ftime));
+    cL = !vbool4(leftExists);
+    cR = !vbool4(rightExists);
+  }
+
+  template<>
+    __forceinline void LineMi<4>::gather(Vec4vf4& p0,
+                                              Vec4vf4& p1,
+                                              Vec4vf4& pL,
+                                              Vec4vf4& pR,
+                                              const LineSegments* geom) const
+  {
+    const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0]));
+    const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1]));
+    const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2]));
+    const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3]));
+    transpose(a0,a1,a2,a3,p0.x,p0.y,p0.z,p0.w);
+
+    const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1));
+    const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1));
+    const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1));
+    const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1));
+    transpose(b0,b1,b2,b3,p1.x,p1.y,p1.z,p1.w);
+    
+    const vfloat4 l0 = (leftExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]-1)) : vfloat4(inf);
+    const vfloat4 l1 = (leftExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]-1)) : vfloat4(inf);
+    const vfloat4 l2 = (leftExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]-1)) : vfloat4(inf);
+    const vfloat4 l3 = (leftExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]-1)) : vfloat4(inf);
+    transpose(l0,l1,l2,l3,pL.x,pL.y,pL.z,pL.w);
+    
+    const vfloat4 r0 = (rightExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]+2)) : vfloat4(inf);
+    const vfloat4 r1 = (rightExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]+2)) : vfloat4(inf);
+    const vfloat4 r2 = (rightExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]+2)) : vfloat4(inf);
+    const vfloat4 r3 = (rightExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]+2)) : vfloat4(inf);
+    transpose(r0,r1,r2,r3,pR.x,pR.y,pR.z,pR.w);
+  }
+  
+  template<>
+    __forceinline void LineMi<4>::gatheri(Vec4vf4& p0,
+                                              Vec4vf4& p1,
+                                              Vec4vf4& pL,
+                                              Vec4vf4& pR,
+                                              const LineSegments* geom,
+                                              const int itime) const
+  {
+    const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0],itime));
+    const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1],itime));
+    const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2],itime));
+    const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3],itime));
+    transpose(a0,a1,a2,a3,p0.x,p0.y,p0.z,p0.w);
+    
+    const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1,itime));
+    const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1,itime));
+    const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1,itime));
+    const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1,itime));
+    transpose(b0,b1,b2,b3,p1.x,p1.y,p1.z,p1.w);
+    
+    const vfloat4 l0 = (leftExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]-1,itime)) : vfloat4(inf);
+    const vfloat4 l1 = (leftExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]-1,itime)) : vfloat4(inf);
+    const vfloat4 l2 = (leftExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]-1,itime)) : vfloat4(inf);
+    const vfloat4 l3 = (leftExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]-1,itime)) : vfloat4(inf);
+    transpose(l0,l1,l2,l3,pL.x,pL.y,pL.z,pL.w);
+    
+    const vfloat4 r0 = (rightExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]+2,itime)) : vfloat4(inf);
+    const vfloat4 r1 = (rightExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]+2,itime)) : vfloat4(inf);
+    const vfloat4 r2 = (rightExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]+2,itime)) : vfloat4(inf);
+    const vfloat4 r3 = (rightExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]+2,itime)) : vfloat4(inf);
+    transpose(r0,r1,r2,r3,pR.x,pR.y,pR.z,pR.w);
+  }
+  
+  template<>
+    __forceinline void LineMi<4>::gather(Vec4vf4& p0,
+                                              Vec4vf4& p1,
+                                              Vec4vf4& pL,
+                                              Vec4vf4& pR,
+                                              const LineSegments* geom,
+                                              float time) const
+  {
+    float ftime;
+    const int itime = geom->timeSegment(time, ftime);
+    
+    Vec4vf4 a0,a1,aL,aR;
+    gatheri(a0,a1,aL,aR,geom,itime);
+    Vec4vf4 b0,b1,bL,bR;
+    gatheri(b0,b1,bL,bR,geom,itime+1);
+    p0 = lerp(a0,b0,vfloat4(ftime));
+    p1 = lerp(a1,b1,vfloat4(ftime));
+    pL = lerp(aL,bL,vfloat4(ftime));
+    pR = lerp(aR,bR,vfloat4(ftime));
+  }
+
+#if defined(__AVX__)
+
+  template<>
+    __forceinline void LineMi<8>::gather(Vec4vf8& p0,
+                                         Vec4vf8& p1,
+                                         const LineSegments* geom) const
+  {
+    const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0]));
+    const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1]));
+    const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2]));
+    const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3]));
+    const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(v0[4]));
+    const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(v0[5]));
+    const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(v0[6]));
+    const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(v0[7]));
+    transpose(a0,a1,a2,a3,a4,a5,a6,a7,p0.x,p0.y,p0.z,p0.w);
+
+    const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1));
+    const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1));
+    const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1));
+    const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1));
+    const vfloat4 b4 = vfloat4::loadu(geom->vertexPtr(v0[4]+1));
+    const vfloat4 b5 = vfloat4::loadu(geom->vertexPtr(v0[5]+1));
+    const vfloat4 b6 = vfloat4::loadu(geom->vertexPtr(v0[6]+1));
+    const vfloat4 b7 = vfloat4::loadu(geom->vertexPtr(v0[7]+1));
+    transpose(b0,b1,b2,b3,b4,b5,b6,b7,p1.x,p1.y,p1.z,p1.w);
+  }
+
+  template<>
+  __forceinline void LineMi<8>::gatheri(Vec4vf8& p0,
+                                       Vec4vf8& p1,
+                                       const LineSegments* geom,
+                                       const int itime) const
+  {
+    const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0],itime));
+    const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1],itime));
+    const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2],itime));
+    const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3],itime));
+    const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(v0[4],itime));
+    const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(v0[5],itime));
+    const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(v0[6],itime));
+    const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(v0[7],itime));
+    transpose(a0,a1,a2,a3,a4,a5,a6,a7,p0.x,p0.y,p0.z,p0.w);
+
+    const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1,itime));
+    const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1,itime));
+    const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1,itime));
+    const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1,itime));
+    const vfloat4 b4 = vfloat4::loadu(geom->vertexPtr(v0[4]+1,itime));
+    const vfloat4 b5 = vfloat4::loadu(geom->vertexPtr(v0[5]+1,itime));
+    const vfloat4 b6 = vfloat4::loadu(geom->vertexPtr(v0[6]+1,itime));
+    const vfloat4 b7 = vfloat4::loadu(geom->vertexPtr(v0[7]+1,itime));
+    transpose(b0,b1,b2,b3,b4,b5,b6,b7,p1.x,p1.y,p1.z,p1.w);
+  }
+
+  template<>
+    __forceinline void LineMi<8>::gather(Vec4vf8& p0,
+                                         Vec4vf8& p1,
+                                         const LineSegments* geom,
+                                         float time) const
+  {
+    float ftime;
+    const int itime = geom->timeSegment(time, ftime);
+
+    Vec4vf8 a0,a1;
+    gatheri(a0,a1,geom,itime);
+    Vec4vf8 b0,b1;
+    gatheri(b0,b1,geom,itime+1);
+    p0 = lerp(a0,b0,vfloat8(ftime));
+    p1 = lerp(a1,b1,vfloat8(ftime));
+  }
+  
+  template<>
+    __forceinline void LineMi<8>::gather(Vec4vf8& p0,
+                                              Vec4vf8& p1,
+                                              Vec4vf8& pL,
+                                              Vec4vf8& pR,
+                                              const LineSegments* geom) const
+  {
+    const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0]));
+    const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1]));
+    const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2]));
+    const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3]));
+    const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(v0[4]));
+    const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(v0[5]));
+    const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(v0[6]));
+    const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(v0[7]));
+    transpose(a0,a1,a2,a3,a4,a5,a6,a7,p0.x,p0.y,p0.z,p0.w);
+    
+    const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1));
+    const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1));
+    const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1));
+    const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1));
+    const vfloat4 b4 = vfloat4::loadu(geom->vertexPtr(v0[4]+1));
+    const vfloat4 b5 = vfloat4::loadu(geom->vertexPtr(v0[5]+1));
+    const vfloat4 b6 = vfloat4::loadu(geom->vertexPtr(v0[6]+1));
+    const vfloat4 b7 = vfloat4::loadu(geom->vertexPtr(v0[7]+1));
+    transpose(b0,b1,b2,b3,b4,b5,b6,b7,p1.x,p1.y,p1.z,p1.w);
+    
+    const vfloat4 l0 = (leftExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]-1)) : vfloat4(inf);
+    const vfloat4 l1 = (leftExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]-1)) : vfloat4(inf);
+    const vfloat4 l2 = (leftExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]-1)) : vfloat4(inf);
+    const vfloat4 l3 = (leftExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]-1)) : vfloat4(inf);
+    const vfloat4 l4 = (leftExists & (1<<4)) ? vfloat4::loadu(geom->vertexPtr(v0[4]-1)) : vfloat4(inf);
+    const vfloat4 l5 = (leftExists & (1<<5)) ? vfloat4::loadu(geom->vertexPtr(v0[5]-1)) : vfloat4(inf);
+    const vfloat4 l6 = (leftExists & (1<<6)) ? vfloat4::loadu(geom->vertexPtr(v0[6]-1)) : vfloat4(inf);
+    const vfloat4 l7 = (leftExists & (1<<7)) ? vfloat4::loadu(geom->vertexPtr(v0[7]-1)) : vfloat4(inf);
+    transpose(l0,l1,l2,l3,l4,l5,l6,l7,pL.x,pL.y,pL.z,pL.w);
+    
+    const vfloat4 r0 = (rightExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]+2)) : vfloat4(inf);
+    const vfloat4 r1 = (rightExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]+2)) : vfloat4(inf);
+    const vfloat4 r2 = (rightExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]+2)) : vfloat4(inf);
+    const vfloat4 r3 = (rightExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]+2)) : vfloat4(inf);
+    const vfloat4 r4 = (rightExists & (1<<4)) ? vfloat4::loadu(geom->vertexPtr(v0[4]+2)) : vfloat4(inf);
+    const vfloat4 r5 = (rightExists & (1<<5)) ? vfloat4::loadu(geom->vertexPtr(v0[5]+2)) : vfloat4(inf);
+    const vfloat4 r6 = (rightExists & (1<<6)) ? vfloat4::loadu(geom->vertexPtr(v0[6]+2)) : vfloat4(inf);
+    const vfloat4 r7 = (rightExists & (1<<7)) ? vfloat4::loadu(geom->vertexPtr(v0[7]+2)) : vfloat4(inf);
+    transpose(r0,r1,r2,r3,r4,r5,r6,r7,pR.x,pR.y,pR.z,pR.w);
+  }
+  
+  template<>
+    __forceinline void LineMi<8>::gatheri(Vec4vf8& p0,
+                                              Vec4vf8& p1,
+                                              Vec4vf8& pL,
+                                              Vec4vf8& pR,
+                                              const LineSegments* geom,
+                                              const int itime) const
+  {
+    const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0],itime));
+    const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1],itime));
+    const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2],itime));
+    const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3],itime));
+    const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(v0[4],itime));
+    const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(v0[5],itime));
+    const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(v0[6],itime));
+    const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(v0[7],itime));
+    transpose(a0,a1,a2,a3,a4,a5,a6,a7,p0.x,p0.y,p0.z,p0.w);
+    
+    const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1,itime));
+    const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1,itime));
+    const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1,itime));
+    const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1,itime));
+    const vfloat4 b4 = vfloat4::loadu(geom->vertexPtr(v0[4]+1,itime));
+    const vfloat4 b5 = vfloat4::loadu(geom->vertexPtr(v0[5]+1,itime));
+    const vfloat4 b6 = vfloat4::loadu(geom->vertexPtr(v0[6]+1,itime));
+    const vfloat4 b7 = vfloat4::loadu(geom->vertexPtr(v0[7]+1,itime));
+    transpose(b0,b1,b2,b3,b4,b5,b6,b7,p1.x,p1.y,p1.z,p1.w);
+    
+    const vfloat4 l0 = (leftExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]-1,itime)) : vfloat4(inf);
+    const vfloat4 l1 = (leftExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]-1,itime)) : vfloat4(inf);
+    const vfloat4 l2 = (leftExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]-1,itime)) : vfloat4(inf);
+    const vfloat4 l3 = (leftExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]-1,itime)) : vfloat4(inf);
+    const vfloat4 l4 = (leftExists & (1<<4)) ? vfloat4::loadu(geom->vertexPtr(v0[4]-1,itime)) : vfloat4(inf);
+    const vfloat4 l5 = (leftExists & (1<<5)) ? vfloat4::loadu(geom->vertexPtr(v0[5]-1,itime)) : vfloat4(inf);
+    const vfloat4 l6 = (leftExists & (1<<6)) ? vfloat4::loadu(geom->vertexPtr(v0[6]-1,itime)) : vfloat4(inf);
+    const vfloat4 l7 = (leftExists & (1<<7)) ? vfloat4::loadu(geom->vertexPtr(v0[7]-1,itime)) : vfloat4(inf);
+    transpose(l0,l1,l2,l3,l4,l5,l6,l7,pL.x,pL.y,pL.z,pL.w);
+    
+    const vfloat4 r0 = (rightExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]+2,itime)) : vfloat4(inf);
+    const vfloat4 r1 = (rightExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]+2,itime)) : vfloat4(inf);
+    const vfloat4 r2 = (rightExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]+2,itime)) : vfloat4(inf);
+    const vfloat4 r3 = (rightExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]+2,itime)) : vfloat4(inf);
+    const vfloat4 r4 = (rightExists & (1<<4)) ? vfloat4::loadu(geom->vertexPtr(v0[4]+2,itime)) : vfloat4(inf);
+    const vfloat4 r5 = (rightExists & (1<<5)) ? vfloat4::loadu(geom->vertexPtr(v0[5]+2,itime)) : vfloat4(inf);
+    const vfloat4 r6 = (rightExists & (1<<6)) ? vfloat4::loadu(geom->vertexPtr(v0[6]+2,itime)) : vfloat4(inf);
+    const vfloat4 r7 = (rightExists & (1<<7)) ? vfloat4::loadu(geom->vertexPtr(v0[7]+2,itime)) : vfloat4(inf);
+    transpose(r0,r1,r2,r3,r4,r5,r6,r7,pR.x,pR.y,pR.z,pR.w);
+  }
+  
+  template<>
+    __forceinline void LineMi<8>::gather(Vec4vf8& p0,
+                                              Vec4vf8& p1,
+                                              Vec4vf8& pL,
+                                              Vec4vf8& pR,
+                                              const LineSegments* geom,
+                                              float time) const
+  {
+    float ftime;
+    const int itime = geom->timeSegment(time, ftime);
+    
+    Vec4vf8 a0,a1,aL,aR;
+    gatheri(a0,a1,aL,aR,geom,itime);
+    Vec4vf8 b0,b1,bL,bR;
+    gatheri(b0,b1,bL,bR,geom,itime+1);
+    p0 = lerp(a0,b0,vfloat8(ftime));
+    p1 = lerp(a1,b1,vfloat8(ftime));
+    pL = lerp(aL,bL,vfloat8(ftime));
+    pR = lerp(aR,bR,vfloat8(ftime));
+  }
+
+  template<>
+    __forceinline void LineMi<8>::gather(Vec4vf8& p0,
+                                         Vec4vf8& p1,
+                                         vbool8& cL,
+                                         vbool8& cR,
+                                         const LineSegments* geom) const
+  {
+    gather(p0,p1,geom);
+    cL = !vbool8(leftExists);
+    cR = !vbool8(rightExists);
+  }
+  
+  template<>
+    __forceinline void LineMi<8>::gatheri(Vec4vf8& p0,
+                                              Vec4vf8& p1,
+                                              vbool8& cL,
+                                              vbool8& cR,
+                                              const LineSegments* geom,
+                                              const int itime) const
+  {
+    gatheri(p0,p1,geom,itime);
+    cL = !vbool8(leftExists);
+    cR = !vbool8(rightExists);
+  }
+  
+  template<>
+    __forceinline void LineMi<8>::gather(Vec4vf8& p0,
+                                              Vec4vf8& p1,
+                                              vbool8& cL,
+                                              vbool8& cR,
+                                              const LineSegments* geom,
+                                              float time) const
+  {
+    float ftime;
+    const int itime = geom->timeSegment(time, ftime);
+    
+    Vec4vf8 a0,a1;
+    gatheri(a0,a1,geom,itime);
+    Vec4vf8 b0,b1;
+    gatheri(b0,b1,geom,itime+1);
+    p0 = lerp(a0,b0,vfloat8(ftime));
+    p1 = lerp(a1,b1,vfloat8(ftime));
+    cL = !vbool8(leftExists);
+    cR = !vbool8(rightExists);
+  }
+  
+#endif
+  
+  template<int M>
+  typename LineMi<M>::Type LineMi<M>::type;
+
+  typedef LineMi<4> Line4i;
+  typedef LineMi<8> Line8i;
+}
diff --git a/thirdparty/embree/kernels/geometry/linei_intersector.h b/thirdparty/embree/kernels/geometry/linei_intersector.h
new file mode 100644
index 0000000000..5992827f5b
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/linei_intersector.h
@@ -0,0 +1,124 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "linei.h"
+#include "line_intersector.h"
+#include "intersector_epilog.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M, bool filter>
+    struct FlatLinearCurveMiIntersector1
+    {
+      typedef LineMi<M> Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; line.gather(v0,v1,geom);
+        const vbool<M> valid = line.valid();
+        FlatLinearCurveIntersector1<M>::intersect(valid,ray,context,geom,pre,v0,v1,Intersect1EpilogM<M,filter>(ray,context,line.geomID(),line.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; line.gather(v0,v1,geom);
+        const vbool<M> valid = line.valid();
+        return FlatLinearCurveIntersector1<M>::intersect(valid,ray,context,geom,pre,v0,v1,Occluded1EpilogM<M,filter>(ray,context,line.geomID(),line.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& line)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, line);
+      }
+    };
+
+    template<int M, bool filter>
+    struct FlatLinearCurveMiMBIntersector1
+    {
+      typedef LineMi<M> Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; line.gather(v0,v1,geom,ray.time());
+        const vbool<M> valid = line.valid();
+        FlatLinearCurveIntersector1<M>::intersect(valid,ray,context,geom,pre,v0,v1,Intersect1EpilogM<M,filter>(ray,context,line.geomID(),line.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; line.gather(v0,v1,geom,ray.time());
+        const vbool<M> valid = line.valid();
+        return FlatLinearCurveIntersector1<M>::intersect(valid,ray,context,geom,pre,v0,v1,Occluded1EpilogM<M,filter>(ray,context,line.geomID(),line.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& line)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, line);
+      }
+    };
+
+    template<int M, int K, bool filter>
+    struct FlatLinearCurveMiIntersectorK
+    {
+      typedef LineMi<M> Primitive;
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; line.gather(v0,v1,geom);
+        const vbool<M> valid = line.valid();
+        FlatLinearCurveIntersectorK<M,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,Intersect1KEpilogM<M,K,filter>(ray,k,context,line.geomID(),line.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; line.gather(v0,v1,geom);
+        const vbool<M> valid = line.valid();
+        return FlatLinearCurveIntersectorK<M,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,Occluded1KEpilogM<M,K,filter>(ray,k,context,line.geomID(),line.primID()));
+      }
+    };
+
+    template<int M, int K, bool filter>
+    struct FlatLinearCurveMiMBIntersectorK
+    {
+      typedef LineMi<M> Primitive;
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context,  const Primitive& line)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; line.gather(v0,v1,geom,ray.time()[k]);
+        const vbool<M> valid = line.valid();
+        FlatLinearCurveIntersectorK<M,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,Intersect1KEpilogM<M,K,filter>(ray,k,context,line.geomID(),line.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; line.gather(v0,v1,geom,ray.time()[k]);
+        const vbool<M> valid = line.valid();
+        return FlatLinearCurveIntersectorK<M,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,Occluded1KEpilogM<M,K,filter>(ray,k,context,line.geomID(),line.primID()));
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/object.h b/thirdparty/embree/kernels/geometry/object.h
new file mode 100644
index 0000000000..2a61829ffd
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/object.h
@@ -0,0 +1,84 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+
+namespace embree
+{
+  struct Object
+  {
+    struct Type : public PrimitiveType 
+    {
+      const char* name() const;
+      size_t sizeActive(const char* This) const;
+      size_t sizeTotal(const char* This) const;
+      size_t getBytes(const char* This) const;
+    };
+    static Type type;
+
+  public:
+
+    /* primitive supports multiple time segments */
+    static const bool singleTimeSegment = false;
+
+    /* Returns maximum number of stored primitives */
+    static __forceinline size_t max_size() { return 1; }
+
+    /* Returns required number of primitive blocks for N primitives */
+    static __forceinline size_t blocks(size_t N) { return N; }
+
+  public:
+
+    /*! constructs a virtual object */
+    Object (unsigned geomID, unsigned primID) 
+    : _geomID(geomID), _primID(primID) {}
+
+    __forceinline unsigned geomID() const {
+      return _geomID;
+    }
+
+    __forceinline unsigned primID() const {
+      return _primID;
+    }
+
+    /*! fill triangle from triangle list */
+    __forceinline void fill(const PrimRef* prims, size_t& i, size_t end, Scene* scene)
+    {
+      const PrimRef& prim = prims[i]; i++;
+      new (this) Object(prim.geomID(), prim.primID());
+    }
+
+    /*! fill triangle from triangle list */
+    __forceinline LBBox3fa fillMB(const PrimRef* prims, size_t& i, size_t end, Scene* scene, size_t itime)
+    {
+      const PrimRef& prim = prims[i]; i++;
+      const unsigned geomID = prim.geomID();
+      const unsigned primID = prim.primID();
+      new (this) Object(geomID, primID);
+      AccelSet* accel = (AccelSet*) scene->get(geomID);
+      return accel->linearBounds(primID,itime);
+    }
+
+    /*! fill triangle from triangle list */
+    __forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t& i, size_t end, Scene* scene, const BBox1f time_range)
+    {
+      const PrimRefMB& prim = prims[i]; i++;
+      const unsigned geomID = prim.geomID();
+      const unsigned primID = prim.primID();
+      new (this) Object(geomID, primID);
+      AccelSet* accel = (AccelSet*) scene->get(geomID);
+      return accel->linearBounds(primID,time_range);
+    }
+
+    /* Updates the primitive */
+    __forceinline BBox3fa update(AccelSet* mesh) {
+      return mesh->bounds(primID());
+    }
+
+  private:
+    unsigned int _geomID;  //!< geometry ID
+    unsigned int _primID;  //!< primitive ID
+  };
+}
diff --git a/thirdparty/embree/kernels/geometry/object_intersector.h b/thirdparty/embree/kernels/geometry/object_intersector.h
new file mode 100644
index 0000000000..11ceb2f7fe
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/object_intersector.h
@@ -0,0 +1,127 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "object.h"
+#include "../common/ray.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<bool mblur>
+    struct ObjectIntersector1
+    {
+      typedef Object Primitive;
+     
+      static const bool validIntersectorK = false;
+
+      struct Precalculations {
+        __forceinline Precalculations() {}
+        __forceinline Precalculations (const Ray& ray, const void *ptr) {}
+      };
+      
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim) 
+      {
+        AccelSet* accel = (AccelSet*) context->scene->get(prim.geomID());
+
+        /* perform ray mask test */
+#if defined(EMBREE_RAY_MASK)
+        if ((ray.mask & accel->mask) == 0) 
+          return;
+#endif
+
+        accel->intersect(ray,prim.geomID(),prim.primID(),context,reportIntersection1);
+      }
+      
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+      {
+        AccelSet* accel = (AccelSet*) context->scene->get(prim.geomID());
+        /* perform ray mask test */
+#if defined(EMBREE_RAY_MASK)
+        if ((ray.mask & accel->mask) == 0) 
+          return false;
+#endif
+
+        accel->occluded(ray,prim.geomID(),prim.primID(),context,&reportOcclusion1);
+        return ray.tfar < 0.0f;
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& prim)
+      {
+        AccelSet* accel = (AccelSet*)context->scene->get(prim.geomID());
+        context->geomID = prim.geomID();
+        context->primID = prim.primID();
+        return accel->pointQuery(query, context);
+      }
+      
+      template<int K>
+      static __forceinline void intersectK(const vbool<K>& valid, /* PrecalculationsK& pre, */ RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node)
+      {
+        assert(false);
+      }
+
+      template<int K>
+      static __forceinline vbool<K> occludedK(const vbool<K>& valid, /* PrecalculationsK& pre, */ RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node)
+      {
+        assert(false);
+        return valid;
+      }
+    };
+
+    template<int K, bool mblur>
+    struct ObjectIntersectorK
+    {
+      typedef Object Primitive;
+      
+      struct Precalculations {
+        __forceinline Precalculations (const vbool<K>& valid, const RayK<K>& ray) {}
+      };
+      
+      static __forceinline void intersect(const vbool<K>& valid_i, const Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vbool<K> valid = valid_i;
+        AccelSet* accel = (AccelSet*) context->scene->get(prim.geomID());
+        
+        /* perform ray mask test */
+#if defined(EMBREE_RAY_MASK)
+        valid &= (ray.mask & accel->mask) != 0;
+        if (none(valid)) return;
+#endif
+        accel->intersect(valid,ray,prim.geomID(),prim.primID(),context,&reportIntersection1);
+      }
+
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, const Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vbool<K> valid = valid_i;
+        AccelSet* accel = (AccelSet*) context->scene->get(prim.geomID());
+        
+        /* perform ray mask test */
+#if defined(EMBREE_RAY_MASK)
+        valid &= (ray.mask & accel->mask) != 0;
+        if (none(valid)) return false;
+#endif
+        accel->occluded(valid,ray,prim.geomID(),prim.primID(),context,&reportOcclusion1);
+        return ray.tfar < 0.0f;
+      }
+      
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& prim) {
+        intersect(vbool<K>(1<<int(k)),pre,ray,context,prim);
+      }
+      
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& prim) {
+        occluded(vbool<K>(1<<int(k)),pre,ray,context,prim);
+        return ray.tfar[k] < 0.0f; 
+      }
+    };
+
+    typedef ObjectIntersectorK<4,false>  ObjectIntersector4;
+    typedef ObjectIntersectorK<8,false>  ObjectIntersector8;
+    typedef ObjectIntersectorK<16,false> ObjectIntersector16;
+
+    typedef ObjectIntersectorK<4,true>  ObjectIntersector4MB;
+    typedef ObjectIntersectorK<8,true>  ObjectIntersector8MB;
+    typedef ObjectIntersectorK<16,true> ObjectIntersector16MB;
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/plane.h b/thirdparty/embree/kernels/geometry/plane.h
new file mode 100644
index 0000000000..e447122eab
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/plane.h
@@ -0,0 +1,57 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    struct HalfPlane
+    {
+      const Vec3fa P;  //!< plane origin
+      const Vec3fa N;  //!< plane normal
+
+      __forceinline HalfPlane(const Vec3fa& P, const Vec3fa& N) 
+        : P(P), N(N) {}
+      
+      __forceinline BBox1f intersect(const Vec3fa& ray_org, const Vec3fa& ray_dir) const
+      {
+        Vec3fa O = Vec3fa(ray_org) - P;
+        Vec3fa D = Vec3fa(ray_dir);
+        float ON = dot(O,N);
+        float DN = dot(D,N);
+        bool eps = abs(DN) < min_rcp_input;
+        float t = -ON*rcp(DN);
+        float lower = select(eps || DN < 0.0f, float(neg_inf), t);
+        float upper = select(eps || DN > 0.0f, float(pos_inf), t);
+        return BBox1f(lower,upper);
+      }
+    };
+
+    template<int M>
+      struct HalfPlaneN
+      {
+        const Vec3vf<M> P;  //!< plane origin
+        const Vec3vf<M> N;  //!< plane normal
+
+        __forceinline HalfPlaneN(const Vec3vf<M>& P, const Vec3vf<M>& N)
+          : P(P), N(N) {}
+
+        __forceinline BBox<vfloat<M>> intersect(const Vec3fa& ray_org, const Vec3fa& ray_dir) const
+        {
+          Vec3vf<M> O = Vec3vf<M>((Vec3fa)ray_org) - P;
+          Vec3vf<M> D = Vec3vf<M>((Vec3fa)ray_dir);
+          vfloat<M> ON = dot(O,N);
+          vfloat<M> DN = dot(D,N);
+          vbool<M> eps = abs(DN) < min_rcp_input;
+          vfloat<M> t = -ON*rcp(DN);
+          vfloat<M> lower = select(eps | DN < 0.0f, vfloat<M>(neg_inf), t);
+          vfloat<M> upper = select(eps | DN > 0.0f, vfloat<M>(pos_inf), t);
+          return BBox<vfloat<M>>(lower,upper);
+        }
+      };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/pointi.h b/thirdparty/embree/kernels/geometry/pointi.h
new file mode 100644
index 0000000000..bed04116b0
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/pointi.h
@@ -0,0 +1,412 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+
+namespace embree
+{
+  template<int M>
+  struct PointMi
+  {
+    /* Virtual interface to query information about the line segment type */
+    struct Type : public PrimitiveType
+    {
+      const char* name() const;
+      size_t sizeActive(const char* This) const;
+      size_t sizeTotal(const char* This) const;
+      size_t getBytes(const char* This) const;
+    };
+    static Type type;
+
+   public:
+    /* primitive supports multiple time segments */
+    static const bool singleTimeSegment = false;
+
+    /* Returns maximum number of stored line segments */
+    static __forceinline size_t max_size()
+    {
+      return M;
+    }
+
+    /* Returns required number of primitive blocks for N line segments */
+    static __forceinline size_t blocks(size_t N)
+    {
+      return (N + max_size() - 1) / max_size();
+    }
+
+    /* Returns required number of bytes for N line segments */
+    static __forceinline size_t bytes(size_t N)
+    {
+      return blocks(N) * sizeof(PointMi);
+    }
+
+   public:
+    /* Default constructor */
+    __forceinline PointMi() {}
+
+    /* Construction from vertices and IDs */
+    __forceinline PointMi(const vuint<M>& geomIDs, const vuint<M>& primIDs, Geometry::GType gtype, uint32_t numPrimitives)
+        : gtype((unsigned char)gtype),
+          numPrimitives(numPrimitives),
+          sharedGeomID(geomIDs[0]),
+          primIDs(primIDs)
+    {
+      assert(all(vuint<M>(geomID()) == geomIDs));
+    }
+
+    /* Returns a mask that tells which line segments are valid */
+    __forceinline vbool<M> valid() const {
+      return vint<M>(step) < vint<M>(numPrimitives);
+    }
+
+    /* Returns if the specified line segment is valid */
+    __forceinline bool valid(const size_t i) const
+    {
+      assert(i < M);
+      return i < numPrimitives;
+    }
+
+    /* Returns the number of stored line segments */
+    __forceinline size_t size() const {
+      return numPrimitives;
+    }
+
+    __forceinline unsigned int geomID(unsigned int i = 0) const {
+      return sharedGeomID;
+    }
+
+    __forceinline vuint<M>& primID() {
+      return primIDs;
+    }
+    __forceinline const vuint<M>& primID() const {
+      return primIDs;
+    }
+    __forceinline unsigned int primID(const size_t i) const {
+      assert(i < M);
+      return primIDs[i];
+    }
+
+    /* gather the line segments */
+    __forceinline void gather(Vec4vf<M>& p0, const Points* geom) const;
+    __forceinline void gather(Vec4vf<M>& p0, Vec3vf<M>& n0, const Points* geom) const;
+
+    __forceinline void gatheri(Vec4vf<M>& p0, const Points* geom, const int itime) const;
+    __forceinline void gatheri(Vec4vf<M>& p0, Vec3vf<M>& n0, const Points* geom, const int itime) const;
+
+    __forceinline void gather(Vec4vf<M>& p0, const Points* geom, float time) const;
+    __forceinline void gather(Vec4vf<M>& p0, Vec3vf<M>& n0, const Points* geom, float time) const;
+
+    /* Calculate the bounds of the line segments */
+    __forceinline const BBox3fa bounds(const Scene* scene, size_t itime = 0) const
+    {
+      BBox3fa bounds = empty;
+      for (size_t i = 0; i < M && valid(i); i++) {
+        const Points* geom = scene->get<Points>(geomID(i));
+        bounds.extend(geom->bounds(primID(i),itime));
+      }
+      return bounds;
+    }
+
+    /* Calculate the linear bounds of the primitive */
+    __forceinline LBBox3fa linearBounds(const Scene* scene, size_t itime) {
+      return LBBox3fa(bounds(scene, itime + 0), bounds(scene, itime + 1));
+    }
+
+    __forceinline LBBox3fa linearBounds(const Scene* const scene, size_t itime, size_t numTimeSteps)
+    {
+      LBBox3fa allBounds = empty;
+      for (size_t i = 0; i < M && valid(i); i++) {
+        const Points* geom = scene->get<Points>(geomID(i));
+        allBounds.extend(geom->linearBounds(primID(i), itime, numTimeSteps));
+      }
+      return allBounds;
+    }
+
+    __forceinline LBBox3fa linearBounds(const Scene* const scene, const BBox1f time_range)
+    {
+      LBBox3fa allBounds = empty;
+      for (size_t i = 0; i < M && valid(i); i++) {
+        const Points* geom = scene->get<Points>(geomID((unsigned int)i));
+        allBounds.extend(geom->linearBounds(primID(i), time_range));
+      }
+      return allBounds;
+    }
+
+    /* Fill line segment from line segment list */
+    template<typename PrimRefT>
+    __forceinline void fill(const PrimRefT* prims, size_t& begin, size_t end, Scene* scene)
+    {
+      Geometry::GType gty = scene->get(prims[begin].geomID())->getType();
+      vuint<M> geomID, primID;
+      vuint<M> v0;
+      const PrimRefT* prim = &prims[begin];
+
+      int numPrimitives = 0;
+      for (size_t i = 0; i < M; i++) {
+        if (begin < end) {
+          geomID[i] = prim->geomID();
+          primID[i] = prim->primID();
+          begin++;
+          numPrimitives++;
+        } else {
+          assert(i);
+          if (i > 0) {
+            geomID[i] = geomID[i - 1];
+            primID[i] = primID[i - 1];
+          }
+        }
+        if (begin < end)
+          prim = &prims[begin];  // FIXME: remove this line
+      }
+      new (this) PointMi(geomID, primID, gty, numPrimitives);  // FIXME: use non temporal store
+    }
+
+    template<typename BVH, typename Allocator>
+    __forceinline static typename BVH::NodeRef createLeaf(BVH* bvh,
+                                                          const PrimRef* prims,
+                                                          const range<size_t>& set,
+                                                          const Allocator& alloc)
+    {
+      size_t start    = set.begin();
+      size_t items    = PointMi::blocks(set.size());
+      size_t numbytes = PointMi::bytes(set.size());
+      PointMi* accel  = (PointMi*)alloc.malloc1(numbytes, M * sizeof(float));
+      for (size_t i = 0; i < items; i++) {
+        accel[i].fill(prims, start, set.end(), bvh->scene);
+      }
+      return bvh->encodeLeaf((char*)accel, items);
+    };
+
+    __forceinline LBBox3fa fillMB(const PrimRef* prims, size_t& begin, size_t end, Scene* scene, size_t itime)
+    {
+      fill(prims, begin, end, scene);
+      return linearBounds(scene, itime);
+    }
+
+    __forceinline LBBox3fa fillMB(
+        const PrimRefMB* prims, size_t& begin, size_t end, Scene* scene, const BBox1f time_range)
+    {
+      fill(prims, begin, end, scene);
+      return linearBounds(scene, time_range);
+    }
+
+    template<typename BVH, typename SetMB, typename Allocator>
+    __forceinline static typename BVH::NodeRecordMB4D createLeafMB(BVH* bvh, const SetMB& prims, const Allocator& alloc)
+    {
+      size_t start                     = prims.object_range.begin();
+      size_t end                       = prims.object_range.end();
+      size_t items                     = PointMi::blocks(prims.object_range.size());
+      size_t numbytes                  = PointMi::bytes(prims.object_range.size());
+      PointMi* accel                   = (PointMi*)alloc.malloc1(numbytes, M * sizeof(float));
+      const typename BVH::NodeRef node = bvh->encodeLeaf((char*)accel, items);
+
+      LBBox3fa bounds = empty;
+      for (size_t i = 0; i < items; i++)
+        bounds.extend(accel[i].fillMB(prims.prims->data(), start, end, bvh->scene, prims.time_range));
+
+      return typename BVH::NodeRecordMB4D(node, bounds, prims.time_range);
+    };
+
+    /*! output operator */
+    friend __forceinline embree_ostream operator<<(embree_ostream cout, const PointMi& line)
+    {
+      return cout << "Line" << M << "i {" << line.v0 << ", " << line.geomID() << ", " << line.primID() << "}";
+    }
+
+   public:
+    unsigned char gtype;
+    unsigned char numPrimitives;
+    unsigned int sharedGeomID;
+
+   private:
+    vuint<M> primIDs;  // primitive ID
+  };
+
+  template<>
+  __forceinline void PointMi<4>::gather(Vec4vf4& p0, const Points* geom) const
+  {
+    const vfloat4 a0   = vfloat4::loadu(geom->vertexPtr(primID(0)));
+    const vfloat4 a1   = vfloat4::loadu(geom->vertexPtr(primID(1)));
+    const vfloat4 a2   = vfloat4::loadu(geom->vertexPtr(primID(2)));
+    const vfloat4 a3   = vfloat4::loadu(geom->vertexPtr(primID(3)));
+    transpose(a0, a1, a2, a3, p0.x, p0.y, p0.z, p0.w);
+  }
+
+  template<>
+  __forceinline void PointMi<4>::gather(Vec4vf4& p0, Vec3vf4& n0, const Points* geom) const
+  {
+    const vfloat4 a0   = vfloat4::loadu(geom->vertexPtr(primID(0)));
+    const vfloat4 a1   = vfloat4::loadu(geom->vertexPtr(primID(1)));
+    const vfloat4 a2   = vfloat4::loadu(geom->vertexPtr(primID(2)));
+    const vfloat4 a3   = vfloat4::loadu(geom->vertexPtr(primID(3)));
+    transpose(a0, a1, a2, a3, p0.x, p0.y, p0.z, p0.w);
+    const vfloat4 b0 = vfloat4(geom->normal(primID(0)));
+    const vfloat4 b1 = vfloat4(geom->normal(primID(1)));
+    const vfloat4 b2 = vfloat4(geom->normal(primID(2)));
+    const vfloat4 b3 = vfloat4(geom->normal(primID(3)));
+    transpose(b0, b1, b2, b3, n0.x, n0.y, n0.z);
+  }
+
+  template<>
+  __forceinline void PointMi<4>::gatheri(Vec4vf4& p0, const Points* geom, const int itime) const
+  {
+    const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(primID(0), itime));
+    const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(primID(1), itime));
+    const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(primID(2), itime));
+    const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(primID(3), itime));
+    transpose(a0, a1, a2, a3, p0.x, p0.y, p0.z, p0.w);
+  }
+
+  template<>
+  __forceinline void PointMi<4>::gatheri(Vec4vf4& p0, Vec3vf4& n0, const Points* geom, const int itime) const
+  {
+    const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(primID(0), itime));
+    const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(primID(1), itime));
+    const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(primID(2), itime));
+    const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(primID(3), itime));
+    transpose(a0, a1, a2, a3, p0.x, p0.y, p0.z, p0.w);
+    const vfloat4 b0 = vfloat4(geom->normal(primID(0), itime));
+    const vfloat4 b1 = vfloat4(geom->normal(primID(1), itime));
+    const vfloat4 b2 = vfloat4(geom->normal(primID(2), itime));
+    const vfloat4 b3 = vfloat4(geom->normal(primID(3), itime));
+    transpose(b0, b1, b2, b3, n0.x, n0.y, n0.z);
+  }
+
+  template<>
+  __forceinline void PointMi<4>::gather(Vec4vf4& p0, const Points* geom, float time) const
+  {
+    float ftime;
+    const int itime = geom->timeSegment(time, ftime);
+
+    Vec4vf4 a0; gatheri(a0, geom, itime);
+    Vec4vf4 b0; gatheri(b0, geom, itime + 1);
+    p0 = lerp(a0, b0, vfloat4(ftime));
+  }
+
+  template<>
+  __forceinline void PointMi<4>::gather(Vec4vf4& p0, Vec3vf4& n0, const Points* geom, float time) const
+  {
+    float ftime;
+    const int itime = geom->timeSegment(time, ftime);
+
+    Vec4vf4 a0, b0;
+    Vec3vf4 norm0, norm1;
+    gatheri(a0, norm0, geom, itime);
+    gatheri(b0, norm1, geom, itime + 1);
+    p0 = lerp(a0, b0, vfloat4(ftime));
+    n0 = lerp(norm0, norm1, vfloat4(ftime));
+  }
+
+#if defined(__AVX__)
+
+  template<>
+  __forceinline void PointMi<8>::gather(Vec4vf8& p0, const Points* geom) const
+  {
+    const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(primID(0)));
+    const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(primID(1)));
+    const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(primID(2)));
+    const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(primID(3)));
+    const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(primID(4)));
+    const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(primID(5)));
+    const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(primID(6)));
+    const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(primID(7)));
+    transpose(a0, a1, a2, a3, a4, a5, a6, a7, p0.x, p0.y, p0.z, p0.w);
+  }
+
+  template<>
+  __forceinline void PointMi<8>::gather(Vec4vf8& p0, Vec3vf8& n0, const Points* geom) const
+  {
+    const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(primID(0)));
+    const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(primID(1)));
+    const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(primID(2)));
+    const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(primID(3)));
+    const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(primID(4)));
+    const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(primID(5)));
+    const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(primID(6)));
+    const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(primID(7)));
+    transpose(a0, a1, a2, a3, a4, a5, a6, a7, p0.x, p0.y, p0.z, p0.w);
+    const vfloat4 b0 = vfloat4(geom->normal(primID(0)));
+    const vfloat4 b1 = vfloat4(geom->normal(primID(1)));
+    const vfloat4 b2 = vfloat4(geom->normal(primID(2)));
+    const vfloat4 b3 = vfloat4(geom->normal(primID(3)));
+    const vfloat4 b4 = vfloat4(geom->normal(primID(4)));
+    const vfloat4 b5 = vfloat4(geom->normal(primID(5)));
+    const vfloat4 b6 = vfloat4(geom->normal(primID(6)));
+    const vfloat4 b7 = vfloat4(geom->normal(primID(7)));
+    transpose(b0, b1, b2, b3, b4, b5, b6, b7, n0.x, n0.y, n0.z);
+  }
+
+  template<>
+  __forceinline void PointMi<8>::gatheri(Vec4vf8& p0, const Points* geom, const int itime) const
+  {
+    const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(primID(0), itime));
+    const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(primID(1), itime));
+    const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(primID(2), itime));
+    const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(primID(3), itime));
+    const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(primID(4), itime));
+    const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(primID(5), itime));
+    const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(primID(6), itime));
+    const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(primID(7), itime));
+    transpose(a0, a1, a2, a3, a4, a5, a6, a7, p0.x, p0.y, p0.z, p0.w);
+  }
+
+  template<>
+  __forceinline void PointMi<8>::gatheri(Vec4vf8& p0, Vec3vf8& n0, const Points* geom, const int itime) const
+  {
+    const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(primID(0), itime));
+    const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(primID(1), itime));
+    const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(primID(2), itime));
+    const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(primID(3), itime));
+    const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(primID(4), itime));
+    const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(primID(5), itime));
+    const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(primID(6), itime));
+    const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(primID(7), itime));
+    transpose(a0, a1, a2, a3, a4, a5, a6, a7, p0.x, p0.y, p0.z, p0.w);
+    const vfloat4 b0 = vfloat4(geom->normal(primID(0), itime));
+    const vfloat4 b1 = vfloat4(geom->normal(primID(1), itime));
+    const vfloat4 b2 = vfloat4(geom->normal(primID(2), itime));
+    const vfloat4 b3 = vfloat4(geom->normal(primID(3), itime));
+    const vfloat4 b4 = vfloat4(geom->normal(primID(4), itime));
+    const vfloat4 b5 = vfloat4(geom->normal(primID(5), itime));
+    const vfloat4 b6 = vfloat4(geom->normal(primID(6), itime));
+    const vfloat4 b7 = vfloat4(geom->normal(primID(7), itime));
+    transpose(b0, b1, b2, b3, b4, b5, b6, b7, n0.x, n0.y, n0.z);
+  }
+
+  template<>
+  __forceinline void PointMi<8>::gather(Vec4vf8& p0, const Points* geom, float time) const
+  {
+    float ftime;
+    const int itime = geom->timeSegment(time, ftime);
+
+    Vec4vf8 a0;
+    gatheri(a0, geom, itime);
+    Vec4vf8 b0;
+    gatheri(b0, geom, itime + 1);
+    p0 = lerp(a0, b0, vfloat8(ftime));
+  }
+
+  template<>
+  __forceinline void PointMi<8>::gather(Vec4vf8& p0, Vec3vf8& n0, const Points* geom, float time) const
+  {
+    float ftime;
+    const int itime = geom->timeSegment(time, ftime);
+
+    Vec4vf8 a0, b0;
+    Vec3vf8 norm0, norm1;
+    gatheri(a0, norm0, geom, itime);
+    gatheri(b0, norm1, geom, itime + 1);
+    p0 = lerp(a0, b0, vfloat8(ftime));
+    n0 = lerp(norm0, norm1, vfloat8(ftime));
+  }
+#endif
+
+  template<int M>
+  typename PointMi<M>::Type PointMi<M>::type;
+
+  typedef PointMi<4> Point4i;
+  typedef PointMi<8> Point8i;
+  
+}  // namespace embree
diff --git a/thirdparty/embree/kernels/geometry/primitive.h b/thirdparty/embree/kernels/geometry/primitive.h
new file mode 100644
index 0000000000..608d981dd7
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/primitive.h
@@ -0,0 +1,49 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/default.h"
+#include "../common/scene.h"
+#include "../../common/simd/simd.h"
+#include "../common/primref.h"
+#include "../common/primref_mb.h"
+
+namespace embree
+{
+  struct PrimitiveType
+  {
+    /*! returns name of this primitive type */
+    virtual const char* name() const = 0;
+    
+    /*! Returns the number of stored active primitives in a block. */
+    virtual size_t sizeActive(const char* This) const = 0;
+
+    /*! Returns the number of stored active and inactive primitives in a block. */
+    virtual size_t sizeTotal(const char* This) const = 0;
+
+    /*! Returns the number of bytes of block. */
+    virtual size_t getBytes(const char* This) const = 0;
+  };
+  
+  template<typename Primitive>
+  struct PrimitivePointQuery1
+  {
+    static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& prim)
+    {
+      bool changed = false;
+      for (size_t i = 0; i < Primitive::max_size(); i++)
+      {
+        if (!prim.valid(i)) break;
+        STAT3(point_query.trav_prims,1,1,1);
+        AccelSet* accel = (AccelSet*)context->scene->get(prim.geomID(i));
+        context->geomID = prim.geomID(i);
+        context->primID = prim.primID(i);
+        changed |= accel->pointQuery(query, context);
+      }
+      return changed;
+    }
+    
+    static __forceinline void pointQueryNoop(PointQuery* query, PointQueryContext* context, const Primitive& prim) { }
+  };
+}
diff --git a/thirdparty/embree/kernels/geometry/primitive4.cpp b/thirdparty/embree/kernels/geometry/primitive4.cpp
new file mode 100644
index 0000000000..9c953c5d35
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/primitive4.cpp
@@ -0,0 +1,379 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "primitive.h"
+#include "curveNv.h"
+#include "curveNi.h"
+#include "curveNi_mb.h"
+#include "linei.h"
+#include "triangle.h"
+#include "trianglev.h"
+#include "trianglev_mb.h"
+#include "trianglei.h"
+#include "quadv.h"
+#include "quadi.h"
+#include "subdivpatch1.h"
+#include "object.h"
+#include "instance.h"
+#include "subgrid.h"
+
+namespace embree
+{
+  /********************** Curve4v **************************/
+
+  template<>
+  const char* Curve4v::Type::name () const {
+    return "curve4v";
+  }
+
+  template<>
+  size_t Curve4v::Type::sizeActive(const char* This) const
+  {
+    if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR)
+      return ((Line4i*)This)->size();
+    else
+      return ((Curve4v*)This)->N;
+  }
+
+  template<>
+  size_t Curve4v::Type::sizeTotal(const char* This) const
+  {
+    if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR)
+      return 4;
+    else
+      return ((Curve4v*)This)->N;
+  }
+
+  template<>
+  size_t Curve4v::Type::getBytes(const char* This) const
+  {
+     if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR)
+      return Line4i::bytes(sizeActive(This));
+     else
+        return Curve4v::bytes(sizeActive(This));
+  }
+
+  /********************** Curve4i **************************/
+
+  template<>
+  const char* Curve4i::Type::name () const {
+    return "curve4i";
+  }
+
+  template<>
+  size_t Curve4i::Type::sizeActive(const char* This) const
+  {
+    if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR)
+      return ((Line4i*)This)->size();
+    else
+      return ((Curve4i*)This)->N;
+  }
+
+  template<>
+  size_t Curve4i::Type::sizeTotal(const char* This) const
+  {
+    if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR)
+      return 4;
+    else
+      return ((Curve4i*)This)->N;
+  }
+
+  template<>
+  size_t Curve4i::Type::getBytes(const char* This) const
+  {
+    if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR)
+      return Line4i::bytes(sizeActive(This));
+    else
+      return Curve4i::bytes(sizeActive(This));
+  }
+
+  /********************** Curve4iMB **************************/
+
+  template<>
+  const char* Curve4iMB::Type::name () const {
+    return "curve4imb";
+  }
+
+  template<>
+  size_t Curve4iMB::Type::sizeActive(const char* This) const
+  {
+    if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR)
+      return ((Line4i*)This)->size();
+    else
+      return ((Curve4iMB*)This)->N;
+  }
+
+  template<>
+  size_t Curve4iMB::Type::sizeTotal(const char* This) const
+  {
+    if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR)
+      return 4;
+    else
+      return ((Curve4iMB*)This)->N;
+  }
+
+  template<>
+  size_t Curve4iMB::Type::getBytes(const char* This) const
+  {
+    if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR)
+      return Line4i::bytes(sizeActive(This));
+    else
+      return Curve4iMB::bytes(sizeActive(This));
+  }
+
+  /********************** Line4i **************************/
+
+  template<>
+  const char* Line4i::Type::name () const {
+    return "line4i";
+  }
+
+  template<>
+  size_t Line4i::Type::sizeActive(const char* This) const {
+    return ((Line4i*)This)->size();
+  }
+
+  template<>
+  size_t Line4i::Type::sizeTotal(const char* This) const {
+    return 4;
+  }
+
+  template<>
+  size_t Line4i::Type::getBytes(const char* This) const {
+    return sizeof(Line4i);
+  }
+
+  /********************** Triangle4 **************************/
+
+  template<>
+  const char* Triangle4::Type::name () const {
+    return "triangle4";
+  }
+
+  template<>
+  size_t Triangle4::Type::sizeActive(const char* This) const {
+    return ((Triangle4*)This)->size();
+  }
+
+  template<>
+  size_t Triangle4::Type::sizeTotal(const char* This) const {
+    return 4;
+  }
+
+  template<>
+  size_t Triangle4::Type::getBytes(const char* This) const {
+    return sizeof(Triangle4);
+  }
+
+  /********************** Triangle4v **************************/
+
+  template<>
+  const char* Triangle4v::Type::name () const {
+    return "triangle4v";
+  }
+
+  template<>
+  size_t Triangle4v::Type::sizeActive(const char* This) const {
+    return ((Triangle4v*)This)->size();
+  }
+
+  template<>
+  size_t Triangle4v::Type::sizeTotal(const char* This) const {
+    return 4;
+  }
+
+  template<>
+  size_t Triangle4v::Type::getBytes(const char* This) const {
+    return sizeof(Triangle4v);
+  }
+
+  /********************** Triangle4i **************************/
+
+  template<>
+  const char* Triangle4i::Type::name () const {
+    return "triangle4i";
+  }
+
+  template<>
+  size_t Triangle4i::Type::sizeActive(const char* This) const {
+    return ((Triangle4i*)This)->size();
+  }
+
+  template<>
+  size_t Triangle4i::Type::sizeTotal(const char* This) const {
+    return 4;
+  }
+
+  template<>
+  size_t Triangle4i::Type::getBytes(const char* This) const {
+    return sizeof(Triangle4i);
+  }
+
+  /********************** Triangle4vMB **************************/
+
+  template<>
+  const char* Triangle4vMB::Type::name () const {
+    return  "triangle4vmb";
+  }
+
+  template<>
+  size_t Triangle4vMB::Type::sizeActive(const char* This) const {
+    return ((Triangle4vMB*)This)->size();
+  }
+
+  template<>
+  size_t Triangle4vMB::Type::sizeTotal(const char* This) const {
+    return 4;
+  }
+
+  template<>
+  size_t Triangle4vMB::Type::getBytes(const char* This) const {
+    return sizeof(Triangle4vMB);
+  }
+
+  /********************** Quad4v **************************/
+
+  template<>
+  const char* Quad4v::Type::name () const {
+    return "quad4v";
+  }
+
+  template<>
+  size_t Quad4v::Type::sizeActive(const char* This) const {
+    return ((Quad4v*)This)->size();
+  }
+
+  template<>
+  size_t Quad4v::Type::sizeTotal(const char* This) const {
+    return 4;
+  }
+
+  template<>
+  size_t Quad4v::Type::getBytes(const char* This) const {
+    return sizeof(Quad4v);
+  }
+
+  /********************** Quad4i **************************/
+
+  template<>
+  const char* Quad4i::Type::name () const {
+    return "quad4i";
+  }
+
+  template<>
+  size_t Quad4i::Type::sizeActive(const char* This) const {
+    return ((Quad4i*)This)->size();
+  }
+
+  template<>
+  size_t Quad4i::Type::sizeTotal(const char* This) const {
+    return 4;
+  }
+
+  template<>
+  size_t Quad4i::Type::getBytes(const char* This) const {
+    return sizeof(Quad4i);
+  }
+
+  /********************** SubdivPatch1 **************************/
+
+  const char* SubdivPatch1::Type::name () const {
+    return "subdivpatch1";
+  }
+
+  size_t SubdivPatch1::Type::sizeActive(const char* This) const {
+    return 1;
+  }
+
+  size_t SubdivPatch1::Type::sizeTotal(const char* This) const {
+    return 1;
+  }
+
+  size_t SubdivPatch1::Type::getBytes(const char* This) const {
+    return sizeof(SubdivPatch1);
+  }
+
+  SubdivPatch1::Type SubdivPatch1::type;
+
+  /********************** Virtual Object **************************/
+
+  const char* Object::Type::name () const {
+    return "object";
+  }
+
+  size_t Object::Type::sizeActive(const char* This) const {
+    return 1;
+  }
+
+  size_t Object::Type::sizeTotal(const char* This) const {
+    return 1;
+  }
+
+  size_t Object::Type::getBytes(const char* This) const {
+    return sizeof(Object);
+  }
+
+  Object::Type Object::type;
+
+  /********************** Instance **************************/
+
+  const char* InstancePrimitive::Type::name () const {
+    return "instance";
+  }
+
+  size_t InstancePrimitive::Type::sizeActive(const char* This) const {
+    return 1;
+  }
+
+  size_t InstancePrimitive::Type::sizeTotal(const char* This) const {
+    return 1;
+  }
+
+  size_t InstancePrimitive::Type::getBytes(const char* This) const {
+    return sizeof(InstancePrimitive);
+  }
+
+  InstancePrimitive::Type InstancePrimitive::type;
+
+  /********************** SubGrid **************************/
+
+  const char* SubGrid::Type::name () const {
+    return "subgrid";
+  }
+
+  size_t SubGrid::Type::sizeActive(const char* This) const {
+    return 1;
+  }
+
+  size_t SubGrid::Type::sizeTotal(const char* This) const {
+    return 1;
+  }
+
+  size_t SubGrid::Type::getBytes(const char* This) const {
+    return sizeof(SubGrid);
+  }
+
+  SubGrid::Type SubGrid::type;
+  
+  /********************** SubGridQBVH4 **************************/
+
+  template<>
+  const char* SubGridQBVH4::Type::name () const {
+    return "SubGridQBVH4";
+  }
+
+  template<>
+  size_t SubGridQBVH4::Type::sizeActive(const char* This) const {
+    return 1;
+  }
+
+  template<>
+  size_t SubGridQBVH4::Type::sizeTotal(const char* This) const {
+    return 1;
+  }
+
+  template<>
+  size_t SubGridQBVH4::Type::getBytes(const char* This) const {
+    return sizeof(SubGridQBVH4);
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/quad_intersector.h b/thirdparty/embree/kernels/geometry/quad_intersector.h
new file mode 100644
index 0000000000..93c9526912
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/quad_intersector.h
@@ -0,0 +1,76 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+namespace embree
+{
+  namespace isa
+  {
+    /*! Intersects a ray with a quad with backface culling
+     *  enabled. The quad v0,v1,v2,v3 is split into two triangles
+     *  v0,v1,v3 and v2,v3,v1. The edge v1,v2 decides which of the two
+     *  triangles gets intersected. */
+    template<int N>
+    __forceinline vbool<N> intersect_quad_backface_culling(const vbool<N>& valid0,
+                                                           const Vec3fa& ray_org,
+                                                           const Vec3fa& ray_dir,
+                                                           const float ray_tnear,
+                                                           const float ray_tfar,
+                                                           const Vec3vf<N>& quad_v0,
+                                                           const Vec3vf<N>& quad_v1,
+                                                           const Vec3vf<N>& quad_v2,
+                                                           const Vec3vf<N>& quad_v3,
+                                                           vfloat<N>& u_o,
+                                                           vfloat<N>& v_o,
+                                                           vfloat<N>& t_o)
+    {
+      /* calculate vertices relative to ray origin */
+      vbool<N> valid = valid0;
+      const Vec3vf<N> O = Vec3vf<N>(ray_org);
+      const Vec3vf<N> D = Vec3vf<N>(ray_dir);
+      const Vec3vf<N> va = quad_v0-O;
+      const Vec3vf<N> vb = quad_v1-O;
+      const Vec3vf<N> vc = quad_v2-O;
+      const Vec3vf<N> vd = quad_v3-O;
+
+      const Vec3vf<N> edb = vb-vd;
+      const vfloat<N> WW = dot(cross(vd,edb),D);
+      const Vec3vf<N> v0 = select(WW <= 0.0f,va,vc);
+      const Vec3vf<N> v1 = select(WW <= 0.0f,vb,vd);
+      const Vec3vf<N> v2 = select(WW <= 0.0f,vd,vb);
+
+      /* calculate edges */
+      const Vec3vf<N> e0 = v2-v0;
+      const Vec3vf<N> e1 = v0-v1;
+
+      /* perform edge tests */
+      const vfloat<N> U = dot(cross(v0,e0),D);
+      const vfloat<N> V = dot(cross(v1,e1),D);
+      valid &= max(U,V) <= 0.0f;
+      if (unlikely(none(valid))) return false;
+
+      /* calculate geometry normal and denominator */
+      const Vec3vf<N> Ng = cross(e1,e0);
+      const vfloat<N> den = dot(Ng,D);
+      const vfloat<N> rcpDen = rcp(den);
+
+      /* perform depth test */
+      const vfloat<N> t = rcpDen*dot(v0,Ng);
+      valid &= vfloat<N>(ray_tnear) <= t & t <= vfloat<N>(ray_tfar);
+      if (unlikely(none(valid))) return false;
+
+      /* avoid division by 0 */
+      valid &= den != vfloat<N>(zero);
+      if (unlikely(none(valid))) return false;
+
+      /* update hit information */
+      t_o = t;
+      u_o = U * rcpDen;
+      v_o = V * rcpDen;
+      u_o = select(WW <= 0.0f,u_o,1.0f-u_o);
+      v_o = select(WW <= 0.0f,v_o,1.0f-v_o);
+      return valid;
+    }
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/quad_intersector_moeller.h b/thirdparty/embree/kernels/geometry/quad_intersector_moeller.h
new file mode 100644
index 0000000000..3abc9d6f70
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/quad_intersector_moeller.h
@@ -0,0 +1,460 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "quadv.h"
+#include "triangle_intersector_moeller.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M>
+    struct QuadHitM
+    {
+      __forceinline QuadHitM() {}
+
+      __forceinline QuadHitM(const vbool<M>& valid,
+                             const vfloat<M>& U,
+                             const vfloat<M>& V,
+                             const vfloat<M>& T,
+                             const vfloat<M>& absDen,
+                             const Vec3vf<M>& Ng,
+                             const vbool<M>& flags)
+        : U(U), V(V), T(T), absDen(absDen), tri_Ng(Ng), valid(valid), flags(flags) {}
+
+      __forceinline void finalize()
+      {
+        const vfloat<M> rcpAbsDen = rcp(absDen);
+        vt = T * rcpAbsDen;
+        const vfloat<M> u = min(U * rcpAbsDen,1.0f);
+        const vfloat<M> v = min(V * rcpAbsDen,1.0f);
+        const vfloat<M> u1 = vfloat<M>(1.0f) - u;
+        const vfloat<M> v1 = vfloat<M>(1.0f) - v;
+#if !defined(__AVX__) || defined(EMBREE_BACKFACE_CULLING)
+        vu = select(flags,u1,u);
+        vv = select(flags,v1,v);
+        vNg = Vec3vf<M>(tri_Ng.x,tri_Ng.y,tri_Ng.z);
+#else
+        const vfloat<M> flip = select(flags,vfloat<M>(-1.0f),vfloat<M>(1.0f));
+        vv = select(flags,u1,v);
+        vu = select(flags,v1,u);
+        vNg = Vec3vf<M>(flip*tri_Ng.x,flip*tri_Ng.y,flip*tri_Ng.z);
+#endif
+      }
+
+      __forceinline Vec2f uv(const size_t i)
+      {
+        const float u = vu[i];
+        const float v = vv[i];
+        return Vec2f(u,v);
+      }
+
+      __forceinline float   t(const size_t i) { return vt[i]; }
+      __forceinline Vec3fa Ng(const size_t i) { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); }
+
+    private:
+      vfloat<M> U;
+      vfloat<M> V;
+      vfloat<M> T;
+      vfloat<M> absDen;
+      Vec3vf<M> tri_Ng;
+
+    public:
+      vbool<M> valid;
+      vfloat<M> vu;
+      vfloat<M> vv;
+      vfloat<M> vt;
+      Vec3vf<M> vNg;
+
+    public:
+      const vbool<M> flags;
+    };
+
+    template<int K>
+    struct QuadHitK
+    {
+      __forceinline QuadHitK(const vfloat<K>& U,
+                             const vfloat<K>& V,
+                             const vfloat<K>& T,
+                             const vfloat<K>& absDen,
+                             const Vec3vf<K>& Ng,
+                             const vbool<K>& flags)
+        : U(U), V(V), T(T), absDen(absDen), flags(flags), tri_Ng(Ng) {}
+
+      __forceinline std::tuple<vfloat<K>,vfloat<K>,vfloat<K>,Vec3vf<K>> operator() () const
+      {
+        const vfloat<K> rcpAbsDen = rcp(absDen);
+        const vfloat<K> t = T * rcpAbsDen;
+        const vfloat<K> u0 = min(U * rcpAbsDen,1.0f);
+        const vfloat<K> v0 = min(V * rcpAbsDen,1.0f);
+        const vfloat<K> u1 = vfloat<K>(1.0f) - u0;
+        const vfloat<K> v1 = vfloat<K>(1.0f) - v0;
+        const vfloat<K> u = select(flags,u1,u0);
+        const vfloat<K> v = select(flags,v1,v0);
+        const Vec3vf<K> Ng(tri_Ng.x,tri_Ng.y,tri_Ng.z);
+        return std::make_tuple(u,v,t,Ng);
+      }
+
+    private:
+      const vfloat<K> U;
+      const vfloat<K> V;
+      const vfloat<K> T;
+      const vfloat<K> absDen;
+      const vbool<K> flags;
+      const Vec3vf<K> tri_Ng;
+    };
+
+    /* ----------------------------- */
+    /* -- single ray intersectors -- */
+    /* ----------------------------- */
+
+
+    template<int M, bool filter>
+    struct QuadMIntersector1MoellerTrumbore;
+
+    /*! Intersects M quads with 1 ray */
+    template<int M, bool filter>
+    struct QuadMIntersector1MoellerTrumbore
+    {
+      __forceinline QuadMIntersector1MoellerTrumbore() {}
+
+      __forceinline QuadMIntersector1MoellerTrumbore(const Ray& ray, const void* ptr) {}
+
+      __forceinline void intersect(RayHit& ray, IntersectContext* context,
+                                   const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+                                   const vuint<M>& geomID, const vuint<M>& primID) const
+      {
+        UVIdentity<M> mapUV;
+        MoellerTrumboreHitM<M,UVIdentity<M>> hit(mapUV);
+        MoellerTrumboreIntersector1<M> intersector(ray,nullptr);
+        Intersect1EpilogM<M,filter> epilog(ray,context,geomID,primID);
+
+        /* intersect first triangle */
+        if (intersector.intersect(ray,v0,v1,v3,mapUV,hit)) 
+          epilog(hit.valid,hit);
+
+        /* intersect second triangle */
+        if (intersector.intersect(ray,v2,v3,v1,mapUV,hit)) 
+        {
+          hit.U = hit.absDen - hit.U;
+          hit.V = hit.absDen - hit.V;
+          epilog(hit.valid,hit);
+        }
+      }
+      
+      __forceinline bool occluded(Ray& ray, IntersectContext* context,
+                                  const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+                                  const vuint<M>& geomID, const vuint<M>& primID) const
+      {
+        UVIdentity<M> mapUV;
+        MoellerTrumboreHitM<M,UVIdentity<M>> hit(mapUV);
+        MoellerTrumboreIntersector1<M> intersector(ray,nullptr);
+        Occluded1EpilogM<M,filter> epilog(ray,context,geomID,primID);
+
+        /* intersect first triangle */
+        if (intersector.intersect(ray,v0,v1,v3,mapUV,hit)) 
+        {
+          if (epilog(hit.valid,hit))
+            return true;
+        }
+
+        /* intersect second triangle */
+        if (intersector.intersect(ray,v2,v3,v1,mapUV,hit)) 
+        {
+          hit.U = hit.absDen - hit.U;
+          hit.V = hit.absDen - hit.V;
+          if (epilog(hit.valid,hit))
+            return true;
+        }
+        return false;
+      }
+    };
+
+#if defined(__AVX__)
+
+    /*! Intersects 4 quads with 1 ray using AVX */
+    template<bool filter>
+    struct QuadMIntersector1MoellerTrumbore<4,filter>
+    {
+      __forceinline QuadMIntersector1MoellerTrumbore() {}
+
+      __forceinline QuadMIntersector1MoellerTrumbore(const Ray& ray, const void* ptr) {}
+      
+      template<typename Epilog>
+      __forceinline bool intersect(Ray& ray, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const Epilog& epilog) const
+      {
+        const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z));
+#if !defined(EMBREE_BACKFACE_CULLING)
+        const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z));        
+#else
+        const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z));
+#endif
+        UVIdentity<8> mapUV;
+        MoellerTrumboreHitM<8,UVIdentity<8>> hit(mapUV);
+        MoellerTrumboreIntersector1<8> intersector(ray,nullptr);
+        const vbool8 flags(0,0,0,0,1,1,1,1);
+        if (unlikely(intersector.intersect(ray,vtx0,vtx1,vtx2,mapUV,hit)))
+        {
+          vfloat8 U = hit.U, V = hit.V, absDen = hit.absDen;
+
+#if !defined(EMBREE_BACKFACE_CULLING)
+          hit.U = select(flags,absDen-V,U);
+          hit.V = select(flags,absDen-U,V);
+          hit.vNg *= select(flags,vfloat8(-1.0f),vfloat8(1.0f)); // FIXME: use XOR
+#else
+          hit.U = select(flags,absDen-U,U);
+          hit.V = select(flags,absDen-V,V);
+#endif
+          if (unlikely(epilog(hit.valid,hit)))
+            return true;
+        }
+        return false;
+      }
+      
+      __forceinline bool intersect(RayHit& ray, IntersectContext* context,
+                                   const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                   const vuint4& geomID, const vuint4& primID) const
+      {
+        return intersect(ray,v0,v1,v2,v3,Intersect1EpilogM<8,filter>(ray,context,vuint8(geomID),vuint8(primID)));
+      }
+      
+      __forceinline bool occluded(Ray& ray, IntersectContext* context,
+                                  const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                  const vuint4& geomID, const vuint4& primID) const
+      {
+        return intersect(ray,v0,v1,v2,v3,Occluded1EpilogM<8,filter>(ray,context,vuint8(geomID),vuint8(primID)));
+      }
+    };
+
+#endif
+
+    /* ----------------------------- */
+    /* -- ray packet intersectors -- */
+    /* ----------------------------- */
+
+
+    struct MoellerTrumboreIntersector1KTriangleM
+    {
+      /*! Intersect k'th ray from ray packet of size K with M triangles. */
+      template<int M, int K, typename Epilog>
+      static  __forceinline bool intersect(RayK<K>& ray,
+                                           size_t k,
+                                           const Vec3vf<M>& tri_v0,
+                                           const Vec3vf<M>& tri_e1,
+                                           const Vec3vf<M>& tri_e2,
+                                           const Vec3vf<M>& tri_Ng,
+                                           const vbool<M>& flags,
+                                           const Epilog& epilog)
+      {
+        /* calculate denominator */
+        const Vec3vf<M> O = broadcast<vfloat<M>>(ray.org,k);
+        const Vec3vf<M> D = broadcast<vfloat<M>>(ray.dir,k);
+        const Vec3vf<M> C = Vec3vf<M>(tri_v0) - O;
+        const Vec3vf<M> R = cross(C,D);
+        const vfloat<M> den = dot(Vec3vf<M>(tri_Ng),D);
+        const vfloat<M> absDen = abs(den);
+        const vfloat<M> sgnDen = signmsk(den);
+        
+        /* perform edge tests */
+        const vfloat<M> U = dot(R,Vec3vf<M>(tri_e2)) ^ sgnDen;
+        const vfloat<M> V = dot(R,Vec3vf<M>(tri_e1)) ^ sgnDen;
+        
+        /* perform backface culling */
+#if defined(EMBREE_BACKFACE_CULLING)
+        vbool<M> valid = (den < vfloat<M>(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen);
+#else
+        vbool<M> valid = (den != vfloat<M>(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen);
+#endif
+        if (likely(none(valid))) return false;
+        
+        /* perform depth test */
+        const vfloat<M> T = dot(Vec3vf<M>(tri_Ng),C) ^ sgnDen;
+        valid &= (absDen*vfloat<M>(ray.tnear()[k]) < T) & (T <= absDen*vfloat<M>(ray.tfar[k]));
+        if (likely(none(valid))) return false;
+        
+        /* calculate hit information */
+        QuadHitM<M> hit(valid,U,V,T,absDen,tri_Ng,flags);
+        return epilog(valid,hit);
+      }
+      
+      template<int M, int K, typename Epilog>
+      static __forceinline bool intersect1(RayK<K>& ray,
+                                           size_t k,
+                                           const Vec3vf<M>& v0,
+                                           const Vec3vf<M>& v1,
+                                           const Vec3vf<M>& v2,
+                                           const vbool<M>& flags,
+                                           const Epilog& epilog)
+      {
+        const Vec3vf<M> e1 = v0-v1;
+        const Vec3vf<M> e2 = v2-v0;
+        const Vec3vf<M> Ng = cross(e2,e1);
+        return intersect<M,K>(ray,k,v0,e1,e2,Ng,flags,epilog);
+      }
+    };
+
+    template<int M, int K, bool filter>
+    struct QuadMIntersectorKMoellerTrumboreBase
+    {
+      __forceinline QuadMIntersectorKMoellerTrumboreBase(const vbool<K>& valid, const RayK<K>& ray) {}
+            
+      /*! Intersects K rays with one of M triangles. */
+      template<typename Epilog>
+      __forceinline vbool<K> intersectK(const vbool<K>& valid0,
+                                        RayK<K>& ray,
+                                        const Vec3vf<K>& tri_v0,
+                                        const Vec3vf<K>& tri_e1,
+                                        const Vec3vf<K>& tri_e2,
+                                        const Vec3vf<K>& tri_Ng,
+                                        const vbool<K>& flags,
+                                        const Epilog& epilog) const
+      { 
+        /* calculate denominator */
+        vbool<K> valid = valid0;
+        const Vec3vf<K> C = tri_v0 - ray.org;
+        const Vec3vf<K> R = cross(C,ray.dir);
+        const vfloat<K> den = dot(tri_Ng,ray.dir);
+        const vfloat<K> absDen = abs(den);
+        const vfloat<K> sgnDen = signmsk(den);
+        
+        /* test against edge p2 p0 */
+        const vfloat<K> U = dot(R,tri_e2) ^ sgnDen;
+        valid &= U >= 0.0f;
+        if (likely(none(valid))) return false;
+        
+        /* test against edge p0 p1 */
+        const vfloat<K> V = dot(R,tri_e1) ^ sgnDen;
+        valid &= V >= 0.0f;
+        if (likely(none(valid))) return false;
+        
+        /* test against edge p1 p2 */
+        const vfloat<K> W = absDen-U-V;
+        valid &= W >= 0.0f;
+        if (likely(none(valid))) return false;
+        
+        /* perform depth test */
+        const vfloat<K> T = dot(tri_Ng,C) ^ sgnDen;
+        valid &= (absDen*ray.tnear() < T) & (T <= absDen*ray.tfar);
+        if (unlikely(none(valid))) return false;
+        
+        /* perform backface culling */
+#if defined(EMBREE_BACKFACE_CULLING)
+        valid &= den < vfloat<K>(zero);
+        if (unlikely(none(valid))) return false;
+#else
+        valid &= den != vfloat<K>(zero);
+        if (unlikely(none(valid))) return false;
+#endif
+        
+        /* calculate hit information */
+        QuadHitK<K> hit(U,V,T,absDen,tri_Ng,flags);
+        return epilog(valid,hit);
+      }
+      
+      /*! Intersects K rays with one of M quads. */
+      template<typename Epilog>
+      __forceinline vbool<K> intersectK(const vbool<K>& valid0, 
+                                        RayK<K>& ray,
+                                        const Vec3vf<K>& tri_v0,
+                                        const Vec3vf<K>& tri_v1,
+                                        const Vec3vf<K>& tri_v2,
+                                        const vbool<K>& flags,
+                                        const Epilog& epilog) const
+      {
+        const Vec3vf<K> e1 = tri_v0-tri_v1;
+        const Vec3vf<K> e2 = tri_v2-tri_v0;
+        const Vec3vf<K> Ng = cross(e2,e1);
+        return intersectK(valid0,ray,tri_v0,e1,e2,Ng,flags,epilog);
+      }
+
+      /*! Intersects K rays with one of M quads. */
+      template<typename Epilog>
+      __forceinline bool intersectK(const vbool<K>& valid0, 
+                                    RayK<K>& ray,
+                                    const Vec3vf<K>& v0,
+                                    const Vec3vf<K>& v1,
+                                    const Vec3vf<K>& v2,
+                                    const Vec3vf<K>& v3,
+                                    const Epilog& epilog) const
+      {
+        intersectK(valid0,ray,v0,v1,v3,vbool<K>(false),epilog);
+        if (none(valid0)) return true;
+        intersectK(valid0,ray,v2,v3,v1,vbool<K>(true ),epilog);
+        return none(valid0);
+      }
+    };
+
+    template<int M, int K, bool filter>
+    struct QuadMIntersectorKMoellerTrumbore : public QuadMIntersectorKMoellerTrumboreBase<M,K,filter>
+    {
+      __forceinline QuadMIntersectorKMoellerTrumbore(const vbool<K>& valid, const RayK<K>& ray)
+        : QuadMIntersectorKMoellerTrumboreBase<M,K,filter>(valid,ray) {}
+
+      __forceinline void intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context,
+                                    const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+                                    const vuint<M>& geomID, const vuint<M>& primID) const
+      {
+        Intersect1KEpilogM<M,K,filter> epilog(ray,k,context,geomID,primID);
+        MoellerTrumboreIntersector1KTriangleM::intersect1<M,K>(ray,k,v0,v1,v3,vbool<M>(false),epilog);
+        MoellerTrumboreIntersector1KTriangleM::intersect1<M,K>(ray,k,v2,v3,v1,vbool<M>(true ),epilog);
+      }
+      
+      __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context,
+                                   const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+                                   const vuint<M>& geomID, const vuint<M>& primID) const
+      {
+        Occluded1KEpilogM<M,K,filter> epilog(ray,k,context,geomID,primID);
+        if (MoellerTrumboreIntersector1KTriangleM::intersect1<M,K>(ray,k,v0,v1,v3,vbool<M>(false),epilog)) return true;
+        if (MoellerTrumboreIntersector1KTriangleM::intersect1<M,K>(ray,k,v2,v3,v1,vbool<M>(true ),epilog)) return true;
+        return false;
+      }
+    };
+
+
+#if defined(__AVX__)
+
+    /*! Intersects 4 quads with 1 ray using AVX */
+    template<int K, bool filter>
+    struct QuadMIntersectorKMoellerTrumbore<4,K,filter> : public QuadMIntersectorKMoellerTrumboreBase<4,K,filter>
+    {
+      __forceinline QuadMIntersectorKMoellerTrumbore(const vbool<K>& valid, const RayK<K>& ray)
+        : QuadMIntersectorKMoellerTrumboreBase<4,K,filter>(valid,ray) {}
+      
+      template<typename Epilog>
+      __forceinline bool intersect1(RayK<K>& ray, size_t k,
+                                    const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const Epilog& epilog) const
+      {
+        const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z));
+#if !defined(EMBREE_BACKFACE_CULLING)
+        const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z));
+#else
+        const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z));
+#endif
+        const vbool8 flags(0,0,0,0,1,1,1,1);
+        return MoellerTrumboreIntersector1KTriangleM::intersect1<8,K>(ray,k,vtx0,vtx1,vtx2,flags,epilog); 
+      }
+      
+      __forceinline bool intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context,
+                                    const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                    const vuint4& geomID, const vuint4& primID) const
+      {
+        return intersect1(ray,k,v0,v1,v2,v3,Intersect1KEpilogM<8,K,filter>(ray,k,context,vuint8(geomID),vuint8(primID)));
+      }
+      
+      __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context,
+                                   const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                   const vuint4& geomID, const vuint4& primID) const
+      {
+        return intersect1(ray,k,v0,v1,v2,v3,Occluded1KEpilogM<8,K,filter>(ray,k,context,vuint8(geomID),vuint8(primID)));
+      }
+    };
+
+#endif
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/quad_intersector_pluecker.h b/thirdparty/embree/kernels/geometry/quad_intersector_pluecker.h
new file mode 100644
index 0000000000..9873ff76ac
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/quad_intersector_pluecker.h
@@ -0,0 +1,438 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "quad_intersector_moeller.h"
+
+/*! Modified Pluecker ray/triangle intersector. The test first shifts
+ *  the ray origin into the origin of the coordinate system and then
+ *  uses Pluecker coordinates for the intersection. Due to the shift,
+ *  the Pluecker coordinate calculation simplifies and the tests get
+ *  numerically stable. The edge equations are watertight along the
+ *  edge for neighboring triangles. */
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M>
+    struct QuadHitPlueckerM
+    {
+      __forceinline QuadHitPlueckerM() {}
+
+      __forceinline QuadHitPlueckerM(const vbool<M>& valid,
+                                     const vfloat<M>& U,
+                                     const vfloat<M>& V,
+                                     const vfloat<M>& UVW,
+                                     const vfloat<M>& t,
+                                     const Vec3vf<M>& Ng,
+                                     const vbool<M>& flags)
+        : U(U), V(V), UVW(UVW), tri_Ng(Ng), valid(valid), vt(t), flags(flags) {}
+
+      __forceinline void finalize()
+      {
+        const vbool<M> invalid = abs(UVW) < min_rcp_input;
+        const vfloat<M> rcpUVW = select(invalid,vfloat<M>(0.0f),rcp(UVW));
+        const vfloat<M> u = min(U * rcpUVW,1.0f);
+        const vfloat<M> v = min(V * rcpUVW,1.0f);
+        const vfloat<M> u1 = vfloat<M>(1.0f) - u;
+        const vfloat<M> v1 = vfloat<M>(1.0f) - v;
+#if !defined(__AVX__) || defined(EMBREE_BACKFACE_CULLING)
+        vu = select(flags,u1,u);
+        vv = select(flags,v1,v);
+        vNg = Vec3vf<M>(tri_Ng.x,tri_Ng.y,tri_Ng.z);
+#else
+        const vfloat<M> flip = select(flags,vfloat<M>(-1.0f),vfloat<M>(1.0f));
+        vv = select(flags,u1,v);
+        vu = select(flags,v1,u);
+        vNg = Vec3vf<M>(flip*tri_Ng.x,flip*tri_Ng.y,flip*tri_Ng.z);
+#endif
+      }
+
+      __forceinline Vec2f uv(const size_t i)
+      {
+        const float u = vu[i];
+        const float v = vv[i];
+        return Vec2f(u,v);
+      }
+
+      __forceinline float   t(const size_t i) { return vt[i]; }
+      __forceinline Vec3fa Ng(const size_t i) { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); }
+
+    private:
+      vfloat<M> U;
+      vfloat<M> V;
+      vfloat<M> UVW;
+      Vec3vf<M> tri_Ng;
+
+    public:
+      vbool<M> valid;
+      vfloat<M> vu;
+      vfloat<M> vv;
+      vfloat<M> vt;
+      Vec3vf<M> vNg;
+
+    public:
+      const vbool<M> flags;
+    };
+
+    template<int K>
+    struct QuadHitPlueckerK
+    {
+      __forceinline QuadHitPlueckerK(const vfloat<K>& U,
+                                     const vfloat<K>& V,
+                                     const vfloat<K>& UVW,
+                                     const vfloat<K>& t,
+                                     const Vec3vf<K>& Ng,
+                                     const vbool<K>& flags)
+        : U(U), V(V), UVW(UVW), t(t), flags(flags), tri_Ng(Ng) {}
+
+      __forceinline std::tuple<vfloat<K>,vfloat<K>,vfloat<K>,Vec3vf<K>> operator() () const
+      {
+        const vbool<K> invalid = abs(UVW) < min_rcp_input;
+        const vfloat<K> rcpUVW = select(invalid,vfloat<K>(0.0f),rcp(UVW));
+        const vfloat<K> u0 = min(U * rcpUVW,1.0f);
+        const vfloat<K> v0 = min(V * rcpUVW,1.0f);
+        const vfloat<K> u1 = vfloat<K>(1.0f) - u0;
+        const vfloat<K> v1 = vfloat<K>(1.0f) - v0;
+        const vfloat<K> u = select(flags,u1,u0);
+        const vfloat<K> v = select(flags,v1,v0);
+        const Vec3vf<K> Ng(tri_Ng.x,tri_Ng.y,tri_Ng.z);
+        return std::make_tuple(u,v,t,Ng);
+      }
+
+    private:
+      const vfloat<K> U;
+      const vfloat<K> V;
+      const vfloat<K> UVW;
+      const vfloat<K> t;
+      const vbool<K> flags;
+      const Vec3vf<K> tri_Ng;
+    };
+
+    struct PlueckerIntersectorTriangle1
+    {
+      template<int M, typename Epilog>
+      static __forceinline bool intersect(Ray& ray,
+                                          const Vec3vf<M>& tri_v0,
+                                          const Vec3vf<M>& tri_v1,
+                                          const Vec3vf<M>& tri_v2,
+                                          const vbool<M>& flags,
+                                          const Epilog& epilog)
+      {
+        /* calculate vertices relative to ray origin */
+        const Vec3vf<M> O = Vec3vf<M>((Vec3fa)ray.org);
+        const Vec3vf<M> D = Vec3vf<M>((Vec3fa)ray.dir);
+        const Vec3vf<M> v0 = tri_v0-O;
+        const Vec3vf<M> v1 = tri_v1-O;
+        const Vec3vf<M> v2 = tri_v2-O;
+
+        /* calculate triangle edges */
+        const Vec3vf<M> e0 = v2-v0;
+        const Vec3vf<M> e1 = v0-v1;
+        const Vec3vf<M> e2 = v1-v2;
+
+        /* perform edge tests */
+        const vfloat<M> U = dot(cross(e0,v2+v0),D);
+        const vfloat<M> V = dot(cross(e1,v0+v1),D);
+        const vfloat<M> W = dot(cross(e2,v1+v2),D);
+        const vfloat<M> UVW = U+V+W;
+        const vfloat<M> eps = float(ulp)*abs(UVW);
+#if defined(EMBREE_BACKFACE_CULLING)
+        vbool<M> valid = max(U,V,W) <= eps;
+#else
+        vbool<M> valid =  (min(U,V,W) >= -eps) | (max(U,V,W) <= eps);
+#endif
+        if (unlikely(none(valid))) return false;
+
+        /* calculate geometry normal and denominator */
+        const Vec3vf<M> Ng = stable_triangle_normal(e0,e1,e2);
+        const vfloat<M> den = twice(dot(Ng,D));
+
+         /* perform depth test */
+        const vfloat<M> T = twice(dot(v0,Ng));
+        const vfloat<M> t = rcp(den)*T;
+        valid &= vfloat<M>(ray.tnear()) <= t & t <= vfloat<M>(ray.tfar);
+        valid &= den != vfloat<M>(zero);
+        if (unlikely(none(valid))) return false;
+
+        /* update hit information */
+        QuadHitPlueckerM<M> hit(valid,U,V,UVW,t,Ng,flags);
+        return epilog(valid,hit);
+      }
+    };
+
+    /*! Intersects M quads with 1 ray */
+    template<int M, bool filter>
+    struct QuadMIntersector1Pluecker
+    {
+      __forceinline QuadMIntersector1Pluecker() {}
+
+      __forceinline QuadMIntersector1Pluecker(const Ray& ray, const void* ptr) {}
+
+      __forceinline void intersect(RayHit& ray, IntersectContext* context,
+                                   const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+                                   const vuint<M>& geomID, const vuint<M>& primID) const
+      {
+        Intersect1EpilogM<M,filter> epilog(ray,context,geomID,primID);
+        PlueckerIntersectorTriangle1::intersect<M>(ray,v0,v1,v3,vbool<M>(false),epilog);
+        PlueckerIntersectorTriangle1::intersect<M>(ray,v2,v3,v1,vbool<M>(true),epilog);
+      }
+      
+      __forceinline bool occluded(Ray& ray, IntersectContext* context,
+                                  const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+                                  const vuint<M>& geomID, const vuint<M>& primID) const
+      {
+        Occluded1EpilogM<M,filter> epilog(ray,context,geomID,primID);
+        if (PlueckerIntersectorTriangle1::intersect<M>(ray,v0,v1,v3,vbool<M>(false),epilog)) return true;
+        if (PlueckerIntersectorTriangle1::intersect<M>(ray,v2,v3,v1,vbool<M>(true ),epilog)) return true;
+        return false;
+      }
+    };
+
+#if defined(__AVX__)
+
+    /*! Intersects 4 quads with 1 ray using AVX */
+    template<bool filter>
+    struct QuadMIntersector1Pluecker<4,filter>
+    {
+      __forceinline QuadMIntersector1Pluecker() {}
+
+      __forceinline QuadMIntersector1Pluecker(const Ray& ray, const void* ptr) {}
+      
+      template<typename Epilog>
+      __forceinline bool intersect(Ray& ray, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const Epilog& epilog) const
+      {
+        const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z));
+#if !defined(EMBREE_BACKFACE_CULLING)
+        const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z));
+#else
+        const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z));
+#endif
+        const vbool8 flags(0,0,0,0,1,1,1,1);
+        return PlueckerIntersectorTriangle1::intersect<8>(ray,vtx0,vtx1,vtx2,flags,epilog); 
+      }
+      
+      __forceinline bool intersect(RayHit& ray, IntersectContext* context, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                   const vuint4& geomID, const vuint4& primID) const
+      {
+        return intersect(ray,v0,v1,v2,v3,Intersect1EpilogM<8,filter>(ray,context,vuint8(geomID),vuint8(primID)));
+      }
+      
+      __forceinline bool occluded(Ray& ray, IntersectContext* context, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3,
+                                  const vuint4& geomID, const vuint4& primID) const
+      {
+        return intersect(ray,v0,v1,v2,v3,Occluded1EpilogM<8,filter>(ray,context,vuint8(geomID),vuint8(primID)));
+      }
+    };
+
+#endif
+
+
+    /* ----------------------------- */
+    /* -- ray packet intersectors -- */
+    /* ----------------------------- */
+
+    struct PlueckerIntersector1KTriangleM
+    {
+      /*! Intersect k'th ray from ray packet of size K with M triangles. */
+      template<int M, int K, typename Epilog>
+      static  __forceinline bool intersect1(RayK<K>& ray,
+                                            size_t k,
+                                            const Vec3vf<M>& tri_v0,
+                                            const Vec3vf<M>& tri_v1,
+                                            const Vec3vf<M>& tri_v2,
+                                            const vbool<M>& flags,
+                                            const Epilog& epilog)
+      {
+        /* calculate vertices relative to ray origin */
+          const Vec3vf<M> O = broadcast<vfloat<M>>(ray.org,k);
+          const Vec3vf<M> D = broadcast<vfloat<M>>(ray.dir,k);
+          const Vec3vf<M> v0 = tri_v0-O;
+          const Vec3vf<M> v1 = tri_v1-O;
+          const Vec3vf<M> v2 = tri_v2-O;
+          
+          /* calculate triangle edges */
+          const Vec3vf<M> e0 = v2-v0;
+          const Vec3vf<M> e1 = v0-v1;
+          const Vec3vf<M> e2 = v1-v2;
+	  
+          /* perform edge tests */
+          const vfloat<M> U = dot(cross(e0,v2+v0),D);
+          const vfloat<M> V = dot(cross(e1,v0+v1),D);
+          const vfloat<M> W = dot(cross(e2,v1+v2),D);
+	  
+          const vfloat<M> UVW = U+V+W;
+          const vfloat<M> eps = float(ulp)*abs(UVW);
+#if defined(EMBREE_BACKFACE_CULLING)
+          vbool<M> valid = max(U,V,W) <= eps;
+#else
+          vbool<M> valid = (min(U,V,W) >= -eps) | (max(U,V,W) <= eps);
+#endif	
+          if (unlikely(none(valid))) return false;
+          
+          /* calculate geometry normal and denominator */
+          const Vec3vf<M> Ng = stable_triangle_normal(e0,e1,e2);
+          const vfloat<M> den = twice(dot(Ng,D));
+
+          /* perform depth test */
+          const vfloat<M> T = twice(dot(v0,Ng));
+          const vfloat<M> t = rcp(den)*T;
+          valid &= vfloat<M>(ray.tnear()[k]) <= t & t <= vfloat<M>(ray.tfar[k]);
+          if (unlikely(none(valid))) return false;
+          
+          /* avoid division by 0 */
+          valid &= den != vfloat<M>(zero);
+          if (unlikely(none(valid))) return false;
+          
+          /* update hit information */
+          QuadHitPlueckerM<M> hit(valid,U,V,UVW,t,Ng,flags);
+          return epilog(valid,hit);
+      }
+    };
+
+    template<int M, int K, bool filter>
+    struct QuadMIntersectorKPlueckerBase
+    {
+      __forceinline QuadMIntersectorKPlueckerBase(const vbool<K>& valid, const RayK<K>& ray) {}
+            
+      /*! Intersects K rays with one of M triangles. */
+      template<typename Epilog>
+      __forceinline vbool<K> intersectK(const vbool<K>& valid0,
+                                        RayK<K>& ray,
+                                        const Vec3vf<K>& tri_v0,
+                                        const Vec3vf<K>& tri_v1,
+                                        const Vec3vf<K>& tri_v2,
+                                        const vbool<K>& flags,
+                                        const Epilog& epilog) const
+      {
+        /* calculate vertices relative to ray origin */
+          vbool<K> valid = valid0;
+          const Vec3vf<K> O = ray.org;
+          const Vec3vf<K> D = ray.dir;
+          const Vec3vf<K> v0 = tri_v0-O;
+          const Vec3vf<K> v1 = tri_v1-O;
+          const Vec3vf<K> v2 = tri_v2-O;
+          
+          /* calculate triangle edges */
+          const Vec3vf<K> e0 = v2-v0;
+          const Vec3vf<K> e1 = v0-v1;
+          const Vec3vf<K> e2 = v1-v2;
+           
+          /* perform edge tests */
+          const vfloat<K> U = dot(Vec3vf<K>(cross(e0,v2+v0)),D);
+          const vfloat<K> V = dot(Vec3vf<K>(cross(e1,v0+v1)),D);
+          const vfloat<K> W = dot(Vec3vf<K>(cross(e2,v1+v2)),D);
+          const vfloat<K> UVW = U+V+W;
+          const vfloat<K> eps = float(ulp)*abs(UVW);
+#if defined(EMBREE_BACKFACE_CULLING)
+          valid &= max(U,V,W) <= eps;
+#else
+          valid &= (min(U,V,W) >= -eps) | (max(U,V,W) <= eps);
+#endif
+          if (unlikely(none(valid))) return false;
+          
+           /* calculate geometry normal and denominator */
+          const Vec3vf<K> Ng = stable_triangle_normal(e0,e1,e2);
+          const vfloat<K> den = twice(dot(Vec3vf<K>(Ng),D));
+
+          /* perform depth test */
+          const vfloat<K> T = twice(dot(v0,Vec3vf<K>(Ng)));
+          const vfloat<K> t = rcp(den)*T;
+          valid &= ray.tnear() <= t & t <= ray.tfar;
+          valid &= den != vfloat<K>(zero);
+          if (unlikely(none(valid))) return false;
+          
+          /* calculate hit information */
+          QuadHitPlueckerK<K> hit(U,V,UVW,t,Ng,flags);
+          return epilog(valid,hit);
+      }
+      
+      /*! Intersects K rays with one of M quads. */
+      template<typename Epilog>
+      __forceinline bool intersectK(const vbool<K>& valid0, 
+                                    RayK<K>& ray,
+                                    const Vec3vf<K>& v0,
+                                    const Vec3vf<K>& v1,
+                                    const Vec3vf<K>& v2,
+                                    const Vec3vf<K>& v3,
+                                    const Epilog& epilog) const
+      {
+        intersectK(valid0,ray,v0,v1,v3,vbool<K>(false),epilog);
+        if (none(valid0)) return true;
+        intersectK(valid0,ray,v2,v3,v1,vbool<K>(true ),epilog);
+        return none(valid0);
+      }
+    };
+
+    template<int M, int K, bool filter>
+      struct QuadMIntersectorKPluecker : public QuadMIntersectorKPlueckerBase<M,K,filter>
+    {
+      __forceinline QuadMIntersectorKPluecker(const vbool<K>& valid, const RayK<K>& ray)
+        : QuadMIntersectorKPlueckerBase<M,K,filter>(valid,ray) {}
+
+      __forceinline void intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context,
+                                    const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+                                    const vuint<M>& geomID, const vuint<M>& primID) const
+      {
+        Intersect1KEpilogM<M,K,filter> epilog(ray,k,context,geomID,primID);
+        PlueckerIntersector1KTriangleM::intersect1<M,K>(ray,k,v0,v1,v3,vbool<M>(false),epilog);
+        PlueckerIntersector1KTriangleM::intersect1<M,K>(ray,k,v2,v3,v1,vbool<M>(true ),epilog);
+      }
+      
+      __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context,
+                                   const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+                                   const vuint<M>& geomID, const vuint<M>& primID) const
+      {
+        Occluded1KEpilogM<M,K,filter> epilog(ray,k,context,geomID,primID);
+        if (PlueckerIntersector1KTriangleM::intersect1<M,K>(ray,k,v0,v1,v3,vbool<M>(false),epilog)) return true;
+        if (PlueckerIntersector1KTriangleM::intersect1<M,K>(ray,k,v2,v3,v1,vbool<M>(true ),epilog)) return true;
+        return false;
+      }
+    };
+
+#if defined(__AVX__)
+
+    /*! Intersects 4 quads with 1 ray using AVX */
+    template<int K, bool filter>
+    struct QuadMIntersectorKPluecker<4,K,filter> : public QuadMIntersectorKPlueckerBase<4,K,filter>
+    {
+      __forceinline QuadMIntersectorKPluecker(const vbool<K>& valid, const RayK<K>& ray)
+        : QuadMIntersectorKPlueckerBase<4,K,filter>(valid,ray) {}
+      
+      template<typename Epilog>
+      __forceinline bool intersect1(RayK<K>& ray, size_t k, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const Epilog& epilog) const
+      {
+        const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z));
+        const vbool8 flags(0,0,0,0,1,1,1,1);
+#if !defined(EMBREE_BACKFACE_CULLING)
+        const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z));
+#else
+        const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z));
+#endif
+        return PlueckerIntersector1KTriangleM::intersect1<8,K>(ray,k,vtx0,vtx1,vtx2,flags,epilog); 
+      }
+      
+      __forceinline bool intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context,
+                                    const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                    const vuint4& geomID, const vuint4& primID) const
+      {
+        return intersect1(ray,k,v0,v1,v2,v3,Intersect1KEpilogM<8,K,filter>(ray,k,context,vuint8(geomID),vuint8(primID)));
+      }
+      
+      __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context,
+                                   const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                   const vuint4& geomID, const vuint4& primID) const
+      {
+        return intersect1(ray,k,v0,v1,v2,v3,Occluded1KEpilogM<8,K,filter>(ray,k,context,vuint8(geomID),vuint8(primID)));
+      }
+    };
+
+#endif
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/quadi.h b/thirdparty/embree/kernels/geometry/quadi.h
new file mode 100644
index 0000000000..70a7bdf158
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/quadi.h
@@ -0,0 +1,483 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+#include "../common/scene.h"
+
+namespace embree
+{
+  /* Stores M quads from an indexed face set */
+  template <int M>
+  struct QuadMi
+  {
+    /* Virtual interface to query information about the quad type */
+    struct Type : public PrimitiveType
+    {
+      const char* name() const;
+      size_t sizeActive(const char* This) const;
+      size_t sizeTotal(const char* This) const;
+      size_t getBytes(const char* This) const;
+    };
+    static Type type;
+
+  public:
+
+    /* primitive supports multiple time segments */
+    static const bool singleTimeSegment = false;
+
+    /* Returns maximum number of stored quads */
+    static __forceinline size_t max_size() { return M; }
+
+    /* Returns required number of primitive blocks for N primitives */
+    static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); }
+
+  public:
+
+    /* Default constructor */
+    __forceinline QuadMi() {  }
+
+    /* Construction from vertices and IDs */
+    __forceinline QuadMi(const vuint<M>& v0,
+                         const vuint<M>& v1,
+                         const vuint<M>& v2,
+                         const vuint<M>& v3,
+                         const vuint<M>& geomIDs,
+                         const vuint<M>& primIDs)
+#if defined(EMBREE_COMPACT_POLYS)
+      : geomIDs(geomIDs), primIDs(primIDs) {}
+#else
+     : v0_(v0),v1_(v1), v2_(v2), v3_(v3), geomIDs(geomIDs), primIDs(primIDs) {}
+#endif
+
+    /* Returns a mask that tells which quads are valid */
+    __forceinline vbool<M> valid() const { return primIDs != vuint<M>(-1); }
+
+    /* Returns if the specified quad is valid */
+    __forceinline bool valid(const size_t i) const { assert(i<M); return primIDs[i] != -1; }
+
+    /* Returns the number of stored quads */
+    __forceinline size_t size() const { return bsf(~movemask(valid())); }
+
+    /* Returns the geometry IDs */
+    __forceinline       vuint<M>& geomID()       { return geomIDs; }
+    __forceinline const vuint<M>& geomID() const { return geomIDs; }
+    __forceinline unsigned int geomID(const size_t i) const { assert(i<M); assert(geomIDs[i] != -1); return geomIDs[i]; }
+
+    /* Returns the primitive IDs */
+    __forceinline       vuint<M>& primID()       { return primIDs; }
+    __forceinline const vuint<M>& primID() const { return primIDs; }
+    __forceinline unsigned int primID(const size_t i) const { assert(i<M); return primIDs[i]; }
+
+    /* Calculate the bounds of the quads */
+    __forceinline const BBox3fa bounds(const Scene *const scene, const size_t itime=0) const
+    {
+      BBox3fa bounds = empty;
+      for (size_t i=0; i<M && valid(i); i++) {
+        const QuadMesh* mesh = scene->get<QuadMesh>(geomID(i));
+        bounds.extend(mesh->bounds(primID(i),itime));
+      }
+      return bounds;
+    }
+
+    /* Calculate the linear bounds of the primitive */
+    __forceinline LBBox3fa linearBounds(const Scene* const scene, const size_t itime) {
+      return LBBox3fa(bounds(scene,itime+0),bounds(scene,itime+1));
+    }
+
+    __forceinline LBBox3fa linearBounds(const Scene *const scene, size_t itime, size_t numTimeSteps)
+    {
+      LBBox3fa allBounds = empty;
+      for (size_t i=0; i<M && valid(i); i++)
+      {
+        const QuadMesh* mesh = scene->get<QuadMesh>(geomID(i));
+        allBounds.extend(mesh->linearBounds(primID(i), itime, numTimeSteps));
+      }
+      return allBounds;
+    }
+
+    __forceinline LBBox3fa linearBounds(const Scene *const scene, const BBox1f time_range)
+    {
+      LBBox3fa allBounds = empty;
+      for (size_t i=0; i<M && valid(i); i++)
+      {
+        const QuadMesh* mesh = scene->get<QuadMesh>(geomID(i));
+        allBounds.extend(mesh->linearBounds(primID(i), time_range));
+      }
+      return allBounds;
+    }
+
+    /* Fill quad from quad list */
+    template<typename PrimRefT>
+    __forceinline void fill(const PrimRefT* prims, size_t& begin, size_t end, Scene* scene)
+    {
+      vuint<M> geomID = -1, primID = -1;
+      const PrimRefT* prim = &prims[begin];
+      vuint<M> v0 = zero, v1 = zero, v2 = zero, v3 = zero;
+
+      for (size_t i=0; i<M; i++)
+      {
+        if (begin<end) {
+          geomID[i] = prim->geomID();
+          primID[i] = prim->primID();
+#if !defined(EMBREE_COMPACT_POLYS)
+          const QuadMesh* mesh = scene->get<QuadMesh>(prim->geomID());
+          const QuadMesh::Quad& q = mesh->quad(prim->primID());
+          unsigned int_stride = mesh->vertices0.getStride()/4;
+          v0[i] = q.v[0] * int_stride;
+          v1[i] = q.v[1] * int_stride;
+          v2[i] = q.v[2] * int_stride;
+          v3[i] = q.v[3] * int_stride;
+#endif
+          begin++;
+        } else {
+          assert(i);
+          if (likely(i > 0)) {
+            geomID[i] = geomID[0]; // always valid geomIDs
+            primID[i] = -1;        // indicates invalid data
+            v0[i] = v0[0];
+            v1[i] = v0[0];
+            v2[i] = v0[0];
+            v3[i] = v0[0];
+          }
+        }
+        if (begin<end) prim = &prims[begin];
+      }
+      new (this) QuadMi(v0,v1,v2,v3,geomID,primID); // FIXME: use non temporal store
+    }
+
+    __forceinline LBBox3fa fillMB(const PrimRef* prims, size_t& begin, size_t end, Scene* scene, size_t itime)
+    {
+      fill(prims, begin, end, scene);
+      return linearBounds(scene, itime);
+    }
+
+    __forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t& begin, size_t end, Scene* scene, const BBox1f time_range)
+    {
+      fill(prims, begin, end, scene);
+      return linearBounds(scene, time_range);
+    }
+
+    friend embree_ostream operator<<(embree_ostream cout, const QuadMi& quad) {
+      return cout << "QuadMi<" << M << ">( "
+#if !defined(EMBREE_COMPACT_POLYS)
+                  << "v0 = " << quad.v0_ << ", v1 = " << quad.v1_ << ", v2 = " << quad.v2_ << ", v3 = " << quad.v3_ << ", "
+#endif
+                  << "geomID = " << quad.geomIDs << ", primID = " << quad.primIDs << " )";
+    }
+
+  protected:
+#if !defined(EMBREE_COMPACT_POLYS)
+    vuint<M> v0_;         // 4 byte offset of 1st vertex
+    vuint<M> v1_;         // 4 byte offset of 2nd vertex
+    vuint<M> v2_;         // 4 byte offset of 3rd vertex
+    vuint<M> v3_;         // 4 byte offset of 4th vertex
+#endif
+    vuint<M> geomIDs;    // geometry ID of mesh
+    vuint<M> primIDs;    // primitive ID of primitive inside mesh
+  };
+
+  namespace isa
+  {
+    
+  template<int M>
+    struct QuadMi : public embree::QuadMi<M>
+  {
+#if !defined(EMBREE_COMPACT_POLYS)
+    using embree::QuadMi<M>::v0_;
+    using embree::QuadMi<M>::v1_;
+    using embree::QuadMi<M>::v2_;
+    using embree::QuadMi<M>::v3_;
+#endif
+    using embree::QuadMi<M>::geomIDs;
+    using embree::QuadMi<M>::primIDs;
+    using embree::QuadMi<M>::geomID;
+    using embree::QuadMi<M>::primID;
+    using embree::QuadMi<M>::valid;
+    
+    template<int vid>
+    __forceinline Vec3f getVertex(const size_t index, const Scene *const scene) const
+    {
+#if defined(EMBREE_COMPACT_POLYS)
+      const QuadMesh* mesh = scene->get<QuadMesh>(geomID(index));
+      const QuadMesh::Quad& quad = mesh->quad(primID(index));
+      return (Vec3f) mesh->vertices[0][quad.v[vid]];
+#else
+      const vuint<M>& v = getVertexOffset<vid>();
+      const float* vertices = scene->vertices[geomID(index)];
+      return (Vec3f&) vertices[v[index]];
+#endif
+    }
+
+    template<int vid, typename T>
+    __forceinline Vec3<T> getVertex(const size_t index, const Scene *const scene, const size_t itime, const T& ftime) const
+    {
+#if defined(EMBREE_COMPACT_POLYS)
+      const QuadMesh* mesh = scene->get<QuadMesh>(geomID(index));
+      const QuadMesh::Quad& quad = mesh->quad(primID(index));
+      const Vec3fa v0 = mesh->vertices[itime+0][quad.v[vid]];
+      const Vec3fa v1 = mesh->vertices[itime+1][quad.v[vid]];
+#else
+      const vuint<M>& v = getVertexOffset<vid>();
+      const QuadMesh* mesh = scene->get<QuadMesh>(geomID(index));
+      const float* vertices0 = (const float*) mesh->vertexPtr(0,itime+0);
+      const float* vertices1 = (const float*) mesh->vertexPtr(0,itime+1);
+      const Vec3fa v0 = Vec3fa::loadu(vertices0+v[index]);
+      const Vec3fa v1 = Vec3fa::loadu(vertices1+v[index]);
+#endif
+      const Vec3<T> p0(v0.x,v0.y,v0.z);
+      const Vec3<T> p1(v1.x,v1.y,v1.z);
+      return lerp(p0,p1,ftime);
+    }
+
+    template<int vid, int K, typename T>
+    __forceinline Vec3<T> getVertex(const vbool<K>& valid, const size_t index, const Scene *const scene, const vint<K>& itime, const T& ftime) const
+    {
+      Vec3<T> p0, p1;
+      const QuadMesh* mesh = scene->get<QuadMesh>(geomID(index));
+
+      for (size_t mask=movemask(valid), i=bsf(mask); mask; mask=btc(mask,i), i=bsf(mask))
+      {
+#if defined(EMBREE_COMPACT_POLYS)
+        const QuadMesh::Quad& quad = mesh->quad(primID(index));
+        const Vec3fa v0 = mesh->vertices[itime[i]+0][quad.v[vid]];
+        const Vec3fa v1 = mesh->vertices[itime[i]+1][quad.v[vid]];
+#else
+        const vuint<M>& v = getVertexOffset<vid>();
+        const float* vertices0 = (const float*) mesh->vertexPtr(0,itime[i]+0);
+        const float* vertices1 = (const float*) mesh->vertexPtr(0,itime[i]+1);
+        const Vec3fa v0 = Vec3fa::loadu(vertices0+v[index]);
+        const Vec3fa v1 = Vec3fa::loadu(vertices1+v[index]);
+#endif
+        p0.x[i] = v0.x; p0.y[i] = v0.y; p0.z[i] = v0.z;
+        p1.x[i] = v1.x; p1.y[i] = v1.y; p1.z[i] = v1.z;
+      }
+      return (T(one)-ftime)*p0 + ftime*p1;
+    }
+
+    struct Quad {
+      vfloat4 v0,v1,v2,v3;
+    };
+
+#if defined(EMBREE_COMPACT_POLYS)
+    
+    __forceinline Quad loadQuad(const int i, const Scene* const scene) const 
+    {
+      const unsigned int geomID = geomIDs[i];
+      const unsigned int primID = primIDs[i];
+      if (unlikely(primID == -1)) return { zero, zero, zero, zero };
+      const QuadMesh* mesh = scene->get<QuadMesh>(geomID);
+      const QuadMesh::Quad& quad = mesh->quad(primID);
+      const vfloat4 v0 = (vfloat4) mesh->vertices0[quad.v[0]];
+      const vfloat4 v1 = (vfloat4) mesh->vertices0[quad.v[1]];
+      const vfloat4 v2 = (vfloat4) mesh->vertices0[quad.v[2]];
+      const vfloat4 v3 = (vfloat4) mesh->vertices0[quad.v[3]];
+      return { v0, v1, v2, v3 };
+    }
+
+    __forceinline Quad loadQuad(const int i, const int itime, const Scene* const scene) const 
+    {
+      const unsigned int geomID = geomIDs[i];
+      const unsigned int primID = primIDs[i];
+      if (unlikely(primID == -1)) return { zero, zero, zero, zero };
+      const QuadMesh* mesh = scene->get<QuadMesh>(geomID);
+      const QuadMesh::Quad& quad = mesh->quad(primID);
+      const vfloat4 v0 = (vfloat4) mesh->vertices[itime][quad.v[0]];
+      const vfloat4 v1 = (vfloat4) mesh->vertices[itime][quad.v[1]];
+      const vfloat4 v2 = (vfloat4) mesh->vertices[itime][quad.v[2]];
+      const vfloat4 v3 = (vfloat4) mesh->vertices[itime][quad.v[3]];
+      return { v0, v1, v2, v3 };
+    }
+    
+#else
+
+    __forceinline Quad loadQuad(const int i, const Scene* const scene) const 
+    {
+      const float* vertices = scene->vertices[geomID(i)];
+      const vfloat4 v0 = vfloat4::loadu(vertices + v0_[i]);
+      const vfloat4 v1 = vfloat4::loadu(vertices + v1_[i]);
+      const vfloat4 v2 = vfloat4::loadu(vertices + v2_[i]);
+      const vfloat4 v3 = vfloat4::loadu(vertices + v3_[i]);
+      return { v0, v1, v2, v3 };
+    }
+
+    __forceinline Quad loadQuad(const int i, const int itime, const Scene* const scene) const 
+    {
+      const unsigned int geomID = geomIDs[i];
+      const QuadMesh* mesh = scene->get<QuadMesh>(geomID);
+      const float* vertices = (const float*) mesh->vertexPtr(0,itime);
+      const vfloat4 v0 = vfloat4::loadu(vertices + v0_[i]);
+      const vfloat4 v1 = vfloat4::loadu(vertices + v1_[i]);
+      const vfloat4 v2 = vfloat4::loadu(vertices + v2_[i]);
+      const vfloat4 v3 = vfloat4::loadu(vertices + v3_[i]);
+      return { v0, v1, v2, v3 };
+    }
+    
+#endif
+
+    /* Gather the quads */
+    __forceinline void gather(Vec3vf<M>& p0,
+                              Vec3vf<M>& p1,
+                              Vec3vf<M>& p2,
+                              Vec3vf<M>& p3,
+                              const Scene *const scene) const;
+
+#if defined(__AVX512F__)
+    __forceinline void gather(Vec3vf16& p0,
+                              Vec3vf16& p1,
+                              Vec3vf16& p2,
+                              Vec3vf16& p3,
+                              const Scene *const scene) const;
+#endif
+
+    template<int K>
+#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 2000) // workaround for compiler bug in ICC 2019
+    __noinline
+#else
+    __forceinline
+#endif
+    void gather(const vbool<K>& valid,
+      Vec3vf<K>& p0,
+      Vec3vf<K>& p1,
+      Vec3vf<K>& p2,
+      Vec3vf<K>& p3,
+      const size_t index,
+      const Scene* const scene,
+      const vfloat<K>& time) const
+    {
+      const QuadMesh* mesh = scene->get<QuadMesh>(geomID(index));
+
+      vfloat<K> ftime;
+      const vint<K> itime = mesh->timeSegment<K>(time, ftime);
+
+      const size_t first = bsf(movemask(valid));
+      if (likely(all(valid,itime[first] == itime)))
+      {
+        p0 = getVertex<0>(index, scene, itime[first], ftime);
+        p1 = getVertex<1>(index, scene, itime[first], ftime);
+        p2 = getVertex<2>(index, scene, itime[first], ftime);
+        p3 = getVertex<3>(index, scene, itime[first], ftime);
+      }
+      else
+      {
+        p0 = getVertex<0,K>(valid, index, scene, itime, ftime);
+        p1 = getVertex<1,K>(valid, index, scene, itime, ftime);
+        p2 = getVertex<2,K>(valid, index, scene, itime, ftime);
+        p3 = getVertex<3,K>(valid, index, scene, itime, ftime);
+      }
+    }
+
+    __forceinline void gather(Vec3vf<M>& p0,
+                              Vec3vf<M>& p1,
+                              Vec3vf<M>& p2,
+                              Vec3vf<M>& p3,
+                              const QuadMesh* mesh,
+                              const Scene *const scene,
+                              const int itime) const;
+
+    __forceinline void gather(Vec3vf<M>& p0,
+                              Vec3vf<M>& p1,
+                              Vec3vf<M>& p2,
+                              Vec3vf<M>& p3,
+                              const Scene *const scene,
+                              const float time) const;
+
+    /* Updates the primitive */
+    __forceinline BBox3fa update(QuadMesh* mesh)
+    {
+      BBox3fa bounds = empty;
+      for (size_t i=0; i<M; i++)
+      {
+        if (!valid(i)) break;
+        const unsigned primId = primID(i);
+        const QuadMesh::Quad& q = mesh->quad(primId);
+        const Vec3fa p0 = mesh->vertex(q.v[0]);
+        const Vec3fa p1 = mesh->vertex(q.v[1]);
+        const Vec3fa p2 = mesh->vertex(q.v[2]);
+        const Vec3fa p3 = mesh->vertex(q.v[3]);
+        bounds.extend(merge(BBox3fa(p0),BBox3fa(p1),BBox3fa(p2),BBox3fa(p3)));
+      }
+      return bounds;
+    }
+
+  private:
+#if !defined(EMBREE_COMPACT_POLYS)
+    template<int N> const vuint<M>& getVertexOffset() const;
+#endif
+  };
+
+#if !defined(EMBREE_COMPACT_POLYS)
+  template<> template<> __forceinline const vuint<4>& QuadMi<4>::getVertexOffset<0>() const { return v0_; }
+  template<> template<> __forceinline const vuint<4>& QuadMi<4>::getVertexOffset<1>() const { return v1_; }
+  template<> template<> __forceinline const vuint<4>& QuadMi<4>::getVertexOffset<2>() const { return v2_; }
+  template<> template<> __forceinline const vuint<4>& QuadMi<4>::getVertexOffset<3>() const { return v3_; }
+#endif
+
+  template<>
+  __forceinline void QuadMi<4>::gather(Vec3vf4& p0,
+                                       Vec3vf4& p1,
+                                       Vec3vf4& p2,
+                                       Vec3vf4& p3,
+                                       const Scene *const scene) const
+  {
+    prefetchL1(((char*)this)+0*64);
+    prefetchL1(((char*)this)+1*64);
+    const Quad tri0 = loadQuad(0,scene);
+    const Quad tri1 = loadQuad(1,scene);
+    const Quad tri2 = loadQuad(2,scene);
+    const Quad tri3 = loadQuad(3,scene);
+    transpose(tri0.v0,tri1.v0,tri2.v0,tri3.v0,p0.x,p0.y,p0.z);
+    transpose(tri0.v1,tri1.v1,tri2.v1,tri3.v1,p1.x,p1.y,p1.z);
+    transpose(tri0.v2,tri1.v2,tri2.v2,tri3.v2,p2.x,p2.y,p2.z);
+    transpose(tri0.v3,tri1.v3,tri2.v3,tri3.v3,p3.x,p3.y,p3.z);
+  }
+
+  template<>
+  __forceinline void QuadMi<4>::gather(Vec3vf4& p0,
+                                       Vec3vf4& p1,
+                                       Vec3vf4& p2,
+                                       Vec3vf4& p3,
+                                       const QuadMesh* mesh,
+                                       const Scene *const scene,
+                                       const int itime) const
+  {
+    // FIXME: for trianglei there all geometries are identical, is this the case here too?
+    
+    const Quad tri0 = loadQuad(0,itime,scene);
+    const Quad tri1 = loadQuad(1,itime,scene);
+    const Quad tri2 = loadQuad(2,itime,scene);
+    const Quad tri3 = loadQuad(3,itime,scene);
+    transpose(tri0.v0,tri1.v0,tri2.v0,tri3.v0,p0.x,p0.y,p0.z);
+    transpose(tri0.v1,tri1.v1,tri2.v1,tri3.v1,p1.x,p1.y,p1.z);
+    transpose(tri0.v2,tri1.v2,tri2.v2,tri3.v2,p2.x,p2.y,p2.z);
+    transpose(tri0.v3,tri1.v3,tri2.v3,tri3.v3,p3.x,p3.y,p3.z);
+  }
+
+  template<>
+  __forceinline void QuadMi<4>::gather(Vec3vf4& p0,
+                                       Vec3vf4& p1,
+                                       Vec3vf4& p2,
+                                       Vec3vf4& p3,
+                                       const Scene *const scene,
+                                       const float time) const
+  {
+    const QuadMesh* mesh = scene->get<QuadMesh>(geomID(0)); // in mblur mode all geometries are identical
+
+    float ftime;
+    const int itime = mesh->timeSegment(time, ftime);
+
+    Vec3vf4 a0,a1,a2,a3; gather(a0,a1,a2,a3,mesh,scene,itime);
+    Vec3vf4 b0,b1,b2,b3; gather(b0,b1,b2,b3,mesh,scene,itime+1);
+    p0 = lerp(a0,b0,vfloat4(ftime));
+    p1 = lerp(a1,b1,vfloat4(ftime));
+    p2 = lerp(a2,b2,vfloat4(ftime));
+    p3 = lerp(a3,b3,vfloat4(ftime));
+  }
+  }
+
+  template<int M>
+  typename QuadMi<M>::Type QuadMi<M>::type;
+
+  typedef QuadMi<4> Quad4i;
+}
diff --git a/thirdparty/embree/kernels/geometry/quadi_intersector.h b/thirdparty/embree/kernels/geometry/quadi_intersector.h
new file mode 100644
index 0000000000..20a98c3406
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/quadi_intersector.h
@@ -0,0 +1,350 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "quadi.h"
+#include "quad_intersector_moeller.h"
+#include "quad_intersector_pluecker.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    /*! Intersects M quads with 1 ray */
+    template<int M, bool filter>
+    struct QuadMiIntersector1Moeller
+    {
+      typedef QuadMi<M> Primitive;
+      typedef QuadMIntersector1MoellerTrumbore<M,filter> Precalculations;
+
+      /*! Intersect a ray with the M quads and updates the hit. */
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& quad)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene);
+        pre.intersect(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+
+      /*! Test if the ray is occluded by one of M quads. */
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& quad)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene);
+        return pre.occluded(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& quad)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, quad);
+      }
+    };
+
+    /*! Intersects M triangles with K rays. */
+    template<int M, int K, bool filter>
+    struct QuadMiIntersectorKMoeller
+    {
+      typedef QuadMi<M> Primitive;
+      typedef QuadMIntersectorKMoellerTrumbore<M,K,filter> Precalculations;
+
+      /*! Intersects K rays with M triangles. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        Scene* scene = context->scene;
+        for (size_t i=0; i<QuadMi<M>::max_size(); i++)
+        {
+          if (!quad.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          const Vec3vf<K> p0 = quad.template getVertex<0>(i,scene);
+          const Vec3vf<K> p1 = quad.template getVertex<1>(i,scene);
+          const Vec3vf<K> p2 = quad.template getVertex<2>(i,scene);
+          const Vec3vf<K> p3 = quad.template getVertex<3>(i,scene);
+          pre.intersectK(valid_i,ray,p0,p1,p2,p3,IntersectKEpilogM<M,K,filter>(ray,context,quad.geomID(),quad.primID(),i));
+        }
+      }
+
+      /*! Test for K rays if they are occluded by any of the M triangles. */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        Scene* scene = context->scene;
+        vbool<K> valid0 = valid_i;
+        for (size_t i=0; i<QuadMi<M>::max_size(); i++)
+        {
+          if (!quad.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+          const Vec3vf<K> p0 = quad.template getVertex<0>(i,scene);
+          const Vec3vf<K> p1 = quad.template getVertex<1>(i,scene);
+          const Vec3vf<K> p2 = quad.template getVertex<2>(i,scene);
+          const Vec3vf<K> p3 = quad.template getVertex<3>(i,scene);
+          if (pre.intersectK(valid0,ray,p0,p1,p2,p3,OccludedKEpilogM<M,K,filter>(valid0,ray,context,quad.geomID(),quad.primID(),i)))
+            break;
+        }
+        return !valid0;
+      }
+      
+      /*! Intersect a ray with M triangles and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf4 v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene);
+        pre.intersect1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+
+      /*! Test if the ray is occluded by one of the M triangles. */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf4 v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene);
+        return pre.occluded1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+    };
+
+    /*! Intersects M quads with 1 ray */
+    template<int M, bool filter>
+    struct QuadMiIntersector1Pluecker
+    {
+      typedef QuadMi<M> Primitive;
+      typedef QuadMIntersector1Pluecker<M,filter> Precalculations;
+
+      /*! Intersect a ray with the M quads and updates the hit. */
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& quad)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene);
+        pre.intersect(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+
+      /*! Test if the ray is occluded by one of M quads. */
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& quad)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene);
+        return pre.occluded(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& quad)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, quad);
+      }
+    };
+
+    /*! Intersects M triangles with K rays. */
+    template<int M, int K, bool filter>
+    struct QuadMiIntersectorKPluecker
+    {
+      typedef QuadMi<M> Primitive;
+      typedef QuadMIntersectorKPluecker<M,K,filter> Precalculations;
+
+      /*! Intersects K rays with M triangles. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        Scene* scene = context->scene;
+        for (size_t i=0; i<QuadMi<M>::max_size(); i++)
+        {
+          if (!quad.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          const Vec3vf<K> p0 = quad.template getVertex<0>(i,scene);
+          const Vec3vf<K> p1 = quad.template getVertex<1>(i,scene);
+          const Vec3vf<K> p2 = quad.template getVertex<2>(i,scene);
+          const Vec3vf<K> p3 = quad.template getVertex<3>(i,scene);
+          pre.intersectK(valid_i,ray,p0,p1,p2,p3,IntersectKEpilogM<M,K,filter>(ray,context,quad.geomID(),quad.primID(),i));
+        }
+      }
+
+      /*! Test for K rays if they are occluded by any of the M triangles. */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        Scene* scene = context->scene;
+        vbool<K> valid0 = valid_i;
+        for (size_t i=0; i<QuadMi<M>::max_size(); i++)
+        {
+          if (!quad.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+          const Vec3vf<K> p0 = quad.template getVertex<0>(i,scene);
+          const Vec3vf<K> p1 = quad.template getVertex<1>(i,scene);
+          const Vec3vf<K> p2 = quad.template getVertex<2>(i,scene);
+          const Vec3vf<K> p3 = quad.template getVertex<3>(i,scene);
+          if (pre.intersectK(valid0,ray,p0,p1,p2,p3,OccludedKEpilogM<M,K,filter>(valid0,ray,context,quad.geomID(),quad.primID(),i)))
+            break;
+        }
+        return !valid0;
+      }
+      
+      /*! Intersect a ray with M triangles and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf4 v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene);
+        pre.intersect1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+
+      /*! Test if the ray is occluded by one of the M triangles. */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf4 v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene);
+        return pre.occluded1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+    };
+
+    /*! Intersects M motion blur quads with 1 ray */
+    template<int M, bool filter>
+    struct QuadMiMBIntersector1Moeller
+    {
+      typedef QuadMi<M> Primitive;
+      typedef QuadMIntersector1MoellerTrumbore<M,filter> Precalculations;
+
+      /*! Intersect a ray with the M quads and updates the hit. */
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& quad)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time());
+        pre.intersect(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+
+      /*! Test if the ray is occluded by one of M quads. */
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& quad)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time());
+        return pre.occluded(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& quad)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, quad);
+      }
+    };
+
+    /*! Intersects M motion blur quads with K rays. */
+    template<int M, int K, bool filter>
+    struct QuadMiMBIntersectorKMoeller
+    {
+      typedef QuadMi<M> Primitive;
+      typedef QuadMIntersectorKMoellerTrumbore<M,K,filter> Precalculations;
+
+      /*! Intersects K rays with M quads. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        for (size_t i=0; i<QuadMi<M>::max_size(); i++)
+        {
+          if (!quad.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          Vec3vf<K> v0,v1,v2,v3; quad.template gather<K>(valid_i,v0,v1,v2,v3,i,context->scene,ray.time());
+          pre.intersectK(valid_i,ray,v0,v1,v2,v3,IntersectKEpilogM<M,K,filter>(ray,context,quad.geomID(),quad.primID(),i));
+        }
+      }
+
+      /*! Test for K rays if they are occluded by any of the M quads. */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        vbool<K> valid0 = valid_i;
+        for (size_t i=0; i<QuadMi<M>::max_size(); i++)
+        {
+          if (!quad.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+          Vec3vf<K> v0,v1,v2,v3; quad.template gather<K>(valid_i,v0,v1,v2,v3,i,context->scene,ray.time());
+          if (pre.intersectK(valid0,ray,v0,v1,v2,v3,OccludedKEpilogM<M,K,filter>(valid0,ray,context,quad.geomID(),quad.primID(),i)))
+            break;
+        }
+        return !valid0;
+      }
+      
+      /*! Intersect a ray with M quads and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time()[k]);
+        pre.intersect1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+
+      /*! Test if the ray is occluded by one of the M quads. */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time()[k]);
+        return pre.occluded1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+    };
+
+    /*! Intersects M motion blur quads with 1 ray */
+    template<int M, bool filter>
+    struct QuadMiMBIntersector1Pluecker
+    {
+      typedef QuadMi<M> Primitive;
+      typedef QuadMIntersector1Pluecker<M,filter> Precalculations;
+
+      /*! Intersect a ray with the M quads and updates the hit. */
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& quad)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time());
+        pre.intersect(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+
+      /*! Test if the ray is occluded by one of M quads. */
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& quad)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time());
+        return pre.occluded(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& quad)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, quad);
+      }
+    };
+
+    /*! Intersects M motion blur quads with K rays. */
+    template<int M, int K, bool filter>
+    struct QuadMiMBIntersectorKPluecker
+    {
+      typedef QuadMi<M> Primitive;
+      typedef QuadMIntersectorKPluecker<M,K,filter> Precalculations;
+
+      /*! Intersects K rays with M quads. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        for (size_t i=0; i<QuadMi<M>::max_size(); i++)
+        {
+          if (!quad.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          Vec3vf<K> v0,v1,v2,v3; quad.template gather<K>(valid_i,v0,v1,v2,v3,i,context->scene,ray.time());
+          pre.intersectK(valid_i,ray,v0,v1,v2,v3,IntersectKEpilogM<M,K,filter>(ray,context,quad.geomID(),quad.primID(),i));
+        }
+      }
+
+      /*! Test for K rays if they are occluded by any of the M quads. */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        vbool<K> valid0 = valid_i;
+        for (size_t i=0; i<QuadMi<M>::max_size(); i++)
+        {
+          if (!quad.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+          Vec3vf<K> v0,v1,v2,v3; quad.template gather<K>(valid_i,v0,v1,v2,v3,i,context->scene,ray.time());
+          if (pre.intersectK(valid0,ray,v0,v1,v2,v3,OccludedKEpilogM<M,K,filter>(valid0,ray,context,quad.geomID(),quad.primID(),i)))
+            break;
+        }
+        return !valid0;
+      }
+      
+      /*! Intersect a ray with M quads and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time()[k]);
+        pre.intersect1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+
+      /*! Test if the ray is occluded by one of the M quads. */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time()[k]);
+        return pre.occluded1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/quadv.h b/thirdparty/embree/kernels/geometry/quadv.h
new file mode 100644
index 0000000000..2137356ff2
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/quadv.h
@@ -0,0 +1,165 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+
+namespace embree
+{
+  /* Stores the vertices of M quads in struct of array layout */
+  template <int M>
+  struct QuadMv
+  { 
+  public:
+    struct Type : public PrimitiveType 
+    {
+      const char* name() const;
+      size_t sizeActive(const char* This) const;
+      size_t sizeTotal(const char* This) const;
+      size_t getBytes(const char* This) const;
+    };
+    static Type type;
+
+  public:
+
+    /* Returns maximum number of stored quads */
+    static __forceinline size_t max_size() { return M; }
+    
+    /* Returns required number of primitive blocks for N primitives */
+    static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); }
+   
+  public:
+
+    /* Default constructor */
+    __forceinline QuadMv() {}
+
+    /* Construction from vertices and IDs */
+    __forceinline QuadMv(const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3, const vuint<M>& geomIDs, const vuint<M>& primIDs)
+      : v0(v0), v1(v1), v2(v2), v3(v3), geomIDs(geomIDs), primIDs(primIDs) {}
+    
+    /* Returns a mask that tells which quads are valid */
+    __forceinline vbool<M> valid() const { return geomIDs != vuint<M>(-1); }
+
+    /* Returns true if the specified quad is valid */
+    __forceinline bool valid(const size_t i) const { assert(i<M); return geomIDs[i] != -1; }
+
+    /* Returns the number of stored quads */
+    __forceinline size_t size() const { return bsf(~movemask(valid())); }
+
+    /* Returns the geometry IDs */
+    __forceinline       vuint<M>& geomID()       { return geomIDs; }
+    __forceinline const vuint<M>& geomID() const { return geomIDs; }
+    __forceinline unsigned int geomID(const size_t i) const { assert(i<M); return geomIDs[i]; }
+
+    /* Returns the primitive IDs */
+    __forceinline       vuint<M> primID()       { return primIDs; }
+    __forceinline const vuint<M> primID() const { return primIDs; }
+    __forceinline unsigned int primID(const size_t i) const { assert(i<M); return primIDs[i]; }
+
+    /* Calculate the bounds of the quads */
+    __forceinline BBox3fa bounds() const 
+    {
+      Vec3vf<M> lower = min(v0,v1,v2,v3);
+      Vec3vf<M> upper = max(v0,v1,v2,v3);
+      vbool<M> mask = valid();
+      lower.x = select(mask,lower.x,vfloat<M>(pos_inf));
+      lower.y = select(mask,lower.y,vfloat<M>(pos_inf));
+      lower.z = select(mask,lower.z,vfloat<M>(pos_inf));
+      upper.x = select(mask,upper.x,vfloat<M>(neg_inf));
+      upper.y = select(mask,upper.y,vfloat<M>(neg_inf));
+      upper.z = select(mask,upper.z,vfloat<M>(neg_inf));
+      return BBox3fa(Vec3fa(reduce_min(lower.x),reduce_min(lower.y),reduce_min(lower.z)),
+                     Vec3fa(reduce_max(upper.x),reduce_max(upper.y),reduce_max(upper.z)));
+    }
+    
+    /* Non temporal store */
+    __forceinline static void store_nt(QuadMv* dst, const QuadMv& src)
+    {
+      vfloat<M>::store_nt(&dst->v0.x,src.v0.x);
+      vfloat<M>::store_nt(&dst->v0.y,src.v0.y);
+      vfloat<M>::store_nt(&dst->v0.z,src.v0.z);
+      vfloat<M>::store_nt(&dst->v1.x,src.v1.x);
+      vfloat<M>::store_nt(&dst->v1.y,src.v1.y);
+      vfloat<M>::store_nt(&dst->v1.z,src.v1.z);
+      vfloat<M>::store_nt(&dst->v2.x,src.v2.x);
+      vfloat<M>::store_nt(&dst->v2.y,src.v2.y);
+      vfloat<M>::store_nt(&dst->v2.z,src.v2.z);
+      vfloat<M>::store_nt(&dst->v3.x,src.v3.x);
+      vfloat<M>::store_nt(&dst->v3.y,src.v3.y);
+      vfloat<M>::store_nt(&dst->v3.z,src.v3.z);
+      vuint<M>::store_nt(&dst->geomIDs,src.geomIDs);
+      vuint<M>::store_nt(&dst->primIDs,src.primIDs);
+    }
+
+    /* Fill quad from quad list */
+    __forceinline void fill(const PrimRef* prims, size_t& begin, size_t end, Scene* scene)
+    {
+      vuint<M> vgeomID = -1, vprimID = -1;
+      Vec3vf<M> v0 = zero, v1 = zero, v2 = zero, v3 = zero;
+      
+      for (size_t i=0; i<M && begin<end; i++, begin++)
+      {
+	const PrimRef& prim = prims[begin];
+        const unsigned geomID = prim.geomID();
+        const unsigned primID = prim.primID();
+        const QuadMesh* __restrict__ const mesh = scene->get<QuadMesh>(geomID);
+        const QuadMesh::Quad& quad = mesh->quad(primID);
+        const Vec3fa& p0 = mesh->vertex(quad.v[0]);
+        const Vec3fa& p1 = mesh->vertex(quad.v[1]);
+        const Vec3fa& p2 = mesh->vertex(quad.v[2]);
+        const Vec3fa& p3 = mesh->vertex(quad.v[3]);
+        vgeomID [i] = geomID;
+        vprimID [i] = primID;
+        v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z;
+        v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z;
+        v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z;
+        v3.x[i] = p3.x; v3.y[i] = p3.y; v3.z[i] = p3.z;
+      }
+      QuadMv::store_nt(this,QuadMv(v0,v1,v2,v3,vgeomID,vprimID));
+    }
+
+    /* Updates the primitive */
+    __forceinline BBox3fa update(QuadMesh* mesh)
+    {
+      BBox3fa bounds = empty;
+      vuint<M> vgeomID = -1, vprimID = -1;
+      Vec3vf<M> v0 = zero, v1 = zero, v2 = zero;
+	
+      for (size_t i=0; i<M; i++)
+      {
+        if (primID(i) == -1) break;
+        const unsigned geomId = geomID(i);
+        const unsigned primId = primID(i);
+        const QuadMesh::Quad& quad = mesh->quad(primId);
+        const Vec3fa p0 = mesh->vertex(quad.v[0]);
+        const Vec3fa p1 = mesh->vertex(quad.v[1]);
+        const Vec3fa p2 = mesh->vertex(quad.v[2]);
+        const Vec3fa p3 = mesh->vertex(quad.v[3]);
+        bounds.extend(merge(BBox3fa(p0),BBox3fa(p1),BBox3fa(p2),BBox3fa(p3)));
+        vgeomID [i] = geomId;
+        vprimID [i] = primId;
+        v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z;
+        v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z;
+        v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z;
+        v3.x[i] = p3.x; v3.y[i] = p3.y; v3.z[i] = p3.z;
+      }
+      new (this) QuadMv(v0,v1,v2,v3,vgeomID,vprimID);
+      return bounds;
+    }
+   
+  public:
+    Vec3vf<M> v0;      // 1st vertex of the quads
+    Vec3vf<M> v1;      // 2nd vertex of the quads
+    Vec3vf<M> v2;      // 3rd vertex of the quads
+    Vec3vf<M> v3;      // 4rd vertex of the quads
+  private:
+    vuint<M> geomIDs; // geometry ID
+    vuint<M> primIDs; // primitive ID
+  };
+
+  template<int M>
+  typename QuadMv<M>::Type QuadMv<M>::type;
+
+  typedef QuadMv<4> Quad4v;
+}
diff --git a/thirdparty/embree/kernels/geometry/quadv_intersector.h b/thirdparty/embree/kernels/geometry/quadv_intersector.h
new file mode 100644
index 0000000000..9b28e05614
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/quadv_intersector.h
@@ -0,0 +1,181 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "quadv.h"
+#include "quad_intersector_moeller.h"
+#include "quad_intersector_pluecker.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    /*! Intersects M quads with 1 ray */
+    template<int M, bool filter>
+    struct QuadMvIntersector1Moeller
+    {
+      typedef QuadMv<M> Primitive;
+      typedef QuadMIntersector1MoellerTrumbore<M,filter> Precalculations;
+        
+      /*! Intersect a ray with the M quads and updates the hit. */
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& quad)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        pre.intersect(ray,context,quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID());
+      }
+        
+      /*! Test if the ray is occluded by one of M quads. */
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& quad)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        return pre.occluded(ray,context, quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID());
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& quad)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, quad);
+      }
+    };
+
+    /*! Intersects M triangles with K rays. */
+    template<int M, int K, bool filter>
+    struct QuadMvIntersectorKMoeller
+    {
+      typedef QuadMv<M> Primitive;
+      typedef QuadMIntersectorKMoellerTrumbore<M,K,filter> Precalculations;
+
+      /*! Intersects K rays with M triangles. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const QuadMv<M>& quad)
+      {
+        for (size_t i=0; i<QuadMv<M>::max_size(); i++)
+        {
+          if (!quad.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          const Vec3vf<K> p0 = broadcast<vfloat<K>>(quad.v0,i);
+          const Vec3vf<K> p1 = broadcast<vfloat<K>>(quad.v1,i);
+          const Vec3vf<K> p2 = broadcast<vfloat<K>>(quad.v2,i);
+          const Vec3vf<K> p3 = broadcast<vfloat<K>>(quad.v3,i);
+          pre.intersectK(valid_i,ray,p0,p1,p2,p3,IntersectKEpilogM<M,K,filter>(ray,context,quad.geomID(),quad.primID(),i));
+        }
+      }
+
+      /*! Test for K rays if they are occluded by any of the M triangles. */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const QuadMv<M>& quad)
+      {
+        vbool<K> valid0 = valid_i;
+
+        for (size_t i=0; i<QuadMv<M>::max_size(); i++)
+        {
+          if (!quad.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+          const Vec3vf<K> p0 = broadcast<vfloat<K>>(quad.v0,i);
+          const Vec3vf<K> p1 = broadcast<vfloat<K>>(quad.v1,i);
+          const Vec3vf<K> p2 = broadcast<vfloat<K>>(quad.v2,i);
+          const Vec3vf<K> p3 = broadcast<vfloat<K>>(quad.v3,i);
+          if (pre.intersectK(valid0,ray,p0,p1,p2,p3,OccludedKEpilogM<M,K,filter>(valid0,ray,context,quad.geomID(),quad.primID(),i)))
+            break;
+        }
+        return !valid0;
+      }
+      
+      /*! Intersect a ray with M triangles and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const QuadMv<M>& quad)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        pre.intersect1(ray,k,context,quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID());
+      }
+
+      /*! Test if the ray is occluded by one of the M triangles. */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const QuadMv<M>& quad)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        return pre.occluded1(ray,k,context,quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID());
+      }
+    };
+
+    /*! Intersects M quads with 1 ray */
+    template<int M, bool filter>
+    struct QuadMvIntersector1Pluecker
+    {
+      typedef QuadMv<M> Primitive;
+      typedef QuadMIntersector1Pluecker<M,filter> Precalculations;
+        
+      /*! Intersect a ray with the M quads and updates the hit. */
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& quad)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        pre.intersect(ray,context,quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID());
+      }
+        
+      /*! Test if the ray is occluded by one of M quads. */
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& quad)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        return pre.occluded(ray,context, quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID());
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& quad)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, quad);
+      }
+    };
+
+    /*! Intersects M triangles with K rays. */
+    template<int M, int K, bool filter>
+    struct QuadMvIntersectorKPluecker
+    {
+      typedef QuadMv<M> Primitive;
+      typedef QuadMIntersectorKPluecker<M,K,filter> Precalculations;
+
+      /*! Intersects K rays with M triangles. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const QuadMv<M>& quad)
+      {
+        for (size_t i=0; i<QuadMv<M>::max_size(); i++)
+        {
+          if (!quad.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          const Vec3vf<K> p0 = broadcast<vfloat<K>>(quad.v0,i);
+          const Vec3vf<K> p1 = broadcast<vfloat<K>>(quad.v1,i);
+          const Vec3vf<K> p2 = broadcast<vfloat<K>>(quad.v2,i);
+          const Vec3vf<K> p3 = broadcast<vfloat<K>>(quad.v3,i);
+          pre.intersectK(valid_i,ray,p0,p1,p2,p3,IntersectKEpilogM<M,K,filter>(ray,context,quad.geomID(),quad.primID(),i));
+        }
+      }
+
+      /*! Test for K rays if they are occluded by any of the M triangles. */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const QuadMv<M>& quad)
+      {
+        vbool<K> valid0 = valid_i;
+
+        for (size_t i=0; i<QuadMv<M>::max_size(); i++)
+        {
+          if (!quad.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+          const Vec3vf<K> p0 = broadcast<vfloat<K>>(quad.v0,i);
+          const Vec3vf<K> p1 = broadcast<vfloat<K>>(quad.v1,i);
+          const Vec3vf<K> p2 = broadcast<vfloat<K>>(quad.v2,i);
+          const Vec3vf<K> p3 = broadcast<vfloat<K>>(quad.v3,i);
+          if (pre.intersectK(valid0,ray,p0,p1,p2,p3,OccludedKEpilogM<M,K,filter>(valid0,ray,context,quad.geomID(),quad.primID(),i)))
+            break;
+        }
+        return !valid0;
+      }
+      
+      /*! Intersect a ray with M triangles and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const QuadMv<M>& quad)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        pre.intersect1(ray,k,context,quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID());
+      }
+
+      /*! Test if the ray is occluded by one of the M triangles. */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const QuadMv<M>& quad)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        return pre.occluded1(ray,k,context,quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID());
+      }
+    };
+  }
+}
+
diff --git a/thirdparty/embree/kernels/geometry/roundline_intersector.h b/thirdparty/embree/kernels/geometry/roundline_intersector.h
new file mode 100644
index 0000000000..0e9393442b
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/roundline_intersector.h
@@ -0,0 +1,715 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "curve_intersector_precalculations.h"
+
+
+/*
+  
+  This file implements the intersection of a ray with a round linear
+  curve segment. We define the geometry of such a round linear curve
+  segment from point p0 with radius r0 to point p1 with radius r1
+  using the cone that touches spheres p0/r0 and p1/r1 tangentially
+  plus the sphere p1/r1. We denote the tangentially touching cone from
+  p0/r0 to p1/r1 with cone(p0,r0,p1,r1) and the cone plus the ending
+  sphere with cone_sphere(p0,r0,p1,r1).
+
+  For multiple connected round linear curve segments this construction
+  yield a proper shape when viewed from the outside. Using the
+  following CSG we can also handle the interiour in most common cases:
+
+     round_linear_curve(pl,rl,p0,r0,p1,r1,pr,rr) =
+       cone_sphere(p0,r0,p1,r1) - cone(pl,rl,p0,r0) - cone(p1,r1,pr,rr)
+
+  Thus by subtracting the neighboring cone geometries, we cut away
+  parts of the center cone_sphere surface which lie inside the
+  combined curve. This approach works as long as geometry of the
+  current cone_sphere penetrates into direct neighbor segments only,
+  and not into segments further away.
+  
+  To construct a cone that touches two spheres at p0 and p1 with r0
+  and r1, one has to increase the cone radius at r0 and r1 to obtain
+  larger radii w0 and w1, such that the infinite cone properly touches
+  the spheres.  From the paper "Ray Tracing Generalized Tube
+  Primitives: Method and Applications"
+  (https://www.researchgate.net/publication/334378683_Ray_Tracing_Generalized_Tube_Primitives_Method_and_Applications)
+  one can derive the following equations for these increased
+  radii:
+
+     sr = 1.0f / sqrt(1-sqr(dr)/sqr(p1-p0))
+     w0 = sr*r0
+     w1 = sr*r1
+
+  Further, we want the cone to start where it touches the sphere at p0
+  and to end where it touches sphere at p1.  Therefore, we need to
+  construct clipping locations y0 and y1 for the start and end of the
+  cone. These start and end clipping location of the cone can get
+  calculated as:
+
+     Y0 =               - r0 * (r1-r0) / length(p1-p0)
+     Y1 = length(p1-p0) - r1 * (r1-r0) / length(p1-p0)
+
+  Where the cone starts a distance Y0 and ends a distance Y1 away of
+  point p0 along the cone center. The distance between Y1-Y0 can get
+  calculated as:
+
+    dY = length(p1-p0) - (r1-r0)^2 / length(p1-p0)
+
+  In the code below, Y will always be scaled by length(p1-p0) to
+  obtain y and you will find the terms r0*(r1-r0) and
+  (p1-p0)^2-(r1-r0)^2.
+
+ */
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M>
+      struct RoundLineIntersectorHitM
+      {
+        __forceinline RoundLineIntersectorHitM() {}
+        
+        __forceinline RoundLineIntersectorHitM(const vfloat<M>& u, const vfloat<M>& v, const vfloat<M>& t, const Vec3vf<M>& Ng)
+          : vu(u), vv(v), vt(t), vNg(Ng) {}
+	
+        __forceinline void finalize() {}
+	
+        __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); }
+        __forceinline float t  (const size_t i) const { return vt[i]; }
+        __forceinline Vec3fa Ng(const size_t i) const { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); }
+
+        __forceinline Vec2vf<M> uv() const { return Vec2vf<M>(vu,vv); }
+        __forceinline vfloat<M> t () const { return vt; }
+        __forceinline Vec3vf<M> Ng() const { return vNg; }
+       
+      public:
+        vfloat<M> vu;
+        vfloat<M> vv;
+        vfloat<M> vt;
+        Vec3vf<M> vNg;
+      };
+    
+    namespace __roundline_internal
+    {
+      template<int M>
+        struct ConeGeometry
+        {
+          ConeGeometry (const Vec4vf<M>& a, const Vec4vf<M>& b)
+          : p0(a.xyz()), p1(b.xyz()), dP(p1-p0), dPdP(dot(dP,dP)), r0(a.w), sqr_r0(sqr(r0)), r1(b.w), dr(r1-r0), drdr(dr*dr), r0dr (r0*dr), g(dPdP - drdr) {}
+          
+          /* 
+             
+             This function tests if a point is accepted by first cone
+             clipping plane.
+
+             First, we need to project the point onto the line p0->p1:
+             
+               Y = (p-p0)*(p1-p0)/length(p1-p0)
+             
+             This value y is the distance to the projection point from
+             p0. The clip distances are calculated as:
+             
+               Y0 =               - r0 * (r1-r0) / length(p1-p0)
+               Y1 = length(p1-p0) - r1 * (r1-r0) / length(p1-p0)
+             
+             Thus to test if the point p is accepted by the first
+             clipping plane we need to test Y > Y0 and to test if it
+             is accepted by the second clipping plane we need to test
+             Y < Y1.
+             
+             By multiplying the calculations with length(p1-p0) these
+             calculation can get simplied to:
+             
+               y = (p-p0)*(p1-p0)
+               y0 =           - r0 * (r1-r0)
+               y1 = (p1-p0)^2 - r1 * (r1-r0)
+
+             and the test y > y0 and y < y1.
+             
+          */
+          
+          __forceinline vbool<M> isClippedByPlane (const vbool<M>& valid_i, const Vec3vf<M>& p) const
+          {
+            const Vec3vf<M> p0p = p - p0;
+            const vfloat<M> y = dot(p0p,dP);
+            const vfloat<M> cap0 = -r0dr;
+            const vbool<M> inside_cone = y > cap0;
+            return valid_i & (p0.x != vfloat<M>(inf)) & (p1.x != vfloat<M>(inf)) & inside_cone;
+          }
+          
+          /* 
+             
+             This function tests whether a point lies inside the capped cone
+             tangential to its ending spheres.
+
+             Therefore one has to check if the point is inside the
+             region defined by the cone clipping planes, which is
+             performed similar as in the previous function.
+             
+             To perform the inside cone test we need to project the
+             point onto the line p0->p1:
+             
+               dP = p1-p0
+               Y = (p-p0)*dP/length(dP)
+                           
+             This value Y is the distance to the projection point from
+             p0. To obtain a parameter value u going from 0 to 1 along
+             the line p0->p1 we calculate:
+             
+               U = Y/length(dP)
+             
+             The radii to use at points p0 and p1 are:
+             
+               w0 = sr * r0
+               w1 = sr * r1
+               dw = w1-w0
+             
+             Using these radii and u one can directly test if the point
+             lies inside the cone using the formula dP*dP < wy*wy with:
+             
+               wy = w0 + u*dw
+               py = p0 + u*dP - p
+                          
+             By multiplying the calculations with length(p1-p0) and
+             inserting the definition of w can obtain simpler equations:
+             
+               y = (p-p0)*dP
+               ry = r0 + y/dP^2 * dr
+               wy = sr*ry        
+               py = p0 + y/dP^2*dP - p
+               y0 =      - r0 * dr
+               y1 = dP^2 - r1 * dr
+             
+             Thus for the in-cone test we get:
+             
+                    py^2 < wy^2
+               <=>  py^2 < sr^2 * ry^2
+               <=>  py^2 * ( dP^2 - dr^2 ) < dP^2 * ry^2
+             
+             This can further get simplified to:
+             
+               (p0-p)^2 * (dP^2 - dr^2) - y^2 < dP^2 * r0^2 + 2.0f*r0*dr*y;            
+                      
+          */
+          
+          __forceinline vbool<M> isInsideCappedCone (const vbool<M>& valid_i, const Vec3vf<M>& p) const
+          {
+            const Vec3vf<M> p0p = p - p0;
+            const vfloat<M> y = dot(p0p,dP);
+            const vfloat<M> cap0 = -r0dr+vfloat<M>(ulp);
+            const vfloat<M> cap1 = -r1*dr + dPdP;
+            
+            vbool<M> inside_cone = valid_i & (p0.x != vfloat<M>(inf)) & (p1.x != vfloat<M>(inf));
+            inside_cone &= y > cap0;  // start clipping plane
+            inside_cone &= y < cap1;  // end clipping plane 
+            inside_cone &= sqr(p0p)*g - sqr(y) < dPdP * sqr_r0 + 2.0f*r0dr*y; // in cone test
+            return inside_cone;
+          }
+          
+        protected:
+          Vec3vf<M> p0;
+          Vec3vf<M> p1;
+          Vec3vf<M> dP;
+          vfloat<M> dPdP;
+          vfloat<M> r0;
+          vfloat<M> sqr_r0;
+          vfloat<M> r1;
+          vfloat<M> dr;
+          vfloat<M> drdr;
+          vfloat<M> r0dr;
+          vfloat<M> g;
+        };
+      
+      template<int M>
+        struct ConeGeometryIntersector : public ConeGeometry<M>
+      {
+        using ConeGeometry<M>::p0;
+        using ConeGeometry<M>::p1;
+        using ConeGeometry<M>::dP;
+        using ConeGeometry<M>::dPdP;
+        using ConeGeometry<M>::r0;
+        using ConeGeometry<M>::sqr_r0;
+        using ConeGeometry<M>::r1;
+        using ConeGeometry<M>::dr;
+        using ConeGeometry<M>::r0dr;
+        using ConeGeometry<M>::g;
+        
+        ConeGeometryIntersector (const Vec3vf<M>& ray_org, const Vec3vf<M>& ray_dir, const vfloat<M>& dOdO, const vfloat<M>& rcp_dOdO, const Vec4vf<M>& a, const Vec4vf<M>& b)
+          : ConeGeometry<M>(a,b), org(ray_org), O(ray_org-p0), dO(ray_dir),  dOdO(dOdO), rcp_dOdO(rcp_dOdO), OdP(dot(dP,O)), dOdP(dot(dP,dO)),  yp(OdP + r0dr) {}
+        
+        /*
+          
+          This function intersects a ray with a cone that touches a
+          start sphere p0/r0 and end sphere p1/r1.
+          
+          To find this ray/cone intersections one could just
+          calculate radii w0 and w1 as described above and use a
+          standard ray/cone intersection routine with these
+          radii. However, it turns out that calculations can get
+          simplified when deriving a specialized ray/cone
+          intersection for this special case. We perform
+          calculations relative to the cone origin p0 and define:
+            
+            O  = ray_org - p0
+            dO = ray_dir
+            dP = p1-p0
+            dr = r1-r0
+            dw = w1-w0
+            
+          For some t we can compute the potential hit point h = O + t*dO and
+          project it onto the cone vector dP to obtain u = (h*dP)/(dP*dP). In
+          case of an intersection, the squared distance from the hit point
+          projected onto the cone center line to the hit point should be equal
+          to the squared cone radius at u:
+            
+            (u*dP - h)^2 = (w0 + u*dw)^2
+           
+          Inserting the definition of h, u, w0, and dw into this formula, then
+          factoring out all terms, and sorting by t^2, t^1, and t^0 terms
+          yields a quadratic equation to solve.
+            
+          Inserting u:
+            ( (h*dP)*dP/dP^2 - h )^2 = ( w0 + (h*dP)*dw/dP^2 )^2
+            
+          Multiplying by dP^4:
+            ( (h*dP)*dP - h*dP^2 )^2 = ( w0*dP^2 + (h*dP)*dw )^2
+            
+          Inserting w0 and dw:
+            ( (h*dP)*dP - h*dP^2 )^2 = ( r0*dP^2 + (h*dP)*dr )^2 / (1-dr^2/dP^2)
+            ( (h*dP)*dP - h*dP^2 )^2 *(dP^2 - dr^2) = dP^2 * ( r0*dP^2 + (h*dP)*dr )^2
+            
+          Now one can insert the definition of h, factor out, and presort by t:
+            ( ((O + t*dO)*dP)*dP - (O + t*dO)*dP^2 )^2 *(dP^2 - dr^2) = dP^2 * ( r0*dP^2 + ((O + t*dO)*dP)*dr )^2
+            ( (O*dP)*dP-O*dP^2 + t*( (dO*dP)*dP - dO*dP^2 ) )^2 *(dP^2 - dr^2) = dP^2 * ( r0*dP^2 + (O*dP)*dr + t*(dO*dP)*dr )^2
+            
+          Factoring out further and sorting by t^2, t^1 and t^0 yields:
+            
+            0 =   t^2 * [ ((dO*dP)*dP - dO-dP^2)^2 * (dP^2 - dr^2) - dP^2*(dO*dP)^2*dr^2 ]
+              + 2*t^1 * [ ((O*dP)*dP - O*dP^2) * ((dO*dP)*dP - dO*dP^2) * (dP^2 - dr^2) - dP^2*(r0*dP^2 + (O*dP)*dr)*(dO*dP)*dr ]
+              +   t^0 * [ ( (O*dP)*dP - O*dP^2)^2 * (dP^2-dr^2) - dP^2*(r0*dP^2 + (O*dP)*dr)^2 ]
+            
+          This can be simplified to:
+            
+             0 =   t^2 * [ (dP^2 - dr^2)*dO^2 - (dO*dP)^2 ]
+               + 2*t^1 * [ (dP^2 - dr^2)*(O*dO) - (dO*dP)*(O*dP + r0*dr) ]
+               +   t^0 * [ (dP^2 - dr^2)*O^2 - (O*dP)^2 - r0^2*dP^2 - 2.0f*r0*dr*(O*dP) ]
+            
+          Solving this quadratic equation yields the values for t at which the
+          ray intersects the cone.
+          
+        */
+        
+        __forceinline bool intersectCone(vbool<M>& valid, vfloat<M>& lower, vfloat<M>& upper)
+        {
+          /* return no hit by default */
+          lower = pos_inf;
+          upper = neg_inf;
+          
+          /* compute quadratic equation A*t^2 + B*t + C = 0 */
+          const vfloat<M> OO = dot(O,O);
+          const vfloat<M> OdO = dot(dO,O);
+          const vfloat<M> A = g * dOdO - sqr(dOdP);
+          const vfloat<M> B = 2.0f * (g*OdO - dOdP*yp);
+          const vfloat<M> C = g*OO - sqr(OdP) - sqr_r0*dPdP - 2.0f*r0dr*OdP;
+          
+          /* we miss the cone if determinant is smaller than zero */
+          const vfloat<M> D = B*B - 4.0f*A*C;
+          valid &= (D >= 0.0f & g > 0.0f);  // if g <= 0 then the cone is inside a sphere end
+          
+          /* When rays are parallel to the cone surface, then the
+           * ray may be inside or outside the cone. We just assume a
+           * miss in that case, which is fine as rays inside the
+           * cone would anyway hit the ending spheres in that
+           * case. */
+          valid &= abs(A) > min_rcp_input;
+          if (unlikely(none(valid))) {
+            return false;
+          }
+          
+          /* compute distance to front and back hit */
+          const vfloat<M> Q = sqrt(D);
+          const vfloat<M> rcp_2A = rcp(2.0f*A);
+          t_cone_front = (-B-Q)*rcp_2A;
+          y_cone_front = yp + t_cone_front*dOdP;
+          lower = select( (y_cone_front > -(float)ulp) & (y_cone_front <= g) & (g > 0.0f), t_cone_front, vfloat<M>(pos_inf));
+#if !defined (EMBREE_BACKFACE_CULLING_CURVES)
+          t_cone_back = (-B+Q)*rcp_2A;
+          y_cone_back  = yp + t_cone_back *dOdP;
+          upper = select( (y_cone_back  > -(float)ulp) & (y_cone_back  <= g) & (g > 0.0f), t_cone_back , vfloat<M>(neg_inf));
+#endif          
+          return true;
+        }
+        
+        /* 
+           This function intersects the ray with the end sphere at
+           p1. We already clip away hits that are inside the
+           neighboring cone segment.
+           
+        */
+        
+        __forceinline void intersectEndSphere(vbool<M>& valid, 
+                                              const ConeGeometry<M>& coneR, 
+                                              vfloat<M>& lower, vfloat<M>& upper)
+        {
+          /* calculate front and back hit with end sphere */
+          const Vec3vf<M> O1 = org - p1;
+          const vfloat<M> O1dO = dot(O1,dO);
+          const vfloat<M> h2 = sqr(O1dO) - dOdO*(sqr(O1) - sqr(r1));
+          const vfloat<M> rhs1 = select( h2 >= 0.0f, sqrt(h2), vfloat<M>(neg_inf) );
+          
+          /* clip away front hit if it is inside next cone segment */
+          t_sph1_front = (-O1dO - rhs1)*rcp_dOdO;
+          const Vec3vf<M> hit_front = org + t_sph1_front*dO;
+          vbool<M> valid_sph1_front = h2 >= 0.0f & yp + t_sph1_front*dOdP > g & !coneR.isClippedByPlane (valid, hit_front);
+          lower = select(valid_sph1_front, t_sph1_front, vfloat<M>(pos_inf));
+          
+#if !defined(EMBREE_BACKFACE_CULLING_CURVES)
+          /* clip away back hit if it is inside next cone segment */
+          t_sph1_back  = (-O1dO + rhs1)*rcp_dOdO;
+          const Vec3vf<M> hit_back = org + t_sph1_back*dO;
+          vbool<M> valid_sph1_back  = h2 >= 0.0f & yp + t_sph1_back*dOdP > g & !coneR.isClippedByPlane (valid, hit_back);
+          upper = select(valid_sph1_back, t_sph1_back,  vfloat<M>(neg_inf));
+#else
+          upper = vfloat<M>(neg_inf);
+#endif
+        }
+
+        __forceinline void intersectBeginSphere(const vbool<M>& valid, 
+                                                vfloat<M>& lower, vfloat<M>& upper)
+        {
+          /* calculate front and back hit with end sphere */
+          const Vec3vf<M> O1 = org - p0;
+          const vfloat<M> O1dO = dot(O1,dO);
+          const vfloat<M> h2 = sqr(O1dO) - dOdO*(sqr(O1) - sqr(r0));
+          const vfloat<M> rhs1 = select( h2 >= 0.0f, sqrt(h2), vfloat<M>(neg_inf) );
+          
+          /* clip away front hit if it is inside next cone segment */
+          t_sph0_front = (-O1dO - rhs1)*rcp_dOdO;
+          vbool<M> valid_sph1_front = valid & h2 >= 0.0f & yp + t_sph0_front*dOdP < 0;
+          lower = select(valid_sph1_front, t_sph0_front, vfloat<M>(pos_inf));
+
+#if !defined(EMBREE_BACKFACE_CULLING_CURVES)
+          /* clip away back hit if it is inside next cone segment */
+          t_sph0_back  = (-O1dO + rhs1)*rcp_dOdO;
+          vbool<M> valid_sph1_back  = valid & h2 >= 0.0f & yp + t_sph0_back*dOdP < 0;
+          upper = select(valid_sph1_back, t_sph0_back,  vfloat<M>(neg_inf));
+#else   
+          upper = vfloat<M>(neg_inf);
+#endif
+        }
+        
+        /* 
+           
+           This function calculates the geometry normal of some cone hit.
+           
+           For a given hit point h (relative to p0) with a cone
+           starting at p0 with radius w0 and ending at p1 with
+           radius w1 one normally calculates the geometry normal by
+           first calculating the parmetric u hit location along the
+           cone:
+           
+             u = dot(h,dP)/dP^2
+           
+           Using this value one can now directly calculate the
+           geometry normal by bending the connection vector (h-u*dP)
+           from hit to projected hit with some cone dependent value
+           dw/sqrt(dP^2) * normalize(dP):
+           
+             Ng = normalize(h-u*dP) - dw/length(dP) * normalize(dP)
+           
+           The length of the vector (h-u*dP) can also get calculated
+           by interpolating the radii as w0+u*dw which yields:
+           
+             Ng = (h-u*dP)/(w0+u*dw) - dw/dP^2 * dP
+           
+           Multiplying with (w0+u*dw) yield a scaled Ng':
+           
+             Ng' = (h-u*dP) - (w0+u*dw)*dw/dP^2*dP
+           
+           Inserting the definition of w0 and dw and refactoring
+           yield a furhter scaled Ng'':
+           
+             Ng'' = (dP^2 - dr^2) (h-q) - (r0+u*dr)*dr*dP
+           
+           Now inserting the definition of u gives and multiplying
+           with the denominator yields:
+           
+             Ng''' = (dP^2-dr^2)*(dP^2*h-dot(h,dP)*dP) - (dP^2*r0+dot(h,dP)*dr)*dr*dP
+           
+           Factoring out, cancelling terms, dividing by dP^2, and
+           factoring again yields finally:
+           
+             Ng'''' = (dP^2-dr^2)*h - dP*(dot(h,dP) + r0*dr)
+           
+        */
+        
+        __forceinline Vec3vf<M> Ng_cone(const vbool<M>& front_hit) const
+        {
+#if !defined(EMBREE_BACKFACE_CULLING_CURVES)
+          const vfloat<M> y = select(front_hit, y_cone_front, y_cone_back);
+          const vfloat<M> t = select(front_hit, t_cone_front, t_cone_back);
+          const Vec3vf<M> h = O + t*dO;
+          return g*h-dP*y;
+#else
+          const Vec3vf<M> h = O + t_cone_front*dO;
+          return g*h-dP*y_cone_front;
+#endif
+        }
+        
+        /* compute geometry normal of sphere hit as the difference
+         * vector from hit point to sphere center */
+        
+        __forceinline Vec3vf<M> Ng_sphere1(const vbool<M>& front_hit) const
+        {
+#if !defined(EMBREE_BACKFACE_CULLING_CURVES)
+          const vfloat<M> t_sph1 = select(front_hit, t_sph1_front, t_sph1_back);
+          return org+t_sph1*dO-p1;
+#else 
+          return org+t_sph1_front*dO-p1;
+#endif
+        }
+
+        __forceinline Vec3vf<M> Ng_sphere0(const vbool<M>& front_hit) const
+        {
+#if !defined(EMBREE_BACKFACE_CULLING_CURVES)
+          const vfloat<M> t_sph0 = select(front_hit, t_sph0_front, t_sph0_back);
+          return org+t_sph0*dO-p0;
+#else
+          return org+t_sph0_front*dO-p0;
+#endif
+        }
+        
+        /* 
+           This function calculates the u coordinate of a
+           hit. Therefore we use the hit distance y (which is zero
+           at the first cone clipping plane) and divide by distance
+           g between the clipping planes.
+           
+        */
+        
+        __forceinline vfloat<M> u_cone(const vbool<M>& front_hit) const
+        {
+#if !defined(EMBREE_BACKFACE_CULLING_CURVES)
+          const vfloat<M> y = select(front_hit, y_cone_front, y_cone_back);
+          return clamp(y*rcp(g));
+#else
+          return clamp(y_cone_front*rcp(g));
+#endif
+        }
+        
+      private:
+        Vec3vf<M> org;
+        Vec3vf<M> O;
+        Vec3vf<M> dO;
+        vfloat<M> dOdO;
+        vfloat<M> rcp_dOdO;
+        vfloat<M> OdP;
+        vfloat<M> dOdP;
+        
+        /* for ray/cone intersection */
+      private:
+        vfloat<M> yp;
+        vfloat<M> y_cone_front;
+        vfloat<M> t_cone_front;
+#if !defined (EMBREE_BACKFACE_CULLING_CURVES)
+        vfloat<M> y_cone_back;
+        vfloat<M> t_cone_back;
+#endif
+        
+        /* for ray/sphere intersection */
+      private:
+        vfloat<M> t_sph1_front;
+        vfloat<M> t_sph0_front;
+#if !defined (EMBREE_BACKFACE_CULLING_CURVES)
+        vfloat<M> t_sph1_back;
+        vfloat<M> t_sph0_back;
+#endif
+      };
+      
+      
+      template<int M, typename Epilog, typename ray_tfar_func>
+        static __forceinline bool intersectConeSphere(const vbool<M>& valid_i,
+                                                      const Vec3vf<M>& ray_org_in, const Vec3vf<M>& ray_dir, 
+                                                      const vfloat<M>& ray_tnear, const ray_tfar_func& ray_tfar,
+                                                      const Vec4vf<M>& v0, const Vec4vf<M>& v1,
+                                                      const Vec4vf<M>& vL, const Vec4vf<M>& vR,
+                                                      const Epilog& epilog)
+      {         
+        vbool<M> valid = valid_i;
+        
+        /* move ray origin closer to make calculations numerically stable */
+        const vfloat<M> dOdO = sqr(ray_dir);
+        const vfloat<M> rcp_dOdO = rcp(dOdO);
+        const Vec3vf<M> center = vfloat<M>(0.5f)*(v0.xyz()+v1.xyz());
+        const vfloat<M> dt = dot(center-ray_org_in,ray_dir)*rcp_dOdO;
+        const Vec3vf<M> ray_org = ray_org_in + dt*ray_dir;
+        
+        /* intersect with cone from v0 to v1 */
+        vfloat<M> t_cone_lower, t_cone_upper;
+        ConeGeometryIntersector<M> cone (ray_org, ray_dir, dOdO, rcp_dOdO, v0, v1);
+        vbool<M> validCone = valid;
+        cone.intersectCone(validCone, t_cone_lower, t_cone_upper);
+
+        valid &= (validCone | (cone.g <= 0.0f));  // if cone is entirely in sphere end - check sphere
+        if (unlikely(none(valid)))
+          return false;
+        
+        /* cone hits inside the neighboring capped cones are inside the geometry and thus ignored */
+        const ConeGeometry<M> coneL (v0, vL);
+        const ConeGeometry<M> coneR (v1, vR);
+#if !defined(EMBREE_BACKFACE_CULLING_CURVES)
+        const Vec3vf<M> hit_lower = ray_org + t_cone_lower*ray_dir;
+        const Vec3vf<M> hit_upper = ray_org + t_cone_upper*ray_dir;
+        t_cone_lower = select (!coneL.isInsideCappedCone (validCone, hit_lower) & !coneR.isInsideCappedCone (validCone, hit_lower), t_cone_lower, vfloat<M>(pos_inf));
+        t_cone_upper = select (!coneL.isInsideCappedCone (validCone, hit_upper) & !coneR.isInsideCappedCone (validCone, hit_upper), t_cone_upper, vfloat<M>(neg_inf));
+#endif
+
+        /* intersect ending sphere */
+        vfloat<M> t_sph1_lower, t_sph1_upper;
+        vfloat<M> t_sph0_lower = vfloat<M>(pos_inf);
+        vfloat<M> t_sph0_upper = vfloat<M>(neg_inf);
+        cone.intersectEndSphere(valid, coneR, t_sph1_lower, t_sph1_upper);
+
+        const vbool<M> isBeginPoint = valid & (vL[0] == vfloat<M>(pos_inf));
+        if (unlikely(any(isBeginPoint))) {
+          cone.intersectBeginSphere (isBeginPoint, t_sph0_lower, t_sph0_upper);
+        }
+        
+        /* CSG union of cone and end sphere */
+        vfloat<M> t_sph_lower = min(t_sph0_lower, t_sph1_lower);
+        vfloat<M> t_cone_sphere_lower = min(t_cone_lower, t_sph_lower);
+#if !defined (EMBREE_BACKFACE_CULLING_CURVES)
+        vfloat<M> t_sph_upper = max(t_sph0_upper, t_sph1_upper);
+        vfloat<M> t_cone_sphere_upper = max(t_cone_upper, t_sph_upper);
+        
+        /* filter out hits that are not in tnear/tfar range */
+        const vbool<M> valid_lower = valid & ray_tnear <= dt+t_cone_sphere_lower & dt+t_cone_sphere_lower <= ray_tfar() & t_cone_sphere_lower != vfloat<M>(pos_inf);
+        const vbool<M> valid_upper = valid & ray_tnear <= dt+t_cone_sphere_upper & dt+t_cone_sphere_upper <= ray_tfar() & t_cone_sphere_upper != vfloat<M>(neg_inf);
+        
+        /* check if there is a first hit */
+        const vbool<M> valid_first = valid_lower | valid_upper;
+        if (unlikely(none(valid_first)))
+          return false;
+        
+        /* construct first hit */
+        const vfloat<M> t_first = select(valid_lower, t_cone_sphere_lower, t_cone_sphere_upper);
+        const vbool<M> cone_hit_first = t_first == t_cone_lower | t_first == t_cone_upper;
+        const vbool<M> sph0_hit_first = t_first == t_sph0_lower | t_first == t_sph0_upper;
+        const Vec3vf<M> Ng_first = select(cone_hit_first, cone.Ng_cone(valid_lower), select (sph0_hit_first, cone.Ng_sphere0(valid_lower), cone.Ng_sphere1(valid_lower)));
+        const vfloat<M> u_first  = select(cone_hit_first, cone.u_cone(valid_lower), select (sph0_hit_first, vfloat<M>(zero), vfloat<M>(one)));
+
+        /* invoke intersection filter for first hit */
+        RoundLineIntersectorHitM<M> hit(u_first,zero,dt+t_first,Ng_first);
+        const bool is_hit_first = epilog(valid_first, hit);
+        
+        /* check for possible second hits before potentially accepted hit */
+        const vfloat<M> t_second = t_cone_sphere_upper;
+        const vbool<M> valid_second = valid_lower & valid_upper & (dt+t_cone_sphere_upper <= ray_tfar());
+        if (unlikely(none(valid_second)))
+          return is_hit_first;
+        
+        /* invoke intersection filter for second hit */
+        const vbool<M> cone_hit_second = t_second == t_cone_lower | t_second == t_cone_upper;
+        const vbool<M> sph0_hit_second = t_second == t_sph0_lower | t_second == t_sph0_upper;
+        const Vec3vf<M> Ng_second = select(cone_hit_second, cone.Ng_cone(false), select (sph0_hit_second, cone.Ng_sphere0(false), cone.Ng_sphere1(false)));
+        const vfloat<M> u_second  = select(cone_hit_second, cone.u_cone(false), select (sph0_hit_second, vfloat<M>(zero), vfloat<M>(one)));
+
+        hit = RoundLineIntersectorHitM<M>(u_second,zero,dt+t_second,Ng_second);
+        const bool is_hit_second = epilog(valid_second, hit);
+        
+        return is_hit_first | is_hit_second;
+#else
+        /* filter out hits that are not in tnear/tfar range */
+        const vbool<M> valid_lower = valid & ray_tnear <= dt+t_cone_sphere_lower & dt+t_cone_sphere_lower <= ray_tfar() & t_cone_sphere_lower != vfloat<M>(pos_inf);
+        
+        /* check if there is a valid hit */
+        if (unlikely(none(valid_lower)))
+          return false;
+        
+        /* construct first hit */
+        const vbool<M> cone_hit_first = t_cone_sphere_lower == t_cone_lower | t_cone_sphere_lower == t_cone_upper;
+        const vbool<M> sph0_hit_first = t_cone_sphere_lower == t_sph0_lower | t_cone_sphere_lower == t_sph0_upper;
+        const Vec3vf<M> Ng_first = select(cone_hit_first, cone.Ng_cone(valid_lower), select (sph0_hit_first, cone.Ng_sphere0(valid_lower), cone.Ng_sphere1(valid_lower)));
+        const vfloat<M> u_first  = select(cone_hit_first, cone.u_cone(valid_lower), select (sph0_hit_first, vfloat<M>(zero), vfloat<M>(one)));
+
+        /* invoke intersection filter for first hit */
+        RoundLineIntersectorHitM<M> hit(u_first,zero,dt+t_cone_sphere_lower,Ng_first);
+        const bool is_hit_first = epilog(valid_lower, hit);
+        
+        return is_hit_first;
+#endif
+      }
+      
+    } // end namespace __roundline_internal
+    
+    template<int M>
+      struct RoundLinearCurveIntersector1
+      {
+        typedef CurvePrecalculations1 Precalculations;
+
+        template<typename Ray>
+        struct ray_tfar {
+          Ray& ray;
+          __forceinline ray_tfar(Ray& ray) : ray(ray) {}
+          __forceinline vfloat<M> operator() () const { return ray.tfar; };
+        };
+	
+        template<typename Ray, typename Epilog>
+        static __forceinline bool intersect(const vbool<M>& valid_i,
+                                            Ray& ray,
+                                            IntersectContext* context,
+                                            const LineSegments* geom,
+                                            const Precalculations& pre,
+                                            const Vec4vf<M>& v0i, const Vec4vf<M>& v1i,
+                                            const Vec4vf<M>& vLi, const Vec4vf<M>& vRi,
+                                            const Epilog& epilog)
+        {
+          const Vec3vf<M> ray_org(ray.org.x, ray.org.y, ray.org.z);
+          const Vec3vf<M> ray_dir(ray.dir.x, ray.dir.y, ray.dir.z);
+          const vfloat<M> ray_tnear(ray.tnear());
+          const Vec4vf<M> v0 = enlargeRadiusToMinWidth<M>(context,geom,ray_org,v0i);
+          const Vec4vf<M> v1 = enlargeRadiusToMinWidth<M>(context,geom,ray_org,v1i);
+          const Vec4vf<M> vL = enlargeRadiusToMinWidth<M>(context,geom,ray_org,vLi);
+          const Vec4vf<M> vR = enlargeRadiusToMinWidth<M>(context,geom,ray_org,vRi);
+          return  __roundline_internal::intersectConeSphere<M>(valid_i,ray_org,ray_dir,ray_tnear,ray_tfar<Ray>(ray),v0,v1,vL,vR,epilog);
+        }
+      };
+    
+    template<int M, int K>
+      struct RoundLinearCurveIntersectorK
+      {
+        typedef CurvePrecalculationsK<K> Precalculations;
+        
+        struct ray_tfar {
+          RayK<K>& ray;
+          size_t k;
+          __forceinline ray_tfar(RayK<K>& ray, size_t k) : ray(ray), k(k) {}
+          __forceinline vfloat<M> operator() () const { return ray.tfar[k]; };
+        };
+        
+        template<typename Epilog>
+        static __forceinline bool intersect(const vbool<M>& valid_i,
+                                            RayK<K>& ray, size_t k,
+                                            IntersectContext* context,
+                                            const LineSegments* geom,
+                                            const Precalculations& pre,
+                                            const Vec4vf<M>& v0i, const Vec4vf<M>& v1i,
+                                            const Vec4vf<M>& vLi, const Vec4vf<M>& vRi,
+                                            const Epilog& epilog)
+        {
+          const Vec3vf<M> ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]);
+          const Vec3vf<M> ray_dir(ray.dir.x[k], ray.dir.y[k], ray.dir.z[k]);
+          const vfloat<M> ray_tnear = ray.tnear()[k];
+          const Vec4vf<M> v0 = enlargeRadiusToMinWidth<M>(context,geom,ray_org,v0i);
+          const Vec4vf<M> v1 = enlargeRadiusToMinWidth<M>(context,geom,ray_org,v1i);
+          const Vec4vf<M> vL = enlargeRadiusToMinWidth<M>(context,geom,ray_org,vLi);
+          const Vec4vf<M> vR = enlargeRadiusToMinWidth<M>(context,geom,ray_org,vRi);
+          return __roundline_internal::intersectConeSphere<M>(valid_i,ray_org,ray_dir,ray_tnear,ray_tfar(ray,k),v0,v1,vL,vR,epilog);
+        }
+      };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/roundlinei_intersector.h b/thirdparty/embree/kernels/geometry/roundlinei_intersector.h
new file mode 100644
index 0000000000..29061d6475
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/roundlinei_intersector.h
@@ -0,0 +1,123 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "roundline_intersector.h"
+#include "intersector_epilog.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M, bool filter>
+    struct RoundLinearCurveMiIntersector1
+    {
+      typedef LineMi<M> Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom);
+        const vbool<M> valid = line.valid();
+        RoundLinearCurveIntersector1<M>::intersect(valid,ray,context,geom,pre,v0,v1,vL,vR,Intersect1EpilogM<M,filter>(ray,context,line.geomID(),line.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom);
+        const vbool<M> valid = line.valid();
+        return RoundLinearCurveIntersector1<M>::intersect(valid,ray,context,geom,pre,v0,v1,vL,vR,Occluded1EpilogM<M,filter>(ray,context,line.geomID(),line.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& line)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, line);
+      }
+    };
+
+    template<int M, bool filter>
+    struct RoundLinearCurveMiMBIntersector1
+    {
+      typedef LineMi<M> Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom,ray.time());
+        const vbool<M> valid = line.valid();
+        RoundLinearCurveIntersector1<M>::intersect(valid,ray,context,geom,pre,v0,v1,vL,vR,Intersect1EpilogM<M,filter>(ray,context,line.geomID(),line.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom,ray.time());
+        const vbool<M> valid = line.valid();
+        return RoundLinearCurveIntersector1<M>::intersect(valid,ray,context,geom,pre,v0,v1,vL,vR,Occluded1EpilogM<M,filter>(ray,context,line.geomID(),line.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& line)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, line);
+      }
+    };
+
+    template<int M, int K, bool filter>
+    struct RoundLinearCurveMiIntersectorK
+    {
+      typedef LineMi<M> Primitive;
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom);
+        const vbool<M> valid = line.valid();
+        RoundLinearCurveIntersectorK<M,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,vL,vR,Intersect1KEpilogM<M,K,filter>(ray,k,context,line.geomID(),line.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom);
+        const vbool<M> valid = line.valid();
+        return RoundLinearCurveIntersectorK<M,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,vL,vR,Occluded1KEpilogM<M,K,filter>(ray,k,context,line.geomID(),line.primID()));
+      }
+    };
+
+    template<int M, int K, bool filter>
+    struct RoundLinearCurveMiMBIntersectorK
+    {
+      typedef LineMi<M> Primitive;
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context,  const Primitive& line)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom,ray.time()[k]);
+        const vbool<M> valid = line.valid();
+        RoundLinearCurveIntersectorK<M,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,vL,vR,Intersect1KEpilogM<M,K,filter>(ray,k,context,line.geomID(),line.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom,ray.time()[k]);
+        const vbool<M> valid = line.valid();
+        return RoundLinearCurveIntersectorK<M,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,vL,vR,Occluded1KEpilogM<M,K,filter>(ray,k,context,line.geomID(),line.primID()));
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/sphere_intersector.h b/thirdparty/embree/kernels/geometry/sphere_intersector.h
new file mode 100644
index 0000000000..2670f9762d
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/sphere_intersector.h
@@ -0,0 +1,183 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "../common/scene_points.h"
+#include "curve_intersector_precalculations.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M>
+    struct SphereIntersectorHitM
+    {
+      __forceinline SphereIntersectorHitM() {}
+
+      __forceinline SphereIntersectorHitM(const vfloat<M>& t, const Vec3vf<M>& Ng)
+        : vt(t), vNg(Ng) {}
+
+      __forceinline void finalize() {}
+
+      __forceinline Vec2f uv(const size_t i) const {
+        return Vec2f(0.0f, 0.0f);
+      }
+      __forceinline float t(const size_t i) const {
+        return vt[i];
+      }
+      __forceinline Vec3fa Ng(const size_t i) const {
+        return Vec3fa(vNg.x[i], vNg.y[i], vNg.z[i]);
+      }
+
+     public:
+      vfloat<M> vt;
+      Vec3vf<M> vNg;
+    };
+
+    template<int M>
+    struct SphereIntersector1
+    {
+      typedef CurvePrecalculations1 Precalculations;
+
+      template<typename Epilog>
+      static __forceinline bool intersect(
+          const vbool<M>& valid_i, Ray& ray,
+          const Precalculations& pre, const Vec4vf<M>& v0, const Epilog& epilog)
+      {
+        vbool<M> valid = valid_i;
+
+        const vfloat<M> rd2    = rcp(dot(ray.dir, ray.dir));
+        const Vec3vf<M> ray_org(ray.org.x, ray.org.y, ray.org.z);
+        const Vec3vf<M> ray_dir(ray.dir.x, ray.dir.y, ray.dir.z);
+        const Vec3vf<M> center = v0.xyz();
+        const vfloat<M> radius = v0.w;
+
+        const Vec3vf<M> c0     = center - ray_org;
+        const vfloat<M> projC0 = dot(c0, ray_dir) * rd2;
+        const Vec3vf<M> perp   = c0 - projC0 * ray_dir;
+        const vfloat<M> l2     = dot(perp, perp);
+        const vfloat<M> r2     = radius * radius;
+        valid &= (l2 <= r2);
+        if (unlikely(none(valid)))
+          return false;
+
+        const vfloat<M> td      = sqrt((r2 - l2) * rd2);
+        const vfloat<M> t_front = projC0 - td;
+        const vfloat<M> t_back  = projC0 + td;
+
+        const vbool<M> valid_front = valid & (ray.tnear() <= t_front) & (t_front <= ray.tfar);
+        const vbool<M> valid_back  = valid & (ray.tnear() <= t_back ) & (t_back  <= ray.tfar);
+
+        /* check if there is a first hit */
+        const vbool<M> valid_first = valid_front | valid_back;
+        if (unlikely(none(valid_first)))
+          return false;
+
+        /* construct first hit */
+        const vfloat<M> td_front = -td;
+        const vfloat<M> td_back  = +td;
+        const vfloat<M> t_first  = select(valid_front, t_front, t_back);
+        const Vec3vf<M> Ng_first = select(valid_front, td_front, td_back) * ray_dir - perp;
+        SphereIntersectorHitM<M> hit(t_first, Ng_first);
+
+        /* invoke intersection filter for first hit */
+        const bool is_hit_first = epilog(valid_first, hit);
+                
+        /* check for possible second hits before potentially accepted hit */
+        const vfloat<M> t_second = t_back;
+        const vbool<M> valid_second = valid_front & valid_back & (t_second <= ray.tfar);
+        if (unlikely(none(valid_second)))
+          return is_hit_first;
+
+        /* invoke intersection filter for second hit */
+        const Vec3vf<M> Ng_second = td_back * ray_dir - perp;
+        hit = SphereIntersectorHitM<M> (t_second, Ng_second);
+        const bool is_hit_second = epilog(valid_second, hit);
+        
+        return is_hit_first | is_hit_second;
+      }
+
+      template<typename Epilog>
+      static __forceinline bool intersect(
+        const vbool<M>& valid_i, Ray& ray, IntersectContext* context, const Points* geom,
+        const Precalculations& pre, const Vec4vf<M>& v0i, const Epilog& epilog)
+      {
+        const Vec3vf<M> ray_org(ray.org.x, ray.org.y, ray.org.z);
+        const Vec4vf<M> v0 = enlargeRadiusToMinWidth<M>(context,geom,ray_org,v0i);
+        return intersect(valid_i,ray,pre,v0,epilog);
+      }
+    };
+
+    template<int M, int K>
+    struct SphereIntersectorK
+    {
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      template<typename Epilog>
+      static __forceinline bool intersect(const vbool<M>& valid_i,
+                                          RayK<K>& ray, size_t k,
+                                          IntersectContext* context,
+                                          const Points* geom,
+                                          const Precalculations& pre,
+                                          const Vec4vf<M>& v0i,
+                                          const Epilog& epilog)
+      {
+        vbool<M> valid = valid_i;
+
+        const Vec3vf<M> ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]);
+        const Vec3vf<M> ray_dir(ray.dir.x[k], ray.dir.y[k], ray.dir.z[k]);
+        const vfloat<M> rd2 = rcp(dot(ray_dir, ray_dir));
+
+        const Vec4vf<M> v0 = enlargeRadiusToMinWidth<M>(context,geom,ray_org,v0i);
+        const Vec3vf<M> center = v0.xyz();
+        const vfloat<M> radius = v0.w;
+
+        const Vec3vf<M> c0     = center - ray_org;
+        const vfloat<M> projC0 = dot(c0, ray_dir) * rd2;
+        const Vec3vf<M> perp   = c0 - projC0 * ray_dir;
+        const vfloat<M> l2     = dot(perp, perp);
+        const vfloat<M> r2     = radius * radius;
+        valid &= (l2 <= r2);
+        if (unlikely(none(valid)))
+          return false;
+
+        const vfloat<M> td      = sqrt((r2 - l2) * rd2);
+        const vfloat<M> t_front = projC0 - td;
+        const vfloat<M> t_back  = projC0 + td;
+
+        const vbool<M> valid_front = valid & (ray.tnear()[k] <= t_front) & (t_front <= ray.tfar[k]);
+        const vbool<M> valid_back  = valid & (ray.tnear()[k] <= t_back ) & (t_back  <= ray.tfar[k]);
+
+        /* check if there is a first hit */
+        const vbool<M> valid_first = valid_front | valid_back;
+        if (unlikely(none(valid_first)))
+          return false;
+
+        /* construct first hit */
+        const vfloat<M> td_front = -td;
+        const vfloat<M> td_back  = +td;
+        const vfloat<M> t_first  = select(valid_front, t_front, t_back);
+        const Vec3vf<M> Ng_first = select(valid_front, td_front, td_back) * ray_dir - perp;
+        SphereIntersectorHitM<M> hit(t_first, Ng_first);
+
+        /* invoke intersection filter for first hit */
+        const bool is_hit_first = epilog(valid_first, hit);
+                
+        /* check for possible second hits before potentially accepted hit */
+        const vfloat<M> t_second = t_back;
+        const vbool<M> valid_second = valid_front & valid_back & (t_second <= ray.tfar[k]);
+        if (unlikely(none(valid_second)))
+          return is_hit_first;
+
+        /* invoke intersection filter for second hit */
+        const Vec3vf<M> Ng_second = td_back * ray_dir - perp;
+        hit = SphereIntersectorHitM<M> (t_second, Ng_second);
+        const bool is_hit_second = epilog(valid_second, hit);
+        
+        return is_hit_first | is_hit_second;
+      }
+    };
+  }  // namespace isa
+}  // namespace embree
diff --git a/thirdparty/embree/kernels/geometry/spherei_intersector.h b/thirdparty/embree/kernels/geometry/spherei_intersector.h
new file mode 100644
index 0000000000..7a0b428117
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/spherei_intersector.h
@@ -0,0 +1,156 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "intersector_epilog.h"
+#include "pointi.h"
+#include "sphere_intersector.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M, bool filter>
+    struct SphereMiIntersector1
+    {
+      typedef PointMi<M> Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre,
+                                          RayHit& ray,
+                                          IntersectContext* context,
+                                          const Primitive& sphere)
+      {
+        STAT3(normal.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(sphere.geomID());
+        Vec4vf<M> v0; sphere.gather(v0, geom);
+        const vbool<M> valid = sphere.valid();
+        SphereIntersector1<M>::intersect(
+          valid, ray, context, geom, pre, v0, Intersect1EpilogM<M, filter>(ray, context, sphere.geomID(), sphere.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre,
+                                         Ray& ray,
+                                         IntersectContext* context,
+                                         const Primitive& sphere)
+      {
+        STAT3(shadow.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(sphere.geomID());
+        Vec4vf<M> v0; sphere.gather(v0, geom);
+        const vbool<M> valid = sphere.valid();
+        return SphereIntersector1<M>::intersect(
+          valid, ray, context, geom, pre, v0, Occluded1EpilogM<M, filter>(ray, context, sphere.geomID(), sphere.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query,
+                                           PointQueryContext* context,
+                                           const Primitive& sphere)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, sphere);
+      }
+    };
+
+    template<int M, bool filter>
+    struct SphereMiMBIntersector1
+    {
+      typedef PointMi<M> Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre,
+                                          RayHit& ray,
+                                          IntersectContext* context,
+                                          const Primitive& sphere)
+      {
+        STAT3(normal.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(sphere.geomID());
+        Vec4vf<M> v0; sphere.gather(v0, geom, ray.time());
+        const vbool<M> valid = sphere.valid();
+        SphereIntersector1<M>::intersect(
+          valid, ray, context, geom, pre, v0, Intersect1EpilogM<M, filter>(ray, context, sphere.geomID(), sphere.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre,
+                                         Ray& ray,
+                                         IntersectContext* context,
+                                         const Primitive& sphere)
+      {
+        STAT3(shadow.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(sphere.geomID());
+        Vec4vf<M> v0; sphere.gather(v0, geom, ray.time());
+        const vbool<M> valid = sphere.valid();
+        return SphereIntersector1<M>::intersect(
+          valid, ray, context, geom, pre, v0, Occluded1EpilogM<M, filter>(ray, context, sphere.geomID(), sphere.primID()));
+      }
+
+      static __forceinline bool pointQuery(PointQuery* query,
+                                           PointQueryContext* context,
+                                           const Primitive& sphere)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, sphere);
+      }
+    };
+
+    template<int M, int K, bool filter>
+    struct SphereMiIntersectorK
+    {
+      typedef PointMi<M> Primitive;
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      static __forceinline void intersect(
+          const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& sphere)
+      {
+        STAT3(normal.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(sphere.geomID());
+        Vec4vf<M> v0; sphere.gather(v0, geom);
+        const vbool<M> valid = sphere.valid();
+        SphereIntersectorK<M, K>::intersect(
+          valid, ray, k, context, geom, pre, v0,
+          Intersect1KEpilogM<M, K, filter>(ray, k, context, sphere.geomID(), sphere.primID()));
+      }
+
+      static __forceinline bool occluded(
+          const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& sphere)
+      {
+        STAT3(shadow.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(sphere.geomID());
+        Vec4vf<M> v0; sphere.gather(v0, geom);
+        const vbool<M> valid = sphere.valid();
+        return SphereIntersectorK<M, K>::intersect(
+          valid, ray, k, context, geom, pre, v0,
+          Occluded1KEpilogM<M, K, filter>(ray, k, context, sphere.geomID(), sphere.primID()));
+      }
+    };
+
+    template<int M, int K, bool filter>
+    struct SphereMiMBIntersectorK
+    {
+      typedef PointMi<M> Primitive;
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      static __forceinline void intersect(
+          const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& sphere)
+      {
+        STAT3(normal.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(sphere.geomID());
+        Vec4vf<M> v0; sphere.gather(v0, geom, ray.time()[k]);
+        const vbool<M> valid = sphere.valid();
+        SphereIntersectorK<M, K>::intersect(
+          valid, ray, k, context, geom, pre, v0,
+          Intersect1KEpilogM<M, K, filter>(ray, k, context, sphere.geomID(), sphere.primID()));
+      }
+
+      static __forceinline bool occluded(
+          const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& sphere)
+      {
+        STAT3(shadow.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(sphere.geomID());
+        Vec4vf<M> v0; sphere.gather(v0, geom, ray.time()[k]);
+        const vbool<M> valid = sphere.valid();
+        return SphereIntersectorK<M, K>::intersect(
+          valid, ray, k, context, geom, pre, v0,
+          Occluded1KEpilogM<M, K, filter>(ray, k, context, sphere.geomID(), sphere.primID()));
+      }
+    };
+  }  // namespace isa
+}  // namespace embree
diff --git a/thirdparty/embree/kernels/geometry/subdivpatch1.h b/thirdparty/embree/kernels/geometry/subdivpatch1.h
new file mode 100644
index 0000000000..ae0d4e2616
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/subdivpatch1.h
@@ -0,0 +1,38 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../geometry/primitive.h"
+#include "../subdiv/subdivpatch1base.h"
+
+namespace embree
+{
+
+  struct __aligned(64) SubdivPatch1 : public SubdivPatch1Base
+  {
+    struct Type : public PrimitiveType 
+    {
+      const char* name() const;
+      size_t sizeActive(const char* This) const;
+      size_t sizeTotal(const char* This) const;
+      size_t getBytes(const char* This) const;
+    };
+    
+    static Type type;
+
+  public:
+
+    /*! constructor for cached subdiv patch */
+    SubdivPatch1 (const unsigned int gID,
+                        const unsigned int pID,
+                        const unsigned int subPatch,
+                        const SubdivMesh *const mesh,
+                        const size_t time,
+                        const Vec2f uv[4],
+                        const float edge_level[4],
+                        const int subdiv[4],
+                        const int simd_width) 
+      : SubdivPatch1Base(gID,pID,subPatch,mesh,time,uv,edge_level,subdiv,simd_width) {}
+  };
+}
diff --git a/thirdparty/embree/kernels/geometry/subdivpatch1_intersector.h b/thirdparty/embree/kernels/geometry/subdivpatch1_intersector.h
new file mode 100644
index 0000000000..b4b15a1210
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/subdivpatch1_intersector.h
@@ -0,0 +1,237 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "subdivpatch1.h"
+#include "grid_soa.h"
+#include "grid_soa_intersector1.h"
+#include "grid_soa_intersector_packet.h"
+#include "../common/ray.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<typename T>
+      class SubdivPatch1Precalculations : public T
+    { 
+    public:
+      __forceinline SubdivPatch1Precalculations (const Ray& ray, const void* ptr)
+        : T(ray,ptr) {}
+    };
+
+    template<int K, typename T>
+      class SubdivPatch1PrecalculationsK : public T
+    { 
+    public:
+      __forceinline SubdivPatch1PrecalculationsK (const vbool<K>& valid, RayK<K>& ray)
+        : T(valid,ray) {}
+    };
+
+    class SubdivPatch1Intersector1
+    {
+    public:
+      typedef GridSOA Primitive;
+      typedef SubdivPatch1Precalculations<GridSOAIntersector1::Precalculations> Precalculations;
+
+      static __forceinline bool processLazyNode(Precalculations& pre, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      {
+        lazy_node = prim->root(0);
+        pre.grid = (Primitive*)prim;
+        return false;
+      }
+
+      /*! Intersect a ray with the primitive. */
+      template<int N, bool robust>
+        static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node) 
+      {
+        if (likely(ty == 0)) GridSOAIntersector1::intersect(pre,ray,context,prim,lazy_node);
+        else                 processLazyNode(pre,context,prim,lazy_node);
+      }
+
+      template<int N, bool robust>
+      static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, size_t ty0, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node) {
+        intersect(This,pre,ray,context,prim,ty,tray,lazy_node);
+      }
+      
+      /*! Test if the ray is occluded by the primitive */
+      template<int N, bool robust>
+      static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node)
+      {
+        if (likely(ty == 0)) return GridSOAIntersector1::occluded(pre,ray,context,prim,lazy_node);
+        else                 return processLazyNode(pre,context,prim,lazy_node);
+      }
+
+      template<int N, bool robust>
+      static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, size_t ty0, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node) {
+        return occluded(This,pre,ray,context,prim,ty,tray,lazy_node);
+      }
+      
+      template<int N>
+        static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, const Primitive* prim, size_t ty, const TravPointQuery<N> &tquery, size_t& lazy_node) 
+      {
+          // TODO: PointQuery implement
+          assert(false && "not implemented");
+          return false;
+      }
+
+      template<int N>
+      static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, size_t ty0, const Primitive* prim, size_t ty, const TravPointQuery<N> &tquery, size_t& lazy_node) {
+        return pointQuery(This,query,context,prim,ty,tquery,lazy_node);
+      }
+    };
+
+    class SubdivPatch1MBIntersector1
+    {
+    public:
+      typedef SubdivPatch1 Primitive;
+      typedef GridSOAMBIntersector1::Precalculations Precalculations;
+      
+      static __forceinline bool processLazyNode(Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim_i, size_t& lazy_node)
+      {
+        Primitive* prim = (Primitive*) prim_i;
+        GridSOA* grid = nullptr;
+        grid = (GridSOA*) prim->root_ref.get();
+        pre.itime = getTimeSegment(ray.time(), float(grid->time_steps-1), pre.ftime);
+        lazy_node = grid->root(pre.itime);
+        pre.grid = grid;
+        return false;
+      }
+
+      /*! Intersect a ray with the primitive. */
+      template<int N, bool robust>
+      static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node) 
+      {
+        if (likely(ty == 0)) GridSOAMBIntersector1::intersect(pre,ray,context,prim,lazy_node);
+        else                 processLazyNode(pre,ray,context,prim,lazy_node);
+      }
+
+      template<int N, bool robust>
+      static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, size_t ty0, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node) {
+        intersect(This,pre,ray,context,prim,ty,tray,lazy_node);
+      }
+      
+      /*! Test if the ray is occluded by the primitive */
+      template<int N, bool robust>
+      static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node)
+      {
+        if (likely(ty == 0)) return GridSOAMBIntersector1::occluded(pre,ray,context,prim,lazy_node);
+        else                 return processLazyNode(pre,ray,context,prim,lazy_node);
+      }
+
+      template<int N, bool robust>
+      static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, size_t ty0, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node) {
+        return occluded(This,pre,ray,context,prim,ty,tray,lazy_node);
+      }
+      
+      template<int N>
+        static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, const Primitive* prim, size_t ty, const TravPointQuery<N> &tquery, size_t& lazy_node) 
+      {
+          // TODO: PointQuery implement
+          assert(false && "not implemented");
+          return false;
+      }
+
+      template<int N, bool robust>
+      static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, size_t ty0, const Primitive* prim, size_t ty, const TravPointQuery<N> &tquery, size_t& lazy_node) {
+        return pointQuery(This,query,context,prim,ty,tquery,lazy_node);
+      }
+    };
+
+    template <int K>
+      struct SubdivPatch1IntersectorK
+    {
+      typedef GridSOA Primitive;
+      typedef SubdivPatch1PrecalculationsK<K,typename GridSOAIntersectorK<K>::Precalculations> Precalculations;
+      
+      static __forceinline bool processLazyNode(Precalculations& pre, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      {
+        lazy_node = prim->root(0);
+        pre.grid = (Primitive*)prim;
+        return false;
+      }
+      
+      template<bool robust>        
+      static __forceinline void intersect(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRayK<K, robust> &tray, size_t& lazy_node)
+      {
+        if (likely(ty == 0)) GridSOAIntersectorK<K>::intersect(valid,pre,ray,context,prim,lazy_node);
+        else                 processLazyNode(pre,context,prim,lazy_node);
+      }
+      
+      template<bool robust>        
+      static __forceinline vbool<K> occluded(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRayK<K, robust> &tray, size_t& lazy_node)
+      {
+        if (likely(ty == 0)) return GridSOAIntersectorK<K>::occluded(valid,pre,ray,context,prim,lazy_node);
+        else                 return processLazyNode(pre,context,prim,lazy_node);
+      }
+      
+      template<int N, bool robust>              
+        static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node)
+      {
+        if (likely(ty == 0)) GridSOAIntersectorK<K>::intersect(pre,ray,k,context,prim,lazy_node);
+        else                 processLazyNode(pre,context,prim,lazy_node);
+      }
+      
+      template<int N, bool robust>              
+      static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node)
+      {
+        if (likely(ty == 0)) return GridSOAIntersectorK<K>::occluded(pre,ray,k,context,prim,lazy_node);
+        else                 return processLazyNode(pre,context,prim,lazy_node);
+      }
+    };
+
+    typedef SubdivPatch1IntersectorK<4>  SubdivPatch1Intersector4;
+    typedef SubdivPatch1IntersectorK<8>  SubdivPatch1Intersector8;
+    typedef SubdivPatch1IntersectorK<16> SubdivPatch1Intersector16;
+
+    template <int K>
+      struct SubdivPatch1MBIntersectorK
+    {
+      typedef SubdivPatch1 Primitive;
+      //typedef GridSOAMBIntersectorK<K>::Precalculations Precalculations;
+      typedef SubdivPatch1PrecalculationsK<K,typename GridSOAMBIntersectorK<K>::Precalculations> Precalculations;
+      
+      static __forceinline bool processLazyNode(Precalculations& pre, IntersectContext* context, const Primitive* prim_i, size_t& lazy_node)
+      {
+        Primitive* prim = (Primitive*) prim_i;
+        GridSOA* grid = (GridSOA*) prim->root_ref.get();
+        lazy_node = grid->troot;
+        pre.grid = grid;
+        return false;
+      }
+
+      template<bool robust>
+      static __forceinline void intersect(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRayK<K, robust> &tray, size_t& lazy_node)
+      {
+        if (likely(ty == 0)) GridSOAMBIntersectorK<K>::intersect(valid,pre,ray,context,prim,lazy_node);
+        else                 processLazyNode(pre,context,prim,lazy_node);
+      }
+
+      template<bool robust>
+      static __forceinline vbool<K> occluded(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRayK<K, robust> &tray, size_t& lazy_node)
+      {
+        if (likely(ty == 0)) return GridSOAMBIntersectorK<K>::occluded(valid,pre,ray,context,prim,lazy_node);
+        else                 return processLazyNode(pre,context,prim,lazy_node);
+      }
+      
+      template<int N, bool robust>      
+      static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node)
+      {
+        if (likely(ty == 0)) GridSOAMBIntersectorK<K>::intersect(pre,ray,k,context,prim,lazy_node);
+        else                 processLazyNode(pre,context,prim,lazy_node);
+      }
+      
+      template<int N, bool robust>      
+      static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node)
+      {
+        if (likely(ty == 0)) return GridSOAMBIntersectorK<K>::occluded(pre,ray,k,context,prim,lazy_node);
+        else                 return processLazyNode(pre,context,prim,lazy_node);
+      }
+    };
+
+    typedef SubdivPatch1MBIntersectorK<4>  SubdivPatch1MBIntersector4;
+    typedef SubdivPatch1MBIntersectorK<8>  SubdivPatch1MBIntersector8;
+    typedef SubdivPatch1MBIntersectorK<16> SubdivPatch1MBIntersector16;
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/subgrid.h b/thirdparty/embree/kernels/geometry/subgrid.h
new file mode 100644
index 0000000000..ce54421cab
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/subgrid.h
@@ -0,0 +1,517 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "../common/scene_grid_mesh.h"
+#include "../bvh/bvh.h"
+
+namespace embree
+{
+    /* Stores M quads from an indexed face set */
+      struct SubGrid
+      {
+        /* Virtual interface to query information about the quad type */
+        struct Type : public PrimitiveType
+        {
+          const char* name() const;
+          size_t sizeActive(const char* This) const;
+          size_t sizeTotal(const char* This) const;
+          size_t getBytes(const char* This) const;
+        };
+        static Type type;
+
+      public:
+
+        /* primitive supports multiple time segments */
+        static const bool singleTimeSegment = false;
+
+        /* Returns maximum number of stored quads */
+        static __forceinline size_t max_size() { return 1; }
+
+        /* Returns required number of primitive blocks for N primitives */
+        static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); }
+
+      public:
+
+        /* Default constructor */
+        __forceinline SubGrid() {  }
+
+        /* Construction from vertices and IDs */
+        __forceinline SubGrid(const unsigned int x,
+                              const unsigned int y,
+                              const unsigned int geomID,
+                              const unsigned int primID)
+          : _x(x), _y(y), _geomID(geomID), _primID(primID)
+        {
+        }
+
+        __forceinline bool invalid3x3X() const { return (unsigned int)_x & (1<<15); }
+        __forceinline bool invalid3x3Y() const { return (unsigned int)_y & (1<<15); }
+
+        /* Gather the quads */
+        __forceinline void gather(Vec3vf4& p0,
+                                  Vec3vf4& p1,
+                                  Vec3vf4& p2,
+                                  Vec3vf4& p3,
+                                  const GridMesh* const mesh,
+                                  const GridMesh::Grid &g) const
+        {
+          /* first quad always valid */
+          const size_t vtxID00 = g.startVtxID + x() + y() * g.lineVtxOffset;
+          const size_t vtxID01 = vtxID00 + 1;
+          const vfloat4 vtx00  = vfloat4::loadu(mesh->vertexPtr(vtxID00));
+          const vfloat4 vtx01  = vfloat4::loadu(mesh->vertexPtr(vtxID01));
+          const size_t vtxID10 = vtxID00 + g.lineVtxOffset;
+          const size_t vtxID11 = vtxID01 + g.lineVtxOffset;
+          const vfloat4 vtx10  = vfloat4::loadu(mesh->vertexPtr(vtxID10));
+          const vfloat4 vtx11  = vfloat4::loadu(mesh->vertexPtr(vtxID11));
+
+          /* deltaX => vtx02, vtx12 */
+          const size_t deltaX  = invalid3x3X() ? 0 : 1;
+          const size_t vtxID02 = vtxID01 + deltaX;       
+          const vfloat4 vtx02  = vfloat4::loadu(mesh->vertexPtr(vtxID02));
+          const size_t vtxID12 = vtxID11 + deltaX;       
+          const vfloat4 vtx12  = vfloat4::loadu(mesh->vertexPtr(vtxID12));
+
+          /* deltaY => vtx20, vtx21 */
+          const size_t deltaY  = invalid3x3Y() ? 0 : g.lineVtxOffset;
+          const size_t vtxID20 = vtxID10 + deltaY;
+          const size_t vtxID21 = vtxID11 + deltaY;
+          const vfloat4 vtx20  = vfloat4::loadu(mesh->vertexPtr(vtxID20));
+          const vfloat4 vtx21  = vfloat4::loadu(mesh->vertexPtr(vtxID21));
+
+          /* deltaX/deltaY => vtx22 */
+          const size_t vtxID22 = vtxID11 + deltaX + deltaY;       
+          const vfloat4 vtx22  = vfloat4::loadu(mesh->vertexPtr(vtxID22));
+
+          transpose(vtx00,vtx01,vtx11,vtx10,p0.x,p0.y,p0.z);
+          transpose(vtx01,vtx02,vtx12,vtx11,p1.x,p1.y,p1.z);
+          transpose(vtx11,vtx12,vtx22,vtx21,p2.x,p2.y,p2.z);
+          transpose(vtx10,vtx11,vtx21,vtx20,p3.x,p3.y,p3.z);                    
+        }
+
+        template<typename T>
+        __forceinline vfloat4 getVertexMB(const GridMesh* const mesh, const size_t offset, const size_t itime, const float ftime) const
+        {
+          const T v0 = T::loadu(mesh->vertexPtr(offset,itime+0));
+          const T v1 = T::loadu(mesh->vertexPtr(offset,itime+1));
+          return lerp(v0,v1,ftime);
+        }
+
+        /* Gather the quads */
+        __forceinline void gatherMB(Vec3vf4& p0,
+                                    Vec3vf4& p1,
+                                    Vec3vf4& p2,
+                                    Vec3vf4& p3,
+                                    const GridMesh* const mesh,
+                                    const GridMesh::Grid &g,
+                                    const size_t itime, 
+                                    const float ftime) const
+        {
+          /* first quad always valid */
+          const size_t vtxID00 = g.startVtxID + x() + y() * g.lineVtxOffset;
+          const size_t vtxID01 = vtxID00 + 1;
+          const vfloat4 vtx00  = getVertexMB<vfloat4>(mesh,vtxID00,itime,ftime);
+          const vfloat4 vtx01  = getVertexMB<vfloat4>(mesh,vtxID01,itime,ftime);
+          const size_t vtxID10 = vtxID00 + g.lineVtxOffset;
+          const size_t vtxID11 = vtxID01 + g.lineVtxOffset;
+          const vfloat4 vtx10  = getVertexMB<vfloat4>(mesh,vtxID10,itime,ftime);
+          const vfloat4 vtx11  = getVertexMB<vfloat4>(mesh,vtxID11,itime,ftime);
+
+          /* deltaX => vtx02, vtx12 */
+          const size_t deltaX  = invalid3x3X() ? 0 : 1;
+          const size_t vtxID02 = vtxID01 + deltaX;       
+          const vfloat4 vtx02  = getVertexMB<vfloat4>(mesh,vtxID02,itime,ftime);
+          const size_t vtxID12 = vtxID11 + deltaX;       
+          const vfloat4 vtx12  = getVertexMB<vfloat4>(mesh,vtxID12,itime,ftime);
+
+          /* deltaY => vtx20, vtx21 */
+          const size_t deltaY  = invalid3x3Y() ? 0 : g.lineVtxOffset;
+          const size_t vtxID20 = vtxID10 + deltaY;
+          const size_t vtxID21 = vtxID11 + deltaY;
+          const vfloat4 vtx20  = getVertexMB<vfloat4>(mesh,vtxID20,itime,ftime);
+          const vfloat4 vtx21  = getVertexMB<vfloat4>(mesh,vtxID21,itime,ftime);
+
+          /* deltaX/deltaY => vtx22 */
+          const size_t vtxID22 = vtxID11 + deltaX + deltaY;       
+          const vfloat4 vtx22  = getVertexMB<vfloat4>(mesh,vtxID22,itime,ftime);
+
+          transpose(vtx00,vtx01,vtx11,vtx10,p0.x,p0.y,p0.z);
+          transpose(vtx01,vtx02,vtx12,vtx11,p1.x,p1.y,p1.z);
+          transpose(vtx11,vtx12,vtx22,vtx21,p2.x,p2.y,p2.z);
+          transpose(vtx10,vtx11,vtx21,vtx20,p3.x,p3.y,p3.z);                    
+        }
+
+
+
+        /* Gather the quads */
+        __forceinline void gather(Vec3vf4& p0,
+                                  Vec3vf4& p1,
+                                  Vec3vf4& p2,
+                                  Vec3vf4& p3,
+                                  const Scene *const scene) const
+        {
+          const GridMesh* const mesh = scene->get<GridMesh>(geomID());
+          const GridMesh::Grid &g    = mesh->grid(primID());
+          gather(p0,p1,p2,p3,mesh,g);
+        }
+
+        /* Gather the quads in the motion blur case */
+        __forceinline void gatherMB(Vec3vf4& p0,
+                                    Vec3vf4& p1,
+                                    Vec3vf4& p2,
+                                    Vec3vf4& p3,
+                                    const Scene *const scene,
+                                    const size_t itime, 
+                                    const float ftime) const
+        {
+          const GridMesh* const mesh = scene->get<GridMesh>(geomID());
+          const GridMesh::Grid &g    = mesh->grid(primID());
+          gatherMB(p0,p1,p2,p3,mesh,g,itime,ftime);
+        }
+
+        /* Gather the quads */
+        __forceinline void gather(Vec3fa vtx[16], const Scene *const scene) const
+        {
+          const GridMesh* mesh     = scene->get<GridMesh>(geomID());
+          const GridMesh::Grid &g  = mesh->grid(primID());
+
+          /* first quad always valid */
+          const size_t vtxID00 = g.startVtxID + x() + y() * g.lineVtxOffset;
+          const size_t vtxID01 = vtxID00 + 1;
+          const Vec3fa vtx00  = Vec3fa::loadu(mesh->vertexPtr(vtxID00));
+          const Vec3fa vtx01  = Vec3fa::loadu(mesh->vertexPtr(vtxID01));
+          const size_t vtxID10 = vtxID00 + g.lineVtxOffset;
+          const size_t vtxID11 = vtxID01 + g.lineVtxOffset;
+          const Vec3fa vtx10  = Vec3fa::loadu(mesh->vertexPtr(vtxID10));
+          const Vec3fa vtx11  = Vec3fa::loadu(mesh->vertexPtr(vtxID11));
+
+          /* deltaX => vtx02, vtx12 */
+          const size_t deltaX  = invalid3x3X() ? 0 : 1;
+          const size_t vtxID02 = vtxID01 + deltaX;       
+          const Vec3fa vtx02  = Vec3fa::loadu(mesh->vertexPtr(vtxID02));
+          const size_t vtxID12 = vtxID11 + deltaX;       
+          const Vec3fa vtx12  = Vec3fa::loadu(mesh->vertexPtr(vtxID12));
+
+          /* deltaY => vtx20, vtx21 */
+          const size_t deltaY  = invalid3x3Y() ? 0 : g.lineVtxOffset;
+          const size_t vtxID20 = vtxID10 + deltaY;
+          const size_t vtxID21 = vtxID11 + deltaY;
+          const Vec3fa vtx20  = Vec3fa::loadu(mesh->vertexPtr(vtxID20));
+          const Vec3fa vtx21  = Vec3fa::loadu(mesh->vertexPtr(vtxID21));
+
+          /* deltaX/deltaY => vtx22 */
+          const size_t vtxID22 = vtxID11 + deltaX + deltaY;       
+          const Vec3fa vtx22  = Vec3fa::loadu(mesh->vertexPtr(vtxID22));
+
+          vtx[ 0] = vtx00; vtx[ 1] = vtx01; vtx[ 2] = vtx11; vtx[ 3] = vtx10;
+          vtx[ 4] = vtx01; vtx[ 5] = vtx02; vtx[ 6] = vtx12; vtx[ 7] = vtx11;
+          vtx[ 8] = vtx10; vtx[ 9] = vtx11; vtx[10] = vtx21; vtx[11] = vtx20;
+          vtx[12] = vtx11; vtx[13] = vtx12; vtx[14] = vtx22; vtx[15] = vtx21;
+        }
+
+        /* Gather the quads */
+        __forceinline void gatherMB(vfloat4 vtx[16], const Scene *const scene, const size_t itime, const float ftime) const
+        {
+          const GridMesh* mesh     = scene->get<GridMesh>(geomID());
+          const GridMesh::Grid &g  = mesh->grid(primID());
+
+          /* first quad always valid */
+          const size_t vtxID00 = g.startVtxID + x() + y() * g.lineVtxOffset;
+          const size_t vtxID01 = vtxID00 + 1;
+          const vfloat4 vtx00  = getVertexMB<vfloat4>(mesh,vtxID00,itime,ftime);
+          const vfloat4 vtx01  = getVertexMB<vfloat4>(mesh,vtxID01,itime,ftime);
+          const size_t vtxID10 = vtxID00 + g.lineVtxOffset;
+          const size_t vtxID11 = vtxID01 + g.lineVtxOffset;
+          const vfloat4 vtx10  = getVertexMB<vfloat4>(mesh,vtxID10,itime,ftime);
+          const vfloat4 vtx11  = getVertexMB<vfloat4>(mesh,vtxID11,itime,ftime);
+
+          /* deltaX => vtx02, vtx12 */
+          const size_t deltaX  = invalid3x3X() ? 0 : 1;
+          const size_t vtxID02 = vtxID01 + deltaX;       
+          const vfloat4 vtx02  = getVertexMB<vfloat4>(mesh,vtxID02,itime,ftime);
+          const size_t vtxID12 = vtxID11 + deltaX;       
+          const vfloat4 vtx12  = getVertexMB<vfloat4>(mesh,vtxID12,itime,ftime);
+
+          /* deltaY => vtx20, vtx21 */
+          const size_t deltaY  = invalid3x3Y() ? 0 : g.lineVtxOffset;
+          const size_t vtxID20 = vtxID10 + deltaY;
+          const size_t vtxID21 = vtxID11 + deltaY;
+          const vfloat4 vtx20  = getVertexMB<vfloat4>(mesh,vtxID20,itime,ftime);
+          const vfloat4 vtx21  = getVertexMB<vfloat4>(mesh,vtxID21,itime,ftime);
+
+          /* deltaX/deltaY => vtx22 */
+          const size_t vtxID22 = vtxID11 + deltaX + deltaY;       
+          const vfloat4 vtx22  = getVertexMB<vfloat4>(mesh,vtxID22,itime,ftime);
+
+          vtx[ 0] = vtx00; vtx[ 1] = vtx01; vtx[ 2] = vtx11; vtx[ 3] = vtx10;
+          vtx[ 4] = vtx01; vtx[ 5] = vtx02; vtx[ 6] = vtx12; vtx[ 7] = vtx11;
+          vtx[ 8] = vtx10; vtx[ 9] = vtx11; vtx[10] = vtx21; vtx[11] = vtx20;
+          vtx[12] = vtx11; vtx[13] = vtx12; vtx[14] = vtx22; vtx[15] = vtx21;
+        }        
+          
+
+        /* Calculate the bounds of the subgrid */
+        __forceinline const BBox3fa bounds(const Scene *const scene, const size_t itime=0) const
+        {
+          BBox3fa bounds = empty;
+          FATAL("not implemented yet");
+          return bounds;
+        }
+
+        /* Calculate the linear bounds of the primitive */
+        __forceinline LBBox3fa linearBounds(const Scene* const scene, const size_t itime)
+        {
+          return LBBox3fa(bounds(scene,itime+0),bounds(scene,itime+1));
+        }
+
+        __forceinline LBBox3fa linearBounds(const Scene *const scene, size_t itime, size_t numTimeSteps)
+        {
+          LBBox3fa allBounds = empty;
+          FATAL("not implemented yet");
+          return allBounds;
+        }
+
+        __forceinline LBBox3fa linearBounds(const Scene *const scene, const BBox1f time_range)
+        {
+          LBBox3fa allBounds = empty;
+          FATAL("not implemented yet");
+          return allBounds;
+        }
+
+
+        friend embree_ostream operator<<(embree_ostream cout, const SubGrid& sg) {
+          return cout << "SubGrid " << " ( x " << sg.x() << ", y = " << sg.y() << ", geomID = " << sg.geomID() << ", primID = " << sg.primID() << " )";
+        }
+
+        __forceinline unsigned int geomID() const { return _geomID; }
+        __forceinline unsigned int primID() const { return _primID; }
+        __forceinline unsigned int x() const { return (unsigned int)_x & 0x7fff; }
+        __forceinline unsigned int y() const { return (unsigned int)_y & 0x7fff; }
+
+      private:
+        unsigned short _x;
+        unsigned short _y;
+        unsigned int _geomID;    // geometry ID of mesh
+        unsigned int _primID;    // primitive ID of primitive inside mesh
+      };
+
+      struct SubGridID {
+        unsigned short x;
+        unsigned short y;
+        unsigned int primID;
+        
+        __forceinline SubGridID() {}
+        __forceinline SubGridID(const unsigned int x, const unsigned int y, const unsigned int primID) :
+        x(x), y(y), primID(primID) {}        
+      };
+
+      /* QuantizedBaseNode as large subgrid leaf */
+      template<int N>
+      struct SubGridQBVHN
+      {
+        /* Virtual interface to query information about the quad type */
+        struct Type : public PrimitiveType
+        {
+          const char* name() const;
+          size_t sizeActive(const char* This) const;
+          size_t sizeTotal(const char* This) const;
+          size_t getBytes(const char* This) const;
+        };
+        static Type type;
+
+      public:
+
+        __forceinline size_t size() const
+        {
+          for (size_t i=0;i<N;i++)
+            if (primID(i) == -1) return i;
+          return N;
+        }
+
+      __forceinline void clear() {
+        for (size_t i=0;i<N;i++)
+          subgridIDs[i] = SubGridID(0,0,(unsigned int)-1);
+        qnode.clear();
+      }
+
+        /* Default constructor */
+        __forceinline SubGridQBVHN() {  }
+
+        /* Construction from vertices and IDs */
+        __forceinline SubGridQBVHN(const unsigned int x[N],
+                                   const unsigned int y[N],
+                                   const unsigned int primID[N],
+                                   const BBox3fa * const subGridBounds,
+                                   const unsigned int geomID,
+                                   const unsigned int items)
+        {
+          clear();
+          _geomID = geomID;
+
+          __aligned(64) typename BVHN<N>::AABBNode node;
+          node.clear();          
+          for (size_t i=0;i<items;i++)
+          {
+            subgridIDs[i] = SubGridID(x[i],y[i],primID[i]);
+            node.setBounds(i,subGridBounds[i]);
+          }
+          qnode.init_dim(node);
+        }
+
+        __forceinline unsigned int geomID() const { return _geomID; }
+        __forceinline unsigned int primID(const size_t i) const { assert(i < N); return subgridIDs[i].primID; }
+        __forceinline unsigned int x(const size_t i) const { assert(i < N); return subgridIDs[i].x; }
+        __forceinline unsigned int y(const size_t i) const { assert(i < N); return subgridIDs[i].y; }
+
+        __forceinline SubGrid subgrid(const size_t i) const {
+          assert(i < N);
+          assert(primID(i) != -1);
+          return SubGrid(x(i),y(i),geomID(),primID(i));
+        }
+
+      public:
+        SubGridID subgridIDs[N];
+
+        typename BVHN<N>::QuantizedBaseNode qnode;
+
+        unsigned int _geomID;    // geometry ID of mesh
+
+
+        friend embree_ostream operator<<(embree_ostream cout, const SubGridQBVHN& sg) {
+          cout << "SubGridQBVHN " << embree_endl;
+          for (size_t i=0;i<N;i++)
+            cout << i << " ( x = " << sg.subgridIDs[i].x << ", y = " << sg.subgridIDs[i].y << ", primID = " << sg.subgridIDs[i].primID << " )" << embree_endl;
+          cout << "geomID " << sg._geomID << embree_endl;
+          cout << "lowerX " << sg.qnode.dequantizeLowerX() << embree_endl;
+          cout << "upperX " << sg.qnode.dequantizeUpperX() << embree_endl;
+          cout << "lowerY " << sg.qnode.dequantizeLowerY() << embree_endl;
+          cout << "upperY " << sg.qnode.dequantizeUpperY() << embree_endl;
+          cout << "lowerZ " << sg.qnode.dequantizeLowerZ() << embree_endl;
+          cout << "upperZ " << sg.qnode.dequantizeUpperZ() << embree_endl;
+          return cout;
+        }
+
+      };
+
+      template<int N>
+        typename SubGridQBVHN<N>::Type SubGridQBVHN<N>::type;
+
+      typedef SubGridQBVHN<4> SubGridQBVH4;
+      typedef SubGridQBVHN<8> SubGridQBVH8;
+
+
+      /* QuantizedBaseNode as large subgrid leaf */
+      template<int N>
+      struct SubGridMBQBVHN
+      {
+        /* Virtual interface to query information about the quad type */
+        struct Type : public PrimitiveType
+        {
+          const char* name() const;
+          size_t sizeActive(const char* This) const;
+          size_t sizeTotal(const char* This) const;
+          size_t getBytes(const char* This) const;
+        };
+        static Type type;
+
+      public:
+
+        __forceinline size_t size() const
+        {
+          for (size_t i=0;i<N;i++)
+            if (primID(i) == -1) return i;
+          return N;
+        }
+
+      __forceinline void clear() {
+        for (size_t i=0;i<N;i++)
+          subgridIDs[i] = SubGridID(0,0,(unsigned int)-1);
+        qnode.clear();
+      }
+
+        /* Default constructor */
+        __forceinline SubGridMBQBVHN() {  }
+
+        /* Construction from vertices and IDs */
+        __forceinline SubGridMBQBVHN(const unsigned int x[N],
+                                     const unsigned int y[N],
+                                     const unsigned int primID[N],
+                                     const BBox3fa * const subGridBounds0,
+                                     const BBox3fa * const subGridBounds1,
+                                     const unsigned int geomID,
+                                     const float toffset,
+                                     const float tscale,
+                                     const unsigned int items)
+        {
+          clear();
+          _geomID = geomID;
+          time_offset = toffset;
+          time_scale  = tscale;
+
+          __aligned(64) typename BVHN<N>::AABBNode node0,node1;
+          node0.clear();          
+          node1.clear();          
+          for (size_t i=0;i<items;i++)
+          {
+            subgridIDs[i] = SubGridID(x[i],y[i],primID[i]);
+            node0.setBounds(i,subGridBounds0[i]);
+            node1.setBounds(i,subGridBounds1[i]);
+          }
+          qnode.node0.init_dim(node0);
+          qnode.node1.init_dim(node1);
+        }
+
+        __forceinline unsigned int geomID() const { return _geomID; }
+        __forceinline unsigned int primID(const size_t i) const { assert(i < N); return subgridIDs[i].primID; }
+        __forceinline unsigned int x(const size_t i) const { assert(i < N); return subgridIDs[i].x; }
+        __forceinline unsigned int y(const size_t i) const { assert(i < N); return subgridIDs[i].y; }
+
+        __forceinline SubGrid subgrid(const size_t i) const {
+          assert(i < N);
+          assert(primID(i) != -1);
+          return SubGrid(x(i),y(i),geomID(),primID(i));
+        }
+
+        __forceinline float adjustTime(const float t) const { return time_scale * (t-time_offset); }
+
+        template<int K>
+        __forceinline vfloat<K> adjustTime(const vfloat<K> &t) const { return time_scale * (t-time_offset); }
+
+      public:
+        SubGridID subgridIDs[N];
+
+        typename BVHN<N>::QuantizedBaseNodeMB qnode;
+
+        float time_offset;
+        float time_scale;
+        unsigned int _geomID;    // geometry ID of mesh
+
+
+        friend embree_ostream operator<<(embree_ostream cout, const SubGridMBQBVHN& sg) {
+          cout << "SubGridMBQBVHN " << embree_endl;
+          for (size_t i=0;i<N;i++)
+            cout << i << " ( x = " << sg.subgridIDs[i].x << ", y = " << sg.subgridIDs[i].y << ", primID = " << sg.subgridIDs[i].primID << " )" << embree_endl;
+          cout << "geomID      " << sg._geomID << embree_endl;
+          cout << "time_offset " << sg.time_offset << embree_endl;
+          cout << "time_scale  " << sg.time_scale << embree_endl;         
+          cout << "lowerX " << sg.qnode.node0.dequantizeLowerX() << embree_endl;
+          cout << "upperX " << sg.qnode.node0.dequantizeUpperX() << embree_endl;
+          cout << "lowerY " << sg.qnode.node0.dequantizeLowerY() << embree_endl;
+          cout << "upperY " << sg.qnode.node0.dequantizeUpperY() << embree_endl;
+          cout << "lowerZ " << sg.qnode.node0.dequantizeLowerZ() << embree_endl;
+          cout << "upperZ " << sg.qnode.node0.dequantizeUpperZ() << embree_endl;
+          cout << "lowerX " << sg.qnode.node1.dequantizeLowerX() << embree_endl;
+          cout << "upperX " << sg.qnode.node1.dequantizeUpperX() << embree_endl;
+          cout << "lowerY " << sg.qnode.node1.dequantizeLowerY() << embree_endl;
+          cout << "upperY " << sg.qnode.node1.dequantizeUpperY() << embree_endl;
+          cout << "lowerZ " << sg.qnode.node1.dequantizeLowerZ() << embree_endl;
+          cout << "upperZ " << sg.qnode.node1.dequantizeUpperZ() << embree_endl;
+          return cout;
+        }
+
+      };
+
+}
diff --git a/thirdparty/embree/kernels/geometry/subgrid_intersector.h b/thirdparty/embree/kernels/geometry/subgrid_intersector.h
new file mode 100644
index 0000000000..ad5fee2e4e
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/subgrid_intersector.h
@@ -0,0 +1,517 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "subgrid.h"
+#include "subgrid_intersector_moeller.h"
+#include "subgrid_intersector_pluecker.h"
+
+namespace embree
+{
+  namespace isa
+  {
+
+    // =======================================================================================
+    // =================================== SubGridIntersectors ===============================
+    // =======================================================================================
+
+
+    template<int N, bool filter>
+    struct SubGridIntersector1Moeller
+    {
+      typedef SubGridQBVHN<N> Primitive;
+      typedef SubGridQuadMIntersector1MoellerTrumbore<4,filter> Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const SubGrid& subgrid)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+        Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene);
+        pre.intersect(ray,context,v0,v1,v2,v3,g,subgrid);
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const SubGrid& subgrid)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+        Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene);
+        return pre.occluded(ray,context,v0,v1,v2,v3,g,subgrid);
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const SubGrid& subgrid)
+      {
+        STAT3(point_query.trav_prims,1,1,1);
+        AccelSet* accel = (AccelSet*)context->scene->get(subgrid.geomID());
+        assert(accel);
+        context->geomID = subgrid.geomID();
+        context->primID = subgrid.primID();
+        return accel->pointQuery(query, context);
+      }
+
+      template<bool robust>
+        static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+      {
+        BVHNQuantizedBaseNodeIntersector1<N,robust> isec1;
+
+        for (size_t i=0;i<num;i++)
+        {
+          vfloat<N> dist;
+          size_t mask = isec1.intersect(&prim[i].qnode,tray,dist); 
+#if defined(__AVX__)
+          STAT3(normal.trav_hit_boxes[popcnt(mask)],1,1,1);
+#endif
+          while(mask != 0)
+          {
+            const size_t ID = bscf(mask); 
+            assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask()));
+
+            if (unlikely(dist[ID] > ray.tfar)) continue;
+            intersect(pre,ray,context,prim[i].subgrid(ID));
+          }
+        }
+      }
+      template<bool robust>        
+        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+
+      {
+        BVHNQuantizedBaseNodeIntersector1<N,robust> isec1;
+
+        for (size_t i=0;i<num;i++)
+        {
+          vfloat<N> dist;
+          size_t mask = isec1.intersect(&prim[i].qnode,tray,dist); 
+          while(mask != 0)
+          {
+            const size_t ID = bscf(mask); 
+            assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask()));
+
+            if (occluded(pre,ray,context,prim[i].subgrid(ID)))
+              return true;
+          }
+        }
+        return false;
+      }
+    
+      static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, const Primitive* prim, size_t num, const TravPointQuery<N> &tquery, size_t& lazy_node)
+      {
+        bool changed = false;
+        for (size_t i=0;i<num;i++)
+        {
+          vfloat<N> dist;
+          size_t mask;
+          if (likely(context->query_type == POINT_QUERY_TYPE_SPHERE)) {
+            mask = BVHNQuantizedBaseNodePointQuerySphere1<N>::pointQuery(&prim[i].qnode,tquery,dist);
+          } else {
+            mask = BVHNQuantizedBaseNodePointQueryAABB1<N>::pointQuery(&prim[i].qnode,tquery,dist);
+          }
+          while(mask != 0)
+          {
+            const size_t ID = bscf(mask); 
+            assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask()));
+            changed |= pointQuery(query, context, prim[i].subgrid(ID));
+          }
+        }
+        return changed;
+      }
+    };
+
+    template<int N, bool filter>
+    struct SubGridIntersector1Pluecker
+    {
+      typedef SubGridQBVHN<N> Primitive;
+      typedef SubGridQuadMIntersector1Pluecker<4,filter> Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const SubGrid& subgrid)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+        Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene);
+        pre.intersect(ray,context,v0,v1,v2,v3,g,subgrid);
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const SubGrid& subgrid)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+        Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene);
+        return pre.occluded(ray,context,v0,v1,v2,v3,g,subgrid);
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const SubGrid& subgrid)
+      {
+        STAT3(point_query.trav_prims,1,1,1);
+        AccelSet* accel = (AccelSet*)context->scene->get(subgrid.geomID());
+        context->geomID = subgrid.geomID();
+        context->primID = subgrid.primID();
+        return accel->pointQuery(query, context);
+      }
+
+      template<bool robust>
+        static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+      {
+        BVHNQuantizedBaseNodeIntersector1<N,robust> isec1;
+
+        for (size_t i=0;i<num;i++)
+        {
+          vfloat<N> dist;
+          size_t mask = isec1.intersect(&prim[i].qnode,tray,dist); 
+#if defined(__AVX__)
+          STAT3(normal.trav_hit_boxes[popcnt(mask)],1,1,1);
+#endif
+          while(mask != 0)
+          {
+            const size_t ID = bscf(mask); 
+            assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask()));
+
+            if (unlikely(dist[ID] > ray.tfar)) continue;
+            intersect(pre,ray,context,prim[i].subgrid(ID));
+          }
+        }
+      }
+
+      template<bool robust>        
+        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+      {
+        BVHNQuantizedBaseNodeIntersector1<N,robust> isec1;
+
+        for (size_t i=0;i<num;i++)
+        {
+          vfloat<N> dist;
+          size_t mask = isec1.intersect(&prim[i].qnode,tray,dist); 
+          while(mask != 0)
+          {
+            const size_t ID = bscf(mask); 
+            assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask()));
+
+            if (occluded(pre,ray,context,prim[i].subgrid(ID)))
+              return true;
+          }
+        }
+        return false;
+      }
+      
+      static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, const Primitive* prim, size_t num, const TravPointQuery<N> &tquery, size_t& lazy_node)
+      {
+        bool changed = false;
+        for (size_t i=0;i<num;i++)
+        {
+          vfloat<N> dist;
+          size_t mask;
+          if (likely(context->query_type == POINT_QUERY_TYPE_SPHERE)) {
+            mask = BVHNQuantizedBaseNodePointQuerySphere1<N>::pointQuery(&prim[i].qnode,tquery,dist);
+          } else {
+            mask = BVHNQuantizedBaseNodePointQueryAABB1<N>::pointQuery(&prim[i].qnode,tquery,dist);
+          }
+#if defined(__AVX__)
+          STAT3(point_query.trav_hit_boxes[popcnt(mask)],1,1,1);
+#endif
+          while(mask != 0)
+          {
+            const size_t ID = bscf(mask); 
+            assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask()));
+            changed |= pointQuery(query, context, prim[i].subgrid(ID));
+          }
+        }
+        return changed;
+      }
+    };
+
+    template<int N, int K, bool filter>
+    struct SubGridIntersectorKMoeller
+    {
+      typedef SubGridQBVHN<N> Primitive;
+      typedef SubGridQuadMIntersectorKMoellerTrumbore<4,K,filter> Precalculations;
+
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const SubGrid& subgrid)
+      {
+        Vec3fa vtx[16];
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+        subgrid.gather(vtx,context->scene);
+        for (unsigned int i=0; i<4; i++)
+        {
+          const Vec3vf<K> p0 = vtx[i*4+0];
+          const Vec3vf<K> p1 = vtx[i*4+1];
+          const Vec3vf<K> p2 = vtx[i*4+2];
+          const Vec3vf<K> p3 = vtx[i*4+3];
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          pre.intersectK(valid_i,ray,p0,p1,p2,p3,g,subgrid,i,IntersectKEpilogM<4,K,filter>(ray,context,subgrid.geomID(),subgrid.primID(),i));
+        }
+      }
+
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const SubGrid& subgrid)
+      {
+        vbool<K> valid0 = valid_i;
+        Vec3fa vtx[16];
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+        subgrid.gather(vtx,context->scene);
+        for (unsigned int i=0; i<4; i++)
+        {
+          const Vec3vf<K> p0 = vtx[i*4+0];
+          const Vec3vf<K> p1 = vtx[i*4+1];
+          const Vec3vf<K> p2 = vtx[i*4+2];
+          const Vec3vf<K> p3 = vtx[i*4+3];
+          STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+          if (pre.intersectK(valid0,ray,p0,p1,p2,p3,g,subgrid,i,OccludedKEpilogM<4,K,filter>(valid0,ray,context,subgrid.geomID(),subgrid.primID(),i)))
+            break;
+        }
+        return !valid0;
+      }
+      
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const SubGrid& subgrid)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+        Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene);
+        pre.intersect1(ray,k,context,v0,v1,v2,v3,g,subgrid);
+      }
+
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const SubGrid& subgrid)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+        Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene);
+        return pre.occluded1(ray,k,context,v0,v1,v2,v3,g,subgrid);
+      }
+
+        template<bool robust>
+          static __forceinline void intersect(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+        {
+          BVHNQuantizedBaseNodeIntersectorK<N,K,robust> isecK;
+          for (size_t j=0;j<num;j++)
+          {
+            size_t m_valid = movemask(prim[j].qnode.validMask());
+            vfloat<K> dist;
+            while(m_valid)
+            {
+              const size_t i = bscf(m_valid);
+              if (none(valid & isecK.intersectK(&prim[j].qnode,i,tray,dist))) continue;
+              intersect(valid,pre,ray,context,prim[j].subgrid(i));
+            }
+          }
+        }
+
+        template<bool robust>        
+        static __forceinline vbool<K> occluded(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+        {
+          BVHNQuantizedBaseNodeIntersectorK<N,K,robust> isecK;
+          vbool<K> valid0 = valid;
+          for (size_t j=0;j<num;j++)
+          {
+            size_t m_valid = movemask(prim[j].qnode.validMask());
+            vfloat<K> dist;
+            while(m_valid)
+            {
+              const size_t i = bscf(m_valid);
+              if (none(valid0 & isecK.intersectK(&prim[j].qnode,i,tray,dist))) continue;
+              valid0 &= !occluded(valid0,pre,ray,context,prim[j].subgrid(i));
+              if (none(valid0)) break;
+            }
+          }
+          return !valid0;
+        }
+        
+        template<bool robust>        
+          static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+        {
+          BVHNQuantizedBaseNodeIntersector1<N,robust> isec1;
+
+          for (size_t i=0;i<num;i++)
+          {
+            vfloat<N> dist;
+            size_t mask = isec1.intersect(&prim[i].qnode,tray,dist); 
+            while(mask != 0)
+            {
+              const size_t ID = bscf(mask); 
+              assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask()));
+
+              if (unlikely(dist[ID] > ray.tfar[k])) continue;
+              intersect(pre,ray,k,context,prim[i].subgrid(ID));
+            }
+          }
+        }
+        
+        template<bool robust>
+        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+        {
+          BVHNQuantizedBaseNodeIntersector1<N,robust> isec1;
+
+          for (size_t i=0;i<num;i++)
+          {
+            vfloat<N> dist;
+            size_t mask = isec1.intersect(&prim[i].qnode,tray,dist); 
+            while(mask != 0)
+            {
+              const size_t ID = bscf(mask); 
+              assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask()));
+
+              if (occluded(pre,ray,k,context,prim[i].subgrid(ID)))
+                return true;
+            }
+          }
+          return false;
+        }
+    };
+
+
+    template<int N, int K, bool filter>
+    struct SubGridIntersectorKPluecker
+    {
+      typedef SubGridQBVHN<N> Primitive;
+      typedef SubGridQuadMIntersectorKPluecker<4,K,filter> Precalculations;
+
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const SubGrid& subgrid)
+      {
+        Vec3fa vtx[16];
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+        subgrid.gather(vtx,context->scene);
+        for (unsigned int i=0; i<4; i++)
+        {
+          const Vec3vf<K> p0 = vtx[i*4+0];
+          const Vec3vf<K> p1 = vtx[i*4+1];
+          const Vec3vf<K> p2 = vtx[i*4+2];
+          const Vec3vf<K> p3 = vtx[i*4+3];
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          pre.intersectK(valid_i,ray,p0,p1,p2,p3,g,subgrid,i,IntersectKEpilogM<4,K,filter>(ray,context,subgrid.geomID(),subgrid.primID(),i));
+        }
+      }
+
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const SubGrid& subgrid)
+      {
+        vbool<K> valid0 = valid_i;
+        Vec3fa vtx[16];
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+        subgrid.gather(vtx,context->scene);
+        for (unsigned int i=0; i<4; i++)
+        {
+          const Vec3vf<K> p0 = vtx[i*4+0];
+          const Vec3vf<K> p1 = vtx[i*4+1];
+          const Vec3vf<K> p2 = vtx[i*4+2];
+          const Vec3vf<K> p3 = vtx[i*4+3];
+          STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+          //if (pre.intersectK(valid0,ray,p0,p1,p2,p3,g,subgrid,i,OccludedKEpilogM<4,K,filter>(valid0,ray,context,subgrid.geomID(),subgrid.primID(),i)))
+          if (pre.occludedK(valid0,ray,p0,p1,p2,p3,g,subgrid,i,OccludedKEpilogM<4,K,filter>(valid0,ray,context,subgrid.geomID(),subgrid.primID(),i)))
+	    
+            break;
+        }
+        return !valid0;
+      }
+      
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const SubGrid& subgrid)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+        Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene);
+        pre.intersect1(ray,k,context,v0,v1,v2,v3,g,subgrid);
+      }
+
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const SubGrid& subgrid)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+        Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene);
+        return pre.occluded1(ray,k,context,v0,v1,v2,v3,g,subgrid);
+      }
+      
+        template<bool robust>
+          static __forceinline void intersect(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+        {
+          BVHNQuantizedBaseNodeIntersectorK<N,K,robust> isecK;
+          for (size_t j=0;j<num;j++)
+          {
+            size_t m_valid = movemask(prim[j].qnode.validMask());
+            vfloat<K> dist;
+            while(m_valid)
+            {
+              const size_t i = bscf(m_valid);
+              if (none(valid & isecK.intersectK(&prim[j].qnode,i,tray,dist))) continue;
+              intersect(valid,pre,ray,context,prim[j].subgrid(i));
+            }
+          }
+        }
+
+        template<bool robust>        
+        static __forceinline vbool<K> occluded(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+        {
+          BVHNQuantizedBaseNodeIntersectorK<N,K,robust> isecK;
+          vbool<K> valid0 = valid;
+          for (size_t j=0;j<num;j++)
+          {
+            size_t m_valid = movemask(prim[j].qnode.validMask());
+            vfloat<K> dist;
+            while(m_valid)
+            {
+              const size_t i = bscf(m_valid);
+              if (none(valid0 & isecK.intersectK(&prim[j].qnode,i,tray,dist))) continue;
+              valid0 &= !occluded(valid0,pre,ray,context,prim[j].subgrid(i));
+              if (none(valid0)) break;
+            }
+          }
+          return !valid0;
+        }
+        
+        template<bool robust>        
+          static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+        {
+          BVHNQuantizedBaseNodeIntersector1<N,robust> isec1;
+
+          for (size_t i=0;i<num;i++)
+          {
+            vfloat<N> dist;
+            size_t mask = isec1.intersect(&prim[i].qnode,tray,dist); 
+            while(mask != 0)
+            {
+              const size_t ID = bscf(mask); 
+              assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask()));
+
+              if (unlikely(dist[ID] > ray.tfar[k])) continue;
+              intersect(pre,ray,k,context,prim[i].subgrid(ID));
+            }
+          }
+        }
+        
+        template<bool robust>
+        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+        {
+          BVHNQuantizedBaseNodeIntersector1<N,robust> isec1;
+
+          for (size_t i=0;i<num;i++)
+          {
+            vfloat<N> dist;
+            size_t mask = isec1.intersect(&prim[i].qnode,tray,dist); 
+            while(mask != 0)
+            {
+              const size_t ID = bscf(mask); 
+              assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask()));
+
+              if (occluded(pre,ray,k,context,prim[i].subgrid(ID)))
+                return true;
+            }
+          }
+          return false;
+        }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/subgrid_intersector_moeller.h b/thirdparty/embree/kernels/geometry/subgrid_intersector_moeller.h
new file mode 100644
index 0000000000..64937d34fe
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/subgrid_intersector_moeller.h
@@ -0,0 +1,382 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "subgrid.h"
+#include "quad_intersector_moeller.h"
+
+namespace embree
+{
+  namespace isa
+  {
+
+    /* ----------------------------- */
+    /* -- single ray intersectors -- */
+    /* ----------------------------- */
+
+    template<int M>
+    __forceinline void interpolateUV(MoellerTrumboreHitM<M,UVIdentity<M>> &hit,const GridMesh::Grid &g, const SubGrid& subgrid, const vint<M> &stepX, const vint<M> &stepY) 
+    {
+      /* correct U,V interpolation across the entire grid */
+      const vint<M> sx((int)subgrid.x());
+      const vint<M> sy((int)subgrid.y());
+      const vint<M> sxM(sx + stepX); 
+      const vint<M> syM(sy + stepY); 
+      const float inv_resX = rcp((float)((int)g.resX-1));
+      const float inv_resY = rcp((float)((int)g.resY-1));          
+      hit.U = (hit.U + (vfloat<M>)sxM * hit.absDen) * inv_resX;
+      hit.V = (hit.V + (vfloat<M>)syM * hit.absDen) * inv_resY;
+    }
+    
+    template<int M, bool filter>
+      struct SubGridQuadMIntersector1MoellerTrumbore;
+
+    template<int M, bool filter>
+      struct SubGridQuadMIntersector1MoellerTrumbore
+      {
+        __forceinline SubGridQuadMIntersector1MoellerTrumbore() {}
+
+        __forceinline SubGridQuadMIntersector1MoellerTrumbore(const Ray& ray, const void* ptr) {}
+
+        __forceinline void intersect(RayHit& ray, IntersectContext* context,
+                                     const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+                                     const GridMesh::Grid &g, const SubGrid& subgrid) const
+        {
+          UVIdentity<M> mapUV;
+          MoellerTrumboreHitM<M,UVIdentity<M>> hit(mapUV);
+          MoellerTrumboreIntersector1<M> intersector(ray,nullptr);
+          Intersect1EpilogMU<M,filter> epilog(ray,context,subgrid.geomID(),subgrid.primID());
+
+          /* intersect first triangle */
+          if (intersector.intersect(ray,v0,v1,v3,mapUV,hit)) 
+          {
+            interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1));
+            epilog(hit.valid,hit);
+          }
+
+          /* intersect second triangle */
+          if (intersector.intersect(ray,v2,v3,v1,mapUV,hit)) 
+          {
+            hit.U = hit.absDen - hit.U;
+            hit.V = hit.absDen - hit.V;
+            interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1));
+            epilog(hit.valid,hit);
+          }
+        }
+      
+        __forceinline bool occluded(Ray& ray, IntersectContext* context,
+                                    const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+                                    const GridMesh::Grid &g, const SubGrid& subgrid) const
+        {
+          UVIdentity<M> mapUV;
+          MoellerTrumboreHitM<M,UVIdentity<M>> hit(mapUV);
+          MoellerTrumboreIntersector1<M> intersector(ray,nullptr);
+          Occluded1EpilogMU<M,filter> epilog(ray,context,subgrid.geomID(),subgrid.primID());
+          
+          /* intersect first triangle */
+          if (intersector.intersect(ray,v0,v1,v3,mapUV,hit)) 
+          {
+            interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1));
+            if (epilog(hit.valid,hit))
+              return true;
+          }
+
+          /* intersect second triangle */
+          if (intersector.intersect(ray,v2,v3,v1,mapUV,hit)) 
+          {
+            hit.U = hit.absDen - hit.U;
+            hit.V = hit.absDen - hit.V;
+            interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1));
+            if (epilog(hit.valid,hit))
+              return true;
+          }
+          return false;
+        }
+      };
+
+#if defined (__AVX__)
+
+    /*! Intersects 4 quads with 1 ray using AVX */
+    template<bool filter>
+      struct SubGridQuadMIntersector1MoellerTrumbore<4,filter>
+    {
+      __forceinline SubGridQuadMIntersector1MoellerTrumbore() {}
+
+      __forceinline SubGridQuadMIntersector1MoellerTrumbore(const Ray& ray, const void* ptr) {}
+      
+      template<typename Epilog>
+        __forceinline bool intersect(Ray& ray, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const GridMesh::Grid &g, const SubGrid& subgrid, const Epilog& epilog) const
+      {
+        const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z));
+#if !defined(EMBREE_BACKFACE_CULLING)
+        const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z));        
+#else
+        const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z));
+#endif
+        UVIdentity<8> mapUV;
+        MoellerTrumboreHitM<8,UVIdentity<8>> hit(mapUV);
+        MoellerTrumboreIntersector1<8> intersector(ray,nullptr);
+        const vbool8 flags(0,0,0,0,1,1,1,1);
+        if (unlikely(intersector.intersect(ray,vtx0,vtx1,vtx2,mapUV,hit)))
+        {
+	  /* correct U,V interpolation across the entire grid */
+	  const vfloat8 U = select(flags,hit.absDen - hit.V,hit.U);	  
+	  const vfloat8 V = select(flags,hit.absDen - hit.U,hit.V);
+	  hit.U = U;
+	  hit.V = V;
+	  hit.vNg *= select(flags,vfloat8(-1.0f),vfloat8(1.0f)); 	  
+          interpolateUV<8>(hit,g,subgrid,vint<8>(0,1,1,0,0,1,1,0),vint<8>(0,0,1,1,0,0,1,1));
+          if (unlikely(epilog(hit.valid,hit)))
+            return true;
+        }
+        return false;
+      }
+      
+      __forceinline bool intersect(RayHit& ray, IntersectContext* context,
+                                   const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                   const GridMesh::Grid &g, const SubGrid& subgrid) const
+      {
+          return intersect(ray,v0,v1,v2,v3,g,subgrid,Intersect1EpilogMU<8,filter>(ray,context,subgrid.geomID(),subgrid.primID()));
+      }
+      
+      __forceinline bool occluded(Ray& ray, IntersectContext* context,
+                                  const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                  const GridMesh::Grid &g, const SubGrid& subgrid) const
+      {
+          return intersect(ray,v0,v1,v2,v3,g,subgrid,Occluded1EpilogMU<8,filter>(ray,context,subgrid.geomID(),subgrid.primID()));
+      }
+    };
+
+#endif
+
+    // ============================================================================================================================
+    // ============================================================================================================================
+    // ============================================================================================================================
+
+
+    /* ----------------------------- */
+    /* -- ray packet intersectors -- */
+    /* ----------------------------- */
+
+    template<int K>
+    __forceinline void interpolateUV(const vbool<K>& valid, MoellerTrumboreHitK<K,UVIdentity<K>> &hit,const GridMesh::Grid &g, const SubGrid& subgrid, const unsigned int i) 
+    {
+      /* correct U,V interpolation across the entire grid */
+      const unsigned int sx = subgrid.x() + (unsigned int)(i % 2);
+      const unsigned int sy = subgrid.y() + (unsigned int)(i >>1);
+      const float inv_resX = rcp((float)(int)(g.resX-1));
+      const float inv_resY = rcp((float)(int)(g.resY-1));      
+      hit.U = select(valid,(hit.U + vfloat<K>((float)sx) * hit.absDen) * inv_resX,hit.U);
+      hit.V = select(valid,(hit.V + vfloat<K>((float)sy) * hit.absDen) * inv_resY,hit.V);
+    }
+        
+    template<int M, int K, bool filter>
+      struct SubGridQuadMIntersectorKMoellerTrumboreBase
+      {
+        __forceinline SubGridQuadMIntersectorKMoellerTrumboreBase(const vbool<K>& valid, const RayK<K>& ray) {}
+
+        template<typename Epilog>
+        __forceinline bool intersectK(const vbool<K>& valid, 
+                                      RayK<K>& ray,
+                                      const Vec3vf<K>& v0,
+                                      const Vec3vf<K>& v1,
+                                      const Vec3vf<K>& v2,
+                                      const Vec3vf<K>& v3,
+                                      const GridMesh::Grid &g, 
+                                      const SubGrid &subgrid,
+                                      const unsigned int i,
+                                      const Epilog& epilog) const
+        {
+	  UVIdentity<K> mapUV;
+	  MoellerTrumboreHitK<K,UVIdentity<K>> hit(mapUV);
+	  MoellerTrumboreIntersectorK<M,K> intersector;
+
+          const vbool<K> valid0 = intersector.intersectK(valid,ray,v0,v1,v3,mapUV,hit);
+	  if (any(valid0))
+	    {
+	      interpolateUV(valid0,hit,g,subgrid,i);
+	      epilog(valid0,hit);
+	    }
+          const vbool<K> valid1 = intersector.intersectK(valid,ray,v2,v3,v1,mapUV,hit);
+	  if (any(valid1))
+	    {
+	      hit.U = hit.absDen - hit.U;
+	      hit.V = hit.absDen - hit.V;	      
+	      interpolateUV(valid1,hit,g,subgrid,i);
+	      epilog(valid1,hit);
+	    }
+	  return any(valid0|valid1);	  
+        }
+
+       template<typename Epilog>
+        __forceinline bool occludedK(const vbool<K>& valid, 
+				     RayK<K>& ray,
+				     const Vec3vf<K>& v0,
+				     const Vec3vf<K>& v1,
+				     const Vec3vf<K>& v2,
+				     const Vec3vf<K>& v3,
+				     const GridMesh::Grid &g, 
+				     const SubGrid &subgrid,
+				     const unsigned int i,
+				     const Epilog& epilog) const
+        {
+	  UVIdentity<K> mapUV;
+	  MoellerTrumboreHitK<K,UVIdentity<K>> hit(mapUV);
+	  MoellerTrumboreIntersectorK<M,K> intersector;
+
+	  vbool<K> valid_final = valid;
+          const vbool<K> valid0 = intersector.intersectK(valid,ray,v0,v1,v3,mapUV,hit);
+	  if (any(valid0))
+	    {
+	      interpolateUV(valid0,hit,g,subgrid,i);
+	      epilog(valid0,hit);
+	      valid_final &= !valid0;
+	    }
+	  if (none(valid_final)) return true;	      	  
+          const vbool<K> valid1 = intersector.intersectK(valid,ray,v2,v3,v1,mapUV,hit);
+	  if (any(valid1))
+	    {
+	      hit.U = hit.absDen - hit.U;
+	      hit.V = hit.absDen - hit.V;	      
+	      interpolateUV(valid1,hit,g,subgrid,i);
+	      epilog(valid1,hit);
+	      valid_final &= !valid1;	      
+	    }
+	  return none(valid_final);
+        }
+
+        static __forceinline bool intersect1(RayK<K>& ray,
+                                             size_t k,
+                                             const Vec3vf<M>& v0,
+                                             const Vec3vf<M>& v1,
+                                             const Vec3vf<M>& v2,
+                                             MoellerTrumboreHitM<M,UVIdentity<M>> &hit)
+        {
+          const Vec3vf<M> e1 = v0-v1;
+          const Vec3vf<M> e2 = v2-v0;
+	  MoellerTrumboreIntersectorK<8,K> intersector;
+	  UVIdentity<M> mapUV;
+	  return intersector.intersectEdge(ray,k,v0,e1,e2,mapUV,hit);
+        }
+	
+      };
+
+    template<int M, int K, bool filter>
+      struct SubGridQuadMIntersectorKMoellerTrumbore : public SubGridQuadMIntersectorKMoellerTrumboreBase<M,K,filter>
+    {
+      __forceinline SubGridQuadMIntersectorKMoellerTrumbore(const vbool<K>& valid, const RayK<K>& ray)
+        : SubGridQuadMIntersectorKMoellerTrumboreBase<M,K,filter>(valid,ray) {}
+
+      __forceinline void intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context,
+                                    const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const
+      {
+	UVIdentity<M> mapUV;
+	MoellerTrumboreHitM<M,UVIdentity<M>> hit(mapUV);
+        Intersect1KEpilogMU<M,K,filter> epilog(ray,k,context,subgrid.geomID(),subgrid.primID());
+	MoellerTrumboreIntersectorK<M,K> intersector;
+	/* intersect first triangle */
+	if (intersector.intersect(ray,k,v0,v1,v3,mapUV,hit)) 
+          {
+            interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1));
+            epilog(hit.valid,hit);
+          }
+
+	/* intersect second triangle */
+	if (intersector.intersect(ray,k,v2,v3,v1,mapUV,hit)) 
+          {
+	    hit.U = hit.absDen - hit.U;
+	    hit.V = hit.absDen - hit.V;
+            interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1));
+            epilog(hit.valid,hit);
+          }
+      }
+      
+      __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context,
+                                   const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const
+      {
+	UVIdentity<M> mapUV;
+        MoellerTrumboreHitM<M,UVIdentity<M>> hit(mapUV);
+        Occluded1KEpilogMU<M,K,filter> epilog(ray,k,context,subgrid.geomID(),subgrid.primID());	
+	MoellerTrumboreIntersectorK<M,K> intersector;
+	/* intersect first triangle */
+	if (intersector.intersect(ray,k,v0,v1,v3,mapUV,hit)) 
+        {
+          interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1));
+          if (epilog(hit.valid,hit)) return true;
+        }
+
+	/* intersect second triangle */
+	if (intersector.intersect(ray,k,v2,v3,v1,mapUV,hit)) 
+        {
+          hit.U = hit.absDen - hit.U;
+          hit.V = hit.absDen - hit.V;	  
+          interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1));
+          if (epilog(hit.valid,hit)) return true;
+        }
+        return false;
+      }
+    };
+
+
+#if defined (__AVX__)
+
+    /*! Intersects 4 quads with 1 ray using AVX */
+    template<int K, bool filter>
+      struct SubGridQuadMIntersectorKMoellerTrumbore<4,K,filter> : public SubGridQuadMIntersectorKMoellerTrumboreBase<4,K,filter>
+    {
+      __forceinline SubGridQuadMIntersectorKMoellerTrumbore(const vbool<K>& valid, const RayK<K>& ray)
+        : SubGridQuadMIntersectorKMoellerTrumboreBase<4,K,filter>(valid,ray) {}
+      
+      template<typename Epilog>
+        __forceinline bool intersect1(RayK<K>& ray, size_t k,const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                      const GridMesh::Grid &g, const SubGrid &subgrid, const Epilog& epilog) const
+      {
+        const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z));
+#if !defined(EMBREE_BACKFACE_CULLING)
+        const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z));
+#else
+        const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z));
+#endif
+        const vbool8 flags(0,0,0,0,1,1,1,1);
+
+        UVIdentity<8> mapUV;
+        MoellerTrumboreHitM<8,UVIdentity<8>> hit(mapUV);
+        if (SubGridQuadMIntersectorKMoellerTrumboreBase<8,K,filter>::intersect1(ray,k,vtx0,vtx1,vtx2,hit))
+        {
+	  const vfloat8 U = select(flags,hit.absDen - hit.V,hit.U);	  
+	  const vfloat8 V = select(flags,hit.absDen - hit.U,hit.V);
+	  hit.U = U;
+	  hit.V = V;
+	  hit.vNg *= select(flags,vfloat8(-1.0f),vfloat8(1.0f)); 	  	  
+	  interpolateUV<8>(hit,g,subgrid,vint<8>(0,1,1,0,0,1,1,0),vint<8>(0,0,1,1,0,0,1,1));
+          if (unlikely(epilog(hit.valid,hit)))
+            return true;
+
+        }
+        return false;
+      }
+      
+      __forceinline bool intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context,
+                                    const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const
+      {
+        return intersect1(ray,k,v0,v1,v2,v3,g,subgrid,Intersect1KEpilogMU<8,K,filter>(ray,k,context,subgrid.geomID(),subgrid.primID()));
+      }
+      
+      __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context,
+                                   const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const
+      {
+        return intersect1(ray,k,v0,v1,v2,v3,g,subgrid,Occluded1KEpilogMU<8,K,filter>(ray,k,context,subgrid.geomID(),subgrid.primID()));
+      }
+    };
+
+#endif
+
+
+
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/subgrid_intersector_pluecker.h b/thirdparty/embree/kernels/geometry/subgrid_intersector_pluecker.h
new file mode 100644
index 0000000000..5ded56e1f7
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/subgrid_intersector_pluecker.h
@@ -0,0 +1,367 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "subgrid.h"
+#include "quad_intersector_moeller.h"
+#include "quad_intersector_pluecker.h"
+
+namespace embree
+{
+  namespace isa
+  {
+
+    template<int M>
+    __forceinline void interpolateUV(PlueckerHitM<M,UVIdentity<M>> &hit,const GridMesh::Grid &g, const SubGrid& subgrid, const vint<M> &stepX, const vint<M> &stepY) 
+    {
+      /* correct U,V interpolation across the entire grid */
+      const vint<M> sx((int)subgrid.x());
+      const vint<M> sy((int)subgrid.y());
+      const vint<M> sxM(sx + stepX);
+      const vint<M> syM(sy + stepY);
+      const float inv_resX = rcp((float)((int)g.resX-1));
+      const float inv_resY = rcp((float)((int)g.resY-1));          
+      hit.U = (hit.U + vfloat<M>(sxM) * hit.UVW) * inv_resX;
+      hit.V = (hit.V + vfloat<M>(syM) * hit.UVW) * inv_resY;
+    }
+    
+    template<int M, bool filter>
+      struct SubGridQuadMIntersector1Pluecker;
+
+    template<int M, bool filter>
+      struct SubGridQuadMIntersector1Pluecker
+      {
+        __forceinline SubGridQuadMIntersector1Pluecker() {}
+
+        __forceinline SubGridQuadMIntersector1Pluecker(const Ray& ray, const void* ptr) {}
+
+        __forceinline void intersect(RayHit& ray, IntersectContext* context,
+                                     const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+                                     const GridMesh::Grid &g, const SubGrid& subgrid) const
+        {
+          UVIdentity<M> mapUV;
+          PlueckerHitM<M,UVIdentity<M>> hit(mapUV);
+          PlueckerIntersector1<M> intersector(ray,nullptr);
+	  
+          Intersect1EpilogMU<M,filter> epilog(ray,context,subgrid.geomID(),subgrid.primID());
+
+          /* intersect first triangle */
+	  if (intersector.intersect(ray,v0,v1,v3,mapUV,hit)) 
+          {
+            interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1));
+            epilog(hit.valid,hit);
+          }
+
+          /* intersect second triangle */
+	  if (intersector.intersect(ray,v2,v3,v1,mapUV,hit)) 
+          {
+	    hit.U = hit.UVW - hit.U;
+	    hit.V = hit.UVW - hit.V;
+            interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1));
+            epilog(hit.valid,hit);
+          }
+        }
+      
+        __forceinline bool occluded(Ray& ray, IntersectContext* context,
+                                    const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+                                    const GridMesh::Grid &g, const SubGrid& subgrid) const
+        {
+          UVIdentity<M> mapUV;
+          PlueckerHitM<M,UVIdentity<M>> hit(mapUV);
+          PlueckerIntersector1<M> intersector(ray,nullptr);
+          Occluded1EpilogMU<M,filter> epilog(ray,context,subgrid.geomID(),subgrid.primID());
+
+          /* intersect first triangle */
+	  if (intersector.intersect(ray,v0,v1,v3,mapUV,hit)) 
+          {
+            interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1));
+            if (epilog(hit.valid,hit))
+	      return true;
+          }
+
+          /* intersect second triangle */
+	  if (intersector.intersect(ray,v2,v3,v1,mapUV,hit)) 
+          {
+	    hit.U = hit.UVW - hit.U;
+	    hit.V = hit.UVW - hit.V;
+            interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1));
+            if (epilog(hit.valid,hit))
+	      return true;
+          }
+          return false;
+        }
+      };
+
+#if defined (__AVX__)
+
+    /*! Intersects 4 quads with 1 ray using AVX */
+    template<bool filter>
+      struct SubGridQuadMIntersector1Pluecker<4,filter>
+    {
+      __forceinline SubGridQuadMIntersector1Pluecker() {}
+
+      __forceinline SubGridQuadMIntersector1Pluecker(const Ray& ray, const void* ptr) {}
+      
+      template<typename Epilog>
+        __forceinline bool intersect(Ray& ray, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const GridMesh::Grid &g, const SubGrid& subgrid, const Epilog& epilog) const
+      {
+        const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z));
+#if !defined(EMBREE_BACKFACE_CULLING)
+        const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z));        
+#else
+        const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z));
+#endif
+
+        UVIdentity<8> mapUV;
+        PlueckerHitM<8,UVIdentity<8>> hit(mapUV);
+        PlueckerIntersector1<8> intersector(ray,nullptr);
+        const vbool8 flags(0,0,0,0,1,1,1,1);
+        if (unlikely(intersector.intersect(ray,vtx0,vtx1,vtx2,mapUV,hit)))
+        {
+	  /* correct U,V interpolation across the entire grid */
+	  const vfloat8 U = select(flags,hit.UVW - hit.V,hit.U);	  
+	  const vfloat8 V = select(flags,hit.UVW - hit.U,hit.V);
+	  hit.U = U;
+	  hit.V = V;	  
+	  hit.vNg *= select(flags,vfloat8(-1.0f),vfloat8(1.0f)); 
+          interpolateUV<8>(hit,g,subgrid,vint<8>(0,1,1,0,0,1,1,0),vint<8>(0,0,1,1,0,0,1,1));
+          if (unlikely(epilog(hit.valid,hit)))
+            return true;
+        }
+        return false;
+      }
+      
+      __forceinline bool intersect(RayHit& ray, IntersectContext* context,
+                                   const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                   const GridMesh::Grid &g, const SubGrid& subgrid) const
+      {
+          return intersect(ray,v0,v1,v2,v3,g,subgrid,Intersect1EpilogMU<8,filter>(ray,context,subgrid.geomID(),subgrid.primID()));
+      }
+      
+      __forceinline bool occluded(Ray& ray, IntersectContext* context,
+                                  const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                  const GridMesh::Grid &g, const SubGrid& subgrid) const
+      {
+          return intersect(ray,v0,v1,v2,v3,g,subgrid,Occluded1EpilogMU<8,filter>(ray,context,subgrid.geomID(),subgrid.primID()));
+      }
+    };
+
+#endif
+
+
+    /* ----------------------------- */
+    /* -- ray packet intersectors -- */
+    /* ----------------------------- */
+
+    template<int K>
+    __forceinline void interpolateUV(const vbool<K>& valid, PlueckerHitK<K,UVIdentity<K>> &hit,const GridMesh::Grid &g, const SubGrid& subgrid, const unsigned int i) 
+    {
+      /* correct U,V interpolation across the entire grid */
+      const unsigned int sx = subgrid.x() + (unsigned int)(i % 2);
+      const unsigned int sy = subgrid.y() + (unsigned int)(i >>1);
+      const float inv_resX = rcp((float)(int)(g.resX-1));
+      const float inv_resY = rcp((float)(int)(g.resY-1));      
+      hit.U = select(valid,(hit.U + vfloat<K>((float)sx) * hit.UVW) * inv_resX,hit.U);
+      hit.V = select(valid,(hit.V + vfloat<K>((float)sy) * hit.UVW) * inv_resY,hit.V);
+    }
+    
+    template<int M, int K, bool filter>
+      struct SubGridQuadMIntersectorKPlueckerBase
+      {
+        __forceinline SubGridQuadMIntersectorKPlueckerBase(const vbool<K>& valid, const RayK<K>& ray) {}
+
+        template<typename Epilog>
+        __forceinline bool intersectK(const vbool<K>& valid, 
+                                      RayK<K>& ray,
+                                      const Vec3vf<K>& v0,
+                                      const Vec3vf<K>& v1,
+                                      const Vec3vf<K>& v2,
+                                      const Vec3vf<K>& v3,
+                                      const GridMesh::Grid &g, 
+                                      const SubGrid &subgrid,
+                                      const unsigned int i,
+                                      const Epilog& epilog) const
+        {
+	  UVIdentity<K> mapUV;
+	  PlueckerHitK<K,UVIdentity<K>> hit(mapUV);
+	  PlueckerIntersectorK<M,K> intersector;
+
+          const vbool<K> valid0 = intersector.intersectK(valid,ray,v0,v1,v3,mapUV,hit);
+	  if (any(valid0))
+	    {
+	      interpolateUV(valid0,hit,g,subgrid,i);
+	      epilog(valid0,hit);
+	    }
+          const vbool<K> valid1 = intersector.intersectK(valid,ray,v2,v3,v1,mapUV,hit);
+	  if (any(valid1))
+	    {
+	      hit.U = hit.UVW - hit.U;
+	      hit.V = hit.UVW - hit.V;	      
+	      interpolateUV(valid1,hit,g,subgrid,i);
+	      epilog(valid1,hit);
+	    }
+	  return any(valid0|valid1);	  
+        }
+
+       template<typename Epilog>
+        __forceinline bool occludedK(const vbool<K>& valid, 
+				     RayK<K>& ray,
+				     const Vec3vf<K>& v0,
+				     const Vec3vf<K>& v1,
+				     const Vec3vf<K>& v2,
+				     const Vec3vf<K>& v3,
+				     const GridMesh::Grid &g, 
+				     const SubGrid &subgrid,
+				     const unsigned int i,
+				     const Epilog& epilog) const
+        {
+	  UVIdentity<K> mapUV;
+	  PlueckerHitK<K,UVIdentity<K>> hit(mapUV);
+	  PlueckerIntersectorK<M,K> intersector;
+
+	  vbool<K> valid_final = valid;
+          const vbool<K> valid0 = intersector.intersectK(valid,ray,v0,v1,v3,mapUV,hit);
+	  if (any(valid0))
+	    {
+	      interpolateUV(valid0,hit,g,subgrid,i);
+	      epilog(valid0,hit);
+	      valid_final &= !valid0;
+	    }
+	  if (none(valid_final)) return true;	      	  
+          const vbool<K> valid1 = intersector.intersectK(valid,ray,v2,v3,v1,mapUV,hit);
+	  if (any(valid1))
+	    {
+	      hit.U = hit.UVW - hit.U;
+	      hit.V = hit.UVW - hit.V;	      
+	      interpolateUV(valid1,hit,g,subgrid,i);
+	      epilog(valid1,hit);
+	      valid_final &= !valid1;	      
+	    }
+	  return none(valid_final);
+        }
+
+	
+      };
+
+
+    
+
+    template<int M, int K, bool filter>
+      struct SubGridQuadMIntersectorKPluecker : public SubGridQuadMIntersectorKPlueckerBase<M,K,filter>
+    {
+      __forceinline SubGridQuadMIntersectorKPluecker(const vbool<K>& valid, const RayK<K>& ray)
+        : SubGridQuadMIntersectorKPlueckerBase<M,K,filter>(valid,ray) {}
+
+      __forceinline void intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context,
+                                    const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const
+      {
+	UVIdentity<M> mapUV;
+	PlueckerHitM<M,UVIdentity<M>> hit(mapUV);
+        Intersect1KEpilogMU<M,K,filter> epilog(ray,k,context,subgrid.geomID(),subgrid.primID());
+	PlueckerIntersectorK<M,K> intersector;
+	
+	/* intersect first triangle */
+	if (intersector.intersect(ray,k,v0,v1,v3,mapUV,hit)) 
+          {
+            interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1));
+            epilog(hit.valid,hit);
+          }
+
+	/* intersect second triangle */
+	if (intersector.intersect(ray,k,v2,v3,v1,mapUV,hit)) 
+          {
+	    hit.U = hit.UVW - hit.U;
+	    hit.V = hit.UVW - hit.V;
+            interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1));
+            epilog(hit.valid,hit);
+          }
+      }
+      
+      __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context,
+                                   const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const
+      {
+	UVIdentity<M> mapUV;
+	PlueckerHitM<M,UVIdentity<M>> hit(mapUV);
+        Occluded1KEpilogMU<M,K,filter> epilog(ray,k,context,subgrid.geomID(),subgrid.primID());	
+	PlueckerIntersectorK<M,K> intersector;
+	
+	/* intersect first triangle */
+	if (intersector.intersect(ray,k,v0,v1,v3,mapUV,hit)) 
+        {
+          interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1));
+          if (epilog(hit.valid,hit)) return true;
+        }
+
+	/* intersect second triangle */
+	if (intersector.intersect(ray,k,v2,v3,v1,mapUV,hit)) 
+        {
+	  hit.U = hit.UVW - hit.U;
+	  hit.V = hit.UVW - hit.V;	  
+          interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1));
+          if (epilog(hit.valid,hit)) return true;
+        }
+        return false;
+      }
+    };
+
+
+#if defined (__AVX__)
+
+    /*! Intersects 4 quads with 1 ray using AVX */
+    template<int K, bool filter>
+      struct SubGridQuadMIntersectorKPluecker<4,K,filter> : public SubGridQuadMIntersectorKPlueckerBase<4,K,filter>
+    {
+      __forceinline SubGridQuadMIntersectorKPluecker(const vbool<K>& valid, const RayK<K>& ray)
+        : SubGridQuadMIntersectorKPlueckerBase<4,K,filter>(valid,ray) {}
+      
+      template<typename Epilog>
+        __forceinline bool intersect1(RayK<K>& ray, size_t k,const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                      const GridMesh::Grid &g, const SubGrid &subgrid, const Epilog& epilog) const
+      {
+        const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z));
+#if !defined(EMBREE_BACKFACE_CULLING)
+        const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z));
+#else
+        const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z));
+#endif
+	UVIdentity<8> mapUV;
+	PlueckerHitM<8,UVIdentity<8>> hit(mapUV);
+	PlueckerIntersectorK<8,K> intersector;
+	const vbool8 flags(0,0,0,0,1,1,1,1);
+        if (unlikely(intersector.intersect(ray,k,vtx0,vtx1,vtx2,mapUV,hit)))	
+        {
+          /* correct U,V interpolation across the entire grid */
+	  const vfloat8 U = select(flags,hit.UVW - hit.V,hit.U);	  
+	  const vfloat8 V = select(flags,hit.UVW - hit.U,hit.V);
+	  hit.U = U;
+	  hit.V = V;	  
+	  hit.vNg *= select(flags,vfloat8(-1.0f),vfloat8(1.0f)); 
+          interpolateUV<8>(hit,g,subgrid,vint<8>(0,1,1,0,0,1,1,0),vint<8>(0,0,1,1,0,0,1,1));
+          if (unlikely(epilog(hit.valid,hit)))
+            return true;
+        }
+        return false;
+      }
+      
+      __forceinline bool intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context,
+                                    const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const
+      {
+        return intersect1(ray,k,v0,v1,v2,v3,g,subgrid,Intersect1KEpilogMU<8,K,filter>(ray,k,context,subgrid.geomID(),subgrid.primID()));
+      }
+      
+      __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context,
+                                   const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const
+      {
+        return intersect1(ray,k,v0,v1,v2,v3,g,subgrid,Occluded1KEpilogMU<8,K,filter>(ray,k,context,subgrid.geomID(),subgrid.primID()));
+      }
+    };
+#endif
+
+    
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/subgrid_mb_intersector.h b/thirdparty/embree/kernels/geometry/subgrid_mb_intersector.h
new file mode 100644
index 0000000000..473d656e24
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/subgrid_mb_intersector.h
@@ -0,0 +1,236 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "subgrid_intersector.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int N, bool filter>
+    struct SubGridMBIntersector1Pluecker
+    {
+      typedef SubGridMBQBVHN<N> Primitive;
+      typedef SubGridQuadMIntersector1Pluecker<4,filter> Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const SubGrid& subgrid)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+        float ftime;
+        const int itime = mesh->timeSegment(ray.time(), ftime);
+        Vec3vf4 v0,v1,v2,v3; subgrid.gatherMB(v0,v1,v2,v3,context->scene,itime,ftime);
+        pre.intersect(ray,context,v0,v1,v2,v3,g,subgrid);
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const SubGrid& subgrid)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+        float ftime;
+        const int itime = mesh->timeSegment(ray.time(), ftime);
+
+        Vec3vf4 v0,v1,v2,v3; subgrid.gatherMB(v0,v1,v2,v3,context->scene,itime,ftime);
+        return pre.occluded(ray,context,v0,v1,v2,v3,g,subgrid);
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const SubGrid& subgrid)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, subgrid);
+      }
+
+      template<bool robust>
+        static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+      {
+        BVHNQuantizedBaseNodeIntersector1<N,robust> isec1;
+        for (size_t i=0;i<num;i++)
+        {
+          vfloat<N> dist;
+          const float time = prim[i].adjustTime(ray.time());
+
+          assert(time <= 1.0f);
+          size_t mask = isec1.intersect(&prim[i].qnode,tray,time,dist); 
+#if defined(__AVX__)
+          STAT3(normal.trav_hit_boxes[popcnt(mask)],1,1,1);
+#endif
+          while(mask != 0)
+          {
+            const size_t ID = bscf(mask); 
+            if (unlikely(dist[ID] > ray.tfar)) continue;
+            intersect(pre,ray,context,prim[i].subgrid(ID));
+          }
+        }
+      }
+
+      template<bool robust>        
+        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+      {
+        BVHNQuantizedBaseNodeIntersector1<N,robust> isec1;
+        for (size_t i=0;i<num;i++)
+        {
+          const float time = prim[i].adjustTime(ray.time());
+          assert(time <= 1.0f);
+          vfloat<N> dist;
+          size_t mask = isec1.intersect(&prim[i].qnode,tray,time,dist); 
+          while(mask != 0)
+          {
+            const size_t ID = bscf(mask); 
+            if (occluded(pre,ray,context,prim[i].subgrid(ID)))
+              return true;
+          }
+        }
+        return false;
+      }
+      
+      static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, const Primitive* prim, size_t num, const TravPointQuery<N> &tquery, size_t& lazy_node)
+      {
+        assert(false && "not implemented");
+        return false;
+      }
+    };
+
+
+    template<int N, int K, bool filter>
+    struct SubGridMBIntersectorKPluecker
+    {
+      typedef SubGridMBQBVHN<N> Primitive;
+      typedef SubGridQuadMIntersectorKPluecker<4,K,filter> Precalculations;
+
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const SubGrid& subgrid)
+      {
+        size_t m_valid = movemask(valid_i);
+        while(m_valid)
+        {
+          size_t ID = bscf(m_valid);
+          intersect(pre,ray,ID,context,subgrid);
+        }
+      }
+
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const SubGrid& subgrid)
+      {
+        vbool<K> valid0 = valid_i;
+        size_t m_valid = movemask(valid_i);
+        while(m_valid)
+        {
+          size_t ID = bscf(m_valid);
+          if (occluded(pre,ray,ID,context,subgrid))
+            clear(valid0,ID);
+        }
+        return !valid0;
+      }
+      
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const SubGrid& subgrid)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+ 
+        vfloat<K> ftime;
+        const vint<K> itime = mesh->timeSegment<K>(ray.time(), ftime);
+        Vec3vf4 v0,v1,v2,v3; subgrid.gatherMB(v0,v1,v2,v3,context->scene,itime[k],ftime[k]);
+        pre.intersect1(ray,k,context,v0,v1,v2,v3,g,subgrid);
+      }
+
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const SubGrid& subgrid)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+        vfloat<K> ftime;
+        const vint<K> itime = mesh->timeSegment<K>(ray.time(), ftime);
+        Vec3vf4 v0,v1,v2,v3; subgrid.gatherMB(v0,v1,v2,v3,context->scene,itime[k],ftime[k]);
+        return pre.occluded1(ray,k,context,v0,v1,v2,v3,g,subgrid);
+      }
+
+        template<bool robust>
+          static __forceinline void intersect(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+        {
+          BVHNQuantizedBaseNodeIntersectorK<N,K,robust> isecK;
+          for (size_t j=0;j<num;j++)
+          {
+            size_t m_valid = movemask(prim[j].qnode.validMask());
+            const vfloat<K> time = prim[j].template adjustTime<K>(ray.time());
+
+            vfloat<K> dist;
+            while(m_valid)
+            {
+              const size_t i = bscf(m_valid);
+              if (none(valid & isecK.intersectK(&prim[j].qnode,i,tray,time,dist))) continue;
+              intersect(valid,pre,ray,context,prim[j].subgrid(i));
+            }
+          }
+        }
+
+        template<bool robust>        
+        static __forceinline vbool<K> occluded(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+        {
+          BVHNQuantizedBaseNodeIntersectorK<N,K,robust> isecK;
+
+          vbool<K> valid0 = valid;
+          for (size_t j=0;j<num;j++)
+          {
+            size_t m_valid = movemask(prim[j].qnode.validMask());
+            const vfloat<K> time = prim[j].template adjustTime<K>(ray.time());
+            vfloat<K> dist;
+            while(m_valid)
+            {
+              const size_t i = bscf(m_valid);
+              if (none(valid0 & isecK.intersectK(&prim[j].qnode,i,tray,time,dist))) continue;
+              valid0 &= !occluded(valid0,pre,ray,context,prim[j].subgrid(i));
+              if (none(valid0)) break;
+            }
+          }
+          return !valid0;
+        }
+        
+        template<bool robust>        
+          static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+        {
+          BVHNQuantizedBaseNodeIntersector1<N,robust> isec1;
+          for (size_t i=0;i<num;i++)
+          {
+            vfloat<N> dist;
+            const float time = prim[i].adjustTime(ray.time()[k]);
+            assert(time <= 1.0f);
+
+            size_t mask = isec1.intersect(&prim[i].qnode,tray,time,dist); 
+            while(mask != 0)
+            {
+              const size_t ID = bscf(mask); 
+              if (unlikely(dist[ID] > ray.tfar[k])) continue;
+              intersect(pre,ray,k,context,prim[i].subgrid(ID));
+            }
+          }
+        }
+        
+        template<bool robust>
+        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+        {
+          BVHNQuantizedBaseNodeIntersector1<N,robust> isec1;
+          
+          for (size_t i=0;i<num;i++)
+          {
+            vfloat<N> dist;
+            const float time = prim[i].adjustTime(ray.time()[k]);
+            assert(time <= 1.0f);
+
+            size_t mask = isec1.intersect(&prim[i].qnode,tray,time,dist); 
+            while(mask != 0)
+            {
+              const size_t ID = bscf(mask); 
+              if (occluded(pre,ray,k,context,prim[i].subgrid(ID)))
+                return true;
+            }
+          }
+          return false;
+        }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/triangle.h b/thirdparty/embree/kernels/geometry/triangle.h
new file mode 100644
index 0000000000..24b758ae48
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/triangle.h
@@ -0,0 +1,162 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+
+namespace embree
+{
+  /* Precalculated representation for M triangles. Stores for each
+     triangle a base vertex, two edges, and the geometry normal to
+     speed up intersection calculations */
+  template<int M>
+  struct TriangleM
+  {
+  public:
+    struct Type : public PrimitiveType 
+    {
+      const char* name() const;
+      size_t sizeActive(const char* This) const;
+      size_t sizeTotal(const char* This) const;
+      size_t getBytes(const char* This) const;
+    };
+    static Type type;
+    
+  public:
+
+    /* Returns maximum number of stored triangles */
+    static __forceinline size_t max_size() { return M; }
+    
+    /* Returns required number of primitive blocks for N primitives */
+    static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); }
+
+  public:
+
+    /* Default constructor */
+    __forceinline TriangleM() {}
+
+    /* Construction from vertices and IDs */
+    __forceinline TriangleM(const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const vuint<M>& geomIDs, const vuint<M>& primIDs)
+      : v0(v0), e1(v0-v1), e2(v2-v0), geomIDs(geomIDs), primIDs(primIDs) {}
+
+    /* Returns a mask that tells which triangles are valid */
+    __forceinline vbool<M> valid() const { return geomIDs != vuint<M>(-1); }
+
+    /* Returns true if the specified triangle is valid */
+    __forceinline bool valid(const size_t i) const { assert(i<M); return geomIDs[i] != -1; }
+    
+    /* Returns the number of stored triangles */
+    __forceinline size_t size() const { return bsf(~movemask(valid()));  }
+
+    /* Returns the geometry IDs */
+    __forceinline       vuint<M>& geomID()       { return geomIDs;  }
+    __forceinline const vuint<M>& geomID() const { return geomIDs;  }
+    __forceinline unsigned int geomID(const size_t i) const { assert(i<M); return geomIDs[i]; }
+
+    /* Returns the primitive IDs */
+    __forceinline       vuint<M>& primID()       { return primIDs; }
+    __forceinline const vuint<M>& primID() const { return primIDs; }
+    __forceinline unsigned int primID(const size_t i) const { assert(i<M); return primIDs[i]; }
+
+    /* Calculate the bounds of the triangle */
+    __forceinline BBox3fa bounds() const 
+    {
+      Vec3vf<M> p0 = v0;
+      Vec3vf<M> p1 = v0-e1;
+      Vec3vf<M> p2 = v0+e2;
+      Vec3vf<M> lower = min(p0,p1,p2);
+      Vec3vf<M> upper = max(p0,p1,p2);
+      vbool<M> mask = valid();
+      lower.x = select(mask,lower.x,vfloat<M>(pos_inf));
+      lower.y = select(mask,lower.y,vfloat<M>(pos_inf));
+      lower.z = select(mask,lower.z,vfloat<M>(pos_inf));
+      upper.x = select(mask,upper.x,vfloat<M>(neg_inf));
+      upper.y = select(mask,upper.y,vfloat<M>(neg_inf));
+      upper.z = select(mask,upper.z,vfloat<M>(neg_inf));
+      return BBox3fa(Vec3fa(reduce_min(lower.x),reduce_min(lower.y),reduce_min(lower.z)),
+                     Vec3fa(reduce_max(upper.x),reduce_max(upper.y),reduce_max(upper.z)));
+    }
+
+    /* Non temporal store */
+    __forceinline static void store_nt(TriangleM* dst, const TriangleM& src)
+    {
+      vfloat<M>::store_nt(&dst->v0.x,src.v0.x);
+      vfloat<M>::store_nt(&dst->v0.y,src.v0.y);
+      vfloat<M>::store_nt(&dst->v0.z,src.v0.z);
+      vfloat<M>::store_nt(&dst->e1.x,src.e1.x);
+      vfloat<M>::store_nt(&dst->e1.y,src.e1.y);
+      vfloat<M>::store_nt(&dst->e1.z,src.e1.z);
+      vfloat<M>::store_nt(&dst->e2.x,src.e2.x);
+      vfloat<M>::store_nt(&dst->e2.y,src.e2.y);
+      vfloat<M>::store_nt(&dst->e2.z,src.e2.z);
+      vuint<M>::store_nt(&dst->geomIDs,src.geomIDs);
+      vuint<M>::store_nt(&dst->primIDs,src.primIDs);
+    }
+
+    /* Fill triangle from triangle list */
+    __forceinline void fill(const PrimRef* prims, size_t& begin, size_t end, Scene* scene)
+    {
+      vuint<M> vgeomID = -1, vprimID = -1;
+      Vec3vf<M> v0 = zero, v1 = zero, v2 = zero;
+      
+      for (size_t i=0; i<M && begin<end; i++, begin++)
+      {
+	const PrimRef& prim = prims[begin];
+        const unsigned geomID = prim.geomID();
+        const unsigned primID = prim.primID();
+        const TriangleMesh* __restrict__ const mesh = scene->get<TriangleMesh>(geomID);
+        const TriangleMesh::Triangle& tri = mesh->triangle(primID);
+        const Vec3fa& p0 = mesh->vertex(tri.v[0]);
+        const Vec3fa& p1 = mesh->vertex(tri.v[1]);
+        const Vec3fa& p2 = mesh->vertex(tri.v[2]);
+        vgeomID [i] = geomID;
+        vprimID [i] = primID;
+        v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z;
+        v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z;
+        v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z;
+      }
+      TriangleM::store_nt(this,TriangleM(v0,v1,v2,vgeomID,vprimID));
+    }
+
+    /* Updates the primitive */
+    __forceinline BBox3fa update(TriangleMesh* mesh)
+    {
+      BBox3fa bounds = empty;
+      vuint<M> vgeomID = -1, vprimID = -1;
+      Vec3vf<M> v0 = zero, v1 = zero, v2 = zero;
+
+	  for (size_t i=0; i<M; i++)
+      {
+        if (unlikely(geomID(i) == -1)) break;
+        const unsigned geomId = geomID(i);
+        const unsigned primId = primID(i);
+        const TriangleMesh::Triangle& tri = mesh->triangle(primId);
+        const Vec3fa p0 = mesh->vertex(tri.v[0]);
+        const Vec3fa p1 = mesh->vertex(tri.v[1]);
+        const Vec3fa p2 = mesh->vertex(tri.v[2]);
+        bounds.extend(merge(BBox3fa(p0),BBox3fa(p1),BBox3fa(p2)));
+        vgeomID [i] = geomId;
+        vprimID [i] = primId;
+        v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z;
+        v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z;
+        v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z;
+      }
+      TriangleM::store_nt(this,TriangleM(v0,v1,v2,vgeomID,vprimID));
+      return bounds;
+    }
+
+  public:
+    Vec3vf<M> v0;      // base vertex of the triangles
+    Vec3vf<M> e1;      // 1st edge of the triangles (v0-v1)
+    Vec3vf<M> e2;      // 2nd edge of the triangles (v2-v0)
+  private:
+    vuint<M> geomIDs; // geometry IDs
+    vuint<M> primIDs; // primitive IDs
+  };
+
+  template<int M>
+  typename TriangleM<M>::Type TriangleM<M>::type;
+
+  typedef TriangleM<4> Triangle4;
+}
diff --git a/thirdparty/embree/kernels/geometry/triangle_intersector.h b/thirdparty/embree/kernels/geometry/triangle_intersector.h
new file mode 100644
index 0000000000..2cdff78ec8
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/triangle_intersector.h
@@ -0,0 +1,96 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "triangle.h"
+#include "triangle_intersector_moeller.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    /*! Intersects M triangles with 1 ray */
+    template<int M, bool filter>
+    struct TriangleMIntersector1Moeller
+    {
+      typedef TriangleM<M> Primitive;
+      typedef MoellerTrumboreIntersector1<M> Precalculations;
+
+      /*! Intersect a ray with the M triangles and updates the hit. */
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const TriangleM<M>& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        pre.intersectEdge(ray,tri.v0,tri.e1,tri.e2,UVIdentity<M>(),Intersect1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+
+      /*! Test if the ray is occluded by one of M triangles. */
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const TriangleM<M>& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        return pre.intersectEdge(ray,tri.v0,tri.e1,tri.e2,UVIdentity<M>(),Occluded1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri);
+      }
+      
+    };
+
+    /*! Intersects M triangles with K rays. */
+    template<int M, int K, bool filter>
+    struct TriangleMIntersectorKMoeller
+    {
+      typedef TriangleM<M> Primitive;
+      typedef MoellerTrumboreIntersectorK<M,K> Precalculations;
+
+      /*! Intersects K rays with M triangles. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const TriangleM<M>& tri)
+      {
+        STAT_USER(0,TriangleM<M>::max_size());
+        for (size_t i=0; i<TriangleM<M>::max_size(); i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          const Vec3vf<K> p0 = broadcast<vfloat<K>>(tri.v0,i);
+          const Vec3vf<K> e1 = broadcast<vfloat<K>>(tri.e1,i);
+          const Vec3vf<K> e2 = broadcast<vfloat<K>>(tri.e2,i);
+          pre.intersectEdgeK(valid_i,ray,p0,e1,e2,UVIdentity<K>(),IntersectKEpilogM<M,K,filter>(ray,context,tri.geomID(),tri.primID(),i));
+        }
+      }
+
+      /*! Test for K rays if they are occluded by any of the M triangles. */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const TriangleM<M>& tri)
+      {
+        vbool<K> valid0 = valid_i;
+
+        for (size_t i=0; i<TriangleM<M>::max_size(); i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+          const Vec3vf<K> p0 = broadcast<vfloat<K>>(tri.v0,i);
+          const Vec3vf<K> e1 = broadcast<vfloat<K>>(tri.e1,i);
+          const Vec3vf<K> e2 = broadcast<vfloat<K>>(tri.e2,i);
+          pre.intersectEdgeK(valid0,ray,p0,e1,e2,UVIdentity<K>(),OccludedKEpilogM<M,K,filter>(valid0,ray,context,tri.geomID(),tri.primID(),i));
+          if (none(valid0)) break;
+        }
+        return !valid0;
+      }
+      
+      /*! Intersect a ray with M triangles and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const TriangleM<M>& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        pre.intersectEdge(ray,k,tri.v0,tri.e1,tri.e2,UVIdentity<M>(),Intersect1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+
+      /*! Test if the ray is occluded by one of the M triangles. */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const TriangleM<M>& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        return pre.intersectEdge(ray,k,tri.v0,tri.e1,tri.e2,UVIdentity<M>(),Occluded1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/triangle_intersector_moeller.h b/thirdparty/embree/kernels/geometry/triangle_intersector_moeller.h
new file mode 100644
index 0000000000..0a42d8f08b
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/triangle_intersector_moeller.h
@@ -0,0 +1,525 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "triangle.h"
+#include "intersector_epilog.h"
+
+/*! This intersector implements a modified version of the Moeller
+ *  Trumbore intersector from the paper "Fast, Minimum Storage
+ *  Ray-Triangle Intersection". In contrast to the paper we
+ *  precalculate some factors and factor the calculations differently
+ *  to allow precalculating the cross product e1 x e2. The resulting
+ *  algorithm is similar to the fastest one of the paper "Optimizing
+ *  Ray-Triangle Intersection via Automated Search". */
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M, typename UVMapper>
+    struct MoellerTrumboreHitM
+    {
+      __forceinline MoellerTrumboreHitM(const UVMapper& mapUV) : mapUV(mapUV) {}
+
+      __forceinline MoellerTrumboreHitM(const vbool<M>& valid, const vfloat<M>& U, const vfloat<M>& V, const vfloat<M>& T, const vfloat<M>& absDen, const Vec3vf<M>& Ng, const UVMapper& mapUV)
+        : U(U), V(V), T(T), absDen(absDen), mapUV(mapUV), valid(valid), vNg(Ng) {}
+      
+      __forceinline void finalize() 
+      {
+        const vfloat<M> rcpAbsDen = rcp(absDen);
+        vt = T * rcpAbsDen;
+        vu = U * rcpAbsDen;
+        vv = V * rcpAbsDen;
+        mapUV(vu,vv,vNg);
+      }
+
+      __forceinline Vec2vf<M> uv() const { return Vec2vf<M>(vu,vv); }
+      __forceinline vfloat<M> t () const { return vt; }
+      __forceinline Vec3vf<M> Ng() const { return vNg; }
+     
+      __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); }
+      __forceinline float t  (const size_t i) const { return vt[i]; }
+      __forceinline Vec3fa Ng(const size_t i) const { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); }
+      
+    public:
+      vfloat<M> U;
+      vfloat<M> V;
+      vfloat<M> T;
+      vfloat<M> absDen;
+      UVMapper mapUV;
+      
+    public:
+      vbool<M> valid;
+      vfloat<M> vu;
+      vfloat<M> vv;
+      vfloat<M> vt;
+      Vec3vf<M> vNg;
+    };
+    
+    template<int M, bool early_out = true>
+    struct MoellerTrumboreIntersector1
+    {
+      __forceinline MoellerTrumboreIntersector1() {}
+
+      __forceinline MoellerTrumboreIntersector1(const Ray& ray, const void* ptr) {}
+
+      template<typename UVMapper>
+      __forceinline bool intersect(const vbool<M>& valid0,
+                                   Ray& ray,
+                                   const Vec3vf<M>& tri_v0,
+                                   const Vec3vf<M>& tri_e1,
+                                   const Vec3vf<M>& tri_e2,
+                                   const Vec3vf<M>& tri_Ng,
+                                   const UVMapper& mapUV,
+                                   MoellerTrumboreHitM<M,UVMapper>& hit) const
+      {
+        /* calculate denominator */
+        vbool<M> valid = valid0;
+        const Vec3vf<M> O = Vec3vf<M>((Vec3fa)ray.org);
+        const Vec3vf<M> D = Vec3vf<M>((Vec3fa)ray.dir);
+        const Vec3vf<M> C = Vec3vf<M>(tri_v0) - O;
+        const Vec3vf<M> R = cross(C,D);
+        const vfloat<M> den = dot(Vec3vf<M>(tri_Ng),D);
+
+        const vfloat<M> absDen = abs(den);
+        const vfloat<M> sgnDen = signmsk(den);
+        
+        /* perform edge tests */
+        const vfloat<M> U = dot(R,Vec3vf<M>(tri_e2)) ^ sgnDen;
+        const vfloat<M> V = dot(R,Vec3vf<M>(tri_e1)) ^ sgnDen;
+        
+        /* perform backface culling */        
+#if defined(EMBREE_BACKFACE_CULLING)
+        valid &= (den < vfloat<M>(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen);
+#else
+        valid &= (den != vfloat<M>(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen);
+#endif
+        if (likely(early_out && none(valid))) return false;
+
+        /* perform depth test */
+        const vfloat<M> T = dot(Vec3vf<M>(tri_Ng),C) ^ sgnDen;
+        valid &= (absDen*vfloat<M>(ray.tnear()) < T) & (T <= absDen*vfloat<M>(ray.tfar));
+        if (likely(early_out && none(valid))) return false;
+           
+        /* update hit information */
+        new (&hit) MoellerTrumboreHitM<M,UVMapper>(valid,U,V,T,absDen,tri_Ng,mapUV);
+
+        return true;
+      }
+
+      template<typename UVMapper>
+      __forceinline bool intersectEdge(const vbool<M>& valid,
+                                       Ray& ray,
+                                       const Vec3vf<M>& tri_v0,
+                                       const Vec3vf<M>& tri_e1,
+                                       const Vec3vf<M>& tri_e2,
+                                       const UVMapper& mapUV,
+                                       MoellerTrumboreHitM<M,UVMapper>& hit) const
+      {
+        const Vec3<vfloat<M>> tri_Ng = cross(tri_e2,tri_e1);
+        return intersect(valid,ray,tri_v0,tri_e1,tri_e2,tri_Ng,mapUV,hit);
+      }
+
+      template<typename UVMapper>
+      __forceinline bool intersectEdge(Ray& ray,
+                                       const Vec3vf<M>& tri_v0,
+                                       const Vec3vf<M>& tri_e1,
+                                       const Vec3vf<M>& tri_e2,
+                                       const UVMapper& mapUV,
+                                       MoellerTrumboreHitM<M,UVMapper>& hit) const
+      {
+        vbool<M> valid = true;
+        const Vec3<vfloat<M>> tri_Ng = cross(tri_e2,tri_e1);
+        return intersect(valid,ray,tri_v0,tri_e1,tri_e2,tri_Ng,mapUV,hit);
+      }
+
+      template<typename UVMapper>
+      __forceinline bool intersect(Ray& ray,
+                                   const Vec3vf<M>& v0,
+                                   const Vec3vf<M>& v1,
+                                   const Vec3vf<M>& v2,
+                                   const UVMapper& mapUV,
+                                   MoellerTrumboreHitM<M,UVMapper>& hit) const
+      {
+        const Vec3vf<M> e1 = v0-v1;
+        const Vec3vf<M> e2 = v2-v0;
+        return intersectEdge(ray,v0,e1,e2,mapUV,hit);
+      }
+
+      template<typename UVMapper>
+      __forceinline bool intersect(const vbool<M>& valid,
+                                   Ray& ray,
+                                   const Vec3vf<M>& v0,
+                                   const Vec3vf<M>& v1,
+                                   const Vec3vf<M>& v2,
+                                   const UVMapper& mapUV,
+                                   MoellerTrumboreHitM<M,UVMapper>& hit) const
+      {
+        const Vec3vf<M> e1 = v0-v1;
+        const Vec3vf<M> e2 = v2-v0;
+        return intersectEdge(valid,ray,v0,e1,e2,mapUV,hit);
+      }
+
+      template<typename UVMapper, typename Epilog>
+      __forceinline bool intersectEdge(Ray& ray,
+                                       const Vec3vf<M>& v0,
+                                       const Vec3vf<M>& e1,
+                                       const Vec3vf<M>& e2,
+                                       const UVMapper& mapUV,
+                                       const Epilog& epilog) const
+      {
+        MoellerTrumboreHitM<M,UVMapper> hit(mapUV);
+        if (likely(intersectEdge(ray,v0,e1,e2,mapUV,hit))) return epilog(hit.valid,hit);
+        return false;
+      }
+
+      template<typename UVMapper, typename Epilog>
+        __forceinline bool intersect(Ray& ray,
+                                     const Vec3vf<M>& v0,
+                                     const Vec3vf<M>& v1,
+                                     const Vec3vf<M>& v2,
+                                     const UVMapper& mapUV,
+                                     const Epilog& epilog) const
+      {
+        MoellerTrumboreHitM<M,UVMapper> hit(mapUV);
+        if (likely(intersect(ray,v0,v1,v2,mapUV,hit))) return epilog(hit.valid,hit);
+        return false;
+      }
+
+      template<typename Epilog>
+        __forceinline bool intersect(Ray& ray,
+                                     const Vec3vf<M>& v0,
+                                     const Vec3vf<M>& v1,
+                                     const Vec3vf<M>& v2,
+                                     const Epilog& epilog) const
+      {
+        auto mapUV = UVIdentity<M>();
+        MoellerTrumboreHitM<M,UVIdentity<M>> hit(mapUV);
+        if (likely(intersect(ray,v0,v1,v2,mapUV,hit))) return epilog(hit.valid,hit);
+        return false;
+      }
+
+      template<typename UVMapper, typename Epilog>
+      __forceinline bool intersect(const vbool<M>& valid,
+                                   Ray& ray,
+                                   const Vec3vf<M>& v0,
+                                   const Vec3vf<M>& v1,
+                                   const Vec3vf<M>& v2,
+                                   const UVMapper& mapUV,
+                                   const Epilog& epilog) const
+      {
+        MoellerTrumboreHitM<M,UVMapper> hit(mapUV);
+        if (likely(intersect(valid,ray,v0,v1,v2,mapUV,hit))) return epilog(hit.valid,hit);
+        return false;
+      }
+    };
+    
+    template<int K, typename UVMapper>
+    struct MoellerTrumboreHitK
+    {
+      __forceinline MoellerTrumboreHitK(const UVMapper& mapUV) : mapUV(mapUV) {}
+      __forceinline MoellerTrumboreHitK(const vfloat<K>& U, const vfloat<K>& V, const vfloat<K>& T, const vfloat<K>& absDen, const Vec3vf<K>& Ng, const UVMapper& mapUV)
+        : U(U), V(V), T(T), absDen(absDen), Ng(Ng), mapUV(mapUV) {}
+      
+      __forceinline std::tuple<vfloat<K>,vfloat<K>,vfloat<K>,Vec3vf<K>> operator() () const
+      {
+        const vfloat<K> rcpAbsDen = rcp(absDen);
+        const vfloat<K> t = T * rcpAbsDen;
+        vfloat<K> u = U * rcpAbsDen;
+        vfloat<K> v = V * rcpAbsDen;
+        Vec3vf<K> vNg = Ng;
+        mapUV(u,v,vNg);
+        return std::make_tuple(u,v,t,vNg);
+      }
+      
+      vfloat<K> U;
+      vfloat<K> V;
+      const vfloat<K> T;
+      const vfloat<K> absDen;
+      const Vec3vf<K> Ng;
+      const UVMapper& mapUV;
+    };
+    
+    template<int M, int K>
+    struct MoellerTrumboreIntersectorK
+    {
+      __forceinline MoellerTrumboreIntersectorK() {}
+      __forceinline MoellerTrumboreIntersectorK(const vbool<K>& valid, const RayK<K>& ray) {}
+      
+      /*! Intersects K rays with one of M triangles. */
+      template<typename UVMapper>
+      __forceinline vbool<K> intersectK(const vbool<K>& valid0,
+                                        //RayK<K>& ray,
+                                        const Vec3vf<K>& ray_org,
+                                        const Vec3vf<K>& ray_dir,
+                                        const vfloat<K>& ray_tnear,
+                                        const vfloat<K>& ray_tfar,
+                                        const Vec3vf<K>& tri_v0,
+                                        const Vec3vf<K>& tri_e1,
+                                        const Vec3vf<K>& tri_e2,
+                                        const Vec3vf<K>& tri_Ng,
+                                        const UVMapper& mapUV,
+                                        MoellerTrumboreHitK<K,UVMapper> &hit) const
+      { 
+        /* calculate denominator */
+        vbool<K> valid = valid0;
+        const Vec3vf<K> C = tri_v0 - ray_org;
+        const Vec3vf<K> R = cross(C,ray_dir);
+        const vfloat<K> den = dot(tri_Ng,ray_dir);
+        const vfloat<K> absDen = abs(den);
+        const vfloat<K> sgnDen = signmsk(den);
+        
+        /* test against edge p2 p0 */
+        const vfloat<K> U = dot(tri_e2,R) ^ sgnDen;
+        valid &= U >= 0.0f;
+        if (likely(none(valid))) return false;
+        
+        /* test against edge p0 p1 */
+        const vfloat<K> V = dot(tri_e1,R) ^ sgnDen;
+        valid &= V >= 0.0f;
+        if (likely(none(valid))) return false;
+        
+        /* test against edge p1 p2 */
+        const vfloat<K> W = absDen-U-V;
+        valid &= W >= 0.0f;
+        if (likely(none(valid))) return false;
+        
+        /* perform depth test */
+        const vfloat<K> T = dot(tri_Ng,C) ^ sgnDen;
+        valid &= (absDen*ray_tnear < T) & (T <= absDen*ray_tfar);
+        if (unlikely(none(valid))) return false;
+        
+        /* perform backface culling */
+#if defined(EMBREE_BACKFACE_CULLING)
+        valid &= den < vfloat<K>(zero);
+        if (unlikely(none(valid))) return false;
+#else
+        valid &= den != vfloat<K>(zero);
+        if (unlikely(none(valid))) return false;
+#endif
+        
+        /* calculate hit information */
+        new (&hit) MoellerTrumboreHitK<K,UVMapper>(U,V,T,absDen,tri_Ng,mapUV);
+        return valid;
+      }
+
+      /*! Intersects K rays with one of M triangles. */
+      template<typename UVMapper>
+      __forceinline vbool<K> intersectK(const vbool<K>& valid0, 
+                                        RayK<K>& ray,
+                                        const Vec3vf<K>& tri_v0,
+                                        const Vec3vf<K>& tri_v1,
+                                        const Vec3vf<K>& tri_v2,
+                                        const UVMapper& mapUV,
+                                        MoellerTrumboreHitK<K,UVMapper> &hit) const
+      {
+        const Vec3vf<K> e1 = tri_v0-tri_v1;
+        const Vec3vf<K> e2 = tri_v2-tri_v0;
+        const Vec3vf<K> Ng = cross(e2,e1);
+        return intersectK(valid0,ray.org,ray.dir,ray.tnear(),ray.tfar,tri_v0,e1,e2,Ng,mapUV,hit);
+      }
+      
+      
+      /*! Intersects K rays with one of M triangles. */
+      template<typename UVMapper, typename Epilog>
+      __forceinline vbool<K> intersectK(const vbool<K>& valid0, 
+                                        RayK<K>& ray,
+                                        const Vec3vf<K>& tri_v0,
+                                        const Vec3vf<K>& tri_v1,
+                                        const Vec3vf<K>& tri_v2,
+                                        const UVMapper& mapUV,
+                                        const Epilog& epilog) const
+      {
+        MoellerTrumboreHitK<K,UVIdentity<K>> hit(mapUV);		
+        const Vec3vf<K> e1 = tri_v0-tri_v1;
+        const Vec3vf<K> e2 = tri_v2-tri_v0;
+        const Vec3vf<K> Ng = cross(e2,e1);
+        const vbool<K> valid = intersectK(valid0,ray.org,ray.dir,ray.tnear(),ray.tfar,tri_v0,e1,e2,Ng,mapUV,hit);
+	return epilog(valid,hit);
+      }
+
+
+      
+      template<typename Epilog>
+      __forceinline vbool<K> intersectK(const vbool<K>& valid0, 
+                                        RayK<K>& ray,
+                                        const Vec3vf<K>& tri_v0,
+                                        const Vec3vf<K>& tri_v1,
+                                        const Vec3vf<K>& tri_v2,
+                                        const Epilog& epilog) const
+      {
+	UVIdentity<K> mapUV;	
+        MoellerTrumboreHitK<K,UVIdentity<K>> hit(mapUV);			
+        const Vec3vf<K> e1 = tri_v0-tri_v1;
+        const Vec3vf<K> e2 = tri_v2-tri_v0;
+        const Vec3vf<K> Ng = cross(e2,e1);
+        const vbool<K> valid = intersectK(valid0,ray.org,ray.dir,ray.tnear(),ray.tfar,tri_v0,e1,e2,Ng,mapUV,hit);
+	return epilog(valid,hit);
+      }
+
+      /*! Intersects K rays with one of M triangles. */
+      template<typename UVMapper, typename Epilog>
+      __forceinline vbool<K> intersectEdgeK(const vbool<K>& valid0, 
+                                            RayK<K>& ray,
+                                            const Vec3vf<K>& tri_v0, 
+                                            const Vec3vf<K>& tri_e1, 
+                                            const Vec3vf<K>& tri_e2,
+                                            const UVMapper& mapUV,
+                                            const Epilog& epilog) const
+      {
+        MoellerTrumboreHitK<K,UVIdentity<K>> hit(mapUV);			
+        const Vec3vf<K> tri_Ng = cross(tri_e2,tri_e1);
+        const vbool<K> valid = intersectK(valid0,ray.org,ray.dir,ray.tnear(),ray.tfar,tri_v0,tri_e1,tri_e2,tri_Ng,mapUV,hit);
+	return epilog(valid,hit);
+      }
+      
+      /*! Intersect k'th ray from ray packet of size K with M triangles. */
+      template<typename UVMapper>
+      __forceinline bool intersectEdge(RayK<K>& ray,
+                                       size_t k,
+                                       const Vec3vf<M>& tri_v0,
+                                       const Vec3vf<M>& tri_e1,
+                                       const Vec3vf<M>& tri_e2,
+                                       const UVMapper& mapUV,
+                                       MoellerTrumboreHitM<M,UVMapper>& hit) const
+      {
+        /* calculate denominator */
+        typedef Vec3vf<M> Vec3vfM;
+        const Vec3vf<M> tri_Ng = cross(tri_e2,tri_e1);
+
+        const Vec3vfM O = broadcast<vfloat<M>>(ray.org,k);
+        const Vec3vfM D = broadcast<vfloat<M>>(ray.dir,k);
+        const Vec3vfM C = Vec3vfM(tri_v0) - O;
+        const Vec3vfM R = cross(C,D);
+        const vfloat<M> den = dot(Vec3vfM(tri_Ng),D);
+        const vfloat<M> absDen = abs(den);
+        const vfloat<M> sgnDen = signmsk(den);
+        
+        /* perform edge tests */
+        const vfloat<M> U = dot(Vec3vf<M>(tri_e2),R) ^ sgnDen;
+        const vfloat<M> V = dot(Vec3vf<M>(tri_e1),R) ^ sgnDen;
+        
+        /* perform backface culling */
+#if defined(EMBREE_BACKFACE_CULLING)
+        vbool<M> valid = (den < vfloat<M>(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen);
+#else
+        vbool<M> valid = (den != vfloat<M>(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen);
+#endif
+        if (likely(none(valid))) return false;
+        
+        /* perform depth test */
+        const vfloat<M> T = dot(Vec3vf<M>(tri_Ng),C) ^ sgnDen;
+        valid &= (absDen*vfloat<M>(ray.tnear()[k]) < T) & (T <= absDen*vfloat<M>(ray.tfar[k]));
+        if (likely(none(valid))) return false;
+        
+        /* calculate hit information */
+        new (&hit) MoellerTrumboreHitM<M,UVMapper>(valid,U,V,T,absDen,tri_Ng,mapUV);
+        return true;
+      }
+
+       template<typename UVMapper>
+      __forceinline bool intersectEdge(RayK<K>& ray,
+                                       size_t k,
+                                       const BBox<vfloat<M>>& time_range,
+                                       const Vec3vf<M>& tri_v0, 
+                                       const Vec3vf<M>& tri_e1, 
+                                       const Vec3vf<M>& tri_e2,
+                                       const UVMapper& mapUV,
+                                       MoellerTrumboreHitM<M,UVMapper>& hit) const
+      {
+        if (likely(intersect(ray,k,tri_v0,tri_e1,tri_e2,mapUV,hit))) 
+        {
+          hit.valid &= time_range.lower <= vfloat<M>(ray.time[k]);
+          hit.valid &= vfloat<M>(ray.time[k]) < time_range.upper;
+          return any(hit.valid);
+        }
+        return false;
+      }
+
+      template<typename UVMapper>
+      __forceinline bool intersect(RayK<K>& ray,
+                                   size_t k,
+                                   const Vec3vf<M>& v0, 
+                                   const Vec3vf<M>& v1, 
+                                   const Vec3vf<M>& v2,
+                                   const UVMapper& mapUV,
+                                   MoellerTrumboreHitM<M,UVMapper>& hit) const      
+      {
+        const Vec3vf<M> e1 = v0-v1;
+        const Vec3vf<M> e2 = v2-v0;
+        return intersectEdge(ray,k,v0,e1,e2,mapUV,hit);
+      }
+      
+      template<typename UVMapper, typename Epilog>
+      __forceinline bool intersectEdge(RayK<K>& ray,
+                                       size_t k,
+                                       const Vec3vf<M>& tri_v0, 
+                                       const Vec3vf<M>& tri_e1, 
+                                       const Vec3vf<M>& tri_e2,
+                                       const UVMapper& mapUV,
+                                       const Epilog& epilog) const
+      {
+        MoellerTrumboreHitM<M,UVMapper> hit(mapUV);
+        if (likely(intersectEdge(ray,k,tri_v0,tri_e1,tri_e2,mapUV,hit))) return epilog(hit.valid,hit);
+        return false;
+      }
+
+      template<typename UVMapper, typename Epilog>
+      __forceinline bool intersectEdge(RayK<K>& ray,
+                                       size_t k,                           
+                                       const BBox<vfloat<M>>& time_range,
+                                       const Vec3vf<M>& tri_v0, 
+                                       const Vec3vf<M>& tri_e1, 
+                                       const Vec3vf<M>& tri_e2,
+                                       const UVMapper& mapUV,
+                                       const Epilog& epilog) const
+      {
+        MoellerTrumboreHitM<M,UVMapper> hit(mapUV);
+        if (likely(intersectEdge(ray,k,time_range,tri_v0,tri_e1,tri_e2,mapUV,hit))) return epilog(hit.valid,hit);
+        return false;
+      }
+      
+      template<typename UVMapper, typename Epilog>
+      __forceinline bool intersect(RayK<K>& ray,
+                                   size_t k,
+                                   const Vec3vf<M>& v0, 
+                                   const Vec3vf<M>& v1, 
+                                   const Vec3vf<M>& v2,
+                                   const UVMapper& mapUV,
+                                   const Epilog& epilog) const      
+      {
+        const Vec3vf<M> e1 = v0-v1;
+        const Vec3vf<M> e2 = v2-v0;
+        return intersectEdge(ray,k,v0,e1,e2,mapUV,epilog);
+      }
+
+      template<typename Epilog>
+      __forceinline bool intersect(RayK<K>& ray,
+                                   size_t k,
+                                   const Vec3vf<M>& v0, 
+                                   const Vec3vf<M>& v1, 
+                                   const Vec3vf<M>& v2,
+                                   const Epilog& epilog) const      
+      {
+        return intersect(ray,k,v0,v1,v2,UVIdentity<M>(),epilog);
+      }
+
+      template<typename UVMapper, typename Epilog>
+      __forceinline bool intersect(RayK<K>& ray,
+                                   size_t k,
+                                   const BBox<vfloat<M>>& time_range,
+                                   const Vec3vf<M>& v0,
+                                   const Vec3vf<M>& v1,
+                                   const Vec3vf<M>& v2,
+                                   const UVMapper& mapUV,
+                                   const Epilog& epilog) const
+      {
+        const Vec3vf<M> e1 = v0-v1;
+        const Vec3vf<M> e2 = v2-v0;
+        return intersectEdge(ray,k,time_range,v0,e1,e2,mapUV,epilog);
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/triangle_intersector_pluecker.h b/thirdparty/embree/kernels/geometry/triangle_intersector_pluecker.h
new file mode 100644
index 0000000000..8fbefcea88
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/triangle_intersector_pluecker.h
@@ -0,0 +1,407 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "triangle.h"
+#include "trianglev.h"
+#include "trianglev_mb.h"
+#include "intersector_epilog.h"
+
+/*! Modified Pluecker ray/triangle intersector. The test first shifts
+ *  the ray origin into the origin of the coordinate system and then
+ *  uses Pluecker coordinates for the intersection. Due to the shift,
+ *  the Pluecker coordinate calculation simplifies and the tests get
+ *  numerically stable. The edge equations are watertight along the
+ *  edge for neighboring triangles. */
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M, typename UVMapper>
+    struct PlueckerHitM
+    {
+      __forceinline PlueckerHitM(const UVMapper& mapUV) : mapUV(mapUV) {}
+      
+      __forceinline PlueckerHitM(const vbool<M>& valid, const vfloat<M>& U, const vfloat<M>& V, const vfloat<M>& UVW, const vfloat<M>& t, const Vec3vf<M>& Ng, const UVMapper& mapUV)
+        :  U(U), V(V), UVW(UVW), mapUV(mapUV), valid(valid), vt(t), vNg(Ng) {}
+      
+      __forceinline void finalize() 
+      {
+        const vbool<M> invalid = abs(UVW) < min_rcp_input;
+        const vfloat<M> rcpUVW = select(invalid,vfloat<M>(0.0f),rcp(UVW));
+        vu = min(U * rcpUVW,1.0f);
+        vv = min(V * rcpUVW,1.0f);	
+        mapUV(vu,vv,vNg);
+      }
+
+      __forceinline Vec2vf<M> uv() const { return Vec2vf<M>(vu,vv); }
+      __forceinline vfloat<M> t () const { return vt; }
+      __forceinline Vec3vf<M> Ng() const { return vNg; }
+    
+      __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); }
+      __forceinline float t  (const size_t i) const { return vt[i]; }
+      __forceinline Vec3fa Ng(const size_t i) const { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); }
+      
+    public:
+      vfloat<M> U;
+      vfloat<M> V;
+      vfloat<M> UVW;
+      const UVMapper& mapUV;
+      
+    public:
+      vbool<M> valid;      
+      vfloat<M> vu;
+      vfloat<M> vv;
+      vfloat<M> vt;
+      Vec3vf<M> vNg;
+    };
+
+    template<int M, bool early_out = true>
+    struct PlueckerIntersector1
+    {
+      __forceinline PlueckerIntersector1() {}
+
+      __forceinline PlueckerIntersector1(const Ray& ray, const void* ptr) {}
+
+      template<typename UVMapper>
+      __forceinline bool intersect(const vbool<M>& valid0,
+                                   Ray& ray,
+                                   const Vec3vf<M>& tri_v0,
+                                   const Vec3vf<M>& tri_v1,
+                                   const Vec3vf<M>& tri_v2,
+                                   const UVMapper& mapUV,
+				   PlueckerHitM<M,UVMapper>& hit) const
+      {
+        vbool<M> valid = valid0;
+        
+        /* calculate vertices relative to ray origin */
+        const Vec3vf<M> O = Vec3vf<M>((Vec3fa)ray.org);
+	const Vec3vf<M> D = Vec3vf<M>((Vec3fa)ray.dir);
+        const Vec3vf<M> v0 = tri_v0-O;
+        const Vec3vf<M> v1 = tri_v1-O;
+        const Vec3vf<M> v2 = tri_v2-O;
+
+        /* calculate triangle edges */
+        const Vec3vf<M> e0 = v2-v0;
+        const Vec3vf<M> e1 = v0-v1;
+        const Vec3vf<M> e2 = v1-v2;
+
+        /* perform edge tests */
+        const vfloat<M> U = dot(cross(e0,v2+v0),D);
+        const vfloat<M> V = dot(cross(e1,v0+v1),D);
+        const vfloat<M> W = dot(cross(e2,v1+v2),D);
+        const vfloat<M> UVW = U+V+W;
+        const vfloat<M> eps = float(ulp)*abs(UVW);
+#if defined(EMBREE_BACKFACE_CULLING)
+        valid &= max(U,V,W) <= eps;
+#else
+        valid &= (min(U,V,W) >= -eps) | (max(U,V,W) <= eps);
+#endif
+        if (unlikely(early_out && none(valid))) return false;
+
+        /* calculate geometry normal and denominator */
+        const Vec3vf<M> Ng = stable_triangle_normal(e0,e1,e2);
+        const vfloat<M> den = twice(dot(Ng,D));
+        
+        /* perform depth test */
+        const vfloat<M> T = twice(dot(v0,Ng));
+        const vfloat<M> t = rcp(den)*T;
+        valid &= vfloat<M>(ray.tnear()) <= t & t <= vfloat<M>(ray.tfar);
+        valid &= den != vfloat<M>(zero);
+        if (unlikely(early_out && none(valid))) return false;
+
+        /* update hit information */
+        new (&hit) PlueckerHitM<M,UVMapper>(valid,U,V,UVW,t,Ng,mapUV);
+        return true;
+      }
+
+      template<typename UVMapper>
+      __forceinline bool intersectEdge(const vbool<M>& valid,
+				       Ray& ray,
+				       const Vec3vf<M>& tri_v0,
+				       const Vec3vf<M>& tri_v1,
+				       const Vec3vf<M>& tri_v2,
+				       const UVMapper& mapUV,
+				       PlueckerHitM<M,UVMapper>& hit) const
+      {
+        return intersect(valid,ray,tri_v0,tri_v1,tri_v2,mapUV,hit);
+      }
+
+      template<typename UVMapper>
+      __forceinline bool intersectEdge(Ray& ray,
+				       const Vec3vf<M>& tri_v0,
+				       const Vec3vf<M>& tri_v1,
+				       const Vec3vf<M>& tri_v2,
+				       const UVMapper& mapUV,				       
+				       PlueckerHitM<M,UVMapper>& hit) const
+      {
+	vbool<M> valid = true;
+        return intersect(valid,ray,tri_v0,tri_v1,tri_v2,mapUV,hit);
+      }
+
+      template<typename UVMapper>
+      __forceinline bool intersect(Ray& ray,
+                                   const Vec3vf<M>& tri_v0,
+                                   const Vec3vf<M>& tri_v1,
+                                   const Vec3vf<M>& tri_v2,
+                                   const UVMapper& mapUV,				   
+                                   PlueckerHitM<M,UVMapper>& hit) const
+      {
+        return intersectEdge(ray,tri_v0,tri_v1,tri_v2,mapUV,hit);
+      }
+
+      template<typename UVMapper, typename Epilog>
+      __forceinline bool intersectEdge(Ray& ray,
+                                       const Vec3vf<M>& v0,
+                                       const Vec3vf<M>& e1,
+                                       const Vec3vf<M>& e2,
+                                       const UVMapper& mapUV,
+                                       const Epilog& epilog) const
+      {
+        PlueckerHitM<M,UVMapper> hit(mapUV);
+        if (likely(intersectEdge(ray,v0,e1,e2,mapUV,hit))) return epilog(hit.valid,hit);
+        return false;
+      }
+
+      template<typename UVMapper, typename Epilog>
+        __forceinline bool intersect(Ray& ray,
+                                     const Vec3vf<M>& v0,
+                                     const Vec3vf<M>& v1,
+                                     const Vec3vf<M>& v2,
+                                     const UVMapper& mapUV,
+                                     const Epilog& epilog) const
+      {
+        PlueckerHitM<M,UVMapper> hit(mapUV);
+        if (likely(intersect(ray,v0,v1,v2,mapUV,hit))) return epilog(hit.valid,hit);
+        return false;
+      }
+
+      template<typename Epilog>
+        __forceinline bool intersect(Ray& ray,
+                                     const Vec3vf<M>& v0,
+                                     const Vec3vf<M>& v1,
+                                     const Vec3vf<M>& v2,
+                                     const Epilog& epilog) const
+      {
+        auto mapUV = UVIdentity<M>();
+        PlueckerHitM<M,UVIdentity<M>> hit(mapUV);
+        if (likely(intersect(ray,v0,v1,v2,mapUV,hit))) return epilog(hit.valid,hit);
+        return false;
+      }
+
+      template<typename UVMapper, typename Epilog>
+      __forceinline bool intersect(const vbool<M>& valid,
+                                   Ray& ray,
+                                   const Vec3vf<M>& v0,
+                                   const Vec3vf<M>& v1,
+                                   const Vec3vf<M>& v2,
+                                   const UVMapper& mapUV,
+                                   const Epilog& epilog) const
+      {
+        PlueckerHitM<M,UVMapper> hit(mapUV);
+        if (likely(intersect(valid,ray,v0,v1,v2,mapUV,hit))) return epilog(hit.valid,hit);
+        return false;
+      }
+      
+    };
+
+    template<int K, typename UVMapper>
+    struct PlueckerHitK
+    {
+      __forceinline PlueckerHitK(const UVMapper& mapUV) : mapUV(mapUV) {}
+      
+      __forceinline PlueckerHitK(const vfloat<K>& U, const vfloat<K>& V, const vfloat<K>& UVW, const vfloat<K>& t, const Vec3vf<K>& Ng, const UVMapper& mapUV)
+        :  U(U), V(V), UVW(UVW), t(t), Ng(Ng), mapUV(mapUV) {}
+      
+      __forceinline std::tuple<vfloat<K>,vfloat<K>,vfloat<K>,Vec3vf<K>> operator() () const
+      {
+        const vbool<K> invalid = abs(UVW) < min_rcp_input;
+        const vfloat<K> rcpUVW = select(invalid,vfloat<K>(0.0f),rcp(UVW));
+        vfloat<K> u = min(U * rcpUVW,1.0f);
+        vfloat<K> v = min(V * rcpUVW,1.0f);
+        Vec3vf<K> vNg = Ng;
+        mapUV(u,v,vNg);
+        return std::make_tuple(u,v,t,vNg);
+      }
+      vfloat<K> U;
+      vfloat<K> V;
+      const vfloat<K> UVW;
+      const vfloat<K> t;
+      const Vec3vf<K> Ng;
+      const UVMapper& mapUV;
+    };
+    
+    template<int M, int K>
+    struct PlueckerIntersectorK
+    {
+      __forceinline PlueckerIntersectorK() {}      
+      __forceinline PlueckerIntersectorK(const vbool<K>& valid, const RayK<K>& ray) {}
+
+      /*! Intersects K rays with one of M triangles. */
+      template<typename UVMapper>
+      __forceinline vbool<K> intersectK(const vbool<K>& valid0,
+				    RayK<K>& ray,
+				    const Vec3vf<K>& tri_v0,
+				    const Vec3vf<K>& tri_v1,
+				    const Vec3vf<K>& tri_v2,
+				    const UVMapper& mapUV,
+				    PlueckerHitK<K,UVMapper> &hit) const
+      {
+        /* calculate vertices relative to ray origin */
+        vbool<K> valid = valid0;
+        const Vec3vf<K> O = ray.org;
+        const Vec3vf<K> D = ray.dir;
+        const Vec3vf<K> v0 = tri_v0-O;
+        const Vec3vf<K> v1 = tri_v1-O;
+        const Vec3vf<K> v2 = tri_v2-O;
+
+        /* calculate triangle edges */
+        const Vec3vf<K> e0 = v2-v0;
+        const Vec3vf<K> e1 = v0-v1;
+        const Vec3vf<K> e2 = v1-v2;
+
+        /* perform edge tests */
+        const vfloat<K> U = dot(Vec3vf<K>(cross(e0,v2+v0)),D);
+        const vfloat<K> V = dot(Vec3vf<K>(cross(e1,v0+v1)),D);
+        const vfloat<K> W = dot(Vec3vf<K>(cross(e2,v1+v2)),D);
+        const vfloat<K> UVW = U+V+W;
+        const vfloat<K> eps = float(ulp)*abs(UVW);
+#if defined(EMBREE_BACKFACE_CULLING)
+        valid &= max(U,V,W) <= eps;
+#else
+        valid &= (min(U,V,W) >= -eps) | (max(U,V,W) <= eps);
+#endif
+        if (unlikely(none(valid))) return valid;
+
+         /* calculate geometry normal and denominator */
+        const Vec3vf<K> Ng = stable_triangle_normal(e0,e1,e2);
+        const vfloat<K> den = twice(dot(Vec3vf<K>(Ng),D));
+
+        /* perform depth test */
+        const vfloat<K> T = twice(dot(v0,Vec3vf<K>(Ng)));
+        const vfloat<K> t = rcp(den)*T;
+        valid &= ray.tnear() <= t & t <= ray.tfar;
+        valid &= den != vfloat<K>(zero);
+        if (unlikely(none(valid))) return valid;
+        
+        /* calculate hit information */
+        new (&hit) PlueckerHitK<K,UVMapper>(U,V,UVW,t,Ng,mapUV);
+        return valid;
+      }
+
+      template<typename Epilog>
+      __forceinline vbool<K> intersectK(const vbool<K>& valid0,
+                                        RayK<K>& ray,
+                                        const Vec3vf<K>& tri_v0,
+                                        const Vec3vf<K>& tri_v1,
+                                        const Vec3vf<K>& tri_v2,
+                                        const Epilog& epilog) const
+      {
+	UVIdentity<K> mapUV;	
+        PlueckerHitK<K,UVIdentity<K>> hit(mapUV);		
+        const vbool<K> valid = intersectK(valid0,ray,tri_v0,tri_v1,tri_v2,mapUV,hit);
+	return epilog(valid,hit);
+      }
+
+      template<typename UVMapper, typename Epilog>
+      __forceinline vbool<K> intersectK(const vbool<K>& valid0,
+                                        RayK<K>& ray,
+                                        const Vec3vf<K>& tri_v0,
+                                        const Vec3vf<K>& tri_v1,
+                                        const Vec3vf<K>& tri_v2,
+					const UVMapper& mapUV,
+                                        const Epilog& epilog) const
+      {
+        PlueckerHitK<K,UVMapper> hit(mapUV);		
+        const vbool<K> valid = intersectK(valid0,ray,tri_v0,tri_v1,tri_v2,mapUV,hit);
+	return epilog(valid,hit);
+      }
+      
+      /*! Intersect k'th ray from ray packet of size K with M triangles. */
+      template<typename UVMapper>
+      __forceinline bool intersect(RayK<K>& ray, size_t k,
+                                   const Vec3vf<M>& tri_v0,
+                                   const Vec3vf<M>& tri_v1,
+                                   const Vec3vf<M>& tri_v2,
+                                   const UVMapper& mapUV,
+				   PlueckerHitM<M,UVMapper> &hit) const
+      {
+        /* calculate vertices relative to ray origin */
+        const Vec3vf<M> O = broadcast<vfloat<M>>(ray.org,k);
+        const Vec3vf<M> D = broadcast<vfloat<M>>(ray.dir,k);
+        const Vec3vf<M> v0 = tri_v0-O;
+        const Vec3vf<M> v1 = tri_v1-O;
+        const Vec3vf<M> v2 = tri_v2-O;
+
+        /* calculate triangle edges */
+        const Vec3vf<M> e0 = v2-v0;
+        const Vec3vf<M> e1 = v0-v1;
+        const Vec3vf<M> e2 = v1-v2;
+
+	
+        /* perform edge tests */
+        const vfloat<M> U = dot(cross(e0,v2+v0),D);
+        const vfloat<M> V = dot(cross(e1,v0+v1),D);
+        const vfloat<M> W = dot(cross(e2,v1+v2),D);
+	
+        const vfloat<M> UVW = U+V+W;
+        const vfloat<M> eps = float(ulp)*abs(UVW);
+#if defined(EMBREE_BACKFACE_CULLING)
+        vbool<M> valid = max(U,V,W) <= eps;
+#else
+        vbool<M> valid = (min(U,V,W) >= -eps) | (max(U,V,W) <= eps);
+#endif
+        if (unlikely(none(valid))) return false;
+
+        /* calculate geometry normal and denominator */
+        const Vec3vf<M> Ng = stable_triangle_normal(e0,e1,e2);
+        const vfloat<M> den = twice(dot(Ng,D));
+        
+        /* perform depth test */
+        const vfloat<M> T = twice(dot(v0,Ng));
+        const vfloat<M> t = rcp(den)*T;
+        valid &= vfloat<M>(ray.tnear()[k]) <= t & t <= vfloat<M>(ray.tfar[k]);
+        if (unlikely(none(valid))) return false;
+
+        /* avoid division by 0 */
+        valid &= den != vfloat<M>(zero);
+        if (unlikely(none(valid))) return false;
+
+        /* update hit information */
+        new (&hit) PlueckerHitM<M,UVMapper>(valid,U,V,UVW,t,Ng,mapUV);
+        return true;
+      }
+
+      template<typename UVMapper, typename Epilog>
+      __forceinline bool intersect(RayK<K>& ray, size_t k,
+                                   const Vec3vf<M>& tri_v0,
+                                   const Vec3vf<M>& tri_v1,
+                                   const Vec3vf<M>& tri_v2,
+                                   const UVMapper& mapUV,				   
+                                   const Epilog& epilog) const
+      {
+        PlueckerHitM<M,UVMapper> hit(mapUV);	
+        if (intersect(ray,k,tri_v0,tri_v1,tri_v2,mapUV,hit))
+	  return epilog(hit.valid,hit);
+	return false;
+      }
+
+      template<typename Epilog>
+      __forceinline bool intersect(RayK<K>& ray, size_t k,
+                                   const Vec3vf<M>& tri_v0,
+                                   const Vec3vf<M>& tri_v1,
+                                   const Vec3vf<M>& tri_v2,
+                                   const Epilog& epilog) const
+      {
+	UVIdentity<M> mapUV;	
+        PlueckerHitM<M,UVIdentity<M>> hit(mapUV);	
+        if (intersect(ray,k,tri_v0,tri_v1,tri_v2,mapUV,hit))
+	  return epilog(hit.valid,hit);
+	return false;
+      }
+      
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/triangle_intersector_woop.h b/thirdparty/embree/kernels/geometry/triangle_intersector_woop.h
new file mode 100644
index 0000000000..f05dcc4537
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/triangle_intersector_woop.h
@@ -0,0 +1,418 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "triangle.h"
+#include "intersector_epilog.h"
+
+/*! This intersector implements a modified version of the Woop's ray-triangle intersection test */
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M>
+    struct WoopHitM
+    {
+      __forceinline WoopHitM() {}
+
+      __forceinline WoopHitM(const vbool<M>& valid, 
+                             const vfloat<M>& U, 
+                             const vfloat<M>& V, 
+                             const vfloat<M>& T, 
+                             const vfloat<M>& inv_det,                              
+                             const Vec3vf<M>& Ng)
+        : U(U), V(V), T(T), inv_det(inv_det), valid(valid), vNg(Ng) {}
+      
+      __forceinline void finalize() 
+      {
+        vt = T;
+        vu = U*inv_det;
+        vv = V*inv_det;
+      }
+
+      __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); }
+      __forceinline float t  (const size_t i) const { return vt[i]; }
+      __forceinline Vec3fa Ng(const size_t i) const { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); }
+      
+    private:
+      const vfloat<M> U;
+      const vfloat<M> V;
+      const vfloat<M> T;
+      const vfloat<M> inv_det;
+      
+    public:
+      const vbool<M> valid;
+      vfloat<M> vu;
+      vfloat<M> vv;
+      vfloat<M> vt;
+      Vec3vf<M> vNg;
+    };
+
+    template<int M>
+    struct WoopPrecalculations1
+    {
+      unsigned int kx,ky,kz;
+      Vec3vf<M> org;
+      Vec3fa S;
+      __forceinline WoopPrecalculations1() {}
+
+      __forceinline WoopPrecalculations1(const Ray& ray, const void* ptr)
+      {
+        kz = maxDim(abs(ray.dir));
+        kx = (kz+1) % 3;
+        ky = (kx+1) % 3;
+        const float inv_dir_kz = rcp(ray.dir[kz]);
+        if (ray.dir[kz]) std::swap(kx,ky);
+        S.x = ray.dir[kx] * inv_dir_kz;
+        S.y = ray.dir[ky] * inv_dir_kz;
+        S.z = inv_dir_kz;
+        org = Vec3vf<M>(ray.org[kx],ray.org[ky],ray.org[kz]);
+      }
+    };
+
+    
+    template<int M>
+    struct WoopIntersector1
+    {
+
+        typedef WoopPrecalculations1<M> Precalculations;
+
+      __forceinline WoopIntersector1() {}
+
+      __forceinline WoopIntersector1(const Ray& ray, const void* ptr) {}
+
+      static __forceinline bool intersect(const vbool<M>& valid0,
+                                          Ray& ray,
+                                          const Precalculations& pre,
+                                          const Vec3vf<M>& tri_v0,
+                                          const Vec3vf<M>& tri_v1,
+                                          const Vec3vf<M>& tri_v2,
+                                          WoopHitM<M>& hit)
+      {       
+        vbool<M> valid = valid0;
+
+        /* vertices relative to ray origin */
+        const Vec3vf<M> org = Vec3vf<M>(pre.org.x,pre.org.y,pre.org.z);
+        const Vec3vf<M> A = Vec3vf<M>(tri_v0[pre.kx],tri_v0[pre.ky],tri_v0[pre.kz]) - org;
+        const Vec3vf<M> B = Vec3vf<M>(tri_v1[pre.kx],tri_v1[pre.ky],tri_v1[pre.kz]) - org;
+        const Vec3vf<M> C = Vec3vf<M>(tri_v2[pre.kx],tri_v2[pre.ky],tri_v2[pre.kz]) - org;
+
+        /* shear and scale vertices */
+        const vfloat<M> Ax = nmadd(A.z,pre.S.x,A.x);
+        const vfloat<M> Ay = nmadd(A.z,pre.S.y,A.y);
+        const vfloat<M> Bx = nmadd(B.z,pre.S.x,B.x);
+        const vfloat<M> By = nmadd(B.z,pre.S.y,B.y);
+        const vfloat<M> Cx = nmadd(C.z,pre.S.x,C.x);
+        const vfloat<M> Cy = nmadd(C.z,pre.S.y,C.y);
+
+        /* scaled barycentric */
+        const vfloat<M> U0 = Cx*By;
+        const vfloat<M> U1 = Cy*Bx;
+        const vfloat<M> V0 = Ax*Cy;
+        const vfloat<M> V1 = Ay*Cx;
+        const vfloat<M> W0 = Bx*Ay;
+        const vfloat<M> W1 = By*Ax;
+#if !defined(__AVX512F__)
+        valid &= (U0 >= U1) & (V0 >= V1) & (W0 >= W1) |
+          (U0 <= U1) & (V0 <= V1) & (W0 <= W1);
+#else
+        valid &= ge(ge(U0 >= U1,V0,V1),W0,W1) | le(le(U0 <= U1,V0,V1),W0,W1);
+#endif
+
+        if (likely(none(valid))) return false;
+        const vfloat<M> U = U0-U1;
+        const vfloat<M> V = V0-V1;
+        const vfloat<M> W = W0-W1;
+
+        const vfloat<M> det = U+V+W;
+
+        valid &= det != 0.0f;
+        const vfloat<M> inv_det = rcp(det);
+
+        const vfloat<M> Az = pre.S.z * A.z;
+        const vfloat<M> Bz = pre.S.z * B.z;
+        const vfloat<M> Cz = pre.S.z * C.z;
+        const vfloat<M> T  = madd(U,Az,madd(V,Bz,W*Cz)); 
+        const vfloat<M> t  = T * inv_det;
+        /* perform depth test */
+        valid &= (vfloat<M>(ray.tnear()) < t) & (t <= vfloat<M>(ray.tfar));
+        if (likely(none(valid))) return false;
+        
+        const Vec3vf<M> tri_Ng = cross(tri_v2-tri_v0,tri_v0-tri_v1);
+
+        /* update hit information */
+        new (&hit) WoopHitM<M>(valid,U,V,t,inv_det,tri_Ng);
+        return true;
+      }
+      
+      static __forceinline bool intersect(Ray& ray,
+                                   const Precalculations& pre,
+                                   const Vec3vf<M>& v0,
+                                   const Vec3vf<M>& v1,
+                                   const Vec3vf<M>& v2,
+                                   WoopHitM<M>& hit)
+      {
+        vbool<M> valid = true;
+        return intersect(valid,ray,pre,v0,v1,v2,hit);
+      }
+
+
+      template<typename Epilog>
+      static __forceinline bool intersect(Ray& ray,
+                                     const Precalculations& pre,
+                                     const Vec3vf<M>& v0,
+                                     const Vec3vf<M>& v1,
+                                     const Vec3vf<M>& v2,
+                                     const Epilog& epilog)
+      {
+        WoopHitM<M> hit;
+        if (likely(intersect(ray,pre,v0,v1,v2,hit))) return epilog(hit.valid,hit);
+        return false;
+      }
+
+      template<typename Epilog>
+      static __forceinline bool intersect(const vbool<M>& valid,
+                                   Ray& ray,
+                                   const Precalculations& pre,
+                                   const Vec3vf<M>& v0,
+                                   const Vec3vf<M>& v1,
+                                   const Vec3vf<M>& v2,
+                                   const Epilog& epilog)
+      {
+        WoopHitM<M> hit;
+        if (likely(intersect(valid,ray,pre,v0,v1,v2,hit))) return epilog(hit.valid,hit);
+        return false;
+      }
+    };
+    
+#if 0
+    template<int K>
+    struct WoopHitK
+    {
+      __forceinline WoopHitK(const vfloat<K>& U, const vfloat<K>& V, const vfloat<K>& T, const vfloat<K>& absDen, const Vec3vf<K>& Ng)
+        : U(U), V(V), T(T), absDen(absDen), Ng(Ng) {}
+      
+      __forceinline std::tuple<vfloat<K>,vfloat<K>,vfloat<K>,Vec3vf<K>> operator() () const
+      {
+        const vfloat<K> rcpAbsDen = rcp(absDen);
+        const vfloat<K> t = T * rcpAbsDen;
+        const vfloat<K> u = U * rcpAbsDen;
+        const vfloat<K> v = V * rcpAbsDen;
+        return std::make_tuple(u,v,t,Ng);
+      }
+      
+    private:
+      const vfloat<K> U;
+      const vfloat<K> V;
+      const vfloat<K> T;
+      const vfloat<K> absDen;
+      const Vec3vf<K> Ng;
+    };
+    
+    template<int M, int K>
+    struct WoopIntersectorK
+    {
+      __forceinline WoopIntersectorK(const vbool<K>& valid, const RayK<K>& ray) {}
+      
+      /*! Intersects K rays with one of M triangles. */
+      template<typename Epilog>
+      __forceinline vbool<K> intersectK(const vbool<K>& valid0,
+                                        //RayK<K>& ray,
+                                        const Vec3vf<K>& ray_org,
+                                        const Vec3vf<K>& ray_dir,
+                                        const vfloat<K>& ray_tnear,
+                                        const vfloat<K>& ray_tfar,
+                                        const Vec3vf<K>& tri_v0,
+                                        const Vec3vf<K>& tri_e1,
+                                        const Vec3vf<K>& tri_e2,
+                                        const Vec3vf<K>& tri_Ng,
+                                        const Epilog& epilog) const
+      { 
+        /* calculate denominator */
+        vbool<K> valid = valid0;
+        const Vec3vf<K> C = tri_v0 - ray_org;
+        const Vec3vf<K> R = cross(C,ray_dir);
+        const vfloat<K> den = dot(tri_Ng,ray_dir);
+        const vfloat<K> absDen = abs(den);
+        const vfloat<K> sgnDen = signmsk(den);
+        
+        /* test against edge p2 p0 */
+        const vfloat<K> U = dot(tri_e2,R) ^ sgnDen;
+        valid &= U >= 0.0f;
+        if (likely(none(valid))) return false;
+        
+        /* test against edge p0 p1 */
+        const vfloat<K> V = dot(tri_e1,R) ^ sgnDen;
+        valid &= V >= 0.0f;
+        if (likely(none(valid))) return false;
+        
+        /* test against edge p1 p2 */
+        const vfloat<K> W = absDen-U-V;
+        valid &= W >= 0.0f;
+        if (likely(none(valid))) return false;
+        
+        /* perform depth test */
+        const vfloat<K> T = dot(tri_Ng,C) ^ sgnDen;
+        valid &= (absDen*ray_tnear < T) & (T <= absDen*ray_tfar);
+        if (unlikely(none(valid))) return false;
+        
+        /* perform backface culling */
+#if defined(EMBREE_BACKFACE_CULLING)
+        valid &= den < vfloat<K>(zero);
+        if (unlikely(none(valid))) return false;
+#else
+        valid &= den != vfloat<K>(zero);
+        if (unlikely(none(valid))) return false;
+#endif
+        
+        /* calculate hit information */
+        WoopHitK<K> hit(U,V,T,absDen,tri_Ng);
+        return epilog(valid,hit);
+      }
+      
+      /*! Intersects K rays with one of M triangles. */
+      template<typename Epilog>
+      __forceinline vbool<K> intersectK(const vbool<K>& valid0, 
+                                        RayK<K>& ray,
+                                        const Vec3vf<K>& tri_v0,
+                                        const Vec3vf<K>& tri_v1,
+                                        const Vec3vf<K>& tri_v2,
+                                        const Epilog& epilog) const
+      {
+        const Vec3vf<K> e1 = tri_v0-tri_v1;
+        const Vec3vf<K> e2 = tri_v2-tri_v0;
+        const Vec3vf<K> Ng = cross(e2,e1);
+        return intersectK(valid0,ray.org,ray.dir,ray.tnear(),ray.tfar,tri_v0,e1,e2,Ng,epilog);
+      }
+
+      /*! Intersects K rays with one of M triangles. */
+      template<typename Epilog>
+      __forceinline vbool<K> intersectEdgeK(const vbool<K>& valid0, 
+                                            RayK<K>& ray,
+                                            const Vec3vf<K>& tri_v0, 
+                                            const Vec3vf<K>& tri_e1, 
+                                            const Vec3vf<K>& tri_e2, 
+                                            const Epilog& epilog) const
+      {
+        const Vec3vf<K> tri_Ng = cross(tri_e2,tri_e1);
+        return intersectK(valid0,ray.org,ray.dir,ray.tnear(),ray.tfar,tri_v0,tri_e1,tri_e2,tri_Ng,epilog);
+      }
+      
+      /*! Intersect k'th ray from ray packet of size K with M triangles. */
+      __forceinline bool intersectEdge(RayK<K>& ray,
+                                       size_t k,
+                                       const Vec3vf<M>& tri_v0,
+                                       const Vec3vf<M>& tri_e1,
+                                       const Vec3vf<M>& tri_e2,
+                                       WoopHitM<M>& hit) const
+      {
+        /* calculate denominator */
+        typedef Vec3vf<M> Vec3vfM;
+        const Vec3vf<M> tri_Ng = cross(tri_e2,tri_e1);
+
+        const Vec3vfM O = broadcast<vfloat<M>>(ray.org,k);
+        const Vec3vfM D = broadcast<vfloat<M>>(ray.dir,k);
+        const Vec3vfM C = Vec3vfM(tri_v0) - O;
+        const Vec3vfM R = cross(C,D);
+        const vfloat<M> den = dot(Vec3vfM(tri_Ng),D);
+        const vfloat<M> absDen = abs(den);
+        const vfloat<M> sgnDen = signmsk(den);
+        
+        /* perform edge tests */
+        const vfloat<M> U = dot(Vec3vf<M>(tri_e2),R) ^ sgnDen;
+        const vfloat<M> V = dot(Vec3vf<M>(tri_e1),R) ^ sgnDen;
+        
+        /* perform backface culling */
+#if defined(EMBREE_BACKFACE_CULLING)
+        vbool<M> valid = (den < vfloat<M>(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen);
+#else
+        vbool<M> valid = (den != vfloat<M>(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen);
+#endif
+        if (likely(none(valid))) return false;
+        
+        /* perform depth test */
+        const vfloat<M> T = dot(Vec3vf<M>(tri_Ng),C) ^ sgnDen;
+        valid &= (absDen*vfloat<M>(ray.tnear()[k]) < T) & (T <= absDen*vfloat<M>(ray.tfar[k]));
+        if (likely(none(valid))) return false;
+        
+        /* calculate hit information */
+        new (&hit) WoopHitM<M>(valid,U,V,T,absDen,tri_Ng);
+        return true;
+      }
+
+      __forceinline bool intersectEdge(RayK<K>& ray,
+                                       size_t k,
+                                       const BBox<vfloat<M>>& time_range,
+                                       const Vec3vf<M>& tri_v0, 
+                                       const Vec3vf<M>& tri_e1, 
+                                       const Vec3vf<M>& tri_e2, 
+                                       WoopHitM<M>& hit) const
+      {
+        if (likely(intersect(ray,k,tri_v0,tri_e1,tri_e2,hit))) 
+        {
+          hit.valid &= time_range.lower <= vfloat<M>(ray.time[k]);
+          hit.valid &= vfloat<M>(ray.time[k]) < time_range.upper;
+          return any(hit.valid);
+        }
+        return false;
+      }
+
+      template<typename Epilog>
+      __forceinline bool intersectEdge(RayK<K>& ray,
+                                       size_t k,
+                                       const Vec3vf<M>& tri_v0, 
+                                       const Vec3vf<M>& tri_e1, 
+                                       const Vec3vf<M>& tri_e2, 
+                                       const Epilog& epilog) const
+      {
+        WoopHitM<M> hit;
+        if (likely(intersectEdge(ray,k,tri_v0,tri_e1,tri_e2,hit))) return epilog(hit.valid,hit);
+        return false;
+      }
+
+      template<typename Epilog>
+      __forceinline bool intersectEdge(RayK<K>& ray,
+                                       size_t k,                           
+                                       const BBox<vfloat<M>>& time_range,
+                                       const Vec3vf<M>& tri_v0, 
+                                       const Vec3vf<M>& tri_e1, 
+                                       const Vec3vf<M>& tri_e2, 
+                                       const Epilog& epilog) const
+      {
+        WoopHitM<M> hit;
+        if (likely(intersectEdge(ray,k,time_range,tri_v0,tri_e1,tri_e2,hit))) return epilog(hit.valid,hit);
+        return false;
+      }
+      
+      template<typename Epilog>
+      __forceinline bool intersect(RayK<K>& ray,
+                                   size_t k,
+                                   const Vec3vf<M>& v0, 
+                                   const Vec3vf<M>& v1, 
+                                   const Vec3vf<M>& v2, 
+                                   const Epilog& epilog) const      
+      {
+        const Vec3vf<M> e1 = v0-v1;
+        const Vec3vf<M> e2 = v2-v0;
+        return intersectEdge(ray,k,v0,e1,e2,epilog);
+      }
+
+      template<typename Epilog>
+      __forceinline bool intersect(RayK<K>& ray,
+                                   size_t k,
+                                   const BBox<vfloat<M>>& time_range,
+                                   const Vec3vf<M>& v0,
+                                   const Vec3vf<M>& v1,
+                                   const Vec3vf<M>& v2,
+                                   const Epilog& epilog) const
+      {
+        const Vec3vf<M> e1 = v0-v1;
+        const Vec3vf<M> e2 = v2-v0;
+        return intersectEdge(ray,k,time_range,v0,e1,e2,epilog);
+      }
+    };
+#endif
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/triangle_triangle_intersector.h b/thirdparty/embree/kernels/geometry/triangle_triangle_intersector.h
new file mode 100644
index 0000000000..50106bcc16
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/triangle_triangle_intersector.h
@@ -0,0 +1,132 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "primitive.h"
+
+namespace embree
+{ 
+  namespace isa
+  {
+    struct TriangleTriangleIntersector
+    {
+      __forceinline static float T(float pa0, float pa1, float da0, float da1) {
+        return pa0 + (pa1-pa0)*da0/(da0-da1);
+      }
+      
+      __forceinline static bool point_line_side(const Vec2f& p, const Vec2f& a0, const Vec2f& a1) {
+        return det(p-a0,a0-a1) >= 0.0f;
+      }
+      
+      __forceinline static bool point_inside_triangle(const Vec2f& p, const Vec2f& a, const Vec2f& b, const Vec2f& c) 
+      {
+        const bool pab = point_line_side(p,a,b); 
+        const bool pbc = point_line_side(p,b,c);
+        const bool pca = point_line_side(p,c,a);
+        return pab == pbc && pab == pca;
+      }
+      
+      __forceinline static bool intersect_line_line(const Vec2f& a0, const Vec2f& a1, const Vec2f& b0, const Vec2f& b1)
+      {
+        const bool different_sides0 = point_line_side(b0,a0,a1) != point_line_side(b1,a0,a1);
+        const bool different_sides1 = point_line_side(a0,b0,b1) != point_line_side(a1,b0,b1);
+        return different_sides0 && different_sides1;
+      }
+      
+      __forceinline static bool intersect_triangle_triangle (const Vec2f& a0, const Vec2f& a1, const Vec2f& a2, 
+                                                             const Vec2f& b0, const Vec2f& b1, const Vec2f& b2)
+      {
+        const bool a01_b01 = intersect_line_line(a0,a1,b0,b1); 
+        if (a01_b01) return true;
+        const bool a01_b12 = intersect_line_line(a0,a1,b1,b2);
+        if (a01_b12) return true;
+        const bool a01_b20 = intersect_line_line(a0,a1,b2,b0);
+        if (a01_b20) return true;
+        const bool a12_b01 = intersect_line_line(a1,a2,b0,b1);
+        if (a12_b01) return true;
+        const bool a12_b12 = intersect_line_line(a1,a2,b1,b2);
+        if (a12_b12) return true;
+        const bool a12_b20 = intersect_line_line(a1,a2,b2,b0);
+        if (a12_b20) return true;
+        const bool a20_b01 = intersect_line_line(a2,a0,b0,b1);
+        if (a20_b01) return true;
+        const bool a20_b12 = intersect_line_line(a2,a0,b1,b2);
+        if (a20_b12) return true;
+        const bool a20_b20 = intersect_line_line(a2,a0,b2,b0);
+        if (a20_b20) return true;
+        
+        bool a_in_b = point_inside_triangle(a0,b0,b1,b2) && point_inside_triangle(a1,b0,b1,b2) && point_inside_triangle(a2,b0,b1,b2);
+        if (a_in_b) return true;
+        
+        bool b_in_a = point_inside_triangle(b0,a0,a1,a2) && point_inside_triangle(b1,a0,a1,a2) && point_inside_triangle(b2,a0,a1,a2);
+        if (b_in_a) return true;
+        
+        return false;
+      }
+      
+      static bool intersect_triangle_triangle (const Vec3fa& a0, const Vec3fa& a1, const Vec3fa& a2,
+                                               const Vec3fa& b0, const Vec3fa& b1, const Vec3fa& b2)
+      {
+        const float eps = 1E-5f;
+        
+        /* calculate triangle planes */
+        const Vec3fa Na = cross(a1-a0,a2-a0);
+        const float  Ca = dot(Na,a0);
+        const Vec3fa Nb = cross(b1-b0,b2-b0);
+        const float  Cb = dot(Nb,b0);
+        
+        /* project triangle A onto plane B */
+        const float da0 = dot(Nb,a0)-Cb;
+        const float da1 = dot(Nb,a1)-Cb;
+        const float da2 = dot(Nb,a2)-Cb;
+        if (max(da0,da1,da2) < -eps) return false;
+        if (min(da0,da1,da2) > +eps) return false;
+        //CSTAT(bvh_collide_prim_intersections4++);
+        
+        /* project triangle B onto plane A */
+        const float db0 = dot(Na,b0)-Ca;
+        const float db1 = dot(Na,b1)-Ca;
+        const float db2 = dot(Na,b2)-Ca;
+        if (max(db0,db1,db2) < -eps) return false;
+        if (min(db0,db1,db2) > +eps) return false;
+        //CSTAT(bvh_collide_prim_intersections5++);
+        
+        if (unlikely((std::fabs(da0) < eps && std::fabs(da1) < eps && std::fabs(da2) < eps) ||
+                     (std::fabs(db0) < eps && std::fabs(db1) < eps && std::fabs(db2) < eps)))
+        {
+          const size_t dz = maxDim(Na);
+          const size_t dx = (dz+1)%3;
+          const size_t dy = (dx+1)%3;
+          const Vec2f A0(a0[dx],a0[dy]);
+          const Vec2f A1(a1[dx],a1[dy]);
+          const Vec2f A2(a2[dx],a2[dy]);
+          const Vec2f B0(b0[dx],b0[dy]);
+          const Vec2f B1(b1[dx],b1[dy]);
+          const Vec2f B2(b2[dx],b2[dy]);
+          return intersect_triangle_triangle(A0,A1,A2,B0,B1,B2);
+        }
+        
+        const Vec3fa D = cross(Na,Nb);
+        const float pa0 = dot(D,a0);
+        const float pa1 = dot(D,a1);
+        const float pa2 = dot(D,a2);
+        const float pb0 = dot(D,b0);
+        const float pb1 = dot(D,b1);
+        const float pb2 = dot(D,b2);
+        
+        BBox1f ba = empty;
+        if (min(da0,da1) <= 0.0f && max(da0,da1) >= 0.0f && abs(da0-da1) > 0.0f) ba.extend(T(pa0,pa1,da0,da1));
+        if (min(da1,da2) <= 0.0f && max(da1,da2) >= 0.0f && abs(da1-da2) > 0.0f) ba.extend(T(pa1,pa2,da1,da2));
+        if (min(da2,da0) <= 0.0f && max(da2,da0) >= 0.0f && abs(da2-da0) > 0.0f) ba.extend(T(pa2,pa0,da2,da0));
+        
+        BBox1f bb = empty;
+        if (min(db0,db1) <= 0.0f && max(db0,db1) >= 0.0f && abs(db0-db1) > 0.0f) bb.extend(T(pb0,pb1,db0,db1));
+        if (min(db1,db2) <= 0.0f && max(db1,db2) >= 0.0f && abs(db1-db2) > 0.0f) bb.extend(T(pb1,pb2,db1,db2));
+        if (min(db2,db0) <= 0.0f && max(db2,db0) >= 0.0f && abs(db2-db0) > 0.0f) bb.extend(T(pb2,pb0,db2,db0));
+        
+        return conjoint(ba,bb);
+      }
+    };
+  }
+}
+
+  
diff --git a/thirdparty/embree/kernels/geometry/trianglei.h b/thirdparty/embree/kernels/geometry/trianglei.h
new file mode 100644
index 0000000000..6aad48a5ef
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/trianglei.h
@@ -0,0 +1,442 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+#include "../common/scene.h"
+
+namespace embree
+{
+  /* Stores M triangles from an indexed face set */
+  template <int M>
+  struct TriangleMi
+  {
+    /* Virtual interface to query information about the triangle type */
+    struct Type : public PrimitiveType
+    {
+      const char* name() const;
+      size_t sizeActive(const char* This) const;
+      size_t sizeTotal(const char* This) const;
+      size_t getBytes(const char* This) const;
+    };
+    static Type type;
+
+  public:
+
+    /* primitive supports multiple time segments */
+    static const bool singleTimeSegment = false;
+
+    /* Returns maximum number of stored triangles */
+    static __forceinline size_t max_size() { return M; }
+
+    /* Returns required number of primitive blocks for N primitives */
+    static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); }
+
+  public:
+
+    /* Default constructor */
+    __forceinline TriangleMi() {  }
+
+    /* Construction from vertices and IDs */
+    __forceinline TriangleMi(const vuint<M>& v0,
+                             const vuint<M>& v1,
+                             const vuint<M>& v2,
+                             const vuint<M>& geomIDs,
+                             const vuint<M>& primIDs)
+#if defined(EMBREE_COMPACT_POLYS)
+      : geomIDs(geomIDs), primIDs(primIDs) {}
+#else
+    : v0_(v0), v1_(v1), v2_(v2), geomIDs(geomIDs), primIDs(primIDs) {}
+#endif
+
+    /* Returns a mask that tells which triangles are valid */
+    __forceinline vbool<M> valid() const { return primIDs != vuint<M>(-1); }
+
+    /* Returns if the specified triangle is valid */
+    __forceinline bool valid(const size_t i) const { assert(i<M); return primIDs[i] != -1; }
+
+    /* Returns the number of stored triangles */
+    __forceinline size_t size() const { return bsf(~movemask(valid())); }
+
+    /* Returns the geometry IDs */
+    __forceinline vuint<M> geomID() const { return geomIDs; }
+    __forceinline unsigned int geomID(const size_t i) const { assert(i<M); return geomIDs[i]; }
+
+    /* Returns the primitive IDs */
+    __forceinline vuint<M> primID() const { return primIDs; }
+    __forceinline unsigned int primID(const size_t i) const { assert(i<M); return primIDs[i]; }
+
+    /* Calculate the bounds of the triangles */
+    __forceinline const BBox3fa bounds(const Scene *const scene, const size_t itime=0) const
+    {
+      BBox3fa bounds = empty;
+      for (size_t i=0; i<M && valid(i); i++) {
+        const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID(i));
+        bounds.extend(mesh->bounds(primID(i),itime));
+      }
+      return bounds;
+    }
+
+    /* Calculate the linear bounds of the primitive */
+    __forceinline LBBox3fa linearBounds(const Scene *const scene, size_t itime) {
+      return LBBox3fa(bounds(scene,itime+0),bounds(scene,itime+1));
+    }
+
+    __forceinline LBBox3fa linearBounds(const Scene *const scene, size_t itime, size_t numTimeSteps)
+    {
+      LBBox3fa allBounds = empty;
+      for (size_t i=0; i<M && valid(i); i++)
+      {
+        const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID(i));
+        allBounds.extend(mesh->linearBounds(primID(i), itime, numTimeSteps));
+      }
+      return allBounds;
+    }
+
+    __forceinline LBBox3fa linearBounds(const Scene *const scene, const BBox1f time_range)
+    {
+      LBBox3fa allBounds = empty;
+      for (size_t i=0; i<M && valid(i); i++)
+      {
+        const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID(i));
+        allBounds.extend(mesh->linearBounds(primID(i), time_range));
+      }
+      return allBounds;
+    }
+    
+    /* Non-temporal store */
+    __forceinline static void store_nt(TriangleMi* dst, const TriangleMi& src)
+    {
+#if !defined(EMBREE_COMPACT_POLYS)
+      vuint<M>::store_nt(&dst->v0_,src.v0_);
+      vuint<M>::store_nt(&dst->v1_,src.v1_);
+      vuint<M>::store_nt(&dst->v2_,src.v2_);
+#endif
+      vuint<M>::store_nt(&dst->geomIDs,src.geomIDs);
+      vuint<M>::store_nt(&dst->primIDs,src.primIDs);
+    }
+
+    /* Fill triangle from triangle list */
+    template<typename PrimRefT>
+    __forceinline void fill(const PrimRefT* prims, size_t& begin, size_t end, Scene* scene)
+    {
+      vuint<M> v0 = zero, v1 = zero, v2 = zero;
+      vuint<M> geomID = -1, primID = -1;
+      const PrimRefT* prim = &prims[begin];
+
+      for (size_t i=0; i<M; i++)
+      {
+        if (begin<end) {
+          geomID[i] = prim->geomID();
+          primID[i] = prim->primID();
+#if !defined(EMBREE_COMPACT_POLYS)
+          const TriangleMesh* mesh = scene->get<TriangleMesh>(prim->geomID());
+          const TriangleMesh::Triangle& tri = mesh->triangle(prim->primID());
+          unsigned int int_stride = mesh->vertices0.getStride()/4;
+          v0[i] = tri.v[0] * int_stride;
+          v1[i] = tri.v[1] * int_stride;
+          v2[i] = tri.v[2] * int_stride;
+#endif
+          begin++;
+        } else {
+          assert(i);
+          if (likely(i > 0)) {
+            geomID[i] = geomID[0];
+            primID[i] = -1;
+            v0[i] = v0[0];
+            v1[i] = v0[0];
+            v2[i] = v0[0];
+          }
+        }
+        if (begin<end) prim = &prims[begin];
+      }
+      new (this) TriangleMi(v0,v1,v2,geomID,primID); // FIXME: use non temporal store
+    }
+
+    __forceinline LBBox3fa fillMB(const PrimRef* prims, size_t& begin, size_t end, Scene* scene, size_t itime)
+    {
+      fill(prims, begin, end, scene);
+      return linearBounds(scene, itime);
+    }
+
+    __forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t& begin, size_t end, Scene* scene, const BBox1f time_range)
+    {
+      fill(prims, begin, end, scene);
+      return linearBounds(scene, time_range);
+    }
+
+    /* Updates the primitive */
+    __forceinline BBox3fa update(TriangleMesh* mesh)
+    {
+      BBox3fa bounds = empty;
+      for (size_t i=0; i<M; i++)
+      {
+        if (primID(i) == -1) break;
+        const unsigned int primId = primID(i);
+        const TriangleMesh::Triangle& tri = mesh->triangle(primId);
+        const Vec3fa p0 = mesh->vertex(tri.v[0]);
+        const Vec3fa p1 = mesh->vertex(tri.v[1]);
+        const Vec3fa p2 = mesh->vertex(tri.v[2]);
+        bounds.extend(merge(BBox3fa(p0),BBox3fa(p1),BBox3fa(p2)));
+      }
+      return bounds;
+    }
+
+  protected:
+#if !defined(EMBREE_COMPACT_POLYS)
+    vuint<M> v0_;         // 4 byte offset of 1st vertex
+    vuint<M> v1_;         // 4 byte offset of 2nd vertex
+    vuint<M> v2_;         // 4 byte offset of 3rd vertex
+#endif
+    vuint<M> geomIDs;    // geometry ID of mesh
+    vuint<M> primIDs;    // primitive ID of primitive inside mesh
+  };
+
+  namespace isa
+  {
+    
+  template<int M>
+    struct TriangleMi : public embree::TriangleMi<M>
+  {
+#if !defined(EMBREE_COMPACT_POLYS)
+    using embree::TriangleMi<M>::v0_;
+    using embree::TriangleMi<M>::v1_;
+    using embree::TriangleMi<M>::v2_;
+#endif
+    using embree::TriangleMi<M>::geomIDs;
+    using embree::TriangleMi<M>::primIDs;
+    using embree::TriangleMi<M>::geomID;
+    using embree::TriangleMi<M>::primID;
+    using embree::TriangleMi<M>::valid;
+        
+    /* loads a single vertex */
+    template<int vid>
+    __forceinline Vec3f getVertex(const size_t index, const Scene *const scene) const
+    {
+#if defined(EMBREE_COMPACT_POLYS)
+      const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID(index));
+      const TriangleMesh::Triangle& tri = mesh->triangle(primID(index));
+      return (Vec3f) mesh->vertices[0][tri.v[vid]];
+#else
+      const vuint<M>& v = getVertexOffset<vid>();
+      const float* vertices = scene->vertices[geomID(index)];
+      return (Vec3f&) vertices[v[index]];
+#endif
+    }
+
+    template<int vid, typename T>
+    __forceinline Vec3<T> getVertex(const size_t index, const Scene *const scene, const size_t itime, const T& ftime) const
+    {
+#if defined(EMBREE_COMPACT_POLYS)
+      const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID(index));
+      const TriangleMesh::Triangle& tri = mesh->triangle(primID(index));
+      const Vec3fa v0 = mesh->vertices[itime+0][tri.v[vid]];
+      const Vec3fa v1 = mesh->vertices[itime+1][tri.v[vid]];
+#else
+      const vuint<M>& v = getVertexOffset<vid>();
+      const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID(index));
+      const float* vertices0 = (const float*) mesh->vertexPtr(0,itime+0);
+      const float* vertices1 = (const float*) mesh->vertexPtr(0,itime+1);
+      const Vec3fa v0 = Vec3fa::loadu(vertices0+v[index]);
+      const Vec3fa v1 = Vec3fa::loadu(vertices1+v[index]);
+#endif
+      const Vec3<T> p0(v0.x,v0.y,v0.z);
+      const Vec3<T> p1(v1.x,v1.y,v1.z);
+      return lerp(p0,p1,ftime);
+    }
+
+    template<int vid, int K, typename T>
+    __forceinline Vec3<T> getVertex(const vbool<K>& valid, const size_t index, const Scene *const scene, const vint<K>& itime, const T& ftime) const
+    {
+      Vec3<T> p0, p1;
+      const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID(index));
+      
+      for (size_t mask=movemask(valid), i=bsf(mask); mask; mask=btc(mask,i), i=bsf(mask))
+      {
+#if defined(EMBREE_COMPACT_POLYS)
+        const TriangleMesh::Triangle& tri = mesh->triangle(primID(index));
+        const Vec3fa v0 = mesh->vertices[itime[i]+0][tri.v[vid]];
+        const Vec3fa v1 = mesh->vertices[itime[i]+1][tri.v[vid]];
+#else
+        const vuint<M>& v = getVertexOffset<vid>();
+        const float* vertices0 = (const float*) mesh->vertexPtr(0,itime[i]+0);
+        const float* vertices1 = (const float*) mesh->vertexPtr(0,itime[i]+1);
+        const Vec3fa v0 = Vec3fa::loadu(vertices0+v[index]);
+        const Vec3fa v1 = Vec3fa::loadu(vertices1+v[index]);
+#endif
+        p0.x[i] = v0.x; p0.y[i] = v0.y; p0.z[i] = v0.z;
+        p1.x[i] = v1.x; p1.y[i] = v1.y; p1.z[i] = v1.z;
+      }
+      return (T(one)-ftime)*p0 + ftime*p1;
+    }
+
+    struct Triangle {
+      vfloat4 v0,v1,v2;
+    };
+    
+#if defined(EMBREE_COMPACT_POLYS)
+    
+    __forceinline Triangle loadTriangle(const int i, const Scene* const scene) const 
+    {
+      const unsigned int geomID = geomIDs[i];
+      const unsigned int primID = primIDs[i];
+      if (unlikely(primID == -1)) return { zero, zero, zero };
+      const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID);
+      const TriangleMesh::Triangle& tri = mesh->triangle(primID);
+      const vfloat4 v0 = (vfloat4) mesh->vertices0[tri.v[0]];
+      const vfloat4 v1 = (vfloat4) mesh->vertices0[tri.v[1]];
+      const vfloat4 v2 = (vfloat4) mesh->vertices0[tri.v[2]];
+      return { v0, v1, v2 };
+    }
+
+    __forceinline Triangle loadTriangle(const int i, const int itime, const TriangleMesh* const mesh) const 
+    {
+      const unsigned int primID = primIDs[i];
+      if (unlikely(primID == -1)) return { zero, zero, zero };
+      const TriangleMesh::Triangle& tri = mesh->triangle(primID);
+      const vfloat4 v0 = (vfloat4) mesh->vertices[itime][tri.v[0]];
+      const vfloat4 v1 = (vfloat4) mesh->vertices[itime][tri.v[1]];
+      const vfloat4 v2 = (vfloat4) mesh->vertices[itime][tri.v[2]];
+      return { v0, v1, v2 };
+    }
+    
+#else
+
+    __forceinline Triangle loadTriangle(const int i, const Scene* const scene) const 
+    {
+      const float* vertices = scene->vertices[geomID(i)];
+      const vfloat4 v0 = vfloat4::loadu(vertices + v0_[i]);
+      const vfloat4 v1 = vfloat4::loadu(vertices + v1_[i]);
+      const vfloat4 v2 = vfloat4::loadu(vertices + v2_[i]);
+      return { v0, v1, v2 };
+    }
+
+    __forceinline Triangle loadTriangle(const int i, const int itime, const TriangleMesh* const mesh) const 
+    {
+      const float* vertices = (const float*) mesh->vertexPtr(0,itime);
+      const vfloat4 v0 = vfloat4::loadu(vertices + v0_[i]);
+      const vfloat4 v1 = vfloat4::loadu(vertices + v1_[i]);
+      const vfloat4 v2 = vfloat4::loadu(vertices + v2_[i]);
+      return { v0, v1, v2 };
+    }
+    
+#endif
+
+    /* Gather the triangles */
+    __forceinline void gather(Vec3vf<M>& p0, Vec3vf<M>& p1, Vec3vf<M>& p2, const Scene* const scene) const;
+
+    template<int K>
+#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 2000) // workaround for compiler bug in ICC 2019
+    __noinline
+#else
+    __forceinline
+#endif
+    void gather(const vbool<K>& valid,
+                Vec3vf<K>& p0,
+                Vec3vf<K>& p1,
+                Vec3vf<K>& p2,
+                const size_t index,
+                const Scene* const scene,
+                const vfloat<K>& time) const
+    {
+      const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID(index));
+
+      vfloat<K> ftime;
+      const vint<K> itime = mesh->timeSegment<K>(time, ftime);
+
+      const size_t first = bsf(movemask(valid));
+      if (likely(all(valid,itime[first] == itime)))
+      {
+        p0 = getVertex<0>(index, scene, itime[first], ftime);
+        p1 = getVertex<1>(index, scene, itime[first], ftime);
+        p2 = getVertex<2>(index, scene, itime[first], ftime);
+      } else {
+        p0 = getVertex<0,K>(valid, index, scene, itime, ftime);
+        p1 = getVertex<1,K>(valid, index, scene, itime, ftime);
+        p2 = getVertex<2,K>(valid, index, scene, itime, ftime);
+      }
+    }
+
+    __forceinline void gather(Vec3vf<M>& p0,
+                              Vec3vf<M>& p1,
+                              Vec3vf<M>& p2,
+                              const TriangleMesh* mesh,
+                              const Scene *const scene,
+                              const int itime) const;
+
+    __forceinline void gather(Vec3vf<M>& p0,
+                              Vec3vf<M>& p1,
+                              Vec3vf<M>& p2,
+                              const Scene *const scene,
+                              const float time) const;
+
+
+#if !defined(EMBREE_COMPACT_POLYS)
+    template<int N> const vuint<M>& getVertexOffset() const;
+#endif
+  };
+
+#if !defined(EMBREE_COMPACT_POLYS)
+  template<> template<> __forceinline const vuint<4>& TriangleMi<4>::getVertexOffset<0>() const { return v0_; }
+  template<> template<> __forceinline const vuint<4>& TriangleMi<4>::getVertexOffset<1>() const { return v1_; }
+  template<> template<> __forceinline const vuint<4>& TriangleMi<4>::getVertexOffset<2>() const { return v2_; }
+#endif
+  
+  template<>
+  __forceinline void TriangleMi<4>::gather(Vec3vf4& p0,
+                                           Vec3vf4& p1,
+                                           Vec3vf4& p2,
+                                           const Scene* const scene) const
+  {
+    const Triangle tri0 = loadTriangle(0,scene);
+    const Triangle tri1 = loadTriangle(1,scene);
+    const Triangle tri2 = loadTriangle(2,scene);
+    const Triangle tri3 = loadTriangle(3,scene);
+    transpose(tri0.v0,tri1.v0,tri2.v0,tri3.v0,p0.x,p0.y,p0.z);
+    transpose(tri0.v1,tri1.v1,tri2.v1,tri3.v1,p1.x,p1.y,p1.z);
+    transpose(tri0.v2,tri1.v2,tri2.v2,tri3.v2,p2.x,p2.y,p2.z);
+  }
+
+  template<>
+  __forceinline void TriangleMi<4>::gather(Vec3vf4& p0,
+                                           Vec3vf4& p1,
+                                           Vec3vf4& p2,
+                                           const TriangleMesh* mesh,
+                                           const Scene *const scene,
+                                           const int itime) const
+  {
+    const Triangle tri0 = loadTriangle(0,itime,mesh);
+    const Triangle tri1 = loadTriangle(1,itime,mesh);
+    const Triangle tri2 = loadTriangle(2,itime,mesh);
+    const Triangle tri3 = loadTriangle(3,itime,mesh);
+    transpose(tri0.v0,tri1.v0,tri2.v0,tri3.v0,p0.x,p0.y,p0.z);
+    transpose(tri0.v1,tri1.v1,tri2.v1,tri3.v1,p1.x,p1.y,p1.z);
+    transpose(tri0.v2,tri1.v2,tri2.v2,tri3.v2,p2.x,p2.y,p2.z);
+  }
+
+  template<>
+  __forceinline void TriangleMi<4>::gather(Vec3vf4& p0,
+                                           Vec3vf4& p1,
+                                           Vec3vf4& p2,
+                                           const Scene *const scene,
+                                           const float time) const
+  {
+    const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID(0)); // in mblur mode all geometries are identical
+
+    float ftime;
+    const int itime = mesh->timeSegment(time, ftime);
+
+    Vec3vf4 a0,a1,a2; gather(a0,a1,a2,mesh,scene,itime);
+    Vec3vf4 b0,b1,b2; gather(b0,b1,b2,mesh,scene,itime+1);
+    p0 = lerp(a0,b0,vfloat4(ftime));
+    p1 = lerp(a1,b1,vfloat4(ftime));
+    p2 = lerp(a2,b2,vfloat4(ftime));
+  }
+  }
+
+  template<int M>
+  typename TriangleMi<M>::Type TriangleMi<M>::type;
+
+  typedef TriangleMi<4> Triangle4i;
+}
diff --git a/thirdparty/embree/kernels/geometry/trianglei_intersector.h b/thirdparty/embree/kernels/geometry/trianglei_intersector.h
new file mode 100644
index 0000000000..f7deb9e72d
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/trianglei_intersector.h
@@ -0,0 +1,336 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "trianglei.h"
+#include "triangle_intersector_moeller.h"
+#include "triangle_intersector_pluecker.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    /*! Intersects M triangles with 1 ray */
+    template<int M, bool filter>
+    struct TriangleMiIntersector1Moeller
+    {
+      typedef TriangleMi<M> Primitive;
+      typedef MoellerTrumboreIntersector1<M> Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf<M> v0, v1, v2; tri.gather(v0,v1,v2,context->scene);
+        pre.intersect(ray,v0,v1,v2,Intersect1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf<M> v0, v1, v2; tri.gather(v0,v1,v2,context->scene);
+        return pre.intersect(ray,v0,v1,v2,Occluded1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri);
+      }
+    };
+
+    /*! Intersects M triangles with K rays */
+    template<int M, int K, bool filter>
+    struct TriangleMiIntersectorKMoeller
+    {
+      typedef TriangleMi<M> Primitive;
+      typedef MoellerTrumboreIntersectorK<M,K> Precalculations;
+
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive& tri)
+      {
+        const Scene* scene = context->scene;
+        for (size_t i=0; i<Primitive::max_size(); i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),RayHitK<K>::size());
+          const Vec3vf<K> v0 = tri.template getVertex<0>(i,scene);
+          const Vec3vf<K> v1 = tri.template getVertex<1>(i,scene);
+          const Vec3vf<K> v2 = tri.template getVertex<2>(i,scene);
+          pre.intersectK(valid_i,ray,v0,v1,v2,IntersectKEpilogM<M,K,filter>(ray,context,tri.geomID(),tri.primID(),i));
+        }
+      }
+
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive& tri)
+      {
+        vbool<K> valid0 = valid_i;
+        const Scene* scene = context->scene;
+
+        for (size_t i=0; i<Primitive::max_size(); i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid_i),RayHitK<K>::size());
+          const Vec3vf<K> v0 = tri.template getVertex<0>(i,scene);
+          const Vec3vf<K> v1 = tri.template getVertex<1>(i,scene);
+          const Vec3vf<K> v2 = tri.template getVertex<2>(i,scene);
+          pre.intersectK(valid0,ray,v0,v1,v2,OccludedKEpilogM<M,K,filter>(valid0,ray,context,tri.geomID(),tri.primID(),i));
+          if (none(valid0)) break;
+        }
+        return !valid0;
+      }
+      
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf<M> v0, v1, v2; tri.gather(v0,v1,v2,context->scene);
+        pre.intersect(ray,k,v0,v1,v2,Intersect1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf<M> v0, v1, v2; tri.gather(v0,v1,v2,context->scene);
+        return pre.intersect(ray,k,v0,v1,v2,Occluded1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+    };
+
+    /*! Intersects M triangles with 1 ray */
+    template<int M, bool filter>
+    struct TriangleMiIntersector1Pluecker
+    {
+      typedef TriangleMi<M> Primitive;
+      typedef PlueckerIntersector1<M> Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf<M> v0, v1, v2; tri.gather(v0,v1,v2,context->scene);
+        pre.intersect(ray,v0,v1,v2,Intersect1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf<M> v0, v1, v2; tri.gather(v0,v1,v2,context->scene);
+        return pre.intersect(ray,v0,v1,v2,Occluded1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri);
+      }
+    };
+
+    /*! Intersects M triangles with K rays */
+    template<int M, int K, bool filter>
+    struct TriangleMiIntersectorKPluecker
+    {
+      typedef TriangleMi<M> Primitive;
+      typedef PlueckerIntersectorK<M,K> Precalculations;
+
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive& tri)
+      {
+        const Scene* scene = context->scene;
+        for (size_t i=0; i<Primitive::max_size(); i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),RayHitK<K>::size());
+          const Vec3vf<K> v0 = tri.template getVertex<0>(i,scene);
+          const Vec3vf<K> v1 = tri.template getVertex<1>(i,scene);
+          const Vec3vf<K> v2 = tri.template getVertex<2>(i,scene);
+          pre.intersectK(valid_i,ray,v0,v1,v2,IntersectKEpilogM<M,K,filter>(ray,context,tri.geomID(),tri.primID(),i));
+        }
+      }
+
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive& tri)
+      {
+        vbool<K> valid0 = valid_i;
+        const Scene* scene = context->scene;
+
+        for (size_t i=0; i<Primitive::max_size(); i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid_i),RayHitK<K>::size());
+          const Vec3vf<K> v0 = tri.template getVertex<0>(i,scene);
+          const Vec3vf<K> v1 = tri.template getVertex<1>(i,scene);
+          const Vec3vf<K> v2 = tri.template getVertex<2>(i,scene);
+          pre.intersectK(valid0,ray,v0,v1,v2,OccludedKEpilogM<M,K,filter>(valid0,ray,context,tri.geomID(),tri.primID(),i));
+          if (none(valid0)) break;
+        }
+        return !valid0;
+      }
+
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf<M> v0, v1, v2; tri.gather(v0,v1,v2,context->scene);
+        pre.intersect(ray,k,v0,v1,v2,Intersect1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf<M> v0, v1, v2; tri.gather(v0,v1,v2,context->scene);
+        return pre.intersect(ray,k,v0,v1,v2,Occluded1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+    };
+
+    /*! Intersects M motion blur triangles with 1 ray */
+    template<int M, bool filter>
+    struct TriangleMiMBIntersector1Moeller
+    {
+      typedef TriangleMi<M> Primitive;
+      typedef MoellerTrumboreIntersector1<M> Precalculations;
+
+      /*! Intersect a ray with the M triangles and updates the hit. */
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time());
+        pre.intersect(ray,v0,v1,v2,Intersect1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+
+      /*! Test if the ray is occluded by one of M triangles. */
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time());
+        return pre.intersect(ray,v0,v1,v2,Occluded1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri);
+      }
+    };
+
+    /*! Intersects M motion blur triangles with K rays. */
+    template<int M, int K, bool filter>
+    struct TriangleMiMBIntersectorKMoeller
+    {
+      typedef TriangleMi<M> Primitive;
+      typedef MoellerTrumboreIntersectorK<M,K> Precalculations;
+
+      /*! Intersects K rays with M triangles. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const TriangleMi<M>& tri)
+      {
+        for (size_t i=0; i<TriangleMi<M>::max_size(); i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          Vec3vf<K> v0,v1,v2; tri.template gather<K>(valid_i,v0,v1,v2,i,context->scene,ray.time());
+          pre.intersectK(valid_i,ray,v0,v1,v2,IntersectKEpilogM<M,K,filter>(ray,context,tri.geomID(),tri.primID(),i));
+        }
+      }
+
+      /*! Test for K rays if they are occluded by any of the M triangles. */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const TriangleMi<M>& tri)
+      {
+        vbool<K> valid0 = valid_i;
+        for (size_t i=0; i<TriangleMi<M>::max_size(); i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+          Vec3vf<K> v0,v1,v2; tri.template gather<K>(valid_i,v0,v1,v2,i,context->scene,ray.time());
+          pre.intersectK(valid0,ray,v0,v1,v2,OccludedKEpilogM<M,K,filter>(valid0,ray,context,tri.geomID(),tri.primID(),i));
+          if (none(valid0)) break;
+        }
+        return !valid0;
+      }
+
+      /*! Intersect a ray with M triangles and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const TriangleMi<M>& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time()[k]);
+        pre.intersect(ray,k,v0,v1,v2,Intersect1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+
+      /*! Test if the ray is occluded by one of the M triangles. */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const TriangleMi<M>& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time()[k]);
+        return pre.intersect(ray,k,v0,v1,v2,Occluded1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+    };
+
+    /*! Intersects M motion blur triangles with 1 ray */
+    template<int M, bool filter>
+    struct TriangleMiMBIntersector1Pluecker
+    {
+      typedef TriangleMi<M> Primitive;
+      typedef PlueckerIntersector1<M> Precalculations;
+
+      /*! Intersect a ray with the M triangles and updates the hit. */
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time());
+        pre.intersect(ray,v0,v1,v2,Intersect1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+
+      /*! Test if the ray is occluded by one of M triangles. */
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time());
+        return pre.intersect(ray,v0,v1,v2,Occluded1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri);
+      }
+    };
+
+    /*! Intersects M motion blur triangles with K rays. */
+    template<int M, int K, bool filter>
+    struct TriangleMiMBIntersectorKPluecker
+    {
+      typedef TriangleMi<M> Primitive;
+      typedef PlueckerIntersectorK<M,K> Precalculations;
+
+      /*! Intersects K rays with M triangles. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const TriangleMi<M>& tri)
+      {
+        for (size_t i=0; i<TriangleMi<M>::max_size(); i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          Vec3vf<K> v0,v1,v2; tri.template gather<K>(valid_i,v0,v1,v2,i,context->scene,ray.time());
+          pre.intersectK(valid_i,ray,v0,v1,v2,IntersectKEpilogM<M,K,filter>(ray,context,tri.geomID(),tri.primID(),i));
+        }
+      }
+
+      /*! Test for K rays if they are occluded by any of the M triangles. */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const TriangleMi<M>& tri)
+      {
+        vbool<K> valid0 = valid_i;
+        for (size_t i=0; i<TriangleMi<M>::max_size(); i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+          Vec3vf<K> v0,v1,v2; tri.template gather<K>(valid_i,v0,v1,v2,i,context->scene,ray.time());
+          pre.intersectK(valid0,ray,v0,v1,v2,OccludedKEpilogM<M,K,filter>(valid0,ray,context,tri.geomID(),tri.primID(),i));
+          if (none(valid0)) break;
+        }
+        return !valid0;
+      }
+      
+      /*! Intersect a ray with M triangles and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const TriangleMi<M>& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time()[k]);
+        pre.intersect(ray,k,v0,v1,v2,Intersect1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+
+      /*! Test if the ray is occluded by one of the M triangles. */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const TriangleMi<M>& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time()[k]);
+        return pre.intersect(ray,k,v0,v1,v2,Occluded1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/trianglev.h b/thirdparty/embree/kernels/geometry/trianglev.h
new file mode 100644
index 0000000000..cd94756b9e
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/trianglev.h
@@ -0,0 +1,157 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+
+namespace embree
+{
+  /* Stores the vertices of M triangles in struct of array layout */
+  template <int M>
+  struct TriangleMv
+  { 
+  public:
+    struct Type : public PrimitiveType 
+    {
+      const char* name() const;
+      size_t sizeActive(const char* This) const;
+      size_t sizeTotal(const char* This) const;
+      size_t getBytes(const char* This) const;
+    };
+    static Type type;
+
+  public:
+
+    /* Returns maximum number of stored triangles */
+    static __forceinline size_t max_size() { return M; }
+    
+    /* Returns required number of primitive blocks for N primitives */
+    static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); }
+   
+  public:
+
+    /* Default constructor */
+    __forceinline TriangleMv() {}
+
+    /* Construction from vertices and IDs */
+    __forceinline TriangleMv(const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const vuint<M>& geomIDs, const vuint<M>& primIDs)
+      : v0(v0), v1(v1), v2(v2), geomIDs(geomIDs), primIDs(primIDs) {}
+    
+    /* Returns a mask that tells which triangles are valid */
+    __forceinline vbool<M> valid() const { return geomIDs != vuint<M>(-1); }
+
+    /* Returns true if the specified triangle is valid */
+    __forceinline bool valid(const size_t i) const { assert(i<M); return geomIDs[i] != -1; }
+
+    /* Returns the number of stored triangles */
+    __forceinline size_t size() const { return bsf(~movemask(valid())); }
+
+    /* Returns the geometry IDs */
+    __forceinline       vuint<M>& geomID()       { return geomIDs; }
+    __forceinline const vuint<M>& geomID() const { return geomIDs; }
+    __forceinline unsigned int geomID(const size_t i) const { assert(i<M); return geomIDs[i]; }
+
+    /* Returns the primitive IDs */
+    __forceinline       vuint<M>& primID()       { return primIDs; }
+    __forceinline const vuint<M>& primID() const { return primIDs; }
+    __forceinline unsigned int primID(const size_t i) const { assert(i<M); return primIDs[i]; }
+
+    /* Calculate the bounds of the triangles */
+    __forceinline BBox3fa bounds() const 
+    {
+      Vec3vf<M> lower = min(v0,v1,v2);
+      Vec3vf<M> upper = max(v0,v1,v2);
+      vbool<M> mask = valid();
+      lower.x = select(mask,lower.x,vfloat<M>(pos_inf));
+      lower.y = select(mask,lower.y,vfloat<M>(pos_inf));
+      lower.z = select(mask,lower.z,vfloat<M>(pos_inf));
+      upper.x = select(mask,upper.x,vfloat<M>(neg_inf));
+      upper.y = select(mask,upper.y,vfloat<M>(neg_inf));
+      upper.z = select(mask,upper.z,vfloat<M>(neg_inf));
+      return BBox3fa(Vec3fa(reduce_min(lower.x),reduce_min(lower.y),reduce_min(lower.z)),
+                     Vec3fa(reduce_max(upper.x),reduce_max(upper.y),reduce_max(upper.z)));
+    }
+    
+    /* Non temporal store */
+    __forceinline static void store_nt(TriangleMv* dst, const TriangleMv& src)
+    {
+      vfloat<M>::store_nt(&dst->v0.x,src.v0.x);
+      vfloat<M>::store_nt(&dst->v0.y,src.v0.y);
+      vfloat<M>::store_nt(&dst->v0.z,src.v0.z);
+      vfloat<M>::store_nt(&dst->v1.x,src.v1.x);
+      vfloat<M>::store_nt(&dst->v1.y,src.v1.y);
+      vfloat<M>::store_nt(&dst->v1.z,src.v1.z);
+      vfloat<M>::store_nt(&dst->v2.x,src.v2.x);
+      vfloat<M>::store_nt(&dst->v2.y,src.v2.y);
+      vfloat<M>::store_nt(&dst->v2.z,src.v2.z);
+      vuint<M>::store_nt(&dst->geomIDs,src.geomIDs);
+      vuint<M>::store_nt(&dst->primIDs,src.primIDs);
+    }
+
+    /* Fill triangle from triangle list */
+    __forceinline void fill(const PrimRef* prims, size_t& begin, size_t end, Scene* scene)
+    {
+      vuint<M> vgeomID = -1, vprimID = -1;
+      Vec3vf<M> v0 = zero, v1 = zero, v2 = zero;
+      
+      for (size_t i=0; i<M && begin<end; i++, begin++)
+      {
+	const PrimRef& prim = prims[begin];
+        const unsigned geomID = prim.geomID();
+        const unsigned primID = prim.primID();
+        const TriangleMesh* __restrict__ const mesh = scene->get<TriangleMesh>(geomID);
+        const TriangleMesh::Triangle& tri = mesh->triangle(primID);
+        const Vec3fa& p0 = mesh->vertex(tri.v[0]);
+        const Vec3fa& p1 = mesh->vertex(tri.v[1]);
+        const Vec3fa& p2 = mesh->vertex(tri.v[2]);
+        vgeomID [i] = geomID;
+        vprimID [i] = primID;
+        v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z;
+        v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z;
+        v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z;
+      }
+      TriangleMv::store_nt(this,TriangleMv(v0,v1,v2,vgeomID,vprimID));
+    }
+
+    /* Updates the primitive */
+    __forceinline BBox3fa update(TriangleMesh* mesh)
+    {
+      BBox3fa bounds = empty;
+      vuint<M> vgeomID = -1, vprimID = -1;
+      Vec3vf<M> v0 = zero, v1 = zero, v2 = zero;
+      
+      for (size_t i=0; i<M; i++)
+      {
+        if (primID(i) == -1) break;
+        const unsigned geomId = geomID(i);
+        const unsigned primId = primID(i);
+        const TriangleMesh::Triangle& tri = mesh->triangle(primId);
+        const Vec3fa p0 = mesh->vertex(tri.v[0]);
+        const Vec3fa p1 = mesh->vertex(tri.v[1]);
+        const Vec3fa p2 = mesh->vertex(tri.v[2]);
+        bounds.extend(merge(BBox3fa(p0),BBox3fa(p1),BBox3fa(p2)));
+        vgeomID [i] = geomId;
+        vprimID [i] = primId;
+        v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z;
+        v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z;
+        v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z;
+      }
+      new (this) TriangleMv(v0,v1,v2,vgeomID,vprimID);
+      return bounds;
+    }
+   
+  public:
+    Vec3vf<M> v0;      // 1st vertex of the triangles
+    Vec3vf<M> v1;      // 2nd vertex of the triangles
+    Vec3vf<M> v2;      // 3rd vertex of the triangles
+  private:
+    vuint<M> geomIDs; // geometry ID
+    vuint<M> primIDs; // primitive ID
+  };
+
+  template<int M>
+  typename TriangleMv<M>::Type TriangleMv<M>::type;
+
+  typedef TriangleMv<4> Triangle4v;
+}
diff --git a/thirdparty/embree/kernels/geometry/trianglev_intersector.h b/thirdparty/embree/kernels/geometry/trianglev_intersector.h
new file mode 100644
index 0000000000..3abb7f8e32
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/trianglev_intersector.h
@@ -0,0 +1,206 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "triangle.h"
+#include "triangle_intersector_pluecker.h"
+#include "triangle_intersector_moeller.h"
+#include "triangle_intersector_woop.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    /*! Intersects M triangles with 1 ray */
+    template<int M, bool filter>
+    struct TriangleMvIntersector1Moeller
+    {
+      typedef TriangleMv<M> Primitive;
+      typedef MoellerTrumboreIntersector1<M> Precalculations;
+
+      /*! Intersect a ray with M triangles and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        pre.intersect(ray,tri.v0,tri.v1,tri.v2,/*UVIdentity<M>(),*/Intersect1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+
+      /*! Test if the ray is occluded by one of the M triangles. */
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        return pre.intersect(ray,tri.v0,tri.v1,tri.v2,/*UVIdentity<M>(),*/Occluded1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri);
+      }
+    };
+
+
+    template<int M, bool filter>
+    struct TriangleMvIntersector1Woop
+    {
+      typedef TriangleMv<M> Primitive;
+      typedef WoopIntersector1<M> intersec;
+      typedef WoopPrecalculations1<M> Precalculations;
+
+      /*! Intersect a ray with M triangles and updates the hit. */
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        intersec::intersect(ray,pre,tri.v0,tri.v1,tri.v2,Intersect1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+
+      /*! Test if the ray is occluded by one of the M triangles. */
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        return intersec::intersect(ray,pre,tri.v0,tri.v1,tri.v2,Occluded1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri);
+      }
+    };
+
+
+    /*! Intersects M triangles with K rays */
+    template<int M, int K, bool filter>
+    struct TriangleMvIntersectorKMoeller
+    {
+      typedef TriangleMv<M> Primitive;
+      typedef MoellerTrumboreIntersectorK<M,K> Precalculations;
+
+      /*! Intersects K rays with M triangles. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive& tri)
+      {
+        for (size_t i=0; i<M; i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          const Vec3vf<K> v0 = broadcast<vfloat<K>>(tri.v0,i);
+          const Vec3vf<K> v1 = broadcast<vfloat<K>>(tri.v1,i);
+          const Vec3vf<K> v2 = broadcast<vfloat<K>>(tri.v2,i);
+          pre.intersectK(valid_i,ray,v0,v1,v2,/*UVIdentity<K>(),*/IntersectKEpilogM<M,K,filter>(ray,context,tri.geomID(),tri.primID(),i));
+        }
+      }
+
+      /*! Test for K rays if they are occluded by any of the M triangles. */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive& tri)
+      {
+        vbool<K> valid0 = valid_i;
+
+        for (size_t i=0; i<M; i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid_i),K);
+          const Vec3vf<K> v0 = broadcast<vfloat<K>>(tri.v0,i);
+          const Vec3vf<K> v1 = broadcast<vfloat<K>>(tri.v1,i);
+          const Vec3vf<K> v2 = broadcast<vfloat<K>>(tri.v2,i);
+          pre.intersectK(valid0,ray,v0,v1,v2,/*UVIdentity<K>(),*/OccludedKEpilogM<M,K,filter>(valid0,ray,context,tri.geomID(),tri.primID(),i));
+          if (none(valid0)) break;
+        }
+        return !valid0;
+      }
+      
+      /*! Intersect a ray with M triangles and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        pre.intersect(ray,k,tri.v0,tri.v1,tri.v2,/*UVIdentity<M>(),*/Intersect1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID())); //FIXME: M
+      }
+
+      /*! Test if the ray is occluded by one of the M triangles. */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        return pre.intersect(ray,k,tri.v0,tri.v1,tri.v2,/*UVIdentity<M>(),*/Occluded1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID())); //FIXME: M
+      }
+    };
+
+    /*! Intersects M triangles with 1 ray */
+    template<int M, bool filter>
+    struct TriangleMvIntersector1Pluecker
+    {
+      typedef TriangleMv<M> Primitive;
+      typedef PlueckerIntersector1<M> Precalculations;
+
+      /*! Intersect a ray with M triangles and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        pre.intersect(ray,tri.v0,tri.v1,tri.v2,UVIdentity<M>(),Intersect1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+
+      /*! Test if the ray is occluded by one of the M triangles. */
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        return pre.intersect(ray,tri.v0,tri.v1,tri.v2,UVIdentity<M>(),Occluded1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri);
+      }
+    };
+
+    /*! Intersects M triangles with K rays */
+    template<int M, int K, bool filter>
+    struct TriangleMvIntersectorKPluecker
+    {
+      typedef TriangleMv<M> Primitive;
+      typedef PlueckerIntersectorK<M,K> Precalculations;
+
+      /*! Intersects K rays with M triangles. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive& tri)
+      {
+        for (size_t i=0; i<M; i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          const Vec3vf<K> v0 = broadcast<vfloat<K>>(tri.v0,i);
+          const Vec3vf<K> v1 = broadcast<vfloat<K>>(tri.v1,i);
+          const Vec3vf<K> v2 = broadcast<vfloat<K>>(tri.v2,i);
+          pre.intersectK(valid_i,ray,v0,v1,v2,UVIdentity<K>(),IntersectKEpilogM<M,K,filter>(ray,context,tri.geomID(),tri.primID(),i));
+        }
+      }
+
+      /*! Test for K rays if they are occluded by any of the M triangles. */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive& tri)
+      {
+        vbool<K> valid0 = valid_i;
+
+        for (size_t i=0; i<M; i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid_i),K);
+          const Vec3vf<K> v0 = broadcast<vfloat<K>>(tri.v0,i);
+          const Vec3vf<K> v1 = broadcast<vfloat<K>>(tri.v1,i);
+          const Vec3vf<K> v2 = broadcast<vfloat<K>>(tri.v2,i);
+          pre.intersectK(valid0,ray,v0,v1,v2,UVIdentity<K>(),OccludedKEpilogM<M,K,filter>(valid0,ray,context,tri.geomID(),tri.primID(),i));
+          if (none(valid0)) break;
+        }
+        return !valid0;
+      }
+      
+      /*! Intersect a ray with M triangles and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        pre.intersect(ray,k,tri.v0,tri.v1,tri.v2,UVIdentity<M>(),Intersect1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+
+      /*! Test if the ray is occluded by one of the M triangles. */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        return pre.intersect(ray,k,tri.v0,tri.v1,tri.v2,UVIdentity<M>(),Occluded1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/trianglev_mb.h b/thirdparty/embree/kernels/geometry/trianglev_mb.h
new file mode 100644
index 0000000000..b550a29fd5
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/trianglev_mb.h
@@ -0,0 +1,201 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+
+namespace embree
+{
+  /* Stores the vertices of M triangles in struct of array layout */
+  template<int M>
+  struct TriangleMvMB
+  {
+  public:
+    struct Type : public PrimitiveType 
+    {
+      const char* name() const;
+      size_t sizeActive(const char* This) const;
+      size_t sizeTotal(const char* This) const;
+      size_t getBytes(const char* This) const;
+    };
+
+    static Type type;
+
+  public:
+
+    /* primitive supports single time segments */
+    static const bool singleTimeSegment = true;
+
+    /* Returns maximum number of stored triangles */
+    static __forceinline size_t max_size() { return M; }
+    
+    /* Returns required number of primitive blocks for N primitives */
+    static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); }
+   
+  public:
+
+    /* Default constructor */
+    __forceinline TriangleMvMB() {}
+
+    /* Construction from vertices and IDs */
+    __forceinline TriangleMvMB(const Vec3vf<M>& a0, const Vec3vf<M>& a1,
+                               const Vec3vf<M>& b0, const Vec3vf<M>& b1,
+                               const Vec3vf<M>& c0, const Vec3vf<M>& c1,
+                               const vuint<M>& geomIDs, const vuint<M>& primIDs)
+      : v0(a0), v1(b0), v2(c0), dv0(a1-a0), dv1(b1-b0), dv2(c1-c0), geomIDs(geomIDs), primIDs(primIDs) {}
+
+    /* Returns a mask that tells which triangles are valid */
+    __forceinline vbool<M> valid() const { return geomIDs != vuint<M>(-1); }
+
+    /* Returns if the specified triangle is valid */
+    __forceinline bool valid(const size_t i) const { assert(i<M); return geomIDs[i] != -1; }
+
+    /* Returns the number of stored triangles */
+    __forceinline size_t size() const { return bsf(~movemask(valid())); }
+
+    /* Returns the geometry IDs */
+    __forceinline       vuint<M>& geomID()       { return geomIDs; }
+    __forceinline const vuint<M>& geomID() const { return geomIDs; }
+    __forceinline unsigned int geomID(const size_t i) const { assert(i<M); return geomIDs[i]; }
+
+    /* Returns the primitive IDs */
+    __forceinline       vuint<M>& primID()       { return primIDs; }
+    __forceinline const vuint<M>& primID() const { return primIDs; }
+    __forceinline unsigned int primID(const size_t i) const { assert(i<M); return primIDs[i]; }
+
+    /* Calculate the bounds of the triangles at t0 */
+    __forceinline BBox3fa bounds0() const 
+    {
+      Vec3vf<M> lower = min(v0,v1,v2);
+      Vec3vf<M> upper = max(v0,v1,v2);
+      const vbool<M> mask = valid();
+      lower.x = select(mask,lower.x,vfloat<M>(pos_inf));
+      lower.y = select(mask,lower.y,vfloat<M>(pos_inf));
+      lower.z = select(mask,lower.z,vfloat<M>(pos_inf));
+      upper.x = select(mask,upper.x,vfloat<M>(neg_inf));
+      upper.y = select(mask,upper.y,vfloat<M>(neg_inf));
+      upper.z = select(mask,upper.z,vfloat<M>(neg_inf));
+      return BBox3fa(Vec3fa(reduce_min(lower.x),reduce_min(lower.y),reduce_min(lower.z)),
+		     Vec3fa(reduce_max(upper.x),reduce_max(upper.y),reduce_max(upper.z)));
+    }
+
+    /* Calculate the bounds of the triangles at t1 */
+    __forceinline BBox3fa bounds1() const 
+    {
+      const Vec3vf<M> p0 = v0+dv0;
+      const Vec3vf<M> p1 = v1+dv1;
+      const Vec3vf<M> p2 = v2+dv2;
+      Vec3vf<M> lower = min(p0,p1,p2);
+      Vec3vf<M> upper = max(p0,p1,p2);
+      const vbool<M> mask = valid();
+      lower.x = select(mask,lower.x,vfloat<M>(pos_inf));
+      lower.y = select(mask,lower.y,vfloat<M>(pos_inf));
+      lower.z = select(mask,lower.z,vfloat<M>(pos_inf));
+      upper.x = select(mask,upper.x,vfloat<M>(neg_inf));
+      upper.y = select(mask,upper.y,vfloat<M>(neg_inf));
+      upper.z = select(mask,upper.z,vfloat<M>(neg_inf));
+      return BBox3fa(Vec3fa(reduce_min(lower.x),reduce_min(lower.y),reduce_min(lower.z)),
+		     Vec3fa(reduce_max(upper.x),reduce_max(upper.y),reduce_max(upper.z)));
+    }
+
+    /* Calculate the linear bounds of the primitive */
+    __forceinline LBBox3fa linearBounds() const {
+      return LBBox3fa(bounds0(),bounds1());
+    }
+
+    /* Fill triangle from triangle list */
+    __forceinline LBBox3fa fillMB(const PrimRef* prims, size_t& begin, size_t end, Scene* scene, size_t itime)
+    {
+      vuint<M> vgeomID = -1, vprimID = -1;
+      Vec3vf<M> va0 = zero, vb0 = zero, vc0 = zero;
+      Vec3vf<M> va1 = zero, vb1 = zero, vc1 = zero;
+
+      BBox3fa bounds0 = empty;
+      BBox3fa bounds1 = empty;
+      
+      for (size_t i=0; i<M && begin<end; i++, begin++)
+      {
+	const PrimRef& prim = prims[begin];
+        const unsigned geomID = prim.geomID();
+        const unsigned primID = prim.primID();
+        const TriangleMesh* __restrict__ const mesh = scene->get<TriangleMesh>(geomID);
+        const TriangleMesh::Triangle& tri = mesh->triangle(primID);
+        const Vec3fa& a0 = mesh->vertex(tri.v[0],itime+0); bounds0.extend(a0);
+        const Vec3fa& a1 = mesh->vertex(tri.v[0],itime+1); bounds1.extend(a1);
+        const Vec3fa& b0 = mesh->vertex(tri.v[1],itime+0); bounds0.extend(b0);
+        const Vec3fa& b1 = mesh->vertex(tri.v[1],itime+1); bounds1.extend(b1);
+        const Vec3fa& c0 = mesh->vertex(tri.v[2],itime+0); bounds0.extend(c0);
+        const Vec3fa& c1 = mesh->vertex(tri.v[2],itime+1); bounds1.extend(c1);
+        vgeomID [i] = geomID;
+        vprimID [i] = primID;
+        va0.x[i] = a0.x; va0.y[i] = a0.y; va0.z[i] = a0.z;
+	va1.x[i] = a1.x; va1.y[i] = a1.y; va1.z[i] = a1.z;
+	vb0.x[i] = b0.x; vb0.y[i] = b0.y; vb0.z[i] = b0.z;
+	vb1.x[i] = b1.x; vb1.y[i] = b1.y; vb1.z[i] = b1.z;
+	vc0.x[i] = c0.x; vc0.y[i] = c0.y; vc0.z[i] = c0.z;
+	vc1.x[i] = c1.x; vc1.y[i] = c1.y; vc1.z[i] = c1.z;
+      }
+      new (this) TriangleMvMB(va0,va1,vb0,vb1,vc0,vc1,vgeomID,vprimID);
+      return LBBox3fa(bounds0,bounds1);
+    }
+
+    /* Fill triangle from triangle list */
+    __forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t& begin, size_t end, Scene* scene, const BBox1f time_range)
+    {
+      vuint<M> vgeomID = -1, vprimID = -1;
+      Vec3vf<M> va0 = zero, vb0 = zero, vc0 = zero;
+      Vec3vf<M> va1 = zero, vb1 = zero, vc1 = zero;
+
+      LBBox3fa allBounds = empty;
+      for (size_t i=0; i<M && begin<end; i++, begin++)
+      {
+        const PrimRefMB& prim = prims[begin];
+        const unsigned geomID = prim.geomID();
+        const unsigned primID = prim.primID();
+        const TriangleMesh* const mesh = scene->get<TriangleMesh>(geomID);
+        const range<int> itime_range = mesh->timeSegmentRange(time_range);
+        assert(itime_range.size() == 1);
+        const int ilower = itime_range.begin();
+        const TriangleMesh::Triangle& tri = mesh->triangle(primID);
+        allBounds.extend(mesh->linearBounds(primID, time_range));
+        const Vec3fa& a0 = mesh->vertex(tri.v[0],ilower+0);
+        const Vec3fa& a1 = mesh->vertex(tri.v[0],ilower+1);
+        const Vec3fa& b0 = mesh->vertex(tri.v[1],ilower+0);
+        const Vec3fa& b1 = mesh->vertex(tri.v[1],ilower+1);
+        const Vec3fa& c0 = mesh->vertex(tri.v[2],ilower+0);
+        const Vec3fa& c1 = mesh->vertex(tri.v[2],ilower+1);
+        const BBox1f time_range_v(mesh->timeStep(ilower+0),mesh->timeStep(ilower+1));
+        auto a01 = globalLinear(std::make_pair(a0,a1),time_range_v);
+        auto b01 = globalLinear(std::make_pair(b0,b1),time_range_v);
+        auto c01 = globalLinear(std::make_pair(c0,c1),time_range_v);
+        vgeomID [i] = geomID;
+        vprimID [i] = primID;
+        va0.x[i] = a01.first .x; va0.y[i] = a01.first .y; va0.z[i] = a01.first .z;
+	va1.x[i] = a01.second.x; va1.y[i] = a01.second.y; va1.z[i] = a01.second.z;
+	vb0.x[i] = b01.first .x; vb0.y[i] = b01.first .y; vb0.z[i] = b01.first .z;
+	vb1.x[i] = b01.second.x; vb1.y[i] = b01.second.y; vb1.z[i] = b01.second.z;
+	vc0.x[i] = c01.first .x; vc0.y[i] = c01.first .y; vc0.z[i] = c01.first .z;
+	vc1.x[i] = c01.second.x; vc1.y[i] = c01.second.y; vc1.z[i] = c01.second.z;
+      }
+      new (this) TriangleMvMB(va0,va1,vb0,vb1,vc0,vc1,vgeomID,vprimID);
+      return allBounds;
+    }
+
+  public:
+    Vec3vf<M> v0;      // 1st vertex of the triangles
+    Vec3vf<M> v1;      // 2nd vertex of the triangles
+    Vec3vf<M> v2;      // 3rd vertex of the triangles
+    Vec3vf<M> dv0;     // difference vector between time steps t0 and t1 for first vertex
+    Vec3vf<M> dv1;     // difference vector between time steps t0 and t1 for second vertex
+    Vec3vf<M> dv2;     // difference vector between time steps t0 and t1 for third vertex
+  private:
+    vuint<M> geomIDs; // geometry ID
+    vuint<M> primIDs; // primitive ID
+  };
+
+  template<int M>
+  typename TriangleMvMB<M>::Type TriangleMvMB<M>::type;
+
+  typedef TriangleMvMB<4> Triangle4vMB;
+}
diff --git a/thirdparty/embree/kernels/geometry/trianglev_mb_intersector.h b/thirdparty/embree/kernels/geometry/trianglev_mb_intersector.h
new file mode 100644
index 0000000000..38cd52e85d
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/trianglev_mb_intersector.h
@@ -0,0 +1,211 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "triangle.h"
+#include "intersector_epilog.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    /*! Intersects M motion blur triangles with 1 ray */
+    template<int M, bool filter>
+    struct TriangleMvMBIntersector1Moeller
+    {
+      typedef TriangleMvMB<M> Primitive;
+      typedef MoellerTrumboreIntersector1<M> Precalculations;
+
+      /*! Intersect a ray with the M triangles and updates the hit. */
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const TriangleMvMB<M>& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const Vec3vf<M> time(ray.time());
+        const Vec3vf<M> v0 = madd(time,Vec3vf<M>(tri.dv0),Vec3vf<M>(tri.v0));
+        const Vec3vf<M> v1 = madd(time,Vec3vf<M>(tri.dv1),Vec3vf<M>(tri.v1));
+        const Vec3vf<M> v2 = madd(time,Vec3vf<M>(tri.dv2),Vec3vf<M>(tri.v2));
+        pre.intersect(ray,v0,v1,v2,Intersect1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+
+      /*! Test if the ray is occluded by one of M triangles. */
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const TriangleMvMB<M>& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const Vec3vf<M> time(ray.time());
+        const Vec3vf<M> v0 = madd(time,Vec3vf<M>(tri.dv0),Vec3vf<M>(tri.v0));
+        const Vec3vf<M> v1 = madd(time,Vec3vf<M>(tri.dv1),Vec3vf<M>(tri.v1));
+        const Vec3vf<M> v2 = madd(time,Vec3vf<M>(tri.dv2),Vec3vf<M>(tri.v2));
+        return pre.intersect(ray,v0,v1,v2,Occluded1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri);
+      }
+    };
+    
+    /*! Intersects M motion blur triangles with K rays. */
+    template<int M, int K, bool filter>
+    struct TriangleMvMBIntersectorKMoeller
+    {
+      typedef TriangleMvMB<M> Primitive;
+      typedef MoellerTrumboreIntersectorK<M,K> Precalculations;
+
+      /*! Intersects K rays with M triangles. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const TriangleMvMB<M>& tri)
+      {
+        for (size_t i=0; i<TriangleMvMB<M>::max_size(); i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          const Vec3vf<K> time(ray.time());
+          const Vec3vf<K> v0 = madd(time,broadcast<vfloat<K>>(tri.dv0,i),broadcast<vfloat<K>>(tri.v0,i));
+          const Vec3vf<K> v1 = madd(time,broadcast<vfloat<K>>(tri.dv1,i),broadcast<vfloat<K>>(tri.v1,i));
+          const Vec3vf<K> v2 = madd(time,broadcast<vfloat<K>>(tri.dv2,i),broadcast<vfloat<K>>(tri.v2,i));
+          pre.intersectK(valid_i,ray,v0,v1,v2,IntersectKEpilogM<M,K,filter>(ray,context,tri.geomID(),tri.primID(),i));
+        }
+      }
+
+      /*! Test for K rays if they are occluded by any of the M triangles. */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const TriangleMvMB<M>& tri)
+      {
+        vbool<K> valid0 = valid_i;
+
+        for (size_t i=0; i<TriangleMvMB<M>::max_size(); i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+          const Vec3vf<K> time(ray.time());
+          const Vec3vf<K> v0 = madd(time,broadcast<vfloat<K>>(tri.dv0,i),broadcast<vfloat<K>>(tri.v0,i));
+          const Vec3vf<K> v1 = madd(time,broadcast<vfloat<K>>(tri.dv1,i),broadcast<vfloat<K>>(tri.v1,i));
+          const Vec3vf<K> v2 = madd(time,broadcast<vfloat<K>>(tri.dv2,i),broadcast<vfloat<K>>(tri.v2,i));
+          pre.intersectK(valid0,ray,v0,v1,v2,OccludedKEpilogM<M,K,filter>(valid0,ray,context,tri.geomID(),tri.primID(),i));
+          if (none(valid0)) break;
+        }
+        return !valid0;
+      }
+      
+      /*! Intersect a ray with M triangles and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const TriangleMvMB<M>& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const Vec3vf<M> time(ray.time()[k]);
+        const Vec3vf<M> v0 = madd(time,Vec3vf<M>(tri.dv0),Vec3vf<M>(tri.v0));
+        const Vec3vf<M> v1 = madd(time,Vec3vf<M>(tri.dv1),Vec3vf<M>(tri.v1));
+        const Vec3vf<M> v2 = madd(time,Vec3vf<M>(tri.dv2),Vec3vf<M>(tri.v2));
+        pre.intersect(ray,k,v0,v1,v2,Intersect1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+
+      /*! Test if the ray is occluded by one of the M triangles. */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const TriangleMvMB<M>& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const Vec3vf<M> time(ray.time()[k]);
+        const Vec3vf<M> v0 = madd(time,Vec3vf<M>(tri.dv0),Vec3vf<M>(tri.v0));
+        const Vec3vf<M> v1 = madd(time,Vec3vf<M>(tri.dv1),Vec3vf<M>(tri.v1));
+        const Vec3vf<M> v2 = madd(time,Vec3vf<M>(tri.dv2),Vec3vf<M>(tri.v2));
+        return pre.intersect(ray,k,v0,v1,v2,Occluded1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+    };
+
+    /*! Intersects M motion blur triangles with 1 ray */
+    template<int M, bool filter>
+    struct TriangleMvMBIntersector1Pluecker
+    {
+      typedef TriangleMvMB<M> Primitive;
+      typedef PlueckerIntersector1<M> Precalculations;
+
+      /*! Intersect a ray with the M triangles and updates the hit. */
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const TriangleMvMB<M>& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const Vec3vf<M> time(ray.time());
+        const Vec3vf<M> v0 = madd(time,Vec3vf<M>(tri.dv0),Vec3vf<M>(tri.v0));
+        const Vec3vf<M> v1 = madd(time,Vec3vf<M>(tri.dv1),Vec3vf<M>(tri.v1));
+        const Vec3vf<M> v2 = madd(time,Vec3vf<M>(tri.dv2),Vec3vf<M>(tri.v2));
+        pre.intersect(ray,v0,v1,v2,UVIdentity<M>(),Intersect1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+
+      /*! Test if the ray is occluded by one of M triangles. */
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const TriangleMvMB<M>& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const Vec3vf<M> time(ray.time());
+        const Vec3vf<M> v0 = madd(time,Vec3vf<M>(tri.dv0),Vec3vf<M>(tri.v0));
+        const Vec3vf<M> v1 = madd(time,Vec3vf<M>(tri.dv1),Vec3vf<M>(tri.v1));
+        const Vec3vf<M> v2 = madd(time,Vec3vf<M>(tri.dv2),Vec3vf<M>(tri.v2));
+        return pre.intersect(ray,v0,v1,v2,UVIdentity<M>(),Occluded1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri);
+      }
+    };
+    
+    /*! Intersects M motion blur triangles with K rays. */
+    template<int M, int K, bool filter>
+    struct TriangleMvMBIntersectorKPluecker
+    {
+      typedef TriangleMvMB<M> Primitive;
+      typedef PlueckerIntersectorK<M,K> Precalculations;
+
+      /*! Intersects K rays with M triangles. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const TriangleMvMB<M>& tri)
+      {
+        for (size_t i=0; i<TriangleMvMB<M>::max_size(); i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          const Vec3vf<K> time(ray.time());
+          const Vec3vf<K> v0 = madd(time,broadcast<vfloat<K>>(tri.dv0,i),broadcast<vfloat<K>>(tri.v0,i));
+          const Vec3vf<K> v1 = madd(time,broadcast<vfloat<K>>(tri.dv1,i),broadcast<vfloat<K>>(tri.v1,i));
+          const Vec3vf<K> v2 = madd(time,broadcast<vfloat<K>>(tri.dv2,i),broadcast<vfloat<K>>(tri.v2,i));
+          pre.intersectK(valid_i,ray,v0,v1,v2,UVIdentity<K>(),IntersectKEpilogM<M,K,filter>(ray,context,tri.geomID(),tri.primID(),i));
+        }
+      }
+
+      /*! Test for K rays if they are occluded by any of the M triangles. */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const TriangleMvMB<M>& tri)
+      {
+        vbool<K> valid0 = valid_i;
+
+        for (size_t i=0; i<TriangleMvMB<M>::max_size(); i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+          const Vec3vf<K> time(ray.time());
+          const Vec3vf<K> v0 = madd(time,broadcast<vfloat<K>>(tri.dv0,i),broadcast<vfloat<K>>(tri.v0,i));
+          const Vec3vf<K> v1 = madd(time,broadcast<vfloat<K>>(tri.dv1,i),broadcast<vfloat<K>>(tri.v1,i));
+          const Vec3vf<K> v2 = madd(time,broadcast<vfloat<K>>(tri.dv2,i),broadcast<vfloat<K>>(tri.v2,i));
+          pre.intersectK(valid0,ray,v0,v1,v2,UVIdentity<K>(),OccludedKEpilogM<M,K,filter>(valid0,ray,context,tri.geomID(),tri.primID(),i));
+          if (none(valid0)) break;
+        }
+        return !valid0;
+      }
+
+      /*! Intersect a ray with M triangles and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const TriangleMvMB<M>& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const Vec3vf<M> time(ray.time()[k]);
+        const Vec3vf<M> v0 = madd(time,Vec3vf<M>(tri.dv0),Vec3vf<M>(tri.v0));
+        const Vec3vf<M> v1 = madd(time,Vec3vf<M>(tri.dv1),Vec3vf<M>(tri.v1));
+        const Vec3vf<M> v2 = madd(time,Vec3vf<M>(tri.dv2),Vec3vf<M>(tri.v2));
+        pre.intersect(ray,k,v0,v1,v2,UVIdentity<M>(),Intersect1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+
+      /*! Test if the ray is occluded by one of the M triangles. */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const TriangleMvMB<M>& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const Vec3vf<M> time(ray.time()[k]);
+        const Vec3vf<M> v0 = madd(time,Vec3vf<M>(tri.dv0),Vec3vf<M>(tri.v0));
+        const Vec3vf<M> v1 = madd(time,Vec3vf<M>(tri.dv1),Vec3vf<M>(tri.v1));
+        const Vec3vf<M> v2 = madd(time,Vec3vf<M>(tri.dv2),Vec3vf<M>(tri.v2));
+        return pre.intersect(ray,k,v0,v1,v2,UVIdentity<M>(),Occluded1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/hash.h b/thirdparty/embree/kernels/hash.h
new file mode 100644
index 0000000000..10f315cee7
--- /dev/null
+++ b/thirdparty/embree/kernels/hash.h
@@ -0,0 +1,5 @@
+
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#define RTC_HASH "7c53133eb21424f7f0ae1e25bf357e358feaf6ab"
diff --git a/thirdparty/embree/kernels/subdiv/bezier_curve.h b/thirdparty/embree/kernels/subdiv/bezier_curve.h
new file mode 100644
index 0000000000..a5adad5cc9
--- /dev/null
+++ b/thirdparty/embree/kernels/subdiv/bezier_curve.h
@@ -0,0 +1,671 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/default.h"
+//#include "../common/scene_curves.h"
+#include "../common/context.h"
+
+namespace embree
+{
+  class BezierBasis
+  {
+  public:
+
+    template<typename T>
+      static __forceinline Vec4<T> eval(const T& u) 
+    {
+      const T t1 = u;
+      const T t0 = 1.0f-t1;
+      const T B0 = t0 * t0 * t0;
+      const T B1 = 3.0f * t1 * (t0 * t0);
+      const T B2 = 3.0f * (t1 * t1) * t0;
+      const T B3 = t1 * t1 * t1;
+      return Vec4<T>(B0,B1,B2,B3);
+    }
+    
+    template<typename T>
+      static __forceinline Vec4<T>  derivative(const T& u)
+    {
+      const T t1 = u;
+      const T t0 = 1.0f-t1;
+      const T B0 = -(t0*t0);
+      const T B1 = madd(-2.0f,t0*t1,t0*t0);
+      const T B2 = msub(+2.0f,t0*t1,t1*t1);
+      const T B3 = +(t1*t1);
+      return T(3.0f)*Vec4<T>(B0,B1,B2,B3);
+    }
+
+    template<typename T>
+      static __forceinline Vec4<T>  derivative2(const T& u)
+    {
+      const T t1 = u;
+      const T t0 = 1.0f-t1;
+      const T B0 = t0;
+      const T B1 = madd(-2.0f,t0,t1);
+      const T B2 = madd(-2.0f,t1,t0);
+      const T B3 = t1;
+      return T(6.0f)*Vec4<T>(B0,B1,B2,B3);
+    }
+  };
+  
+  struct PrecomputedBezierBasis
+  {
+    enum { N = 16 };
+  public:
+    PrecomputedBezierBasis() {}
+    PrecomputedBezierBasis(int shift);
+
+    /* basis for bezier evaluation */
+  public:
+    float c0[N+1][N+1];
+    float c1[N+1][N+1];
+    float c2[N+1][N+1];
+    float c3[N+1][N+1];
+    
+    /* basis for bezier derivative evaluation */
+  public:
+    float d0[N+1][N+1];
+    float d1[N+1][N+1];
+    float d2[N+1][N+1];
+    float d3[N+1][N+1];
+  };
+  extern PrecomputedBezierBasis bezier_basis0;
+  extern PrecomputedBezierBasis bezier_basis1;
+
+  
+  template<typename V>
+    struct LinearBezierCurve
+    {
+      V v0,v1;
+      
+      __forceinline LinearBezierCurve () {}
+      
+      __forceinline LinearBezierCurve (const LinearBezierCurve& other)
+        : v0(other.v0), v1(other.v1) {}
+      
+      __forceinline LinearBezierCurve& operator= (const LinearBezierCurve& other) {
+        v0 = other.v0; v1 = other.v1; return *this;
+      }
+        
+        __forceinline LinearBezierCurve (const V& v0, const V& v1)
+          : v0(v0), v1(v1) {}
+      
+      __forceinline V begin() const { return v0; }
+      __forceinline V end  () const { return v1; }
+      
+      bool hasRoot() const;
+      
+      friend embree_ostream operator<<(embree_ostream cout, const LinearBezierCurve& a) {
+        return cout << "LinearBezierCurve (" << a.v0 << ", " << a.v1 << ")";
+      }
+    };
+  
+  template<> __forceinline bool LinearBezierCurve<Interval1f>::hasRoot() const {
+    return numRoots(v0,v1);
+  }
+  
+  template<typename V>
+    struct QuadraticBezierCurve
+    {
+      V v0,v1,v2;
+      
+      __forceinline QuadraticBezierCurve () {}
+      
+      __forceinline QuadraticBezierCurve (const QuadraticBezierCurve& other)
+        : v0(other.v0), v1(other.v1), v2(other.v2) {}
+      
+      __forceinline QuadraticBezierCurve& operator= (const QuadraticBezierCurve& other) {
+        v0 = other.v0; v1 = other.v1; v2 = other.v2; return *this;
+      }
+        
+        __forceinline QuadraticBezierCurve (const V& v0, const V& v1, const V& v2)
+          : v0(v0), v1(v1), v2(v2) {}
+      
+      __forceinline V begin() const { return v0; }
+      __forceinline V end  () const { return v2; }
+      
+      __forceinline V interval() const {
+        return merge(v0,v1,v2);
+      }
+      
+      __forceinline BBox<V> bounds() const {
+        return merge(BBox<V>(v0),BBox<V>(v1),BBox<V>(v2));
+      }
+      
+      friend embree_ostream operator<<(embree_ostream cout, const QuadraticBezierCurve& a) {
+        return cout << "QuadraticBezierCurve ( (" << a.u.lower << ", " << a.u.upper << "), " << a.v0 << ", " << a.v1 << ", " << a.v2 << ")";
+      }
+    };
+  
+  
+  typedef QuadraticBezierCurve<float> QuadraticBezierCurve1f;
+  typedef QuadraticBezierCurve<Vec2fa> QuadraticBezierCurve2fa;
+  typedef QuadraticBezierCurve<Vec3fa> QuadraticBezierCurve3fa;
+
+  template<typename Vertex>
+    struct CubicBezierCurve
+    {
+      Vertex v0,v1,v2,v3;
+      
+      __forceinline CubicBezierCurve() {}
+
+      template<typename T1>
+      __forceinline CubicBezierCurve (const CubicBezierCurve<T1>& other)
+      : v0(other.v0), v1(other.v1), v2(other.v2), v3(other.v3) {}
+      
+      __forceinline CubicBezierCurve& operator= (const CubicBezierCurve& other) {
+        v0 = other.v0; v1 = other.v1; v2 = other.v2; v3 = other.v3; return *this;
+      }
+      
+      __forceinline CubicBezierCurve(const Vertex& v0, const Vertex& v1, const Vertex& v2, const Vertex& v3)
+        : v0(v0), v1(v1), v2(v2), v3(v3) {}
+
+      __forceinline Vertex begin() const {
+        return v0;
+      }
+
+      __forceinline Vertex end() const {
+        return v3;
+      }
+
+      __forceinline Vertex center() const {
+        return 0.25f*(v0+v1+v2+v3);
+      }
+
+      __forceinline Vertex begin_direction() const {
+        return v1-v0;
+      }
+
+      __forceinline Vertex end_direction() const {
+        return v3-v2;
+      }
+
+      __forceinline CubicBezierCurve<float> xfm(const Vertex& dx) const {
+        return CubicBezierCurve<float>(dot(v0,dx),dot(v1,dx),dot(v2,dx),dot(v3,dx));
+      }
+      
+      __forceinline CubicBezierCurve<vfloatx> vxfm(const Vertex& dx) const {
+        return CubicBezierCurve<vfloatx>(dot(v0,dx),dot(v1,dx),dot(v2,dx),dot(v3,dx));
+      }
+      
+      __forceinline CubicBezierCurve<float> xfm(const Vertex& dx, const Vertex& p) const {
+        return CubicBezierCurve<float>(dot(v0-p,dx),dot(v1-p,dx),dot(v2-p,dx),dot(v3-p,dx));
+      }
+
+       __forceinline CubicBezierCurve<Vec3fa> xfm(const LinearSpace3fa& space) const
+      {
+        const Vec3fa q0 = xfmVector(space,v0);
+        const Vec3fa q1 = xfmVector(space,v1);
+        const Vec3fa q2 = xfmVector(space,v2);
+        const Vec3fa q3 = xfmVector(space,v3);
+        return CubicBezierCurve<Vec3fa>(q0,q1,q2,q3);
+      }
+      
+      __forceinline CubicBezierCurve<Vec3fa> xfm(const LinearSpace3fa& space, const Vec3fa& p) const
+      {
+        const Vec3fa q0 = xfmVector(space,v0-p);
+        const Vec3fa q1 = xfmVector(space,v1-p);
+        const Vec3fa q2 = xfmVector(space,v2-p);
+        const Vec3fa q3 = xfmVector(space,v3-p);
+        return CubicBezierCurve<Vec3fa>(q0,q1,q2,q3);
+      }
+
+      __forceinline CubicBezierCurve<Vec3ff> xfm_pr(const LinearSpace3fa& space, const Vec3fa& p) const
+      {
+        const Vec3ff q0(xfmVector(space,(Vec3fa)v0-p), v0.w);
+        const Vec3ff q1(xfmVector(space,(Vec3fa)v1-p), v1.w);
+        const Vec3ff q2(xfmVector(space,(Vec3fa)v2-p), v2.w);
+        const Vec3ff q3(xfmVector(space,(Vec3fa)v3-p), v3.w);
+        return CubicBezierCurve<Vec3ff>(q0,q1,q2,q3);
+      }
+
+      __forceinline CubicBezierCurve<Vec3fa> xfm(const LinearSpace3fa& space, const Vec3fa& p, const float s) const
+      {
+        const Vec3fa q0 = xfmVector(space,s*(v0-p));
+        const Vec3fa q1 = xfmVector(space,s*(v1-p));
+        const Vec3fa q2 = xfmVector(space,s*(v2-p));
+        const Vec3fa q3 = xfmVector(space,s*(v3-p));
+        return CubicBezierCurve<Vec3fa>(q0,q1,q2,q3);
+      }
+      
+      __forceinline int maxRoots() const;
+      
+      __forceinline BBox<Vertex> bounds() const {
+        return merge(BBox<Vertex>(v0),BBox<Vertex>(v1),BBox<Vertex>(v2),BBox<Vertex>(v3));
+      }
+      
+      __forceinline friend CubicBezierCurve operator +( const CubicBezierCurve& a, const CubicBezierCurve& b ) {
+        return CubicBezierCurve(a.v0+b.v0,a.v1+b.v1,a.v2+b.v2,a.v3+b.v3);
+      }
+      
+      __forceinline friend CubicBezierCurve operator -( const CubicBezierCurve& a, const CubicBezierCurve& b ) {
+        return CubicBezierCurve(a.v0-b.v0,a.v1-b.v1,a.v2-b.v2,a.v3-b.v3);
+      }
+      
+      __forceinline friend CubicBezierCurve operator -( const CubicBezierCurve& a, const Vertex& b ) {
+        return CubicBezierCurve(a.v0-b,a.v1-b,a.v2-b,a.v3-b);
+      }
+      
+      __forceinline friend CubicBezierCurve operator *( const Vertex& a, const CubicBezierCurve& b ) {
+        return CubicBezierCurve(a*b.v0,a*b.v1,a*b.v2,a*b.v3);
+      }
+
+      __forceinline friend CubicBezierCurve cmadd( const Vertex& a, const CubicBezierCurve& b,  const CubicBezierCurve& c) {
+        return CubicBezierCurve(madd(a,b.v0,c.v0),madd(a,b.v1,c.v1),madd(a,b.v2,c.v2),madd(a,b.v3,c.v3));
+      }
+      
+      __forceinline friend CubicBezierCurve clerp ( const CubicBezierCurve& a, const CubicBezierCurve& b, const Vertex& t ) {
+        return cmadd((Vertex(1.0f)-t),a,t*b);
+      }
+      
+      __forceinline friend CubicBezierCurve merge ( const CubicBezierCurve& a, const CubicBezierCurve& b ) {
+        return CubicBezierCurve(merge(a.v0,b.v0),merge(a.v1,b.v1),merge(a.v2,b.v2),merge(a.v3,b.v3));
+      }
+      
+      __forceinline void split(CubicBezierCurve& left, CubicBezierCurve& right, const float t = 0.5f) const
+      {
+        const Vertex p00 = v0;
+        const Vertex p01 = v1;
+        const Vertex p02 = v2;
+        const Vertex p03 = v3;
+        
+        const Vertex p10 = lerp(p00,p01,t);
+        const Vertex p11 = lerp(p01,p02,t);
+        const Vertex p12 = lerp(p02,p03,t);
+        const Vertex p20 = lerp(p10,p11,t);
+        const Vertex p21 = lerp(p11,p12,t);
+        const Vertex p30 = lerp(p20,p21,t);
+        
+        new (&left ) CubicBezierCurve(p00,p10,p20,p30);
+        new (&right) CubicBezierCurve(p30,p21,p12,p03);
+      }
+      
+      __forceinline CubicBezierCurve<Vec2vfx> split() const
+      {
+        const float u0 = 0.0f, u1 = 1.0f;
+        const float dscale = (u1-u0)*(1.0f/(3.0f*(VSIZEX-1)));
+        const vfloatx vu0 = lerp(u0,u1,vfloatx(step)*(1.0f/(VSIZEX-1)));
+        Vec2vfx P0, dP0du; evalN(vu0,P0,dP0du); dP0du = dP0du * Vec2vfx(dscale);
+        const Vec2vfx P3 = shift_right_1(P0);
+        const Vec2vfx dP3du = shift_right_1(dP0du); 
+        const Vec2vfx P1 = P0 + dP0du; 
+        const Vec2vfx P2 = P3 - dP3du;
+        return CubicBezierCurve<Vec2vfx>(P0,P1,P2,P3);
+      }
+      
+      __forceinline CubicBezierCurve<Vec2vfx> split(const BBox1f& u) const
+      {
+        const float u0 = u.lower, u1 = u.upper;
+        const float dscale = (u1-u0)*(1.0f/(3.0f*(VSIZEX-1)));
+        const vfloatx vu0 = lerp(u0,u1,vfloatx(step)*(1.0f/(VSIZEX-1)));
+        Vec2vfx P0, dP0du; evalN(vu0,P0,dP0du); dP0du = dP0du * Vec2vfx(dscale);
+        const Vec2vfx P3 = shift_right_1(P0);
+        const Vec2vfx dP3du = shift_right_1(dP0du); 
+        const Vec2vfx P1 = P0 + dP0du; 
+        const Vec2vfx P2 = P3 - dP3du;
+        return CubicBezierCurve<Vec2vfx>(P0,P1,P2,P3);
+      }
+      
+      __forceinline void eval(float t, Vertex& p, Vertex& dp) const
+      {
+        const Vertex p00 = v0;
+        const Vertex p01 = v1;
+        const Vertex p02 = v2;
+        const Vertex p03 = v3;
+        
+        const Vertex p10 = lerp(p00,p01,t);
+        const Vertex p11 = lerp(p01,p02,t);
+        const Vertex p12 = lerp(p02,p03,t);
+        const Vertex p20 = lerp(p10,p11,t);
+        const Vertex p21 = lerp(p11,p12,t);
+        const Vertex p30 = lerp(p20,p21,t);
+        
+        p = p30;
+        dp = Vertex(3.0f)*(p21-p20);
+      }
+
+#if 0
+      __forceinline Vertex eval(float t) const
+      {
+        const Vertex p00 = v0;
+        const Vertex p01 = v1;
+        const Vertex p02 = v2;
+        const Vertex p03 = v3;
+        
+        const Vertex p10 = lerp(p00,p01,t);
+        const Vertex p11 = lerp(p01,p02,t);
+        const Vertex p12 = lerp(p02,p03,t);
+        const Vertex p20 = lerp(p10,p11,t);
+        const Vertex p21 = lerp(p11,p12,t);
+        const Vertex p30 = lerp(p20,p21,t);
+        
+        return p30;
+      }
+#else
+      __forceinline Vertex eval(const float t) const 
+      {
+        const Vec4<float> b = BezierBasis::eval(t);
+        return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3)));
+      }
+#endif
+      
+      __forceinline Vertex eval_dt(float t) const
+      {
+        const Vertex p00 = v1-v0;
+        const Vertex p01 = v2-v1;
+        const Vertex p02 = v3-v2;
+        const Vertex p10 = lerp(p00,p01,t);
+        const Vertex p11 = lerp(p01,p02,t);
+        const Vertex p20 = lerp(p10,p11,t);
+        return Vertex(3.0f)*p20;
+      }
+
+      __forceinline Vertex eval_du(const float t) const
+      {
+        const Vec4<float> b = BezierBasis::derivative(t);
+        return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3)));
+      }
+
+      __forceinline Vertex eval_dudu(const float t) const 
+      {
+        const Vec4<float> b = BezierBasis::derivative2(t);
+        return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3)));
+      }
+      
+      __forceinline void evalN(const vfloatx& t, Vec2vfx& p, Vec2vfx& dp) const
+      {
+        const Vec2vfx p00 = v0;
+        const Vec2vfx p01 = v1;
+        const Vec2vfx p02 = v2;
+        const Vec2vfx p03 = v3;
+        
+        const Vec2vfx p10 = lerp(p00,p01,t);
+        const Vec2vfx p11 = lerp(p01,p02,t);
+        const Vec2vfx p12 = lerp(p02,p03,t);
+        
+        const Vec2vfx p20 = lerp(p10,p11,t);
+        const Vec2vfx p21 = lerp(p11,p12,t);
+        
+        const Vec2vfx p30 = lerp(p20,p21,t);
+        
+        p = p30;
+        dp = vfloatx(3.0f)*(p21-p20);
+      }
+
+      __forceinline void eval(const float t, Vertex& p, Vertex& dp, Vertex& ddp) const
+      {
+        const Vertex p00 = v0;
+        const Vertex p01 = v1;
+        const Vertex p02 = v2;
+        const Vertex p03 = v3;
+        const Vertex p10 = lerp(p00,p01,t);
+        const Vertex p11 = lerp(p01,p02,t);
+        const Vertex p12 = lerp(p02,p03,t);
+        const Vertex p20 = lerp(p10,p11,t);
+        const Vertex p21 = lerp(p11,p12,t);
+        const Vertex p30 = lerp(p20,p21,t);
+        p = p30;
+        dp = 3.0f*(p21-p20);
+        ddp = eval_dudu(t);
+      }
+      
+      __forceinline CubicBezierCurve clip(const Interval1f& u1) const
+      {
+        Vertex f0,df0; eval(u1.lower,f0,df0);
+        Vertex f1,df1; eval(u1.upper,f1,df1);
+        float s = u1.upper-u1.lower;
+        return CubicBezierCurve(f0,f0+s*(1.0f/3.0f)*df0,f1-s*(1.0f/3.0f)*df1,f1);
+      }
+      
+      __forceinline QuadraticBezierCurve<Vertex> derivative() const
+      {
+        const Vertex q0 = 3.0f*(v1-v0);
+        const Vertex q1 = 3.0f*(v2-v1);
+        const Vertex q2 = 3.0f*(v3-v2);
+        return QuadraticBezierCurve<Vertex>(q0,q1,q2);
+      }
+      
+      __forceinline BBox<Vertex> derivative_bounds(const Interval1f& u1) const
+      {
+        Vertex f0,df0; eval(u1.lower,f0,df0);
+        Vertex f3,df3; eval(u1.upper,f3,df3);
+        const float s = u1.upper-u1.lower;
+        const Vertex f1 = f0+s*(1.0f/3.0f)*df0;
+        const Vertex f2 = f3-s*(1.0f/3.0f)*df3;
+        const Vertex q0 = s*df0;
+        const Vertex q1 = 3.0f*(f2-f1);
+        const Vertex q2 = s*df3;
+        return merge(BBox<Vertex>(q0),BBox<Vertex>(q1),BBox<Vertex>(q2));
+      }
+      
+      template<int M>
+      __forceinline Vec4vf<M> veval(const vfloat<M>& t) const 
+      {
+        const Vec4vf<M> b = BezierBasis::eval(t);
+        return madd(b.x, Vec4vf<M>(v0), madd(b.y, Vec4vf<M>(v1), madd(b.z, Vec4vf<M>(v2), b.w * Vec4vf<M>(v3))));
+      }
+
+      template<int M>
+      __forceinline Vec4vf<M> veval_du(const vfloat<M>& t) const 
+      {
+        const Vec4vf<M> b = BezierBasis::derivative(t);
+        return madd(b.x, Vec4vf<M>(v0), madd(b.y, Vec4vf<M>(v1), madd(b.z, Vec4vf<M>(v2), b.w * Vec4vf<M>(v3))));
+      }
+
+      template<int M>
+      __forceinline Vec4vf<M> veval_dudu(const vfloat<M>& t) const 
+      {
+        const Vec4vf<M> b = BezierBasis::derivative2(t);
+        return madd(b.x, Vec4vf<M>(v0), madd(b.y, Vec4vf<M>(v1), madd(b.z, Vec4vf<M>(v2), b.w * Vec4vf<M>(v3))));
+      }
+      
+      template<int M>
+      __forceinline void veval(const vfloat<M>& t, Vec4vf<M>& p, Vec4vf<M>& dp) const
+      {
+        const Vec4vf<M> p00 = v0;
+        const Vec4vf<M> p01 = v1;
+        const Vec4vf<M> p02 = v2;
+        const Vec4vf<M> p03 = v3;
+        
+        const Vec4vf<M> p10 = lerp(p00,p01,t);
+        const Vec4vf<M> p11 = lerp(p01,p02,t);
+        const Vec4vf<M> p12 = lerp(p02,p03,t);
+        const Vec4vf<M> p20 = lerp(p10,p11,t);
+        const Vec4vf<M> p21 = lerp(p11,p12,t);
+        const Vec4vf<M> p30 = lerp(p20,p21,t);
+        
+        p = p30;
+        dp = vfloat<M>(3.0f)*(p21-p20);
+      }
+      
+      template<int M, typename Vec = Vec4vf<M>>
+      __forceinline Vec eval0(const int ofs, const int size) const
+      {
+        assert(size <= PrecomputedBezierBasis::N);
+        assert(ofs <= size);
+        return madd(vfloat<M>::loadu(&bezier_basis0.c0[size][ofs]), Vec(v0),
+                    madd(vfloat<M>::loadu(&bezier_basis0.c1[size][ofs]), Vec(v1),
+                         madd(vfloat<M>::loadu(&bezier_basis0.c2[size][ofs]), Vec(v2),
+                              vfloat<M>::loadu(&bezier_basis0.c3[size][ofs]) * Vec(v3))));
+      }
+      
+      template<int M, typename Vec = Vec4vf<M>>
+      __forceinline Vec eval1(const int ofs, const int size) const
+      {
+        assert(size <= PrecomputedBezierBasis::N);
+        assert(ofs <= size);
+        return madd(vfloat<M>::loadu(&bezier_basis1.c0[size][ofs]), Vec(v0), 
+                    madd(vfloat<M>::loadu(&bezier_basis1.c1[size][ofs]), Vec(v1),
+                         madd(vfloat<M>::loadu(&bezier_basis1.c2[size][ofs]), Vec(v2),
+                              vfloat<M>::loadu(&bezier_basis1.c3[size][ofs]) * Vec(v3))));
+      }
+      
+      template<int M, typename Vec = Vec4vf<M>>
+      __forceinline Vec derivative0(const int ofs, const int size) const
+      {
+        assert(size <= PrecomputedBezierBasis::N);
+        assert(ofs <= size);
+        return madd(vfloat<M>::loadu(&bezier_basis0.d0[size][ofs]), Vec(v0),
+                    madd(vfloat<M>::loadu(&bezier_basis0.d1[size][ofs]), Vec(v1),
+                         madd(vfloat<M>::loadu(&bezier_basis0.d2[size][ofs]), Vec(v2),
+                              vfloat<M>::loadu(&bezier_basis0.d3[size][ofs]) * Vec(v3))));
+      }
+      
+      template<int M, typename Vec = Vec4vf<M>>
+      __forceinline Vec derivative1(const int ofs, const int size) const
+      {
+        assert(size <= PrecomputedBezierBasis::N);
+        assert(ofs <= size);
+        return madd(vfloat<M>::loadu(&bezier_basis1.d0[size][ofs]), Vec(v0),
+                    madd(vfloat<M>::loadu(&bezier_basis1.d1[size][ofs]), Vec(v1),
+                         madd(vfloat<M>::loadu(&bezier_basis1.d2[size][ofs]), Vec(v2),
+                              vfloat<M>::loadu(&bezier_basis1.d3[size][ofs]) * Vec(v3))));
+      }
+
+      /* calculates bounds of bezier curve geometry */
+      __forceinline BBox3fa accurateBounds() const
+      {
+        const int N = 7;
+        const float scale = 1.0f/(3.0f*(N-1));
+        Vec3vfx pl(pos_inf), pu(neg_inf);
+        for (int i=0; i<=N; i+=VSIZEX)
+        {
+          vintx vi = vintx(i)+vintx(step);
+          vboolx valid = vi <= vintx(N);
+          const Vec3vfx p  = eval0<VSIZEX,Vec3vf<VSIZEX>>(i,N);
+          const Vec3vfx dp = derivative0<VSIZEX,Vec3vf<VSIZEX>>(i,N);
+          const Vec3vfx pm = p-Vec3vfx(scale)*select(vi!=vintx(0),dp,Vec3vfx(zero));
+          const Vec3vfx pp = p+Vec3vfx(scale)*select(vi!=vintx(N),dp,Vec3vfx(zero));
+          pl = select(valid,min(pl,p,pm,pp),pl); // FIXME: use masked min
+          pu = select(valid,max(pu,p,pm,pp),pu); // FIXME: use masked min
+        }
+        const Vec3fa lower(reduce_min(pl.x),reduce_min(pl.y),reduce_min(pl.z));
+        const Vec3fa upper(reduce_max(pu.x),reduce_max(pu.y),reduce_max(pu.z));
+        return BBox3fa(lower,upper);
+      }
+      
+      /* calculates bounds of bezier curve geometry */
+      __forceinline BBox3fa accurateRoundBounds() const
+      {
+        const int N = 7;
+        const float scale = 1.0f/(3.0f*(N-1));
+        Vec4vfx pl(pos_inf), pu(neg_inf);
+        for (int i=0; i<=N; i+=VSIZEX)
+        {
+          vintx vi = vintx(i)+vintx(step);
+          vboolx valid = vi <= vintx(N);
+          const Vec4vfx p  = eval0<VSIZEX>(i,N);
+          const Vec4vfx dp = derivative0<VSIZEX>(i,N);
+          const Vec4vfx pm = p-Vec4vfx(scale)*select(vi!=vintx(0),dp,Vec4vfx(zero));
+          const Vec4vfx pp = p+Vec4vfx(scale)*select(vi!=vintx(N),dp,Vec4vfx(zero));
+          pl = select(valid,min(pl,p,pm,pp),pl); // FIXME: use masked min
+          pu = select(valid,max(pu,p,pm,pp),pu); // FIXME: use masked min
+        }
+        const Vec3fa lower(reduce_min(pl.x),reduce_min(pl.y),reduce_min(pl.z));
+        const Vec3fa upper(reduce_max(pu.x),reduce_max(pu.y),reduce_max(pu.z));
+        const float r_min = reduce_min(pl.w);
+        const float r_max = reduce_max(pu.w);
+        const Vec3fa upper_r = Vec3fa(max(abs(r_min),abs(r_max)));
+        return enlarge(BBox3fa(lower,upper),upper_r);
+      }
+      
+      /* calculates bounds when tessellated into N line segments */
+      __forceinline BBox3fa accurateFlatBounds(int N) const
+      {
+        if (likely(N == 4))
+        {
+          const Vec4vf4 pi = eval0<4>(0,4);
+          const Vec3fa lower(reduce_min(pi.x),reduce_min(pi.y),reduce_min(pi.z));
+          const Vec3fa upper(reduce_max(pi.x),reduce_max(pi.y),reduce_max(pi.z));
+          const Vec3fa upper_r = Vec3fa(reduce_max(abs(pi.w)));
+          return enlarge(BBox3fa(min(lower,v3),max(upper,v3)),max(upper_r,Vec3fa(abs(v3.w))));
+        } 
+        else
+        {
+          Vec3vfx pl(pos_inf), pu(neg_inf); vfloatx ru(0.0f);
+          for (int i=0; i<N; i+=VSIZEX)
+          {
+            vboolx valid = vintx(i)+vintx(step) < vintx(N);
+            const Vec4vfx pi = eval0<VSIZEX>(i,N);
+            
+            pl.x = select(valid,min(pl.x,pi.x),pl.x); // FIXME: use masked min
+            pl.y = select(valid,min(pl.y,pi.y),pl.y); 
+            pl.z = select(valid,min(pl.z,pi.z),pl.z); 
+            
+            pu.x = select(valid,max(pu.x,pi.x),pu.x); // FIXME: use masked min
+            pu.y = select(valid,max(pu.y,pi.y),pu.y); 
+            pu.z = select(valid,max(pu.z,pi.z),pu.z); 
+            
+            ru   = select(valid,max(ru,abs(pi.w)),ru);
+          }
+          const Vec3fa lower(reduce_min(pl.x),reduce_min(pl.y),reduce_min(pl.z));
+          const Vec3fa upper(reduce_max(pu.x),reduce_max(pu.y),reduce_max(pu.z));
+          const Vec3fa upper_r(reduce_max(ru));
+          return enlarge(BBox3fa(min(lower,v3),max(upper,v3)),max(upper_r,Vec3fa(abs(v3.w))));
+        }
+      }
+      
+      friend __forceinline embree_ostream operator<<(embree_ostream cout, const CubicBezierCurve& curve) {
+        return cout << "CubicBezierCurve { v0 = " << curve.v0 << ", v1 = " << curve.v1 << ", v2 = " << curve.v2 << ", v3 = " << curve.v3 << " }";
+      }
+    };
+
+#if defined(__AVX__)
+  template<>
+    __forceinline CubicBezierCurve<vfloat4> CubicBezierCurve<vfloat4>::clip(const Interval1f& u1) const
+  {
+    const vfloat8 p00 = vfloat8(v0);
+    const vfloat8 p01 = vfloat8(v1);
+    const vfloat8 p02 = vfloat8(v2);
+    const vfloat8 p03 = vfloat8(v3);
+
+    const vfloat8 t(vfloat4(u1.lower),vfloat4(u1.upper));
+    const vfloat8 p10 = lerp(p00,p01,t);
+    const vfloat8 p11 = lerp(p01,p02,t);
+    const vfloat8 p12 = lerp(p02,p03,t);
+    const vfloat8 p20 = lerp(p10,p11,t);
+    const vfloat8 p21 = lerp(p11,p12,t);
+    const vfloat8 p30 = lerp(p20,p21,t);
+    
+    const vfloat8 f01  = p30;
+    const vfloat8 df01 = vfloat8(3.0f)*(p21-p20);
+        
+    const vfloat4 f0  = extract4<0>(f01),  f1  = extract4<1>(f01);
+    const vfloat4 df0 = extract4<0>(df01), df1 = extract4<1>(df01);
+    const float s = u1.upper-u1.lower;
+    return CubicBezierCurve(f0,f0+s*(1.0f/3.0f)*df0,f1-s*(1.0f/3.0f)*df1,f1);
+  }
+#endif
+  
+  template<typename Vertex> using BezierCurveT = CubicBezierCurve<Vertex>;
+  
+  typedef CubicBezierCurve<float> CubicBezierCurve1f;
+  typedef CubicBezierCurve<Vec2fa> CubicBezierCurve2fa;
+  typedef CubicBezierCurve<Vec3fa> CubicBezierCurve3fa;
+  typedef CubicBezierCurve<Vec3fa> BezierCurve3fa;
+  
+  template<> __forceinline int CubicBezierCurve<float>::maxRoots() const
+  {
+    float eps = 1E-4f;
+    bool neg0 = v0 <= 0.0f; bool zero0 = fabs(v0) < eps;
+    bool neg1 = v1 <= 0.0f; bool zero1 = fabs(v1) < eps;
+    bool neg2 = v2 <= 0.0f; bool zero2 = fabs(v2) < eps;
+    bool neg3 = v3 <= 0.0f; bool zero3 = fabs(v3) < eps;
+    return (neg0 != neg1 || zero0) + (neg1 != neg2 || zero1) + (neg2 != neg3 || zero2 || zero3);
+  }
+  
+  template<> __forceinline int CubicBezierCurve<Interval1f>::maxRoots() const {
+    return numRoots(v0,v1) + numRoots(v1,v2) + numRoots(v2,v3);
+  }
+
+  template<typename CurveGeometry>
+  __forceinline CubicBezierCurve<Vec3ff> enlargeRadiusToMinWidth(const IntersectContext* context, const CurveGeometry* geom, const Vec3fa& ray_org, const CubicBezierCurve<Vec3ff>& curve)
+  {
+    return CubicBezierCurve<Vec3ff>(enlargeRadiusToMinWidth(context,geom,ray_org,curve.v0),
+                                    enlargeRadiusToMinWidth(context,geom,ray_org,curve.v1),
+                                    enlargeRadiusToMinWidth(context,geom,ray_org,curve.v2),
+                                    enlargeRadiusToMinWidth(context,geom,ray_org,curve.v3));
+  }
+}
diff --git a/thirdparty/embree/kernels/subdiv/bezier_patch.h b/thirdparty/embree/kernels/subdiv/bezier_patch.h
new file mode 100644
index 0000000000..2ff03902a7
--- /dev/null
+++ b/thirdparty/embree/kernels/subdiv/bezier_patch.h
@@ -0,0 +1,372 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "catmullclark_patch.h"
+#include "bezier_curve.h"
+
+namespace embree
+{  
+  template<class T, class S>
+    static __forceinline T deCasteljau(const S& uu, const T& v0, const T& v1, const T& v2, const T& v3)
+  {
+    const T v0_1 = lerp(v0,v1,uu);
+    const T v1_1 = lerp(v1,v2,uu);
+    const T v2_1 = lerp(v2,v3,uu);
+    const T v0_2 = lerp(v0_1,v1_1,uu);
+    const T v1_2 = lerp(v1_1,v2_1,uu);
+    const T v0_3 = lerp(v0_2,v1_2,uu);
+    return v0_3;
+  }
+  
+  template<class T, class S>
+    static __forceinline T deCasteljau_tangent(const S& uu, const T& v0, const T& v1, const T& v2, const T& v3)
+  {
+    const T v0_1 = lerp(v0,v1,uu);
+    const T v1_1 = lerp(v1,v2,uu);
+    const T v2_1 = lerp(v2,v3,uu);
+    const T v0_2 = lerp(v0_1,v1_1,uu);
+    const T v1_2 = lerp(v1_1,v2_1,uu);
+    return S(3.0f)*(v1_2-v0_2);
+  }
+
+  template<typename Vertex>
+    __forceinline Vertex computeInnerBezierControlPoint(const Vertex v[4][4], const size_t y, const size_t x) {
+    return 1.0f / 36.0f * (16.0f * v[y][x] + 4.0f * (v[y-1][x] +  v[y+1][x] + v[y][x-1] + v[y][x+1]) + (v[y-1][x-1] + v[y+1][x+1] + v[y-1][x+1] + v[y+1][x-1]));
+  }
+  
+  template<typename Vertex>
+    __forceinline Vertex computeTopEdgeBezierControlPoint(const Vertex v[4][4], const size_t y, const size_t x) {
+    return 1.0f / 18.0f * (8.0f * v[y][x] + 4.0f * v[y-1][x] + 2.0f * (v[y][x-1] + v[y][x+1]) + (v[y-1][x-1] + v[y-1][x+1]));
+  }
+
+  template<typename Vertex>
+    __forceinline Vertex computeBottomEdgeBezierControlPoint(const Vertex v[4][4], const size_t y, const size_t x) {
+    return 1.0f / 18.0f * (8.0f * v[y][x] + 4.0f * v[y+1][x] + 2.0f * (v[y][x-1] + v[y][x+1]) + v[y+1][x-1] + v[y+1][x+1]);
+  }
+  
+  template<typename Vertex>
+    __forceinline Vertex computeLeftEdgeBezierControlPoint(const Vertex v[4][4], const size_t y, const size_t x) {
+    return 1.0f / 18.0f * (8.0f * v[y][x] + 4.0f * v[y][x-1] + 2.0f * (v[y-1][x] + v[y+1][x]) + v[y-1][x-1] + v[y+1][x-1]);
+  }
+  
+  template<typename Vertex>
+    __forceinline Vertex computeRightEdgeBezierControlPoint(const Vertex v[4][4], const size_t y, const size_t x) {
+    return 1.0f / 18.0f * (8.0f * v[y][x] + 4.0f * v[y][x+1] + 2.0f * (v[y-1][x] + v[y+1][x]) + v[y-1][x+1] + v[y+1][x+1]);
+  }
+  
+  template<typename Vertex>
+    __forceinline Vertex computeCornerBezierControlPoint(const Vertex v[4][4], const size_t y, const size_t x, const ssize_t delta_y, const ssize_t delta_x)
+  {
+    return 1.0f / 9.0f * (4.0f * v[y][x] + 2.0f * (v[y+delta_y][x] + v[y][x+delta_x]) + v[y+delta_y][x+delta_x]);
+  }
+
+  template<typename Vertex, typename Vertex_t>
+    class __aligned(64) BezierPatchT
+  {
+   public:
+      Vertex matrix[4][4];
+    
+  public:
+
+    __forceinline BezierPatchT() {}
+
+    __forceinline BezierPatchT (const HalfEdge* edge, const char* vertices, size_t stride);
+
+    __forceinline BezierPatchT(const CatmullClarkPatchT<Vertex,Vertex_t>& patch);
+
+    __forceinline BezierPatchT(const CatmullClarkPatchT<Vertex,Vertex_t>& patch,
+                               const BezierCurveT<Vertex>* border0,
+                               const BezierCurveT<Vertex>* border1,
+                               const BezierCurveT<Vertex>* border2,
+                               const BezierCurveT<Vertex>* border3);
+                               
+    __forceinline BezierPatchT(const BSplinePatchT<Vertex,Vertex_t>& source)
+    {
+      /* compute inner bezier control points */
+      matrix[0][0] = computeInnerBezierControlPoint(source.v,1,1);
+      matrix[0][3] = computeInnerBezierControlPoint(source.v,1,2);
+      matrix[3][3] = computeInnerBezierControlPoint(source.v,2,2);
+      matrix[3][0] = computeInnerBezierControlPoint(source.v,2,1);
+      
+      /* compute top edge control points */
+      matrix[0][1] = computeRightEdgeBezierControlPoint(source.v,1,1);
+      matrix[0][2] = computeLeftEdgeBezierControlPoint(source.v,1,2); 
+      
+      /* compute buttom edge control points */
+      matrix[3][1] = computeRightEdgeBezierControlPoint(source.v,2,1);
+      matrix[3][2] = computeLeftEdgeBezierControlPoint(source.v,2,2);
+      
+      /* compute left edge control points */
+      matrix[1][0] = computeBottomEdgeBezierControlPoint(source.v,1,1);
+      matrix[2][0] = computeTopEdgeBezierControlPoint(source.v,2,1);
+      
+      /* compute right edge control points */
+      matrix[1][3] = computeBottomEdgeBezierControlPoint(source.v,1,2);
+      matrix[2][3] = computeTopEdgeBezierControlPoint(source.v,2,2);
+      
+      /* compute corner control points */
+      matrix[1][1] = computeCornerBezierControlPoint(source.v,1,1, 1, 1);
+      matrix[1][2] = computeCornerBezierControlPoint(source.v,1,2, 1,-1);
+      matrix[2][2] = computeCornerBezierControlPoint(source.v,2,2,-1,-1);
+      matrix[2][1] = computeCornerBezierControlPoint(source.v,2,1,-1, 1);      
+    }
+
+    static __forceinline Vertex_t bilinear(const Vec4f Bu, const Vertex matrix[4][4], const Vec4f Bv)
+    {
+      const Vertex_t M0 = madd(Bu.x,matrix[0][0],madd(Bu.y,matrix[0][1],madd(Bu.z,matrix[0][2],Bu.w * matrix[0][3]))); 
+      const Vertex_t M1 = madd(Bu.x,matrix[1][0],madd(Bu.y,matrix[1][1],madd(Bu.z,matrix[1][2],Bu.w * matrix[1][3])));
+      const Vertex_t M2 = madd(Bu.x,matrix[2][0],madd(Bu.y,matrix[2][1],madd(Bu.z,matrix[2][2],Bu.w * matrix[2][3])));
+      const Vertex_t M3 = madd(Bu.x,matrix[3][0],madd(Bu.y,matrix[3][1],madd(Bu.z,matrix[3][2],Bu.w * matrix[3][3])));
+      return madd(Bv.x,M0,madd(Bv.y,M1,madd(Bv.z,M2,Bv.w*M3)));
+    }
+
+    static __forceinline Vertex_t eval(const Vertex matrix[4][4], const float uu, const float vv) 
+    {      
+      const Vec4f Bu = BezierBasis::eval(uu);
+      const Vec4f Bv = BezierBasis::eval(vv);
+      return bilinear(Bu,matrix,Bv);
+    }
+
+    static __forceinline Vertex_t eval_du(const Vertex matrix[4][4], const float uu, const float vv) 
+    {
+      const Vec4f Bu = BezierBasis::derivative(uu);
+      const Vec4f Bv = BezierBasis::eval(vv);
+      return bilinear(Bu,matrix,Bv);
+    }
+
+    static __forceinline Vertex_t eval_dv(const Vertex matrix[4][4], const float uu, const float vv) 
+    {
+      const Vec4f Bu = BezierBasis::eval(uu);
+      const Vec4f Bv = BezierBasis::derivative(vv);
+      return bilinear(Bu,matrix,Bv);
+    }
+
+    static __forceinline Vertex_t eval_dudu(const Vertex matrix[4][4], const float uu, const float vv) 
+    {
+      const Vec4f Bu = BezierBasis::derivative2(uu);
+      const Vec4f Bv = BezierBasis::eval(vv);
+      return bilinear(Bu,matrix,Bv);
+    }
+
+    static __forceinline Vertex_t eval_dvdv(const Vertex matrix[4][4], const float uu, const float vv) 
+    {
+      const Vec4f Bu = BezierBasis::eval(uu);
+      const Vec4f Bv = BezierBasis::derivative2(vv);
+      return bilinear(Bu,matrix,Bv);
+    }
+
+    static __forceinline Vertex_t eval_dudv(const Vertex matrix[4][4], const float uu, const float vv) 
+    {
+      const Vec4f Bu = BezierBasis::derivative(uu);
+      const Vec4f Bv = BezierBasis::derivative(vv);
+      return bilinear(Bu,matrix,Bv);
+    }
+
+    static __forceinline Vertex_t normal(const Vertex matrix[4][4], const float uu, const float vv) 
+    {
+      const Vertex_t dPdu = eval_du(matrix,uu,vv);
+      const Vertex_t dPdv = eval_dv(matrix,uu,vv);
+      return cross(dPdu,dPdv);
+    }
+
+    __forceinline Vertex_t normal(const float uu, const float vv) 
+    {
+      const Vertex_t dPdu = eval_du(matrix,uu,vv);
+      const Vertex_t dPdv = eval_dv(matrix,uu,vv);
+      return cross(dPdu,dPdv);
+    }
+
+    __forceinline Vertex_t eval(const float uu, const float vv) const {
+      return eval(matrix,uu,vv);
+    }
+
+    __forceinline Vertex_t eval_du(const float uu, const float vv) const { 
+      return eval_du(matrix,uu,vv);
+    }
+
+    __forceinline Vertex_t eval_dv(const float uu, const float vv) const {
+      return eval_dv(matrix,uu,vv);
+    }
+
+    __forceinline Vertex_t eval_dudu(const float uu, const float vv) const { 
+      return eval_dudu(matrix,uu,vv);
+    }
+    
+    __forceinline Vertex_t eval_dvdv(const float uu, const float vv) const { 
+      return eval_dvdv(matrix,uu,vv);
+    }
+
+    __forceinline Vertex_t eval_dudv(const float uu, const float vv) const { 
+      return eval_dudv(matrix,uu,vv);
+    }
+
+    __forceinline void eval(const float u, const float v, Vertex* P, Vertex* dPdu, Vertex* dPdv, Vertex* ddPdudu, Vertex* ddPdvdv, Vertex* ddPdudv, const float dscale = 1.0f) const
+    {
+      if (P) {
+        *P = eval(u,v); 
+      }
+      if (dPdu) {
+        assert(dPdu); *dPdu = eval_du(u,v)*dscale; 
+        assert(dPdv); *dPdv = eval_dv(u,v)*dscale; 
+      }
+      if (ddPdudu) {
+        assert(ddPdudu); *ddPdudu = eval_dudu(u,v)*sqr(dscale); 
+        assert(ddPdvdv); *ddPdvdv = eval_dvdv(u,v)*sqr(dscale); 
+        assert(ddPdudv); *ddPdudv = eval_dudv(u,v)*sqr(dscale); 
+      }
+    }
+
+    template<class vfloat>
+      __forceinline vfloat eval(const size_t i, const vfloat& uu, const vfloat& vv, const Vec4<vfloat>& u_n, const Vec4<vfloat>& v_n) const
+      {
+        const vfloat curve0_x = v_n[0] * vfloat(matrix[0][0][i]) + v_n[1] * vfloat(matrix[1][0][i]) + v_n[2] * vfloat(matrix[2][0][i]) + v_n[3] * vfloat(matrix[3][0][i]);
+        const vfloat curve1_x = v_n[0] * vfloat(matrix[0][1][i]) + v_n[1] * vfloat(matrix[1][1][i]) + v_n[2] * vfloat(matrix[2][1][i]) + v_n[3] * vfloat(matrix[3][1][i]);
+        const vfloat curve2_x = v_n[0] * vfloat(matrix[0][2][i]) + v_n[1] * vfloat(matrix[1][2][i]) + v_n[2] * vfloat(matrix[2][2][i]) + v_n[3] * vfloat(matrix[3][2][i]);
+        const vfloat curve3_x = v_n[0] * vfloat(matrix[0][3][i]) + v_n[1] * vfloat(matrix[1][3][i]) + v_n[2] * vfloat(matrix[2][3][i]) + v_n[3] * vfloat(matrix[3][3][i]);
+        return u_n[0] * curve0_x + u_n[1] * curve1_x + u_n[2] * curve2_x + u_n[3] * curve3_x;
+      }
+
+    template<typename vbool, typename vfloat>
+      __forceinline void eval(const vbool& valid, const vfloat& uu, const vfloat& vv, 
+                              float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv,
+                              const float dscale, const size_t dstride, const size_t N) const
+      {
+        if (P) {
+          const Vec4<vfloat> u_n = BezierBasis::eval(uu); 
+          const Vec4<vfloat> v_n = BezierBasis::eval(vv); 
+          for (size_t i=0; i<N; i++) vfloat::store(valid,P+i*dstride,eval(i,uu,vv,u_n,v_n));
+        }
+        if (dPdu) 
+        {
+          {
+            assert(dPdu);
+            const Vec4<vfloat> u_n = BezierBasis::derivative(uu);
+            const Vec4<vfloat> v_n = BezierBasis::eval(vv); 
+            for (size_t i=0; i<N; i++) vfloat::store(valid,dPdu+i*dstride,eval(i,uu,vv,u_n,v_n)*dscale);
+          }
+          {
+            assert(dPdv);
+            const Vec4<vfloat> u_n = BezierBasis::eval(uu);
+            const Vec4<vfloat> v_n = BezierBasis::derivative(vv); 
+            for (size_t i=0; i<N; i++) vfloat::store(valid,dPdv+i*dstride,eval(i,uu,vv,u_n,v_n)*dscale);
+          }
+        }
+        if (ddPdudu) 
+        {
+          {
+            assert(ddPdudu);
+            const Vec4<vfloat> u_n = BezierBasis::derivative2(uu);
+            const Vec4<vfloat> v_n = BezierBasis::eval(vv); 
+            for (size_t i=0; i<N; i++) vfloat::store(valid,ddPdudu+i*dstride,eval(i,uu,vv,u_n,v_n)*sqr(dscale));
+          }
+          {
+            assert(ddPdvdv);
+            const Vec4<vfloat> u_n = BezierBasis::eval(uu);
+            const Vec4<vfloat> v_n = BezierBasis::derivative2(vv); 
+            for (size_t i=0; i<N; i++) vfloat::store(valid,ddPdvdv+i*dstride,eval(i,uu,vv,u_n,v_n)*sqr(dscale));
+          }
+          {
+            assert(ddPdudv);
+            const Vec4<vfloat> u_n = BezierBasis::derivative(uu);
+            const Vec4<vfloat> v_n = BezierBasis::derivative(vv); 
+            for (size_t i=0; i<N; i++) vfloat::store(valid,ddPdudv+i*dstride,eval(i,uu,vv,u_n,v_n)*sqr(dscale));
+          }
+        }
+      }
+
+    template<typename T>
+      static __forceinline Vec3<T> eval(const Vertex matrix[4][4], const T& uu, const T& vv) 
+    {      
+      const T one_minus_uu = 1.0f - uu;
+      const T one_minus_vv = 1.0f - vv;      
+
+      const T B0_u = one_minus_uu * one_minus_uu * one_minus_uu;
+      const T B0_v = one_minus_vv * one_minus_vv * one_minus_vv;
+      const T B1_u = 3.0f * (one_minus_uu * uu * one_minus_uu);
+      const T B1_v = 3.0f * (one_minus_vv * vv * one_minus_vv);
+      const T B2_u = 3.0f * (uu * one_minus_uu * uu);
+      const T B2_v = 3.0f * (vv * one_minus_vv * vv);
+      const T B3_u = uu * uu * uu;
+      const T B3_v = vv * vv * vv;
+      
+      const T x = 
+        madd(B0_v,madd(B0_u,matrix[0][0].x,madd(B1_u,matrix[0][1].x,madd(B2_u,matrix[0][2].x,B3_u*matrix[0][3].x))), 
+        madd(B1_v,madd(B0_u,matrix[1][0].x,madd(B1_u,matrix[1][1].x,madd(B2_u,matrix[1][2].x,B3_u*matrix[1][3].x))),
+        madd(B2_v,madd(B0_u,matrix[2][0].x,madd(B1_u,matrix[2][1].x,madd(B2_u,matrix[2][2].x,B3_u*matrix[2][3].x))),
+             B3_v*madd(B0_u,matrix[3][0].x,madd(B1_u,matrix[3][1].x,madd(B2_u,matrix[3][2].x,B3_u*matrix[3][3].x)))))); 
+
+      const T y = 
+        madd(B0_v,madd(B0_u,matrix[0][0].y,madd(B1_u,matrix[0][1].y,madd(B2_u,matrix[0][2].y,B3_u*matrix[0][3].y))), 
+        madd(B1_v,madd(B0_u,matrix[1][0].y,madd(B1_u,matrix[1][1].y,madd(B2_u,matrix[1][2].y,B3_u*matrix[1][3].y))),
+        madd(B2_v,madd(B0_u,matrix[2][0].y,madd(B1_u,matrix[2][1].y,madd(B2_u,matrix[2][2].y,B3_u*matrix[2][3].y))),
+             B3_v*madd(B0_u,matrix[3][0].y,madd(B1_u,matrix[3][1].y,madd(B2_u,matrix[3][2].y,B3_u*matrix[3][3].y)))))); 
+      
+      const T z = 
+        madd(B0_v,madd(B0_u,matrix[0][0].z,madd(B1_u,matrix[0][1].z,madd(B2_u,matrix[0][2].z,B3_u*matrix[0][3].z))), 
+        madd(B1_v,madd(B0_u,matrix[1][0].z,madd(B1_u,matrix[1][1].z,madd(B2_u,matrix[1][2].z,B3_u*matrix[1][3].z))),
+        madd(B2_v,madd(B0_u,matrix[2][0].z,madd(B1_u,matrix[2][1].z,madd(B2_u,matrix[2][2].z,B3_u*matrix[2][3].z))),
+             B3_v*madd(B0_u,matrix[3][0].z,madd(B1_u,matrix[3][1].z,madd(B2_u,matrix[3][2].z,B3_u*matrix[3][3].z)))))); 
+      
+      return Vec3<T>(x,y,z);
+    }
+
+    template<typename vfloat>
+      __forceinline Vec3<vfloat> eval(const vfloat& uu, const vfloat& vv) const {     
+      return eval(matrix,uu,vv);
+    }
+
+    template<class T>
+      static __forceinline Vec3<T> normal(const Vertex matrix[4][4], const T& uu, const T& vv) 
+    {
+      
+      const Vec3<T> matrix_00 = Vec3<T>(matrix[0][0].x,matrix[0][0].y,matrix[0][0].z);
+      const Vec3<T> matrix_01 = Vec3<T>(matrix[0][1].x,matrix[0][1].y,matrix[0][1].z);
+      const Vec3<T> matrix_02 = Vec3<T>(matrix[0][2].x,matrix[0][2].y,matrix[0][2].z);
+      const Vec3<T> matrix_03 = Vec3<T>(matrix[0][3].x,matrix[0][3].y,matrix[0][3].z);
+
+      const Vec3<T> matrix_10 = Vec3<T>(matrix[1][0].x,matrix[1][0].y,matrix[1][0].z);
+      const Vec3<T> matrix_11 = Vec3<T>(matrix[1][1].x,matrix[1][1].y,matrix[1][1].z);
+      const Vec3<T> matrix_12 = Vec3<T>(matrix[1][2].x,matrix[1][2].y,matrix[1][2].z);
+      const Vec3<T> matrix_13 = Vec3<T>(matrix[1][3].x,matrix[1][3].y,matrix[1][3].z);
+
+      const Vec3<T> matrix_20 = Vec3<T>(matrix[2][0].x,matrix[2][0].y,matrix[2][0].z);
+      const Vec3<T> matrix_21 = Vec3<T>(matrix[2][1].x,matrix[2][1].y,matrix[2][1].z);
+      const Vec3<T> matrix_22 = Vec3<T>(matrix[2][2].x,matrix[2][2].y,matrix[2][2].z);
+      const Vec3<T> matrix_23 = Vec3<T>(matrix[2][3].x,matrix[2][3].y,matrix[2][3].z);
+
+      const Vec3<T> matrix_30 = Vec3<T>(matrix[3][0].x,matrix[3][0].y,matrix[3][0].z);
+      const Vec3<T> matrix_31 = Vec3<T>(matrix[3][1].x,matrix[3][1].y,matrix[3][1].z);
+      const Vec3<T> matrix_32 = Vec3<T>(matrix[3][2].x,matrix[3][2].y,matrix[3][2].z);
+      const Vec3<T> matrix_33 = Vec3<T>(matrix[3][3].x,matrix[3][3].y,matrix[3][3].z);
+            
+      /* tangentU */
+      const Vec3<T> col0 = deCasteljau(vv, matrix_00, matrix_10, matrix_20, matrix_30);
+      const Vec3<T> col1 = deCasteljau(vv, matrix_01, matrix_11, matrix_21, matrix_31);
+      const Vec3<T> col2 = deCasteljau(vv, matrix_02, matrix_12, matrix_22, matrix_32);
+      const Vec3<T> col3 = deCasteljau(vv, matrix_03, matrix_13, matrix_23, matrix_33);
+      
+      const Vec3<T> tangentU = deCasteljau_tangent(uu, col0, col1, col2, col3);
+      
+      /* tangentV */
+      const Vec3<T> row0 = deCasteljau(uu, matrix_00, matrix_01, matrix_02, matrix_03);
+      const Vec3<T> row1 = deCasteljau(uu, matrix_10, matrix_11, matrix_12, matrix_13);
+      const Vec3<T> row2 = deCasteljau(uu, matrix_20, matrix_21, matrix_22, matrix_23);
+      const Vec3<T> row3 = deCasteljau(uu, matrix_30, matrix_31, matrix_32, matrix_33);
+      
+      const Vec3<T> tangentV = deCasteljau_tangent(vv, row0, row1, row2, row3);
+      
+      /* normal = tangentU x tangentV */
+      const Vec3<T> n = cross(tangentU,tangentV);
+      return n;
+    }
+
+    template<typename vfloat>
+      __forceinline Vec3<vfloat> normal(const vfloat& uu, const vfloat& vv) const {     
+      return normal(matrix,uu,vv);
+    }
+  };
+
+  typedef BezierPatchT<Vec3fa,Vec3fa_t> BezierPatch3fa;
+}
diff --git a/thirdparty/embree/kernels/subdiv/bilinear_patch.h b/thirdparty/embree/kernels/subdiv/bilinear_patch.h
new file mode 100644
index 0000000000..cade104a6c
--- /dev/null
+++ b/thirdparty/embree/kernels/subdiv/bilinear_patch.h
@@ -0,0 +1,191 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "catmullclark_patch.h"
+#include "bezier_curve.h"
+
+namespace embree
+{
+  template<typename Vertex, typename Vertex_t = Vertex>
+    class __aligned(64) BilinearPatchT
+    {
+      typedef CatmullClark1RingT<Vertex,Vertex_t> CatmullClarkRing;
+      typedef CatmullClarkPatchT<Vertex,Vertex_t> CatmullClarkPatch;
+      
+    public:
+      Vertex v[4];
+      
+    public:
+      
+      __forceinline BilinearPatchT () {}
+
+      __forceinline BilinearPatchT (const HalfEdge* edge, const BufferView<Vertex>& vertices) {
+        init(edge,vertices.getPtr(),vertices.getStride());
+      }
+      
+      __forceinline BilinearPatchT (const HalfEdge* edge, const char* vertices, size_t stride) {
+        init(edge,vertices,stride);
+      }
+
+      __forceinline void init (const HalfEdge* edge, const char* vertices, size_t stride)
+      {
+        v[0] = Vertex::loadu(vertices+edge->getStartVertexIndex()*stride); edge = edge->next();
+        v[1] = Vertex::loadu(vertices+edge->getStartVertexIndex()*stride); edge = edge->next();
+        v[2] = Vertex::loadu(vertices+edge->getStartVertexIndex()*stride); edge = edge->next();
+        v[3] = Vertex::loadu(vertices+edge->getStartVertexIndex()*stride); edge = edge->next();
+      }
+
+      __forceinline BilinearPatchT (const CatmullClarkPatch& patch)
+      {
+        v[0] = patch.ring[0].getLimitVertex();
+        v[1] = patch.ring[1].getLimitVertex();
+        v[2] = patch.ring[2].getLimitVertex();
+        v[3] = patch.ring[3].getLimitVertex();
+      }
+
+      __forceinline BBox<Vertex> bounds() const
+      {
+        
+        BBox<Vertex> bounds (v[0]);
+        bounds.extend(v[1]);
+        bounds.extend(v[2]);
+        bounds.extend(v[3]);
+        return bounds;
+      }
+      
+      __forceinline Vertex eval(const float uu, const float vv) const {
+        return lerp(lerp(v[0],v[1],uu),lerp(v[3],v[2],uu),vv);
+      }
+
+      __forceinline Vertex eval_du(const float uu, const float vv) const {
+        return lerp(v[1]-v[0],v[2]-v[3],vv);
+      }
+
+      __forceinline Vertex eval_dv(const float uu, const float vv) const {
+        return lerp(v[3]-v[0],v[2]-v[1],uu);
+      }
+
+      __forceinline Vertex eval_dudu(const float uu, const float vv) const {
+        return Vertex(zero);
+      }
+
+      __forceinline Vertex eval_dvdv(const float uu, const float vv) const {
+        return Vertex(zero);
+      }
+
+      __forceinline Vertex eval_dudv(const float uu, const float vv) const {
+        return (v[2]-v[3]) - (v[1]-v[0]);
+      }
+
+      __forceinline Vertex normal(const float uu, const float vv) const {
+        return cross(eval_du(uu,vv),eval_dv(uu,vv));
+      }
+      
+      __forceinline void eval(const float u, const float v, 
+                              Vertex* P, Vertex* dPdu, Vertex* dPdv, Vertex* ddPdudu, Vertex* ddPdvdv, Vertex* ddPdudv,
+                              const float dscale = 1.0f) const
+      {
+        if (P) {
+          *P = eval(u,v); 
+        }
+        if (dPdu) {
+          assert(dPdu); *dPdu = eval_du(u,v)*dscale; 
+          assert(dPdv); *dPdv = eval_dv(u,v)*dscale; 
+        }
+        if (ddPdudu) {
+          assert(ddPdudu); *ddPdudu = eval_dudu(u,v)*sqr(dscale); 
+          assert(ddPdvdv); *ddPdvdv = eval_dvdv(u,v)*sqr(dscale); 
+          assert(ddPdudv); *ddPdudv = eval_dudv(u,v)*sqr(dscale); 
+        }
+      }
+
+      template<class vfloat>
+      __forceinline Vec3<vfloat> eval(const vfloat& uu, const vfloat& vv) const
+      {
+        const vfloat x = lerp(lerp(v[0].x,v[1].x,uu),lerp(v[3].x,v[2].x,uu),vv);
+        const vfloat y = lerp(lerp(v[0].y,v[1].y,uu),lerp(v[3].y,v[2].y,uu),vv);
+        const vfloat z = lerp(lerp(v[0].z,v[1].z,uu),lerp(v[3].z,v[2].z,uu),vv);
+        return Vec3<vfloat>(x,y,z);
+      }
+
+      template<class vfloat>
+      __forceinline Vec3<vfloat> eval_du(const vfloat& uu, const vfloat& vv) const
+      {
+        const vfloat x = lerp(v[1].x-v[0].x,v[2].x-v[3].x,vv);
+        const vfloat y = lerp(v[1].y-v[0].y,v[2].y-v[3].y,vv);
+        const vfloat z = lerp(v[1].z-v[0].z,v[2].z-v[3].z,vv);
+        return Vec3<vfloat>(x,y,z);
+      }
+
+      template<class vfloat>
+      __forceinline Vec3<vfloat> eval_dv(const vfloat& uu, const vfloat& vv) const
+      {
+        const vfloat x = lerp(v[3].x-v[0].x,v[2].x-v[1].x,uu);
+        const vfloat y = lerp(v[3].y-v[0].y,v[2].y-v[1].y,uu);
+        const vfloat z = lerp(v[3].z-v[0].z,v[2].z-v[1].z,uu);
+        return Vec3<vfloat>(x,y,z);
+      }
+
+      template<typename vfloat>
+      __forceinline Vec3<vfloat> normal(const vfloat& uu, const vfloat& vv) const {
+        return cross(eval_du(uu,vv),eval_dv(uu,vv));
+      }
+
+       template<class vfloat>
+      __forceinline vfloat eval(const size_t i, const vfloat& uu, const vfloat& vv) const {
+        return lerp(lerp(v[0][i],v[1][i],uu),lerp(v[3][i],v[2][i],uu),vv);
+      }
+
+      template<class vfloat>
+      __forceinline vfloat eval_du(const size_t i, const vfloat& uu, const vfloat& vv) const {
+        return lerp(v[1][i]-v[0][i],v[2][i]-v[3][i],vv);
+      }
+
+      template<class vfloat>
+      __forceinline vfloat eval_dv(const size_t i, const vfloat& uu, const vfloat& vv) const {
+        return lerp(v[3][i]-v[0][i],v[2][i]-v[1][i],uu);
+      }
+      
+      template<class vfloat>
+      __forceinline vfloat eval_dudu(const size_t i, const vfloat& uu, const vfloat& vv) const {
+        return vfloat(zero);
+      }
+
+      template<class vfloat>
+      __forceinline vfloat eval_dvdv(const size_t i, const vfloat& uu, const vfloat& vv) const {
+        return vfloat(zero);
+      }
+
+      template<class vfloat>
+      __forceinline vfloat eval_dudv(const size_t i, const vfloat& uu, const vfloat& vv) const {
+        return (v[2][i]-v[3][i]) - (v[1][i]-v[0][i]);
+      }
+
+      template<typename vbool, typename vfloat>
+      __forceinline void eval(const vbool& valid, const vfloat& uu, const vfloat& vv, 
+                              float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv,
+                              const float dscale, const size_t dstride, const size_t N) const
+      {
+        if (P) {
+          for (size_t i=0; i<N; i++) vfloat::store(valid,P+i*dstride,eval(i,uu,vv));
+        }
+        if (dPdu) {
+          for (size_t i=0; i<N; i++) {
+            assert(dPdu); vfloat::store(valid,dPdu+i*dstride,eval_du(i,uu,vv)*dscale);
+            assert(dPdv); vfloat::store(valid,dPdv+i*dstride,eval_dv(i,uu,vv)*dscale);
+          }
+        }
+        if (ddPdudu) {
+          for (size_t i=0; i<N; i++) {
+            assert(ddPdudu); vfloat::store(valid,ddPdudu+i*dstride,eval_dudu(i,uu,vv)*sqr(dscale));
+            assert(ddPdvdv); vfloat::store(valid,ddPdvdv+i*dstride,eval_dvdv(i,uu,vv)*sqr(dscale));
+            assert(ddPdudv); vfloat::store(valid,ddPdudv+i*dstride,eval_dudv(i,uu,vv)*sqr(dscale));
+          }
+        }
+      }
+    };
+  
+  typedef BilinearPatchT<Vec3fa,Vec3fa_t> BilinearPatch3fa;
+}
diff --git a/thirdparty/embree/kernels/subdiv/bspline_curve.h b/thirdparty/embree/kernels/subdiv/bspline_curve.h
new file mode 100644
index 0000000000..51489ef37c
--- /dev/null
+++ b/thirdparty/embree/kernels/subdiv/bspline_curve.h
@@ -0,0 +1,320 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/default.h"
+#include "bezier_curve.h"
+
+namespace embree
+{
+  class BSplineBasis
+  {
+  public:
+
+    template<typename T>
+      static __forceinline Vec4<T> eval(const T& u) 
+    {
+      const T t  = u;
+      const T s  = T(1.0f) - u;
+      const T n0 = s*s*s;
+      const T n1 = (4.0f*(s*s*s)+(t*t*t)) + (12.0f*((s*t)*s) + 6.0f*((t*s)*t));
+      const T n2 = (4.0f*(t*t*t)+(s*s*s)) + (12.0f*((t*s)*t) + 6.0f*((s*t)*s));
+      const T n3 = t*t*t;
+      return T(1.0f/6.0f)*Vec4<T>(n0,n1,n2,n3);
+    }
+    
+    template<typename T>
+      static __forceinline Vec4<T>  derivative(const T& u)
+    {
+      const T t  =  u;
+      const T s  =  1.0f - u;
+      const T n0 = -s*s;
+      const T n1 = -t*t - 4.0f*(t*s);
+      const T n2 =  s*s + 4.0f*(s*t);
+      const T n3 =  t*t;
+      return T(0.5f)*Vec4<T>(n0,n1,n2,n3);
+    }
+
+    template<typename T>
+      static __forceinline Vec4<T>  derivative2(const T& u)
+    {
+      const T t  =  u;
+      const T s  =  1.0f - u;
+      const T n0 = s;
+      const T n1 = t - 2.0f*s;
+      const T n2 = s - 2.0f*t;
+      const T n3 = t;
+      return Vec4<T>(n0,n1,n2,n3);
+    }
+  };
+  
+  struct PrecomputedBSplineBasis
+  {
+    enum { N = 16 };
+  public:
+    PrecomputedBSplineBasis() {}
+    PrecomputedBSplineBasis(int shift);
+
+    /* basis for bspline evaluation */
+  public:
+    float c0[N+1][N+1];
+    float c1[N+1][N+1];
+    float c2[N+1][N+1];
+    float c3[N+1][N+1];
+    
+    /* basis for bspline derivative evaluation */
+  public:
+    float d0[N+1][N+1];
+    float d1[N+1][N+1];
+    float d2[N+1][N+1];
+    float d3[N+1][N+1];
+  };
+  extern PrecomputedBSplineBasis bspline_basis0;
+  extern PrecomputedBSplineBasis bspline_basis1;
+
+  template<typename Vertex>
+    struct BSplineCurveT
+    {
+      Vertex v0,v1,v2,v3;
+      
+      __forceinline BSplineCurveT() {}
+      
+      __forceinline BSplineCurveT(const Vertex& v0, const Vertex& v1, const Vertex& v2, const Vertex& v3)
+        : v0(v0), v1(v1), v2(v2), v3(v3) {}
+
+      __forceinline Vertex begin() const {
+        return madd(1.0f/6.0f,v0,madd(2.0f/3.0f,v1,1.0f/6.0f*v2));
+      }
+
+      __forceinline Vertex end() const {
+        return madd(1.0f/6.0f,v1,madd(2.0f/3.0f,v2,1.0f/6.0f*v3));
+      }
+
+      __forceinline Vertex center() const {
+        return 0.25f*(v0+v1+v2+v3);
+      }
+
+      __forceinline BBox<Vertex> bounds() const {
+        return merge(BBox<Vertex>(v0),BBox<Vertex>(v1),BBox<Vertex>(v2),BBox<Vertex>(v3));
+      }
+      
+      __forceinline friend BSplineCurveT operator -( const BSplineCurveT& a, const Vertex& b ) {
+        return BSplineCurveT(a.v0-b,a.v1-b,a.v2-b,a.v3-b);
+      }
+
+      __forceinline BSplineCurveT<Vec3ff> xfm_pr(const LinearSpace3fa& space, const Vec3fa& p) const
+      {
+        const Vec3ff q0(xfmVector(space,(Vec3fa)v0-p), v0.w);
+        const Vec3ff q1(xfmVector(space,(Vec3fa)v1-p), v1.w);
+        const Vec3ff q2(xfmVector(space,(Vec3fa)v2-p), v2.w);
+        const Vec3ff q3(xfmVector(space,(Vec3fa)v3-p), v3.w);
+        return BSplineCurveT<Vec3ff>(q0,q1,q2,q3);
+      }
+      
+      __forceinline Vertex eval(const float t) const 
+      {
+        const Vec4<float> b = BSplineBasis::eval(t);
+        return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3)));
+      }
+      
+      __forceinline Vertex eval_du(const float t) const
+      {
+        const Vec4<float> b = BSplineBasis::derivative(t);
+        return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3)));
+      }
+      
+      __forceinline Vertex eval_dudu(const float t) const 
+      {
+        const Vec4<float> b = BSplineBasis::derivative2(t);
+        return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3)));
+      }
+      
+      __forceinline void eval(const float t, Vertex& p, Vertex& dp, Vertex& ddp) const
+      {
+        p = eval(t);
+        dp = eval_du(t);
+        ddp = eval_dudu(t);
+      }
+
+      template<int M>
+      __forceinline Vec4vf<M> veval(const vfloat<M>& t) const 
+      {
+        const Vec4vf<M> b = BSplineBasis::eval(t);
+        return madd(b.x, Vec4vf<M>(v0), madd(b.y, Vec4vf<M>(v1), madd(b.z, Vec4vf<M>(v2), b.w * Vec4vf<M>(v3))));
+      }
+
+      template<int M>
+      __forceinline Vec4vf<M> veval_du(const vfloat<M>& t) const 
+      {
+        const Vec4vf<M> b = BSplineBasis::derivative(t);
+        return madd(b.x, Vec4vf<M>(v0), madd(b.y, Vec4vf<M>(v1), madd(b.z, Vec4vf<M>(v2), b.w * Vec4vf<M>(v3))));
+      }
+
+      template<int M>
+      __forceinline Vec4vf<M> veval_dudu(const vfloat<M>& t) const 
+      {
+        const Vec4vf<M> b = BSplineBasis::derivative2(t);
+        return madd(b.x, Vec4vf<M>(v0), madd(b.y, Vec4vf<M>(v1), madd(b.z, Vec4vf<M>(v2), b.w * Vec4vf<M>(v3))));
+      }
+
+      template<int M>
+      __forceinline void veval(const vfloat<M>& t, Vec4vf<M>& p, Vec4vf<M>& dp) const
+      {
+        p = veval<M>(t);
+        dp = veval_du<M>(t);
+      }
+      
+      template<int M>
+      __forceinline Vec4vf<M> eval0(const int ofs, const int size) const
+      {
+        assert(size <= PrecomputedBSplineBasis::N);
+        assert(ofs <= size);
+        return madd(vfloat<M>::loadu(&bspline_basis0.c0[size][ofs]), Vec4vf<M>(v0),
+                    madd(vfloat<M>::loadu(&bspline_basis0.c1[size][ofs]), Vec4vf<M>(v1),
+                         madd(vfloat<M>::loadu(&bspline_basis0.c2[size][ofs]), Vec4vf<M>(v2),
+                              vfloat<M>::loadu(&bspline_basis0.c3[size][ofs]) * Vec4vf<M>(v3))));
+      }
+      
+      template<int M>
+      __forceinline Vec4vf<M> eval1(const int ofs, const int size) const
+      {
+        assert(size <= PrecomputedBSplineBasis::N);
+        assert(ofs <= size);
+        return madd(vfloat<M>::loadu(&bspline_basis1.c0[size][ofs]), Vec4vf<M>(v0), 
+                    madd(vfloat<M>::loadu(&bspline_basis1.c1[size][ofs]), Vec4vf<M>(v1),
+                         madd(vfloat<M>::loadu(&bspline_basis1.c2[size][ofs]), Vec4vf<M>(v2),
+                              vfloat<M>::loadu(&bspline_basis1.c3[size][ofs]) * Vec4vf<M>(v3))));
+      }
+      
+      template<int M>
+      __forceinline Vec4vf<M> derivative0(const int ofs, const int size) const
+      {
+        assert(size <= PrecomputedBSplineBasis::N);
+        assert(ofs <= size);
+        return madd(vfloat<M>::loadu(&bspline_basis0.d0[size][ofs]), Vec4vf<M>(v0),
+                    madd(vfloat<M>::loadu(&bspline_basis0.d1[size][ofs]), Vec4vf<M>(v1),
+                         madd(vfloat<M>::loadu(&bspline_basis0.d2[size][ofs]), Vec4vf<M>(v2),
+                              vfloat<M>::loadu(&bspline_basis0.d3[size][ofs]) * Vec4vf<M>(v3))));
+      }
+      
+      template<int M>
+      __forceinline Vec4vf<M> derivative1(const int ofs, const int size) const
+      {
+        assert(size <= PrecomputedBSplineBasis::N);
+        assert(ofs <= size);
+        return madd(vfloat<M>::loadu(&bspline_basis1.d0[size][ofs]), Vec4vf<M>(v0),
+                    madd(vfloat<M>::loadu(&bspline_basis1.d1[size][ofs]), Vec4vf<M>(v1),
+                         madd(vfloat<M>::loadu(&bspline_basis1.d2[size][ofs]), Vec4vf<M>(v2),
+                              vfloat<M>::loadu(&bspline_basis1.d3[size][ofs]) * Vec4vf<M>(v3))));
+      }
+      
+      /* calculates bounds of bspline curve geometry */
+      __forceinline BBox3fa accurateRoundBounds() const
+      {
+        const int N = 7;
+        const float scale = 1.0f/(3.0f*(N-1));
+        Vec4vfx pl(pos_inf), pu(neg_inf);
+        for (int i=0; i<=N; i+=VSIZEX)
+        {
+          vintx vi = vintx(i)+vintx(step);
+          vboolx valid = vi <= vintx(N);
+          const Vec4vfx p  = eval0<VSIZEX>(i,N);
+          const Vec4vfx dp = derivative0<VSIZEX>(i,N);
+          const Vec4vfx pm = p-Vec4vfx(scale)*select(vi!=vintx(0),dp,Vec4vfx(zero));
+          const Vec4vfx pp = p+Vec4vfx(scale)*select(vi!=vintx(N),dp,Vec4vfx(zero));
+          pl = select(valid,min(pl,p,pm,pp),pl); // FIXME: use masked min
+          pu = select(valid,max(pu,p,pm,pp),pu); // FIXME: use masked min
+        }
+        const Vec3fa lower(reduce_min(pl.x),reduce_min(pl.y),reduce_min(pl.z));
+        const Vec3fa upper(reduce_max(pu.x),reduce_max(pu.y),reduce_max(pu.z));
+        const float r_min = reduce_min(pl.w);
+        const float r_max = reduce_max(pu.w);
+        const Vec3fa upper_r = Vec3fa(max(abs(r_min),abs(r_max)));
+        return enlarge(BBox3fa(lower,upper),upper_r);
+      }
+      
+      /* calculates bounds when tessellated into N line segments */
+      __forceinline BBox3fa accurateFlatBounds(int N) const
+      {
+        if (likely(N == 4))
+        {
+          const Vec4vf4 pi = eval0<4>(0,4);
+          const Vec3fa lower(reduce_min(pi.x),reduce_min(pi.y),reduce_min(pi.z));
+          const Vec3fa upper(reduce_max(pi.x),reduce_max(pi.y),reduce_max(pi.z));
+          const Vec3fa upper_r = Vec3fa(reduce_max(abs(pi.w)));
+          const Vec3ff pe = end();
+          return enlarge(BBox3fa(min(lower,pe),max(upper,pe)),max(upper_r,Vec3fa(abs(pe.w))));
+        } 
+        else
+        {
+          Vec3vfx pl(pos_inf), pu(neg_inf); vfloatx ru(0.0f);
+          for (int i=0; i<=N; i+=VSIZEX)
+          {
+            vboolx valid = vintx(i)+vintx(step) <= vintx(N);
+            const Vec4vfx pi = eval0<VSIZEX>(i,N);
+            
+            pl.x = select(valid,min(pl.x,pi.x),pl.x); // FIXME: use masked min
+            pl.y = select(valid,min(pl.y,pi.y),pl.y); 
+            pl.z = select(valid,min(pl.z,pi.z),pl.z); 
+            
+            pu.x = select(valid,max(pu.x,pi.x),pu.x); // FIXME: use masked min
+            pu.y = select(valid,max(pu.y,pi.y),pu.y); 
+            pu.z = select(valid,max(pu.z,pi.z),pu.z); 
+            
+            ru = select(valid,max(ru,abs(pi.w)),ru); 
+          }
+          const Vec3fa lower(reduce_min(pl.x),reduce_min(pl.y),reduce_min(pl.z));
+          const Vec3fa upper(reduce_max(pu.x),reduce_max(pu.y),reduce_max(pu.z));
+          const Vec3fa upper_r(reduce_max(ru));
+          return enlarge(BBox3fa(lower,upper),upper_r);
+        }
+      }
+      
+      friend __forceinline embree_ostream operator<<(embree_ostream cout, const BSplineCurveT& curve) {
+        return cout << "BSplineCurve { v0 = " << curve.v0 << ", v1 = " << curve.v1 << ", v2 = " << curve.v2 << ", v3 = " << curve.v3 << " }";
+      }
+    };
+  
+  template<typename Vertex>
+    __forceinline void convert(const BezierCurveT<Vertex>& icurve, BezierCurveT<Vertex>& ocurve) {
+    ocurve = icurve;
+  }
+  
+  template<typename Vertex>
+    __forceinline void convert(const BSplineCurveT<Vertex>& icurve, BSplineCurveT<Vertex>& ocurve) {
+    ocurve = icurve;
+  }
+  
+  template<typename Vertex>
+    __forceinline void convert(const BezierCurveT<Vertex>& icurve, BSplineCurveT<Vertex>& ocurve)
+  {
+    const Vertex v0 = madd(6.0f,icurve.v0,madd(-7.0f,icurve.v1,2.0f*icurve.v2));
+    const Vertex v1 = msub(2.0f,icurve.v1,icurve.v2);
+    const Vertex v2 = msub(2.0f,icurve.v2,icurve.v1);
+    const Vertex v3 = madd(2.0f,icurve.v1,madd(-7.0f,icurve.v2,6.0f*icurve.v3));
+    ocurve = BSplineCurveT<Vertex>(v0,v1,v2,v3);
+  }
+  
+  template<typename Vertex>
+    __forceinline void convert(const BSplineCurveT<Vertex>& icurve, BezierCurveT<Vertex>& ocurve)
+  {
+    const Vertex v0 = madd(1.0f/6.0f,icurve.v0,madd(2.0f/3.0f,icurve.v1,1.0f/6.0f*icurve.v2));
+    const Vertex v1 = madd(2.0f/3.0f,icurve.v1,1.0f/3.0f*icurve.v2);
+    const Vertex v2 = madd(1.0f/3.0f,icurve.v1,2.0f/3.0f*icurve.v2);
+    const Vertex v3 = madd(1.0f/6.0f,icurve.v1,madd(2.0f/3.0f,icurve.v2,1.0f/6.0f*icurve.v3));
+    ocurve = BezierCurveT<Vertex>(v0,v1,v2,v3);
+  }
+
+  template<typename CurveGeometry>
+  __forceinline BSplineCurveT<Vec3ff> enlargeRadiusToMinWidth(const IntersectContext* context, const CurveGeometry* geom, const Vec3fa& ray_org, const BSplineCurveT<Vec3ff>& curve)
+  {
+    return BSplineCurveT<Vec3ff>(enlargeRadiusToMinWidth(context,geom,ray_org,curve.v0),
+                                 enlargeRadiusToMinWidth(context,geom,ray_org,curve.v1),
+                                 enlargeRadiusToMinWidth(context,geom,ray_org,curve.v2),
+                                 enlargeRadiusToMinWidth(context,geom,ray_org,curve.v3));
+  }
+  
+  typedef BSplineCurveT<Vec3fa> BSplineCurve3fa;
+}
+
diff --git a/thirdparty/embree/kernels/subdiv/bspline_patch.h b/thirdparty/embree/kernels/subdiv/bspline_patch.h
new file mode 100644
index 0000000000..ff47f01c7a
--- /dev/null
+++ b/thirdparty/embree/kernels/subdiv/bspline_patch.h
@@ -0,0 +1,449 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "catmullclark_patch.h"
+#include "bspline_curve.h"
+
+namespace embree
+{
+  template<typename Vertex, typename Vertex_t = Vertex>
+    class __aligned(64) BSplinePatchT
+    {
+      typedef CatmullClark1RingT<Vertex,Vertex_t> CatmullClarkRing;
+      typedef CatmullClarkPatchT<Vertex,Vertex_t> CatmullClarkPatch;
+      
+    public:
+      
+      __forceinline BSplinePatchT () {}
+
+      __forceinline BSplinePatchT (const CatmullClarkPatch& patch) {
+        init(patch);
+      }
+
+      __forceinline BSplinePatchT(const CatmullClarkPatch& patch,
+                                  const BezierCurveT<Vertex>* border0,
+                                  const BezierCurveT<Vertex>* border1,
+                                  const BezierCurveT<Vertex>* border2,
+                                  const BezierCurveT<Vertex>* border3)
+      {
+        init(patch);
+      }
+
+      __forceinline BSplinePatchT (const HalfEdge* edge, const char* vertices, size_t stride) {
+        init(edge,vertices,stride);
+      }
+
+      __forceinline Vertex hard_corner(const                    Vertex& v01, const Vertex& v02, 
+                                       const Vertex& v10, const Vertex& v11, const Vertex& v12, 
+                                       const Vertex& v20, const Vertex& v21, const Vertex& v22)
+      {
+        return 4.0f*v11 - 2.0f*(v12+v21) + v22;
+      }
+
+      __forceinline Vertex soft_convex_corner( const                    Vertex& v01, const Vertex& v02, 
+                                               const Vertex& v10, const Vertex& v11, const Vertex& v12, 
+                                               const Vertex& v20, const Vertex& v21, const Vertex& v22)
+      {
+        return -8.0f*v11 + 4.0f*(v12+v21) + v22;
+      }
+
+      __forceinline Vertex convex_corner(const float vertex_crease_weight, 
+                                         const                    Vertex& v01, const Vertex& v02, 
+                                         const Vertex& v10, const Vertex& v11, const Vertex& v12, 
+                                         const Vertex& v20, const Vertex& v21, const Vertex& v22)
+      {
+        if (std::isinf(vertex_crease_weight)) return hard_corner(v01,v02,v10,v11,v12,v20,v21,v22);
+        else                                  return soft_convex_corner(v01,v02,v10,v11,v12,v20,v21,v22);
+      }
+
+      __forceinline Vertex load(const HalfEdge* edge, const char* vertices, size_t stride) {
+        return Vertex_t::loadu(vertices+edge->getStartVertexIndex()*stride);
+      }
+
+      __forceinline void init_border(const CatmullClarkRing& edge0,
+                                     Vertex& v01, Vertex& v02,
+                                     const Vertex& v11, const Vertex& v12,
+                                     const Vertex& v21, const Vertex& v22)
+      {
+        if (likely(edge0.has_opposite_back(0)))
+        {
+          v01 = edge0.back(2);
+          v02 = edge0.back(1);
+        } else {
+          v01 = 2.0f*v11-v21;
+          v02 = 2.0f*v12-v22;
+        }
+      }
+
+      __forceinline void init_corner(const CatmullClarkRing& edge0,
+                                     Vertex& v00,       const Vertex& v01, const Vertex& v02, 
+                                     const Vertex& v10, const Vertex& v11, const Vertex& v12, 
+                                     const Vertex& v20, const Vertex& v21, const Vertex& v22)
+      {
+        const bool MAYBE_UNUSED has_back1 = edge0.has_opposite_back(1);
+        const bool has_back0 = edge0.has_opposite_back(0);
+        const bool has_front1 = edge0.has_opposite_front(1);
+        const bool MAYBE_UNUSED has_front2 = edge0.has_opposite_front(2);
+        
+        if (likely(has_back0)) {
+          if (likely(has_front1)) { assert(has_back1 && has_front2); v00 = edge0.back(3); }
+          else { assert(!has_back1); v00 = 2.0f*v01-v02; }
+        }
+        else {
+          if (likely(has_front1)) { assert(!has_front2); v00 = 2.0f*v10-v20; }
+          else v00 = convex_corner(edge0.vertex_crease_weight,v01,v02,v10,v11,v12,v20,v21,v22);
+        }
+      }
+      
+      void init(const CatmullClarkPatch& patch)
+      {
+        /* fill inner vertices */
+        const Vertex v11 = v[1][1] = patch.ring[0].vtx;
+        const Vertex v12 = v[1][2] = patch.ring[1].vtx;
+        const Vertex v22 = v[2][2] = patch.ring[2].vtx; 
+        const Vertex v21 = v[2][1] = patch.ring[3].vtx; 
+        
+        /* fill border vertices */
+        init_border(patch.ring[0],v[0][1],v[0][2],v11,v12,v21,v22);
+        init_border(patch.ring[1],v[1][3],v[2][3],v12,v22,v11,v21);
+        init_border(patch.ring[2],v[3][2],v[3][1],v22,v21,v12,v11);
+        init_border(patch.ring[3],v[2][0],v[1][0],v21,v11,v22,v12);
+        
+        /* fill corner vertices */
+        init_corner(patch.ring[0],v[0][0],v[0][1],v[0][2],v[1][0],v11,v12,v[2][0],v21,v22);
+        init_corner(patch.ring[1],v[0][3],v[1][3],v[2][3],v[0][2],v12,v22,v[0][1],v11,v21);
+        init_corner(patch.ring[2],v[3][3],v[3][2],v[3][1],v[2][3],v22,v21,v[1][3],v12,v11);
+        init_corner(patch.ring[3],v[3][0],v[2][0],v[1][0],v[3][1],v21,v11,v[3][2],v22,v12);
+      }
+      
+      void init_border(const HalfEdge* edge0, const char* vertices, size_t stride,
+                                     Vertex& v01, Vertex& v02,
+                                     const Vertex& v11, const Vertex& v12,
+                                     const Vertex& v21, const Vertex& v22)
+      {
+        if (likely(edge0->hasOpposite())) 
+        {
+          const HalfEdge* e = edge0->opposite()->next()->next(); 
+          v01 = load(e,vertices,stride); 
+          v02 = load(e->next(),vertices,stride);
+        } else {
+          v01 = 2.0f*v11-v21;
+          v02 = 2.0f*v12-v22;
+        }
+      }
+      
+      void init_corner(const HalfEdge* edge0, const char* vertices, size_t stride,
+                       Vertex& v00, const Vertex& v01, const Vertex& v02, 
+                       const Vertex& v10, const Vertex& v11, const Vertex& v12, 
+                       const Vertex& v20, const Vertex& v21, const Vertex& v22)
+      {
+        const bool has_back0 = edge0->hasOpposite();
+        const bool has_front1 = edge0->prev()->hasOpposite();
+
+        if (likely(has_back0))
+        { 
+          const HalfEdge* e = edge0->opposite()->next();
+          if (likely(has_front1))
+          {
+            assert(e->hasOpposite());
+            assert(edge0->prev()->opposite()->prev()->hasOpposite());
+            v00 = load(e->opposite()->prev(),vertices,stride);
+          } 
+          else {
+            assert(!e->hasOpposite());
+            v00 = 2.0f*v01-v02;
+          }
+        }
+        else
+        {
+          if (likely(has_front1)) {
+            assert(!edge0->prev()->opposite()->prev()->hasOpposite());
+            v00 = 2.0f*v10-v20;
+          }
+          else {
+            assert(edge0->vertex_crease_weight == 0.0f || std::isinf(edge0->vertex_crease_weight));
+            v00 = convex_corner(edge0->vertex_crease_weight,v01,v02,v10,v11,v12,v20,v21,v22);
+          }
+        }
+      }
+      
+      void init(const HalfEdge* edge0, const char* vertices, size_t stride)
+      {
+        assert( edge0->isRegularFace() );
+        
+        /* fill inner vertices */
+        const Vertex v11 = v[1][1] = load(edge0,vertices,stride); const HalfEdge* edge1 = edge0->next();
+        const Vertex v12 = v[1][2] = load(edge1,vertices,stride); const HalfEdge* edge2 = edge1->next();
+        const Vertex v22 = v[2][2] = load(edge2,vertices,stride); const HalfEdge* edge3 = edge2->next();
+        const Vertex v21 = v[2][1] = load(edge3,vertices,stride); assert(edge0  == edge3->next());
+        
+        /* fill border vertices */
+        init_border(edge0,vertices,stride,v[0][1],v[0][2],v11,v12,v21,v22);
+        init_border(edge1,vertices,stride,v[1][3],v[2][3],v12,v22,v11,v21);
+        init_border(edge2,vertices,stride,v[3][2],v[3][1],v22,v21,v12,v11);
+        init_border(edge3,vertices,stride,v[2][0],v[1][0],v21,v11,v22,v12);
+        
+        /* fill corner vertices */
+        init_corner(edge0,vertices,stride,v[0][0],v[0][1],v[0][2],v[1][0],v11,v12,v[2][0],v21,v22);
+        init_corner(edge1,vertices,stride,v[0][3],v[1][3],v[2][3],v[0][2],v12,v22,v[0][1],v11,v21);
+        init_corner(edge2,vertices,stride,v[3][3],v[3][2],v[3][1],v[2][3],v22,v21,v[1][3],v12,v11);
+        init_corner(edge3,vertices,stride,v[3][0],v[2][0],v[1][0],v[3][1],v21,v11,v[3][2],v22,v12);
+      }
+      
+      __forceinline BBox<Vertex> bounds() const
+      {
+        const Vertex* const cv = &v[0][0];
+        BBox<Vertex> bounds (cv[0]);
+        for (size_t i=1; i<16 ; i++)
+          bounds.extend( cv[i] );
+        return bounds;
+      }
+      
+      __forceinline Vertex eval(const float uu, const float vv) const
+      {
+        const Vec4f v_n = BSplineBasis::eval(vv);
+        const Vertex_t curve0 = madd(v_n[0],v[0][0],madd(v_n[1],v[1][0],madd(v_n[2],v[2][0],v_n[3] * v[3][0])));
+        const Vertex_t curve1 = madd(v_n[0],v[0][1],madd(v_n[1],v[1][1],madd(v_n[2],v[2][1],v_n[3] * v[3][1])));
+        const Vertex_t curve2 = madd(v_n[0],v[0][2],madd(v_n[1],v[1][2],madd(v_n[2],v[2][2],v_n[3] * v[3][2])));
+        const Vertex_t curve3 = madd(v_n[0],v[0][3],madd(v_n[1],v[1][3],madd(v_n[2],v[2][3],v_n[3] * v[3][3])));
+        
+        const Vec4f u_n = BSplineBasis::eval(uu);
+        return madd(u_n[0],curve0,madd(u_n[1],curve1,madd(u_n[2],curve2,u_n[3] * curve3)));
+      }
+      
+      __forceinline Vertex eval_du(const float uu, const float vv) const
+      {
+        const Vec4f v_n = BSplineBasis::eval(vv);
+        const Vertex_t curve0 = madd(v_n[0],v[0][0],madd(v_n[1],v[1][0],madd(v_n[2],v[2][0],v_n[3] * v[3][0])));
+        const Vertex_t curve1 = madd(v_n[0],v[0][1],madd(v_n[1],v[1][1],madd(v_n[2],v[2][1],v_n[3] * v[3][1])));
+        const Vertex_t curve2 = madd(v_n[0],v[0][2],madd(v_n[1],v[1][2],madd(v_n[2],v[2][2],v_n[3] * v[3][2])));
+        const Vertex_t curve3 = madd(v_n[0],v[0][3],madd(v_n[1],v[1][3],madd(v_n[2],v[2][3],v_n[3] * v[3][3])));
+        
+        const Vec4f u_n = BSplineBasis::derivative(uu);
+        return madd(u_n[0],curve0,madd(u_n[1],curve1,madd(u_n[2],curve2,u_n[3] * curve3)));
+      }
+      
+      __forceinline Vertex eval_dv(const float uu, const float vv) const
+      {
+        const Vec4f v_n = BSplineBasis::derivative(vv);
+        const Vertex_t curve0 = madd(v_n[0],v[0][0],madd(v_n[1],v[1][0],madd(v_n[2],v[2][0],v_n[3] * v[3][0])));
+        const Vertex_t curve1 = madd(v_n[0],v[0][1],madd(v_n[1],v[1][1],madd(v_n[2],v[2][1],v_n[3] * v[3][1])));
+        const Vertex_t curve2 = madd(v_n[0],v[0][2],madd(v_n[1],v[1][2],madd(v_n[2],v[2][2],v_n[3] * v[3][2])));
+        const Vertex_t curve3 = madd(v_n[0],v[0][3],madd(v_n[1],v[1][3],madd(v_n[2],v[2][3],v_n[3] * v[3][3])));
+        
+        const Vec4f u_n = BSplineBasis::eval(uu);
+        return madd(u_n[0],curve0,madd(u_n[1],curve1,madd(u_n[2],curve2,u_n[3] * curve3)));
+      }
+
+      __forceinline Vertex eval_dudu(const float uu, const float vv) const
+      {
+        const Vec4f v_n = BSplineBasis::eval(vv);
+        const Vertex_t curve0 = madd(v_n[0],v[0][0],madd(v_n[1],v[1][0],madd(v_n[2],v[2][0],v_n[3] * v[3][0])));
+        const Vertex_t curve1 = madd(v_n[0],v[0][1],madd(v_n[1],v[1][1],madd(v_n[2],v[2][1],v_n[3] * v[3][1])));
+        const Vertex_t curve2 = madd(v_n[0],v[0][2],madd(v_n[1],v[1][2],madd(v_n[2],v[2][2],v_n[3] * v[3][2])));
+        const Vertex_t curve3 = madd(v_n[0],v[0][3],madd(v_n[1],v[1][3],madd(v_n[2],v[2][3],v_n[3] * v[3][3])));
+        
+        const Vec4f u_n = BSplineBasis::derivative2(uu);
+        return madd(u_n[0],curve0,madd(u_n[1],curve1,madd(u_n[2],curve2,u_n[3] * curve3)));
+      }
+
+      __forceinline Vertex eval_dvdv(const float uu, const float vv) const
+      {
+        const Vec4f v_n = BSplineBasis::derivative2(vv);
+        const Vertex_t curve0 = madd(v_n[0],v[0][0],madd(v_n[1],v[1][0],madd(v_n[2],v[2][0],v_n[3] * v[3][0])));
+        const Vertex_t curve1 = madd(v_n[0],v[0][1],madd(v_n[1],v[1][1],madd(v_n[2],v[2][1],v_n[3] * v[3][1])));
+        const Vertex_t curve2 = madd(v_n[0],v[0][2],madd(v_n[1],v[1][2],madd(v_n[2],v[2][2],v_n[3] * v[3][2])));
+        const Vertex_t curve3 = madd(v_n[0],v[0][3],madd(v_n[1],v[1][3],madd(v_n[2],v[2][3],v_n[3] * v[3][3])));
+        
+        const Vec4f u_n = BSplineBasis::eval(uu);
+        return madd(u_n[0],curve0,madd(u_n[1],curve1,madd(u_n[2],curve2,u_n[3] * curve3)));
+      }
+
+      __forceinline Vertex eval_dudv(const float uu, const float vv) const
+      {
+        const Vec4f v_n = BSplineBasis::derivative(vv);
+        const Vertex_t curve0 = madd(v_n[0],v[0][0],madd(v_n[1],v[1][0],madd(v_n[2],v[2][0],v_n[3] * v[3][0])));
+        const Vertex_t curve1 = madd(v_n[0],v[0][1],madd(v_n[1],v[1][1],madd(v_n[2],v[2][1],v_n[3] * v[3][1])));
+        const Vertex_t curve2 = madd(v_n[0],v[0][2],madd(v_n[1],v[1][2],madd(v_n[2],v[2][2],v_n[3] * v[3][2])));
+        const Vertex_t curve3 = madd(v_n[0],v[0][3],madd(v_n[1],v[1][3],madd(v_n[2],v[2][3],v_n[3] * v[3][3])));
+        
+        const Vec4f u_n = BSplineBasis::derivative(uu);
+        return madd(u_n[0],curve0,madd(u_n[1],curve1,madd(u_n[2],curve2,u_n[3] * curve3)));
+      }
+      
+      __forceinline Vertex normal(const float uu, const float vv) const
+      {
+        const Vertex tu = eval_du(uu,vv);
+        const Vertex tv = eval_dv(uu,vv);
+        return cross(tu,tv);
+      }   
+
+      template<typename T>
+      __forceinline Vec3<T> eval(const T& uu, const T& vv, const Vec4<T>& u_n, const Vec4<T>& v_n) const
+      {
+        const T curve0_x = madd(v_n[0],T(v[0][0].x),madd(v_n[1],T(v[1][0].x),madd(v_n[2],T(v[2][0].x),v_n[3] * T(v[3][0].x))));
+        const T curve1_x = madd(v_n[0],T(v[0][1].x),madd(v_n[1],T(v[1][1].x),madd(v_n[2],T(v[2][1].x),v_n[3] * T(v[3][1].x))));
+        const T curve2_x = madd(v_n[0],T(v[0][2].x),madd(v_n[1],T(v[1][2].x),madd(v_n[2],T(v[2][2].x),v_n[3] * T(v[3][2].x))));
+        const T curve3_x = madd(v_n[0],T(v[0][3].x),madd(v_n[1],T(v[1][3].x),madd(v_n[2],T(v[2][3].x),v_n[3] * T(v[3][3].x))));
+        const T x = madd(u_n[0],curve0_x,madd(u_n[1],curve1_x,madd(u_n[2],curve2_x,u_n[3] * curve3_x)));
+                  
+        const T curve0_y = madd(v_n[0],T(v[0][0].y),madd(v_n[1],T(v[1][0].y),madd(v_n[2],T(v[2][0].y),v_n[3] * T(v[3][0].y))));
+        const T curve1_y = madd(v_n[0],T(v[0][1].y),madd(v_n[1],T(v[1][1].y),madd(v_n[2],T(v[2][1].y),v_n[3] * T(v[3][1].y))));
+        const T curve2_y = madd(v_n[0],T(v[0][2].y),madd(v_n[1],T(v[1][2].y),madd(v_n[2],T(v[2][2].y),v_n[3] * T(v[3][2].y))));
+        const T curve3_y = madd(v_n[0],T(v[0][3].y),madd(v_n[1],T(v[1][3].y),madd(v_n[2],T(v[2][3].y),v_n[3] * T(v[3][3].y))));
+        const T y = madd(u_n[0],curve0_y,madd(u_n[1],curve1_y,madd(u_n[2],curve2_y,u_n[3] * curve3_y)));
+          
+        const T curve0_z = madd(v_n[0],T(v[0][0].z),madd(v_n[1],T(v[1][0].z),madd(v_n[2],T(v[2][0].z),v_n[3] * T(v[3][0].z))));
+        const T curve1_z = madd(v_n[0],T(v[0][1].z),madd(v_n[1],T(v[1][1].z),madd(v_n[2],T(v[2][1].z),v_n[3] * T(v[3][1].z))));
+        const T curve2_z = madd(v_n[0],T(v[0][2].z),madd(v_n[1],T(v[1][2].z),madd(v_n[2],T(v[2][2].z),v_n[3] * T(v[3][2].z))));
+        const T curve3_z = madd(v_n[0],T(v[0][3].z),madd(v_n[1],T(v[1][3].z),madd(v_n[2],T(v[2][3].z),v_n[3] * T(v[3][3].z))));
+        const T z = madd(u_n[0],curve0_z,madd(u_n[1],curve1_z,madd(u_n[2],curve2_z,u_n[3] * curve3_z)));
+        
+        return Vec3<T>(x,y,z);
+      }
+      
+      template<typename T>
+      __forceinline Vec3<T> eval(const T& uu, const T& vv) const
+      {
+        const Vec4<T> u_n = BSplineBasis::eval(uu);
+        const Vec4<T> v_n = BSplineBasis::eval(vv);
+        return eval(uu,vv,u_n,v_n);
+      }
+
+      template<typename T>
+      __forceinline Vec3<T> eval_du(const T& uu, const T& vv) const
+      {
+        const Vec4<T> u_n = BSplineBasis::derivative(uu); 
+        const Vec4<T> v_n = BSplineBasis::eval(vv); 
+        return eval(uu,vv,u_n,v_n);      
+      }
+      
+      template<typename T>
+      __forceinline Vec3<T> eval_dv(const T& uu, const T& vv) const
+      {
+        const Vec4<T> u_n = BSplineBasis::eval(uu); 
+        const Vec4<T> v_n = BSplineBasis::derivative(vv); 
+        return eval(uu,vv,u_n,v_n);      
+      }
+
+      template<typename T>
+      __forceinline Vec3<T> eval_dudu(const T& uu, const T& vv) const
+      {
+        const Vec4<T> u_n = BSplineBasis::derivative2(uu); 
+        const Vec4<T> v_n = BSplineBasis::eval(vv); 
+        return eval(uu,vv,u_n,v_n);      
+      }
+
+      template<typename T>
+      __forceinline Vec3<T> eval_dvdv(const T& uu, const T& vv) const
+      {
+        const Vec4<T> u_n = BSplineBasis::eval(uu); 
+        const Vec4<T> v_n = BSplineBasis::derivative2(vv); 
+        return eval(uu,vv,u_n,v_n);      
+      }
+
+      template<typename T>
+      __forceinline Vec3<T> eval_dudv(const T& uu, const T& vv) const
+      {
+        const Vec4<T> u_n = BSplineBasis::derivative(uu); 
+        const Vec4<T> v_n = BSplineBasis::derivative(vv); 
+        return eval(uu,vv,u_n,v_n);      
+      }
+      
+      template<typename T>
+      __forceinline Vec3<T> normal(const T& uu, const T& vv) const {
+        return cross(eval_du(uu,vv),eval_dv(uu,vv));
+      }
+
+      void eval(const float u, const float v, 
+                Vertex* P, Vertex* dPdu, Vertex* dPdv, Vertex* ddPdudu, Vertex* ddPdvdv, Vertex* ddPdudv, 
+                const float dscale = 1.0f) const
+      {
+        if (P) {
+          *P = eval(u,v); 
+        }
+        if (dPdu) {
+          assert(dPdu); *dPdu = eval_du(u,v)*dscale; 
+          assert(dPdv); *dPdv = eval_dv(u,v)*dscale; 
+        }
+        if (ddPdudu) {
+          assert(ddPdudu); *ddPdudu = eval_dudu(u,v)*sqr(dscale); 
+          assert(ddPdvdv); *ddPdvdv = eval_dvdv(u,v)*sqr(dscale); 
+          assert(ddPdudv); *ddPdudv = eval_dudv(u,v)*sqr(dscale); 
+        }
+      }
+
+      template<class vfloat>
+      __forceinline vfloat eval(const size_t i, const vfloat& uu, const vfloat& vv, const Vec4<vfloat>& u_n, const Vec4<vfloat>& v_n) const
+      {
+        const vfloat curve0_x = madd(v_n[0],vfloat(v[0][0][i]),madd(v_n[1],vfloat(v[1][0][i]),madd(v_n[2],vfloat(v[2][0][i]),v_n[3] * vfloat(v[3][0][i]))));
+        const vfloat curve1_x = madd(v_n[0],vfloat(v[0][1][i]),madd(v_n[1],vfloat(v[1][1][i]),madd(v_n[2],vfloat(v[2][1][i]),v_n[3] * vfloat(v[3][1][i]))));
+        const vfloat curve2_x = madd(v_n[0],vfloat(v[0][2][i]),madd(v_n[1],vfloat(v[1][2][i]),madd(v_n[2],vfloat(v[2][2][i]),v_n[3] * vfloat(v[3][2][i]))));
+        const vfloat curve3_x = madd(v_n[0],vfloat(v[0][3][i]),madd(v_n[1],vfloat(v[1][3][i]),madd(v_n[2],vfloat(v[2][3][i]),v_n[3] * vfloat(v[3][3][i]))));
+        return madd(u_n[0],curve0_x,madd(u_n[1],curve1_x,madd(u_n[2],curve2_x,u_n[3] * curve3_x)));
+      }
+        
+      template<typename vbool, typename vfloat>
+      void eval(const vbool& valid, const vfloat& uu, const vfloat& vv, 
+                float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv, 
+                const float dscale, const size_t dstride, const size_t N) const
+      {
+        if (P) {
+          const Vec4<vfloat> u_n = BSplineBasis::eval(uu); 
+          const Vec4<vfloat> v_n = BSplineBasis::eval(vv); 
+          for (size_t i=0; i<N; i++) vfloat::store(valid,P+i*dstride,eval(i,uu,vv,u_n,v_n));
+        }
+        if (dPdu) 
+        {
+          {
+            assert(dPdu);
+            const Vec4<vfloat> u_n = BSplineBasis::derivative(uu); 
+            const Vec4<vfloat> v_n = BSplineBasis::eval(vv);
+            for (size_t i=0; i<N; i++) vfloat::store(valid,dPdu+i*dstride,eval(i,uu,vv,u_n,v_n)*dscale);
+          }
+          {
+            assert(dPdv);
+            const Vec4<vfloat> u_n = BSplineBasis::eval(uu); 
+            const Vec4<vfloat> v_n = BSplineBasis::derivative(vv);
+            for (size_t i=0; i<N; i++) vfloat::store(valid,dPdv+i*dstride,eval(i,uu,vv,u_n,v_n)*dscale);
+          }
+        }
+        if (ddPdudu) 
+        {
+          {
+            assert(ddPdudu);
+            const Vec4<vfloat> u_n = BSplineBasis::derivative2(uu); 
+            const Vec4<vfloat> v_n = BSplineBasis::eval(vv);
+            for (size_t i=0; i<N; i++) vfloat::store(valid,ddPdudu+i*dstride,eval(i,uu,vv,u_n,v_n)*sqr(dscale));
+          }
+          {
+            assert(ddPdvdv);
+            const Vec4<vfloat> u_n = BSplineBasis::eval(uu); 
+            const Vec4<vfloat> v_n = BSplineBasis::derivative2(vv);
+            for (size_t i=0; i<N; i++) vfloat::store(valid,ddPdvdv+i*dstride,eval(i,uu,vv,u_n,v_n)*sqr(dscale));
+          }
+          {
+            assert(ddPdudv);
+            const Vec4<vfloat> u_n = BSplineBasis::derivative(uu); 
+            const Vec4<vfloat> v_n = BSplineBasis::derivative(vv);
+            for (size_t i=0; i<N; i++) vfloat::store(valid,ddPdudv+i*dstride,eval(i,uu,vv,u_n,v_n)*sqr(dscale));
+          }
+        }
+      }
+
+      friend __forceinline embree_ostream operator<<(embree_ostream o, const BSplinePatchT& p)
+      {
+        for (size_t y=0; y<4; y++)
+          for (size_t x=0; x<4; x++)
+            o << "[" << y << "][" << x << "] " << p.v[y][x] << embree_endl;
+        return o;
+      } 
+
+    public:
+      Vertex v[4][4];
+    };
+  
+  typedef BSplinePatchT<Vec3fa,Vec3fa_t> BSplinePatch3fa;
+}
diff --git a/thirdparty/embree/kernels/subdiv/catmullclark_coefficients.h b/thirdparty/embree/kernels/subdiv/catmullclark_coefficients.h
new file mode 100644
index 0000000000..46959797bf
--- /dev/null
+++ b/thirdparty/embree/kernels/subdiv/catmullclark_coefficients.h
@@ -0,0 +1,85 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/geometry.h"
+
+namespace embree
+{
+  static const size_t MAX_PATCH_VALENCE = 16;         //!< maximum number of vertices of a patch
+  static const size_t MAX_RING_FACE_VALENCE = 64;     //!< maximum number of faces per ring
+  static const size_t MAX_RING_EDGE_VALENCE = 2*64;   //!< maximum number of edges per ring
+
+  class CatmullClarkPrecomputedCoefficients 
+  {
+  private:
+    
+    float table_cos_2PI_div_n[MAX_RING_FACE_VALENCE+1];
+
+    float* table_limittangent_a[MAX_RING_FACE_VALENCE+1];
+    float* table_limittangent_b[MAX_RING_FACE_VALENCE+1];
+    float table_limittangent_c[MAX_RING_FACE_VALENCE+1];
+
+    __forceinline float set_cos_2PI_div_n(const size_t n) { 
+      if (unlikely(n == 0)) return 1.0f;
+      return cosf(2.0f*float(pi)/(float)n); 
+    }
+
+    __forceinline float set_limittangent_a(const size_t i, const size_t n)  
+    { 
+      if (unlikely(n == 0)) return 1.0f;
+      const float c0 = 1.0f/(float)n * 1.0f / sqrtf(4.0f + cosf(float(pi)/(float)n)*cosf(float(pi)/(float)n));
+      const float c1 = (1.0f/(float)n + cosf(float(pi)/(float)n) * c0); 
+      return cosf(2.0f*float(pi)*(float)i/(float)n) * c1;
+    }
+
+    __forceinline float set_limittangent_b(const size_t i, const size_t n)  
+    { 
+      if (unlikely(n == 0)) return 1.0f;
+      const float c0 = 1.0f/(float)n * 1.0f / sqrtf(4.0f + cosf(float(pi)/(float)n)*cosf(float(pi)/(float)n));
+      return cosf((2.0f*float(pi)*i+float(pi))/(float)n) * c0;
+    }
+
+    __forceinline float set_limittangent_c(const size_t n)  
+    { 
+      if (unlikely(n == 0)) return 1.0f;
+      return 2.0f/16.0f * (5.0f + cosf(2.0f*float(pi)/(float)n) + cosf(float(pi)/(float)n) * sqrtf(18.0f+2.0f*cosf(2.0f*float(pi)/(float)n)));
+    }
+
+  public:
+
+    __forceinline float cos_2PI_div_n(const size_t n)
+    {
+      if (likely(n <= MAX_RING_FACE_VALENCE))
+        return table_cos_2PI_div_n[n];
+      else
+        return set_cos_2PI_div_n(n);
+    }
+
+    __forceinline float limittangent_a(const size_t i, const size_t n)
+    {
+      assert(n <= MAX_RING_FACE_VALENCE);
+      assert(i < n);
+      return table_limittangent_a[n][i];
+    }
+
+    __forceinline float limittangent_b(const size_t i, const size_t n)
+    {
+      assert(n <= MAX_RING_FACE_VALENCE);
+      assert(i < n);
+      return table_limittangent_b[n][i];
+    }
+
+    __forceinline float limittangent_c(const size_t n)
+    {
+      assert(n <= MAX_RING_FACE_VALENCE);
+      return table_limittangent_c[n];
+    }
+
+    static CatmullClarkPrecomputedCoefficients table;
+ 
+    CatmullClarkPrecomputedCoefficients();    
+    ~CatmullClarkPrecomputedCoefficients();    
+  };
+}
diff --git a/thirdparty/embree/kernels/subdiv/catmullclark_patch.h b/thirdparty/embree/kernels/subdiv/catmullclark_patch.h
new file mode 100644
index 0000000000..91772d94ed
--- /dev/null
+++ b/thirdparty/embree/kernels/subdiv/catmullclark_patch.h
@@ -0,0 +1,562 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "catmullclark_ring.h"
+#include "bezier_curve.h"
+
+namespace embree
+{
+  template<typename Vertex, typename Vertex_t = Vertex>
+    class __aligned(64) CatmullClarkPatchT
+    {
+    public:
+    typedef CatmullClark1RingT<Vertex,Vertex_t> CatmullClark1Ring;
+    typedef typename CatmullClark1Ring::Type Type;
+    
+    array_t<CatmullClark1RingT<Vertex,Vertex_t>,4> ring;
+    
+    public:
+    __forceinline CatmullClarkPatchT () {}
+
+    __forceinline CatmullClarkPatchT (const HalfEdge* first_half_edge, const char* vertices, size_t stride) {
+      init(first_half_edge,vertices,stride);
+    }
+    
+    __forceinline CatmullClarkPatchT (const HalfEdge* first_half_edge, const BufferView<Vec3fa>& vertices) {
+      init(first_half_edge,vertices.getPtr(),vertices.getStride());
+    }
+    
+    __forceinline void init (const HalfEdge* first_half_edge, const char* vertices, size_t stride) 
+    {
+      for (unsigned i=0; i<4; i++)
+        ring[i].init(first_half_edge+i,vertices,stride);
+
+      assert(verify());
+    }
+
+    __forceinline size_t bytes() const {
+      return ring[0].bytes()+ring[1].bytes()+ring[2].bytes()+ring[3].bytes();
+    }
+
+    __forceinline void serialize(void* ptr, size_t& ofs) const
+    {
+      for (size_t i=0; i<4; i++)
+        ring[i].serialize((char*)ptr,ofs);
+    }
+
+    __forceinline void deserialize(void* ptr)
+    {
+      size_t ofs = 0;
+      for (size_t i=0; i<4; i++)
+        ring[i].deserialize((char*)ptr,ofs);
+    }
+
+    __forceinline BBox3fa bounds() const
+    {
+      BBox3fa bounds (ring[0].bounds());
+      for (size_t i=1; i<4; i++)
+	bounds.extend(ring[i].bounds());
+      return bounds;
+    }
+    
+    __forceinline Type type() const 
+    {
+      const int ty0 = ring[0].type() ^ CatmullClark1Ring::TYPE_CREASES;
+      const int ty1 = ring[1].type() ^ CatmullClark1Ring::TYPE_CREASES;
+      const int ty2 = ring[2].type() ^ CatmullClark1Ring::TYPE_CREASES;
+      const int ty3 = ring[3].type() ^ CatmullClark1Ring::TYPE_CREASES;
+      return (Type) ((ty0 & ty1 & ty2 & ty3) ^ CatmullClark1Ring::TYPE_CREASES);
+    }
+    
+    __forceinline bool isFinalResolution(float res) const {
+      return ring[0].isFinalResolution(res) && ring[1].isFinalResolution(res) && ring[2].isFinalResolution(res) && ring[3].isFinalResolution(res);
+    }
+    
+    static __forceinline void init_regular(const CatmullClark1RingT<Vertex,Vertex_t>& p0,
+					   const CatmullClark1RingT<Vertex,Vertex_t>& p1,
+					   CatmullClark1RingT<Vertex,Vertex_t>& dest0,
+					   CatmullClark1RingT<Vertex,Vertex_t>& dest1) 
+    {
+      assert(p1.face_valence > 2);
+      dest1.vertex_level = dest0.vertex_level = p0.edge_level;
+      dest1.face_valence = dest0.face_valence = 4;
+      dest1.edge_valence = dest0.edge_valence = 8;
+      dest1.border_index = dest0.border_index = -1;
+      dest1.vtx = dest0.vtx = (Vertex_t)p0.ring[0];
+      dest1.vertex_crease_weight = dest0.vertex_crease_weight = 0.0f;
+      
+      dest1.ring[2] = dest0.ring[0] = (Vertex_t)p0.ring[1];
+      dest1.ring[1] = dest0.ring[7] = (Vertex_t)p1.ring[0];
+      dest1.ring[0] = dest0.ring[6] = (Vertex_t)p1.vtx;
+      dest1.ring[7] = dest0.ring[5] = (Vertex_t)p1.ring[4];
+      dest1.ring[6] = dest0.ring[4] = (Vertex_t)p0.ring[p0.edge_valence-1];
+      dest1.ring[5] = dest0.ring[3] = (Vertex_t)p0.ring[p0.edge_valence-2];
+      dest1.ring[4] = dest0.ring[2] = (Vertex_t)p0.vtx;
+      dest1.ring[3] = dest0.ring[1] = (Vertex_t)p0.ring[2];
+      
+      dest1.crease_weight[1] = dest0.crease_weight[0] = 0.0f;
+      dest1.crease_weight[0] = dest0.crease_weight[3] = p1.crease_weight[1];
+      dest1.crease_weight[3] = dest0.crease_weight[2] = 0.0f;
+      dest1.crease_weight[2] = dest0.crease_weight[1] = p0.crease_weight[0];
+      
+      if (p0.eval_unique_identifier <= p1.eval_unique_identifier)
+      {
+        dest0.eval_start_index = 3;
+        dest1.eval_start_index = 0;
+        dest0.eval_unique_identifier = p0.eval_unique_identifier;
+        dest1.eval_unique_identifier = p0.eval_unique_identifier;
+      }
+      else
+      {
+        dest0.eval_start_index = 1;
+        dest1.eval_start_index = 2;
+        dest0.eval_unique_identifier = p1.eval_unique_identifier;
+        dest1.eval_unique_identifier = p1.eval_unique_identifier;
+      }
+    }    
+    
+    static __forceinline void init_border(const CatmullClark1RingT<Vertex,Vertex_t> &p0,
+                                          const CatmullClark1RingT<Vertex,Vertex_t> &p1,
+                                          CatmullClark1RingT<Vertex,Vertex_t> &dest0,
+                                          CatmullClark1RingT<Vertex,Vertex_t> &dest1) 
+    {
+      dest1.vertex_level = dest0.vertex_level = p0.edge_level;
+      dest1.face_valence = dest0.face_valence = 3;
+      dest1.edge_valence = dest0.edge_valence = 6;
+      dest0.border_index = 2;
+      dest1.border_index = 4;
+      dest1.vtx  = dest0.vtx = (Vertex_t)p0.ring[0];
+      dest1.vertex_crease_weight = dest0.vertex_crease_weight = 0.0f;
+      
+      dest1.ring[2] = dest0.ring[0] = (Vertex_t)p0.ring[1];
+      dest1.ring[1] = dest0.ring[5] = (Vertex_t)p1.ring[0];
+      dest1.ring[0] = dest0.ring[4] = (Vertex_t)p1.vtx;
+      dest1.ring[5] = dest0.ring[3] = (Vertex_t)p0.ring[p0.border_index+1]; // dummy
+      dest1.ring[4] = dest0.ring[2] = (Vertex_t)p0.vtx;
+      dest1.ring[3] = dest0.ring[1] = (Vertex_t)p0.ring[2];
+      
+      dest1.crease_weight[1] = dest0.crease_weight[0] = 0.0f;
+      dest1.crease_weight[0] = dest0.crease_weight[2] = p1.crease_weight[1];
+      dest1.crease_weight[2] = dest0.crease_weight[1] = p0.crease_weight[0];
+      
+      if (p0.eval_unique_identifier <= p1.eval_unique_identifier)
+      {
+        dest0.eval_start_index = 1;
+        dest1.eval_start_index = 2;
+        dest0.eval_unique_identifier = p0.eval_unique_identifier;
+        dest1.eval_unique_identifier = p0.eval_unique_identifier;
+      }
+      else
+      {
+        dest0.eval_start_index = 2;
+        dest1.eval_start_index = 0;
+        dest0.eval_unique_identifier = p1.eval_unique_identifier;
+        dest1.eval_unique_identifier = p1.eval_unique_identifier;
+      }
+    }
+    
+    static __forceinline void init_regular(const Vertex_t &center, const Vertex_t center_ring[8], const unsigned int offset, CatmullClark1RingT<Vertex,Vertex_t> &dest)
+    {
+      dest.vertex_level = 0.0f;
+      dest.face_valence = 4;
+      dest.edge_valence = 8;
+      dest.border_index = -1;
+      dest.vtx     = (Vertex_t)center;
+      dest.vertex_crease_weight = 0.0f;
+      for (size_t i=0; i<8; i++) 
+	dest.ring[i] = (Vertex_t)center_ring[(offset+i)%8];
+      for (size_t i=0; i<4; i++) 
+        dest.crease_weight[i] = 0.0f;
+      
+      dest.eval_start_index = (8-offset)>>1;
+      if (dest.eval_start_index >= dest.face_valence) dest.eval_start_index -= dest.face_valence;
+      assert( dest.eval_start_index < dest.face_valence );
+      dest.eval_unique_identifier = 0;
+    }
+    
+    __noinline void subdivide(array_t<CatmullClarkPatchT,4>& patch) const
+    {
+      ring[0].subdivide(patch[0].ring[0]);
+      ring[1].subdivide(patch[1].ring[1]);
+      ring[2].subdivide(patch[2].ring[2]);
+      ring[3].subdivide(patch[3].ring[3]);
+      
+      patch[0].ring[0].edge_level = 0.5f*ring[0].edge_level;
+      patch[0].ring[1].edge_level = 0.25f*(ring[1].edge_level+ring[3].edge_level);
+      patch[0].ring[2].edge_level = 0.25f*(ring[0].edge_level+ring[2].edge_level);
+      patch[0].ring[3].edge_level = 0.5f*ring[3].edge_level;
+      
+      patch[1].ring[0].edge_level = 0.5f*ring[0].edge_level;
+      patch[1].ring[1].edge_level = 0.5f*ring[1].edge_level;
+      patch[1].ring[2].edge_level = 0.25f*(ring[0].edge_level+ring[2].edge_level);
+      patch[1].ring[3].edge_level = 0.25f*(ring[1].edge_level+ring[3].edge_level);
+      
+      patch[2].ring[0].edge_level = 0.25f*(ring[0].edge_level+ring[2].edge_level);
+      patch[2].ring[1].edge_level = 0.5f*ring[1].edge_level;
+      patch[2].ring[2].edge_level = 0.5f*ring[2].edge_level;
+      patch[2].ring[3].edge_level = 0.25f*(ring[1].edge_level+ring[3].edge_level);
+      
+      patch[3].ring[0].edge_level = 0.25f*(ring[0].edge_level+ring[2].edge_level);
+      patch[3].ring[1].edge_level = 0.25f*(ring[1].edge_level+ring[3].edge_level);
+      patch[3].ring[2].edge_level = 0.5f*ring[2].edge_level;
+      patch[3].ring[3].edge_level = 0.5f*ring[3].edge_level;
+      
+      const bool regular0 = ring[0].has_last_face() && ring[1].face_valence > 2;
+      if (likely(regular0))
+        init_regular(patch[0].ring[0],patch[1].ring[1],patch[0].ring[1],patch[1].ring[0]);
+      else
+        init_border(patch[0].ring[0],patch[1].ring[1],patch[0].ring[1],patch[1].ring[0]);
+      
+      const bool regular1 = ring[1].has_last_face() && ring[2].face_valence > 2;
+      if (likely(regular1))
+        init_regular(patch[1].ring[1],patch[2].ring[2],patch[1].ring[2],patch[2].ring[1]);
+      else
+        init_border(patch[1].ring[1],patch[2].ring[2],patch[1].ring[2],patch[2].ring[1]);
+      
+      const bool regular2 = ring[2].has_last_face() && ring[3].face_valence > 2;
+      if (likely(regular2))
+        init_regular(patch[2].ring[2],patch[3].ring[3],patch[2].ring[3],patch[3].ring[2]);
+      else
+        init_border(patch[2].ring[2],patch[3].ring[3],patch[2].ring[3],patch[3].ring[2]);
+      
+      const bool regular3 = ring[3].has_last_face() && ring[0].face_valence > 2;
+      if (likely(regular3))
+        init_regular(patch[3].ring[3],patch[0].ring[0],patch[3].ring[0],patch[0].ring[3]);
+      else
+        init_border(patch[3].ring[3],patch[0].ring[0],patch[3].ring[0],patch[0].ring[3]);
+      
+      Vertex_t center = (ring[0].vtx + ring[1].vtx + ring[2].vtx + ring[3].vtx) * 0.25f;
+
+      Vertex_t center_ring[8];
+      center_ring[0] = (Vertex_t)patch[3].ring[3].ring[0];
+      center_ring[7] = (Vertex_t)patch[3].ring[3].vtx;
+      center_ring[6] = (Vertex_t)patch[2].ring[2].ring[0];
+      center_ring[5] = (Vertex_t)patch[2].ring[2].vtx;
+      center_ring[4] = (Vertex_t)patch[1].ring[1].ring[0];
+      center_ring[3] = (Vertex_t)patch[1].ring[1].vtx;
+      center_ring[2] = (Vertex_t)patch[0].ring[0].ring[0];
+      center_ring[1] = (Vertex_t)patch[0].ring[0].vtx;
+      
+      init_regular(center,center_ring,0,patch[0].ring[2]);
+      init_regular(center,center_ring,2,patch[1].ring[3]);
+      init_regular(center,center_ring,4,patch[2].ring[0]);
+      init_regular(center,center_ring,6,patch[3].ring[1]);
+      
+      assert(patch[0].verify());
+      assert(patch[1].verify());
+      assert(patch[2].verify());
+      assert(patch[3].verify());
+    }
+    
+    bool verify() const {
+      return ring[0].hasValidPositions() && ring[1].hasValidPositions() && ring[2].hasValidPositions() && ring[3].hasValidPositions();
+    }
+    
+    __forceinline void init( FinalQuad& quad ) const
+    {
+      quad.vtx[0] = (Vertex_t)ring[0].vtx;
+      quad.vtx[1] = (Vertex_t)ring[1].vtx;
+      quad.vtx[2] = (Vertex_t)ring[2].vtx;
+      quad.vtx[3] = (Vertex_t)ring[3].vtx;
+    };
+    
+    friend __forceinline embree_ostream operator<<(embree_ostream o, const CatmullClarkPatchT &p)
+    {
+      o << "CatmullClarkPatch { " << embree_endl;
+      for (size_t i=0; i<4; i++)
+	o << "ring" << i << ": " << p.ring[i] << embree_endl;
+      o << "}" << embree_endl;
+      return o;
+    }
+    };
+  
+  typedef CatmullClarkPatchT<Vec3fa,Vec3fa_t> CatmullClarkPatch3fa;
+  
+  template<typename Vertex, typename Vertex_t = Vertex>
+    class __aligned(64) GeneralCatmullClarkPatchT
+    {
+    public:
+    typedef CatmullClarkPatchT<Vertex,Vertex_t> CatmullClarkPatch;
+    typedef CatmullClark1RingT<Vertex,Vertex_t> CatmullClark1Ring;
+    typedef BezierCurveT<Vertex> BezierCurve;
+
+    static const unsigned SIZE = MAX_PATCH_VALENCE;
+    DynamicStackArray<GeneralCatmullClark1RingT<Vertex,Vertex_t>,8,SIZE> ring;
+    unsigned N;
+    
+    __forceinline GeneralCatmullClarkPatchT () 
+    : N(0) {}
+    
+    GeneralCatmullClarkPatchT (const HalfEdge* h, const char* vertices, size_t stride) {
+      init(h,vertices,stride);
+    }
+
+    __forceinline GeneralCatmullClarkPatchT (const HalfEdge* first_half_edge, const BufferView<Vec3fa>& vertices) {
+      init(first_half_edge,vertices.getPtr(),vertices.getStride());
+    }
+
+    __forceinline void init (const HalfEdge* h, const char* vertices, size_t stride) 
+    {
+      unsigned int i = 0;
+      const HalfEdge* edge = h; 
+      do {
+        ring[i].init(edge,vertices,stride);
+        edge = edge->next();
+        i++;
+      } while ((edge != h) && (i < SIZE));
+      N = i;
+    }
+
+    __forceinline unsigned size() const { 
+      return N; 
+    }
+    
+    __forceinline bool isQuadPatch() const {
+      return (N == 4) && ring[0].only_quads && ring[1].only_quads && ring[2].only_quads && ring[3].only_quads;
+    }
+
+    static __forceinline void init_regular(const CatmullClark1RingT<Vertex,Vertex_t>& p0,
+					   const CatmullClark1RingT<Vertex,Vertex_t>& p1,
+					   CatmullClark1RingT<Vertex,Vertex_t>& dest0,
+					   CatmullClark1RingT<Vertex,Vertex_t>& dest1) 
+    {
+      assert(p1.face_valence > 2);
+      dest1.vertex_level = dest0.vertex_level = p0.edge_level;
+      dest1.face_valence = dest0.face_valence = 4;
+      dest1.edge_valence = dest0.edge_valence = 8;
+      dest1.border_index = dest0.border_index = -1;
+      dest1.vtx = dest0.vtx = (Vertex_t)p0.ring[0];
+      dest1.vertex_crease_weight = dest0.vertex_crease_weight = 0.0f;
+      
+      dest1.ring[2] = dest0.ring[0] = (Vertex_t)p0.ring[1];
+      dest1.ring[1] = dest0.ring[7] = (Vertex_t)p1.ring[0];
+      dest1.ring[0] = dest0.ring[6] = (Vertex_t)p1.vtx;
+      dest1.ring[7] = dest0.ring[5] = (Vertex_t)p1.ring[4];
+      dest1.ring[6] = dest0.ring[4] = (Vertex_t)p0.ring[p0.edge_valence-1];
+      dest1.ring[5] = dest0.ring[3] = (Vertex_t)p0.ring[p0.edge_valence-2];
+      dest1.ring[4] = dest0.ring[2] = (Vertex_t)p0.vtx;
+      dest1.ring[3] = dest0.ring[1] = (Vertex_t)p0.ring[2];
+      
+      dest1.crease_weight[1] = dest0.crease_weight[0] = 0.0f;
+      dest1.crease_weight[0] = dest0.crease_weight[3] = p1.crease_weight[1];
+      dest1.crease_weight[3] = dest0.crease_weight[2] = 0.0f;
+      dest1.crease_weight[2] = dest0.crease_weight[1] = p0.crease_weight[0];
+      
+      if (p0.eval_unique_identifier <= p1.eval_unique_identifier)
+      {
+        dest0.eval_start_index = 3;
+        dest1.eval_start_index = 0;
+        dest0.eval_unique_identifier = p0.eval_unique_identifier;
+        dest1.eval_unique_identifier = p0.eval_unique_identifier;
+      }
+      else
+      {
+        dest0.eval_start_index = 1;
+        dest1.eval_start_index = 2;
+        dest0.eval_unique_identifier = p1.eval_unique_identifier;
+        dest1.eval_unique_identifier = p1.eval_unique_identifier;
+      }      
+    }
+    
+    
+    static __forceinline void init_border(const CatmullClark1RingT<Vertex,Vertex_t> &p0,
+                                          const CatmullClark1RingT<Vertex,Vertex_t> &p1,
+                                          CatmullClark1RingT<Vertex,Vertex_t> &dest0,
+                                          CatmullClark1RingT<Vertex,Vertex_t> &dest1) 
+    {
+      dest1.vertex_level = dest0.vertex_level = p0.edge_level;
+      dest1.face_valence = dest0.face_valence = 3;
+      dest1.edge_valence = dest0.edge_valence = 6;
+      dest0.border_index = 2;
+      dest1.border_index = 4;
+      dest1.vtx  = dest0.vtx = (Vertex_t)p0.ring[0];
+      dest1.vertex_crease_weight = dest0.vertex_crease_weight = 0.0f;
+      
+      dest1.ring[2] = dest0.ring[0] = (Vertex_t)p0.ring[1];
+      dest1.ring[1] = dest0.ring[5] = (Vertex_t)p1.ring[0];
+      dest1.ring[0] = dest0.ring[4] = (Vertex_t)p1.vtx;
+      dest1.ring[5] = dest0.ring[3] = (Vertex_t)p0.ring[p0.border_index+1]; // dummy
+      dest1.ring[4] = dest0.ring[2] = (Vertex_t)p0.vtx;
+      dest1.ring[3] = dest0.ring[1] = (Vertex_t)p0.ring[2];
+      
+      dest1.crease_weight[1] = dest0.crease_weight[0] = 0.0f;
+      dest1.crease_weight[0] = dest0.crease_weight[2] = p1.crease_weight[1];
+      dest1.crease_weight[2] = dest0.crease_weight[1] = p0.crease_weight[0];
+      
+      if (p0.eval_unique_identifier <= p1.eval_unique_identifier)
+      {
+        dest0.eval_start_index = 1;
+        dest1.eval_start_index = 2;
+        dest0.eval_unique_identifier = p0.eval_unique_identifier;
+        dest1.eval_unique_identifier = p0.eval_unique_identifier;
+      }
+      else
+      {
+        dest0.eval_start_index = 2;
+        dest1.eval_start_index = 0;
+        dest0.eval_unique_identifier = p1.eval_unique_identifier;
+        dest1.eval_unique_identifier = p1.eval_unique_identifier;
+      }
+    }
+    
+    static __forceinline void init_regular(const Vertex_t &center, const array_t<Vertex_t,2*SIZE>& center_ring, const float vertex_level, const unsigned int N, const unsigned int offset, CatmullClark1RingT<Vertex,Vertex_t> &dest)
+    {
+      assert(N<(MAX_RING_FACE_VALENCE));
+      assert(2*N<(MAX_RING_EDGE_VALENCE));
+      dest.vertex_level = vertex_level;
+      dest.face_valence = N;
+      dest.edge_valence = 2*N;
+      dest.border_index = -1;
+      dest.vtx     = (Vertex_t)center;
+      dest.vertex_crease_weight = 0.0f;
+      for (unsigned i=0; i<2*N; i++) {
+        dest.ring[i] = (Vertex_t)center_ring[(2*N+offset+i-1)%(2*N)];
+        assert(isvalid(dest.ring[i]));
+      }
+      for (unsigned i=0; i<N; i++) 
+        dest.crease_weight[i] = 0.0f;
+      
+      assert(offset <= 2*N);
+      dest.eval_start_index = (2*N-offset)>>1;
+      if (dest.eval_start_index >= dest.face_valence) dest.eval_start_index -= dest.face_valence;
+      
+      assert( dest.eval_start_index < dest.face_valence );
+      dest.eval_unique_identifier = 0;
+    }
+    
+    __noinline void subdivide(array_t<CatmullClarkPatch,SIZE>& patch, unsigned& N_o) const
+    {
+      N_o = N;
+      assert( N );
+      for (unsigned i=0; i<N; i++) {
+        unsigned ip1 = (i+1)%N; // FIXME: %
+        ring[i].subdivide(patch[i].ring[0]);
+        patch[i]  .ring[0].edge_level = 0.5f*ring[i].edge_level;
+        patch[ip1].ring[3].edge_level = 0.5f*ring[i].edge_level;
+        
+	assert( patch[i].ring[0].hasValidPositions() );
+        
+      }
+      assert(N < 2*SIZE);
+      Vertex_t center = Vertex_t(0.0f);
+      array_t<Vertex_t,2*SIZE> center_ring;
+      float center_vertex_level = 2.0f; // guarantees that irregular vertices get always isolated also for non-quads
+      
+      for (unsigned i=0; i<N; i++)
+      {
+        unsigned ip1 = (i+1)%N; // FIXME: %
+        unsigned im1 = (i+N-1)%N; // FIXME: %
+        bool regular = ring[i].has_last_face() && ring[ip1].face_valence > 2;
+        if (likely(regular)) init_regular(patch[i].ring[0],patch[ip1].ring[0],patch[i].ring[1],patch[ip1].ring[3]); 
+        else                 init_border (patch[i].ring[0],patch[ip1].ring[0],patch[i].ring[1],patch[ip1].ring[3]);
+        
+	assert( patch[i].ring[1].hasValidPositions() );
+	assert( patch[ip1].ring[3].hasValidPositions() );
+        
+	float level = 0.25f*(ring[im1].edge_level+ring[ip1].edge_level);
+        patch[i].ring[1].edge_level = patch[ip1].ring[2].edge_level = level;
+	center_vertex_level = max(center_vertex_level,level);
+        
+        center += ring[i].vtx;
+        center_ring[2*i+0] = (Vertex_t)patch[i].ring[0].vtx;
+        center_ring[2*i+1] = (Vertex_t)patch[i].ring[0].ring[0];
+      }
+      center /= float(N);
+      
+      for (unsigned int i=0; i<N; i++) {
+        init_regular(center,center_ring,center_vertex_level,N,2*i,patch[i].ring[2]);
+        
+	assert( patch[i].ring[2].hasValidPositions() );
+      }
+    }
+    
+    void init(CatmullClarkPatch& patch) const
+    {
+      assert(size() == 4);
+      ring[0].convert(patch.ring[0]);
+      ring[1].convert(patch.ring[1]);
+      ring[2].convert(patch.ring[2]);
+      ring[3].convert(patch.ring[3]);
+    }
+    
+    static void fix_quad_ring_order (array_t<CatmullClarkPatch,GeneralCatmullClarkPatchT::SIZE>& patches)
+    {
+      CatmullClark1Ring patches1ring1 = patches[1].ring[1];
+      patches[1].ring[1] = patches[1].ring[0]; // FIXME: optimize these assignments
+      patches[1].ring[0] = patches[1].ring[3];
+      patches[1].ring[3] = patches[1].ring[2];
+      patches[1].ring[2] = patches1ring1;
+      
+      CatmullClark1Ring patches2ring2 = patches[2].ring[2];
+      patches[2].ring[2] = patches[2].ring[0];
+      patches[2].ring[0] = patches2ring2;
+      CatmullClark1Ring patches2ring3 = patches[2].ring[3];
+      patches[2].ring[3] = patches[2].ring[1];
+      patches[2].ring[1] = patches2ring3;
+      
+      CatmullClark1Ring patches3ring3 = patches[3].ring[3];
+      patches[3].ring[3] = patches[3].ring[0];
+      patches[3].ring[0] = patches[3].ring[1];
+      patches[3].ring[1] = patches[3].ring[2];
+      patches[3].ring[2] = patches3ring3;
+    }
+
+    __forceinline void getLimitBorder(BezierCurve curves[GeneralCatmullClarkPatchT::SIZE]) const
+    {
+      Vertex P0 = ring[0].getLimitVertex();
+      for (unsigned i=0; i<N; i++)
+      {
+        const unsigned i0 = i, i1 = i+1==N ? 0 : i+1;
+        const Vertex P1 = madd(1.0f/3.0f,ring[i0].getLimitTangent(),P0);
+        const Vertex P3 = ring[i1].getLimitVertex();
+        const Vertex P2 = madd(1.0f/3.0f,ring[i1].getSecondLimitTangent(),P3);
+        new (&curves[i]) BezierCurve(P0,P1,P2,P3);
+        P0 = P3;
+      }
+    }
+
+    __forceinline void getLimitBorder(BezierCurve curves[2], const unsigned subPatch) const
+    {
+      const unsigned i0 = subPatch;
+      const Vertex t0_p = ring[i0].getLimitTangent();
+      const Vertex t0_m = ring[i0].getSecondLimitTangent();
+          
+      const unsigned i1 = subPatch+1 == N ? 0 : subPatch+1;
+      const Vertex t1_p = ring[i1].getLimitTangent();
+      const Vertex t1_m = ring[i1].getSecondLimitTangent();
+      
+      const unsigned i2 = subPatch == 0 ? N-1 : subPatch-1;
+      const Vertex t2_p = ring[i2].getLimitTangent();
+      const Vertex t2_m = ring[i2].getSecondLimitTangent();
+      
+      const Vertex b00 = ring[i0].getLimitVertex();
+      const Vertex b03 = ring[i1].getLimitVertex();
+      const Vertex b33 = ring[i2].getLimitVertex();
+      
+      const Vertex b01 = madd(1.0/3.0f,t0_p,b00);
+      const Vertex b11 = madd(1.0/3.0f,t0_m,b00);
+      
+      //const Vertex b13 = madd(1.0/3.0f,t1_p,b03);
+      const Vertex b02 = madd(1.0/3.0f,t1_m,b03);
+          
+      const Vertex b22 = madd(1.0/3.0f,t2_p,b33);
+      const Vertex b23 = madd(1.0/3.0f,t2_m,b33);
+          
+      new (&curves[0]) BezierCurve(b00,b01,b02,b03);
+      new (&curves[1]) BezierCurve(b33,b22,b11,b00);
+    }
+    
+    friend __forceinline embree_ostream operator<<(embree_ostream o, const GeneralCatmullClarkPatchT &p)
+    {
+      o << "GeneralCatmullClarkPatch { " << embree_endl;
+      for (unsigned i=0; i<p.N; i++)
+	o << "ring" << i << ": " << p.ring[i] << embree_endl;
+      o << "}" << embree_endl;
+      return o;
+    }
+    };
+  
+  typedef GeneralCatmullClarkPatchT<Vec3fa,Vec3fa_t> GeneralCatmullClarkPatch3fa;
+}
diff --git a/thirdparty/embree/kernels/subdiv/catmullclark_ring.h b/thirdparty/embree/kernels/subdiv/catmullclark_ring.h
new file mode 100644
index 0000000000..e5ad5dadfe
--- /dev/null
+++ b/thirdparty/embree/kernels/subdiv/catmullclark_ring.h
@@ -0,0 +1,826 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/geometry.h"
+#include "../common/buffer.h"
+#include "half_edge.h"
+#include "catmullclark_coefficients.h"
+
+namespace embree
+{
+  struct __aligned(64) FinalQuad {
+    Vec3fa vtx[4];
+  };
+
+  template<typename Vertex, typename Vertex_t = Vertex>
+    struct __aligned(64) CatmullClark1RingT
+  {
+    ALIGNED_STRUCT_(64);
+    
+    int border_index;                                   //!< edge index where border starts
+    unsigned int face_valence;                          //!< number of adjacent quad faces
+    unsigned int edge_valence;                          //!< number of adjacent edges (2*face_valence)
+    float vertex_crease_weight;                         //!< weight of vertex crease (0 if no vertex crease)
+    DynamicStackArray<float,16,MAX_RING_FACE_VALENCE> crease_weight; //!< edge crease weights for each adjacent edge
+    float vertex_level;                                 //!< maximum level of all adjacent edges
+    float edge_level;                                   //!< level of first edge
+    unsigned int eval_start_index;                      //!< topology dependent index to start evaluation
+    unsigned int eval_unique_identifier;                //!< topology dependent unique identifier for this ring 
+    Vertex vtx;                                         //!< center vertex
+    DynamicStackArray<Vertex,32,MAX_RING_EDGE_VALENCE> ring;  //!< ring of neighboring vertices
+   
+  public:
+    CatmullClark1RingT () 
+    : eval_start_index(0), eval_unique_identifier(0) {} // FIXME: default constructor should be empty
+
+    /*! calculates number of bytes required to serialize this structure */
+    __forceinline size_t bytes() const
+    {
+      size_t ofs = 0;
+      ofs += sizeof(border_index);
+      ofs += sizeof(face_valence);
+      assert(2*face_valence == edge_valence);
+      ofs += sizeof(vertex_crease_weight);
+      ofs += face_valence*sizeof(float);
+      ofs += sizeof(vertex_level);
+      ofs += sizeof(edge_level);
+      ofs += sizeof(eval_start_index);
+      ofs += sizeof(eval_unique_identifier);
+      ofs += sizeof(vtx);
+      ofs += edge_valence*sizeof(Vertex);
+      return ofs;
+    }
+
+    template<typename Ty>
+    static __forceinline void store(char* ptr, size_t& ofs, const Ty& v) {
+      *(Ty*)&ptr[ofs] = v; ofs += sizeof(Ty);
+    }
+
+    template<typename Ty>
+    static __forceinline void load(char* ptr, size_t& ofs, Ty& v) {
+      v = *(Ty*)&ptr[ofs]; ofs += sizeof(Ty);
+    }
+
+    /*! serializes the ring to some memory location */
+    __forceinline void serialize(char* ptr, size_t& ofs) const
+    {
+      store(ptr,ofs,border_index);
+      store(ptr,ofs,face_valence);
+      store(ptr,ofs,vertex_crease_weight);
+      for (size_t i=0; i<face_valence; i++)
+        store(ptr,ofs,crease_weight[i]);
+      store(ptr,ofs,vertex_level);
+      store(ptr,ofs,edge_level);
+      store(ptr,ofs,eval_start_index);
+      store(ptr,ofs,eval_unique_identifier);
+      Vertex_t::storeu(&ptr[ofs],vtx); ofs += sizeof(Vertex);
+      for (size_t i=0; i<edge_valence; i++) {
+        Vertex_t::storeu(&ptr[ofs],ring[i]); ofs += sizeof(Vertex);
+      }
+    }
+
+    /*! deserializes the ring from some memory location */
+    __forceinline void deserialize(char* ptr, size_t& ofs)
+    {
+      load(ptr,ofs,border_index);
+      load(ptr,ofs,face_valence);
+      edge_valence = 2*face_valence;
+      load(ptr,ofs,vertex_crease_weight);
+      for (size_t i=0; i<face_valence; i++)
+        load(ptr,ofs,crease_weight[i]);
+      load(ptr,ofs,vertex_level);
+      load(ptr,ofs,edge_level);
+      load(ptr,ofs,eval_start_index);
+      load(ptr,ofs,eval_unique_identifier);
+      vtx = Vertex_t::loadu(&ptr[ofs]); ofs += sizeof(Vertex);
+      for (size_t i=0; i<edge_valence; i++) {
+        ring[i] = Vertex_t::loadu(&ptr[ofs]); ofs += sizeof(Vertex);
+      }
+    }
+
+    __forceinline bool hasBorder() const {
+      return border_index != -1;
+    }
+    
+    __forceinline const Vertex& front(size_t i) const {
+      assert(edge_valence>i);
+      return ring[i];
+    }
+    
+    __forceinline const Vertex& back(size_t i) const {
+      assert(edge_valence>=i);
+      return ring[edge_valence-i];
+    }
+    
+    __forceinline bool has_last_face() const {
+      return (size_t)border_index != (size_t)edge_valence-2;
+    }
+
+    __forceinline bool has_opposite_front(size_t i) const {
+      return (size_t)border_index != 2*i;
+    }
+
+    __forceinline bool has_opposite_back(size_t i) const {
+      return (size_t)border_index != ((size_t)edge_valence-2-2*i);
+    }
+    
+    __forceinline BBox3fa bounds() const
+    {
+      BBox3fa bounds ( vtx );
+      for (size_t i = 0; i<edge_valence ; i++)
+	bounds.extend( ring[i] );
+      return bounds;
+    }
+
+    /*! initializes the ring from the half edge structure */
+    __forceinline void init(const HalfEdge* const h, const char* vertices, size_t stride) 
+    {
+      border_index = -1;
+      vtx = Vertex_t::loadu(vertices+h->getStartVertexIndex()*stride);
+      vertex_crease_weight = h->vertex_crease_weight;
+      
+      HalfEdge* p = (HalfEdge*) h;
+
+      unsigned i=0;
+      unsigned min_vertex_index = (unsigned)-1;
+      unsigned min_vertex_index_face = (unsigned)-1;
+      edge_level = p->edge_level;
+      vertex_level = 0.0f;
+
+      do
+      {
+        vertex_level = max(vertex_level,p->edge_level);
+        crease_weight[i/2] = p->edge_crease_weight;
+        assert(p->hasOpposite() || p->edge_crease_weight == float(inf));
+
+        /* store first two vertices of face */
+        p = p->next();
+        const unsigned index0 = p->getStartVertexIndex();
+        ring[i++] = Vertex_t::loadu(vertices+index0*stride);
+        if (index0 < min_vertex_index) { min_vertex_index = index0; min_vertex_index_face = i>>1; }
+        p = p->next();
+
+        const unsigned index1 = p->getStartVertexIndex();
+        ring[i++] = Vertex_t::loadu(vertices+index1*stride);
+        p = p->next();
+       
+        /* continue with next face */
+        if (likely(p->hasOpposite())) 
+          p = p->opposite();
+        
+        /* if there is no opposite go the long way to the other side of the border */
+        else
+        {
+          /* find minimum start vertex */
+          const unsigned index0 = p->getStartVertexIndex();
+          if (index0 < min_vertex_index) { min_vertex_index = index0; min_vertex_index_face = i>>1; }
+
+          /*! mark first border edge and store dummy vertex for face between the two border edges */
+          border_index = i;
+          crease_weight[i/2] = inf; 
+          ring[i++] = Vertex_t::loadu(vertices+index0*stride);
+          ring[i++] = vtx; // dummy vertex
+          	  
+          /*! goto other side of border */
+          p = (HalfEdge*) h;
+          while (p->hasOpposite()) 
+            p = p->opposite()->next();
+        }
+
+      } while (p != h); 
+
+      edge_valence = i;
+      face_valence = i >> 1;
+      eval_unique_identifier = min_vertex_index;
+      eval_start_index = min_vertex_index_face;
+
+      assert( hasValidPositions() );
+    }
+      
+    __forceinline void subdivide(CatmullClark1RingT& dest) const
+    {
+      dest.edge_level             = 0.5f*edge_level;
+      dest.vertex_level           = 0.5f*vertex_level;
+      dest.face_valence           = face_valence;
+      dest.edge_valence           = edge_valence;
+      dest.border_index           = border_index;
+      dest.vertex_crease_weight   = max(0.0f,vertex_crease_weight-1.0f);
+      dest.eval_start_index       = eval_start_index;
+      dest.eval_unique_identifier = eval_unique_identifier;
+
+      /* calculate face points */
+      Vertex_t S = Vertex_t(0.0f);
+      for (size_t i=0; i<face_valence; i++) 
+      {
+        size_t face_index = i + eval_start_index; if (face_index >= face_valence) face_index -= face_valence; assert(face_index < face_valence);
+        size_t index0 = 2*face_index+0; if (index0 >= edge_valence) index0 -= edge_valence; assert(index0 < edge_valence);
+        size_t index1 = 2*face_index+1; if (index1 >= edge_valence) index1 -= edge_valence; assert(index1 < edge_valence);
+        size_t index2 = 2*face_index+2; if (index2 >= edge_valence) index2 -= edge_valence; assert(index2 < edge_valence);
+        S += dest.ring[index1] = ((vtx + ring[index1]) + (ring[index0] + ring[index2])) * 0.25f;
+      }
+      
+      /* calculate new edge points */
+      size_t num_creases = 0;
+      array_t<size_t,MAX_RING_FACE_VALENCE> crease_id;
+
+      for (size_t i=0; i<face_valence; i++)
+      {
+        size_t face_index = i + eval_start_index;
+        if (face_index >= face_valence) face_index -= face_valence;
+        const float edge_crease = crease_weight[face_index];
+        dest.crease_weight[face_index] = max(edge_crease-1.0f,0.0f);
+      
+        size_t index      = 2*face_index;
+        size_t prev_index = face_index == 0 ? edge_valence-1 : 2*face_index-1;
+        size_t next_index = 2*face_index+1;
+
+        const Vertex_t v = vtx + ring[index];
+        const Vertex_t f = dest.ring[prev_index] + dest.ring[next_index];
+        S += ring[index];
+                
+        /* fast path for regular edge points */
+        if (likely(edge_crease <= 0.0f)) {
+          dest.ring[index] = (v+f) * 0.25f;
+        }
+        
+        /* slower path for hard edge rule */
+        else {
+          crease_id[num_creases++] = face_index;
+          dest.ring[index] = v*0.5f;
+	  
+          /* even slower path for blended edge rule */
+          if (unlikely(edge_crease < 1.0f)) {
+            dest.ring[index] = lerp((v+f)*0.25f,v*0.5f,edge_crease);
+          }
+        }
+      }
+      
+      /* compute new vertex using smooth rule */
+      const float inv_face_valence = 1.0f / (float)face_valence;
+      const Vertex_t v_smooth = (Vertex_t) madd(inv_face_valence,S,(float(face_valence)-2.0f)*vtx)*inv_face_valence;
+      dest.vtx = v_smooth;
+      
+      /* compute new vertex using vertex_crease_weight rule */
+      if (unlikely(vertex_crease_weight > 0.0f)) 
+      {
+        if (vertex_crease_weight >= 1.0f) {
+          dest.vtx = vtx;
+        } else {
+          dest.vtx = lerp(v_smooth,vtx,vertex_crease_weight);
+        }
+        return;
+      }
+      
+      /* no edge crease rule and dart rule */
+      if (likely(num_creases <= 1))
+        return;
+      
+      /* compute new vertex using crease rule */
+      if (likely(num_creases == 2)) 
+      {
+        /* update vertex using crease rule */
+        const size_t crease0 = crease_id[0], crease1 = crease_id[1];
+        const Vertex_t v_sharp = (Vertex_t)(ring[2*crease0] + 6.0f*vtx + ring[2*crease1]) * (1.0f / 8.0f);
+        dest.vtx = v_sharp;
+
+        /* update crease_weights using chaikin rule */
+        const float crease_weight0 = crease_weight[crease0], crease_weight1 = crease_weight[crease1];
+        dest.crease_weight[crease0] = max(0.25f*(3.0f*crease_weight0 + crease_weight1)-1.0f,0.0f);
+        dest.crease_weight[crease1] = max(0.25f*(3.0f*crease_weight1 + crease_weight0)-1.0f,0.0f);
+
+        /* interpolate between sharp and smooth rule */
+        const float v_blend = 0.5f*(crease_weight0+crease_weight1);
+        if (unlikely(v_blend < 1.0f)) {
+          dest.vtx = lerp(v_smooth,v_sharp,v_blend);
+        }
+      }
+      
+      /* compute new vertex using corner rule */
+      else {
+        dest.vtx = vtx;
+      }
+    }
+    
+    __forceinline bool isRegular1() const 
+    {
+      if (border_index == -1) {
+	if (face_valence == 4) return true;
+      } else {
+	if (face_valence < 4) return true;
+      }
+      return false;
+    }
+
+    __forceinline size_t numEdgeCreases() const
+    {
+      ssize_t numCreases = 0;
+      for (size_t i=0; i<face_valence; i++) {
+        numCreases += crease_weight[i] > 0.0f;
+      }
+      return numCreases;
+    }
+
+    enum Type {
+      TYPE_NONE            = 0,      //!< invalid type
+      TYPE_REGULAR         = 1,      //!< regular patch when ignoring creases
+      TYPE_REGULAR_CREASES = 2,      //!< regular patch when considering creases
+      TYPE_GREGORY         = 4,      //!< gregory patch when ignoring creases
+      TYPE_GREGORY_CREASES = 8,      //!< gregory patch when considering creases
+      TYPE_CREASES         = 16      //!< patch has crease features
+    };
+    
+    __forceinline Type type() const
+    {
+      /* check if there is an edge crease anywhere */      
+      const size_t numCreases = numEdgeCreases();
+      const bool noInnerCreases = hasBorder() ? numCreases == 2 : numCreases == 0;
+
+      Type crease_mask = (Type) (TYPE_REGULAR | TYPE_GREGORY);
+      if (noInnerCreases ) crease_mask = (Type) (crease_mask | TYPE_REGULAR_CREASES | TYPE_GREGORY_CREASES);
+      if (numCreases != 0) crease_mask = (Type) (crease_mask | TYPE_CREASES);
+
+      /* calculate if this vertex is regular */
+      bool hasBorder = border_index != -1;
+      if (face_valence == 2 && hasBorder) {
+        if      (vertex_crease_weight == 0.0f      ) return (Type) (crease_mask & (TYPE_REGULAR | TYPE_REGULAR_CREASES | TYPE_GREGORY | TYPE_GREGORY_CREASES | TYPE_CREASES));
+        else if (vertex_crease_weight == float(inf)) return (Type) (crease_mask & (TYPE_REGULAR | TYPE_REGULAR_CREASES | TYPE_GREGORY | TYPE_GREGORY_CREASES | TYPE_CREASES));
+        else                                         return TYPE_CREASES;
+      }
+      else if (vertex_crease_weight != 0.0f)         return TYPE_CREASES;
+      else if (face_valence == 3 &&  hasBorder)      return (Type) (crease_mask & (TYPE_REGULAR | TYPE_REGULAR_CREASES | TYPE_GREGORY | TYPE_GREGORY_CREASES | TYPE_CREASES));
+      else if (face_valence == 4 && !hasBorder)      return (Type) (crease_mask & (TYPE_REGULAR | TYPE_REGULAR_CREASES | TYPE_GREGORY | TYPE_GREGORY_CREASES | TYPE_CREASES));
+      else                                           return (Type) (crease_mask & (TYPE_GREGORY | TYPE_GREGORY_CREASES | TYPE_CREASES));
+    }
+
+    __forceinline bool isFinalResolution(float res) const {
+      return vertex_level <= res;
+    }
+
+    /* computes the limit vertex */
+    __forceinline Vertex getLimitVertex() const
+    {
+      /* return hard corner */ 
+      if (unlikely(std::isinf(vertex_crease_weight)))
+        return vtx;
+
+      /* border vertex rule */
+      if (unlikely(border_index != -1))
+      {
+	const unsigned int second_border_index = border_index+2 >= int(edge_valence) ? 0 : border_index+2;
+	return (4.0f * vtx + (ring[border_index] + ring[second_border_index])) * 1.0f/6.0f;
+      }
+      
+      Vertex_t F( 0.0f );
+      Vertex_t E( 0.0f );
+      
+      assert(eval_start_index < face_valence);
+
+      for (size_t i=0; i<face_valence; i++) {
+        size_t index = i+eval_start_index;
+        if (index >= face_valence) index -= face_valence;
+        F += ring[2*index+1];
+        E += ring[2*index];
+      }
+
+      const float n = (float)face_valence;
+      return (Vertex_t)(n*n*vtx+4.0f*E+F) / ((n+5.0f)*n);      
+    }
+    
+    /* gets limit tangent in the direction of egde vtx -> ring[0] */
+    __forceinline Vertex getLimitTangent() const 
+    {
+      if (unlikely(std::isinf(vertex_crease_weight)))
+        return ring[0] - vtx;
+
+      /* border vertex rule */
+      if (unlikely(border_index != -1))
+      {	
+	if (border_index != (int)edge_valence-2 ) {
+	  return ring[0] - vtx; 
+	}
+	else
+	{
+	  const unsigned int second_border_index = border_index+2 >= int(edge_valence) ? 0 : border_index+2;
+	  return (ring[second_border_index] - ring[border_index]) * 0.5f;
+	}
+      }
+      
+      Vertex_t alpha( 0.0f );
+      Vertex_t beta ( 0.0f );
+      
+      const size_t n = face_valence;
+
+      assert(eval_start_index < face_valence);
+
+      Vertex_t q( 0.0f );
+      for (size_t i=0; i<face_valence; i++)
+      {
+        size_t index = i+eval_start_index;
+        if (index >= face_valence) index -= face_valence;
+        const float a = CatmullClarkPrecomputedCoefficients::table.limittangent_a(index,n);
+        const float b = CatmullClarkPrecomputedCoefficients::table.limittangent_b(index,n);
+	alpha +=  a * ring[2*index];
+	beta  +=  b * ring[2*index+1];
+      }
+
+      const float sigma = CatmullClarkPrecomputedCoefficients::table.limittangent_c(n);
+      return sigma * (alpha + beta);
+    }
+    
+    /* gets limit tangent in the direction of egde vtx -> ring[edge_valence-2] */
+    __forceinline Vertex getSecondLimitTangent() const 
+    {
+      if (unlikely(std::isinf(vertex_crease_weight)))
+        return ring[2] - vtx;
+ 
+      /* border vertex rule */
+      if (unlikely(border_index != -1))
+      {
+        if (border_index != 2) {
+          return ring[2] - vtx;
+        }
+        else {
+          const unsigned int second_border_index = border_index+2 >= int(edge_valence) ? 0 : border_index+2;
+          return (ring[border_index] - ring[second_border_index]) * 0.5f;
+        }
+      }
+      
+      Vertex_t alpha( 0.0f );
+      Vertex_t beta ( 0.0f );
+
+      const size_t n = face_valence;
+
+      assert(eval_start_index < face_valence);
+
+      for (size_t i=0; i<face_valence; i++)
+      {
+        size_t index = i+eval_start_index;
+        if (index >= face_valence) index -= face_valence;
+
+        size_t prev_index = index == 0 ? face_valence-1 : index-1; // need to be bit-wise exact in cosf eval
+        const float a = CatmullClarkPrecomputedCoefficients::table.limittangent_a(prev_index,n);
+        const float b = CatmullClarkPrecomputedCoefficients::table.limittangent_b(prev_index,n);
+	alpha += a * ring[2*index];
+	beta  += b * ring[2*index+1];
+      }
+
+      const float sigma = CatmullClarkPrecomputedCoefficients::table.limittangent_c(n);
+      return sigma* (alpha + beta);      
+    }
+
+    /* gets surface normal */
+    const Vertex getNormal() const  {
+      return cross(getLimitTangent(),getSecondLimitTangent());
+    }
+    
+    /* returns center of the n-th quad in the 1-ring */
+    __forceinline Vertex getQuadCenter(const size_t index) const
+    {
+      const Vertex_t &p0 = vtx;
+      const Vertex_t &p1 = ring[2*index+0];
+      const Vertex_t &p2 = ring[2*index+1];
+      const Vertex_t &p3 = index == face_valence-1 ? ring[0] : ring[2*index+2];
+      const Vertex p = (p0+p1+p2+p3) * 0.25f;
+      return p;
+    }
+    
+    /* returns center of the n-th edge in the 1-ring */
+    __forceinline Vertex getEdgeCenter(const size_t index) const {
+      return (vtx + ring[index*2]) * 0.5f;
+    }
+
+    bool hasValidPositions() const
+    {
+      for (size_t i=0; i<edge_valence; i++) {
+        if (!isvalid(ring[i]))
+          return false;
+      }	
+      return true;
+    }
+
+    friend __forceinline embree_ostream operator<<(embree_ostream o, const CatmullClark1RingT &c)
+    {
+      o << "vtx " << c.vtx << " size = " << c.edge_valence << ", " << 
+	"hard_edge = " << c.border_index << ", face_valence " << c.face_valence << 
+	", edge_level = " << c.edge_level << ", vertex_level = " << c.vertex_level << ", eval_start_index: " << c.eval_start_index << ", ring: " << embree_endl;
+      
+      for (unsigned int i=0; i<min(c.edge_valence,(unsigned int)MAX_RING_FACE_VALENCE); i++) {
+        o << i << " -> " << c.ring[i];
+        if (i % 2 == 0) o << " crease = " << c.crease_weight[i/2];
+        o << embree_endl;
+      }
+      return o;
+    } 
+  };
+
+  typedef CatmullClark1RingT<Vec3fa,Vec3fa_t> CatmullClark1Ring3fa;
+  
+  template<typename Vertex, typename Vertex_t = Vertex>
+    struct __aligned(64) GeneralCatmullClark1RingT
+  {
+    ALIGNED_STRUCT_(64);
+    
+    typedef CatmullClark1RingT<Vertex,Vertex_t> CatmullClark1Ring;
+    
+    struct Face 
+    {
+      __forceinline Face() {}
+      __forceinline Face (int size, float crease_weight)
+        : size(size), crease_weight(crease_weight) {}
+
+      // FIXME: add member that returns total number of vertices
+
+      int size;              // number of vertices-2 of nth face in ring
+      float crease_weight;
+    };
+
+    Vertex vtx;
+    DynamicStackArray<Vertex,32,MAX_RING_EDGE_VALENCE> ring; 
+    DynamicStackArray<Face,16,MAX_RING_FACE_VALENCE> faces;
+    unsigned int face_valence;
+    unsigned int edge_valence;
+    int border_face;
+    float vertex_crease_weight;
+    float vertex_level;                      //!< maximum level of adjacent edges
+    float edge_level;                        // level of first edge
+    bool only_quads;                         // true if all faces are quads
+    unsigned int eval_start_face_index;
+    unsigned int eval_start_vertex_index;
+    unsigned int eval_unique_identifier;
+
+  public:
+    GeneralCatmullClark1RingT() 
+      : eval_start_face_index(0), eval_start_vertex_index(0), eval_unique_identifier(0) {}
+
+    __forceinline bool isRegular() const 
+    {
+      if (border_face == -1 && face_valence == 4) return true;
+      return false;
+    }
+    
+    __forceinline bool has_last_face() const {
+      return border_face != (int)face_valence-1;
+    }
+    
+    __forceinline bool has_second_face() const {
+      return (border_face == -1) || (border_face >= 2);
+    }
+
+    bool hasValidPositions() const
+    {
+      for (size_t i=0; i<edge_valence; i++) {
+        if (!isvalid(ring[i]))
+          return false;
+      }	
+      return true;
+    }
+
+    __forceinline void init(const HalfEdge* const h, const char* vertices, size_t stride)
+    {
+      only_quads = true;
+      border_face = -1;
+      vtx = Vertex_t::loadu(vertices+h->getStartVertexIndex()*stride);
+      vertex_crease_weight = h->vertex_crease_weight;
+      HalfEdge* p = (HalfEdge*) h;
+      
+      unsigned int e=0, f=0;
+      unsigned min_vertex_index = (unsigned)-1;
+      unsigned min_vertex_index_face = (unsigned)-1;
+      unsigned min_vertex_index_vertex = (unsigned)-1;
+      edge_level = p->edge_level;
+      vertex_level = 0.0f;
+      do 
+      {
+        HalfEdge* p_prev = p->prev();
+        HalfEdge* p_next = p->next();
+        const float crease_weight = p->edge_crease_weight;
+         assert(p->hasOpposite() || p->edge_crease_weight == float(inf));
+        vertex_level = max(vertex_level,p->edge_level);
+
+        /* find minimum start vertex */
+        unsigned vertex_index = p_next->getStartVertexIndex();
+        if (vertex_index < min_vertex_index) { min_vertex_index = vertex_index; min_vertex_index_face = f; min_vertex_index_vertex = e; }
+
+	/* store first N-2 vertices of face */
+	unsigned int vn = 0;
+        for (p = p_next; p!=p_prev; p=p->next()) {
+          ring[e++] = Vertex_t::loadu(vertices+p->getStartVertexIndex()*stride);
+          vn++;
+	}
+	faces[f++] = Face(vn,crease_weight);
+	only_quads &= (vn == 2);
+	
+        /* continue with next face */
+        if (likely(p->hasOpposite())) 
+          p = p->opposite();
+        
+        /* if there is no opposite go the long way to the other side of the border */
+        else
+        {
+          /* find minimum start vertex */
+          unsigned vertex_index = p->getStartVertexIndex();
+          if (vertex_index < min_vertex_index) { min_vertex_index = vertex_index; min_vertex_index_face = f; min_vertex_index_vertex = e; }
+
+          /*! mark first border edge and store dummy vertex for face between the two border edges */
+          border_face = f;
+	  faces[f++] = Face(2,inf); 
+          ring[e++] = Vertex_t::loadu(vertices+p->getStartVertexIndex()*stride);
+          ring[e++] = vtx; // dummy vertex
+	  
+          /*! goto other side of border */
+          p = (HalfEdge*) h;
+          while (p->hasOpposite()) 
+            p = p->opposite()->next();
+        }
+	
+      } while (p != h); 
+      
+      edge_valence = e;
+      face_valence = f;
+      eval_unique_identifier = min_vertex_index;
+      eval_start_face_index = min_vertex_index_face;
+      eval_start_vertex_index = min_vertex_index_vertex;
+
+      assert( hasValidPositions() );
+    }
+    
+    __forceinline void subdivide(CatmullClark1Ring& dest) const
+    {
+      dest.edge_level = 0.5f*edge_level;
+      dest.vertex_level = 0.5f*vertex_level;
+      dest.face_valence = face_valence;
+      dest.edge_valence = 2*face_valence;
+      dest.border_index = border_face == -1 ? -1 : 2*border_face; // FIXME:
+      dest.vertex_crease_weight    = max(0.0f,vertex_crease_weight-1.0f);
+      dest.eval_start_index        = eval_start_face_index;
+      dest.eval_unique_identifier  = eval_unique_identifier;
+      assert(dest.face_valence <= MAX_RING_FACE_VALENCE);
+
+      /* calculate face points */
+      Vertex_t S = Vertex_t(0.0f);
+      for (size_t face=0, v=eval_start_vertex_index; face<face_valence; face++) {
+        size_t f = (face + eval_start_face_index)%face_valence;
+
+        Vertex_t F = vtx;
+        for (size_t k=v; k<=v+faces[f].size; k++) F += ring[k%edge_valence]; // FIXME: optimize
+        S += dest.ring[2*f+1] = F/float(faces[f].size+2);
+        v+=faces[f].size;
+        v%=edge_valence;
+      }
+      
+      /* calculate new edge points */
+      size_t num_creases = 0;
+      array_t<size_t,MAX_RING_FACE_VALENCE> crease_id;
+      Vertex_t C = Vertex_t(0.0f);
+      for (size_t face=0, j=eval_start_vertex_index; face<face_valence; face++)
+      {
+        size_t i = (face + eval_start_face_index)%face_valence;
+        
+        const Vertex_t v = vtx + ring[j];
+        Vertex_t f = dest.ring[2*i+1];
+        if (i == 0) f += dest.ring[dest.edge_valence-1]; 
+        else        f += dest.ring[2*i-1];
+        S += ring[j];
+        dest.crease_weight[i] = max(faces[i].crease_weight-1.0f,0.0f);
+        
+        /* fast path for regular edge points */
+        if (likely(faces[i].crease_weight <= 0.0f)) {
+          dest.ring[2*i] = (v+f) * 0.25f;
+        }
+        
+        /* slower path for hard edge rule */
+        else {
+          C += ring[j]; crease_id[num_creases++] = i;
+          dest.ring[2*i] = v*0.5f;
+	  
+          /* even slower path for blended edge rule */
+          if (unlikely(faces[i].crease_weight < 1.0f)) {
+            dest.ring[2*i] = lerp((v+f)*0.25f,v*0.5f,faces[i].crease_weight);
+          }
+        }
+        j+=faces[i].size;
+        j%=edge_valence;
+      }
+      
+      /* compute new vertex using smooth rule */
+      const float inv_face_valence = 1.0f / (float)face_valence;
+      const Vertex_t v_smooth = (Vertex_t) madd(inv_face_valence,S,(float(face_valence)-2.0f)*vtx)*inv_face_valence;
+      dest.vtx = v_smooth;
+      
+      /* compute new vertex using vertex_crease_weight rule */
+      if (unlikely(vertex_crease_weight > 0.0f)) 
+      {
+        if (vertex_crease_weight >= 1.0f) {
+          dest.vtx = vtx;
+        } else {
+          dest.vtx = lerp(vtx,v_smooth,vertex_crease_weight);
+        }
+        return;
+      }
+      
+      if (likely(num_creases <= 1))
+        return;
+      
+      /* compute new vertex using crease rule */
+      if (likely(num_creases == 2)) {
+        const Vertex_t v_sharp = (Vertex_t)(C + 6.0f * vtx) * (1.0f / 8.0f);
+        const float crease_weight0 = faces[crease_id[0]].crease_weight;
+        const float crease_weight1 = faces[crease_id[1]].crease_weight;
+        dest.vtx = v_sharp;
+        dest.crease_weight[crease_id[0]] = max(0.25f*(3.0f*crease_weight0 + crease_weight1)-1.0f,0.0f);
+        dest.crease_weight[crease_id[1]] = max(0.25f*(3.0f*crease_weight1 + crease_weight0)-1.0f,0.0f);
+        const float v_blend = 0.5f*(crease_weight0+crease_weight1);
+        if (unlikely(v_blend < 1.0f)) {
+          dest.vtx = lerp(v_sharp,v_smooth,v_blend);
+        }
+      }
+      
+      /* compute new vertex using corner rule */
+      else {
+        dest.vtx = vtx;
+      }
+    }
+
+    void convert(CatmullClark1Ring& dst) const
+    {
+      dst.edge_level = edge_level;
+      dst.vertex_level = vertex_level;
+      dst.vtx = vtx;
+      dst.face_valence = face_valence;
+      dst.edge_valence = 2*face_valence;
+      dst.border_index = border_face == -1 ? -1 : 2*border_face;
+      for (size_t i=0; i<face_valence; i++) 
+	dst.crease_weight[i] = faces[i].crease_weight;
+      dst.vertex_crease_weight = vertex_crease_weight;
+      for (size_t i=0; i<edge_valence; i++) dst.ring[i] = ring[i];
+
+      dst.eval_start_index = eval_start_face_index;
+      dst.eval_unique_identifier = eval_unique_identifier;
+
+      assert( dst.hasValidPositions() );
+    }
+
+
+    /* gets limit tangent in the direction of egde vtx -> ring[0] */
+    __forceinline Vertex getLimitTangent() const 
+    {
+      CatmullClark1Ring cc_vtx;
+     
+      /* fast path for quad only rings */
+      if (only_quads)
+      {
+        convert(cc_vtx);
+        return cc_vtx.getLimitTangent();
+      }
+      
+      subdivide(cc_vtx);
+      return 2.0f * cc_vtx.getLimitTangent();
+    }
+
+    /* gets limit tangent in the direction of egde vtx -> ring[edge_valence-2] */
+    __forceinline Vertex getSecondLimitTangent() const 
+    {
+      CatmullClark1Ring cc_vtx;
+     
+      /* fast path for quad only rings */
+      if (only_quads)
+      {
+        convert(cc_vtx);
+        return cc_vtx.getSecondLimitTangent();
+      }
+      
+      subdivide(cc_vtx);
+      return 2.0f * cc_vtx.getSecondLimitTangent();
+    }
+
+
+    /* gets limit vertex */
+    __forceinline Vertex getLimitVertex() const 
+    {
+      CatmullClark1Ring cc_vtx;
+     
+      /* fast path for quad only rings */
+      if (only_quads)
+        convert(cc_vtx);
+      else 
+        subdivide(cc_vtx);
+      return cc_vtx.getLimitVertex();
+    }
+
+    friend __forceinline embree_ostream operator<<(embree_ostream o, const GeneralCatmullClark1RingT &c)
+    {
+      o << "vtx " << c.vtx << " size = " << c.edge_valence << ", border_face = " << c.border_face << ", " << " face_valence = " << c.face_valence << 
+	", edge_level = " << c.edge_level << ", vertex_level = " << c.vertex_level << ", ring: " << embree_endl;
+      for (size_t v=0, f=0; f<c.face_valence; v+=c.faces[f++].size) {
+        for (size_t i=v; i<v+c.faces[f].size; i++) {
+          o << i << " -> " << c.ring[i];
+          if (i == v) o << " crease = " << c.faces[f].crease_weight;
+          o << embree_endl;
+        }
+      }
+      return o;
+    } 
+  };  
+}
diff --git a/thirdparty/embree/kernels/subdiv/catmullrom_curve.h b/thirdparty/embree/kernels/subdiv/catmullrom_curve.h
new file mode 100644
index 0000000000..74fc4c1230
--- /dev/null
+++ b/thirdparty/embree/kernels/subdiv/catmullrom_curve.h
@@ -0,0 +1,297 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/default.h"
+#include "../common/scene_curves.h"
+
+/*
+
+  Implements Catmul Rom curves with control points p0, p1, p2, p3. At
+  t=0 the curve goes through p1, with tangent (p2-p0)/3, and for t=1
+  the curve goes through p2 with tangent (p3-p2)/2.
+
+ */
+
+namespace embree
+{
+  class CatmullRomBasis
+  {
+  public:
+
+    template<typename T>
+      static __forceinline Vec4<T> eval(const T& u) 
+    {
+      const T t  = u;
+      const T s  = T(1.0f) - u;
+      const T n0 = - t * s * s;
+      const T n1 = 2.0f + t * t * (3.0f * t - 5.0f);
+      const T n2 = 2.0f + s * s * (3.0f * s - 5.0f);
+      const T n3 = - s * t * t;
+      return T(0.5f) * Vec4<T>(n0, n1, n2, n3);
+    }
+    
+    template<typename T>
+      static __forceinline Vec4<T>  derivative(const T& u)
+    {
+      const T t  =  u;
+      const T s  =  1.0f - u;
+      const T n0 =  - s * s + 2.0f * s * t;
+      const T n1 =  2.0f * t * (3.0f * t - 5.0f) + 3.0f * t * t;
+      const T n2 =  2.0f * s * (3.0f * t + 2.0f) - 3.0f * s * s;
+      const T n3 = -2.0f * s * t + t * t;
+      return T(0.5f) * Vec4<T>(n0, n1, n2, n3);
+    }
+
+    template<typename T>
+      static __forceinline Vec4<T>  derivative2(const T& u)
+    {
+      const T t  =  u;
+      const T n0 = -3.0f * t + 2.0f;
+      const T n1 =  9.0f * t - 5.0f;
+      const T n2 = -9.0f * t + 4.0f;
+      const T n3 =  3.0f * t - 1.0f;
+      return Vec4<T>(n0, n1, n2, n3);
+    }
+  };
+  
+  struct PrecomputedCatmullRomBasis
+  {
+    enum { N = 16 };
+  public:
+    PrecomputedCatmullRomBasis() {}
+    PrecomputedCatmullRomBasis(int shift);
+
+    /* basis for bspline evaluation */
+  public:
+    float c0[N+1][N+1];
+    float c1[N+1][N+1];
+    float c2[N+1][N+1];
+    float c3[N+1][N+1];
+    
+    /* basis for bspline derivative evaluation */
+  public:
+    float d0[N+1][N+1];
+    float d1[N+1][N+1];
+    float d2[N+1][N+1];
+    float d3[N+1][N+1];
+  };
+  extern PrecomputedCatmullRomBasis catmullrom_basis0;
+  extern PrecomputedCatmullRomBasis catmullrom_basis1;
+
+  template<typename Vertex>
+    struct CatmullRomCurveT
+    {
+      Vertex v0,v1,v2,v3;
+      
+      __forceinline CatmullRomCurveT() {}
+      
+      __forceinline CatmullRomCurveT(const Vertex& v0, const Vertex& v1, const Vertex& v2, const Vertex& v3)
+        : v0(v0), v1(v1), v2(v2), v3(v3) {}
+
+      __forceinline Vertex begin() const {
+        return madd(1.0f/6.0f,v0,madd(2.0f/3.0f,v1,1.0f/6.0f*v2));
+      }
+
+      __forceinline Vertex end() const {
+        return madd(1.0f/6.0f,v1,madd(2.0f/3.0f,v2,1.0f/6.0f*v3));
+      }
+
+      __forceinline Vertex center() const {
+        return 0.25f*(v0+v1+v2+v3);
+      }
+
+      __forceinline BBox<Vertex> bounds() const {
+        return merge(BBox<Vertex>(v0),BBox<Vertex>(v1),BBox<Vertex>(v2),BBox<Vertex>(v3));
+      }
+
+      __forceinline friend CatmullRomCurveT operator -( const CatmullRomCurveT& a, const Vertex& b ) {
+        return CatmullRomCurveT(a.v0-b,a.v1-b,a.v2-b,a.v3-b);
+      }
+
+      __forceinline CatmullRomCurveT<Vec3ff> xfm_pr(const LinearSpace3fa& space, const Vec3fa& p) const
+      {
+        const Vec3ff q0(xfmVector(space,v0-p), v0.w);
+        const Vec3ff q1(xfmVector(space,v1-p), v1.w);
+        const Vec3ff q2(xfmVector(space,v2-p), v2.w);
+        const Vec3ff q3(xfmVector(space,v3-p), v3.w);
+        return CatmullRomCurveT<Vec3ff>(q0,q1,q2,q3);
+      }
+      
+      __forceinline Vertex eval(const float t) const 
+      {
+        const Vec4<float> b = CatmullRomBasis::eval(t);
+        return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3)));
+      }
+      
+      __forceinline Vertex eval_du(const float t) const
+      {
+        const Vec4<float> b = CatmullRomBasis::derivative(t);
+        return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3)));
+      }
+      
+      __forceinline Vertex eval_dudu(const float t) const 
+      {
+        const Vec4<float> b = CatmullRomBasis::derivative2(t);
+        return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3)));
+      }
+      
+      __forceinline void eval(const float t, Vertex& p, Vertex& dp, Vertex& ddp) const
+      {
+        p = eval(t);
+        dp = eval_du(t);
+        ddp = eval_dudu(t);
+      }
+
+      template<int M>
+      __forceinline Vec4vf<M> veval(const vfloat<M>& t) const 
+      {
+        const Vec4vf<M> b = CatmullRomBasis::eval(t);
+        return madd(b.x, Vec4vf<M>(v0), madd(b.y, Vec4vf<M>(v1), madd(b.z, Vec4vf<M>(v2), b.w * Vec4vf<M>(v3))));
+      }
+
+      template<int M>
+      __forceinline Vec4vf<M> veval_du(const vfloat<M>& t) const 
+      {
+        const Vec4vf<M> b = CatmullRomBasis::derivative(t);
+        return madd(b.x, Vec4vf<M>(v0), madd(b.y, Vec4vf<M>(v1), madd(b.z, Vec4vf<M>(v2), b.w * Vec4vf<M>(v3))));
+      }
+
+      template<int M>
+      __forceinline Vec4vf<M> veval_dudu(const vfloat<M>& t) const 
+      {
+        const Vec4vf<M> b = CatmullRomBasis::derivative2(t);
+        return madd(b.x, Vec4vf<M>(v0), madd(b.y, Vec4vf<M>(v1), madd(b.z, Vec4vf<M>(v2), b.w * Vec4vf<M>(v3))));
+      }
+
+      template<int M>
+      __forceinline void veval(const vfloat<M>& t, Vec4vf<M>& p, Vec4vf<M>& dp) const
+      {
+        p = veval<M>(t);
+        dp = veval_du<M>(t);
+      }
+      
+      template<int M>
+      __forceinline Vec4vf<M> eval0(const int ofs, const int size) const
+      {
+        assert(size <= PrecomputedCatmullRomBasis::N);
+        assert(ofs <= size);
+        return madd(vfloat<M>::loadu(&catmullrom_basis0.c0[size][ofs]), Vec4vf<M>(v0),
+                    madd(vfloat<M>::loadu(&catmullrom_basis0.c1[size][ofs]), Vec4vf<M>(v1),
+                         madd(vfloat<M>::loadu(&catmullrom_basis0.c2[size][ofs]), Vec4vf<M>(v2),
+                              vfloat<M>::loadu(&catmullrom_basis0.c3[size][ofs]) * Vec4vf<M>(v3))));
+      }
+      
+      template<int M>
+      __forceinline Vec4vf<M> eval1(const int ofs, const int size) const
+      {
+        assert(size <= PrecomputedCatmullRomBasis::N);
+        assert(ofs <= size);
+        return madd(vfloat<M>::loadu(&catmullrom_basis1.c0[size][ofs]), Vec4vf<M>(v0), 
+                    madd(vfloat<M>::loadu(&catmullrom_basis1.c1[size][ofs]), Vec4vf<M>(v1),
+                         madd(vfloat<M>::loadu(&catmullrom_basis1.c2[size][ofs]), Vec4vf<M>(v2),
+                              vfloat<M>::loadu(&catmullrom_basis1.c3[size][ofs]) * Vec4vf<M>(v3))));
+      }
+      
+      template<int M>
+      __forceinline Vec4vf<M> derivative0(const int ofs, const int size) const
+      {
+        assert(size <= PrecomputedCatmullRomBasis::N);
+        assert(ofs <= size);
+        return madd(vfloat<M>::loadu(&catmullrom_basis0.d0[size][ofs]), Vec4vf<M>(v0),
+                    madd(vfloat<M>::loadu(&catmullrom_basis0.d1[size][ofs]), Vec4vf<M>(v1),
+                         madd(vfloat<M>::loadu(&catmullrom_basis0.d2[size][ofs]), Vec4vf<M>(v2),
+                              vfloat<M>::loadu(&catmullrom_basis0.d3[size][ofs]) * Vec4vf<M>(v3))));
+      }
+      
+      template<int M>
+      __forceinline Vec4vf<M> derivative1(const int ofs, const int size) const
+      {
+        assert(size <= PrecomputedCatmullRomBasis::N);
+        assert(ofs <= size);
+        return madd(vfloat<M>::loadu(&catmullrom_basis1.d0[size][ofs]), Vec4vf<M>(v0),
+                    madd(vfloat<M>::loadu(&catmullrom_basis1.d1[size][ofs]), Vec4vf<M>(v1),
+                         madd(vfloat<M>::loadu(&catmullrom_basis1.d2[size][ofs]), Vec4vf<M>(v2),
+                              vfloat<M>::loadu(&catmullrom_basis1.d3[size][ofs]) * Vec4vf<M>(v3))));
+      }
+      
+      /* calculates bounds of catmull-rom curve geometry */
+      __forceinline BBox3fa accurateRoundBounds() const
+      {
+        const int N = 7;
+        const float scale = 1.0f/(3.0f*(N-1));
+        Vec4vfx pl(pos_inf), pu(neg_inf);
+        for (int i=0; i<=N; i+=VSIZEX)
+        {
+          vintx vi = vintx(i)+vintx(step);
+          vboolx valid = vi <= vintx(N);
+          const Vec4vfx p  = eval0<VSIZEX>(i,N);
+          const Vec4vfx dp = derivative0<VSIZEX>(i,N);
+          const Vec4vfx pm = p-Vec4vfx(scale)*select(vi!=vintx(0),dp,Vec4vfx(zero));
+          const Vec4vfx pp = p+Vec4vfx(scale)*select(vi!=vintx(N),dp,Vec4vfx(zero));
+          pl = select(valid,min(pl,p,pm,pp),pl); // FIXME: use masked min
+          pu = select(valid,max(pu,p,pm,pp),pu); // FIXME: use masked min
+        }
+        const Vec3fa lower(reduce_min(pl.x),reduce_min(pl.y),reduce_min(pl.z));
+        const Vec3fa upper(reduce_max(pu.x),reduce_max(pu.y),reduce_max(pu.z));
+        const float r_min = reduce_min(pl.w);
+        const float r_max = reduce_max(pu.w);
+        const Vec3fa upper_r = Vec3fa(max(abs(r_min),abs(r_max)));
+        return enlarge(BBox3fa(lower,upper),upper_r);
+      }
+      
+      /* calculates bounds when tessellated into N line segments */
+      __forceinline BBox3fa accurateFlatBounds(int N) const
+      {
+        if (likely(N == 4))
+        {
+          const Vec4vf4 pi = eval0<4>(0,4);
+          const Vec3fa lower(reduce_min(pi.x),reduce_min(pi.y),reduce_min(pi.z));
+          const Vec3fa upper(reduce_max(pi.x),reduce_max(pi.y),reduce_max(pi.z));
+          const Vec3fa upper_r = Vec3fa(reduce_max(abs(pi.w)));
+          const Vec3ff pe = end();
+          return enlarge(BBox3fa(min(lower,pe),max(upper,pe)),max(upper_r,Vec3fa(abs(pe.w))));
+        } 
+        else
+        {
+          Vec3vfx pl(pos_inf), pu(neg_inf); vfloatx ru(0.0f);
+          for (int i=0; i<=N; i+=VSIZEX)
+          {
+            vboolx valid = vintx(i)+vintx(step) <= vintx(N);
+            const Vec4vfx pi = eval0<VSIZEX>(i,N);
+            
+            pl.x = select(valid,min(pl.x,pi.x),pl.x); // FIXME: use masked min
+            pl.y = select(valid,min(pl.y,pi.y),pl.y); 
+            pl.z = select(valid,min(pl.z,pi.z),pl.z); 
+            
+            pu.x = select(valid,max(pu.x,pi.x),pu.x); // FIXME: use masked min
+            pu.y = select(valid,max(pu.y,pi.y),pu.y); 
+            pu.z = select(valid,max(pu.z,pi.z),pu.z); 
+            
+            ru = select(valid,max(ru,abs(pi.w)),ru); 
+          }
+          const Vec3fa lower(reduce_min(pl.x),reduce_min(pl.y),reduce_min(pl.z));
+          const Vec3fa upper(reduce_max(pu.x),reduce_max(pu.y),reduce_max(pu.z));
+          const Vec3fa upper_r(reduce_max(ru));
+          return enlarge(BBox3fa(lower,upper),upper_r);
+        }
+      }
+      
+      friend __forceinline embree_ostream operator<<(embree_ostream cout, const CatmullRomCurveT& curve) {
+        return cout << "CatmullRomCurve { v0 = " << curve.v0 << ", v1 = " << curve.v1 << ", v2 = " << curve.v2 << ", v3 = " << curve.v3 << " }";
+      }
+    };
+
+  template<typename CurveGeometry>
+  __forceinline CatmullRomCurveT<Vec3ff> enlargeRadiusToMinWidth(const IntersectContext* context, const CurveGeometry* geom, const Vec3fa& ray_org, const CatmullRomCurveT<Vec3ff>& curve)
+  {
+    return CatmullRomCurveT<Vec3ff>(enlargeRadiusToMinWidth(context,geom,ray_org,curve.v0),
+                                    enlargeRadiusToMinWidth(context,geom,ray_org,curve.v1),
+                                    enlargeRadiusToMinWidth(context,geom,ray_org,curve.v2),
+                                    enlargeRadiusToMinWidth(context,geom,ray_org,curve.v3));
+  }
+  
+  typedef CatmullRomCurveT<Vec3fa> CatmullRomCurve3fa;
+}
+
diff --git a/thirdparty/embree/kernels/subdiv/feature_adaptive_eval.h b/thirdparty/embree/kernels/subdiv/feature_adaptive_eval.h
new file mode 100644
index 0000000000..58c0b63e62
--- /dev/null
+++ b/thirdparty/embree/kernels/subdiv/feature_adaptive_eval.h
@@ -0,0 +1,226 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "patch.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<typename Vertex, typename Vertex_t = Vertex>
+      struct FeatureAdaptiveEval
+      {
+      public:
+        
+        typedef PatchT<Vertex,Vertex_t> Patch;
+        typedef typename Patch::Ref Ref;
+        typedef GeneralCatmullClarkPatchT<Vertex,Vertex_t> GeneralCatmullClarkPatch;
+        typedef CatmullClark1RingT<Vertex,Vertex_t> CatmullClarkRing;
+        typedef CatmullClarkPatchT<Vertex,Vertex_t> CatmullClarkPatch;
+        typedef BSplinePatchT<Vertex,Vertex_t> BSplinePatch;
+        typedef BezierPatchT<Vertex,Vertex_t> BezierPatch;
+        typedef GregoryPatchT<Vertex,Vertex_t> GregoryPatch;
+        typedef BilinearPatchT<Vertex,Vertex_t> BilinearPatch;
+        typedef BezierCurveT<Vertex> BezierCurve;
+        
+      public:
+        
+        FeatureAdaptiveEval (const HalfEdge* edge, const char* vertices, size_t stride, const float u, const float v, 
+                             Vertex* P, Vertex* dPdu, Vertex* dPdv, Vertex* ddPdudu, Vertex* ddPdvdv, Vertex* ddPdudv)
+        : P(P), dPdu(dPdu), dPdv(dPdv), ddPdudu(ddPdudu), ddPdvdv(ddPdvdv), ddPdudv(ddPdudv)
+        {
+          switch (edge->patch_type) {
+          case HalfEdge::BILINEAR_PATCH: BilinearPatch(edge,vertices,stride).eval(u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,1.0f); break;
+          case HalfEdge::REGULAR_QUAD_PATCH: RegularPatchT(edge,vertices,stride).eval(u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,1.0f); break;
+#if PATCH_USE_GREGORY == 2
+          case HalfEdge::IRREGULAR_QUAD_PATCH: GregoryPatch(edge,vertices,stride).eval(u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,1.0f); break;
+#endif
+          default: {
+            GeneralCatmullClarkPatch patch(edge,vertices,stride);
+            eval(patch,Vec2f(u,v),0);
+            break;
+          }
+          }
+        }
+
+        FeatureAdaptiveEval (CatmullClarkPatch& patch, const float u, const float v, float dscale, size_t depth, 
+                             Vertex* P, Vertex* dPdu, Vertex* dPdv, Vertex* ddPdudu, Vertex* ddPdvdv, Vertex* ddPdudv)
+        : P(P), dPdu(dPdu), dPdv(dPdv), ddPdudu(ddPdudu), ddPdvdv(ddPdvdv), ddPdudv(ddPdudv)
+        {
+          eval(patch,Vec2f(u,v),dscale,depth);
+        }
+        
+        void eval_general_quad(const GeneralCatmullClarkPatch& patch, array_t<CatmullClarkPatch,GeneralCatmullClarkPatch::SIZE>& patches, const Vec2f& uv, size_t depth)
+        {
+          float u = uv.x, v = uv.y;
+          if (v < 0.5f) {
+            if (u < 0.5f) {
+#if PATCH_USE_GREGORY == 2
+              BezierCurve borders[2]; patch.getLimitBorder(borders,0);
+              BezierCurve border0l,border0r; borders[0].subdivide(border0l,border0r);
+              BezierCurve border2l,border2r; borders[1].subdivide(border2l,border2r);
+              eval(patches[0],Vec2f(2.0f*u,2.0f*v),2.0f,depth+1, &border0l, nullptr, nullptr, &border2r);
+#else
+              eval(patches[0],Vec2f(2.0f*u,2.0f*v),2.0f,depth+1);
+#endif
+              if (dPdu && dPdv) {
+                const Vertex dpdx = *dPdu, dpdy = *dPdv;
+                *dPdu = dpdx; *dPdv = dpdy;
+              }
+            }
+            else {
+#if PATCH_USE_GREGORY == 2
+              BezierCurve borders[2]; patch.getLimitBorder(borders,1);
+              BezierCurve border0l,border0r; borders[0].subdivide(border0l,border0r);
+              BezierCurve border2l,border2r; borders[1].subdivide(border2l,border2r);
+              eval(patches[1],Vec2f(2.0f*v,2.0f-2.0f*u),2.0f,depth+1, &border0l, nullptr, nullptr, &border2r);
+#else
+              eval(patches[1],Vec2f(2.0f*v,2.0f-2.0f*u),2.0f,depth+1);
+#endif
+              if (dPdu && dPdv) {
+                const Vertex dpdx = *dPdu, dpdy = *dPdv;
+                *dPdu = -dpdy; *dPdv = dpdx;
+              }
+            }
+          } else {
+            if (u > 0.5f) {
+#if PATCH_USE_GREGORY == 2
+              BezierCurve borders[2]; patch.getLimitBorder(borders,2);
+              BezierCurve border0l,border0r; borders[0].subdivide(border0l,border0r);
+              BezierCurve border2l,border2r; borders[1].subdivide(border2l,border2r);
+              eval(patches[2],Vec2f(2.0f-2.0f*u,2.0f-2.0f*v),2.0f,depth+1, &border0l, nullptr, nullptr, &border2r);
+#else
+              eval(patches[2],Vec2f(2.0f-2.0f*u,2.0f-2.0f*v),2.0f,depth+1);
+#endif
+              if (dPdu && dPdv) {
+                const Vertex dpdx = *dPdu, dpdy = *dPdv;
+                *dPdu = -dpdx; *dPdv = -dpdy;
+              }
+            }
+            else {
+#if PATCH_USE_GREGORY == 2
+              BezierCurve borders[2]; patch.getLimitBorder(borders,3);
+              BezierCurve border0l,border0r; borders[0].subdivide(border0l,border0r);
+              BezierCurve border2l,border2r; borders[1].subdivide(border2l,border2r);
+              eval(patches[3],Vec2f(2.0f-2.0f*v,2.0f*u),2.0f,depth+1, &border0l, nullptr, nullptr, &border2r);
+#else
+              eval(patches[3],Vec2f(2.0f-2.0f*v,2.0f*u),2.0f,depth+1);
+#endif
+              if (dPdu && dPdv) {
+                const Vertex dpdx = *dPdu, dpdy = *dPdv;
+                *dPdu = dpdy; *dPdv = -dpdx;
+              }
+            }
+          }
+        }
+
+        __forceinline bool final(const CatmullClarkPatch& patch, const typename CatmullClarkRing::Type type, size_t depth) 
+        {
+          const int max_eval_depth = (type & CatmullClarkRing::TYPE_CREASES) ? PATCH_MAX_EVAL_DEPTH_CREASE : PATCH_MAX_EVAL_DEPTH_IRREGULAR;
+//#if PATCH_MIN_RESOLUTION
+//          return patch.isFinalResolution(PATCH_MIN_RESOLUTION) || depth>=(size_t)max_eval_depth;
+//#else
+          return depth>=(size_t)max_eval_depth;
+//#endif
+        }
+        
+        void eval(CatmullClarkPatch& patch, Vec2f uv, float dscale, size_t depth, 
+                  BezierCurve* border0 = nullptr, BezierCurve* border1 = nullptr, BezierCurve* border2 = nullptr, BezierCurve* border3 = nullptr)
+        {
+          while (true) 
+          {
+            typename CatmullClarkPatch::Type ty = patch.type();
+
+            if (unlikely(final(patch,ty,depth)))
+            {
+              if (ty & CatmullClarkRing::TYPE_REGULAR) { 
+                RegularPatch(patch,border0,border1,border2,border3).eval(uv.x,uv.y,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale); 
+                PATCH_DEBUG_SUBDIVISION(234423,c,c,-1);
+                return;
+              } else {
+                IrregularFillPatch(patch,border0,border1,border2,border3).eval(uv.x,uv.y,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale); 
+                PATCH_DEBUG_SUBDIVISION(34534,c,-1,c);
+                return;
+              }
+            }
+            else if (ty & CatmullClarkRing::TYPE_REGULAR_CREASES) { 
+              assert(depth > 0); 
+              RegularPatch(patch,border0,border1,border2,border3).eval(uv.x,uv.y,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale); 
+              PATCH_DEBUG_SUBDIVISION(43524,c,c,-1);
+              return;
+            }
+#if PATCH_USE_GREGORY == 2
+            else if (ty & CatmullClarkRing::TYPE_GREGORY_CREASES) { 
+              assert(depth > 0); 
+              GregoryPatch(patch,border0,border1,border2,border3).eval(uv.x,uv.y,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale); 
+              PATCH_DEBUG_SUBDIVISION(23498,c,-1,c);
+              return;
+            }
+#endif
+            else
+            {
+              array_t<CatmullClarkPatch,4> patches; 
+              patch.subdivide(patches); // FIXME: only have to generate one of the patches
+              
+              const float u = uv.x, v = uv.y;
+              if (v < 0.5f) {
+                if (u < 0.5f) { patch = patches[0]; uv = Vec2f(2.0f*u,2.0f*v); dscale *= 2.0f; }
+                else          { patch = patches[1]; uv = Vec2f(2.0f*u-1.0f,2.0f*v); dscale *= 2.0f; }
+              } else {
+                if (u > 0.5f) { patch = patches[2]; uv = Vec2f(2.0f*u-1.0f,2.0f*v-1.0f); dscale *= 2.0f; }
+                else          { patch = patches[3]; uv = Vec2f(2.0f*u,2.0f*v-1.0f); dscale *= 2.0f; }
+              }
+              depth++;
+            }
+          }
+        }
+        
+        void eval(const GeneralCatmullClarkPatch& patch, const Vec2f& uv, const size_t depth) 
+        {  
+          /* convert into standard quad patch if possible */
+          if (likely(patch.isQuadPatch())) 
+          {
+            CatmullClarkPatch qpatch; patch.init(qpatch);
+            return eval(qpatch,uv,1.0f,depth); 
+          }
+          
+          /* subdivide patch */
+          unsigned N;
+          array_t<CatmullClarkPatch,GeneralCatmullClarkPatch::SIZE> patches; 
+          patch.subdivide(patches,N); // FIXME: only have to generate one of the patches
+          
+          /* parametrization for quads */
+          if (N == 4) 
+            eval_general_quad(patch,patches,uv,depth);
+          
+          /* parametrization for arbitrary polygons */
+          else 
+          {
+            const unsigned l = (unsigned) floor(0.5f*uv.x); const float u = 2.0f*frac(0.5f*uv.x)-0.5f; 
+            const unsigned h = (unsigned) floor(0.5f*uv.y); const float v = 2.0f*frac(0.5f*uv.y)-0.5f; 
+            const unsigned i = 4*h+l; assert(i<N);
+            if (i >= N) return;
+
+#if PATCH_USE_GREGORY == 2
+            BezierCurve borders[2]; patch.getLimitBorder(borders,i);
+            BezierCurve border0l,border0r; borders[0].subdivide(border0l,border0r);
+            BezierCurve border2l,border2r; borders[1].subdivide(border2l,border2r);
+            eval(patches[i],Vec2f(u,v),1.0f,depth+1, &border0l, nullptr, nullptr, &border2r);
+#else
+            eval(patches[i],Vec2f(u,v),1.0f,depth+1);
+#endif
+          }
+        }
+        
+      private:
+        Vertex* const P;
+        Vertex* const dPdu;
+        Vertex* const dPdv;
+        Vertex* const ddPdudu;
+        Vertex* const ddPdvdv;
+        Vertex* const ddPdudv;
+      };
+  }
+}
diff --git a/thirdparty/embree/kernels/subdiv/feature_adaptive_eval_grid.h b/thirdparty/embree/kernels/subdiv/feature_adaptive_eval_grid.h
new file mode 100644
index 0000000000..4755aba28d
--- /dev/null
+++ b/thirdparty/embree/kernels/subdiv/feature_adaptive_eval_grid.h
@@ -0,0 +1,359 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "patch.h"
+#include "catmullclark_patch.h"
+#include "bspline_patch.h"
+#include "gregory_patch.h"
+#include "tessellation.h"
+
+namespace embree
+{
+  namespace isa 
+  {
+    struct FeatureAdaptiveEvalGrid
+    {
+      typedef CatmullClark1Ring3fa CatmullClarkRing;
+      typedef CatmullClarkPatch3fa CatmullClarkPatch;
+      typedef BilinearPatch3fa BilinearPatch;
+      typedef BSplinePatch3fa BSplinePatch;
+      typedef BezierPatch3fa BezierPatch;
+      typedef GregoryPatch3fa GregoryPatch;
+
+    private:
+      const unsigned x0,x1;
+      const unsigned y0,y1;
+      const unsigned swidth,sheight;
+      const float rcp_swidth, rcp_sheight;
+      float* const Px;
+      float* const Py;
+      float* const Pz;
+      float* const U;
+      float* const V;
+      float* const Nx;
+      float* const Ny;
+      float* const Nz;
+      const unsigned dwidth;
+      //const unsigned dheight;
+      unsigned count;
+      
+
+    public:      
+      FeatureAdaptiveEvalGrid (const GeneralCatmullClarkPatch3fa& patch, unsigned subPatch,
+                               const unsigned x0, const unsigned x1, const unsigned y0, const unsigned y1, const unsigned swidth, const unsigned sheight, 
+                               float* Px, float* Py, float* Pz, float* U, float* V, 
+                               float* Nx, float* Ny, float* Nz,
+                               const unsigned dwidth, const unsigned dheight)
+      : x0(x0), x1(x1), y0(y0), y1(y1), swidth(swidth), sheight(sheight), rcp_swidth(1.0f/(swidth-1.0f)), rcp_sheight(1.0f/(sheight-1.0f)), 
+        Px(Px), Py(Py), Pz(Pz), U(U), V(V), Nx(Nx), Ny(Ny), Nz(Nz), dwidth(dwidth), /*dheight(dheight),*/ count(0)
+      {
+        assert(swidth < (2<<20) && sheight < (2<<20));
+        const BBox2f srange(Vec2f(0.0f,0.0f),Vec2f(float(swidth-1),float(sheight-1)));
+        const BBox2f erange(Vec2f((float)x0,(float)y0),Vec2f((float)x1,(float)y1));
+        
+        /* convert into standard quad patch if possible */
+        if (likely(patch.isQuadPatch())) 
+        {
+          CatmullClarkPatch3fa qpatch; patch.init(qpatch);
+          eval(qpatch, srange, erange, 0);
+          assert(count == (x1-x0+1)*(y1-y0+1));
+          return;
+        }
+        
+        /* subdivide patch */
+        unsigned N;
+        array_t<CatmullClarkPatch3fa,GeneralCatmullClarkPatch3fa::SIZE> patches; 
+        patch.subdivide(patches,N);
+        
+        if (N == 4)
+        {
+          const Vec2f c = srange.center();
+          const BBox2f srange0(srange.lower,c);
+          const BBox2f srange1(Vec2f(c.x,srange.lower.y),Vec2f(srange.upper.x,c.y));
+          const BBox2f srange2(c,srange.upper);
+          const BBox2f srange3(Vec2f(srange.lower.x,c.y),Vec2f(c.x,srange.upper.y));
+
+#if PATCH_USE_GREGORY == 2
+          BezierCurve3fa borders[GeneralCatmullClarkPatch3fa::SIZE]; patch.getLimitBorder(borders);
+          BezierCurve3fa border0l,border0r; borders[0].subdivide(border0l,border0r);
+          BezierCurve3fa border1l,border1r; borders[1].subdivide(border1l,border1r);
+          BezierCurve3fa border2l,border2r; borders[2].subdivide(border2l,border2r);
+          BezierCurve3fa border3l,border3r; borders[3].subdivide(border3l,border3r);
+          GeneralCatmullClarkPatch3fa::fix_quad_ring_order(patches);
+          eval(patches[0],srange0,intersect(srange0,erange),1,&border0l,nullptr,nullptr,&border3r);
+          eval(patches[1],srange1,intersect(srange1,erange),1,&border0r,&border1l,nullptr,nullptr);
+          eval(patches[2],srange2,intersect(srange2,erange),1,nullptr,&border1r,&border2l,nullptr);
+          eval(patches[3],srange3,intersect(srange3,erange),1,nullptr,nullptr,&border2r,&border3l);
+#else
+          GeneralCatmullClarkPatch3fa::fix_quad_ring_order(patches);
+          eval(patches[0],srange0,intersect(srange0,erange),1);
+          eval(patches[1],srange1,intersect(srange1,erange),1);
+          eval(patches[2],srange2,intersect(srange2,erange),1);
+          eval(patches[3],srange3,intersect(srange3,erange),1);
+#endif
+        }
+        else
+        {
+          assert(subPatch < N);
+          
+#if PATCH_USE_GREGORY == 2
+          BezierCurve3fa borders[2]; patch.getLimitBorder(borders,subPatch);
+          BezierCurve3fa border0l,border0r; borders[0].subdivide(border0l,border0r);
+          BezierCurve3fa border2l,border2r; borders[1].subdivide(border2l,border2r);
+          eval(patches[subPatch], srange, erange, 1, &border0l, nullptr, nullptr, &border2r);
+#else
+          eval(patches[subPatch], srange, erange, 1);
+#endif
+          
+        }
+        assert(count == (x1-x0+1)*(y1-y0+1));
+      }
+      
+      FeatureAdaptiveEvalGrid (const CatmullClarkPatch3fa& patch,
+                               const BBox2f& srange, const BBox2f& erange, const unsigned depth,
+                               const unsigned x0, const unsigned x1, const unsigned y0, const unsigned y1, const unsigned swidth, const unsigned sheight, 
+                               float* Px, float* Py, float* Pz, float* U, float* V, 
+                               float* Nx, float* Ny, float* Nz,
+                               const unsigned dwidth, const unsigned dheight)
+      : x0(x0), x1(x1), y0(y0), y1(y1), swidth(swidth), sheight(sheight), rcp_swidth(1.0f/(swidth-1.0f)), rcp_sheight(1.0f/(sheight-1.0f)), 
+        Px(Px), Py(Py), Pz(Pz), U(U), V(V), Nx(Nx), Ny(Ny), Nz(Nz), dwidth(dwidth), /*dheight(dheight),*/ count(0)
+      {
+        eval(patch,srange,erange,depth);
+      }
+
+      template<typename Patch>
+      void evalLocalGrid(const Patch& patch, const BBox2f& srange, const int lx0, const int lx1, const int ly0, const int ly1)
+      {
+        const float scale_x = rcp(srange.upper.x-srange.lower.x);
+        const float scale_y = rcp(srange.upper.y-srange.lower.y);
+        count += (lx1-lx0)*(ly1-ly0);
+        
+#if 0
+        for (unsigned iy=ly0; iy<ly1; iy++) {
+          for (unsigned ix=lx0; ix<lx1; ix++) {
+            const float lu = select(ix == swidth -1, float(1.0f), (float(ix)-srange.lower.x)*scale_x);
+            const float lv = select(iy == sheight-1, float(1.0f), (float(iy)-srange.lower.y)*scale_y);
+            const Vec3fa p = patch.eval(lu,lv);
+            const float u = float(ix)*rcp_swidth;
+            const float v = float(iy)*rcp_sheight;
+            const int ofs = (iy-y0)*dwidth+(ix-x0);
+            Px[ofs] = p.x;
+            Py[ofs] = p.y;
+            Pz[ofs] = p.z;
+            U[ofs] = u;
+            V[ofs] = v;
+          }
+        }
+#else
+        foreach2(lx0,lx1,ly0,ly1,[&](const vboolx& valid, const vintx& ix, const vintx& iy) {
+            const vfloatx lu = select(ix == swidth -1, vfloatx(1.0f), (vfloatx(ix)-srange.lower.x)*scale_x);
+            const vfloatx lv = select(iy == sheight-1, vfloatx(1.0f), (vfloatx(iy)-srange.lower.y)*scale_y);
+            const Vec3vfx p = patch.eval(lu,lv);
+            Vec3vfx n = zero;
+            if (unlikely(Nx != nullptr)) n = normalize_safe(patch.normal(lu,lv));
+            const vfloatx u = vfloatx(ix)*rcp_swidth;
+            const vfloatx v = vfloatx(iy)*rcp_sheight;
+            const vintx ofs = (iy-y0)*dwidth+(ix-x0);
+            if (likely(all(valid)) && all(iy==iy[0])) {
+              const unsigned ofs2 = ofs[0];
+              vfloatx::storeu(Px+ofs2,p.x);
+              vfloatx::storeu(Py+ofs2,p.y);
+              vfloatx::storeu(Pz+ofs2,p.z);
+              vfloatx::storeu(U+ofs2,u);
+              vfloatx::storeu(V+ofs2,v);
+              if (unlikely(Nx != nullptr)) {
+                vfloatx::storeu(Nx+ofs2,n.x);
+                vfloatx::storeu(Ny+ofs2,n.y);
+                vfloatx::storeu(Nz+ofs2,n.z);
+              }
+            } else {
+              foreach_unique_index(valid,iy,[&](const vboolx& valid, const int iy0, const int j) {
+                  const unsigned ofs2 = ofs[j]-j;
+                  vfloatx::storeu(valid,Px+ofs2,p.x);
+                  vfloatx::storeu(valid,Py+ofs2,p.y);
+                  vfloatx::storeu(valid,Pz+ofs2,p.z);
+                  vfloatx::storeu(valid,U+ofs2,u);
+                  vfloatx::storeu(valid,V+ofs2,v);
+                  if (unlikely(Nx != nullptr)) {
+                    vfloatx::storeu(valid,Nx+ofs2,n.x);
+                    vfloatx::storeu(valid,Ny+ofs2,n.y);
+                    vfloatx::storeu(valid,Nz+ofs2,n.z);
+                  }
+                });
+            }
+          });
+#endif
+      }
+      
+      __forceinline bool final(const CatmullClarkPatch3fa& patch, const CatmullClarkRing::Type type, unsigned depth) 
+      {
+        const unsigned max_eval_depth = (type & CatmullClarkRing::TYPE_CREASES) ? PATCH_MAX_EVAL_DEPTH_CREASE : PATCH_MAX_EVAL_DEPTH_IRREGULAR;
+//#if PATCH_MIN_RESOLUTION
+//        return patch.isFinalResolution(PATCH_MIN_RESOLUTION) || depth>=max_eval_depth;
+//#else
+        return depth>=max_eval_depth;
+//#endif
+      }
+      
+      void eval(const CatmullClarkPatch3fa& patch, const BBox2f& srange, const BBox2f& erange, const unsigned depth, 
+                const BezierCurve3fa* border0 = nullptr, const BezierCurve3fa* border1 = nullptr, const BezierCurve3fa* border2 = nullptr, const BezierCurve3fa* border3 = nullptr)
+      {
+        if (erange.empty())
+          return;
+        
+        int lx0 = (int) ceilf(erange.lower.x);
+        int lx1 = (int) ceilf(erange.upper.x) + (erange.upper.x == x1 && (srange.lower.x < erange.upper.x || erange.upper.x == 0));
+        int ly0 = (int) ceilf(erange.lower.y);
+        int ly1 = (int) ceilf(erange.upper.y) + (erange.upper.y == y1 && (srange.lower.y < erange.upper.y || erange.upper.y == 0));
+        if (lx0 >= lx1 || ly0 >= ly1) return;
+
+        CatmullClarkPatch::Type ty = patch.type();
+
+        if (unlikely(final(patch,ty,depth)))
+        {
+          if (ty & CatmullClarkRing::TYPE_REGULAR) {
+            RegularPatch rpatch(patch,border0,border1,border2,border3);
+            evalLocalGrid(rpatch,srange,lx0,lx1,ly0,ly1);
+            return;
+          } else {
+            IrregularFillPatch ipatch(patch,border0,border1,border2,border3);
+            evalLocalGrid(ipatch,srange,lx0,lx1,ly0,ly1);
+            return;
+          }
+        }
+        else if (ty & CatmullClarkRing::TYPE_REGULAR_CREASES) { 
+          assert(depth > 0); 
+          RegularPatch rpatch(patch,border0,border1,border2,border3);
+          evalLocalGrid(rpatch,srange,lx0,lx1,ly0,ly1);
+          return;
+        }
+#if PATCH_USE_GREGORY == 2
+        else if (ty & CatmullClarkRing::TYPE_GREGORY_CREASES) { 
+          assert(depth > 0); 
+          GregoryPatch gpatch(patch,border0,border1,border2,border3);
+          evalLocalGrid(gpatch,srange,lx0,lx1,ly0,ly1);
+        }
+#endif
+        else
+        {
+          array_t<CatmullClarkPatch3fa,4> patches; 
+          patch.subdivide(patches);
+          
+          const Vec2f c = srange.center();
+          const BBox2f srange0(srange.lower,c);
+          const BBox2f srange1(Vec2f(c.x,srange.lower.y),Vec2f(srange.upper.x,c.y));
+          const BBox2f srange2(c,srange.upper);
+          const BBox2f srange3(Vec2f(srange.lower.x,c.y),Vec2f(c.x,srange.upper.y));
+          
+          eval(patches[0],srange0,intersect(srange0,erange),depth+1);
+          eval(patches[1],srange1,intersect(srange1,erange),depth+1);
+          eval(patches[2],srange2,intersect(srange2,erange),depth+1);
+          eval(patches[3],srange3,intersect(srange3,erange),depth+1);
+        }
+      }
+    };
+    
+    template<typename Eval, typename Patch>
+      bool stitch_col(const Patch& patch, int subPatch,
+                      const bool right, const unsigned y0, const unsigned y1, const int fine_y, const int coarse_y, 
+                      float* Px, float* Py, float* Pz, float* U, float* V, float* Nx, float* Ny, float* Nz, const unsigned dx0, const unsigned dwidth, const unsigned dheight)
+    {
+      assert(coarse_y <= fine_y);
+      if (likely(fine_y == coarse_y))
+        return false;
+      
+      const unsigned y0s = stitch(y0,fine_y,coarse_y);
+      const unsigned y1s = stitch(y1,fine_y,coarse_y);
+      const unsigned M = y1s-y0s+1 + VSIZEX;
+      
+      dynamic_large_stack_array(float,px,M,64*sizeof(float));
+      dynamic_large_stack_array(float,py,M,64*sizeof(float));
+      dynamic_large_stack_array(float,pz,M,64*sizeof(float));
+      dynamic_large_stack_array(float,u,M,64*sizeof(float));
+      dynamic_large_stack_array(float,v,M,64*sizeof(float));
+      dynamic_large_stack_array(float,nx,M,64*sizeof(float));
+      dynamic_large_stack_array(float,ny,M,64*sizeof(float));
+      dynamic_large_stack_array(float,nz,M,64*sizeof(float));
+      const bool has_Nxyz = Nx; assert(!Nx || (Ny && Nz));
+      Eval(patch,subPatch, right,right, y0s,y1s, 2,coarse_y+1, px,py,pz,u,v, 
+           has_Nxyz ? (float*)nx : nullptr,has_Nxyz ? (float*)ny : nullptr ,has_Nxyz ? (float*)nz : nullptr, 1,4097);
+      
+      for (unsigned y=y0; y<=y1; y++) 
+      {
+        const unsigned ys = stitch(y,fine_y,coarse_y)-y0s;
+        Px[(y-y0)*dwidth+dx0] = px[ys];
+        Py[(y-y0)*dwidth+dx0] = py[ys];
+        Pz[(y-y0)*dwidth+dx0] = pz[ys];
+        U [(y-y0)*dwidth+dx0] = u[ys];
+        V [(y-y0)*dwidth+dx0] = v[ys];
+        if (unlikely(has_Nxyz)) {
+          Nx[(y-y0)*dwidth+dx0] = nx[ys];
+          Ny[(y-y0)*dwidth+dx0] = ny[ys];
+          Nz[(y-y0)*dwidth+dx0] = nz[ys];
+        }
+      }
+      return true;
+    }
+    
+    template<typename Eval, typename Patch>
+      bool stitch_row(const Patch& patch, int subPatch, 
+                      const bool bottom, const unsigned x0, const unsigned x1, const int fine_x, const int coarse_x, 
+                      float* Px, float* Py, float* Pz, float* U, float* V, float* Nx, float* Ny, float* Nz, const unsigned dy0, const unsigned dwidth, const unsigned dheight)
+    {
+      assert(coarse_x <= fine_x);
+      if (likely(fine_x == coarse_x))
+	return false;
+      
+      const unsigned x0s = stitch(x0,fine_x,coarse_x);
+      const unsigned x1s = stitch(x1,fine_x,coarse_x);
+      const unsigned M = x1s-x0s+1 + VSIZEX;
+
+      dynamic_large_stack_array(float,px,M,32*sizeof(float));
+      dynamic_large_stack_array(float,py,M,32*sizeof(float));
+      dynamic_large_stack_array(float,pz,M,32*sizeof(float));
+      dynamic_large_stack_array(float,u,M,32*sizeof(float));
+      dynamic_large_stack_array(float,v,M,32*sizeof(float));
+      dynamic_large_stack_array(float,nx,M,32*sizeof(float));
+      dynamic_large_stack_array(float,ny,M,32*sizeof(float));
+      dynamic_large_stack_array(float,nz,M,32*sizeof(float));
+      const bool has_Nxyz = Nx; assert(!Nx || (Ny && Nz));
+      Eval(patch,subPatch, x0s,x1s, bottom,bottom, coarse_x+1,2, px,py,pz,u,v, 
+           has_Nxyz ? (float*)nx :nullptr, has_Nxyz ? (float*)ny : nullptr , has_Nxyz ? (float*)nz : nullptr, 4097,1);
+      
+      for (unsigned x=x0; x<=x1; x++) 
+      {
+	const unsigned xs = stitch(x,fine_x,coarse_x)-x0s;
+	Px[dy0*dwidth+x-x0] = px[xs];
+        Py[dy0*dwidth+x-x0] = py[xs];
+        Pz[dy0*dwidth+x-x0] = pz[xs];
+        U [dy0*dwidth+x-x0] = u[xs];
+        V [dy0*dwidth+x-x0] = v[xs];
+        if (unlikely(has_Nxyz)) {
+          Nx[dy0*dwidth+x-x0] = nx[xs];
+          Ny[dy0*dwidth+x-x0] = ny[xs];
+          Nz[dy0*dwidth+x-x0] = nz[xs];
+        }
+      }
+      return true;
+    }
+    
+    template<typename Eval, typename Patch>
+    void feature_adaptive_eval_grid (const Patch& patch, unsigned subPatch, const float levels[4],
+                                     const unsigned x0, const unsigned x1, const unsigned y0, const unsigned y1, const unsigned swidth, const unsigned sheight, 
+                                     float* Px, float* Py, float* Pz, float* U, float* V, float* Nx, float* Ny, float* Nz, const unsigned dwidth, const unsigned dheight)
+    {
+      bool sl = false, sr = false, st = false, sb = false;
+      if (levels) {
+        sl = x0 == 0         && stitch_col<Eval,Patch>(patch,subPatch,0,y0,y1,sheight-1,int(levels[3]), Px,Py,Pz,U,V,Nx,Ny,Nz, 0    ,dwidth,dheight);
+        sr = x1 == swidth-1  && stitch_col<Eval,Patch>(patch,subPatch,1,y0,y1,sheight-1,int(levels[1]), Px,Py,Pz,U,V,Nx,Ny,Nz, x1-x0,dwidth,dheight);
+        st = y0 == 0         && stitch_row<Eval,Patch>(patch,subPatch,0,x0,x1,swidth-1,int(levels[0]), Px,Py,Pz,U,V,Nx,Ny,Nz, 0    ,dwidth,dheight);
+        sb = y1 == sheight-1 && stitch_row<Eval,Patch>(patch,subPatch,1,x0,x1,swidth-1,int(levels[2]), Px,Py,Pz,U,V,Nx,Ny,Nz, y1-y0,dwidth,dheight);
+      }
+      const unsigned ofs = st*dwidth+sl;
+      Eval(patch,subPatch,x0+sl,x1-sr,y0+st,y1-sb, swidth,sheight, Px+ofs,Py+ofs,Pz+ofs,U+ofs,V+ofs,Nx?Nx+ofs:nullptr,Ny?Ny+ofs:nullptr,Nz?Nz+ofs:nullptr, dwidth,dheight);
+    }
+  }
+}
+
diff --git a/thirdparty/embree/kernels/subdiv/feature_adaptive_eval_simd.h b/thirdparty/embree/kernels/subdiv/feature_adaptive_eval_simd.h
new file mode 100644
index 0000000000..edab0db12f
--- /dev/null
+++ b/thirdparty/embree/kernels/subdiv/feature_adaptive_eval_simd.h
@@ -0,0 +1,186 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "patch.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<typename vbool, typename vint, typename vfloat, typename Vertex, typename Vertex_t = Vertex>
+      struct FeatureAdaptiveEvalSimd
+      {
+      public:
+        
+        typedef PatchT<Vertex,Vertex_t> Patch;
+        typedef typename Patch::Ref Ref;
+        typedef GeneralCatmullClarkPatchT<Vertex,Vertex_t> GeneralCatmullClarkPatch;
+        typedef CatmullClark1RingT<Vertex,Vertex_t> CatmullClarkRing;
+        typedef CatmullClarkPatchT<Vertex,Vertex_t> CatmullClarkPatch;
+        typedef BSplinePatchT<Vertex,Vertex_t> BSplinePatch;
+        typedef BezierPatchT<Vertex,Vertex_t> BezierPatch;
+        typedef GregoryPatchT<Vertex,Vertex_t> GregoryPatch;
+        typedef BilinearPatchT<Vertex,Vertex_t> BilinearPatch;
+        typedef BezierCurveT<Vertex> BezierCurve;
+
+        FeatureAdaptiveEvalSimd (const HalfEdge* edge, const char* vertices, size_t stride, const vbool& valid, const vfloat& u, const vfloat& v, 
+                                 float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv, const size_t dstride, const size_t N)
+        : P(P), dPdu(dPdu), dPdv(dPdv), ddPdudu(ddPdudu), ddPdvdv(ddPdvdv), ddPdudv(ddPdudv), dstride(dstride), N(N)
+        {
+          switch (edge->patch_type) {
+          case HalfEdge::BILINEAR_PATCH: BilinearPatch(edge,vertices,stride).eval(valid,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,1.0f,dstride,N); break;
+          case HalfEdge::REGULAR_QUAD_PATCH: RegularPatchT(edge,vertices,stride).eval(valid,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,1.0f,dstride,N); break;
+#if PATCH_USE_GREGORY == 2
+          case HalfEdge::IRREGULAR_QUAD_PATCH: GregoryPatchT<Vertex,Vertex_t>(edge,vertices,stride).eval(valid,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,1.0f,dstride,N); break;
+#endif
+          default: {
+            GeneralCatmullClarkPatch patch(edge,vertices,stride);
+            eval_direct(valid,patch,Vec2<vfloat>(u,v),0);
+            break;
+          }
+          }
+        }
+
+        FeatureAdaptiveEvalSimd (const CatmullClarkPatch& patch, const vbool& valid, const vfloat& u, const vfloat& v, float dscale, size_t depth, 
+                                 float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv, const size_t dstride, const size_t N)
+        : P(P), dPdu(dPdu), dPdv(dPdv), ddPdudu(ddPdudu), ddPdvdv(ddPdvdv), ddPdudv(ddPdudv), dstride(dstride), N(N)
+        {
+          eval_direct(valid,patch,Vec2<vfloat>(u,v),dscale,depth);
+        }
+
+        template<size_t N>
+        __forceinline void eval_quad_direct(const vbool& valid, array_t<CatmullClarkPatch,N>& patches, const Vec2<vfloat>& uv, float dscale, size_t depth)
+        {
+          const vfloat u = uv.x, v = uv.y;
+          const vbool u0_mask = u < 0.5f, u1_mask = u >= 0.5f;
+          const vbool v0_mask = v < 0.5f, v1_mask = v >= 0.5f;
+          const vbool u0v0_mask = valid & u0_mask & v0_mask;
+          const vbool u0v1_mask = valid & u0_mask & v1_mask;
+          const vbool u1v0_mask = valid & u1_mask & v0_mask;
+          const vbool u1v1_mask = valid & u1_mask & v1_mask;
+          if (any(u0v0_mask)) eval_direct(u0v0_mask,patches[0],Vec2<vfloat>(2.0f*u,2.0f*v),2.0f*dscale,depth+1);
+          if (any(u1v0_mask)) eval_direct(u1v0_mask,patches[1],Vec2<vfloat>(2.0f*u-1.0f,2.0f*v),2.0f*dscale,depth+1);
+          if (any(u1v1_mask)) eval_direct(u1v1_mask,patches[2],Vec2<vfloat>(2.0f*u-1.0f,2.0f*v-1.0f),2.0f*dscale,depth+1);
+          if (any(u0v1_mask)) eval_direct(u0v1_mask,patches[3],Vec2<vfloat>(2.0f*u,2.0f*v-1.0f),2.0f*dscale,depth+1);
+        }
+        
+        template<size_t N>
+        __forceinline void eval_general_quad_direct(const vbool& valid, const GeneralCatmullClarkPatch& patch, array_t<CatmullClarkPatch,N>& patches, const Vec2<vfloat>& uv, float dscale, size_t depth)
+        {
+#if PATCH_USE_GREGORY == 2
+          BezierCurve borders[GeneralCatmullClarkPatch::SIZE]; patch.getLimitBorder(borders);
+          BezierCurve border0l,border0r; borders[0].subdivide(border0l,border0r);
+          BezierCurve border1l,border1r; borders[1].subdivide(border1l,border1r);
+          BezierCurve border2l,border2r; borders[2].subdivide(border2l,border2r);
+          BezierCurve border3l,border3r; borders[3].subdivide(border3l,border3r);
+#endif
+          GeneralCatmullClarkPatch::fix_quad_ring_order(patches);
+          const vfloat u = uv.x, v = uv.y;
+          const vbool u0_mask = u < 0.5f, u1_mask = u >= 0.5f;
+          const vbool v0_mask = v < 0.5f, v1_mask = v >= 0.5f;
+          const vbool u0v0_mask = valid & u0_mask & v0_mask;
+          const vbool u0v1_mask = valid & u0_mask & v1_mask;
+          const vbool u1v0_mask = valid & u1_mask & v0_mask;
+          const vbool u1v1_mask = valid & u1_mask & v1_mask;
+#if PATCH_USE_GREGORY == 2
+          if (any(u0v0_mask)) eval_direct(u0v0_mask,patches[0],Vec2<vfloat>(2.0f*u,2.0f*v),2.0f*dscale,depth+1,&border0l,nullptr,nullptr,&border3r);
+          if (any(u1v0_mask)) eval_direct(u1v0_mask,patches[1],Vec2<vfloat>(2.0f*u-1.0f,2.0f*v),2.0f*dscale,depth+1,&border0r,&border1l,nullptr,nullptr);
+          if (any(u1v1_mask)) eval_direct(u1v1_mask,patches[2],Vec2<vfloat>(2.0f*u-1.0f,2.0f*v-1.0f),2.0f*dscale,depth+1,nullptr,&border1r,&border2l,nullptr);
+          if (any(u0v1_mask)) eval_direct(u0v1_mask,patches[3],Vec2<vfloat>(2.0f*u,2.0f*v-1.0f),2.0f*dscale,depth+1,nullptr,nullptr,&border2r,&border3l);
+#else
+          if (any(u0v0_mask)) eval_direct(u0v0_mask,patches[0],Vec2<vfloat>(2.0f*u,2.0f*v),2.0f*dscale,depth+1);
+          if (any(u1v0_mask)) eval_direct(u1v0_mask,patches[1],Vec2<vfloat>(2.0f*u-1.0f,2.0f*v),2.0f*dscale,depth+1);
+          if (any(u1v1_mask)) eval_direct(u1v1_mask,patches[2],Vec2<vfloat>(2.0f*u-1.0f,2.0f*v-1.0f),2.0f*dscale,depth+1);
+          if (any(u0v1_mask)) eval_direct(u0v1_mask,patches[3],Vec2<vfloat>(2.0f*u,2.0f*v-1.0f),2.0f*dscale,depth+1);
+#endif
+        }
+        
+        __forceinline bool final(const CatmullClarkPatch& patch, const typename CatmullClarkRing::Type type, size_t depth) 
+        {
+          const size_t max_eval_depth = (type & CatmullClarkRing::TYPE_CREASES) ? PATCH_MAX_EVAL_DEPTH_CREASE : PATCH_MAX_EVAL_DEPTH_IRREGULAR;
+//#if PATCH_MIN_RESOLUTION
+//          return patch.isFinalResolution(PATCH_MIN_RESOLUTION) || depth>=max_eval_depth;
+//#else
+          return depth>=max_eval_depth;
+//#endif
+        }
+
+        void eval_direct(const vbool& valid, const CatmullClarkPatch& patch, const Vec2<vfloat>& uv, float dscale, size_t depth,
+                         BezierCurve* border0 = nullptr, BezierCurve* border1 = nullptr, BezierCurve* border2 = nullptr, BezierCurve* border3 = nullptr)
+        {
+          typename CatmullClarkPatch::Type ty = patch.type();
+
+          if (unlikely(final(patch,ty,depth)))
+          {
+            if (ty & CatmullClarkRing::TYPE_REGULAR) { 
+              RegularPatch(patch,border0,border1,border2,border3).eval(valid,uv.x,uv.y,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N);
+            } else {
+              IrregularFillPatch(patch,border0,border1,border2,border3).eval(valid,uv.x,uv.y,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N);
+            }
+          }
+          else if (ty & CatmullClarkRing::TYPE_REGULAR_CREASES) { 
+            assert(depth > 0); RegularPatch(patch,border0,border1,border2,border3).eval(valid,uv.x,uv.y,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N);
+          }
+#if PATCH_USE_GREGORY == 2
+          else if (ty & CatmullClarkRing::TYPE_GREGORY_CREASES) { 
+            assert(depth > 0); GregoryPatch(patch,border0,border1,border2,border3).eval(valid,uv.x,uv.y,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N);
+          }
+#endif
+          else
+          {
+            array_t<CatmullClarkPatch,4> patches; 
+            patch.subdivide(patches); // FIXME: only have to generate one of the patches
+            eval_quad_direct(valid,patches,uv,dscale,depth);
+          }
+        }  
+
+        void eval_direct(const vbool& valid, const GeneralCatmullClarkPatch& patch, const Vec2<vfloat>& uv, const size_t depth) 
+        {
+          /* convert into standard quad patch if possible */
+          if (likely(patch.isQuadPatch())) {
+            CatmullClarkPatch qpatch; patch.init(qpatch);
+            return eval_direct(valid,qpatch,uv,1.0f,depth);
+          }
+          
+          /* subdivide patch */
+          unsigned Nc;
+          array_t<CatmullClarkPatch,GeneralCatmullClarkPatch::SIZE> patches; 
+          patch.subdivide(patches,Nc); // FIXME: only have to generate one of the patches
+          
+          /* parametrization for quads */
+          if (Nc == 4) 
+            eval_general_quad_direct(valid,patch,patches,uv,1.0f,depth);
+          
+          /* parametrization for arbitrary polygons */
+          else 
+          {
+            const vint l = (vint)floor(0.5f*uv.x); const vfloat u = 2.0f*frac(0.5f*uv.x)-0.5f; 
+            const vint h = (vint)floor(0.5f*uv.y); const vfloat v = 2.0f*frac(0.5f*uv.y)-0.5f; 
+            const vint i = (h<<2)+l; assert(all(valid,i<Nc));
+            foreach_unique(valid,i,[&](const vbool& valid, const int i) {
+#if PATCH_USE_GREGORY == 2
+                BezierCurve borders[2]; patch.getLimitBorder(borders,i);
+                BezierCurve border0l,border0r; borders[0].subdivide(border0l,border0r);
+                BezierCurve border2l,border2r; borders[1].subdivide(border2l,border2r);
+                eval_direct(valid,patches[i],Vec2<vfloat>(u,v),1.0f,depth+1, &border0l, nullptr, nullptr, &border2r);
+#else
+                eval_direct(valid,patches[i],Vec2<vfloat>(u,v),1.0f,depth+1);
+#endif
+              });
+          }
+        }
+
+      private:
+        float* const P;
+        float* const dPdu;
+        float* const dPdv;
+        float* const ddPdudu;
+        float* const ddPdvdv;
+        float* const ddPdudv;
+        const size_t dstride;
+        const size_t N;
+      };
+  }
+}
diff --git a/thirdparty/embree/kernels/subdiv/gregory_patch.h b/thirdparty/embree/kernels/subdiv/gregory_patch.h
new file mode 100644
index 0000000000..9026d5c407
--- /dev/null
+++ b/thirdparty/embree/kernels/subdiv/gregory_patch.h
@@ -0,0 +1,893 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "catmullclark_patch.h"
+#include "bezier_patch.h"
+#include "bezier_curve.h"
+#include "catmullclark_coefficients.h"
+
+namespace embree
+{  
+  template<typename Vertex, typename Vertex_t = Vertex>
+  class __aligned(64) GregoryPatchT
+  {
+    typedef CatmullClarkPatchT<Vertex,Vertex_t> CatmullClarkPatch;
+    typedef GeneralCatmullClarkPatchT<Vertex,Vertex_t> GeneralCatmullClarkPatch;
+    typedef CatmullClark1RingT<Vertex,Vertex_t> CatmullClark1Ring;
+    typedef BezierCurveT<Vertex> BezierCurve;
+
+  public:
+    Vertex v[4][4];
+    Vertex f[2][2];
+
+    __forceinline GregoryPatchT() {}
+
+    __forceinline GregoryPatchT(const CatmullClarkPatch& patch) {
+      init(patch);
+    }
+
+    __forceinline GregoryPatchT(const CatmullClarkPatch& patch, 
+                                const BezierCurve* border0, const BezierCurve* border1, const BezierCurve* border2, const BezierCurve* border3) 
+    {
+      init_crackfix(patch,border0,border1,border2,border3);
+    }
+
+    __forceinline GregoryPatchT (const HalfEdge* edge, const char* vertices, size_t stride) { 
+      init(CatmullClarkPatch(edge,vertices,stride));
+    }
+      
+    __forceinline Vertex& p0() { return v[0][0]; }
+    __forceinline Vertex& p1() { return v[0][3]; }
+    __forceinline Vertex& p2() { return v[3][3]; }
+    __forceinline Vertex& p3() { return v[3][0]; }
+    
+    __forceinline Vertex& e0_p() { return v[0][1]; }
+    __forceinline Vertex& e0_m() { return v[1][0]; }
+    __forceinline Vertex& e1_p() { return v[1][3]; }
+    __forceinline Vertex& e1_m() { return v[0][2]; }
+    __forceinline Vertex& e2_p() { return v[3][2]; }
+    __forceinline Vertex& e2_m() { return v[2][3]; }
+    __forceinline Vertex& e3_p() { return v[2][0]; }
+    __forceinline Vertex& e3_m() { return v[3][1]; }
+    
+    __forceinline Vertex& f0_p() { return v[1][1]; }
+    __forceinline Vertex& f1_p() { return v[1][2]; }
+    __forceinline Vertex& f2_p() { return v[2][2]; }
+    __forceinline Vertex& f3_p() { return v[2][1]; }
+    __forceinline Vertex& f0_m() { return f[0][0]; }
+    __forceinline Vertex& f1_m() { return f[0][1]; }
+    __forceinline Vertex& f2_m() { return f[1][1]; }
+    __forceinline Vertex& f3_m() { return f[1][0]; }
+    
+    __forceinline const Vertex& p0() const { return v[0][0]; }
+    __forceinline const Vertex& p1() const { return v[0][3]; }
+    __forceinline const Vertex& p2() const { return v[3][3]; }
+    __forceinline const Vertex& p3() const { return v[3][0]; }
+    
+    __forceinline const Vertex& e0_p() const { return v[0][1]; }
+    __forceinline const Vertex& e0_m() const { return v[1][0]; }
+    __forceinline const Vertex& e1_p() const { return v[1][3]; }
+    __forceinline const Vertex& e1_m() const { return v[0][2]; }
+    __forceinline const Vertex& e2_p() const { return v[3][2]; }
+    __forceinline const Vertex& e2_m() const { return v[2][3]; }
+    __forceinline const Vertex& e3_p() const { return v[2][0]; }
+    __forceinline const Vertex& e3_m() const { return v[3][1]; }
+    
+    __forceinline const Vertex& f0_p() const { return v[1][1]; }
+    __forceinline const Vertex& f1_p() const { return v[1][2]; }
+    __forceinline const Vertex& f2_p() const { return v[2][2]; }
+    __forceinline const Vertex& f3_p() const { return v[2][1]; }
+    __forceinline const Vertex& f0_m() const { return f[0][0]; }
+    __forceinline const Vertex& f1_m() const { return f[0][1]; }
+    __forceinline const Vertex& f2_m() const { return f[1][1]; }
+    __forceinline const Vertex& f3_m() const { return f[1][0]; }
+    
+    __forceinline Vertex initCornerVertex(const CatmullClarkPatch& irreg_patch, const size_t index) {
+      return irreg_patch.ring[index].getLimitVertex();
+    }
+    
+    __forceinline Vertex initPositiveEdgeVertex(const CatmullClarkPatch& irreg_patch, const size_t index, const Vertex& p_vtx) {
+      return madd(1.0f/3.0f,irreg_patch.ring[index].getLimitTangent(),p_vtx);
+    }
+    
+    __forceinline Vertex initNegativeEdgeVertex(const CatmullClarkPatch& irreg_patch, const size_t index, const Vertex& p_vtx) {
+      return madd(1.0f/3.0f,irreg_patch.ring[index].getSecondLimitTangent(),p_vtx);
+    }
+
+    __forceinline Vertex initPositiveEdgeVertex2(const CatmullClarkPatch& irreg_patch, const size_t index, const Vertex& p_vtx) 
+    {
+      CatmullClark1Ring3fa r0,r1,r2;
+      irreg_patch.ring[index].subdivide(r0);
+      r0.subdivide(r1);
+      r1.subdivide(r2);
+      return madd(8.0f/3.0f,r2.getLimitTangent(),p_vtx);
+    }
+    
+    __forceinline Vertex initNegativeEdgeVertex2(const CatmullClarkPatch& irreg_patch, const size_t index, const Vertex& p_vtx) 
+    {
+      CatmullClark1Ring3fa r0,r1,r2;
+      irreg_patch.ring[index].subdivide(r0);
+      r0.subdivide(r1);
+      r1.subdivide(r2);
+      return madd(8.0f/3.0f,r2.getSecondLimitTangent(),p_vtx);
+    }
+    
+    void initFaceVertex(const CatmullClarkPatch& irreg_patch, 
+			const size_t index, 
+			const Vertex& p_vtx, 
+                        const Vertex& e0_p_vtx, 
+			const Vertex& e1_m_vtx, 
+			const unsigned int face_valence_p1,
+ 			const Vertex& e0_m_vtx,	
+			const Vertex& e3_p_vtx,	
+			const unsigned int face_valence_p3,
+			Vertex& f_p_vtx, 
+			Vertex& f_m_vtx)
+    {
+      const unsigned int face_valence = irreg_patch.ring[index].face_valence;
+      const unsigned int edge_valence = irreg_patch.ring[index].edge_valence;
+      const unsigned int border_index = irreg_patch.ring[index].border_index;
+      
+      const Vertex& vtx     = irreg_patch.ring[index].vtx;
+      const Vertex e_i      = irreg_patch.ring[index].getEdgeCenter(0);
+      const Vertex c_i_m_1  = irreg_patch.ring[index].getQuadCenter(0);
+      const Vertex e_i_m_1  = irreg_patch.ring[index].getEdgeCenter(1);
+      
+      Vertex c_i, e_i_p_1;
+      const bool hasHardEdge0 =
+        std::isinf(irreg_patch.ring[index].vertex_crease_weight) &&
+        std::isinf(irreg_patch.ring[index].crease_weight[0]);
+                
+      if (unlikely((border_index == edge_valence-2) || hasHardEdge0))
+      {
+        /* mirror quad center and edge mid-point */
+        c_i     = madd(2.0f, e_i - c_i_m_1, c_i_m_1);
+        e_i_p_1 = madd(2.0f, vtx - e_i_m_1, e_i_m_1);
+      }
+      else
+      {
+        c_i     = irreg_patch.ring[index].getQuadCenter( face_valence-1 );
+        e_i_p_1 = irreg_patch.ring[index].getEdgeCenter( face_valence-1 );
+      }
+      
+      Vertex c_i_m_2, e_i_m_2;
+      const bool hasHardEdge1 =
+        std::isinf(irreg_patch.ring[index].vertex_crease_weight) &&
+        std::isinf(irreg_patch.ring[index].crease_weight[1]);
+      
+      if (unlikely(border_index == 2 || hasHardEdge1))
+      {
+        /* mirror quad center and edge mid-point */
+        c_i_m_2  = madd(2.0f, e_i_m_1 - c_i_m_1, c_i_m_1);
+        e_i_m_2  = madd(2.0f, vtx - e_i, + e_i);
+      }
+      else
+      {
+        c_i_m_2  = irreg_patch.ring[index].getQuadCenter( 1 );
+        e_i_m_2  = irreg_patch.ring[index].getEdgeCenter( 2 );
+      }      
+      
+      const float d = 3.0f;
+      //const float c     = cosf(2.0f*M_PI/(float)face_valence);
+      //const float c_e_p = cosf(2.0f*M_PI/(float)face_valence_p1);
+      //const float c_e_m = cosf(2.0f*M_PI/(float)face_valence_p3);
+      
+      const float c     = CatmullClarkPrecomputedCoefficients::table.cos_2PI_div_n(face_valence);
+      const float c_e_p = CatmullClarkPrecomputedCoefficients::table.cos_2PI_div_n(face_valence_p1);
+      const float c_e_m = CatmullClarkPrecomputedCoefficients::table.cos_2PI_div_n(face_valence_p3);
+
+      const Vertex r_e_p = 1.0f/3.0f * (e_i_m_1 - e_i_p_1) + 2.0f/3.0f * (c_i_m_1 - c_i);
+      const Vertex r_e_m = 1.0f/3.0f * (e_i     - e_i_m_2) + 2.0f/3.0f * (c_i_m_1 - c_i_m_2);
+
+      f_p_vtx = 1.0f / d * (c_e_p * p_vtx + (d - 2.0f*c - c_e_p) * e0_p_vtx + 2.0f*c* e1_m_vtx + r_e_p);      
+      f_m_vtx = 1.0f / d * (c_e_m * p_vtx + (d - 2.0f*c - c_e_m) * e0_m_vtx + 2.0f*c* e3_p_vtx + r_e_m);     
+    }
+
+    __noinline void init(const CatmullClarkPatch& patch)
+    {
+      assert( patch.ring[0].hasValidPositions() );
+      assert( patch.ring[1].hasValidPositions() );
+      assert( patch.ring[2].hasValidPositions() );
+      assert( patch.ring[3].hasValidPositions() );
+      
+      p0() = initCornerVertex(patch,0);
+      p1() = initCornerVertex(patch,1);
+      p2() = initCornerVertex(patch,2);
+      p3() = initCornerVertex(patch,3);
+
+      e0_p() = initPositiveEdgeVertex(patch,0, p0());
+      e1_p() = initPositiveEdgeVertex(patch,1, p1());
+      e2_p() = initPositiveEdgeVertex(patch,2, p2());
+      e3_p() = initPositiveEdgeVertex(patch,3, p3());
+
+      e0_m() = initNegativeEdgeVertex(patch,0, p0());
+      e1_m() = initNegativeEdgeVertex(patch,1, p1());
+      e2_m() = initNegativeEdgeVertex(patch,2, p2());
+      e3_m() = initNegativeEdgeVertex(patch,3, p3());
+
+      const unsigned int face_valence_p0 = patch.ring[0].face_valence;
+      const unsigned int face_valence_p1 = patch.ring[1].face_valence;
+      const unsigned int face_valence_p2 = patch.ring[2].face_valence;
+      const unsigned int face_valence_p3 = patch.ring[3].face_valence;
+      
+      initFaceVertex(patch,0,p0(),e0_p(),e1_m(),face_valence_p1,e0_m(),e3_p(),face_valence_p3,f0_p(),f0_m() );
+      initFaceVertex(patch,1,p1(),e1_p(),e2_m(),face_valence_p2,e1_m(),e0_p(),face_valence_p0,f1_p(),f1_m() );
+      initFaceVertex(patch,2,p2(),e2_p(),e3_m(),face_valence_p3,e2_m(),e1_p(),face_valence_p1,f2_p(),f2_m() );
+      initFaceVertex(patch,3,p3(),e3_p(),e0_m(),face_valence_p0,e3_m(),e2_p(),face_valence_p3,f3_p(),f3_m() );
+
+    }
+
+    __noinline void init_crackfix(const CatmullClarkPatch& patch, 
+                                  const BezierCurve* border0, 
+                                  const BezierCurve* border1,
+                                  const BezierCurve* border2, 
+                                  const BezierCurve* border3)
+    {
+      assert( patch.ring[0].hasValidPositions() );
+      assert( patch.ring[1].hasValidPositions() );
+      assert( patch.ring[2].hasValidPositions() );
+      assert( patch.ring[3].hasValidPositions() );
+      
+      p0() = initCornerVertex(patch,0);
+      p1() = initCornerVertex(patch,1);
+      p2() = initCornerVertex(patch,2);
+      p3() = initCornerVertex(patch,3);
+
+      e0_p() = initPositiveEdgeVertex(patch,0, p0());
+      e1_p() = initPositiveEdgeVertex(patch,1, p1());
+      e2_p() = initPositiveEdgeVertex(patch,2, p2());
+      e3_p() = initPositiveEdgeVertex(patch,3, p3());
+
+      e0_m() = initNegativeEdgeVertex(patch,0, p0());
+      e1_m() = initNegativeEdgeVertex(patch,1, p1());
+      e2_m() = initNegativeEdgeVertex(patch,2, p2());
+      e3_m() = initNegativeEdgeVertex(patch,3, p3());
+
+      if (unlikely(border0 != nullptr)) 
+      {         
+        p0()   = border0->v0;
+        e0_p() = border0->v1; 
+        e1_m() = border0->v2; 
+        p1()   = border0->v3;
+      }
+      
+      if (unlikely(border1 != nullptr))
+      {          
+        p1()   = border1->v0; 
+        e1_p() = border1->v1; 
+        e2_m() = border1->v2; 
+        p2()   = border1->v3; 
+      }
+
+      if (unlikely(border2 != nullptr))
+      {          
+        p2()   = border2->v0; 
+        e2_p() = border2->v1; 
+        e3_m() = border2->v2; 
+        p3()   = border2->v3; 
+      }
+
+      if (unlikely(border3 != nullptr))
+      {          
+        p3()   = border3->v0; 
+        e3_p() = border3->v1; 
+        e0_m() = border3->v2; 
+        p0()   = border3->v3; 
+      }
+
+      const unsigned int face_valence_p0 = patch.ring[0].face_valence;
+      const unsigned int face_valence_p1 = patch.ring[1].face_valence;
+      const unsigned int face_valence_p2 = patch.ring[2].face_valence;
+      const unsigned int face_valence_p3 = patch.ring[3].face_valence;
+      
+      initFaceVertex(patch,0,p0(),e0_p(),e1_m(),face_valence_p1,e0_m(),e3_p(),face_valence_p3,f0_p(),f0_m() );
+      initFaceVertex(patch,1,p1(),e1_p(),e2_m(),face_valence_p2,e1_m(),e0_p(),face_valence_p0,f1_p(),f1_m() );
+      initFaceVertex(patch,2,p2(),e2_p(),e3_m(),face_valence_p3,e2_m(),e1_p(),face_valence_p1,f2_p(),f2_m() );
+      initFaceVertex(patch,3,p3(),e3_p(),e0_m(),face_valence_p0,e3_m(),e2_p(),face_valence_p3,f3_p(),f3_m() );
+    }
+
+    
+    void computeGregoryPatchFacePoints(const unsigned int face_valence,
+				       const Vertex& r_e_p, 
+				       const Vertex& r_e_m, 					 
+				       const Vertex& p_vtx, 
+				       const Vertex& e0_p_vtx, 
+				       const Vertex& e1_m_vtx, 
+				       const unsigned int face_valence_p1,
+				       const Vertex& e0_m_vtx,	
+				       const Vertex& e3_p_vtx,	
+				       const unsigned int face_valence_p3,
+				       Vertex& f_p_vtx, 
+				       Vertex& f_m_vtx,
+                                       const float d = 3.0f)
+    {
+      //const float c     = cosf(2.0*M_PI/(float)face_valence);
+      //const float c_e_p = cosf(2.0*M_PI/(float)face_valence_p1);
+      //const float c_e_m = cosf(2.0*M_PI/(float)face_valence_p3);
+
+      const float c     = CatmullClarkPrecomputedCoefficients::table.cos_2PI_div_n(face_valence);
+      const float c_e_p = CatmullClarkPrecomputedCoefficients::table.cos_2PI_div_n(face_valence_p1);
+      const float c_e_m = CatmullClarkPrecomputedCoefficients::table.cos_2PI_div_n(face_valence_p3);
+
+
+      f_p_vtx = 1.0f / d * (c_e_p * p_vtx + (d - 2.0f*c - c_e_p) * e0_p_vtx + 2.0f*c* e1_m_vtx + r_e_p);      
+      f_m_vtx = 1.0f / d * (c_e_m * p_vtx + (d - 2.0f*c - c_e_m) * e0_m_vtx + 2.0f*c* e3_p_vtx + r_e_m);      
+      f_p_vtx = 1.0f / d * (c_e_p * p_vtx + (d - 2.0f*c - c_e_p) * e0_p_vtx + 2.0f*c* e1_m_vtx + r_e_p);      
+      f_m_vtx = 1.0f / d * (c_e_m * p_vtx + (d - 2.0f*c - c_e_m) * e0_m_vtx + 2.0f*c* e3_p_vtx + r_e_m);
+    }
+
+    __noinline void init(const GeneralCatmullClarkPatch& patch)
+    {
+      assert(patch.size() == 4);
+#if 0
+      CatmullClarkPatch qpatch; patch.init(qpatch);
+      init(qpatch);
+#else
+      const float face_valence_p0 = patch.ring[0].face_valence;
+      const float face_valence_p1 = patch.ring[1].face_valence;
+      const float face_valence_p2 = patch.ring[2].face_valence;
+      const float face_valence_p3 = patch.ring[3].face_valence;
+
+      Vertex p0_r_p, p0_r_m;
+      patch.ring[0].computeGregoryPatchEdgePoints( p0(), e0_p(), e0_m(), p0_r_p, p0_r_m );
+
+      Vertex p1_r_p, p1_r_m;
+      patch.ring[1].computeGregoryPatchEdgePoints( p1(), e1_p(), e1_m(), p1_r_p, p1_r_m );
+      
+      Vertex p2_r_p, p2_r_m;
+      patch.ring[2].computeGregoryPatchEdgePoints( p2(), e2_p(), e2_m(), p2_r_p, p2_r_m );
+
+      Vertex p3_r_p, p3_r_m;
+      patch.ring[3].computeGregoryPatchEdgePoints( p3(), e3_p(), e3_m(), p3_r_p, p3_r_m );
+
+      computeGregoryPatchFacePoints(face_valence_p0, p0_r_p, p0_r_m, p0(), e0_p(), e1_m(), face_valence_p1, e0_m(), e3_p(), face_valence_p3, f0_p(), f0_m() );
+      computeGregoryPatchFacePoints(face_valence_p1, p1_r_p, p1_r_m, p1(), e1_p(), e2_m(), face_valence_p2, e1_m(), e0_p(), face_valence_p0, f1_p(), f1_m() );
+      computeGregoryPatchFacePoints(face_valence_p2, p2_r_p, p2_r_m, p2(), e2_p(), e3_m(), face_valence_p3, e2_m(), e1_p(), face_valence_p1, f2_p(), f2_m() );
+      computeGregoryPatchFacePoints(face_valence_p3, p3_r_p, p3_r_m, p3(), e3_p(), e0_m(), face_valence_p0, e3_m(), e2_p(), face_valence_p3, f3_p(), f3_m() );
+
+#endif
+    }
+   
+    
+    __forceinline void convert_to_bezier()
+    {
+      f0_p() = (f0_p() + f0_m()) * 0.5f;
+      f1_p() = (f1_p() + f1_m()) * 0.5f;
+      f2_p() = (f2_p() + f2_m()) * 0.5f;
+      f3_p() = (f3_p() + f3_m()) * 0.5f;
+      f0_m() = Vertex( zero );
+      f1_m() = Vertex( zero );
+      f2_m() = Vertex( zero );
+      f3_m() = Vertex( zero );      
+    }
+    
+    static __forceinline void computeInnerVertices(const Vertex matrix[4][4], const Vertex f_m[2][2], const float uu, const float vv,
+						   Vertex_t& matrix_11, Vertex_t& matrix_12, Vertex_t& matrix_22, Vertex_t& matrix_21)
+    {
+      if (unlikely(uu == 0.0f || uu == 1.0f || vv == 0.0f || vv == 1.0f)) 
+      {
+	matrix_11 = matrix[1][1];
+	matrix_12 = matrix[1][2];
+	matrix_22 = matrix[2][2];
+	matrix_21 = matrix[2][1];	 
+      }
+      else
+      {
+	const Vertex_t f0_p = matrix[1][1];
+	const Vertex_t f1_p = matrix[1][2];
+	const Vertex_t f2_p = matrix[2][2];
+	const Vertex_t f3_p = matrix[2][1];
+        
+	const Vertex_t f0_m = f_m[0][0];
+	const Vertex_t f1_m = f_m[0][1];
+	const Vertex_t f2_m = f_m[1][1];
+	const Vertex_t f3_m = f_m[1][0];
+        
+	matrix_11 = (      uu  * f0_p +       vv  * f0_m)*rcp(uu+vv);
+	matrix_12 = ((1.0f-uu) * f1_m +       vv  * f1_p)*rcp(1.0f-uu+vv);
+	matrix_22 = ((1.0f-uu) * f2_p + (1.0f-vv) * f2_m)*rcp(2.0f-uu-vv);
+	matrix_21 = (      uu  * f3_m + (1.0f-vv) * f3_p)*rcp(1.0f+uu-vv);
+      }
+    } 
+
+    template<typename vfloat>
+    static __forceinline void computeInnerVertices(const Vertex v[4][4], const Vertex f[2][2], 
+                                                   size_t i, const vfloat& uu, const vfloat& vv, vfloat& matrix_11, vfloat& matrix_12, vfloat& matrix_22, vfloat& matrix_21) 
+    {
+      const auto m_border = (uu == 0.0f) | (uu == 1.0f) | (vv == 0.0f) | (vv == 1.0f);
+
+      const vfloat f0_p = v[1][1][i];
+      const vfloat f1_p = v[1][2][i];
+      const vfloat f2_p = v[2][2][i];
+      const vfloat f3_p = v[2][1][i];
+      
+      const vfloat f0_m = f[0][0][i];
+      const vfloat f1_m = f[0][1][i];
+      const vfloat f2_m = f[1][1][i];
+      const vfloat f3_m = f[1][0][i];
+      
+      const vfloat one_minus_uu = vfloat(1.0f) - uu;
+      const vfloat one_minus_vv = vfloat(1.0f) - vv;      
+      
+      const vfloat f0_i = (          uu * f0_p +           vv * f0_m) * rcp(uu+vv);
+      const vfloat f1_i = (one_minus_uu * f1_m +           vv * f1_p) * rcp(one_minus_uu+vv);
+      const vfloat f2_i = (one_minus_uu * f2_p + one_minus_vv * f2_m) * rcp(one_minus_uu+one_minus_vv);
+      const vfloat f3_i = (          uu * f3_m + one_minus_vv * f3_p) * rcp(uu+one_minus_vv);
+      
+      matrix_11 = select(m_border,f0_p,f0_i);
+      matrix_12 = select(m_border,f1_p,f1_i);
+      matrix_22 = select(m_border,f2_p,f2_i);
+      matrix_21 = select(m_border,f3_p,f3_i);
+    }
+
+    static __forceinline Vertex eval(const Vertex matrix[4][4], const Vertex f[2][2], const float& uu, const float& vv) 
+    {
+      Vertex_t v_11, v_12, v_22, v_21;
+      computeInnerVertices(matrix,f,uu,vv,v_11, v_12, v_22, v_21);
+      
+      const Vec4<float> Bu = BezierBasis::eval(uu);
+      const Vec4<float> Bv = BezierBasis::eval(vv);
+      
+      return madd(Bv.x,madd(Bu.x,matrix[0][0],madd(Bu.y,matrix[0][1],madd(Bu.z,matrix[0][2],Bu.w * matrix[0][3]))), 
+                  madd(Bv.y,madd(Bu.x,matrix[1][0],madd(Bu.y,v_11        ,madd(Bu.z,v_12        ,Bu.w * matrix[1][3]))), 
+                       madd(Bv.z,madd(Bu.x,matrix[2][0],madd(Bu.y,v_21        ,madd(Bu.z,v_22        ,Bu.w * matrix[2][3]))), 
+                            Bv.w*madd(Bu.x,matrix[3][0],madd(Bu.y,matrix[3][1],madd(Bu.z,matrix[3][2],Bu.w * matrix[3][3])))))); 
+    }
+
+    static __forceinline Vertex eval_du(const Vertex matrix[4][4], const Vertex f[2][2], const float uu, const float vv) // approximative derivative
+    {
+      Vertex_t v_11, v_12, v_22, v_21;
+      computeInnerVertices(matrix,f,uu,vv,v_11, v_12, v_22, v_21);
+      
+      const Vec4<float> Bu = BezierBasis::derivative(uu);
+      const Vec4<float> Bv = BezierBasis::eval(vv);
+
+      return madd(Bv.x,madd(Bu.x,matrix[0][0],madd(Bu.y,matrix[0][1],madd(Bu.z,matrix[0][2],Bu.w * matrix[0][3]))), 
+                  madd(Bv.y,madd(Bu.x,matrix[1][0],madd(Bu.y,v_11        ,madd(Bu.z,v_12        ,Bu.w * matrix[1][3]))), 
+                       madd(Bv.z,madd(Bu.x,matrix[2][0],madd(Bu.y,v_21        ,madd(Bu.z,v_22        ,Bu.w * matrix[2][3]))), 
+                            Bv.w*madd(Bu.x,matrix[3][0],madd(Bu.y,matrix[3][1],madd(Bu.z,matrix[3][2],Bu.w * matrix[3][3])))))); 
+    }
+
+    static __forceinline Vertex eval_dv(const Vertex matrix[4][4], const Vertex f[2][2], const float uu, const float vv) // approximative derivative
+    {
+      Vertex_t v_11, v_12, v_22, v_21;
+      computeInnerVertices(matrix,f,uu,vv,v_11, v_12, v_22, v_21);
+      
+      const Vec4<float> Bu = BezierBasis::eval(uu);
+      const Vec4<float> Bv = BezierBasis::derivative(vv);
+ 
+      return madd(Bv.x,madd(Bu.x,matrix[0][0],madd(Bu.y,matrix[0][1],madd(Bu.z,matrix[0][2],Bu.w * matrix[0][3]))), 
+                  madd(Bv.y,madd(Bu.x,matrix[1][0],madd(Bu.y,v_11        ,madd(Bu.z,v_12        ,Bu.w * matrix[1][3]))), 
+                       madd(Bv.z,madd(Bu.x,matrix[2][0],madd(Bu.y,v_21        ,madd(Bu.z,v_22        ,Bu.w * matrix[2][3]))), 
+                            Bv.w*madd(Bu.x,matrix[3][0],madd(Bu.y,matrix[3][1],madd(Bu.z,matrix[3][2],Bu.w * matrix[3][3])))))); 
+    }
+
+    static __forceinline Vertex eval_dudu(const Vertex matrix[4][4], const Vertex f[2][2], const float uu, const float vv) // approximative derivative
+    {
+      Vertex_t v_11, v_12, v_22, v_21;
+      computeInnerVertices(matrix,f,uu,vv,v_11, v_12, v_22, v_21);
+      
+      const Vec4<float> Bu = BezierBasis::derivative2(uu);
+      const Vec4<float> Bv = BezierBasis::eval(vv);
+ 
+      return madd(Bv.x,madd(Bu.x,matrix[0][0],madd(Bu.y,matrix[0][1],madd(Bu.z,matrix[0][2],Bu.w * matrix[0][3]))), 
+                  madd(Bv.y,madd(Bu.x,matrix[1][0],madd(Bu.y,v_11        ,madd(Bu.z,v_12        ,Bu.w * matrix[1][3]))), 
+                       madd(Bv.z,madd(Bu.x,matrix[2][0],madd(Bu.y,v_21        ,madd(Bu.z,v_22        ,Bu.w * matrix[2][3]))), 
+                            Bv.w*madd(Bu.x,matrix[3][0],madd(Bu.y,matrix[3][1],madd(Bu.z,matrix[3][2],Bu.w * matrix[3][3])))))); 
+     }
+
+    static __forceinline Vertex eval_dvdv(const Vertex matrix[4][4], const Vertex f[2][2], const float uu, const float vv) // approximative derivative
+    {
+      Vertex_t v_11, v_12, v_22, v_21;
+      computeInnerVertices(matrix,f,uu,vv,v_11, v_12, v_22, v_21);
+      
+      const Vec4<float> Bu = BezierBasis::eval(uu);
+      const Vec4<float> Bv = BezierBasis::derivative2(vv);
+
+      return madd(Bv.x,madd(Bu.x,matrix[0][0],madd(Bu.y,matrix[0][1],madd(Bu.z,matrix[0][2],Bu.w * matrix[0][3]))), 
+                  madd(Bv.y,madd(Bu.x,matrix[1][0],madd(Bu.y,v_11        ,madd(Bu.z,v_12        ,Bu.w * matrix[1][3]))), 
+                       madd(Bv.z,madd(Bu.x,matrix[2][0],madd(Bu.y,v_21        ,madd(Bu.z,v_22        ,Bu.w * matrix[2][3]))), 
+                            Bv.w*madd(Bu.x,matrix[3][0],madd(Bu.y,matrix[3][1],madd(Bu.z,matrix[3][2],Bu.w * matrix[3][3])))))); 
+    }
+
+    static __forceinline Vertex eval_dudv(const Vertex matrix[4][4], const Vertex f[2][2], const float uu, const float vv) // approximative derivative
+    {
+      Vertex_t v_11, v_12, v_22, v_21;
+      computeInnerVertices(matrix,f,uu,vv,v_11, v_12, v_22, v_21);
+      
+      const Vec4<float> Bu = BezierBasis::derivative(uu);
+      const Vec4<float> Bv = BezierBasis::derivative(vv);
+
+      return madd(Bv.x,madd(Bu.x,matrix[0][0],madd(Bu.y,matrix[0][1],madd(Bu.z,matrix[0][2],Bu.w * matrix[0][3]))), 
+                  madd(Bv.y,madd(Bu.x,matrix[1][0],madd(Bu.y,v_11        ,madd(Bu.z,v_12        ,Bu.w * matrix[1][3]))), 
+                       madd(Bv.z,madd(Bu.x,matrix[2][0],madd(Bu.y,v_21        ,madd(Bu.z,v_22        ,Bu.w * matrix[2][3]))), 
+                            Bv.w*madd(Bu.x,matrix[3][0],madd(Bu.y,matrix[3][1],madd(Bu.z,matrix[3][2],Bu.w * matrix[3][3])))))); 
+    }
+
+    __forceinline Vertex eval(const float uu, const float vv) const {
+      return eval(v,f,uu,vv);
+    }
+
+    __forceinline Vertex eval_du( const float uu, const float vv) const {
+      return eval_du(v,f,uu,vv);
+    }
+
+    __forceinline Vertex eval_dv( const float uu, const float vv) const {
+      return eval_dv(v,f,uu,vv);
+    }
+
+    __forceinline Vertex eval_dudu( const float uu, const float vv) const {
+      return eval_dudu(v,f,uu,vv);
+    }
+
+    __forceinline Vertex eval_dvdv( const float uu, const float vv) const {
+      return eval_dvdv(v,f,uu,vv);
+    }
+
+    __forceinline Vertex eval_dudv( const float uu, const float vv) const {
+      return eval_dudv(v,f,uu,vv);
+    }
+
+    static __forceinline Vertex normal(const Vertex matrix[4][4], const Vertex f_m[2][2], const float uu, const float vv)  // FIXME: why not using basis functions
+    {
+      /* interpolate inner vertices */
+      Vertex_t matrix_11, matrix_12, matrix_22, matrix_21;
+      computeInnerVertices(matrix,f_m,uu,vv,matrix_11, matrix_12, matrix_22, matrix_21);
+      
+      /* tangentU */
+      const Vertex_t col0 = deCasteljau(vv, (Vertex_t)matrix[0][0], (Vertex_t)matrix[1][0], (Vertex_t)matrix[2][0], (Vertex_t)matrix[3][0]);
+      const Vertex_t col1 = deCasteljau(vv, (Vertex_t)matrix[0][1], (Vertex_t)matrix_11   , (Vertex_t)matrix_21   , (Vertex_t)matrix[3][1]);
+      const Vertex_t col2 = deCasteljau(vv, (Vertex_t)matrix[0][2], (Vertex_t)matrix_12   , (Vertex_t)matrix_22   , (Vertex_t)matrix[3][2]);
+      const Vertex_t col3 = deCasteljau(vv, (Vertex_t)matrix[0][3], (Vertex_t)matrix[1][3], (Vertex_t)matrix[2][3], (Vertex_t)matrix[3][3]);
+      
+      const Vertex_t tangentU = deCasteljau_tangent(uu, col0, col1, col2, col3);
+      
+      /* tangentV */
+      const Vertex_t row0 = deCasteljau(uu, (Vertex_t)matrix[0][0], (Vertex_t)matrix[0][1], (Vertex_t)matrix[0][2], (Vertex_t)matrix[0][3]);
+      const Vertex_t row1 = deCasteljau(uu, (Vertex_t)matrix[1][0], (Vertex_t)matrix_11   , (Vertex_t)matrix_12   , (Vertex_t)matrix[1][3]);
+      const Vertex_t row2 = deCasteljau(uu, (Vertex_t)matrix[2][0], (Vertex_t)matrix_21   , (Vertex_t)matrix_22   , (Vertex_t)matrix[2][3]);
+      const Vertex_t row3 = deCasteljau(uu, (Vertex_t)matrix[3][0], (Vertex_t)matrix[3][1], (Vertex_t)matrix[3][2], (Vertex_t)matrix[3][3]);
+      
+      const Vertex_t tangentV = deCasteljau_tangent(vv, row0, row1, row2, row3);
+      
+      /* normal = tangentU x tangentV */
+      const Vertex_t n = cross(tangentU,tangentV);
+      
+      return n;     
+    }
+   
+    __forceinline Vertex normal( const float uu, const float vv) const {
+      return normal(v,f,uu,vv);
+    }    
+    
+    __forceinline void eval(const float u, const float v, 
+                            Vertex* P, Vertex* dPdu, Vertex* dPdv, 
+                            Vertex* ddPdudu, Vertex* ddPdvdv, Vertex* ddPdudv,
+                            const float dscale = 1.0f) const
+    {
+      if (P) {
+        *P = eval(u,v); 
+      }
+      if (dPdu) {
+        assert(dPdu); *dPdu = eval_du(u,v)*dscale; 
+        assert(dPdv); *dPdv = eval_dv(u,v)*dscale; 
+      }
+      if (ddPdudu) {
+        assert(ddPdudu); *ddPdudu = eval_dudu(u,v)*sqr(dscale); 
+        assert(ddPdvdv); *ddPdvdv = eval_dvdv(u,v)*sqr(dscale); 
+        assert(ddPdudv); *ddPdudv = eval_dudv(u,v)*sqr(dscale); 
+      }
+    }
+
+    template<class vfloat>
+    static __forceinline vfloat eval(const Vertex v[4][4], const Vertex f[2][2], 
+                                     const size_t i, const vfloat& uu, const vfloat& vv, const Vec4<vfloat>& u_n, const Vec4<vfloat>& v_n,
+                                     vfloat& matrix_11, vfloat& matrix_12, vfloat& matrix_22, vfloat& matrix_21)
+    {
+      const vfloat curve0_x = madd(v_n[0],vfloat(v[0][0][i]),madd(v_n[1],vfloat(v[1][0][i]),madd(v_n[2],vfloat(v[2][0][i]),v_n[3] * vfloat(v[3][0][i]))));
+      const vfloat curve1_x = madd(v_n[0],vfloat(v[0][1][i]),madd(v_n[1],vfloat(matrix_11 ),madd(v_n[2],vfloat(matrix_21 ),v_n[3] * vfloat(v[3][1][i]))));
+      const vfloat curve2_x = madd(v_n[0],vfloat(v[0][2][i]),madd(v_n[1],vfloat(matrix_12 ),madd(v_n[2],vfloat(matrix_22 ),v_n[3] * vfloat(v[3][2][i]))));
+      const vfloat curve3_x = madd(v_n[0],vfloat(v[0][3][i]),madd(v_n[1],vfloat(v[1][3][i]),madd(v_n[2],vfloat(v[2][3][i]),v_n[3] * vfloat(v[3][3][i]))));
+      return madd(u_n[0],curve0_x,madd(u_n[1],curve1_x,madd(u_n[2],curve2_x,u_n[3] * curve3_x)));
+    }
+    
+    template<typename vbool, typename vfloat>
+    static __forceinline void eval(const Vertex v[4][4], const Vertex f[2][2], 
+                                   const vbool& valid, const vfloat& uu, const vfloat& vv, 
+                                   float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv,
+                                   const float dscale, const size_t dstride, const size_t N) 
+    {
+      if (P) {
+        const Vec4<vfloat> u_n = BezierBasis::eval(uu); 
+        const Vec4<vfloat> v_n = BezierBasis::eval(vv); 
+        for (size_t i=0; i<N; i++) {
+          vfloat matrix_11, matrix_12, matrix_22, matrix_21;
+          computeInnerVertices(v,f,i,uu,vv,matrix_11,matrix_12,matrix_22,matrix_21); // FIXME: calculated multiple times
+          vfloat::store(valid,P+i*dstride,eval(v,f,i,uu,vv,u_n,v_n,matrix_11,matrix_12,matrix_22,matrix_21));
+        }
+      }
+      if (dPdu)
+      {
+        {
+          assert(dPdu);
+          const Vec4<vfloat> u_n = BezierBasis::derivative(uu); 
+          const Vec4<vfloat> v_n = BezierBasis::eval(vv);
+          for (size_t i=0; i<N; i++) {
+            vfloat matrix_11, matrix_12, matrix_22, matrix_21;
+            computeInnerVertices(v,f,i,uu,vv,matrix_11,matrix_12,matrix_22,matrix_21);  // FIXME: calculated multiple times
+            vfloat::store(valid,dPdu+i*dstride,eval(v,f,i,uu,vv,u_n,v_n,matrix_11,matrix_12,matrix_22,matrix_21)*dscale);
+          }
+        }
+        {
+          assert(dPdv);
+          const Vec4<vfloat> u_n = BezierBasis::eval(uu); 
+          const Vec4<vfloat> v_n = BezierBasis::derivative(vv);
+          for (size_t i=0; i<N; i++) {
+            vfloat matrix_11, matrix_12, matrix_22, matrix_21;
+            computeInnerVertices(v,f,i,uu,vv,matrix_11,matrix_12,matrix_22,matrix_21);  // FIXME: calculated multiple times
+            vfloat::store(valid,dPdv+i*dstride,eval(v,f,i,uu,vv,u_n,v_n,matrix_11,matrix_12,matrix_22,matrix_21)*dscale);
+          }
+        }
+      }
+      if (ddPdudu)
+      {
+        {
+          assert(ddPdudu);
+          const Vec4<vfloat> u_n = BezierBasis::derivative2(uu); 
+          const Vec4<vfloat> v_n = BezierBasis::eval(vv);
+          for (size_t i=0; i<N; i++) {
+            vfloat matrix_11, matrix_12, matrix_22, matrix_21;
+            computeInnerVertices(v,f,i,uu,vv,matrix_11,matrix_12,matrix_22,matrix_21);  // FIXME: calculated multiple times
+            vfloat::store(valid,ddPdudu+i*dstride,eval(v,f,i,uu,vv,u_n,v_n,matrix_11,matrix_12,matrix_22,matrix_21)*sqr(dscale));
+          }
+        }
+        {
+          assert(ddPdvdv);
+          const Vec4<vfloat> u_n = BezierBasis::eval(uu); 
+          const Vec4<vfloat> v_n = BezierBasis::derivative2(vv);
+          for (size_t i=0; i<N; i++) {
+            vfloat matrix_11, matrix_12, matrix_22, matrix_21;
+            computeInnerVertices(v,f,i,uu,vv,matrix_11,matrix_12,matrix_22,matrix_21);  // FIXME: calculated multiple times
+            vfloat::store(valid,ddPdvdv+i*dstride,eval(v,f,i,uu,vv,u_n,v_n,matrix_11,matrix_12,matrix_22,matrix_21)*sqr(dscale));
+          }
+        }
+        {
+          assert(ddPdudv);
+          const Vec4<vfloat> u_n = BezierBasis::derivative(uu); 
+          const Vec4<vfloat> v_n = BezierBasis::derivative(vv);
+          for (size_t i=0; i<N; i++) {
+            vfloat matrix_11, matrix_12, matrix_22, matrix_21;
+            computeInnerVertices(v,f,i,uu,vv,matrix_11,matrix_12,matrix_22,matrix_21);  // FIXME: calculated multiple times
+            vfloat::store(valid,ddPdudv+i*dstride,eval(v,f,i,uu,vv,u_n,v_n,matrix_11,matrix_12,matrix_22,matrix_21)*sqr(dscale));
+          }
+        }
+      }
+    }
+
+    template<typename vbool, typename vfloat>
+    __forceinline void eval(const vbool& valid, const vfloat& uu, const vfloat& vv, 
+                            float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv,
+                            const float dscale, const size_t dstride, const size_t N) const {
+      eval(v,f,valid,uu,vv,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N);
+    }
+
+    template<class T>
+      static __forceinline Vec3<T> eval_t(const Vertex matrix[4][4], const Vec3<T> f[2][2], const T& uu, const T& vv) 
+    {
+      typedef typename T::Bool M;
+      const M m_border = (uu == 0.0f) | (uu == 1.0f) | (vv == 0.0f) | (vv == 1.0f);
+
+      const Vec3<T> f0_p = Vec3<T>(matrix[1][1].x,matrix[1][1].y,matrix[1][1].z);
+      const Vec3<T> f1_p = Vec3<T>(matrix[1][2].x,matrix[1][2].y,matrix[1][2].z);
+      const Vec3<T> f2_p = Vec3<T>(matrix[2][2].x,matrix[2][2].y,matrix[2][2].z);
+      const Vec3<T> f3_p = Vec3<T>(matrix[2][1].x,matrix[2][1].y,matrix[2][1].z);
+      
+      const Vec3<T> f0_m = f[0][0];
+      const Vec3<T> f1_m = f[0][1];
+      const Vec3<T> f2_m = f[1][1];
+      const Vec3<T> f3_m = f[1][0];
+      
+      const T one_minus_uu = T(1.0f) - uu;
+      const T one_minus_vv = T(1.0f) - vv;      
+      
+      const Vec3<T> f0_i = (          uu * f0_p +           vv * f0_m) * rcp(uu+vv);
+      const Vec3<T> f1_i = (one_minus_uu * f1_m +           vv * f1_p) * rcp(one_minus_uu+vv);
+      const Vec3<T> f2_i = (one_minus_uu * f2_p + one_minus_vv * f2_m) * rcp(one_minus_uu+one_minus_vv);
+      const Vec3<T> f3_i = (          uu * f3_m + one_minus_vv * f3_p) * rcp(uu+one_minus_vv);
+      
+      const Vec3<T> F0( select(m_border,f0_p.x,f0_i.x), select(m_border,f0_p.y,f0_i.y), select(m_border,f0_p.z,f0_i.z) );
+      const Vec3<T> F1( select(m_border,f1_p.x,f1_i.x), select(m_border,f1_p.y,f1_i.y), select(m_border,f1_p.z,f1_i.z) );
+      const Vec3<T> F2( select(m_border,f2_p.x,f2_i.x), select(m_border,f2_p.y,f2_i.y), select(m_border,f2_p.z,f2_i.z) );
+      const Vec3<T> F3( select(m_border,f3_p.x,f3_i.x), select(m_border,f3_p.y,f3_i.y), select(m_border,f3_p.z,f3_i.z) );
+
+      const T B0_u = one_minus_uu * one_minus_uu * one_minus_uu;
+      const T B0_v = one_minus_vv * one_minus_vv * one_minus_vv;
+      const T B1_u = 3.0f * (one_minus_uu * uu * one_minus_uu);
+      const T B1_v = 3.0f * (one_minus_vv * vv * one_minus_vv);
+      const T B2_u = 3.0f * (uu * one_minus_uu * uu);
+      const T B2_v = 3.0f * (vv * one_minus_vv * vv);
+      const T B3_u = uu * uu * uu;
+      const T B3_v = vv * vv * vv;
+
+      const T x = madd(B0_v,madd(B0_u,matrix[0][0].x,madd(B1_u,matrix[0][1].x,madd(B2_u,matrix[0][2].x,B3_u * matrix[0][3].x))), 
+                  madd(B1_v,madd(B0_u,matrix[1][0].x,madd(B1_u,F0.x          ,madd(B2_u,F1.x          ,B3_u * matrix[1][3].x))), 
+                  madd(B2_v,madd(B0_u,matrix[2][0].x,madd(B1_u,F3.x          ,madd(B2_u,F2.x          ,B3_u * matrix[2][3].x))), 
+                       B3_v*madd(B0_u,matrix[3][0].x,madd(B1_u,matrix[3][1].x,madd(B2_u,matrix[3][2].x,B3_u * matrix[3][3].x)))))); 
+
+      const T y = madd(B0_v,madd(B0_u,matrix[0][0].y,madd(B1_u,matrix[0][1].y,madd(B2_u,matrix[0][2].y,B3_u * matrix[0][3].y))),
+                  madd(B1_v,madd(B0_u,matrix[1][0].y,madd(B1_u,F0.y          ,madd(B2_u,F1.y          ,B3_u * matrix[1][3].y))),
+                  madd(B2_v,madd(B0_u,matrix[2][0].y,madd(B1_u,F3.y          ,madd(B2_u,F2.y          ,B3_u * matrix[2][3].y))),
+                       B3_v*madd(B0_u,matrix[3][0].y,madd(B1_u,matrix[3][1].y,madd(B2_u,matrix[3][2].y,B3_u * matrix[3][3].y))))));
+      
+      const T z = madd(B0_v,madd(B0_u,matrix[0][0].z,madd(B1_u,matrix[0][1].z,madd(B2_u,matrix[0][2].z,B3_u * matrix[0][3].z))),
+                  madd(B1_v,madd(B0_u,matrix[1][0].z,madd(B1_u,F0.z          ,madd(B2_u,F1.z          ,B3_u * matrix[1][3].z))),
+                  madd(B2_v,madd(B0_u,matrix[2][0].z,madd(B1_u,F3.z          ,madd(B2_u,F2.z          ,B3_u * matrix[2][3].z))),
+                       B3_v*madd(B0_u,matrix[3][0].z,madd(B1_u,matrix[3][1].z,madd(B2_u,matrix[3][2].z,B3_u * matrix[3][3].z))))));
+      
+      return Vec3<T>(x,y,z);
+    }
+
+    template<class T>
+    __forceinline Vec3<T> eval(const T& uu, const T& vv) const 
+    {
+      Vec3<T> ff[2][2];
+      ff[0][0] = Vec3<T>(f[0][0]);
+      ff[0][1] = Vec3<T>(f[0][1]);
+      ff[1][1] = Vec3<T>(f[1][1]);
+      ff[1][0] = Vec3<T>(f[1][0]);
+      return eval_t(v,ff,uu,vv);
+    }
+
+    template<class T>
+      static __forceinline Vec3<T> normal_t(const Vertex matrix[4][4], const Vec3<T> f[2][2], const T& uu, const T& vv) 
+    {
+      typedef typename T::Bool M;
+      
+      const Vec3<T> f0_p = Vec3<T>(matrix[1][1].x,matrix[1][1].y,matrix[1][1].z);
+      const Vec3<T> f1_p = Vec3<T>(matrix[1][2].x,matrix[1][2].y,matrix[1][2].z);
+      const Vec3<T> f2_p = Vec3<T>(matrix[2][2].x,matrix[2][2].y,matrix[2][2].z);
+      const Vec3<T> f3_p = Vec3<T>(matrix[2][1].x,matrix[2][1].y,matrix[2][1].z);
+
+      const Vec3<T> f0_m = f[0][0];
+      const Vec3<T> f1_m = f[0][1];
+      const Vec3<T> f2_m = f[1][1];
+      const Vec3<T> f3_m = f[1][0];
+      
+      const T one_minus_uu = T(1.0f) - uu;
+      const T one_minus_vv = T(1.0f) - vv;      
+      
+      const Vec3<T> f0_i = (          uu * f0_p +           vv * f0_m) * rcp(uu+vv);
+      const Vec3<T> f1_i = (one_minus_uu * f1_m +           vv * f1_p) * rcp(one_minus_uu+vv);
+      const Vec3<T> f2_i = (one_minus_uu * f2_p + one_minus_vv * f2_m) * rcp(one_minus_uu+one_minus_vv);
+      const Vec3<T> f3_i = (          uu * f3_m + one_minus_vv * f3_p) * rcp(uu+one_minus_vv);
+
+#if 1
+      const M m_corner0 = (uu == 0.0f) & (vv == 0.0f);
+      const M m_corner1 = (uu == 1.0f) & (vv == 0.0f);
+      const M m_corner2 = (uu == 1.0f) & (vv == 1.0f);
+      const M m_corner3 = (uu == 0.0f) & (vv == 1.0f);      
+      const Vec3<T> matrix_11( select(m_corner0,f0_p.x,f0_i.x), select(m_corner0,f0_p.y,f0_i.y), select(m_corner0,f0_p.z,f0_i.z) );
+      const Vec3<T> matrix_12( select(m_corner1,f1_p.x,f1_i.x), select(m_corner1,f1_p.y,f1_i.y), select(m_corner1,f1_p.z,f1_i.z) );
+      const Vec3<T> matrix_22( select(m_corner2,f2_p.x,f2_i.x), select(m_corner2,f2_p.y,f2_i.y), select(m_corner2,f2_p.z,f2_i.z) );
+      const Vec3<T> matrix_21( select(m_corner3,f3_p.x,f3_i.x), select(m_corner3,f3_p.y,f3_i.y), select(m_corner3,f3_p.z,f3_i.z) );
+#else
+      const M m_border = (uu == 0.0f) | (uu == 1.0f) | (vv == 0.0f) | (vv == 1.0f);
+      const Vec3<T> matrix_11( select(m_border,f0_p.x,f0_i.x), select(m_border,f0_p.y,f0_i.y), select(m_border,f0_p.z,f0_i.z) );
+      const Vec3<T> matrix_12( select(m_border,f1_p.x,f1_i.x), select(m_border,f1_p.y,f1_i.y), select(m_border,f1_p.z,f1_i.z) );
+      const Vec3<T> matrix_22( select(m_border,f2_p.x,f2_i.x), select(m_border,f2_p.y,f2_i.y), select(m_border,f2_p.z,f2_i.z) );
+      const Vec3<T> matrix_21( select(m_border,f3_p.x,f3_i.x), select(m_border,f3_p.y,f3_i.y), select(m_border,f3_p.z,f3_i.z) );
+#endif
+      
+      const Vec3<T> matrix_00 = Vec3<T>(matrix[0][0].x,matrix[0][0].y,matrix[0][0].z);
+      const Vec3<T> matrix_10 = Vec3<T>(matrix[1][0].x,matrix[1][0].y,matrix[1][0].z);
+      const Vec3<T> matrix_20 = Vec3<T>(matrix[2][0].x,matrix[2][0].y,matrix[2][0].z);
+      const Vec3<T> matrix_30 = Vec3<T>(matrix[3][0].x,matrix[3][0].y,matrix[3][0].z);
+      
+      const Vec3<T> matrix_01 = Vec3<T>(matrix[0][1].x,matrix[0][1].y,matrix[0][1].z);
+      const Vec3<T> matrix_02 = Vec3<T>(matrix[0][2].x,matrix[0][2].y,matrix[0][2].z);
+      const Vec3<T> matrix_03 = Vec3<T>(matrix[0][3].x,matrix[0][3].y,matrix[0][3].z);
+      
+      const Vec3<T> matrix_31 = Vec3<T>(matrix[3][1].x,matrix[3][1].y,matrix[3][1].z);
+      const Vec3<T> matrix_32 = Vec3<T>(matrix[3][2].x,matrix[3][2].y,matrix[3][2].z);
+      const Vec3<T> matrix_33 = Vec3<T>(matrix[3][3].x,matrix[3][3].y,matrix[3][3].z);
+      
+      const Vec3<T> matrix_13 = Vec3<T>(matrix[1][3].x,matrix[1][3].y,matrix[1][3].z);
+      const Vec3<T> matrix_23 = Vec3<T>(matrix[2][3].x,matrix[2][3].y,matrix[2][3].z);
+      
+      /* tangentU */
+      const Vec3<T> col0 = deCasteljau(vv, matrix_00, matrix_10, matrix_20, matrix_30);
+      const Vec3<T> col1 = deCasteljau(vv, matrix_01, matrix_11, matrix_21, matrix_31);
+      const Vec3<T> col2 = deCasteljau(vv, matrix_02, matrix_12, matrix_22, matrix_32);
+      const Vec3<T> col3 = deCasteljau(vv, matrix_03, matrix_13, matrix_23, matrix_33);
+      
+      const Vec3<T> tangentU = deCasteljau_tangent(uu, col0, col1, col2, col3);
+      
+      /* tangentV */
+      const Vec3<T> row0 = deCasteljau(uu, matrix_00, matrix_01, matrix_02, matrix_03);
+      const Vec3<T> row1 = deCasteljau(uu, matrix_10, matrix_11, matrix_12, matrix_13);
+      const Vec3<T> row2 = deCasteljau(uu, matrix_20, matrix_21, matrix_22, matrix_23);
+      const Vec3<T> row3 = deCasteljau(uu, matrix_30, matrix_31, matrix_32, matrix_33);
+      
+      const Vec3<T> tangentV = deCasteljau_tangent(vv, row0, row1, row2, row3);
+      
+      /* normal = tangentU x tangentV */
+      const Vec3<T> n = cross(tangentU,tangentV);
+      return n;
+    }
+
+     template<class T>
+    __forceinline Vec3<T> normal(const T& uu, const T& vv) const 
+    {
+      Vec3<T> ff[2][2];
+      ff[0][0] = Vec3<T>(f[0][0]);
+      ff[0][1] = Vec3<T>(f[0][1]);
+      ff[1][1] = Vec3<T>(f[1][1]);
+      ff[1][0] = Vec3<T>(f[1][0]);
+      return normal_t(v,ff,uu,vv);
+    }
+
+    __forceinline BBox<Vertex> bounds() const
+    {
+      const Vertex *const cv = &v[0][0];
+      BBox<Vertex> bounds (cv[0]);
+      for (size_t i=1; i<16; i++) 
+        bounds.extend( cv[i] );
+      bounds.extend(f[0][0]);
+      bounds.extend(f[1][0]);
+      bounds.extend(f[1][1]);
+      bounds.extend(f[1][1]);
+      return bounds;
+    }
+    
+    friend embree_ostream operator<<(embree_ostream o, const GregoryPatchT& p)
+    {
+      for (size_t y=0; y<4; y++)
+	for (size_t x=0; x<4; x++)
+	  o << "v[" << y << "][" << x << "] " << p.v[y][x] << embree_endl;
+      
+      for (size_t y=0; y<2; y++)
+	for (size_t x=0; x<2; x++)
+	  o << "f[" << y << "][" << x << "] " << p.f[y][x] << embree_endl;
+      return o;
+    } 
+  };
+
+  typedef GregoryPatchT<Vec3fa,Vec3fa_t> GregoryPatch3fa;
+
+  template<typename Vertex, typename Vertex_t>
+    __forceinline  BezierPatchT<Vertex,Vertex_t>::BezierPatchT (const HalfEdge* edge, const char* vertices, size_t stride) 
+  {
+    CatmullClarkPatchT<Vertex,Vertex_t> patch(edge,vertices,stride);
+    GregoryPatchT<Vertex,Vertex_t> gpatch(patch); 
+    gpatch.convert_to_bezier(); 
+    for (size_t y=0; y<4; y++)
+      for (size_t x=0; x<4; x++)
+        matrix[y][x] = (Vertex_t)gpatch.v[y][x];
+  }
+  
+   template<typename Vertex, typename Vertex_t>
+    __forceinline BezierPatchT<Vertex,Vertex_t>::BezierPatchT(const CatmullClarkPatchT<Vertex,Vertex_t>& patch) 
+    {
+      GregoryPatchT<Vertex,Vertex_t> gpatch(patch); 
+      gpatch.convert_to_bezier(); 
+      for (size_t y=0; y<4; y++)
+	for (size_t x=0; x<4; x++)
+	  matrix[y][x] = (Vertex_t)gpatch.v[y][x];
+    }
+
+   template<typename Vertex, typename Vertex_t>
+     __forceinline BezierPatchT<Vertex,Vertex_t>::BezierPatchT(const CatmullClarkPatchT<Vertex,Vertex_t>& patch, 
+                                                               const BezierCurveT<Vertex>* border0,
+                                                               const BezierCurveT<Vertex>* border1,
+                                                               const BezierCurveT<Vertex>* border2,
+                                                               const BezierCurveT<Vertex>* border3) 
+    {
+      GregoryPatchT<Vertex,Vertex_t> gpatch(patch,border0,border1,border2,border3); 
+      gpatch.convert_to_bezier(); 
+      for (size_t y=0; y<4; y++)
+	for (size_t x=0; x<4; x++)
+	  matrix[y][x] = (Vertex_t)gpatch.v[y][x];
+    }
+}
diff --git a/thirdparty/embree/kernels/subdiv/gregory_patch_dense.h b/thirdparty/embree/kernels/subdiv/gregory_patch_dense.h
new file mode 100644
index 0000000000..4cf9a7e98f
--- /dev/null
+++ b/thirdparty/embree/kernels/subdiv/gregory_patch_dense.h
@@ -0,0 +1,113 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "gregory_patch.h"
+
+namespace embree
+{  
+  class __aligned(64) DenseGregoryPatch3fa
+  {
+    typedef Vec3fa Vec3fa_4x4[4][4];
+  public:
+
+    __forceinline DenseGregoryPatch3fa (const GregoryPatch3fa& patch)
+    {
+      for (size_t y=0; y<4; y++)
+	for (size_t x=0; x<4; x++)
+	  matrix[y][x] = Vec3ff(patch.v[y][x], 0.0f);
+      
+      matrix[0][0].w = patch.f[0][0].x;
+      matrix[0][1].w = patch.f[0][0].y;
+      matrix[0][2].w = patch.f[0][0].z;
+      matrix[0][3].w = 0.0f;
+      
+      matrix[1][0].w = patch.f[0][1].x;
+      matrix[1][1].w = patch.f[0][1].y;
+      matrix[1][2].w = patch.f[0][1].z;
+      matrix[1][3].w = 0.0f;
+      
+      matrix[2][0].w = patch.f[1][1].x;
+      matrix[2][1].w = patch.f[1][1].y;
+      matrix[2][2].w = patch.f[1][1].z;
+      matrix[2][3].w = 0.0f;
+      
+      matrix[3][0].w = patch.f[1][0].x;
+      matrix[3][1].w = patch.f[1][0].y;
+      matrix[3][2].w = patch.f[1][0].z;
+      matrix[3][3].w = 0.0f;
+    }
+
+    __forceinline void extract_f_m(Vec3fa f_m[2][2]) const
+    {
+      f_m[0][0] = Vec3fa( matrix[0][0].w, matrix[0][1].w, matrix[0][2].w );
+      f_m[0][1] = Vec3fa( matrix[1][0].w, matrix[1][1].w, matrix[1][2].w );
+      f_m[1][1] = Vec3fa( matrix[2][0].w, matrix[2][1].w, matrix[2][2].w );
+      f_m[1][0] = Vec3fa( matrix[3][0].w, matrix[3][1].w, matrix[3][2].w );      
+    }
+
+    __forceinline Vec3fa eval(const float uu, const float vv) const
+    {
+      __aligned(64) Vec3fa f_m[2][2]; extract_f_m(f_m);
+      return GregoryPatch3fa::eval(*(Vec3fa_4x4*)&matrix,f_m,uu,vv);
+    }
+
+    __forceinline Vec3fa normal(const float uu, const float vv) const
+    {
+      __aligned(64) Vec3fa f_m[2][2]; extract_f_m(f_m);
+      return GregoryPatch3fa::normal(*(Vec3fa_4x4*)&matrix,f_m,uu,vv);
+    }
+
+    template<class T>
+      __forceinline Vec3<T> eval(const T &uu, const T &vv) const 
+    {
+      Vec3<T> f_m[2][2];
+      f_m[0][0] = Vec3<T>( matrix[0][0].w, matrix[0][1].w, matrix[0][2].w );
+      f_m[0][1] = Vec3<T>( matrix[1][0].w, matrix[1][1].w, matrix[1][2].w );
+      f_m[1][1] = Vec3<T>( matrix[2][0].w, matrix[2][1].w, matrix[2][2].w );
+      f_m[1][0] = Vec3<T>( matrix[3][0].w, matrix[3][1].w, matrix[3][2].w );
+      return GregoryPatch3fa::eval_t(*(Vec3fa_4x4*)&matrix,f_m,uu,vv);
+    }
+    
+    template<class T>
+      __forceinline Vec3<T> normal(const T &uu, const T &vv) const 
+    {
+      Vec3<T> f_m[2][2];
+      f_m[0][0] = Vec3<T>( matrix[0][0].w, matrix[0][1].w, matrix[0][2].w );
+      f_m[0][1] = Vec3<T>( matrix[1][0].w, matrix[1][1].w, matrix[1][2].w );
+      f_m[1][1] = Vec3<T>( matrix[2][0].w, matrix[2][1].w, matrix[2][2].w );
+      f_m[1][0] = Vec3<T>( matrix[3][0].w, matrix[3][1].w, matrix[3][2].w );
+      return GregoryPatch3fa::normal_t(*(Vec3fa_4x4*)&matrix,f_m,uu,vv);
+    }
+
+    __forceinline void eval(const float u, const float v, 
+                            Vec3fa* P, Vec3fa* dPdu, Vec3fa* dPdv, Vec3fa* ddPdudu, Vec3fa* ddPdvdv, Vec3fa* ddPdudv,
+                            const float dscale = 1.0f) const
+    {
+      __aligned(64) Vec3fa f_m[2][2]; extract_f_m(f_m);
+      if (P) {
+        *P    = GregoryPatch3fa::eval(*(Vec3fa_4x4*)&matrix,f_m,u,v); 
+      }
+      if (dPdu) {
+        assert(dPdu); *dPdu = GregoryPatch3fa::eval_du(*(Vec3fa_4x4*)&matrix,f_m,u,v)*dscale; 
+        assert(dPdv); *dPdv = GregoryPatch3fa::eval_dv(*(Vec3fa_4x4*)&matrix,f_m,u,v)*dscale; 
+      }
+      if (ddPdudu) {
+        assert(ddPdudu); *ddPdudu = GregoryPatch3fa::eval_dudu(*(Vec3fa_4x4*)&matrix,f_m,u,v)*sqr(dscale); 
+        assert(ddPdvdv); *ddPdvdv = GregoryPatch3fa::eval_dvdv(*(Vec3fa_4x4*)&matrix,f_m,u,v)*sqr(dscale); 
+        assert(ddPdudv); *ddPdudv = GregoryPatch3fa::eval_dudv(*(Vec3fa_4x4*)&matrix,f_m,u,v)*sqr(dscale); 
+      }
+    }
+
+    template<typename vbool, typename vfloat>
+    __forceinline void eval(const vbool& valid, const vfloat& uu, const vfloat& vv, float* P, float* dPdu, float* dPdv, const float dscale, const size_t dstride, const size_t N) const 
+    {
+      __aligned(64) Vec3fa f_m[2][2]; extract_f_m(f_m);
+      GregoryPatch3fa::eval(matrix,f_m,valid,uu,vv,P,dPdu,dPdv,dscale,dstride,N);
+    }
+
+  private:
+    Vec3ff matrix[4][4]; // f_p/m points are stored in 4th component
+  };
+}
diff --git a/thirdparty/embree/kernels/subdiv/gridrange.h b/thirdparty/embree/kernels/subdiv/gridrange.h
new file mode 100644
index 0000000000..4f2b90d7bd
--- /dev/null
+++ b/thirdparty/embree/kernels/subdiv/gridrange.h
@@ -0,0 +1,96 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/default.h"
+
+namespace embree
+{
+  struct __aligned(16) GridRange
+  {
+    unsigned int u_start;
+    unsigned int u_end;
+    unsigned int v_start;
+    unsigned int v_end;
+
+    __forceinline GridRange() {}
+
+    __forceinline GridRange(unsigned int u_start, unsigned int u_end, unsigned int v_start, unsigned int v_end) 
+      : u_start(u_start), u_end(u_end), v_start(v_start), v_end(v_end) {}
+
+    __forceinline unsigned int width() const {
+      return u_end-u_start+1;
+    }
+
+    __forceinline unsigned int height() const {
+      return v_end-v_start+1;
+    }
+
+    __forceinline bool hasLeafSize() const
+    {
+      const unsigned int u_size = u_end-u_start+1;
+      const unsigned int v_size = v_end-v_start+1;
+      assert(u_size >= 1);
+      assert(v_size >= 1);
+      return u_size <= 3 && v_size <= 3;
+    }
+
+    static __forceinline unsigned int split(unsigned int start,unsigned int end)
+    {
+      const unsigned int center = (start+end)/2;
+      assert (center > start);
+      assert (center < end);
+      return center;
+    }
+
+    __forceinline void split(GridRange& r0, GridRange& r1) const
+    {
+      assert( hasLeafSize() == false );
+      const unsigned int u_size = u_end-u_start+1;
+      const unsigned int v_size = v_end-v_start+1;
+      r0 = *this;
+      r1 = *this;
+
+      if (u_size >= v_size)
+      {
+        const unsigned int u_mid = split(u_start,u_end);
+        r0.u_end   = u_mid;
+        r1.u_start = u_mid;
+      }
+      else
+      {
+        const unsigned int v_mid = split(v_start,v_end);
+        r0.v_end   = v_mid;
+        r1.v_start = v_mid;
+      }
+    }
+
+    __forceinline unsigned int splitIntoSubRanges(GridRange r[4]) const
+    {
+      assert( !hasLeafSize() );
+      unsigned int children = 0;
+      GridRange first,second;
+      split(first,second);
+
+      if (first.hasLeafSize()) {
+        r[0] = first;
+        children++;
+      } 
+      else {
+        first.split(r[0],r[1]);
+        children += 2;
+      }
+
+      if (second.hasLeafSize())	{
+        r[children] = second;
+        children++;
+      }
+      else {
+        second.split(r[children+0],r[children+1]);
+        children += 2;
+      }
+      return children;      
+    }
+  };
+}
diff --git a/thirdparty/embree/kernels/subdiv/half_edge.h b/thirdparty/embree/kernels/subdiv/half_edge.h
new file mode 100644
index 0000000000..baf019cd79
--- /dev/null
+++ b/thirdparty/embree/kernels/subdiv/half_edge.h
@@ -0,0 +1,371 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "catmullclark_coefficients.h"
+
+namespace embree
+{
+  class __aligned(32) HalfEdge
+  {
+    friend class SubdivMesh;
+    public:
+
+    enum PatchType : char { 
+      BILINEAR_PATCH        = 0, //!< a bilinear patch
+      REGULAR_QUAD_PATCH    = 1, //!< a regular quad patch can be represented as a B-Spline
+      IRREGULAR_QUAD_PATCH  = 2, //!< an irregular quad patch can be represented as a Gregory patch
+      COMPLEX_PATCH         = 3  //!< these patches need subdivision and cannot be processed by the above fast code paths
+    };
+    
+    enum VertexType : char { 
+      REGULAR_VERTEX           = 0, //!< regular vertex
+      NON_MANIFOLD_EDGE_VERTEX = 1, //!< vertex of a non-manifold edge
+    };
+    
+    __forceinline friend PatchType max( const PatchType& ty0, const PatchType& ty1) {
+      return (PatchType) max((int)ty0,(int)ty1);
+    }
+    
+    struct Edge 
+    {
+      /*! edge constructor */
+      __forceinline Edge(const uint32_t v0, const uint32_t v1)
+	: v0(v0), v1(v1) {}
+
+      /*! create an 64 bit identifier that is unique for the not oriented edge */
+      __forceinline operator uint64_t() const       
+      {
+	uint32_t p0 = v0, p1 = v1;
+	if (p0<p1) std::swap(p0,p1);
+	return (((uint64_t)p0) << 32) | (uint64_t)p1;
+      }
+
+    public:
+      uint32_t v0,v1;    //!< start and end vertex of the edge
+    };
+
+    HalfEdge () 
+      : vtx_index(-1), next_half_edge_ofs(0), prev_half_edge_ofs(0), opposite_half_edge_ofs(0), edge_crease_weight(0), 
+      vertex_crease_weight(0), edge_level(0), patch_type(COMPLEX_PATCH), vertex_type(REGULAR_VERTEX)
+    {
+      static_assert(sizeof(HalfEdge) == 32, "invalid half edge size");
+    }
+ 
+    __forceinline bool hasOpposite() const { return opposite_half_edge_ofs != 0; }
+    __forceinline void setOpposite(HalfEdge* opposite) { opposite_half_edge_ofs = int(opposite-this); }
+    
+    __forceinline       HalfEdge* next()       { assert( next_half_edge_ofs != 0 ); return &this[next_half_edge_ofs]; }
+    __forceinline const HalfEdge* next() const { assert( next_half_edge_ofs != 0 ); return &this[next_half_edge_ofs]; }
+    
+    __forceinline       HalfEdge* prev()       { assert( prev_half_edge_ofs != 0 ); return &this[prev_half_edge_ofs]; }
+    __forceinline const HalfEdge* prev() const { assert( prev_half_edge_ofs != 0 ); return &this[prev_half_edge_ofs]; }
+    
+    __forceinline       HalfEdge* opposite()       { assert( opposite_half_edge_ofs != 0 ); return &this[opposite_half_edge_ofs]; }
+    __forceinline const HalfEdge* opposite() const { assert( opposite_half_edge_ofs != 0 ); return &this[opposite_half_edge_ofs]; }
+    
+    __forceinline       HalfEdge* rotate()       { return opposite()->next(); }
+    __forceinline const HalfEdge* rotate() const { return opposite()->next(); }
+    
+    __forceinline unsigned int getStartVertexIndex() const { return vtx_index; }
+    __forceinline unsigned int getEndVertexIndex  () const { return next()->vtx_index; }
+    __forceinline Edge         getEdge            () const { return Edge(getStartVertexIndex(),getEndVertexIndex()); }
+   
+    
+    /*! tests if the start vertex of the edge is regular */
+    __forceinline PatchType vertexType() const
+    {
+      const HalfEdge* p = this;
+      size_t face_valence = 0;
+      bool hasBorder = false;
+      
+      do
+      {
+        /* we need subdivision to handle edge creases */
+        if (p->hasOpposite() && p->edge_crease_weight > 0.0f) 
+          return COMPLEX_PATCH;
+        
+        face_valence++;
+        
+        /* test for quad */
+        const HalfEdge* pp = p;
+        pp = pp->next(); if (pp == p) return COMPLEX_PATCH;
+        pp = pp->next(); if (pp == p) return COMPLEX_PATCH;
+        pp = pp->next(); if (pp == p) return COMPLEX_PATCH;
+        pp = pp->next(); if (pp != p) return COMPLEX_PATCH;
+        
+        /* continue with next face */
+        p = p->prev();
+        if (likely(p->hasOpposite())) 
+          p = p->opposite();
+        
+        /* if there is no opposite go the long way to the other side of the border */
+        else
+        {
+          face_valence++;
+          hasBorder = true;
+          p = this;
+          while (p->hasOpposite()) 
+            p = p->rotate();
+        }
+      } while (p != this); 
+      
+      /* calculate vertex type */
+      if (face_valence == 2 && hasBorder) {
+        if      (vertex_crease_weight == 0.0f      ) return REGULAR_QUAD_PATCH;
+        else if (vertex_crease_weight == float(inf)) return REGULAR_QUAD_PATCH;
+        else                                         return COMPLEX_PATCH;
+      }
+      else if (vertex_crease_weight != 0.0f)         return COMPLEX_PATCH;
+      else if (face_valence == 3 &&  hasBorder)      return REGULAR_QUAD_PATCH;
+      else if (face_valence == 4 && !hasBorder)      return REGULAR_QUAD_PATCH;
+      else                                           return IRREGULAR_QUAD_PATCH;
+    }
+
+    /*! tests if this edge is part of a bilinear patch */
+    __forceinline bool bilinearVertex() const {
+      return vertex_crease_weight == float(inf) && edge_crease_weight == float(inf);
+    }
+    
+    /*! calculates the type of the patch */
+    __forceinline PatchType patchType() const 
+    {
+      const HalfEdge* p = this;
+      PatchType ret = REGULAR_QUAD_PATCH;
+      bool bilinear = true;
+      
+      ret = max(ret,p->vertexType());
+      bilinear &= p->bilinearVertex();
+      if ((p = p->next()) == this) return COMPLEX_PATCH;
+      
+      ret = max(ret,p->vertexType());
+      bilinear &= p->bilinearVertex();
+      if ((p = p->next()) == this) return COMPLEX_PATCH;
+      
+      ret = max(ret,p->vertexType());
+      bilinear &= p->bilinearVertex();
+      if ((p = p->next()) == this) return COMPLEX_PATCH;
+      
+      ret = max(ret,p->vertexType());
+      bilinear &= p->bilinearVertex();
+      if ((p = p->next()) != this) return COMPLEX_PATCH;
+      
+      if (bilinear) return BILINEAR_PATCH;
+      return ret;
+    }
+    
+    /*! tests if the face is a regular b-spline face */
+    __forceinline bool isRegularFace() const {
+      return patch_type == REGULAR_QUAD_PATCH;
+    }
+    
+    /*! tests if the face can be diced (using bspline or gregory patch) */
+    __forceinline bool isGregoryFace() const {
+      return patch_type == IRREGULAR_QUAD_PATCH || patch_type == REGULAR_QUAD_PATCH;
+    }
+    
+    /*! tests if the base vertex of this half edge is a corner vertex */
+    __forceinline bool isCorner() const {
+      return !hasOpposite() && !prev()->hasOpposite();
+    }
+
+    /*! tests if the vertex is attached to any border */
+    __forceinline bool vertexHasBorder() const 
+    {
+      const HalfEdge* p = this;
+      do {
+        if (!p->hasOpposite()) return true;
+        p = p->rotate();
+      } while (p != this);
+      return false;
+    }
+    
+    /*! tests if the face this half edge belongs to has some border */
+    __forceinline bool faceHasBorder() const 
+    {
+      const HalfEdge* p = this;
+      do {
+        if (p->vertexHasBorder() && (p->vertex_type != HalfEdge::NON_MANIFOLD_EDGE_VERTEX)) return true;
+        p = p->next();
+      } while (p != this);
+      return false;
+    }
+    
+    /*! calculates conservative bounds of a catmull clark subdivision face */
+    __forceinline BBox3fa bounds(const BufferView<Vec3fa>& vertices) const
+    {
+      BBox3fa bounds = this->get1RingBounds(vertices);
+      for (const HalfEdge* p=this->next(); p!=this; p=p->next())
+        bounds.extend(p->get1RingBounds(vertices));
+      return bounds;
+    }
+    
+    /*! tests if this is a valid patch */
+    __forceinline bool valid(const BufferView<Vec3fa>& vertices) const
+    {
+      size_t N = 1;
+      if (!this->validRing(vertices)) return false;
+      for (const HalfEdge* p=this->next(); p!=this; p=p->next(), N++) {
+        if (!p->validRing(vertices)) return false;
+      }
+      return N >= 3 && N <= MAX_PATCH_VALENCE;
+    }
+    
+    /*! counts number of polygon edges  */
+    __forceinline unsigned int numEdges() const
+    {
+      unsigned int N = 1;
+      for (const HalfEdge* p=this->next(); p!=this; p=p->next(), N++);
+      return N;
+    }
+
+    /*! calculates face and edge valence */
+    __forceinline void calculateFaceValenceAndEdgeValence(size_t& faceValence, size_t& edgeValence) const 
+    {
+      faceValence = 0;
+      edgeValence = 0;
+      
+      const HalfEdge* p = this;
+      do 
+      {
+         /* calculate bounds of current face */
+        unsigned int numEdges = p->numEdges();
+        assert(numEdges >= 3);
+        edgeValence += numEdges-2;
+        
+        faceValence++;
+        p = p->prev();
+        
+        /* continue with next face */
+        if (likely(p->hasOpposite())) 
+          p = p->opposite();
+        
+        /* if there is no opposite go the long way to the other side of the border */
+        else {
+          faceValence++;
+          edgeValence++;
+          p = this;
+          while (p->hasOpposite()) 
+            p = p->opposite()->next();
+        }
+        
+      } while (p != this); 
+    }
+
+    /*! stream output */
+    friend __forceinline std::ostream &operator<<(std::ostream &o, const HalfEdge &h)
+    {
+      return o << "{ " << 
+        "vertex = " << h.vtx_index << ", " << //" -> " << h.next()->vtx_index << ", " << 
+        "prev = " << h.prev_half_edge_ofs << ", " << 
+        "next = " << h.next_half_edge_ofs << ", " << 
+        "opposite = " << h.opposite_half_edge_ofs << ", " << 
+        "edge_crease = " << h.edge_crease_weight << ", " << 
+        "vertex_crease = " << h.vertex_crease_weight << ", " << 
+        //"edge_level = " << h.edge_level << 
+        " }";
+    } 
+    
+  private:
+    
+    /*! calculates the bounds of the face associated with the half-edge */
+    __forceinline BBox3fa getFaceBounds(const BufferView<Vec3fa>& vertices) const 
+    {
+      BBox3fa b = vertices[getStartVertexIndex()];
+      for (const HalfEdge* p = next(); p!=this; p=p->next()) {
+        b.extend(vertices[p->getStartVertexIndex()]);
+      }
+      return b;
+    }
+    
+    /*! calculates the bounds of the 1-ring associated with the vertex of the half-edge */
+    __forceinline BBox3fa get1RingBounds(const BufferView<Vec3fa>& vertices) const 
+    {
+      BBox3fa bounds = empty;
+      const HalfEdge* p = this;
+      do 
+      {
+        /* calculate bounds of current face */
+        bounds.extend(p->getFaceBounds(vertices));
+        p = p->prev();
+        
+        /* continue with next face */
+        if (likely(p->hasOpposite())) 
+          p = p->opposite();
+        
+        /* if there is no opposite go the long way to the other side of the border */
+        else {
+          p = this;
+          while (p->hasOpposite()) 
+            p = p->opposite()->next();
+        }
+        
+      } while (p != this); 
+      
+      return bounds;
+    }
+    
+    /*! tests if this is a valid face */
+    __forceinline bool validFace(const BufferView<Vec3fa>& vertices, size_t& N) const 
+    {
+      const Vec3fa v = vertices[getStartVertexIndex()];
+      if (!isvalid(v)) return false;
+      size_t n = 1;
+      for (const HalfEdge* p = next(); p!=this; p=p->next(), n++) {
+        const Vec3fa v = vertices[p->getStartVertexIndex()];
+        if (!isvalid(v)) return false;
+      }
+      N += n-2;
+      return n >= 3 && n <= MAX_PATCH_VALENCE;
+    }
+    
+    /*! tests if this is a valid ring */
+    __forceinline bool validRing(const BufferView<Vec3fa>& vertices) const 
+    {
+      size_t faceValence = 0;
+      size_t edgeValence = 0;
+      
+      const HalfEdge* p = this;
+      do 
+      {
+        /* calculate bounds of current face */
+        if (!p->validFace(vertices,edgeValence)) 
+          return false;
+        
+        faceValence++;
+        p = p->prev();
+        
+        /* continue with next face */
+        if (likely(p->hasOpposite())) 
+          p = p->opposite();
+        
+        /* if there is no opposite go the long way to the other side of the border */
+        else {
+          faceValence++;
+          edgeValence++;
+          p = this;
+          while (p->hasOpposite()) 
+            p = p->opposite()->next();
+        }
+        
+      } while (p != this); 
+      
+      return faceValence <= MAX_RING_FACE_VALENCE && edgeValence <= MAX_RING_EDGE_VALENCE;
+    }
+    
+  private:
+    unsigned int vtx_index;         //!< index of edge start vertex
+    int next_half_edge_ofs;         //!< relative offset to next half edge of face
+    int prev_half_edge_ofs;         //!< relative offset to previous half edge of face
+    int opposite_half_edge_ofs;     //!< relative offset to opposite half edge
+    
+  public:
+    float edge_crease_weight;       //!< crease weight attached to edge
+    float vertex_crease_weight;     //!< crease weight attached to start vertex
+    float edge_level;               //!< subdivision factor for edge
+    PatchType patch_type;           //!< stores type of subdiv patch
+    VertexType vertex_type;         //!< stores type of the start vertex
+    char align[2];
+  };
+}
diff --git a/thirdparty/embree/kernels/subdiv/hermite_curve.h b/thirdparty/embree/kernels/subdiv/hermite_curve.h
new file mode 100644
index 0000000000..ffef5a4315
--- /dev/null
+++ b/thirdparty/embree/kernels/subdiv/hermite_curve.h
@@ -0,0 +1,39 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/default.h"
+#include "bezier_curve.h"
+
+namespace embree
+{
+  template<typename Vertex>
+    struct HermiteCurveT : BezierCurveT<Vertex>
+    {
+      __forceinline HermiteCurveT() {}
+
+      __forceinline HermiteCurveT(const BezierCurveT<Vertex>& curve)
+        : BezierCurveT<Vertex>(curve) {}
+      
+      __forceinline HermiteCurveT(const Vertex& v0, const Vertex& t0, const Vertex& v1, const Vertex& t1)
+        : BezierCurveT<Vertex>(v0,madd(1.0f/3.0f,t0,v0),nmadd(1.0f/3.0f,t1,v1),v1) {}
+
+      __forceinline HermiteCurveT<Vec3ff> xfm_pr(const LinearSpace3fa& space, const Vec3fa& p) const
+      {
+        const Vec3ff q0(xfmVector(space,this->v0-p), this->v0.w);
+        const Vec3ff q1(xfmVector(space,this->v1-p), this->v1.w);
+        const Vec3ff q2(xfmVector(space,this->v2-p), this->v2.w);
+        const Vec3ff q3(xfmVector(space,this->v3-p), this->v3.w);
+        return BezierCurveT<Vec3ff>(q0,q1,q2,q3);
+      }
+    };
+
+  template<typename CurveGeometry>
+  __forceinline HermiteCurveT<Vec3ff> enlargeRadiusToMinWidth(const IntersectContext* context, const CurveGeometry* geom, const Vec3fa& ray_org, const HermiteCurveT<Vec3ff>& curve) {
+    return HermiteCurveT<Vec3ff>(enlargeRadiusToMinWidth(context,geom,ray_org,BezierCurveT<Vec3ff>(curve)));
+  }
+  
+  typedef HermiteCurveT<Vec3fa> HermiteCurve3fa;
+}
+
diff --git a/thirdparty/embree/kernels/subdiv/linear_bezier_patch.h b/thirdparty/embree/kernels/subdiv/linear_bezier_patch.h
new file mode 100644
index 0000000000..f8e8a25f35
--- /dev/null
+++ b/thirdparty/embree/kernels/subdiv/linear_bezier_patch.h
@@ -0,0 +1,403 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bezier_curve.h"
+
+namespace embree
+{
+  namespace isa
+  {   
+    template<typename V>
+      struct TensorLinearQuadraticBezierSurface
+      {
+        QuadraticBezierCurve<V> L;
+        QuadraticBezierCurve<V> R;
+        
+        __forceinline TensorLinearQuadraticBezierSurface() {}
+        
+        __forceinline TensorLinearQuadraticBezierSurface(const TensorLinearQuadraticBezierSurface<V>& curve)
+          : L(curve.L), R(curve.R) {}
+        
+        __forceinline TensorLinearQuadraticBezierSurface& operator= (const TensorLinearQuadraticBezierSurface& other) {
+          L = other.L; R = other.R; return *this;
+        }
+          
+          __forceinline TensorLinearQuadraticBezierSurface(const QuadraticBezierCurve<V>& L, const QuadraticBezierCurve<V>& R)
+            : L(L), R(R) {}
+        
+        __forceinline BBox<V> bounds() const {
+          return merge(L.bounds(),R.bounds());
+        }
+      };
+    
+    template<>
+      struct TensorLinearQuadraticBezierSurface<Vec2fa>
+    {
+      QuadraticBezierCurve<vfloat4> LR;
+      
+      __forceinline TensorLinearQuadraticBezierSurface() {}
+      
+      __forceinline TensorLinearQuadraticBezierSurface(const TensorLinearQuadraticBezierSurface<Vec2fa>& curve)
+        : LR(curve.LR) {}
+      
+      __forceinline TensorLinearQuadraticBezierSurface& operator= (const TensorLinearQuadraticBezierSurface& other) {
+        LR = other.LR; return *this;
+      }
+      
+      __forceinline TensorLinearQuadraticBezierSurface(const QuadraticBezierCurve<vfloat4>& LR)
+        : LR(LR) {}
+      
+      __forceinline BBox<Vec2fa> bounds() const
+      {
+        const BBox<vfloat4> b = LR.bounds();
+        const BBox<Vec2fa> bl(Vec2fa(b.lower),Vec2fa(b.upper));
+        const BBox<Vec2fa> br(Vec2fa(shuffle<2,3,2,3>(b.lower)),Vec2fa(shuffle<2,3,2,3>(b.upper)));
+        return merge(bl,br);
+      }
+    };
+    
+    template<typename V>
+      struct TensorLinearCubicBezierSurface
+      {
+        CubicBezierCurve<V> L;
+        CubicBezierCurve<V> R;
+        
+        __forceinline TensorLinearCubicBezierSurface() {}
+        
+        __forceinline TensorLinearCubicBezierSurface(const TensorLinearCubicBezierSurface& curve)
+          : L(curve.L), R(curve.R) {}
+        
+        __forceinline TensorLinearCubicBezierSurface& operator= (const TensorLinearCubicBezierSurface& other) {
+          L = other.L; R = other.R; return *this;
+        }
+          
+        __forceinline TensorLinearCubicBezierSurface(const CubicBezierCurve<V>& L, const CubicBezierCurve<V>& R)
+          : L(L), R(R) {}
+
+        template<template<typename T> class SourceCurve>
+        __forceinline static TensorLinearCubicBezierSurface fromCenterAndNormalCurve(const SourceCurve<Vec3ff>& center, const SourceCurve<Vec3fa>& normal)
+        {
+          SourceCurve<Vec3ff> vcurve = center;
+          SourceCurve<Vec3fa> ncurve = normal;
+          
+          /* here we construct a patch which follows the curve l(t) =
+           * p(t) +/- r(t)*normalize(cross(n(t),dp(t))) */
+          
+          const Vec3ff p0   = vcurve.eval(0.0f);
+          const Vec3ff dp0  = vcurve.eval_du(0.0f);
+          const Vec3ff ddp0 = vcurve.eval_dudu(0.0f);
+
+          const Vec3fa n0   = ncurve.eval(0.0f);
+          const Vec3fa dn0  = ncurve.eval_du(0.0f);
+
+          const Vec3ff p1   = vcurve.eval(1.0f);
+          const Vec3ff dp1  = vcurve.eval_du(1.0f);
+          const Vec3ff ddp1 = vcurve.eval_dudu(1.0f);
+
+          const Vec3fa n1   = ncurve.eval(1.0f);
+          const Vec3fa dn1  = ncurve.eval_du(1.0f);
+
+          const Vec3fa bt0  = cross(n0,dp0);
+          const Vec3fa dbt0 = cross(dn0,dp0) + cross(n0,ddp0);
+
+          const Vec3fa bt1  = cross(n1,dp1);
+          const Vec3fa dbt1 = cross(dn1,dp1) + cross(n1,ddp1);
+            
+          const Vec3fa k0  = normalize(bt0);
+          const Vec3fa dk0 = dnormalize(bt0,dbt0);
+          
+          const Vec3fa k1 = normalize(bt1);
+          const Vec3fa dk1 = dnormalize(bt1,dbt1);
+                    
+          const Vec3fa l0 = p0 - p0.w*k0;
+          const Vec3fa dl0 = dp0 - (dp0.w*k0 + p0.w*dk0);
+
+          const Vec3fa r0 = p0 + p0.w*k0;
+          const Vec3fa dr0 = dp0 + (dp0.w*k0 + p0.w*dk0);
+
+          const Vec3fa l1 = p1 - p1.w*k1;
+          const Vec3fa dl1 = dp1 - (dp1.w*k1 + p1.w*dk1);
+
+          const Vec3fa r1 = p1 + p1.w*k1;
+          const Vec3fa dr1 = dp1 + (dp1.w*k1 + p1.w*dk1);
+
+          const float scale = 1.0f/3.0f;
+          CubicBezierCurve<V> L(l0,l0+scale*dl0,l1-scale*dl1,l1);
+          CubicBezierCurve<V> R(r0,r0+scale*dr0,r1-scale*dr1,r1);
+          return TensorLinearCubicBezierSurface(L,R);
+        }
+
+        __forceinline BBox<V> bounds() const {
+          return merge(L.bounds(),R.bounds());
+        }
+
+        __forceinline BBox3fa accurateBounds() const {
+          return merge(L.accurateBounds(),R.accurateBounds());
+        }
+        
+        __forceinline CubicBezierCurve<Interval1f> reduce_v() const {
+          return merge(CubicBezierCurve<Interval<V>>(L),CubicBezierCurve<Interval<V>>(R));
+        }
+        
+        __forceinline LinearBezierCurve<Interval1f> reduce_u() const {
+          return LinearBezierCurve<Interval1f>(L.bounds(),R.bounds());
+        }
+        
+        __forceinline TensorLinearCubicBezierSurface<float> xfm(const V& dx) const {
+          return TensorLinearCubicBezierSurface<float>(L.xfm(dx),R.xfm(dx));
+        }
+        
+        __forceinline TensorLinearCubicBezierSurface<vfloatx> vxfm(const V& dx) const {
+          return TensorLinearCubicBezierSurface<vfloatx>(L.vxfm(dx),R.vxfm(dx));
+        }
+        
+        __forceinline TensorLinearCubicBezierSurface<float> xfm(const V& dx, const V& p) const {
+          return TensorLinearCubicBezierSurface<float>(L.xfm(dx,p),R.xfm(dx,p));
+        }
+
+        __forceinline TensorLinearCubicBezierSurface<Vec3fa> xfm(const LinearSpace3fa& space) const {
+          return TensorLinearCubicBezierSurface(L.xfm(space),R.xfm(space));
+        }
+        
+        __forceinline TensorLinearCubicBezierSurface<Vec3fa> xfm(const LinearSpace3fa& space, const Vec3fa& p) const {
+          return TensorLinearCubicBezierSurface(L.xfm(space,p),R.xfm(space,p));
+        }
+
+        __forceinline TensorLinearCubicBezierSurface<Vec3fa> xfm(const LinearSpace3fa& space, const Vec3fa& p, const float s) const {
+          return TensorLinearCubicBezierSurface(L.xfm(space,p,s),R.xfm(space,p,s));
+        }
+
+        __forceinline TensorLinearCubicBezierSurface clip_u(const Interval1f& u) const {
+          return TensorLinearCubicBezierSurface(L.clip(u),R.clip(u));
+        }
+        
+        __forceinline TensorLinearCubicBezierSurface clip_v(const Interval1f& v) const {
+          return TensorLinearCubicBezierSurface(clerp(L,R,V(v.lower)),clerp(L,R,V(v.upper)));
+        }
+        
+        __forceinline TensorLinearCubicBezierSurface clip(const Interval1f& u, const Interval1f& v) const {
+          return clip_v(v).clip_u(u);
+        }
+        
+        __forceinline void split_u(TensorLinearCubicBezierSurface& left, TensorLinearCubicBezierSurface& right, const float u = 0.5f) const
+        {
+          CubicBezierCurve<V> L0,L1; L.split(L0,L1,u);
+          CubicBezierCurve<V> R0,R1; R.split(R0,R1,u);
+          new (&left ) TensorLinearCubicBezierSurface(L0,R0);
+          new (&right) TensorLinearCubicBezierSurface(L1,R1);
+        }
+        
+        __forceinline TensorLinearCubicBezierSurface<Vec2vfx> vsplit_u(vboolx& valid, const BBox1f& u) const {
+          valid = true; clear(valid,VSIZEX-1);
+          return TensorLinearCubicBezierSurface<Vec2vfx>(L.split(u),R.split(u));
+        }
+        
+        __forceinline V eval(const float u, const float v) const {
+          return clerp(L,R,V(v)).eval(u);
+        }
+        
+        __forceinline V eval_du(const float u, const float v) const {
+          return clerp(L,R,V(v)).eval_dt(u);
+        }
+        
+        __forceinline V eval_dv(const float u, const float v) const {
+          return (R-L).eval(u);
+        }
+        
+        __forceinline void eval(const float u, const float v, V& p, V& dpdu, V& dpdv) const
+        {
+          V p0, dp0du; L.eval(u,p0,dp0du);
+          V p1, dp1du; R.eval(u,p1,dp1du);
+          p = lerp(p0,p1,v);
+          dpdu = lerp(dp0du,dp1du,v);
+          dpdv = p1-p0;
+        }
+        
+        __forceinline TensorLinearQuadraticBezierSurface<V> derivative_u() const {
+          return TensorLinearQuadraticBezierSurface<V>(L.derivative(),R.derivative());
+        }
+        
+        __forceinline CubicBezierCurve<V> derivative_v() const {
+          return R-L;
+        }
+        
+        __forceinline V axis_u() const {
+          return (L.end()-L.begin())+(R.end()-R.begin());
+        }
+        
+        __forceinline V axis_v() const {
+          return (R.begin()-L.begin())+(R.end()-L.end());
+        }
+        
+        friend embree_ostream operator<<(embree_ostream cout, const TensorLinearCubicBezierSurface& a)
+        {
+          return cout << "TensorLinearCubicBezierSurface" << embree_endl
+                      << "{" << embree_endl
+                      << "  L = " << a.L << ", " << embree_endl
+                      << "  R = " << a.R << embree_endl
+                      << "}";
+        }
+
+        friend __forceinline TensorLinearCubicBezierSurface clerp(const TensorLinearCubicBezierSurface& a, const TensorLinearCubicBezierSurface& b, const float t) {
+          return TensorLinearCubicBezierSurface(clerp(a.L,b.L,V(t)), clerp(a.R,b.R,V(t)));
+        }
+      };
+    
+    template<>
+      struct TensorLinearCubicBezierSurface<Vec2fa>
+    {
+      CubicBezierCurve<vfloat4> LR;
+      
+      __forceinline TensorLinearCubicBezierSurface() {}
+      
+      __forceinline TensorLinearCubicBezierSurface(const TensorLinearCubicBezierSurface& curve)
+        : LR(curve.LR) {}
+      
+      __forceinline TensorLinearCubicBezierSurface& operator= (const TensorLinearCubicBezierSurface& other) {
+        LR = other.LR; return *this;
+      }
+      
+      __forceinline TensorLinearCubicBezierSurface(const CubicBezierCurve<vfloat4>& LR)
+        : LR(LR) {}
+      
+      __forceinline TensorLinearCubicBezierSurface(const CubicBezierCurve<Vec2fa>& L, const CubicBezierCurve<Vec2fa>& R)
+        : LR(shuffle<0,1,0,1>(vfloat4(L.v0),vfloat4(R.v0)),shuffle<0,1,0,1>(vfloat4(L.v1),vfloat4(R.v1)),shuffle<0,1,0,1>(vfloat4(L.v2),vfloat4(R.v2)),shuffle<0,1,0,1>(vfloat4(L.v3),vfloat4(R.v3))) {}
+      
+      __forceinline CubicBezierCurve<Vec2fa> getL() const {
+        return CubicBezierCurve<Vec2fa>(Vec2fa(LR.v0),Vec2fa(LR.v1),Vec2fa(LR.v2),Vec2fa(LR.v3));
+      }
+      
+      __forceinline CubicBezierCurve<Vec2fa> getR() const {
+        return CubicBezierCurve<Vec2fa>(Vec2fa(shuffle<2,3,2,3>(LR.v0)),Vec2fa(shuffle<2,3,2,3>(LR.v1)),Vec2fa(shuffle<2,3,2,3>(LR.v2)),Vec2fa(shuffle<2,3,2,3>(LR.v3)));
+      }
+      
+      __forceinline BBox<Vec2fa> bounds() const
+      {
+        const BBox<vfloat4> b = LR.bounds();
+        const BBox<Vec2fa> bl(Vec2fa(b.lower),Vec2fa(b.upper));
+        const BBox<Vec2fa> br(Vec2fa(shuffle<2,3,2,3>(b.lower)),Vec2fa(shuffle<2,3,2,3>(b.upper)));
+        return merge(bl,br);
+      }
+      
+      __forceinline BBox1f bounds(const Vec2fa& axis) const
+      {
+        const CubicBezierCurve<vfloat4> LRx = LR;
+        const CubicBezierCurve<vfloat4> LRy(shuffle<1,0,3,2>(LR.v0),shuffle<1,0,3,2>(LR.v1),shuffle<1,0,3,2>(LR.v2),shuffle<1,0,3,2>(LR.v3));
+        const CubicBezierCurve<vfloat4> LRa = cmadd(shuffle<0>(vfloat4(axis)),LRx,shuffle<1>(vfloat4(axis))*LRy);
+        const BBox<vfloat4> Lb = LRa.bounds();
+        const BBox<vfloat4> Rb(shuffle<3>(Lb.lower),shuffle<3>(Lb.upper));
+        const BBox<vfloat4> b = merge(Lb,Rb);
+        return BBox1f(b.lower[0],b.upper[0]);
+      }
+
+      __forceinline TensorLinearCubicBezierSurface<float> xfm(const Vec2fa& dx) const
+      {
+        const CubicBezierCurve<vfloat4> LRx = LR;
+        const CubicBezierCurve<vfloat4> LRy(shuffle<1,0,3,2>(LR.v0),shuffle<1,0,3,2>(LR.v1),shuffle<1,0,3,2>(LR.v2),shuffle<1,0,3,2>(LR.v3));
+        const CubicBezierCurve<vfloat4> LRa = cmadd(shuffle<0>(vfloat4(dx)),LRx,shuffle<1>(vfloat4(dx))*LRy);
+        return TensorLinearCubicBezierSurface<float>(CubicBezierCurve<float>(LRa.v0[0],LRa.v1[0],LRa.v2[0],LRa.v3[0]),
+                                                     CubicBezierCurve<float>(LRa.v0[2],LRa.v1[2],LRa.v2[2],LRa.v3[2]));
+      }
+      
+      __forceinline TensorLinearCubicBezierSurface<float> xfm(const Vec2fa& dx, const Vec2fa& p) const
+      {
+        const vfloat4 pxyxy = shuffle<0,1,0,1>(vfloat4(p));
+        const CubicBezierCurve<vfloat4> LRx = LR-pxyxy;
+        const CubicBezierCurve<vfloat4> LRy(shuffle<1,0,3,2>(LR.v0),shuffle<1,0,3,2>(LR.v1),shuffle<1,0,3,2>(LR.v2),shuffle<1,0,3,2>(LR.v3));
+        const CubicBezierCurve<vfloat4> LRa = cmadd(shuffle<0>(vfloat4(dx)),LRx,shuffle<1>(vfloat4(dx))*LRy);
+        return TensorLinearCubicBezierSurface<float>(CubicBezierCurve<float>(LRa.v0[0],LRa.v1[0],LRa.v2[0],LRa.v3[0]),
+                                                     CubicBezierCurve<float>(LRa.v0[2],LRa.v1[2],LRa.v2[2],LRa.v3[2]));
+      }
+
+      __forceinline TensorLinearCubicBezierSurface clip_u(const Interval1f& u) const {
+        return TensorLinearCubicBezierSurface(LR.clip(u));
+      }
+      
+      __forceinline TensorLinearCubicBezierSurface clip_v(const Interval1f& v) const
+      {
+        const CubicBezierCurve<vfloat4> LL(shuffle<0,1,0,1>(LR.v0),shuffle<0,1,0,1>(LR.v1),shuffle<0,1,0,1>(LR.v2),shuffle<0,1,0,1>(LR.v3));
+        const CubicBezierCurve<vfloat4> RR(shuffle<2,3,2,3>(LR.v0),shuffle<2,3,2,3>(LR.v1),shuffle<2,3,2,3>(LR.v2),shuffle<2,3,2,3>(LR.v3));
+        return TensorLinearCubicBezierSurface(clerp(LL,RR,vfloat4(v.lower,v.lower,v.upper,v.upper)));
+      }
+      
+      __forceinline TensorLinearCubicBezierSurface clip(const Interval1f& u, const Interval1f& v) const {
+        return clip_v(v).clip_u(u);
+      }
+      
+      __forceinline void split_u(TensorLinearCubicBezierSurface& left, TensorLinearCubicBezierSurface& right, const float u = 0.5f) const
+      {
+        CubicBezierCurve<vfloat4> LR0,LR1; LR.split(LR0,LR1,u);
+        new (&left ) TensorLinearCubicBezierSurface(LR0);
+        new (&right) TensorLinearCubicBezierSurface(LR1);
+      }
+      
+      __forceinline TensorLinearCubicBezierSurface<Vec2vfx> vsplit_u(vboolx& valid, const BBox1f& u) const {
+        valid = true; clear(valid,VSIZEX-1);
+        return TensorLinearCubicBezierSurface<Vec2vfx>(getL().split(u),getR().split(u));
+      }
+      
+      __forceinline Vec2fa eval(const float u, const float v) const
+      {
+        const vfloat4 p = LR.eval(u);
+        return Vec2fa(lerp(shuffle<0,1,0,1>(p),shuffle<2,3,2,3>(p),v));
+      }
+      
+      __forceinline Vec2fa eval_du(const float u, const float v) const
+      {
+        const vfloat4 dpdu = LR.eval_dt(u);
+        return Vec2fa(lerp(shuffle<0,1,0,1>(dpdu),shuffle<2,3,2,3>(dpdu),v));
+      }
+      
+      __forceinline Vec2fa eval_dv(const float u, const float v) const
+      {
+        const vfloat4 p = LR.eval(u);
+        return Vec2fa(shuffle<2,3,2,3>(p)-shuffle<0,1,0,1>(p));
+      }
+      
+      __forceinline void eval(const float u, const float v, Vec2fa& p, Vec2fa& dpdu, Vec2fa& dpdv) const
+      {
+        vfloat4 p0, dp0du; LR.eval(u,p0,dp0du);
+        p = Vec2fa(lerp(shuffle<0,1,0,1>(p0),shuffle<2,3,2,3>(p0),v));
+        dpdu = Vec2fa(lerp(shuffle<0,1,0,1>(dp0du),shuffle<2,3,2,3>(dp0du),v));
+        dpdv = Vec2fa(shuffle<2,3,2,3>(p0)-shuffle<0,1,0,1>(p0));
+      }
+      
+      __forceinline TensorLinearQuadraticBezierSurface<Vec2fa> derivative_u() const {
+        return TensorLinearQuadraticBezierSurface<Vec2fa>(LR.derivative());
+      }
+      
+      __forceinline CubicBezierCurve<Vec2fa> derivative_v() const {
+        return getR()-getL();
+      }
+      
+      __forceinline Vec2fa axis_u() const
+      {
+        const CubicBezierCurve<Vec2fa> L = getL();
+        const CubicBezierCurve<Vec2fa> R = getR();
+        return (L.end()-L.begin())+(R.end()-R.begin());
+      }
+      
+      __forceinline Vec2fa axis_v() const
+      {
+        const CubicBezierCurve<Vec2fa> L = getL();
+        const CubicBezierCurve<Vec2fa> R = getR();
+        return (R.begin()-L.begin())+(R.end()-L.end());
+      }
+      
+      friend embree_ostream operator<<(embree_ostream cout, const TensorLinearCubicBezierSurface& a)
+      {
+        return cout << "TensorLinearCubicBezierSurface" << embree_endl
+                    << "{" << embree_endl
+                    << "  L = " << a.getL() << ", " << embree_endl
+                    << "  R = " << a.getR() << embree_endl
+                    << "}";
+      }
+    };
+
+    typedef TensorLinearCubicBezierSurface<float> TensorLinearCubicBezierSurface1f;
+    typedef TensorLinearCubicBezierSurface<Vec2fa> TensorLinearCubicBezierSurface2fa;
+    typedef TensorLinearCubicBezierSurface<Vec3fa> TensorLinearCubicBezierSurface3fa;
+  }
+}
diff --git a/thirdparty/embree/kernels/subdiv/patch.h b/thirdparty/embree/kernels/subdiv/patch.h
new file mode 100644
index 0000000000..c4340ea9b6
--- /dev/null
+++ b/thirdparty/embree/kernels/subdiv/patch.h
@@ -0,0 +1,371 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "catmullclark_patch.h"
+#include "bilinear_patch.h"
+#include "bspline_patch.h"
+#include "bezier_patch.h"
+#include "gregory_patch.h"
+#include "tessellation_cache.h"
+
+#if 1
+#define PATCH_DEBUG_SUBDIVISION(ptr,x,y,z)
+#else
+#define PATCH_DEBUG_SUBDIVISION(ptr,x,y,z)            \
+  {                                                   \
+    size_t hex = (size_t)ptr;                          \
+    for (size_t i=0; i<4; i++) hex = hex ^ (hex >> 8);  \
+    const float c = (float)(((hex >> 0) ^ (hex >> 4) ^ (hex >> 8) ^ (hex >> 12) ^ (hex >> 16))&0xf)/15.0f; \
+    if (P) *P = Vertex(0.5f+0.5f*x,0.5f+0.5f*y,0.5f+0.5f*z,0.0f);         \
+    }               
+#endif
+
+#define PATCH_MAX_CACHE_DEPTH 2
+//#define PATCH_MIN_RESOLUTION 1     // FIXME: not yet completely implemented
+#define PATCH_MAX_EVAL_DEPTH_IRREGULAR 10     // maximum evaluation depth at irregular vertices (has to be larger or equal than PATCH_MAX_CACHE_DEPTH)
+#define PATCH_MAX_EVAL_DEPTH_CREASE 10       // maximum evaluation depth at crease features (has to be larger or equal than PATCH_MAX_CACHE_DEPTH)
+#define PATCH_USE_GREGORY 1        // 0 = no gregory, 1 = fill, 2 = as early as possible
+
+#if PATCH_USE_GREGORY==2
+#define PATCH_USE_BEZIER_PATCH 1   // enable use of bezier instead of b-spline patches
+#else
+#define PATCH_USE_BEZIER_PATCH 0   // enable use of bezier instead of b-spline patches
+#endif
+
+#if PATCH_USE_BEZIER_PATCH
+#  define RegularPatch  BezierPatch
+#  define RegularPatchT BezierPatchT<Vertex,Vertex_t>
+#else
+#  define RegularPatch  BSplinePatch
+#  define RegularPatchT BSplinePatchT<Vertex,Vertex_t>
+#endif
+
+#if PATCH_USE_GREGORY
+#define IrregularFillPatch GregoryPatch
+#define IrregularFillPatchT GregoryPatchT<Vertex,Vertex_t>
+#else
+#define IrregularFillPatch BilinearPatch
+#define IrregularFillPatchT BilinearPatchT<Vertex,Vertex_t>
+#endif
+
+namespace embree
+{
+  template<typename Vertex, typename Vertex_t = Vertex>
+    struct __aligned(64) PatchT
+    {
+    public:
+    
+    typedef GeneralCatmullClarkPatchT<Vertex,Vertex_t> GeneralCatmullClarkPatch;
+    typedef CatmullClarkPatchT<Vertex,Vertex_t> CatmullClarkPatch;
+    typedef CatmullClark1RingT<Vertex,Vertex_t> CatmullClarkRing;
+    typedef BezierCurveT<Vertex> BezierCurve;
+    
+    enum Type {
+      INVALID_PATCH = 0,
+      BILINEAR_PATCH = 1,
+      BSPLINE_PATCH = 2,  
+      BEZIER_PATCH = 3,  
+      GREGORY_PATCH = 4,
+      SUBDIVIDED_GENERAL_PATCH = 7,
+      SUBDIVIDED_QUAD_PATCH = 8,
+      EVAL_PATCH = 9,
+    };
+    
+    struct Ref
+    {
+      __forceinline Ref(void* p = nullptr) 
+        : ptr((size_t)p) {}
+
+      __forceinline operator bool() const { return ptr != 0; }
+      __forceinline operator size_t() const { return ptr; }
+
+      __forceinline Ref (Type ty, void* in) 
+        : ptr(((size_t)in)+ty) { assert((((size_t)in) & 0xF) == 0); }
+
+      __forceinline Type  type  () const { return (Type)(ptr & 0xF); }
+      __forceinline void* object() const { return (void*) (ptr & ~0xF); }
+
+      size_t ptr;
+    };
+
+    struct EvalPatch 
+    {
+      /* creates EvalPatch from a CatmullClarkPatch */
+      template<typename Allocator>
+      __noinline static Ref create(const Allocator& alloc, const CatmullClarkPatch& patch) 
+      {
+        size_t ofs = 0, bytes = patch.bytes();
+        void* ptr = alloc(bytes);
+        patch.serialize(ptr,ofs);
+        assert(ofs == bytes);
+        return Ref(EVAL_PATCH, ptr);
+      }
+    };
+
+    struct BilinearPatch 
+    {
+      /* creates BilinearPatch from a CatmullClarkPatch */
+      template<typename Allocator>
+      __noinline static Ref create(const Allocator& alloc, const CatmullClarkPatch& patch,
+                                   const BezierCurve* border0, const BezierCurve* border1, const BezierCurve* border2, const BezierCurve* border3) {
+        return Ref(BILINEAR_PATCH, new (alloc(sizeof(BilinearPatch))) BilinearPatch(patch));
+      }
+
+      __forceinline BilinearPatch (const CatmullClarkPatch& patch) 
+        : patch(patch) {}
+
+      /* creates BilinearPatch from 4 vertices */
+      template<typename Allocator>
+      __noinline static Ref create(const Allocator& alloc, const HalfEdge* edge, const char* vertices, size_t stride) {
+        return Ref(BILINEAR_PATCH, new (alloc(sizeof(BilinearPatch))) BilinearPatch(edge,vertices,stride));
+      }
+      
+      __forceinline BilinearPatch (const HalfEdge* edge, const char* vertices, size_t stride) 
+        : patch(edge,vertices,stride) {}
+      
+    public:
+      BilinearPatchT<Vertex,Vertex_t> patch;
+    };
+    
+    struct BSplinePatch 
+    {
+      /* creates BSplinePatch from a half edge */
+      template<typename Allocator>
+      __noinline static Ref create(const Allocator& alloc, const HalfEdge* edge, const char* vertices, size_t stride) {
+        return Ref(BSPLINE_PATCH, new (alloc(sizeof(BSplinePatch))) BSplinePatch(edge,vertices,stride));
+      }
+      
+      __forceinline BSplinePatch (const HalfEdge* edge, const char* vertices, size_t stride) 
+        : patch(edge,vertices,stride) {}
+      
+      /* creates BSplinePatch from a CatmullClarkPatch */
+      template<typename Allocator>
+      __noinline static Ref create(const Allocator& alloc, const CatmullClarkPatch& patch,
+                                   const BezierCurve* border0, const BezierCurve* border1, const BezierCurve* border2, const BezierCurve* border3) {
+        return Ref(BSPLINE_PATCH, new (alloc(sizeof(BSplinePatch))) BSplinePatch(patch,border0,border1,border2,border3));
+      }
+      
+      __forceinline BSplinePatch (const CatmullClarkPatch& patch, const BezierCurve* border0, const BezierCurve* border1, const BezierCurve* border2, const BezierCurve* border3) 
+        : patch(patch,border0,border1,border2,border3) {}
+      
+    public:
+      BSplinePatchT<Vertex,Vertex_t> patch;
+    };
+
+    struct BezierPatch
+    {
+      /* creates BezierPatch from a half edge */
+      template<typename Allocator>
+        __noinline static Ref create(const Allocator& alloc, const HalfEdge* edge, const char* vertices, size_t stride) {
+        return Ref(BEZIER_PATCH, new (alloc(sizeof(BezierPatch))) BezierPatch(edge,vertices,stride));
+      }
+      
+      __forceinline BezierPatch (const HalfEdge* edge, const char* vertices, size_t stride) 
+        : patch(edge,vertices,stride) {}
+      
+      /* creates Bezier from a CatmullClarkPatch */
+      template<typename Allocator>
+      __noinline static Ref create(const Allocator& alloc, const CatmullClarkPatch& patch,
+                                   const BezierCurve* border0, const BezierCurve* border1, const BezierCurve* border2, const BezierCurve* border3) {
+        return Ref(BEZIER_PATCH, new (alloc(sizeof(BezierPatch))) BezierPatch(patch,border0,border1,border2,border3));
+      }
+      
+      __forceinline BezierPatch (const CatmullClarkPatch& patch, const BezierCurve* border0, const BezierCurve* border1, const BezierCurve* border2, const BezierCurve* border3) 
+        : patch(patch,border0,border1,border2,border3) {}
+      
+    public:
+      BezierPatchT<Vertex,Vertex_t> patch;
+    };
+    
+    struct GregoryPatch
+    {
+      /* creates GregoryPatch from half edge */
+      template<typename Allocator>
+      __noinline static Ref create(const Allocator& alloc, const HalfEdge* edge, const char* vertices, size_t stride) {
+        return Ref(GREGORY_PATCH, new (alloc(sizeof(GregoryPatch))) GregoryPatch(edge,vertices,stride));
+      }
+      
+      __forceinline GregoryPatch (const HalfEdge* edge, const char* vertices, size_t stride) 
+        : patch(CatmullClarkPatch(edge,vertices,stride)) {}
+       
+      /* creates GregoryPatch from CatmullClarkPatch */
+      template<typename Allocator>
+      __noinline static Ref create(const Allocator& alloc, const CatmullClarkPatch& patch,
+                                   const BezierCurve* border0, const BezierCurve* border1, const BezierCurve* border2, const BezierCurve* border3) {
+        return Ref(GREGORY_PATCH, new (alloc(sizeof(GregoryPatch))) GregoryPatch(patch,border0,border1,border2,border3));
+      }
+      
+      __forceinline GregoryPatch (const CatmullClarkPatch& patch, const BezierCurve* border0, const BezierCurve* border1, const BezierCurve* border2, const BezierCurve* border3) 
+        : patch(patch,border0,border1,border2,border3) {}
+      
+    public:
+      GregoryPatchT<Vertex,Vertex_t> patch;
+    };
+    
+    struct SubdividedQuadPatch
+    {
+      template<typename Allocator>
+      __noinline static Ref create(const Allocator& alloc, Ref children[4]) {
+        return Ref(SUBDIVIDED_QUAD_PATCH, new (alloc(sizeof(SubdividedQuadPatch))) SubdividedQuadPatch(children));
+      }
+      
+      __forceinline SubdividedQuadPatch(Ref children[4]) {
+        for (size_t i=0; i<4; i++) child[i] = children[i];
+      }
+      
+    public:
+      Ref child[4];
+    };
+    
+    struct SubdividedGeneralPatch
+    {
+      template<typename Allocator>
+      __noinline static Ref create(const Allocator& alloc, Ref* children, const unsigned N) {
+        return Ref(SUBDIVIDED_GENERAL_PATCH, new (alloc(sizeof(SubdividedGeneralPatch))) SubdividedGeneralPatch(children,N));
+      }
+      
+      __forceinline SubdividedGeneralPatch(Ref* children, const unsigned N) : N(N) {
+        for (unsigned i=0; i<N; i++) child[i] = children[i];
+      }
+      
+      unsigned N;
+      Ref child[MAX_PATCH_VALENCE];
+    };
+    
+    /*! Default constructor. */
+    __forceinline PatchT () {}
+    
+    template<typename Allocator>
+      __noinline static Ref create(const Allocator& alloc, const HalfEdge* edge, const char* vertices, size_t stride)
+    {
+      if (PATCH_MAX_CACHE_DEPTH == 0) 
+        return nullptr;
+
+      Ref child(0);
+      switch (edge->patch_type) {
+      case HalfEdge::BILINEAR_PATCH:       child = BilinearPatch::create(alloc,edge,vertices,stride); break; 
+      case HalfEdge::REGULAR_QUAD_PATCH:   child = RegularPatch::create(alloc,edge,vertices,stride); break;
+#if PATCH_USE_GREGORY == 2
+      case HalfEdge::IRREGULAR_QUAD_PATCH: child = GregoryPatch::create(alloc,edge,vertices,stride); break;
+#endif
+      default: {
+        GeneralCatmullClarkPatch patch(edge,vertices,stride);
+        child = PatchT::create(alloc,patch,edge,vertices,stride,0);
+      }
+      }
+      return child;
+    }
+
+    template<typename Allocator>
+    __noinline static Ref create(const Allocator& alloc, GeneralCatmullClarkPatch& patch, const HalfEdge* edge, const char* vertices, size_t stride, size_t depth)
+    {  
+      /* convert into standard quad patch if possible */
+      if (likely(patch.isQuadPatch())) 
+      {
+        CatmullClarkPatch qpatch; patch.init(qpatch);
+        return PatchT::create(alloc,qpatch,edge,vertices,stride,depth);
+      }
+   
+      /* do only cache up to some depth */
+      if (depth >= PATCH_MAX_CACHE_DEPTH)
+        return nullptr;
+         
+      /* subdivide patch */
+      unsigned N;
+      array_t<CatmullClarkPatch,GeneralCatmullClarkPatch::SIZE> patches; 
+      patch.subdivide(patches,N);
+      
+      if (N == 4) 
+      {
+        Ref child[4];
+#if PATCH_USE_GREGORY == 2
+        BezierCurve borders[GeneralCatmullClarkPatch::SIZE]; patch.getLimitBorder(borders);
+        BezierCurve border0l,border0r; borders[0].subdivide(border0l,border0r);
+        BezierCurve border1l,border1r; borders[1].subdivide(border1l,border1r);
+        BezierCurve border2l,border2r; borders[2].subdivide(border2l,border2r);
+        BezierCurve border3l,border3r; borders[3].subdivide(border3l,border3r);
+        GeneralCatmullClarkPatch::fix_quad_ring_order(patches);
+        child[0] = PatchT::create(alloc,patches[0],edge,vertices,stride,depth+1,&border0l,nullptr,nullptr,&border3r);
+        child[1] = PatchT::create(alloc,patches[1],edge,vertices,stride,depth+1,&border0r,&border1l,nullptr,nullptr);
+        child[2] = PatchT::create(alloc,patches[2],edge,vertices,stride,depth+1,nullptr,&border1r,&border2l,nullptr);
+        child[3] = PatchT::create(alloc,patches[3],edge,vertices,stride,depth+1,nullptr,nullptr,&border2r,&border3l);
+#else
+        GeneralCatmullClarkPatch::fix_quad_ring_order(patches);
+        for (size_t i=0; i<4; i++)
+          child[i] = PatchT::create(alloc,patches[i],edge,vertices,stride,depth+1);
+#endif
+        return SubdividedQuadPatch::create(alloc,child);
+      }
+      else 
+      {
+        assert(N<MAX_PATCH_VALENCE);
+        Ref child[MAX_PATCH_VALENCE];
+        
+#if PATCH_USE_GREGORY == 2
+        BezierCurve borders[GeneralCatmullClarkPatch::SIZE]; 
+        patch.getLimitBorder(borders);
+
+        for (size_t i0=0; i0<N; i0++) {
+          const size_t i2 = i0==0 ? N-1 : i0-1; 
+          BezierCurve border0l,border0r; borders[i0].subdivide(border0l,border0r);
+          BezierCurve border2l,border2r; borders[i2].subdivide(border2l,border2r);
+          child[i0] = PatchT::create(alloc,patches[i0],edge,vertices,stride,depth+1, &border0l, nullptr, nullptr, &border2r);
+        }
+#else
+        for (size_t i=0; i<N; i++)
+          child[i] = PatchT::create(alloc,patches[i],edge,vertices,stride,depth+1);
+#endif
+        return SubdividedGeneralPatch::create(alloc,child,N);
+      }
+      
+      return nullptr;
+    }
+
+    static __forceinline bool final(const CatmullClarkPatch& patch, const typename CatmullClarkRing::Type type, size_t depth) 
+    {
+      const size_t max_eval_depth = (type & CatmullClarkRing::TYPE_CREASES) ? PATCH_MAX_EVAL_DEPTH_CREASE : PATCH_MAX_EVAL_DEPTH_IRREGULAR;
+//#if PATCH_MIN_RESOLUTION
+//      return patch.isFinalResolution(PATCH_MIN_RESOLUTION) || depth>=max_eval_depth;
+//#else
+      return depth>=max_eval_depth;
+//#endif
+    }
+
+    template<typename Allocator>
+      __noinline static Ref create(const Allocator& alloc, CatmullClarkPatch& patch, const HalfEdge* edge, const char* vertices, size_t stride, size_t depth,
+                                   const BezierCurve* border0 = nullptr, const BezierCurve* border1 = nullptr, const BezierCurve* border2 = nullptr, const BezierCurve* border3 = nullptr)
+    {
+      const typename CatmullClarkPatch::Type ty = patch.type();
+      if (unlikely(final(patch,ty,depth))) {
+        if (ty & CatmullClarkRing::TYPE_REGULAR) return RegularPatch::create(alloc,patch,border0,border1,border2,border3); 
+        else                                     return IrregularFillPatch::create(alloc,patch,border0,border1,border2,border3); 
+      }
+      else if (ty & CatmullClarkRing::TYPE_REGULAR_CREASES) { 
+        assert(depth > 0); return RegularPatch::create(alloc,patch,border0,border1,border2,border3); 
+      }
+#if PATCH_USE_GREGORY == 2
+      else if (ty & CatmullClarkRing::TYPE_GREGORY_CREASES) { 
+        assert(depth > 0); return GregoryPatch::create(alloc,patch,border0,border1,border2,border3); 
+      }
+#endif
+      else if (depth >= PATCH_MAX_CACHE_DEPTH) {
+        return EvalPatch::create(alloc,patch); 
+      }
+      
+      else 
+      {
+        Ref child[4];
+        array_t<CatmullClarkPatch,4> patches; 
+        patch.subdivide(patches);
+        
+        for (size_t i=0; i<4; i++)
+          child[i] = PatchT::create(alloc,patches[i],edge,vertices,stride,depth+1);
+        return SubdividedQuadPatch::create(alloc,child);
+      }
+    }
+  };
+
+  typedef PatchT<Vec3fa,Vec3fa_t> Patch3fa;
+}
diff --git a/thirdparty/embree/kernels/subdiv/patch_eval.h b/thirdparty/embree/kernels/subdiv/patch_eval.h
new file mode 100644
index 0000000000..a3fafa72f4
--- /dev/null
+++ b/thirdparty/embree/kernels/subdiv/patch_eval.h
@@ -0,0 +1,129 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "patch.h"
+#include "feature_adaptive_eval.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<typename Vertex, typename Vertex_t = Vertex>
+      struct PatchEval
+      {
+      public:
+        
+        typedef PatchT<Vertex,Vertex_t> Patch;
+        typedef typename Patch::Ref Ref;
+        typedef CatmullClarkPatchT<Vertex,Vertex_t> CatmullClarkPatch;
+        
+        PatchEval (SharedLazyTessellationCache::CacheEntry& entry, size_t commitCounter, 
+                   const HalfEdge* edge, const char* vertices, size_t stride, const float u, const float v, 
+                   Vertex* P, Vertex* dPdu, Vertex* dPdv, Vertex* ddPdudu, Vertex* ddPdvdv, Vertex* ddPdudv)
+        : P(P), dPdu(dPdu), dPdv(dPdv), ddPdudu(ddPdudu), ddPdvdv(ddPdvdv), ddPdudv(ddPdudv)
+        {
+          /* conservative time for the very first allocation */
+          auto time = SharedLazyTessellationCache::sharedLazyTessellationCache.getTime(commitCounter);
+
+          Ref patch = SharedLazyTessellationCache::lookup(entry,commitCounter,[&] () {
+              auto alloc = [&](size_t bytes) { return SharedLazyTessellationCache::malloc(bytes); };
+              return Patch::create(alloc,edge,vertices,stride);
+            },true);
+
+          auto curTime = SharedLazyTessellationCache::sharedLazyTessellationCache.getTime(commitCounter);
+          const bool allAllocationsValid = SharedLazyTessellationCache::validTime(time,curTime);
+
+          if (patch && allAllocationsValid &&  eval(patch,u,v,1.0f,0)) {
+            SharedLazyTessellationCache::unlock();
+            return;
+          }
+          SharedLazyTessellationCache::unlock();
+          FeatureAdaptiveEval<Vertex,Vertex_t>(edge,vertices,stride,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv);
+          PATCH_DEBUG_SUBDIVISION(edge,c,-1,-1);
+        }
+        
+        __forceinline bool eval_quad(const typename Patch::SubdividedQuadPatch* This, const float u, const float v, const float dscale, const size_t depth)
+        {
+          if (v < 0.5f) {
+            if (u < 0.5f) return eval(This->child[0],2.0f*u,2.0f*v,2.0f*dscale,depth+1);
+            else          return eval(This->child[1],2.0f*u-1.0f,2.0f*v,2.0f*dscale,depth+1);
+          } else {
+            if (u > 0.5f) return eval(This->child[2],2.0f*u-1.0f,2.0f*v-1.0f,2.0f*dscale,depth+1);
+            else          return eval(This->child[3],2.0f*u,2.0f*v-1.0f,2.0f*dscale,depth+1);
+          }
+        }
+        
+        bool eval_general(const typename Patch::SubdividedGeneralPatch* This, const float U, const float V, const size_t depth)
+        {
+          const unsigned l = (unsigned) floor(0.5f*U); const float u = 2.0f*frac(0.5f*U)-0.5f; 
+          const unsigned h = (unsigned) floor(0.5f*V); const float v = 2.0f*frac(0.5f*V)-0.5f; 
+          const unsigned i = 4*h+l; assert(i<This->N);
+          return eval(This->child[i],u,v,1.0f,depth+1);
+        }
+        
+        bool eval(Ref This, const float& u, const float& v, const float dscale, const size_t depth) 
+        {
+          if (!This) return false;
+          //PRINT(depth);
+          //PRINT2(u,v);
+          
+          switch (This.type()) 
+          {
+          case Patch::BILINEAR_PATCH: {
+            //PRINT("bilinear");
+            ((typename Patch::BilinearPatch*)This.object())->patch.eval(u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale); 
+            PATCH_DEBUG_SUBDIVISION(This,-1,c,c);
+            return true;
+          }
+          case Patch::BSPLINE_PATCH: {
+            //PRINT("bspline");
+            ((typename Patch::BSplinePatch*)This.object())->patch.eval(u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale);
+            PATCH_DEBUG_SUBDIVISION(This,-1,c,-1);
+            return true;
+          }
+          case Patch::BEZIER_PATCH: {
+            //PRINT("bezier");
+            ((typename Patch::BezierPatch*)This.object())->patch.eval(u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale);
+            PATCH_DEBUG_SUBDIVISION(This,-1,c,-1);
+            return true;
+          }
+          case Patch::GREGORY_PATCH: {
+            //PRINT("gregory");
+            ((typename Patch::GregoryPatch*)This.object())->patch.eval(u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale); 
+            PATCH_DEBUG_SUBDIVISION(This,-1,-1,c);
+            return true;
+          }
+          case Patch::SUBDIVIDED_QUAD_PATCH: {
+            //PRINT("subdivided quad");
+            return eval_quad(((typename Patch::SubdividedQuadPatch*)This.object()),u,v,dscale,depth);
+          }
+          case Patch::SUBDIVIDED_GENERAL_PATCH: { 
+            //PRINT("general_patch");
+            assert(dscale == 1.0f); 
+            return eval_general(((typename Patch::SubdividedGeneralPatch*)This.object()),u,v,depth); 
+          }
+          case Patch::EVAL_PATCH: { 
+            //PRINT("eval_patch");
+            CatmullClarkPatch patch; patch.deserialize(This.object());
+            FeatureAdaptiveEval<Vertex,Vertex_t>(patch,u,v,dscale,depth,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv);
+            return true;
+          }
+          default: 
+            assert(false); 
+            return false;
+          }
+        }
+        
+      private:
+        Vertex* const P;
+        Vertex* const dPdu;
+        Vertex* const dPdv;
+        Vertex* const ddPdudu;
+        Vertex* const ddPdvdv;
+        Vertex* const ddPdudv;
+      };
+  }
+}
+  
diff --git a/thirdparty/embree/kernels/subdiv/patch_eval_grid.h b/thirdparty/embree/kernels/subdiv/patch_eval_grid.h
new file mode 100644
index 0000000000..167e1ebe1c
--- /dev/null
+++ b/thirdparty/embree/kernels/subdiv/patch_eval_grid.h
@@ -0,0 +1,245 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "patch.h"
+#include "feature_adaptive_eval_grid.h"
+
+namespace embree
+{
+  namespace isa 
+  {
+    struct PatchEvalGrid
+    {
+      typedef Patch3fa Patch;
+      typedef Patch::Ref Ref;
+      typedef GeneralCatmullClarkPatch3fa GeneralCatmullClarkPatch;
+      typedef CatmullClarkPatch3fa CatmullClarkPatch;
+      typedef BSplinePatch3fa BSplinePatch;
+      typedef BezierPatch3fa BezierPatch;
+      typedef GregoryPatch3fa GregoryPatch;
+      typedef BilinearPatch3fa BilinearPatch;
+
+    private:
+      const unsigned x0,x1;
+      const unsigned y0,y1;
+      const unsigned swidth,sheight;
+      const float rcp_swidth, rcp_sheight;
+      float* const Px;
+      float* const Py;
+      float* const Pz;
+      float* const U;
+      float* const V;
+      float* const Nx;
+      float* const Ny;
+      float* const Nz;
+      const unsigned dwidth,dheight;
+      unsigned count;
+
+    public:      
+
+      PatchEvalGrid (Ref patch, unsigned subPatch,
+                     const unsigned x0, const unsigned x1, const unsigned y0, const unsigned y1, const unsigned swidth, const unsigned sheight, 
+                     float* Px, float* Py, float* Pz, float* U, float* V, 
+                     float* Nx, float* Ny, float* Nz,
+                     const unsigned dwidth, const unsigned dheight)
+      : x0(x0), x1(x1), y0(y0), y1(y1), swidth(swidth), sheight(sheight), rcp_swidth(1.0f/(swidth-1.0f)), rcp_sheight(1.0f/(sheight-1.0f)), 
+        Px(Px), Py(Py), Pz(Pz), U(U), V(V), Nx(Nx), Ny(Ny), Nz(Nz), dwidth(dwidth), dheight(dheight), count(0)
+      {
+        assert(swidth < (2<<20) && sheight < (2<<20));
+        const BBox2f srange(Vec2f(0.0f,0.0f),Vec2f(float(swidth-1),float(sheight-1)));
+        const BBox2f erange(Vec2f(float(x0),float(y0)),Vec2f((float)x1,(float)y1));
+        bool done MAYBE_UNUSED = eval(patch,subPatch,srange,erange);
+        assert(done);
+        assert(count == (x1-x0+1)*(y1-y0+1));
+      }
+
+      template<typename Patch>
+      __forceinline void evalLocalGrid(const Patch* patch, const BBox2f& srange, const int lx0, const int lx1, const int ly0, const int ly1)
+      {
+        const float scale_x = rcp(srange.upper.x-srange.lower.x);
+        const float scale_y = rcp(srange.upper.y-srange.lower.y);
+        count += (lx1-lx0)*(ly1-ly0);
+        
+#if 0
+        for (unsigned iy=ly0; iy<ly1; iy++) {
+          for (unsigned ix=lx0; ix<lx1; ix++) {
+            const float lu = select(ix == swidth -1, float(1.0f), (float(ix)-srange.lower.x)*scale_x);
+            const float lv = select(iy == sheight-1, float(1.0f), (float(iy)-srange.lower.y)*scale_y);
+            const Vec3fa p = patch->patch.eval(lu,lv);
+            const float u = float(ix)*rcp_swidth;
+            const float v = float(iy)*rcp_sheight;
+            const int ofs = (iy-y0)*dwidth+(ix-x0);
+            Px[ofs] = p.x;
+            Py[ofs] = p.y;
+            Pz[ofs] = p.z;
+            U[ofs] = u;
+            V[ofs] = v;
+          }
+        }
+#else
+        foreach2(lx0,lx1,ly0,ly1,[&](const vboolx& valid, const vintx& ix, const vintx& iy) {
+            const vfloatx lu = select(ix == swidth -1, vfloatx(1.0f), (vfloatx(ix)-srange.lower.x)*scale_x);
+            const vfloatx lv = select(iy == sheight-1, vfloatx(1.0f), (vfloatx(iy)-srange.lower.y)*scale_y);
+            const Vec3vfx p = patch->patch.eval(lu,lv);
+            Vec3vfx n = zero;
+            if (unlikely(Nx != nullptr)) n = normalize_safe(patch->patch.normal(lu,lv));
+            const vfloatx u = vfloatx(ix)*rcp_swidth;
+            const vfloatx v = vfloatx(iy)*rcp_sheight;
+            const vintx ofs = (iy-y0)*dwidth+(ix-x0);
+            if (likely(all(valid)) && all(iy==iy[0])) {
+              const unsigned ofs2 = ofs[0];
+              vfloatx::storeu(Px+ofs2,p.x);
+              vfloatx::storeu(Py+ofs2,p.y);
+              vfloatx::storeu(Pz+ofs2,p.z);
+              vfloatx::storeu(U+ofs2,u);
+              vfloatx::storeu(V+ofs2,v);
+              if (unlikely(Nx != nullptr)) {
+                vfloatx::storeu(Nx+ofs2,n.x);
+                vfloatx::storeu(Ny+ofs2,n.y);
+                vfloatx::storeu(Nz+ofs2,n.z);
+              }
+            } else {
+              foreach_unique_index(valid,iy,[&](const vboolx& valid, const int iy0, const int j) {
+                  const unsigned ofs2 = ofs[j]-j;
+                  vfloatx::storeu(valid,Px+ofs2,p.x);
+                  vfloatx::storeu(valid,Py+ofs2,p.y);
+                  vfloatx::storeu(valid,Pz+ofs2,p.z);
+                  vfloatx::storeu(valid,U+ofs2,u);
+                  vfloatx::storeu(valid,V+ofs2,v);
+                  if (unlikely(Nx != nullptr)) {
+                    vfloatx::storeu(valid,Nx+ofs2,n.x);
+                    vfloatx::storeu(valid,Ny+ofs2,n.y);
+                    vfloatx::storeu(valid,Nz+ofs2,n.z);
+                  }
+                });
+            }
+          });
+#endif
+      }
+
+      bool eval(Ref This, const BBox2f& srange, const BBox2f& erange, const unsigned depth) 
+      {
+        if (erange.empty())
+          return true;
+        
+        const int lx0 = (int) ceilf(erange.lower.x);
+        const int lx1 = (int) ceilf(erange.upper.x) + (erange.upper.x == x1 && (srange.lower.x < erange.upper.x || erange.upper.x == 0));
+        const int ly0 = (int) ceilf(erange.lower.y);
+        const int ly1 = (int) ceilf(erange.upper.y) + (erange.upper.y == y1 && (srange.lower.y < erange.upper.y || erange.upper.y == 0));
+        if (lx0 >= lx1 || ly0 >= ly1) 
+          return true;
+
+        if (!This) 
+          return false;
+        
+        switch (This.type()) 
+        {
+        case Patch::BILINEAR_PATCH: {
+          evalLocalGrid((Patch::BilinearPatch*)This.object(),srange,lx0,lx1,ly0,ly1);
+          return true;
+        }
+        case Patch::BSPLINE_PATCH: {
+          evalLocalGrid((Patch::BSplinePatch*)This.object(),srange,lx0,lx1,ly0,ly1);
+          return true;
+        }
+        case Patch::BEZIER_PATCH: {
+          evalLocalGrid((Patch::BezierPatch*)This.object(),srange,lx0,lx1,ly0,ly1);
+          return true;
+        }
+        case Patch::GREGORY_PATCH: {
+          evalLocalGrid((Patch::GregoryPatch*)This.object(),srange,lx0,lx1,ly0,ly1);
+          return true;
+        }
+        case Patch::SUBDIVIDED_QUAD_PATCH: 
+        {
+          const Vec2f c = srange.center();
+          const BBox2f srange0(srange.lower,c);
+          const BBox2f srange1(Vec2f(c.x,srange.lower.y),Vec2f(srange.upper.x,c.y));
+          const BBox2f srange2(c,srange.upper);
+          const BBox2f srange3(Vec2f(srange.lower.x,c.y),Vec2f(c.x,srange.upper.y));
+          
+          Patch::SubdividedQuadPatch* patch = (Patch::SubdividedQuadPatch*)This.object();
+          eval(patch->child[0],srange0,intersect(srange0,erange),depth+1);
+          eval(patch->child[1],srange1,intersect(srange1,erange),depth+1);
+          eval(patch->child[2],srange2,intersect(srange2,erange),depth+1);
+          eval(patch->child[3],srange3,intersect(srange3,erange),depth+1);
+          return true;
+        }
+        case Patch::EVAL_PATCH: { 
+          CatmullClarkPatch patch; patch.deserialize(This.object());
+          FeatureAdaptiveEvalGrid(patch,srange,erange,depth,x0,x1,y0,y1,swidth,sheight,Px,Py,Pz,U,V,Nx,Ny,Nz,dwidth,dheight);
+          count += (lx1-lx0)*(ly1-ly0);
+          return true;
+        }
+        default: 
+          assert(false); 
+          return false;
+        }
+      }
+
+      bool eval(Ref This, unsigned subPatch, const BBox2f& srange, const BBox2f& erange) 
+      {
+        if (!This) 
+          return false;
+
+        switch (This.type()) 
+        {
+        case Patch::SUBDIVIDED_GENERAL_PATCH: { 
+          Patch::SubdividedGeneralPatch* patch = (Patch::SubdividedGeneralPatch*)This.object();
+          assert(subPatch < patch->N);
+          return eval(patch->child[subPatch],srange,erange,1);
+        }
+        default: 
+          assert(subPatch == 0);
+          return eval(This,srange,erange,0);
+        }
+      }
+    };
+
+    __forceinline unsigned patch_eval_subdivision_count (const HalfEdge* h)
+    {
+      const unsigned N = h->numEdges();
+      if (N == 4) return 1;
+      else return N;
+    }
+    
+    template<typename Tessellator>
+      inline void patch_eval_subdivision (const HalfEdge* h, Tessellator tessellator)
+    {
+      const unsigned N = h->numEdges();
+      int neighborSubdiv[GeneralCatmullClarkPatch3fa::SIZE]; // FIXME: use array_t
+      float levels[GeneralCatmullClarkPatch3fa::SIZE];
+      for (unsigned i=0; i<N; i++) {
+        assert(i<GeneralCatmullClarkPatch3fa::SIZE);
+        neighborSubdiv[i] = h->hasOpposite() ? h->opposite()->numEdges() != 4 : 0; 
+        levels[i] = h->edge_level;
+        h = h->next();
+      }      
+      if (N == 4)
+      {
+        const Vec2f uv[4] = { Vec2f(0.0f,0.0f), Vec2f(1.0f,0.0f), Vec2f(1.0f,1.0f), Vec2f(0.0f,1.0f) };
+        tessellator(uv,neighborSubdiv,levels,0);
+      }
+      else
+      {
+        for (unsigned i=0; i<N; i++) 
+        {
+          assert(i<MAX_PATCH_VALENCE);
+          static_assert(MAX_PATCH_VALENCE <= 16, "MAX_PATCH_VALENCE > 16");
+          const int h = (i >> 2) & 3, l = i & 3;
+          const Vec2f subPatchID((float)l,(float)h);
+          const Vec2f uv[4] = { 2.0f*subPatchID + (0.5f+Vec2f(0.0f,0.0f)),
+                                2.0f*subPatchID + (0.5f+Vec2f(1.0f,0.0f)),
+                                2.0f*subPatchID + (0.5f+Vec2f(1.0f,1.0f)),
+                                2.0f*subPatchID + (0.5f+Vec2f(0.0f,1.0f)) };
+          const int neighborSubdiv1[4] = { 0,0,0,0 }; 
+          const float levels1[4] = { 0.5f*levels[(i+0)%N], 0.5f*levels[(i+0)%N], 0.5f*levels[(i+N-1)%N], 0.5f*levels[(i+N-1)%N] };
+          tessellator(uv,neighborSubdiv1,levels1,i);
+        }
+      }
+    }
+  }
+}
+
diff --git a/thirdparty/embree/kernels/subdiv/patch_eval_simd.h b/thirdparty/embree/kernels/subdiv/patch_eval_simd.h
new file mode 100644
index 0000000000..fef88a4492
--- /dev/null
+++ b/thirdparty/embree/kernels/subdiv/patch_eval_simd.h
@@ -0,0 +1,127 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "patch.h"
+#include "feature_adaptive_eval_simd.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<typename vbool, typename vint, typename vfloat, typename Vertex, typename Vertex_t = Vertex>
+      struct PatchEvalSimd
+      {
+      public:
+        
+        typedef PatchT<Vertex,Vertex_t> Patch;
+        typedef typename Patch::Ref Ref;
+        typedef CatmullClarkPatchT<Vertex,Vertex_t> CatmullClarkPatch;
+
+        PatchEvalSimd (SharedLazyTessellationCache::CacheEntry& entry, size_t commitCounter, 
+                       const HalfEdge* edge, const char* vertices, size_t stride, const vbool& valid0, const vfloat& u, const vfloat& v, 
+                       float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv, const size_t dstride, const size_t N)
+        : P(P), dPdu(dPdu), dPdv(dPdv), ddPdudu(ddPdudu), ddPdvdv(ddPdvdv), ddPdudv(ddPdudv), dstride(dstride), N(N)
+        {
+          /* conservative time for the very first allocation */
+          auto time = SharedLazyTessellationCache::sharedLazyTessellationCache.getTime(commitCounter);
+
+          Ref patch = SharedLazyTessellationCache::lookup(entry,commitCounter,[&] () {
+              auto alloc = [](size_t bytes) { return SharedLazyTessellationCache::malloc(bytes); };
+              return Patch::create(alloc,edge,vertices,stride);
+            }, true);
+
+          auto curTime = SharedLazyTessellationCache::sharedLazyTessellationCache.getTime(commitCounter);
+          const bool allAllocationsValid = SharedLazyTessellationCache::validTime(time,curTime);
+          
+          patch = allAllocationsValid ? patch : nullptr;
+
+          /* use cached data structure for calculations */
+          const vbool valid1 = patch ? eval(valid0,patch,u,v,1.0f,0) : vbool(false);
+          SharedLazyTessellationCache::unlock();
+          const vbool valid2 = valid0 & !valid1;
+          if (any(valid2)) {
+            FeatureAdaptiveEvalSimd<vbool,vint,vfloat,Vertex,Vertex_t>(edge,vertices,stride,valid2,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dstride,N);
+          }
+        }
+        
+        vbool eval_quad(const vbool& valid, const typename Patch::SubdividedQuadPatch* This, const vfloat& u, const vfloat& v, const float dscale, const size_t depth)
+        {
+          vbool ret = false;
+          const vbool u0_mask = u < 0.5f, u1_mask = u >= 0.5f;
+          const vbool v0_mask = v < 0.5f, v1_mask = v >= 0.5f;
+          const vbool u0v0_mask = valid & u0_mask & v0_mask;
+          const vbool u0v1_mask = valid & u0_mask & v1_mask;
+          const vbool u1v0_mask = valid & u1_mask & v0_mask;
+          const vbool u1v1_mask = valid & u1_mask & v1_mask;
+          if (any(u0v0_mask)) ret |= eval(u0v0_mask,This->child[0],2.0f*u,2.0f*v,2.0f*dscale,depth+1);
+          if (any(u1v0_mask)) ret |= eval(u1v0_mask,This->child[1],2.0f*u-1.0f,2.0f*v,2.0f*dscale,depth+1);
+          if (any(u1v1_mask)) ret |= eval(u1v1_mask,This->child[2],2.0f*u-1.0f,2.0f*v-1.0f,2.0f*dscale,depth+1);
+          if (any(u0v1_mask)) ret |= eval(u0v1_mask,This->child[3],2.0f*u,2.0f*v-1.0f,2.0f*dscale,depth+1);
+          return ret;
+        }
+        
+        vbool eval_general(const vbool& valid, const typename Patch::SubdividedGeneralPatch* patch, const vfloat& U, const vfloat& V, const size_t depth)
+        {
+          vbool ret = false;
+          const vint l = (vint)floor(0.5f*U); const vfloat u = 2.0f*frac(0.5f*U)-0.5f; 
+          const vint h = (vint)floor(0.5f*V); const vfloat v = 2.0f*frac(0.5f*V)-0.5f; 
+          const vint i = (h<<2)+l; assert(all(valid,i<patch->N));
+          foreach_unique(valid,i,[&](const vbool& valid, const int i) {
+              ret |= eval(valid,patch->child[i],u,v,1.0f,depth+1);
+            });
+          return ret;
+        }
+        
+        vbool eval(const vbool& valid, Ref This, const vfloat& u, const vfloat& v, const float dscale, const size_t depth) 
+        {
+          if (!This) return false;
+          switch (This.type()) 
+          {
+          case Patch::BILINEAR_PATCH: {
+            ((typename Patch::BilinearPatch*)This.object())->patch.eval(valid,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N); 
+            return valid;
+          }
+          case Patch::BSPLINE_PATCH: {
+            ((typename Patch::BSplinePatch*)This.object())->patch.eval(valid,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N);
+            return valid;
+          }
+          case Patch::BEZIER_PATCH: {
+            ((typename Patch::BezierPatch*)This.object())->patch.eval(valid,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N);
+            return valid;
+          }
+          case Patch::GREGORY_PATCH: {
+            ((typename Patch::GregoryPatch*)This.object())->patch.eval(valid,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N); 
+            return valid;
+          }
+          case Patch::SUBDIVIDED_QUAD_PATCH: {
+            return eval_quad(valid,((typename Patch::SubdividedQuadPatch*)This.object()),u,v,dscale,depth);
+          }
+          case Patch::SUBDIVIDED_GENERAL_PATCH: { 
+            assert(dscale == 1.0f); 
+            return eval_general(valid,((typename Patch::SubdividedGeneralPatch*)This.object()),u,v,depth); 
+          }
+          case Patch::EVAL_PATCH: { 
+            CatmullClarkPatch patch; patch.deserialize(This.object());
+            FeatureAdaptiveEvalSimd<vbool,vint,vfloat,Vertex,Vertex_t>(patch,valid,u,v,dscale,depth,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dstride,N);
+            return valid;
+          }
+          default: 
+            assert(false); 
+            return false;
+          }
+        }
+
+      private:
+        float* const P;
+        float* const dPdu;
+        float* const dPdv;
+        float* const ddPdudu;
+        float* const ddPdvdv;
+        float* const ddPdudv;
+        const size_t dstride;
+        const size_t N;
+      };
+  }
+}
diff --git a/thirdparty/embree/kernels/subdiv/subdivpatch1base.h b/thirdparty/embree/kernels/subdiv/subdivpatch1base.h
new file mode 100644
index 0000000000..c3069dadee
--- /dev/null
+++ b/thirdparty/embree/kernels/subdiv/subdivpatch1base.h
@@ -0,0 +1,156 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../geometry/primitive.h"
+#include "bspline_patch.h"
+#include "bezier_patch.h"
+#include "gregory_patch.h"
+#include "gregory_patch_dense.h"
+#include "tessellation.h"
+#include "tessellation_cache.h"
+#include "gridrange.h"
+#include "patch_eval_grid.h"
+#include "feature_adaptive_eval_grid.h"
+#include "../common/scene_subdiv_mesh.h"
+
+namespace embree
+{
+  struct __aligned(64) SubdivPatch1Base
+  {
+  public:
+
+    enum Type {
+      INVALID_PATCH          = 0,
+      BSPLINE_PATCH          = 1,  
+      BEZIER_PATCH           = 2,  
+      GREGORY_PATCH          = 3,
+      EVAL_PATCH             = 5,
+      BILINEAR_PATCH         = 6,
+    };
+
+    enum Flags {
+      TRANSITION_PATCH       = 16, 
+    };
+
+    /*! Default constructor. */
+    __forceinline SubdivPatch1Base () {}
+
+    SubdivPatch1Base (const unsigned int gID,
+                      const unsigned int pID,
+                      const unsigned int subPatch,
+                      const SubdivMesh *const mesh,
+                      const size_t time,
+                      const Vec2f uv[4],
+                      const float edge_level[4],
+                      const int subdiv[4],
+                      const int simd_width);
+
+    __forceinline bool needsStitching() const {
+      return flags & TRANSITION_PATCH;      
+    }
+
+    __forceinline Vec2f getUV(const size_t i) const {
+      return Vec2f((float)u[i],(float)v[i]) * (8.0f/0x10000);
+    }
+
+    static void computeEdgeLevels(const float edge_level[4], const int subdiv[4], float level[4]);
+    static Vec2i computeGridSize(const float level[4]);
+    bool updateEdgeLevels(const float edge_level[4], const int subdiv[4], const SubdivMesh *const mesh, const int simd_width);
+
+  public:
+
+    __forceinline size_t getGridBytes() const {
+      const size_t grid_size_xyzuv = (grid_size_simd_blocks * VSIZEX) * 4;
+      return 64*((grid_size_xyzuv+15) / 16);
+    }
+
+    __forceinline void write_lock()     { mtx.lock();   }
+    __forceinline void write_unlock()   { mtx.unlock(); }
+    __forceinline bool try_write_lock() { return mtx.try_lock(); }
+    //__forceinline bool try_read_lock()  { return mtx.try_read_lock(); }
+
+    __forceinline void resetRootRef() {
+      //assert( mtx.hasInitialState() );
+      root_ref = SharedLazyTessellationCache::Tag();
+    }
+
+    __forceinline SharedLazyTessellationCache::CacheEntry& entry() {
+      return (SharedLazyTessellationCache::CacheEntry&) root_ref;
+    }
+
+  public:    
+    __forceinline unsigned int geomID() const  {
+      return geom;
+    } 
+
+    __forceinline unsigned int primID() const  {
+      return prim;
+    } 
+
+  public:
+    SharedLazyTessellationCache::Tag root_ref;
+    SpinLock mtx;
+
+    unsigned short u[4];                        //!< 16bit discretized u,v coordinates
+    unsigned short v[4];
+    float level[4];
+
+    unsigned char flags;
+    unsigned char type;
+    unsigned short grid_u_res;
+    unsigned int geom;                          //!< geometry ID of the subdivision mesh this patch belongs to
+    unsigned int prim;                          //!< primitive ID of this subdivision patch
+    unsigned short grid_v_res;
+
+    unsigned short grid_size_simd_blocks;
+    unsigned int time_;
+
+    struct PatchHalfEdge {
+      const HalfEdge* edge;
+      unsigned subPatch;
+    };
+
+    Vec3fa patch_v[4][4];
+
+    const HalfEdge *edge() const { return ((PatchHalfEdge*)patch_v)->edge; }
+    unsigned time() const { return time_; }
+    unsigned subPatch() const { return ((PatchHalfEdge*)patch_v)->subPatch; }
+
+    void set_edge(const HalfEdge *h) const { ((PatchHalfEdge*)patch_v)->edge = h; }
+    void set_subPatch(const unsigned s) const { ((PatchHalfEdge*)patch_v)->subPatch = s; }
+  };
+
+  namespace isa
+  {
+    Vec3fa patchEval(const SubdivPatch1Base& patch, const float uu, const float vv);
+    Vec3fa patchNormal(const SubdivPatch1Base& patch, const float uu, const float vv);
+    
+    template<typename simdf>
+      Vec3<simdf> patchEval(const SubdivPatch1Base& patch, const simdf& uu, const simdf& vv); 
+
+    template<typename simdf>
+      Vec3<simdf> patchNormal(const SubdivPatch1Base& patch, const simdf& uu, const simdf& vv); 
+   
+
+    /* eval grid over patch and stich edges when required */      
+    void evalGrid(const SubdivPatch1Base& patch,
+                  const unsigned x0, const unsigned x1,
+                  const unsigned y0, const unsigned y1,
+                  const unsigned swidth, const unsigned sheight,
+                  float *__restrict__ const grid_x,
+                  float *__restrict__ const grid_y,
+                  float *__restrict__ const grid_z,
+                  float *__restrict__ const grid_u,
+                  float *__restrict__ const grid_v,
+                  const SubdivMesh* const geom);
+
+    /* eval grid over patch and stich edges when required */      
+    BBox3fa evalGridBounds(const SubdivPatch1Base& patch,
+                           const unsigned x0, const unsigned x1,
+                           const unsigned y0, const unsigned y1,
+                           const unsigned swidth, const unsigned sheight,
+                           const SubdivMesh* const geom);
+  }
+}
diff --git a/thirdparty/embree/kernels/subdiv/tessellation.h b/thirdparty/embree/kernels/subdiv/tessellation.h
new file mode 100644
index 0000000000..abde4f2bde
--- /dev/null
+++ b/thirdparty/embree/kernels/subdiv/tessellation.h
@@ -0,0 +1,161 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+namespace embree
+{
+  /* adjust discret tessellation level for feature-adaptive pre-subdivision */
+  __forceinline float adjustTessellationLevel(float l, const size_t sublevel)
+  {
+    for (size_t i=0; i<sublevel; i++) l *= 0.5f;
+    float r = ceilf(l);      
+    for (size_t i=0; i<sublevel; i++) r *= 2.0f;
+    return r;
+  }
+  
+  __forceinline int stitch(const int x, const int fine, const int coarse) {
+    return (2*x+1)*coarse/(2*fine);
+  }
+
+  __forceinline void stitchGridEdges(const unsigned int low_rate,
+                                     const unsigned int high_rate,
+                                     const unsigned int x0,
+                                     const unsigned int x1,
+				    float * __restrict__ const uv_array,
+				    const unsigned int uv_array_step)
+  {
+#if 1
+    const float inv_low_rate = rcp((float)(low_rate-1));
+    for (unsigned x=x0; x<=x1; x++) {
+      uv_array[(x-x0)*uv_array_step] = float(stitch(x,high_rate-1,low_rate-1))*inv_low_rate;
+    }
+    if (unlikely(x1 == high_rate-1))
+      uv_array[(x1-x0)*uv_array_step] = 1.0f;
+#else
+    assert(low_rate < high_rate);
+    assert(high_rate >= 2);
+    
+    const float inv_low_rate = rcp((float)(low_rate-1));
+    const unsigned int dy = low_rate  - 1; 
+    const unsigned int dx = high_rate - 1;
+    
+    int p = 2*dy-dx;  
+    
+    unsigned int offset = 0;
+    unsigned int y = 0;
+    float value = 0.0f;
+    for(unsigned int x=0;x<high_rate-1; x++) // '<=' would be correct but we will leave the 1.0f at the end
+    {
+      uv_array[offset] = value;
+      
+      offset += uv_array_step;      
+      if (unlikely(p > 0))
+      {
+	y++;
+	value = (float)y * inv_low_rate;
+	p -= 2*dx;
+      }
+      p += 2*dy;
+    }
+#endif
+  }
+  
+  __forceinline void stitchUVGrid(const float edge_levels[4],
+                                  const unsigned int swidth,
+                                  const unsigned int sheight,
+                                  const unsigned int x0,
+                                  const unsigned int y0,
+				  const unsigned int grid_u_res,
+				  const unsigned int grid_v_res,
+				  float * __restrict__ const u_array,
+				  float * __restrict__ const v_array)
+  {
+    const unsigned int x1 = x0+grid_u_res-1;
+    const unsigned int y1 = y0+grid_v_res-1;
+    const unsigned int int_edge_points0 = (unsigned int)edge_levels[0] + 1;
+    const unsigned int int_edge_points1 = (unsigned int)edge_levels[1] + 1;
+    const unsigned int int_edge_points2 = (unsigned int)edge_levels[2] + 1;
+    const unsigned int int_edge_points3 = (unsigned int)edge_levels[3] + 1;
+    
+    if (unlikely(y0 == 0 && int_edge_points0 < swidth))
+      stitchGridEdges(int_edge_points0,swidth,x0,x1,u_array,1);
+    
+    if (unlikely(y1 == sheight-1 && int_edge_points2 < swidth))
+      stitchGridEdges(int_edge_points2,swidth,x0,x1,&u_array[(grid_v_res-1)*grid_u_res],1);
+    
+    if (unlikely(x0 == 0 && int_edge_points1 < sheight))
+      stitchGridEdges(int_edge_points1,sheight,y0,y1,&v_array[grid_u_res-1],grid_u_res);
+    
+    if (unlikely(x1 == swidth-1 && int_edge_points3 < sheight))
+      stitchGridEdges(int_edge_points3,sheight,y0,y1,v_array,grid_u_res);  
+  }
+  
+  __forceinline void gridUVTessellator(const float edge_levels[4],  
+                                       const unsigned int swidth,
+                                       const unsigned int sheight,
+                                       const unsigned int x0,
+                                       const unsigned int y0,
+				       const unsigned int grid_u_res,
+				       const unsigned int grid_v_res,
+				       float * __restrict__ const u_array,
+				       float * __restrict__ const v_array)
+  {
+    assert( grid_u_res >= 1);
+    assert( grid_v_res >= 1);
+    assert( edge_levels[0] >= 1.0f );
+    assert( edge_levels[1] >= 1.0f );
+    assert( edge_levels[2] >= 1.0f );
+    assert( edge_levels[3] >= 1.0f );
+    
+#if defined(__AVX__)
+    const vint8 grid_u_segments = vint8(swidth)-1;
+    const vint8 grid_v_segments = vint8(sheight)-1;
+    
+    const vfloat8 inv_grid_u_segments = rcp(vfloat8(grid_u_segments));
+    const vfloat8 inv_grid_v_segments = rcp(vfloat8(grid_v_segments));
+    
+    unsigned int index = 0;
+    vint8 v_i( zero );
+    for (unsigned int y=0;y<grid_v_res;y++,index+=grid_u_res,v_i += 1)
+    {
+      vint8 u_i ( step );
+      
+      const vbool8 m_v = v_i < grid_v_segments;
+      
+      for (unsigned int x=0;x<grid_u_res;x+=8, u_i += 8)
+      {
+        const vbool8 m_u = u_i < grid_u_segments;
+	const vfloat8 u = select(m_u, vfloat8(x0+u_i) * inv_grid_u_segments, 1.0f);
+	const vfloat8 v = select(m_v, vfloat8(y0+v_i) * inv_grid_v_segments, 1.0f);
+	vfloat8::storeu(&u_array[index + x],u);
+	vfloat8::storeu(&v_array[index + x],v);	   
+      }
+    }       
+ #else   
+    const vint4 grid_u_segments = vint4(swidth)-1;
+    const vint4 grid_v_segments = vint4(sheight)-1;
+    
+    const vfloat4 inv_grid_u_segments = rcp(vfloat4(grid_u_segments));
+    const vfloat4 inv_grid_v_segments = rcp(vfloat4(grid_v_segments));
+    
+    unsigned int index = 0;
+    vint4 v_i( zero );
+    for (unsigned int y=0;y<grid_v_res;y++,index+=grid_u_res,v_i += 1)
+    {
+      vint4 u_i ( step );
+      
+      const vbool4 m_v = v_i < grid_v_segments;
+      
+      for (unsigned int x=0;x<grid_u_res;x+=4, u_i += 4)
+      {
+        const vbool4 m_u = u_i < grid_u_segments;
+	const vfloat4 u = select(m_u, vfloat4(x0+u_i) * inv_grid_u_segments, 1.0f);
+	const vfloat4 v = select(m_v, vfloat4(y0+v_i) * inv_grid_v_segments, 1.0f);
+        vfloat4::storeu(&u_array[index + x],u);
+	vfloat4::storeu(&v_array[index + x],v);	   
+      }
+    }       
+#endif
+  } 
+}
diff --git a/thirdparty/embree/kernels/subdiv/tessellation_cache.h b/thirdparty/embree/kernels/subdiv/tessellation_cache.h
new file mode 100644
index 0000000000..99edf49be4
--- /dev/null
+++ b/thirdparty/embree/kernels/subdiv/tessellation_cache.h
@@ -0,0 +1,325 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/default.h"
+
+/* force a complete cache invalidation when running out of allocation space */
+#define FORCE_SIMPLE_FLUSH 0
+
+#define THREAD_BLOCK_ATOMIC_ADD 4
+
+#if defined(DEBUG)
+#define CACHE_STATS(x) 
+#else
+#define CACHE_STATS(x) 
+#endif
+
+namespace embree
+{
+  class SharedTessellationCacheStats
+  {
+  public:
+    /* stats */
+    static std::atomic<size_t> cache_accesses;
+    static std::atomic<size_t> cache_hits;
+    static std::atomic<size_t> cache_misses;
+    static std::atomic<size_t> cache_flushes;                
+    static size_t        cache_num_patches;
+    __aligned(64) static SpinLock mtx;
+    
+    /* print stats for debugging */                 
+    static void printStats();
+    static void clearStats();
+  };
+  
+  void resizeTessellationCache(size_t new_size);
+  void resetTessellationCache();
+  
+ ////////////////////////////////////////////////////////////////////////////////
+ ////////////////////////////////////////////////////////////////////////////////
+ ////////////////////////////////////////////////////////////////////////////////
+
+ struct __aligned(64) ThreadWorkState 
+ {
+   ALIGNED_STRUCT_(64);
+
+   std::atomic<size_t> counter;
+   ThreadWorkState* next;
+   bool allocated;
+
+   __forceinline ThreadWorkState(bool allocated = false) 
+     : counter(0), next(nullptr), allocated(allocated) 
+   {
+     assert( ((size_t)this % 64) == 0 ); 
+   }   
+ };
+
+ class __aligned(64) SharedLazyTessellationCache 
+ {
+ public:
+   
+   static const size_t NUM_CACHE_SEGMENTS              = 8;
+   static const size_t NUM_PREALLOC_THREAD_WORK_STATES = 512;
+   static const size_t COMMIT_INDEX_SHIFT              = 32+8;
+#if defined(__64BIT__)
+   static const size_t REF_TAG_MASK                    = 0xffffffffff;
+#else
+   static const size_t REF_TAG_MASK                    = 0x7FFFFFFF;
+#endif
+   static const size_t MAX_TESSELLATION_CACHE_SIZE     = REF_TAG_MASK+1;
+   static const size_t BLOCK_SIZE                      = 64;
+   
+
+    /*! Per thread tessellation ref cache */
+   static __thread ThreadWorkState* init_t_state;
+   static ThreadWorkState* current_t_state;
+   
+   static __forceinline ThreadWorkState *threadState() 
+   {
+     if (unlikely(!init_t_state))
+       /* sets init_t_state, can't return pointer due to macosx icc bug*/
+       SharedLazyTessellationCache::sharedLazyTessellationCache.getNextRenderThreadWorkState();
+     return init_t_state;
+   }
+
+   struct Tag
+   {
+     __forceinline Tag() : data(0) {}
+
+     __forceinline Tag(void* ptr, size_t combinedTime) { 
+       init(ptr,combinedTime);
+     }
+
+     __forceinline Tag(size_t ptr, size_t combinedTime) {
+       init((void*)ptr,combinedTime); 
+     }
+
+     __forceinline void init(void* ptr, size_t combinedTime)
+     {
+       if (ptr == nullptr) {
+         data = 0;
+         return;
+       }
+       int64_t new_root_ref = (int64_t) ptr;
+       new_root_ref -= (int64_t)SharedLazyTessellationCache::sharedLazyTessellationCache.getDataPtr();                                
+       assert( new_root_ref <= (int64_t)REF_TAG_MASK );
+       new_root_ref |= (int64_t)combinedTime << COMMIT_INDEX_SHIFT; 
+       data = new_root_ref;
+     }
+
+     __forceinline int64_t get() const { return data.load(); }
+     __forceinline void set( int64_t v ) { data.store(v); }
+     __forceinline void reset() { data.store(0); }
+
+   private:
+     atomic<int64_t> data;
+   };
+
+   static __forceinline size_t extractCommitIndex(const int64_t v) { return v >> SharedLazyTessellationCache::COMMIT_INDEX_SHIFT; }
+
+   struct CacheEntry
+   {
+     Tag tag;
+     SpinLock mutex;
+   };
+
+ private:
+
+   float *data;
+   bool hugepages;
+   size_t size;
+   size_t maxBlocks;
+   ThreadWorkState *threadWorkState;
+      
+   __aligned(64) std::atomic<size_t> localTime;
+   __aligned(64) std::atomic<size_t> next_block;
+   __aligned(64) SpinLock   reset_state;
+   __aligned(64) SpinLock   linkedlist_mtx;
+   __aligned(64) std::atomic<size_t> switch_block_threshold;
+   __aligned(64) std::atomic<size_t> numRenderThreads;
+
+
+ public:
+
+      
+   SharedLazyTessellationCache();
+   ~SharedLazyTessellationCache();
+
+   void getNextRenderThreadWorkState();
+
+   __forceinline size_t maxAllocSize() const {
+     return switch_block_threshold;
+   }
+
+   __forceinline size_t getCurrentIndex() { return localTime.load(); }
+   __forceinline void   addCurrentIndex(const size_t i=1) { localTime.fetch_add(i); }
+
+   __forceinline size_t getTime(const size_t globalTime) {
+     return localTime.load()+NUM_CACHE_SEGMENTS*globalTime;
+   }
+
+
+   __forceinline size_t lockThread  (ThreadWorkState *const t_state, const ssize_t plus=1) { return t_state->counter.fetch_add(plus);  }
+   __forceinline size_t unlockThread(ThreadWorkState *const t_state, const ssize_t plus=-1) { assert(isLocked(t_state)); return t_state->counter.fetch_add(plus); }
+
+   __forceinline bool isLocked(ThreadWorkState *const t_state) { return t_state->counter.load() != 0; }
+
+   static __forceinline void lock  () { sharedLazyTessellationCache.lockThread(threadState()); }
+   static __forceinline void unlock() { sharedLazyTessellationCache.unlockThread(threadState()); }
+   static __forceinline bool isLocked() { return sharedLazyTessellationCache.isLocked(threadState()); }
+   static __forceinline size_t getState() { return threadState()->counter.load(); }
+   static __forceinline void lockThreadLoop() { sharedLazyTessellationCache.lockThreadLoop(threadState()); }
+
+   static __forceinline size_t getTCacheTime(const size_t globalTime) {
+     return sharedLazyTessellationCache.getTime(globalTime);
+   }
+
+   /* per thread lock */
+   __forceinline void lockThreadLoop (ThreadWorkState *const t_state) 
+   { 
+     while(1)
+     {
+       size_t lock = SharedLazyTessellationCache::sharedLazyTessellationCache.lockThread(t_state,1);
+       if (unlikely(lock >= THREAD_BLOCK_ATOMIC_ADD))
+       {
+         /* lock failed wait until sync phase is over */
+         sharedLazyTessellationCache.unlockThread(t_state,-1);	       
+         sharedLazyTessellationCache.waitForUsersLessEqual(t_state,0);
+       }
+       else
+         break;
+     }
+   }
+
+   static __forceinline void* lookup(CacheEntry& entry, size_t globalTime)
+   {   
+     const int64_t subdiv_patch_root_ref = entry.tag.get(); 
+     CACHE_STATS(SharedTessellationCacheStats::cache_accesses++);
+     
+     if (likely(subdiv_patch_root_ref != 0)) 
+     {
+       const size_t subdiv_patch_root = (subdiv_patch_root_ref & REF_TAG_MASK) + (size_t)sharedLazyTessellationCache.getDataPtr();
+       const size_t subdiv_patch_cache_index = extractCommitIndex(subdiv_patch_root_ref);
+       
+       if (likely( sharedLazyTessellationCache.validCacheIndex(subdiv_patch_cache_index,globalTime) ))
+       {
+         CACHE_STATS(SharedTessellationCacheStats::cache_hits++);
+         return (void*) subdiv_patch_root;
+       }
+     }
+     CACHE_STATS(SharedTessellationCacheStats::cache_misses++);
+     return nullptr;
+   }
+
+   template<typename Constructor>
+     static __forceinline auto lookup (CacheEntry& entry, size_t globalTime, const Constructor constructor, const bool before=false) -> decltype(constructor())
+   {
+     ThreadWorkState *t_state = SharedLazyTessellationCache::threadState();
+
+     while (true)
+     {
+       sharedLazyTessellationCache.lockThreadLoop(t_state);
+       void* patch = SharedLazyTessellationCache::lookup(entry,globalTime);
+       if (patch) return (decltype(constructor())) patch;
+       
+       if (entry.mutex.try_lock())
+       {
+         if (!validTag(entry.tag,globalTime)) 
+         {
+           auto timeBefore = sharedLazyTessellationCache.getTime(globalTime);
+           auto ret = constructor(); // thread is locked here!
+           assert(ret);
+           /* this should never return nullptr */
+           auto timeAfter = sharedLazyTessellationCache.getTime(globalTime);
+           auto time = before ? timeBefore : timeAfter;
+           __memory_barrier();
+           entry.tag = SharedLazyTessellationCache::Tag(ret,time);
+           __memory_barrier();
+           entry.mutex.unlock();
+           return ret;
+         }
+         entry.mutex.unlock();
+       }
+       SharedLazyTessellationCache::sharedLazyTessellationCache.unlockThread(t_state);
+     }
+   }
+   
+   __forceinline bool validCacheIndex(const size_t i, const size_t globalTime)
+   {
+#if FORCE_SIMPLE_FLUSH == 1
+     return i == getTime(globalTime);
+#else
+     return i+(NUM_CACHE_SEGMENTS-1) >= getTime(globalTime);
+#endif
+   }
+
+   static __forceinline bool validTime(const size_t oldtime, const size_t newTime)
+   {
+     return oldtime+(NUM_CACHE_SEGMENTS-1) >= newTime;
+   }
+
+
+    static __forceinline bool validTag(const Tag& tag, size_t globalTime)
+    {
+      const int64_t subdiv_patch_root_ref = tag.get(); 
+      if (subdiv_patch_root_ref == 0) return false;
+      const size_t subdiv_patch_cache_index = extractCommitIndex(subdiv_patch_root_ref);
+      return sharedLazyTessellationCache.validCacheIndex(subdiv_patch_cache_index,globalTime);
+    }
+
+   void waitForUsersLessEqual(ThreadWorkState *const t_state,
+			      const unsigned int users);
+    
+   __forceinline size_t alloc(const size_t blocks)
+   {
+     if (unlikely(blocks >= switch_block_threshold))
+       throw_RTCError(RTC_ERROR_INVALID_OPERATION,"allocation exceeds size of tessellation cache segment");
+
+     assert(blocks < switch_block_threshold);
+     size_t index = next_block.fetch_add(blocks);
+     if (unlikely(index + blocks >= switch_block_threshold)) return (size_t)-1;
+     return index;
+   }
+
+   static __forceinline void* malloc(const size_t bytes)
+   {
+     size_t block_index = -1;
+     ThreadWorkState *const t_state = threadState();
+     while (true)
+     {
+       block_index = sharedLazyTessellationCache.alloc((bytes+BLOCK_SIZE-1)/BLOCK_SIZE);
+       if (block_index == (size_t)-1)
+       {
+         sharedLazyTessellationCache.unlockThread(t_state);		  
+         sharedLazyTessellationCache.allocNextSegment();
+         sharedLazyTessellationCache.lockThread(t_state);
+         continue; 
+       }
+       break;
+     }
+     return sharedLazyTessellationCache.getBlockPtr(block_index);
+   }
+
+   __forceinline void *getBlockPtr(const size_t block_index)
+   {
+     assert(block_index < maxBlocks);
+     assert(data);
+     assert(block_index*16 <= size);
+     return (void*)&data[block_index*16];
+   }
+
+   __forceinline void*  getDataPtr()      { return data; }
+   __forceinline size_t getNumUsedBytes() { return next_block * BLOCK_SIZE; }
+   __forceinline size_t getMaxBlocks()    { return maxBlocks; }
+   __forceinline size_t getSize()         { return size; }
+
+   void allocNextSegment();
+   void realloc(const size_t newSize);
+
+   void reset();
+
+   static SharedLazyTessellationCache sharedLazyTessellationCache;
+ };
+}
diff --git a/thirdparty/embree/patches/godot-changes-android.patch b/thirdparty/embree/patches/godot-changes-android.patch
new file mode 100644
index 0000000000..a27f924bde
--- /dev/null
+++ b/thirdparty/embree/patches/godot-changes-android.patch
@@ -0,0 +1,103 @@
+diff --git a/thirdparty/embree/common/sys/sysinfo.cpp b/thirdparty/embree/common/sys/sysinfo.cpp
+index ba97dc227b..1679599608 100644
+--- a/thirdparty/embree/common/sys/sysinfo.cpp
++++ b/thirdparty/embree/common/sys/sysinfo.cpp
+@@ -618,7 +618,10 @@ namespace embree
+     static int nThreads = -1;
+     if (nThreads != -1) return nThreads;
+ 
+-#if defined(__MACOSX__)
++// -- GODOT start --
++// #if defined(__MACOSX__)
++#if defined(__MACOSX__) || defined(__ANDROID__)
++// -- GODOT end --
+     nThreads = sysconf(_SC_NPROCESSORS_ONLN); // does not work in Linux LXC container
+     assert(nThreads);
+ #else
+diff --git a/thirdparty/embree/common/sys/thread.cpp b/thirdparty/embree/common/sys/thread.cpp
+index a7827e18f7..f4014be89b 100644
+--- a/thirdparty/embree/common/sys/thread.cpp
++++ b/thirdparty/embree/common/sys/thread.cpp
+@@ -158,7 +158,9 @@ namespace embree
+ /// Linux Platform
+ ////////////////////////////////////////////////////////////////////////////////
+ 
+-#if defined(__LINUX__)
++// -- GODOT start --
++#if defined(__LINUX__) && !defined(__ANDROID__)
++// -- GODOT end --
+ 
+ #include <fstream>
+ #include <sstream>
+@@ -247,6 +249,28 @@ namespace embree
+ }
+ #endif
+ 
++// -- GODOT start --
++////////////////////////////////////////////////////////////////////////////////
++/// Android Platform
++////////////////////////////////////////////////////////////////////////////////
++
++#if defined(__ANDROID__)
++
++namespace embree
++{
++  /*! set affinity of the calling thread */
++  void setAffinity(ssize_t affinity)
++  {
++    cpu_set_t cset;
++    CPU_ZERO(&cset);
++    CPU_SET(affinity, &cset);
++
++    sched_setaffinity(0, sizeof(cset), &cset);
++  }
++}
++#endif
++// -- GODOT end --
++
+ ////////////////////////////////////////////////////////////////////////////////
+ /// FreeBSD Platform
+ ////////////////////////////////////////////////////////////////////////////////
+@@ -355,7 +379,9 @@ namespace embree
+     pthread_attr_destroy(&attr);
+ 
+     /* set affinity */
+-#if defined(__LINUX__)
++// -- GODOT start --
++#if defined(__LINUX__) && !defined(__ANDROID__)
++// -- GODOT end --
+     if (threadID >= 0) {
+       cpu_set_t cset;
+       CPU_ZERO(&cset);
+@@ -370,7 +396,16 @@ namespace embree
+       CPU_SET(threadID, &cset);
+       pthread_setaffinity_np(*tid, sizeof(cset), &cset);
+     }
++// -- GODOT start --
++#elif defined(__ANDROID__)
++    if (threadID >= 0) {
++      cpu_set_t cset;
++      CPU_ZERO(&cset);
++      CPU_SET(threadID, &cset);
++      sched_setaffinity(pthread_gettid_np(*tid), sizeof(cset), &cset);
++    }
+ #endif
++// -- GODOT end --
+ 
+     return thread_t(tid);
+   }
+@@ -389,8 +424,14 @@ namespace embree
+ 
+   /*! destroy a hardware thread by its handle */
+   void destroyThread(thread_t tid) {
++// -- GODOT start --
++#if defined(__ANDROID__)
++    FATAL("Can't destroy threads on Android.");
++#else
+     pthread_cancel(*(pthread_t*)tid);
+     delete (pthread_t*)tid;
++#endif
++// -- GODOT end --
+   }
+ 
+   /*! creates thread local storage */
diff --git a/thirdparty/embree/patches/godot-changes-misc.patch b/thirdparty/embree/patches/godot-changes-misc.patch
new file mode 100644
index 0000000000..8bf0d9fa97
--- /dev/null
+++ b/thirdparty/embree/patches/godot-changes-misc.patch
@@ -0,0 +1,105 @@
+diff --git a/thirdparty/embree/common/sys/intrinsics.h b/thirdparty/embree/common/sys/intrinsics.h
+index 79729c87ab..ed8dd7d40a 100644
+--- a/thirdparty/embree/common/sys/intrinsics.h
++++ b/thirdparty/embree/common/sys/intrinsics.h
+@@ -34,8 +34,14 @@
+ #endif
+ 
+ #if defined(__WIN32__)
+-#  define NOMINMAX
+-#  include <windows.h>
++// -- GODOT start --
++#if !defined(NOMINMAX)
++// -- GODOT end --
++#define NOMINMAX
++// -- GODOT start --
++#endif
++#include "windows.h"
++// -- GODOT end --
+ #endif
+ 
+ /* normally defined in pmmintrin.h, but we always need this */
+diff --git a/thirdparty/embree/common/sys/platform.h b/thirdparty/embree/common/sys/platform.h
+index 3fc5e99b8d..697e07bb86 100644
+--- a/thirdparty/embree/common/sys/platform.h
++++ b/thirdparty/embree/common/sys/platform.h
+@@ -99,7 +99,9 @@
+ #define dll_import 
+ #endif
+ 
+-#ifdef __WIN32__
++// -- GODOT start --
++#if defined(__WIN32__) && !defined(__MINGW32__)
++// -- GODOT end --
+ #if !defined(__noinline)
+ #define __noinline             __declspec(noinline)
+ #endif
+@@ -149,6 +151,9 @@
+   #define DELETED  = delete
+ #endif
+ 
++// -- GODOT start --
++#if !defined(likely)
++// -- GODOT end --
+ #if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+ #define   likely(expr) (expr)
+ #define unlikely(expr) (expr)
+@@ -156,6 +161,9 @@
+ #define   likely(expr) __builtin_expect((bool)(expr),true )
+ #define unlikely(expr) __builtin_expect((bool)(expr),false)
+ #endif
++// -- GODOT start --
++#endif
++// -- GODOT end --
+ 
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Error handling and debugging
+diff --git a/thirdparty/embree/common/sys/sysinfo.cpp b/thirdparty/embree/common/sys/sysinfo.cpp
+index ba97dc227b..f1a59e511e 100644
+--- a/thirdparty/embree/common/sys/sysinfo.cpp
++++ b/thirdparty/embree/common/sys/sysinfo.cpp
+@@ -248,7 +248,9 @@ namespace embree
+ #if defined(__X86_ASM__)
+   __noinline int64_t get_xcr0() 
+   {
+-#if defined (__WIN32__)
++// -- GODOT start --
++#if defined (__WIN32__) && !defined (__MINGW32__)
++// -- GODOT end --
+     int64_t xcr0 = 0; // int64_t is workaround for compiler bug under VS2013, Win32
+     xcr0 = _xgetbv(0);
+     return xcr0;
+diff --git a/thirdparty/embree/include/embree3/rtcore_common.h b/thirdparty/embree/include/embree3/rtcore_common.h
+index 9c14b28745..4857e1e05e 100644
+--- a/thirdparty/embree/include/embree3/rtcore_common.h
++++ b/thirdparty/embree/include/embree3/rtcore_common.h
+@@ -19,7 +19,9 @@ typedef int ssize_t;
+ #endif
+ #endif
+ 
+-#ifdef _WIN32
++// -- GODOT start --
++#if defined(_WIN32) && defined(_MSC_VER)
++// -- GODOT end --
+ #  define RTC_ALIGN(...) __declspec(align(__VA_ARGS__))
+ #else
+ #  define RTC_ALIGN(...) __attribute__((aligned(__VA_ARGS__)))
+diff --git a/thirdparty/embree/common/tasking/taskschedulertbb.h b/thirdparty/embree/common/tasking/taskschedulertbb.h
+index 3fd15816e9..35bd49849f 100644
+--- a/thirdparty/embree/common/tasking/taskschedulertbb.h
++++ b/thirdparty/embree/common/tasking/taskschedulertbb.h
+@@ -12,7 +12,13 @@
+ #include "../sys/ref.h"
+ 
+ #if defined(__WIN32__)
++// -- GODOT start --
++#if !defined(NOMINMAX)
++// -- GODOT end --
+ #  define NOMINMAX
++// -- GODOT start --
++#endif
++// -- GODOT end --
+ #endif
+ 
+ // We need to define these to avoid implicit linkage against
+ 
+\ No newline at end of file
diff --git a/thirdparty/embree/patches/godot-changes-noexcept.patch b/thirdparty/embree/patches/godot-changes-noexcept.patch
new file mode 100644
index 0000000000..c587a0e2be
--- /dev/null
+++ b/thirdparty/embree/patches/godot-changes-noexcept.patch
@@ -0,0 +1,630 @@
+diff --git a/thirdparty/embree/common/algorithms/parallel_for.h b/thirdparty/embree/common/algorithms/parallel_for.h
+index f052d8b468..645681ac63 100644
+--- a/thirdparty/embree/common/algorithms/parallel_for.h
++++ b/thirdparty/embree/common/algorithms/parallel_for.h
+@@ -21,7 +21,10 @@ namespace embree
+           func(r.begin());
+         });
+       if (!TaskScheduler::wait())
+-        throw std::runtime_error("task cancelled");
++        // -- GODOT start --
++        // throw std::runtime_error("task cancelled");
++        abort();
++        // -- GODOT end --
+     }
+     
+ #elif defined(TASKING_TBB)
+@@ -31,13 +34,19 @@ namespace embree
+         func(i);
+       },context);
+     if (context.is_group_execution_cancelled())
+-      throw std::runtime_error("task cancelled");
++      // -- GODOT start --
++      // throw std::runtime_error("task cancelled");
++      abort();
++      // -- GODOT end --
+   #else
+     tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+         func(i);
+       });
+     if (tbb::task::self().is_cancelled())
+-      throw std::runtime_error("task cancelled");
++      // -- GODOT start --
++      // throw std::runtime_error("task cancelled");
++      abort();
++      // -- GODOT end --
+   #endif
+ 
+ #elif defined(TASKING_PPL)
+@@ -57,7 +66,10 @@ namespace embree
+ #if defined(TASKING_INTERNAL)
+     TaskScheduler::spawn(first,last,minStepSize,func);
+     if (!TaskScheduler::wait())
+-      throw std::runtime_error("task cancelled");
++      // -- GODOT start --
++      // throw std::runtime_error("task cancelled");
++      abort();
++      // -- GODOT end --
+ 
+ #elif defined(TASKING_TBB)
+   #if TBB_INTERFACE_VERSION >= 12002
+@@ -66,13 +78,19 @@ namespace embree
+         func(range<Index>(r.begin(),r.end()));
+       },context);
+     if (context.is_group_execution_cancelled())
+-      throw std::runtime_error("task cancelled");
++      // -- GODOT start --
++      // throw std::runtime_error("task cancelled");
++      abort();
++      // -- GODOT end --
+   #else
+     tbb::parallel_for(tbb::blocked_range<Index>(first,last,minStepSize),[&](const tbb::blocked_range<Index>& r) {
+         func(range<Index>(r.begin(),r.end()));
+       });
+     if (tbb::task::self().is_cancelled())
+-      throw std::runtime_error("task cancelled");
++      // -- GODOT start --
++      // throw std::runtime_error("task cancelled");
++      abort();
++      // -- GODOT end --
+   #endif
+ 
+ #elif defined(TASKING_PPL)
+@@ -104,13 +122,19 @@ namespace embree
+           func(i);
+         },tbb::simple_partitioner(),context);
+       if (context.is_group_execution_cancelled())
+-        throw std::runtime_error("task cancelled");
++        // -- GODOT start --
++        // throw std::runtime_error("task cancelled");
++        abort();
++        // -- GODOT end --
+     #else
+       tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+           func(i);
+         },tbb::simple_partitioner());
+       if (tbb::task::self().is_cancelled())
+-        throw std::runtime_error("task cancelled");
++        // -- GODOT start --
++        // throw std::runtime_error("task cancelled");
++        abort();
++        // -- GODOT end --
+     #endif
+   }
+ 
+@@ -125,13 +149,19 @@ namespace embree
+           func(i);
+         },ap,context);
+       if (context.is_group_execution_cancelled())
+-        throw std::runtime_error("task cancelled");
++        // -- GODOT start --
++        // throw std::runtime_error("task cancelled");
++        abort();
++        // -- GODOT end --
+     #else
+       tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+           func(i);
+         },ap);
+       if (tbb::task::self().is_cancelled())
+-        throw std::runtime_error("task cancelled");
++        // -- GODOT start --
++        // throw std::runtime_error("task cancelled");
++        abort();
++        // -- GODOT end --
+     #endif
+   }
+ 
+diff --git a/thirdparty/embree/common/algorithms/parallel_reduce.h b/thirdparty/embree/common/algorithms/parallel_reduce.h
+index f42ae2ec50..8271372ea4 100644
+--- a/thirdparty/embree/common/algorithms/parallel_reduce.h
++++ b/thirdparty/embree/common/algorithms/parallel_reduce.h
+@@ -58,15 +58,19 @@ namespace embree
+     const Value v = tbb::parallel_reduce(tbb::blocked_range<Index>(first,last,minStepSize),identity,
+       [&](const tbb::blocked_range<Index>& r, const Value& start) { return reduction(start,func(range<Index>(r.begin(),r.end()))); },
+       reduction,context);
+-    if (context.is_group_execution_cancelled())
+-      throw std::runtime_error("task cancelled");
++    // -- GODOT start --
++    // if (context.is_group_execution_cancelled())
++    //   throw std::runtime_error("task cancelled");
++    // -- GODOT end --
+     return v;
+   #else
+     const Value v = tbb::parallel_reduce(tbb::blocked_range<Index>(first,last,minStepSize),identity,
+       [&](const tbb::blocked_range<Index>& r, const Value& start) { return reduction(start,func(range<Index>(r.begin(),r.end()))); },
+       reduction);
+-    if (tbb::task::self().is_cancelled())
+-      throw std::runtime_error("task cancelled");
++    // -- GODOT start --
++    // if (tbb::task::self().is_cancelled())
++    //   throw std::runtime_error("task cancelled");
++    // -- GODOT end --
+     return v;
+   #endif
+ #else // TASKING_PPL
+diff --git a/thirdparty/embree/common/lexers/stringstream.cpp b/thirdparty/embree/common/lexers/stringstream.cpp
+index 42ffb10176..a037869506 100644
+--- a/thirdparty/embree/common/lexers/stringstream.cpp
++++ b/thirdparty/embree/common/lexers/stringstream.cpp
+@@ -39,7 +39,10 @@ namespace embree
+     std::vector<char> str; str.reserve(64);
+     while (cin->peek() != EOF && !isSeparator(cin->peek())) {
+       int c = cin->get();
+-      if (!isValidChar(c)) throw std::runtime_error("invalid character "+std::string(1,c)+" in input");
++      // -- GODOT start --
++      // if (!isValidChar(c)) throw std::runtime_error("invalid character "+std::string(1,c)+" in input");
++      if (!isValidChar(c)) abort();
++      // -- GODOT end --
+       str.push_back((char)c);
+     }
+     str.push_back(0);
+diff --git a/thirdparty/embree/common/sys/alloc.cpp b/thirdparty/embree/common/sys/alloc.cpp
+index 1bc30fe9a5..abdd269069 100644
+--- a/thirdparty/embree/common/sys/alloc.cpp
++++ b/thirdparty/embree/common/sys/alloc.cpp
+@@ -21,7 +21,10 @@ namespace embree
+     void* ptr = _mm_malloc(size,align);
+ 
+     if (size != 0 && ptr == nullptr)
+-      throw std::bad_alloc();
++      // -- GODOT start --
++      // throw std::bad_alloc();
++      abort();
++      // -- GODOT end --
+     
+     return ptr;
+   }
+@@ -128,7 +131,10 @@ namespace embree
+     /* fall back to 4k pages */
+     int flags = MEM_COMMIT | MEM_RESERVE;
+     char* ptr = (char*) VirtualAlloc(nullptr,bytes,flags,PAGE_READWRITE);
+-    if (ptr == nullptr) throw std::bad_alloc();
++    // -- GODOT start --
++    // if (ptr == nullptr) throw std::bad_alloc();
++    if (ptr == nullptr) abort();
++    // -- GODOT end --
+     hugepages = false;
+     return ptr;
+   }
+@@ -145,7 +151,10 @@ namespace embree
+       return bytesOld;
+ 
+     if (!VirtualFree((char*)ptr+bytesNew,bytesOld-bytesNew,MEM_DECOMMIT))
+-      throw std::bad_alloc();
++      // -- GODOT start --
++      // throw std::bad_alloc();
++      abort();
++      // -- GODOT end --
+ 
+     return bytesNew;
+   }
+@@ -156,7 +165,10 @@ namespace embree
+       return;
+ 
+     if (!VirtualFree(ptr,0,MEM_RELEASE))
+-      throw std::bad_alloc();
++      // -- GODOT start --
++      // throw std::bad_alloc();
++      abort();
++      // -- GODOT end --
+   }
+ 
+   void os_advise(void *ptr, size_t bytes)
+@@ -260,7 +272,10 @@ namespace embree
+ 
+     /* fallback to 4k pages */
+     void* ptr = (char*) mmap(0, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
+-    if (ptr == MAP_FAILED) throw std::bad_alloc();
++    // -- GODOT start --
++    // if (ptr == MAP_FAILED) throw std::bad_alloc();
++    if (ptr == MAP_FAILED) abort();
++    // -- GODOT end --
+     hugepages = false;
+ 
+     /* advise huge page hint for THP */
+@@ -277,7 +292,10 @@ namespace embree
+       return bytesOld;
+ 
+     if (munmap((char*)ptr+bytesNew,bytesOld-bytesNew) == -1)
+-      throw std::bad_alloc();
++      // -- GODOT start --
++      // throw std::bad_alloc();
++      abort();
++      // -- GODOT end --
+ 
+     return bytesNew;
+   }
+@@ -291,7 +309,10 @@ namespace embree
+     const size_t pageSize = hugepages ? PAGE_SIZE_2M : PAGE_SIZE_4K;
+     bytes = (bytes+pageSize-1) & ~(pageSize-1);
+     if (munmap(ptr,bytes) == -1)
+-      throw std::bad_alloc();
++      // -- GODOT start --
++      // throw std::bad_alloc();
++      abort();
++      // -- GODOT end --
+   }
+ 
+   /* hint for transparent huge pages (THP) */
+diff --git a/thirdparty/embree/common/sys/platform.h b/thirdparty/embree/common/sys/platform.h
+index 8a6d9fa0a9..697e07bb86 100644
+--- a/thirdparty/embree/common/sys/platform.h
++++ b/thirdparty/embree/common/sys/platform.h
+@@ -179,11 +179,19 @@
+ #define PRINT4(x,y,z,w) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << ", " << STRING(w) << " = " << (w) << embree_endl
+ 
+ #if defined(DEBUG) // only report file and line in debug mode
++  // -- GODOT start --
++  // #define THROW_RUNTIME_ERROR(str)
++  //   throw std::runtime_error(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
+   #define THROW_RUNTIME_ERROR(str) \
+-    throw std::runtime_error(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
++    printf(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)), abort();
++  // -- GODOT end --
+ #else
++  // -- GODOT start --
++  // #define THROW_RUNTIME_ERROR(str)
++  //   throw std::runtime_error(str);
+   #define THROW_RUNTIME_ERROR(str) \
+-    throw std::runtime_error(str);
++    abort();
++  // -- GODOT end --
+ #endif
+ 
+ #define FATAL(x)   THROW_RUNTIME_ERROR(x)
+diff --git a/thirdparty/embree/common/tasking/taskschedulerinternal.cpp b/thirdparty/embree/common/tasking/taskschedulerinternal.cpp
+index dca835a716..ad438588a3 100644
+--- a/thirdparty/embree/common/tasking/taskschedulerinternal.cpp
++++ b/thirdparty/embree/common/tasking/taskschedulerinternal.cpp
+@@ -48,13 +48,15 @@ namespace embree
+     {
+       Task* prevTask = thread.task;
+       thread.task = this;
+-      try {
+-        if (thread.scheduler->cancellingException == nullptr)
++      // -- GODOT start --
++      // try {
++      // if (thread.scheduler->cancellingException == nullptr)
+           closure->execute();
+-      } catch (...) {
+-        if (thread.scheduler->cancellingException == nullptr)
+-          thread.scheduler->cancellingException = std::current_exception();
+-      }
++      // } catch (...) {
++      //   if (thread.scheduler->cancellingException == nullptr)
++      //     thread.scheduler->cancellingException = std::current_exception();
++      // }
++      // -- GODOT end --
+       thread.task = prevTask;
+       add_dependencies(-1);
+     }
+@@ -291,8 +293,11 @@ namespace embree
+     size_t threadIndex = allocThreadIndex();
+     condition.wait(mutex, [&] () { return hasRootTask.load(); });
+     mutex.unlock();
+-    std::exception_ptr except = thread_loop(threadIndex);
+-    if (except != nullptr) std::rethrow_exception(except);
++    // -- GODOT start --
++    // std::exception_ptr except = thread_loop(threadIndex);
++    // if (except != nullptr) std::rethrow_exception(except);
++    thread_loop(threadIndex);
++    // -- GODOT end --
+   }
+ 
+   void TaskScheduler::reset() {
+@@ -324,7 +329,10 @@ namespace embree
+     return thread->scheduler->cancellingException == nullptr;
+   }
+ 
+-  std::exception_ptr TaskScheduler::thread_loop(size_t threadIndex)
++// -- GODOT start --
++//   std::exception_ptr TaskScheduler::thread_loop(size_t threadIndex)
++  void TaskScheduler::thread_loop(size_t threadIndex)
++// -- GODOT end --
+   {
+     /* allocate thread structure */
+     std::unique_ptr<Thread> mthread(new Thread(threadIndex,this)); // too large for stack allocation
+@@ -347,9 +355,10 @@ namespace embree
+     swapThread(oldThread);
+ 
+     /* remember exception to throw */
+-    std::exception_ptr except = nullptr;
+-    if (cancellingException != nullptr) except = cancellingException;
+-
++    // -- GODOT start --
++    // std::exception_ptr except = nullptr;
++    // if (cancellingException != nullptr) except = cancellingException;
++    // -- GODOT end --
+     /* wait for all threads to terminate */
+     threadCounter--;
+ #if defined(__WIN32__)
+@@ -367,7 +376,10 @@ namespace embree
+           yield();
+ #endif
+ 	}
+-    return except;
++     // -- GODOT start --
++     // return except;
++     return;
++     // -- GODOT end --
+   }
+ 
+   bool TaskScheduler::steal_from_other_threads(Thread& thread)
+diff --git a/thirdparty/embree/common/tasking/taskschedulerinternal.h b/thirdparty/embree/common/tasking/taskschedulerinternal.h
+index c766a0bb6a..8fa6bb12fa 100644
+--- a/thirdparty/embree/common/tasking/taskschedulerinternal.h
++++ b/thirdparty/embree/common/tasking/taskschedulerinternal.h
+@@ -123,7 +123,10 @@ namespace embree
+       {
+         size_t ofs = bytes + ((align - stackPtr) & (align-1));
+         if (stackPtr + ofs > CLOSURE_STACK_SIZE)
+-          throw std::runtime_error("closure stack overflow");
++          // -- GODOT start --
++          // throw std::runtime_error("closure stack overflow");
++          abort();
++          // -- GODOT end --
+         stackPtr += ofs;
+         return &stack[stackPtr-bytes];
+       }
+@@ -132,7 +135,10 @@ namespace embree
+       __forceinline void push_right(Thread& thread, const size_t size, const Closure& closure)
+       {
+         if (right >= TASK_STACK_SIZE)
+-          throw std::runtime_error("task stack overflow");
++           // -- GODOT start --
++           // throw std::runtime_error("task stack overflow");
++           abort();
++           // -- GODOT end --
+ 
+ 	/* allocate new task on right side of stack */
+         size_t oldStackPtr = stackPtr;
+@@ -238,7 +244,10 @@ namespace embree
+     void wait_for_threads(size_t threadCount);
+ 
+     /*! thread loop for all worker threads */
+-    std::exception_ptr thread_loop(size_t threadIndex);
++    // -- GODOT start --
++    // std::exception_ptr thread_loop(size_t threadIndex);
++    void thread_loop(size_t threadIndex);
++    // -- GODOT end --
+ 
+     /*! steals a task from a different thread */
+     bool steal_from_other_threads(Thread& thread);
+diff --git a/thirdparty/embree/kernels/bvh/bvh_statistics.cpp b/thirdparty/embree/kernels/bvh/bvh_statistics.cpp
+index d8da78eed7..d857ff7d95 100644
+--- a/thirdparty/embree/kernels/bvh/bvh_statistics.cpp
++++ b/thirdparty/embree/kernels/bvh/bvh_statistics.cpp
+@@ -150,7 +150,10 @@ namespace embree
+       }
+     }
+     else {
+-      throw std::runtime_error("not supported node type in bvh_statistics");
++      // -- GODOT start --
++      // throw std::runtime_error("not supported node type in bvh_statistics");
++      abort();
++      // -- GODOT end --
+     }
+     return s;
+   } 
+diff --git a/thirdparty/embree/kernels/common/rtcore.cpp b/thirdparty/embree/kernels/common/rtcore.cpp
+index 74e9fb335c..94b3819e42 100644
+--- a/thirdparty/embree/kernels/common/rtcore.cpp
++++ b/thirdparty/embree/kernels/common/rtcore.cpp
+@@ -197,7 +197,10 @@ RTC_NAMESPACE_BEGIN;
+     if (quality != RTC_BUILD_QUALITY_LOW &&
+         quality != RTC_BUILD_QUALITY_MEDIUM &&
+         quality != RTC_BUILD_QUALITY_HIGH)
+-      throw std::runtime_error("invalid build quality");
++      // -- GODOT start --
++      // throw std::runtime_error("invalid build quality");
++      abort();
++      // -- GODOT end --
+     scene->setBuildQuality(quality);
+     RTC_CATCH_END2(scene);
+   }
+@@ -1350,7 +1353,10 @@ RTC_NAMESPACE_BEGIN;
+         quality != RTC_BUILD_QUALITY_MEDIUM &&
+         quality != RTC_BUILD_QUALITY_HIGH &&
+         quality != RTC_BUILD_QUALITY_REFIT)
+-      throw std::runtime_error("invalid build quality");
++      // -- GODOT start --
++      // throw std::runtime_error("invalid build quality");
++      abort();
++      // -- GODOT end --
+     geometry->setBuildQuality(quality);
+     RTC_CATCH_END2(geometry);
+   }
+diff --git a/thirdparty/embree/kernels/common/rtcore.h b/thirdparty/embree/kernels/common/rtcore.h
+index 4e4b24e9c2..373e49a689 100644
+--- a/thirdparty/embree/kernels/common/rtcore.h
++++ b/thirdparty/embree/kernels/common/rtcore.h
+@@ -25,52 +25,58 @@ namespace embree
+ #endif
+ 
+ /*! Macros used in the rtcore API implementation */
+-#define RTC_CATCH_BEGIN try {
++// -- GODOT start --
++// #define RTC_CATCH_BEGIN try {
++#define RTC_CATCH_BEGIN
+   
+-#define RTC_CATCH_END(device)                                                \
+-  } catch (std::bad_alloc&) {                                                   \
+-    Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
+-  } catch (rtcore_error& e) {                                                   \
+-    Device::process_error(device,e.error,e.what());                             \
+-  } catch (std::exception& e) {                                                 \
+-    Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
+-  } catch (...) {                                                               \
+-    Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
+-  }
++// #define RTC_CATCH_END(device)                                                \
++//   } catch (std::bad_alloc&) {                                                   \
++//     Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
++//   } catch (rtcore_error& e) {                                                   \
++//     Device::process_error(device,e.error,e.what());                             \
++//   } catch (std::exception& e) {                                                 \
++//     Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
++//   } catch (...) {                                                               \
++//     Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
++//   }
++#define RTC_CATCH_END(device)
+   
+-#define RTC_CATCH_END2(scene)                                                \
+-  } catch (std::bad_alloc&) {                                                   \
+-    Device* device = scene ? scene->device : nullptr;                           \
+-    Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
+-  } catch (rtcore_error& e) {                                                   \
+-    Device* device = scene ? scene->device : nullptr;                           \
+-    Device::process_error(device,e.error,e.what());                             \
+-  } catch (std::exception& e) {                                                 \
+-    Device* device = scene ? scene->device : nullptr;                           \
+-    Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
+-  } catch (...) {                                                               \
+-    Device* device = scene ? scene->device : nullptr;                           \
+-    Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
+-  }
++// #define RTC_CATCH_END2(scene)                                                \
++//   } catch (std::bad_alloc&) {                                                   \
++//     Device* device = scene ? scene->device : nullptr;                           \
++//     Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
++//   } catch (rtcore_error& e) {                                                   \
++//     Device* device = scene ? scene->device : nullptr;                           \
++//     Device::process_error(device,e.error,e.what());                             \
++//   } catch (std::exception& e) {                                                 \
++//     Device* device = scene ? scene->device : nullptr;                           \
++//     Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
++//   } catch (...) {                                                               \
++//     Device* device = scene ? scene->device : nullptr;                           \
++//     Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
++//   }
++#define RTC_CATCH_END2(scene)
+ 
+-#define RTC_CATCH_END2_FALSE(scene)                                             \
+-  } catch (std::bad_alloc&) {                                                   \
+-    Device* device = scene ? scene->device : nullptr;                           \
+-    Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
+-    return false;                                                               \
+-  } catch (rtcore_error& e) {                                                   \
+-    Device* device = scene ? scene->device : nullptr;                           \
+-    Device::process_error(device,e.error,e.what());                             \
+-    return false;                                                               \
+-  } catch (std::exception& e) {                                                 \
+-    Device* device = scene ? scene->device : nullptr;                           \
+-    Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
+-    return false;                                                               \
+-  } catch (...) {                                                               \
+-    Device* device = scene ? scene->device : nullptr;                           \
+-    Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
+-    return false;                                                               \
+-  }
++// #define RTC_CATCH_END2_FALSE(scene)                                             \
++//   } catch (std::bad_alloc&) {                                                   \
++//     Device* device = scene ? scene->device : nullptr;                           \
++//     Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
++//     return false;                                                               \
++//   } catch (rtcore_error& e) {                                                   \
++//     Device* device = scene ? scene->device : nullptr;                           \
++//     Device::process_error(device,e.error,e.what());                             \
++//     return false;                                                               \
++//   } catch (std::exception& e) {                                                 \
++//     Device* device = scene ? scene->device : nullptr;                           \
++//     Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
++//     return false;                                                               \
++//   } catch (...) {                                                               \
++//     Device* device = scene ? scene->device : nullptr;                           \
++//     Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
++//     return false;                                                               \
++//   }
++#define RTC_CATCH_END2_FALSE(scene) return false;
++// -- GODOT end --
+ 
+ #define RTC_VERIFY_HANDLE(handle)                               \
+   if (handle == nullptr) {                                         \
+@@ -97,28 +103,38 @@ namespace embree
+ #define RTC_TRACE(x) 
+ #endif
+ 
+-  /*! used to throw embree API errors */
+-  struct rtcore_error : public std::exception
+-  {
+-    __forceinline rtcore_error(RTCError error, const std::string& str)
+-      : error(error), str(str) {}
+-    
+-    ~rtcore_error() throw() {}
+-    
+-    const char* what () const throw () {
+-      return str.c_str();
+-    }
+-    
+-    RTCError error;
+-    std::string str;
+-  };
++// -- GODOT begin --
++//   /*! used to throw embree API errors */
++//   struct rtcore_error : public std::exception
++//   {
++//     __forceinline rtcore_error(RTCError error, const std::string& str)
++//       : error(error), str(str) {}
++//     
++//     ~rtcore_error() throw() {}
++//     
++//     const char* what () const throw () {
++//       return str.c_str();
++//     }
++//     
++//     RTCError error;
++//     std::string str;
++//   };
++// -- GODOT end --
+ 
+ #if defined(DEBUG) // only report file and line in debug mode
++  // -- GODOT begin --
++  // #define throw_RTCError(error,str) \
++  //   throw rtcore_error(error,std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
+   #define throw_RTCError(error,str) \
+-    throw rtcore_error(error,std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
++    printf(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)), abort();
++  // -- GODOT end --
+ #else
++  // -- GODOT begin --
++  // #define throw_RTCError(error,str) \
++  //   throw rtcore_error(error,str);
+   #define throw_RTCError(error,str) \
+-    throw rtcore_error(error,str);
++    abort();
++  // -- GODOT end --
+ #endif
+ 
+ #define RTC_BUILD_ARGUMENTS_HAS(settings,member) \
+diff --git a/thirdparty/embree/kernels/common/scene.cpp b/thirdparty/embree/kernels/common/scene.cpp
+index 0149055f2c..408d7eae6f 100644
+--- a/thirdparty/embree/kernels/common/scene.cpp
++++ b/thirdparty/embree/kernels/common/scene.cpp
+@@ -792,16 +792,18 @@ namespace embree
+     }
+ 
+     /* initiate build */
+-    try {
++    // -- GODOT start --
++    // try {
+       scheduler->spawn_root([&]() { commit_task(); Lock<MutexSys> lock(schedulerMutex); this->scheduler = nullptr; }, 1, !join);
+-    }
+-    catch (...) {
+-      accels_clear();
+-      updateInterface();
+-      Lock<MutexSys> lock(schedulerMutex);
+-      this->scheduler = nullptr;
+-      throw;
+-    }
++    // }
++    // catch (...) {
++    //   accels_clear();
++    //   updateInterface();
++    //   Lock<MutexSys> lock(schedulerMutex);
++    //   this->scheduler = nullptr;
++    //   throw;
++    // }
++    // -- GODOT end --
+   }
+ 
+ #endif
diff --git a/thirdparty/embree/patches/godot-changes-ubsan.patch b/thirdparty/embree/patches/godot-changes-ubsan.patch
new file mode 100644
index 0000000000..1336246f0d
--- /dev/null
+++ b/thirdparty/embree/patches/godot-changes-ubsan.patch
@@ -0,0 +1,24 @@
+diff --git a/thirdparty/embree/kernels/builders/primrefgen.cpp b/thirdparty/embree/kernels/builders/primrefgen.cpp
+index bb4fc81dfe..d279dc4993 100644
+--- a/thirdparty/embree/kernels/builders/primrefgen.cpp
++++ b/thirdparty/embree/kernels/builders/primrefgen.cpp
+@@ -184,6 +184,9 @@ namespace embree
+ 
+     // special variants for grid meshes
+ 
++// -- GODOT start --
++#if defined(EMBREE_GEOMETRY_GRID)
++// -- GODOT end --
+     PrimInfo createPrimRefArrayGrids(Scene* scene, mvector<PrimRef>& prims, mvector<SubGridBuildData>& sgrids)
+     {
+       PrimInfo pinfo(empty);
+@@ -293,6 +296,9 @@ namespace embree
+ 
+       return pinfo;
+     }
++// -- GODOT start --
++#endif
++// -- GODOT end --
+     
+     // ====================================================================================================
+     // ====================================================================================================