6 files changed, 991 insertions, 0 deletions
diff --git a/thirdparty/embree-aarch64/common/tasking/taskscheduler.h b/thirdparty/embree-aarch64/common/tasking/taskscheduler.h
new file mode 100644
index 0000000000..9940e068d0
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/tasking/taskscheduler.h
@@ -0,0 +1,17 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#if defined(TASKING_INTERNAL)
+#  include "taskschedulerinternal.h"
+#elif defined(TASKING_GCD) && defined(BUILD_IOS)
+#  include "taskschedulergcd.h"
+#elif defined(TASKING_TBB)
+#  include "taskschedulertbb.h"
+#elif defined(TASKING_PPL)
+#  include "taskschedulerppl.h"
+#else
+#  error "no tasking system enabled"
+#endif
+
diff --git a/thirdparty/embree-aarch64/common/tasking/taskschedulergcd.h b/thirdparty/embree-aarch64/common/tasking/taskschedulergcd.h
new file mode 100644
index 0000000000..d31f8bb478
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/tasking/taskschedulergcd.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#include "../sys/platform.h"
+#include "../sys/alloc.h"
+#include "../sys/barrier.h"
+#include "../sys/thread.h"
+#include "../sys/mutex.h"
+#include "../sys/condition.h"
+#include "../sys/ref.h"
+
+#include <dispatch/dispatch.h>
+
+namespace embree
+{
+  struct TaskScheduler
+  {
+    /*! initializes the task scheduler */
+    static void create(size_t numThreads, bool set_affinity, bool start_threads);
+
+    /*! destroys the task scheduler again */
+    static void destroy() {}
+
+    /* returns the ID of the current thread */
+    static __forceinline size_t threadID()
+    {
+      return threadIndex();
+    }
+
+    /* returns the index (0..threadCount-1) of the current thread */
+    static __forceinline size_t threadIndex()
+    {
+        currentThreadIndex = (currentThreadIndex + 1) % GCDNumThreads;
+        return currentThreadIndex;
+    }
+
+    /* returns the total number of threads */
+    static __forceinline size_t threadCount()
+    {
+        return GCDNumThreads;
+    }
+
+    private:
+      static size_t GCDNumThreads;
+      static size_t currentThreadIndex;
+
+  };
+
+};
+
diff --git a/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.cpp b/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.cpp
new file mode 100644
index 0000000000..ebf656d1a0
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.cpp
@@ -0,0 +1,426 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "taskschedulerinternal.h"
+#include "../math/math.h"
+#include "../sys/sysinfo.h"
+#include <algorithm>
+
+namespace embree
+{
+  RTC_NAMESPACE_BEGIN
+  
+  static MutexSys g_mutex;
+  size_t TaskScheduler::g_numThreads = 0;
+  __thread TaskScheduler* TaskScheduler::g_instance = nullptr;
+  std::vector<Ref<TaskScheduler>> g_instance_vector;
+  __thread TaskScheduler::Thread* TaskScheduler::thread_local_thread = nullptr;
+  TaskScheduler::ThreadPool* TaskScheduler::threadPool = nullptr;
+
+  template<typename Predicate, typename Body>
+  __forceinline void TaskScheduler::steal_loop(Thread& thread, const Predicate& pred, const Body& body)
+  {
+    while (true)
+    {
+      /*! some rounds that yield */
+      for (size_t i=0; i<32; i++)
+      {
+        /*! some spinning rounds */
+        const size_t threadCount = thread.threadCount();
+        for (size_t j=0; j<1024; j+=threadCount)
+        {
+          if (!pred()) return;
+          if (thread.scheduler->steal_from_other_threads(thread)) {
+            i=j=0;
+            body();
+          }
+        }
+        yield();
+      }
+    }
+  }
+
+  /*! run this task */
+  void TaskScheduler::Task::run_internal (Thread& thread) // FIXME: avoid as many dll_exports as possible
+  {
+    /* try to run if not already stolen */
+    if (try_switch_state(INITIALIZED,DONE))
+    {
+      Task* prevTask = thread.task;
+      thread.task = this;
+      // -- GODOT start --
+      // try {
+      // if (thread.scheduler->cancellingException == nullptr)
+          closure->execute();
+      // } catch (...) {
+      //   if (thread.scheduler->cancellingException == nullptr)
+      //     thread.scheduler->cancellingException = std::current_exception();
+      // }
+      // -- GODOT end --
+      thread.task = prevTask;
+      add_dependencies(-1);
+    }
+
+    /* steal until all dependencies have completed */
+    steal_loop(thread,
+               [&] () { return dependencies>0; },
+               [&] () { while (thread.tasks.execute_local_internal(thread,this)); });
+
+    /* now signal our parent task that we are finished */
+    if (parent)
+      parent->add_dependencies(-1);
+  }
+
+    /*! run this task */
+  dll_export void TaskScheduler::Task::run (Thread& thread) {
+    run_internal(thread);
+  }
+
+  bool TaskScheduler::TaskQueue::execute_local_internal(Thread& thread, Task* parent)
+  {
+    /* stop if we run out of local tasks or reach the waiting task */
+    if (right == 0 || &tasks[right-1] == parent)
+      return false;
+
+    /* execute task */
+    size_t oldRight = right;
+    tasks[right-1].run_internal(thread);
+    if (right != oldRight) {
+      THROW_RUNTIME_ERROR("you have to wait for spawned subtasks");
+    }
+
+    /* pop task and closure from stack */
+    right--;
+    if (tasks[right].stackPtr != size_t(-1))
+      stackPtr = tasks[right].stackPtr;
+
+    /* also move left pointer */
+    if (left >= right) left.store(right.load());
+
+    return right != 0;
+  }
+
+  dll_export bool TaskScheduler::TaskQueue::execute_local(Thread& thread, Task* parent) {
+    return execute_local_internal(thread,parent);
+  }
+
+  bool TaskScheduler::TaskQueue::steal(Thread& thread)
+  {
+    size_t l = left;
+    size_t r = right;
+    if (l < r)
+    {
+      l = left++;
+       if (l >= r)
+         return false;
+    }
+    else
+      return false;
+
+    if (!tasks[l].try_steal(thread.tasks.tasks[thread.tasks.right]))
+      return false;
+
+    thread.tasks.right++;
+    return true;
+  }
+
+  /* we steal from the left */
+  size_t TaskScheduler::TaskQueue::getTaskSizeAtLeft()
+  {
+    if (left >= right) return 0;
+    return tasks[left].N;
+  }
+
+  void threadPoolFunction(std::pair<TaskScheduler::ThreadPool*,size_t>* pair)
+  {
+    TaskScheduler::ThreadPool* pool = pair->first;
+    size_t threadIndex = pair->second;
+    delete pair;
+    pool->thread_loop(threadIndex);
+  }
+
+  TaskScheduler::ThreadPool::ThreadPool(bool set_affinity)
+    : numThreads(0), numThreadsRunning(0), set_affinity(set_affinity), running(false) {}
+
+  dll_export void TaskScheduler::ThreadPool::startThreads()
+  {
+    if (running) return;
+    setNumThreads(numThreads,true);
+  }
+
+  void TaskScheduler::ThreadPool::setNumThreads(size_t newNumThreads, bool startThreads)
+  {
+    Lock<MutexSys> lock(g_mutex);
+    assert(newNumThreads);
+    newNumThreads = min(newNumThreads, (size_t) getNumberOfLogicalThreads());
+
+    // We are observing a few % gain by increasing number threads by 2 on aarch64.
+#if defined(__aarch64__) && defined(BUILD_IOS)
+    numThreads = newNumThreads*2;
+#else
+    numThreads = newNumThreads;
+#endif
+    numThreads = newNumThreads;
+    if (!startThreads && !running) return;
+    running = true;
+    size_t numThreadsActive = numThreadsRunning;
+
+    mutex.lock();
+    numThreadsRunning = newNumThreads;
+    mutex.unlock();
+    condition.notify_all();
+
+    /* start new threads */
+    for (size_t t=numThreadsActive; t<numThreads; t++)
+    {
+      if (t == 0) continue;
+      auto pair = new std::pair<TaskScheduler::ThreadPool*,size_t>(this,t);
+      threads.push_back(createThread((thread_func)threadPoolFunction,pair,4*1024*1024,set_affinity ? t : -1));
+    }
+
+    /* stop some threads if we reduce the number of threads */
+    for (ssize_t t=numThreadsActive-1; t>=ssize_t(numThreadsRunning); t--) {
+      if (t == 0) continue;
+      embree::join(threads.back());
+      threads.pop_back();
+    }
+  }
+
+  TaskScheduler::ThreadPool::~ThreadPool()
+  {
+    /* leave all taskschedulers */
+    mutex.lock();
+    numThreadsRunning = 0;
+    mutex.unlock();
+    condition.notify_all();
+
+    /* wait for threads to terminate */
+    for (size_t i=0; i<threads.size(); i++)
+      embree::join(threads[i]);
+  }
+
+  dll_export void TaskScheduler::ThreadPool::add(const Ref<TaskScheduler>& scheduler)
+  {
+    mutex.lock();
+    schedulers.push_back(scheduler);
+    mutex.unlock();
+    condition.notify_all();
+  }
+
+  dll_export void TaskScheduler::ThreadPool::remove(const Ref<TaskScheduler>& scheduler)
+  {
+    Lock<MutexSys> lock(mutex);
+    for (std::list<Ref<TaskScheduler> >::iterator it = schedulers.begin(); it != schedulers.end(); it++) {
+      if (scheduler == *it) {
+        schedulers.erase(it);
+        return;
+      }
+    }
+  }
+
+  void TaskScheduler::ThreadPool::thread_loop(size_t globalThreadIndex)
+  {
+    while (globalThreadIndex < numThreadsRunning)
+    {
+      Ref<TaskScheduler> scheduler = NULL;
+      ssize_t threadIndex = -1;
+      {
+        Lock<MutexSys> lock(mutex);
+        condition.wait(mutex, [&] () { return globalThreadIndex >= numThreadsRunning || !schedulers.empty(); });
+        if (globalThreadIndex >= numThreadsRunning) break;
+        scheduler = schedulers.front();
+        threadIndex = scheduler->allocThreadIndex();
+      }
+      scheduler->thread_loop(threadIndex);
+    }
+  }
+
+  TaskScheduler::TaskScheduler()
+    : threadCounter(0), anyTasksRunning(0), hasRootTask(false)
+  {
+    threadLocal.resize(2*getNumberOfLogicalThreads()); // FIXME: this has to be 2x as in the compatibility join mode with rtcCommitScene the worker threads also join. When disallowing rtcCommitScene to join a build we can remove the 2x.
+    for (size_t i=0; i<threadLocal.size(); i++)
+      threadLocal[i].store(nullptr);
+  }
+
+  TaskScheduler::~TaskScheduler()
+  {
+    assert(threadCounter == 0);
+  }
+
+  dll_export size_t TaskScheduler::threadID()
+  {
+    Thread* thread = TaskScheduler::thread();
+    if (thread) return thread->threadIndex;
+    else        return 0;
+  }
+
+  dll_export size_t TaskScheduler::threadIndex()
+  {
+    Thread* thread = TaskScheduler::thread();
+    if (thread) return thread->threadIndex;
+    else        return 0;
+  }
+
+  dll_export size_t TaskScheduler::threadCount() {
+    return threadPool->size();
+  }
+
+  dll_export TaskScheduler* TaskScheduler::instance()
+  {
+    if (g_instance == NULL) {
+      Lock<MutexSys> lock(g_mutex);
+      g_instance = new TaskScheduler;
+      g_instance_vector.push_back(g_instance);
+    }
+    return g_instance;
+  }
+
+  void TaskScheduler::create(size_t numThreads, bool set_affinity, bool start_threads)
+  {
+    if (!threadPool) threadPool = new TaskScheduler::ThreadPool(set_affinity);
+    threadPool->setNumThreads(numThreads,start_threads);
+  }
+
+  void TaskScheduler::destroy() {
+    delete threadPool; threadPool = nullptr;
+  }
+
+  dll_export ssize_t TaskScheduler::allocThreadIndex()
+  {
+    size_t threadIndex = threadCounter++;
+    assert(threadIndex < threadLocal.size());
+    return threadIndex;
+  }
+
+  void TaskScheduler::join()
+  {
+    mutex.lock();
+    size_t threadIndex = allocThreadIndex();
+    condition.wait(mutex, [&] () { return hasRootTask.load(); });
+    mutex.unlock();
+    // -- GODOT start --
+    // std::exception_ptr except = thread_loop(threadIndex);
+    // if (except != nullptr) std::rethrow_exception(except);
+    thread_loop(threadIndex);
+    // -- GODOT end --
+  }
+
+  void TaskScheduler::reset() {
+    hasRootTask = false;
+  }
+
+  void TaskScheduler::wait_for_threads(size_t threadCount)
+  {
+    while (threadCounter < threadCount-1)
+      pause_cpu();
+  }
+
+  dll_export TaskScheduler::Thread* TaskScheduler::thread() {
+    return thread_local_thread;
+  }
+
+  dll_export TaskScheduler::Thread* TaskScheduler::swapThread(Thread* thread)
+  {
+    Thread* old = thread_local_thread;
+    thread_local_thread = thread;
+    return old;
+  }
+
+  dll_export bool TaskScheduler::wait()
+  {
+    Thread* thread = TaskScheduler::thread();
+    if (thread == nullptr) return true;
+    while (thread->tasks.execute_local_internal(*thread,thread->task)) {};
+    return thread->scheduler->cancellingException == nullptr;
+  }
+
+// -- GODOT start --
+//   std::exception_ptr TaskScheduler::thread_loop(size_t threadIndex)
+  void TaskScheduler::thread_loop(size_t threadIndex)
+// -- GODOT end --
+  {
+    /* allocate thread structure */
+    std::unique_ptr<Thread> mthread(new Thread(threadIndex,this)); // too large for stack allocation
+    Thread& thread = *mthread;
+    threadLocal[threadIndex].store(&thread);
+    Thread* oldThread = swapThread(&thread);
+
+    /* main thread loop */
+    while (anyTasksRunning)
+    {
+      steal_loop(thread,
+                 [&] () { return anyTasksRunning > 0; },
+                 [&] () {
+                   anyTasksRunning++;
+                   while (thread.tasks.execute_local_internal(thread,nullptr));
+                   anyTasksRunning--;
+                 });
+    }
+    threadLocal[threadIndex].store(nullptr);
+    swapThread(oldThread);
+
+    /* remember exception to throw */
+    // -- GODOT start --
+    // std::exception_ptr except = nullptr;
+    // if (cancellingException != nullptr) except = cancellingException;
+    // -- GODOT end --
+    /* wait for all threads to terminate */
+    threadCounter--;
+#if defined(__WIN32__)
+	size_t loopIndex = 1;
+#endif
+#define LOOP_YIELD_THRESHOLD (4096)
+	while (threadCounter > 0) {
+#if defined(__WIN32__)
+          if ((loopIndex % LOOP_YIELD_THRESHOLD) == 0)
+            yield();
+          else
+            _mm_pause();
+	  loopIndex++;
+#else
+          yield();
+#endif
+	}
+    // -- GODOT start --
+    // return except;
+    return;
+    // -- GODOT end --
+  }
+
+  bool TaskScheduler::steal_from_other_threads(Thread& thread)
+  {
+    const size_t threadIndex = thread.threadIndex;
+    const size_t threadCount = this->threadCounter;
+
+    for (size_t i=1; i<threadCount; i++)
+    {
+      pause_cpu(32);
+      size_t otherThreadIndex = threadIndex+i;
+      if (otherThreadIndex >= threadCount) otherThreadIndex -= threadCount;
+
+      Thread* othread = threadLocal[otherThreadIndex].load();
+      if (!othread)
+        continue;
+
+      if (othread->tasks.steal(thread))
+        return true;
+    }
+
+    return false;
+  }
+
+  dll_export void TaskScheduler::startThreads() {
+    threadPool->startThreads();
+  }
+
+  dll_export void TaskScheduler::addScheduler(const Ref<TaskScheduler>& scheduler) {
+    threadPool->add(scheduler);
+  }
+
+  dll_export void TaskScheduler::removeScheduler(const Ref<TaskScheduler>& scheduler) {
+    threadPool->remove(scheduler);
+  }
+
+  RTC_NAMESPACE_END
+}
diff --git a/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.h b/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.h
new file mode 100644
index 0000000000..8bd70b2b8c
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.h
@@ -0,0 +1,386 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+#include "../sys/alloc.h"
+#include "../sys/barrier.h"
+#include "../sys/thread.h"
+#include "../sys/mutex.h"
+#include "../sys/condition.h"
+#include "../sys/ref.h"
+#include "../sys/atomic.h"
+#include "../math/range.h"
+#include "../../include/embree3/rtcore.h"
+
+#include <list>
+
+namespace embree
+{
+
+  /* The tasking system exports some symbols to be used by the tutorials. Thus we 
+     hide is also in the API namespace when requested. */
+  RTC_NAMESPACE_BEGIN
+
+  struct TaskScheduler : public RefCount
+  {
+    ALIGNED_STRUCT_(64);
+    friend class Device;
+
+    static const size_t TASK_STACK_SIZE = 4*1024;           //!< task structure stack
+    static const size_t CLOSURE_STACK_SIZE = 512*1024;    //!< stack for task closures
+
+    struct Thread;
+
+    /*! virtual interface for all tasks */
+    struct TaskFunction {
+      virtual void execute() = 0;
+    };
+
+    /*! builds a task interface from a closure */
+    template<typename Closure>
+    struct ClosureTaskFunction : public TaskFunction
+    {
+      Closure closure;
+      __forceinline ClosureTaskFunction (const Closure& closure) : closure(closure) {}
+      void execute() { closure(); };
+    };
+
+    struct __aligned(64) Task
+    {
+      /*! states a task can be in */
+      enum { DONE, INITIALIZED };
+
+      /*! switch from one state to another */
+      __forceinline void switch_state(int from, int to)
+      {
+	__memory_barrier();
+        MAYBE_UNUSED bool success = state.compare_exchange_strong(from,to);
+	assert(success);
+      }
+
+      /*! try to switch from one state to another */
+      __forceinline bool try_switch_state(int from, int to) {
+	__memory_barrier();
+	return state.compare_exchange_strong(from,to);
+      }
+
+       /*! increment/decrement dependency counter */
+      void add_dependencies(int n) {
+	dependencies+=n;
+      }
+
+      /*! initialize all tasks to DONE state by default */
+      __forceinline Task()
+	: state(DONE) {}
+
+      /*! construction of new task */
+      __forceinline Task (TaskFunction* closure, Task* parent, size_t stackPtr, size_t N)
+        : dependencies(1), stealable(true), closure(closure), parent(parent), stackPtr(stackPtr), N(N)
+      {
+        if (parent) parent->add_dependencies(+1);
+	switch_state(DONE,INITIALIZED);
+      }
+
+      /*! construction of stolen task, stealing thread will decrement initial dependency */
+      __forceinline Task (TaskFunction* closure, Task* parent)
+        : dependencies(1), stealable(false), closure(closure), parent(parent), stackPtr(-1), N(1)
+      {
+	switch_state(DONE,INITIALIZED);
+      }
+
+      /*! try to steal this task */
+      bool try_steal(Task& child)
+      {
+        if (!stealable) return false;
+	if (!try_switch_state(INITIALIZED,DONE)) return false;
+	new (&child) Task(closure, this);
+        return true;
+      }
+
+      /*! run this task */
+      dll_export void run(Thread& thread);
+
+      void run_internal(Thread& thread);
+
+    public:
+      std::atomic<int> state;            //!< state this task is in
+      std::atomic<int> dependencies;     //!< dependencies to wait for
+      std::atomic<bool> stealable;       //!< true if task can be stolen
+      TaskFunction* closure;             //!< the closure to execute
+      Task* parent;                      //!< parent task to signal when we are finished
+      size_t stackPtr;                   //!< stack location where closure is stored
+      size_t N;                          //!< approximative size of task
+    };
+
+    struct TaskQueue
+    {
+      TaskQueue ()
+      : left(0), right(0), stackPtr(0) {}
+
+      __forceinline void* alloc(size_t bytes, size_t align = 64)
+      {
+        size_t ofs = bytes + ((align - stackPtr) & (align-1));
+        if (stackPtr + ofs > CLOSURE_STACK_SIZE)
+          // -- GODOT start --
+          // throw std::runtime_error("closure stack overflow");
+          abort();
+          // -- GODOT end --
+        stackPtr += ofs;
+        return &stack[stackPtr-bytes];
+      }
+
+      template<typename Closure>
+      __forceinline void push_right(Thread& thread, const size_t size, const Closure& closure)
+      {
+        if (right >= TASK_STACK_SIZE)
+          // -- GODOT start --
+          // throw std::runtime_error("task stack overflow");
+          abort();
+          // -- GODOT end --
+
+	/* allocate new task on right side of stack */
+        size_t oldStackPtr = stackPtr;
+        TaskFunction* func = new (alloc(sizeof(ClosureTaskFunction<Closure>))) ClosureTaskFunction<Closure>(closure);
+        /* gcc 8 or later fails to compile without explicit .load() */
+        new (&(tasks[right.load()])) Task(func,thread.task,oldStackPtr,size);
+        right++;
+
+	/* also move left pointer */
+	if (left >= right-1) left = right-1;
+      }
+
+      dll_export bool execute_local(Thread& thread, Task* parent);
+      bool execute_local_internal(Thread& thread, Task* parent);
+      bool steal(Thread& thread);
+      size_t getTaskSizeAtLeft();
+
+      bool empty() { return right == 0; }
+
+    public:
+
+      /* task stack */
+      Task tasks[TASK_STACK_SIZE];
+      __aligned(64) std::atomic<size_t> left;   //!< threads steal from left
+      __aligned(64) std::atomic<size_t> right;  //!< new tasks are added to the right
+
+      /* closure stack */
+      __aligned(64) char stack[CLOSURE_STACK_SIZE];
+      size_t stackPtr;
+    };
+
+    /*! thread local structure for each thread */
+    struct Thread
+    {
+      ALIGNED_STRUCT_(64);
+
+      Thread (size_t threadIndex, const Ref<TaskScheduler>& scheduler)
+      : threadIndex(threadIndex), task(nullptr), scheduler(scheduler) {}
+
+      __forceinline size_t threadCount() {
+        return scheduler->threadCounter;
+      }
+
+      size_t threadIndex;              //!< ID of this thread
+      TaskQueue tasks;                 //!< local task queue
+      Task* task;                      //!< current active task
+      Ref<TaskScheduler> scheduler;     //!< pointer to task scheduler
+    };
+
+    /*! pool of worker threads */
+    struct ThreadPool
+    {
+      ThreadPool (bool set_affinity);
+      ~ThreadPool ();
+
+      /*! starts the threads */
+      dll_export void startThreads();
+
+      /*! sets number of threads to use */
+      void setNumThreads(size_t numThreads, bool startThreads = false);
+
+      /*! adds a task scheduler object for scheduling */
+      dll_export void add(const Ref<TaskScheduler>& scheduler);
+
+      /*! remove the task scheduler object again */
+      dll_export void remove(const Ref<TaskScheduler>& scheduler);
+
+      /*! returns number of threads of the thread pool */
+      size_t size() const { return numThreads; }
+
+      /*! main loop for all threads */
+      void thread_loop(size_t threadIndex);
+
+    private:
+      std::atomic<size_t> numThreads;
+      std::atomic<size_t> numThreadsRunning;
+      bool set_affinity;
+      std::atomic<bool> running;
+      std::vector<thread_t> threads;
+
+    private:
+      MutexSys mutex;
+      ConditionSys condition;
+      std::list<Ref<TaskScheduler> > schedulers;
+    };
+
+    TaskScheduler ();
+    ~TaskScheduler ();
+
+    /*! initializes the task scheduler */
+    static void create(size_t numThreads, bool set_affinity, bool start_threads);
+
+    /*! destroys the task scheduler again */
+    static void destroy();
+
+    /*! lets new worker threads join the tasking system */
+    void join();
+    void reset();
+
+    /*! let a worker thread allocate a thread index */
+    dll_export ssize_t allocThreadIndex();
+
+    /*! wait for some number of threads available (threadCount includes main thread) */
+    void wait_for_threads(size_t threadCount);
+
+    /*! thread loop for all worker threads */
+    // -- GODOT start --
+    // std::exception_ptr thread_loop(size_t threadIndex);
+    void thread_loop(size_t threadIndex);
+    // -- GODOT end --
+
+    /*! steals a task from a different thread */
+    bool steal_from_other_threads(Thread& thread);
+
+    template<typename Predicate, typename Body>
+      static void steal_loop(Thread& thread, const Predicate& pred, const Body& body);
+
+    /* spawn a new task at the top of the threads task stack */
+    template<typename Closure>
+      void spawn_root(const Closure& closure, size_t size = 1, bool useThreadPool = true)
+    {
+      if (useThreadPool) startThreads();
+
+      size_t threadIndex = allocThreadIndex();
+      std::unique_ptr<Thread> mthread(new Thread(threadIndex,this)); // too large for stack allocation
+      Thread& thread = *mthread;
+      assert(threadLocal[threadIndex].load() == nullptr);
+      threadLocal[threadIndex] = &thread;
+      Thread* oldThread = swapThread(&thread);
+      thread.tasks.push_right(thread,size,closure);
+      {
+        Lock<MutexSys> lock(mutex);
+	anyTasksRunning++;
+        hasRootTask = true;
+        condition.notify_all();
+      }
+
+      if (useThreadPool) addScheduler(this);
+
+      while (thread.tasks.execute_local(thread,nullptr));
+      anyTasksRunning--;
+      if (useThreadPool) removeScheduler(this);
+
+      threadLocal[threadIndex] = nullptr;
+      swapThread(oldThread);
+
+      /* remember exception to throw */
+      std::exception_ptr except = nullptr;
+      if (cancellingException != nullptr) except = cancellingException;
+
+      /* wait for all threads to terminate */
+      threadCounter--;
+      while (threadCounter > 0) yield();
+      cancellingException = nullptr;
+
+      /* re-throw proper exception */
+      if (except != nullptr)
+        std::rethrow_exception(except);
+    }
+
+    /* spawn a new task at the top of the threads task stack */
+    template<typename Closure>
+    static __forceinline void spawn(size_t size, const Closure& closure)
+    {
+      Thread* thread = TaskScheduler::thread();
+      if (likely(thread != nullptr)) thread->tasks.push_right(*thread,size,closure);
+      else                           instance()->spawn_root(closure,size);
+    }
+
+    /* spawn a new task at the top of the threads task stack */
+    template<typename Closure>
+    static __forceinline void spawn(const Closure& closure) {
+      spawn(1,closure);
+    }
+
+    /* spawn a new task set  */
+    template<typename Index, typename Closure>
+    static void spawn(const Index begin, const Index end, const Index blockSize, const Closure& closure)
+    {
+      spawn(end-begin, [=]()
+        {
+	  if (end-begin <= blockSize) {
+	    return closure(range<Index>(begin,end));
+	  }
+	  const Index center = (begin+end)/2;
+	  spawn(begin,center,blockSize,closure);
+	  spawn(center,end  ,blockSize,closure);
+	  wait();
+	});
+    }
+
+    /* work on spawned subtasks and wait until all have finished */
+    dll_export static bool wait();
+
+    /* returns the ID of the current thread */
+    dll_export static size_t threadID();
+
+    /* returns the index (0..threadCount-1) of the current thread */
+    dll_export static size_t threadIndex();
+
+    /* returns the total number of threads */
+    dll_export static size_t threadCount();
+
+  private:
+
+    /* returns the thread local task list of this worker thread */
+    dll_export static Thread* thread();
+
+    /* sets the thread local task list of this worker thread */
+    dll_export static Thread* swapThread(Thread* thread);
+
+    /*! returns the taskscheduler object to be used by the master thread */
+    dll_export static TaskScheduler* instance();
+
+    /*! starts the threads */
+    dll_export static void startThreads();
+
+    /*! adds a task scheduler object for scheduling */
+    dll_export static void addScheduler(const Ref<TaskScheduler>& scheduler);
+
+    /*! remove the task scheduler object again */
+    dll_export static void removeScheduler(const Ref<TaskScheduler>& scheduler);
+
+  private:
+    std::vector<atomic<Thread*>> threadLocal;
+    std::atomic<size_t> threadCounter;
+    std::atomic<size_t> anyTasksRunning;
+    std::atomic<bool> hasRootTask;
+    std::exception_ptr cancellingException;
+    MutexSys mutex;
+    ConditionSys condition;
+
+  private:
+    static size_t g_numThreads;
+    static __thread TaskScheduler* g_instance;
+    static __thread Thread* thread_local_thread;
+    static ThreadPool* threadPool;
+  };
+
+  RTC_NAMESPACE_END
+
+#if defined(RTC_NAMESPACE)
+    using RTC_NAMESPACE::TaskScheduler;
+#endif
+}
diff --git a/thirdparty/embree-aarch64/common/tasking/taskschedulerppl.h b/thirdparty/embree-aarch64/common/tasking/taskschedulerppl.h
new file mode 100644
index 0000000000..776f98cdac
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/tasking/taskschedulerppl.h
@@ -0,0 +1,46 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+#include "../sys/alloc.h"
+#include "../sys/barrier.h"
+#include "../sys/thread.h"
+#include "../sys/mutex.h"
+#include "../sys/condition.h"
+#include "../sys/ref.h"
+
+#if !defined(__WIN32__)
+#error PPL tasking system only available under windows
+#endif
+
+#include <ppl.h>
+
+namespace embree
+{
+  struct TaskScheduler
+  {
+    /*! initializes the task scheduler */
+    static void create(size_t numThreads, bool set_affinity, bool start_threads);
+
+    /*! destroys the task scheduler again */
+    static void destroy();
+
+    /* returns the ID of the current thread */
+    static __forceinline size_t threadID() {
+      return GetCurrentThreadId();
+    }
+
+    /* returns the index (0..threadCount-1) of the current thread */
+    /* FIXME: threadIndex is NOT supported by PPL! */
+    static __forceinline size_t threadIndex() {
+      return 0;
+    }
+
+    /* returns the total number of threads */
+    static __forceinline size_t threadCount() {
+      return GetMaximumProcessorCount(ALL_PROCESSOR_GROUPS) + 1;
+    }
+  };
+};
diff --git a/thirdparty/embree-aarch64/common/tasking/taskschedulertbb.h b/thirdparty/embree-aarch64/common/tasking/taskschedulertbb.h
new file mode 100644
index 0000000000..98dba26871
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/tasking/taskschedulertbb.h
@@ -0,0 +1,67 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+#include "../sys/alloc.h"
+#include "../sys/barrier.h"
+#include "../sys/thread.h"
+#include "../sys/mutex.h"
+#include "../sys/condition.h"
+#include "../sys/ref.h"
+
+#if defined(__WIN32__)
+#  define NOMINMAX
+#endif
+
+// We need to define these to avoid implicit linkage against
+// tbb_debug.lib under Windows. When removing these lines debug build
+// under Windows fails.
+#define __TBB_NO_IMPLICIT_LINKAGE 1
+#define __TBBMALLOC_NO_IMPLICIT_LINKAGE 1
+#define TBB_SUPPRESS_DEPRECATED_MESSAGES 1
+#define TBB_PREVIEW_ISOLATED_TASK_GROUP 1
+#include "tbb/tbb.h"
+#include "tbb/parallel_sort.h"
+
+namespace embree
+{
+  struct TaskScheduler
+  {
+    /*! initializes the task scheduler */
+    static void create(size_t numThreads, bool set_affinity, bool start_threads);
+
+    /*! destroys the task scheduler again */
+    static void destroy();
+
+    /* returns the ID of the current thread */
+    static __forceinline size_t threadID()
+    {
+      return threadIndex();
+    }
+
+    /* returns the index (0..threadCount-1) of the current thread */
+    static __forceinline size_t threadIndex()
+    {
+#if TBB_INTERFACE_VERSION >= 9100
+      return tbb::this_task_arena::current_thread_index();
+#elif TBB_INTERFACE_VERSION >= 9000
+      return tbb::task_arena::current_thread_index();
+#else
+      return 0;
+#endif
+    }
+
+    /* returns the total number of threads */
+    static __forceinline size_t threadCount() {
+#if TBB_INTERFACE_VERSION >= 9100
+      return tbb::this_task_arena::max_concurrency();
+#else
+      return tbb::task_scheduler_init::default_num_threads();
+#endif
+    }
+
+  };
+
+};