summaryrefslogtreecommitdiff
path: root/thirdparty/embree-aarch64/kernels/subdiv/tessellation_cache.h
diff options
context:
space:
mode:
Diffstat (limited to 'thirdparty/embree-aarch64/kernels/subdiv/tessellation_cache.h')
-rw-r--r--thirdparty/embree-aarch64/kernels/subdiv/tessellation_cache.h325
1 files changed, 325 insertions, 0 deletions
diff --git a/thirdparty/embree-aarch64/kernels/subdiv/tessellation_cache.h b/thirdparty/embree-aarch64/kernels/subdiv/tessellation_cache.h
new file mode 100644
index 0000000000..5c215288b6
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/subdiv/tessellation_cache.h
@@ -0,0 +1,325 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/default.h"
+
+/* force a complete cache invalidation when running out of allocation space */
+#define FORCE_SIMPLE_FLUSH 0
+
+#define THREAD_BLOCK_ATOMIC_ADD 4
+
+#if defined(DEBUG)
+#define CACHE_STATS(x)
+#else
+#define CACHE_STATS(x)
+#endif
+
+namespace embree
+{
+ class SharedTessellationCacheStats
+ {
+ public:
+ /* stats */
+ static std::atomic<size_t> cache_accesses;
+ static std::atomic<size_t> cache_hits;
+ static std::atomic<size_t> cache_misses;
+ static std::atomic<size_t> cache_flushes;
+ static size_t cache_num_patches;
+ __aligned(64) static SpinLock mtx;
+
+ /* print stats for debugging */
+ static void printStats();
+ static void clearStats();
+ };
+
+ void resizeTessellationCache(size_t new_size);
+ void resetTessellationCache();
+
+ ////////////////////////////////////////////////////////////////////////////////
+ ////////////////////////////////////////////////////////////////////////////////
+ ////////////////////////////////////////////////////////////////////////////////
+
+ struct __aligned(64) ThreadWorkState
+ {
+ ALIGNED_STRUCT_(64);
+
+ std::atomic<size_t> counter;
+ ThreadWorkState* next;
+ bool allocated;
+
+ __forceinline ThreadWorkState(bool allocated = false)
+ : counter(0), next(nullptr), allocated(allocated)
+ {
+ assert( ((size_t)this % 64) == 0 );
+ }
+ };
+
+ class __aligned(64) SharedLazyTessellationCache
+ {
+ public:
+
+ static const size_t NUM_CACHE_SEGMENTS = 8;
+ static const size_t NUM_PREALLOC_THREAD_WORK_STATES = 512;
+ static const size_t COMMIT_INDEX_SHIFT = 32+8;
+#if defined(__X86_64__) || defined(__aarch64__)
+ static const size_t REF_TAG_MASK = 0xffffffffff;
+#else
+ static const size_t REF_TAG_MASK = 0x7FFFFFFF;
+#endif
+ static const size_t MAX_TESSELLATION_CACHE_SIZE = REF_TAG_MASK+1;
+ static const size_t BLOCK_SIZE = 64;
+
+
+ /*! Per thread tessellation ref cache */
+ static __thread ThreadWorkState* init_t_state;
+ static ThreadWorkState* current_t_state;
+
+ static __forceinline ThreadWorkState *threadState()
+ {
+ if (unlikely(!init_t_state))
+ /* sets init_t_state, can't return pointer due to macosx icc bug*/
+ SharedLazyTessellationCache::sharedLazyTessellationCache.getNextRenderThreadWorkState();
+ return init_t_state;
+ }
+
+ struct Tag
+ {
+ __forceinline Tag() : data(0) {}
+
+ __forceinline Tag(void* ptr, size_t combinedTime) {
+ init(ptr,combinedTime);
+ }
+
+ __forceinline Tag(size_t ptr, size_t combinedTime) {
+ init((void*)ptr,combinedTime);
+ }
+
+ __forceinline void init(void* ptr, size_t combinedTime)
+ {
+ if (ptr == nullptr) {
+ data = 0;
+ return;
+ }
+ int64_t new_root_ref = (int64_t) ptr;
+ new_root_ref -= (int64_t)SharedLazyTessellationCache::sharedLazyTessellationCache.getDataPtr();
+ assert( new_root_ref <= (int64_t)REF_TAG_MASK );
+ new_root_ref |= (int64_t)combinedTime << COMMIT_INDEX_SHIFT;
+ data = new_root_ref;
+ }
+
+ __forceinline int64_t get() const { return data.load(); }
+ __forceinline void set( int64_t v ) { data.store(v); }
+ __forceinline void reset() { data.store(0); }
+
+ private:
+ atomic<int64_t> data;
+ };
+
+ static __forceinline size_t extractCommitIndex(const int64_t v) { return v >> SharedLazyTessellationCache::COMMIT_INDEX_SHIFT; }
+
+ struct CacheEntry
+ {
+ Tag tag;
+ SpinLock mutex;
+ };
+
+ private:
+
+ float *data;
+ bool hugepages;
+ size_t size;
+ size_t maxBlocks;
+ ThreadWorkState *threadWorkState;
+
+ __aligned(64) std::atomic<size_t> localTime;
+ __aligned(64) std::atomic<size_t> next_block;
+ __aligned(64) SpinLock reset_state;
+ __aligned(64) SpinLock linkedlist_mtx;
+ __aligned(64) std::atomic<size_t> switch_block_threshold;
+ __aligned(64) std::atomic<size_t> numRenderThreads;
+
+
+ public:
+
+
+ SharedLazyTessellationCache();
+ ~SharedLazyTessellationCache();
+
+ void getNextRenderThreadWorkState();
+
+ __forceinline size_t maxAllocSize() const {
+ return switch_block_threshold;
+ }
+
+ __forceinline size_t getCurrentIndex() { return localTime.load(); }
+ __forceinline void addCurrentIndex(const size_t i=1) { localTime.fetch_add(i); }
+
+ __forceinline size_t getTime(const size_t globalTime) {
+ return localTime.load()+NUM_CACHE_SEGMENTS*globalTime;
+ }
+
+
+ __forceinline size_t lockThread (ThreadWorkState *const t_state, const ssize_t plus=1) { return t_state->counter.fetch_add(plus); }
+ __forceinline size_t unlockThread(ThreadWorkState *const t_state, const ssize_t plus=-1) { assert(isLocked(t_state)); return t_state->counter.fetch_add(plus); }
+
+ __forceinline bool isLocked(ThreadWorkState *const t_state) { return t_state->counter.load() != 0; }
+
+ static __forceinline void lock () { sharedLazyTessellationCache.lockThread(threadState()); }
+ static __forceinline void unlock() { sharedLazyTessellationCache.unlockThread(threadState()); }
+ static __forceinline bool isLocked() { return sharedLazyTessellationCache.isLocked(threadState()); }
+ static __forceinline size_t getState() { return threadState()->counter.load(); }
+ static __forceinline void lockThreadLoop() { sharedLazyTessellationCache.lockThreadLoop(threadState()); }
+
+ static __forceinline size_t getTCacheTime(const size_t globalTime) {
+ return sharedLazyTessellationCache.getTime(globalTime);
+ }
+
+ /* per thread lock */
+ __forceinline void lockThreadLoop (ThreadWorkState *const t_state)
+ {
+ while(1)
+ {
+ size_t lock = SharedLazyTessellationCache::sharedLazyTessellationCache.lockThread(t_state,1);
+ if (unlikely(lock >= THREAD_BLOCK_ATOMIC_ADD))
+ {
+ /* lock failed wait until sync phase is over */
+ sharedLazyTessellationCache.unlockThread(t_state,-1);
+ sharedLazyTessellationCache.waitForUsersLessEqual(t_state,0);
+ }
+ else
+ break;
+ }
+ }
+
+ static __forceinline void* lookup(CacheEntry& entry, size_t globalTime)
+ {
+ const int64_t subdiv_patch_root_ref = entry.tag.get();
+ CACHE_STATS(SharedTessellationCacheStats::cache_accesses++);
+
+ if (likely(subdiv_patch_root_ref != 0))
+ {
+ const size_t subdiv_patch_root = (subdiv_patch_root_ref & REF_TAG_MASK) + (size_t)sharedLazyTessellationCache.getDataPtr();
+ const size_t subdiv_patch_cache_index = extractCommitIndex(subdiv_patch_root_ref);
+
+ if (likely( sharedLazyTessellationCache.validCacheIndex(subdiv_patch_cache_index,globalTime) ))
+ {
+ CACHE_STATS(SharedTessellationCacheStats::cache_hits++);
+ return (void*) subdiv_patch_root;
+ }
+ }
+ CACHE_STATS(SharedTessellationCacheStats::cache_misses++);
+ return nullptr;
+ }
+
+ template<typename Constructor>
+ static __forceinline auto lookup (CacheEntry& entry, size_t globalTime, const Constructor constructor, const bool before=false) -> decltype(constructor())
+ {
+ ThreadWorkState *t_state = SharedLazyTessellationCache::threadState();
+
+ while (true)
+ {
+ sharedLazyTessellationCache.lockThreadLoop(t_state);
+ void* patch = SharedLazyTessellationCache::lookup(entry,globalTime);
+ if (patch) return (decltype(constructor())) patch;
+
+ if (entry.mutex.try_lock())
+ {
+ if (!validTag(entry.tag,globalTime))
+ {
+ auto timeBefore = sharedLazyTessellationCache.getTime(globalTime);
+ auto ret = constructor(); // thread is locked here!
+ assert(ret);
+ /* this should never return nullptr */
+ auto timeAfter = sharedLazyTessellationCache.getTime(globalTime);
+ auto time = before ? timeBefore : timeAfter;
+ __memory_barrier();
+ entry.tag = SharedLazyTessellationCache::Tag(ret,time);
+ __memory_barrier();
+ entry.mutex.unlock();
+ return ret;
+ }
+ entry.mutex.unlock();
+ }
+ SharedLazyTessellationCache::sharedLazyTessellationCache.unlockThread(t_state);
+ }
+ }
+
+ __forceinline bool validCacheIndex(const size_t i, const size_t globalTime)
+ {
+#if FORCE_SIMPLE_FLUSH == 1
+ return i == getTime(globalTime);
+#else
+ return i+(NUM_CACHE_SEGMENTS-1) >= getTime(globalTime);
+#endif
+ }
+
+ static __forceinline bool validTime(const size_t oldtime, const size_t newTime)
+ {
+ return oldtime+(NUM_CACHE_SEGMENTS-1) >= newTime;
+ }
+
+
+ static __forceinline bool validTag(const Tag& tag, size_t globalTime)
+ {
+ const int64_t subdiv_patch_root_ref = tag.get();
+ if (subdiv_patch_root_ref == 0) return false;
+ const size_t subdiv_patch_cache_index = extractCommitIndex(subdiv_patch_root_ref);
+ return sharedLazyTessellationCache.validCacheIndex(subdiv_patch_cache_index,globalTime);
+ }
+
+ void waitForUsersLessEqual(ThreadWorkState *const t_state,
+ const unsigned int users);
+
+ __forceinline size_t alloc(const size_t blocks)
+ {
+ if (unlikely(blocks >= switch_block_threshold))
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"allocation exceeds size of tessellation cache segment");
+
+ assert(blocks < switch_block_threshold);
+ size_t index = next_block.fetch_add(blocks);
+ if (unlikely(index + blocks >= switch_block_threshold)) return (size_t)-1;
+ return index;
+ }
+
+ static __forceinline void* malloc(const size_t bytes)
+ {
+ size_t block_index = -1;
+ ThreadWorkState *const t_state = threadState();
+ while (true)
+ {
+ block_index = sharedLazyTessellationCache.alloc((bytes+BLOCK_SIZE-1)/BLOCK_SIZE);
+ if (block_index == (size_t)-1)
+ {
+ sharedLazyTessellationCache.unlockThread(t_state);
+ sharedLazyTessellationCache.allocNextSegment();
+ sharedLazyTessellationCache.lockThread(t_state);
+ continue;
+ }
+ break;
+ }
+ return sharedLazyTessellationCache.getBlockPtr(block_index);
+ }
+
+ __forceinline void *getBlockPtr(const size_t block_index)
+ {
+ assert(block_index < maxBlocks);
+ assert(data);
+ assert(block_index*16 <= size);
+ return (void*)&data[block_index*16];
+ }
+
+ __forceinline void* getDataPtr() { return data; }
+ __forceinline size_t getNumUsedBytes() { return next_block * BLOCK_SIZE; }
+ __forceinline size_t getMaxBlocks() { return maxBlocks; }
+ __forceinline size_t getSize() { return size; }
+
+ void allocNextSegment();
+ void realloc(const size_t newSize);
+
+ void reset();
+
+ static SharedLazyTessellationCache sharedLazyTessellationCache;
+ };
+}