diff options
Diffstat (limited to 'thirdparty/embree-aarch64/kernels/subdiv/tessellation_cache.h')
-rw-r--r-- | thirdparty/embree-aarch64/kernels/subdiv/tessellation_cache.h | 325 |
1 files changed, 325 insertions, 0 deletions
diff --git a/thirdparty/embree-aarch64/kernels/subdiv/tessellation_cache.h b/thirdparty/embree-aarch64/kernels/subdiv/tessellation_cache.h new file mode 100644 index 0000000000..5c215288b6 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/subdiv/tessellation_cache.h @@ -0,0 +1,325 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/default.h" + +/* force a complete cache invalidation when running out of allocation space */ +#define FORCE_SIMPLE_FLUSH 0 + +#define THREAD_BLOCK_ATOMIC_ADD 4 + +#if defined(DEBUG) +#define CACHE_STATS(x) +#else +#define CACHE_STATS(x) +#endif + +namespace embree +{ + class SharedTessellationCacheStats + { + public: + /* stats */ + static std::atomic<size_t> cache_accesses; + static std::atomic<size_t> cache_hits; + static std::atomic<size_t> cache_misses; + static std::atomic<size_t> cache_flushes; + static size_t cache_num_patches; + __aligned(64) static SpinLock mtx; + + /* print stats for debugging */ + static void printStats(); + static void clearStats(); + }; + + void resizeTessellationCache(size_t new_size); + void resetTessellationCache(); + + //////////////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////////////// + + struct __aligned(64) ThreadWorkState + { + ALIGNED_STRUCT_(64); + + std::atomic<size_t> counter; + ThreadWorkState* next; + bool allocated; + + __forceinline ThreadWorkState(bool allocated = false) + : counter(0), next(nullptr), allocated(allocated) + { + assert( ((size_t)this % 64) == 0 ); + } + }; + + class __aligned(64) SharedLazyTessellationCache + { + public: + + static const size_t NUM_CACHE_SEGMENTS = 8; + static const size_t NUM_PREALLOC_THREAD_WORK_STATES = 512; + static const size_t COMMIT_INDEX_SHIFT = 32+8; +#if defined(__X86_64__) || defined(__aarch64__) + static const size_t REF_TAG_MASK = 0xffffffffff; +#else + static const size_t REF_TAG_MASK = 0x7FFFFFFF; +#endif + static const size_t MAX_TESSELLATION_CACHE_SIZE = REF_TAG_MASK+1; + static const size_t BLOCK_SIZE = 64; + + + /*! Per thread tessellation ref cache */ + static __thread ThreadWorkState* init_t_state; + static ThreadWorkState* current_t_state; + + static __forceinline ThreadWorkState *threadState() + { + if (unlikely(!init_t_state)) + /* sets init_t_state, can't return pointer due to macosx icc bug*/ + SharedLazyTessellationCache::sharedLazyTessellationCache.getNextRenderThreadWorkState(); + return init_t_state; + } + + struct Tag + { + __forceinline Tag() : data(0) {} + + __forceinline Tag(void* ptr, size_t combinedTime) { + init(ptr,combinedTime); + } + + __forceinline Tag(size_t ptr, size_t combinedTime) { + init((void*)ptr,combinedTime); + } + + __forceinline void init(void* ptr, size_t combinedTime) + { + if (ptr == nullptr) { + data = 0; + return; + } + int64_t new_root_ref = (int64_t) ptr; + new_root_ref -= (int64_t)SharedLazyTessellationCache::sharedLazyTessellationCache.getDataPtr(); + assert( new_root_ref <= (int64_t)REF_TAG_MASK ); + new_root_ref |= (int64_t)combinedTime << COMMIT_INDEX_SHIFT; + data = new_root_ref; + } + + __forceinline int64_t get() const { return data.load(); } + __forceinline void set( int64_t v ) { data.store(v); } + __forceinline void reset() { data.store(0); } + + private: + atomic<int64_t> data; + }; + + static __forceinline size_t extractCommitIndex(const int64_t v) { return v >> SharedLazyTessellationCache::COMMIT_INDEX_SHIFT; } + + struct CacheEntry + { + Tag tag; + SpinLock mutex; + }; + + private: + + float *data; + bool hugepages; + size_t size; + size_t maxBlocks; + ThreadWorkState *threadWorkState; + + __aligned(64) std::atomic<size_t> localTime; + __aligned(64) std::atomic<size_t> next_block; + __aligned(64) SpinLock reset_state; + __aligned(64) SpinLock linkedlist_mtx; + __aligned(64) std::atomic<size_t> switch_block_threshold; + __aligned(64) std::atomic<size_t> numRenderThreads; + + + public: + + + SharedLazyTessellationCache(); + ~SharedLazyTessellationCache(); + + void getNextRenderThreadWorkState(); + + __forceinline size_t maxAllocSize() const { + return switch_block_threshold; + } + + __forceinline size_t getCurrentIndex() { return localTime.load(); } + __forceinline void addCurrentIndex(const size_t i=1) { localTime.fetch_add(i); } + + __forceinline size_t getTime(const size_t globalTime) { + return localTime.load()+NUM_CACHE_SEGMENTS*globalTime; + } + + + __forceinline size_t lockThread (ThreadWorkState *const t_state, const ssize_t plus=1) { return t_state->counter.fetch_add(plus); } + __forceinline size_t unlockThread(ThreadWorkState *const t_state, const ssize_t plus=-1) { assert(isLocked(t_state)); return t_state->counter.fetch_add(plus); } + + __forceinline bool isLocked(ThreadWorkState *const t_state) { return t_state->counter.load() != 0; } + + static __forceinline void lock () { sharedLazyTessellationCache.lockThread(threadState()); } + static __forceinline void unlock() { sharedLazyTessellationCache.unlockThread(threadState()); } + static __forceinline bool isLocked() { return sharedLazyTessellationCache.isLocked(threadState()); } + static __forceinline size_t getState() { return threadState()->counter.load(); } + static __forceinline void lockThreadLoop() { sharedLazyTessellationCache.lockThreadLoop(threadState()); } + + static __forceinline size_t getTCacheTime(const size_t globalTime) { + return sharedLazyTessellationCache.getTime(globalTime); + } + + /* per thread lock */ + __forceinline void lockThreadLoop (ThreadWorkState *const t_state) + { + while(1) + { + size_t lock = SharedLazyTessellationCache::sharedLazyTessellationCache.lockThread(t_state,1); + if (unlikely(lock >= THREAD_BLOCK_ATOMIC_ADD)) + { + /* lock failed wait until sync phase is over */ + sharedLazyTessellationCache.unlockThread(t_state,-1); + sharedLazyTessellationCache.waitForUsersLessEqual(t_state,0); + } + else + break; + } + } + + static __forceinline void* lookup(CacheEntry& entry, size_t globalTime) + { + const int64_t subdiv_patch_root_ref = entry.tag.get(); + CACHE_STATS(SharedTessellationCacheStats::cache_accesses++); + + if (likely(subdiv_patch_root_ref != 0)) + { + const size_t subdiv_patch_root = (subdiv_patch_root_ref & REF_TAG_MASK) + (size_t)sharedLazyTessellationCache.getDataPtr(); + const size_t subdiv_patch_cache_index = extractCommitIndex(subdiv_patch_root_ref); + + if (likely( sharedLazyTessellationCache.validCacheIndex(subdiv_patch_cache_index,globalTime) )) + { + CACHE_STATS(SharedTessellationCacheStats::cache_hits++); + return (void*) subdiv_patch_root; + } + } + CACHE_STATS(SharedTessellationCacheStats::cache_misses++); + return nullptr; + } + + template<typename Constructor> + static __forceinline auto lookup (CacheEntry& entry, size_t globalTime, const Constructor constructor, const bool before=false) -> decltype(constructor()) + { + ThreadWorkState *t_state = SharedLazyTessellationCache::threadState(); + + while (true) + { + sharedLazyTessellationCache.lockThreadLoop(t_state); + void* patch = SharedLazyTessellationCache::lookup(entry,globalTime); + if (patch) return (decltype(constructor())) patch; + + if (entry.mutex.try_lock()) + { + if (!validTag(entry.tag,globalTime)) + { + auto timeBefore = sharedLazyTessellationCache.getTime(globalTime); + auto ret = constructor(); // thread is locked here! + assert(ret); + /* this should never return nullptr */ + auto timeAfter = sharedLazyTessellationCache.getTime(globalTime); + auto time = before ? timeBefore : timeAfter; + __memory_barrier(); + entry.tag = SharedLazyTessellationCache::Tag(ret,time); + __memory_barrier(); + entry.mutex.unlock(); + return ret; + } + entry.mutex.unlock(); + } + SharedLazyTessellationCache::sharedLazyTessellationCache.unlockThread(t_state); + } + } + + __forceinline bool validCacheIndex(const size_t i, const size_t globalTime) + { +#if FORCE_SIMPLE_FLUSH == 1 + return i == getTime(globalTime); +#else + return i+(NUM_CACHE_SEGMENTS-1) >= getTime(globalTime); +#endif + } + + static __forceinline bool validTime(const size_t oldtime, const size_t newTime) + { + return oldtime+(NUM_CACHE_SEGMENTS-1) >= newTime; + } + + + static __forceinline bool validTag(const Tag& tag, size_t globalTime) + { + const int64_t subdiv_patch_root_ref = tag.get(); + if (subdiv_patch_root_ref == 0) return false; + const size_t subdiv_patch_cache_index = extractCommitIndex(subdiv_patch_root_ref); + return sharedLazyTessellationCache.validCacheIndex(subdiv_patch_cache_index,globalTime); + } + + void waitForUsersLessEqual(ThreadWorkState *const t_state, + const unsigned int users); + + __forceinline size_t alloc(const size_t blocks) + { + if (unlikely(blocks >= switch_block_threshold)) + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"allocation exceeds size of tessellation cache segment"); + + assert(blocks < switch_block_threshold); + size_t index = next_block.fetch_add(blocks); + if (unlikely(index + blocks >= switch_block_threshold)) return (size_t)-1; + return index; + } + + static __forceinline void* malloc(const size_t bytes) + { + size_t block_index = -1; + ThreadWorkState *const t_state = threadState(); + while (true) + { + block_index = sharedLazyTessellationCache.alloc((bytes+BLOCK_SIZE-1)/BLOCK_SIZE); + if (block_index == (size_t)-1) + { + sharedLazyTessellationCache.unlockThread(t_state); + sharedLazyTessellationCache.allocNextSegment(); + sharedLazyTessellationCache.lockThread(t_state); + continue; + } + break; + } + return sharedLazyTessellationCache.getBlockPtr(block_index); + } + + __forceinline void *getBlockPtr(const size_t block_index) + { + assert(block_index < maxBlocks); + assert(data); + assert(block_index*16 <= size); + return (void*)&data[block_index*16]; + } + + __forceinline void* getDataPtr() { return data; } + __forceinline size_t getNumUsedBytes() { return next_block * BLOCK_SIZE; } + __forceinline size_t getMaxBlocks() { return maxBlocks; } + __forceinline size_t getSize() { return size; } + + void allocNextSegment(); + void realloc(const size_t newSize); + + void reset(); + + static SharedLazyTessellationCache sharedLazyTessellationCache; + }; +} |