// Copyright 2009-2021 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "../common/default.h" /* force a complete cache invalidation when running out of allocation space */ #define FORCE_SIMPLE_FLUSH 0 #define THREAD_BLOCK_ATOMIC_ADD 4 #if defined(DEBUG) #define CACHE_STATS(x) #else #define CACHE_STATS(x) #endif namespace embree { class SharedTessellationCacheStats { public: /* stats */ static std::atomic cache_accesses; static std::atomic cache_hits; static std::atomic cache_misses; static std::atomic cache_flushes; static size_t cache_num_patches; __aligned(64) static SpinLock mtx; /* print stats for debugging */ static void printStats(); static void clearStats(); }; void resizeTessellationCache(size_t new_size); void resetTessellationCache(); //////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////// struct __aligned(64) ThreadWorkState { ALIGNED_STRUCT_(64); std::atomic counter; ThreadWorkState* next; bool allocated; __forceinline ThreadWorkState(bool allocated = false) : counter(0), next(nullptr), allocated(allocated) { assert( ((size_t)this % 64) == 0 ); } }; class __aligned(64) SharedLazyTessellationCache { public: static const size_t NUM_CACHE_SEGMENTS = 8; static const size_t NUM_PREALLOC_THREAD_WORK_STATES = 512; static const size_t COMMIT_INDEX_SHIFT = 32+8; #if defined(__64BIT__) static const size_t REF_TAG_MASK = 0xffffffffff; #else static const size_t REF_TAG_MASK = 0x7FFFFFFF; #endif static const size_t MAX_TESSELLATION_CACHE_SIZE = REF_TAG_MASK+1; static const size_t BLOCK_SIZE = 64; /*! Per thread tessellation ref cache */ static __thread ThreadWorkState* init_t_state; static ThreadWorkState* current_t_state; static __forceinline ThreadWorkState *threadState() { if (unlikely(!init_t_state)) /* sets init_t_state, can't return pointer due to macosx icc bug*/ SharedLazyTessellationCache::sharedLazyTessellationCache.getNextRenderThreadWorkState(); return init_t_state; } struct Tag { __forceinline Tag() : data(0) {} __forceinline Tag(void* ptr, size_t combinedTime) { init(ptr,combinedTime); } __forceinline Tag(size_t ptr, size_t combinedTime) { init((void*)ptr,combinedTime); } __forceinline void init(void* ptr, size_t combinedTime) { if (ptr == nullptr) { data = 0; return; } int64_t new_root_ref = (int64_t) ptr; new_root_ref -= (int64_t)SharedLazyTessellationCache::sharedLazyTessellationCache.getDataPtr(); assert( new_root_ref <= (int64_t)REF_TAG_MASK ); new_root_ref |= (int64_t)combinedTime << COMMIT_INDEX_SHIFT; data = new_root_ref; } __forceinline int64_t get() const { return data.load(); } __forceinline void set( int64_t v ) { data.store(v); } __forceinline void reset() { data.store(0); } private: atomic data; }; static __forceinline size_t extractCommitIndex(const int64_t v) { return v >> SharedLazyTessellationCache::COMMIT_INDEX_SHIFT; } struct CacheEntry { Tag tag; SpinLock mutex; }; private: float *data; bool hugepages; size_t size; size_t maxBlocks; ThreadWorkState *threadWorkState; __aligned(64) std::atomic localTime; __aligned(64) std::atomic next_block; __aligned(64) SpinLock reset_state; __aligned(64) SpinLock linkedlist_mtx; __aligned(64) std::atomic switch_block_threshold; __aligned(64) std::atomic numRenderThreads; public: SharedLazyTessellationCache(); ~SharedLazyTessellationCache(); void getNextRenderThreadWorkState(); __forceinline size_t maxAllocSize() const { return switch_block_threshold; } __forceinline size_t getCurrentIndex() { return localTime.load(); } __forceinline void addCurrentIndex(const size_t i=1) { localTime.fetch_add(i); } __forceinline size_t getTime(const size_t globalTime) { return localTime.load()+NUM_CACHE_SEGMENTS*globalTime; } __forceinline size_t lockThread (ThreadWorkState *const t_state, const ssize_t plus=1) { return t_state->counter.fetch_add(plus); } __forceinline size_t unlockThread(ThreadWorkState *const t_state, const ssize_t plus=-1) { assert(isLocked(t_state)); return t_state->counter.fetch_add(plus); } __forceinline bool isLocked(ThreadWorkState *const t_state) { return t_state->counter.load() != 0; } static __forceinline void lock () { sharedLazyTessellationCache.lockThread(threadState()); } static __forceinline void unlock() { sharedLazyTessellationCache.unlockThread(threadState()); } static __forceinline bool isLocked() { return sharedLazyTessellationCache.isLocked(threadState()); } static __forceinline size_t getState() { return threadState()->counter.load(); } static __forceinline void lockThreadLoop() { sharedLazyTessellationCache.lockThreadLoop(threadState()); } static __forceinline size_t getTCacheTime(const size_t globalTime) { return sharedLazyTessellationCache.getTime(globalTime); } /* per thread lock */ __forceinline void lockThreadLoop (ThreadWorkState *const t_state) { while(1) { size_t lock = SharedLazyTessellationCache::sharedLazyTessellationCache.lockThread(t_state,1); if (unlikely(lock >= THREAD_BLOCK_ATOMIC_ADD)) { /* lock failed wait until sync phase is over */ sharedLazyTessellationCache.unlockThread(t_state,-1); sharedLazyTessellationCache.waitForUsersLessEqual(t_state,0); } else break; } } static __forceinline void* lookup(CacheEntry& entry, size_t globalTime) { const int64_t subdiv_patch_root_ref = entry.tag.get(); CACHE_STATS(SharedTessellationCacheStats::cache_accesses++); if (likely(subdiv_patch_root_ref != 0)) { const size_t subdiv_patch_root = (subdiv_patch_root_ref & REF_TAG_MASK) + (size_t)sharedLazyTessellationCache.getDataPtr(); const size_t subdiv_patch_cache_index = extractCommitIndex(subdiv_patch_root_ref); if (likely( sharedLazyTessellationCache.validCacheIndex(subdiv_patch_cache_index,globalTime) )) { CACHE_STATS(SharedTessellationCacheStats::cache_hits++); return (void*) subdiv_patch_root; } } CACHE_STATS(SharedTessellationCacheStats::cache_misses++); return nullptr; } template static __forceinline auto lookup (CacheEntry& entry, size_t globalTime, const Constructor constructor, const bool before=false) -> decltype(constructor()) { ThreadWorkState *t_state = SharedLazyTessellationCache::threadState(); while (true) { sharedLazyTessellationCache.lockThreadLoop(t_state); void* patch = SharedLazyTessellationCache::lookup(entry,globalTime); if (patch) return (decltype(constructor())) patch; if (entry.mutex.try_lock()) { if (!validTag(entry.tag,globalTime)) { auto timeBefore = sharedLazyTessellationCache.getTime(globalTime); auto ret = constructor(); // thread is locked here! assert(ret); /* this should never return nullptr */ auto timeAfter = sharedLazyTessellationCache.getTime(globalTime); auto time = before ? timeBefore : timeAfter; __memory_barrier(); entry.tag = SharedLazyTessellationCache::Tag(ret,time); __memory_barrier(); entry.mutex.unlock(); return ret; } entry.mutex.unlock(); } SharedLazyTessellationCache::sharedLazyTessellationCache.unlockThread(t_state); } } __forceinline bool validCacheIndex(const size_t i, const size_t globalTime) { #if FORCE_SIMPLE_FLUSH == 1 return i == getTime(globalTime); #else return i+(NUM_CACHE_SEGMENTS-1) >= getTime(globalTime); #endif } static __forceinline bool validTime(const size_t oldtime, const size_t newTime) { return oldtime+(NUM_CACHE_SEGMENTS-1) >= newTime; } static __forceinline bool validTag(const Tag& tag, size_t globalTime) { const int64_t subdiv_patch_root_ref = tag.get(); if (subdiv_patch_root_ref == 0) return false; const size_t subdiv_patch_cache_index = extractCommitIndex(subdiv_patch_root_ref); return sharedLazyTessellationCache.validCacheIndex(subdiv_patch_cache_index,globalTime); } void waitForUsersLessEqual(ThreadWorkState *const t_state, const unsigned int users); __forceinline size_t alloc(const size_t blocks) { if (unlikely(blocks >= switch_block_threshold)) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"allocation exceeds size of tessellation cache segment"); assert(blocks < switch_block_threshold); size_t index = next_block.fetch_add(blocks); if (unlikely(index + blocks >= switch_block_threshold)) return (size_t)-1; return index; } static __forceinline void* malloc(const size_t bytes) { size_t block_index = -1; ThreadWorkState *const t_state = threadState(); while (true) { block_index = sharedLazyTessellationCache.alloc((bytes+BLOCK_SIZE-1)/BLOCK_SIZE); if (block_index == (size_t)-1) { sharedLazyTessellationCache.unlockThread(t_state); sharedLazyTessellationCache.allocNextSegment(); sharedLazyTessellationCache.lockThread(t_state); continue; } break; } return sharedLazyTessellationCache.getBlockPtr(block_index); } __forceinline void *getBlockPtr(const size_t block_index) { assert(block_index < maxBlocks); assert(data); assert(block_index*16 <= size); return (void*)&data[block_index*16]; } __forceinline void* getDataPtr() { return data; } __forceinline size_t getNumUsedBytes() { return next_block * BLOCK_SIZE; } __forceinline size_t getMaxBlocks() { return maxBlocks; } __forceinline size_t getSize() { return size; } void allocNextSegment(); void realloc(const size_t newSize); void reset(); static SharedLazyTessellationCache sharedLazyTessellationCache; }; }