diff options
Diffstat (limited to 'thirdparty/embree-aarch64/common/sys')
26 files changed, 4551 insertions, 0 deletions
diff --git a/thirdparty/embree-aarch64/common/sys/alloc.cpp b/thirdparty/embree-aarch64/common/sys/alloc.cpp new file mode 100644 index 0000000000..12f143f131 --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/alloc.cpp @@ -0,0 +1,327 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "alloc.h" +#include "intrinsics.h" +#include "sysinfo.h" +#include "mutex.h" + +//////////////////////////////////////////////////////////////////////////////// +/// All Platforms +//////////////////////////////////////////////////////////////////////////////// + +namespace embree +{ + void* alignedMalloc(size_t size, size_t align) + { + if (size == 0) + return nullptr; + + assert((align & (align-1)) == 0); + void* ptr = _mm_malloc(size,align); + + if (size != 0 && ptr == nullptr) + // -- GODOT start -- + // throw std::bad_alloc(); + abort(); + // -- GODOT end -- + + return ptr; + } + + void alignedFree(void* ptr) + { + if (ptr) + _mm_free(ptr); + } + + static bool huge_pages_enabled = false; + static MutexSys os_init_mutex; + + __forceinline bool isHugePageCandidate(const size_t bytes) + { + if (!huge_pages_enabled) + return false; + + /* use huge pages only when memory overhead is low */ + const size_t hbytes = (bytes+PAGE_SIZE_2M-1) & ~size_t(PAGE_SIZE_2M-1); + return 66*(hbytes-bytes) < bytes; // at most 1.5% overhead + } +} + +//////////////////////////////////////////////////////////////////////////////// +/// Windows Platform +//////////////////////////////////////////////////////////////////////////////// + +#ifdef _WIN32 + +#define WIN32_LEAN_AND_MEAN +#include <windows.h> +#include <malloc.h> + +namespace embree +{ + bool win_enable_selockmemoryprivilege (bool verbose) + { + HANDLE hToken; + if (!OpenProcessToken(GetCurrentProcess(), TOKEN_QUERY | TOKEN_ADJUST_PRIVILEGES, &hToken)) { + if (verbose) std::cout << "WARNING: OpenProcessToken failed while trying to enable SeLockMemoryPrivilege: " << GetLastError() << std::endl; + return false; + } + + TOKEN_PRIVILEGES tp; + tp.PrivilegeCount = 1; + tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; + + if (!LookupPrivilegeValueW(nullptr, L"SeLockMemoryPrivilege", &tp.Privileges[0].Luid)) { + if (verbose) std::cout << "WARNING: LookupPrivilegeValue failed while trying to enable SeLockMemoryPrivilege: " << GetLastError() << std::endl; + return false; + } + + SetLastError(ERROR_SUCCESS); + if (!AdjustTokenPrivileges(hToken, FALSE, &tp, sizeof(tp), nullptr, 0)) { + if (verbose) std::cout << "WARNING: AdjustTokenPrivileges failed while trying to enable SeLockMemoryPrivilege" << std::endl; + return false; + } + + if (GetLastError() == ERROR_NOT_ALL_ASSIGNED) { + if (verbose) std::cout << "WARNING: AdjustTokenPrivileges failed to enable SeLockMemoryPrivilege: Add SeLockMemoryPrivilege for current user and run process in elevated mode (Run as administrator)." << std::endl; + return false; + } + + return true; + } + + bool os_init(bool hugepages, bool verbose) + { + Lock<MutexSys> lock(os_init_mutex); + + if (!hugepages) { + huge_pages_enabled = false; + return true; + } + + if (GetLargePageMinimum() != PAGE_SIZE_2M) { + huge_pages_enabled = false; + return false; + } + + huge_pages_enabled = true; + return true; + } + + void* os_malloc(size_t bytes, bool& hugepages) + { + if (bytes == 0) { + hugepages = false; + return nullptr; + } + + /* try direct huge page allocation first */ + if (isHugePageCandidate(bytes)) + { + int flags = MEM_COMMIT | MEM_RESERVE | MEM_LARGE_PAGES; + char* ptr = (char*) VirtualAlloc(nullptr,bytes,flags,PAGE_READWRITE); + if (ptr != nullptr) { + hugepages = true; + return ptr; + } + } + + /* fall back to 4k pages */ + int flags = MEM_COMMIT | MEM_RESERVE; + char* ptr = (char*) VirtualAlloc(nullptr,bytes,flags,PAGE_READWRITE); + // -- GODOT start -- + // if (ptr == nullptr) throw std::bad_alloc(); + if (ptr == nullptr) abort(); + // -- GODOT end -- + hugepages = false; + return ptr; + } + + size_t os_shrink(void* ptr, size_t bytesNew, size_t bytesOld, bool hugepages) + { + if (hugepages) // decommitting huge pages seems not to work under Windows + return bytesOld; + + const size_t pageSize = hugepages ? PAGE_SIZE_2M : PAGE_SIZE_4K; + bytesNew = (bytesNew+pageSize-1) & ~(pageSize-1); + bytesOld = (bytesOld+pageSize-1) & ~(pageSize-1); + if (bytesNew >= bytesOld) + return bytesOld; + + if (!VirtualFree((char*)ptr+bytesNew,bytesOld-bytesNew,MEM_DECOMMIT)) + // -- GODOT start -- + // throw std::bad_alloc(); + abort(); + // -- GODOT end -- + + return bytesNew; + } + + void os_free(void* ptr, size_t bytes, bool hugepages) + { + if (bytes == 0) + return; + + if (!VirtualFree(ptr,0,MEM_RELEASE)) + // -- GODOT start -- + // throw std::bad_alloc(); + abort(); + // -- GODOT end -- + } + + void os_advise(void *ptr, size_t bytes) + { + } +} + +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Unix Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__UNIX__) + +#include <sys/mman.h> +#include <errno.h> +#include <stdlib.h> +#include <string.h> +#include <sstream> + +#if defined(__MACOSX__) +#include <mach/vm_statistics.h> +#endif + +namespace embree +{ + bool os_init(bool hugepages, bool verbose) + { + Lock<MutexSys> lock(os_init_mutex); + + if (!hugepages) { + huge_pages_enabled = false; + return true; + } + +#if defined(__LINUX__) + + int hugepagesize = 0; + + std::ifstream file; + file.open("/proc/meminfo",std::ios::in); + if (!file.is_open()) { + if (verbose) std::cout << "WARNING: Could not open /proc/meminfo. Huge page support cannot get enabled!" << std::endl; + huge_pages_enabled = false; + return false; + } + + std::string line; + while (getline(file,line)) + { + std::stringstream sline(line); + while (!sline.eof() && sline.peek() == ' ') sline.ignore(); + std::string tag; getline(sline,tag,' '); + while (!sline.eof() && sline.peek() == ' ') sline.ignore(); + std::string val; getline(sline,val,' '); + while (!sline.eof() && sline.peek() == ' ') sline.ignore(); + std::string unit; getline(sline,unit,' '); + if (tag == "Hugepagesize:" && unit == "kB") { + hugepagesize = std::stoi(val)*1024; + break; + } + } + + if (hugepagesize != PAGE_SIZE_2M) + { + if (verbose) std::cout << "WARNING: Only 2MB huge pages supported. Huge page support cannot get enabled!" << std::endl; + huge_pages_enabled = false; + return false; + } +#endif + + huge_pages_enabled = true; + return true; + } + + void* os_malloc(size_t bytes, bool& hugepages) + { + if (bytes == 0) { + hugepages = false; + return nullptr; + } + + /* try direct huge page allocation first */ + if (isHugePageCandidate(bytes)) + { +#if defined(__MACOSX__) + void* ptr = mmap(0, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, VM_FLAGS_SUPERPAGE_SIZE_2MB, 0); + if (ptr != MAP_FAILED) { + hugepages = true; + return ptr; + } +#elif defined(MAP_HUGETLB) + void* ptr = mmap(0, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON | MAP_HUGETLB, -1, 0); + if (ptr != MAP_FAILED) { + hugepages = true; + return ptr; + } +#endif + } + + /* fallback to 4k pages */ + void* ptr = (char*) mmap(0, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); + // -- GODOT start -- + // if (ptr == MAP_FAILED) throw std::bad_alloc(); + if (ptr == MAP_FAILED) abort(); + // -- GODOT end -- + hugepages = false; + + /* advise huge page hint for THP */ + os_advise(ptr,bytes); + return ptr; + } + + size_t os_shrink(void* ptr, size_t bytesNew, size_t bytesOld, bool hugepages) + { + const size_t pageSize = hugepages ? PAGE_SIZE_2M : PAGE_SIZE_4K; + bytesNew = (bytesNew+pageSize-1) & ~(pageSize-1); + bytesOld = (bytesOld+pageSize-1) & ~(pageSize-1); + if (bytesNew >= bytesOld) + return bytesOld; + + if (munmap((char*)ptr+bytesNew,bytesOld-bytesNew) == -1) + // -- GODOT start -- + // throw std::bad_alloc(); + abort(); + // -- GODOT end -- + + return bytesNew; + } + + void os_free(void* ptr, size_t bytes, bool hugepages) + { + if (bytes == 0) + return; + + /* for hugepages we need to also align the size */ + const size_t pageSize = hugepages ? PAGE_SIZE_2M : PAGE_SIZE_4K; + bytes = (bytes+pageSize-1) & ~(pageSize-1); + if (munmap(ptr,bytes) == -1) + // -- GODOT start -- + // throw std::bad_alloc(); + abort(); + // -- GODOT end -- + } + + /* hint for transparent huge pages (THP) */ + void os_advise(void* pptr, size_t bytes) + { +#if defined(MADV_HUGEPAGE) + madvise(pptr,bytes,MADV_HUGEPAGE); +#endif + } +} + +#endif diff --git a/thirdparty/embree-aarch64/common/sys/alloc.h b/thirdparty/embree-aarch64/common/sys/alloc.h new file mode 100644 index 0000000000..5898ecda70 --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/alloc.h @@ -0,0 +1,164 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "platform.h" +#include <vector> +#include <set> + +namespace embree +{ +#define ALIGNED_STRUCT_(align) \ + void* operator new(size_t size) { return alignedMalloc(size,align); } \ + void operator delete(void* ptr) { alignedFree(ptr); } \ + void* operator new[](size_t size) { return alignedMalloc(size,align); } \ + void operator delete[](void* ptr) { alignedFree(ptr); } + +#define ALIGNED_CLASS_(align) \ + public: \ + ALIGNED_STRUCT_(align) \ + private: + + /*! aligned allocation */ + void* alignedMalloc(size_t size, size_t align); + void alignedFree(void* ptr); + + /*! allocator that performs aligned allocations */ + template<typename T, size_t alignment> + struct aligned_allocator + { + typedef T value_type; + typedef T* pointer; + typedef const T* const_pointer; + typedef T& reference; + typedef const T& const_reference; + typedef std::size_t size_type; + typedef std::ptrdiff_t difference_type; + + __forceinline pointer allocate( size_type n ) { + return (pointer) alignedMalloc(n*sizeof(value_type),alignment); + } + + __forceinline void deallocate( pointer p, size_type n ) { + return alignedFree(p); + } + + __forceinline void construct( pointer p, const_reference val ) { + new (p) T(val); + } + + __forceinline void destroy( pointer p ) { + p->~T(); + } + }; + + /*! allocates pages directly from OS */ + bool win_enable_selockmemoryprivilege(bool verbose); + bool os_init(bool hugepages, bool verbose); + void* os_malloc (size_t bytes, bool& hugepages); + size_t os_shrink (void* ptr, size_t bytesNew, size_t bytesOld, bool hugepages); + void os_free (void* ptr, size_t bytes, bool hugepages); + void os_advise (void* ptr, size_t bytes); + + /*! allocator that performs OS allocations */ + template<typename T> + struct os_allocator + { + typedef T value_type; + typedef T* pointer; + typedef const T* const_pointer; + typedef T& reference; + typedef const T& const_reference; + typedef std::size_t size_type; + typedef std::ptrdiff_t difference_type; + + __forceinline os_allocator () + : hugepages(false) {} + + __forceinline pointer allocate( size_type n ) { + return (pointer) os_malloc(n*sizeof(value_type),hugepages); + } + + __forceinline void deallocate( pointer p, size_type n ) { + return os_free(p,n*sizeof(value_type),hugepages); + } + + __forceinline void construct( pointer p, const_reference val ) { + new (p) T(val); + } + + __forceinline void destroy( pointer p ) { + p->~T(); + } + + bool hugepages; + }; + + /*! allocator for IDs */ + template<typename T, size_t max_id> + struct IDPool + { + typedef T value_type; + + IDPool () + : nextID(0) {} + + T allocate() + { + /* return ID from list */ + if (!IDs.empty()) + { + T id = *IDs.begin(); + IDs.erase(IDs.begin()); + return id; + } + + /* allocate new ID */ + else + { + if (size_t(nextID)+1 > max_id) + return -1; + + return nextID++; + } + } + + /* adds an ID provided by the user */ + bool add(T id) + { + if (id > max_id) + return false; + + /* check if ID should be in IDs set */ + if (id < nextID) { + auto p = IDs.find(id); + if (p == IDs.end()) return false; + IDs.erase(p); + return true; + } + + /* otherwise increase ID set */ + else + { + for (T i=nextID; i<id; i++) { + IDs.insert(i); + } + nextID = id+1; + return true; + } + } + + void deallocate( T id ) + { + assert(id < nextID); + MAYBE_UNUSED auto done = IDs.insert(id).second; + assert(done); + } + + private: + std::set<T> IDs; //!< stores deallocated IDs to be reused + T nextID; //!< next ID to use when IDs vector is empty + }; +} + diff --git a/thirdparty/embree-aarch64/common/sys/array.h b/thirdparty/embree-aarch64/common/sys/array.h new file mode 100644 index 0000000000..77722a39f6 --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/array.h @@ -0,0 +1,222 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "platform.h" +#include "alloc.h" + +namespace embree +{ + /*! static array with static size */ + template<typename T, size_t N> + class array_t + { + public: + + /********************** Iterators ****************************/ + + __forceinline T* begin() const { return items; }; + __forceinline T* end () const { return items+N; }; + + + /********************** Capacity ****************************/ + + __forceinline bool empty () const { return N == 0; } + __forceinline size_t size () const { return N; } + __forceinline size_t max_size () const { return N; } + + + /******************** Element access **************************/ + + __forceinline T& operator[](size_t i) { assert(i < N); return items[i]; } + __forceinline const T& operator[](size_t i) const { assert(i < N); return items[i]; } + + __forceinline T& at(size_t i) { assert(i < N); return items[i]; } + __forceinline const T& at(size_t i) const { assert(i < N); return items[i]; } + + __forceinline T& front() const { assert(N > 0); return items[0]; }; + __forceinline T& back () const { assert(N > 0); return items[N-1]; }; + + __forceinline T* data() { return items; }; + __forceinline const T* data() const { return items; }; + + private: + T items[N]; + }; + + /*! static array with dynamic size */ + template<typename T, size_t N> + class darray_t + { + public: + + __forceinline darray_t () : M(0) {} + + __forceinline darray_t (const T& v) : M(0) { + for (size_t i=0; i<N; i++) items[i] = v; + } + + /********************** Iterators ****************************/ + + __forceinline T* begin() const { return items; }; + __forceinline T* end () const { return items+M; }; + + + /********************** Capacity ****************************/ + + __forceinline bool empty () const { return M == 0; } + __forceinline size_t size () const { return M; } + __forceinline size_t capacity () const { return N; } + __forceinline size_t max_size () const { return N; } + + void resize(size_t new_size) { + assert(new_size < max_size()); + M = new_size; + } + + /******************** Modifiers **************************/ + + __forceinline void push_back(const T& v) + { + assert(M+1 < max_size()); + items[M++] = v; + } + + __forceinline void pop_back() + { + assert(!empty()); + M--; + } + + __forceinline void clear() { + M = 0; + } + + /******************** Element access **************************/ + + __forceinline T& operator[](size_t i) { assert(i < M); return items[i]; } + __forceinline const T& operator[](size_t i) const { assert(i < M); return items[i]; } + + __forceinline T& at(size_t i) { assert(i < M); return items[i]; } + __forceinline const T& at(size_t i) const { assert(i < M); return items[i]; } + + __forceinline T& front() const { assert(M > 0); return items[0]; }; + __forceinline T& back () const { assert(M > 0); return items[M-1]; }; + + __forceinline T* data() { return items; }; + __forceinline const T* data() const { return items; }; + + private: + size_t M; + T items[N]; + }; + + /*! dynamic sized array that is allocated on the stack */ +#define dynamic_large_stack_array(Ty,Name,N,max_stack_bytes) StackArray<Ty,max_stack_bytes> Name(N) + template<typename Ty, size_t max_stack_bytes> + struct __aligned(64) StackArray + { + __forceinline StackArray (const size_t N) + : N(N) + { + if (N*sizeof(Ty) <= max_stack_bytes) + data = &arr[0]; + else + data = (Ty*) alignedMalloc(N*sizeof(Ty),64); + } + + __forceinline ~StackArray () { + if (data != &arr[0]) alignedFree(data); + } + + __forceinline operator Ty* () { return data; } + __forceinline operator const Ty* () const { return data; } + + __forceinline Ty& operator[](const int i) { assert(i>=0 && i<N); return data[i]; } + __forceinline const Ty& operator[](const int i) const { assert(i>=0 && i<N); return data[i]; } + + __forceinline Ty& operator[](const unsigned i) { assert(i<N); return data[i]; } + __forceinline const Ty& operator[](const unsigned i) const { assert(i<N); return data[i]; } + +#if defined(__X86_64__) || defined(__aarch64__) + __forceinline Ty& operator[](const size_t i) { assert(i<N); return data[i]; } + __forceinline const Ty& operator[](const size_t i) const { assert(i<N); return data[i]; } +#endif + + private: + Ty arr[max_stack_bytes/sizeof(Ty)]; + Ty* data; + size_t N; + + private: + StackArray (const StackArray& other) DELETED; // do not implement + StackArray& operator= (const StackArray& other) DELETED; // do not implement + + }; + + /*! dynamic sized array that is allocated on the stack */ + template<typename Ty, size_t max_stack_elements, size_t max_total_elements> + struct __aligned(64) DynamicStackArray + { + __forceinline DynamicStackArray () + : data(&arr[0]) {} + + __forceinline ~DynamicStackArray () + { + if (!isStackAllocated()) + delete[] data; + } + + __forceinline bool isStackAllocated() const { + return data == &arr[0]; + } + + __forceinline size_t size() const + { + if (isStackAllocated()) return max_stack_elements; + else return max_total_elements; + } + + __forceinline void resize(size_t M) + { + assert(M <= max_total_elements); + if (likely(M <= max_stack_elements)) return; + if (likely(!isStackAllocated())) return; + + data = new Ty[max_total_elements]; + + for (size_t i=0; i<max_stack_elements; i++) + data[i] = arr[i]; + } + + __forceinline operator Ty* () { return data; } + __forceinline operator const Ty* () const { return data; } + + __forceinline Ty& operator[](const int i) { assert(i>=0 && i<max_total_elements); resize(i+1); return data[i]; } + __forceinline Ty& operator[](const unsigned i) { assert(i<max_total_elements); resize(i+1); return data[i]; } + +#if defined(__X86_64__) || defined(__aarch64__) + __forceinline Ty& operator[](const size_t i) { assert(i<max_total_elements); resize(i+1); return data[i]; } +#endif + + __forceinline DynamicStackArray (const DynamicStackArray& other) + : data(&arr[0]) + { + for (size_t i=0; i<other.size(); i++) + this->operator[] (i) = other[i]; + } + + DynamicStackArray& operator= (const DynamicStackArray& other) + { + for (size_t i=0; i<other.size(); i++) + this->operator[] (i) = other[i]; + + return *this; + } + + private: + Ty arr[max_stack_elements]; + Ty* data; + }; +} diff --git a/thirdparty/embree-aarch64/common/sys/atomic.h b/thirdparty/embree-aarch64/common/sys/atomic.h new file mode 100644 index 0000000000..ebfb8552c3 --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/atomic.h @@ -0,0 +1,59 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <atomic> +#include "intrinsics.h" + +namespace embree +{ +/* compiler memory barriers */ +#if defined(__INTEL_COMPILER) +//#define __memory_barrier() __memory_barrier() +#elif defined(__GNUC__) || defined(__clang__) +# define __memory_barrier() asm volatile("" ::: "memory") +#elif defined(_MSC_VER) +# define __memory_barrier() _ReadWriteBarrier() +#endif + + template <typename T> + struct atomic : public std::atomic<T> + { + atomic () {} + + atomic (const T& a) + : std::atomic<T>(a) {} + + atomic (const atomic<T>& a) { + this->store(a.load()); + } + + atomic& operator=(const atomic<T>& other) { + this->store(other.load()); + return *this; + } + }; + + template<typename T> + __forceinline void atomic_min(std::atomic<T>& aref, const T& bref) + { + const T b = bref.load(); + while (true) { + T a = aref.load(); + if (a <= b) break; + if (aref.compare_exchange_strong(a,b)) break; + } + } + + template<typename T> + __forceinline void atomic_max(std::atomic<T>& aref, const T& bref) + { + const T b = bref.load(); + while (true) { + T a = aref.load(); + if (a >= b) break; + if (aref.compare_exchange_strong(a,b)) break; + } + } +} diff --git a/thirdparty/embree-aarch64/common/sys/barrier.cpp b/thirdparty/embree-aarch64/common/sys/barrier.cpp new file mode 100644 index 0000000000..0061d18db2 --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/barrier.cpp @@ -0,0 +1,289 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "barrier.h" +#include "condition.h" +#include "regression.h" +#include "thread.h" + +#if defined (__WIN32__) + +#define WIN32_LEAN_AND_MEAN +#include <windows.h> + +namespace embree +{ + struct BarrierSysImplementation + { + __forceinline BarrierSysImplementation (size_t N) + : i(0), enterCount(0), exitCount(0), barrierSize(0) + { + events[0] = CreateEvent(nullptr, TRUE, FALSE, nullptr); + events[1] = CreateEvent(nullptr, TRUE, FALSE, nullptr); + init(N); + } + + __forceinline ~BarrierSysImplementation () + { + CloseHandle(events[0]); + CloseHandle(events[1]); + } + + __forceinline void init(size_t N) + { + barrierSize = N; + enterCount.store(N); + exitCount.store(N); + } + + __forceinline void wait() + { + /* every thread entering the barrier decrements this count */ + size_t i0 = i; + size_t cnt0 = enterCount--; + + /* all threads except the last one are wait in the barrier */ + if (cnt0 > 1) + { + if (WaitForSingleObject(events[i0], INFINITE) != WAIT_OBJECT_0) + THROW_RUNTIME_ERROR("WaitForSingleObjects failed"); + } + + /* the last thread starts all threads waiting at the barrier */ + else + { + i = 1-i; + enterCount.store(barrierSize); + if (SetEvent(events[i0]) == 0) + THROW_RUNTIME_ERROR("SetEvent failed"); + } + + /* every thread leaving the barrier decrements this count */ + size_t cnt1 = exitCount--; + + /* the last thread that left the barrier resets the event again */ + if (cnt1 == 1) + { + exitCount.store(barrierSize); + if (ResetEvent(events[i0]) == 0) + THROW_RUNTIME_ERROR("ResetEvent failed"); + } + } + + public: + HANDLE events[2]; + atomic<size_t> i; + atomic<size_t> enterCount; + atomic<size_t> exitCount; + size_t barrierSize; + }; +} + +#else + +namespace embree +{ + struct BarrierSysImplementation + { + __forceinline BarrierSysImplementation (size_t N) + : count(0), barrierSize(0) + { + init(N); + } + + __forceinline void init(size_t N) + { + assert(count == 0); + count = 0; + barrierSize = N; + } + + __forceinline void wait() + { + mutex.lock(); + count++; + + if (count == barrierSize) { + count = 0; + cond.notify_all(); + mutex.unlock(); + return; + } + + cond.wait(mutex); + mutex.unlock(); + return; + } + + public: + MutexSys mutex; + ConditionSys cond; + volatile size_t count; + volatile size_t barrierSize; + }; +} + +#endif + +namespace embree +{ + BarrierSys::BarrierSys (size_t N) { + opaque = new BarrierSysImplementation(N); + } + + BarrierSys::~BarrierSys () { + delete (BarrierSysImplementation*) opaque; + } + + void BarrierSys::init(size_t count) { + ((BarrierSysImplementation*) opaque)->init(count); + } + + void BarrierSys::wait() { + ((BarrierSysImplementation*) opaque)->wait(); + } + + LinearBarrierActive::LinearBarrierActive (size_t N) + : count0(nullptr), count1(nullptr), mode(0), flag0(0), flag1(0), threadCount(0) + { + if (N == 0) N = getNumberOfLogicalThreads(); + init(N); + } + + LinearBarrierActive::~LinearBarrierActive() + { + delete[] count0; + delete[] count1; + } + + void LinearBarrierActive::init(size_t N) + { + if (threadCount != N) { + threadCount = N; + if (count0) delete[] count0; count0 = new unsigned char[N]; + if (count1) delete[] count1; count1 = new unsigned char[N]; + } + mode = 0; + flag0 = 0; + flag1 = 0; + for (size_t i=0; i<N; i++) count0[i] = 0; + for (size_t i=0; i<N; i++) count1[i] = 0; + } + + void LinearBarrierActive::wait (const size_t threadIndex) + { + if (mode == 0) + { + if (threadIndex == 0) + { + for (size_t i=0; i<threadCount; i++) + count1[i] = 0; + + for (size_t i=1; i<threadCount; i++) + { + while (likely(count0[i] == 0)) + pause_cpu(); + } + mode = 1; + flag1 = 0; + __memory_barrier(); + flag0 = 1; + } + else + { + count0[threadIndex] = 1; + { + while (likely(flag0 == 0)) + pause_cpu(); + } + + } + } + else + { + if (threadIndex == 0) + { + for (size_t i=0; i<threadCount; i++) + count0[i] = 0; + + for (size_t i=1; i<threadCount; i++) + { + while (likely(count1[i] == 0)) + pause_cpu(); + } + + mode = 0; + flag0 = 0; + __memory_barrier(); + flag1 = 1; + } + else + { + count1[threadIndex] = 1; + { + while (likely(flag1 == 0)) + pause_cpu(); + } + } + } + } + + struct barrier_sys_regression_test : public RegressionTest + { + BarrierSys barrier; + std::atomic<size_t> threadID; + std::atomic<size_t> numFailed; + std::vector<size_t> threadResults; + + barrier_sys_regression_test() + : RegressionTest("barrier_sys_regression_test"), threadID(0), numFailed(0) + { + registerRegressionTest(this); + } + + static void thread_alloc(barrier_sys_regression_test* This) + { + size_t tid = This->threadID++; + for (size_t j=0; j<1000; j++) + { + This->barrier.wait(); + This->threadResults[tid] = tid; + This->barrier.wait(); + } + } + + bool run () + { + threadID.store(0); + numFailed.store(0); + + size_t numThreads = getNumberOfLogicalThreads(); + threadResults.resize(numThreads); + barrier.init(numThreads+1); + + /* create threads */ + std::vector<thread_t> threads; + for (size_t i=0; i<numThreads; i++) + threads.push_back(createThread((thread_func)thread_alloc,this)); + + /* run test */ + for (size_t i=0; i<1000; i++) + { + for (size_t i=0; i<numThreads; i++) threadResults[i] = 0; + barrier.wait(); + barrier.wait(); + for (size_t i=0; i<numThreads; i++) numFailed += threadResults[i] != i; + } + + /* destroy threads */ + for (size_t i=0; i<numThreads; i++) + join(threads[i]); + + return numFailed == 0; + } + }; + + barrier_sys_regression_test barrier_sys_regression_test; +} + + diff --git a/thirdparty/embree-aarch64/common/sys/barrier.h b/thirdparty/embree-aarch64/common/sys/barrier.h new file mode 100644 index 0000000000..89607b8685 --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/barrier.h @@ -0,0 +1,112 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "intrinsics.h" +#include "sysinfo.h" +#include "atomic.h" + +namespace embree +{ + /*! system barrier using operating system */ + class BarrierSys + { + public: + + /*! construction / destruction */ + BarrierSys (size_t N = 0); + ~BarrierSys (); + + private: + /*! class in non-copyable */ + BarrierSys (const BarrierSys& other) DELETED; // do not implement + BarrierSys& operator= (const BarrierSys& other) DELETED; // do not implement + + public: + /*! intializes the barrier with some number of threads */ + void init(size_t count); + + /*! lets calling thread wait in barrier */ + void wait(); + + private: + void* opaque; + }; + + /*! fast active barrier using atomitc counter */ + struct BarrierActive + { + public: + BarrierActive () + : cntr(0) {} + + void reset() { + cntr.store(0); + } + + void wait (size_t numThreads) + { + cntr++; + while (cntr.load() != numThreads) + pause_cpu(); + } + + private: + std::atomic<size_t> cntr; + }; + + /*! fast active barrier that does not require initialization to some number of threads */ + struct BarrierActiveAutoReset + { + public: + BarrierActiveAutoReset () + : cntr0(0), cntr1(0) {} + + void wait (size_t threadCount) + { + cntr0.fetch_add(1); + while (cntr0 != threadCount) pause_cpu(); + cntr1.fetch_add(1); + while (cntr1 != threadCount) pause_cpu(); + cntr0.fetch_add(-1); + while (cntr0 != 0) pause_cpu(); + cntr1.fetch_add(-1); + while (cntr1 != 0) pause_cpu(); + } + + private: + std::atomic<size_t> cntr0; + std::atomic<size_t> cntr1; + }; + + class LinearBarrierActive + { + public: + + /*! construction and destruction */ + LinearBarrierActive (size_t threadCount = 0); + ~LinearBarrierActive(); + + private: + /*! class in non-copyable */ + LinearBarrierActive (const LinearBarrierActive& other) DELETED; // do not implement + LinearBarrierActive& operator= (const LinearBarrierActive& other) DELETED; // do not implement + + public: + /*! intializes the barrier with some number of threads */ + void init(size_t threadCount); + + /*! thread with threadIndex waits in the barrier */ + void wait (const size_t threadIndex); + + private: + volatile unsigned char* count0; + volatile unsigned char* count1; + volatile unsigned int mode; + volatile unsigned int flag0; + volatile unsigned int flag1; + volatile size_t threadCount; + }; +} + diff --git a/thirdparty/embree-aarch64/common/sys/condition.cpp b/thirdparty/embree-aarch64/common/sys/condition.cpp new file mode 100644 index 0000000000..0e7ca7af39 --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/condition.cpp @@ -0,0 +1,81 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "condition.h" + +#if defined(__WIN32__) && !defined(PTHREADS_WIN32) + +#define WIN32_LEAN_AND_MEAN +#include <windows.h> + +namespace embree +{ + struct ConditionImplementation + { + __forceinline ConditionImplementation () { + InitializeConditionVariable(&cond); + } + + __forceinline ~ConditionImplementation () { + } + + __forceinline void wait(MutexSys& mutex_in) { + SleepConditionVariableCS(&cond, (LPCRITICAL_SECTION)mutex_in.mutex, INFINITE); + } + + __forceinline void notify_all() { + WakeAllConditionVariable(&cond); + } + + public: + CONDITION_VARIABLE cond; + }; +} +#endif + +#if defined(__UNIX__) || defined(PTHREADS_WIN32) +#include <pthread.h> +namespace embree +{ + struct ConditionImplementation + { + __forceinline ConditionImplementation () { + pthread_cond_init(&cond,nullptr); + } + + __forceinline ~ConditionImplementation() { + pthread_cond_destroy(&cond); + } + + __forceinline void wait(MutexSys& mutex) { + pthread_cond_wait(&cond, (pthread_mutex_t*)mutex.mutex); + } + + __forceinline void notify_all() { + pthread_cond_broadcast(&cond); + } + + public: + pthread_cond_t cond; + }; +} +#endif + +namespace embree +{ + ConditionSys::ConditionSys () { + cond = new ConditionImplementation; + } + + ConditionSys::~ConditionSys() { + delete (ConditionImplementation*) cond; + } + + void ConditionSys::wait(MutexSys& mutex) { + ((ConditionImplementation*) cond)->wait(mutex); + } + + void ConditionSys::notify_all() { + ((ConditionImplementation*) cond)->notify_all(); + } +} diff --git a/thirdparty/embree-aarch64/common/sys/condition.h b/thirdparty/embree-aarch64/common/sys/condition.h new file mode 100644 index 0000000000..7a3a05aa81 --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/condition.h @@ -0,0 +1,31 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "mutex.h" + +namespace embree +{ + class ConditionSys + { + public: + ConditionSys(); + ~ConditionSys(); + void wait( class MutexSys& mutex ); + void notify_all(); + + template<typename Predicate> + __forceinline void wait( class MutexSys& mutex, const Predicate& pred ) + { + while (!pred()) wait(mutex); + } + + private: + ConditionSys (const ConditionSys& other) DELETED; // do not implement + ConditionSys& operator= (const ConditionSys& other) DELETED; // do not implement + + protected: + void* cond; + }; +} diff --git a/thirdparty/embree-aarch64/common/sys/filename.cpp b/thirdparty/embree-aarch64/common/sys/filename.cpp new file mode 100644 index 0000000000..86182c1afb --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/filename.cpp @@ -0,0 +1,138 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "filename.h" +#include "sysinfo.h" + +namespace embree +{ +#ifdef __WIN32__ + const char path_sep = '\\'; +#else + const char path_sep = '/'; +#endif + + /*! create an empty filename */ + FileName::FileName () {} + + /*! create a valid filename from a string */ + FileName::FileName (const char* in) { + filename = in; + for (size_t i=0; i<filename.size(); i++) + if (filename[i] == '\\' || filename[i] == '/') + filename[i] = path_sep; + while (!filename.empty() && filename[filename.size()-1] == path_sep) + filename.resize(filename.size()-1); + } + + /*! create a valid filename from a string */ + FileName::FileName (const std::string& in) { + filename = in; + for (size_t i=0; i<filename.size(); i++) + if (filename[i] == '\\' || filename[i] == '/') + filename[i] = path_sep; + while (!filename.empty() && filename[filename.size()-1] == path_sep) + filename.resize(filename.size()-1); + } + + /*! returns path to home folder */ + FileName FileName::homeFolder() + { +#ifdef __WIN32__ + const char* home = getenv("UserProfile"); +#else + const char* home = getenv("HOME"); +#endif + if (home) return home; + return ""; + } + + /*! returns path to executable */ + FileName FileName::executableFolder() { + return FileName(getExecutableFileName()).path(); + } + + /*! returns the path */ + FileName FileName::path() const { + size_t pos = filename.find_last_of(path_sep); + if (pos == std::string::npos) return FileName(); + return filename.substr(0,pos); + } + + /*! returns the basename */ + std::string FileName::base() const { + size_t pos = filename.find_last_of(path_sep); + if (pos == std::string::npos) return filename; + return filename.substr(pos+1); + } + + /*! returns the extension */ + std::string FileName::ext() const { + size_t pos = filename.find_last_of('.'); + if (pos == std::string::npos) return ""; + return filename.substr(pos+1); + } + + /*! returns the extension */ + FileName FileName::dropExt() const { + size_t pos = filename.find_last_of('.'); + if (pos == std::string::npos) return filename; + return filename.substr(0,pos); + } + + /*! returns the basename without extension */ + std::string FileName::name() const { + size_t start = filename.find_last_of(path_sep); + if (start == std::string::npos) start = 0; else start++; + size_t end = filename.find_last_of('.'); + if (end == std::string::npos || end < start) end = filename.size(); + return filename.substr(start, end - start); + } + + /*! replaces the extension */ + FileName FileName::setExt(const std::string& ext) const { + size_t start = filename.find_last_of(path_sep); + if (start == std::string::npos) start = 0; else start++; + size_t end = filename.find_last_of('.'); + if (end == std::string::npos || end < start) return FileName(filename+ext); + return FileName(filename.substr(0,end)+ext); + } + + /*! adds the extension */ + FileName FileName::addExt(const std::string& ext) const { + return FileName(filename+ext); + } + + /*! concatenates two filenames to this/other */ + FileName FileName::operator +( const FileName& other ) const { + if (filename == "") return FileName(other); + else return FileName(filename + path_sep + other.filename); + } + + /*! concatenates two filenames to this/other */ + FileName FileName::operator +( const std::string& other ) const { + return operator+(FileName(other)); + } + + /*! removes the base from a filename (if possible) */ + FileName FileName::operator -( const FileName& base ) const { + size_t pos = filename.find_first_of(base); + if (pos == std::string::npos) return *this; + return FileName(filename.substr(pos+1)); + } + + /*! == operator */ + bool operator== (const FileName& a, const FileName& b) { + return a.filename == b.filename; + } + + /*! != operator */ + bool operator!= (const FileName& a, const FileName& b) { + return a.filename != b.filename; + } + + /*! output operator */ + std::ostream& operator<<(std::ostream& cout, const FileName& filename) { + return cout << filename.filename; + } +} diff --git a/thirdparty/embree-aarch64/common/sys/filename.h b/thirdparty/embree-aarch64/common/sys/filename.h new file mode 100644 index 0000000000..58f881b14d --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/filename.h @@ -0,0 +1,81 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "platform.h" + +namespace embree +{ + /*! Convenience class for handling file names and paths. */ + class FileName + { + public: + + /*! create an empty filename */ + FileName (); + + /*! create a valid filename from a string */ + FileName (const char* filename); + + /*! create a valid filename from a string */ + FileName (const std::string& filename); + + /*! returns path to home folder */ + static FileName homeFolder(); + + /*! returns path to executable */ + static FileName executableFolder(); + + /*! auto convert into a string */ + operator std::string() const { return filename; } + + /*! returns a string of the filename */ + const std::string str() const { return filename; } + + /*! returns a c-string of the filename */ + const char* c_str() const { return filename.c_str(); } + + /*! returns the path of a filename */ + FileName path() const; + + /*! returns the file of a filename */ + std::string base() const; + + /*! returns the base of a filename without extension */ + std::string name() const; + + /*! returns the file extension */ + std::string ext() const; + + /*! drops the file extension */ + FileName dropExt() const; + + /*! replaces the file extension */ + FileName setExt(const std::string& ext = "") const; + + /*! adds file extension */ + FileName addExt(const std::string& ext = "") const; + + /*! concatenates two filenames to this/other */ + FileName operator +( const FileName& other ) const; + + /*! concatenates two filenames to this/other */ + FileName operator +( const std::string& other ) const; + + /*! removes the base from a filename (if possible) */ + FileName operator -( const FileName& base ) const; + + /*! == operator */ + friend bool operator==(const FileName& a, const FileName& b); + + /*! != operator */ + friend bool operator!=(const FileName& a, const FileName& b); + + /*! output operator */ + friend embree_ostream operator<<(embree_ostream cout, const FileName& filename); + + private: + std::string filename; + }; +} diff --git a/thirdparty/embree-aarch64/common/sys/intrinsics.h b/thirdparty/embree-aarch64/common/sys/intrinsics.h new file mode 100644 index 0000000000..44cdbd8f0f --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/intrinsics.h @@ -0,0 +1,559 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "platform.h" + +#if defined(__WIN32__) +#include <intrin.h> +#endif + +#if defined(__ARM_NEON) +#include "../math/SSE2NEON.h" +#if defined(NEON_AVX2_EMULATION) +#include "../math/AVX2NEON.h" +#endif +#else +#include <immintrin.h> +#endif + +#if defined(__BMI__) && defined(__GNUC__) && !defined(__INTEL_COMPILER) + #if !defined(_tzcnt_u32) + #define _tzcnt_u32 __tzcnt_u32 + #endif + #if !defined(_tzcnt_u64) + #define _tzcnt_u64 __tzcnt_u64 + #endif +#endif + +#if defined(__aarch64__) +#if !defined(_lzcnt_u32) + #define _lzcnt_u32 __builtin_clz +#endif +#if !defined(_lzcnt_u32) + #define _lzcnt_u32 __builtin_clzll +#endif +#else +#if defined(__LZCNT__) + #if !defined(_lzcnt_u32) + #define _lzcnt_u32 __lzcnt32 + #endif + #if !defined(_lzcnt_u64) + #define _lzcnt_u64 __lzcnt64 + #endif +#endif +#endif + +#if defined(__WIN32__) +# ifndef NOMINMAX +# define NOMINMAX +# endif +# include <windows.h> +#endif + +/* normally defined in pmmintrin.h, but we always need this */ +#if !defined(_MM_SET_DENORMALS_ZERO_MODE) +#define _MM_DENORMALS_ZERO_ON (0x0040) +#define _MM_DENORMALS_ZERO_OFF (0x0000) +#define _MM_DENORMALS_ZERO_MASK (0x0040) +#define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x))) +#endif + +namespace embree +{ + +//////////////////////////////////////////////////////////////////////////////// +/// Windows Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__WIN32__) + + __forceinline size_t read_tsc() + { + LARGE_INTEGER li; + QueryPerformanceCounter(&li); + return (size_t)li.QuadPart; + } + + __forceinline int bsf(int v) { +#if defined(__AVX2__) && !defined(__aarch64__) + return _tzcnt_u32(v); +#else + unsigned long r = 0; _BitScanForward(&r,v); return r; +#endif + } + + __forceinline unsigned bsf(unsigned v) { +#if defined(__AVX2__) && !defined(__aarch64__) + return _tzcnt_u32(v); +#else + unsigned long r = 0; _BitScanForward(&r,v); return r; +#endif + } + +#if defined(__X86_64__) + __forceinline size_t bsf(size_t v) { +#if defined(__AVX2__) + return _tzcnt_u64(v); +#else + unsigned long r = 0; _BitScanForward64(&r,v); return r; +#endif + } +#endif + + __forceinline int bscf(int& v) + { + int i = bsf(v); + v &= v-1; + return i; + } + + __forceinline unsigned bscf(unsigned& v) + { + unsigned i = bsf(v); + v &= v-1; + return i; + } + +#if defined(__X86_64__) + __forceinline size_t bscf(size_t& v) + { + size_t i = bsf(v); + v &= v-1; + return i; + } +#endif + + __forceinline int bsr(int v) { +#if defined(__AVX2__) && !defined(__aarch64__) + return 31 - _lzcnt_u32(v); +#else + unsigned long r = 0; _BitScanReverse(&r,v); return r; +#endif + } + + __forceinline unsigned bsr(unsigned v) { +#if defined(__AVX2__) && !defined(__aarch64__) + return 31 - _lzcnt_u32(v); +#else + unsigned long r = 0; _BitScanReverse(&r,v); return r; +#endif + } + +#if defined(__X86_64__) + __forceinline size_t bsr(size_t v) { +#if defined(__AVX2__) + return 63 -_lzcnt_u64(v); +#else + unsigned long r = 0; _BitScanReverse64(&r, v); return r; +#endif + } +#endif + + __forceinline int lzcnt(const int x) + { +#if defined(__AVX2__) && !defined(__aarch64__) + return _lzcnt_u32(x); +#else + if (unlikely(x == 0)) return 32; + return 31 - bsr(x); +#endif + } + + __forceinline int btc(int v, int i) { + long r = v; _bittestandcomplement(&r,i); return r; + } + + __forceinline int bts(int v, int i) { + long r = v; _bittestandset(&r,i); return r; + } + + __forceinline int btr(int v, int i) { + long r = v; _bittestandreset(&r,i); return r; + } + +#if defined(__X86_64__) + + __forceinline size_t btc(size_t v, size_t i) { + size_t r = v; _bittestandcomplement64((__int64*)&r,i); return r; + } + + __forceinline size_t bts(size_t v, size_t i) { + __int64 r = v; _bittestandset64(&r,i); return r; + } + + __forceinline size_t btr(size_t v, size_t i) { + __int64 r = v; _bittestandreset64(&r,i); return r; + } + +#endif + + __forceinline int32_t atomic_cmpxchg(volatile int32_t* p, const int32_t c, const int32_t v) { + return _InterlockedCompareExchange((volatile long*)p,v,c); + } + +//////////////////////////////////////////////////////////////////////////////// +/// Unix Platform +//////////////////////////////////////////////////////////////////////////////// + +#else + +#if defined(__i386__) && defined(__PIC__) + + __forceinline void __cpuid(int out[4], int op) + { + asm volatile ("xchg{l}\t{%%}ebx, %1\n\t" + "cpuid\n\t" + "xchg{l}\t{%%}ebx, %1\n\t" + : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) + : "0"(op)); + } + + __forceinline void __cpuid_count(int out[4], int op1, int op2) + { + asm volatile ("xchg{l}\t{%%}ebx, %1\n\t" + "cpuid\n\t" + "xchg{l}\t{%%}ebx, %1\n\t" + : "=a" (out[0]), "=r" (out[1]), "=c" (out[2]), "=d" (out[3]) + : "0" (op1), "2" (op2)); + } + +#else + + __forceinline void __cpuid(int out[4], int op) { +#if defined(__ARM_NEON) + if (op == 0) { // Get CPU name + out[0] = 0x41524d20; + out[1] = 0x41524d20; + out[2] = 0x41524d20; + out[3] = 0x41524d20; + } +#else + asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op)); +#endif + } + +#if !defined(__ARM_NEON) + __forceinline void __cpuid_count(int out[4], int op1, int op2) { + asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op1), "c"(op2)); + } +#endif + +#endif + + __forceinline uint64_t read_tsc() { +#if defined(__ARM_NEON) + return 0; // FIXME(LTE): mimic rdtsc +#else + uint32_t high,low; + asm volatile ("rdtsc" : "=d"(high), "=a"(low)); + return (((uint64_t)high) << 32) + (uint64_t)low; +#endif + } + + __forceinline int bsf(int v) { +#if defined(__ARM_NEON) + return __builtin_ctz(v); +#else +#if defined(__AVX2__) + return _tzcnt_u32(v); +#else + int r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r; +#endif +#endif + } + +#if defined(__X86_64__) || defined(__aarch64__) + __forceinline unsigned bsf(unsigned v) + { +#if defined(__ARM_NEON) + return __builtin_ctz(v); +#else +#if defined(__AVX2__) + return _tzcnt_u32(v); +#else + unsigned r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r; +#endif +#endif + } +#endif + + __forceinline size_t bsf(size_t v) { +#if defined(__AVX2__) && !defined(__aarch64__) +#if defined(__X86_64__) + return _tzcnt_u64(v); +#else + return _tzcnt_u32(v); +#endif +#elif defined(__ARM_NEON) + return __builtin_ctzl(v); +#else + size_t r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r; +#endif + } + + __forceinline int bscf(int& v) + { + int i = bsf(v); + v &= v-1; + return i; + } + +#if defined(__X86_64__) || defined(__aarch64__) + __forceinline unsigned int bscf(unsigned int& v) + { + unsigned int i = bsf(v); + v &= v-1; + return i; + } +#endif + + __forceinline size_t bscf(size_t& v) + { + size_t i = bsf(v); + v &= v-1; + return i; + } + + __forceinline int bsr(int v) { +#if defined(__AVX2__) && !defined(__aarch64__) + return 31 - _lzcnt_u32(v); +#elif defined(__ARM_NEON) + return __builtin_clz(v)^31; +#else + int r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r; +#endif + } + +#if defined(__X86_64__) || defined(__aarch64__) + __forceinline unsigned bsr(unsigned v) { +#if defined(__AVX2__) + return 31 - _lzcnt_u32(v); +#elif defined(__ARM_NEON) + return __builtin_clz(v)^31; +#else + unsigned r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r; +#endif + } +#endif + + __forceinline size_t bsr(size_t v) { +#if defined(__AVX2__) && !defined(__aarch64__) +#if defined(__X86_64__) + return 63 - _lzcnt_u64(v); +#else + return 31 - _lzcnt_u32(v); +#endif +#elif defined(__aarch64__) + return (sizeof(v) * 8 - 1) - __builtin_clzl(v); +#else + size_t r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r; +#endif + } + + __forceinline int lzcnt(const int x) + { +#if defined(__AVX2__) && !defined(__aarch64__) + return _lzcnt_u32(x); +#else + if (unlikely(x == 0)) return 32; + return 31 - bsr(x); +#endif + } + + __forceinline size_t blsr(size_t v) { +#if defined(__AVX2__) && !defined(__aarch64__) +#if defined(__INTEL_COMPILER) + return _blsr_u64(v); +#else +#if defined(__X86_64__) + return __blsr_u64(v); +#else + return __blsr_u32(v); +#endif +#endif +#else + return v & (v-1); +#endif + } + + __forceinline int btc(int v, int i) { +#if defined(__aarch64__) + // _bittestandcomplement(long *a, long b) { + // unsigned char x = (*a >> b) & 1; + // *a = *a ^ (1 << b); + // return x; + + // We only need `*a` + return (v ^ (1 << i)); +#else + int r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r; +#endif + } + + __forceinline int bts(int v, int i) { +#if defined(__aarch64__) + // _bittestandset(long *a, long b) { + // unsigned char x = (*a >> b) & 1; + // *a = *a | (1 << b); + // return x; + return (v | (v << i)); +#else + int r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r; +#endif + } + + __forceinline int btr(int v, int i) { +#if defined(__aarch64__) + // _bittestandreset(long *a, long b) { + // unsigned char x = (*a >> b) & 1; + // *a = *a & ~(1 << b); + // return x; + return (v & ~(v << i)); +#else + int r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r; +#endif + } + + __forceinline size_t btc(size_t v, size_t i) { +#if defined(__aarch64__) + return (v ^ (1 << i)); +#else + size_t r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r; +#endif + } + + __forceinline size_t bts(size_t v, size_t i) { +#if defined(__aarch64__) + return (v | (v << i)); +#else + size_t r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r; +#endif + } + + __forceinline size_t btr(size_t v, size_t i) { +#if defined(__ARM_NEON) + return (v & ~(v << i)); +#else + size_t r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r; +#endif + } + + __forceinline int32_t atomic_cmpxchg(int32_t volatile* value, int32_t comparand, const int32_t input) { + return __sync_val_compare_and_swap(value, comparand, input); + } + +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// All Platforms +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__clang__) || defined(__GNUC__) +#if !defined(_mm_undefined_ps) + __forceinline __m128 _mm_undefined_ps() { return _mm_setzero_ps(); } +#endif +#if !defined(_mm_undefined_si128) + __forceinline __m128i _mm_undefined_si128() { return _mm_setzero_si128(); } +#endif +#if !defined(_mm256_undefined_ps) && defined(__AVX__) + __forceinline __m256 _mm256_undefined_ps() { return _mm256_setzero_ps(); } +#endif +#if !defined(_mm256_undefined_si256) && defined(__AVX__) + __forceinline __m256i _mm256_undefined_si256() { return _mm256_setzero_si256(); } +#endif +#if !defined(_mm512_undefined_ps) && defined(__AVX512F__) + __forceinline __m512 _mm512_undefined_ps() { return _mm512_setzero_ps(); } +#endif +#if !defined(_mm512_undefined_epi32) && defined(__AVX512F__) + __forceinline __m512i _mm512_undefined_epi32() { return _mm512_setzero_si512(); } +#endif +#endif + +#if defined(__SSE4_2__) || defined(__ARM_NEON) + + __forceinline int popcnt(int in) { + return _mm_popcnt_u32(in); + } + + __forceinline unsigned popcnt(unsigned in) { + return _mm_popcnt_u32(in); + } + +#if defined(__X86_64__) || defined(__ARM_NEON) + __forceinline size_t popcnt(size_t in) { + return _mm_popcnt_u64(in); + } +#endif + +#endif + + __forceinline uint64_t rdtsc() + { + int dummy[4]; + __cpuid(dummy,0); + uint64_t clock = read_tsc(); + __cpuid(dummy,0); + return clock; + } + + __forceinline void pause_cpu(const size_t N = 8) + { + for (size_t i=0; i<N; i++) + _mm_pause(); + } + + /* prefetches */ + __forceinline void prefetchL1 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T0); } + __forceinline void prefetchL2 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T1); } + __forceinline void prefetchL3 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T2); } + __forceinline void prefetchNTA(const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_NTA); } + __forceinline void prefetchEX (const void* ptr) { +#if defined(__INTEL_COMPILER) + _mm_prefetch((const char*)ptr,_MM_HINT_ET0); +#else + _mm_prefetch((const char*)ptr,_MM_HINT_T0); +#endif + } + + __forceinline void prefetchL1EX(const void* ptr) { + prefetchEX(ptr); + } + + __forceinline void prefetchL2EX(const void* ptr) { + prefetchEX(ptr); + } +#if defined(__AVX2__) && !defined(__aarch64__) + __forceinline unsigned int pext(unsigned int a, unsigned int b) { return _pext_u32(a, b); } + __forceinline unsigned int pdep(unsigned int a, unsigned int b) { return _pdep_u32(a, b); } +#if defined(__X86_64__) + __forceinline size_t pext(size_t a, size_t b) { return _pext_u64(a, b); } + __forceinline size_t pdep(size_t a, size_t b) { return _pdep_u64(a, b); } +#endif +#endif + +#if defined(__AVX512F__) +#if defined(__INTEL_COMPILER) + __forceinline float mm512_cvtss_f32(__m512 v) { + return _mm512_cvtss_f32(v); + } + __forceinline int mm512_mask2int(__mmask16 k1) { + return _mm512_mask2int(k1); + } + __forceinline __mmask16 mm512_int2mask(int mask) { + return _mm512_int2mask(mask); + } +#else + __forceinline float mm512_cvtss_f32(__m512 v) { // FIXME: _mm512_cvtss_f32 neither supported by clang v4.0.0 nor GCC 6.3 + return _mm_cvtss_f32(_mm512_castps512_ps128(v)); + } + __forceinline int mm512_mask2int(__mmask16 k1) { // FIXME: _mm512_mask2int not yet supported by GCC 6.3 + return (int)k1; + } + __forceinline __mmask16 mm512_int2mask(int mask) { // FIXME: _mm512_int2mask not yet supported by GCC 6.3 + return (__mmask16)mask; + } +#endif +#endif +} diff --git a/thirdparty/embree-aarch64/common/sys/library.cpp b/thirdparty/embree-aarch64/common/sys/library.cpp new file mode 100644 index 0000000000..899267a1e4 --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/library.cpp @@ -0,0 +1,83 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "library.h" +#include "sysinfo.h" +#include "filename.h" + +//////////////////////////////////////////////////////////////////////////////// +/// Windows Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__WIN32__) + +#define WIN32_LEAN_AND_MEAN +#include <windows.h> + +namespace embree +{ + /* opens a shared library */ + lib_t openLibrary(const std::string& file) + { + std::string fullName = file+".dll"; + FileName executable = getExecutableFileName(); + HANDLE handle = LoadLibrary((executable.path() + fullName).c_str()); + return lib_t(handle); + } + + /* returns address of a symbol from the library */ + void* getSymbol(lib_t lib, const std::string& sym) { + return reinterpret_cast<void *>(GetProcAddress(HMODULE(lib),sym.c_str())); + } + + /* closes the shared library */ + void closeLibrary(lib_t lib) { + FreeLibrary(HMODULE(lib)); + } +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Unix Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__UNIX__) + +#include <dlfcn.h> + +namespace embree +{ + /* opens a shared library */ + lib_t openLibrary(const std::string& file) + { +#if defined(__MACOSX__) + std::string fullName = "lib"+file+".dylib"; +#else + std::string fullName = "lib"+file+".so"; +#endif + void* lib = dlopen(fullName.c_str(), RTLD_NOW); + if (lib) return lib_t(lib); + FileName executable = getExecutableFileName(); + lib = dlopen((executable.path() + fullName).c_str(),RTLD_NOW); + if (lib == nullptr) { + const char* error = dlerror(); + if (error) { + THROW_RUNTIME_ERROR(error); + } else { + THROW_RUNTIME_ERROR("could not load library "+executable.str()); + } + } + return lib_t(lib); + } + + /* returns address of a symbol from the library */ + void* getSymbol(lib_t lib, const std::string& sym) { + return dlsym(lib,sym.c_str()); + } + + /* closes the shared library */ + void closeLibrary(lib_t lib) { + dlclose(lib); + } +} +#endif diff --git a/thirdparty/embree-aarch64/common/sys/library.h b/thirdparty/embree-aarch64/common/sys/library.h new file mode 100644 index 0000000000..c2164e9fbe --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/library.h @@ -0,0 +1,21 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "platform.h" + +namespace embree +{ + /*! type for shared library */ + typedef struct opaque_lib_t* lib_t; + + /*! loads a shared library */ + lib_t openLibrary(const std::string& file); + + /*! returns address of a symbol from the library */ + void* getSymbol(lib_t lib, const std::string& sym); + + /*! unloads a shared library */ + void closeLibrary(lib_t lib); +} diff --git a/thirdparty/embree-aarch64/common/sys/mutex.cpp b/thirdparty/embree-aarch64/common/sys/mutex.cpp new file mode 100644 index 0000000000..11779bc9b9 --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/mutex.cpp @@ -0,0 +1,58 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "mutex.h" +#include "regression.h" + +#if defined(__WIN32__) && !defined(PTHREADS_WIN32) + +#define WIN32_LEAN_AND_MEAN +#include <windows.h> + +namespace embree +{ + MutexSys::MutexSys() { mutex = new CRITICAL_SECTION; InitializeCriticalSection((CRITICAL_SECTION*)mutex); } + MutexSys::~MutexSys() { DeleteCriticalSection((CRITICAL_SECTION*)mutex); delete (CRITICAL_SECTION*)mutex; } + void MutexSys::lock() { EnterCriticalSection((CRITICAL_SECTION*)mutex); } + bool MutexSys::try_lock() { return TryEnterCriticalSection((CRITICAL_SECTION*)mutex) != 0; } + void MutexSys::unlock() { LeaveCriticalSection((CRITICAL_SECTION*)mutex); } +} +#endif + +#if defined(__UNIX__) || defined(PTHREADS_WIN32) +#include <pthread.h> +namespace embree +{ + /*! system mutex using pthreads */ + MutexSys::MutexSys() + { + mutex = new pthread_mutex_t; + if (pthread_mutex_init((pthread_mutex_t*)mutex, nullptr) != 0) + THROW_RUNTIME_ERROR("pthread_mutex_init failed"); + } + + MutexSys::~MutexSys() + { + MAYBE_UNUSED bool ok = pthread_mutex_destroy((pthread_mutex_t*)mutex) == 0; + assert(ok); + delete (pthread_mutex_t*)mutex; + mutex = nullptr; + } + + void MutexSys::lock() + { + if (pthread_mutex_lock((pthread_mutex_t*)mutex) != 0) + THROW_RUNTIME_ERROR("pthread_mutex_lock failed"); + } + + bool MutexSys::try_lock() { + return pthread_mutex_trylock((pthread_mutex_t*)mutex) == 0; + } + + void MutexSys::unlock() + { + if (pthread_mutex_unlock((pthread_mutex_t*)mutex) != 0) + THROW_RUNTIME_ERROR("pthread_mutex_unlock failed"); + } +}; +#endif diff --git a/thirdparty/embree-aarch64/common/sys/mutex.h b/thirdparty/embree-aarch64/common/sys/mutex.h new file mode 100644 index 0000000000..1164210f23 --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/mutex.h @@ -0,0 +1,98 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "platform.h" +#include "intrinsics.h" +#include "atomic.h" + +namespace embree +{ + /*! system mutex */ + class MutexSys { + friend struct ConditionImplementation; + public: + MutexSys(); + ~MutexSys(); + + private: + MutexSys (const MutexSys& other) DELETED; // do not implement + MutexSys& operator= (const MutexSys& other) DELETED; // do not implement + + public: + void lock(); + bool try_lock(); + void unlock(); + + protected: + void* mutex; + }; + + /*! spinning mutex */ + class SpinLock + { + public: + + SpinLock () + : flag(false) {} + + __forceinline bool isLocked() { + return flag.load(); + } + + __forceinline void lock() + { + while (true) + { + while (flag.load()) + { + _mm_pause(); + _mm_pause(); + } + + bool expected = false; + if (flag.compare_exchange_strong(expected,true,std::memory_order_acquire)) + break; + } + } + + __forceinline bool try_lock() + { + bool expected = false; + if (flag.load() != expected) { + return false; + } + return flag.compare_exchange_strong(expected,true,std::memory_order_acquire); + } + + __forceinline void unlock() { + flag.store(false,std::memory_order_release); + } + + __forceinline void wait_until_unlocked() + { + while(flag.load()) + { + _mm_pause(); + _mm_pause(); + } + } + + public: + atomic<bool> flag; + }; + + /*! safe mutex lock and unlock helper */ + template<typename Mutex> class Lock { + public: + Lock (Mutex& mutex) : mutex(mutex), locked(true) { mutex.lock(); } + Lock (Mutex& mutex, bool locked) : mutex(mutex), locked(locked) {} + ~Lock() { if (locked) mutex.unlock(); } + __forceinline void lock() { assert(!locked); locked = true; mutex.lock(); } + __forceinline bool isLocked() const { return locked; } + protected: + Mutex& mutex; + bool locked; + }; +} diff --git a/thirdparty/embree-aarch64/common/sys/platform.h b/thirdparty/embree-aarch64/common/sys/platform.h new file mode 100644 index 0000000000..737f14aa6e --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/platform.h @@ -0,0 +1,387 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define _CRT_SECURE_NO_WARNINGS + +#include <cstddef> +#include <cassert> +#include <cstdlib> +#include <cstdio> +#include <memory> +#include <stdexcept> +#include <iostream> +#include <iomanip> +#include <fstream> +#include <string> +#include <cstring> +#include <stdint.h> +#include <functional> + +//////////////////////////////////////////////////////////////////////////////// +/// detect platform +//////////////////////////////////////////////////////////////////////////////// + +/* detect 32 or 64 platform */ +#if defined(__x86_64__) || defined(__ia64__) || defined(_M_X64) +#define __X86_64__ +#endif + +/* detect Linux platform */ +#if defined(linux) || defined(__linux__) || defined(__LINUX__) +# if !defined(__LINUX__) +# define __LINUX__ +# endif +# if !defined(__UNIX__) +# define __UNIX__ +# endif +#endif + +/* detect FreeBSD platform */ +#if defined(__FreeBSD__) || defined(__FREEBSD__) +# if !defined(__FREEBSD__) +# define __FREEBSD__ +# endif +# if !defined(__UNIX__) +# define __UNIX__ +# endif +#endif + +/* detect Windows 95/98/NT/2000/XP/Vista/7/8/10 platform */ +#if (defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)) && !defined(__CYGWIN__) +# if !defined(__WIN32__) +# define __WIN32__ +# endif +#endif + +/* detect Cygwin platform */ +#if defined(__CYGWIN__) +# if !defined(__UNIX__) +# define __UNIX__ +# endif +#endif + +/* detect MAC OS X platform */ +#if defined(__APPLE__) || defined(MACOSX) || defined(__MACOSX__) +# if !defined(__MACOSX__) +# define __MACOSX__ +# endif +# if !defined(__UNIX__) +# define __UNIX__ +# endif +#endif + +/* try to detect other Unix systems */ +#if defined(__unix__) || defined (unix) || defined(__unix) || defined(_unix) +# if !defined(__UNIX__) +# define __UNIX__ +# endif +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Macros +//////////////////////////////////////////////////////////////////////////////// + +#ifdef __WIN32__ +#define dll_export __declspec(dllexport) +#define dll_import __declspec(dllimport) +#else +#define dll_export __attribute__ ((visibility ("default"))) +#define dll_import +#endif + +#ifdef __WIN32__ +#if !defined(__noinline) +#define __noinline __declspec(noinline) +#endif +//#define __forceinline __forceinline +//#define __restrict __restrict +#if defined(__INTEL_COMPILER) +#define __restrict__ __restrict +#else +#define __restrict__ //__restrict // causes issues with MSVC +#endif +#if !defined(__thread) +// NOTE: Require `-fms-extensions` for clang +#define __thread __declspec(thread) +#endif +#if !defined(__aligned) +#if defined(__MINGW32__) +#define __aligned(...) __attribute__((aligned(__VA_ARGS__))) +#else +#define __aligned(...) __declspec(align(__VA_ARGS__)) +#endif +#endif +//#define __FUNCTION__ __FUNCTION__ +#define debugbreak() __debugbreak() + +#else +#if !defined(__noinline) +#define __noinline __attribute__((noinline)) +#endif +#if !defined(__forceinline) +#define __forceinline inline __attribute__((always_inline)) +#endif +//#define __restrict __restrict +//#define __thread __thread +#if !defined(__aligned) +#define __aligned(...) __attribute__((aligned(__VA_ARGS__))) +#endif +#if !defined(__FUNCTION__) +#define __FUNCTION__ __PRETTY_FUNCTION__ +#endif +#define debugbreak() asm ("int $3") +#endif + +#if defined(__clang__) || defined(__GNUC__) + #define MAYBE_UNUSED __attribute__((unused)) +#else + #define MAYBE_UNUSED +#endif + +#if defined(_MSC_VER) && (_MSC_VER < 1900) // before VS2015 deleted functions are not supported properly + #define DELETED +#else + #define DELETED = delete +#endif + +// -- GODOT start -- +#ifndef likely +// -- GODOT end -- +#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) +#define likely(expr) (expr) +#define unlikely(expr) (expr) +#else +#define likely(expr) __builtin_expect((bool)(expr),true ) +#define unlikely(expr) __builtin_expect((bool)(expr),false) +#endif +// -- GODOT start -- +#endif +// -- GODOT end -- + +//////////////////////////////////////////////////////////////////////////////// +/// Error handling and debugging +//////////////////////////////////////////////////////////////////////////////// + +/* debug printing macros */ +#define STRING(x) #x +#define TOSTRING(x) STRING(x) +#define PING embree_cout << __FILE__ << " (" << __LINE__ << "): " << __FUNCTION__ << embree_endl +#define PRINT(x) embree_cout << STRING(x) << " = " << (x) << embree_endl +#define PRINT2(x,y) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << embree_endl +#define PRINT3(x,y,z) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << embree_endl +#define PRINT4(x,y,z,w) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << ", " << STRING(w) << " = " << (w) << embree_endl + +#if defined(DEBUG) // only report file and line in debug mode + // -- GODOT start -- + // #define THROW_RUNTIME_ERROR(str) + // throw std::runtime_error(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)); + #define THROW_RUNTIME_ERROR(str) \ + printf(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)), abort(); + // -- GODOT end -- +#else + // -- GODOT start -- + // #define THROW_RUNTIME_ERROR(str) + // throw std::runtime_error(str); + #define THROW_RUNTIME_ERROR(str) \ + abort(); + // -- GODOT end -- +#endif + +#define FATAL(x) THROW_RUNTIME_ERROR(x) +#define WARNING(x) { std::cerr << "Warning: " << x << embree_endl << std::flush; } + +#define NOT_IMPLEMENTED FATAL(std::string(__FUNCTION__) + " not implemented") + +//////////////////////////////////////////////////////////////////////////////// +/// Basic types +//////////////////////////////////////////////////////////////////////////////// + +/* default floating-point type */ +namespace embree { + typedef float real; +} + +/* windows does not have ssize_t */ +#if defined(__WIN32__) +#if defined(__X86_64__) || defined(__aarch64__) +typedef int64_t ssize_t; +#else +typedef int32_t ssize_t; +#endif +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Basic utility functions +//////////////////////////////////////////////////////////////////////////////// + +__forceinline std::string toString(long long value) { + return std::to_string(value); +} + +//////////////////////////////////////////////////////////////////////////////// +/// Disable some compiler warnings +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__INTEL_COMPILER) +//#pragma warning(disable:265 ) // floating-point operation result is out of range +//#pragma warning(disable:383 ) // value copied to temporary, reference to temporary used +//#pragma warning(disable:869 ) // parameter was never referenced +//#pragma warning(disable:981 ) // operands are evaluated in unspecified order +//#pragma warning(disable:1418) // external function definition with no prior declaration +//#pragma warning(disable:1419) // external declaration in primary source file +//#pragma warning(disable:1572) // floating-point equality and inequality comparisons are unreliable +//#pragma warning(disable:94 ) // the size of an array must be greater than zero +//#pragma warning(disable:1599) // declaration hides parameter +//#pragma warning(disable:424 ) // extra ";" ignored +#pragma warning(disable:2196) // routine is both "inline" and "noinline" +//#pragma warning(disable:177 ) // label was declared but never referenced +//#pragma warning(disable:114 ) // function was referenced but not defined +//#pragma warning(disable:819 ) // template nesting depth does not match the previous declaration of function +#pragma warning(disable:15335) // was not vectorized: vectorization possible but seems inefficient +#endif + +#if defined(_MSC_VER) +//#pragma warning(disable:4200) // nonstandard extension used : zero-sized array in struct/union +#pragma warning(disable:4800) // forcing value to bool 'true' or 'false' (performance warning) +//#pragma warning(disable:4267) // '=' : conversion from 'size_t' to 'unsigned long', possible loss of data +#pragma warning(disable:4244) // 'argument' : conversion from 'ssize_t' to 'unsigned int', possible loss of data +//#pragma warning(disable:4355) // 'this' : used in base member initializer list +//#pragma warning(disable:391 ) // '<=' : signed / unsigned mismatch +//#pragma warning(disable:4018) // '<' : signed / unsigned mismatch +//#pragma warning(disable:4305) // 'initializing' : truncation from 'double' to 'float' +//#pragma warning(disable:4068) // unknown pragma +//#pragma warning(disable:4146) // unary minus operator applied to unsigned type, result still unsigned +//#pragma warning(disable:4838) // conversion from 'unsigned int' to 'const int' requires a narrowing conversion) +//#pragma warning(disable:4227) // anachronism used : qualifiers on reference are ignored +#pragma warning(disable:4503) // decorated name length exceeded, name was truncated +#pragma warning(disable:4180) // qualifier applied to function type has no meaning; ignored +#pragma warning(disable:4258) // definition from the for loop is ignored; the definition from the enclosing scope is used + +# if _MSC_VER < 1910 // prior to Visual studio 2017 (V141) +# pragma warning(disable:4101) // warning C4101: 'x': unreferenced local variable // a compiler bug issues wrong warnings +# pragma warning(disable:4789) // buffer '' of size 8 bytes will be overrun; 32 bytes will be written starting at offset 0 +# endif + +#endif + +#if defined(__clang__) && !defined(__INTEL_COMPILER) +//#pragma clang diagnostic ignored "-Wunknown-pragmas" +//#pragma clang diagnostic ignored "-Wunused-variable" +//#pragma clang diagnostic ignored "-Wreorder" +//#pragma clang diagnostic ignored "-Wmicrosoft" +//#pragma clang diagnostic ignored "-Wunused-private-field" +//#pragma clang diagnostic ignored "-Wunused-local-typedef" +//#pragma clang diagnostic ignored "-Wunused-function" +//#pragma clang diagnostic ignored "-Wnarrowing" +//#pragma clang diagnostic ignored "-Wc++11-narrowing" +//#pragma clang diagnostic ignored "-Wdeprecated-register" +//#pragma clang diagnostic ignored "-Wdeprecated-declarations" +#endif + +#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__) +#pragma GCC diagnostic ignored "-Wpragmas" +//#pragma GCC diagnostic ignored "-Wnarrowing" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +//#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +//#pragma GCC diagnostic ignored "-Warray-bounds" +#pragma GCC diagnostic ignored "-Wattributes" +#pragma GCC diagnostic ignored "-Wmisleading-indentation" +#pragma GCC diagnostic ignored "-Wsign-compare" +#pragma GCC diagnostic ignored "-Wparentheses" +#endif + +#if defined(__clang__) && defined(__WIN32__) +#pragma clang diagnostic ignored "-Wunused-parameter" +#pragma clang diagnostic ignored "-Wmicrosoft-cast" +#pragma clang diagnostic ignored "-Wmicrosoft-enum-value" +#pragma clang diagnostic ignored "-Wmicrosoft-include" +#pragma clang diagnostic ignored "-Wunused-function" +#pragma clang diagnostic ignored "-Wunknown-pragmas" +#endif + +/* disabling deprecated warning, please use only where use of deprecated Embree API functions is desired */ +#if defined(__WIN32__) && defined(__INTEL_COMPILER) +#define DISABLE_DEPRECATED_WARNING __pragma(warning (disable: 1478)) // warning: function was declared deprecated +#define ENABLE_DEPRECATED_WARNING __pragma(warning (enable: 1478)) // warning: function was declared deprecated +#elif defined(__INTEL_COMPILER) +#define DISABLE_DEPRECATED_WARNING _Pragma("warning (disable: 1478)") // warning: function was declared deprecated +#define ENABLE_DEPRECATED_WARNING _Pragma("warning (enable : 1478)") // warning: function was declared deprecated +#elif defined(__clang__) +#define DISABLE_DEPRECATED_WARNING _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"") // warning: xxx is deprecated +#define ENABLE_DEPRECATED_WARNING _Pragma("clang diagnostic warning \"-Wdeprecated-declarations\"") // warning: xxx is deprecated +#elif defined(__GNUC__) +#define DISABLE_DEPRECATED_WARNING _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"") // warning: xxx is deprecated +#define ENABLE_DEPRECATED_WARNING _Pragma("GCC diagnostic warning \"-Wdeprecated-declarations\"") // warning: xxx is deprecated +#elif defined(_MSC_VER) +#define DISABLE_DEPRECATED_WARNING __pragma(warning (disable: 4996)) // warning: function was declared deprecated +#define ENABLE_DEPRECATED_WARNING __pragma(warning (enable : 4996)) // warning: function was declared deprecated +#endif + +/* embree output stream */ +#define embree_ostream std::ostream& +#define embree_cout std::cout +#define embree_cout_uniform std::cout +#define embree_endl std::endl + +//////////////////////////////////////////////////////////////////////////////// +/// Some macros for static profiling +//////////////////////////////////////////////////////////////////////////////// + +#if defined (__GNUC__) +#define IACA_SSC_MARK( MARK_ID ) \ +__asm__ __volatile__ ( \ + "\n\t movl $"#MARK_ID", %%ebx" \ + "\n\t .byte 0x64, 0x67, 0x90" \ + : : : "memory" ); + +#define IACA_UD_BYTES __asm__ __volatile__ ("\n\t .byte 0x0F, 0x0B"); + +#else +#define IACA_UD_BYTES {__asm _emit 0x0F \ + __asm _emit 0x0B} + +#define IACA_SSC_MARK(x) {__asm mov ebx, x\ + __asm _emit 0x64 \ + __asm _emit 0x67 \ + __asm _emit 0x90 } + +#define IACA_VC64_START __writegsbyte(111, 111); +#define IACA_VC64_END __writegsbyte(222, 222); + +#endif + +#define IACA_START {IACA_UD_BYTES \ + IACA_SSC_MARK(111)} +#define IACA_END {IACA_SSC_MARK(222) \ + IACA_UD_BYTES} + +namespace embree +{ + template<typename Closure> + struct OnScopeExitHelper + { + OnScopeExitHelper (const Closure f) : active(true), f(f) {} + ~OnScopeExitHelper() { if (active) f(); } + void deactivate() { active = false; } + bool active; + const Closure f; + }; + + template <typename Closure> + OnScopeExitHelper<Closure> OnScopeExit(const Closure f) { + return OnScopeExitHelper<Closure>(f); + } + +#define STRING_JOIN2(arg1, arg2) DO_STRING_JOIN2(arg1, arg2) +#define DO_STRING_JOIN2(arg1, arg2) arg1 ## arg2 +#define ON_SCOPE_EXIT(code) \ + auto STRING_JOIN2(on_scope_exit_, __LINE__) = OnScopeExit([&](){code;}) + + template<typename Ty> + std::unique_ptr<Ty> make_unique(Ty* ptr) { + return std::unique_ptr<Ty>(ptr); + } + +} diff --git a/thirdparty/embree-aarch64/common/sys/ref.h b/thirdparty/embree-aarch64/common/sys/ref.h new file mode 100644 index 0000000000..24648e6234 --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/ref.h @@ -0,0 +1,122 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "atomic.h" + +namespace embree +{ + struct NullTy { + }; + + extern MAYBE_UNUSED NullTy null; + + class RefCount + { + public: + RefCount(int val = 0) : refCounter(val) {} + virtual ~RefCount() {}; + + virtual RefCount* refInc() { refCounter.fetch_add(1); return this; } + virtual void refDec() { if (refCounter.fetch_add(-1) == 1) delete this; } + private: + std::atomic<size_t> refCounter; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Reference to single object + //////////////////////////////////////////////////////////////////////////////// + + template<typename Type> + class Ref + { + public: + Type* ptr; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Ref() : ptr(nullptr) {} + __forceinline Ref(NullTy) : ptr(nullptr) {} + __forceinline Ref(const Ref& input) : ptr(input.ptr) { if (ptr) ptr->refInc(); } + __forceinline Ref(Ref&& input) : ptr(input.ptr) { input.ptr = nullptr; } + + __forceinline Ref(Type* const input) : ptr(input) + { + if (ptr) + ptr->refInc(); + } + + __forceinline ~Ref() + { + if (ptr) + ptr->refDec(); + } + + __forceinline Ref& operator =(const Ref& input) + { + if (input.ptr) + input.ptr->refInc(); + if (ptr) + ptr->refDec(); + ptr = input.ptr; + return *this; + } + + __forceinline Ref& operator =(Ref&& input) + { + if (ptr) + ptr->refDec(); + ptr = input.ptr; + input.ptr = nullptr; + return *this; + } + + __forceinline Ref& operator =(Type* const input) + { + if (input) + input->refInc(); + if (ptr) + ptr->refDec(); + ptr = input; + return *this; + } + + __forceinline Ref& operator =(NullTy) + { + if (ptr) + ptr->refDec(); + ptr = nullptr; + return *this; + } + + __forceinline operator bool() const { return ptr != nullptr; } + + __forceinline const Type& operator *() const { return *ptr; } + __forceinline Type& operator *() { return *ptr; } + __forceinline const Type* operator ->() const { return ptr; } + __forceinline Type* operator ->() { return ptr; } + + template<typename TypeOut> + __forceinline Ref<TypeOut> cast() { return Ref<TypeOut>(static_cast<TypeOut*>(ptr)); } + template<typename TypeOut> + __forceinline const Ref<TypeOut> cast() const { return Ref<TypeOut>(static_cast<TypeOut*>(ptr)); } + + template<typename TypeOut> + __forceinline Ref<TypeOut> dynamicCast() { return Ref<TypeOut>(dynamic_cast<TypeOut*>(ptr)); } + template<typename TypeOut> + __forceinline const Ref<TypeOut> dynamicCast() const { return Ref<TypeOut>(dynamic_cast<TypeOut*>(ptr)); } + }; + + template<typename Type> __forceinline bool operator < (const Ref<Type>& a, const Ref<Type>& b) { return a.ptr < b.ptr; } + + template<typename Type> __forceinline bool operator ==(const Ref<Type>& a, NullTy ) { return a.ptr == nullptr; } + template<typename Type> __forceinline bool operator ==(NullTy , const Ref<Type>& b) { return nullptr == b.ptr; } + template<typename Type> __forceinline bool operator ==(const Ref<Type>& a, const Ref<Type>& b) { return a.ptr == b.ptr; } + + template<typename Type> __forceinline bool operator !=(const Ref<Type>& a, NullTy ) { return a.ptr != nullptr; } + template<typename Type> __forceinline bool operator !=(NullTy , const Ref<Type>& b) { return nullptr != b.ptr; } + template<typename Type> __forceinline bool operator !=(const Ref<Type>& a, const Ref<Type>& b) { return a.ptr != b.ptr; } +} diff --git a/thirdparty/embree-aarch64/common/sys/regression.cpp b/thirdparty/embree-aarch64/common/sys/regression.cpp new file mode 100644 index 0000000000..d95ff8dfe0 --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/regression.cpp @@ -0,0 +1,30 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "regression.h" + +namespace embree +{ + /* registerRegressionTest is invoked from static initializers, thus + * we cannot have the regression_tests variable as global static + * variable due to issues with static variable initialization + * order. */ + std::vector<RegressionTest*>& get_regression_tests() + { + static std::vector<RegressionTest*> regression_tests; + return regression_tests; + } + + void registerRegressionTest(RegressionTest* test) + { + get_regression_tests().push_back(test); + } + + RegressionTest* getRegressionTest(size_t index) + { + if (index >= get_regression_tests().size()) + return nullptr; + + return get_regression_tests()[index]; + } +} diff --git a/thirdparty/embree-aarch64/common/sys/regression.h b/thirdparty/embree-aarch64/common/sys/regression.h new file mode 100644 index 0000000000..632f8d92cf --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/regression.h @@ -0,0 +1,25 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "platform.h" + +#include <vector> + +namespace embree +{ + /*! virtual interface for all regression tests */ + struct RegressionTest + { + RegressionTest (std::string name) : name(name) {} + virtual bool run() = 0; + std::string name; + }; + + /*! registers a regression test */ + void registerRegressionTest(RegressionTest* test); + + /*! run all regression tests */ + RegressionTest* getRegressionTest(size_t index); +} diff --git a/thirdparty/embree-aarch64/common/sys/string.cpp b/thirdparty/embree-aarch64/common/sys/string.cpp new file mode 100644 index 0000000000..931244383e --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/string.cpp @@ -0,0 +1,42 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "string.h" + +#include <algorithm> +#include <ctype.h> + +namespace embree +{ + char to_lower(char c) { return char(tolower(int(c))); } + char to_upper(char c) { return char(toupper(int(c))); } + std::string toLowerCase(const std::string& s) { std::string dst(s); std::transform(dst.begin(), dst.end(), dst.begin(), to_lower); return dst; } + std::string toUpperCase(const std::string& s) { std::string dst(s); std::transform(dst.begin(), dst.end(), dst.begin(), to_upper); return dst; } + + Vec2f string_to_Vec2f ( std::string str ) + { + size_t next = 0; + const float x = std::stof(str,&next); str = str.substr(next+1); + const float y = std::stof(str,&next); + return Vec2f(x,y); + } + + Vec3f string_to_Vec3f ( std::string str ) + { + size_t next = 0; + const float x = std::stof(str,&next); str = str.substr(next+1); + const float y = std::stof(str,&next); str = str.substr(next+1); + const float z = std::stof(str,&next); + return Vec3f(x,y,z); + } + + Vec4f string_to_Vec4f ( std::string str ) + { + size_t next = 0; + const float x = std::stof(str,&next); str = str.substr(next+1); + const float y = std::stof(str,&next); str = str.substr(next+1); + const float z = std::stof(str,&next); str = str.substr(next+1); + const float w = std::stof(str,&next); + return Vec4f(x,y,z,w); + } +} diff --git a/thirdparty/embree-aarch64/common/sys/string.h b/thirdparty/embree-aarch64/common/sys/string.h new file mode 100644 index 0000000000..2e9b0f88c3 --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/string.h @@ -0,0 +1,37 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "platform.h" +#include "../math/vec2.h" +#include "../math/vec3.h" +#include "../math/vec4.h" + +namespace embree +{ + class IOStreamStateRestorer + { + public: + IOStreamStateRestorer(std::ostream& iostream) + : iostream(iostream), flags(iostream.flags()), precision(iostream.precision()) { + } + + ~IOStreamStateRestorer() { + iostream.flags(flags); + iostream.precision(precision); + } + + private: + std::ostream& iostream; + std::ios::fmtflags flags; + std::streamsize precision; + }; + + std::string toLowerCase(const std::string& s); + std::string toUpperCase(const std::string& s); + + Vec2f string_to_Vec2f ( std::string str ); + Vec3f string_to_Vec3f ( std::string str ); + Vec4f string_to_Vec4f ( std::string str ); +} diff --git a/thirdparty/embree-aarch64/common/sys/sysinfo.cpp b/thirdparty/embree-aarch64/common/sys/sysinfo.cpp new file mode 100644 index 0000000000..1d11436770 --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/sysinfo.cpp @@ -0,0 +1,676 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "sysinfo.h" +#include "intrinsics.h" +#include "string.h" +#include "ref.h" +#if defined(__FREEBSD__) +#include <sys/cpuset.h> +#include <pthread_np.h> +typedef cpuset_t cpu_set_t; +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// All Platforms +//////////////////////////////////////////////////////////////////////////////// + +namespace embree +{ + NullTy null; + + std::string getPlatformName() + { +#if defined(__LINUX__) && defined(__ANDROID__) && defined(__aarch64__) && defined(__ARM_NEON) + return "Android Linux (aarch64 / arm64)"; +#elif defined(__LINUX__) && defined(__ANDROID__) && defined(__X86_64__) + return "Android Linux (x64)"; +#elif defined(__LINUX__) && defined(__ANDROID__) && (defined(_X86_) || defined(__X86__) || defined(_M_IX86)) + return "Android Linux (x86)"; +#elif defined(__LINUX__) && !defined(__X86_64__) + return "Linux (32bit)"; +#elif defined(__LINUX__) && defined(__X86_64__) + return "Linux (64bit)"; +#elif defined(__FREEBSD__) && !defined(__X86_64__) + return "FreeBSD (32bit)"; +#elif defined(__FREEBSD__) && defined(__X86_64__) + return "FreeBSD (64bit)"; +#elif defined(__CYGWIN__) && !defined(__X86_64__) + return "Cygwin (32bit)"; +#elif defined(__CYGWIN__) && defined(__X86_64__) + return "Cygwin (64bit)"; +#elif defined(__WIN32__) && !defined(__X86_64__) + return "Windows (32bit)"; +#elif defined(__WIN32__) && defined(__X86_64__) + return "Windows (64bit)"; +#elif defined(TARGET_IPHONE_SIMULATOR) && defined(__X86_64__) + return "iOS Simulator (x64)"; +#elif defined(TARGET_OS_IPHONE) && defined(__aarch64__) && defined(__ARM_NEON) + return "iOS (aarch64 / arm64)"; +#elif defined(__MACOSX__) && !defined(__X86_64__) + return "Mac OS X (32bit)"; +#elif defined(__MACOSX__) && defined(__X86_64__) + return "Mac OS X (64bit)"; +#elif defined(__UNIX__) && defined(__aarch64__) + return "Unix (aarch64)"; +#elif defined(__UNIX__) && !defined(__X86_64__) + return "Unix (32bit)"; +#elif defined(__UNIX__) && defined(__X86_64__) + return "Unix (64bit)"; +#else + return "Unknown"; +#endif + } + + std::string getCompilerName() + { +#if defined(__INTEL_COMPILER) + int icc_mayor = __INTEL_COMPILER / 100 % 100; + int icc_minor = __INTEL_COMPILER % 100; + std::string version = "Intel Compiler "; + version += toString(icc_mayor); + version += "." + toString(icc_minor); +#if defined(__INTEL_COMPILER_UPDATE) + version += "." + toString(__INTEL_COMPILER_UPDATE); +#endif + return version; +#elif defined(__clang__) + return "CLANG " __clang_version__; +#elif defined (__GNUC__) + return "GCC " __VERSION__; +#elif defined(_MSC_VER) + std::string version = toString(_MSC_FULL_VER); + version.insert(4,"."); + version.insert(9,"."); + version.insert(2,"."); + return "Visual C++ Compiler " + version; +#else + return "Unknown Compiler"; +#endif + } + + std::string getCPUVendor() + { + int cpuinfo[4]; + __cpuid (cpuinfo, 0); + int name[4]; + name[0] = cpuinfo[1]; + name[1] = cpuinfo[3]; + name[2] = cpuinfo[2]; + name[3] = 0; + return (char*)name; + } + + CPU getCPUModel() + { + if (getCPUVendor() != "GenuineIntel") + return CPU::UNKNOWN; + + int out[4]; + __cpuid(out, 0); + if (out[0] < 1) return CPU::UNKNOWN; + __cpuid(out, 1); + + /* please see CPUID documentation for these formulas */ + uint32_t family_ID = (out[0] >> 8) & 0x0F; + uint32_t extended_family_ID = (out[0] >> 20) & 0xFF; + + uint32_t model_ID = (out[0] >> 4) & 0x0F; + uint32_t extended_model_ID = (out[0] >> 16) & 0x0F; + + uint32_t DisplayFamily = family_ID; + if (family_ID == 0x0F) + DisplayFamily += extended_family_ID; + + uint32_t DisplayModel = model_ID; + if (family_ID == 0x06 || family_ID == 0x0F) + DisplayModel += extended_model_ID << 4; + + uint32_t DisplayFamily_DisplayModel = (DisplayFamily << 8) + (DisplayModel << 0); + + // Data from Intel® 64 and IA-32 Architectures, Volume 4, Chapter 2, Table 2-1 (CPUID Signature Values of DisplayFamily_DisplayModel) + if (DisplayFamily_DisplayModel == 0x067D) return CPU::CORE_ICE_LAKE; + if (DisplayFamily_DisplayModel == 0x067E) return CPU::CORE_ICE_LAKE; + if (DisplayFamily_DisplayModel == 0x068C) return CPU::CORE_TIGER_LAKE; + if (DisplayFamily_DisplayModel == 0x06A5) return CPU::CORE_COMET_LAKE; + if (DisplayFamily_DisplayModel == 0x06A6) return CPU::CORE_COMET_LAKE; + if (DisplayFamily_DisplayModel == 0x0666) return CPU::CORE_CANNON_LAKE; + if (DisplayFamily_DisplayModel == 0x068E) return CPU::CORE_KABY_LAKE; + if (DisplayFamily_DisplayModel == 0x069E) return CPU::CORE_KABY_LAKE; + if (DisplayFamily_DisplayModel == 0x066A) return CPU::XEON_ICE_LAKE; + if (DisplayFamily_DisplayModel == 0x066C) return CPU::XEON_ICE_LAKE; + if (DisplayFamily_DisplayModel == 0x0655) return CPU::XEON_SKY_LAKE; + if (DisplayFamily_DisplayModel == 0x064E) return CPU::CORE_SKY_LAKE; + if (DisplayFamily_DisplayModel == 0x065E) return CPU::CORE_SKY_LAKE; + if (DisplayFamily_DisplayModel == 0x0656) return CPU::XEON_BROADWELL; + if (DisplayFamily_DisplayModel == 0x064F) return CPU::XEON_BROADWELL; + if (DisplayFamily_DisplayModel == 0x0647) return CPU::CORE_BROADWELL; + if (DisplayFamily_DisplayModel == 0x063D) return CPU::CORE_BROADWELL; + if (DisplayFamily_DisplayModel == 0x063F) return CPU::XEON_HASWELL; + if (DisplayFamily_DisplayModel == 0x063C) return CPU::CORE_HASWELL; + if (DisplayFamily_DisplayModel == 0x0645) return CPU::CORE_HASWELL; + if (DisplayFamily_DisplayModel == 0x0646) return CPU::CORE_HASWELL; + if (DisplayFamily_DisplayModel == 0x063E) return CPU::XEON_IVY_BRIDGE; + if (DisplayFamily_DisplayModel == 0x063A) return CPU::CORE_IVY_BRIDGE; + if (DisplayFamily_DisplayModel == 0x062D) return CPU::SANDY_BRIDGE; + if (DisplayFamily_DisplayModel == 0x062F) return CPU::SANDY_BRIDGE; + if (DisplayFamily_DisplayModel == 0x062A) return CPU::SANDY_BRIDGE; + if (DisplayFamily_DisplayModel == 0x062E) return CPU::NEHALEM; + if (DisplayFamily_DisplayModel == 0x0625) return CPU::NEHALEM; + if (DisplayFamily_DisplayModel == 0x062C) return CPU::NEHALEM; + if (DisplayFamily_DisplayModel == 0x061E) return CPU::NEHALEM; + if (DisplayFamily_DisplayModel == 0x061F) return CPU::NEHALEM; + if (DisplayFamily_DisplayModel == 0x061A) return CPU::NEHALEM; + if (DisplayFamily_DisplayModel == 0x061D) return CPU::NEHALEM; + if (DisplayFamily_DisplayModel == 0x0617) return CPU::CORE2; + if (DisplayFamily_DisplayModel == 0x060F) return CPU::CORE2; + if (DisplayFamily_DisplayModel == 0x060E) return CPU::CORE1; + + if (DisplayFamily_DisplayModel == 0x0685) return CPU::XEON_PHI_KNIGHTS_MILL; + if (DisplayFamily_DisplayModel == 0x0657) return CPU::XEON_PHI_KNIGHTS_LANDING; + + return CPU::UNKNOWN; + } + + std::string stringOfCPUModel(CPU model) + { + switch (model) { + case CPU::XEON_ICE_LAKE : return "Xeon Ice Lake"; + case CPU::CORE_ICE_LAKE : return "Core Ice Lake"; + case CPU::CORE_TIGER_LAKE : return "Core Tiger Lake"; + case CPU::CORE_COMET_LAKE : return "Core Comet Lake"; + case CPU::CORE_CANNON_LAKE : return "Core Cannon Lake"; + case CPU::CORE_KABY_LAKE : return "Core Kaby Lake"; + case CPU::XEON_SKY_LAKE : return "Xeon Sky Lake"; + case CPU::CORE_SKY_LAKE : return "Core Sky Lake"; + case CPU::XEON_PHI_KNIGHTS_MILL : return "Xeon Phi Knights Mill"; + case CPU::XEON_PHI_KNIGHTS_LANDING: return "Xeon Phi Knights Landing"; + case CPU::XEON_BROADWELL : return "Xeon Broadwell"; + case CPU::CORE_BROADWELL : return "Core Broadwell"; + case CPU::XEON_HASWELL : return "Xeon Haswell"; + case CPU::CORE_HASWELL : return "Core Haswell"; + case CPU::XEON_IVY_BRIDGE : return "Xeon Ivy Bridge"; + case CPU::CORE_IVY_BRIDGE : return "Core Ivy Bridge"; + case CPU::SANDY_BRIDGE : return "Sandy Bridge"; + case CPU::NEHALEM : return "Nehalem"; + case CPU::CORE2 : return "Core2"; + case CPU::CORE1 : return "Core"; + case CPU::ARM : return "Arm"; + case CPU::UNKNOWN : return "Unknown CPU"; + } + return "Unknown CPU (error)"; + } + +#if !defined(__ARM_NEON) + /* constants to access destination registers of CPUID instruction */ + static const int EAX = 0; + static const int EBX = 1; + static const int ECX = 2; + static const int EDX = 3; + + /* cpuid[eax=1].ecx */ + static const int CPU_FEATURE_BIT_SSE3 = 1 << 0; + static const int CPU_FEATURE_BIT_SSSE3 = 1 << 9; + static const int CPU_FEATURE_BIT_FMA3 = 1 << 12; + static const int CPU_FEATURE_BIT_SSE4_1 = 1 << 19; + static const int CPU_FEATURE_BIT_SSE4_2 = 1 << 20; + //static const int CPU_FEATURE_BIT_MOVBE = 1 << 22; + static const int CPU_FEATURE_BIT_POPCNT = 1 << 23; + //static const int CPU_FEATURE_BIT_XSAVE = 1 << 26; + static const int CPU_FEATURE_BIT_OXSAVE = 1 << 27; + static const int CPU_FEATURE_BIT_AVX = 1 << 28; + static const int CPU_FEATURE_BIT_F16C = 1 << 29; + static const int CPU_FEATURE_BIT_RDRAND = 1 << 30; + + /* cpuid[eax=1].edx */ + static const int CPU_FEATURE_BIT_SSE = 1 << 25; + static const int CPU_FEATURE_BIT_SSE2 = 1 << 26; + + /* cpuid[eax=0x80000001].ecx */ + static const int CPU_FEATURE_BIT_LZCNT = 1 << 5; + + /* cpuid[eax=7,ecx=0].ebx */ + static const int CPU_FEATURE_BIT_BMI1 = 1 << 3; + static const int CPU_FEATURE_BIT_AVX2 = 1 << 5; + static const int CPU_FEATURE_BIT_BMI2 = 1 << 8; + static const int CPU_FEATURE_BIT_AVX512F = 1 << 16; // AVX512F (foundation) + static const int CPU_FEATURE_BIT_AVX512DQ = 1 << 17; // AVX512DQ (doubleword and quadword instructions) + static const int CPU_FEATURE_BIT_AVX512PF = 1 << 26; // AVX512PF (prefetch gather/scatter instructions) + static const int CPU_FEATURE_BIT_AVX512ER = 1 << 27; // AVX512ER (exponential and reciprocal instructions) + static const int CPU_FEATURE_BIT_AVX512CD = 1 << 28; // AVX512CD (conflict detection instructions) + static const int CPU_FEATURE_BIT_AVX512BW = 1 << 30; // AVX512BW (byte and word instructions) + static const int CPU_FEATURE_BIT_AVX512VL = 1 << 31; // AVX512VL (vector length extensions) + static const int CPU_FEATURE_BIT_AVX512IFMA = 1 << 21; // AVX512IFMA (integer fused multiple-add instructions) + + /* cpuid[eax=7,ecx=0].ecx */ + static const int CPU_FEATURE_BIT_AVX512VBMI = 1 << 1; // AVX512VBMI (vector bit manipulation instructions) +#endif + +#if !defined(__ARM_NEON) + __noinline int64_t get_xcr0() + { + // https://github.com/opencv/opencv/blob/master/modules/core/src/system.cpp#L466 +#if defined (__WIN32__) && defined(_XCR_XFEATURE_ENABLED_MASK) + int64_t xcr0 = 0; // int64_t is workaround for compiler bug under VS2013, Win32 + xcr0 = _xgetbv(0); + return xcr0; +#else + int xcr0 = 0; + __asm__ ("xgetbv" : "=a" (xcr0) : "c" (0) : "%edx" ); + return xcr0; +#endif + } +#endif + + int getCPUFeatures() + { +#if defined(__ARM_NEON) + int cpu_features = CPU_FEATURE_NEON|CPU_FEATURE_SSE|CPU_FEATURE_SSE2; +#if defined(NEON_AVX2_EMULATION) + cpu_features |= CPU_FEATURE_SSE3|CPU_FEATURE_SSSE3|CPU_FEATURE_SSE42; + cpu_features |= CPU_FEATURE_XMM_ENABLED; + cpu_features |= CPU_FEATURE_YMM_ENABLED; + cpu_features |= CPU_FEATURE_SSE41 | CPU_FEATURE_RDRAND | CPU_FEATURE_F16C; + cpu_features |= CPU_FEATURE_POPCNT; + cpu_features |= CPU_FEATURE_AVX; + cpu_features |= CPU_FEATURE_AVX2; + cpu_features |= CPU_FEATURE_FMA3; + cpu_features |= CPU_FEATURE_LZCNT; + cpu_features |= CPU_FEATURE_BMI1; + cpu_features |= CPU_FEATURE_BMI2; + cpu_features |= CPU_FEATURE_NEON_2X; + + + +#endif + return cpu_features; + +#else + /* cache CPU features access */ + static int cpu_features = 0; + if (cpu_features) + return cpu_features; + + /* get number of CPUID leaves */ + int cpuid_leaf0[4]; + __cpuid(cpuid_leaf0, 0x00000000); + unsigned nIds = cpuid_leaf0[EAX]; + + /* get number of extended CPUID leaves */ + int cpuid_leafe[4]; + __cpuid(cpuid_leafe, 0x80000000); + unsigned nExIds = cpuid_leafe[EAX]; + + /* get CPUID leaves for EAX = 1,7, and 0x80000001 */ + int cpuid_leaf_1[4] = { 0,0,0,0 }; + int cpuid_leaf_7[4] = { 0,0,0,0 }; + int cpuid_leaf_e1[4] = { 0,0,0,0 }; + if (nIds >= 1) __cpuid (cpuid_leaf_1,0x00000001); +#if _WIN32 +#if _MSC_VER && (_MSC_FULL_VER < 160040219) +#else + if (nIds >= 7) __cpuidex(cpuid_leaf_7,0x00000007,0); +#endif +#else + if (nIds >= 7) __cpuid_count(cpuid_leaf_7,0x00000007,0); +#endif + if (nExIds >= 0x80000001) __cpuid(cpuid_leaf_e1,0x80000001); + + /* detect if OS saves XMM, YMM, and ZMM states */ + bool xmm_enabled = true; + bool ymm_enabled = false; + bool zmm_enabled = false; + if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_OXSAVE) { + int64_t xcr0 = get_xcr0(); + xmm_enabled = ((xcr0 & 0x02) == 0x02); /* checks if xmm are enabled in XCR0 */ + ymm_enabled = xmm_enabled && ((xcr0 & 0x04) == 0x04); /* checks if ymm state are enabled in XCR0 */ + zmm_enabled = ymm_enabled && ((xcr0 & 0xE0) == 0xE0); /* checks if OPMASK state, upper 256-bit of ZMM0-ZMM15 and ZMM16-ZMM31 state are enabled in XCR0 */ + } + if (xmm_enabled) cpu_features |= CPU_FEATURE_XMM_ENABLED; + if (ymm_enabled) cpu_features |= CPU_FEATURE_YMM_ENABLED; + if (zmm_enabled) cpu_features |= CPU_FEATURE_ZMM_ENABLED; + + if (cpuid_leaf_1[EDX] & CPU_FEATURE_BIT_SSE ) cpu_features |= CPU_FEATURE_SSE; + if (cpuid_leaf_1[EDX] & CPU_FEATURE_BIT_SSE2 ) cpu_features |= CPU_FEATURE_SSE2; + if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSE3 ) cpu_features |= CPU_FEATURE_SSE3; + if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSSE3 ) cpu_features |= CPU_FEATURE_SSSE3; + if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSE4_1) cpu_features |= CPU_FEATURE_SSE41; + if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSE4_2) cpu_features |= CPU_FEATURE_SSE42; + if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_POPCNT) cpu_features |= CPU_FEATURE_POPCNT; + if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_AVX ) cpu_features |= CPU_FEATURE_AVX; + + if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_F16C ) cpu_features |= CPU_FEATURE_F16C; + if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_RDRAND) cpu_features |= CPU_FEATURE_RDRAND; + if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX2 ) cpu_features |= CPU_FEATURE_AVX2; + if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_FMA3 ) cpu_features |= CPU_FEATURE_FMA3; + if (cpuid_leaf_e1[ECX] & CPU_FEATURE_BIT_LZCNT) cpu_features |= CPU_FEATURE_LZCNT; + if (cpuid_leaf_7 [EBX] & CPU_FEATURE_BIT_BMI1 ) cpu_features |= CPU_FEATURE_BMI1; + if (cpuid_leaf_7 [EBX] & CPU_FEATURE_BIT_BMI2 ) cpu_features |= CPU_FEATURE_BMI2; + + if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512F ) cpu_features |= CPU_FEATURE_AVX512F; + if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512DQ ) cpu_features |= CPU_FEATURE_AVX512DQ; + if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512PF ) cpu_features |= CPU_FEATURE_AVX512PF; + if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512ER ) cpu_features |= CPU_FEATURE_AVX512ER; + if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512CD ) cpu_features |= CPU_FEATURE_AVX512CD; + if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512BW ) cpu_features |= CPU_FEATURE_AVX512BW; + if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512IFMA) cpu_features |= CPU_FEATURE_AVX512IFMA; + if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512VL ) cpu_features |= CPU_FEATURE_AVX512VL; + if (cpuid_leaf_7[ECX] & CPU_FEATURE_BIT_AVX512VBMI) cpu_features |= CPU_FEATURE_AVX512VBMI; + + return cpu_features; +#endif + } + + std::string stringOfCPUFeatures(int features) + { + std::string str; + if (features & CPU_FEATURE_XMM_ENABLED) str += "XMM "; + if (features & CPU_FEATURE_YMM_ENABLED) str += "YMM "; + if (features & CPU_FEATURE_ZMM_ENABLED) str += "ZMM "; + if (features & CPU_FEATURE_SSE ) str += "SSE "; + if (features & CPU_FEATURE_SSE2 ) str += "SSE2 "; + if (features & CPU_FEATURE_SSE3 ) str += "SSE3 "; + if (features & CPU_FEATURE_SSSE3 ) str += "SSSE3 "; + if (features & CPU_FEATURE_SSE41 ) str += "SSE4.1 "; + if (features & CPU_FEATURE_SSE42 ) str += "SSE4.2 "; + if (features & CPU_FEATURE_POPCNT) str += "POPCNT "; + if (features & CPU_FEATURE_AVX ) str += "AVX "; + if (features & CPU_FEATURE_F16C ) str += "F16C "; + if (features & CPU_FEATURE_RDRAND) str += "RDRAND "; + if (features & CPU_FEATURE_AVX2 ) str += "AVX2 "; + if (features & CPU_FEATURE_FMA3 ) str += "FMA3 "; + if (features & CPU_FEATURE_LZCNT ) str += "LZCNT "; + if (features & CPU_FEATURE_BMI1 ) str += "BMI1 "; + if (features & CPU_FEATURE_BMI2 ) str += "BMI2 "; + if (features & CPU_FEATURE_AVX512F) str += "AVX512F "; + if (features & CPU_FEATURE_AVX512DQ) str += "AVX512DQ "; + if (features & CPU_FEATURE_AVX512PF) str += "AVX512PF "; + if (features & CPU_FEATURE_AVX512ER) str += "AVX512ER "; + if (features & CPU_FEATURE_AVX512CD) str += "AVX512CD "; + if (features & CPU_FEATURE_AVX512BW) str += "AVX512BW "; + if (features & CPU_FEATURE_AVX512VL) str += "AVX512VL "; + if (features & CPU_FEATURE_AVX512IFMA) str += "AVX512IFMA "; + if (features & CPU_FEATURE_AVX512VBMI) str += "AVX512VBMI "; + if (features & CPU_FEATURE_NEON) str += "NEON "; + if (features & CPU_FEATURE_NEON_2X) str += "2xNEON "; + return str; + } + + std::string stringOfISA (int isa) + { + if (isa == SSE) return "SSE"; + if (isa == SSE2) return "SSE2"; + if (isa == SSE3) return "SSE3"; + if (isa == SSSE3) return "SSSE3"; + if (isa == SSE41) return "SSE4.1"; + if (isa == SSE42) return "SSE4.2"; + if (isa == AVX) return "AVX"; + if (isa == AVX2) return "AVX2"; + if (isa == AVX512KNL) return "AVX512KNL"; + if (isa == AVX512SKX) return "AVX512SKX"; + if (isa == NEON) return "NEON"; + if (isa == NEON_2X) return "2xNEON"; + return "UNKNOWN"; + } + + bool hasISA(int features, int isa) { + return (features & isa) == isa; + } + + std::string supportedTargetList (int features) + { + std::string v; + if (hasISA(features,SSE)) v += "SSE "; + if (hasISA(features,SSE2)) v += "SSE2 "; + if (hasISA(features,SSE3)) v += "SSE3 "; + if (hasISA(features,SSSE3)) v += "SSSE3 "; + if (hasISA(features,SSE41)) v += "SSE4.1 "; + if (hasISA(features,SSE42)) v += "SSE4.2 "; + if (hasISA(features,AVX)) v += "AVX "; + if (hasISA(features,AVXI)) v += "AVXI "; + if (hasISA(features,AVX2)) v += "AVX2 "; + if (hasISA(features,AVX512KNL)) v += "AVX512KNL "; + if (hasISA(features,AVX512SKX)) v += "AVX512SKX "; + if (hasISA(features,NEON)) v += "NEON "; + if (hasISA(features,NEON_2X)) v += "2xNEON "; + return v; + } +} + +//////////////////////////////////////////////////////////////////////////////// +/// Windows Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__WIN32__) + +#define WIN32_LEAN_AND_MEAN +#include <windows.h> +#include <psapi.h> + +namespace embree +{ + std::string getExecutableFileName() { + char filename[1024]; + if (!GetModuleFileName(nullptr, filename, sizeof(filename))) + return std::string(); + return std::string(filename); + } + + unsigned int getNumberOfLogicalThreads() + { + static int nThreads = -1; + if (nThreads != -1) return nThreads; + + typedef WORD (WINAPI *GetActiveProcessorGroupCountFunc)(); + typedef DWORD (WINAPI *GetActiveProcessorCountFunc)(WORD); + HMODULE hlib = LoadLibrary("Kernel32"); + GetActiveProcessorGroupCountFunc pGetActiveProcessorGroupCount = (GetActiveProcessorGroupCountFunc)GetProcAddress(hlib, "GetActiveProcessorGroupCount"); + GetActiveProcessorCountFunc pGetActiveProcessorCount = (GetActiveProcessorCountFunc) GetProcAddress(hlib, "GetActiveProcessorCount"); + + if (pGetActiveProcessorGroupCount && pGetActiveProcessorCount) + { + int groups = pGetActiveProcessorGroupCount(); + int totalProcessors = 0; + for (int i = 0; i < groups; i++) + totalProcessors += pGetActiveProcessorCount(i); + nThreads = totalProcessors; + } + else + { + SYSTEM_INFO sysinfo; + GetSystemInfo(&sysinfo); + nThreads = sysinfo.dwNumberOfProcessors; + } + assert(nThreads); + return nThreads; + } + + int getTerminalWidth() + { + HANDLE handle = GetStdHandle(STD_OUTPUT_HANDLE); + if (handle == INVALID_HANDLE_VALUE) return 80; + CONSOLE_SCREEN_BUFFER_INFO info; + memset(&info,0,sizeof(info)); + GetConsoleScreenBufferInfo(handle, &info); + return info.dwSize.X; + } + + double getSeconds() + { + LARGE_INTEGER freq, val; + QueryPerformanceFrequency(&freq); + QueryPerformanceCounter(&val); + return (double)val.QuadPart / (double)freq.QuadPart; + } + + void sleepSeconds(double t) { + Sleep(DWORD(1000.0*t)); + } + + size_t getVirtualMemoryBytes() + { + PROCESS_MEMORY_COUNTERS info; + GetProcessMemoryInfo( GetCurrentProcess( ), &info, sizeof(info) ); + return (size_t)info.QuotaPeakPagedPoolUsage; + } + + size_t getResidentMemoryBytes() + { + PROCESS_MEMORY_COUNTERS info; + GetProcessMemoryInfo( GetCurrentProcess( ), &info, sizeof(info) ); + return (size_t)info.WorkingSetSize; + } +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Linux Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__LINUX__) + +#include <stdio.h> +#include <unistd.h> + +namespace embree +{ + std::string getExecutableFileName() + { + std::string pid = "/proc/" + toString(getpid()) + "/exe"; + char buf[4096]; + memset(buf,0,sizeof(buf)); + if (readlink(pid.c_str(), buf, sizeof(buf)-1) == -1) + return std::string(); + return std::string(buf); + } + + size_t getVirtualMemoryBytes() + { + size_t virt, resident, shared; + std::ifstream buffer("/proc/self/statm"); + buffer >> virt >> resident >> shared; + return virt*sysconf(_SC_PAGE_SIZE); + } + + size_t getResidentMemoryBytes() + { + size_t virt, resident, shared; + std::ifstream buffer("/proc/self/statm"); + buffer >> virt >> resident >> shared; + return resident*sysconf(_SC_PAGE_SIZE); + } +} + +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// FreeBSD Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined (__FreeBSD__) + +#include <sys/sysctl.h> + +namespace embree +{ + std::string getExecutableFileName() + { + const int mib[4] = { CTL_KERN, KERN_PROC, KERN_PROC_PATHNAME, -1 }; + char buf[4096]; + memset(buf,0,sizeof(buf)); + size_t len = sizeof(buf)-1; + if (sysctl(mib, 4, buf, &len, 0x0, 0) == -1) + return std::string(); + return std::string(buf); + } + + size_t getVirtualMemoryBytes() { + return 0; + } + + size_t getResidentMemoryBytes() { + return 0; + } +} + +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Mac OS X Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__MACOSX__) + +#include <mach-o/dyld.h> + +namespace embree +{ + std::string getExecutableFileName() + { + char buf[4096]; + uint32_t size = sizeof(buf); + if (_NSGetExecutablePath(buf, &size) != 0) + return std::string(); + return std::string(buf); + } + + size_t getVirtualMemoryBytes() { + return 0; + } + + size_t getResidentMemoryBytes() { + return 0; + } +} + +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Unix Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__UNIX__) + +#include <unistd.h> +#include <sys/ioctl.h> +#include <sys/time.h> +#include <pthread.h> + +namespace embree +{ + unsigned int getNumberOfLogicalThreads() + { + static int nThreads = -1; + if (nThreads != -1) return nThreads; + +#if defined(__MACOSX__) || defined(__ANDROID__) + nThreads = sysconf(_SC_NPROCESSORS_ONLN); // does not work in Linux LXC container + assert(nThreads); +#else + cpu_set_t set; + if (pthread_getaffinity_np(pthread_self(), sizeof(set), &set) == 0) + nThreads = CPU_COUNT(&set); +#endif + + assert(nThreads); + return nThreads; + } + + int getTerminalWidth() + { + struct winsize info; + if (ioctl(STDOUT_FILENO, TIOCGWINSZ, &info) < 0) return 80; + return info.ws_col; + } + + double getSeconds() { + struct timeval tp; gettimeofday(&tp,nullptr); + return double(tp.tv_sec) + double(tp.tv_usec)/1E6; + } + + void sleepSeconds(double t) { + usleep(1000000.0*t); + } +} +#endif + diff --git a/thirdparty/embree-aarch64/common/sys/sysinfo.h b/thirdparty/embree-aarch64/common/sys/sysinfo.h new file mode 100644 index 0000000000..8e313a59b3 --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/sysinfo.h @@ -0,0 +1,192 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define CACHELINE_SIZE 64 + +#if !defined(PAGE_SIZE) + #define PAGE_SIZE 4096 +#endif + +#define PAGE_SIZE_2M (2*1024*1024) +#define PAGE_SIZE_4K (4*1024) + +#include "platform.h" + +/* define isa namespace and ISA bitvector */ +#if defined (__AVX512VL__) +# define isa avx512skx +# define ISA AVX512SKX +# define ISA_STR "AVX512SKX" +#elif defined (__AVX512F__) +# define isa avx512knl +# define ISA AVX512KNL +# define ISA_STR "AVX512KNL" +#elif defined (__AVX2__) +# define isa avx2 +# define ISA AVX2 +# define ISA_STR "AVX2" +#elif defined(__AVXI__) +# define isa avxi +# define ISA AVXI +# define ISA_STR "AVXI" +#elif defined(__AVX__) +# define isa avx +# define ISA AVX +# define ISA_STR "AVX" +#elif defined (__SSE4_2__) +# define isa sse42 +# define ISA SSE42 +# define ISA_STR "SSE4.2" +//#elif defined (__SSE4_1__) // we demote this to SSE2, MacOSX code compiles with SSE41 by default with XCode 11 +//# define isa sse41 +//# define ISA SSE41 +//# define ISA_STR "SSE4.1" +//#elif defined(__SSSE3__) // we demote this to SSE2, MacOSX code compiles with SSSE3 by default with ICC +//# define isa ssse3 +//# define ISA SSSE3 +//# define ISA_STR "SSSE3" +//#elif defined(__SSE3__) // we demote this to SSE2, MacOSX code compiles with SSE3 by default with clang +//# define isa sse3 +//# define ISA SSE3 +//# define ISA_STR "SSE3" +#elif defined(__SSE2__) || defined(__SSE3__) || defined(__SSSE3__) +# define isa sse2 +# define ISA SSE2 +# define ISA_STR "SSE2" +#elif defined(__SSE__) +# define isa sse +# define ISA SSE +# define ISA_STR "SSE" +#elif defined(__ARM_NEON) +// NOTE(LTE): Use sse2 for `isa` for the compatibility at the moment. +#define isa sse2 +#define ISA NEON +#define ISA_STR "NEON" +#else +#error Unknown ISA +#endif + +namespace embree +{ + enum class CPU + { + XEON_ICE_LAKE, + CORE_ICE_LAKE, + CORE_TIGER_LAKE, + CORE_COMET_LAKE, + CORE_CANNON_LAKE, + CORE_KABY_LAKE, + XEON_SKY_LAKE, + CORE_SKY_LAKE, + XEON_PHI_KNIGHTS_MILL, + XEON_PHI_KNIGHTS_LANDING, + XEON_BROADWELL, + CORE_BROADWELL, + XEON_HASWELL, + CORE_HASWELL, + XEON_IVY_BRIDGE, + CORE_IVY_BRIDGE, + SANDY_BRIDGE, + NEHALEM, + CORE2, + CORE1, + ARM, + UNKNOWN, + }; + + /*! get the full path to the running executable */ + std::string getExecutableFileName(); + + /*! return platform name */ + std::string getPlatformName(); + + /*! get the full name of the compiler */ + std::string getCompilerName(); + + /*! return the name of the CPU */ + std::string getCPUVendor(); + + /*! get microprocessor model */ + CPU getCPUModel(); + + /*! converts CPU model into string */ + std::string stringOfCPUModel(CPU model); + + /*! CPU features */ + static const int CPU_FEATURE_SSE = 1 << 0; + static const int CPU_FEATURE_SSE2 = 1 << 1; + static const int CPU_FEATURE_SSE3 = 1 << 2; + static const int CPU_FEATURE_SSSE3 = 1 << 3; + static const int CPU_FEATURE_SSE41 = 1 << 4; + static const int CPU_FEATURE_SSE42 = 1 << 5; + static const int CPU_FEATURE_POPCNT = 1 << 6; + static const int CPU_FEATURE_AVX = 1 << 7; + static const int CPU_FEATURE_F16C = 1 << 8; + static const int CPU_FEATURE_RDRAND = 1 << 9; + static const int CPU_FEATURE_AVX2 = 1 << 10; + static const int CPU_FEATURE_FMA3 = 1 << 11; + static const int CPU_FEATURE_LZCNT = 1 << 12; + static const int CPU_FEATURE_BMI1 = 1 << 13; + static const int CPU_FEATURE_BMI2 = 1 << 14; + static const int CPU_FEATURE_AVX512F = 1 << 16; + static const int CPU_FEATURE_AVX512DQ = 1 << 17; + static const int CPU_FEATURE_AVX512PF = 1 << 18; + static const int CPU_FEATURE_AVX512ER = 1 << 19; + static const int CPU_FEATURE_AVX512CD = 1 << 20; + static const int CPU_FEATURE_AVX512BW = 1 << 21; + static const int CPU_FEATURE_AVX512VL = 1 << 22; + static const int CPU_FEATURE_AVX512IFMA = 1 << 23; + static const int CPU_FEATURE_AVX512VBMI = 1 << 24; + static const int CPU_FEATURE_XMM_ENABLED = 1 << 25; + static const int CPU_FEATURE_YMM_ENABLED = 1 << 26; + static const int CPU_FEATURE_ZMM_ENABLED = 1 << 27; + static const int CPU_FEATURE_NEON = 1 << 28; + static const int CPU_FEATURE_NEON_2X = 1 << 29; + + /*! get CPU features */ + int getCPUFeatures(); + + /*! convert CPU features into a string */ + std::string stringOfCPUFeatures(int features); + + /*! creates a string of all supported targets that are supported */ + std::string supportedTargetList (int isa); + + /*! ISAs */ + static const int SSE = CPU_FEATURE_SSE | CPU_FEATURE_XMM_ENABLED; + static const int SSE2 = SSE | CPU_FEATURE_SSE2; + static const int SSE3 = SSE2 | CPU_FEATURE_SSE3; + static const int SSSE3 = SSE3 | CPU_FEATURE_SSSE3; + static const int SSE41 = SSSE3 | CPU_FEATURE_SSE41; + static const int SSE42 = SSE41 | CPU_FEATURE_SSE42 | CPU_FEATURE_POPCNT; + static const int AVX = SSE42 | CPU_FEATURE_AVX | CPU_FEATURE_YMM_ENABLED; + static const int AVXI = AVX | CPU_FEATURE_F16C | CPU_FEATURE_RDRAND; + static const int AVX2 = AVXI | CPU_FEATURE_AVX2 | CPU_FEATURE_FMA3 | CPU_FEATURE_BMI1 | CPU_FEATURE_BMI2 | CPU_FEATURE_LZCNT; + static const int AVX512KNL = AVX2 | CPU_FEATURE_AVX512F | CPU_FEATURE_AVX512PF | CPU_FEATURE_AVX512ER | CPU_FEATURE_AVX512CD | CPU_FEATURE_ZMM_ENABLED; + static const int AVX512SKX = AVX2 | CPU_FEATURE_AVX512F | CPU_FEATURE_AVX512DQ | CPU_FEATURE_AVX512CD | CPU_FEATURE_AVX512BW | CPU_FEATURE_AVX512VL | CPU_FEATURE_ZMM_ENABLED; + static const int NEON = CPU_FEATURE_NEON | CPU_FEATURE_SSE | CPU_FEATURE_SSE2; + static const int NEON_2X = CPU_FEATURE_NEON_2X | AVX2; + + /*! converts ISA bitvector into a string */ + std::string stringOfISA(int features); + + /*! return the number of logical threads of the system */ + unsigned int getNumberOfLogicalThreads(); + + /*! returns the size of the terminal window in characters */ + int getTerminalWidth(); + + /*! returns performance counter in seconds */ + double getSeconds(); + + /*! sleeps the specified number of seconds */ + void sleepSeconds(double t); + + /*! returns virtual address space occupied by process */ + size_t getVirtualMemoryBytes(); + + /*! returns resident memory required by process */ + size_t getResidentMemoryBytes(); +} diff --git a/thirdparty/embree-aarch64/common/sys/thread.cpp b/thirdparty/embree-aarch64/common/sys/thread.cpp new file mode 100644 index 0000000000..f9ea5b7d96 --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/thread.cpp @@ -0,0 +1,429 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "thread.h" +#include "sysinfo.h" +#include "string.h" + +#include <iostream> +#if defined(__ARM_NEON) +#include "../math/SSE2NEON.h" +#else +#include <xmmintrin.h> +#endif + +#if defined(PTHREADS_WIN32) +#pragma comment (lib, "pthreadVC.lib") +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Windows Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__WIN32__) + +#define WIN32_LEAN_AND_MEAN +#include <windows.h> + +namespace embree +{ + /*! set the affinity of a given thread */ + void setAffinity(HANDLE thread, ssize_t affinity) + { + typedef WORD (WINAPI *GetActiveProcessorGroupCountFunc)(); + typedef DWORD (WINAPI *GetActiveProcessorCountFunc)(WORD); + typedef BOOL (WINAPI *SetThreadGroupAffinityFunc)(HANDLE, const GROUP_AFFINITY *, PGROUP_AFFINITY); + typedef BOOL (WINAPI *SetThreadIdealProcessorExFunc)(HANDLE, PPROCESSOR_NUMBER, PPROCESSOR_NUMBER); + HMODULE hlib = LoadLibrary("Kernel32"); + GetActiveProcessorGroupCountFunc pGetActiveProcessorGroupCount = (GetActiveProcessorGroupCountFunc)GetProcAddress(hlib, "GetActiveProcessorGroupCount"); + GetActiveProcessorCountFunc pGetActiveProcessorCount = (GetActiveProcessorCountFunc)GetProcAddress(hlib, "GetActiveProcessorCount"); + SetThreadGroupAffinityFunc pSetThreadGroupAffinity = (SetThreadGroupAffinityFunc)GetProcAddress(hlib, "SetThreadGroupAffinity"); + SetThreadIdealProcessorExFunc pSetThreadIdealProcessorEx = (SetThreadIdealProcessorExFunc)GetProcAddress(hlib, "SetThreadIdealProcessorEx"); + if (pGetActiveProcessorGroupCount && pGetActiveProcessorCount && pSetThreadGroupAffinity && pSetThreadIdealProcessorEx) + { + int groups = pGetActiveProcessorGroupCount(); + int totalProcessors = 0, group = 0, number = 0; + for (int i = 0; i<groups; i++) { + int processors = pGetActiveProcessorCount(i); + if (totalProcessors + processors > affinity) { + group = i; + number = (int)affinity - totalProcessors; + break; + } + totalProcessors += processors; + } + + GROUP_AFFINITY groupAffinity; + groupAffinity.Group = (WORD)group; + groupAffinity.Mask = (KAFFINITY)(uint64_t(1) << number); + groupAffinity.Reserved[0] = 0; + groupAffinity.Reserved[1] = 0; + groupAffinity.Reserved[2] = 0; + if (!pSetThreadGroupAffinity(thread, &groupAffinity, nullptr)) + WARNING("SetThreadGroupAffinity failed"); // on purpose only a warning + + PROCESSOR_NUMBER processorNumber; + processorNumber.Group = group; + processorNumber.Number = number; + processorNumber.Reserved = 0; + if (!pSetThreadIdealProcessorEx(thread, &processorNumber, nullptr)) + WARNING("SetThreadIdealProcessorEx failed"); // on purpose only a warning + } + else + { + if (!SetThreadAffinityMask(thread, DWORD_PTR(uint64_t(1) << affinity))) + WARNING("SetThreadAffinityMask failed"); // on purpose only a warning + if (SetThreadIdealProcessor(thread, (DWORD)affinity) == (DWORD)-1) + WARNING("SetThreadIdealProcessor failed"); // on purpose only a warning + } + } + + /*! set affinity of the calling thread */ + void setAffinity(ssize_t affinity) { + setAffinity(GetCurrentThread(), affinity); + } + + struct ThreadStartupData + { + public: + ThreadStartupData (thread_func f, void* arg) + : f(f), arg(arg) {} + public: + thread_func f; + void* arg; + }; + + DWORD WINAPI threadStartup(LPVOID ptr) + { + ThreadStartupData* parg = (ThreadStartupData*) ptr; + _mm_setcsr(_mm_getcsr() | /*FTZ:*/ (1<<15) | /*DAZ:*/ (1<<6)); + parg->f(parg->arg); + delete parg; + parg = nullptr; + return 0; + } + +#if !defined(PTHREADS_WIN32) + + /*! creates a hardware thread running on specific core */ + thread_t createThread(thread_func f, void* arg, size_t stack_size, ssize_t threadID) + { + HANDLE thread = CreateThread(nullptr, stack_size, threadStartup, new ThreadStartupData(f,arg), 0, nullptr); + if (thread == nullptr) FATAL("CreateThread failed"); + if (threadID >= 0) setAffinity(thread, threadID); + return thread_t(thread); + } + + /*! the thread calling this function gets yielded */ + void yield() { + SwitchToThread(); + } + + /*! waits until the given thread has terminated */ + void join(thread_t tid) { + WaitForSingleObject(HANDLE(tid), INFINITE); + CloseHandle(HANDLE(tid)); + } + + /*! creates thread local storage */ + tls_t createTls() { + return tls_t(size_t(TlsAlloc())); + } + + /*! set the thread local storage pointer */ + void setTls(tls_t tls, void* const ptr) { + TlsSetValue(DWORD(size_t(tls)), ptr); + } + + /*! return the thread local storage pointer */ + void* getTls(tls_t tls) { + return TlsGetValue(DWORD(size_t(tls))); + } + + /*! destroys thread local storage identifier */ + void destroyTls(tls_t tls) { + TlsFree(DWORD(size_t(tls))); + } +#endif +} + +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Linux Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__LINUX__) + +#include <fstream> +#include <sstream> +#include <algorithm> + +#if defined(__ANDROID__) +#include <pthread.h> +#endif + +namespace embree +{ + static MutexSys mutex; + static std::vector<size_t> threadIDs; + +#if !defined(__ANDROID__) // TODO(LTE): Implement for Android target + /* changes thread ID mapping such that we first fill up all thread on one core */ + size_t mapThreadID(size_t threadID) + { + Lock<MutexSys> lock(mutex); + + if (threadIDs.size() == 0) + { + /* parse thread/CPU topology */ + for (size_t cpuID=0;;cpuID++) + { + std::fstream fs; + std::string cpu = std::string("/sys/devices/system/cpu/cpu") + std::to_string((long long)cpuID) + std::string("/topology/thread_siblings_list"); + fs.open (cpu.c_str(), std::fstream::in); + if (fs.fail()) break; + + int i; + while (fs >> i) + { + if (std::none_of(threadIDs.begin(),threadIDs.end(),[&] (int id) { return id == i; })) + threadIDs.push_back(i); + if (fs.peek() == ',') + fs.ignore(); + } + fs.close(); + } + +#if 0 + for (size_t i=0;i<threadIDs.size();i++) + std::cout << i << " -> " << threadIDs[i] << std::endl; +#endif + + /* verify the mapping and do not use it if the mapping has errors */ + for (size_t i=0;i<threadIDs.size();i++) { + for (size_t j=0;j<threadIDs.size();j++) { + if (i != j && threadIDs[i] == threadIDs[j]) { + threadIDs.clear(); + } + } + } + } + + /* re-map threadIDs if mapping is available */ + size_t ID = threadID; + if (threadID < threadIDs.size()) + ID = threadIDs[threadID]; + + /* find correct thread to affinitize to */ + cpu_set_t set; + if (pthread_getaffinity_np(pthread_self(), sizeof(set), &set) == 0) + { + for (int i=0, j=0; i<CPU_SETSIZE; i++) + { + if (!CPU_ISSET(i,&set)) continue; + + if (j == ID) { + ID = i; + break; + } + j++; + } + } + + return ID; + } +#endif + + /*! set affinity of the calling thread */ + void setAffinity(ssize_t affinity) + { +#if defined(__ANDROID__) + // TODO(LTE): Implement +#else + cpu_set_t cset; + CPU_ZERO(&cset); + size_t threadID = mapThreadID(affinity); + CPU_SET(threadID, &cset); + + pthread_setaffinity_np(pthread_self(), sizeof(cset), &cset); +#endif + } +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// FreeBSD Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__FreeBSD__) + +#include <pthread_np.h> + +namespace embree +{ + /*! set affinity of the calling thread */ + void setAffinity(ssize_t affinity) + { + cpuset_t cset; + CPU_ZERO(&cset); + CPU_SET(affinity, &cset); + + pthread_setaffinity_np(pthread_self(), sizeof(cset), &cset); + } +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// MacOSX Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__MACOSX__) + +#include <mach/thread_act.h> +#include <mach/thread_policy.h> +#include <mach/mach_init.h> + +namespace embree +{ + /*! set affinity of the calling thread */ + void setAffinity(ssize_t affinity) + { + thread_affinity_policy ap; + ap.affinity_tag = affinity; + if (thread_policy_set(mach_thread_self(),THREAD_AFFINITY_POLICY,(thread_policy_t)&ap,THREAD_AFFINITY_POLICY_COUNT) != KERN_SUCCESS) + WARNING("setting thread affinity failed"); // on purpose only a warning + } +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Unix Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__UNIX__) || defined(PTHREADS_WIN32) + +#include <pthread.h> +#include <sched.h> + +#if defined(__USE_NUMA__) +#include <numa.h> +#endif + +namespace embree +{ + struct ThreadStartupData + { + public: + ThreadStartupData (thread_func f, void* arg, int affinity) + : f(f), arg(arg), affinity(affinity) {} + public: + thread_func f; + void* arg; + ssize_t affinity; + }; + + static void* threadStartup(ThreadStartupData* parg) + { + _mm_setcsr(_mm_getcsr() | /*FTZ:*/ (1<<15) | /*DAZ:*/ (1<<6)); + + /*! Mac OS X does not support setting affinity at thread creation time */ +#if defined(__MACOSX__) + if (parg->affinity >= 0) + setAffinity(parg->affinity); +#endif + + parg->f(parg->arg); + delete parg; + parg = nullptr; + return nullptr; + } + + /*! creates a hardware thread running on specific core */ + thread_t createThread(thread_func f, void* arg, size_t stack_size, ssize_t threadID) + { + /* set stack size */ + pthread_attr_t attr; + pthread_attr_init(&attr); + if (stack_size > 0) pthread_attr_setstacksize (&attr, stack_size); + + /* create thread */ + pthread_t* tid = new pthread_t; + if (pthread_create(tid,&attr,(void*(*)(void*))threadStartup,new ThreadStartupData(f,arg,threadID)) != 0) { + pthread_attr_destroy(&attr); + delete tid; + FATAL("pthread_create failed"); + } + pthread_attr_destroy(&attr); + + /* set affinity */ +#if defined(__LINUX__) && !defined(__ANDROID__) + if (threadID >= 0) { + cpu_set_t cset; + CPU_ZERO(&cset); + threadID = mapThreadID(threadID); + CPU_SET(threadID, &cset); + pthread_setaffinity_np(*tid, sizeof(cset), &cset); + } +#elif defined(__FreeBSD__) + if (threadID >= 0) { + cpuset_t cset; + CPU_ZERO(&cset); + CPU_SET(threadID, &cset); + pthread_setaffinity_np(*tid, sizeof(cset), &cset); + } +#endif + + return thread_t(tid); + } + + /*! the thread calling this function gets yielded */ + void yield() { + sched_yield(); + } + + /*! waits until the given thread has terminated */ + void join(thread_t tid) { + if (pthread_join(*(pthread_t*)tid, nullptr) != 0) + FATAL("pthread_join failed"); + delete (pthread_t*)tid; + } + + /*! creates thread local storage */ + tls_t createTls() + { + pthread_key_t* key = new pthread_key_t; + if (pthread_key_create(key,nullptr) != 0) { + delete key; + FATAL("pthread_key_create failed"); + } + + return tls_t(key); + } + + /*! return the thread local storage pointer */ + void* getTls(tls_t tls) + { + assert(tls); + return pthread_getspecific(*(pthread_key_t*)tls); + } + + /*! set the thread local storage pointer */ + void setTls(tls_t tls, void* const ptr) + { + assert(tls); + if (pthread_setspecific(*(pthread_key_t*)tls, ptr) != 0) + FATAL("pthread_setspecific failed"); + } + + /*! destroys thread local storage identifier */ + void destroyTls(tls_t tls) + { + assert(tls); + if (pthread_key_delete(*(pthread_key_t*)tls) != 0) + FATAL("pthread_key_delete failed"); + delete (pthread_key_t*)tls; + } +} + +#endif diff --git a/thirdparty/embree-aarch64/common/sys/thread.h b/thirdparty/embree-aarch64/common/sys/thread.h new file mode 100644 index 0000000000..45da6e6a70 --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/thread.h @@ -0,0 +1,46 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "platform.h" +#include "mutex.h" +#include "alloc.h" +#include "vector.h" +#include <vector> + +namespace embree +{ + /*! type for thread */ + typedef struct opaque_thread_t* thread_t; + + /*! signature of thread start function */ + typedef void (*thread_func)(void*); + + /*! creates a hardware thread running on specific logical thread */ + thread_t createThread(thread_func f, void* arg, size_t stack_size = 0, ssize_t threadID = -1); + + /*! set affinity of the calling thread */ + void setAffinity(ssize_t affinity); + + /*! the thread calling this function gets yielded */ + void yield(); + + /*! waits until the given thread has terminated */ + void join(thread_t tid); + + /*! type for handle to thread local storage */ + typedef struct opaque_tls_t* tls_t; + + /*! creates thread local storage */ + tls_t createTls(); + + /*! set the thread local storage pointer */ + void setTls(tls_t tls, void* const ptr); + + /*! return the thread local storage pointer */ + void* getTls(tls_t tls); + + /*! destroys thread local storage identifier */ + void destroyTls(tls_t tls); +} diff --git a/thirdparty/embree-aarch64/common/sys/vector.h b/thirdparty/embree-aarch64/common/sys/vector.h new file mode 100644 index 0000000000..e41794de7c --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/vector.h @@ -0,0 +1,242 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "alloc.h" +#include <algorithm> + +namespace embree +{ + template<typename T, typename allocator> + class vector_t + { + public: + typedef T value_type; + typedef T* iterator; + typedef const T* const_iterator; + + __forceinline vector_t () + : size_active(0), size_alloced(0), items(nullptr) {} + + __forceinline explicit vector_t (size_t sz) + : size_active(0), size_alloced(0), items(nullptr) { internal_resize_init(sz); } + + template<typename M> + __forceinline explicit vector_t (M alloc, size_t sz) + : alloc(alloc), size_active(0), size_alloced(0), items(nullptr) { internal_resize_init(sz); } + + __forceinline ~vector_t() { + clear(); + } + + __forceinline vector_t (const vector_t& other) + { + size_active = other.size_active; + size_alloced = other.size_alloced; + items = alloc.allocate(size_alloced); + for (size_t i=0; i<size_active; i++) + ::new (&items[i]) value_type(other.items[i]); + } + + __forceinline vector_t (vector_t&& other) + : alloc(std::move(other.alloc)) + { + size_active = other.size_active; other.size_active = 0; + size_alloced = other.size_alloced; other.size_alloced = 0; + items = other.items; other.items = nullptr; + } + + __forceinline vector_t& operator=(const vector_t& other) + { + resize(other.size_active); + for (size_t i=0; i<size_active; i++) + items[i] = value_type(other.items[i]); + return *this; + } + + __forceinline vector_t& operator=(vector_t&& other) + { + clear(); + alloc = std::move(other.alloc); + size_active = other.size_active; other.size_active = 0; + size_alloced = other.size_alloced; other.size_alloced = 0; + items = other.items; other.items = nullptr; + return *this; + } + + /********************** Iterators ****************************/ + + __forceinline iterator begin() { return items; }; + __forceinline const_iterator begin() const { return items; }; + + __forceinline iterator end () { return items+size_active; }; + __forceinline const_iterator end () const { return items+size_active; }; + + + /********************** Capacity ****************************/ + + __forceinline bool empty () const { return size_active == 0; } + __forceinline size_t size () const { return size_active; } + __forceinline size_t capacity () const { return size_alloced; } + + + __forceinline void resize(size_t new_size) { + internal_resize(new_size,internal_grow_size(new_size)); + } + + __forceinline void reserve(size_t new_alloced) + { + /* do nothing if container already large enough */ + if (new_alloced <= size_alloced) + return; + + /* resize exact otherwise */ + internal_resize(size_active,new_alloced); + } + + __forceinline void shrink_to_fit() { + internal_resize(size_active,size_active); + } + + /******************** Element access **************************/ + + __forceinline T& operator[](size_t i) { assert(i < size_active); return items[i]; } + __forceinline const T& operator[](size_t i) const { assert(i < size_active); return items[i]; } + + __forceinline T& at(size_t i) { assert(i < size_active); return items[i]; } + __forceinline const T& at(size_t i) const { assert(i < size_active); return items[i]; } + + __forceinline T& front() const { assert(size_active > 0); return items[0]; }; + __forceinline T& back () const { assert(size_active > 0); return items[size_active-1]; }; + + __forceinline T* data() { return items; }; + __forceinline const T* data() const { return items; }; + + + /******************** Modifiers **************************/ + + __forceinline void push_back(const T& nt) + { + const T v = nt; // need local copy as input reference could point to this vector + internal_resize(size_active,internal_grow_size(size_active+1)); + ::new (&items[size_active++]) T(v); + } + + __forceinline void pop_back() + { + assert(!empty()); + size_active--; + alloc.destroy(&items[size_active]); + } + + __forceinline void clear() + { + /* destroy elements */ + for (size_t i=0; i<size_active; i++) + alloc.destroy(&items[i]); + + /* free memory */ + alloc.deallocate(items,size_alloced); + items = nullptr; + size_active = size_alloced = 0; + } + + /******************** Comparisons **************************/ + + friend bool operator== (const vector_t& a, const vector_t& b) + { + if (a.size() != b.size()) return false; + for (size_t i=0; i<a.size(); i++) + if (a[i] != b[i]) + return false; + return true; + } + + friend bool operator!= (const vector_t& a, const vector_t& b) { + return !(a==b); + } + + private: + + __forceinline void internal_resize_init(size_t new_active) + { + assert(size_active == 0); + assert(size_alloced == 0); + assert(items == nullptr); + if (new_active == 0) return; + items = alloc.allocate(new_active); + for (size_t i=0; i<new_active; i++) ::new (&items[i]) T(); + size_active = new_active; + size_alloced = new_active; + } + + __forceinline void internal_resize(size_t new_active, size_t new_alloced) + { + assert(new_active <= new_alloced); + + /* destroy elements */ + if (new_active < size_active) + { + for (size_t i=new_active; i<size_active; i++) + alloc.destroy(&items[i]); + size_active = new_active; + } + + /* only reallocate if necessary */ + if (new_alloced == size_alloced) { + for (size_t i=size_active; i<new_active; i++) ::new (&items[i]) T; + size_active = new_active; + return; + } + + /* reallocate and copy items */ + T* old_items = items; + items = alloc.allocate(new_alloced); + for (size_t i=0; i<size_active; i++) { + ::new (&items[i]) T(std::move(old_items[i])); + alloc.destroy(&old_items[i]); + } + + for (size_t i=size_active; i<new_active; i++) { + ::new (&items[i]) T; + } + + alloc.deallocate(old_items,size_alloced); + size_active = new_active; + size_alloced = new_alloced; + } + + __forceinline size_t internal_grow_size(size_t new_alloced) + { + /* do nothing if container already large enough */ + if (new_alloced <= size_alloced) + return size_alloced; + + /* resize to next power of 2 otherwise */ + size_t new_size_alloced = size_alloced; + while (new_size_alloced < new_alloced) { + new_size_alloced = std::max(size_t(1),2*new_size_alloced); + } + return new_size_alloced; + } + + private: + allocator alloc; + size_t size_active; // number of valid items + size_t size_alloced; // number of items allocated + T* items; // data array + }; + + /*! vector class that performs standard allocations */ + template<typename T> + using vector = vector_t<T,std::allocator<T>>; + + /*! vector class that performs aligned allocations */ + template<typename T> + using avector = vector_t<T,aligned_allocator<T,std::alignment_of<T>::value> >; + + /*! vector class that performs OS allocations */ + template<typename T> + using ovector = vector_t<T,os_allocator<T> >; +} |