24 files changed, 1612 insertions, 544 deletions
diff --git a/core/templates/bin_sorted_array.h b/core/templates/bin_sorted_array.h
new file mode 100644
index 0000000000..be9d0b5475
--- /dev/null
+++ b/core/templates/bin_sorted_array.h
@@ -0,0 +1,181 @@
+/*************************************************************************/
+/*  bin_sorted_array.h                                                   */
+/*************************************************************************/
+/*                       This file is part of:                           */
+/*                           GODOT ENGINE                                */
+/*                      https://godotengine.org                          */
+/*************************************************************************/
+/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur.                 */
+/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md).   */
+/*                                                                       */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the       */
+/* "Software"), to deal in the Software without restriction, including   */
+/* without limitation the rights to use, copy, modify, merge, publish,   */
+/* distribute, sublicense, and/or sell copies of the Software, and to    */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions:                                             */
+/*                                                                       */
+/* The above copyright notice and this permission notice shall be        */
+/* included in all copies or substantial portions of the Software.       */
+/*                                                                       */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
+/*************************************************************************/
+
+#ifndef BIN_SORTED_ARRAY_H
+#define BIN_SORTED_ARRAY_H
+
+#include "core/templates/local_vector.h"
+#include "core/templates/paged_array.h"
+
+template <class T>
+class BinSortedArray {
+	PagedArray<T> array;
+	LocalVector<uint64_t> bin_limits;
+
+	// Implement if elements need to keep track of their own index in the array.
+	_FORCE_INLINE_ virtual void _update_idx(T &r_element, uint64_t p_idx) {}
+
+	_FORCE_INLINE_ void _swap(uint64_t p_a, uint64_t p_b) {
+		SWAP(array[p_a], array[p_b]);
+		_update_idx(array[p_a], p_a);
+		_update_idx(array[p_b], p_b);
+	}
+
+public:
+	uint64_t insert(T &p_element, uint64_t p_bin) {
+		array.push_back(p_element);
+		uint64_t new_idx = array.size() - 1;
+		_update_idx(p_element, new_idx);
+		bin_limits[0] = new_idx;
+		if (p_bin != 0) {
+			new_idx = move(new_idx, p_bin);
+		}
+		return new_idx;
+	}
+
+	uint64_t move(uint64_t p_idx, uint64_t p_bin) {
+		ERR_FAIL_COND_V(p_idx >= array.size(), -1);
+
+		uint64_t current_bin = bin_limits.size() - 1;
+		while (p_idx > bin_limits[current_bin]) {
+			current_bin--;
+		}
+
+		if (p_bin == current_bin) {
+			return p_idx;
+		}
+
+		uint64_t current_idx = p_idx;
+		if (p_bin > current_bin) {
+			while (p_bin > current_bin) {
+				uint64_t swap_idx = 0;
+
+				if (current_bin == bin_limits.size() - 1) {
+					bin_limits.push_back(0);
+				} else {
+					bin_limits[current_bin + 1]++;
+					swap_idx = bin_limits[current_bin + 1];
+				}
+
+				if (current_idx != swap_idx) {
+					_swap(current_idx, swap_idx);
+					current_idx = swap_idx;
+				}
+
+				current_bin++;
+			}
+		} else {
+			while (p_bin < current_bin) {
+				uint64_t swap_idx = bin_limits[current_bin];
+
+				if (current_idx != swap_idx) {
+					_swap(current_idx, swap_idx);
+				}
+
+				if (current_bin == bin_limits.size() - 1 && bin_limits[current_bin] == 0) {
+					bin_limits.resize(bin_limits.size() - 1);
+				} else {
+					bin_limits[current_bin]--;
+				}
+				current_idx = swap_idx;
+				current_bin--;
+			}
+		}
+
+		return current_idx;
+	}
+
+	void remove(uint64_t p_idx) {
+		ERR_FAIL_COND(p_idx >= array.size());
+		uint64_t new_idx = move(p_idx, 0);
+		uint64_t swap_idx = array.size() - 1;
+
+		if (new_idx != swap_idx) {
+			_swap(new_idx, swap_idx);
+		}
+
+		if (bin_limits[0] > 0) {
+			bin_limits[0]--;
+		}
+
+		array.pop_back();
+	}
+
+	void set_page_pool(PagedArrayPool<T> *p_page_pool) {
+		array.set_page_pool(p_page_pool);
+	}
+
+	_FORCE_INLINE_ const T &operator[](uint64_t p_index) const {
+		return array[p_index];
+	}
+
+	_FORCE_INLINE_ T &operator[](uint64_t p_index) {
+		return array[p_index];
+	}
+
+	int get_bin_count() {
+		if (array.size() == 0) {
+			return 0;
+		}
+		return bin_limits.size();
+	}
+
+	int get_bin_start(int p_bin) {
+		ERR_FAIL_COND_V(p_bin >= get_bin_count(), ~0U);
+		if ((unsigned int)p_bin == bin_limits.size() - 1) {
+			return 0;
+		}
+		return bin_limits[p_bin + 1] + 1;
+	}
+
+	int get_bin_size(int p_bin) {
+		ERR_FAIL_COND_V(p_bin >= get_bin_count(), 0);
+		if ((unsigned int)p_bin == bin_limits.size() - 1) {
+			return bin_limits[p_bin] + 1;
+		}
+		return bin_limits[p_bin] - bin_limits[p_bin + 1];
+	}
+
+	void reset() {
+		array.reset();
+		bin_limits.clear();
+		bin_limits.push_back(0);
+	}
+
+	BinSortedArray() {
+		bin_limits.push_back(0);
+	}
+
+	virtual ~BinSortedArray() {
+		reset();
+	}
+};
+
+#endif //BIN_SORTED_ARRAY_H
diff --git a/core/templates/command_queue_mt.cpp b/core/templates/command_queue_mt.cpp
index 238bf3975c..04a8095f0b 100644
--- a/core/templates/command_queue_mt.cpp
+++ b/core/templates/command_queue_mt.cpp
@@ -70,35 +70,7 @@ CommandQueueMT::SyncSemaphore *CommandQueueMT::_alloc_sync_sem() {
 	return &sync_sems[idx];
 }
 
-bool CommandQueueMT::dealloc_one() {
-tryagain:
-	if (dealloc_ptr == (write_ptr_and_epoch >> 1)) {
-		// The queue is empty
-		return false;
-	}
-
-	uint32_t size = *(uint32_t *)&command_mem[dealloc_ptr];
-
-	if (size == 0) {
-		// End of command buffer wrap down
-		dealloc_ptr = 0;
-		goto tryagain;
-	}
-
-	if (size & 1) {
-		// Still used, nothing can be deallocated
-		return false;
-	}
-
-	dealloc_ptr += (size >> 1) + 8;
-	return true;
-}
-
 CommandQueueMT::CommandQueueMT(bool p_sync) {
-	command_mem_size = GLOBAL_DEF_RST("memory/limits/command_queue/multithreading_queue_size_kb", DEFAULT_COMMAND_MEM_SIZE_KB);
-	ProjectSettings::get_singleton()->set_custom_property_info("memory/limits/command_queue/multithreading_queue_size_kb", PropertyInfo(Variant::INT, "memory/limits/command_queue/multithreading_queue_size_kb", PROPERTY_HINT_RANGE, "1,4096,1,or_greater"));
-	command_mem_size *= 1024;
-	command_mem = (uint8_t *)memalloc(command_mem_size);
 	if (p_sync) {
 		sync = memnew(Semaphore);
 	}
@@ -108,5 +80,4 @@ CommandQueueMT::~CommandQueueMT() {
 	if (sync) {
 		memdelete(sync);
 	}
-	memfree(command_mem);
 }
diff --git a/core/templates/command_queue_mt.h b/core/templates/command_queue_mt.h
index 7861c76153..519a896ffc 100644
--- a/core/templates/command_queue_mt.h
+++ b/core/templates/command_queue_mt.h
@@ -34,6 +34,8 @@
 #include "core/os/memory.h"
 #include "core/os/mutex.h"
 #include "core/os/semaphore.h"
+#include "core/string/print_string.h"
+#include "core/templates/local_vector.h"
 #include "core/templates/simple_type.h"
 #include "core/typedefs.h"
 
@@ -319,7 +321,7 @@ class CommandQueueMT {
 	DECL_CMD(0)
 	SPACE_SEP_LIST(DECL_CMD, 15)
 
-	/* comands that return */
+	// Commands that return.
 	DECL_CMD_RET(0)
 	SPACE_SEP_LIST(DECL_CMD_RET, 15)
 
@@ -334,11 +336,7 @@ class CommandQueueMT {
 		SYNC_SEMAPHORES = 8
 	};
 
-	uint8_t *command_mem = nullptr;
-	uint32_t read_ptr_and_epoch = 0;
-	uint32_t write_ptr_and_epoch = 0;
-	uint32_t dealloc_ptr = 0;
-	uint32_t command_mem_size = 0;
+	LocalVector<uint8_t> command_mem;
 	SyncSemaphore sync_sems[SYNC_SEMAPHORES];
 	Mutex mutex;
 	Semaphore *sync = nullptr;
@@ -346,138 +344,47 @@ class CommandQueueMT {
 	template <class T>
 	T *allocate() {
 		// alloc size is size+T+safeguard
-		uint32_t alloc_size = ((sizeof(T) + 8 - 1) & ~(8 - 1)) + 8;
-
-		// Assert that the buffer is big enough to hold at least two messages.
-		ERR_FAIL_COND_V(alloc_size * 2 + sizeof(uint32_t) > command_mem_size, nullptr);
-
-	tryagain:
-		uint32_t write_ptr = write_ptr_and_epoch >> 1;
-
-		if (write_ptr < dealloc_ptr) {
-			// behind dealloc_ptr, check that there is room
-			if ((dealloc_ptr - write_ptr) <= alloc_size) {
-				// There is no more room, try to deallocate something
-				if (dealloc_one()) {
-					goto tryagain;
-				}
-				return nullptr;
-			}
-		} else {
-			// ahead of dealloc_ptr, check that there is room
-
-			if ((command_mem_size - write_ptr) < alloc_size + sizeof(uint32_t)) {
-				// no room at the end, wrap down;
-
-				if (dealloc_ptr == 0) { // don't want write_ptr to become dealloc_ptr
-
-					// There is no more room, try to deallocate something
-					if (dealloc_one()) {
-						goto tryagain;
-					}
-					return nullptr;
-				}
-
-				// if this happens, it's a bug
-				ERR_FAIL_COND_V((command_mem_size - write_ptr) < 8, nullptr);
-				// zero means, wrap to beginning
-
-				uint32_t *p = (uint32_t *)&command_mem[write_ptr];
-				*p = 1;
-				write_ptr_and_epoch = 0 | (1 & ~write_ptr_and_epoch); // Invert epoch.
-				// See if we can get the thread to run and clear up some more space while we wait.
-				// This is required if alloc_size * 2 + 4 > COMMAND_MEM_SIZE
-				if (sync) {
-					sync->post();
-				}
-				goto tryagain;
-			}
-		}
-		// Allocate the size and the 'in use' bit.
-		// First bit used to mark if command is still in use (1)
-		// or if it has been destroyed and can be deallocated (0).
-		uint32_t size = (sizeof(T) + 8 - 1) & ~(8 - 1);
-		uint32_t *p = (uint32_t *)&command_mem[write_ptr];
-		*p = (size << 1) | 1;
-		write_ptr += 8;
-		// allocate the command
-		T *cmd = memnew_placement(&command_mem[write_ptr], T);
-		write_ptr += size;
-		write_ptr_and_epoch = (write_ptr << 1) | (write_ptr_and_epoch & 1);
+		uint32_t alloc_size = ((sizeof(T) + 8 - 1) & ~(8 - 1));
+		uint64_t size = command_mem.size();
+		command_mem.resize(size + alloc_size + 8);
+		*(uint64_t *)&command_mem[size] = alloc_size;
+		T *cmd = memnew_placement(&command_mem[size + 8], T);
 		return cmd;
 	}
 
 	template <class T>
 	T *allocate_and_lock() {
 		lock();
-		T *ret;
-
-		while ((ret = allocate<T>()) == nullptr) {
-			unlock();
-			// sleep a little until fetch happened and some room is made
-			wait_for_flush();
-			lock();
-		}
-
+		T *ret = allocate<T>();
 		return ret;
 	}
 
-	bool flush_one(bool p_lock = true) {
-		if (p_lock) {
-			lock();
-		}
-	tryagain:
-
-		// tried to read an empty queue
-		if (read_ptr_and_epoch == write_ptr_and_epoch) {
-			if (p_lock) {
-				unlock();
-			}
-			return false;
-		}
-
-		uint32_t read_ptr = read_ptr_and_epoch >> 1;
-		uint32_t size_ptr = read_ptr;
-		uint32_t size = *(uint32_t *)&command_mem[read_ptr] >> 1;
-
-		if (size == 0) {
-			*(uint32_t *)&command_mem[read_ptr] = 0; // clear in-use bit.
-			//end of ringbuffer, wrap
-			read_ptr_and_epoch = 0 | (1 & ~read_ptr_and_epoch); // Invert epoch.
-			goto tryagain;
-		}
-
-		read_ptr += 8;
+	void _flush() {
+		lock();
 
-		CommandBase *cmd = reinterpret_cast<CommandBase *>(&command_mem[read_ptr]);
+		uint64_t read_ptr = 0;
+		uint64_t limit = command_mem.size();
 
-		read_ptr += size;
+		while (read_ptr < limit) {
+			uint64_t size = *(uint64_t *)&command_mem[read_ptr];
+			read_ptr += 8;
+			CommandBase *cmd = reinterpret_cast<CommandBase *>(&command_mem[read_ptr]);
 
-		read_ptr_and_epoch = (read_ptr << 1) | (read_ptr_and_epoch & 1);
+			cmd->call(); //execute the function
+			cmd->post(); //release in case it needs sync/ret
+			cmd->~CommandBase(); //should be done, so erase the command
 
-		if (p_lock) {
-			unlock();
-		}
-		cmd->call();
-		if (p_lock) {
-			lock();
+			read_ptr += size;
 		}
 
-		cmd->post();
-		cmd->~CommandBase();
-		*(uint32_t *)&command_mem[size_ptr] &= ~1;
-
-		if (p_lock) {
-			unlock();
-		}
-		return true;
+		command_mem.clear();
+		unlock();
 	}
 
 	void lock();
 	void unlock();
 	void wait_for_flush();
 	SyncSemaphore *_alloc_sync_sem();
-	bool dealloc_one();
 
 public:
 	/* NORMAL PUSH COMMANDS */
@@ -492,18 +399,19 @@ public:
 	DECL_PUSH_AND_SYNC(0)
 	SPACE_SEP_LIST(DECL_PUSH_AND_SYNC, 15)
 
-	void wait_and_flush_one() {
-		ERR_FAIL_COND(!sync);
-		sync->wait();
-		flush_one();
+	_FORCE_INLINE_ void flush_if_pending() {
+		if (unlikely(command_mem.size() > 0)) {
+			_flush();
+		}
 	}
-
 	void flush_all() {
-		//ERR_FAIL_COND(sync);
-		lock();
-		while (flush_one(false)) {
-		}
-		unlock();
+		_flush();
+	}
+
+	void wait_and_flush() {
+		ERR_FAIL_COND(!sync);
+		sync->wait();
+		_flush();
 	}
 
 	CommandQueueMT(bool p_sync);
diff --git a/core/templates/cowdata.h b/core/templates/cowdata.h
index 4becb1be59..9b8c0eb528 100644
--- a/core/templates/cowdata.h
+++ b/core/templates/cowdata.h
@@ -45,6 +45,16 @@ class CharString;
 template <class T, class V>
 class VMap;
 
+#if !defined(NO_THREADS)
+SAFE_NUMERIC_TYPE_PUN_GUARANTEES(uint32_t)
+#endif
+
+// Silence a false positive warning (see GH-52119).
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wplacement-new"
+#endif
+
 template <class T>
 class CowData {
 	template <class TV>
@@ -60,12 +70,12 @@ private:
 
 	// internal helpers
 
-	_FORCE_INLINE_ uint32_t *_get_refcount() const {
+	_FORCE_INLINE_ SafeNumeric<uint32_t> *_get_refcount() const {
 		if (!_ptr) {
 			return nullptr;
 		}
 
-		return reinterpret_cast<uint32_t *>(_ptr) - 2;
+		return reinterpret_cast<SafeNumeric<uint32_t> *>(_ptr) - 2;
 	}
 
 	_FORCE_INLINE_ uint32_t *_get_size() const {
@@ -111,7 +121,7 @@ private:
 	void _unref(void *p_data);
 	void _ref(const CowData *p_from);
 	void _ref(const CowData &p_from);
-	void _copy_on_write();
+	uint32_t _copy_on_write();
 
 public:
 	void operator=(const CowData<T> &p_from) { _ref(p_from); }
@@ -138,7 +148,7 @@ public:
 	_FORCE_INLINE_ bool is_empty() const { return _ptr == nullptr; }
 
 	_FORCE_INLINE_ void set(int p_index, const T &p_elem) {
-		CRASH_BAD_INDEX(p_index, size());
+		ERR_FAIL_INDEX(p_index, size());
 		_copy_on_write();
 		_get_data()[p_index] = p_elem;
 	}
@@ -192,9 +202,9 @@ void CowData<T>::_unref(void *p_data) {
 		return;
 	}
 
-	uint32_t *refc = _get_refcount();
+	SafeNumeric<uint32_t> *refc = _get_refcount();
 
-	if (atomic_decrement(refc) > 0) {
+	if (refc->decrement() > 0) {
 		return; // still in use
 	}
 	// clean up
@@ -214,20 +224,21 @@ void CowData<T>::_unref(void *p_data) {
 }
 
 template <class T>
-void CowData<T>::_copy_on_write() {
+uint32_t CowData<T>::_copy_on_write() {
 	if (!_ptr) {
-		return;
+		return 0;
 	}
 
-	uint32_t *refc = _get_refcount();
+	SafeNumeric<uint32_t> *refc = _get_refcount();
 
-	if (unlikely(*refc > 1)) {
+	uint32_t rc = refc->get();
+	if (unlikely(rc > 1)) {
 		/* in use by more than me */
 		uint32_t current_size = *_get_size();
 
 		uint32_t *mem_new = (uint32_t *)Memory::alloc_static(_get_alloc_size(current_size), true);
 
-		*(mem_new - 2) = 1; //refcount
+		new (mem_new - 2) SafeNumeric<uint32_t>(1); //refcount
 		*(mem_new - 1) = current_size; //size
 
 		T *_data = (T *)(mem_new);
@@ -244,7 +255,10 @@ void CowData<T>::_copy_on_write() {
 
 		_unref(_ptr);
 		_ptr = _data;
+
+		rc = 1;
 	}
+	return rc;
 }
 
 template <class T>
@@ -265,7 +279,7 @@ Error CowData<T>::resize(int p_size) {
 	}
 
 	// possibly changing size, copy on write
-	_copy_on_write();
+	uint32_t rc = _copy_on_write();
 
 	size_t current_alloc_size = _get_alloc_size(current_size);
 	size_t alloc_size;
@@ -278,13 +292,15 @@ Error CowData<T>::resize(int p_size) {
 				uint32_t *ptr = (uint32_t *)Memory::alloc_static(alloc_size, true);
 				ERR_FAIL_COND_V(!ptr, ERR_OUT_OF_MEMORY);
 				*(ptr - 1) = 0; //size, currently none
-				*(ptr - 2) = 1; //refcount
+				new (ptr - 2) SafeNumeric<uint32_t>(1); //refcount
 
 				_ptr = (T *)ptr;
 
 			} else {
-				void *_ptrnew = (T *)Memory::realloc_static(_ptr, alloc_size, true);
+				uint32_t *_ptrnew = (uint32_t *)Memory::realloc_static(_ptr, alloc_size, true);
 				ERR_FAIL_COND_V(!_ptrnew, ERR_OUT_OF_MEMORY);
+				new (_ptrnew - 2) SafeNumeric<uint32_t>(rc); //refcount
+
 				_ptr = (T *)(_ptrnew);
 			}
 		}
@@ -311,8 +327,9 @@ Error CowData<T>::resize(int p_size) {
 		}
 
 		if (alloc_size != current_alloc_size) {
-			void *_ptrnew = (T *)Memory::realloc_static(_ptr, alloc_size, true);
+			uint32_t *_ptrnew = (uint32_t *)Memory::realloc_static(_ptr, alloc_size, true);
 			ERR_FAIL_COND_V(!_ptrnew, ERR_OUT_OF_MEMORY);
+			new (_ptrnew - 2) SafeNumeric<uint32_t>(rc); //refcount
 
 			_ptr = (T *)(_ptrnew);
 		}
@@ -359,7 +376,7 @@ void CowData<T>::_ref(const CowData &p_from) {
 		return; //nothing to do
 	}
 
-	if (atomic_conditional_increment(p_from._get_refcount()) > 0) { // could reference
+	if (p_from._get_refcount()->conditional_increment() > 0) { // could reference
 		_ptr = p_from._ptr;
 	}
 }
@@ -369,4 +386,8 @@ CowData<T>::~CowData() {
 	_unref(_ptr);
 }
 
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+
 #endif // COWDATA_H
diff --git a/core/templates/hash_map.h b/core/templates/hash_map.h
index dc378aed69..1257b54449 100644
--- a/core/templates/hash_map.h
+++ b/core/templates/hash_map.h
@@ -291,7 +291,7 @@ public:
 	}
 
 	/**
-	 * Same as get, except it can return nullptr  when item was not found.
+	 * Same as get, except it can return nullptr when item was not found.
 	 * This is mainly used for speed purposes.
 	 */
 
@@ -324,7 +324,7 @@ public:
 	}
 
 	/**
-	 * Same as get, except it can return nullptr  when item was not found.
+	 * Same as get, except it can return nullptr when item was not found.
 	 * This version is custom, will take a hash and a custom key (that should support operator==()
 	 */
 
@@ -443,7 +443,7 @@ public:
 
 	/**
 	 * Get the next key to p_key, and the first key if p_key is null.
-	 * Returns a pointer to the next key if found, nullptr  otherwise.
+	 * Returns a pointer to the next key if found, nullptr otherwise.
 	 * Adding/Removing elements while iterating will, of course, have unexpected results, don't do it.
 	 *
 	 * Example:
diff --git a/core/templates/hashfuncs.h b/core/templates/hashfuncs.h
index 4572b269cf..2e932f9f26 100644
--- a/core/templates/hashfuncs.h
+++ b/core/templates/hashfuncs.h
@@ -95,7 +95,7 @@ static inline uint32_t hash_djb2_one_float(double p_in, uint32_t p_prev = 5381)
 	if (p_in == 0.0f) {
 		u.d = 0.0;
 	} else if (Math::is_nan(p_in)) {
-		u.d = Math_NAN;
+		u.d = NAN;
 	} else {
 		u.d = p_in;
 	}
@@ -124,7 +124,7 @@ static inline uint64_t hash_djb2_one_float_64(double p_in, uint64_t p_prev = 538
 	if (p_in == 0.0f) {
 		u.d = 0.0;
 	} else if (Math::is_nan(p_in)) {
-		u.d = Math_NAN;
+		u.d = NAN;
 	} else {
 		u.d = p_in;
 	}
diff --git a/core/templates/list.h b/core/templates/list.h
index eaf1dbb4a0..c2e17a2f6f 100644
--- a/core/templates/list.h
+++ b/core/templates/list.h
@@ -135,6 +135,83 @@ public:
 		_FORCE_INLINE_ Element() {}
 	};
 
+	typedef T ValueType;
+
+	struct Iterator {
+		_FORCE_INLINE_ T &operator*() const {
+			return E->get();
+		}
+		_FORCE_INLINE_ T *operator->() const { return &E->get(); }
+		_FORCE_INLINE_ Iterator &operator++() {
+			E = E->next();
+			return *this;
+		}
+		_FORCE_INLINE_ Iterator &operator--() {
+			E = E->prev();
+			return *this;
+		}
+
+		_FORCE_INLINE_ bool operator==(const Iterator &b) const { return E == b.E; }
+		_FORCE_INLINE_ bool operator!=(const Iterator &b) const { return E != b.E; }
+
+		Iterator(Element *p_E) { E = p_E; }
+		Iterator() {}
+		Iterator(const Iterator &p_it) { E = p_it.E; }
+
+	private:
+		Element *E = nullptr;
+	};
+
+	struct ConstIterator {
+		_FORCE_INLINE_ const T &operator*() const {
+			return E->get();
+		}
+		_FORCE_INLINE_ const T *operator->() const { return &E->get(); }
+		_FORCE_INLINE_ ConstIterator &operator++() {
+			E = E->next();
+			return *this;
+		}
+		_FORCE_INLINE_ ConstIterator &operator--() {
+			E = E->prev();
+			return *this;
+		}
+
+		_FORCE_INLINE_ bool operator==(const ConstIterator &b) const { return E == b.E; }
+		_FORCE_INLINE_ bool operator!=(const ConstIterator &b) const { return E != b.E; }
+
+		_FORCE_INLINE_ ConstIterator(const Element *p_E) { E = p_E; }
+		_FORCE_INLINE_ ConstIterator() {}
+		_FORCE_INLINE_ ConstIterator(const ConstIterator &p_it) { E = p_it.E; }
+
+	private:
+		const Element *E = nullptr;
+	};
+
+	_FORCE_INLINE_ Iterator begin() {
+		return Iterator(front());
+	}
+	_FORCE_INLINE_ Iterator end() {
+		return Iterator(nullptr);
+	}
+
+#if 0
+	//to use when replacing find()
+	_FORCE_INLINE_ Iterator find(const K &p_key) {
+		return Iterator(find(p_key));
+	}
+#endif
+	_FORCE_INLINE_ ConstIterator begin() const {
+		return ConstIterator(front());
+	}
+	_FORCE_INLINE_ ConstIterator end() const {
+		return ConstIterator(nullptr);
+	}
+#if 0
+	//to use when replacing find()
+	_FORCE_INLINE_ ConstIterator find(const K &p_key) const {
+		return ConstIterator(find(p_key));
+	}
+#endif
 private:
 	struct _Data {
 		Element *first = nullptr;
@@ -492,7 +569,7 @@ public:
 		_data->last = p_I;
 	}
 
-	void invert() {
+	void reverse() {
 		int s = size() / 2;
 		Element *F = front();
 		Element *B = back();
diff --git a/core/templates/local_vector.h b/core/templates/local_vector.h
index 4ffb93b2ad..5704b8f230 100644
--- a/core/templates/local_vector.h
+++ b/core/templates/local_vector.h
@@ -32,7 +32,6 @@
 #define LOCAL_VECTOR_H
 
 #include "core/error/error_macros.h"
-#include "core/os/copymem.h"
 #include "core/os/memory.h"
 #include "core/templates/sort_array.h"
 #include "core/templates/vector.h"
@@ -82,6 +81,19 @@ public:
 		}
 	}
 
+	/// Removes the item copying the last value into the position of the one to
+	/// remove. It's generally faster than `remove`.
+	void remove_unordered(U p_index) {
+		ERR_FAIL_INDEX(p_index, count);
+		count--;
+		if (count > p_index) {
+			data[p_index] = data[count];
+		}
+		if (!__has_trivial_destructor(T) && !force_trivial) {
+			data[count].~T();
+		}
+	}
+
 	void erase(const T &p_val) {
 		int64_t idx = find(p_val);
 		if (idx >= 0) {
@@ -105,6 +117,7 @@ public:
 		}
 	}
 	_FORCE_INLINE_ bool is_empty() const { return count == 0; }
+	_FORCE_INLINE_ U get_capacity() const { return capacity; }
 	_FORCE_INLINE_ void reserve(U p_size) {
 		p_size = nearest_power_of_2_templated(p_size);
 		if (p_size > capacity) {
@@ -157,7 +170,7 @@ public:
 			push_back(p_val);
 		} else {
 			resize(count + 1);
-			for (U i = count; i > p_pos; i--) {
+			for (U i = count - 1; i > p_pos; i--) {
 				data[i] = data[i - 1];
 			}
 			data[p_pos] = p_val;
@@ -165,7 +178,7 @@ public:
 	}
 
 	int64_t find(const T &p_val, U p_from = 0) const {
-		for (U i = 0; i < count; i++) {
+		for (U i = p_from; i < count; i++) {
 			if (data[i] == p_val) {
 				return int64_t(i);
 			}
@@ -202,7 +215,7 @@ public:
 		Vector<T> ret;
 		ret.resize(size());
 		T *w = ret.ptrw();
-		copymem(w, data, sizeof(T) * count);
+		memcpy(w, data, sizeof(T) * count);
 		return ret;
 	}
 
@@ -210,7 +223,7 @@ public:
 		Vector<uint8_t> ret;
 		ret.resize(count * sizeof(T));
 		uint8_t *w = ret.ptrw();
-		copymem(w, data, sizeof(T) * count);
+		memcpy(w, data, sizeof(T) * count);
 		return ret;
 	}
 
diff --git a/core/templates/map.h b/core/templates/map.h
index 51a237472d..badb407e5d 100644
--- a/core/templates/map.h
+++ b/core/templates/map.h
@@ -32,10 +32,11 @@
 #define MAP_H
 
 #include "core/error/error_macros.h"
-#include "core/templates/set.h"
+#include "core/os/memory.h"
+#include "core/templates/pair.h"
 
 // based on the very nice implementation of rb-trees by:
-// https://web.archive.org/web/20120507164830/http://web.mit.edu/~emin/www/source_code/red_black_tree/index.html
+// https://web.archive.org/web/20120507164830/https://web.mit.edu/~emin/www/source_code/red_black_tree/index.html
 
 template <class K, class V, class C = Comparator<K>, class A = DefaultAllocator>
 class Map {
@@ -55,11 +56,12 @@ public:
 		Element *parent = nullptr;
 		Element *_next = nullptr;
 		Element *_prev = nullptr;
-		K _key;
-		V _value;
-		//_Data *data;
+		KeyValue<K, V> _data;
 
 	public:
+		KeyValue<K, V> &key_value() { return _data; }
+		const KeyValue<K, V> &key_value() const { return _data; }
+
 		const Element *next() const {
 			return _next;
 		}
@@ -73,23 +75,106 @@ public:
 			return _prev;
 		}
 		const K &key() const {
-			return _key;
+			return _data.key;
 		}
 		V &value() {
-			return _value;
+			return _data.value;
 		}
 		const V &value() const {
-			return _value;
+			return _data.value;
 		}
 		V &get() {
-			return _value;
+			return _data.value;
 		}
 		const V &get() const {
-			return _value;
+			return _data.value;
 		}
-		Element() {}
+		Element(const KeyValue<K, V> &p_data) :
+				_data(p_data) {}
 	};
 
+	typedef KeyValue<K, V> ValueType;
+
+	struct Iterator {
+		_FORCE_INLINE_ KeyValue<K, V> &operator*() const {
+			return E->key_value();
+		}
+		_FORCE_INLINE_ KeyValue<K, V> *operator->() const { return &E->key_value(); }
+		_FORCE_INLINE_ Iterator &operator++() {
+			E = E->next();
+			return *this;
+		}
+		_FORCE_INLINE_ Iterator &operator--() {
+			E = E->prev();
+			return *this;
+		}
+
+		_FORCE_INLINE_ bool operator==(const Iterator &b) const { return E == b.E; }
+		_FORCE_INLINE_ bool operator!=(const Iterator &b) const { return E != b.E; }
+
+		Iterator(Element *p_E) { E = p_E; }
+		Iterator() {}
+		Iterator(const Iterator &p_it) { E = p_it.E; }
+
+	private:
+		Element *E = nullptr;
+	};
+
+	struct ConstIterator {
+		_FORCE_INLINE_ const KeyValue<K, V> &operator*() const {
+			return E->key_value();
+		}
+		_FORCE_INLINE_ const KeyValue<K, V> *operator->() const { return &E->key_value(); }
+		_FORCE_INLINE_ ConstIterator &operator++() {
+			E = E->next();
+			return *this;
+		}
+		_FORCE_INLINE_ ConstIterator &operator--() {
+			E = E->prev();
+			return *this;
+		}
+
+		_FORCE_INLINE_ bool operator==(const ConstIterator &b) const { return E == b.E; }
+		_FORCE_INLINE_ bool operator!=(const ConstIterator &b) const { return E != b.E; }
+
+		ConstIterator(const Element *p_E) { E = p_E; }
+		ConstIterator() {}
+		ConstIterator(const ConstIterator &p_it) { E = p_it.E; }
+
+	private:
+		const Element *E = nullptr;
+	};
+
+	_FORCE_INLINE_ Iterator begin() {
+		return Iterator(front());
+	}
+	_FORCE_INLINE_ Iterator end() {
+		return Iterator(nullptr);
+	}
+
+#if 0
+	//to use when replacing find()
+	_FORCE_INLINE_ Iterator find(const K &p_key) {
+		return Iterator(find(p_key));
+	}
+#endif
+	_FORCE_INLINE_ void remove(const Iterator &p_iter) {
+		return erase(p_iter.E);
+	}
+
+	_FORCE_INLINE_ ConstIterator begin() const {
+		return ConstIterator(front());
+	}
+	_FORCE_INLINE_ ConstIterator end() const {
+		return ConstIterator(nullptr);
+	}
+
+#if 0
+	//to use when replacing find()
+	_FORCE_INLINE_ ConstIterator find(const K &p_key) const {
+		return ConstIterator(find(p_key));
+	}
+#endif
 private:
 	struct _Data {
 		Element *_root = nullptr;
@@ -107,7 +192,7 @@ private:
 		}
 
 		void _create_root() {
-			_root = memnew_allocator(Element, A);
+			_root = memnew_allocator(Element(KeyValue<K, V>(K(), V())), A);
 			_root->parent = _root->left = _root->right = _nil;
 			_root->color = BLACK;
 		}
@@ -216,9 +301,9 @@ private:
 		C less;
 
 		while (node != _data._nil) {
-			if (less(p_key, node->_key)) {
+			if (less(p_key, node->_data.key)) {
 				node = node->left;
-			} else if (less(node->_key, p_key)) {
+			} else if (less(node->_data.key, p_key)) {
 				node = node->right;
 			} else {
 				return node; // found
@@ -236,9 +321,9 @@ private:
 		while (node != _data._nil) {
 			prev = node;
 
-			if (less(p_key, node->_key)) {
+			if (less(p_key, node->_data.key)) {
 				node = node->left;
-			} else if (less(node->_key, p_key)) {
+			} else if (less(node->_data.key, p_key)) {
 				node = node->right;
 			} else {
 				return node; // found
@@ -249,7 +334,7 @@ private:
 			return nullptr; // tree empty
 		}
 
-		if (less(p_key, prev->_key)) {
+		if (less(p_key, prev->_data.key)) {
 			prev = prev->_prev;
 		}
 
@@ -312,25 +397,25 @@ private:
 		while (node != _data._nil) {
 			new_parent = node;
 
-			if (less(p_key, node->_key)) {
+			if (less(p_key, node->_data.key)) {
 				node = node->left;
-			} else if (less(node->_key, p_key)) {
+			} else if (less(node->_data.key, p_key)) {
 				node = node->right;
 			} else {
-				node->_value = p_value;
+				node->_data.value = p_value;
 				return node; // Return existing node with new value
 			}
 		}
 
-		Element *new_node = memnew_allocator(Element, A);
+		typedef KeyValue<K, V> KV;
+		Element *new_node = memnew_allocator(Element(KV(p_key, p_value)), A);
 		new_node->parent = new_parent;
 		new_node->right = _data._nil;
 		new_node->left = _data._nil;
-		new_node->_key = p_key;
-		new_node->_value = p_value;
+
 		//new_node->data=_data;
 
-		if (new_parent == _data._root || less(p_key, new_parent->_key)) {
+		if (new_parent == _data._root || less(p_key, new_parent->_data.key)) {
 			new_parent->left = new_node;
 		} else {
 			new_parent->right = new_node;
@@ -575,7 +660,7 @@ public:
 		CRASH_COND(!_data._root);
 		const Element *e = find(p_key);
 		CRASH_COND(!e);
-		return e->_value;
+		return e->_data.value;
 	}
 
 	V &operator[](const K &p_key) {
@@ -588,7 +673,7 @@ public:
 			e = insert(p_key, V());
 		}
 
-		return e->_value;
+		return e->_data.value;
 	}
 
 	Element *front() const {
diff --git a/core/templates/oa_hash_map.h b/core/templates/oa_hash_map.h
index 1d4176eb10..025cc30db4 100644
--- a/core/templates/oa_hash_map.h
+++ b/core/templates/oa_hash_map.h
@@ -32,7 +32,6 @@
 #define OA_HASH_MAP_H
 
 #include "core/math/math_funcs.h"
-#include "core/os/copymem.h"
 #include "core/os/memory.h"
 #include "core/templates/hashfuncs.h"
 
@@ -232,7 +231,7 @@ public:
 	/**
 	 * returns true if the value was found, false otherwise.
 	 *
-	 * if r_data is not nullptr  then the value will be written to the object
+	 * if r_data is not nullptr then the value will be written to the object
 	 * it points to.
 	 */
 	bool lookup(const TKey &p_key, TValue &r_data) const {
@@ -250,7 +249,7 @@ public:
 	/**
 	 * returns true if the value was found, false otherwise.
 	 *
-	 * if r_data is not nullptr  then the value will be written to the object
+	 * if r_data is not nullptr then the value will be written to the object
 	 * it points to.
 	 */
 	TValue *lookup_ptr(const TKey &p_key) const {
diff --git a/core/templates/paged_allocator.h b/core/templates/paged_allocator.h
index 7002034710..dfc885c6eb 100644
--- a/core/templates/paged_allocator.h
+++ b/core/templates/paged_allocator.h
@@ -35,6 +35,8 @@
 #include "core/os/spin_lock.h"
 #include "core/typedefs.h"
 
+#include <type_traits>
+
 template <class T, bool thread_safe = false>
 class PagedAllocator {
 	T **page_pool = nullptr;
@@ -48,7 +50,8 @@ class PagedAllocator {
 	SpinLock spin_lock;
 
 public:
-	T *alloc() {
+	template <class... Args>
+	T *alloc(const Args &&...p_args) {
 		if (thread_safe) {
 			spin_lock.lock();
 		}
@@ -73,7 +76,7 @@ public:
 		if (thread_safe) {
 			spin_lock.unlock();
 		}
-		memnew_placement(alloc, T);
+		memnew_placement(alloc, T(p_args...));
 		return alloc;
 	}
 
@@ -89,8 +92,10 @@ public:
 		allocs_available++;
 	}
 
-	void reset() {
-		ERR_FAIL_COND(allocs_available < pages_allocated * page_size);
+	void reset(bool p_allow_unfreed = false) {
+		if (!p_allow_unfreed || !std::is_trivially_destructible<T>::value) {
+			ERR_FAIL_COND(allocs_available < pages_allocated * page_size);
+		}
 		if (pages_allocated) {
 			for (uint32_t i = 0; i < pages_allocated; i++) {
 				memfree(page_pool[i]);
diff --git a/core/templates/pair.h b/core/templates/pair.h
index bc1a764694..e30ee8bc56 100644
--- a/core/templates/pair.h
+++ b/core/templates/pair.h
@@ -31,6 +31,8 @@
 #ifndef PAIR_H
 #define PAIR_H
 
+#include "core/typedefs.h"
+
 template <class F, class S>
 struct Pair {
 	F first;
@@ -60,7 +62,43 @@ bool operator!=(const Pair<F, S> &pair, const Pair<F, S> &other) {
 template <class F, class S>
 struct PairSort {
 	bool operator()(const Pair<F, S> &A, const Pair<F, S> &B) const {
-		return A.first < B.first;
+		if (A.first != B.first) {
+			return A.first < B.first;
+		}
+		return A.second < B.second;
+	}
+};
+
+template <class K, class V>
+struct KeyValue {
+	const K key;
+	V value;
+
+	void operator=(const KeyValue &p_kv) = delete;
+	_FORCE_INLINE_ KeyValue(const KeyValue &p_kv) :
+			key(p_kv.key),
+			value(p_kv.value) {
+	}
+	_FORCE_INLINE_ KeyValue(const K &p_key, const V &p_value) :
+			key(p_key),
+			value(p_value) {
+	}
+};
+
+template <class K, class V>
+bool operator==(const KeyValue<K, V> &pair, const KeyValue<K, V> &other) {
+	return (pair.key == other.key) && (pair.value == other.value);
+}
+
+template <class K, class V>
+bool operator!=(const KeyValue<K, V> &pair, const KeyValue<K, V> &other) {
+	return (pair.key != other.key) || (pair.value != other.value);
+}
+
+template <class K, class V>
+struct KeyValueSort {
+	bool operator()(const KeyValue<K, V> &A, const KeyValue<K, V> &B) const {
+		return A.key < B.key;
 	}
 };
 
diff --git a/core/templates/pooled_list.h b/core/templates/pooled_list.h
new file mode 100644
index 0000000000..b139dadb75
--- /dev/null
+++ b/core/templates/pooled_list.h
@@ -0,0 +1,97 @@
+/*************************************************************************/
+/*  pooled_list.h                                                        */
+/*************************************************************************/
+/*                       This file is part of:                           */
+/*                           GODOT ENGINE                                */
+/*                      https://godotengine.org                          */
+/*************************************************************************/
+/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur.                 */
+/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md).   */
+/*                                                                       */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the       */
+/* "Software"), to deal in the Software without restriction, including   */
+/* without limitation the rights to use, copy, modify, merge, publish,   */
+/* distribute, sublicense, and/or sell copies of the Software, and to    */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions:                                             */
+/*                                                                       */
+/* The above copyright notice and this permission notice shall be        */
+/* included in all copies or substantial portions of the Software.       */
+/*                                                                       */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
+/*************************************************************************/
+
+#ifndef POOLED_LIST_H
+#define POOLED_LIST_H
+
+#include "core/templates/local_vector.h"
+
+// Simple template to provide a pool with O(1) allocate and free.
+// The freelist could alternatively be a linked list placed within the unused elements
+// to use less memory, however a separate freelist is probably more cache friendly.
+//
+// NOTE: Take great care when using this with non POD types. The construction and destruction
+// is done in the LocalVector, NOT as part of the pool. So requesting a new item does not guarantee
+// a constructor is run, and free does not guarantee a destructor.
+// You should generally handle clearing
+// an item explicitly after a request, as it may contain 'leftovers'.
+// This is by design for fastest use in the BVH. If you want a more general pool
+// that does call constructors / destructors on request / free, this should probably be
+// a separate template.
+template <class T, bool force_trivial = false>
+class PooledList {
+	LocalVector<T, uint32_t, force_trivial> list;
+	LocalVector<uint32_t, uint32_t, true> freelist;
+
+	// not all list members are necessarily used
+	int _used_size;
+
+public:
+	PooledList() {
+		_used_size = 0;
+	}
+
+	int estimate_memory_use() const {
+		return (list.size() * sizeof(T)) + (freelist.size() * sizeof(uint32_t));
+	}
+
+	const T &operator[](uint32_t p_index) const {
+		return list[p_index];
+	}
+	T &operator[](uint32_t p_index) {
+		return list[p_index];
+	}
+
+	int size() const { return _used_size; }
+
+	T *request(uint32_t &r_id) {
+		_used_size++;
+
+		if (freelist.size()) {
+			// pop from freelist
+			int new_size = freelist.size() - 1;
+			r_id = freelist[new_size];
+			freelist.resize(new_size);
+			return &list[r_id];
+		}
+
+		r_id = list.size();
+		list.resize(r_id + 1);
+		return &list[r_id];
+	}
+	void free(const uint32_t &p_id) {
+		// should not be on free list already
+		CRASH_COND(p_id >= list.size());
+		freelist.push_back(p_id);
+		_used_size--;
+	}
+};
+
+#endif // POOLED_LIST_H
diff --git a/core/templates/rid.h b/core/templates/rid.h
index 7fe6dbe473..4c7119b4ea 100644
--- a/core/templates/rid.h
+++ b/core/templates/rid.h
@@ -40,30 +40,37 @@ class RID {
 	uint64_t _id = 0;
 
 public:
-	_FORCE_INLINE_ bool operator==(const RID &p_rid) const {
+	_ALWAYS_INLINE_ bool operator==(const RID &p_rid) const {
 		return _id == p_rid._id;
 	}
-	_FORCE_INLINE_ bool operator<(const RID &p_rid) const {
+	_ALWAYS_INLINE_ bool operator<(const RID &p_rid) const {
 		return _id < p_rid._id;
 	}
-	_FORCE_INLINE_ bool operator<=(const RID &p_rid) const {
+	_ALWAYS_INLINE_ bool operator<=(const RID &p_rid) const {
 		return _id <= p_rid._id;
 	}
-	_FORCE_INLINE_ bool operator>(const RID &p_rid) const {
+	_ALWAYS_INLINE_ bool operator>(const RID &p_rid) const {
 		return _id > p_rid._id;
 	}
-	_FORCE_INLINE_ bool operator>=(const RID &p_rid) const {
+	_ALWAYS_INLINE_ bool operator>=(const RID &p_rid) const {
 		return _id >= p_rid._id;
 	}
-	_FORCE_INLINE_ bool operator!=(const RID &p_rid) const {
+	_ALWAYS_INLINE_ bool operator!=(const RID &p_rid) const {
 		return _id != p_rid._id;
 	}
-	_FORCE_INLINE_ bool is_valid() const { return _id != 0; }
-	_FORCE_INLINE_ bool is_null() const { return _id == 0; }
+	_ALWAYS_INLINE_ bool is_valid() const { return _id != 0; }
+	_ALWAYS_INLINE_ bool is_null() const { return _id == 0; }
 
-	_FORCE_INLINE_ uint64_t get_id() const { return _id; }
+	_ALWAYS_INLINE_ uint32_t get_local_index() const { return _id & 0xFFFFFFFF; }
 
-	_FORCE_INLINE_ RID() {}
+	static _ALWAYS_INLINE_ RID from_uint64(uint64_t p_id) {
+		RID _rid;
+		_rid._id = p_id;
+		return _rid;
+	}
+	_ALWAYS_INLINE_ uint64_t get_id() const { return _id; }
+
+	_ALWAYS_INLINE_ RID() {}
 };
 
 #endif // RID_H
diff --git a/core/templates/rid_owner.cpp b/core/templates/rid_owner.cpp
index f75a2eb9df..56f39ab779 100644
--- a/core/templates/rid_owner.cpp
+++ b/core/templates/rid_owner.cpp
@@ -30,4 +30,4 @@
 
 #include "rid_owner.h"
 
-volatile uint64_t RID_AllocBase::base_id = 1;
+SafeNumeric<uint64_t> RID_AllocBase::base_id{ 1 };
diff --git a/core/templates/rid_owner.h b/core/templates/rid_owner.h
index d26c445eb4..71d41eacc4 100644
--- a/core/templates/rid_owner.h
+++ b/core/templates/rid_owner.h
@@ -44,7 +44,7 @@
 #include <typeinfo>
 
 class RID_AllocBase {
-	static volatile uint64_t base_id;
+	static SafeNumeric<uint64_t> base_id;
 
 protected:
 	static RID _make_from_id(uint64_t p_id) {
@@ -53,14 +53,16 @@ protected:
 		return rid;
 	}
 
-	static uint64_t _gen_id() {
-		return atomic_increment(&base_id);
-	}
-
 	static RID _gen_rid() {
 		return _make_from_id(_gen_id());
 	}
 
+	friend struct VariantUtilityFunctions;
+
+	static uint64_t _gen_id() {
+		return base_id.increment();
+	}
+
 public:
 	virtual ~RID_AllocBase() {}
 };
@@ -79,8 +81,7 @@ class RID_Alloc : public RID_AllocBase {
 
 	SpinLock spin_lock;
 
-public:
-	RID make_rid(const T &p_value) {
+	_FORCE_INLINE_ RID _allocate_rid() {
 		if (THREAD_SAFE) {
 			spin_lock.lock();
 		}
@@ -102,7 +103,7 @@ public:
 
 			//initialize
 			for (uint32_t i = 0; i < elements_in_chunk; i++) {
-				//dont initialize chunk
+				// Don't initialize chunk.
 				validator_chunks[chunk_count][i] = 0xFFFFFFFF;
 				free_list_chunks[chunk_count][i] = alloc_count + i;
 			}
@@ -115,15 +116,15 @@ public:
 		uint32_t free_chunk = free_index / elements_in_chunk;
 		uint32_t free_element = free_index % elements_in_chunk;
 
-		T *ptr = &chunks[free_chunk][free_element];
-		memnew_placement(ptr, T(p_value));
-
-		uint32_t validator = (uint32_t)(_gen_id() & 0xFFFFFFFF);
+		uint32_t validator = (uint32_t)(_gen_id() & 0x7FFFFFFF);
 		uint64_t id = validator;
 		id <<= 32;
 		id |= free_index;
 
 		validator_chunks[free_chunk][free_element] = validator;
+
+		validator_chunks[free_chunk][free_element] |= 0x80000000; //mark uninitialized bit
+
 		alloc_count++;
 
 		if (THREAD_SAFE) {
@@ -133,7 +134,27 @@ public:
 		return _make_from_id(id);
 	}
 
-	_FORCE_INLINE_ T *getornull(const RID &p_rid) {
+public:
+	RID make_rid() {
+		RID rid = _allocate_rid();
+		initialize_rid(rid);
+		return rid;
+	}
+	RID make_rid(const T &p_value) {
+		RID rid = _allocate_rid();
+		initialize_rid(rid, p_value);
+		return rid;
+	}
+
+	//allocate but don't initialize, use initialize_rid afterwards
+	RID allocate_rid() {
+		return _allocate_rid();
+	}
+
+	_FORCE_INLINE_ T *get_or_null(const RID &p_rid, bool p_initialize = false) {
+		if (p_rid == RID()) {
+			return nullptr;
+		}
 		if (THREAD_SAFE) {
 			spin_lock.lock();
 		}
@@ -151,10 +172,32 @@ public:
 		uint32_t idx_element = idx % elements_in_chunk;
 
 		uint32_t validator = uint32_t(id >> 32);
-		if (unlikely(validator_chunks[idx_chunk][idx_element] != validator)) {
+
+		if (unlikely(p_initialize)) {
+			if (unlikely(!(validator_chunks[idx_chunk][idx_element] & 0x80000000))) {
+				if (THREAD_SAFE) {
+					spin_lock.unlock();
+				}
+				ERR_FAIL_V_MSG(nullptr, "Initializing already initialized RID");
+			}
+
+			if (unlikely((validator_chunks[idx_chunk][idx_element] & 0x7FFFFFFF) != validator)) {
+				if (THREAD_SAFE) {
+					spin_lock.unlock();
+				}
+				ERR_FAIL_V_MSG(nullptr, "Attempting to initialize the wrong RID");
+				return nullptr;
+			}
+
+			validator_chunks[idx_chunk][idx_element] &= 0x7FFFFFFF; //initialized
+
+		} else if (unlikely(validator_chunks[idx_chunk][idx_element] != validator)) {
 			if (THREAD_SAFE) {
 				spin_lock.unlock();
 			}
+			if ((validator_chunks[idx_chunk][idx_element] & 0x80000000) && validator_chunks[idx_chunk][idx_element] != 0xFFFFFFFF) {
+				ERR_FAIL_V_MSG(nullptr, "Attempting to use an uninitialized RID");
+			}
 			return nullptr;
 		}
 
@@ -166,6 +209,16 @@ public:
 
 		return ptr;
 	}
+	void initialize_rid(RID p_rid) {
+		T *mem = get_or_null(p_rid, true);
+		ERR_FAIL_COND(!mem);
+		memnew_placement(mem, T);
+	}
+	void initialize_rid(RID p_rid, const T &p_value) {
+		T *mem = get_or_null(p_rid, true);
+		ERR_FAIL_COND(!mem);
+		memnew_placement(mem, T(p_value));
+	}
 
 	_FORCE_INLINE_ bool owns(const RID &p_rid) {
 		if (THREAD_SAFE) {
@@ -186,7 +239,7 @@ public:
 
 		uint32_t validator = uint32_t(id >> 32);
 
-		bool owned = validator_chunks[idx_chunk][idx_element] == validator;
+		bool owned = (validator_chunks[idx_chunk][idx_element] & 0x7FFFFFFF) == validator;
 
 		if (THREAD_SAFE) {
 			spin_lock.unlock();
@@ -213,7 +266,12 @@ public:
 		uint32_t idx_element = idx % elements_in_chunk;
 
 		uint32_t validator = uint32_t(id >> 32);
-		if (unlikely(validator_chunks[idx_chunk][idx_element] != validator)) {
+		if (unlikely(validator_chunks[idx_chunk][idx_element] & 0x80000000)) {
+			if (THREAD_SAFE) {
+				spin_lock.unlock();
+			}
+			ERR_FAIL_MSG("Attempted to free an uninitialized or invalid RID");
+		} else if (unlikely(validator_chunks[idx_chunk][idx_element] != validator)) {
 			if (THREAD_SAFE) {
 				spin_lock.unlock();
 			}
@@ -282,7 +340,7 @@ public:
 		description = p_descrption;
 	}
 
-	RID_Alloc(uint32_t p_target_chunk_byte_size = 4096) {
+	RID_Alloc(uint32_t p_target_chunk_byte_size = 65536) {
 		elements_in_chunk = sizeof(T) > p_target_chunk_byte_size ? 1 : (p_target_chunk_byte_size / sizeof(T));
 	}
 
@@ -300,6 +358,9 @@ public:
 
 			for (size_t i = 0; i < max_alloc; i++) {
 				uint64_t validator = validator_chunks[i / elements_in_chunk][i % elements_in_chunk];
+				if (validator & 0x80000000) {
+					continue; //uninitialized
+				}
 				if (validator != 0xFFFFFFFF) {
 					chunks[i / elements_in_chunk][i % elements_in_chunk].~T();
 				}
@@ -330,14 +391,28 @@ public:
 		return alloc.make_rid(p_ptr);
 	}
 
-	_FORCE_INLINE_ T *getornull(const RID &p_rid) {
-		T **ptr = alloc.getornull(p_rid);
+	_FORCE_INLINE_ RID allocate_rid() {
+		return alloc.allocate_rid();
+	}
+
+	_FORCE_INLINE_ void initialize_rid(RID p_rid, T *p_ptr) {
+		alloc.initialize_rid(p_rid, p_ptr);
+	}
+
+	_FORCE_INLINE_ T *get_or_null(const RID &p_rid) {
+		T **ptr = alloc.get_or_null(p_rid);
 		if (unlikely(!ptr)) {
 			return nullptr;
 		}
 		return *ptr;
 	}
 
+	_FORCE_INLINE_ void replace(const RID &p_rid, T *p_new_ptr) {
+		T **ptr = alloc.get_or_null(p_rid);
+		ERR_FAIL_COND(!ptr);
+		*ptr = p_new_ptr;
+	}
+
 	_FORCE_INLINE_ bool owns(const RID &p_rid) {
 		return alloc.owns(p_rid);
 	}
@@ -366,7 +441,7 @@ public:
 		alloc.set_description(p_descrption);
 	}
 
-	RID_PtrOwner(uint32_t p_target_chunk_byte_size = 4096) :
+	RID_PtrOwner(uint32_t p_target_chunk_byte_size = 65536) :
 			alloc(p_target_chunk_byte_size) {}
 };
 
@@ -375,12 +450,27 @@ class RID_Owner {
 	RID_Alloc<T, THREAD_SAFE> alloc;
 
 public:
+	_FORCE_INLINE_ RID make_rid() {
+		return alloc.make_rid();
+	}
 	_FORCE_INLINE_ RID make_rid(const T &p_ptr) {
 		return alloc.make_rid(p_ptr);
 	}
 
-	_FORCE_INLINE_ T *getornull(const RID &p_rid) {
-		return alloc.getornull(p_rid);
+	_FORCE_INLINE_ RID allocate_rid() {
+		return alloc.allocate_rid();
+	}
+
+	_FORCE_INLINE_ void initialize_rid(RID p_rid) {
+		alloc.initialize_rid(p_rid);
+	}
+
+	_FORCE_INLINE_ void initialize_rid(RID p_rid, const T &p_ptr) {
+		alloc.initialize_rid(p_rid, p_ptr);
+	}
+
+	_FORCE_INLINE_ T *get_or_null(const RID &p_rid) {
+		return alloc.get_or_null(p_rid);
 	}
 
 	_FORCE_INLINE_ bool owns(const RID &p_rid) {
@@ -410,7 +500,7 @@ public:
 	void set_description(const char *p_descrption) {
 		alloc.set_description(p_descrption);
 	}
-	RID_Owner(uint32_t p_target_chunk_byte_size = 4096) :
+	RID_Owner(uint32_t p_target_chunk_byte_size = 65536) :
 			alloc(p_target_chunk_byte_size) {}
 };
 
diff --git a/core/templates/safe_list.h b/core/templates/safe_list.h
new file mode 100644
index 0000000000..d8f010663b
--- /dev/null
+++ b/core/templates/safe_list.h
@@ -0,0 +1,375 @@
+/*************************************************************************/
+/*  safe_list.h                                                          */
+/*************************************************************************/
+/*                       This file is part of:                           */
+/*                           GODOT ENGINE                                */
+/*                      https://godotengine.org                          */
+/*************************************************************************/
+/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur.                 */
+/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md).   */
+/*                                                                       */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the       */
+/* "Software"), to deal in the Software without restriction, including   */
+/* without limitation the rights to use, copy, modify, merge, publish,   */
+/* distribute, sublicense, and/or sell copies of the Software, and to    */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions:                                             */
+/*                                                                       */
+/* The above copyright notice and this permission notice shall be        */
+/* included in all copies or substantial portions of the Software.       */
+/*                                                                       */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
+/*************************************************************************/
+
+#ifndef SAFE_LIST_H
+#define SAFE_LIST_H
+
+#include "core/os/memory.h"
+#include "core/typedefs.h"
+#include <functional>
+
+#if !defined(NO_THREADS)
+
+#include <atomic>
+#include <type_traits>
+
+// Design goals for these classes:
+// - Accessing this list with an iterator will never result in a use-after free,
+//   even if the element being accessed has been logically removed from the list on
+//   another thread.
+// - Logical deletion from the list will not result in deallocation at that time,
+//   instead the node will be deallocated at a later time when it is safe to do so.
+// - No blocking synchronization primitives will be used.
+
+// This is used in very specific areas of the engine where it's critical that these guarantees are held.
+
+template <class T, class A = DefaultAllocator>
+class SafeList {
+	struct SafeListNode {
+		std::atomic<SafeListNode *> next = nullptr;
+
+		// If the node is logically deleted, this pointer will typically point
+		// to the previous list item in time that was also logically deleted.
+		std::atomic<SafeListNode *> graveyard_next = nullptr;
+
+		std::function<void(T)> deletion_fn = [](T t) { return; };
+
+		T val;
+	};
+
+	static_assert(std::atomic<T>::is_always_lock_free);
+
+	std::atomic<SafeListNode *> head = nullptr;
+	std::atomic<SafeListNode *> graveyard_head = nullptr;
+
+	std::atomic_uint active_iterator_count = 0;
+
+public:
+	class Iterator {
+		friend class SafeList;
+
+		SafeListNode *cursor;
+		SafeList *list;
+
+		Iterator(SafeListNode *p_cursor, SafeList *p_list) :
+				cursor(p_cursor), list(p_list) {
+			list->active_iterator_count++;
+		}
+
+	public:
+		Iterator(const Iterator &p_other) :
+				cursor(p_other.cursor), list(p_other.list) {
+			list->active_iterator_count++;
+		}
+
+		~Iterator() {
+			list->active_iterator_count--;
+		}
+
+	public:
+		T &operator*() {
+			return cursor->val;
+		}
+
+		Iterator &operator++() {
+			cursor = cursor->next;
+			return *this;
+		}
+
+		// These two operators are mostly useful for comparisons to nullptr.
+		bool operator==(const void *p_other) const {
+			return cursor == p_other;
+		}
+
+		bool operator!=(const void *p_other) const {
+			return cursor != p_other;
+		}
+
+		// These two allow easy range-based for loops.
+		bool operator==(const Iterator &p_other) const {
+			return cursor == p_other.cursor;
+		}
+
+		bool operator!=(const Iterator &p_other) const {
+			return cursor != p_other.cursor;
+		}
+	};
+
+public:
+	// Calling this will cause an allocation.
+	void insert(T p_value) {
+		SafeListNode *new_node = memnew_allocator(SafeListNode, A);
+		new_node->val = p_value;
+		SafeListNode *expected_head = nullptr;
+		do {
+			expected_head = head.load();
+			new_node->next.store(expected_head);
+		} while (!head.compare_exchange_strong(/* expected= */ expected_head, /* new= */ new_node));
+	}
+
+	Iterator find(T p_value) {
+		for (Iterator it = begin(); it != end(); ++it) {
+			if (*it == p_value) {
+				return it;
+			}
+		}
+		return end();
+	}
+
+	void erase(T p_value, std::function<void(T)> p_deletion_fn) {
+		Iterator tmp = find(p_value);
+		erase(tmp, p_deletion_fn);
+	}
+
+	void erase(T p_value) {
+		Iterator tmp = find(p_value);
+		erase(tmp, [](T t) { return; });
+	}
+
+	void erase(Iterator &p_iterator, std::function<void(T)> p_deletion_fn) {
+		p_iterator.cursor->deletion_fn = p_deletion_fn;
+		erase(p_iterator);
+	}
+
+	void erase(Iterator &p_iterator) {
+		if (find(p_iterator.cursor->val) == nullptr) {
+			// Not in the list, nothing to do.
+			return;
+		}
+		// First, remove the node from the list.
+		while (true) {
+			Iterator prev = begin();
+			SafeListNode *expected_head = prev.cursor;
+			for (; prev != end(); ++prev) {
+				if (prev.cursor && prev.cursor->next == p_iterator.cursor) {
+					break;
+				}
+			}
+			if (prev != end()) {
+				// There exists a node before this.
+				prev.cursor->next.store(p_iterator.cursor->next.load());
+				// Done.
+				break;
+			} else {
+				if (head.compare_exchange_strong(/* expected= */ expected_head, /* new= */ p_iterator.cursor->next.load())) {
+					// Successfully reassigned the head pointer before another thread changed it to something else.
+					break;
+				}
+				// Fall through upon failure, try again.
+			}
+		}
+		// Then queue it for deletion by putting it in the node graveyard.
+		// Don't touch `next` because an iterator might still be pointing at this node.
+		SafeListNode *expected_head = nullptr;
+		do {
+			expected_head = graveyard_head.load();
+			p_iterator.cursor->graveyard_next.store(expected_head);
+		} while (!graveyard_head.compare_exchange_strong(/* expected= */ expected_head, /* new= */ p_iterator.cursor));
+	}
+
+	Iterator begin() {
+		return Iterator(head.load(), this);
+	}
+
+	Iterator end() {
+		return Iterator(nullptr, this);
+	}
+
+	// Calling this will cause zero to many deallocations.
+	void maybe_cleanup() {
+		SafeListNode *cursor = nullptr;
+		SafeListNode *new_graveyard_head = nullptr;
+		do {
+			// The access order here is theoretically important.
+			cursor = graveyard_head.load();
+			if (active_iterator_count.load() != 0) {
+				// It's not safe to clean up with an active iterator, because that iterator
+				// could be pointing to an element that we want to delete.
+				return;
+			}
+			// Any iterator created after this point will never point to a deleted node.
+			// Swap it out with the current graveyard head.
+		} while (!graveyard_head.compare_exchange_strong(/* expected= */ cursor, /* new= */ new_graveyard_head));
+		// Our graveyard list is now unreachable by any active iterators,
+		// detached from the main graveyard head and ready for deletion.
+		while (cursor) {
+			SafeListNode *tmp = cursor;
+			cursor = cursor->graveyard_next;
+			tmp->deletion_fn(tmp->val);
+			memdelete_allocator<SafeListNode, A>(tmp);
+		}
+	}
+};
+
+#else // NO_THREADS
+
+// Effectively the same structure without the atomics. It's probably possible to simplify it but the semantics shouldn't differ greatly.
+template <class T, class A = DefaultAllocator>
+class SafeList {
+	struct SafeListNode {
+		SafeListNode *next = nullptr;
+
+		// If the node is logically deleted, this pointer will typically point to the previous list item in time that was also logically deleted.
+		SafeListNode *graveyard_next = nullptr;
+
+		std::function<void(T)> deletion_fn = [](T t) { return; };
+
+		T val;
+	};
+
+	SafeListNode *head = nullptr;
+	SafeListNode *graveyard_head = nullptr;
+
+	unsigned int active_iterator_count = 0;
+
+public:
+	class Iterator {
+		friend class SafeList;
+
+		SafeListNode *cursor;
+		SafeList *list;
+
+	public:
+		Iterator(SafeListNode *p_cursor, SafeList *p_list) :
+				cursor(p_cursor), list(p_list) {
+			list->active_iterator_count++;
+		}
+
+		~Iterator() {
+			list->active_iterator_count--;
+		}
+
+		T &operator*() {
+			return cursor->val;
+		}
+
+		Iterator &operator++() {
+			cursor = cursor->next;
+			return *this;
+		}
+
+		// These two operators are mostly useful for comparisons to nullptr.
+		bool operator==(const void *p_other) const {
+			return cursor == p_other;
+		}
+
+		bool operator!=(const void *p_other) const {
+			return cursor != p_other;
+		}
+
+		// These two allow easy range-based for loops.
+		bool operator==(const Iterator &p_other) const {
+			return cursor == p_other.cursor;
+		}
+
+		bool operator!=(const Iterator &p_other) const {
+			return cursor != p_other.cursor;
+		}
+	};
+
+public:
+	// Calling this will cause an allocation.
+	void insert(T p_value) {
+		SafeListNode *new_node = memnew_allocator(SafeListNode, A);
+		new_node->val = p_value;
+		new_node->next = head;
+		head = new_node;
+	}
+
+	Iterator find(T p_value) {
+		for (Iterator it = begin(); it != end(); ++it) {
+			if (*it == p_value) {
+				return it;
+			}
+		}
+		return end();
+	}
+
+	void erase(T p_value, std::function<void(T)> p_deletion_fn) {
+		erase(find(p_value), p_deletion_fn);
+	}
+
+	void erase(T p_value) {
+		erase(find(p_value), [](T t) { return; });
+	}
+
+	void erase(Iterator p_iterator, std::function<void(T)> p_deletion_fn) {
+		p_iterator.cursor->deletion_fn = p_deletion_fn;
+		erase(p_iterator);
+	}
+
+	void erase(Iterator p_iterator) {
+		Iterator prev = begin();
+		for (; prev != end(); ++prev) {
+			if (prev.cursor && prev.cursor->next == p_iterator.cursor) {
+				break;
+			}
+		}
+		if (prev == end()) {
+			// Not in the list, nothing to do.
+			return;
+		}
+		// First, remove the node from the list.
+		prev.cursor->next = p_iterator.cursor->next;
+
+		// Then queue it for deletion by putting it in the node graveyard. Don't touch `next` because an iterator might still be pointing at this node.
+		p_iterator.cursor->graveyard_next = graveyard_head;
+		graveyard_head = p_iterator.cursor;
+	}
+
+	Iterator begin() {
+		return Iterator(head, this);
+	}
+
+	Iterator end() {
+		return Iterator(nullptr, this);
+	}
+
+	// Calling this will cause zero to many deallocations.
+	void maybe_cleanup() {
+		SafeListNode *cursor = graveyard_head;
+		if (active_iterator_count != 0) {
+			// It's not safe to clean up with an active iterator, because that iterator could be pointing to an element that we want to delete.
+			return;
+		}
+		graveyard_head = nullptr;
+		// Our graveyard list is now unreachable by any active iterators, detached from the main graveyard head and ready for deletion.
+		while (cursor) {
+			SafeListNode *tmp = cursor;
+			cursor = cursor->next;
+			tmp->deletion_fn(tmp->val);
+			memdelete_allocator<SafeListNode, A>(tmp);
+		}
+	}
+};
+
+#endif
+
+#endif // SAFE_LIST_H
diff --git a/core/templates/safe_refcount.cpp b/core/templates/safe_refcount.cpp
deleted file mode 100644
index a915ee662f..0000000000
--- a/core/templates/safe_refcount.cpp
+++ /dev/null
@@ -1,161 +0,0 @@
-/*************************************************************************/
-/*  safe_refcount.cpp                                                    */
-/*************************************************************************/
-/*                       This file is part of:                           */
-/*                           GODOT ENGINE                                */
-/*                      https://godotengine.org                          */
-/*************************************************************************/
-/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur.                 */
-/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md).   */
-/*                                                                       */
-/* Permission is hereby granted, free of charge, to any person obtaining */
-/* a copy of this software and associated documentation files (the       */
-/* "Software"), to deal in the Software without restriction, including   */
-/* without limitation the rights to use, copy, modify, merge, publish,   */
-/* distribute, sublicense, and/or sell copies of the Software, and to    */
-/* permit persons to whom the Software is furnished to do so, subject to */
-/* the following conditions:                                             */
-/*                                                                       */
-/* The above copyright notice and this permission notice shall be        */
-/* included in all copies or substantial portions of the Software.       */
-/*                                                                       */
-/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
-/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
-/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
-/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
-/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
-/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
-/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
-/*************************************************************************/
-
-#include "safe_refcount.h"
-
-#if defined(_MSC_VER)
-
-/* Implementation for MSVC-Windows */
-
-// don't pollute my namespace!
-#include <windows.h>
-
-#define ATOMIC_CONDITIONAL_INCREMENT_BODY(m_pw, m_win_type, m_win_cmpxchg, m_cpp_type) \
-	/* try to increment until it actually works */                                     \
-	/* taken from boost */                                                             \
-	while (true) {                                                                     \
-		m_cpp_type tmp = static_cast<m_cpp_type const volatile &>(*(m_pw));            \
-		if (tmp == 0) {                                                                \
-			return 0; /* if zero, can't add to it anymore */                           \
-		}                                                                              \
-		if (m_win_cmpxchg((m_win_type volatile *)(m_pw), tmp + 1, tmp) == tmp) {       \
-			return tmp + 1;                                                            \
-		}                                                                              \
-	}
-
-#define ATOMIC_EXCHANGE_IF_GREATER_BODY(m_pw, m_val, m_win_type, m_win_cmpxchg, m_cpp_type) \
-	while (true) {                                                                          \
-		m_cpp_type tmp = static_cast<m_cpp_type const volatile &>(*(m_pw));                 \
-		if (tmp >= m_val) {                                                                 \
-			return tmp; /* already greater, or equal */                                     \
-		}                                                                                   \
-		if (m_win_cmpxchg((m_win_type volatile *)(m_pw), m_val, tmp) == tmp) {              \
-			return m_val;                                                                   \
-		}                                                                                   \
-	}
-
-_ALWAYS_INLINE_ uint32_t _atomic_conditional_increment_impl(volatile uint32_t *pw) {
-	ATOMIC_CONDITIONAL_INCREMENT_BODY(pw, LONG, InterlockedCompareExchange, uint32_t);
-}
-
-_ALWAYS_INLINE_ uint32_t _atomic_decrement_impl(volatile uint32_t *pw) {
-	return InterlockedDecrement((LONG volatile *)pw);
-}
-
-_ALWAYS_INLINE_ uint32_t _atomic_increment_impl(volatile uint32_t *pw) {
-	return InterlockedIncrement((LONG volatile *)pw);
-}
-
-_ALWAYS_INLINE_ uint32_t _atomic_sub_impl(volatile uint32_t *pw, volatile uint32_t val) {
-	return InterlockedExchangeAdd((LONG volatile *)pw, -(int32_t)val) - val;
-}
-
-_ALWAYS_INLINE_ uint32_t _atomic_add_impl(volatile uint32_t *pw, volatile uint32_t val) {
-	return InterlockedAdd((LONG volatile *)pw, val);
-}
-
-_ALWAYS_INLINE_ uint32_t _atomic_exchange_if_greater_impl(volatile uint32_t *pw, volatile uint32_t val) {
-	ATOMIC_EXCHANGE_IF_GREATER_BODY(pw, val, LONG, InterlockedCompareExchange, uint32_t);
-}
-
-_ALWAYS_INLINE_ uint64_t _atomic_conditional_increment_impl(volatile uint64_t *pw) {
-	ATOMIC_CONDITIONAL_INCREMENT_BODY(pw, LONGLONG, InterlockedCompareExchange64, uint64_t);
-}
-
-_ALWAYS_INLINE_ uint64_t _atomic_decrement_impl(volatile uint64_t *pw) {
-	return InterlockedDecrement64((LONGLONG volatile *)pw);
-}
-
-_ALWAYS_INLINE_ uint64_t _atomic_increment_impl(volatile uint64_t *pw) {
-	return InterlockedIncrement64((LONGLONG volatile *)pw);
-}
-
-_ALWAYS_INLINE_ uint64_t _atomic_sub_impl(volatile uint64_t *pw, volatile uint64_t val) {
-	return InterlockedExchangeAdd64((LONGLONG volatile *)pw, -(int64_t)val) - val;
-}
-
-_ALWAYS_INLINE_ uint64_t _atomic_add_impl(volatile uint64_t *pw, volatile uint64_t val) {
-	return InterlockedAdd64((LONGLONG volatile *)pw, val);
-}
-
-_ALWAYS_INLINE_ uint64_t _atomic_exchange_if_greater_impl(volatile uint64_t *pw, volatile uint64_t val) {
-	ATOMIC_EXCHANGE_IF_GREATER_BODY(pw, val, LONGLONG, InterlockedCompareExchange64, uint64_t);
-}
-
-// The actual advertised functions; they'll call the right implementation
-
-uint32_t atomic_conditional_increment(volatile uint32_t *pw) {
-	return _atomic_conditional_increment_impl(pw);
-}
-
-uint32_t atomic_decrement(volatile uint32_t *pw) {
-	return _atomic_decrement_impl(pw);
-}
-
-uint32_t atomic_increment(volatile uint32_t *pw) {
-	return _atomic_increment_impl(pw);
-}
-
-uint32_t atomic_sub(volatile uint32_t *pw, volatile uint32_t val) {
-	return _atomic_sub_impl(pw, val);
-}
-
-uint32_t atomic_add(volatile uint32_t *pw, volatile uint32_t val) {
-	return _atomic_add_impl(pw, val);
-}
-
-uint32_t atomic_exchange_if_greater(volatile uint32_t *pw, volatile uint32_t val) {
-	return _atomic_exchange_if_greater_impl(pw, val);
-}
-
-uint64_t atomic_conditional_increment(volatile uint64_t *pw) {
-	return _atomic_conditional_increment_impl(pw);
-}
-
-uint64_t atomic_decrement(volatile uint64_t *pw) {
-	return _atomic_decrement_impl(pw);
-}
-
-uint64_t atomic_increment(volatile uint64_t *pw) {
-	return _atomic_increment_impl(pw);
-}
-
-uint64_t atomic_sub(volatile uint64_t *pw, volatile uint64_t val) {
-	return _atomic_sub_impl(pw, val);
-}
-
-uint64_t atomic_add(volatile uint64_t *pw, volatile uint64_t val) {
-	return _atomic_add_impl(pw, val);
-}
-
-uint64_t atomic_exchange_if_greater(volatile uint64_t *pw, volatile uint64_t val) {
-	return _atomic_exchange_if_greater_impl(pw, val);
-}
-#endif
diff --git a/core/templates/safe_refcount.h b/core/templates/safe_refcount.h
index d94444fad6..e9e5695f80 100644
--- a/core/templates/safe_refcount.h
+++ b/core/templates/safe_refcount.h
@@ -31,167 +31,292 @@
 #ifndef SAFE_REFCOUNT_H
 #define SAFE_REFCOUNT_H
 
-#include "core/os/mutex.h"
 #include "core/typedefs.h"
-#include "platform_config.h"
 
-// Atomic functions, these are used for multithread safe reference counters!
+#if !defined(NO_THREADS)
 
-#ifdef NO_THREADS
+#include <atomic>
+#include <type_traits>
 
-/* Bogus implementation unaware of multiprocessing */
+// Design goals for these classes:
+// - No automatic conversions or arithmetic operators,
+//   to keep explicit the use of atomics everywhere.
+// - Using acquire-release semantics, even to set the first value.
+//   The first value may be set relaxedly in many cases, but adding the distinction
+//   between relaxed and unrelaxed operation to the interface would make it needlessly
+//   flexible. There's negligible waste in having release semantics for the initial
+//   value and, as an important benefit, you can be sure the value is properly synchronized
+//   even with threads that are already running.
+
+// This is used in very specific areas of the engine where it's critical that these guarantees are held
+#define SAFE_NUMERIC_TYPE_PUN_GUARANTEES(m_type)                    \
+	static_assert(sizeof(SafeNumeric<m_type>) == sizeof(m_type));   \
+	static_assert(alignof(SafeNumeric<m_type>) == alignof(m_type)); \
+	static_assert(std::is_trivially_destructible<std::atomic<m_type>>::value);
 
 template <class T>
-static _ALWAYS_INLINE_ T atomic_conditional_increment(volatile T *pw) {
-	if (*pw == 0) {
-		return 0;
+class SafeNumeric {
+	std::atomic<T> value;
+
+	static_assert(std::atomic<T>::is_always_lock_free);
+
+public:
+	_ALWAYS_INLINE_ void set(T p_value) {
+		value.store(p_value, std::memory_order_release);
 	}
 
-	(*pw)++;
+	_ALWAYS_INLINE_ T get() const {
+		return value.load(std::memory_order_acquire);
+	}
 
-	return *pw;
-}
+	_ALWAYS_INLINE_ T increment() {
+		return value.fetch_add(1, std::memory_order_acq_rel) + 1;
+	}
 
-template <class T>
-static _ALWAYS_INLINE_ T atomic_decrement(volatile T *pw) {
-	(*pw)--;
+	// Returns the original value instead of the new one
+	_ALWAYS_INLINE_ T postincrement() {
+		return value.fetch_add(1, std::memory_order_acq_rel);
+	}
 
-	return *pw;
-}
+	_ALWAYS_INLINE_ T decrement() {
+		return value.fetch_sub(1, std::memory_order_acq_rel) - 1;
+	}
 
-template <class T>
-static _ALWAYS_INLINE_ T atomic_increment(volatile T *pw) {
-	(*pw)++;
+	// Returns the original value instead of the new one
+	_ALWAYS_INLINE_ T postdecrement() {
+		return value.fetch_sub(1, std::memory_order_acq_rel);
+	}
 
-	return *pw;
-}
+	_ALWAYS_INLINE_ T add(T p_value) {
+		return value.fetch_add(p_value, std::memory_order_acq_rel) + p_value;
+	}
 
-template <class T, class V>
-static _ALWAYS_INLINE_ T atomic_sub(volatile T *pw, volatile V val) {
-	(*pw) -= val;
+	// Returns the original value instead of the new one
+	_ALWAYS_INLINE_ T postadd(T p_value) {
+		return value.fetch_add(p_value, std::memory_order_acq_rel);
+	}
 
-	return *pw;
-}
+	_ALWAYS_INLINE_ T sub(T p_value) {
+		return value.fetch_sub(p_value, std::memory_order_acq_rel) - p_value;
+	}
 
-template <class T, class V>
-static _ALWAYS_INLINE_ T atomic_add(volatile T *pw, volatile V val) {
-	(*pw) += val;
+	// Returns the original value instead of the new one
+	_ALWAYS_INLINE_ T postsub(T p_value) {
+		return value.fetch_sub(p_value, std::memory_order_acq_rel);
+	}
 
-	return *pw;
-}
+	_ALWAYS_INLINE_ T exchange_if_greater(T p_value) {
+		while (true) {
+			T tmp = value.load(std::memory_order_acquire);
+			if (tmp >= p_value) {
+				return tmp; // already greater, or equal
+			}
+			if (value.compare_exchange_weak(tmp, p_value, std::memory_order_release)) {
+				return p_value;
+			}
+		}
+	}
 
-template <class T, class V>
-static _ALWAYS_INLINE_ T atomic_exchange_if_greater(volatile T *pw, volatile V val) {
-	if (val > *pw) {
-		*pw = val;
+	_ALWAYS_INLINE_ T conditional_increment() {
+		while (true) {
+			T c = value.load(std::memory_order_acquire);
+			if (c == 0) {
+				return 0;
+			}
+			if (value.compare_exchange_weak(c, c + 1, std::memory_order_release)) {
+				return c + 1;
+			}
+		}
 	}
 
-	return *pw;
-}
+	_ALWAYS_INLINE_ explicit SafeNumeric<T>(T p_value = static_cast<T>(0)) {
+		set(p_value);
+	}
+};
 
-#elif defined(__GNUC__)
+class SafeFlag {
+	std::atomic_bool flag;
 
-/* Implementation for GCC & Clang */
+	static_assert(std::atomic_bool::is_always_lock_free);
 
-// GCC guarantees atomic intrinsics for sizes of 1, 2, 4 and 8 bytes.
-// Clang states it supports GCC atomic builtins.
+public:
+	_ALWAYS_INLINE_ bool is_set() const {
+		return flag.load(std::memory_order_acquire);
+	}
 
-template <class T>
-static _ALWAYS_INLINE_ T atomic_conditional_increment(volatile T *pw) {
-	while (true) {
-		T tmp = static_cast<T const volatile &>(*pw);
-		if (tmp == 0) {
-			return 0; // if zero, can't add to it anymore
-		}
-		if (__sync_val_compare_and_swap(pw, tmp, tmp + 1) == tmp) {
-			return tmp + 1;
-		}
+	_ALWAYS_INLINE_ void set() {
+		flag.store(true, std::memory_order_release);
 	}
-}
 
-template <class T>
-static _ALWAYS_INLINE_ T atomic_decrement(volatile T *pw) {
-	return __sync_sub_and_fetch(pw, 1);
-}
+	_ALWAYS_INLINE_ void clear() {
+		flag.store(false, std::memory_order_release);
+	}
+
+	_ALWAYS_INLINE_ void set_to(bool p_value) {
+		flag.store(p_value, std::memory_order_release);
+	}
+
+	_ALWAYS_INLINE_ explicit SafeFlag(bool p_value = false) {
+		set_to(p_value);
+	}
+};
+
+class SafeRefCount {
+	SafeNumeric<uint32_t> count;
+
+public:
+	_ALWAYS_INLINE_ bool ref() { // true on success
+		return count.conditional_increment() != 0;
+	}
+
+	_ALWAYS_INLINE_ uint32_t refval() { // none-zero on success
+		return count.conditional_increment();
+	}
+
+	_ALWAYS_INLINE_ bool unref() { // true if must be disposed of
+		return count.decrement() == 0;
+	}
+
+	_ALWAYS_INLINE_ uint32_t unrefval() { // 0 if must be disposed of
+		return count.decrement();
+	}
+
+	_ALWAYS_INLINE_ uint32_t get() const {
+		return count.get();
+	}
+
+	_ALWAYS_INLINE_ void init(uint32_t p_value = 1) {
+		count.set(p_value);
+	}
+};
+
+#else
 
 template <class T>
-static _ALWAYS_INLINE_ T atomic_increment(volatile T *pw) {
-	return __sync_add_and_fetch(pw, 1);
-}
-
-template <class T, class V>
-static _ALWAYS_INLINE_ T atomic_sub(volatile T *pw, volatile V val) {
-	return __sync_sub_and_fetch(pw, val);
-}
-
-template <class T, class V>
-static _ALWAYS_INLINE_ T atomic_add(volatile T *pw, volatile V val) {
-	return __sync_add_and_fetch(pw, val);
-}
-
-template <class T, class V>
-static _ALWAYS_INLINE_ T atomic_exchange_if_greater(volatile T *pw, volatile V val) {
-	while (true) {
-		T tmp = static_cast<T const volatile &>(*pw);
-		if (tmp >= val) {
-			return tmp; // already greater, or equal
+class SafeNumeric {
+protected:
+	T value;
+
+public:
+	_ALWAYS_INLINE_ void set(T p_value) {
+		value = p_value;
+	}
+
+	_ALWAYS_INLINE_ T get() const {
+		return value;
+	}
+
+	_ALWAYS_INLINE_ T increment() {
+		return ++value;
+	}
+
+	_ALWAYS_INLINE_ T postincrement() {
+		return value++;
+	}
+
+	_ALWAYS_INLINE_ T decrement() {
+		return --value;
+	}
+
+	_ALWAYS_INLINE_ T postdecrement() {
+		return value--;
+	}
+
+	_ALWAYS_INLINE_ T add(T p_value) {
+		return value += p_value;
+	}
+
+	_ALWAYS_INLINE_ T postadd(T p_value) {
+		T old = value;
+		value += p_value;
+		return old;
+	}
+
+	_ALWAYS_INLINE_ T sub(T p_value) {
+		return value -= p_value;
+	}
+
+	_ALWAYS_INLINE_ T postsub(T p_value) {
+		T old = value;
+		value -= p_value;
+		return old;
+	}
+
+	_ALWAYS_INLINE_ T exchange_if_greater(T p_value) {
+		if (value < p_value) {
+			value = p_value;
 		}
-		if (__sync_val_compare_and_swap(pw, tmp, val) == tmp) {
-			return val;
+		return value;
+	}
+
+	_ALWAYS_INLINE_ T conditional_increment() {
+		if (value == 0) {
+			return 0;
+		} else {
+			return ++value;
 		}
 	}
-}
-
-#elif defined(_MSC_VER)
-// For MSVC use a separate compilation unit to prevent windows.h from polluting
-// the global namespace.
-uint32_t atomic_conditional_increment(volatile uint32_t *pw);
-uint32_t atomic_decrement(volatile uint32_t *pw);
-uint32_t atomic_increment(volatile uint32_t *pw);
-uint32_t atomic_sub(volatile uint32_t *pw, volatile uint32_t val);
-uint32_t atomic_add(volatile uint32_t *pw, volatile uint32_t val);
-uint32_t atomic_exchange_if_greater(volatile uint32_t *pw, volatile uint32_t val);
-
-uint64_t atomic_conditional_increment(volatile uint64_t *pw);
-uint64_t atomic_decrement(volatile uint64_t *pw);
-uint64_t atomic_increment(volatile uint64_t *pw);
-uint64_t atomic_sub(volatile uint64_t *pw, volatile uint64_t val);
-uint64_t atomic_add(volatile uint64_t *pw, volatile uint64_t val);
-uint64_t atomic_exchange_if_greater(volatile uint64_t *pw, volatile uint64_t val);
 
-#else
-//no threads supported?
-#error Must provide atomic functions for this platform or compiler!
-#endif
+	_ALWAYS_INLINE_ explicit SafeNumeric<T>(T p_value = static_cast<T>(0)) :
+			value(p_value) {
+	}
+};
 
-struct SafeRefCount {
-	uint32_t count = 0;
+class SafeFlag {
+protected:
+	bool flag;
 
 public:
-	// destroy() is called when weak_count_ drops to zero.
-
-	_ALWAYS_INLINE_ bool ref() { // true on success
+	_ALWAYS_INLINE_ bool is_set() const {
+		return flag;
+	}
 
-		return atomic_conditional_increment(&count) != 0;
+	_ALWAYS_INLINE_ void set() {
+		flag = true;
 	}
 
-	_ALWAYS_INLINE_ uint32_t refval() { // none-zero on success
+	_ALWAYS_INLINE_ void clear() {
+		flag = false;
+	}
 
-		return atomic_conditional_increment(&count);
+	_ALWAYS_INLINE_ void set_to(bool p_value) {
+		flag = p_value;
 	}
 
-	_ALWAYS_INLINE_ bool unref() { // true if must be disposed of
+	_ALWAYS_INLINE_ explicit SafeFlag(bool p_value = false) :
+			flag(p_value) {}
+};
+
+class SafeRefCount {
+	uint32_t count = 0;
 
-		return atomic_decrement(&count) == 0;
+public:
+	_ALWAYS_INLINE_ bool ref() { // true on success
+		if (count != 0) {
+			++count;
+			return true;
+		} else {
+			return false;
+		}
 	}
 
-	_ALWAYS_INLINE_ uint32_t unrefval() { // 0 if must be disposed of
+	_ALWAYS_INLINE_ uint32_t refval() { // none-zero on success
+		if (count != 0) {
+			return ++count;
+		} else {
+			return 0;
+		}
+	}
 
-		return atomic_decrement(&count);
+	_ALWAYS_INLINE_ bool unref() { // true if must be disposed of
+		return --count == 0;
 	}
 
-	_ALWAYS_INLINE_ uint32_t get() const { // nothrow
+	_ALWAYS_INLINE_ uint32_t unrefval() { // 0 if must be disposed of
+		return --count;
+	}
 
+	_ALWAYS_INLINE_ uint32_t get() const {
 		return count;
 	}
 
@@ -200,4 +325,6 @@ public:
 	}
 };
 
+#endif
+
 #endif // SAFE_REFCOUNT_H
diff --git a/core/templates/search_array.h b/core/templates/search_array.h
new file mode 100644
index 0000000000..8efc32df82
--- /dev/null
+++ b/core/templates/search_array.h
@@ -0,0 +1,67 @@
+/*************************************************************************/
+/*  search_array.h                                                       */
+/*************************************************************************/
+/*                       This file is part of:                           */
+/*                           GODOT ENGINE                                */
+/*                      https://godotengine.org                          */
+/*************************************************************************/
+/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur.                 */
+/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md).   */
+/*                                                                       */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the       */
+/* "Software"), to deal in the Software without restriction, including   */
+/* without limitation the rights to use, copy, modify, merge, publish,   */
+/* distribute, sublicense, and/or sell copies of the Software, and to    */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions:                                             */
+/*                                                                       */
+/* The above copyright notice and this permission notice shall be        */
+/* included in all copies or substantial portions of the Software.       */
+/*                                                                       */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
+/*************************************************************************/
+
+#ifndef SEARCH_ARRAY_H
+#define SEARCH_ARRAY_H
+
+#include <core/templates/sort_array.h>
+
+template <class T, class Comparator = _DefaultComparator<T>>
+class SearchArray {
+public:
+	Comparator compare;
+
+	inline int bisect(const T *p_array, int p_len, const T &p_value, bool p_before) const {
+		int lo = 0;
+		int hi = p_len;
+		if (p_before) {
+			while (lo < hi) {
+				const int mid = (lo + hi) / 2;
+				if (compare(p_array[mid], p_value)) {
+					lo = mid + 1;
+				} else {
+					hi = mid;
+				}
+			}
+		} else {
+			while (lo < hi) {
+				const int mid = (lo + hi) / 2;
+				if (compare(p_value, p_array[mid])) {
+					hi = mid;
+				} else {
+					lo = mid + 1;
+				}
+			}
+		}
+		return lo;
+	}
+};
+
+#endif // SEARCH_ARRAY_H
diff --git a/core/templates/set.h b/core/templates/set.h
index 3036ecf27d..0a80ceefb5 100644
--- a/core/templates/set.h
+++ b/core/templates/set.h
@@ -35,7 +35,7 @@
 #include "core/typedefs.h"
 
 // based on the very nice implementation of rb-trees by:
-// https://web.archive.org/web/20120507164830/http://web.mit.edu/~emin/www/source_code/red_black_tree/index.html
+// https://web.archive.org/web/20120507164830/https://web.mit.edu/~emin/www/source_code/red_black_tree/index.html
 
 template <class T, class C = Comparator<T>, class A = DefaultAllocator>
 class Set {
@@ -71,12 +71,94 @@ public:
 		Element *prev() {
 			return _prev;
 		}
+		T &get() {
+			return value;
+		}
 		const T &get() const {
 			return value;
 		};
 		Element() {}
 	};
 
+	typedef T ValueType;
+
+	struct Iterator {
+		_FORCE_INLINE_ T &operator*() const {
+			return E->get();
+		}
+		_FORCE_INLINE_ T *operator->() const { return &E->get(); }
+		_FORCE_INLINE_ Iterator &operator++() {
+			E = E->next();
+			return *this;
+		}
+		_FORCE_INLINE_ Iterator &operator--() {
+			E = E->prev();
+			return *this;
+		}
+
+		_FORCE_INLINE_ bool operator==(const Iterator &b) const { return E == b.E; }
+		_FORCE_INLINE_ bool operator!=(const Iterator &b) const { return E != b.E; }
+
+		Iterator(Element *p_E) { E = p_E; }
+		Iterator() {}
+		Iterator(const Iterator &p_it) { E = p_it.E; }
+
+	private:
+		Element *E = nullptr;
+	};
+
+	struct ConstIterator {
+		_FORCE_INLINE_ const T &operator*() const {
+			return E->get();
+		}
+		_FORCE_INLINE_ const T *operator->() const { return &E->get(); }
+		_FORCE_INLINE_ ConstIterator &operator++() {
+			E = E->next();
+			return *this;
+		}
+		_FORCE_INLINE_ ConstIterator &operator--() {
+			E = E->prev();
+			return *this;
+		}
+
+		_FORCE_INLINE_ bool operator==(const ConstIterator &b) const { return E == b.E; }
+		_FORCE_INLINE_ bool operator!=(const ConstIterator &b) const { return E != b.E; }
+
+		_FORCE_INLINE_ ConstIterator(const Element *p_E) { E = p_E; }
+		_FORCE_INLINE_ ConstIterator() {}
+		_FORCE_INLINE_ ConstIterator(const ConstIterator &p_it) { E = p_it.E; }
+
+	private:
+		const Element *E = nullptr;
+	};
+
+	_FORCE_INLINE_ Iterator begin() {
+		return Iterator(front());
+	}
+	_FORCE_INLINE_ Iterator end() {
+		return Iterator(nullptr);
+	}
+
+#if 0
+	//to use when replacing find()
+	_FORCE_INLINE_ Iterator find(const K &p_key) {
+		return Iterator(find(p_key));
+	}
+#endif
+
+	_FORCE_INLINE_ ConstIterator begin() const {
+		return ConstIterator(front());
+	}
+	_FORCE_INLINE_ ConstIterator end() const {
+		return ConstIterator(nullptr);
+	}
+
+#if 0
+	//to use when replacing find()
+	_FORCE_INLINE_ ConstIterator find(const K &p_key) const {
+		return ConstIterator(find(p_key));
+	}
+#endif
 private:
 	struct _Data {
 		Element *_root = nullptr;
diff --git a/core/templates/thread_work_pool.cpp b/core/templates/thread_work_pool.cpp
index ea784e510c..17969a2c90 100644
--- a/core/templates/thread_work_pool.cpp
+++ b/core/templates/thread_work_pool.cpp
@@ -32,14 +32,15 @@
 
 #include "core/os/os.h"
 
-void ThreadWorkPool::_thread_function(ThreadData *p_thread) {
+void ThreadWorkPool::_thread_function(void *p_user) {
+	ThreadData *thread = static_cast<ThreadData *>(p_user);
 	while (true) {
-		p_thread->start.wait();
-		if (p_thread->exit.load()) {
+		thread->start.wait();
+		if (thread->exit.load()) {
 			break;
 		}
-		p_thread->work->work();
-		p_thread->completed.post();
+		thread->work->work();
+		thread->completed.post();
 	}
 }
 
@@ -54,7 +55,7 @@ void ThreadWorkPool::init(int p_thread_count) {
 
 	for (uint32_t i = 0; i < thread_count; i++) {
 		threads[i].exit.store(false);
-		threads[i].thread = memnew(std::thread(ThreadWorkPool::_thread_function, &threads[i]));
+		threads[i].thread.start(&ThreadWorkPool::_thread_function, &threads[i]);
 	}
 }
 
@@ -68,8 +69,7 @@ void ThreadWorkPool::finish() {
 		threads[i].start.post();
 	}
 	for (uint32_t i = 0; i < thread_count; i++) {
-		threads[i].thread->join();
-		memdelete(threads[i].thread);
+		threads[i].thread.wait_to_finish();
 	}
 
 	memdelete_arr(threads);
diff --git a/core/templates/thread_work_pool.h b/core/templates/thread_work_pool.h
index 02d941d0f4..b242648bc8 100644
--- a/core/templates/thread_work_pool.h
+++ b/core/templates/thread_work_pool.h
@@ -33,9 +33,9 @@
 
 #include "core/os/memory.h"
 #include "core/os/semaphore.h"
+#include "core/os/thread.h"
 
 #include <atomic>
-#include <thread>
 
 class ThreadWorkPool {
 	std::atomic<uint32_t> index;
@@ -64,7 +64,7 @@ class ThreadWorkPool {
 	};
 
 	struct ThreadData {
-		std::thread *thread;
+		Thread thread;
 		Semaphore start;
 		Semaphore completed;
 		std::atomic<bool> exit;
@@ -75,7 +75,7 @@ class ThreadWorkPool {
 	uint32_t thread_count = 0;
 	BaseWork *current_work = nullptr;
 
-	static void _thread_function(ThreadData *p_thread);
+	static void _thread_function(void *p_user);
 
 public:
 	template <class C, class M, class U>
@@ -83,7 +83,7 @@ public:
 		ERR_FAIL_COND(!threads); //never initialized
 		ERR_FAIL_COND(current_work != nullptr);
 
-		index.store(0);
+		index.store(0, std::memory_order_release);
 
 		Work<C, M, U> *w = memnew((Work<C, M, U>));
 		w->instance = p_instance;
@@ -104,8 +104,15 @@ public:
 		return current_work != nullptr;
 	}
 
+	bool is_done_dispatching() const {
+		ERR_FAIL_COND_V(current_work == nullptr, true);
+		return index.load(std::memory_order_acquire) >= current_work->max_elements;
+	}
+
 	uint32_t get_work_index() const {
-		return index;
+		ERR_FAIL_COND_V(current_work == nullptr, 0);
+		uint32_t idx = index.load(std::memory_order_acquire);
+		return MIN(idx, current_work->max_elements);
 	}
 
 	void end_work() {
@@ -125,6 +132,7 @@ public:
 		end_work();
 	}
 
+	_FORCE_INLINE_ int get_thread_count() const { return thread_count; }
 	void init(int p_thread_count = -1);
 	void finish();
 	~ThreadWorkPool();
diff --git a/core/templates/vector.h b/core/templates/vector.h
index 6a8902707c..4b008a45a4 100644
--- a/core/templates/vector.h
+++ b/core/templates/vector.h
@@ -38,9 +38,9 @@
 */
 
 #include "core/error/error_macros.h"
-#include "core/os/copymem.h"
 #include "core/os/memory.h"
 #include "core/templates/cowdata.h"
+#include "core/templates/search_array.h"
 #include "core/templates/sort_array.h"
 
 template <class T>
@@ -66,6 +66,7 @@ private:
 public:
 	bool push_back(T p_elem);
 	_FORCE_INLINE_ bool append(const T &p_elem) { return push_back(p_elem); } //alias
+	void fill(T p_elem);
 
 	void remove(int p_index) { _cowdata.remove(p_index); }
 	void erase(const T &p_val) {
@@ -74,7 +75,7 @@ public:
 			remove(idx);
 		}
 	}
-	void invert();
+	void reverse();
 
 	_FORCE_INLINE_ T *ptrw() { return _cowdata.ptrw(); }
 	_FORCE_INLINE_ const T *ptr() const { return _cowdata.ptr(); }
@@ -92,7 +93,7 @@ public:
 
 	void append_array(Vector<T> p_other);
 
-	bool has(const T &p_val) {
+	bool has(const T &p_val) const {
 		return find(p_val, 0) != -1;
 	}
 
@@ -112,6 +113,11 @@ public:
 		sort_custom<_DefaultComparator<T>>();
 	}
 
+	int bsearch(const T &p_value, bool p_before) {
+		SearchArray<T> search;
+		return search.bisect(ptrw(), size(), p_value, p_before);
+	}
+
 	Vector<T> duplicate() {
 		return *this;
 	}
@@ -134,7 +140,7 @@ public:
 	Vector<uint8_t> to_byte_array() const {
 		Vector<uint8_t> ret;
 		ret.resize(size() * sizeof(T));
-		copymem(ret.ptrw(), ptr(), sizeof(T) * size());
+		memcpy(ret.ptrw(), ptr(), sizeof(T) * size());
 		return ret;
 	}
 
@@ -187,6 +193,70 @@ public:
 		return false;
 	}
 
+	struct Iterator {
+		_FORCE_INLINE_ T &operator*() const {
+			return *elem_ptr;
+		}
+		_FORCE_INLINE_ T *operator->() const { return elem_ptr; }
+		_FORCE_INLINE_ Iterator &operator++() {
+			elem_ptr++;
+			return *this;
+		}
+		_FORCE_INLINE_ Iterator &operator--() {
+			elem_ptr--;
+			return *this;
+		}
+
+		_FORCE_INLINE_ bool operator==(const Iterator &b) const { return elem_ptr == b.elem_ptr; }
+		_FORCE_INLINE_ bool operator!=(const Iterator &b) const { return elem_ptr != b.elem_ptr; }
+
+		Iterator(T *p_ptr) { elem_ptr = p_ptr; }
+		Iterator() {}
+		Iterator(const Iterator &p_it) { elem_ptr = p_it.elem_ptr; }
+
+	private:
+		T *elem_ptr = nullptr;
+	};
+
+	struct ConstIterator {
+		_FORCE_INLINE_ const T &operator*() const {
+			return *elem_ptr;
+		}
+		_FORCE_INLINE_ const T *operator->() const { return elem_ptr; }
+		_FORCE_INLINE_ ConstIterator &operator++() {
+			elem_ptr++;
+			return *this;
+		}
+		_FORCE_INLINE_ ConstIterator &operator--() {
+			elem_ptr--;
+			return *this;
+		}
+
+		_FORCE_INLINE_ bool operator==(const ConstIterator &b) const { return elem_ptr == b.elem_ptr; }
+		_FORCE_INLINE_ bool operator!=(const ConstIterator &b) const { return elem_ptr != b.elem_ptr; }
+
+		ConstIterator(const T *p_ptr) { elem_ptr = p_ptr; }
+		ConstIterator() {}
+		ConstIterator(const ConstIterator &p_it) { elem_ptr = p_it.elem_ptr; }
+
+	private:
+		const T *elem_ptr = nullptr;
+	};
+
+	_FORCE_INLINE_ Iterator begin() {
+		return Iterator(ptrw());
+	}
+	_FORCE_INLINE_ Iterator end() {
+		return Iterator(ptrw() + size());
+	}
+
+	_FORCE_INLINE_ ConstIterator begin() const {
+		return ConstIterator(ptr());
+	}
+	_FORCE_INLINE_ ConstIterator end() const {
+		return ConstIterator(ptr() + size());
+	}
+
 	_FORCE_INLINE_ Vector() {}
 	_FORCE_INLINE_ Vector(const Vector &p_from) { _cowdata._ref(p_from._cowdata); }
 
@@ -194,7 +264,7 @@ public:
 };
 
 template <class T>
-void Vector<T>::invert() {
+void Vector<T>::reverse() {
 	for (int i = 0; i < size() / 2; i++) {
 		T *p = ptrw();
 		SWAP(p[i], p[size() - i - 1]);
@@ -223,4 +293,12 @@ bool Vector<T>::push_back(T p_elem) {
 	return false;
 }
 
+template <class T>
+void Vector<T>::fill(T p_elem) {
+	T *p = ptrw();
+	for (int i = 0; i < size(); i++) {
+		p[i] = p_elem;
+	}
+}
+
 #endif // VECTOR_H