4 files changed, 138 insertions, 313 deletions
diff --git a/core/object/worker_thread_pool.cpp b/core/object/worker_thread_pool.cpp
index c276802f99..54738a673e 100644
--- a/core/object/worker_thread_pool.cpp
+++ b/core/object/worker_thread_pool.cpp
@@ -32,6 +32,13 @@
 
 #include "core/os/os.h"
 
+void WorkerThreadPool::Task::free_template_userdata() {
+	ERR_FAIL_COND(!template_userdata);
+	ERR_FAIL_COND(native_func_userdata == nullptr);
+	BaseTemplateUserdata *btu = (BaseTemplateUserdata *)native_func_userdata;
+	memdelete(btu);
+}
+
 WorkerThreadPool *WorkerThreadPool::singleton = nullptr;
 
 void WorkerThreadPool::_process_task_queue() {
@@ -48,30 +55,36 @@ void WorkerThreadPool::_process_task(Task *p_task) {
 	if (p_task->group) {
 		// Handling a group
 		bool do_post = false;
-		if (p_task->native_group_func) {
-			while (true) {
-				uint32_t work_index = p_task->group->index.postincrement();
-				if (work_index >= p_task->group->max) {
-					do_post = work_index == p_task->group->max; // First one reaching max handles semaphore and clean-up.
-					break;
-				}
-				p_task->native_group_func(p_task->native_func_userdata, work_index);
-			}
+		Callable::CallError ce;
+		Variant ret;
+		Variant arg;
+		Variant *argptr = &arg;
 
-		} else {
-			Callable::CallError ce;
-			Variant ret;
-			Variant arg;
-			Variant *argptr = &arg;
-			while (true) {
-				uint32_t work_index = p_task->group->index.postincrement();
-				if (work_index >= p_task->group->max) {
-					do_post = work_index == p_task->group->max; // First one reaching max handles semaphore and clean-up.
-					break;
-				}
+		while (true) {
+			uint32_t work_index = p_task->group->index.postincrement();
+
+			if (work_index >= p_task->group->max) {
+				break;
+			}
+			if (p_task->native_group_func) {
+				p_task->native_group_func(p_task->native_func_userdata, work_index);
+			} else if (p_task->template_userdata) {
+				p_task->template_userdata->callback_indexed(work_index);
+			} else {
 				arg = work_index;
 				p_task->callable.call((const Variant **)&argptr, 1, ret, ce);
 			}
+
+			// This is the only way to ensure posting is done when all tasks are really complete.
+			uint32_t completed_amount = p_task->group->completed_index.increment();
+
+			if (completed_amount == p_task->group->max) {
+				do_post = true;
+			}
+		}
+
+		if (do_post && p_task->template_userdata) {
+			memdelete(p_task->template_userdata); // This is no longer needed at this point, so get rid of it.
 		}
 
 		if (low_priority && use_native_low_priority_threads) {
@@ -104,6 +117,9 @@ void WorkerThreadPool::_process_task(Task *p_task) {
 	} else {
 		if (p_task->native_func) {
 			p_task->native_func(p_task->native_func_userdata);
+		} else if (p_task->template_userdata) {
+			p_task->template_userdata->callback();
+			memdelete(p_task->template_userdata);
 		} else {
 			Callable::CallError ce;
 			Variant ret;
@@ -171,13 +187,19 @@ void WorkerThreadPool::_post_task(Task *p_task, bool p_high_priority) {
 }
 
 WorkerThreadPool::TaskID WorkerThreadPool::add_native_task(void (*p_func)(void *), void *p_userdata, bool p_high_priority, const String &p_description) {
+	return _add_task(Callable(), p_func, p_userdata, nullptr, p_high_priority, p_description);
+}
+
+WorkerThreadPool::TaskID WorkerThreadPool::_add_task(const Callable &p_callable, void (*p_func)(void *), void *p_userdata, BaseTemplateUserdata *p_template_userdata, bool p_high_priority, const String &p_description) {
 	task_mutex.lock();
 	// Get a free task
 	Task *task = task_allocator.alloc();
 	TaskID id = last_task++;
+	task->callable = p_callable;
 	task->native_func = p_func;
 	task->native_func_userdata = p_userdata;
 	task->description = p_description;
+	task->template_userdata = p_template_userdata;
 	tasks.insert(id, task);
 	task_mutex.unlock();
 
@@ -187,18 +209,7 @@ WorkerThreadPool::TaskID WorkerThreadPool::add_native_task(void (*p_func)(void *
 }
 
 WorkerThreadPool::TaskID WorkerThreadPool::add_task(const Callable &p_action, bool p_high_priority, const String &p_description) {
-	task_mutex.lock();
-	// Get a free task
-	Task *task = task_allocator.alloc();
-	TaskID id = last_task++;
-	task->callable = p_action;
-	task->description = p_description;
-	tasks.insert(id, task);
-	task_mutex.unlock();
-
-	_post_task(task, p_high_priority);
-
-	return id;
+	return _add_task(p_action, nullptr, nullptr, nullptr, p_high_priority, p_description);
 }
 
 bool WorkerThreadPool::is_task_completed(TaskID p_task_id) const {
@@ -269,8 +280,8 @@ void WorkerThreadPool::wait_for_task_completion(TaskID p_task_id) {
 	task_mutex.unlock();
 }
 
-WorkerThreadPool::GroupID WorkerThreadPool::add_native_group_task(void (*p_func)(void *, uint32_t), void *p_userdata, int p_elements, int p_tasks, bool p_high_priority, const String &p_description) {
-	ERR_FAIL_COND_V(p_elements <= 0, INVALID_TASK_ID);
+WorkerThreadPool::GroupID WorkerThreadPool::_add_group_task(const Callable &p_callable, void (*p_func)(void *, uint32_t), void *p_userdata, BaseTemplateUserdata *p_template_userdata, int p_elements, int p_tasks, bool p_high_priority, const String &p_description) {
+	ERR_FAIL_COND_V(p_elements < 0, INVALID_TASK_ID);
 	if (p_tasks < 0) {
 		p_tasks = threads.size();
 	}
@@ -280,17 +291,34 @@ WorkerThreadPool::GroupID WorkerThreadPool::add_native_group_task(void (*p_func)
 	GroupID id = last_task++;
 	group->max = p_elements;
 	group->self = id;
-	group->tasks_used = p_tasks;
-	Task **tasks_posted = (Task **)alloca(sizeof(Task *) * p_tasks);
-	for (int i = 0; i < p_tasks; i++) {
-		Task *task = task_allocator.alloc();
-		task->native_group_func = p_func;
-		task->native_func_userdata = p_userdata;
-		task->description = p_description;
-		task->group = group;
-		tasks_posted[i] = task;
-		// No task ID is used.
+
+	Task **tasks_posted = nullptr;
+	if (p_elements == 0) {
+		// Should really not call it with zero Elements, but at least it should work.
+		group->completed.set_to(true);
+		group->done_semaphore.post();
+		group->tasks_used = 0;
+		p_tasks = 0;
+		if (p_template_userdata) {
+			memdelete(p_template_userdata);
+		}
+
+	} else {
+		group->tasks_used = p_tasks;
+		tasks_posted = (Task **)alloca(sizeof(Task *) * p_tasks);
+		for (int i = 0; i < p_tasks; i++) {
+			Task *task = task_allocator.alloc();
+			task->native_group_func = p_func;
+			task->native_func_userdata = p_userdata;
+			task->description = p_description;
+			task->group = group;
+			task->callable = p_callable;
+			task->template_userdata = p_template_userdata;
+			tasks_posted[i] = task;
+			// No task ID is used.
+		}
 	}
+
 	groups[id] = group;
 	task_mutex.unlock();
 
@@ -308,43 +336,25 @@ WorkerThreadPool::GroupID WorkerThreadPool::add_native_group_task(void (*p_func)
 	return id;
 }
 
+WorkerThreadPool::GroupID WorkerThreadPool::add_native_group_task(void (*p_func)(void *, uint32_t), void *p_userdata, int p_elements, int p_tasks, bool p_high_priority, const String &p_description) {
+	return _add_group_task(Callable(), p_func, p_userdata, nullptr, p_elements, p_tasks, p_high_priority, p_description);
+}
+
 WorkerThreadPool::GroupID WorkerThreadPool::add_group_task(const Callable &p_action, int p_elements, int p_tasks, bool p_high_priority, const String &p_description) {
-	ERR_FAIL_COND_V(p_elements <= 0, INVALID_TASK_ID);
-	if (p_tasks < 0) {
-		p_tasks = threads.size();
-	}
+	return _add_group_task(p_action, nullptr, nullptr, nullptr, p_elements, p_tasks, p_high_priority, p_description);
+}
 
+uint32_t WorkerThreadPool::get_group_processed_element_count(GroupID p_group) const {
 	task_mutex.lock();
-	Group *group = group_allocator.alloc();
-	GroupID id = last_task++;
-	group->max = p_elements;
-	group->self = id;
-	group->tasks_used = p_tasks;
-	Task **tasks_posted = (Task **)alloca(sizeof(Task *) * p_tasks);
-	for (int i = 0; i < p_tasks; i++) {
-		Task *task = task_allocator.alloc();
-		task->callable = p_action;
-		task->description = p_description;
-		task->group = group;
-		tasks_posted[i] = task;
-		// No task ID is used.
+	const Group *const *groupp = groups.getptr(p_group);
+	if (!groupp) {
+		task_mutex.unlock();
+		ERR_FAIL_V_MSG(0, "Invalid Group ID");
 	}
-	groups[id] = group;
+	uint32_t elements = (*groupp)->completed_index.get();
 	task_mutex.unlock();
-
-	if (!p_high_priority && use_native_low_priority_threads) {
-		group->low_priority_native_tasks.resize(p_tasks);
-	}
-
-	for (int i = 0; i < p_tasks; i++) {
-		_post_task(tasks_posted[i], p_high_priority);
-		if (!p_high_priority && use_native_low_priority_threads) {
-			group->low_priority_native_tasks[i] = tasks_posted[i];
-		}
-	}
-	return id;
+	return elements;
 }
-
 bool WorkerThreadPool::is_group_task_completed(GroupID p_group) const {
 	task_mutex.lock();
 	const Group *const *groupp = groups.getptr(p_group);
@@ -451,6 +461,7 @@ void WorkerThreadPool::_bind_methods() {
 
 	ClassDB::bind_method(D_METHOD("add_group_task", "action", "elements", "tasks_needed", "high_priority", "description"), &WorkerThreadPool::add_group_task, DEFVAL(-1), DEFVAL(false), DEFVAL(String()));
 	ClassDB::bind_method(D_METHOD("is_group_task_completed", "group_id"), &WorkerThreadPool::is_group_task_completed);
+	ClassDB::bind_method(D_METHOD("get_group_processed_element_count", "group_id"), &WorkerThreadPool::get_group_processed_element_count);
 	ClassDB::bind_method(D_METHOD("wait_for_group_task_completion", "group_id"), &WorkerThreadPool::wait_for_group_task_completion);
 }
 
diff --git a/core/object/worker_thread_pool.h b/core/object/worker_thread_pool.h
index dfb0050605..1debd9ca37 100644
--- a/core/object/worker_thread_pool.h
+++ b/core/object/worker_thread_pool.h
@@ -53,9 +53,16 @@ public:
 private:
 	struct Task;
 
+	struct BaseTemplateUserdata {
+		virtual void callback() {}
+		virtual void callback_indexed(uint32_t p_index) {}
+		virtual ~BaseTemplateUserdata() {}
+	};
+
 	struct Group {
 		GroupID self;
 		SafeNumeric<uint32_t> index;
+		SafeNumeric<uint32_t> completed_index;
 		uint32_t max = 0;
 		Semaphore done_semaphore;
 		SafeFlag completed;
@@ -76,7 +83,10 @@ private:
 		SelfList<Task> task_elem;
 		bool waiting = false; // Waiting for completion
 		bool low_priority = false;
+		BaseTemplateUserdata *template_userdata = nullptr;
 		Thread *low_priority_thread = nullptr;
+
+		void free_template_userdata();
 		Task() :
 				task_elem(this) {}
 	};
@@ -119,18 +129,60 @@ private:
 
 	static WorkerThreadPool *singleton;
 
+	TaskID _add_task(const Callable &p_callable, void (*p_func)(void *), void *p_userdata, BaseTemplateUserdata *p_template_userdata, bool p_high_priority, const String &p_description);
+	GroupID _add_group_task(const Callable &p_callable, void (*p_func)(void *, uint32_t), void *p_userdata, BaseTemplateUserdata *p_template_userdata, int p_elements, int p_tasks, bool p_high_priority, const String &p_description);
+
+	template <class C, class M, class U>
+	struct TaskUserData : public BaseTemplateUserdata {
+		C *instance;
+		M method;
+		U userdata;
+		virtual void callback() override {
+			(instance->*method)(userdata);
+		}
+	};
+
+	template <class C, class M, class U>
+	struct GroupUserData : public BaseTemplateUserdata {
+		C *instance;
+		M method;
+		U userdata;
+		virtual void callback_indexed(uint32_t p_index) override {
+			(instance->*method)(p_index, userdata);
+		}
+	};
+
 protected:
 	static void _bind_methods();
 
 public:
+	template <class C, class M, class U>
+	TaskID add_template_task(C *p_instance, M p_method, U p_userdata, bool p_high_priority = false, const String &p_description = String()) {
+		typedef TaskUserData<C, M, U> TUD;
+		TUD *ud = memnew(TUD);
+		ud->instance = p_instance;
+		ud->method = p_method;
+		ud->userdata = p_userdata;
+		return _add_task(Callable(), nullptr, nullptr, ud, p_high_priority, p_description);
+	}
 	TaskID add_native_task(void (*p_func)(void *), void *p_userdata, bool p_high_priority = false, const String &p_description = String());
 	TaskID add_task(const Callable &p_action, bool p_high_priority = false, const String &p_description = String());
 
 	bool is_task_completed(TaskID p_task_id) const;
 	void wait_for_task_completion(TaskID p_task_id);
 
+	template <class C, class M, class U>
+	GroupID add_template_group_task(C *p_instance, M p_method, U p_userdata, int p_elements, int p_tasks = -1, bool p_high_priority = false, const String &p_description = String()) {
+		typedef GroupUserData<C, M, U> GUD;
+		GUD *ud = memnew(GUD);
+		ud->instance = p_instance;
+		ud->method = p_method;
+		ud->userdata = p_userdata;
+		return _add_group_task(Callable(), nullptr, nullptr, ud, p_elements, p_tasks, p_high_priority, p_description);
+	}
 	GroupID add_native_group_task(void (*p_func)(void *, uint32_t), void *p_userdata, int p_elements, int p_tasks = -1, bool p_high_priority = false, const String &p_description = String());
 	GroupID add_group_task(const Callable &p_action, int p_elements, int p_tasks = -1, bool p_high_priority = false, const String &p_description = String());
+	uint32_t get_group_processed_element_count(GroupID p_group) const;
 	bool is_group_task_completed(GroupID p_group) const;
 	void wait_for_group_task_completion(GroupID p_group);
 
diff --git a/core/templates/thread_work_pool.cpp b/core/templates/thread_work_pool.cpp
deleted file mode 100644
index a75fd06b9b..0000000000
--- a/core/templates/thread_work_pool.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-/*************************************************************************/
-/*  thread_work_pool.cpp                                                 */
-/*************************************************************************/
-/*                       This file is part of:                           */
-/*                           GODOT ENGINE                                */
-/*                      https://godotengine.org                          */
-/*************************************************************************/
-/* Copyright (c) 2007-2022 Juan Linietsky, Ariel Manzur.                 */
-/* Copyright (c) 2014-2022 Godot Engine contributors (cf. AUTHORS.md).   */
-/*                                                                       */
-/* Permission is hereby granted, free of charge, to any person obtaining */
-/* a copy of this software and associated documentation files (the       */
-/* "Software"), to deal in the Software without restriction, including   */
-/* without limitation the rights to use, copy, modify, merge, publish,   */
-/* distribute, sublicense, and/or sell copies of the Software, and to    */
-/* permit persons to whom the Software is furnished to do so, subject to */
-/* the following conditions:                                             */
-/*                                                                       */
-/* The above copyright notice and this permission notice shall be        */
-/* included in all copies or substantial portions of the Software.       */
-/*                                                                       */
-/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
-/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
-/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
-/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
-/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
-/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
-/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
-/*************************************************************************/
-
-#include "thread_work_pool.h"
-
-#include "core/os/os.h"
-
-void ThreadWorkPool::_thread_function(void *p_user) {
-	ThreadData *thread = static_cast<ThreadData *>(p_user);
-	while (true) {
-		thread->start.wait();
-		if (thread->exit.load()) {
-			break;
-		}
-		thread->work->work();
-		thread->completed.post();
-	}
-}
-
-void ThreadWorkPool::init(int p_thread_count) {
-	ERR_FAIL_COND(threads != nullptr);
-	if (p_thread_count < 0) {
-		p_thread_count = OS::get_singleton()->get_default_thread_pool_size();
-	}
-
-	thread_count = p_thread_count;
-	threads = memnew_arr(ThreadData, thread_count);
-
-	for (uint32_t i = 0; i < thread_count; i++) {
-		threads[i].exit.store(false);
-		threads[i].thread.start(&ThreadWorkPool::_thread_function, &threads[i]);
-	}
-}
-
-void ThreadWorkPool::finish() {
-	if (threads == nullptr) {
-		return;
-	}
-
-	for (uint32_t i = 0; i < thread_count; i++) {
-		threads[i].exit.store(true);
-		threads[i].start.post();
-	}
-	for (uint32_t i = 0; i < thread_count; i++) {
-		threads[i].thread.wait_to_finish();
-	}
-
-	memdelete_arr(threads);
-	threads = nullptr;
-}
-
-ThreadWorkPool::~ThreadWorkPool() {
-	finish();
-}
diff --git a/core/templates/thread_work_pool.h b/core/templates/thread_work_pool.h
deleted file mode 100644
index cdb43d6d89..0000000000
--- a/core/templates/thread_work_pool.h
+++ /dev/null
@@ -1,157 +0,0 @@
-/*************************************************************************/
-/*  thread_work_pool.h                                                   */
-/*************************************************************************/
-/*                       This file is part of:                           */
-/*                           GODOT ENGINE                                */
-/*                      https://godotengine.org                          */
-/*************************************************************************/
-/* Copyright (c) 2007-2022 Juan Linietsky, Ariel Manzur.                 */
-/* Copyright (c) 2014-2022 Godot Engine contributors (cf. AUTHORS.md).   */
-/*                                                                       */
-/* Permission is hereby granted, free of charge, to any person obtaining */
-/* a copy of this software and associated documentation files (the       */
-/* "Software"), to deal in the Software without restriction, including   */
-/* without limitation the rights to use, copy, modify, merge, publish,   */
-/* distribute, sublicense, and/or sell copies of the Software, and to    */
-/* permit persons to whom the Software is furnished to do so, subject to */
-/* the following conditions:                                             */
-/*                                                                       */
-/* The above copyright notice and this permission notice shall be        */
-/* included in all copies or substantial portions of the Software.       */
-/*                                                                       */
-/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
-/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
-/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
-/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
-/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
-/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
-/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
-/*************************************************************************/
-
-#ifndef THREAD_WORK_POOL_H
-#define THREAD_WORK_POOL_H
-
-#include "core/os/memory.h"
-#include "core/os/semaphore.h"
-#include "core/os/thread.h"
-
-#include <atomic>
-
-class ThreadWorkPool {
-	std::atomic<uint32_t> index;
-
-	struct BaseWork {
-		std::atomic<uint32_t> *index = nullptr;
-		uint32_t max_elements = 0;
-		virtual void work() = 0;
-		virtual ~BaseWork() = default;
-	};
-
-	template <class C, class M, class U>
-	struct Work : public BaseWork {
-		C *instance;
-		M method;
-		U userdata;
-		virtual void work() override {
-			while (true) {
-				uint32_t work_index = index->fetch_add(1, std::memory_order_relaxed);
-				if (work_index >= max_elements) {
-					break;
-				}
-				(instance->*method)(work_index, userdata);
-			}
-		}
-	};
-
-	struct ThreadData {
-		Thread thread;
-		Semaphore start;
-		Semaphore completed;
-		std::atomic<bool> exit;
-		BaseWork *work = nullptr;
-	};
-
-	ThreadData *threads = nullptr;
-	uint32_t thread_count = 0;
-	uint32_t threads_working = 0;
-	BaseWork *current_work = nullptr;
-
-	static void _thread_function(void *p_user);
-
-public:
-	template <class C, class M, class U>
-	void begin_work(uint32_t p_elements, C *p_instance, M p_method, U p_userdata) {
-		ERR_FAIL_COND(!threads); //never initialized
-		ERR_FAIL_COND(current_work != nullptr);
-
-		index.store(0, std::memory_order_release);
-
-		Work<C, M, U> *w = memnew((Work<C, M, U>));
-		w->instance = p_instance;
-		w->userdata = p_userdata;
-		w->method = p_method;
-		w->index = &index;
-		w->max_elements = p_elements;
-
-		current_work = w;
-
-		threads_working = MIN(p_elements, thread_count);
-
-		for (uint32_t i = 0; i < threads_working; i++) {
-			threads[i].work = w;
-			threads[i].start.post();
-		}
-	}
-
-	bool is_working() const {
-		return current_work != nullptr;
-	}
-
-	bool is_done_dispatching() const {
-		ERR_FAIL_COND_V(current_work == nullptr, true);
-		return index.load(std::memory_order_acquire) >= current_work->max_elements;
-	}
-
-	uint32_t get_work_index() const {
-		ERR_FAIL_COND_V(current_work == nullptr, 0);
-		uint32_t idx = index.load(std::memory_order_acquire);
-		return MIN(idx, current_work->max_elements);
-	}
-
-	void end_work() {
-		ERR_FAIL_COND(current_work == nullptr);
-		for (uint32_t i = 0; i < threads_working; i++) {
-			threads[i].completed.wait();
-			threads[i].work = nullptr;
-		}
-
-		threads_working = 0;
-		memdelete(current_work);
-		current_work = nullptr;
-	}
-
-	template <class C, class M, class U>
-	void do_work(uint32_t p_elements, C *p_instance, M p_method, U p_userdata) {
-		switch (p_elements) {
-			case 0:
-				// Nothing to do, so do nothing.
-				break;
-			case 1:
-				// No value in pushing the work to another thread if it's a single job
-				// and we're going to wait for it to finish. Just run it right here.
-				(p_instance->*p_method)(0, p_userdata);
-				break;
-			default:
-				// Multiple jobs to do; commence threaded business.
-				begin_work(p_elements, p_instance, p_method, p_userdata);
-				end_work();
-		}
-	}
-
-	_FORCE_INLINE_ int get_thread_count() const { return thread_count; }
-	void init(int p_thread_count = -1);
-	void finish();
-	~ThreadWorkPool();
-};
-
-#endif // THREAD_WORK_POOL_H