1 files changed, 149 insertions, 55 deletions
diff --git a/modules/cvtt/image_compress_cvtt.cpp b/modules/cvtt/image_compress_cvtt.cpp
index c860942d33..3a371c8597 100644
--- a/modules/cvtt/image_compress_cvtt.cpp
+++ b/modules/cvtt/image_compress_cvtt.cpp
@@ -30,10 +30,112 @@
 
 #include "image_compress_cvtt.h"
 
+#include "os/os.h"
+#include "os/thread.h"
 #include "print_string.h"
 
 #include <ConvectionKernels.h>
 
+struct CVTTCompressionJobParams {
+	bool is_hdr;
+	bool is_signed;
+	int bytes_per_pixel;
+
+	cvtt::Options options;
+};
+
+struct CVTTCompressionRowTask {
+	const uint8_t *in_mm_bytes;
+	uint8_t *out_mm_bytes;
+	int y_start;
+	int width;
+	int height;
+};
+
+struct CVTTCompressionJobQueue {
+	CVTTCompressionJobParams job_params;
+	const CVTTCompressionRowTask *job_tasks;
+	uint32_t num_tasks;
+	uint32_t current_task;
+};
+
+static void _digest_row_task(const CVTTCompressionJobParams &p_job_params, const CVTTCompressionRowTask &p_row_task) {
+	const uint8_t *in_bytes = p_row_task.in_mm_bytes;
+	uint8_t *out_bytes = p_row_task.out_mm_bytes;
+	int w = p_row_task.width;
+	int h = p_row_task.height;
+
+	int y_start = p_row_task.y_start;
+	int y_end = y_start + 4;
+
+	int bytes_per_pixel = p_job_params.bytes_per_pixel;
+	bool is_hdr = p_job_params.is_hdr;
+	bool is_signed = p_job_params.is_signed;
+
+	cvtt::PixelBlockU8 input_blocks_ldr[cvtt::NumParallelBlocks];
+	cvtt::PixelBlockF16 input_blocks_hdr[cvtt::NumParallelBlocks];
+
+	for (int x_start = 0; x_start < w; x_start += 4 * cvtt::NumParallelBlocks) {
+		int x_end = x_start + 4 * cvtt::NumParallelBlocks;
+
+		for (int y = y_start; y < y_end; y++) {
+			int first_input_element = (y - y_start) * 4;
+			const uint8_t *row_start;
+			if (y >= h) {
+				row_start = in_bytes + (h - 1) * (w * bytes_per_pixel);
+			} else {
+				row_start = in_bytes + y * (w * bytes_per_pixel);
+			}
+
+			for (int x = x_start; x < x_end; x++) {
+				const uint8_t *pixel_start;
+				if (x >= w) {
+					pixel_start = row_start + (w - 1) * bytes_per_pixel;
+				} else {
+					pixel_start = row_start + x * bytes_per_pixel;
+				}
+
+				int block_index = (x - x_start) / 4;
+				int block_element = (x - x_start) % 4 + first_input_element;
+				if (is_hdr) {
+					memcpy(input_blocks_hdr[block_index].m_pixels[block_element], pixel_start, bytes_per_pixel);
+					input_blocks_hdr[block_index].m_pixels[block_element][3] = 0x3c00; // 1.0 (unused)
+				} else {
+					memcpy(input_blocks_ldr[block_index].m_pixels[block_element], pixel_start, bytes_per_pixel);
+				}
+			}
+		}
+
+		uint8_t output_blocks[16 * cvtt::NumParallelBlocks];
+
+		if (is_hdr) {
+			if (is_signed) {
+				cvtt::Kernels::EncodeBC6HS(output_blocks, input_blocks_hdr, p_job_params.options);
+			} else {
+				cvtt::Kernels::EncodeBC6HU(output_blocks, input_blocks_hdr, p_job_params.options);
+			}
+		} else {
+			cvtt::Kernels::EncodeBC7(output_blocks, input_blocks_ldr, p_job_params.options);
+		}
+
+		int num_real_blocks = ((w - x_start) + 3) / 4;
+		if (num_real_blocks > cvtt::NumParallelBlocks) {
+			num_real_blocks = cvtt::NumParallelBlocks;
+		}
+
+		memcpy(out_bytes, output_blocks, 16 * num_real_blocks);
+		out_bytes += 16 * num_real_blocks;
+	}
+}
+
+static void _digest_job_queue(void *p_job_queue) {
+	CVTTCompressionJobQueue *job_queue = static_cast<CVTTCompressionJobQueue *>(p_job_queue);
+
+	for (int next_task = atomic_increment(&job_queue->current_task); next_task <= job_queue->num_tasks; next_task = atomic_increment(&job_queue->current_task)) {
+		_digest_row_task(job_queue->job_params, job_queue->job_tasks[next_task - 1]);
+	}
+}
+
 void image_compress_cvtt(Image *p_image, float p_lossy_quality, Image::CompressSource p_source) {
 
 	if (p_image->get_format() >= Image::FORMAT_BPTC_RGBA)
@@ -101,6 +203,20 @@ void image_compress_cvtt(Image *p_image, float p_lossy_quality, Image::CompressS
 
 	int dst_ofs = 0;
 
+	CVTTCompressionJobQueue job_queue;
+	job_queue.job_params.is_hdr = is_hdr;
+	job_queue.job_params.is_signed = is_signed;
+	job_queue.job_params.options = options;
+	job_queue.job_params.bytes_per_pixel = is_hdr ? 6 : 4;
+
+#ifdef NO_THREADS
+	int num_job_threads = 0;
+#else
+	int num_job_threads = OS::get_singleton()->can_use_threads() ? (OS::get_singleton()->get_processor_count() - 1) : 0;
+#endif
+
+	PoolVector<CVTTCompressionRowTask> tasks;
+
 	for (int i = 0; i <= mm_count; i++) {
 
 		int bw = w % 4 != 0 ? w + (4 - w % 4) : w;
@@ -111,75 +227,53 @@ void image_compress_cvtt(Image *p_image, float p_lossy_quality, Image::CompressS
 		const uint8_t *in_bytes = &rb[src_ofs];
 		uint8_t *out_bytes = &wb[dst_ofs];
 
-		cvtt::PixelBlockU8 input_blocks_ldr[cvtt::NumParallelBlocks];
-		cvtt::PixelBlockF16 input_blocks_hdr[cvtt::NumParallelBlocks];
-
-		int bytes_per_pixel = is_hdr ? 6 : 4;
-
 		for (int y_start = 0; y_start < h; y_start += 4) {
 			int y_end = y_start + 4;
 
-			for (int x_start = 0; x_start < w; x_start += 4 * cvtt::NumParallelBlocks) {
-				int x_end = x_start + 4 * cvtt::NumParallelBlocks;
+			CVTTCompressionRowTask row_task;
+			row_task.width = w;
+			row_task.height = h;
+			row_task.y_start = y_start;
+			row_task.in_mm_bytes = in_bytes;
+			row_task.out_mm_bytes = out_bytes;
+
+			if (num_job_threads > 0) {
+				tasks.push_back(row_task);
+			} else {
+				_digest_row_task(job_queue.job_params, row_task);
+			}
 
-				for (int y = y_start; y < y_end; y++) {
-					int first_input_element = (y - y_start) * 4;
-					const uint8_t *row_start;
-					if (y >= h) {
-						row_start = in_bytes + (h - 1) * (w * bytes_per_pixel);
-					} else {
-						row_start = in_bytes + y * (w * bytes_per_pixel);
-					}
+			out_bytes += 16 * (bw / 4);
+		}
 
-					for (int x = x_start; x < x_end; x++) {
-						const uint8_t *pixel_start;
-						if (x >= w) {
-							pixel_start = row_start + (w - 1) * bytes_per_pixel;
-						} else {
-							pixel_start = row_start + x * bytes_per_pixel;
-						}
+		dst_ofs += (MAX(4, bw) * MAX(4, bh)) >> shift;
+		w >>= 1;
+		h >>= 1;
+	}
 
-						int block_index = (x - x_start) / 4;
-						int block_element = (x - x_start) % 4 + first_input_element;
-						if (is_hdr) {
-							memcpy(input_blocks_hdr[block_index].m_pixels[block_element], pixel_start, bytes_per_pixel);
-							input_blocks_hdr[block_index].m_pixels[block_element][3] = 0x3c00; // 1.0 (unused)
-						} else {
-							memcpy(input_blocks_ldr[block_index].m_pixels[block_element], pixel_start, bytes_per_pixel);
-						}
-					}
-				}
+	if (num_job_threads > 0) {
+		PoolVector<Thread *> threads;
+		threads.resize(num_job_threads);
 
-				uint8_t output_blocks[16 * cvtt::NumParallelBlocks];
+		PoolVector<Thread *>::Write threads_wb = threads.write();
 
-				if (is_hdr) {
-					if (is_signed) {
-						cvtt::Kernels::EncodeBC6HS(output_blocks, input_blocks_hdr, options);
-					} else {
-						cvtt::Kernels::EncodeBC6HU(output_blocks, input_blocks_hdr, options);
-					}
-				} else {
-					cvtt::Kernels::EncodeBC7(output_blocks, input_blocks_ldr, options);
-				}
+		PoolVector<CVTTCompressionRowTask>::Read tasks_rb = tasks.read();
 
-				int num_real_blocks = ((w - x_start) + 3) / 4;
-				if (num_real_blocks > cvtt::NumParallelBlocks) {
-					num_real_blocks = cvtt::NumParallelBlocks;
-				}
+		job_queue.job_tasks = &tasks_rb[0];
+		job_queue.current_task = 0;
+		job_queue.num_tasks = static_cast<uint32_t>(tasks.size());
 
-				memcpy(out_bytes, output_blocks, 16 * num_real_blocks);
-				out_bytes += 16 * num_real_blocks;
-			}
+		for (int i = 0; i < num_job_threads; i++) {
+			threads_wb[i] = Thread::create(_digest_job_queue, &job_queue);
 		}
+		_digest_job_queue(&job_queue);
 
-		dst_ofs += (MAX(4, bw) * MAX(4, bh)) >> shift;
-		w >>= 1;
-		h >>= 1;
+		for (int i = 0; i < num_job_threads; i++) {
+			Thread::wait_to_finish(threads_wb[i]);
+			memdelete(threads_wb[i]);
+		}
 	}
 
-	rb = PoolVector<uint8_t>::Read();
-	wb = PoolVector<uint8_t>::Write();
-
 	p_image->create(p_image->get_width(), p_image->get_height(), p_image->has_mipmaps(), target_format, data);
 }