summaryrefslogtreecommitdiff
path: root/thirdparty/oidn/core
diff options
context:
space:
mode:
Diffstat (limited to 'thirdparty/oidn/core')
-rw-r--r--thirdparty/oidn/core/api.cpp408
-rw-r--r--thirdparty/oidn/core/autoencoder.cpp519
-rw-r--r--thirdparty/oidn/core/autoencoder.h116
-rw-r--r--thirdparty/oidn/core/buffer.h75
-rw-r--r--thirdparty/oidn/core/common.h133
-rw-r--r--thirdparty/oidn/core/device.cpp205
-rw-r--r--thirdparty/oidn/core/device.h78
-rw-r--r--thirdparty/oidn/core/filter.cpp27
-rw-r--r--thirdparty/oidn/core/filter.h52
-rw-r--r--thirdparty/oidn/core/image.h111
-rw-r--r--thirdparty/oidn/core/input_reorder.h232
-rw-r--r--thirdparty/oidn/core/math.h78
-rw-r--r--thirdparty/oidn/core/network.cpp434
-rw-r--r--thirdparty/oidn/core/network.h112
-rw-r--r--thirdparty/oidn/core/node.h142
-rw-r--r--thirdparty/oidn/core/output_reorder.h126
-rw-r--r--thirdparty/oidn/core/transfer_function.cpp95
-rw-r--r--thirdparty/oidn/core/transfer_function.h201
-rw-r--r--thirdparty/oidn/core/upsample.h92
-rw-r--r--thirdparty/oidn/core/weights_reorder.h99
20 files changed, 3335 insertions, 0 deletions
diff --git a/thirdparty/oidn/core/api.cpp b/thirdparty/oidn/core/api.cpp
new file mode 100644
index 0000000000..7353fe4e25
--- /dev/null
+++ b/thirdparty/oidn/core/api.cpp
@@ -0,0 +1,408 @@
+// ======================================================================== //
+// Copyright 2009-2019 Intel Corporation //
+// //
+// Licensed under the Apache License, Version 2.0 (the "License"); //
+// you may not use this file except in compliance with the License. //
+// You may obtain a copy of the License at //
+// //
+// http://www.apache.org/licenses/LICENSE-2.0 //
+// //
+// Unless required by applicable law or agreed to in writing, software //
+// distributed under the License is distributed on an "AS IS" BASIS, //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and //
+// limitations under the License. //
+// ======================================================================== //
+
+#ifdef _WIN32
+# define OIDN_API extern "C" __declspec(dllexport)
+#else
+# define OIDN_API extern "C" __attribute__ ((visibility ("default")))
+#endif
+
+// Locks the device that owns the specified object
+// Use *only* inside OIDN_TRY/CATCH!
+#define OIDN_LOCK(obj) \
+ std::lock_guard<std::mutex> lock(obj->getDevice()->getMutex());
+
+// Try/catch for converting exceptions to errors
+#define OIDN_TRY \
+ try {
+
+#define OIDN_CATCH(obj) \
+ } catch (Exception& e) { \
+ Device::setError(obj ? obj->getDevice() : nullptr, e.code(), e.what()); \
+ } catch (std::bad_alloc&) { \
+ Device::setError(obj ? obj->getDevice() : nullptr, Error::OutOfMemory, "out of memory"); \
+ } catch (mkldnn::error& e) { \
+ if (e.status == mkldnn_out_of_memory) \
+ Device::setError(obj ? obj->getDevice() : nullptr, Error::OutOfMemory, "out of memory"); \
+ else \
+ Device::setError(obj ? obj->getDevice() : nullptr, Error::Unknown, e.message); \
+ } catch (std::exception& e) { \
+ Device::setError(obj ? obj->getDevice() : nullptr, Error::Unknown, e.what()); \
+ } catch (...) { \
+ Device::setError(obj ? obj->getDevice() : nullptr, Error::Unknown, "unknown exception caught"); \
+ }
+
+#include "device.h"
+#include "filter.h"
+#include <mutex>
+
+namespace oidn {
+
+ namespace
+ {
+ __forceinline void checkHandle(void* handle)
+ {
+ if (handle == nullptr)
+ throw Exception(Error::InvalidArgument, "invalid handle");
+ }
+
+ template<typename T>
+ __forceinline void retainObject(T* obj)
+ {
+ if (obj)
+ {
+ obj->incRef();
+ }
+ else
+ {
+ OIDN_TRY
+ checkHandle(obj);
+ OIDN_CATCH(obj)
+ }
+ }
+
+ template<typename T>
+ __forceinline void releaseObject(T* obj)
+ {
+ if (obj == nullptr || obj->decRefKeep() == 0)
+ {
+ OIDN_TRY
+ checkHandle(obj);
+ OIDN_LOCK(obj);
+ obj->destroy();
+ OIDN_CATCH(obj)
+ }
+ }
+
+ template<>
+ __forceinline void releaseObject(Device* obj)
+ {
+ if (obj == nullptr || obj->decRefKeep() == 0)
+ {
+ OIDN_TRY
+ checkHandle(obj);
+ // Do NOT lock the device because it owns the mutex
+ obj->destroy();
+ OIDN_CATCH(obj)
+ }
+ }
+ }
+
+ OIDN_API OIDNDevice oidnNewDevice(OIDNDeviceType type)
+ {
+ Ref<Device> device = nullptr;
+ OIDN_TRY
+ if (type == OIDN_DEVICE_TYPE_CPU || type == OIDN_DEVICE_TYPE_DEFAULT)
+ device = makeRef<Device>();
+ else
+ throw Exception(Error::InvalidArgument, "invalid device type");
+ OIDN_CATCH(device)
+ return (OIDNDevice)device.detach();
+ }
+
+ OIDN_API void oidnRetainDevice(OIDNDevice hDevice)
+ {
+ Device* device = (Device*)hDevice;
+ retainObject(device);
+ }
+
+ OIDN_API void oidnReleaseDevice(OIDNDevice hDevice)
+ {
+ Device* device = (Device*)hDevice;
+ releaseObject(device);
+ }
+
+ OIDN_API void oidnSetDevice1b(OIDNDevice hDevice, const char* name, bool value)
+ {
+ Device* device = (Device*)hDevice;
+ OIDN_TRY
+ checkHandle(hDevice);
+ OIDN_LOCK(device);
+ device->set1i(name, value);
+ OIDN_CATCH(device)
+ }
+
+ OIDN_API void oidnSetDevice1i(OIDNDevice hDevice, const char* name, int value)
+ {
+ Device* device = (Device*)hDevice;
+ OIDN_TRY
+ checkHandle(hDevice);
+ OIDN_LOCK(device);
+ device->set1i(name, value);
+ OIDN_CATCH(device)
+ }
+
+ OIDN_API bool oidnGetDevice1b(OIDNDevice hDevice, const char* name)
+ {
+ Device* device = (Device*)hDevice;
+ OIDN_TRY
+ checkHandle(hDevice);
+ OIDN_LOCK(device);
+ return device->get1i(name);
+ OIDN_CATCH(device)
+ return false;
+ }
+
+ OIDN_API int oidnGetDevice1i(OIDNDevice hDevice, const char* name)
+ {
+ Device* device = (Device*)hDevice;
+ OIDN_TRY
+ checkHandle(hDevice);
+ OIDN_LOCK(device);
+ return device->get1i(name);
+ OIDN_CATCH(device)
+ return 0;
+ }
+
+ OIDN_API void oidnSetDeviceErrorFunction(OIDNDevice hDevice, OIDNErrorFunction func, void* userPtr)
+ {
+ Device* device = (Device*)hDevice;
+ OIDN_TRY
+ checkHandle(hDevice);
+ OIDN_LOCK(device);
+ device->setErrorFunction((ErrorFunction)func, userPtr);
+ OIDN_CATCH(device)
+ }
+
+ OIDN_API OIDNError oidnGetDeviceError(OIDNDevice hDevice, const char** outMessage)
+ {
+ Device* device = (Device*)hDevice;
+ OIDN_TRY
+ return (OIDNError)Device::getError(device, outMessage);
+ OIDN_CATCH(device)
+ if (outMessage) *outMessage = "";
+ return OIDN_ERROR_UNKNOWN;
+ }
+
+ OIDN_API void oidnCommitDevice(OIDNDevice hDevice)
+ {
+ Device* device = (Device*)hDevice;
+ OIDN_TRY
+ checkHandle(hDevice);
+ OIDN_LOCK(device);
+ device->commit();
+ OIDN_CATCH(device)
+ }
+
+ OIDN_API OIDNBuffer oidnNewBuffer(OIDNDevice hDevice, size_t byteSize)
+ {
+ Device* device = (Device*)hDevice;
+ OIDN_TRY
+ checkHandle(hDevice);
+ OIDN_LOCK(device);
+ Ref<Buffer> buffer = device->newBuffer(byteSize);
+ return (OIDNBuffer)buffer.detach();
+ OIDN_CATCH(device)
+ return nullptr;
+ }
+
+ OIDN_API OIDNBuffer oidnNewSharedBuffer(OIDNDevice hDevice, void* ptr, size_t byteSize)
+ {
+ Device* device = (Device*)hDevice;
+ OIDN_TRY
+ checkHandle(hDevice);
+ OIDN_LOCK(device);
+ Ref<Buffer> buffer = device->newBuffer(ptr, byteSize);
+ return (OIDNBuffer)buffer.detach();
+ OIDN_CATCH(device)
+ return nullptr;
+ }
+
+ OIDN_API void oidnRetainBuffer(OIDNBuffer hBuffer)
+ {
+ Buffer* buffer = (Buffer*)hBuffer;
+ retainObject(buffer);
+ }
+
+ OIDN_API void oidnReleaseBuffer(OIDNBuffer hBuffer)
+ {
+ Buffer* buffer = (Buffer*)hBuffer;
+ releaseObject(buffer);
+ }
+
+ OIDN_API void* oidnMapBuffer(OIDNBuffer hBuffer, OIDNAccess access, size_t byteOffset, size_t byteSize)
+ {
+ Buffer* buffer = (Buffer*)hBuffer;
+ OIDN_TRY
+ checkHandle(hBuffer);
+ OIDN_LOCK(buffer);
+ return buffer->map(byteOffset, byteSize);
+ OIDN_CATCH(buffer)
+ return nullptr;
+ }
+
+ OIDN_API void oidnUnmapBuffer(OIDNBuffer hBuffer, void* mappedPtr)
+ {
+ Buffer* buffer = (Buffer*)hBuffer;
+ OIDN_TRY
+ checkHandle(hBuffer);
+ OIDN_LOCK(buffer);
+ return buffer->unmap(mappedPtr);
+ OIDN_CATCH(buffer)
+ }
+
+ OIDN_API OIDNFilter oidnNewFilter(OIDNDevice hDevice, const char* type)
+ {
+ Device* device = (Device*)hDevice;
+ OIDN_TRY
+ checkHandle(hDevice);
+ OIDN_LOCK(device);
+ Ref<Filter> filter = device->newFilter(type);
+ return (OIDNFilter)filter.detach();
+ OIDN_CATCH(device)
+ return nullptr;
+ }
+
+ OIDN_API void oidnRetainFilter(OIDNFilter hFilter)
+ {
+ Filter* filter = (Filter*)hFilter;
+ retainObject(filter);
+ }
+
+ OIDN_API void oidnReleaseFilter(OIDNFilter hFilter)
+ {
+ Filter* filter = (Filter*)hFilter;
+ releaseObject(filter);
+ }
+
+ OIDN_API void oidnSetFilterImage(OIDNFilter hFilter, const char* name,
+ OIDNBuffer hBuffer, OIDNFormat format,
+ size_t width, size_t height,
+ size_t byteOffset,
+ size_t bytePixelStride, size_t byteRowStride)
+ {
+ Filter* filter = (Filter*)hFilter;
+ OIDN_TRY
+ checkHandle(hFilter);
+ checkHandle(hBuffer);
+ OIDN_LOCK(filter);
+ Ref<Buffer> buffer = (Buffer*)hBuffer;
+ if (buffer->getDevice() != filter->getDevice())
+ throw Exception(Error::InvalidArgument, "the specified objects are bound to different devices");
+ Image data(buffer, (Format)format, (int)width, (int)height, byteOffset, bytePixelStride, byteRowStride);
+ filter->setImage(name, data);
+ OIDN_CATCH(filter)
+ }
+
+ OIDN_API void oidnSetSharedFilterImage(OIDNFilter hFilter, const char* name,
+ void* ptr, OIDNFormat format,
+ size_t width, size_t height,
+ size_t byteOffset,
+ size_t bytePixelStride, size_t byteRowStride)
+ {
+ Filter* filter = (Filter*)hFilter;
+ OIDN_TRY
+ checkHandle(hFilter);
+ OIDN_LOCK(filter);
+ Image data(ptr, (Format)format, (int)width, (int)height, byteOffset, bytePixelStride, byteRowStride);
+ filter->setImage(name, data);
+ OIDN_CATCH(filter)
+ }
+
+ OIDN_API void oidnSetFilter1b(OIDNFilter hFilter, const char* name, bool value)
+ {
+ Filter* filter = (Filter*)hFilter;
+ OIDN_TRY
+ checkHandle(hFilter);
+ OIDN_LOCK(filter);
+ filter->set1i(name, int(value));
+ OIDN_CATCH(filter)
+ }
+
+ OIDN_API bool oidnGetFilter1b(OIDNFilter hFilter, const char* name)
+ {
+ Filter* filter = (Filter*)hFilter;
+ OIDN_TRY
+ checkHandle(hFilter);
+ OIDN_LOCK(filter);
+ return filter->get1i(name);
+ OIDN_CATCH(filter)
+ return false;
+ }
+
+ OIDN_API void oidnSetFilter1i(OIDNFilter hFilter, const char* name, int value)
+ {
+ Filter* filter = (Filter*)hFilter;
+ OIDN_TRY
+ checkHandle(hFilter);
+ OIDN_LOCK(filter);
+ filter->set1i(name, value);
+ OIDN_CATCH(filter)
+ }
+
+ OIDN_API int oidnGetFilter1i(OIDNFilter hFilter, const char* name)
+ {
+ Filter* filter = (Filter*)hFilter;
+ OIDN_TRY
+ checkHandle(hFilter);
+ OIDN_LOCK(filter);
+ return filter->get1i(name);
+ OIDN_CATCH(filter)
+ return 0;
+ }
+
+ OIDN_API void oidnSetFilter1f(OIDNFilter hFilter, const char* name, float value)
+ {
+ Filter* filter = (Filter*)hFilter;
+ OIDN_TRY
+ checkHandle(hFilter);
+ OIDN_LOCK(filter);
+ filter->set1f(name, value);
+ OIDN_CATCH(filter)
+ }
+
+ OIDN_API float oidnGetFilter1f(OIDNFilter hFilter, const char* name)
+ {
+ Filter* filter = (Filter*)hFilter;
+ OIDN_TRY
+ checkHandle(hFilter);
+ OIDN_LOCK(filter);
+ return filter->get1f(name);
+ OIDN_CATCH(filter)
+ return 0;
+ }
+
+ OIDN_API void oidnSetFilterProgressMonitorFunction(OIDNFilter hFilter, OIDNProgressMonitorFunction func, void* userPtr)
+ {
+ Filter* filter = (Filter*)hFilter;
+ OIDN_TRY
+ checkHandle(hFilter);
+ OIDN_LOCK(filter);
+ filter->setProgressMonitorFunction(func, userPtr);
+ OIDN_CATCH(filter)
+ }
+
+ OIDN_API void oidnCommitFilter(OIDNFilter hFilter)
+ {
+ Filter* filter = (Filter*)hFilter;
+ OIDN_TRY
+ checkHandle(hFilter);
+ OIDN_LOCK(filter);
+ filter->commit();
+ OIDN_CATCH(filter)
+ }
+
+ OIDN_API void oidnExecuteFilter(OIDNFilter hFilter)
+ {
+ Filter* filter = (Filter*)hFilter;
+ OIDN_TRY
+ checkHandle(hFilter);
+ OIDN_LOCK(filter);
+ filter->execute();
+ OIDN_CATCH(filter)
+ }
+
+} // namespace oidn
diff --git a/thirdparty/oidn/core/autoencoder.cpp b/thirdparty/oidn/core/autoencoder.cpp
new file mode 100644
index 0000000000..8ae2421fa6
--- /dev/null
+++ b/thirdparty/oidn/core/autoencoder.cpp
@@ -0,0 +1,519 @@
+// ======================================================================== //
+// Copyright 2009-2019 Intel Corporation //
+// //
+// Licensed under the Apache License, Version 2.0 (the "License"); //
+// you may not use this file except in compliance with the License. //
+// You may obtain a copy of the License at //
+// //
+// http://www.apache.org/licenses/LICENSE-2.0 //
+// //
+// Unless required by applicable law or agreed to in writing, software //
+// distributed under the License is distributed on an "AS IS" BASIS, //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and //
+// limitations under the License. //
+// ======================================================================== //
+
+#include "autoencoder.h"
+
+namespace oidn {
+
+ // --------------------------------------------------------------------------
+ // AutoencoderFilter
+ // --------------------------------------------------------------------------
+
+ AutoencoderFilter::AutoencoderFilter(const Ref<Device>& device)
+ : Filter(device)
+ {
+ }
+
+ void AutoencoderFilter::setImage(const std::string& name, const Image& data)
+ {
+ if (name == "color")
+ color = data;
+ else if (name == "albedo")
+ albedo = data;
+ else if (name == "normal")
+ normal = data;
+ else if (name == "output")
+ output = data;
+
+ dirty = true;
+ }
+
+ void AutoencoderFilter::set1i(const std::string& name, int value)
+ {
+ if (name == "hdr")
+ hdr = value;
+ else if (name == "srgb")
+ srgb = value;
+ else if (name == "maxMemoryMB")
+ maxMemoryMB = value;
+
+ dirty = true;
+ }
+
+ int AutoencoderFilter::get1i(const std::string& name)
+ {
+ if (name == "hdr")
+ return hdr;
+ else if (name == "srgb")
+ return srgb;
+ else if (name == "maxMemoryMB")
+ return maxMemoryMB;
+ else if (name == "alignment")
+ return alignment;
+ else if (name == "overlap")
+ return overlap;
+ else
+ throw Exception(Error::InvalidArgument, "invalid parameter");
+ }
+
+ void AutoencoderFilter::set1f(const std::string& name, float value)
+ {
+ if (name == "hdrScale")
+ hdrScale = value;
+
+ dirty = true;
+ }
+
+ float AutoencoderFilter::get1f(const std::string& name)
+ {
+ if (name == "hdrScale")
+ return hdrScale;
+ else
+ throw Exception(Error::InvalidArgument, "invalid parameter");
+ }
+
+ void AutoencoderFilter::commit()
+ {
+ if (!dirty)
+ return;
+
+ {
+ if (mayiuse(avx512_common))
+ net = buildNet<16>();
+ else
+ net = buildNet<8>();
+ }
+
+ dirty = false;
+ }
+
+ void AutoencoderFilter::execute()
+ {
+ if (dirty)
+ throw Exception(Error::InvalidOperation, "changes to the filter are not committed");
+
+ if (!net)
+ return;
+
+ {
+ Progress progress;
+ progress.func = progressFunc;
+ progress.userPtr = progressUserPtr;
+ progress.taskCount = tileCountH * tileCountW;
+
+ // Iterate over the tiles
+ int tileIndex = 0;
+
+ for (int i = 0; i < tileCountH; ++i)
+ {
+ const int h = i * (tileH - 2*overlap); // input tile position (including overlap)
+ const int overlapBeginH = i > 0 ? overlap : 0; // overlap on the top
+ const int overlapEndH = i < tileCountH-1 ? overlap : 0; // overlap on the bottom
+ const int tileH1 = min(H - h, tileH); // input tile size (including overlap)
+ const int tileH2 = tileH1 - overlapBeginH - overlapEndH; // output tile size
+ const int alignOffsetH = tileH - roundUp(tileH1, alignment); // align to the bottom in the tile buffer
+
+ for (int j = 0; j < tileCountW; ++j)
+ {
+ const int w = j * (tileW - 2*overlap); // input tile position (including overlap)
+ const int overlapBeginW = j > 0 ? overlap : 0; // overlap on the left
+ const int overlapEndW = j < tileCountW-1 ? overlap : 0; // overlap on the right
+ const int tileW1 = min(W - w, tileW); // input tile size (including overlap)
+ const int tileW2 = tileW1 - overlapBeginW - overlapEndW; // output tile size
+ const int alignOffsetW = tileW - roundUp(tileW1, alignment); // align to the right in the tile buffer
+
+ // Set the input tile
+ inputReorder->setTile(h, w,
+ alignOffsetH, alignOffsetW,
+ tileH1, tileW1);
+
+ // Set the output tile
+ outputReorder->setTile(alignOffsetH + overlapBeginH, alignOffsetW + overlapBeginW,
+ h + overlapBeginH, w + overlapBeginW,
+ tileH2, tileW2);
+
+ //printf("Tile: %d %d -> %d %d\n", w+overlapBeginW, h+overlapBeginH, w+overlapBeginW+tileW2, h+overlapBeginH+tileH2);
+
+ // Denoise the tile
+ net->execute(progress, tileIndex);
+
+ // Next tile
+ tileIndex++;
+ }
+ }
+ }
+ }
+
+ void AutoencoderFilter::computeTileSize()
+ {
+ const int minTileSize = 3*overlap;
+ const int estimatedBytesPerPixel = mayiuse(avx512_common) ? estimatedBytesPerPixel16 : estimatedBytesPerPixel8;
+ const int64_t maxTilePixels = (int64_t(maxMemoryMB)*1024*1024 - estimatedBytesBase) / estimatedBytesPerPixel;
+
+ tileCountH = 1;
+ tileCountW = 1;
+ tileH = roundUp(H, alignment);
+ tileW = roundUp(W, alignment);
+
+ // Divide the image into tiles until the tile size gets below the threshold
+ while (int64_t(tileH) * tileW > maxTilePixels)
+ {
+ if (tileH > minTileSize && tileH > tileW)
+ {
+ tileCountH++;
+ tileH = max(roundUp(ceilDiv(H - 2*overlap, tileCountH), alignment) + 2*overlap, minTileSize);
+ }
+ else if (tileW > minTileSize)
+ {
+ tileCountW++;
+ tileW = max(roundUp(ceilDiv(W - 2*overlap, tileCountW), alignment) + 2*overlap, minTileSize);
+ }
+ else
+ break;
+ }
+
+ // Compute the final number of tiles
+ tileCountH = (H > tileH) ? ceilDiv(H - 2*overlap, tileH - 2*overlap) : 1;
+ tileCountW = (W > tileW) ? ceilDiv(W - 2*overlap, tileW - 2*overlap) : 1;
+
+ if (device->isVerbose(2))
+ {
+ std::cout << "Tile size : " << tileW << "x" << tileH << std::endl;
+ std::cout << "Tile count: " << tileCountW << "x" << tileCountH << std::endl;
+ }
+ }
+
+ template<int K>
+ std::shared_ptr<Executable> AutoencoderFilter::buildNet()
+ {
+ H = color.height;
+ W = color.width;
+
+ // Configure the network
+ int inputC;
+ void* weightPtr;
+
+ if (srgb && hdr)
+ throw Exception(Error::InvalidOperation, "srgb and hdr modes cannot be enabled at the same time");
+
+ if (color && !albedo && !normal && weightData.hdr)
+ {
+ inputC = 3;
+ weightPtr = hdr ? weightData.hdr : weightData.ldr;
+ }
+ else if (color && albedo && !normal && weightData.hdr_alb)
+ {
+ inputC = 6;
+ weightPtr = hdr ? weightData.hdr_alb : weightData.ldr_alb;
+ }
+ else if (color && albedo && normal && weightData.hdr_alb_nrm)
+ {
+ inputC = 9;
+ weightPtr = hdr ? weightData.hdr_alb_nrm : weightData.ldr_alb_nrm;
+ }
+ else
+ {
+ throw Exception(Error::InvalidOperation, "unsupported combination of input features");
+ }
+
+ if (!output)
+ throw Exception(Error::InvalidOperation, "output image not specified");
+
+ if ((color.format != Format::Float3)
+ || (albedo && albedo.format != Format::Float3)
+ || (normal && normal.format != Format::Float3)
+ || (output.format != Format::Float3))
+ throw Exception(Error::InvalidOperation, "unsupported image format");
+
+ if ((albedo && (albedo.width != W || albedo.height != H))
+ || (normal && (normal.width != W || normal.height != H))
+ || (output.width != W || output.height != H))
+ throw Exception(Error::InvalidOperation, "image size mismatch");
+
+ // Compute the tile size
+ computeTileSize();
+
+ // If the image size is zero, there is nothing else to do
+ if (H <= 0 || W <= 0)
+ return nullptr;
+
+ // Parse the weights
+ const auto weightMap = parseTensors(weightPtr);
+
+ // Create the network
+ std::shared_ptr<Network<K>> net = std::make_shared<Network<K>>(device, weightMap);
+
+ // Compute the tensor sizes
+ const auto inputDims = memory::dims({1, inputC, tileH, tileW});
+ const auto inputReorderDims = net->getInputReorderDims(inputDims, alignment); //-> concat0
+
+ const auto conv1Dims = net->getConvDims("conv1", inputReorderDims); //-> temp0
+ const auto conv1bDims = net->getConvDims("conv1b", conv1Dims); //-> temp1
+ const auto pool1Dims = net->getPoolDims(conv1bDims); //-> concat1
+ const auto conv2Dims = net->getConvDims("conv2", pool1Dims); //-> temp0
+ const auto pool2Dims = net->getPoolDims(conv2Dims); //-> concat2
+ const auto conv3Dims = net->getConvDims("conv3", pool2Dims); //-> temp0
+ const auto pool3Dims = net->getPoolDims(conv3Dims); //-> concat3
+ const auto conv4Dims = net->getConvDims("conv4", pool3Dims); //-> temp0
+ const auto pool4Dims = net->getPoolDims(conv4Dims); //-> concat4
+ const auto conv5Dims = net->getConvDims("conv5", pool4Dims); //-> temp0
+ const auto pool5Dims = net->getPoolDims(conv5Dims); //-> temp1
+ const auto upsample4Dims = net->getUpsampleDims(pool5Dims); //-> concat4
+ const auto concat4Dims = net->getConcatDims(upsample4Dims, pool4Dims);
+ const auto conv6Dims = net->getConvDims("conv6", concat4Dims); //-> temp0
+ const auto conv6bDims = net->getConvDims("conv6b", conv6Dims); //-> temp1
+ const auto upsample3Dims = net->getUpsampleDims(conv6bDims); //-> concat3
+ const auto concat3Dims = net->getConcatDims(upsample3Dims, pool3Dims);
+ const auto conv7Dims = net->getConvDims("conv7", concat3Dims); //-> temp0
+ const auto conv7bDims = net->getConvDims("conv7b", conv7Dims); //-> temp1
+ const auto upsample2Dims = net->getUpsampleDims(conv7bDims); //-> concat2
+ const auto concat2Dims = net->getConcatDims(upsample2Dims, pool2Dims);
+ const auto conv8Dims = net->getConvDims("conv8", concat2Dims); //-> temp0
+ const auto conv8bDims = net->getConvDims("conv8b", conv8Dims); //-> temp1
+ const auto upsample1Dims = net->getUpsampleDims(conv8bDims); //-> concat1
+ const auto concat1Dims = net->getConcatDims(upsample1Dims, pool1Dims);
+ const auto conv9Dims = net->getConvDims("conv9", concat1Dims); //-> temp0
+ const auto conv9bDims = net->getConvDims("conv9b", conv9Dims); //-> temp1
+ const auto upsample0Dims = net->getUpsampleDims(conv9bDims); //-> concat0
+ const auto concat0Dims = net->getConcatDims(upsample0Dims, inputReorderDims);
+ const auto conv10Dims = net->getConvDims("conv10", concat0Dims); //-> temp0
+ const auto conv10bDims = net->getConvDims("conv10b", conv10Dims); //-> temp1
+ const auto conv11Dims = net->getConvDims("conv11", conv10bDims); //-> temp0
+
+ const auto outputDims = memory::dims({1, 3, tileH, tileW});
+
+ // Allocate two temporary ping-pong buffers to decrease memory usage
+ const auto temp0Dims = getMaxTensorDims({
+ conv1Dims,
+ conv2Dims,
+ conv3Dims,
+ conv4Dims,
+ conv5Dims,
+ conv6Dims,
+ conv7Dims,
+ conv8Dims,
+ conv9Dims,
+ conv10Dims,
+ conv11Dims
+ });
+
+ const auto temp1Dims = getMaxTensorDims({
+ conv1bDims,
+ pool5Dims,
+ conv6bDims,
+ conv7bDims,
+ conv8bDims,
+ conv9bDims,
+ conv10bDims,
+ });
+
+ auto temp0 = net->allocTensor(temp0Dims);
+ auto temp1 = net->allocTensor(temp1Dims);
+
+ // Allocate enough memory to hold the concat outputs. Then use the first
+ // half to hold the previous conv output and the second half to hold the
+ // pool/orig image output. This works because everything is C dimension
+ // outermost, padded to K floats, and all the concats are on the C dimension.
+ auto concat0Dst = net->allocTensor(concat0Dims);
+ auto concat1Dst = net->allocTensor(concat1Dims);
+ auto concat2Dst = net->allocTensor(concat2Dims);
+ auto concat3Dst = net->allocTensor(concat3Dims);
+ auto concat4Dst = net->allocTensor(concat4Dims);
+
+ // Transfer function
+ std::shared_ptr<TransferFunction> transferFunc = makeTransferFunc();
+
+ // Autoexposure
+ if (auto tf = std::dynamic_pointer_cast<HDRTransferFunction>(transferFunc))
+ {
+ if (isnan(hdrScale))
+ net->addAutoexposure(color, tf);
+ else
+ tf->setExposure(hdrScale);
+ }
+
+ // Input reorder
+ auto inputReorderDst = net->castTensor(inputReorderDims, concat0Dst, upsample0Dims);
+ inputReorder = net->addInputReorder(color, albedo, normal,
+ transferFunc,
+ alignment, inputReorderDst);
+
+ // conv1
+ auto conv1 = net->addConv("conv1", inputReorder->getDst(), temp0);
+
+ // conv1b
+ auto conv1b = net->addConv("conv1b", conv1->getDst(), temp1);
+
+ // pool1
+ // Adjust pointer for pool1 to eliminate concat1
+ auto pool1Dst = net->castTensor(pool1Dims, concat1Dst, upsample1Dims);
+ auto pool1 = net->addPool(conv1b->getDst(), pool1Dst);
+
+ // conv2
+ auto conv2 = net->addConv("conv2", pool1->getDst(), temp0);
+
+ // pool2
+ // Adjust pointer for pool2 to eliminate concat2
+ auto pool2Dst = net->castTensor(pool2Dims, concat2Dst, upsample2Dims);
+ auto pool2 = net->addPool(conv2->getDst(), pool2Dst);
+
+ // conv3
+ auto conv3 = net->addConv("conv3", pool2->getDst(), temp0);
+
+ // pool3
+ // Adjust pointer for pool3 to eliminate concat3
+ auto pool3Dst = net->castTensor(pool3Dims, concat3Dst, upsample3Dims);
+ auto pool3 = net->addPool(conv3->getDst(), pool3Dst);
+
+ // conv4
+ auto conv4 = net->addConv("conv4", pool3->getDst(), temp0);
+
+ // pool4
+ // Adjust pointer for pool4 to eliminate concat4
+ auto pool4Dst = net->castTensor(pool4Dims, concat4Dst, upsample4Dims);
+ auto pool4 = net->addPool(conv4->getDst(), pool4Dst);
+
+ // conv5
+ auto conv5 = net->addConv("conv5", pool4->getDst(), temp0);
+
+ // pool5
+ auto pool5 = net->addPool(conv5->getDst(), temp1);
+
+ // upsample4
+ auto upsample4Dst = net->castTensor(upsample4Dims, concat4Dst);
+ auto upsample4 = net->addUpsample(pool5->getDst(), upsample4Dst);
+
+ // conv6
+ auto conv6 = net->addConv("conv6", concat4Dst, temp0);
+
+ // conv6b
+ auto conv6b = net->addConv("conv6b", conv6->getDst(), temp1);
+
+ // upsample3
+ auto upsample3Dst = net->castTensor(upsample3Dims, concat3Dst);
+ auto upsample3 = net->addUpsample(conv6b->getDst(), upsample3Dst);
+
+ // conv7
+ auto conv7 = net->addConv("conv7", concat3Dst, temp0);
+
+ // conv7b
+ auto conv7b = net->addConv("conv7b", conv7->getDst(), temp1);
+
+ // upsample2
+ auto upsample2Dst = net->castTensor(upsample2Dims, concat2Dst);
+ auto upsample2 = net->addUpsample(conv7b->getDst(), upsample2Dst);
+
+ // conv8
+ auto conv8 = net->addConv("conv8", concat2Dst, temp0);
+
+ // conv8b
+ auto conv8b = net->addConv("conv8b", conv8->getDst(), temp1);
+
+ // upsample1
+ auto upsample1Dst = net->castTensor(upsample1Dims, concat1Dst);
+ auto upsample1 = net->addUpsample(conv8b->getDst(), upsample1Dst);
+
+ // conv9
+ auto conv9 = net->addConv("conv9", concat1Dst, temp0);
+
+ // conv9b
+ auto conv9b = net->addConv("conv9b", conv9->getDst(), temp1);
+
+ // upsample0
+ auto upsample0Dst = net->castTensor(upsample0Dims, concat0Dst);
+ auto upsample0 = net->addUpsample(conv9b->getDst(), upsample0Dst);
+
+ // conv10
+ auto conv10 = net->addConv("conv10", concat0Dst, temp0);
+
+ // conv10b
+ auto conv10b = net->addConv("conv10b", conv10->getDst(), temp1);
+
+ // conv11
+ auto conv11 = net->addConv("conv11", conv10b->getDst(), temp0, false /* no relu */);
+
+ // Output reorder
+ outputReorder = net->addOutputReorder(conv11->getDst(), transferFunc, output);
+
+ net->finalize();
+ return net;
+ }
+
+ std::shared_ptr<TransferFunction> AutoencoderFilter::makeTransferFunc()
+ {
+ if (hdr)
+ return std::make_shared<PQXTransferFunction>();
+ else if (srgb)
+ return std::make_shared<LinearTransferFunction>();
+ else
+ return std::make_shared<GammaTransferFunction>();
+ }
+
+// Godot doesn't need Raytracing filters. Removing them saves space in the weights files.
+#if 0
+ // --------------------------------------------------------------------------
+ // RTFilter
+ // --------------------------------------------------------------------------
+
+ namespace weights
+ {
+ // LDR
+ extern unsigned char rt_ldr[]; // color
+ extern unsigned char rt_ldr_alb[]; // color, albedo
+ extern unsigned char rt_ldr_alb_nrm[]; // color, albedo, normal
+
+ // HDR
+ extern unsigned char rt_hdr[]; // color
+ extern unsigned char rt_hdr_alb[]; // color, albedo
+ extern unsigned char rt_hdr_alb_nrm[]; // color, albedo, normal
+ }
+
+ RTFilter::RTFilter(const Ref<Device>& device)
+ : AutoencoderFilter(device)
+ {
+ weightData.ldr = weights::rt_ldr;
+ weightData.ldr_alb = weights::rt_ldr_alb;
+ weightData.ldr_alb_nrm = weights::rt_ldr_alb_nrm;
+ weightData.hdr = weights::rt_hdr;
+ weightData.hdr_alb = weights::rt_hdr_alb;
+ weightData.hdr_alb_nrm = weights::rt_hdr_alb_nrm;
+ }
+#endif
+
+ // --------------------------------------------------------------------------
+ // RTLightmapFilter
+ // --------------------------------------------------------------------------
+
+ namespace weights
+ {
+ // HDR
+ extern unsigned char rtlightmap_hdr[]; // color
+ }
+
+ RTLightmapFilter::RTLightmapFilter(const Ref<Device>& device)
+ : AutoencoderFilter(device)
+ {
+ weightData.hdr = weights::rtlightmap_hdr;
+
+ hdr = true;
+ }
+
+ std::shared_ptr<TransferFunction> RTLightmapFilter::makeTransferFunc()
+ {
+ return std::make_shared<LogTransferFunction>();
+ }
+
+} // namespace oidn
diff --git a/thirdparty/oidn/core/autoencoder.h b/thirdparty/oidn/core/autoencoder.h
new file mode 100644
index 0000000000..97432f2bbd
--- /dev/null
+++ b/thirdparty/oidn/core/autoencoder.h
@@ -0,0 +1,116 @@
+// ======================================================================== //
+// Copyright 2009-2019 Intel Corporation //
+// //
+// Licensed under the Apache License, Version 2.0 (the "License"); //
+// you may not use this file except in compliance with the License. //
+// You may obtain a copy of the License at //
+// //
+// http://www.apache.org/licenses/LICENSE-2.0 //
+// //
+// Unless required by applicable law or agreed to in writing, software //
+// distributed under the License is distributed on an "AS IS" BASIS, //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and //
+// limitations under the License. //
+// ======================================================================== //
+
+#pragma once
+
+#include "filter.h"
+#include "network.h"
+#include "transfer_function.h"
+
+namespace oidn {
+
+ // --------------------------------------------------------------------------
+ // AutoencoderFilter - Direct-predicting autoencoder
+ // --------------------------------------------------------------------------
+
+ class AutoencoderFilter : public Filter
+ {
+ protected:
+ static constexpr int alignment = 32; // required spatial alignment in pixels (padding may be necessary)
+ static constexpr int receptiveField = 222; // receptive field in pixels
+ static constexpr int overlap = roundUp(receptiveField / 2, alignment); // required spatial overlap between tiles in pixels
+
+ static constexpr int estimatedBytesBase = 16*1024*1024; // estimated base memory usage
+ static constexpr int estimatedBytesPerPixel8 = 889; // estimated memory usage per pixel for K=8
+ static constexpr int estimatedBytesPerPixel16 = 2185; // estimated memory usage per pixel for K=16
+
+ Image color;
+ Image albedo;
+ Image normal;
+ Image output;
+ bool hdr = false;
+ float hdrScale = std::numeric_limits<float>::quiet_NaN();
+ bool srgb = false;
+ int maxMemoryMB = 6000; // approximate maximum memory usage in MBs
+
+ int H = 0; // image height
+ int W = 0; // image width
+ int tileH = 0; // tile height
+ int tileW = 0; // tile width
+ int tileCountH = 1; // number of tiles in H dimension
+ int tileCountW = 1; // number of tiles in W dimension
+
+ std::shared_ptr<Executable> net;
+ std::shared_ptr<Node> inputReorder;
+ std::shared_ptr<Node> outputReorder;
+
+ struct
+ {
+ void* ldr = nullptr;
+ void* ldr_alb = nullptr;
+ void* ldr_alb_nrm = nullptr;
+ void* hdr = nullptr;
+ void* hdr_alb = nullptr;
+ void* hdr_alb_nrm = nullptr;
+ } weightData;
+
+ explicit AutoencoderFilter(const Ref<Device>& device);
+ virtual std::shared_ptr<TransferFunction> makeTransferFunc();
+
+ public:
+ void setImage(const std::string& name, const Image& data) override;
+ void set1i(const std::string& name, int value) override;
+ int get1i(const std::string& name) override;
+ void set1f(const std::string& name, float value) override;
+ float get1f(const std::string& name) override;
+
+ void commit() override;
+ void execute() override;
+
+ private:
+ void computeTileSize();
+
+ template<int K>
+ std::shared_ptr<Executable> buildNet();
+
+ bool isCommitted() const { return bool(net); }
+ };
+
+ // --------------------------------------------------------------------------
+ // RTFilter - Generic ray tracing denoiser
+ // --------------------------------------------------------------------------
+
+// Godot doesn't need Raytracing filters. Removing them saves space in the weights files.
+#if 0
+ class RTFilter : public AutoencoderFilter
+ {
+ public:
+ explicit RTFilter(const Ref<Device>& device);
+ };
+#endif
+
+ // --------------------------------------------------------------------------
+ // RTLightmapFilter - Ray traced lightmap denoiser
+ // --------------------------------------------------------------------------
+
+ class RTLightmapFilter : public AutoencoderFilter
+ {
+ public:
+ explicit RTLightmapFilter(const Ref<Device>& device);
+ std::shared_ptr<TransferFunction> makeTransferFunc() override;
+ };
+
+} // namespace oidn
diff --git a/thirdparty/oidn/core/buffer.h b/thirdparty/oidn/core/buffer.h
new file mode 100644
index 0000000000..b95109152e
--- /dev/null
+++ b/thirdparty/oidn/core/buffer.h
@@ -0,0 +1,75 @@
+// ======================================================================== //
+// Copyright 2009-2019 Intel Corporation //
+// //
+// Licensed under the Apache License, Version 2.0 (the "License"); //
+// you may not use this file except in compliance with the License. //
+// You may obtain a copy of the License at //
+// //
+// http://www.apache.org/licenses/LICENSE-2.0 //
+// //
+// Unless required by applicable law or agreed to in writing, software //
+// distributed under the License is distributed on an "AS IS" BASIS, //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and //
+// limitations under the License. //
+// ======================================================================== //
+
+#pragma once
+
+#include "common.h"
+#include "device.h"
+
+namespace oidn {
+
+ class Device;
+
+ // Buffer which may or may not own its data
+ class Buffer : public RefCount
+ {
+ private:
+ char* ptr;
+ size_t byteSize;
+ bool shared;
+ Ref<Device> device;
+
+ public:
+ __forceinline Buffer(const Ref<Device>& device, size_t size)
+ : ptr((char*)alignedMalloc(size, 64)),
+ byteSize(size),
+ shared(false),
+ device(device) {}
+
+ __forceinline Buffer(const Ref<Device>& device, void* data, size_t size)
+ : ptr((char*)data),
+ byteSize(size),
+ shared(true),
+ device(device)
+ {
+ if (data == nullptr)
+ throw Exception(Error::InvalidArgument, "buffer pointer null");
+ }
+
+ __forceinline ~Buffer()
+ {
+ if (!shared)
+ alignedFree(ptr);
+ }
+
+ __forceinline char* data() { return ptr; }
+ __forceinline const char* data() const { return ptr; }
+ __forceinline size_t size() const { return byteSize; }
+
+ void* map(size_t offset, size_t size)
+ {
+ if (offset + size > byteSize)
+ throw Exception(Error::InvalidArgument, "buffer region out of range");
+
+ return ptr + offset;
+ }
+
+ void unmap(void* mappedPtr) {}
+
+ Device* getDevice() { return device.get(); }
+ };
+
+} // namespace oidn
diff --git a/thirdparty/oidn/core/common.h b/thirdparty/oidn/core/common.h
new file mode 100644
index 0000000000..6c87f377bc
--- /dev/null
+++ b/thirdparty/oidn/core/common.h
@@ -0,0 +1,133 @@
+// ======================================================================== //
+// Copyright 2009-2019 Intel Corporation //
+// //
+// Licensed under the Apache License, Version 2.0 (the "License"); //
+// you may not use this file except in compliance with the License. //
+// You may obtain a copy of the License at //
+// //
+// http://www.apache.org/licenses/LICENSE-2.0 //
+// //
+// Unless required by applicable law or agreed to in writing, software //
+// distributed under the License is distributed on an "AS IS" BASIS, //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and //
+// limitations under the License. //
+// ======================================================================== //
+
+#pragma once
+
+#include "common/platform.h"
+
+#include "mkl-dnn/include/mkldnn.hpp"
+#include "mkl-dnn/include/mkldnn_debug.h"
+#include "mkl-dnn/src/common/mkldnn_thread.hpp"
+#include "mkl-dnn/src/common/type_helpers.hpp"
+#include "mkl-dnn/src/cpu/jit_generator.hpp"
+
+#include "common/ref.h"
+#include "common/exception.h"
+#include "common/thread.h"
+#include "math.h"
+
+namespace oidn {
+
+ using namespace mkldnn;
+ using namespace mkldnn::impl::cpu;
+ using mkldnn::impl::parallel_nd;
+ using mkldnn::impl::memory_desc_matches_tag;
+
+
+ inline size_t getFormatBytes(Format format)
+ {
+ switch (format)
+ {
+ case Format::Undefined: return 1;
+ case Format::Float: return sizeof(float);
+ case Format::Float2: return sizeof(float)*2;
+ case Format::Float3: return sizeof(float)*3;
+ case Format::Float4: return sizeof(float)*4;
+ }
+ assert(0);
+ return 0;
+ }
+
+
+ inline memory::dims getTensorDims(const std::shared_ptr<memory>& mem)
+ {
+ const mkldnn_memory_desc_t& desc = mem->get_desc().data;
+ return memory::dims(&desc.dims[0], &desc.dims[desc.ndims]);
+ }
+
+ inline memory::data_type getTensorType(const std::shared_ptr<memory>& mem)
+ {
+ const mkldnn_memory_desc_t& desc = mem->get_desc().data;
+ return memory::data_type(desc.data_type);
+ }
+
+ // Returns the number of values in a tensor
+ inline size_t getTensorSize(const memory::dims& dims)
+ {
+ size_t res = 1;
+ for (int i = 0; i < (int)dims.size(); ++i)
+ res *= dims[i];
+ return res;
+ }
+
+ inline memory::dims getMaxTensorDims(const std::vector<memory::dims>& dims)
+ {
+ memory::dims result;
+ size_t maxSize = 0;
+
+ for (const auto& d : dims)
+ {
+ const size_t size = getTensorSize(d);
+ if (size > maxSize)
+ {
+ result = d;
+ maxSize = size;
+ }
+ }
+
+ return result;
+ }
+
+ inline size_t getTensorSize(const std::shared_ptr<memory>& mem)
+ {
+ return getTensorSize(getTensorDims(mem));
+ }
+
+
+ template<int K>
+ inline int getPadded(int dim)
+ {
+ return (dim + (K-1)) & ~(K-1);
+ }
+
+ template<int K>
+ inline memory::dims getPadded_nchw(const memory::dims& dims)
+ {
+ assert(dims.size() == 4);
+ memory::dims padDims = dims;
+ padDims[1] = getPadded<K>(dims[1]); // pad C
+ return padDims;
+ }
+
+
+ template<int K>
+ struct BlockedFormat;
+
+ template<>
+ struct BlockedFormat<8>
+ {
+ static constexpr memory::format_tag nChwKc = memory::format_tag::nChw8c;
+ static constexpr memory::format_tag OIhwKiKo = memory::format_tag::OIhw8i8o;
+ };
+
+ template<>
+ struct BlockedFormat<16>
+ {
+ static constexpr memory::format_tag nChwKc = memory::format_tag::nChw16c;
+ static constexpr memory::format_tag OIhwKiKo = memory::format_tag::OIhw16i16o;
+ };
+
+} // namespace oidn
diff --git a/thirdparty/oidn/core/device.cpp b/thirdparty/oidn/core/device.cpp
new file mode 100644
index 0000000000..0812624bb5
--- /dev/null
+++ b/thirdparty/oidn/core/device.cpp
@@ -0,0 +1,205 @@
+// ======================================================================== //
+// Copyright 2009-2019 Intel Corporation //
+// //
+// Licensed under the Apache License, Version 2.0 (the "License"); //
+// you may not use this file except in compliance with the License. //
+// You may obtain a copy of the License at //
+// //
+// http://www.apache.org/licenses/LICENSE-2.0 //
+// //
+// Unless required by applicable law or agreed to in writing, software //
+// distributed under the License is distributed on an "AS IS" BASIS, //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and //
+// limitations under the License. //
+// ======================================================================== //
+
+#include "device.h"
+#include "autoencoder.h"
+
+namespace oidn {
+
+ thread_local Device::ErrorState Device::globalError;
+
+ Device::Device()
+ {
+ if (!mayiuse(sse41))
+ throw Exception(Error::UnsupportedHardware, "SSE4.1 support is required at minimum");
+ }
+
+ Device::~Device()
+ {
+ }
+
+ void Device::setError(Device* device, Error code, const std::string& message)
+ {
+ // Update the stored error only if the previous error was queried
+ if (device)
+ {
+ ErrorState& curError = device->error.get();
+
+ if (curError.code == Error::None)
+ {
+ curError.code = code;
+ curError.message = message;
+ }
+
+ // Print the error message in verbose mode
+ if (device->isVerbose())
+ std::cerr << "Error: " << message << std::endl;
+
+ // Call the error callback function
+ ErrorFunction errorFunc;
+ void* errorUserPtr;
+
+ {
+ std::lock_guard<std::mutex> lock(device->mutex);
+ errorFunc = device->errorFunc;
+ errorUserPtr = device->errorUserPtr;
+ }
+
+ if (errorFunc)
+ errorFunc(errorUserPtr, code, (code == Error::None) ? nullptr : message.c_str());
+ }
+ else
+ {
+ if (globalError.code == Error::None)
+ {
+ globalError.code = code;
+ globalError.message = message;
+ }
+ }
+ }
+
+ Error Device::getError(Device* device, const char** outMessage)
+ {
+ // Return and clear the stored error code, but keep the error message so pointers to it will
+ // remain valid until the next getError call
+ if (device)
+ {
+ ErrorState& curError = device->error.get();
+ const Error code = curError.code;
+ if (outMessage)
+ *outMessage = (code == Error::None) ? nullptr : curError.message.c_str();
+ curError.code = Error::None;
+ return code;
+ }
+ else
+ {
+ const Error code = globalError.code;
+ if (outMessage)
+ *outMessage = (code == Error::None) ? nullptr : globalError.message.c_str();
+ globalError.code = Error::None;
+ return code;
+ }
+ }
+
+ void Device::setErrorFunction(ErrorFunction func, void* userPtr)
+ {
+ errorFunc = func;
+ errorUserPtr = userPtr;
+ }
+
+ int Device::get1i(const std::string& name)
+ {
+ if (name == "numThreads")
+ return numThreads;
+ else if (name == "setAffinity")
+ return setAffinity;
+ else if (name == "verbose")
+ return verbose;
+ else if (name == "version")
+ return OIDN_VERSION;
+ else if (name == "versionMajor")
+ return OIDN_VERSION_MAJOR;
+ else if (name == "versionMinor")
+ return OIDN_VERSION_MINOR;
+ else if (name == "versionPatch")
+ return OIDN_VERSION_PATCH;
+ else
+ throw Exception(Error::InvalidArgument, "invalid parameter");
+ }
+
+ void Device::set1i(const std::string& name, int value)
+ {
+ if (name == "numThreads")
+ numThreads = value;
+ else if (name == "setAffinity")
+ setAffinity = value;
+ else if (name == "verbose")
+ {
+ verbose = value;
+ error.verbose = value;
+ }
+
+ dirty = true;
+ }
+
+ void Device::commit()
+ {
+ if (isCommitted())
+ throw Exception(Error::InvalidOperation, "device can be committed only once");
+
+ // Create the task arena
+ const int maxNumThreads = 1; //affinity ? affinity->getNumThreads() : tbb::this_task_arena::max_concurrency();
+ numThreads = (numThreads > 0) ? min(numThreads, maxNumThreads) : maxNumThreads;
+
+ dirty = false;
+
+ if (isVerbose())
+ print();
+ }
+
+ void Device::checkCommitted()
+ {
+ if (dirty)
+ throw Exception(Error::InvalidOperation, "changes to the device are not committed");
+ }
+
+ Ref<Buffer> Device::newBuffer(size_t byteSize)
+ {
+ checkCommitted();
+ return makeRef<Buffer>(Ref<Device>(this), byteSize);
+ }
+
+ Ref<Buffer> Device::newBuffer(void* ptr, size_t byteSize)
+ {
+ checkCommitted();
+ return makeRef<Buffer>(Ref<Device>(this), ptr, byteSize);
+ }
+
+ Ref<Filter> Device::newFilter(const std::string& type)
+ {
+ checkCommitted();
+
+ if (isVerbose())
+ std::cout << "Filter: " << type << std::endl;
+
+ Ref<Filter> filter;
+
+// Godot doesn't need Raytracing filters. Removing them saves space in the weights files.
+#if 0
+ if (type == "RT")
+ filter = makeRef<RTFilter>(Ref<Device>(this));
+#endif
+ if (type == "RTLightmap")
+ filter = makeRef<RTLightmapFilter>(Ref<Device>(this));
+ else
+ throw Exception(Error::InvalidArgument, "unknown filter type");
+
+ return filter;
+ }
+
+ void Device::print()
+ {
+ std::cout << std::endl;
+
+ std::cout << "Intel(R) Open Image Denoise " << OIDN_VERSION_STRING << std::endl;
+ std::cout << " Compiler: " << getCompilerName() << std::endl;
+ std::cout << " Build : " << getBuildName() << std::endl;
+ std::cout << " Platform: " << getPlatformName() << std::endl;
+
+ std::cout << std::endl;
+ }
+
+} // namespace oidn
diff --git a/thirdparty/oidn/core/device.h b/thirdparty/oidn/core/device.h
new file mode 100644
index 0000000000..93a83eb731
--- /dev/null
+++ b/thirdparty/oidn/core/device.h
@@ -0,0 +1,78 @@
+// ======================================================================== //
+// Copyright 2009-2019 Intel Corporation //
+// //
+// Licensed under the Apache License, Version 2.0 (the "License"); //
+// you may not use this file except in compliance with the License. //
+// You may obtain a copy of the License at //
+// //
+// http://www.apache.org/licenses/LICENSE-2.0 //
+// //
+// Unless required by applicable law or agreed to in writing, software //
+// distributed under the License is distributed on an "AS IS" BASIS, //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and //
+// limitations under the License. //
+// ======================================================================== //
+
+#pragma once
+
+#include "common.h"
+
+namespace oidn {
+
+ class Buffer;
+ class Filter;
+
+ class Device : public RefCount, public Verbose
+ {
+ private:
+ // Thread-safety
+ std::mutex mutex;
+
+ // Error handling
+ struct ErrorState
+ {
+ Error code = Error::None;
+ std::string message;
+ };
+
+ static thread_local ErrorState globalError;
+ ThreadLocal<ErrorState> error;
+ ErrorFunction errorFunc = nullptr;
+ void* errorUserPtr = nullptr;
+
+ // Parameters
+ int numThreads = 0; // autodetect by default
+ bool setAffinity = true;
+
+ bool dirty = true;
+
+ public:
+ Device();
+ ~Device();
+
+ static void setError(Device* device, Error code, const std::string& message);
+ static Error getError(Device* device, const char** outMessage);
+
+ void setErrorFunction(ErrorFunction func, void* userPtr);
+
+ int get1i(const std::string& name);
+ void set1i(const std::string& name, int value);
+
+ void commit();
+
+ Ref<Buffer> newBuffer(size_t byteSize);
+ Ref<Buffer> newBuffer(void* ptr, size_t byteSize);
+ Ref<Filter> newFilter(const std::string& type);
+
+ __forceinline Device* getDevice() { return this; }
+ __forceinline std::mutex& getMutex() { return mutex; }
+
+ private:
+ bool isCommitted() const { return false; }
+ void checkCommitted();
+
+ void print();
+ };
+
+} // namespace oidn
diff --git a/thirdparty/oidn/core/filter.cpp b/thirdparty/oidn/core/filter.cpp
new file mode 100644
index 0000000000..ec1f10af87
--- /dev/null
+++ b/thirdparty/oidn/core/filter.cpp
@@ -0,0 +1,27 @@
+// ======================================================================== //
+// Copyright 2009-2019 Intel Corporation //
+// //
+// Licensed under the Apache License, Version 2.0 (the "License"); //
+// you may not use this file except in compliance with the License. //
+// You may obtain a copy of the License at //
+// //
+// http://www.apache.org/licenses/LICENSE-2.0 //
+// //
+// Unless required by applicable law or agreed to in writing, software //
+// distributed under the License is distributed on an "AS IS" BASIS, //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and //
+// limitations under the License. //
+// ======================================================================== //
+
+#include "filter.h"
+
+namespace oidn {
+
+ void Filter::setProgressMonitorFunction(ProgressMonitorFunction func, void* userPtr)
+ {
+ progressFunc = func;
+ progressUserPtr = userPtr;
+ }
+
+} // namespace oidn
diff --git a/thirdparty/oidn/core/filter.h b/thirdparty/oidn/core/filter.h
new file mode 100644
index 0000000000..935fa202f4
--- /dev/null
+++ b/thirdparty/oidn/core/filter.h
@@ -0,0 +1,52 @@
+// ======================================================================== //
+// Copyright 2009-2019 Intel Corporation //
+// //
+// Licensed under the Apache License, Version 2.0 (the "License"); //
+// you may not use this file except in compliance with the License. //
+// You may obtain a copy of the License at //
+// //
+// http://www.apache.org/licenses/LICENSE-2.0 //
+// //
+// Unless required by applicable law or agreed to in writing, software //
+// distributed under the License is distributed on an "AS IS" BASIS, //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and //
+// limitations under the License. //
+// ======================================================================== //
+
+#pragma once
+
+#include "common.h"
+#include "device.h"
+#include "image.h"
+
+namespace oidn {
+
+ class Filter : public RefCount
+ {
+ protected:
+ Ref<Device> device;
+
+ ProgressMonitorFunction progressFunc = nullptr;
+ void* progressUserPtr = nullptr;
+
+ bool dirty = true;
+
+ public:
+ explicit Filter(const Ref<Device>& device) : device(device) {}
+
+ virtual void setImage(const std::string& name, const Image& data) = 0;
+ virtual void set1i(const std::string& name, int value) = 0;
+ virtual int get1i(const std::string& name) = 0;
+ virtual void set1f(const std::string& name, float value) = 0;
+ virtual float get1f(const std::string& name) = 0;
+
+ void setProgressMonitorFunction(ProgressMonitorFunction func, void* userPtr);
+
+ virtual void commit() = 0;
+ virtual void execute() = 0;
+
+ Device* getDevice() { return device.get(); }
+ };
+
+} // namespace oidn
diff --git a/thirdparty/oidn/core/image.h b/thirdparty/oidn/core/image.h
new file mode 100644
index 0000000000..748f49c4e5
--- /dev/null
+++ b/thirdparty/oidn/core/image.h
@@ -0,0 +1,111 @@
+// ======================================================================== //
+// Copyright 2009-2019 Intel Corporation //
+// //
+// Licensed under the Apache License, Version 2.0 (the "License"); //
+// you may not use this file except in compliance with the License. //
+// You may obtain a copy of the License at //
+// //
+// http://www.apache.org/licenses/LICENSE-2.0 //
+// //
+// Unless required by applicable law or agreed to in writing, software //
+// distributed under the License is distributed on an "AS IS" BASIS, //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and //
+// limitations under the License. //
+// ======================================================================== //
+
+#pragma once
+
+#include "common.h"
+#include "buffer.h"
+
+namespace oidn {
+
+ struct Image
+ {
+ static constexpr int maxSize = 65536;
+
+ char* ptr; // pointer to the first pixel
+ int width; // width in number of pixels
+ int height; // height in number of pixels
+ size_t bytePixelStride; // pixel stride in number of *bytes*
+ size_t rowStride; // row stride in number of *pixel strides*
+ Format format; // pixel format
+ Ref<Buffer> buffer; // buffer containing the image data
+
+ Image() : ptr(nullptr), width(0), height(0), bytePixelStride(0), rowStride(0), format(Format::Undefined) {}
+
+ Image(void* ptr, Format format, int width, int height, size_t byteOffset, size_t inBytePixelStride, size_t inByteRowStride)
+ {
+ if (ptr == nullptr)
+ throw Exception(Error::InvalidArgument, "buffer pointer null");
+
+ init((char*)ptr + byteOffset, format, width, height, inBytePixelStride, inByteRowStride);
+ }
+
+ Image(const Ref<Buffer>& buffer, Format format, int width, int height, size_t byteOffset, size_t inBytePixelStride, size_t inByteRowStride)
+ {
+ init(buffer->data() + byteOffset, format, width, height, inBytePixelStride, inByteRowStride);
+
+ if (byteOffset + height * rowStride * bytePixelStride > buffer->size())
+ throw Exception(Error::InvalidArgument, "buffer region out of range");
+ }
+
+ void init(char* ptr, Format format, int width, int height, size_t inBytePixelStride, size_t inByteRowStride)
+ {
+ assert(width >= 0);
+ assert(height >= 0);
+ if (width > maxSize || height > maxSize)
+ throw Exception(Error::InvalidArgument, "image size too large");
+
+ this->ptr = ptr;
+ this->width = width;
+ this->height = height;
+
+ const size_t pixelSize = getFormatBytes(format);
+ if (inBytePixelStride != 0)
+ {
+ if (inBytePixelStride < pixelSize)
+ throw Exception(Error::InvalidArgument, "pixel stride smaller than pixel size");
+
+ this->bytePixelStride = inBytePixelStride;
+ }
+ else
+ {
+ this->bytePixelStride = pixelSize;
+ }
+
+ if (inByteRowStride != 0)
+ {
+ if (inByteRowStride < width * this->bytePixelStride)
+ throw Exception(Error::InvalidArgument, "row stride smaller than width * pixel stride");
+ if (inByteRowStride % this->bytePixelStride != 0)
+ throw Exception(Error::InvalidArgument, "row stride not integer multiple of pixel stride");
+
+ this->rowStride = inByteRowStride / this->bytePixelStride;
+ }
+ else
+ {
+ this->rowStride = width;
+ }
+
+ this->format = format;
+ }
+
+ __forceinline char* get(int y, int x)
+ {
+ return ptr + ((size_t(y) * rowStride + size_t(x)) * bytePixelStride);
+ }
+
+ __forceinline const char* get(int y, int x) const
+ {
+ return ptr + ((size_t(y) * rowStride + size_t(x)) * bytePixelStride);
+ }
+
+ operator bool() const
+ {
+ return ptr != nullptr;
+ }
+ };
+
+} // namespace oidn
diff --git a/thirdparty/oidn/core/input_reorder.h b/thirdparty/oidn/core/input_reorder.h
new file mode 100644
index 0000000000..966856afe9
--- /dev/null
+++ b/thirdparty/oidn/core/input_reorder.h
@@ -0,0 +1,232 @@
+// ======================================================================== //
+// Copyright 2009-2019 Intel Corporation //
+// //
+// Licensed under the Apache License, Version 2.0 (the "License"); //
+// you may not use this file except in compliance with the License. //
+// You may obtain a copy of the License at //
+// //
+// http://www.apache.org/licenses/LICENSE-2.0 //
+// //
+// Unless required by applicable law or agreed to in writing, software //
+// distributed under the License is distributed on an "AS IS" BASIS, //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and //
+// limitations under the License. //
+// ======================================================================== //
+
+#pragma once
+
+#include "node.h"
+#include "image.h"
+
+namespace oidn {
+
+ // Input reorder node
+ template<int K, class TransferFunction>
+ class InputReorderNode : public Node
+ {
+ private:
+ // Source
+ Image color;
+ Image albedo;
+ Image normal;
+
+ // Destination
+ std::shared_ptr<memory> dst;
+ float* dstPtr;
+ int C2;
+ int H2;
+ int W2;
+
+ // Tile
+ int h1Begin;
+ int w1Begin;
+ int h2Begin;
+ int w2Begin;
+ int H;
+ int W;
+
+ std::shared_ptr<TransferFunction> transferFunc;
+
+ public:
+ InputReorderNode(const Image& color,
+ const Image& albedo,
+ const Image& normal,
+ const std::shared_ptr<memory>& dst,
+ const std::shared_ptr<TransferFunction>& transferFunc)
+ : color(color), albedo(albedo), normal(normal),
+ dst(dst),
+ h1Begin(0), w1Begin(0),
+ H(color.height), W(color.width),
+ transferFunc(transferFunc)
+ {
+ const mkldnn_memory_desc_t& dstDesc = dst->get_desc().data;
+ assert(memory_desc_matches_tag(dstDesc, mkldnn_format_tag_t(BlockedFormat<K>::nChwKc)));
+ assert(dstDesc.ndims == 4);
+ assert(dstDesc.data_type == memory::data_type::f32);
+ assert(dstDesc.dims[0] == 1);
+ //assert(dstDesc.dims[1] >= getPadded<K>(C1));
+
+ dstPtr = (float*)dst->get_data_handle();
+ C2 = dstDesc.dims[1];
+ H2 = dstDesc.dims[2];
+ W2 = dstDesc.dims[3];
+ }
+
+ void setTile(int h1, int w1, int h2, int w2, int H, int W) override
+ {
+ h1Begin = h1;
+ w1Begin = w1;
+ h2Begin = h2;
+ w2Begin = w2;
+ this->H = H;
+ this->W = W;
+ }
+
+ void execute(stream& sm) override
+ {
+ assert(H + h1Begin <= color.height);
+ assert(W + w1Begin <= color.width);
+ assert(H + h2Begin <= H2);
+ assert(W + w2Begin <= W2);
+
+ parallel_nd(H2, [&](int h2)
+ {
+ const int h = h2 - h2Begin;
+
+ if (h >= 0 && h < H)
+ {
+ const int h1 = h + h1Begin;
+
+ // Zero pad
+ for (int w2 = 0; w2 < w2Begin; ++w2)
+ {
+ int c = 0;
+ while (c < C2)
+ store(h2, w2, c, 0.f);
+ }
+
+ // Reorder
+ for (int w = 0; w < W; ++w)
+ {
+ const int w1 = w + w1Begin;
+ const int w2 = w + w2Begin;
+
+ int c = 0;
+ storeColor(h2, w2, c, (float*)color.get(h1, w1));
+ if (albedo)
+ storeAlbedo(h2, w2, c, (float*)albedo.get(h1, w1));
+ if (normal)
+ storeNormal(h2, w2, c, (float*)normal.get(h1, w1));
+ while (c < C2)
+ store(h2, w2, c, 0.f);
+ }
+
+ // Zero pad
+ for (int w2 = W + w2Begin; w2 < W2; ++w2)
+ {
+ int c = 0;
+ while (c < C2)
+ store(h2, w2, c, 0.f);
+ }
+ }
+ else
+ {
+ // Zero pad
+ for (int w2 = 0; w2 < W2; ++w2)
+ {
+ int c = 0;
+ while (c < C2)
+ store(h2, w2, c, 0.f);
+ }
+ }
+ });
+ }
+
+ std::shared_ptr<memory> getDst() const override { return dst; }
+
+ private:
+ // Stores a single value
+ __forceinline void store(int h, int w, int& c, float value)
+ {
+ // Destination is in nChwKc format
+ float* dst_c = dstPtr + (H2*W2*K*(c/K)) + h*W2*K + w*K + (c%K);
+ *dst_c = value;
+ c++;
+ }
+
+ // Stores a color
+ __forceinline void storeColor(int h, int w, int& c, const float* values)
+ {
+ #pragma unroll
+ for (int i = 0; i < 3; ++i)
+ {
+ // Load the value
+ float x = values[i];
+
+ // Sanitize the value
+ x = maxSafe(x, 0.f);
+
+ // Apply the transfer function
+ x = transferFunc->forward(x);
+
+ // Store the value
+ store(h, w, c, x);
+ }
+ }
+
+ // Stores an albedo
+ __forceinline void storeAlbedo(int h, int w, int& c, const float* values)
+ {
+ #pragma unroll
+ for (int i = 0; i < 3; ++i)
+ {
+ // Load the value
+ float x = values[i];
+
+ // Sanitize the value
+ x = clampSafe(x, 0.f, 1.f);
+
+ // Store the value
+ store(h, w, c, x);
+ }
+ }
+
+ // Stores a normal
+ __forceinline void storeNormal(int h, int w, int& c, const float* values)
+ {
+ // Load the normal
+ float x = values[0];
+ float y = values[1];
+ float z = values[2];
+
+ // Compute the length of the normal
+ const float lengthSqr = sqr(x) + sqr(y) + sqr(z);
+
+ // Normalize the normal and transform it to [0..1]
+ if (isfinite(lengthSqr))
+ {
+ const float invLength = (lengthSqr > minVectorLengthSqr) ? rsqrt(lengthSqr) : 1.f;
+
+ const float scale = invLength * 0.5f;
+ const float offset = 0.5f;
+
+ x = x * scale + offset;
+ y = y * scale + offset;
+ z = z * scale + offset;
+ }
+ else
+ {
+ x = 0.f;
+ y = 0.f;
+ z = 0.f;
+ }
+
+ // Store the normal
+ store(h, w, c, x);
+ store(h, w, c, y);
+ store(h, w, c, z);
+ }
+ };
+
+} // namespace oidn
diff --git a/thirdparty/oidn/core/math.h b/thirdparty/oidn/core/math.h
new file mode 100644
index 0000000000..a844ef0d1d
--- /dev/null
+++ b/thirdparty/oidn/core/math.h
@@ -0,0 +1,78 @@
+// ======================================================================== //
+// Copyright 2009-2019 Intel Corporation //
+// //
+// Licensed under the Apache License, Version 2.0 (the "License"); //
+// you may not use this file except in compliance with the License. //
+// You may obtain a copy of the License at //
+// //
+// http://www.apache.org/licenses/LICENSE-2.0 //
+// //
+// Unless required by applicable law or agreed to in writing, software //
+// distributed under the License is distributed on an "AS IS" BASIS, //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and //
+// limitations under the License. //
+// ======================================================================== //
+
+#pragma once
+
+#include "common/platform.h"
+
+namespace oidn {
+
+ constexpr float minVectorLength = 1e-10f;
+ constexpr float minVectorLengthSqr = minVectorLength * minVectorLength;
+
+ using std::log;
+ using std::log2;
+ using std::exp;
+ using std::exp2;
+ using std::pow;
+ using std::isfinite;
+ using std::isnan;
+
+ __forceinline float sqr(float x)
+ {
+ return x * x;
+ }
+
+ __forceinline float rcp(float x)
+ {
+ __m128 r = _mm_rcp_ss(_mm_set_ss(x));
+ return _mm_cvtss_f32(_mm_sub_ss(_mm_add_ss(r, r), _mm_mul_ss(_mm_mul_ss(r, r), _mm_set_ss(x))));
+ }
+
+ __forceinline float rsqrt(float x)
+ {
+ __m128 r = _mm_rsqrt_ss(_mm_set_ss(x));
+ return _mm_cvtss_f32(_mm_add_ss(_mm_mul_ss(_mm_set_ss(1.5f), r),
+ _mm_mul_ss(_mm_mul_ss(_mm_mul_ss(_mm_set_ss(x), _mm_set_ss(-0.5f)), r), _mm_mul_ss(r, r))));
+ }
+
+ __forceinline float maxSafe(float value, float minValue)
+ {
+ return isfinite(value) ? max(value, minValue) : minValue;
+ }
+
+ __forceinline float clampSafe(float value, float minValue, float maxValue)
+ {
+ return isfinite(value) ? clamp(value, minValue, maxValue) : minValue;
+ }
+
+ // Returns ceil(a / b) for non-negative integers
+ template<class Int>
+ __forceinline constexpr Int ceilDiv(Int a, Int b)
+ {
+ //assert(a >= 0);
+ //assert(b > 0);
+ return (a + b - 1) / b;
+ }
+
+ // Returns a rounded up to multiple of b
+ template<class Int>
+ __forceinline constexpr Int roundUp(Int a, Int b)
+ {
+ return ceilDiv(a, b) * b;
+ }
+
+} // namespace oidn
diff --git a/thirdparty/oidn/core/network.cpp b/thirdparty/oidn/core/network.cpp
new file mode 100644
index 0000000000..4da32073cd
--- /dev/null
+++ b/thirdparty/oidn/core/network.cpp
@@ -0,0 +1,434 @@
+// ======================================================================== //
+// Copyright 2009-2019 Intel Corporation //
+// //
+// Licensed under the Apache License, Version 2.0 (the "License"); //
+// you may not use this file except in compliance with the License. //
+// You may obtain a copy of the License at //
+// //
+// http://www.apache.org/licenses/LICENSE-2.0 //
+// //
+// Unless required by applicable law or agreed to in writing, software //
+// distributed under the License is distributed on an "AS IS" BASIS, //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and //
+// limitations under the License. //
+// ======================================================================== //
+
+#include "network.h"
+#include "upsample.h"
+#include "weights_reorder.h"
+#include <cstring>
+
+namespace oidn {
+
+ template<int K>
+ Network<K>::Network(const Ref<Device>& device, const std::map<std::string, Tensor>& weightMap)
+ : device(device),
+ eng(engine::cpu, 0),
+ sm(eng),
+ weightMap(weightMap)
+ {
+ }
+
+ template<int K>
+ void Network<K>::execute(const Progress& progress, int taskIndex)
+ {
+ if (progress.func)
+ {
+ const double value = double(taskIndex) / double(progress.taskCount);
+ if (!progress.func(progress.userPtr, value))
+ throw Exception(Error::Cancelled, "execution was cancelled");
+ }
+
+ for (size_t i = 0; i < nodes.size(); ++i)
+ {
+ nodes[i]->execute(sm);
+
+ if (progress.func)
+ {
+ const double value = (double(taskIndex) + double(i+1) / double(nodes.size())) / double(progress.taskCount);
+ if (!progress.func(progress.userPtr, value))
+ throw Exception(Error::Cancelled, "execution was cancelled");
+ }
+ }
+ }
+
+ template<int K>
+ std::shared_ptr<memory> Network<K>::allocTensor(const memory::dims& dims,
+ memory::format_tag format,
+ void* data)
+ {
+ if (format == memory::format_tag::any)
+ {
+ if (dims.size() == 4)
+ format = BlockedFormat<K>::nChwKc;
+ else if (dims.size() == 1)
+ format = memory::format_tag::x;
+ else
+ assert(0);
+ }
+ memory::desc desc(dims, memory::data_type::f32, format);
+ if (data == nullptr)
+ {
+ const size_t bytes = getTensorSize(dims) * sizeof(float);
+ if (format == BlockedFormat<K>::nChwKc)
+ activationAllocBytes += bytes;
+ totalAllocBytes += bytes;
+
+ return std::make_shared<memory>(desc, eng);
+ }
+ else
+ {
+ return std::make_shared<memory>(desc, eng, data);
+ }
+ }
+
+ template<int K>
+ std::shared_ptr<memory> Network<K>::castTensor(const memory::dims& dims,
+ const std::shared_ptr<memory>& src,
+ size_t srcOffset,
+ memory::format_tag format)
+ {
+ const mkldnn_memory_desc_t& srcDesc = src->get_desc().data;
+ MAYBE_UNUSED(srcDesc);
+ assert(srcDesc.data_type == memory::data_type::f32);
+ assert(getTensorSize(src) >= srcOffset + getTensorSize(dims));
+
+ if (format == memory::format_tag::any)
+ {
+ if (dims.size() == 4)
+ format = BlockedFormat<K>::nChwKc;
+ else if (dims.size() == 1)
+ format = memory::format_tag::x;
+ else
+ assert(0);
+ }
+ memory::desc desc(dims, memory::data_type::f32, format);
+ float* srcPtr = (float*)src->get_data_handle() + srcOffset;
+ return std::make_shared<memory>(desc, eng, srcPtr);
+ }
+
+ template<int K>
+ std::shared_ptr<memory> Network<K>::castTensor(const memory::dims& dims,
+ const std::shared_ptr<memory>& src,
+ const memory::dims& srcOffset)
+ {
+ return castTensor(dims, src, getTensorSize(srcOffset));
+ }
+
+ template<int K>
+ void Network<K>::zeroTensor(const std::shared_ptr<memory>& dst)
+ {
+ assert(getTensorType(dst) == memory::data_type::f32);
+ memset(dst->get_data_handle(), 0, getTensorSize(dst)*sizeof(float));
+ }
+
+ template<int K>
+ memory::dims Network<K>::getInputReorderDims(const memory::dims& srcDims, int alignment)
+ {
+ memory::dims dstDims = srcDims;
+ dstDims[1] = getPadded<K>(srcDims[1]); // round up C
+ dstDims[2] = roundUp(srcDims[2], memory::dim(alignment)); // round up H
+ dstDims[3] = roundUp(srcDims[3], memory::dim(alignment)); // round up W
+ return dstDims;
+ }
+
+ template<int K>
+ std::shared_ptr<Node> Network<K>::addInputReorder(const Image& color,
+ const Image& albedo,
+ const Image& normal,
+ const std::shared_ptr<TransferFunction>& transferFunc,
+ int alignment,
+ const std::shared_ptr<memory>& userDst)
+ {
+ assert(color);
+ int inputC = 3;
+ if (albedo) inputC += 3;
+ if (normal) inputC += 3;
+
+ memory::dims srcDims = {1, inputC, color.height, color.width};
+ memory::dims dstDims = getInputReorderDims(srcDims, alignment);
+
+ // Allocate padded memory
+ auto dst = userDst;
+ if (!dst)
+ dst = allocTensor(dstDims);
+
+ // Push node
+ std::shared_ptr<Node> node;
+
+ if (auto tf = std::dynamic_pointer_cast<LinearTransferFunction>(transferFunc))
+ node = std::make_shared<InputReorderNode<K, LinearTransferFunction>>(color, albedo, normal, dst, tf);
+ else if (auto tf = std::dynamic_pointer_cast<GammaTransferFunction>(transferFunc))
+ node = std::make_shared<InputReorderNode<K, GammaTransferFunction>>(color, albedo, normal, dst, tf);
+ else if (auto tf = std::dynamic_pointer_cast<LogTransferFunction>(transferFunc))
+ node = std::make_shared<InputReorderNode<K, LogTransferFunction>>(color, albedo, normal, dst, tf);
+ else if (auto tf = std::dynamic_pointer_cast<PQXTransferFunction>(transferFunc))
+ node = std::make_shared<InputReorderNode<K, PQXTransferFunction>>(color, albedo, normal, dst, tf);
+ else
+ assert(0);
+
+ nodes.push_back(node);
+ return node;
+ }
+
+ template<int K>
+ std::shared_ptr<Node> Network<K>::addOutputReorder(const std::shared_ptr<memory>& src,
+ const std::shared_ptr<TransferFunction>& transferFunc,
+ const Image& output)
+ {
+ memory::dims srcDims = getTensorDims(src);
+ assert(srcDims[1] == K);
+
+ // Push node
+ std::shared_ptr<Node> node;
+
+ if (auto tf = std::dynamic_pointer_cast<LinearTransferFunction>(transferFunc))
+ node = std::make_shared<OutputReorderNode<K, LinearTransferFunction>>(src, output, tf);
+ else if (auto tf = std::dynamic_pointer_cast<GammaTransferFunction>(transferFunc))
+ node = std::make_shared<OutputReorderNode<K, GammaTransferFunction>>(src, output, tf);
+ else if (auto tf = std::dynamic_pointer_cast<LogTransferFunction>(transferFunc))
+ node = std::make_shared<OutputReorderNode<K, LogTransferFunction>>(src, output, tf);
+ else if (auto tf = std::dynamic_pointer_cast<PQXTransferFunction>(transferFunc))
+ node = std::make_shared<OutputReorderNode<K, PQXTransferFunction>>(src, output, tf);
+ else
+ assert(0);
+
+ nodes.push_back(node);
+ return node;
+ }
+
+ template<int K>
+ memory::dims Network<K>::getConvDims(const std::string& name, const memory::dims& srcDims)
+ {
+ auto b = weightMap[name + "/b"];
+ memory::dims dstDims = srcDims;
+ dstDims[1] = getPadded<K>(b.dims[0]); // dstDims[C] = getPadded(OC)
+ return dstDims;
+ }
+
+ template<int K>
+ std::shared_ptr<Node> Network<K>::addConv(const std::string& name,
+ const std::shared_ptr<memory>& src,
+ const std::shared_ptr<memory>& userDst,
+ bool relu)
+ {
+ const memory::dims strides = {1, 1};
+ const memory::dims padding = {1, 1};
+
+ memory::dims srcDims = getTensorDims(src);
+
+ // Get the weights
+ const auto& W = weightMap[name + "/W"];
+ if (W.ndims() != 4 || W.format != "oihw")
+ throw Exception(Error::InvalidOperation, "invalid convolution weights");
+ memory::dims weightsDims = W.dims;
+ auto userWeights = allocTensor(weightsDims, memory::format_tag::oihw, W.data);
+
+ // Pad the weights
+ memory::dims weightsPadDims = weightsDims;
+ weightsPadDims[1] = getPadded<K>(weightsDims[1]); // IC
+ weightsPadDims[0] = getPadded<K>(weightsDims[0]); // OC
+ assert(srcDims[1] == weightsPadDims[1]); // srcDims[C] == weightsPadDims[IC]
+ auto weightsPad = allocTensor(weightsPadDims, memory::format_tag::oihw);
+ WeightsReorderNode<K>(userWeights, weightsPad).execute(sm);
+
+ // Get the biases
+ const auto& b = weightMap[name + "/b"];
+ if (b.ndims() != 1)
+ throw Exception(Error::InvalidOperation, "invalid convolution biases");
+ memory::dims biasDims = b.dims;
+
+ // Copy/pad the biases
+ memory::dims biasPadDims = {getPadded<K>(biasDims[0])};
+ auto bias = allocTensor(biasPadDims);
+ if (biasDims[0] != biasPadDims[0])
+ memset(bias->get_data_handle(), 0, biasPadDims[0]*sizeof(float));
+ memcpy(bias->get_data_handle(), b.data, biasDims[0]*sizeof(float));
+
+ // Allocate memory for destination
+ memory::dims dstDims = srcDims;
+ dstDims[1] = weightsPadDims[0]; // dstDims[C] = weightsPadDims[OC]
+
+ std::shared_ptr<memory> dst;
+ if (!userDst)
+ dst = allocTensor(dstDims);
+ else if (getTensorDims(userDst) == dstDims)
+ dst = userDst;
+ else
+ dst = castTensor(dstDims, userDst);
+
+ // Create a convolution
+ // Let the convolution primitive choose the weights format
+ auto weightsDesc = memory::desc({ weightsPadDims }, memory::data_type::f32, memory::format_tag::any);
+
+ auto convAlgo = (K == 16) ? convolution_winograd : convolution_direct;
+ auto convDesc = convolution_forward::desc(
+ prop_kind::forward_inference, convAlgo,
+ src->get_desc(),
+ weightsDesc,
+ bias->get_desc(),
+ dst->get_desc(),
+ strides, padding, padding, padding_kind::zero);
+
+ // Incorporate relu
+ mkldnn::primitive_attr convAttr;
+ if (relu)
+ {
+ mkldnn::post_ops ops;
+ ops.append_eltwise(
+ 1.f, // scale factor, not used
+ algorithm::eltwise_relu,
+ 0.f, // max with
+ 0.f // unused
+ );
+ convAttr.set_post_ops(ops);
+ }
+ convAttr.set_scratchpad_mode(scratchpad_mode_user);
+
+ auto convPrimDesc = convolution_forward::primitive_desc(convDesc, convAttr, eng);
+
+ // Reorder the weights to the final format, if necessary
+ auto weights = weightsPad;
+ if (convPrimDesc.weights_desc() != weightsPad->get_desc())
+ {
+ weights = std::make_shared<memory>(convPrimDesc.weights_desc(), eng);
+ ReorderNode(weightsPad, weights).execute(sm);
+ }
+
+ // Create convolution node and add it to the net
+ auto node = std::make_shared<ConvNode>(convPrimDesc, src, weights, bias, dst);
+ nodes.push_back(node);
+ return node;
+ }
+
+ template<int K>
+ memory::dims Network<K>::getPoolDims(const memory::dims& srcDims)
+ {
+ memory::dims dstDims = srcDims;
+ dstDims[2] /= 2; // H/2
+ dstDims[3] /= 2; // W/2
+ return dstDims;
+ }
+
+ template<int K>
+ std::shared_ptr<Node> Network<K>::addPool(const std::shared_ptr<memory>& src,
+ const std::shared_ptr<memory>& userDst)
+ {
+ const memory::dims kernel = {2, 2};
+ const memory::dims strides = {2, 2};
+ const memory::dims padding = {0, 0};
+
+ memory::dims srcDims = getTensorDims(src);
+ memory::dims dstDims = getPoolDims(srcDims);
+
+ std::shared_ptr<memory> dst;
+ if (!userDst)
+ dst = allocTensor(dstDims);
+ else if (getTensorDims(userDst) == dstDims)
+ dst = userDst;
+ else
+ dst = castTensor(dstDims, userDst);
+
+ auto poolDesc = pooling_forward::desc(
+ prop_kind::forward_inference, pooling_max,
+ src->get_desc(),
+ dst->get_desc(),
+ strides, kernel, padding, padding, padding_kind::zero);
+
+ mkldnn::primitive_attr poolAttr;
+ poolAttr.set_scratchpad_mode(scratchpad_mode_user);
+
+ auto poolPrimDesc = pooling_forward::primitive_desc(poolDesc, poolAttr, eng);
+
+ auto node = std::make_shared<PoolNode>(poolPrimDesc, src, dst);
+ nodes.push_back(node);
+ return node;
+ }
+
+ template<int K>
+ memory::dims Network<K>::getUpsampleDims(const memory::dims& srcDims)
+ {
+ memory::dims dstDims = srcDims;
+ dstDims[2] *= 2; // H*2
+ dstDims[3] *= 2; // W*2
+ return dstDims;
+ }
+
+ template<int K>
+ std::shared_ptr<Node> Network<K>::addUpsample(const std::shared_ptr<memory>& src,
+ const std::shared_ptr<memory>& userDst)
+ {
+ memory::dims srcDims = getTensorDims(src);
+ memory::dims dstDims = getUpsampleDims(srcDims);
+
+ std::shared_ptr<memory> dst;
+ if (!userDst)
+ dst = allocTensor(dstDims);
+ else if (getTensorDims(userDst) == dstDims)
+ dst = userDst;
+ else
+ dst = castTensor(dstDims, userDst);
+
+ // Create upsampling node and add it to net
+ auto node = std::make_shared<UpsampleNode<K>>(src, dst);
+ nodes.push_back(node);
+ return node;
+ }
+
+ template<int K>
+ memory::dims Network<K>::getConcatDims(const memory::dims& src1Dims, const memory::dims& src2Dims)
+ {
+ assert(src1Dims[0] == src2Dims[0]); // N
+ assert(src1Dims[2] == src2Dims[2]); // H
+ assert(src1Dims[3] == src2Dims[3]); // W
+
+ memory::dims dstDims = src1Dims;
+ dstDims[1] += src2Dims[1]; // C
+ return dstDims;
+ }
+
+ template<int K>
+ std::shared_ptr<Node> Network<K>::addAutoexposure(const Image& color,
+ const std::shared_ptr<HDRTransferFunction>& transferFunc)
+ {
+ auto node = std::make_shared<AutoexposureNode>(color, transferFunc);
+ nodes.push_back(node);
+ return node;
+ }
+
+ template <int K>
+ void Network<K>::finalize()
+ {
+ // Compute the size of the scratchpad
+ size_t scratchpadSize = 0;
+ for (const auto& node : nodes)
+ scratchpadSize = max(scratchpadSize, node->getScratchpadSize());
+
+ // Allocate the scratchpad
+ memory::dims scratchpadDims = { memory::dim(scratchpadSize) };
+ memory::desc scratchpadDesc(scratchpadDims, memory::data_type::u8, memory::format_tag::x);
+ auto scratchpad = std::make_shared<memory>(scratchpadDesc, eng);
+ activationAllocBytes += scratchpadSize;
+ totalAllocBytes += scratchpadSize;
+
+ // Set the scratchpad for the nodes
+ for (auto& node : nodes)
+ node->setScratchpad(scratchpad);
+
+ // Free the weights
+ weightMap.clear();
+
+ // Print statistics
+ if (device->isVerbose(2))
+ {
+ std::cout << "Activation bytes: " << activationAllocBytes << std::endl;
+ std::cout << "Scratchpad bytes: " << scratchpadSize << std::endl;
+ std::cout << "Total bytes : " << totalAllocBytes << std::endl;
+ }
+ }
+
+ template class Network<8>;
+ template class Network<16>;
+
+} // namespace oidn
diff --git a/thirdparty/oidn/core/network.h b/thirdparty/oidn/core/network.h
new file mode 100644
index 0000000000..7a696fd355
--- /dev/null
+++ b/thirdparty/oidn/core/network.h
@@ -0,0 +1,112 @@
+// ======================================================================== //
+// Copyright 2009-2019 Intel Corporation //
+// //
+// Licensed under the Apache License, Version 2.0 (the "License"); //
+// you may not use this file except in compliance with the License. //
+// You may obtain a copy of the License at //
+// //
+// http://www.apache.org/licenses/LICENSE-2.0 //
+// //
+// Unless required by applicable law or agreed to in writing, software //
+// distributed under the License is distributed on an "AS IS" BASIS, //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and //
+// limitations under the License. //
+// ======================================================================== //
+
+#include "common/tensor.h"
+#include "image.h"
+#include "node.h"
+#include "input_reorder.h"
+#include "output_reorder.h"
+#include "transfer_function.h"
+
+#pragma once
+
+namespace oidn {
+
+ // Progress state
+ struct Progress
+ {
+ ProgressMonitorFunction func;
+ void* userPtr;
+ int taskCount;
+ };
+
+ class Executable
+ {
+ public:
+ virtual ~Executable() {}
+ virtual void execute(const Progress& progress, int taskIndex) = 0;
+ };
+
+ template<int K>
+ class Network : public Executable
+ {
+ public:
+ Network(const Ref<Device>& device, const std::map<std::string, Tensor>& weightMap);
+
+ void execute(const Progress& progress, int taskIndex) override;
+
+ std::shared_ptr<memory> allocTensor(const memory::dims& dims,
+ memory::format_tag format = memory::format_tag::any,
+ void* data = nullptr);
+
+ std::shared_ptr<memory> castTensor(const memory::dims& dims,
+ const std::shared_ptr<memory>& src,
+ size_t srcOffset = 0,
+ memory::format_tag format = memory::format_tag::any);
+
+ std::shared_ptr<memory> castTensor(const memory::dims& dims,
+ const std::shared_ptr<memory>& src,
+ const memory::dims& srcOffset);
+
+ void zeroTensor(const std::shared_ptr<memory>& dst);
+
+ memory::dims getInputReorderDims(const memory::dims& srcDims, int alignment);
+
+ std::shared_ptr<Node> addInputReorder(const Image& color,
+ const Image& albedo,
+ const Image& normal,
+ const std::shared_ptr<TransferFunction>& transferFunc,
+ int alignment,
+ const std::shared_ptr<memory>& userDst = nullptr);
+
+ std::shared_ptr<Node> addOutputReorder(const std::shared_ptr<memory>& src,
+ const std::shared_ptr<TransferFunction>& transferFunc,
+ const Image& output);
+
+ memory::dims getConvDims(const std::string& name, const memory::dims& srcDims);
+ std::shared_ptr<Node> addConv(const std::string& name,
+ const std::shared_ptr<memory>& src,
+ const std::shared_ptr<memory>& userDst = nullptr,
+ bool relu = true);
+
+ memory::dims getPoolDims(const memory::dims& srcDims);
+ std::shared_ptr<Node> addPool(const std::shared_ptr<memory>& src,
+ const std::shared_ptr<memory>& userDst = nullptr);
+
+ memory::dims getUpsampleDims(const memory::dims& srcDims);
+ std::shared_ptr<Node> addUpsample(const std::shared_ptr<memory>& src,
+ const std::shared_ptr<memory>& userDst = nullptr);
+
+ memory::dims getConcatDims(const memory::dims& src1Dims, const memory::dims& src2Dims);
+
+ std::shared_ptr<Node> addAutoexposure(const Image& color,
+ const std::shared_ptr<HDRTransferFunction>& transferFunc);
+
+ void finalize();
+
+ private:
+ Ref<Device> device;
+ engine eng;
+ stream sm;
+ std::vector<std::shared_ptr<Node>> nodes;
+ std::map<std::string, Tensor> weightMap;
+
+ // Memory allocation statistics
+ size_t activationAllocBytes = 0; // number of allocated activation bytes
+ size_t totalAllocBytes = 0; // total number of allocated bytes
+ };
+
+} // namespace oidn
diff --git a/thirdparty/oidn/core/node.h b/thirdparty/oidn/core/node.h
new file mode 100644
index 0000000000..b9ffe906df
--- /dev/null
+++ b/thirdparty/oidn/core/node.h
@@ -0,0 +1,142 @@
+// ======================================================================== //
+// Copyright 2009-2019 Intel Corporation //
+// //
+// Licensed under the Apache License, Version 2.0 (the "License"); //
+// you may not use this file except in compliance with the License. //
+// You may obtain a copy of the License at //
+// //
+// http://www.apache.org/licenses/LICENSE-2.0 //
+// //
+// Unless required by applicable law or agreed to in writing, software //
+// distributed under the License is distributed on an "AS IS" BASIS, //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and //
+// limitations under the License. //
+// ======================================================================== //
+
+#pragma once
+
+#include "common.h"
+#include <vector>
+
+namespace oidn {
+
+ class Node
+ {
+ public:
+ virtual ~Node() = default;
+
+ virtual void execute(stream& sm) = 0;
+
+ virtual std::shared_ptr<memory> getDst() const { return nullptr; }
+
+ virtual size_t getScratchpadSize() const { return 0; }
+ virtual void setScratchpad(const std::shared_ptr<memory>& mem) {}
+
+ virtual void setTile(int h1, int w1, int h2, int w2, int H, int W)
+ {
+ assert(0); // not supported
+ }
+ };
+
+ // Node wrapping an MKL-DNN primitive
+ class MklNode : public Node
+ {
+ private:
+ primitive prim;
+ std::unordered_map<int, memory> args;
+ std::shared_ptr<memory> scratchpad;
+
+ public:
+ MklNode(const primitive& prim, const std::unordered_map<int, memory>& args)
+ : prim(prim),
+ args(args)
+ {}
+
+ size_t getScratchpadSize() const override
+ {
+ const auto primDesc = prim.get_primitive_desc();
+ const mkldnn_memory_desc_t* scratchpadDesc = mkldnn_primitive_desc_query_md(primDesc, mkldnn_query_scratchpad_md, 0);
+ if (scratchpadDesc == nullptr)
+ return 0;
+ return mkldnn_memory_desc_get_size(scratchpadDesc);
+ }
+
+ void setScratchpad(const std::shared_ptr<memory>& mem) override
+ {
+ scratchpad = mem;
+ args.insert(std::make_pair(MKLDNN_ARG_SCRATCHPAD, *scratchpad));
+ }
+
+ void execute(stream& sm) override
+ {
+ prim.execute(sm, args);
+ }
+ };
+
+ // Convolution node
+ class ConvNode : public MklNode
+ {
+ private:
+ std::shared_ptr<memory> src;
+ std::shared_ptr<memory> weights;
+ std::shared_ptr<memory> bias;
+ std::shared_ptr<memory> dst;
+
+ public:
+ ConvNode(const convolution_forward::primitive_desc& desc,
+ const std::shared_ptr<memory>& src,
+ const std::shared_ptr<memory>& weights,
+ const std::shared_ptr<memory>& bias,
+ const std::shared_ptr<memory>& dst)
+ : MklNode(convolution_forward(desc),
+ { { MKLDNN_ARG_SRC, *src },
+ { MKLDNN_ARG_WEIGHTS, *weights },
+ { MKLDNN_ARG_BIAS, *bias },
+ { MKLDNN_ARG_DST, *dst } }),
+ src(src), weights(weights), bias(bias), dst(dst)
+ {}
+
+ std::shared_ptr<memory> getDst() const override { return dst; }
+ };
+
+ // Pooling node
+ class PoolNode : public MklNode
+ {
+ private:
+ std::shared_ptr<memory> src;
+ std::shared_ptr<memory> dst;
+
+ public:
+ PoolNode(const pooling_forward::primitive_desc& desc,
+ const std::shared_ptr<memory>& src,
+ const std::shared_ptr<memory>& dst)
+ : MklNode(pooling_forward(desc),
+ { { MKLDNN_ARG_SRC, *src },
+ { MKLDNN_ARG_DST, *dst } }),
+ src(src), dst(dst)
+ {}
+
+ std::shared_ptr<memory> getDst() const override { return dst; }
+ };
+
+ // Reorder node
+ class ReorderNode : public MklNode
+ {
+ private:
+ std::shared_ptr<memory> src;
+ std::shared_ptr<memory> dst;
+
+ public:
+ ReorderNode(const std::shared_ptr<memory>& src,
+ const std::shared_ptr<memory>& dst)
+ : MklNode(reorder(reorder::primitive_desc(*src, *dst)),
+ { { MKLDNN_ARG_SRC, *src },
+ { MKLDNN_ARG_DST, *dst } }),
+ src(src), dst(dst)
+ {}
+
+ std::shared_ptr<memory> getDst() const override { return dst; }
+ };
+
+} // namespace oidn
diff --git a/thirdparty/oidn/core/output_reorder.h b/thirdparty/oidn/core/output_reorder.h
new file mode 100644
index 0000000000..7918d48e15
--- /dev/null
+++ b/thirdparty/oidn/core/output_reorder.h
@@ -0,0 +1,126 @@
+// ======================================================================== //
+// Copyright 2009-2019 Intel Corporation //
+// //
+// Licensed under the Apache License, Version 2.0 (the "License"); //
+// you may not use this file except in compliance with the License. //
+// You may obtain a copy of the License at //
+// //
+// http://www.apache.org/licenses/LICENSE-2.0 //
+// //
+// Unless required by applicable law or agreed to in writing, software //
+// distributed under the License is distributed on an "AS IS" BASIS, //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and //
+// limitations under the License. //
+// ======================================================================== //
+
+#pragma once
+
+#include "node.h"
+#include "image.h"
+
+namespace oidn {
+
+ // Output reorder node
+ template<int K, class TransferFunction>
+ class OutputReorderNode : public Node
+ {
+ private:
+ // Source
+ std::shared_ptr<memory> src;
+ const float* srcPtr;
+ int H1;
+ int W1;
+
+ // Destination
+ Image output;
+
+ // Tile
+ int h1Begin;
+ int w1Begin;
+ int h2Begin;
+ int w2Begin;
+ int H;
+ int W;
+
+ std::shared_ptr<TransferFunction> transferFunc;
+
+ public:
+ OutputReorderNode(const std::shared_ptr<memory>& src,
+ const Image& output,
+ const std::shared_ptr<TransferFunction>& transferFunc)
+ : src(src),
+ output(output),
+ h1Begin(0), w1Begin(0),
+ h2Begin(0), w2Begin(0),
+ H(output.height), W(output.width),
+ transferFunc(transferFunc)
+ {
+ const mkldnn_memory_desc_t& srcDesc = src->get_desc().data;
+ MAYBE_UNUSED(srcDesc);
+ assert(memory_desc_matches_tag(srcDesc, mkldnn_format_tag_t(BlockedFormat<K>::nChwKc)));
+ assert(srcDesc.ndims == 4);
+ assert(srcDesc.data_type == memory::data_type::f32);
+ assert(srcDesc.dims[0] == 1);
+ // We assume output data is <= K OC
+ assert(srcDesc.dims[1] == K);
+
+ srcPtr = (float*)src->get_data_handle();
+ H1 = srcDesc.dims[2];
+ W1 = srcDesc.dims[3];
+ }
+
+ void setTile(int h1, int w1, int h2, int w2, int H, int W) override
+ {
+ h1Begin = h1;
+ w1Begin = w1;
+ h2Begin = h2;
+ w2Begin = w2;
+ this->H = H;
+ this->W = W;
+ }
+
+ void execute(stream& sm) override
+ {
+ assert(h1Begin + H <= H1);
+ assert(w1Begin + W <= W1);
+ assert(h2Begin + H <= output.height);
+ assert(w2Begin + W <= output.width);
+
+ const int C1 = K;
+
+ parallel_nd(H, [&](int h)
+ {
+ const int h1 = h + h1Begin;
+ const int h2 = h + h2Begin;
+
+ for (int w = 0; w < W; ++w)
+ {
+ const int w1 = w + w1Begin;
+ const int w2 = w + w2Begin;
+ float* dstPtr_C = (float*)output.get(h2, w2);
+
+ // Source is in nChwKc format. In this case C is 1 so this is really nhwc
+ const float* srcPtr_C = srcPtr + h1*W1*C1 + w1*C1;
+
+ #pragma unroll
+ for (int i = 0; i < 3; ++i)
+ {
+ // Load the value
+ float x = srcPtr_C[i];
+
+ // The CNN output may contain negative values or even NaNs, so it must be sanitized
+ x = maxSafe(x, 0.f);
+
+ // Apply the inverse transfer function
+ x = transferFunc->inverse(x);
+
+ // Sanitize and store the final value
+ dstPtr_C[i] = max(x, 0.f);
+ }
+ }
+ });
+ }
+ };
+
+} // namespace oidn
diff --git a/thirdparty/oidn/core/transfer_function.cpp b/thirdparty/oidn/core/transfer_function.cpp
new file mode 100644
index 0000000000..a33e3c84bc
--- /dev/null
+++ b/thirdparty/oidn/core/transfer_function.cpp
@@ -0,0 +1,95 @@
+// ======================================================================== //
+// Copyright 2009-2019 Intel Corporation //
+// //
+// Licensed under the Apache License, Version 2.0 (the "License"); //
+// you may not use this file except in compliance with the License. //
+// You may obtain a copy of the License at //
+// //
+// http://www.apache.org/licenses/LICENSE-2.0 //
+// //
+// Unless required by applicable law or agreed to in writing, software //
+// distributed under the License is distributed on an "AS IS" BASIS, //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and //
+// limitations under the License. //
+// ======================================================================== //
+
+#include "transfer_function.h"
+
+namespace oidn {
+
+ const float LogTransferFunction::xScale = 1.f / log(LogTransferFunction::yMax + 1.f);
+ const float PQXTransferFunction::xScale = 1.f / PQXTransferFunction::pqxForward(PQXTransferFunction::yMax * PQXTransferFunction::yScale);
+
+ float AutoexposureNode::autoexposure(const Image& color)
+ {
+ assert(color.format == Format::Float3);
+ return 1.0f;
+
+ /*constexpr float key = 0.18f;
+ constexpr float eps = 1e-8f;
+ constexpr int K = 16; // downsampling amount
+
+ // Downsample the image to minimize sensitivity to noise
+ const int H = color.height; // original height
+ const int W = color.width; // original width
+ const int HK = (H + K/2) / K; // downsampled height
+ const int WK = (W + K/2) / K; // downsampled width
+
+ // Compute the average log luminance of the downsampled image
+ using Sum = std::pair<float, int>;
+
+ Sum sum =
+ tbb::parallel_reduce(
+ tbb::blocked_range2d<int>(0, HK, 0, WK),
+ Sum(0.f, 0),
+ [&](const tbb::blocked_range2d<int>& r, Sum sum) -> Sum
+ {
+ // Iterate over blocks
+ for (int i = r.rows().begin(); i != r.rows().end(); ++i)
+ {
+ for (int j = r.cols().begin(); j != r.cols().end(); ++j)
+ {
+ // Compute the average luminance in the current block
+ const int beginH = int(ptrdiff_t(i) * H / HK);
+ const int beginW = int(ptrdiff_t(j) * W / WK);
+ const int endH = int(ptrdiff_t(i+1) * H / HK);
+ const int endW = int(ptrdiff_t(j+1) * W / WK);
+
+ float L = 0.f;
+
+ for (int h = beginH; h < endH; ++h)
+ {
+ for (int w = beginW; w < endW; ++w)
+ {
+ const float* rgb = (const float*)color.get(h, w);
+
+ const float r = maxSafe(rgb[0], 0.f);
+ const float g = maxSafe(rgb[1], 0.f);
+ const float b = maxSafe(rgb[2], 0.f);
+
+ L += luminance(r, g, b);
+ }
+ }
+
+ L /= (endH - beginH) * (endW - beginW);
+
+ // Accumulate the log luminance
+ if (L > eps)
+ {
+ sum.first += log2(L);
+ sum.second++;
+ }
+ }
+ }
+
+ return sum;
+ },
+ [](Sum a, Sum b) -> Sum { return Sum(a.first+b.first, a.second+b.second); },
+ tbb::static_partitioner()
+ );
+
+ return (sum.second > 0) ? (key / exp2(sum.first / float(sum.second))) : 1.f;*/
+ }
+
+} // namespace oidn
diff --git a/thirdparty/oidn/core/transfer_function.h b/thirdparty/oidn/core/transfer_function.h
new file mode 100644
index 0000000000..35f2833092
--- /dev/null
+++ b/thirdparty/oidn/core/transfer_function.h
@@ -0,0 +1,201 @@
+// ======================================================================== //
+// Copyright 2009-2019 Intel Corporation //
+// //
+// Licensed under the Apache License, Version 2.0 (the "License"); //
+// you may not use this file except in compliance with the License. //
+// You may obtain a copy of the License at //
+// //
+// http://www.apache.org/licenses/LICENSE-2.0 //
+// //
+// Unless required by applicable law or agreed to in writing, software //
+// distributed under the License is distributed on an "AS IS" BASIS, //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and //
+// limitations under the License. //
+// ======================================================================== //
+
+#pragma once
+
+#include "image.h"
+#include "node.h"
+
+namespace oidn {
+
+ __forceinline float luminance(float r, float g, float b)
+ {
+ return 0.212671f * r + 0.715160f * g + 0.072169f * b;
+ }
+
+ // Color transfer function base class
+ class TransferFunction
+ {
+ public:
+ virtual ~TransferFunction() = default;
+
+ virtual float forward(float y) const = 0;
+ virtual float inverse(float x) const = 0;
+ };
+
+ // HDR transfer function base class
+ class HDRTransferFunction : public TransferFunction
+ {
+ protected:
+ static constexpr float yMax = 65504.f;
+
+ float exposure;
+ float rcpExposure;
+
+ public:
+ HDRTransferFunction(float exposure = 1.f)
+ {
+ setExposure(exposure);
+ }
+
+ void setExposure(float exposure)
+ {
+ this->exposure = exposure;
+ this->rcpExposure = (exposure != 0.f) ? (1.f / exposure) : 0.f;
+ }
+ };
+
+ // Linear transfer function (LDR)
+ class LinearTransferFunction : public TransferFunction
+ {
+ public:
+ __forceinline float forward(float y) const override
+ {
+ return min(y, 1.f);
+ }
+
+ __forceinline float inverse(float x) const override
+ {
+ return min(x, 1.f);
+ }
+ };
+
+ // 2.2 gamma transfer function (LDR)
+ class GammaTransferFunction : public TransferFunction
+ {
+ public:
+ __forceinline float forward(float y) const override
+ {
+ return min(pow(y, 1.f/2.2f), 1.f);
+ }
+
+ __forceinline float inverse(float x) const override
+ {
+ return min(pow(x, 2.2f), 1.f);
+ }
+ };
+
+ // Logarithmic transfer function (HDR)
+ // Compresses [0..65504] to [0..1]
+ class LogTransferFunction : public HDRTransferFunction
+ {
+ private:
+ static const float xScale;
+
+ public:
+ LogTransferFunction(float exposure = 1.f)
+ : HDRTransferFunction(exposure)
+ {
+ }
+
+ __forceinline float forward(float y) const override
+ {
+ return log(y * exposure + 1.f) * xScale;
+ }
+
+ __forceinline float inverse(float x) const override
+ {
+ return (exp(x * (1.f/xScale)) - 1.f) * rcpExposure;
+ }
+ };
+
+ // PQX transfer function (HDR)
+ // Compresses [0..65504] to [0..1]
+ class PQXTransferFunction : public HDRTransferFunction
+ {
+ private:
+ static constexpr float m1 = 2610.f / 4096.f / 4.f;
+ static constexpr float m2 = 2523.f / 4096.f * 128.f;
+ static constexpr float c1 = 3424.f / 4096.f;
+ static constexpr float c2 = 2413.f / 4096.f * 32.f;
+ static constexpr float c3 = 2392.f / 4096.f * 32.f;
+ static constexpr float a = 3711.f / 4096.f / 8.f;
+
+ static constexpr float yScale = 100.f / 10000.f;
+ static const float xScale;
+
+ public:
+ PQXTransferFunction(float exposure = 1.f)
+ : HDRTransferFunction(exposure)
+ {
+ }
+
+ __forceinline float forward(float y) const override
+ {
+ return pqxForward(y * exposure * yScale) * xScale;
+ }
+
+ __forceinline float inverse(float x) const override
+ {
+ return pqxInverse(x * (1.f/xScale)) * (1.f/yScale) * rcpExposure;
+ }
+
+ private:
+ static __forceinline float pqForward(float y)
+ {
+ const float yp = pow(y, m1);
+ return pow((c1 + c2 * yp) * rcp(1.f + c3 * yp), m2);
+ }
+
+ static __forceinline float pqxForward(float y)
+ {
+ if (y <= 1.f)
+ return pqForward(y);
+ else
+ return a * log(y) + 1.f;
+ }
+
+ static __forceinline float pqInverse(float x)
+ {
+ const float xp = pow(x, 1.f/m2);
+ return pow(max((xp - c1) * rcp(c2 - c3 * xp), 0.f), 1.f/m1);
+ }
+
+ static __forceinline float pqxInverse(float x)
+ {
+ if (x <= 1.f)
+ return pqInverse(x);
+ else
+ return exp((x - 1.f) * (1.f/a));
+ }
+ };
+
+ // Autoexposure node
+ class AutoexposureNode : public Node
+ {
+ private:
+ Image color;
+ std::shared_ptr<HDRTransferFunction> transferFunc;
+
+ public:
+ AutoexposureNode(const Image& color,
+ const std::shared_ptr<HDRTransferFunction>& transferFunc)
+ : color(color),
+ transferFunc(transferFunc)
+ {}
+
+ void execute(stream& sm) override
+ {
+ const float exposure = autoexposure(color);
+ //printf("exposure = %f\n", exposure);
+ transferFunc->setExposure(exposure);
+ }
+
+ private:
+ static float autoexposure(const Image& color);
+ };
+
+} // namespace oidn
diff --git a/thirdparty/oidn/core/upsample.h b/thirdparty/oidn/core/upsample.h
new file mode 100644
index 0000000000..f6cace44cd
--- /dev/null
+++ b/thirdparty/oidn/core/upsample.h
@@ -0,0 +1,92 @@
+// ======================================================================== //
+// Copyright 2009-2019 Intel Corporation //
+// //
+// Licensed under the Apache License, Version 2.0 (the "License"); //
+// you may not use this file except in compliance with the License. //
+// You may obtain a copy of the License at //
+// //
+// http://www.apache.org/licenses/LICENSE-2.0 //
+// //
+// Unless required by applicable law or agreed to in writing, software //
+// distributed under the License is distributed on an "AS IS" BASIS, //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and //
+// limitations under the License. //
+// ======================================================================== //
+
+#pragma once
+
+#include "node.h"
+
+namespace oidn {
+
+ // 2x2 nearest-neighbor upsampling node
+ template<int K>
+ class UpsampleNode : public Node
+ {
+ private:
+ std::shared_ptr<memory> src;
+ std::shared_ptr<memory> dst;
+
+ public:
+ UpsampleNode(const std::shared_ptr<memory>& src,
+ const std::shared_ptr<memory>& dst)
+ : src(src),
+ dst(dst)
+ {
+ const mkldnn_memory_desc_t& srcDesc = src->get_desc().data;
+ const mkldnn_memory_desc_t& dstDesc = dst->get_desc().data;
+ MAYBE_UNUSED(srcDesc);
+ MAYBE_UNUSED(dstDesc);
+ assert(memory_desc_matches_tag(srcDesc, mkldnn_format_tag_t(BlockedFormat<K>::nChwKc)));
+ assert(memory_desc_matches_tag(dstDesc, mkldnn_format_tag_t(BlockedFormat<K>::nChwKc)));
+ assert(srcDesc.ndims == 4);
+ assert(dstDesc.ndims == 4);
+ assert(srcDesc.data_type == memory::data_type::f32);
+ assert(dstDesc.data_type == memory::data_type::f32);
+ assert(srcDesc.dims[0] == 1);
+ assert(dstDesc.dims[0] == 1);
+ // 2x2 upsampling
+ assert(dstDesc.dims[2] == srcDesc.dims[2] * 2);
+ assert(dstDesc.dims[3] == srcDesc.dims[3] * 2);
+ }
+
+ void execute(stream& sm) override
+ {
+ const mkldnn_memory_desc_t& srcDesc = src->get_desc().data;
+
+ const float* srcPtr = (float*)src->get_data_handle();
+ float* dstPtr = (float*)dst->get_data_handle();
+
+ const int C = srcDesc.dims[1];
+ const int H = srcDesc.dims[2];
+ const int W = srcDesc.dims[3];
+ const int CK = C / K;
+
+ parallel_nd(CK, H, [&](int ck, int h)
+ {
+ const size_t offset = ck*H*W*K + h*W*K;
+ const float* srcPtr_line = srcPtr + offset;
+ float* dstPtr_line0 = dstPtr + offset * 4;
+ float* dstPtr_line1 = dstPtr_line0 + W*2*K; // next line
+
+ for (int w = 0; w < W; ++w)
+ {
+ #pragma unroll
+ for (int k = 0; k < K; k += 4)
+ {
+ const __m128 m = _mm_load_ps(&srcPtr_line[w*K + k]);
+
+ _mm_stream_ps(&dstPtr_line0[w*2*K + k], m);
+ _mm_stream_ps(&dstPtr_line0[w*2*K+K + k], m);
+ _mm_stream_ps(&dstPtr_line1[w*2*K + k], m);
+ _mm_stream_ps(&dstPtr_line1[w*2*K+K + k], m);
+ }
+ }
+ });
+ }
+
+ std::shared_ptr<memory> getDst() const override { return dst; }
+ };
+
+} // namespace oidn
diff --git a/thirdparty/oidn/core/weights_reorder.h b/thirdparty/oidn/core/weights_reorder.h
new file mode 100644
index 0000000000..6c5dacb8aa
--- /dev/null
+++ b/thirdparty/oidn/core/weights_reorder.h
@@ -0,0 +1,99 @@
+// ======================================================================== //
+// Copyright 2009-2019 Intel Corporation //
+// //
+// Licensed under the Apache License, Version 2.0 (the "License"); //
+// you may not use this file except in compliance with the License. //
+// You may obtain a copy of the License at //
+// //
+// http://www.apache.org/licenses/LICENSE-2.0 //
+// //
+// Unless required by applicable law or agreed to in writing, software //
+// distributed under the License is distributed on an "AS IS" BASIS, //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and //
+// limitations under the License. //
+// ======================================================================== //
+
+#pragma once
+
+#include "node.h"
+
+namespace oidn {
+
+ // Reorders weights from oihw to padded oihw format
+ template<int K>
+ class WeightsReorderNode : public Node
+ {
+ private:
+ std::shared_ptr<memory> src;
+ std::shared_ptr<memory> dst;
+
+ public:
+ WeightsReorderNode(const std::shared_ptr<memory>& src,
+ const std::shared_ptr<memory>& dst)
+ : src(src),
+ dst(dst)
+ {
+ const mkldnn_memory_desc_t& srcDesc = src->get_desc().data;
+ const mkldnn_memory_desc_t& dstDesc = dst->get_desc().data;
+ MAYBE_UNUSED(srcDesc);
+ MAYBE_UNUSED(dstDesc);
+ assert(memory_desc_matches_tag(srcDesc, mkldnn_format_tag_t(memory::format_tag::oihw)));
+ assert(memory_desc_matches_tag(dstDesc, mkldnn_format_tag_t(memory::format_tag::oihw)));
+ assert(srcDesc.ndims == 4);
+ assert(dstDesc.ndims == 4);
+ assert(srcDesc.data_type == memory::data_type::f32);
+ assert(dstDesc.data_type == memory::data_type::f32);
+ assert(getPadded<K>(srcDesc.dims[0]) == dstDesc.dims[0]); // OC
+ assert(getPadded<K>(srcDesc.dims[1]) == dstDesc.dims[1]); // IC
+ assert(srcDesc.dims[2] == dstDesc.dims[2]);
+ assert(srcDesc.dims[3] == dstDesc.dims[3]);
+ }
+
+ void execute(stream& sm) override
+ {
+ const mkldnn_memory_desc_t& srcDesc = src->get_desc().data;
+ const mkldnn_memory_desc_t& dstDesc = dst->get_desc().data;
+
+ const float* srcPtr = (float*)src->get_data_handle();
+ float* dstPtr = (float*)dst->get_data_handle();
+
+ const int OC1 = srcDesc.dims[0];
+ const int OC2 = dstDesc.dims[0];
+ const int IC1 = srcDesc.dims[1];
+ const int IC2 = dstDesc.dims[1];
+ const int H = dstDesc.dims[2];
+ const int W = dstDesc.dims[3];
+
+ for (int oc = 0; oc < OC2; ++oc)
+ {
+ for (int ic = 0; ic < IC2; ++ic)
+ {
+ for (int h = 0; h < H; ++h)
+ {
+ for (int w = 0; w < W; ++w)
+ {
+ // Output is in oihw format
+ float* dstPtr_c = dstPtr + oc*IC2*H*W + ic*H*W + h*W + w;
+
+ if (oc < OC1 && ic < IC1)
+ {
+ // Input is in oihw format
+ const float* srcPtr_c = srcPtr + oc*IC1*H*W + ic*H*W + h*W + w;
+ *dstPtr_c = *srcPtr_c;
+ }
+ else
+ {
+ // padding
+ *dstPtr_c = 0;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ std::shared_ptr<memory> getDst() const override { return dst; }
+ };
+
+} // namespace oidn