20 files changed, 3335 insertions, 0 deletions
diff --git a/thirdparty/oidn/core/api.cpp b/thirdparty/oidn/core/api.cpp
new file mode 100644
index 0000000000..7353fe4e25
--- /dev/null
+++ b/thirdparty/oidn/core/api.cpp
@@ -0,0 +1,408 @@
+// ======================================================================== //
+// Copyright 2009-2019 Intel Corporation                                    //
+//                                                                          //
+// Licensed under the Apache License, Version 2.0 (the "License");          //
+// you may not use this file except in compliance with the License.         //
+// You may obtain a copy of the License at                                  //
+//                                                                          //
+//     http://www.apache.org/licenses/LICENSE-2.0                           //
+//                                                                          //
+// Unless required by applicable law or agreed to in writing, software      //
+// distributed under the License is distributed on an "AS IS" BASIS,        //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and      //
+// limitations under the License.                                           //
+// ======================================================================== //
+
+#ifdef _WIN32
+#  define OIDN_API extern "C" __declspec(dllexport)
+#else
+#  define OIDN_API extern "C" __attribute__ ((visibility ("default")))
+#endif
+
+// Locks the device that owns the specified object
+// Use *only* inside OIDN_TRY/CATCH!
+#define OIDN_LOCK(obj) \
+  std::lock_guard<std::mutex> lock(obj->getDevice()->getMutex());
+
+// Try/catch for converting exceptions to errors
+#define OIDN_TRY \
+  try {
+
+#define OIDN_CATCH(obj) \
+  } catch (Exception& e) {                                                                          \
+    Device::setError(obj ? obj->getDevice() : nullptr, e.code(), e.what());                         \
+  } catch (std::bad_alloc&) {                                                                       \
+    Device::setError(obj ? obj->getDevice() : nullptr, Error::OutOfMemory, "out of memory");        \
+  } catch (mkldnn::error& e) {                                                                      \
+    if (e.status == mkldnn_out_of_memory)                                                           \
+      Device::setError(obj ? obj->getDevice() : nullptr, Error::OutOfMemory, "out of memory");      \
+    else                                                                                            \
+      Device::setError(obj ? obj->getDevice() : nullptr, Error::Unknown, e.message);                \
+  } catch (std::exception& e) {                                                                     \
+    Device::setError(obj ? obj->getDevice() : nullptr, Error::Unknown, e.what());                   \
+  } catch (...) {                                                                                   \
+    Device::setError(obj ? obj->getDevice() : nullptr, Error::Unknown, "unknown exception caught"); \
+  }
+
+#include "device.h"
+#include "filter.h"
+#include <mutex>
+
+namespace oidn {
+
+  namespace
+  {
+    __forceinline void checkHandle(void* handle)
+    {
+      if (handle == nullptr)
+        throw Exception(Error::InvalidArgument, "invalid handle");
+    }
+
+    template<typename T>
+    __forceinline void retainObject(T* obj)
+    {
+      if (obj)
+      {
+        obj->incRef();
+      }
+      else
+      {
+        OIDN_TRY
+          checkHandle(obj);
+        OIDN_CATCH(obj)
+      }
+    }
+
+    template<typename T>
+    __forceinline void releaseObject(T* obj)
+    {
+      if (obj == nullptr || obj->decRefKeep() == 0)
+      {
+        OIDN_TRY
+          checkHandle(obj);
+          OIDN_LOCK(obj);
+          obj->destroy();
+        OIDN_CATCH(obj)
+      }
+    }
+
+    template<>
+    __forceinline void releaseObject(Device* obj)
+    {
+      if (obj == nullptr || obj->decRefKeep() == 0)
+      {
+        OIDN_TRY
+          checkHandle(obj);
+          // Do NOT lock the device because it owns the mutex
+          obj->destroy();
+        OIDN_CATCH(obj)
+      }
+    }
+  }
+
+  OIDN_API OIDNDevice oidnNewDevice(OIDNDeviceType type)
+  {
+    Ref<Device> device = nullptr;
+    OIDN_TRY
+      if (type == OIDN_DEVICE_TYPE_CPU || type == OIDN_DEVICE_TYPE_DEFAULT)
+        device = makeRef<Device>();
+      else
+        throw Exception(Error::InvalidArgument, "invalid device type");
+    OIDN_CATCH(device)
+    return (OIDNDevice)device.detach();
+  }
+
+  OIDN_API void oidnRetainDevice(OIDNDevice hDevice)
+  {
+    Device* device = (Device*)hDevice;
+    retainObject(device);
+  }
+
+  OIDN_API void oidnReleaseDevice(OIDNDevice hDevice)
+  {
+    Device* device = (Device*)hDevice;
+    releaseObject(device);
+  }
+
+  OIDN_API void oidnSetDevice1b(OIDNDevice hDevice, const char* name, bool value)
+  {
+    Device* device = (Device*)hDevice;
+    OIDN_TRY
+      checkHandle(hDevice);
+      OIDN_LOCK(device);
+      device->set1i(name, value);
+    OIDN_CATCH(device)
+  }
+
+  OIDN_API void oidnSetDevice1i(OIDNDevice hDevice, const char* name, int value)
+  {
+    Device* device = (Device*)hDevice;
+    OIDN_TRY
+      checkHandle(hDevice);
+      OIDN_LOCK(device);
+      device->set1i(name, value);
+    OIDN_CATCH(device)
+  }
+
+  OIDN_API bool oidnGetDevice1b(OIDNDevice hDevice, const char* name)
+  {
+    Device* device = (Device*)hDevice;
+    OIDN_TRY
+      checkHandle(hDevice);
+      OIDN_LOCK(device);
+      return device->get1i(name);
+    OIDN_CATCH(device)
+    return false;
+  }
+
+  OIDN_API int oidnGetDevice1i(OIDNDevice hDevice, const char* name)
+  {
+    Device* device = (Device*)hDevice;
+    OIDN_TRY
+      checkHandle(hDevice);
+      OIDN_LOCK(device);
+      return device->get1i(name);
+    OIDN_CATCH(device)
+    return 0;
+  }
+
+  OIDN_API void oidnSetDeviceErrorFunction(OIDNDevice hDevice, OIDNErrorFunction func, void* userPtr)
+  {
+    Device* device = (Device*)hDevice;
+    OIDN_TRY
+      checkHandle(hDevice);
+      OIDN_LOCK(device);
+      device->setErrorFunction((ErrorFunction)func, userPtr);
+    OIDN_CATCH(device)
+  }
+
+  OIDN_API OIDNError oidnGetDeviceError(OIDNDevice hDevice, const char** outMessage)
+  {
+    Device* device = (Device*)hDevice;
+    OIDN_TRY
+      return (OIDNError)Device::getError(device, outMessage);
+    OIDN_CATCH(device)
+    if (outMessage) *outMessage = "";
+    return OIDN_ERROR_UNKNOWN;
+  }
+
+  OIDN_API void oidnCommitDevice(OIDNDevice hDevice)
+  {
+    Device* device = (Device*)hDevice;
+    OIDN_TRY
+      checkHandle(hDevice);
+      OIDN_LOCK(device);
+      device->commit();
+    OIDN_CATCH(device)
+  }
+
+  OIDN_API OIDNBuffer oidnNewBuffer(OIDNDevice hDevice, size_t byteSize)
+  {
+    Device* device = (Device*)hDevice;
+    OIDN_TRY
+      checkHandle(hDevice);
+      OIDN_LOCK(device);
+      Ref<Buffer> buffer = device->newBuffer(byteSize);
+      return (OIDNBuffer)buffer.detach();
+    OIDN_CATCH(device)
+    return nullptr;
+  }
+
+  OIDN_API OIDNBuffer oidnNewSharedBuffer(OIDNDevice hDevice, void* ptr, size_t byteSize)
+  {
+    Device* device = (Device*)hDevice;
+    OIDN_TRY
+      checkHandle(hDevice);
+      OIDN_LOCK(device);
+      Ref<Buffer> buffer = device->newBuffer(ptr, byteSize);
+      return (OIDNBuffer)buffer.detach();
+    OIDN_CATCH(device)
+    return nullptr;
+  }
+
+  OIDN_API void oidnRetainBuffer(OIDNBuffer hBuffer)
+  {
+    Buffer* buffer = (Buffer*)hBuffer;
+    retainObject(buffer);
+  }
+
+  OIDN_API void oidnReleaseBuffer(OIDNBuffer hBuffer)
+  {
+    Buffer* buffer = (Buffer*)hBuffer;
+    releaseObject(buffer);
+  }
+
+  OIDN_API void* oidnMapBuffer(OIDNBuffer hBuffer, OIDNAccess access, size_t byteOffset, size_t byteSize)
+  {
+    Buffer* buffer = (Buffer*)hBuffer;
+    OIDN_TRY
+      checkHandle(hBuffer);
+      OIDN_LOCK(buffer);
+      return buffer->map(byteOffset, byteSize);
+    OIDN_CATCH(buffer)
+    return nullptr;
+  }
+
+  OIDN_API void oidnUnmapBuffer(OIDNBuffer hBuffer, void* mappedPtr)
+  {
+    Buffer* buffer = (Buffer*)hBuffer;
+    OIDN_TRY
+      checkHandle(hBuffer);
+      OIDN_LOCK(buffer);
+      return buffer->unmap(mappedPtr);
+    OIDN_CATCH(buffer)
+  }
+
+  OIDN_API OIDNFilter oidnNewFilter(OIDNDevice hDevice, const char* type)
+  {
+    Device* device = (Device*)hDevice;
+    OIDN_TRY
+      checkHandle(hDevice);
+      OIDN_LOCK(device);
+      Ref<Filter> filter = device->newFilter(type);
+      return (OIDNFilter)filter.detach();
+    OIDN_CATCH(device)
+    return nullptr;
+  }
+
+  OIDN_API void oidnRetainFilter(OIDNFilter hFilter)
+  {
+    Filter* filter = (Filter*)hFilter;
+    retainObject(filter);
+  }
+
+  OIDN_API void oidnReleaseFilter(OIDNFilter hFilter)
+  {
+    Filter* filter = (Filter*)hFilter;
+    releaseObject(filter);
+  }
+
+  OIDN_API void oidnSetFilterImage(OIDNFilter hFilter, const char* name,
+                                   OIDNBuffer hBuffer, OIDNFormat format,
+                                   size_t width, size_t height,
+                                   size_t byteOffset,
+                                   size_t bytePixelStride, size_t byteRowStride)
+  {
+    Filter* filter = (Filter*)hFilter;
+    OIDN_TRY
+      checkHandle(hFilter);
+      checkHandle(hBuffer);
+      OIDN_LOCK(filter);
+      Ref<Buffer> buffer = (Buffer*)hBuffer;
+      if (buffer->getDevice() != filter->getDevice())
+        throw Exception(Error::InvalidArgument, "the specified objects are bound to different devices");
+      Image data(buffer, (Format)format, (int)width, (int)height, byteOffset, bytePixelStride, byteRowStride);
+      filter->setImage(name, data);
+    OIDN_CATCH(filter)
+  }
+
+  OIDN_API void oidnSetSharedFilterImage(OIDNFilter hFilter, const char* name,
+                                         void* ptr, OIDNFormat format,
+                                         size_t width, size_t height,
+                                         size_t byteOffset,
+                                         size_t bytePixelStride, size_t byteRowStride)
+  {
+    Filter* filter = (Filter*)hFilter;
+    OIDN_TRY
+      checkHandle(hFilter);
+      OIDN_LOCK(filter);
+      Image data(ptr, (Format)format, (int)width, (int)height, byteOffset, bytePixelStride, byteRowStride);
+      filter->setImage(name, data);
+    OIDN_CATCH(filter)
+  }
+
+  OIDN_API void oidnSetFilter1b(OIDNFilter hFilter, const char* name, bool value)
+  {
+    Filter* filter = (Filter*)hFilter;
+    OIDN_TRY
+      checkHandle(hFilter);
+      OIDN_LOCK(filter);
+      filter->set1i(name, int(value));
+    OIDN_CATCH(filter)
+  }
+
+  OIDN_API bool oidnGetFilter1b(OIDNFilter hFilter, const char* name)
+  {
+    Filter* filter = (Filter*)hFilter;
+    OIDN_TRY
+      checkHandle(hFilter);
+      OIDN_LOCK(filter);
+      return filter->get1i(name);
+    OIDN_CATCH(filter)
+    return false;
+  }
+
+  OIDN_API void oidnSetFilter1i(OIDNFilter hFilter, const char* name, int value)
+  {
+    Filter* filter = (Filter*)hFilter;
+    OIDN_TRY
+      checkHandle(hFilter);
+      OIDN_LOCK(filter);
+      filter->set1i(name, value);
+    OIDN_CATCH(filter)
+  }
+
+  OIDN_API int oidnGetFilter1i(OIDNFilter hFilter, const char* name)
+  {
+    Filter* filter = (Filter*)hFilter;
+    OIDN_TRY
+      checkHandle(hFilter);
+      OIDN_LOCK(filter);
+      return filter->get1i(name);
+    OIDN_CATCH(filter)
+    return 0;
+  }
+
+  OIDN_API void oidnSetFilter1f(OIDNFilter hFilter, const char* name, float value)
+  {
+    Filter* filter = (Filter*)hFilter;
+    OIDN_TRY
+      checkHandle(hFilter);
+      OIDN_LOCK(filter);
+      filter->set1f(name, value);
+    OIDN_CATCH(filter)
+  }
+
+  OIDN_API float oidnGetFilter1f(OIDNFilter hFilter, const char* name)
+  {
+    Filter* filter = (Filter*)hFilter;
+    OIDN_TRY
+      checkHandle(hFilter);
+      OIDN_LOCK(filter);
+      return filter->get1f(name);
+    OIDN_CATCH(filter)
+    return 0;
+  }
+
+  OIDN_API void oidnSetFilterProgressMonitorFunction(OIDNFilter hFilter, OIDNProgressMonitorFunction func, void* userPtr)
+  {
+    Filter* filter = (Filter*)hFilter;
+    OIDN_TRY
+      checkHandle(hFilter);
+      OIDN_LOCK(filter);
+      filter->setProgressMonitorFunction(func, userPtr);
+    OIDN_CATCH(filter)
+  }
+
+  OIDN_API void oidnCommitFilter(OIDNFilter hFilter)
+  {
+    Filter* filter = (Filter*)hFilter;
+    OIDN_TRY
+      checkHandle(hFilter);
+      OIDN_LOCK(filter);
+      filter->commit();
+    OIDN_CATCH(filter)
+  }
+
+  OIDN_API void oidnExecuteFilter(OIDNFilter hFilter)
+  {
+    Filter* filter = (Filter*)hFilter;
+    OIDN_TRY
+      checkHandle(hFilter);
+      OIDN_LOCK(filter);
+      filter->execute();
+    OIDN_CATCH(filter)
+  }
+
+} // namespace oidn
diff --git a/thirdparty/oidn/core/autoencoder.cpp b/thirdparty/oidn/core/autoencoder.cpp
new file mode 100644
index 0000000000..8ae2421fa6
--- /dev/null
+++ b/thirdparty/oidn/core/autoencoder.cpp
@@ -0,0 +1,519 @@
+// ======================================================================== //
+// Copyright 2009-2019 Intel Corporation                                    //
+//                                                                          //
+// Licensed under the Apache License, Version 2.0 (the "License");          //
+// you may not use this file except in compliance with the License.         //
+// You may obtain a copy of the License at                                  //
+//                                                                          //
+//     http://www.apache.org/licenses/LICENSE-2.0                           //
+//                                                                          //
+// Unless required by applicable law or agreed to in writing, software      //
+// distributed under the License is distributed on an "AS IS" BASIS,        //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and      //
+// limitations under the License.                                           //
+// ======================================================================== //
+
+#include "autoencoder.h"
+
+namespace oidn {
+
+  // --------------------------------------------------------------------------
+  // AutoencoderFilter
+  // --------------------------------------------------------------------------
+
+  AutoencoderFilter::AutoencoderFilter(const Ref<Device>& device)
+    : Filter(device)
+  {
+  }
+
+  void AutoencoderFilter::setImage(const std::string& name, const Image& data)
+  {
+    if (name == "color")
+      color = data;
+    else if (name == "albedo")
+      albedo = data;
+    else if (name == "normal")
+      normal = data;
+    else if (name == "output")
+      output = data;
+
+    dirty = true;
+  }
+
+  void AutoencoderFilter::set1i(const std::string& name, int value)
+  {
+    if (name == "hdr")
+      hdr = value;
+    else if (name == "srgb")
+      srgb = value;
+    else if (name == "maxMemoryMB")
+      maxMemoryMB = value;
+
+    dirty = true;
+  }
+
+  int AutoencoderFilter::get1i(const std::string& name)
+  {
+    if (name == "hdr")
+      return hdr;
+    else if (name == "srgb")
+      return srgb;
+    else if (name == "maxMemoryMB")
+      return maxMemoryMB;
+    else if (name == "alignment")
+      return alignment;
+    else if (name == "overlap")
+      return overlap;
+    else
+      throw Exception(Error::InvalidArgument, "invalid parameter");
+  }
+
+  void AutoencoderFilter::set1f(const std::string& name, float value)
+  {
+    if (name == "hdrScale")
+      hdrScale = value;
+
+    dirty = true;
+  }
+
+  float AutoencoderFilter::get1f(const std::string& name)
+  {
+    if (name == "hdrScale")
+      return hdrScale;
+    else
+      throw Exception(Error::InvalidArgument, "invalid parameter");
+  }
+
+  void AutoencoderFilter::commit()
+  {
+    if (!dirty)
+      return;
+
+    {
+      if (mayiuse(avx512_common))
+        net = buildNet<16>();
+      else
+        net = buildNet<8>();
+    }
+
+    dirty = false;
+  }
+
+  void AutoencoderFilter::execute()
+  {
+    if (dirty)
+      throw Exception(Error::InvalidOperation, "changes to the filter are not committed");
+
+    if (!net)
+      return;
+
+    {
+      Progress progress;
+      progress.func = progressFunc;
+      progress.userPtr = progressUserPtr;
+      progress.taskCount = tileCountH * tileCountW;
+
+      // Iterate over the tiles
+      int tileIndex = 0;
+
+      for (int i = 0; i < tileCountH; ++i)
+      {
+        const int h = i * (tileH - 2*overlap); // input tile position (including overlap)
+        const int overlapBeginH = i > 0            ? overlap : 0; // overlap on the top
+        const int overlapEndH   = i < tileCountH-1 ? overlap : 0; // overlap on the bottom
+        const int tileH1 = min(H - h, tileH); // input tile size (including overlap)
+        const int tileH2 = tileH1 - overlapBeginH - overlapEndH; // output tile size
+        const int alignOffsetH = tileH - roundUp(tileH1, alignment); // align to the bottom in the tile buffer
+
+        for (int j = 0; j < tileCountW; ++j)
+        {
+          const int w = j * (tileW - 2*overlap); // input tile position (including overlap)
+          const int overlapBeginW = j > 0            ? overlap : 0; // overlap on the left
+          const int overlapEndW   = j < tileCountW-1 ? overlap : 0; // overlap on the right
+          const int tileW1 = min(W - w, tileW); // input tile size (including overlap)
+          const int tileW2 = tileW1 - overlapBeginW - overlapEndW; // output tile size
+          const int alignOffsetW = tileW - roundUp(tileW1, alignment); // align to the right in the tile buffer
+
+          // Set the input tile
+          inputReorder->setTile(h, w,
+                                alignOffsetH, alignOffsetW,
+                                tileH1, tileW1);
+
+          // Set the output tile
+          outputReorder->setTile(alignOffsetH + overlapBeginH, alignOffsetW + overlapBeginW,
+                                 h + overlapBeginH, w + overlapBeginW,
+                                 tileH2, tileW2);
+
+          //printf("Tile: %d %d -> %d %d\n", w+overlapBeginW, h+overlapBeginH, w+overlapBeginW+tileW2, h+overlapBeginH+tileH2);
+
+          // Denoise the tile
+          net->execute(progress, tileIndex);
+
+          // Next tile
+          tileIndex++;
+        }
+      }
+    }
+  }
+
+  void AutoencoderFilter::computeTileSize()
+  {
+    const int minTileSize = 3*overlap;
+    const int estimatedBytesPerPixel = mayiuse(avx512_common) ? estimatedBytesPerPixel16 : estimatedBytesPerPixel8;
+    const int64_t maxTilePixels = (int64_t(maxMemoryMB)*1024*1024 - estimatedBytesBase) / estimatedBytesPerPixel;
+
+    tileCountH = 1;
+    tileCountW = 1;
+    tileH = roundUp(H, alignment);
+    tileW = roundUp(W, alignment);
+
+    // Divide the image into tiles until the tile size gets below the threshold
+    while (int64_t(tileH) * tileW > maxTilePixels)
+    {
+      if (tileH > minTileSize && tileH > tileW)
+      {
+        tileCountH++;
+        tileH = max(roundUp(ceilDiv(H - 2*overlap, tileCountH), alignment) + 2*overlap, minTileSize);
+      }
+      else if (tileW > minTileSize)
+      {
+        tileCountW++;
+        tileW = max(roundUp(ceilDiv(W - 2*overlap, tileCountW), alignment) + 2*overlap, minTileSize);
+      }
+      else
+        break;
+    }
+
+    // Compute the final number of tiles
+    tileCountH = (H > tileH) ? ceilDiv(H - 2*overlap, tileH - 2*overlap) : 1;
+    tileCountW = (W > tileW) ? ceilDiv(W - 2*overlap, tileW - 2*overlap) : 1;
+
+    if (device->isVerbose(2))
+    {
+      std::cout << "Tile size : " << tileW << "x" << tileH << std::endl;
+      std::cout << "Tile count: " << tileCountW << "x" << tileCountH << std::endl;
+    }
+  }
+
+  template<int K>
+  std::shared_ptr<Executable> AutoencoderFilter::buildNet()
+  {
+    H = color.height;
+    W = color.width;
+
+    // Configure the network
+    int inputC;
+    void* weightPtr;
+
+    if (srgb && hdr)
+      throw Exception(Error::InvalidOperation, "srgb and hdr modes cannot be enabled at the same time");
+
+    if (color && !albedo && !normal && weightData.hdr)
+    {
+      inputC = 3;
+      weightPtr = hdr ? weightData.hdr : weightData.ldr;
+    }
+    else if (color && albedo && !normal && weightData.hdr_alb)
+    {
+      inputC = 6;
+      weightPtr = hdr ? weightData.hdr_alb : weightData.ldr_alb;
+    }
+    else if (color && albedo && normal && weightData.hdr_alb_nrm)
+    {
+      inputC = 9;
+      weightPtr = hdr ? weightData.hdr_alb_nrm : weightData.ldr_alb_nrm;
+    }
+    else
+    {
+      throw Exception(Error::InvalidOperation, "unsupported combination of input features");
+    }
+
+    if (!output)
+      throw Exception(Error::InvalidOperation, "output image not specified");
+
+    if ((color.format != Format::Float3)
+        || (albedo && albedo.format != Format::Float3)
+        || (normal && normal.format != Format::Float3)
+        || (output.format != Format::Float3))
+      throw Exception(Error::InvalidOperation, "unsupported image format");
+
+    if ((albedo && (albedo.width != W || albedo.height != H))
+        || (normal && (normal.width != W || normal.height != H))
+        || (output.width != W || output.height != H))
+      throw Exception(Error::InvalidOperation, "image size mismatch");
+
+    // Compute the tile size
+    computeTileSize();
+
+    // If the image size is zero, there is nothing else to do
+    if (H <= 0 || W <= 0)
+      return nullptr;
+
+    // Parse the weights
+    const auto weightMap = parseTensors(weightPtr);
+
+    // Create the network
+    std::shared_ptr<Network<K>> net = std::make_shared<Network<K>>(device, weightMap);
+
+    // Compute the tensor sizes
+    const auto inputDims        = memory::dims({1, inputC, tileH, tileW});
+    const auto inputReorderDims = net->getInputReorderDims(inputDims, alignment);   //-> concat0
+
+    const auto conv1Dims     = net->getConvDims("conv1", inputReorderDims);         //-> temp0
+    const auto conv1bDims    = net->getConvDims("conv1b", conv1Dims);               //-> temp1
+    const auto pool1Dims     = net->getPoolDims(conv1bDims);                        //-> concat1
+    const auto conv2Dims     = net->getConvDims("conv2", pool1Dims);                //-> temp0
+    const auto pool2Dims     = net->getPoolDims(conv2Dims);                         //-> concat2
+    const auto conv3Dims     = net->getConvDims("conv3", pool2Dims);                //-> temp0
+    const auto pool3Dims     = net->getPoolDims(conv3Dims);                         //-> concat3
+    const auto conv4Dims     = net->getConvDims("conv4", pool3Dims);                //-> temp0
+    const auto pool4Dims     = net->getPoolDims(conv4Dims);                         //-> concat4
+    const auto conv5Dims     = net->getConvDims("conv5", pool4Dims);                //-> temp0
+    const auto pool5Dims     = net->getPoolDims(conv5Dims);                         //-> temp1
+    const auto upsample4Dims = net->getUpsampleDims(pool5Dims);                     //-> concat4
+    const auto concat4Dims   = net->getConcatDims(upsample4Dims, pool4Dims);
+    const auto conv6Dims     = net->getConvDims("conv6", concat4Dims);              //-> temp0
+    const auto conv6bDims    = net->getConvDims("conv6b", conv6Dims);               //-> temp1
+    const auto upsample3Dims = net->getUpsampleDims(conv6bDims);                    //-> concat3
+    const auto concat3Dims   = net->getConcatDims(upsample3Dims, pool3Dims);
+    const auto conv7Dims     = net->getConvDims("conv7", concat3Dims);              //-> temp0
+    const auto conv7bDims    = net->getConvDims("conv7b", conv7Dims);               //-> temp1
+    const auto upsample2Dims = net->getUpsampleDims(conv7bDims);                    //-> concat2
+    const auto concat2Dims   = net->getConcatDims(upsample2Dims, pool2Dims);
+    const auto conv8Dims     = net->getConvDims("conv8", concat2Dims);              //-> temp0
+    const auto conv8bDims    = net->getConvDims("conv8b", conv8Dims);               //-> temp1
+    const auto upsample1Dims = net->getUpsampleDims(conv8bDims);                    //-> concat1
+    const auto concat1Dims   = net->getConcatDims(upsample1Dims, pool1Dims);
+    const auto conv9Dims     = net->getConvDims("conv9", concat1Dims);              //-> temp0
+    const auto conv9bDims    = net->getConvDims("conv9b", conv9Dims);               //-> temp1
+    const auto upsample0Dims = net->getUpsampleDims(conv9bDims);                    //-> concat0
+    const auto concat0Dims   = net->getConcatDims(upsample0Dims, inputReorderDims);
+    const auto conv10Dims    = net->getConvDims("conv10", concat0Dims);             //-> temp0
+    const auto conv10bDims   = net->getConvDims("conv10b", conv10Dims);             //-> temp1
+    const auto conv11Dims    = net->getConvDims("conv11", conv10bDims);             //-> temp0
+
+    const auto outputDims = memory::dims({1, 3, tileH, tileW});
+
+    // Allocate two temporary ping-pong buffers to decrease memory usage
+    const auto temp0Dims = getMaxTensorDims({
+      conv1Dims,
+      conv2Dims,
+      conv3Dims,
+      conv4Dims,
+      conv5Dims,
+      conv6Dims,
+      conv7Dims,
+      conv8Dims,
+      conv9Dims,
+      conv10Dims,
+      conv11Dims
+    });
+
+    const auto temp1Dims = getMaxTensorDims({
+      conv1bDims,
+      pool5Dims,
+      conv6bDims,
+      conv7bDims,
+      conv8bDims,
+      conv9bDims,
+      conv10bDims,
+    });
+
+    auto temp0 = net->allocTensor(temp0Dims);
+    auto temp1 = net->allocTensor(temp1Dims);
+
+    // Allocate enough memory to hold the concat outputs. Then use the first
+    // half to hold the previous conv output and the second half to hold the
+    // pool/orig image output. This works because everything is C dimension
+    // outermost, padded to K floats, and all the concats are on the C dimension.
+    auto concat0Dst = net->allocTensor(concat0Dims);
+    auto concat1Dst = net->allocTensor(concat1Dims);
+    auto concat2Dst = net->allocTensor(concat2Dims);
+    auto concat3Dst = net->allocTensor(concat3Dims);
+    auto concat4Dst = net->allocTensor(concat4Dims);
+
+    // Transfer function
+    std::shared_ptr<TransferFunction> transferFunc = makeTransferFunc();
+
+    // Autoexposure
+    if (auto tf = std::dynamic_pointer_cast<HDRTransferFunction>(transferFunc))
+    {
+      if (isnan(hdrScale))
+        net->addAutoexposure(color, tf);
+      else
+        tf->setExposure(hdrScale);
+    }
+
+    // Input reorder
+    auto inputReorderDst = net->castTensor(inputReorderDims, concat0Dst, upsample0Dims);
+    inputReorder = net->addInputReorder(color, albedo, normal,
+                                        transferFunc,
+                                        alignment, inputReorderDst);
+
+    // conv1
+    auto conv1 = net->addConv("conv1", inputReorder->getDst(), temp0);
+
+    // conv1b
+    auto conv1b = net->addConv("conv1b", conv1->getDst(), temp1);
+
+    // pool1
+    // Adjust pointer for pool1 to eliminate concat1
+    auto pool1Dst = net->castTensor(pool1Dims, concat1Dst, upsample1Dims);
+    auto pool1 = net->addPool(conv1b->getDst(), pool1Dst);
+
+    // conv2
+    auto conv2 = net->addConv("conv2", pool1->getDst(), temp0);
+
+    // pool2
+    // Adjust pointer for pool2 to eliminate concat2
+    auto pool2Dst = net->castTensor(pool2Dims, concat2Dst, upsample2Dims);
+    auto pool2 = net->addPool(conv2->getDst(), pool2Dst);
+
+    // conv3
+    auto conv3 = net->addConv("conv3", pool2->getDst(), temp0);
+
+    // pool3
+    // Adjust pointer for pool3 to eliminate concat3
+    auto pool3Dst = net->castTensor(pool3Dims, concat3Dst, upsample3Dims);
+    auto pool3 = net->addPool(conv3->getDst(), pool3Dst);
+
+    // conv4
+    auto conv4 = net->addConv("conv4", pool3->getDst(), temp0);
+
+    // pool4
+    // Adjust pointer for pool4 to eliminate concat4
+    auto pool4Dst = net->castTensor(pool4Dims, concat4Dst, upsample4Dims);
+    auto pool4 = net->addPool(conv4->getDst(), pool4Dst);
+
+    // conv5
+    auto conv5 = net->addConv("conv5", pool4->getDst(), temp0);
+
+    // pool5
+    auto pool5 = net->addPool(conv5->getDst(), temp1);
+
+    // upsample4
+    auto upsample4Dst = net->castTensor(upsample4Dims, concat4Dst);
+    auto upsample4 = net->addUpsample(pool5->getDst(), upsample4Dst);
+
+    // conv6
+    auto conv6 = net->addConv("conv6", concat4Dst, temp0);
+
+    // conv6b
+    auto conv6b = net->addConv("conv6b", conv6->getDst(), temp1);
+
+    // upsample3
+    auto upsample3Dst = net->castTensor(upsample3Dims, concat3Dst);
+    auto upsample3 = net->addUpsample(conv6b->getDst(), upsample3Dst);
+
+    // conv7
+    auto conv7 = net->addConv("conv7", concat3Dst, temp0);
+
+    // conv7b
+    auto conv7b = net->addConv("conv7b", conv7->getDst(), temp1);
+
+    // upsample2
+    auto upsample2Dst = net->castTensor(upsample2Dims, concat2Dst);
+    auto upsample2 = net->addUpsample(conv7b->getDst(), upsample2Dst);
+
+    // conv8
+    auto conv8 = net->addConv("conv8", concat2Dst, temp0);
+
+    // conv8b
+    auto conv8b = net->addConv("conv8b", conv8->getDst(), temp1);
+
+    // upsample1
+    auto upsample1Dst = net->castTensor(upsample1Dims, concat1Dst);
+    auto upsample1 = net->addUpsample(conv8b->getDst(), upsample1Dst);
+
+    // conv9
+    auto conv9 = net->addConv("conv9", concat1Dst, temp0);
+
+    // conv9b
+    auto conv9b = net->addConv("conv9b", conv9->getDst(), temp1);
+
+    // upsample0
+    auto upsample0Dst = net->castTensor(upsample0Dims, concat0Dst);
+    auto upsample0 = net->addUpsample(conv9b->getDst(), upsample0Dst);
+
+    // conv10
+    auto conv10 = net->addConv("conv10", concat0Dst, temp0);
+
+    // conv10b
+    auto conv10b = net->addConv("conv10b", conv10->getDst(), temp1);
+
+    // conv11
+    auto conv11 = net->addConv("conv11", conv10b->getDst(), temp0, false /* no relu */);
+
+    // Output reorder
+    outputReorder = net->addOutputReorder(conv11->getDst(), transferFunc, output);
+
+    net->finalize();
+    return net;
+  }
+
+  std::shared_ptr<TransferFunction> AutoencoderFilter::makeTransferFunc()
+  {
+    if (hdr)
+      return std::make_shared<PQXTransferFunction>();
+    else if (srgb)
+      return std::make_shared<LinearTransferFunction>();
+    else
+      return std::make_shared<GammaTransferFunction>();
+  }
+
+// Godot doesn't need Raytracing filters. Removing them saves space in the weights files.
+#if 0
+  // --------------------------------------------------------------------------
+  // RTFilter
+  // --------------------------------------------------------------------------
+
+  namespace weights
+  {
+    // LDR
+    extern unsigned char rt_ldr[];         // color
+    extern unsigned char rt_ldr_alb[];     // color, albedo
+    extern unsigned char rt_ldr_alb_nrm[]; // color, albedo, normal
+
+    // HDR
+    extern unsigned char rt_hdr[];         // color
+    extern unsigned char rt_hdr_alb[];     // color, albedo
+    extern unsigned char rt_hdr_alb_nrm[]; // color, albedo, normal
+  }
+
+  RTFilter::RTFilter(const Ref<Device>& device)
+    : AutoencoderFilter(device)
+  {
+    weightData.ldr         = weights::rt_ldr;
+    weightData.ldr_alb     = weights::rt_ldr_alb;
+    weightData.ldr_alb_nrm = weights::rt_ldr_alb_nrm;
+    weightData.hdr         = weights::rt_hdr;
+    weightData.hdr_alb     = weights::rt_hdr_alb;
+    weightData.hdr_alb_nrm = weights::rt_hdr_alb_nrm;
+  }
+#endif
+
+  // --------------------------------------------------------------------------
+  // RTLightmapFilter
+  // --------------------------------------------------------------------------
+
+  namespace weights
+  {
+    // HDR
+    extern unsigned char rtlightmap_hdr[]; // color
+  }
+
+  RTLightmapFilter::RTLightmapFilter(const Ref<Device>& device)
+    : AutoencoderFilter(device)
+  {
+    weightData.hdr = weights::rtlightmap_hdr;
+
+    hdr = true;
+  }
+
+  std::shared_ptr<TransferFunction> RTLightmapFilter::makeTransferFunc()
+  {
+    return std::make_shared<LogTransferFunction>();
+  }
+
+} // namespace oidn
diff --git a/thirdparty/oidn/core/autoencoder.h b/thirdparty/oidn/core/autoencoder.h
new file mode 100644
index 0000000000..97432f2bbd
--- /dev/null
+++ b/thirdparty/oidn/core/autoencoder.h
@@ -0,0 +1,116 @@
+// ======================================================================== //
+// Copyright 2009-2019 Intel Corporation                                    //
+//                                                                          //
+// Licensed under the Apache License, Version 2.0 (the "License");          //
+// you may not use this file except in compliance with the License.         //
+// You may obtain a copy of the License at                                  //
+//                                                                          //
+//     http://www.apache.org/licenses/LICENSE-2.0                           //
+//                                                                          //
+// Unless required by applicable law or agreed to in writing, software      //
+// distributed under the License is distributed on an "AS IS" BASIS,        //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and      //
+// limitations under the License.                                           //
+// ======================================================================== //
+
+#pragma once
+
+#include "filter.h"
+#include "network.h"
+#include "transfer_function.h"
+
+namespace oidn {
+
+  // --------------------------------------------------------------------------
+  // AutoencoderFilter - Direct-predicting autoencoder
+  // --------------------------------------------------------------------------
+
+  class AutoencoderFilter : public Filter
+  {
+  protected:
+    static constexpr int alignment       = 32;  // required spatial alignment in pixels (padding may be necessary)
+    static constexpr int receptiveField  = 222; // receptive field in pixels
+    static constexpr int overlap         = roundUp(receptiveField / 2, alignment); // required spatial overlap between tiles in pixels
+
+    static constexpr int estimatedBytesBase       = 16*1024*1024; // estimated base memory usage
+    static constexpr int estimatedBytesPerPixel8  = 889;          // estimated memory usage per pixel for K=8
+    static constexpr int estimatedBytesPerPixel16 = 2185;         // estimated memory usage per pixel for K=16
+
+    Image color;
+    Image albedo;
+    Image normal;
+    Image output;
+    bool hdr = false;
+    float hdrScale = std::numeric_limits<float>::quiet_NaN();
+    bool srgb = false;
+    int maxMemoryMB = 6000; // approximate maximum memory usage in MBs
+
+    int H = 0;          // image height
+    int W = 0;          // image width
+    int tileH = 0;      // tile height
+    int tileW = 0;      // tile width
+    int tileCountH = 1; // number of tiles in H dimension
+    int tileCountW = 1; // number of tiles in W dimension
+
+    std::shared_ptr<Executable> net;
+    std::shared_ptr<Node> inputReorder;
+    std::shared_ptr<Node> outputReorder;
+
+    struct
+    {
+      void* ldr         = nullptr;
+      void* ldr_alb     = nullptr;
+      void* ldr_alb_nrm = nullptr;
+      void* hdr         = nullptr;
+      void* hdr_alb     = nullptr;
+      void* hdr_alb_nrm = nullptr;
+    } weightData;
+
+    explicit AutoencoderFilter(const Ref<Device>& device);
+    virtual std::shared_ptr<TransferFunction> makeTransferFunc();
+
+  public:
+    void setImage(const std::string& name, const Image& data) override;
+    void set1i(const std::string& name, int value) override;
+    int get1i(const std::string& name) override;
+    void set1f(const std::string& name, float value) override;
+    float get1f(const std::string& name) override;
+
+    void commit() override;
+    void execute() override;
+
+  private:
+    void computeTileSize();
+
+    template<int K>
+    std::shared_ptr<Executable> buildNet();
+
+    bool isCommitted() const { return bool(net); }
+  };
+
+  // --------------------------------------------------------------------------
+  // RTFilter - Generic ray tracing denoiser
+  // --------------------------------------------------------------------------
+
+// Godot doesn't need Raytracing filters. Removing them saves space in the weights files.
+#if 0
+  class RTFilter : public AutoencoderFilter
+  {
+  public:
+    explicit RTFilter(const Ref<Device>& device);
+  };
+#endif
+
+  // --------------------------------------------------------------------------
+  // RTLightmapFilter - Ray traced lightmap denoiser
+  // --------------------------------------------------------------------------
+
+  class RTLightmapFilter : public AutoencoderFilter
+  {
+  public:
+    explicit RTLightmapFilter(const Ref<Device>& device);
+    std::shared_ptr<TransferFunction> makeTransferFunc() override;
+  };
+
+} // namespace oidn
diff --git a/thirdparty/oidn/core/buffer.h b/thirdparty/oidn/core/buffer.h
new file mode 100644
index 0000000000..b95109152e
--- /dev/null
+++ b/thirdparty/oidn/core/buffer.h
@@ -0,0 +1,75 @@
+// ======================================================================== //
+// Copyright 2009-2019 Intel Corporation                                    //
+//                                                                          //
+// Licensed under the Apache License, Version 2.0 (the "License");          //
+// you may not use this file except in compliance with the License.         //
+// You may obtain a copy of the License at                                  //
+//                                                                          //
+//     http://www.apache.org/licenses/LICENSE-2.0                           //
+//                                                                          //
+// Unless required by applicable law or agreed to in writing, software      //
+// distributed under the License is distributed on an "AS IS" BASIS,        //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and      //
+// limitations under the License.                                           //
+// ======================================================================== //
+
+#pragma once
+
+#include "common.h"
+#include "device.h"
+
+namespace oidn {
+
+  class Device;
+
+  // Buffer which may or may not own its data
+  class Buffer : public RefCount
+  {
+  private:
+    char* ptr;
+    size_t byteSize;
+    bool shared;
+    Ref<Device> device;
+
+  public:
+    __forceinline Buffer(const Ref<Device>& device, size_t size)
+      : ptr((char*)alignedMalloc(size, 64)),
+        byteSize(size),
+        shared(false),
+        device(device) {}
+
+    __forceinline Buffer(const Ref<Device>& device, void* data, size_t size)
+      : ptr((char*)data),
+        byteSize(size),
+        shared(true),
+        device(device)
+    {
+      if (data == nullptr)
+        throw Exception(Error::InvalidArgument, "buffer pointer null");
+    }
+
+    __forceinline ~Buffer()
+    {
+      if (!shared)
+        alignedFree(ptr);
+    }
+
+    __forceinline char* data() { return ptr; }
+    __forceinline const char* data() const { return ptr; }
+    __forceinline size_t size() const { return byteSize; }
+
+    void* map(size_t offset, size_t size)
+    {
+      if (offset + size > byteSize)
+        throw Exception(Error::InvalidArgument, "buffer region out of range");
+
+      return ptr + offset;
+    }
+
+    void unmap(void* mappedPtr) {}
+
+    Device* getDevice() { return device.get(); }
+  };
+
+} // namespace oidn
diff --git a/thirdparty/oidn/core/common.h b/thirdparty/oidn/core/common.h
new file mode 100644
index 0000000000..6c87f377bc
--- /dev/null
+++ b/thirdparty/oidn/core/common.h
@@ -0,0 +1,133 @@
+// ======================================================================== //
+// Copyright 2009-2019 Intel Corporation                                    //
+//                                                                          //
+// Licensed under the Apache License, Version 2.0 (the "License");          //
+// you may not use this file except in compliance with the License.         //
+// You may obtain a copy of the License at                                  //
+//                                                                          //
+//     http://www.apache.org/licenses/LICENSE-2.0                           //
+//                                                                          //
+// Unless required by applicable law or agreed to in writing, software      //
+// distributed under the License is distributed on an "AS IS" BASIS,        //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and      //
+// limitations under the License.                                           //
+// ======================================================================== //
+
+#pragma once
+
+#include "common/platform.h"
+
+#include "mkl-dnn/include/mkldnn.hpp"
+#include "mkl-dnn/include/mkldnn_debug.h"
+#include "mkl-dnn/src/common/mkldnn_thread.hpp"
+#include "mkl-dnn/src/common/type_helpers.hpp"
+#include "mkl-dnn/src/cpu/jit_generator.hpp"
+
+#include "common/ref.h"
+#include "common/exception.h"
+#include "common/thread.h"
+#include "math.h"
+
+namespace oidn {
+
+  using namespace mkldnn;
+  using namespace mkldnn::impl::cpu;
+  using mkldnn::impl::parallel_nd;
+  using mkldnn::impl::memory_desc_matches_tag;
+
+
+  inline size_t getFormatBytes(Format format)
+  {
+    switch (format)
+    {
+    case Format::Undefined: return 1;
+    case Format::Float:     return sizeof(float);
+    case Format::Float2:    return sizeof(float)*2;
+    case Format::Float3:    return sizeof(float)*3;
+    case Format::Float4:    return sizeof(float)*4;
+    }
+    assert(0);
+    return 0;
+  }
+
+
+  inline memory::dims getTensorDims(const std::shared_ptr<memory>& mem)
+  {
+    const mkldnn_memory_desc_t& desc = mem->get_desc().data;
+    return memory::dims(&desc.dims[0], &desc.dims[desc.ndims]);
+  }
+
+  inline memory::data_type getTensorType(const std::shared_ptr<memory>& mem)
+  {
+    const mkldnn_memory_desc_t& desc = mem->get_desc().data;
+    return memory::data_type(desc.data_type);
+  }
+
+  // Returns the number of values in a tensor
+  inline size_t getTensorSize(const memory::dims& dims)
+  {
+    size_t res = 1;
+    for (int i = 0; i < (int)dims.size(); ++i)
+      res *= dims[i];
+    return res;
+  }
+
+  inline memory::dims getMaxTensorDims(const std::vector<memory::dims>& dims)
+  {
+    memory::dims result;
+    size_t maxSize = 0;
+
+    for (const auto& d : dims)
+    {
+      const size_t size = getTensorSize(d);
+      if (size > maxSize)
+      {
+        result = d;
+        maxSize = size;
+      }
+    }
+
+    return result;
+  }
+
+  inline size_t getTensorSize(const std::shared_ptr<memory>& mem)
+  {
+    return getTensorSize(getTensorDims(mem));
+  }
+
+
+  template<int K>
+  inline int getPadded(int dim)
+  {
+    return (dim + (K-1)) & ~(K-1);
+  }
+
+  template<int K>
+  inline memory::dims getPadded_nchw(const memory::dims& dims)
+  {
+    assert(dims.size() == 4);
+    memory::dims padDims = dims;
+    padDims[1] = getPadded<K>(dims[1]); // pad C
+    return padDims;
+  }
+
+
+  template<int K>
+  struct BlockedFormat;
+
+  template<>
+  struct BlockedFormat<8>
+  {
+    static constexpr memory::format_tag nChwKc   = memory::format_tag::nChw8c;
+    static constexpr memory::format_tag OIhwKiKo = memory::format_tag::OIhw8i8o;
+  };
+
+  template<>
+  struct BlockedFormat<16>
+  {
+    static constexpr memory::format_tag nChwKc   = memory::format_tag::nChw16c;
+    static constexpr memory::format_tag OIhwKiKo = memory::format_tag::OIhw16i16o;
+  };
+
+} // namespace oidn
diff --git a/thirdparty/oidn/core/device.cpp b/thirdparty/oidn/core/device.cpp
new file mode 100644
index 0000000000..0812624bb5
--- /dev/null
+++ b/thirdparty/oidn/core/device.cpp
@@ -0,0 +1,205 @@
+// ======================================================================== //
+// Copyright 2009-2019 Intel Corporation                                    //
+//                                                                          //
+// Licensed under the Apache License, Version 2.0 (the "License");          //
+// you may not use this file except in compliance with the License.         //
+// You may obtain a copy of the License at                                  //
+//                                                                          //
+//     http://www.apache.org/licenses/LICENSE-2.0                           //
+//                                                                          //
+// Unless required by applicable law or agreed to in writing, software      //
+// distributed under the License is distributed on an "AS IS" BASIS,        //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and      //
+// limitations under the License.                                           //
+// ======================================================================== //
+
+#include "device.h"
+#include "autoencoder.h"
+
+namespace oidn {
+
+  thread_local Device::ErrorState Device::globalError;
+
+  Device::Device()
+  {
+    if (!mayiuse(sse41))
+      throw Exception(Error::UnsupportedHardware, "SSE4.1 support is required at minimum");
+  }
+
+  Device::~Device()
+  {
+  }
+
+  void Device::setError(Device* device, Error code, const std::string& message)
+  {
+    // Update the stored error only if the previous error was queried
+    if (device)
+    {
+      ErrorState& curError = device->error.get();
+
+      if (curError.code == Error::None)
+      {
+        curError.code = code;
+        curError.message = message;
+      }
+
+      // Print the error message in verbose mode
+      if (device->isVerbose())
+        std::cerr << "Error: " << message << std::endl;
+
+      // Call the error callback function
+      ErrorFunction errorFunc;
+      void* errorUserPtr;
+
+      {
+        std::lock_guard<std::mutex> lock(device->mutex);
+        errorFunc = device->errorFunc;
+        errorUserPtr = device->errorUserPtr;
+      }
+
+      if (errorFunc)
+        errorFunc(errorUserPtr, code, (code == Error::None) ? nullptr : message.c_str());
+    }
+    else
+    {
+      if (globalError.code == Error::None)
+      {
+        globalError.code = code;
+        globalError.message = message;
+      }
+    }
+  }
+
+  Error Device::getError(Device* device, const char** outMessage)
+  {
+    // Return and clear the stored error code, but keep the error message so pointers to it will
+    // remain valid until the next getError call
+    if (device)
+    {
+      ErrorState& curError = device->error.get();
+      const Error code = curError.code;
+      if (outMessage)
+        *outMessage = (code == Error::None) ? nullptr : curError.message.c_str();
+      curError.code = Error::None;
+      return code;
+    }
+    else
+    {
+      const Error code = globalError.code;
+      if (outMessage)
+        *outMessage = (code == Error::None) ? nullptr : globalError.message.c_str();
+      globalError.code = Error::None;
+      return code;
+    }
+  }
+
+  void Device::setErrorFunction(ErrorFunction func, void* userPtr)
+  {
+    errorFunc = func;
+    errorUserPtr = userPtr;
+  }
+
+  int Device::get1i(const std::string& name)
+  {
+    if (name == "numThreads")
+      return numThreads;
+    else if (name == "setAffinity")
+      return setAffinity;
+    else if (name == "verbose")
+      return verbose;
+    else if (name == "version")
+      return OIDN_VERSION;
+    else if (name == "versionMajor")
+      return OIDN_VERSION_MAJOR;
+    else if (name == "versionMinor")
+      return OIDN_VERSION_MINOR;
+    else if (name == "versionPatch")
+      return OIDN_VERSION_PATCH;
+    else
+      throw Exception(Error::InvalidArgument, "invalid parameter");
+  }
+
+  void Device::set1i(const std::string& name, int value)
+  {
+    if (name == "numThreads")
+      numThreads = value;
+    else if (name == "setAffinity")
+      setAffinity = value;
+    else if (name == "verbose")
+    {
+      verbose = value;
+      error.verbose = value;
+    }
+
+    dirty = true;
+  }
+
+  void Device::commit()
+  {
+    if (isCommitted())
+      throw Exception(Error::InvalidOperation, "device can be committed only once");
+
+    // Create the task arena
+    const int maxNumThreads = 1; //affinity ? affinity->getNumThreads() : tbb::this_task_arena::max_concurrency();
+    numThreads = (numThreads > 0) ? min(numThreads, maxNumThreads) : maxNumThreads;
+
+    dirty = false;
+
+    if (isVerbose())
+      print();
+  }
+
+  void Device::checkCommitted()
+  {
+    if (dirty)
+      throw Exception(Error::InvalidOperation, "changes to the device are not committed");
+  }
+
+  Ref<Buffer> Device::newBuffer(size_t byteSize)
+  {
+    checkCommitted();
+    return makeRef<Buffer>(Ref<Device>(this), byteSize);
+  }
+
+  Ref<Buffer> Device::newBuffer(void* ptr, size_t byteSize)
+  {
+    checkCommitted();
+    return makeRef<Buffer>(Ref<Device>(this), ptr, byteSize);
+  }
+
+  Ref<Filter> Device::newFilter(const std::string& type)
+  {
+    checkCommitted();
+
+    if (isVerbose())
+      std::cout << "Filter: " << type << std::endl;
+
+    Ref<Filter> filter;
+
+// Godot doesn't need Raytracing filters. Removing them saves space in the weights files.
+#if 0
+    if (type == "RT")
+      filter = makeRef<RTFilter>(Ref<Device>(this));
+#endif
+	if (type == "RTLightmap")
+      filter = makeRef<RTLightmapFilter>(Ref<Device>(this));
+    else
+      throw Exception(Error::InvalidArgument, "unknown filter type");
+
+    return filter;
+  }
+
+  void Device::print()
+  {
+    std::cout << std::endl;
+
+    std::cout << "Intel(R) Open Image Denoise " << OIDN_VERSION_STRING << std::endl;
+    std::cout << "  Compiler: " << getCompilerName() << std::endl;
+    std::cout << "  Build   : " << getBuildName() << std::endl;
+    std::cout << "  Platform: " << getPlatformName() << std::endl;
+
+    std::cout << std::endl;
+  }
+
+} // namespace oidn
diff --git a/thirdparty/oidn/core/device.h b/thirdparty/oidn/core/device.h
new file mode 100644
index 0000000000..93a83eb731
--- /dev/null
+++ b/thirdparty/oidn/core/device.h
@@ -0,0 +1,78 @@
+// ======================================================================== //
+// Copyright 2009-2019 Intel Corporation                                    //
+//                                                                          //
+// Licensed under the Apache License, Version 2.0 (the "License");          //
+// you may not use this file except in compliance with the License.         //
+// You may obtain a copy of the License at                                  //
+//                                                                          //
+//     http://www.apache.org/licenses/LICENSE-2.0                           //
+//                                                                          //
+// Unless required by applicable law or agreed to in writing, software      //
+// distributed under the License is distributed on an "AS IS" BASIS,        //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and      //
+// limitations under the License.                                           //
+// ======================================================================== //
+
+#pragma once
+
+#include "common.h"
+
+namespace oidn {
+
+  class Buffer;
+  class Filter;
+
+  class Device : public RefCount, public Verbose
+  {
+  private:
+    // Thread-safety
+    std::mutex mutex;
+
+    // Error handling
+    struct ErrorState
+    {
+      Error code = Error::None;
+      std::string message;
+    };
+
+    static thread_local ErrorState globalError;
+    ThreadLocal<ErrorState> error;
+    ErrorFunction errorFunc = nullptr;
+    void* errorUserPtr = nullptr;
+
+    // Parameters
+    int numThreads = 0; // autodetect by default
+    bool setAffinity = true;
+
+    bool dirty = true;
+
+  public:
+    Device();
+    ~Device();
+
+    static void setError(Device* device, Error code, const std::string& message);
+    static Error getError(Device* device, const char** outMessage);
+
+    void setErrorFunction(ErrorFunction func, void* userPtr);
+
+    int get1i(const std::string& name);
+    void set1i(const std::string& name, int value);
+
+    void commit();
+
+    Ref<Buffer> newBuffer(size_t byteSize);
+    Ref<Buffer> newBuffer(void* ptr, size_t byteSize);
+    Ref<Filter> newFilter(const std::string& type);
+
+    __forceinline Device* getDevice() { return this; }
+    __forceinline std::mutex& getMutex() { return mutex; }
+
+  private:
+    bool isCommitted() const { return false; }
+    void checkCommitted();
+
+    void print();
+  };
+
+} // namespace oidn
diff --git a/thirdparty/oidn/core/filter.cpp b/thirdparty/oidn/core/filter.cpp
new file mode 100644
index 0000000000..ec1f10af87
--- /dev/null
+++ b/thirdparty/oidn/core/filter.cpp
@@ -0,0 +1,27 @@
+// ======================================================================== //
+// Copyright 2009-2019 Intel Corporation                                    //
+//                                                                          //
+// Licensed under the Apache License, Version 2.0 (the "License");          //
+// you may not use this file except in compliance with the License.         //
+// You may obtain a copy of the License at                                  //
+//                                                                          //
+//     http://www.apache.org/licenses/LICENSE-2.0                           //
+//                                                                          //
+// Unless required by applicable law or agreed to in writing, software      //
+// distributed under the License is distributed on an "AS IS" BASIS,        //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and      //
+// limitations under the License.                                           //
+// ======================================================================== //
+
+#include "filter.h"
+
+namespace oidn {
+
+  void Filter::setProgressMonitorFunction(ProgressMonitorFunction func, void* userPtr)
+  {
+    progressFunc = func;
+    progressUserPtr = userPtr;
+  }
+
+} // namespace oidn
diff --git a/thirdparty/oidn/core/filter.h b/thirdparty/oidn/core/filter.h
new file mode 100644
index 0000000000..935fa202f4
--- /dev/null
+++ b/thirdparty/oidn/core/filter.h
@@ -0,0 +1,52 @@
+// ======================================================================== //
+// Copyright 2009-2019 Intel Corporation                                    //
+//                                                                          //
+// Licensed under the Apache License, Version 2.0 (the "License");          //
+// you may not use this file except in compliance with the License.         //
+// You may obtain a copy of the License at                                  //
+//                                                                          //
+//     http://www.apache.org/licenses/LICENSE-2.0                           //
+//                                                                          //
+// Unless required by applicable law or agreed to in writing, software      //
+// distributed under the License is distributed on an "AS IS" BASIS,        //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and      //
+// limitations under the License.                                           //
+// ======================================================================== //
+
+#pragma once
+
+#include "common.h"
+#include "device.h"
+#include "image.h"
+
+namespace oidn {
+
+  class Filter : public RefCount
+  {
+  protected:
+    Ref<Device> device;
+
+    ProgressMonitorFunction progressFunc = nullptr;
+    void* progressUserPtr = nullptr;
+
+    bool dirty = true;
+
+  public:
+    explicit Filter(const Ref<Device>& device) : device(device) {}
+
+    virtual void setImage(const std::string& name, const Image& data) = 0;
+    virtual void set1i(const std::string& name, int value) = 0;
+    virtual int get1i(const std::string& name) = 0;
+    virtual void set1f(const std::string& name, float value) = 0;
+    virtual float get1f(const std::string& name) = 0;
+
+    void setProgressMonitorFunction(ProgressMonitorFunction func, void* userPtr);
+
+    virtual void commit() = 0;
+    virtual void execute() = 0;
+
+    Device* getDevice() { return device.get(); }
+  };
+
+} // namespace oidn
diff --git a/thirdparty/oidn/core/image.h b/thirdparty/oidn/core/image.h
new file mode 100644
index 0000000000..748f49c4e5
--- /dev/null
+++ b/thirdparty/oidn/core/image.h
@@ -0,0 +1,111 @@
+// ======================================================================== //
+// Copyright 2009-2019 Intel Corporation                                    //
+//                                                                          //
+// Licensed under the Apache License, Version 2.0 (the "License");          //
+// you may not use this file except in compliance with the License.         //
+// You may obtain a copy of the License at                                  //
+//                                                                          //
+//     http://www.apache.org/licenses/LICENSE-2.0                           //
+//                                                                          //
+// Unless required by applicable law or agreed to in writing, software      //
+// distributed under the License is distributed on an "AS IS" BASIS,        //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and      //
+// limitations under the License.                                           //
+// ======================================================================== //
+
+#pragma once
+
+#include "common.h"
+#include "buffer.h"
+
+namespace oidn {
+
+  struct Image
+  {
+    static constexpr int maxSize = 65536;
+
+    char* ptr;              // pointer to the first pixel
+    int width;              // width in number of pixels
+    int height;             // height in number of pixels
+    size_t bytePixelStride; // pixel stride in number of *bytes*
+    size_t rowStride;       // row stride in number of *pixel strides*
+    Format format;          // pixel format
+    Ref<Buffer> buffer;     // buffer containing the image data
+
+    Image() : ptr(nullptr), width(0), height(0), bytePixelStride(0), rowStride(0), format(Format::Undefined) {}
+
+    Image(void* ptr, Format format, int width, int height, size_t byteOffset, size_t inBytePixelStride, size_t inByteRowStride)
+    {
+      if (ptr == nullptr)
+        throw Exception(Error::InvalidArgument, "buffer pointer null");
+
+      init((char*)ptr + byteOffset, format, width, height, inBytePixelStride, inByteRowStride);
+    }
+
+    Image(const Ref<Buffer>& buffer, Format format, int width, int height, size_t byteOffset, size_t inBytePixelStride, size_t inByteRowStride)
+    {
+      init(buffer->data() + byteOffset, format, width, height, inBytePixelStride, inByteRowStride);
+
+      if (byteOffset + height * rowStride * bytePixelStride > buffer->size())
+        throw Exception(Error::InvalidArgument, "buffer region out of range");
+    }
+
+    void init(char* ptr, Format format, int width, int height, size_t inBytePixelStride, size_t inByteRowStride)
+    {
+      assert(width >= 0);
+      assert(height >= 0);
+      if (width > maxSize || height > maxSize)
+        throw Exception(Error::InvalidArgument, "image size too large");
+
+      this->ptr = ptr;
+      this->width = width;
+      this->height = height;
+
+      const size_t pixelSize = getFormatBytes(format);
+      if (inBytePixelStride != 0)
+      {
+        if (inBytePixelStride < pixelSize)
+          throw Exception(Error::InvalidArgument, "pixel stride smaller than pixel size");
+
+        this->bytePixelStride = inBytePixelStride;
+      }
+      else
+      {
+        this->bytePixelStride = pixelSize;
+      }
+
+      if (inByteRowStride != 0)
+      {
+        if (inByteRowStride < width * this->bytePixelStride)
+          throw Exception(Error::InvalidArgument, "row stride smaller than width * pixel stride");
+        if (inByteRowStride % this->bytePixelStride != 0)
+          throw Exception(Error::InvalidArgument, "row stride not integer multiple of pixel stride");
+
+        this->rowStride = inByteRowStride / this->bytePixelStride;
+      }
+      else
+      {
+        this->rowStride = width;
+      }
+
+      this->format = format;
+    }
+
+    __forceinline char* get(int y, int x)
+    {
+      return ptr + ((size_t(y) * rowStride + size_t(x)) * bytePixelStride);
+    }
+
+    __forceinline const char* get(int y, int x) const
+    {
+      return ptr + ((size_t(y) * rowStride + size_t(x)) * bytePixelStride);
+    }
+
+    operator bool() const
+    {
+      return ptr != nullptr;
+    }
+  };
+
+} // namespace oidn
diff --git a/thirdparty/oidn/core/input_reorder.h b/thirdparty/oidn/core/input_reorder.h
new file mode 100644
index 0000000000..966856afe9
--- /dev/null
+++ b/thirdparty/oidn/core/input_reorder.h
@@ -0,0 +1,232 @@
+// ======================================================================== //
+// Copyright 2009-2019 Intel Corporation                                    //
+//                                                                          //
+// Licensed under the Apache License, Version 2.0 (the "License");          //
+// you may not use this file except in compliance with the License.         //
+// You may obtain a copy of the License at                                  //
+//                                                                          //
+//     http://www.apache.org/licenses/LICENSE-2.0                           //
+//                                                                          //
+// Unless required by applicable law or agreed to in writing, software      //
+// distributed under the License is distributed on an "AS IS" BASIS,        //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and      //
+// limitations under the License.                                           //
+// ======================================================================== //
+
+#pragma once
+
+#include "node.h"
+#include "image.h"
+
+namespace oidn {
+
+  // Input reorder node
+  template<int K, class TransferFunction>
+  class InputReorderNode : public Node
+  {
+  private:
+    // Source
+    Image color;
+    Image albedo;
+    Image normal;
+
+    // Destination
+    std::shared_ptr<memory> dst;
+    float* dstPtr;
+    int C2;
+    int H2;
+    int W2;
+
+    // Tile
+    int h1Begin;
+    int w1Begin;
+    int h2Begin;
+    int w2Begin;
+    int H;
+    int W;
+
+    std::shared_ptr<TransferFunction> transferFunc;
+
+  public:
+    InputReorderNode(const Image& color,
+                     const Image& albedo,
+                     const Image& normal,
+                     const std::shared_ptr<memory>& dst,
+                     const std::shared_ptr<TransferFunction>& transferFunc)
+      : color(color), albedo(albedo), normal(normal),
+        dst(dst),
+        h1Begin(0), w1Begin(0),
+        H(color.height), W(color.width),
+        transferFunc(transferFunc)
+    {
+      const mkldnn_memory_desc_t& dstDesc = dst->get_desc().data;
+      assert(memory_desc_matches_tag(dstDesc, mkldnn_format_tag_t(BlockedFormat<K>::nChwKc)));
+      assert(dstDesc.ndims == 4);
+      assert(dstDesc.data_type == memory::data_type::f32);
+      assert(dstDesc.dims[0] == 1);
+      //assert(dstDesc.dims[1] >= getPadded<K>(C1));
+
+      dstPtr = (float*)dst->get_data_handle();
+      C2 = dstDesc.dims[1];
+      H2 = dstDesc.dims[2];
+      W2 = dstDesc.dims[3];
+    }
+
+    void setTile(int h1, int w1, int h2, int w2, int H, int W) override
+    {
+      h1Begin = h1;
+      w1Begin = w1;
+      h2Begin = h2;
+      w2Begin = w2;
+      this->H = H;
+      this->W = W;
+    }
+
+    void execute(stream& sm) override
+    {
+      assert(H + h1Begin <= color.height);
+      assert(W + w1Begin <= color.width);
+      assert(H + h2Begin <= H2);
+      assert(W + w2Begin <= W2);
+
+      parallel_nd(H2, [&](int h2)
+      {
+        const int h = h2 - h2Begin;
+
+        if (h >= 0 && h < H)
+        {
+          const int h1 = h + h1Begin;
+
+          // Zero pad
+          for (int w2 = 0; w2 < w2Begin; ++w2)
+          {
+            int c = 0;
+            while (c < C2)
+              store(h2, w2, c, 0.f);
+          }
+
+          // Reorder
+          for (int w = 0; w < W; ++w)
+          {
+            const int w1 = w + w1Begin;
+            const int w2 = w + w2Begin;
+
+            int c = 0;
+            storeColor(h2, w2, c, (float*)color.get(h1, w1));
+            if (albedo)
+              storeAlbedo(h2, w2, c, (float*)albedo.get(h1, w1));
+            if (normal)
+              storeNormal(h2, w2, c, (float*)normal.get(h1, w1));
+            while (c < C2)
+              store(h2, w2, c, 0.f);
+          }
+
+          // Zero pad
+          for (int w2 = W + w2Begin; w2 < W2; ++w2)
+          {
+            int c = 0;
+            while (c < C2)
+              store(h2, w2, c, 0.f);
+          }
+        }
+        else
+        {
+          // Zero pad
+          for (int w2 = 0; w2 < W2; ++w2)
+          {
+            int c = 0;
+            while (c < C2)
+              store(h2, w2, c, 0.f);
+          }
+        }
+      });
+    }
+
+    std::shared_ptr<memory> getDst() const override { return dst; }
+
+  private:
+    // Stores a single value
+    __forceinline void store(int h, int w, int& c, float value)
+    {
+      // Destination is in nChwKc format
+      float* dst_c = dstPtr + (H2*W2*K*(c/K)) + h*W2*K + w*K + (c%K);
+      *dst_c = value;
+      c++;
+    }
+
+    // Stores a color
+    __forceinline void storeColor(int h, int w, int& c, const float* values)
+    {
+      #pragma unroll
+      for (int i = 0; i < 3; ++i)
+      {
+        // Load the value
+        float x = values[i];
+
+        // Sanitize the value
+        x = maxSafe(x, 0.f);
+
+        // Apply the transfer function
+        x = transferFunc->forward(x);
+
+        // Store the value
+        store(h, w, c, x);
+      }
+    }
+
+    // Stores an albedo
+    __forceinline void storeAlbedo(int h, int w, int& c, const float* values)
+    {
+      #pragma unroll
+      for (int i = 0; i < 3; ++i)
+      {
+        // Load the value
+        float x = values[i];
+
+        // Sanitize the value
+        x = clampSafe(x, 0.f, 1.f);
+
+        // Store the value
+        store(h, w, c, x);
+      }
+    }
+
+    // Stores a normal
+    __forceinline void storeNormal(int h, int w, int& c, const float* values)
+    {
+      // Load the normal
+      float x = values[0];
+      float y = values[1];
+      float z = values[2];
+
+      // Compute the length of the normal
+      const float lengthSqr = sqr(x) + sqr(y) + sqr(z);
+
+      // Normalize the normal and transform it to [0..1]
+      if (isfinite(lengthSqr))
+      {
+        const float invLength = (lengthSqr > minVectorLengthSqr) ? rsqrt(lengthSqr) : 1.f;
+
+        const float scale  = invLength * 0.5f;
+        const float offset = 0.5f;
+
+        x = x * scale + offset;
+        y = y * scale + offset;
+        z = z * scale + offset;
+      }
+      else
+      {
+        x = 0.f;
+        y = 0.f;
+        z = 0.f;
+      }
+
+      // Store the normal
+      store(h, w, c, x);
+      store(h, w, c, y);
+      store(h, w, c, z);
+    }
+  };
+
+} // namespace oidn
diff --git a/thirdparty/oidn/core/math.h b/thirdparty/oidn/core/math.h
new file mode 100644
index 0000000000..a844ef0d1d
--- /dev/null
+++ b/thirdparty/oidn/core/math.h
@@ -0,0 +1,78 @@
+// ======================================================================== //
+// Copyright 2009-2019 Intel Corporation                                    //
+//                                                                          //
+// Licensed under the Apache License, Version 2.0 (the "License");          //
+// you may not use this file except in compliance with the License.         //
+// You may obtain a copy of the License at                                  //
+//                                                                          //
+//     http://www.apache.org/licenses/LICENSE-2.0                           //
+//                                                                          //
+// Unless required by applicable law or agreed to in writing, software      //
+// distributed under the License is distributed on an "AS IS" BASIS,        //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and      //
+// limitations under the License.                                           //
+// ======================================================================== //
+
+#pragma once
+
+#include "common/platform.h"
+
+namespace oidn {
+
+  constexpr float minVectorLength    = 1e-10f;
+  constexpr float minVectorLengthSqr = minVectorLength * minVectorLength;
+
+  using std::log;
+  using std::log2;
+  using std::exp;
+  using std::exp2;
+  using std::pow;
+  using std::isfinite;
+  using std::isnan;
+
+  __forceinline float sqr(float x)
+  {
+    return x * x;
+  }
+
+  __forceinline float rcp(float x)
+  {
+    __m128 r = _mm_rcp_ss(_mm_set_ss(x));
+    return _mm_cvtss_f32(_mm_sub_ss(_mm_add_ss(r, r), _mm_mul_ss(_mm_mul_ss(r, r), _mm_set_ss(x))));
+  }
+
+  __forceinline float rsqrt(float x)
+  {
+    __m128 r = _mm_rsqrt_ss(_mm_set_ss(x));
+    return _mm_cvtss_f32(_mm_add_ss(_mm_mul_ss(_mm_set_ss(1.5f), r),
+             _mm_mul_ss(_mm_mul_ss(_mm_mul_ss(_mm_set_ss(x), _mm_set_ss(-0.5f)), r), _mm_mul_ss(r, r))));
+  }
+
+  __forceinline float maxSafe(float value, float minValue)
+  {
+    return isfinite(value) ? max(value, minValue) : minValue;
+  }
+
+  __forceinline float clampSafe(float value, float minValue, float maxValue)
+  {
+    return isfinite(value) ? clamp(value, minValue, maxValue) : minValue;
+  }
+
+  // Returns ceil(a / b) for non-negative integers
+  template<class Int>
+  __forceinline constexpr Int ceilDiv(Int a, Int b)
+  {
+    //assert(a >= 0);
+    //assert(b > 0);
+    return (a + b - 1) / b;
+  }
+
+  // Returns a rounded up to multiple of b
+  template<class Int>
+  __forceinline constexpr Int roundUp(Int a, Int b)
+  {
+    return ceilDiv(a, b) * b;
+  }
+
+} // namespace oidn
diff --git a/thirdparty/oidn/core/network.cpp b/thirdparty/oidn/core/network.cpp
new file mode 100644
index 0000000000..4da32073cd
--- /dev/null
+++ b/thirdparty/oidn/core/network.cpp
@@ -0,0 +1,434 @@
+// ======================================================================== //
+// Copyright 2009-2019 Intel Corporation                                    //
+//                                                                          //
+// Licensed under the Apache License, Version 2.0 (the "License");          //
+// you may not use this file except in compliance with the License.         //
+// You may obtain a copy of the License at                                  //
+//                                                                          //
+//     http://www.apache.org/licenses/LICENSE-2.0                           //
+//                                                                          //
+// Unless required by applicable law or agreed to in writing, software      //
+// distributed under the License is distributed on an "AS IS" BASIS,        //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and      //
+// limitations under the License.                                           //
+// ======================================================================== //
+
+#include "network.h"
+#include "upsample.h"
+#include "weights_reorder.h"
+#include <cstring>
+
+namespace oidn {
+
+  template<int K>
+  Network<K>::Network(const Ref<Device>& device, const std::map<std::string, Tensor>& weightMap)
+    : device(device),
+      eng(engine::cpu, 0),
+      sm(eng),
+      weightMap(weightMap)
+  {
+  }
+
+  template<int K>
+  void Network<K>::execute(const Progress& progress, int taskIndex)
+  {
+    if (progress.func)
+    {
+      const double value = double(taskIndex) / double(progress.taskCount);
+      if (!progress.func(progress.userPtr, value))
+        throw Exception(Error::Cancelled, "execution was cancelled");
+    }
+
+    for (size_t i = 0; i < nodes.size(); ++i)
+    {
+      nodes[i]->execute(sm);
+
+      if (progress.func)
+      {
+        const double value = (double(taskIndex) + double(i+1) / double(nodes.size())) / double(progress.taskCount);
+        if (!progress.func(progress.userPtr, value))
+          throw Exception(Error::Cancelled, "execution was cancelled");
+      }
+    }
+  }
+
+  template<int K>
+  std::shared_ptr<memory> Network<K>::allocTensor(const memory::dims& dims,
+                                                  memory::format_tag format,
+                                                  void* data)
+  {
+    if (format == memory::format_tag::any)
+    {
+      if (dims.size() == 4)
+        format = BlockedFormat<K>::nChwKc;
+      else if (dims.size() == 1)
+        format = memory::format_tag::x;
+      else
+        assert(0);
+    }
+    memory::desc desc(dims, memory::data_type::f32, format);
+    if (data == nullptr)
+    {
+      const size_t bytes = getTensorSize(dims) * sizeof(float);
+      if (format == BlockedFormat<K>::nChwKc)
+        activationAllocBytes += bytes;
+      totalAllocBytes += bytes;
+
+      return std::make_shared<memory>(desc, eng);
+    }
+    else
+    {
+      return std::make_shared<memory>(desc, eng, data);
+    }
+  }
+
+  template<int K>
+  std::shared_ptr<memory> Network<K>::castTensor(const memory::dims& dims,
+                                                 const std::shared_ptr<memory>& src,
+                                                 size_t srcOffset,
+                                                 memory::format_tag format)
+  {
+    const mkldnn_memory_desc_t& srcDesc = src->get_desc().data;
+    MAYBE_UNUSED(srcDesc);
+    assert(srcDesc.data_type == memory::data_type::f32);
+    assert(getTensorSize(src) >= srcOffset + getTensorSize(dims));
+
+    if (format == memory::format_tag::any)
+    {
+      if (dims.size() == 4)
+        format = BlockedFormat<K>::nChwKc;
+      else if (dims.size() == 1)
+        format = memory::format_tag::x;
+      else
+        assert(0);
+    }
+    memory::desc desc(dims, memory::data_type::f32, format);
+    float* srcPtr = (float*)src->get_data_handle() + srcOffset;
+    return std::make_shared<memory>(desc, eng, srcPtr);
+  }
+
+  template<int K>
+  std::shared_ptr<memory> Network<K>::castTensor(const memory::dims& dims,
+                                                 const std::shared_ptr<memory>& src,
+                                                 const memory::dims& srcOffset)
+  {
+    return castTensor(dims, src, getTensorSize(srcOffset));
+  }
+
+  template<int K>
+  void Network<K>::zeroTensor(const std::shared_ptr<memory>& dst)
+  {
+    assert(getTensorType(dst) == memory::data_type::f32);
+    memset(dst->get_data_handle(), 0, getTensorSize(dst)*sizeof(float));
+  }
+
+  template<int K>
+  memory::dims Network<K>::getInputReorderDims(const memory::dims& srcDims, int alignment)
+  {
+    memory::dims dstDims = srcDims;
+    dstDims[1] = getPadded<K>(srcDims[1]); // round up C
+    dstDims[2] = roundUp(srcDims[2], memory::dim(alignment)); // round up H
+    dstDims[3] = roundUp(srcDims[3], memory::dim(alignment)); // round up W
+    return dstDims;
+  }
+
+  template<int K>
+  std::shared_ptr<Node> Network<K>::addInputReorder(const Image& color,
+                                                    const Image& albedo,
+                                                    const Image& normal,
+                                                    const std::shared_ptr<TransferFunction>& transferFunc,
+                                                    int alignment,
+                                                    const std::shared_ptr<memory>& userDst)
+  {
+    assert(color);
+    int inputC = 3;
+    if (albedo) inputC += 3;
+    if (normal) inputC += 3;
+
+    memory::dims srcDims = {1, inputC, color.height, color.width};
+    memory::dims dstDims = getInputReorderDims(srcDims, alignment);
+
+    // Allocate padded memory
+    auto dst = userDst;
+    if (!dst)
+      dst = allocTensor(dstDims);
+
+    // Push node
+    std::shared_ptr<Node> node;
+
+    if (auto tf = std::dynamic_pointer_cast<LinearTransferFunction>(transferFunc))
+      node = std::make_shared<InputReorderNode<K, LinearTransferFunction>>(color, albedo, normal, dst, tf);
+    else if (auto tf = std::dynamic_pointer_cast<GammaTransferFunction>(transferFunc))
+      node = std::make_shared<InputReorderNode<K, GammaTransferFunction>>(color, albedo, normal, dst, tf);
+    else if (auto tf = std::dynamic_pointer_cast<LogTransferFunction>(transferFunc))
+      node = std::make_shared<InputReorderNode<K, LogTransferFunction>>(color, albedo, normal, dst, tf);
+    else if (auto tf = std::dynamic_pointer_cast<PQXTransferFunction>(transferFunc))
+      node = std::make_shared<InputReorderNode<K, PQXTransferFunction>>(color, albedo, normal, dst, tf);
+    else
+      assert(0);
+
+    nodes.push_back(node);
+    return node;
+  }
+
+  template<int K>
+  std::shared_ptr<Node> Network<K>::addOutputReorder(const std::shared_ptr<memory>& src,
+                                                     const std::shared_ptr<TransferFunction>& transferFunc,
+                                                     const Image& output)
+  {
+    memory::dims srcDims = getTensorDims(src);
+    assert(srcDims[1] == K);
+
+    // Push node
+    std::shared_ptr<Node> node;
+
+    if (auto tf = std::dynamic_pointer_cast<LinearTransferFunction>(transferFunc))
+      node = std::make_shared<OutputReorderNode<K, LinearTransferFunction>>(src, output, tf);
+    else if (auto tf = std::dynamic_pointer_cast<GammaTransferFunction>(transferFunc))
+      node = std::make_shared<OutputReorderNode<K, GammaTransferFunction>>(src, output, tf);
+    else if (auto tf = std::dynamic_pointer_cast<LogTransferFunction>(transferFunc))
+      node = std::make_shared<OutputReorderNode<K, LogTransferFunction>>(src, output, tf);
+    else if (auto tf = std::dynamic_pointer_cast<PQXTransferFunction>(transferFunc))
+      node = std::make_shared<OutputReorderNode<K, PQXTransferFunction>>(src, output, tf);
+    else
+      assert(0);
+
+    nodes.push_back(node);
+    return node;
+  }
+
+  template<int K>
+  memory::dims Network<K>::getConvDims(const std::string& name, const memory::dims& srcDims)
+  {
+    auto b = weightMap[name + "/b"];
+    memory::dims dstDims = srcDims;
+    dstDims[1] = getPadded<K>(b.dims[0]); // dstDims[C] = getPadded(OC)
+    return dstDims;
+  }
+
+  template<int K>
+  std::shared_ptr<Node> Network<K>::addConv(const std::string& name,
+                                            const std::shared_ptr<memory>& src,
+                                            const std::shared_ptr<memory>& userDst,
+                                            bool relu)
+  {
+    const memory::dims strides = {1, 1};
+    const memory::dims padding = {1, 1};
+
+    memory::dims srcDims = getTensorDims(src);
+
+    // Get the weights
+    const auto& W = weightMap[name + "/W"];
+    if (W.ndims() != 4 || W.format != "oihw")
+      throw Exception(Error::InvalidOperation, "invalid convolution weights");
+    memory::dims weightsDims = W.dims;
+    auto userWeights = allocTensor(weightsDims, memory::format_tag::oihw, W.data);
+
+    // Pad the weights
+    memory::dims weightsPadDims = weightsDims;
+    weightsPadDims[1] = getPadded<K>(weightsDims[1]); // IC
+    weightsPadDims[0] = getPadded<K>(weightsDims[0]); // OC
+    assert(srcDims[1] == weightsPadDims[1]); // srcDims[C] == weightsPadDims[IC]
+    auto weightsPad = allocTensor(weightsPadDims, memory::format_tag::oihw);
+    WeightsReorderNode<K>(userWeights, weightsPad).execute(sm);
+
+    // Get the biases
+    const auto& b = weightMap[name + "/b"];
+    if (b.ndims() != 1)
+      throw Exception(Error::InvalidOperation, "invalid convolution biases");
+    memory::dims biasDims = b.dims;
+
+    // Copy/pad the biases
+    memory::dims biasPadDims = {getPadded<K>(biasDims[0])};
+    auto bias = allocTensor(biasPadDims);
+    if (biasDims[0] != biasPadDims[0])
+      memset(bias->get_data_handle(), 0, biasPadDims[0]*sizeof(float));
+    memcpy(bias->get_data_handle(), b.data, biasDims[0]*sizeof(float));
+
+    // Allocate memory for destination
+    memory::dims dstDims = srcDims;
+    dstDims[1] = weightsPadDims[0]; // dstDims[C] = weightsPadDims[OC]
+
+    std::shared_ptr<memory> dst;
+    if (!userDst)
+      dst = allocTensor(dstDims);
+    else if (getTensorDims(userDst) == dstDims)
+      dst = userDst;
+    else
+      dst = castTensor(dstDims, userDst);
+
+    // Create a convolution
+    // Let the convolution primitive choose the weights format
+    auto weightsDesc = memory::desc({ weightsPadDims }, memory::data_type::f32, memory::format_tag::any);
+
+    auto convAlgo = (K == 16) ? convolution_winograd : convolution_direct;
+    auto convDesc = convolution_forward::desc(
+      prop_kind::forward_inference, convAlgo,
+      src->get_desc(),
+      weightsDesc,
+      bias->get_desc(),
+      dst->get_desc(),
+      strides, padding, padding, padding_kind::zero);
+
+    // Incorporate relu
+    mkldnn::primitive_attr convAttr;
+    if (relu)
+    {
+      mkldnn::post_ops ops;
+      ops.append_eltwise(
+        1.f,   // scale factor, not used
+        algorithm::eltwise_relu,
+        0.f,   // max with
+        0.f    // unused
+      );
+      convAttr.set_post_ops(ops);
+    }
+    convAttr.set_scratchpad_mode(scratchpad_mode_user);
+
+    auto convPrimDesc = convolution_forward::primitive_desc(convDesc, convAttr, eng);
+
+    // Reorder the weights to the final format, if necessary
+    auto weights = weightsPad;
+    if (convPrimDesc.weights_desc() != weightsPad->get_desc())
+    {
+      weights = std::make_shared<memory>(convPrimDesc.weights_desc(), eng);
+      ReorderNode(weightsPad, weights).execute(sm);
+    }
+
+    // Create convolution node and add it to the net
+    auto node = std::make_shared<ConvNode>(convPrimDesc, src, weights, bias, dst);
+    nodes.push_back(node);
+    return node;
+  }
+
+  template<int K>
+  memory::dims Network<K>::getPoolDims(const memory::dims& srcDims)
+  {
+    memory::dims dstDims = srcDims;
+    dstDims[2] /= 2; // H/2
+    dstDims[3] /= 2; // W/2
+    return dstDims;
+  }
+
+  template<int K>
+  std::shared_ptr<Node> Network<K>::addPool(const std::shared_ptr<memory>& src,
+                                            const std::shared_ptr<memory>& userDst)
+  {
+    const memory::dims kernel  = {2, 2};
+    const memory::dims strides = {2, 2};
+    const memory::dims padding = {0, 0};
+
+    memory::dims srcDims = getTensorDims(src);
+    memory::dims dstDims = getPoolDims(srcDims);
+
+    std::shared_ptr<memory> dst;
+    if (!userDst)
+      dst = allocTensor(dstDims);
+    else if (getTensorDims(userDst) == dstDims)
+      dst = userDst;
+    else
+      dst = castTensor(dstDims, userDst);
+
+    auto poolDesc = pooling_forward::desc(
+      prop_kind::forward_inference, pooling_max,
+      src->get_desc(),
+      dst->get_desc(),
+      strides, kernel, padding, padding, padding_kind::zero);
+
+    mkldnn::primitive_attr poolAttr;
+    poolAttr.set_scratchpad_mode(scratchpad_mode_user);
+
+    auto poolPrimDesc = pooling_forward::primitive_desc(poolDesc, poolAttr, eng);
+
+    auto node = std::make_shared<PoolNode>(poolPrimDesc, src, dst);
+    nodes.push_back(node);
+    return node;
+  }
+
+  template<int K>
+  memory::dims Network<K>::getUpsampleDims(const memory::dims& srcDims)
+  {
+    memory::dims dstDims = srcDims;
+    dstDims[2] *= 2; // H*2
+    dstDims[3] *= 2; // W*2
+    return dstDims;
+  }
+
+  template<int K>
+  std::shared_ptr<Node> Network<K>::addUpsample(const std::shared_ptr<memory>& src,
+                                                const std::shared_ptr<memory>& userDst)
+  {
+    memory::dims srcDims = getTensorDims(src);
+    memory::dims dstDims = getUpsampleDims(srcDims);
+
+    std::shared_ptr<memory> dst;
+    if (!userDst)
+      dst = allocTensor(dstDims);
+    else if (getTensorDims(userDst) == dstDims)
+      dst = userDst;
+    else
+      dst = castTensor(dstDims, userDst);
+
+    // Create upsampling node and add it to net
+    auto node = std::make_shared<UpsampleNode<K>>(src, dst);
+    nodes.push_back(node);
+    return node;
+  }
+
+  template<int K>
+  memory::dims Network<K>::getConcatDims(const memory::dims& src1Dims, const memory::dims& src2Dims)
+  {
+    assert(src1Dims[0] == src2Dims[0]); // N
+    assert(src1Dims[2] == src2Dims[2]); // H
+    assert(src1Dims[3] == src2Dims[3]); // W
+
+    memory::dims dstDims = src1Dims;
+    dstDims[1] += src2Dims[1]; // C
+    return dstDims;
+  }
+
+  template<int K>
+  std::shared_ptr<Node> Network<K>::addAutoexposure(const Image& color,
+                                                    const std::shared_ptr<HDRTransferFunction>& transferFunc)
+  {
+    auto node = std::make_shared<AutoexposureNode>(color, transferFunc);
+    nodes.push_back(node);
+    return node;
+  }
+
+  template <int K>
+  void Network<K>::finalize()
+  {
+    // Compute the size of the scratchpad
+    size_t scratchpadSize = 0;
+    for (const auto& node : nodes)
+      scratchpadSize = max(scratchpadSize, node->getScratchpadSize());
+
+    // Allocate the scratchpad
+    memory::dims scratchpadDims = { memory::dim(scratchpadSize) };
+    memory::desc scratchpadDesc(scratchpadDims, memory::data_type::u8, memory::format_tag::x);
+    auto scratchpad = std::make_shared<memory>(scratchpadDesc, eng);
+    activationAllocBytes += scratchpadSize;
+    totalAllocBytes += scratchpadSize;
+
+    // Set the scratchpad for the nodes
+    for (auto& node : nodes)
+      node->setScratchpad(scratchpad);
+
+    // Free the weights
+    weightMap.clear();
+
+    // Print statistics
+    if (device->isVerbose(2))
+    {
+      std::cout << "Activation bytes: " << activationAllocBytes << std::endl;
+      std::cout << "Scratchpad bytes: " << scratchpadSize << std::endl;
+      std::cout << "Total bytes     : " << totalAllocBytes << std::endl;
+    }
+  }
+
+  template class Network<8>;
+  template class Network<16>;
+
+} // namespace oidn
diff --git a/thirdparty/oidn/core/network.h b/thirdparty/oidn/core/network.h
new file mode 100644
index 0000000000..7a696fd355
--- /dev/null
+++ b/thirdparty/oidn/core/network.h
@@ -0,0 +1,112 @@
+// ======================================================================== //
+// Copyright 2009-2019 Intel Corporation                                    //
+//                                                                          //
+// Licensed under the Apache License, Version 2.0 (the "License");          //
+// you may not use this file except in compliance with the License.         //
+// You may obtain a copy of the License at                                  //
+//                                                                          //
+//     http://www.apache.org/licenses/LICENSE-2.0                           //
+//                                                                          //
+// Unless required by applicable law or agreed to in writing, software      //
+// distributed under the License is distributed on an "AS IS" BASIS,        //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and      //
+// limitations under the License.                                           //
+// ======================================================================== //
+
+#include "common/tensor.h"
+#include "image.h"
+#include "node.h"
+#include "input_reorder.h"
+#include "output_reorder.h"
+#include "transfer_function.h"
+
+#pragma once
+
+namespace oidn {
+
+  // Progress state
+  struct Progress
+  {
+    ProgressMonitorFunction func;
+    void* userPtr;
+    int taskCount;
+  };
+
+  class Executable
+  {
+  public:
+    virtual ~Executable() {}
+    virtual void execute(const Progress& progress, int taskIndex) = 0;
+  };
+
+  template<int K>
+  class Network : public Executable
+  {
+  public:
+    Network(const Ref<Device>& device, const std::map<std::string, Tensor>& weightMap);
+
+    void execute(const Progress& progress, int taskIndex) override;
+
+    std::shared_ptr<memory> allocTensor(const memory::dims& dims,
+                                        memory::format_tag format = memory::format_tag::any,
+                                        void* data = nullptr);
+
+    std::shared_ptr<memory> castTensor(const memory::dims& dims,
+                                       const std::shared_ptr<memory>& src,
+                                       size_t srcOffset = 0,
+                                       memory::format_tag format = memory::format_tag::any);
+
+    std::shared_ptr<memory> castTensor(const memory::dims& dims,
+                                       const std::shared_ptr<memory>& src,
+                                       const memory::dims& srcOffset);
+
+    void zeroTensor(const std::shared_ptr<memory>& dst);
+
+    memory::dims getInputReorderDims(const memory::dims& srcDims, int alignment);
+
+    std::shared_ptr<Node> addInputReorder(const Image& color,
+                                          const Image& albedo,
+                                          const Image& normal,
+                                          const std::shared_ptr<TransferFunction>& transferFunc,
+                                          int alignment,
+                                          const std::shared_ptr<memory>& userDst = nullptr);
+
+    std::shared_ptr<Node> addOutputReorder(const std::shared_ptr<memory>& src,
+                                           const std::shared_ptr<TransferFunction>& transferFunc,
+                                           const Image& output);
+
+    memory::dims getConvDims(const std::string& name, const memory::dims& srcDims);
+    std::shared_ptr<Node> addConv(const std::string& name,
+                                  const std::shared_ptr<memory>& src,
+                                  const std::shared_ptr<memory>& userDst = nullptr,
+                                  bool relu = true);
+
+    memory::dims getPoolDims(const memory::dims& srcDims);
+    std::shared_ptr<Node> addPool(const std::shared_ptr<memory>& src,
+                                  const std::shared_ptr<memory>& userDst = nullptr);
+
+    memory::dims getUpsampleDims(const memory::dims& srcDims);
+    std::shared_ptr<Node> addUpsample(const std::shared_ptr<memory>& src,
+                                      const std::shared_ptr<memory>& userDst = nullptr);
+
+    memory::dims getConcatDims(const memory::dims& src1Dims, const memory::dims& src2Dims);
+
+    std::shared_ptr<Node> addAutoexposure(const Image& color,
+                                          const std::shared_ptr<HDRTransferFunction>& transferFunc);
+
+    void finalize();
+
+  private:
+    Ref<Device> device;
+    engine eng;
+    stream sm;
+    std::vector<std::shared_ptr<Node>> nodes;
+    std::map<std::string, Tensor> weightMap;
+
+    // Memory allocation statistics
+    size_t activationAllocBytes = 0; // number of allocated activation bytes
+    size_t totalAllocBytes      = 0; // total number of allocated bytes
+  };
+
+} // namespace oidn
diff --git a/thirdparty/oidn/core/node.h b/thirdparty/oidn/core/node.h
new file mode 100644
index 0000000000..b9ffe906df
--- /dev/null
+++ b/thirdparty/oidn/core/node.h
@@ -0,0 +1,142 @@
+// ======================================================================== //
+// Copyright 2009-2019 Intel Corporation                                    //
+//                                                                          //
+// Licensed under the Apache License, Version 2.0 (the "License");          //
+// you may not use this file except in compliance with the License.         //
+// You may obtain a copy of the License at                                  //
+//                                                                          //
+//     http://www.apache.org/licenses/LICENSE-2.0                           //
+//                                                                          //
+// Unless required by applicable law or agreed to in writing, software      //
+// distributed under the License is distributed on an "AS IS" BASIS,        //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and      //
+// limitations under the License.                                           //
+// ======================================================================== //
+
+#pragma once
+
+#include "common.h"
+#include <vector>
+
+namespace oidn {
+
+  class Node
+  {
+  public:
+    virtual ~Node() = default;
+
+    virtual void execute(stream& sm) = 0;
+
+    virtual std::shared_ptr<memory> getDst() const { return nullptr; }
+
+    virtual size_t getScratchpadSize() const { return 0; }
+    virtual void setScratchpad(const std::shared_ptr<memory>& mem) {}
+
+    virtual void setTile(int h1, int w1, int h2, int w2, int H, int W)
+    {
+      assert(0); // not supported
+    }
+  };
+
+  // Node wrapping an MKL-DNN primitive
+  class MklNode : public Node
+  {
+  private:
+    primitive prim;
+    std::unordered_map<int, memory> args;
+    std::shared_ptr<memory> scratchpad;
+
+  public:
+    MklNode(const primitive& prim, const std::unordered_map<int, memory>& args)
+      : prim(prim),
+        args(args)
+    {}
+
+    size_t getScratchpadSize() const override
+    {
+      const auto primDesc = prim.get_primitive_desc();
+      const mkldnn_memory_desc_t* scratchpadDesc = mkldnn_primitive_desc_query_md(primDesc, mkldnn_query_scratchpad_md, 0);
+      if (scratchpadDesc == nullptr)
+        return 0;
+      return mkldnn_memory_desc_get_size(scratchpadDesc);
+    }
+
+    void setScratchpad(const std::shared_ptr<memory>& mem) override
+    {
+      scratchpad = mem;
+      args.insert(std::make_pair(MKLDNN_ARG_SCRATCHPAD, *scratchpad));
+    }
+
+    void execute(stream& sm) override
+    {
+      prim.execute(sm, args);
+    }
+  };
+
+  // Convolution node
+  class ConvNode : public MklNode
+  {
+  private:
+    std::shared_ptr<memory> src;
+    std::shared_ptr<memory> weights;
+    std::shared_ptr<memory> bias;
+    std::shared_ptr<memory> dst;
+
+  public:
+    ConvNode(const convolution_forward::primitive_desc& desc,
+             const std::shared_ptr<memory>& src,
+             const std::shared_ptr<memory>& weights,
+             const std::shared_ptr<memory>& bias,
+             const std::shared_ptr<memory>& dst)
+      : MklNode(convolution_forward(desc),
+                { { MKLDNN_ARG_SRC, *src },
+                  { MKLDNN_ARG_WEIGHTS, *weights },
+                  { MKLDNN_ARG_BIAS, *bias },
+                  { MKLDNN_ARG_DST, *dst } }),
+                src(src), weights(weights), bias(bias), dst(dst)
+    {}
+
+    std::shared_ptr<memory> getDst() const override { return dst; }
+  };
+
+  // Pooling node
+  class PoolNode : public MklNode
+  {
+  private:
+    std::shared_ptr<memory> src;
+    std::shared_ptr<memory> dst;
+
+  public:
+    PoolNode(const pooling_forward::primitive_desc& desc,
+             const std::shared_ptr<memory>& src,
+             const std::shared_ptr<memory>& dst)
+      : MklNode(pooling_forward(desc),
+                { { MKLDNN_ARG_SRC, *src },
+                  { MKLDNN_ARG_DST, *dst } }),
+                src(src), dst(dst)
+    {}
+
+    std::shared_ptr<memory> getDst() const override { return dst; }
+  };
+
+  // Reorder node
+  class ReorderNode : public MklNode
+  {
+  private:
+    std::shared_ptr<memory> src;
+    std::shared_ptr<memory> dst;
+
+  public:
+    ReorderNode(const std::shared_ptr<memory>& src,
+                const std::shared_ptr<memory>& dst)
+      : MklNode(reorder(reorder::primitive_desc(*src, *dst)),
+                { { MKLDNN_ARG_SRC, *src },
+                  { MKLDNN_ARG_DST, *dst } }),
+                src(src), dst(dst)
+    {}
+
+    std::shared_ptr<memory> getDst() const override { return dst; }
+  };
+
+} // namespace oidn
diff --git a/thirdparty/oidn/core/output_reorder.h b/thirdparty/oidn/core/output_reorder.h
new file mode 100644
index 0000000000..7918d48e15
--- /dev/null
+++ b/thirdparty/oidn/core/output_reorder.h
@@ -0,0 +1,126 @@
+// ======================================================================== //
+// Copyright 2009-2019 Intel Corporation                                    //
+//                                                                          //
+// Licensed under the Apache License, Version 2.0 (the "License");          //
+// you may not use this file except in compliance with the License.         //
+// You may obtain a copy of the License at                                  //
+//                                                                          //
+//     http://www.apache.org/licenses/LICENSE-2.0                           //
+//                                                                          //
+// Unless required by applicable law or agreed to in writing, software      //
+// distributed under the License is distributed on an "AS IS" BASIS,        //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and      //
+// limitations under the License.                                           //
+// ======================================================================== //
+
+#pragma once
+
+#include "node.h"
+#include "image.h"
+
+namespace oidn {
+
+  // Output reorder node
+  template<int K, class TransferFunction>
+  class OutputReorderNode : public Node
+  {
+  private:
+    // Source
+    std::shared_ptr<memory> src;
+    const float* srcPtr;
+    int H1;
+    int W1;
+
+    // Destination
+    Image output;
+
+    // Tile
+    int h1Begin;
+    int w1Begin;
+    int h2Begin;
+    int w2Begin;
+    int H;
+    int W;
+
+    std::shared_ptr<TransferFunction> transferFunc;
+
+  public:
+    OutputReorderNode(const std::shared_ptr<memory>& src,
+                      const Image& output,
+                      const std::shared_ptr<TransferFunction>& transferFunc)
+      : src(src),
+        output(output),
+        h1Begin(0), w1Begin(0),
+        h2Begin(0), w2Begin(0),
+        H(output.height), W(output.width),
+        transferFunc(transferFunc)
+    {
+      const mkldnn_memory_desc_t& srcDesc = src->get_desc().data;
+      MAYBE_UNUSED(srcDesc);
+      assert(memory_desc_matches_tag(srcDesc, mkldnn_format_tag_t(BlockedFormat<K>::nChwKc)));
+      assert(srcDesc.ndims == 4);
+      assert(srcDesc.data_type == memory::data_type::f32);
+      assert(srcDesc.dims[0] == 1);
+      // We assume output data is <= K OC
+      assert(srcDesc.dims[1] == K);
+
+      srcPtr = (float*)src->get_data_handle();
+      H1 = srcDesc.dims[2];
+      W1 = srcDesc.dims[3];
+    }
+
+    void setTile(int h1, int w1, int h2, int w2, int H, int W) override
+    {
+      h1Begin = h1;
+      w1Begin = w1;
+      h2Begin = h2;
+      w2Begin = w2;
+      this->H = H;
+      this->W = W;
+    }
+
+    void execute(stream& sm) override
+    {
+      assert(h1Begin + H <= H1);
+      assert(w1Begin + W <= W1);
+      assert(h2Begin + H <= output.height);
+      assert(w2Begin + W <= output.width);
+
+      const int C1 = K;
+
+      parallel_nd(H, [&](int h)
+      {
+        const int h1 = h + h1Begin;
+        const int h2 = h + h2Begin;
+
+        for (int w = 0; w < W; ++w)
+        {
+          const int w1 = w + w1Begin;
+          const int w2 = w + w2Begin;
+          float* dstPtr_C = (float*)output.get(h2, w2);
+
+          // Source is in nChwKc format. In this case C is 1 so this is really nhwc
+          const float* srcPtr_C = srcPtr + h1*W1*C1 + w1*C1;
+
+          #pragma unroll
+          for (int i = 0; i < 3; ++i)
+          {
+            // Load the value
+            float x = srcPtr_C[i];
+
+            // The CNN output may contain negative values or even NaNs, so it must be sanitized
+            x = maxSafe(x, 0.f);
+
+            // Apply the inverse transfer function
+            x = transferFunc->inverse(x);
+
+            // Sanitize and store the final value
+            dstPtr_C[i] = max(x, 0.f);
+          }
+        }
+      });
+    }
+  };
+
+} // namespace oidn
diff --git a/thirdparty/oidn/core/transfer_function.cpp b/thirdparty/oidn/core/transfer_function.cpp
new file mode 100644
index 0000000000..a33e3c84bc
--- /dev/null
+++ b/thirdparty/oidn/core/transfer_function.cpp
@@ -0,0 +1,95 @@
+// ======================================================================== //
+// Copyright 2009-2019 Intel Corporation                                    //
+//                                                                          //
+// Licensed under the Apache License, Version 2.0 (the "License");          //
+// you may not use this file except in compliance with the License.         //
+// You may obtain a copy of the License at                                  //
+//                                                                          //
+//     http://www.apache.org/licenses/LICENSE-2.0                           //
+//                                                                          //
+// Unless required by applicable law or agreed to in writing, software      //
+// distributed under the License is distributed on an "AS IS" BASIS,        //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and      //
+// limitations under the License.                                           //
+// ======================================================================== //
+
+#include "transfer_function.h"
+
+namespace oidn {
+
+  const float LogTransferFunction::xScale = 1.f / log(LogTransferFunction::yMax + 1.f);
+  const float PQXTransferFunction::xScale = 1.f / PQXTransferFunction::pqxForward(PQXTransferFunction::yMax * PQXTransferFunction::yScale);
+
+  float AutoexposureNode::autoexposure(const Image& color)
+  {
+    assert(color.format == Format::Float3);
+    return 1.0f;
+
+    /*constexpr float key = 0.18f;
+    constexpr float eps = 1e-8f;
+    constexpr int K = 16; // downsampling amount
+
+    // Downsample the image to minimize sensitivity to noise
+    const int H  = color.height;  // original height
+    const int W  = color.width;   // original width
+    const int HK = (H + K/2) / K; // downsampled height
+    const int WK = (W + K/2) / K; // downsampled width
+
+    // Compute the average log luminance of the downsampled image
+    using Sum = std::pair<float, int>;
+
+    Sum sum =
+      tbb::parallel_reduce(
+        tbb::blocked_range2d<int>(0, HK, 0, WK),
+        Sum(0.f, 0),
+        [&](const tbb::blocked_range2d<int>& r, Sum sum) -> Sum
+        {
+          // Iterate over blocks
+          for (int i = r.rows().begin(); i != r.rows().end(); ++i)
+          {
+            for (int j = r.cols().begin(); j != r.cols().end(); ++j)
+            {
+              // Compute the average luminance in the current block
+              const int beginH = int(ptrdiff_t(i)   * H / HK);
+              const int beginW = int(ptrdiff_t(j)   * W / WK);
+              const int endH   = int(ptrdiff_t(i+1) * H / HK);
+              const int endW   = int(ptrdiff_t(j+1) * W / WK);
+
+              float L = 0.f;
+
+              for (int h = beginH; h < endH; ++h)
+              {
+                for (int w = beginW; w < endW; ++w)
+                {
+                  const float* rgb = (const float*)color.get(h, w);
+
+                  const float r = maxSafe(rgb[0], 0.f);
+                  const float g = maxSafe(rgb[1], 0.f);
+                  const float b = maxSafe(rgb[2], 0.f);
+
+                  L += luminance(r, g, b);
+                }
+              }
+
+              L /= (endH - beginH) * (endW - beginW);
+
+              // Accumulate the log luminance
+              if (L > eps)
+              {
+                sum.first += log2(L);
+                sum.second++;
+              }
+            }
+          }
+
+          return sum;
+        },
+        [](Sum a, Sum b) -> Sum { return Sum(a.first+b.first, a.second+b.second); },
+        tbb::static_partitioner()
+      );
+
+    return (sum.second > 0) ? (key / exp2(sum.first / float(sum.second))) : 1.f;*/
+  }
+
+} // namespace oidn
diff --git a/thirdparty/oidn/core/transfer_function.h b/thirdparty/oidn/core/transfer_function.h
new file mode 100644
index 0000000000..35f2833092
--- /dev/null
+++ b/thirdparty/oidn/core/transfer_function.h
@@ -0,0 +1,201 @@
+// ======================================================================== //
+// Copyright 2009-2019 Intel Corporation                                    //
+//                                                                          //
+// Licensed under the Apache License, Version 2.0 (the "License");          //
+// you may not use this file except in compliance with the License.         //
+// You may obtain a copy of the License at                                  //
+//                                                                          //
+//     http://www.apache.org/licenses/LICENSE-2.0                           //
+//                                                                          //
+// Unless required by applicable law or agreed to in writing, software      //
+// distributed under the License is distributed on an "AS IS" BASIS,        //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and      //
+// limitations under the License.                                           //
+// ======================================================================== //
+
+#pragma once
+
+#include "image.h"
+#include "node.h"
+
+namespace oidn {
+
+  __forceinline float luminance(float r, float g, float b)
+  {
+    return 0.212671f * r + 0.715160f * g + 0.072169f * b;
+  }
+
+  // Color transfer function base class
+  class TransferFunction
+  {
+  public:
+    virtual ~TransferFunction() = default;
+
+    virtual float forward(float y) const = 0;
+    virtual float inverse(float x) const = 0;
+  };
+
+  // HDR transfer function base class
+  class HDRTransferFunction : public TransferFunction
+  {
+  protected:
+    static constexpr float yMax = 65504.f;
+
+    float exposure;
+    float rcpExposure;
+
+  public:
+    HDRTransferFunction(float exposure = 1.f)
+    {
+      setExposure(exposure);
+    }
+
+    void setExposure(float exposure)
+    {
+      this->exposure = exposure;
+      this->rcpExposure = (exposure != 0.f) ? (1.f / exposure) : 0.f;
+    }
+  };
+
+  // Linear transfer function (LDR)
+  class LinearTransferFunction : public TransferFunction
+  {
+  public:
+    __forceinline float forward(float y) const override
+    {
+      return min(y, 1.f);
+    }
+
+    __forceinline float inverse(float x) const override
+    {
+      return min(x, 1.f);
+    }
+  };
+
+  // 2.2 gamma transfer function (LDR)
+  class GammaTransferFunction : public TransferFunction
+  {
+  public:
+    __forceinline float forward(float y) const override
+    {
+      return min(pow(y, 1.f/2.2f), 1.f);
+    }
+
+    __forceinline float inverse(float x) const override
+    {
+      return min(pow(x, 2.2f), 1.f);
+    }
+  };
+
+  // Logarithmic transfer function (HDR)
+  // Compresses [0..65504] to [0..1]
+  class LogTransferFunction : public HDRTransferFunction
+  {
+  private:
+    static const float xScale;
+
+  public:
+    LogTransferFunction(float exposure = 1.f)
+      : HDRTransferFunction(exposure)
+    {
+    }
+
+    __forceinline float forward(float y) const override
+    {
+      return log(y * exposure + 1.f) * xScale;
+    }
+
+    __forceinline float inverse(float x) const override
+    {
+      return (exp(x * (1.f/xScale)) - 1.f) * rcpExposure;
+    }
+  };
+
+  // PQX transfer function (HDR)
+  // Compresses [0..65504] to [0..1]
+  class PQXTransferFunction : public HDRTransferFunction
+  {
+  private:
+    static constexpr float m1 = 2610.f / 4096.f / 4.f;
+    static constexpr float m2 = 2523.f / 4096.f * 128.f;
+    static constexpr float c1 = 3424.f / 4096.f;
+    static constexpr float c2 = 2413.f / 4096.f * 32.f;
+    static constexpr float c3 = 2392.f / 4096.f * 32.f;
+    static constexpr float  a = 3711.f / 4096.f / 8.f;
+
+    static constexpr float yScale = 100.f / 10000.f;
+    static const float     xScale;
+
+  public:
+    PQXTransferFunction(float exposure = 1.f)
+      : HDRTransferFunction(exposure)
+    {
+    }
+
+    __forceinline float forward(float y) const override
+    {
+      return pqxForward(y * exposure * yScale) * xScale;
+    }
+
+    __forceinline float inverse(float x) const override
+    {
+      return pqxInverse(x * (1.f/xScale)) * (1.f/yScale) * rcpExposure;
+    }
+
+  private:
+    static __forceinline float pqForward(float y)
+    {
+      const float yp = pow(y, m1);
+      return pow((c1 + c2 * yp) * rcp(1.f + c3 * yp), m2);
+    }
+
+    static __forceinline float pqxForward(float y)
+    {
+      if (y <= 1.f)
+        return pqForward(y);
+      else
+        return a * log(y) + 1.f;
+    }
+
+    static __forceinline float pqInverse(float x)
+    {
+      const float xp = pow(x, 1.f/m2);
+      return pow(max((xp - c1) * rcp(c2 - c3 * xp), 0.f), 1.f/m1);
+    }
+
+    static __forceinline float pqxInverse(float x)
+    {
+      if (x <= 1.f)
+        return pqInverse(x);
+      else
+        return exp((x - 1.f) * (1.f/a));
+    }
+  };
+
+  // Autoexposure node
+  class AutoexposureNode : public Node
+  {
+  private:
+    Image color;
+    std::shared_ptr<HDRTransferFunction> transferFunc;
+
+  public:
+    AutoexposureNode(const Image& color,
+                     const std::shared_ptr<HDRTransferFunction>& transferFunc)
+      : color(color),
+        transferFunc(transferFunc)
+    {}
+
+    void execute(stream& sm) override
+    {
+      const float exposure = autoexposure(color);
+      //printf("exposure = %f\n", exposure);
+      transferFunc->setExposure(exposure);
+    }
+
+  private:
+    static float autoexposure(const Image& color);
+  };
+
+} // namespace oidn
diff --git a/thirdparty/oidn/core/upsample.h b/thirdparty/oidn/core/upsample.h
new file mode 100644
index 0000000000..f6cace44cd
--- /dev/null
+++ b/thirdparty/oidn/core/upsample.h
@@ -0,0 +1,92 @@
+// ======================================================================== //
+// Copyright 2009-2019 Intel Corporation                                    //
+//                                                                          //
+// Licensed under the Apache License, Version 2.0 (the "License");          //
+// you may not use this file except in compliance with the License.         //
+// You may obtain a copy of the License at                                  //
+//                                                                          //
+//     http://www.apache.org/licenses/LICENSE-2.0                           //
+//                                                                          //
+// Unless required by applicable law or agreed to in writing, software      //
+// distributed under the License is distributed on an "AS IS" BASIS,        //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and      //
+// limitations under the License.                                           //
+// ======================================================================== //
+
+#pragma once
+
+#include "node.h"
+
+namespace oidn {
+
+  // 2x2 nearest-neighbor upsampling node
+  template<int K>
+  class UpsampleNode : public Node
+  {
+  private:
+    std::shared_ptr<memory> src;
+    std::shared_ptr<memory> dst;
+
+  public:
+    UpsampleNode(const std::shared_ptr<memory>& src,
+                 const std::shared_ptr<memory>& dst)
+      : src(src),
+        dst(dst)
+    {
+      const mkldnn_memory_desc_t& srcDesc = src->get_desc().data;
+      const mkldnn_memory_desc_t& dstDesc = dst->get_desc().data;
+      MAYBE_UNUSED(srcDesc);
+      MAYBE_UNUSED(dstDesc);
+      assert(memory_desc_matches_tag(srcDesc, mkldnn_format_tag_t(BlockedFormat<K>::nChwKc)));
+      assert(memory_desc_matches_tag(dstDesc, mkldnn_format_tag_t(BlockedFormat<K>::nChwKc)));
+      assert(srcDesc.ndims == 4);
+      assert(dstDesc.ndims == 4);
+      assert(srcDesc.data_type == memory::data_type::f32);
+      assert(dstDesc.data_type == memory::data_type::f32);
+      assert(srcDesc.dims[0] == 1);
+      assert(dstDesc.dims[0] == 1);
+      // 2x2 upsampling
+      assert(dstDesc.dims[2] == srcDesc.dims[2] * 2);
+      assert(dstDesc.dims[3] == srcDesc.dims[3] * 2);
+    }
+
+    void execute(stream& sm) override
+    {
+      const mkldnn_memory_desc_t& srcDesc = src->get_desc().data;
+
+      const float* srcPtr = (float*)src->get_data_handle();
+      float* dstPtr = (float*)dst->get_data_handle();
+
+      const int C = srcDesc.dims[1];
+      const int H = srcDesc.dims[2];
+      const int W = srcDesc.dims[3];
+      const int CK = C / K;
+
+      parallel_nd(CK, H, [&](int ck, int h)
+      {
+        const size_t offset = ck*H*W*K + h*W*K;
+        const float* srcPtr_line = srcPtr + offset;
+        float* dstPtr_line0 = dstPtr + offset * 4;
+        float* dstPtr_line1 = dstPtr_line0 + W*2*K; // next line
+
+        for (int w = 0; w < W; ++w)
+        {
+          #pragma unroll
+          for (int k = 0; k < K; k += 4)
+          {
+            const __m128 m = _mm_load_ps(&srcPtr_line[w*K + k]);
+
+            _mm_stream_ps(&dstPtr_line0[w*2*K   + k], m);
+            _mm_stream_ps(&dstPtr_line0[w*2*K+K + k], m);
+            _mm_stream_ps(&dstPtr_line1[w*2*K   + k], m);
+            _mm_stream_ps(&dstPtr_line1[w*2*K+K + k], m);
+          }
+        }
+      });
+    }
+
+    std::shared_ptr<memory> getDst() const override { return dst; }
+  };
+
+} // namespace oidn
diff --git a/thirdparty/oidn/core/weights_reorder.h b/thirdparty/oidn/core/weights_reorder.h
new file mode 100644
index 0000000000..6c5dacb8aa
--- /dev/null
+++ b/thirdparty/oidn/core/weights_reorder.h
@@ -0,0 +1,99 @@
+// ======================================================================== //
+// Copyright 2009-2019 Intel Corporation                                    //
+//                                                                          //
+// Licensed under the Apache License, Version 2.0 (the "License");          //
+// you may not use this file except in compliance with the License.         //
+// You may obtain a copy of the License at                                  //
+//                                                                          //
+//     http://www.apache.org/licenses/LICENSE-2.0                           //
+//                                                                          //
+// Unless required by applicable law or agreed to in writing, software      //
+// distributed under the License is distributed on an "AS IS" BASIS,        //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and      //
+// limitations under the License.                                           //
+// ======================================================================== //
+
+#pragma once
+
+#include "node.h"
+
+namespace oidn {
+
+  // Reorders weights from oihw to padded oihw format
+  template<int K>
+  class WeightsReorderNode : public Node
+  {
+  private:
+    std::shared_ptr<memory> src;
+    std::shared_ptr<memory> dst;
+
+  public:
+    WeightsReorderNode(const std::shared_ptr<memory>& src,
+                       const std::shared_ptr<memory>& dst)
+      : src(src),
+        dst(dst)
+    {
+      const mkldnn_memory_desc_t& srcDesc = src->get_desc().data;
+      const mkldnn_memory_desc_t& dstDesc = dst->get_desc().data;
+      MAYBE_UNUSED(srcDesc);
+      MAYBE_UNUSED(dstDesc);
+      assert(memory_desc_matches_tag(srcDesc, mkldnn_format_tag_t(memory::format_tag::oihw)));
+      assert(memory_desc_matches_tag(dstDesc, mkldnn_format_tag_t(memory::format_tag::oihw)));
+      assert(srcDesc.ndims == 4);
+      assert(dstDesc.ndims == 4);
+      assert(srcDesc.data_type == memory::data_type::f32);
+      assert(dstDesc.data_type == memory::data_type::f32);
+      assert(getPadded<K>(srcDesc.dims[0]) == dstDesc.dims[0]); // OC
+      assert(getPadded<K>(srcDesc.dims[1]) == dstDesc.dims[1]); // IC
+      assert(srcDesc.dims[2] == dstDesc.dims[2]);
+      assert(srcDesc.dims[3] == dstDesc.dims[3]);
+    }
+
+    void execute(stream& sm) override
+    {
+      const mkldnn_memory_desc_t& srcDesc = src->get_desc().data;
+      const mkldnn_memory_desc_t& dstDesc = dst->get_desc().data;
+
+      const float* srcPtr = (float*)src->get_data_handle();
+      float* dstPtr = (float*)dst->get_data_handle();
+
+      const int OC1 = srcDesc.dims[0];
+      const int OC2 = dstDesc.dims[0];
+      const int IC1 = srcDesc.dims[1];
+      const int IC2 = dstDesc.dims[1];
+      const int H   = dstDesc.dims[2];
+      const int W   = dstDesc.dims[3];
+
+      for (int oc = 0; oc < OC2; ++oc)
+      {
+        for (int ic = 0; ic < IC2; ++ic)
+        {
+          for (int h = 0; h < H; ++h)
+          {
+            for (int w = 0; w < W; ++w)
+            {
+              // Output is in oihw format
+              float* dstPtr_c = dstPtr + oc*IC2*H*W + ic*H*W + h*W + w;
+
+              if (oc < OC1 && ic < IC1)
+              {
+                // Input is in oihw format
+                const float* srcPtr_c = srcPtr + oc*IC1*H*W + ic*H*W + h*W + w;
+                *dstPtr_c = *srcPtr_c;
+              }
+              else
+              {
+                // padding
+                *dstPtr_c = 0;
+              }
+            }
+          }
+        }
+      }
+    }
+
+    std::shared_ptr<memory> getDst() const override { return dst; }
+  };
+
+} // namespace oidn