summaryrefslogtreecommitdiff
path: root/thirdparty/cvtt/ConvectionKernels_IndexSelector.h
diff options
context:
space:
mode:
authorK. S. Ernest (iFire) Lee <ernest.lee@chibifire.com>2022-01-18 04:39:55 -0800
committerK. S. Ernest (iFire) Lee <ernest.lee@chibifire.com>2022-02-04 15:15:26 -0800
commit419b342a9a716426159f7a51ae17390386ecc884 (patch)
treec42a9c3a41e94fa501ca09d0605ee44b88c3c9ef /thirdparty/cvtt/ConvectionKernels_IndexSelector.h
parent992794e44a47a7adddce589bd68ad71402d5ba66 (diff)
Faster CVTT by reducing quality.
Make BC6 and BC7 CVTT faster while still having better quality than DXT5.
Diffstat (limited to 'thirdparty/cvtt/ConvectionKernels_IndexSelector.h')
-rw-r--r--thirdparty/cvtt/ConvectionKernels_IndexSelector.h147
1 files changed, 147 insertions, 0 deletions
diff --git a/thirdparty/cvtt/ConvectionKernels_IndexSelector.h b/thirdparty/cvtt/ConvectionKernels_IndexSelector.h
new file mode 100644
index 0000000000..0f9d209183
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_IndexSelector.h
@@ -0,0 +1,147 @@
+#pragma once
+#ifndef __CVTT_INDEXSELECTOR_H__
+#define __CVTT_INDEXSELECTOR_H__
+
+#include "ConvectionKernels_ParallelMath.h"
+
+namespace cvtt
+{
+ namespace Internal
+ {
+ extern const ParallelMath::UInt16 g_weightReciprocals[17];
+
+ template<int TVectorSize>
+ class IndexSelector
+ {
+ public:
+ typedef ParallelMath::Float MFloat;
+ typedef ParallelMath::UInt16 MUInt16;
+ typedef ParallelMath::UInt15 MUInt15;
+ typedef ParallelMath::SInt16 MSInt16;
+ typedef ParallelMath::AInt16 MAInt16;
+ typedef ParallelMath::SInt32 MSInt32;
+ typedef ParallelMath::UInt31 MUInt31;
+
+
+ template<class TInterpolationEPType, class TColorEPType>
+ void Init(const float *channelWeights, const TInterpolationEPType interpolationEndPoints[2][TVectorSize], const TColorEPType colorSpaceEndpoints[2][TVectorSize], int range)
+ {
+ // In BC6H, the interpolation endpoints are higher-precision than the endpoints in color space.
+ // We need to select indexes using the color-space endpoints.
+
+ m_isUniform = true;
+ for (int ch = 1; ch < TVectorSize; ch++)
+ {
+ if (channelWeights[ch] != channelWeights[0])
+ m_isUniform = false;
+ }
+
+ // To work with channel weights, we need something where:
+ // pxDiff = px - ep[0]
+ // epDiff = ep[1] - ep[0]
+ //
+ // weightedEPDiff = epDiff * channelWeights
+ // normalizedWeightedAxis = weightedEPDiff / len(weightedEPDiff)
+ // normalizedIndex = dot(pxDiff * channelWeights, normalizedWeightedAxis) / len(weightedEPDiff)
+ // index = normalizedIndex * maxValue
+ //
+ // Equivalent to:
+ // axis = channelWeights * maxValue * epDiff * channelWeights / lenSquared(epDiff * channelWeights)
+ // index = dot(axis, pxDiff)
+
+ for (int ep = 0; ep < 2; ep++)
+ for (int ch = 0; ch < TVectorSize; ch++)
+ m_endPoint[ep][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(interpolationEndPoints[ep][ch]);
+
+ m_range = range;
+ m_maxValue = static_cast<float>(range - 1);
+
+ MFloat epDiffWeighted[TVectorSize];
+ for (int ch = 0; ch < TVectorSize; ch++)
+ {
+ m_origin[ch] = ParallelMath::ToFloat(colorSpaceEndpoints[0][ch]);
+ MFloat opposingOriginCh = ParallelMath::ToFloat(colorSpaceEndpoints[1][ch]);
+ epDiffWeighted[ch] = (opposingOriginCh - m_origin[ch]) * channelWeights[ch];
+ }
+
+ MFloat lenSquared = epDiffWeighted[0] * epDiffWeighted[0];
+ for (int ch = 1; ch < TVectorSize; ch++)
+ lenSquared = lenSquared + epDiffWeighted[ch] * epDiffWeighted[ch];
+
+ ParallelMath::MakeSafeDenominator(lenSquared);
+
+ MFloat maxValueDividedByLengthSquared = ParallelMath::MakeFloat(m_maxValue) / lenSquared;
+
+ for (int ch = 0; ch < TVectorSize; ch++)
+ m_axis[ch] = epDiffWeighted[ch] * channelWeights[ch] * maxValueDividedByLengthSquared;
+ }
+
+ template<bool TSigned>
+ void Init(const float channelWeights[TVectorSize], const MUInt15 endPoints[2][TVectorSize], int range)
+ {
+ MAInt16 converted[2][TVectorSize];
+ for (int epi = 0; epi < 2; epi++)
+ for (int ch = 0; ch < TVectorSize; ch++)
+ converted[epi][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(endPoints[epi][ch]);
+
+ Init<MUInt15, MUInt15>(channelWeights, endPoints, endPoints, range);
+ }
+
+ void ReconstructLDR_BC7(const MUInt15 &index, MUInt15* pixel, int numRealChannels)
+ {
+ MUInt15 weight = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(g_weightReciprocals[m_range], index) + 256, 9));
+
+ for (int ch = 0; ch < numRealChannels; ch++)
+ {
+ MUInt15 ep0f = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply((ParallelMath::MakeUInt15(64) - weight), ParallelMath::LosslessCast<MUInt15>::Cast(m_endPoint[0][ch])));
+ MUInt15 ep1f = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply(weight, ParallelMath::LosslessCast<MUInt15>::Cast(m_endPoint[1][ch])));
+ pixel[ch] = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ep0f + ep1f + ParallelMath::MakeUInt15(32), 6));
+ }
+ }
+
+ void ReconstructLDRPrecise(const MUInt15 &index, MUInt15* pixel, int numRealChannels)
+ {
+ MUInt15 weight = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(g_weightReciprocals[m_range], index) + 64, 7));
+
+ for (int ch = 0; ch < numRealChannels; ch++)
+ {
+ MUInt15 ep0f = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply((ParallelMath::MakeUInt15(256) - weight), ParallelMath::LosslessCast<MUInt15>::Cast(m_endPoint[0][ch])));
+ MUInt15 ep1f = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply(weight, ParallelMath::LosslessCast<MUInt15>::Cast(m_endPoint[1][ch])));
+ pixel[ch] = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ep0f + ep1f + ParallelMath::MakeUInt15(128), 8));
+ }
+ }
+
+ void ReconstructLDR_BC7(const MUInt15 &index, MUInt15* pixel)
+ {
+ ReconstructLDR_BC7(index, pixel, TVectorSize);
+ }
+
+ void ReconstructLDRPrecise(const MUInt15 &index, MUInt15* pixel)
+ {
+ ReconstructLDRPrecise(index, pixel, TVectorSize);
+ }
+
+ MUInt15 SelectIndexLDR(const MFloat* pixel, const ParallelMath::RoundTowardNearestForScope* rtn) const
+ {
+ MFloat dist = (pixel[0] - m_origin[0]) * m_axis[0];
+ for (int ch = 1; ch < TVectorSize; ch++)
+ dist = dist + (pixel[ch] - m_origin[ch]) * m_axis[ch];
+
+ return ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(dist, 0.0f, m_maxValue), rtn);
+ }
+
+ protected:
+ MAInt16 m_endPoint[2][TVectorSize];
+
+ private:
+ MFloat m_origin[TVectorSize];
+ MFloat m_axis[TVectorSize];
+ int m_range;
+ float m_maxValue;
+ bool m_isUniform;
+ };
+ }
+}
+
+#endif
+