13 files changed, 250 insertions, 167 deletions
diff --git a/thirdparty/libwebp/src/enc/alpha_enc.c b/thirdparty/libwebp/src/enc/alpha_enc.c
index 7e8d87f22e..dce9ca957d 100644
--- a/thirdparty/libwebp/src/enc/alpha_enc.c
+++ b/thirdparty/libwebp/src/enc/alpha_enc.c
@@ -361,7 +361,8 @@ static int EncodeAlpha(VP8Encoder* const enc,
 //------------------------------------------------------------------------------
 // Main calls
 
-static int CompressAlphaJob(VP8Encoder* const enc, void* dummy) {
+static int CompressAlphaJob(void* arg1, void* dummy) {
+  VP8Encoder* const enc = (VP8Encoder*)arg1;
   const WebPConfig* config = enc->config_;
   uint8_t* alpha_data = NULL;
   size_t alpha_size = 0;
@@ -394,7 +395,7 @@ void VP8EncInitAlpha(VP8Encoder* const enc) {
     WebPGetWorkerInterface()->Init(worker);
     worker->data1 = enc;
     worker->data2 = NULL;
-    worker->hook = (WebPWorkerHook)CompressAlphaJob;
+    worker->hook = CompressAlphaJob;
   }
 }
 
diff --git a/thirdparty/libwebp/src/enc/analysis_enc.c b/thirdparty/libwebp/src/enc/analysis_enc.c
index 08f471f5f8..a47ff7d4e8 100644
--- a/thirdparty/libwebp/src/enc/analysis_enc.c
+++ b/thirdparty/libwebp/src/enc/analysis_enc.c
@@ -434,7 +434,9 @@ typedef struct {
 } SegmentJob;
 
 // main work call
-static int DoSegmentsJob(SegmentJob* const job, VP8EncIterator* const it) {
+static int DoSegmentsJob(void* arg1, void* arg2) {
+  SegmentJob* const job = (SegmentJob*)arg1;
+  VP8EncIterator* const it = (VP8EncIterator*)arg2;
   int ok = 1;
   if (!VP8IteratorIsDone(it)) {
     uint8_t tmp[32 + WEBP_ALIGN_CST];
@@ -462,7 +464,7 @@ static void InitSegmentJob(VP8Encoder* const enc, SegmentJob* const job,
   WebPGetWorkerInterface()->Init(&job->worker);
   job->worker.data1 = job;
   job->worker.data2 = &job->it;
-  job->worker.hook = (WebPWorkerHook)DoSegmentsJob;
+  job->worker.hook = DoSegmentsJob;
   VP8IteratorInit(enc, &job->it);
   VP8IteratorSetRow(&job->it, start_row);
   VP8IteratorSetCountDown(&job->it, (end_row - start_row) * enc->mb_w_);
diff --git a/thirdparty/libwebp/src/enc/frame_enc.c b/thirdparty/libwebp/src/enc/frame_enc.c
index 2b0dc66410..1aec376e44 100644
--- a/thirdparty/libwebp/src/enc/frame_enc.c
+++ b/thirdparty/libwebp/src/enc/frame_enc.c
@@ -198,7 +198,7 @@ static void SetSegmentProbas(VP8Encoder* const enc) {
 
   for (n = 0; n < enc->mb_w_ * enc->mb_h_; ++n) {
     const VP8MBInfo* const mb = &enc->mb_info_[n];
-    p[mb->segment_]++;
+    ++p[mb->segment_];
   }
 #if !defined(WEBP_DISABLE_STATS)
   if (enc->pic_->stats != NULL) {
@@ -520,6 +520,14 @@ static void StoreSideInfo(const VP8EncIterator* const it) {
 #endif
 }
 
+static void ResetSideInfo(const VP8EncIterator* const it) {
+  VP8Encoder* const enc = it->enc_;
+  WebPPicture* const pic = enc->pic_;
+  if (pic->stats != NULL) {
+    memset(enc->block_count_, 0, sizeof(enc->block_count_));
+  }
+  ResetSSE(enc);
+}
 #else  // defined(WEBP_DISABLE_STATS)
 static void ResetSSE(VP8Encoder* const enc) {
   (void)enc;
@@ -528,10 +536,16 @@ static void StoreSideInfo(const VP8EncIterator* const it) {
   VP8Encoder* const enc = it->enc_;
   WebPPicture* const pic = enc->pic_;
   if (pic->extra_info != NULL) {
-    memset(pic->extra_info, 0,
-           enc->mb_w_ * enc->mb_h_ * sizeof(*pic->extra_info));
+    if (it->x_ == 0 && it->y_ == 0) {   // only do it once, at start
+      memset(pic->extra_info, 0,
+             enc->mb_w_ * enc->mb_h_ * sizeof(*pic->extra_info));
+    }
   }
 }
+
+static void ResetSideInfo(const VP8EncIterator* const it) {
+  (void)it;
+}
 #endif  // !defined(WEBP_DISABLE_STATS)
 
 static double GetPSNR(uint64_t mse, uint64_t size) {
@@ -570,7 +584,7 @@ static uint64_t OneStatPass(VP8Encoder* const enc, VP8RDLevel rd_opt,
     VP8IteratorImport(&it, NULL);
     if (VP8Decimate(&it, &info, rd_opt)) {
       // Just record the number of skips and act like skip_proba is not used.
-      enc->proba_.nb_skip_++;
+      ++enc->proba_.nb_skip_;
     }
     RecordResiduals(&it, &info);
     size += info.R + info.H;
@@ -841,6 +855,9 @@ int VP8EncTokenLoop(VP8Encoder* const enc) {
     if (enc->max_i4_header_bits_ > 0 && size_p0 > PARTITION0_SIZE_LIMIT) {
       ++num_pass_left;
       enc->max_i4_header_bits_ >>= 1;  // strengthen header bit limitation...
+      if (is_last_pass) {
+        ResetSideInfo(&it);
+      }
       continue;                        // ...and start over
     }
     if (is_last_pass) {
@@ -871,4 +888,3 @@ int VP8EncTokenLoop(VP8Encoder* const enc) {
 #endif    // DISABLE_TOKEN_BUFFER
 
 //------------------------------------------------------------------------------
-
diff --git a/thirdparty/libwebp/src/enc/histogram_enc.c b/thirdparty/libwebp/src/enc/histogram_enc.c
index 056a972dda..9fdbc627a1 100644
--- a/thirdparty/libwebp/src/enc/histogram_enc.c
+++ b/thirdparty/libwebp/src/enc/histogram_enc.c
@@ -200,14 +200,9 @@ static WEBP_INLINE double BitsEntropyRefine(const VP8LBitEntropy* entropy) {
   }
 }
 
-double VP8LBitsEntropy(const uint32_t* const array, int n,
-                       uint32_t* const trivial_symbol) {
+double VP8LBitsEntropy(const uint32_t* const array, int n) {
   VP8LBitEntropy entropy;
   VP8LBitsEntropyUnrefined(array, n, &entropy);
-  if (trivial_symbol != NULL) {
-    *trivial_symbol =
-        (entropy.nonzeros == 1) ? entropy.nonzero_code : VP8L_NON_TRIVIAL_SYM;
-  }
 
   return BitsEntropyRefine(&entropy);
 }
@@ -605,7 +600,7 @@ static void HistogramCombineEntropyBin(VP8LHistogramSet* const image_histo,
 }
 
 // Implement a Lehmer random number generator with a multiplicative constant of
-// 48271 and a modulo constant of 2^31 − 1.
+// 48271 and a modulo constant of 2^31 - 1.
 static uint32_t MyRand(uint32_t* const seed) {
   *seed = (uint32_t)(((uint64_t)(*seed) * 48271u) % 2147483647u);
   assert(*seed > 0);
@@ -1031,7 +1026,7 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
     }
   }
 
-  // TODO(vikasa): Optimize HistogramRemap for low-effort compression mode also.
+  // TODO(vrabaud): Optimize HistogramRemap for low-effort compression mode.
   // Find the optimal map from original histograms to the final ones.
   HistogramRemap(orig_histo, image_histo, histogram_symbols);
 
diff --git a/thirdparty/libwebp/src/enc/histogram_enc.h b/thirdparty/libwebp/src/enc/histogram_enc.h
index 15b1fbda34..e8c4c83f6f 100644
--- a/thirdparty/libwebp/src/enc/histogram_enc.h
+++ b/thirdparty/libwebp/src/enc/histogram_enc.h
@@ -109,10 +109,7 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
                              uint16_t* const histogram_symbols);
 
 // Returns the entropy for the symbols in the input array.
-// Also sets trivial_symbol to the code value, if the array has only one code
-// value. Otherwise, set it to VP8L_NON_TRIVIAL_SYM.
-double VP8LBitsEntropy(const uint32_t* const array, int n,
-                       uint32_t* const trivial_symbol);
+double VP8LBitsEntropy(const uint32_t* const array, int n);
 
 // Estimate how many bits the combined entropy of literals and distance
 // approximately maps to.
diff --git a/thirdparty/libwebp/src/enc/iterator_enc.c b/thirdparty/libwebp/src/enc/iterator_enc.c
index cfacfd2401..7c47d51272 100644
--- a/thirdparty/libwebp/src/enc/iterator_enc.c
+++ b/thirdparty/libwebp/src/enc/iterator_enc.c
@@ -26,6 +26,9 @@ static void InitLeft(VP8EncIterator* const it) {
   memset(it->u_left_, 129, 8);
   memset(it->v_left_, 129, 8);
   it->left_nz_[8] = 0;
+  if (it->top_derr_ != NULL) {
+    memset(&it->left_derr_, 0, sizeof(it->left_derr_));
+  }
 }
 
 static void InitTop(VP8EncIterator* const it) {
@@ -33,6 +36,9 @@ static void InitTop(VP8EncIterator* const it) {
   const size_t top_size = enc->mb_w_ * 16;
   memset(enc->y_top_, 127, 2 * top_size);
   memset(enc->nz_, 0, enc->mb_w_ * sizeof(*enc->nz_));
+  if (enc->top_derr_ != NULL) {
+    memset(enc->top_derr_, 0, enc->mb_w_ * sizeof(*enc->top_derr_));
+  }
 }
 
 void VP8IteratorSetRow(VP8EncIterator* const it, int y) {
@@ -76,6 +82,7 @@ void VP8IteratorInit(VP8Encoder* const enc, VP8EncIterator* const it) {
   it->y_left_ = (uint8_t*)WEBP_ALIGN(it->yuv_left_mem_ + 1);
   it->u_left_ = it->y_left_ + 16 + 16;
   it->v_left_ = it->u_left_ + 16;
+  it->top_derr_ = enc->top_derr_;
   VP8IteratorReset(it);
 }
 
@@ -450,4 +457,3 @@ int VP8IteratorRotateI4(VP8EncIterator* const it,
 }
 
 //------------------------------------------------------------------------------
-
diff --git a/thirdparty/libwebp/src/enc/near_lossless_enc.c b/thirdparty/libwebp/src/enc/near_lossless_enc.c
index cadd14c664..5517a7e271 100644
--- a/thirdparty/libwebp/src/enc/near_lossless_enc.c
+++ b/thirdparty/libwebp/src/enc/near_lossless_enc.c
@@ -146,6 +146,6 @@ int VP8ApplyNearLossless(const WebPPicture* const picture, int quality,
 
 // Define a stub to suppress compiler warnings.
 extern void VP8LNearLosslessStub(void);
-WEBP_TSAN_IGNORE_FUNCTION void VP8LNearLosslessStub(void) {}
+void VP8LNearLosslessStub(void) {}
 
 #endif  // (WEBP_NEAR_LOSSLESS == 1)
diff --git a/thirdparty/libwebp/src/enc/picture_csp_enc.c b/thirdparty/libwebp/src/enc/picture_csp_enc.c
index d531dd0282..02d9df76d5 100644
--- a/thirdparty/libwebp/src/enc/picture_csp_enc.c
+++ b/thirdparty/libwebp/src/enc/picture_csp_enc.c
@@ -28,11 +28,11 @@
 // If defined, use table to compute x / alpha.
 #define USE_INVERSE_ALPHA_TABLE
 
-static const union {
-  uint32_t argb;
-  uint8_t  bytes[4];
-} test_endian = { 0xff000000u };
-#define ALPHA_IS_LAST (test_endian.bytes[3] == 0xff)
+#ifdef WORDS_BIGENDIAN
+#define ALPHA_OFFSET 0   // uint32_t 0xff000000 is 0xff,00,00,00 in memory
+#else
+#define ALPHA_OFFSET 3   // uint32_t 0xff000000 is 0x00,00,00,ff in memory
+#endif
 
 //------------------------------------------------------------------------------
 // Detection of non-trivial transparency
@@ -61,7 +61,7 @@ int WebPPictureHasTransparency(const WebPPicture* picture) {
     return CheckNonOpaque(picture->a, picture->width, picture->height,
                           1, picture->a_stride);
   } else {
-    const int alpha_offset = ALPHA_IS_LAST ? 3 : 0;
+    const int alpha_offset = ALPHA_OFFSET;
     return CheckNonOpaque((const uint8_t*)picture->argb + alpha_offset,
                           picture->width, picture->height,
                           4, picture->argb_stride * sizeof(*picture->argb));
@@ -126,7 +126,7 @@ static WEBP_INLINE int LinearToGamma(uint32_t base_value, int shift) {
 
 #else
 
-static WEBP_TSAN_IGNORE_FUNCTION void InitGammaTables(void) {}
+static void InitGammaTables(void) {}
 static WEBP_INLINE uint32_t GammaToLinear(uint8_t v) { return v; }
 static WEBP_INLINE int LinearToGamma(uint32_t base_value, int shift) {
   return (int)(base_value << shift);
@@ -170,29 +170,33 @@ typedef uint16_t fixed_y_t;   // unsigned type with extra SFIX precision for W
 
 #if defined(USE_GAMMA_COMPRESSION)
 
-// float variant of gamma-correction
 // We use tables of different size and precision for the Rec709 / BT2020
 // transfer function.
 #define kGammaF (1./0.45)
-static float kGammaToLinearTabF[MAX_Y_T + 1];   // size scales with Y_FIX
-static float kLinearToGammaTabF[kGammaTabSize + 2];
-static volatile int kGammaTablesFOk = 0;
-
-static WEBP_TSAN_IGNORE_FUNCTION void InitGammaTablesF(void) {
-  if (!kGammaTablesFOk) {
+static uint32_t kLinearToGammaTabS[kGammaTabSize + 2];
+#define GAMMA_TO_LINEAR_BITS 14
+static uint32_t kGammaToLinearTabS[MAX_Y_T + 1];   // size scales with Y_FIX
+static volatile int kGammaTablesSOk = 0;
+
+static WEBP_TSAN_IGNORE_FUNCTION void InitGammaTablesS(void) {
+  assert(2 * GAMMA_TO_LINEAR_BITS < 32);  // we use uint32_t intermediate values
+  if (!kGammaTablesSOk) {
     int v;
     const double norm = 1. / MAX_Y_T;
     const double scale = 1. / kGammaTabSize;
     const double a = 0.09929682680944;
     const double thresh = 0.018053968510807;
+    const double final_scale = 1 << GAMMA_TO_LINEAR_BITS;
     for (v = 0; v <= MAX_Y_T; ++v) {
       const double g = norm * v;
+      double value;
       if (g <= thresh * 4.5) {
-        kGammaToLinearTabF[v] = (float)(g / 4.5);
+        value = g / 4.5;
       } else {
         const double a_rec = 1. / (1. + a);
-        kGammaToLinearTabF[v] = (float)pow(a_rec * (g + a), kGammaF);
+        value = pow(a_rec * (g + a), kGammaF);
       }
+      kGammaToLinearTabS[v] = (uint32_t)(value * final_scale + .5);
     }
     for (v = 0; v <= kGammaTabSize; ++v) {
       const double g = scale * v;
@@ -202,37 +206,44 @@ static WEBP_TSAN_IGNORE_FUNCTION void InitGammaTablesF(void) {
       } else {
         value = (1. + a) * pow(g, 1. / kGammaF) - a;
       }
-      kLinearToGammaTabF[v] = (float)(MAX_Y_T * value);
+      // we already incorporate the 1/2 rounding constant here
+      kLinearToGammaTabS[v] =
+          (uint32_t)(MAX_Y_T * value) + (1 << GAMMA_TO_LINEAR_BITS >> 1);
     }
     // to prevent small rounding errors to cause read-overflow:
-    kLinearToGammaTabF[kGammaTabSize + 1] = kLinearToGammaTabF[kGammaTabSize];
-    kGammaTablesFOk = 1;
+    kLinearToGammaTabS[kGammaTabSize + 1] = kLinearToGammaTabS[kGammaTabSize];
+    kGammaTablesSOk = 1;
   }
 }
 
-static WEBP_INLINE float GammaToLinearF(int v) {
-  return kGammaToLinearTabF[v];
+// return value has a fixed-point precision of GAMMA_TO_LINEAR_BITS
+static WEBP_INLINE uint32_t GammaToLinearS(int v) {
+  return kGammaToLinearTabS[v];
 }
 
-static WEBP_INLINE int LinearToGammaF(float value) {
-  const float v = value * kGammaTabSize;
-  const int tab_pos = (int)v;
-  const float x = v - (float)tab_pos;      // fractional part
-  const float v0 = kLinearToGammaTabF[tab_pos + 0];
-  const float v1 = kLinearToGammaTabF[tab_pos + 1];
-  const float y = v1 * x + v0 * (1.f - x);  // interpolate
-  return (int)(y + .5);
+static WEBP_INLINE uint32_t LinearToGammaS(uint32_t value) {
+  // 'value' is in GAMMA_TO_LINEAR_BITS fractional precision
+  const uint32_t v = value * kGammaTabSize;
+  const uint32_t tab_pos = v >> GAMMA_TO_LINEAR_BITS;
+  // fractional part, in GAMMA_TO_LINEAR_BITS fixed-point precision
+  const uint32_t x = v - (tab_pos << GAMMA_TO_LINEAR_BITS);  // fractional part
+  // v0 / v1 are in GAMMA_TO_LINEAR_BITS fixed-point precision (range [0..1])
+  const uint32_t v0 = kLinearToGammaTabS[tab_pos + 0];
+  const uint32_t v1 = kLinearToGammaTabS[tab_pos + 1];
+  // Final interpolation. Note that rounding is already included.
+  const uint32_t v2 = (v1 - v0) * x;    // note: v1 >= v0.
+  const uint32_t result = v0 + (v2 >> GAMMA_TO_LINEAR_BITS);
+  return result;
 }
 
 #else
 
-static WEBP_TSAN_IGNORE_FUNCTION void InitGammaTablesF(void) {}
-static WEBP_INLINE float GammaToLinearF(int v) {
-  const float norm = 1.f / MAX_Y_T;
-  return norm * v;
+static void InitGammaTablesS(void) {}
+static WEBP_INLINE uint32_t GammaToLinearS(int v) {
+  return (v << GAMMA_TO_LINEAR_BITS) / MAX_Y_T;
 }
-static WEBP_INLINE int LinearToGammaF(float value) {
-  return (int)(MAX_Y_T * value + .5);
+static WEBP_INLINE uint32_t LinearToGammaS(uint32_t value) {
+  return (MAX_Y_T * value) >> GAMMA_TO_LINEAR_BITS;
 }
 
 #endif    // USE_GAMMA_COMPRESSION
@@ -254,26 +265,22 @@ static int RGBToGray(int r, int g, int b) {
   return (luma >> YUV_FIX);
 }
 
-static float RGBToGrayF(float r, float g, float b) {
-  return (float)(0.2126 * r + 0.7152 * g + 0.0722 * b);
-}
-
-static int ScaleDown(int a, int b, int c, int d) {
-  const float A = GammaToLinearF(a);
-  const float B = GammaToLinearF(b);
-  const float C = GammaToLinearF(c);
-  const float D = GammaToLinearF(d);
-  return LinearToGammaF(0.25f * (A + B + C + D));
+static uint32_t ScaleDown(int a, int b, int c, int d) {
+  const uint32_t A = GammaToLinearS(a);
+  const uint32_t B = GammaToLinearS(b);
+  const uint32_t C = GammaToLinearS(c);
+  const uint32_t D = GammaToLinearS(d);
+  return LinearToGammaS((A + B + C + D + 2) >> 2);
 }
 
 static WEBP_INLINE void UpdateW(const fixed_y_t* src, fixed_y_t* dst, int w) {
   int i;
   for (i = 0; i < w; ++i) {
-    const float R = GammaToLinearF(src[0 * w + i]);
-    const float G = GammaToLinearF(src[1 * w + i]);
-    const float B = GammaToLinearF(src[2 * w + i]);
-    const float Y = RGBToGrayF(R, G, B);
-    dst[i] = (fixed_y_t)LinearToGammaF(Y);
+    const uint32_t R = GammaToLinearS(src[0 * w + i]);
+    const uint32_t G = GammaToLinearS(src[1 * w + i]);
+    const uint32_t B = GammaToLinearS(src[2 * w + i]);
+    const uint32_t Y = RGBToGray(R, G, B);
+    dst[i] = (fixed_y_t)LinearToGammaS(Y);
   }
 }
 
@@ -863,7 +870,7 @@ static int ImportYUVAFromRGBA(const uint8_t* r_ptr,
   }
 
   if (use_iterative_conversion) {
-    InitGammaTablesF();
+    InitGammaTablesS();
     if (!PreprocessARGB(r_ptr, g_ptr, b_ptr, step, rgb_stride, picture)) {
       return 0;
     }
@@ -990,10 +997,10 @@ static int PictureARGBToYUVA(WebPPicture* picture, WebPEncCSP colorspace,
     return WebPEncodingSetError(picture, VP8_ENC_ERROR_INVALID_CONFIGURATION);
   } else {
     const uint8_t* const argb = (const uint8_t*)picture->argb;
-    const uint8_t* const r = ALPHA_IS_LAST ? argb + 2 : argb + 1;
-    const uint8_t* const g = ALPHA_IS_LAST ? argb + 1 : argb + 2;
-    const uint8_t* const b = ALPHA_IS_LAST ? argb + 0 : argb + 3;
-    const uint8_t* const a = ALPHA_IS_LAST ? argb + 3 : argb + 0;
+    const uint8_t* const a = argb + (0 ^ ALPHA_OFFSET);
+    const uint8_t* const r = argb + (1 ^ ALPHA_OFFSET);
+    const uint8_t* const g = argb + (2 ^ ALPHA_OFFSET);
+    const uint8_t* const b = argb + (3 ^ ALPHA_OFFSET);
 
     picture->colorspace = WEBP_YUV420;
     return ImportYUVAFromRGBA(r, g, b, a, 4, 4 * picture->argb_stride,
@@ -1044,7 +1051,8 @@ int WebPPictureYUVAToARGB(WebPPicture* picture) {
     const int argb_stride = 4 * picture->argb_stride;
     uint8_t* dst = (uint8_t*)picture->argb;
     const uint8_t *cur_u = picture->u, *cur_v = picture->v, *cur_y = picture->y;
-    WebPUpsampleLinePairFunc upsample = WebPGetLinePairConverter(ALPHA_IS_LAST);
+    WebPUpsampleLinePairFunc upsample =
+        WebPGetLinePairConverter(ALPHA_OFFSET > 0);
 
     // First row, with replicated top samples.
     upsample(cur_y, NULL, cur_u, cur_v, cur_u, cur_v, dst, NULL, width);
@@ -1087,6 +1095,7 @@ static int Import(WebPPicture* const picture,
                   const uint8_t* rgb, int rgb_stride,
                   int step, int swap_rb, int import_alpha) {
   int y;
+  // swap_rb -> b,g,r,a , !swap_rb -> r,g,b,a
   const uint8_t* r_ptr = rgb + (swap_rb ? 2 : 0);
   const uint8_t* g_ptr = rgb + 1;
   const uint8_t* b_ptr = rgb + (swap_rb ? 0 : 2);
@@ -1104,19 +1113,32 @@ static int Import(WebPPicture* const picture,
   WebPInitAlphaProcessing();
 
   if (import_alpha) {
+    // dst[] byte order is {a,r,g,b} for big-endian, {b,g,r,a} for little endian
     uint32_t* dst = picture->argb;
-    const int do_copy =
-        (!swap_rb && !ALPHA_IS_LAST) || (swap_rb && ALPHA_IS_LAST);
+    const int do_copy = (ALPHA_OFFSET == 3) && swap_rb;
     assert(step == 4);
-    for (y = 0; y < height; ++y) {
-      if (do_copy) {
+    if (do_copy) {
+      for (y = 0; y < height; ++y) {
         memcpy(dst, rgb, width * 4);
-      } else {
+        rgb += rgb_stride;
+        dst += picture->argb_stride;
+      }
+    } else {
+      for (y = 0; y < height; ++y) {
+#ifdef WORDS_BIGENDIAN
+        // BGRA or RGBA input order.
+        const uint8_t* a_ptr = rgb + 3;
+        WebPPackARGB(a_ptr, r_ptr, g_ptr, b_ptr, width, dst);
+        r_ptr += rgb_stride;
+        g_ptr += rgb_stride;
+        b_ptr += rgb_stride;
+#else
         // RGBA input order. Need to swap R and B.
         VP8LConvertBGRAToRGBA((const uint32_t*)rgb, width, (uint8_t*)dst);
+#endif
+        rgb += rgb_stride;
+        dst += picture->argb_stride;
       }
-      rgb += rgb_stride;
-      dst += picture->argb_stride;
     }
   } else {
     uint32_t* dst = picture->argb;
diff --git a/thirdparty/libwebp/src/enc/picture_psnr_enc.c b/thirdparty/libwebp/src/enc/picture_psnr_enc.c
index 362a7c79be..1a2f0bef3e 100644
--- a/thirdparty/libwebp/src/enc/picture_psnr_enc.c
+++ b/thirdparty/libwebp/src/enc/picture_psnr_enc.c
@@ -18,6 +18,7 @@
 #include <math.h>
 #include <stdlib.h>
 
+#include "src/dsp/dsp.h"
 #include "src/enc/vp8i_enc.h"
 #include "src/utils/utils.h"
 
@@ -169,6 +170,12 @@ int WebPPlaneDistortion(const uint8_t* src, size_t src_stride,
   return 1;
 }
 
+#ifdef WORDS_BIGENDIAN
+#define BLUE_OFFSET 3   // uint32_t 0x000000ff is 0x00,00,00,ff in memory
+#else
+#define BLUE_OFFSET 0   // uint32_t 0x000000ff is 0xff,00,00,00 in memory
+#endif
+
 int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref,
                           int type, float results[5]) {
   int w, h, c;
@@ -195,8 +202,10 @@ int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref,
     float distortion;
     const size_t stride0 = 4 * (size_t)p0.argb_stride;
     const size_t stride1 = 4 * (size_t)p1.argb_stride;
-    if (!WebPPlaneDistortion((const uint8_t*)p0.argb + c, stride0,
-                             (const uint8_t*)p1.argb + c, stride1,
+    // results are reported as BGRA
+    const int offset = c ^ BLUE_OFFSET;
+    if (!WebPPlaneDistortion((const uint8_t*)p0.argb + offset, stride0,
+                             (const uint8_t*)p1.argb + offset, stride1,
                              w, h, 4, type, &distortion, results + c)) {
       goto Error;
     }
@@ -214,6 +223,8 @@ int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref,
   return ok;
 }
 
+#undef BLUE_OFFSET
+
 #else  // defined(WEBP_DISABLE_STATS)
 int WebPPlaneDistortion(const uint8_t* src, size_t src_stride,
                         const uint8_t* ref, size_t ref_stride,
diff --git a/thirdparty/libwebp/src/enc/quant_enc.c b/thirdparty/libwebp/src/enc/quant_enc.c
index 3b1a3129b5..35bfaf21ef 100644
--- a/thirdparty/libwebp/src/enc/quant_enc.c
+++ b/thirdparty/libwebp/src/enc/quant_enc.c
@@ -826,6 +826,85 @@ static int ReconstructIntra4(VP8EncIterator* const it,
   return nz;
 }
 
+//------------------------------------------------------------------------------
+// DC-error diffusion
+
+// Diffusion weights. We under-correct a bit (15/16th of the error is actually
+// diffused) to avoid 'rainbow' chessboard pattern of blocks at q~=0.
+#define C1 7    // fraction of error sent to the 4x4 block below
+#define C2 8    // fraction of error sent to the 4x4 block on the right
+#define DSHIFT 4
+#define DSCALE 1   // storage descaling, needed to make the error fit int8_t
+
+// Quantize as usual, but also compute and return the quantization error.
+// Error is already divided by DSHIFT.
+static int QuantizeSingle(int16_t* const v, const VP8Matrix* const mtx) {
+  int V = *v;
+  const int sign = (V < 0);
+  if (sign) V = -V;
+  if (V > (int)mtx->zthresh_[0]) {
+    const int qV = QUANTDIV(V, mtx->iq_[0], mtx->bias_[0]) * mtx->q_[0];
+    const int err = (V - qV);
+    *v = sign ? -qV : qV;
+    return (sign ? -err : err) >> DSCALE;
+  }
+  *v = 0;
+  return (sign ? -V : V) >> DSCALE;
+}
+
+static void CorrectDCValues(const VP8EncIterator* const it,
+                            const VP8Matrix* const mtx,
+                            int16_t tmp[][16], VP8ModeScore* const rd) {
+  //         | top[0] | top[1]
+  // --------+--------+---------
+  // left[0] | tmp[0]   tmp[1]  <->   err0 err1
+  // left[1] | tmp[2]   tmp[3]        err2 err3
+  //
+  // Final errors {err1,err2,err3} are preserved and later restored
+  // as top[]/left[] on the next block.
+  int ch;
+  for (ch = 0; ch <= 1; ++ch) {
+    const int8_t* const top = it->top_derr_[it->x_][ch];
+    const int8_t* const left = it->left_derr_[ch];
+    int16_t (* const c)[16] = &tmp[ch * 4];
+    int err0, err1, err2, err3;
+    c[0][0] += (C1 * top[0] + C2 * left[0]) >> (DSHIFT - DSCALE);
+    err0 = QuantizeSingle(&c[0][0], mtx);
+    c[1][0] += (C1 * top[1] + C2 * err0) >> (DSHIFT - DSCALE);
+    err1 = QuantizeSingle(&c[1][0], mtx);
+    c[2][0] += (C1 * err0 + C2 * left[1]) >> (DSHIFT - DSCALE);
+    err2 = QuantizeSingle(&c[2][0], mtx);
+    c[3][0] += (C1 * err1 + C2 * err2) >> (DSHIFT - DSCALE);
+    err3 = QuantizeSingle(&c[3][0], mtx);
+    // error 'err' is bounded by mtx->q_[0] which is 132 at max. Hence
+    // err >> DSCALE will fit in an int8_t type if DSCALE>=1.
+    assert(abs(err1) <= 127 && abs(err2) <= 127 && abs(err3) <= 127);
+    rd->derr[ch][0] = (int8_t)err1;
+    rd->derr[ch][1] = (int8_t)err2;
+    rd->derr[ch][2] = (int8_t)err3;
+  }
+}
+
+static void StoreDiffusionErrors(VP8EncIterator* const it,
+                                 const VP8ModeScore* const rd) {
+  int ch;
+  for (ch = 0; ch <= 1; ++ch) {
+    int8_t* const top = it->top_derr_[it->x_][ch];
+    int8_t* const left = it->left_derr_[ch];
+    left[0] = rd->derr[ch][0];            // restore err1
+    left[1] = 3 * rd->derr[ch][2] >> 2;   //     ... 3/4th of err3
+    top[0]  = rd->derr[ch][1];            //     ... err2
+    top[1]  = rd->derr[ch][2] - left[1];  //     ... 1/4th of err3.
+  }
+}
+
+#undef C1
+#undef C2
+#undef DSHIFT
+#undef DSCALE
+
+//------------------------------------------------------------------------------
+
 static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
                          uint8_t* const yuv_out, int mode) {
   const VP8Encoder* const enc = it->enc_;
@@ -839,6 +918,8 @@ static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
   for (n = 0; n < 8; n += 2) {
     VP8FTransform2(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]);
   }
+  if (it->top_derr_ != NULL) CorrectDCValues(it, &dqm->uv_, tmp, rd);
+
   if (DO_TRELLIS_UV && it->do_trellis_) {
     int ch, x, y;
     for (ch = 0, n = 0; ch <= 2; ch += 2) {
@@ -1101,6 +1182,9 @@ static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
       CopyScore(&rd_best, &rd_uv);
       rd->mode_uv = mode;
       memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels));
+      if (it->top_derr_ != NULL) {
+        memcpy(rd->derr, rd_uv.derr, sizeof(rd_uv.derr));
+      }
       SwapPtr(&dst, &tmp_dst);
     }
   }
@@ -1109,6 +1193,9 @@ static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
   if (dst != dst0) {   // copy 16x8 block if needed
     VP8Copy16x8(dst, dst0);
   }
+  if (it->top_derr_ != NULL) {  // store diffusion errors for next block
+    StoreDiffusionErrors(it, rd);
+  }
 }
 
 //------------------------------------------------------------------------------
diff --git a/thirdparty/libwebp/src/enc/vp8i_enc.h b/thirdparty/libwebp/src/enc/vp8i_enc.h
index 3463491e9d..624e8f8e66 100644
--- a/thirdparty/libwebp/src/enc/vp8i_enc.h
+++ b/thirdparty/libwebp/src/enc/vp8i_enc.h
@@ -30,9 +30,9 @@ extern "C" {
 // Various defines and enums
 
 // version numbers
-#define ENC_MAJ_VERSION 0
-#define ENC_MIN_VERSION 6
-#define ENC_REV_VERSION 1
+#define ENC_MAJ_VERSION 1
+#define ENC_MIN_VERSION 0
+#define ENC_REV_VERSION 0
 
 enum { MAX_LF_LEVELS = 64,       // Maximum loop filter level
        MAX_VARIABLE_LEVEL = 67,  // last (inclusive) level with variable cost
@@ -120,6 +120,9 @@ static WEBP_INLINE int QUANTDIV(uint32_t n, uint32_t iQ, uint32_t B) {
 // Uncomment the following to remove token-buffer code:
 // #define DISABLE_TOKEN_BUFFER
 
+// quality below which error-diffusion is enabled
+#define ERROR_DIFFUSION_QUALITY 98
+
 //------------------------------------------------------------------------------
 // Headers
 
@@ -201,6 +204,8 @@ typedef struct {
   score_t i4_penalty_;   // penalty for using Intra4
 } VP8SegmentInfo;
 
+typedef int8_t DError[2 /* u/v */][2 /* top or left */];
+
 // Handy transient struct to accumulate score and info during RD-optimization
 // and mode evaluation.
 typedef struct {
@@ -213,6 +218,7 @@ typedef struct {
   uint8_t modes_i4[16];       // mode numbers for intra4 predictions
   int mode_uv;                // mode number of chroma prediction
   uint32_t nz;                // non-zero blocks
+  int8_t derr[2][3];          // DC diffusion errors for U/V for blocks #1/2/3
 } VP8ModeScore;
 
 // Iterator structure to iterate through macroblocks, pointing to the
@@ -242,6 +248,9 @@ typedef struct {
   int           count_down0_;      // starting counter value (for progress)
   int           percent0_;         // saved initial progress percent
 
+  DError        left_derr_;        // left error diffusion (u/v)
+  DError       *top_derr_;         // top diffusion error - NULL if disabled
+
   uint8_t* y_left_;    // left luma samples (addressable from index -1 to 15).
   uint8_t* u_left_;    // left u samples (addressable from index -1 to 7)
   uint8_t* v_left_;    // left v samples (addressable from index -1 to 7)
@@ -401,6 +410,7 @@ struct VP8Encoder {
   uint8_t*   uv_top_;    // top u/v samples.
                          // U and V are packed into 16 bytes (8 U + 8 V)
   LFStats*   lf_stats_;  // autofilter stats (if NULL, autofilter is off)
+  DError*    top_derr_;  // diffusion error (NULL if disabled)
 };
 
 //------------------------------------------------------------------------------
diff --git a/thirdparty/libwebp/src/enc/vp8l_enc.c b/thirdparty/libwebp/src/enc/vp8l_enc.c
index 312e521906..a89184eb08 100644
--- a/thirdparty/libwebp/src/enc/vp8l_enc.c
+++ b/thirdparty/libwebp/src/enc/vp8l_enc.c
@@ -26,8 +26,6 @@
 #include "src/utils/utils.h"
 #include "src/webp/format_constants.h"
 
-#include "src/enc/delta_palettization_enc.h"
-
 // Maximum number of histogram images (sub-blocks).
 #define MAX_HUFF_IMAGE_SIZE       2600
 
@@ -259,7 +257,7 @@ static int AnalyzeEntropy(const uint32_t* argb,
       ++histo[kHistoAlphaPred * 256];
 
       for (j = 0; j < kHistoTotal; ++j) {
-        entropy_comp[j] = VP8LBitsEntropy(&histo[j * 256], 256, NULL);
+        entropy_comp[j] = VP8LBitsEntropy(&histo[j * 256], 256);
       }
       entropy[kDirect] = entropy_comp[kHistoAlpha] +
           entropy_comp[kHistoRed] +
@@ -384,8 +382,7 @@ static int EncoderAnalyze(VP8LEncoder* const enc,
       AnalyzeAndCreatePalette(pic, low_effort,
                               enc->palette_, &enc->palette_size_);
 
-  // TODO(jyrki): replace the decision to be based on an actual estimate
-  // of entropy, or even spatial variance of entropy.
+  // Empirical bit sizes.
   enc->histo_bits_ = GetHistoBits(method, use_palette,
                                   pic->width, pic->height);
   enc->transform_bits_ = GetTransformBits(method, enc->histo_bits_);
@@ -756,7 +753,6 @@ static WebPEncodingError StoreImageToBitMask(
       // Don't write the distance with the extra bits code since
       // the distance can be up to 18 bits of extra bits, and the prefix
       // 15 bits, totaling to 33, and our PutBits only supports up to 32 bits.
-      // TODO(jyrki): optimize this further.
       VP8LPrefixEncode(distance, &code, &n_bits, &bits);
       WriteHuffmanCode(bw, codes + 4, code);
       VP8LPutBits(bw, bits, n_bits);
@@ -1464,49 +1460,6 @@ static WebPEncodingError EncodePalette(VP8LBitWriter* const bw, int low_effort,
                               20 /* quality */, low_effort);
 }
 
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-
-static WebPEncodingError EncodeDeltaPalettePredictorImage(
-    VP8LBitWriter* const bw, VP8LEncoder* const enc, int quality,
-    int low_effort) {
-  const WebPPicture* const pic = enc->pic_;
-  const int width = pic->width;
-  const int height = pic->height;
-
-  const int pred_bits = 5;
-  const int transform_width = VP8LSubSampleSize(width, pred_bits);
-  const int transform_height = VP8LSubSampleSize(height, pred_bits);
-  const int pred = 7;   // default is Predictor7 (Top/Left Average)
-  const int tiles_per_row = VP8LSubSampleSize(width, pred_bits);
-  const int tiles_per_col = VP8LSubSampleSize(height, pred_bits);
-  uint32_t* predictors;
-  int tile_x, tile_y;
-  WebPEncodingError err = VP8_ENC_OK;
-
-  predictors = (uint32_t*)WebPSafeMalloc(tiles_per_col * tiles_per_row,
-                                         sizeof(*predictors));
-  if (predictors == NULL) return VP8_ENC_ERROR_OUT_OF_MEMORY;
-
-  for (tile_y = 0; tile_y < tiles_per_col; ++tile_y) {
-    for (tile_x = 0; tile_x < tiles_per_row; ++tile_x) {
-      predictors[tile_y * tiles_per_row + tile_x] = 0xff000000u | (pred << 8);
-    }
-  }
-
-  VP8LPutBits(bw, TRANSFORM_PRESENT, 1);
-  VP8LPutBits(bw, PREDICTOR_TRANSFORM, 2);
-  VP8LPutBits(bw, pred_bits - 2, 3);
-  err = EncodeImageNoHuffman(
-      bw, predictors, &enc->hash_chain_,
-      (VP8LBackwardRefs*)&enc->refs_[0],  // cast const away
-      (VP8LBackwardRefs*)&enc->refs_[1],
-      transform_width, transform_height, quality, low_effort);
-  WebPSafeFree(predictors);
-  return err;
-}
-
-#endif // WEBP_EXPERIMENTAL_FEATURES
-
 // -----------------------------------------------------------------------------
 // VP8LEncoder
 
@@ -1568,7 +1521,7 @@ static int EncodeStreamHook(void* input, void* data2) {
   WebPEncodingError err = VP8_ENC_OK;
   const int quality = (int)config->quality;
   const int low_effort = (config->method == 0);
-#if (WEBP_NEAR_LOSSLESS == 1) || defined(WEBP_EXPERIMENTAL_FEATURES)
+#if (WEBP_NEAR_LOSSLESS == 1)
   const int width = picture->width;
 #endif
   const int height = picture->height;
@@ -1627,29 +1580,6 @@ static int EncodeStreamHook(void* input, void* data2) {
     enc->argb_content_ = kEncoderNone;
 #endif
 
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-    if (config->use_delta_palette) {
-      enc->use_predict_ = 1;
-      enc->use_cross_color_ = 0;
-      enc->use_subtract_green_ = 0;
-      enc->use_palette_ = 1;
-      if (enc->argb_content_ != kEncoderNearLossless &&
-          enc->argb_content_ != kEncoderPalette) {
-        err = MakeInputImageCopy(enc);
-        if (err != VP8_ENC_OK) goto Error;
-      }
-      err = WebPSearchOptimalDeltaPalette(enc);
-      if (err != VP8_ENC_OK) goto Error;
-      if (enc->use_palette_) {
-        err = AllocateTransformBuffer(enc, width, height);
-        if (err != VP8_ENC_OK) goto Error;
-        err = EncodeDeltaPalettePredictorImage(bw, enc, quality, low_effort);
-        if (err != VP8_ENC_OK) goto Error;
-        use_delta_palette = 1;
-      }
-    }
-#endif  // WEBP_EXPERIMENTAL_FEATURES
-
     // Encode palette
     if (enc->use_palette_) {
       err = EncodePalette(bw, low_effort, enc);
@@ -1822,7 +1752,7 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
       worker_interface->Init(worker);
       worker->data1 = param;
       worker->data2 = NULL;
-      worker->hook = (WebPWorkerHook)EncodeStreamHook;
+      worker->hook = EncodeStreamHook;
     }
   }
 
@@ -1944,7 +1874,6 @@ int VP8LEncodeImage(const WebPConfig* const config,
   err = VP8LEncodeStream(config, picture, &bw, 1 /*use_cache*/);
   if (err != VP8_ENC_OK) goto Error;
 
-  // TODO(skal): have a fine-grained progress report in VP8LEncodeStream().
   if (!WebPReportProgress(picture, 90, &percent)) goto UserAbort;
 
   // Finish the RIFF chunk.
diff --git a/thirdparty/libwebp/src/enc/webp_enc.c b/thirdparty/libwebp/src/enc/webp_enc.c
index 283cda8e7b..9f4b10c26c 100644
--- a/thirdparty/libwebp/src/enc/webp_enc.c
+++ b/thirdparty/libwebp/src/enc/webp_enc.c
@@ -159,12 +159,16 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
       + WEBP_ALIGN_CST;                      // align all
   const size_t lf_stats_size =
       config->autofilter ? sizeof(*enc->lf_stats_) + WEBP_ALIGN_CST : 0;
+  const size_t top_derr_size =
+      (config->quality <= ERROR_DIFFUSION_QUALITY || config->pass > 1) ?
+          mb_w * sizeof(*enc->top_derr_) : 0;
   uint8_t* mem;
   const uint64_t size = (uint64_t)sizeof(*enc)   // main struct
                       + WEBP_ALIGN_CST           // cache alignment
                       + info_size                // modes info
                       + preds_size               // prediction modes
                       + samples_size             // top/left samples
+                      + top_derr_size            // top diffusion error
                       + nz_size                  // coeff context bits
                       + lf_stats_size;           // autofilter stats
 
@@ -175,11 +179,12 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
          "                info: %ld\n"
          "               preds: %ld\n"
          "         top samples: %ld\n"
+         "       top diffusion: %ld\n"
          "            non-zero: %ld\n"
          "            lf-stats: %ld\n"
          "               total: %ld\n",
          sizeof(*enc) + WEBP_ALIGN_CST, info_size,
-         preds_size, samples_size, nz_size, lf_stats_size, size);
+         preds_size, samples_size, top_derr_size, nz_size, lf_stats_size, size);
   printf("Transient object sizes:\n"
          "      VP8EncIterator: %ld\n"
          "        VP8ModeScore: %ld\n"
@@ -219,6 +224,8 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
   enc->y_top_ = mem;
   enc->uv_top_ = enc->y_top_ + top_stride;
   mem += 2 * top_stride;
+  enc->top_derr_ = top_derr_size ? (DError*)mem : NULL;
+  mem += top_derr_size;
   assert(mem <= (uint8_t*)enc + size);
 
   enc->config_ = config;