16 files changed, 1568 insertions, 778 deletions
diff --git a/drivers/webp/enc/alpha.c b/drivers/webp/enc/alpha.c
index 1842b36401..464df4db09 100644
--- a/drivers/webp/enc/alpha.c
+++ b/drivers/webp/enc/alpha.c
@@ -67,6 +67,11 @@ static int EncodeLossless(const uint8_t* const data, int width, int height,
 
   WebPConfigInit(&config);
   config.lossless = 1;
+  // Enable exact, or it would alter RGB values of transparent alpha, which is
+  // normally OK but not here since we are not encoding the input image but  an
+  // internal encoding-related image containing necessary exact information in
+  // RGB channels.
+  config.exact = 1;
   config.method = effort_level;  // impact is very small
   // Set a low default quality for encoding alpha. Ensure that Alpha quality at
   // lower methods (3 and below) is less than the threshold for triggering
@@ -74,7 +79,11 @@ static int EncodeLossless(const uint8_t* const data, int width, int height,
   config.quality = 8.f * effort_level;
   assert(config.quality >= 0 && config.quality <= 100.f);
 
-  ok = (VP8LEncodeStream(&config, &picture, bw) == VP8_ENC_OK);
+  // TODO(urvang): Temporary fix to avoid generating images that trigger
+  // a decoder bug related to alpha with color cache.
+  // See: https://code.google.com/p/webp/issues/detail?id=239
+  // Need to re-enable this later.
+  ok = (VP8LEncodeStream(&config, &picture, bw, 0 /*use_cache*/) == VP8_ENC_OK);
   WebPPictureFree(&picture);
   ok = ok && !bw->error_;
   if (!ok) {
@@ -113,7 +122,6 @@ static int EncodeAlphaInternal(const uint8_t* const data, int width, int height,
   assert(method >= ALPHA_NO_COMPRESSION);
   assert(method <= ALPHA_LOSSLESS_COMPRESSION);
   assert(sizeof(header) == ALPHA_HEADER_LEN);
-  // TODO(skal): have a common function and #define's to validate alpha params.
 
   filter_func = WebPFilters[filter];
   if (filter_func != NULL) {
diff --git a/drivers/webp/enc/backward_references.c b/drivers/webp/enc/backward_references.c
index 049125e521..136a24a8c3 100644
--- a/drivers/webp/enc/backward_references.c
+++ b/drivers/webp/enc/backward_references.c
@@ -16,6 +16,7 @@
 #include "./backward_references.h"
 #include "./histogram.h"
 #include "../dsp/lossless.h"
+#include "../dsp/dsp.h"
 #include "../utils/color_cache.h"
 #include "../utils/utils.h"
 
@@ -26,11 +27,19 @@
 #define MAX_ENTROPY    (1e30f)
 
 // 1M window (4M bytes) minus 120 special codes for short distances.
-#define WINDOW_SIZE ((1 << 20) - 120)
+#define WINDOW_SIZE_BITS 20
+#define WINDOW_SIZE ((1 << WINDOW_SIZE_BITS) - 120)
 
 // Bounds for the match length.
 #define MIN_LENGTH 2
-#define MAX_LENGTH 4096
+// If you change this, you need MAX_LENGTH_BITS + WINDOW_SIZE_BITS <= 32 as it
+// is used in VP8LHashChain.
+#define MAX_LENGTH_BITS 12
+// We want the max value to be attainable and stored in MAX_LENGTH_BITS bits.
+#define MAX_LENGTH ((1 << MAX_LENGTH_BITS) - 1)
+#if MAX_LENGTH_BITS + WINDOW_SIZE_BITS > 32
+#error "MAX_LENGTH_BITS + WINDOW_SIZE_BITS > 32"
+#endif
 
 // -----------------------------------------------------------------------------
 
@@ -56,46 +65,19 @@ static int DistanceToPlaneCode(int xsize, int dist) {
   return dist + 120;
 }
 
+// Returns the exact index where array1 and array2 are different. For an index
+// inferior or equal to best_len_match, the return value just has to be strictly
+// inferior to best_len_match. The current behavior is to return 0 if this index
+// is best_len_match, and the index itself otherwise.
+// If no two elements are the same, it returns max_limit.
 static WEBP_INLINE int FindMatchLength(const uint32_t* const array1,
                                        const uint32_t* const array2,
-                                       int best_len_match,
-                                       int max_limit) {
-#if !defined(__x86_64__)
-  // TODO(vrabaud): Compare on other architectures.
-  int match_len = 0;
+                                       int best_len_match, int max_limit) {
   // Before 'expensive' linear match, check if the two arrays match at the
   // current best length index.
   if (array1[best_len_match] != array2[best_len_match]) return 0;
-  while (match_len < max_limit && array1[match_len] == array2[match_len]) {
-    ++match_len;
-  }
-  return match_len;
-#else
-  const uint32_t* array1_32 = array1;
-  const uint32_t* array2_32 = array2;
-  // max value is aligned to (uint64_t*) array1
-  const uint32_t* const array1_32_max = array1 + (max_limit & ~1);
 
-  // Before 'expensive' linear match, check if the two arrays match at the
-  // current best length index.
-  if (array1[best_len_match] != array2[best_len_match]) return 0;
-
-  // TODO(vrabaud): add __predict_true on bound checking?
-  while (array1_32 < array1_32_max) {
-    if (*(uint64_t*)array1_32 == *(uint64_t*)array2_32) {
-      array1_32 += 2;
-      array2_32 += 2;
-    } else {
-      // if the uint32_t pointed to are the same, then the following ones have
-      // to be different
-      return (array1_32 - array1) + (*array1_32 == *array2_32);
-    }
-  }
-
-  // Deal with the potential last uint32_t.
-  if ((max_limit & 1) && (*array1_32 != *array2_32)) return max_limit - 1;
-  return max_limit;
-#endif
+  return VP8LVectorMismatch(array1, array2, max_limit);
 }
 
 // -----------------------------------------------------------------------------
@@ -207,34 +189,24 @@ int VP8LBackwardRefsCopy(const VP8LBackwardRefs* const src,
 // -----------------------------------------------------------------------------
 // Hash chains
 
-// initialize as empty
-static void HashChainReset(VP8LHashChain* const p) {
-  int i;
-  assert(p != NULL);
-  for (i = 0; i < p->size_; ++i) {
-    p->chain_[i] = -1;
-  }
-  for (i = 0; i < HASH_SIZE; ++i) {
-    p->hash_to_first_index_[i] = -1;
-  }
-}
-
 int VP8LHashChainInit(VP8LHashChain* const p, int size) {
   assert(p->size_ == 0);
-  assert(p->chain_ == NULL);
+  assert(p->offset_length_ == NULL);
   assert(size > 0);
-  p->chain_ = (int*)WebPSafeMalloc(size, sizeof(*p->chain_));
-  if (p->chain_ == NULL) return 0;
+  p->offset_length_ =
+      (uint32_t*)WebPSafeMalloc(size, sizeof(*p->offset_length_));
+  if (p->offset_length_ == NULL) return 0;
   p->size_ = size;
-  HashChainReset(p);
+
   return 1;
 }
 
 void VP8LHashChainClear(VP8LHashChain* const p) {
   assert(p != NULL);
-  WebPSafeFree(p->chain_);
+  WebPSafeFree(p->offset_length_);
+
   p->size_ = 0;
-  p->chain_ = NULL;
+  p->offset_length_ = NULL;
 }
 
 // -----------------------------------------------------------------------------
@@ -250,18 +222,10 @@ static WEBP_INLINE uint32_t GetPixPairHash64(const uint32_t* const argb) {
   return key;
 }
 
-// Insertion of two pixels at a time.
-static void HashChainInsert(VP8LHashChain* const p,
-                            const uint32_t* const argb, int pos) {
-  const uint32_t hash_code = GetPixPairHash64(argb);
-  p->chain_[pos] = p->hash_to_first_index_[hash_code];
-  p->hash_to_first_index_[hash_code] = pos;
-}
-
 // Returns the maximum number of hash chain lookups to do for a
-// given compression quality. Return value in range [6, 86].
-static int GetMaxItersForQuality(int quality, int low_effort) {
-  return (low_effort ? 6 : 8) + (quality * quality) / 128;
+// given compression quality. Return value in range [8, 86].
+static int GetMaxItersForQuality(int quality) {
+  return 8 + (quality * quality) / 128;
 }
 
 static int GetWindowSizeForHashChain(int quality, int xsize) {
@@ -277,63 +241,120 @@ static WEBP_INLINE int MaxFindCopyLength(int len) {
   return (len < MAX_LENGTH) ? len : MAX_LENGTH;
 }
 
-static void HashChainFindOffset(const VP8LHashChain* const p, int base_position,
-                                const uint32_t* const argb, int len,
-                                int window_size, int* const distance_ptr) {
-  const uint32_t* const argb_start = argb + base_position;
-  const int min_pos =
-      (base_position > window_size) ? base_position - window_size : 0;
-  int pos;
-  assert(len <= MAX_LENGTH);
-  for (pos = p->hash_to_first_index_[GetPixPairHash64(argb_start)];
-       pos >= min_pos;
-       pos = p->chain_[pos]) {
-    const int curr_length =
-        FindMatchLength(argb + pos, argb_start, len - 1, len);
-    if (curr_length == len) break;
-  }
-  *distance_ptr = base_position - pos;
-}
-
-static int HashChainFindCopy(const VP8LHashChain* const p,
-                             int base_position,
-                             const uint32_t* const argb, int max_len,
-                             int window_size, int iter_max,
-                             int* const distance_ptr,
-                             int* const length_ptr) {
-  const uint32_t* const argb_start = argb + base_position;
-  int iter = iter_max;
-  int best_length = 0;
-  int best_distance = 0;
-  const int min_pos =
-      (base_position > window_size) ? base_position - window_size : 0;
+int VP8LHashChainFill(VP8LHashChain* const p, int quality,
+                      const uint32_t* const argb, int xsize, int ysize) {
+  const int size = xsize * ysize;
+  const int iter_max = GetMaxItersForQuality(quality);
+  const int iter_min = iter_max - quality / 10;
+  const uint32_t window_size = GetWindowSizeForHashChain(quality, xsize);
   int pos;
-  int length_max = 256;
-  if (max_len < length_max) {
-    length_max = max_len;
+  uint32_t base_position;
+  int32_t* hash_to_first_index;
+  // Temporarily use the p->offset_length_ as a hash chain.
+  int32_t* chain = (int32_t*)p->offset_length_;
+  assert(p->size_ != 0);
+  assert(p->offset_length_ != NULL);
+
+  hash_to_first_index =
+      (int32_t*)WebPSafeMalloc(HASH_SIZE, sizeof(*hash_to_first_index));
+  if (hash_to_first_index == NULL) return 0;
+
+  // Set the int32_t array to -1.
+  memset(hash_to_first_index, 0xff, HASH_SIZE * sizeof(*hash_to_first_index));
+  // Fill the chain linking pixels with the same hash.
+  for (pos = 0; pos < size - 1; ++pos) {
+    const uint32_t hash_code = GetPixPairHash64(argb + pos);
+    chain[pos] = hash_to_first_index[hash_code];
+    hash_to_first_index[hash_code] = pos;
   }
-  for (pos = p->hash_to_first_index_[GetPixPairHash64(argb_start)];
-       pos >= min_pos;
-       pos = p->chain_[pos]) {
-    int curr_length;
-    int distance;
-    if (--iter < 0) {
-      break;
+  WebPSafeFree(hash_to_first_index);
+
+  // Find the best match interval at each pixel, defined by an offset to the
+  // pixel and a length. The right-most pixel cannot match anything to the right
+  // (hence a best length of 0) and the left-most pixel nothing to the left
+  // (hence an offset of 0).
+  p->offset_length_[0] = p->offset_length_[size - 1] = 0;
+  for (base_position = size - 2 < 0 ? 0 : size - 2; base_position > 0;) {
+    const int max_len = MaxFindCopyLength(size - 1 - base_position);
+    const uint32_t* const argb_start = argb + base_position;
+    int iter = iter_max;
+    int best_length = 0;
+    uint32_t best_distance = 0;
+    const int min_pos =
+        (base_position > window_size) ? base_position - window_size : 0;
+    const int length_max = (max_len < 256) ? max_len : 256;
+    uint32_t max_base_position;
+
+    for (pos = chain[base_position]; pos >= min_pos; pos = chain[pos]) {
+      int curr_length;
+      if (--iter < 0) {
+        break;
+      }
+      assert(base_position > (uint32_t)pos);
+
+      curr_length =
+          FindMatchLength(argb + pos, argb_start, best_length, max_len);
+      if (best_length < curr_length) {
+        best_length = curr_length;
+        best_distance = base_position - pos;
+        // Stop if we have reached the maximum length. Otherwise, make sure
+        // we have executed a minimum number of iterations depending on the
+        // quality.
+        if ((best_length == MAX_LENGTH) ||
+            (curr_length >= length_max && iter < iter_min)) {
+          break;
+        }
+      }
     }
-
-    curr_length = FindMatchLength(argb + pos, argb_start, best_length, max_len);
-    if (best_length < curr_length) {
-      distance = base_position - pos;
-      best_length = curr_length;
-      best_distance = distance;
-      if (curr_length >= length_max) {
+    // We have the best match but in case the two intervals continue matching
+    // to the left, we have the best matches for the left-extended pixels.
+    max_base_position = base_position;
+    while (1) {
+      assert(best_length <= MAX_LENGTH);
+      assert(best_distance <= WINDOW_SIZE);
+      p->offset_length_[base_position] =
+          (best_distance << MAX_LENGTH_BITS) | (uint32_t)best_length;
+      --base_position;
+      // Stop if we don't have a match or if we are out of bounds.
+      if (best_distance == 0 || base_position == 0) break;
+      // Stop if we cannot extend the matching intervals to the left.
+      if (base_position < best_distance ||
+          argb[base_position - best_distance] != argb[base_position]) {
+        break;
+      }
+      // Stop if we are matching at its limit because there could be a closer
+      // matching interval with the same maximum length. Then again, if the
+      // matching interval is as close as possible (best_distance == 1), we will
+      // never find anything better so let's continue.
+      if (best_length == MAX_LENGTH && best_distance != 1 &&
+          base_position + MAX_LENGTH < max_base_position) {
         break;
       }
+      if (best_length < MAX_LENGTH) {
+        ++best_length;
+        max_base_position = base_position;
+      }
     }
   }
-  *distance_ptr = best_distance;
-  *length_ptr = best_length;
-  return (best_length >= MIN_LENGTH);
+  return 1;
+}
+
+static WEBP_INLINE int HashChainFindOffset(const VP8LHashChain* const p,
+                                           const int base_position) {
+  return p->offset_length_[base_position] >> MAX_LENGTH_BITS;
+}
+
+static WEBP_INLINE int HashChainFindLength(const VP8LHashChain* const p,
+                                           const int base_position) {
+  return p->offset_length_[base_position] & ((1U << MAX_LENGTH_BITS) - 1);
+}
+
+static WEBP_INLINE void HashChainFindCopy(const VP8LHashChain* const p,
+                                          int base_position,
+                                          int* const offset_ptr,
+                                          int* const length_ptr) {
+  *offset_ptr = HashChainFindOffset(p, base_position);
+  *length_ptr = HashChainFindLength(p, base_position);
 }
 
 static WEBP_INLINE void AddSingleLiteral(uint32_t pixel, int use_color_cache,
@@ -400,84 +421,62 @@ static int BackwardReferencesRle(int xsize, int ysize,
 
 static int BackwardReferencesLz77(int xsize, int ysize,
                                   const uint32_t* const argb, int cache_bits,
-                                  int quality, int low_effort,
-                                  VP8LHashChain* const hash_chain,
+                                  const VP8LHashChain* const hash_chain,
                                   VP8LBackwardRefs* const refs) {
   int i;
+  int i_last_check = -1;
   int ok = 0;
   int cc_init = 0;
   const int use_color_cache = (cache_bits > 0);
   const int pix_count = xsize * ysize;
   VP8LColorCache hashers;
-  int iter_max = GetMaxItersForQuality(quality, low_effort);
-  const int window_size = GetWindowSizeForHashChain(quality, xsize);
-  int min_matches = 32;
 
   if (use_color_cache) {
     cc_init = VP8LColorCacheInit(&hashers, cache_bits);
     if (!cc_init) goto Error;
   }
   ClearBackwardRefs(refs);
-  HashChainReset(hash_chain);
-  for (i = 0; i < pix_count - 2; ) {
+  for (i = 0; i < pix_count;) {
     // Alternative#1: Code the pixels starting at 'i' using backward reference.
     int offset = 0;
     int len = 0;
-    const int max_len = MaxFindCopyLength(pix_count - i);
-    HashChainFindCopy(hash_chain, i, argb, max_len, window_size,
-                      iter_max, &offset, &len);
-    if (len > MIN_LENGTH || (len == MIN_LENGTH && offset <= 512)) {
-      int offset2 = 0;
-      int len2 = 0;
-      int k;
-      min_matches = 8;
-      HashChainInsert(hash_chain, &argb[i], i);
-      if ((len < (max_len >> 2)) && !low_effort) {
-        // Evaluate Alternative#2: Insert the pixel at 'i' as literal, and code
-        // the pixels starting at 'i + 1' using backward reference.
-        HashChainFindCopy(hash_chain, i + 1, argb, max_len - 1,
-                          window_size, iter_max, &offset2,
-                          &len2);
-        if (len2 > len + 1) {
-          AddSingleLiteral(argb[i], use_color_cache, &hashers, refs);
-          i++;  // Backward reference to be done for next pixel.
-          len = len2;
-          offset = offset2;
-        }
-      }
-      BackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(offset, len));
-      if (use_color_cache) {
-        for (k = 0; k < len; ++k) {
-          VP8LColorCacheInsert(&hashers, argb[i + k]);
-        }
-      }
-      // Add to the hash_chain (but cannot add the last pixel).
-      if (offset >= 3 && offset != xsize) {
-        const int last = (len < pix_count - 1 - i) ? len : pix_count - 1 - i;
-        for (k = 2; k < last - 8; k += 2) {
-          HashChainInsert(hash_chain, &argb[i + k], i + k);
-        }
-        for (; k < last; ++k) {
-          HashChainInsert(hash_chain, &argb[i + k], i + k);
+    int j;
+    HashChainFindCopy(hash_chain, i, &offset, &len);
+    if (len > MIN_LENGTH + 1) {
+      const int len_ini = len;
+      int max_reach = 0;
+      assert(i + len < pix_count);
+      // Only start from what we have not checked already.
+      i_last_check = (i > i_last_check) ? i : i_last_check;
+      // We know the best match for the current pixel but we try to find the
+      // best matches for the current pixel AND the next one combined.
+      // The naive method would use the intervals:
+      // [i,i+len) + [i+len, length of best match at i+len)
+      // while we check if we can use:
+      // [i,j) (where j<=i+len) + [j, length of best match at j)
+      for (j = i_last_check + 1; j <= i + len_ini; ++j) {
+        const int len_j = HashChainFindLength(hash_chain, j);
+        const int reach =
+            j + (len_j > MIN_LENGTH + 1 ? len_j : 1);  // 1 for single literal.
+        if (reach > max_reach) {
+          len = j - i;
+          max_reach = reach;
         }
       }
-      i += len;
     } else {
+      len = 1;
+    }
+    // Go with literal or backward reference.
+    assert(len > 0);
+    if (len == 1) {
       AddSingleLiteral(argb[i], use_color_cache, &hashers, refs);
-      HashChainInsert(hash_chain, &argb[i], i);
-      ++i;
-      --min_matches;
-      if (min_matches <= 0) {
-        AddSingleLiteral(argb[i], use_color_cache, &hashers, refs);
-        HashChainInsert(hash_chain, &argb[i], i);
-        ++i;
+    } else {
+      BackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(offset, len));
+      if (use_color_cache) {
+        for (j = i; j < i + len; ++j) VP8LColorCacheInsert(&hashers, argb[j]);
       }
     }
-  }
-  while (i < pix_count) {
-    // Handle the last pixel(s).
-    AddSingleLiteral(argb[i], use_color_cache, &hashers, refs);
-    ++i;
+    i += len;
   }
 
   ok = !refs->error_;
@@ -498,7 +497,7 @@ typedef struct {
 
 static int BackwardReferencesTraceBackwards(
     int xsize, int ysize, const uint32_t* const argb, int quality,
-    int cache_bits, VP8LHashChain* const hash_chain,
+    int cache_bits, const VP8LHashChain* const hash_chain,
     VP8LBackwardRefs* const refs);
 
 static void ConvertPopulationCountTableToBitEstimates(
@@ -574,16 +573,14 @@ static WEBP_INLINE double GetDistanceCost(const CostModel* const m,
   return m->distance_[code] + extra_bits;
 }
 
-static void AddSingleLiteralWithCostModel(
-    const uint32_t* const argb, VP8LHashChain* const hash_chain,
-    VP8LColorCache* const hashers, const CostModel* const cost_model, int idx,
-    int is_last, int use_color_cache, double prev_cost, float* const cost,
-    uint16_t* const dist_array) {
+static void AddSingleLiteralWithCostModel(const uint32_t* const argb,
+                                          VP8LColorCache* const hashers,
+                                          const CostModel* const cost_model,
+                                          int idx, int use_color_cache,
+                                          double prev_cost, float* const cost,
+                                          uint16_t* const dist_array) {
   double cost_val = prev_cost;
   const uint32_t color = argb[0];
-  if (!is_last) {
-    HashChainInsert(hash_chain, argb, idx);
-  }
   if (use_color_cache && VP8LColorCacheContains(hashers, color)) {
     const double mul0 = 0.68;
     const int ix = VP8LColorCacheGetIndex(hashers, color);
@@ -599,30 +596,598 @@ static void AddSingleLiteralWithCostModel(
   }
 }
 
+// -----------------------------------------------------------------------------
+// CostManager and interval handling
+
+// Empirical value to avoid high memory consumption but good for performance.
+#define COST_CACHE_INTERVAL_SIZE_MAX 100
+
+// To perform backward reference every pixel at index index_ is considered and
+// the cost for the MAX_LENGTH following pixels computed. Those following pixels
+// at index index_ + k (k from 0 to MAX_LENGTH) have a cost of:
+//     distance_cost_ at index_ + GetLengthCost(cost_model, k)
+//            (named cost)            (named cached cost)
+// and the minimum value is kept. GetLengthCost(cost_model, k) is cached in an
+// array of size MAX_LENGTH.
+// Instead of performing MAX_LENGTH comparisons per pixel, we keep track of the
+// minimal values using intervals, for which lower_ and upper_ bounds are kept.
+// An interval is defined by the index_ of the pixel that generated it and
+// is only useful in a range of indices from start_ to end_ (exclusive), i.e.
+// it contains the minimum value for pixels between start_ and end_.
+// Intervals are stored in a linked list and ordered by start_. When a new
+// interval has a better minimum, old intervals are split or removed.
+typedef struct CostInterval CostInterval;
+struct CostInterval {
+  double lower_;
+  double upper_;
+  int start_;
+  int end_;
+  double distance_cost_;
+  int index_;
+  CostInterval* previous_;
+  CostInterval* next_;
+};
+
+// The GetLengthCost(cost_model, k) part of the costs is also bounded for
+// efficiency in a set of intervals of a different type.
+// If those intervals are small enough, they are not used for comparison and
+// written into the costs right away.
+typedef struct {
+  double lower_;  // Lower bound of the interval.
+  double upper_;  // Upper bound of the interval.
+  int start_;
+  int end_;       // Exclusive.
+  int do_write_;  // If !=0, the interval is saved to cost instead of being kept
+                  // for comparison.
+} CostCacheInterval;
+
+// This structure is in charge of managing intervals and costs.
+// It caches the different CostCacheInterval, caches the different
+// GetLengthCost(cost_model, k) in cost_cache_ and the CostInterval's (whose
+// count_ is limited by COST_CACHE_INTERVAL_SIZE_MAX).
+#define COST_MANAGER_MAX_FREE_LIST 10
+typedef struct {
+  CostInterval* head_;
+  int count_;  // The number of stored intervals.
+  CostCacheInterval* cache_intervals_;
+  size_t cache_intervals_size_;
+  double cost_cache_[MAX_LENGTH];  // Contains the GetLengthCost(cost_model, k).
+  double min_cost_cache_;          // The minimum value in cost_cache_[1:].
+  double max_cost_cache_;          // The maximum value in cost_cache_[1:].
+  float* costs_;
+  uint16_t* dist_array_;
+  // Most of the time, we only need few intervals -> use a free-list, to avoid
+  // fragmentation with small allocs in most common cases.
+  CostInterval intervals_[COST_MANAGER_MAX_FREE_LIST];
+  CostInterval* free_intervals_;
+  // These are regularly malloc'd remains. This list can't grow larger than than
+  // size COST_CACHE_INTERVAL_SIZE_MAX - COST_MANAGER_MAX_FREE_LIST, note.
+  CostInterval* recycled_intervals_;
+  // Buffer used in BackwardReferencesHashChainDistanceOnly to store the ends
+  // of the intervals that can have impacted the cost at a pixel.
+  int* interval_ends_;
+  int interval_ends_size_;
+} CostManager;
+
+static int IsCostCacheIntervalWritable(int start, int end) {
+  // 100 is the length for which we consider an interval for comparison, and not
+  // for writing.
+  // The first intervals are very small and go in increasing size. This constant
+  // helps merging them into one big interval (up to index 150/200 usually from
+  // which intervals start getting much bigger).
+  // This value is empirical.
+  return (end - start + 1 < 100);
+}
+
+static void CostIntervalAddToFreeList(CostManager* const manager,
+                                      CostInterval* const interval) {
+  interval->next_ = manager->free_intervals_;
+  manager->free_intervals_ = interval;
+}
+
+static int CostIntervalIsInFreeList(const CostManager* const manager,
+                                    const CostInterval* const interval) {
+  return (interval >= &manager->intervals_[0] &&
+          interval <= &manager->intervals_[COST_MANAGER_MAX_FREE_LIST - 1]);
+}
+
+static void CostManagerInitFreeList(CostManager* const manager) {
+  int i;
+  manager->free_intervals_ = NULL;
+  for (i = 0; i < COST_MANAGER_MAX_FREE_LIST; ++i) {
+    CostIntervalAddToFreeList(manager, &manager->intervals_[i]);
+  }
+}
+
+static void DeleteIntervalList(CostManager* const manager,
+                               const CostInterval* interval) {
+  while (interval != NULL) {
+    const CostInterval* const next = interval->next_;
+    if (!CostIntervalIsInFreeList(manager, interval)) {
+      WebPSafeFree((void*)interval);
+    }  // else: do nothing
+    interval = next;
+  }
+}
+
+static void CostManagerClear(CostManager* const manager) {
+  if (manager == NULL) return;
+
+  WebPSafeFree(manager->costs_);
+  WebPSafeFree(manager->cache_intervals_);
+  WebPSafeFree(manager->interval_ends_);
+
+  // Clear the interval lists.
+  DeleteIntervalList(manager, manager->head_);
+  manager->head_ = NULL;
+  DeleteIntervalList(manager, manager->recycled_intervals_);
+  manager->recycled_intervals_ = NULL;
+
+  // Reset pointers, count_ and cache_intervals_size_.
+  memset(manager, 0, sizeof(*manager));
+  CostManagerInitFreeList(manager);
+}
+
+static int CostManagerInit(CostManager* const manager,
+                           uint16_t* const dist_array, int pix_count,
+                           const CostModel* const cost_model) {
+  int i;
+  const int cost_cache_size = (pix_count > MAX_LENGTH) ? MAX_LENGTH : pix_count;
+  // This constant is tied to the cost_model we use.
+  // Empirically, differences between intervals is usually of more than 1.
+  const double min_cost_diff = 0.1;
+
+  manager->costs_ = NULL;
+  manager->cache_intervals_ = NULL;
+  manager->interval_ends_ = NULL;
+  manager->head_ = NULL;
+  manager->recycled_intervals_ = NULL;
+  manager->count_ = 0;
+  manager->dist_array_ = dist_array;
+  CostManagerInitFreeList(manager);
+
+  // Fill in the cost_cache_.
+  manager->cache_intervals_size_ = 1;
+  manager->cost_cache_[0] = 0;
+  for (i = 1; i < cost_cache_size; ++i) {
+    manager->cost_cache_[i] = GetLengthCost(cost_model, i);
+    // Get an approximation of the number of bound intervals.
+    if (fabs(manager->cost_cache_[i] - manager->cost_cache_[i - 1]) >
+        min_cost_diff) {
+      ++manager->cache_intervals_size_;
+    }
+    // Compute the minimum of cost_cache_.
+    if (i == 1) {
+      manager->min_cost_cache_ = manager->cost_cache_[1];
+      manager->max_cost_cache_ = manager->cost_cache_[1];
+    } else if (manager->cost_cache_[i] < manager->min_cost_cache_) {
+      manager->min_cost_cache_ = manager->cost_cache_[i];
+    } else if (manager->cost_cache_[i] > manager->max_cost_cache_) {
+      manager->max_cost_cache_ = manager->cost_cache_[i];
+    }
+  }
+
+  // With the current cost models, we have 15 intervals, so we are safe by
+  // setting a maximum of COST_CACHE_INTERVAL_SIZE_MAX.
+  if (manager->cache_intervals_size_ > COST_CACHE_INTERVAL_SIZE_MAX) {
+    manager->cache_intervals_size_ = COST_CACHE_INTERVAL_SIZE_MAX;
+  }
+  manager->cache_intervals_ = (CostCacheInterval*)WebPSafeMalloc(
+      manager->cache_intervals_size_, sizeof(*manager->cache_intervals_));
+  if (manager->cache_intervals_ == NULL) {
+    CostManagerClear(manager);
+    return 0;
+  }
+
+  // Fill in the cache_intervals_.
+  {
+    double cost_prev = -1e38f;  // unprobably low initial value
+    CostCacheInterval* prev = NULL;
+    CostCacheInterval* cur = manager->cache_intervals_;
+    const CostCacheInterval* const end =
+        manager->cache_intervals_ + manager->cache_intervals_size_;
+
+    // Consecutive values in cost_cache_ are compared and if a big enough
+    // difference is found, a new interval is created and bounded.
+    for (i = 0; i < cost_cache_size; ++i) {
+      const double cost_val = manager->cost_cache_[i];
+      if (i == 0 ||
+          (fabs(cost_val - cost_prev) > min_cost_diff && cur + 1 < end)) {
+        if (i > 1) {
+          const int is_writable =
+              IsCostCacheIntervalWritable(cur->start_, cur->end_);
+          // Merge with the previous interval if both are writable.
+          if (is_writable && cur != manager->cache_intervals_ &&
+              prev->do_write_) {
+            // Update the previous interval.
+            prev->end_ = cur->end_;
+            if (cur->lower_ < prev->lower_) {
+              prev->lower_ = cur->lower_;
+            } else if (cur->upper_ > prev->upper_) {
+              prev->upper_ = cur->upper_;
+            }
+          } else {
+            cur->do_write_ = is_writable;
+            prev = cur;
+            ++cur;
+          }
+        }
+        // Initialize an interval.
+        cur->start_ = i;
+        cur->do_write_ = 0;
+        cur->lower_ = cost_val;
+        cur->upper_ = cost_val;
+      } else {
+        // Update the current interval bounds.
+        if (cost_val < cur->lower_) {
+          cur->lower_ = cost_val;
+        } else if (cost_val > cur->upper_) {
+          cur->upper_ = cost_val;
+        }
+      }
+      cur->end_ = i + 1;
+      cost_prev = cost_val;
+    }
+    manager->cache_intervals_size_ = cur + 1 - manager->cache_intervals_;
+  }
+
+  manager->costs_ = (float*)WebPSafeMalloc(pix_count, sizeof(*manager->costs_));
+  if (manager->costs_ == NULL) {
+    CostManagerClear(manager);
+    return 0;
+  }
+  // Set the initial costs_ high for every pixel as we will keep the minimum.
+  for (i = 0; i < pix_count; ++i) manager->costs_[i] = 1e38f;
+
+  // The cost at pixel is influenced by the cost intervals from previous pixels.
+  // Let us take the specific case where the offset is the same (which actually
+  // happens a lot in case of uniform regions).
+  // pixel i contributes to j>i a cost of: offset cost + cost_cache_[j-i]
+  // pixel i+1 contributes to j>i a cost of: 2*offset cost + cost_cache_[j-i-1]
+  // pixel i+2 contributes to j>i a cost of: 3*offset cost + cost_cache_[j-i-2]
+  // and so on.
+  // A pixel i influences the following length(j) < MAX_LENGTH pixels. What is
+  // the value of j such that pixel i + j cannot influence any of those pixels?
+  // This value is such that:
+  //               max of cost_cache_ < j*offset cost + min of cost_cache_
+  // (pixel i + j 's cost cannot beat the worst cost given by pixel i).
+  // This value will be used to optimize the cost computation in
+  // BackwardReferencesHashChainDistanceOnly.
+  {
+    // The offset cost is computed in GetDistanceCost and has a minimum value of
+    // the minimum in cost_model->distance_. The case where the offset cost is 0
+    // will be dealt with differently later so we are only interested in the
+    // minimum non-zero offset cost.
+    double offset_cost_min = 0.;
+    int size;
+    for (i = 0; i < NUM_DISTANCE_CODES; ++i) {
+      if (cost_model->distance_[i] != 0) {
+        if (offset_cost_min == 0.) {
+          offset_cost_min = cost_model->distance_[i];
+        } else if (cost_model->distance_[i] < offset_cost_min) {
+          offset_cost_min = cost_model->distance_[i];
+        }
+      }
+    }
+    // In case all the cost_model->distance_ is 0, the next non-zero cost we
+    // can have is from the extra bit in GetDistanceCost, hence 1.
+    if (offset_cost_min < 1.) offset_cost_min = 1.;
+
+    size = 1 + (int)ceil((manager->max_cost_cache_ - manager->min_cost_cache_) /
+                         offset_cost_min);
+    // Empirically, we usually end up with a value below 100.
+    if (size > MAX_LENGTH) size = MAX_LENGTH;
+
+    manager->interval_ends_ =
+        (int*)WebPSafeMalloc(size, sizeof(*manager->interval_ends_));
+    if (manager->interval_ends_ == NULL) {
+      CostManagerClear(manager);
+      return 0;
+    }
+    manager->interval_ends_size_ = size;
+  }
+
+  return 1;
+}
+
+// Given the distance_cost for pixel 'index', update the cost at pixel 'i' if it
+// is smaller than the previously computed value.
+static WEBP_INLINE void UpdateCost(CostManager* const manager, int i, int index,
+                                   double distance_cost) {
+  int k = i - index;
+  double cost_tmp;
+  assert(k >= 0 && k < MAX_LENGTH);
+  cost_tmp = distance_cost + manager->cost_cache_[k];
+
+  if (manager->costs_[i] > cost_tmp) {
+    manager->costs_[i] = (float)cost_tmp;
+    manager->dist_array_[i] = k + 1;
+  }
+}
+
+// Given the distance_cost for pixel 'index', update the cost for all the pixels
+// between 'start' and 'end' excluded.
+static WEBP_INLINE void UpdateCostPerInterval(CostManager* const manager,
+                                              int start, int end, int index,
+                                              double distance_cost) {
+  int i;
+  for (i = start; i < end; ++i) UpdateCost(manager, i, index, distance_cost);
+}
+
+// Given two intervals, make 'prev' be the previous one of 'next' in 'manager'.
+static WEBP_INLINE void ConnectIntervals(CostManager* const manager,
+                                         CostInterval* const prev,
+                                         CostInterval* const next) {
+  if (prev != NULL) {
+    prev->next_ = next;
+  } else {
+    manager->head_ = next;
+  }
+
+  if (next != NULL) next->previous_ = prev;
+}
+
+// Pop an interval in the manager.
+static WEBP_INLINE void PopInterval(CostManager* const manager,
+                                    CostInterval* const interval) {
+  CostInterval* const next = interval->next_;
+
+  if (interval == NULL) return;
+
+  ConnectIntervals(manager, interval->previous_, next);
+  if (CostIntervalIsInFreeList(manager, interval)) {
+    CostIntervalAddToFreeList(manager, interval);
+  } else {  // recycle regularly malloc'd intervals too
+    interval->next_ = manager->recycled_intervals_;
+    manager->recycled_intervals_ = interval;
+  }
+  --manager->count_;
+  assert(manager->count_ >= 0);
+}
+
+// Update the cost at index i by going over all the stored intervals that
+// overlap with i.
+static WEBP_INLINE void UpdateCostPerIndex(CostManager* const manager, int i) {
+  CostInterval* current = manager->head_;
+
+  while (current != NULL && current->start_ <= i) {
+    if (current->end_ <= i) {
+      // We have an outdated interval, remove it.
+      CostInterval* next = current->next_;
+      PopInterval(manager, current);
+      current = next;
+    } else {
+      UpdateCost(manager, i, current->index_, current->distance_cost_);
+      current = current->next_;
+    }
+  }
+}
+
+// Given a current orphan interval and its previous interval, before
+// it was orphaned (which can be NULL), set it at the right place in the list
+// of intervals using the start_ ordering and the previous interval as a hint.
+static WEBP_INLINE void PositionOrphanInterval(CostManager* const manager,
+                                               CostInterval* const current,
+                                               CostInterval* previous) {
+  assert(current != NULL);
+
+  if (previous == NULL) previous = manager->head_;
+  while (previous != NULL && current->start_ < previous->start_) {
+    previous = previous->previous_;
+  }
+  while (previous != NULL && previous->next_ != NULL &&
+         previous->next_->start_ < current->start_) {
+    previous = previous->next_;
+  }
+
+  if (previous != NULL) {
+    ConnectIntervals(manager, current, previous->next_);
+  } else {
+    ConnectIntervals(manager, current, manager->head_);
+  }
+  ConnectIntervals(manager, previous, current);
+}
+
+// Insert an interval in the list contained in the manager by starting at
+// interval_in as a hint. The intervals are sorted by start_ value.
+static WEBP_INLINE void InsertInterval(CostManager* const manager,
+                                       CostInterval* const interval_in,
+                                       double distance_cost, double lower,
+                                       double upper, int index, int start,
+                                       int end) {
+  CostInterval* interval_new;
+
+  if (IsCostCacheIntervalWritable(start, end) ||
+      manager->count_ >= COST_CACHE_INTERVAL_SIZE_MAX) {
+    // Write down the interval if it is too small.
+    UpdateCostPerInterval(manager, start, end, index, distance_cost);
+    return;
+  }
+  if (manager->free_intervals_ != NULL) {
+    interval_new = manager->free_intervals_;
+    manager->free_intervals_ = interval_new->next_;
+  } else if (manager->recycled_intervals_ != NULL) {
+    interval_new = manager->recycled_intervals_;
+    manager->recycled_intervals_ = interval_new->next_;
+  } else {   // malloc for good
+    interval_new = (CostInterval*)WebPSafeMalloc(1, sizeof(*interval_new));
+    if (interval_new == NULL) {
+      // Write down the interval if we cannot create it.
+      UpdateCostPerInterval(manager, start, end, index, distance_cost);
+      return;
+    }
+  }
+
+  interval_new->distance_cost_ = distance_cost;
+  interval_new->lower_ = lower;
+  interval_new->upper_ = upper;
+  interval_new->index_ = index;
+  interval_new->start_ = start;
+  interval_new->end_ = end;
+  PositionOrphanInterval(manager, interval_new, interval_in);
+
+  ++manager->count_;
+}
+
+// When an interval has its start_ or end_ modified, it needs to be
+// repositioned in the linked list.
+static WEBP_INLINE void RepositionInterval(CostManager* const manager,
+                                           CostInterval* const interval) {
+  if (IsCostCacheIntervalWritable(interval->start_, interval->end_)) {
+    // Maybe interval has been resized and is small enough to be removed.
+    UpdateCostPerInterval(manager, interval->start_, interval->end_,
+                          interval->index_, interval->distance_cost_);
+    PopInterval(manager, interval);
+    return;
+  }
+
+  // Early exit if interval is at the right spot.
+  if ((interval->previous_ == NULL ||
+       interval->previous_->start_ <= interval->start_) &&
+      (interval->next_ == NULL ||
+       interval->start_ <= interval->next_->start_)) {
+    return;
+  }
+
+  ConnectIntervals(manager, interval->previous_, interval->next_);
+  PositionOrphanInterval(manager, interval, interval->previous_);
+}
+
+// Given a new cost interval defined by its start at index, its last value and
+// distance_cost, add its contributions to the previous intervals and costs.
+// If handling the interval or one of its subintervals becomes to heavy, its
+// contribution is added to the costs right away.
+static WEBP_INLINE void PushInterval(CostManager* const manager,
+                                     double distance_cost, int index,
+                                     int last) {
+  size_t i;
+  CostInterval* interval = manager->head_;
+  CostInterval* interval_next;
+  const CostCacheInterval* const cost_cache_intervals =
+      manager->cache_intervals_;
+
+  for (i = 0; i < manager->cache_intervals_size_ &&
+              cost_cache_intervals[i].start_ < last;
+       ++i) {
+    // Define the intersection of the ith interval with the new one.
+    int start = index + cost_cache_intervals[i].start_;
+    const int end = index + (cost_cache_intervals[i].end_ > last
+                                 ? last
+                                 : cost_cache_intervals[i].end_);
+    const double lower_in = cost_cache_intervals[i].lower_;
+    const double upper_in = cost_cache_intervals[i].upper_;
+    const double lower_full_in = distance_cost + lower_in;
+    const double upper_full_in = distance_cost + upper_in;
+
+    if (cost_cache_intervals[i].do_write_) {
+      UpdateCostPerInterval(manager, start, end, index, distance_cost);
+      continue;
+    }
+
+    for (; interval != NULL && interval->start_ < end && start < end;
+         interval = interval_next) {
+      const double lower_full_interval =
+          interval->distance_cost_ + interval->lower_;
+      const double upper_full_interval =
+          interval->distance_cost_ + interval->upper_;
+
+      interval_next = interval->next_;
+
+      // Make sure we have some overlap
+      if (start >= interval->end_) continue;
+
+      if (lower_full_in >= upper_full_interval) {
+        // When intervals are represented, the lower, the better.
+        // [**********************************************************]
+        // start                                                    end
+        //                   [----------------------------------]
+        //                   interval->start_       interval->end_
+        // If we are worse than what we already have, add whatever we have so
+        // far up to interval.
+        const int start_new = interval->end_;
+        InsertInterval(manager, interval, distance_cost, lower_in, upper_in,
+                       index, start, interval->start_);
+        start = start_new;
+        continue;
+      }
+
+      // We know the two intervals intersect.
+      if (upper_full_in >= lower_full_interval) {
+        // There is no clear cut on which is best, so let's keep both.
+        // [*********[*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*]***********]
+        // start     interval->start_     interval->end_         end
+        // OR
+        // [*********[*-*-*-*-*-*-*-*-*-*-*-]----------------------]
+        // start     interval->start_     end          interval->end_
+        const int end_new = (interval->end_ <= end) ? interval->end_ : end;
+        InsertInterval(manager, interval, distance_cost, lower_in, upper_in,
+                       index, start, end_new);
+        start = end_new;
+      } else if (start <= interval->start_ && interval->end_ <= end) {
+        //                   [----------------------------------]
+        //                   interval->start_       interval->end_
+        // [**************************************************************]
+        // start                                                        end
+        // We can safely remove the old interval as it is fully included.
+        PopInterval(manager, interval);
+      } else {
+        if (interval->start_ <= start && end <= interval->end_) {
+          // [--------------------------------------------------------------]
+          // interval->start_                                  interval->end_
+          //                     [*****************************]
+          //                     start                       end
+          // We have to split the old interval as it fully contains the new one.
+          const int end_original = interval->end_;
+          interval->end_ = start;
+          InsertInterval(manager, interval, interval->distance_cost_,
+                         interval->lower_, interval->upper_, interval->index_,
+                         end, end_original);
+        } else if (interval->start_ < start) {
+          // [------------------------------------]
+          // interval->start_        interval->end_
+          //                     [*****************************]
+          //                     start                       end
+          interval->end_ = start;
+        } else {
+          //              [------------------------------------]
+          //              interval->start_        interval->end_
+          // [*****************************]
+          // start                       end
+          interval->start_ = end;
+        }
+
+        // The interval has been modified, we need to reposition it or write it.
+        RepositionInterval(manager, interval);
+      }
+    }
+    // Insert the remaining interval from start to end.
+    InsertInterval(manager, interval, distance_cost, lower_in, upper_in, index,
+                   start, end);
+  }
+}
+
 static int BackwardReferencesHashChainDistanceOnly(
-    int xsize, int ysize, const uint32_t* const argb,
-    int quality, int cache_bits, VP8LHashChain* const hash_chain,
+    int xsize, int ysize, const uint32_t* const argb, int quality,
+    int cache_bits, const VP8LHashChain* const hash_chain,
     VP8LBackwardRefs* const refs, uint16_t* const dist_array) {
   int i;
   int ok = 0;
   int cc_init = 0;
   const int pix_count = xsize * ysize;
   const int use_color_cache = (cache_bits > 0);
-  float* const cost =
-      (float*)WebPSafeMalloc(pix_count, sizeof(*cost));
   const size_t literal_array_size = sizeof(double) *
       (NUM_LITERAL_CODES + NUM_LENGTH_CODES +
        ((cache_bits > 0) ? (1 << cache_bits) : 0));
   const size_t cost_model_size = sizeof(CostModel) + literal_array_size;
   CostModel* const cost_model =
-      (CostModel*)WebPSafeMalloc(1ULL, cost_model_size);
+      (CostModel*)WebPSafeCalloc(1ULL, cost_model_size);
   VP8LColorCache hashers;
   const int skip_length = 32 + quality;
   const int skip_min_distance_code = 2;
-  int iter_max = GetMaxItersForQuality(quality, 0);
-  const int window_size = GetWindowSizeForHashChain(quality, xsize);
+  CostManager* cost_manager =
+      (CostManager*)WebPSafeMalloc(1ULL, sizeof(*cost_manager));
 
-  if (cost == NULL || cost_model == NULL) goto Error;
+  if (cost_model == NULL || cost_manager == NULL) goto Error;
 
   cost_model->literal_ = (double*)(cost_model + 1);
   if (use_color_cache) {
@@ -634,34 +1199,91 @@ static int BackwardReferencesHashChainDistanceOnly(
     goto Error;
   }
 
-  for (i = 0; i < pix_count; ++i) cost[i] = 1e38f;
+  if (!CostManagerInit(cost_manager, dist_array, pix_count, cost_model)) {
+    goto Error;
+  }
 
   // We loop one pixel at a time, but store all currently best points to
   // non-processed locations from this point.
   dist_array[0] = 0;
-  HashChainReset(hash_chain);
   // Add first pixel as literal.
-  AddSingleLiteralWithCostModel(argb + 0, hash_chain, &hashers, cost_model, 0,
-                                0, use_color_cache, 0.0, cost, dist_array);
+  AddSingleLiteralWithCostModel(argb + 0, &hashers, cost_model, 0,
+                                use_color_cache, 0.0, cost_manager->costs_,
+                                dist_array);
+
   for (i = 1; i < pix_count - 1; ++i) {
-    int offset = 0;
-    int len = 0;
-    double prev_cost = cost[i - 1];
-    const int max_len = MaxFindCopyLength(pix_count - i);
-    HashChainFindCopy(hash_chain, i, argb, max_len, window_size,
-                      iter_max, &offset, &len);
+    int offset = 0, len = 0;
+    double prev_cost = cost_manager->costs_[i - 1];
+    HashChainFindCopy(hash_chain, i, &offset, &len);
     if (len >= MIN_LENGTH) {
       const int code = DistanceToPlaneCode(xsize, offset);
-      const double distance_cost =
-          prev_cost + GetDistanceCost(cost_model, code);
-      int k;
-      for (k = 1; k < len; ++k) {
-        const double cost_val = distance_cost + GetLengthCost(cost_model, k);
-        if (cost[i + k] > cost_val) {
-          cost[i + k] = (float)cost_val;
-          dist_array[i + k] = k + 1;
+      const double offset_cost = GetDistanceCost(cost_model, code);
+      const int first_i = i;
+      int j_max = 0, interval_ends_index = 0;
+      const int is_offset_zero = (offset_cost == 0.);
+
+      if (!is_offset_zero) {
+        j_max = (int)ceil(
+            (cost_manager->max_cost_cache_ - cost_manager->min_cost_cache_) /
+            offset_cost);
+        if (j_max < 1) {
+          j_max = 1;
+        } else if (j_max > cost_manager->interval_ends_size_ - 1) {
+          // This could only happen in the case of MAX_LENGTH.
+          j_max = cost_manager->interval_ends_size_ - 1;
+        }
+      }  // else j_max is unused anyway.
+
+      // Instead of considering all contributions from a pixel i by calling:
+      //         PushInterval(cost_manager, prev_cost + offset_cost, i, len);
+      // we optimize these contributions in case offset_cost stays the same for
+      // consecutive pixels. This describes a set of pixels similar to a
+      // previous set (e.g. constant color regions).
+      for (; i < pix_count - 1; ++i) {
+        int offset_next, len_next;
+        prev_cost = cost_manager->costs_[i - 1];
+
+        if (is_offset_zero) {
+          // No optimization can be made so we just push all of the
+          // contributions from i.
+          PushInterval(cost_manager, prev_cost, i, len);
+        } else {
+          // j_max is chosen as the smallest j such that:
+          //       max of cost_cache_ < j*offset cost + min of cost_cache_
+          // Therefore, the pixel influenced by i-j_max, cannot be influenced
+          // by i. Only the costs after the end of what i contributed need to be
+          // updated. cost_manager->interval_ends_ is a circular buffer that
+          // stores those ends.
+          const double distance_cost = prev_cost + offset_cost;
+          int j = cost_manager->interval_ends_[interval_ends_index];
+          if (i - first_i <= j_max ||
+              !IsCostCacheIntervalWritable(j, i + len)) {
+            PushInterval(cost_manager, distance_cost, i, len);
+          } else {
+            for (; j < i + len; ++j) {
+              UpdateCost(cost_manager, j, i, distance_cost);
+            }
+          }
+          // Store the new end in the circular buffer.
+          assert(interval_ends_index < cost_manager->interval_ends_size_);
+          cost_manager->interval_ends_[interval_ends_index] = i + len;
+          if (++interval_ends_index > j_max) interval_ends_index = 0;
         }
+
+        // Check whether i is the last pixel to consider, as it is handled
+        // differently.
+        if (i + 1 >= pix_count - 1) break;
+        HashChainFindCopy(hash_chain, i + 1, &offset_next, &len_next);
+        if (offset_next != offset) break;
+        len = len_next;
+        UpdateCostPerIndex(cost_manager, i);
+        AddSingleLiteralWithCostModel(argb + i, &hashers, cost_model, i,
+                                      use_color_cache, prev_cost,
+                                      cost_manager->costs_, dist_array);
       }
+      // Submit the last pixel.
+      UpdateCostPerIndex(cost_manager, i + 1);
+
       // This if is for speedup only. It roughly doubles the speed, and
       // makes compression worse by .1 %.
       if (len >= skip_length && code <= skip_min_distance_code) {
@@ -669,53 +1291,55 @@ static int BackwardReferencesHashChainDistanceOnly(
         // lookups for better copies.
         // 1) insert the hashes.
         if (use_color_cache) {
+          int k;
           for (k = 0; k < len; ++k) {
             VP8LColorCacheInsert(&hashers, argb[i + k]);
           }
         }
-        // 2) Add to the hash_chain (but cannot add the last pixel)
+        // 2) jump.
         {
-          const int last = (len + i < pix_count - 1) ? len + i
-                                                     : pix_count - 1;
-          for (k = i; k < last; ++k) {
-            HashChainInsert(hash_chain, &argb[k], k);
-          }
+          const int i_next = i + len - 1;  // for loop does ++i, thus -1 here.
+          for (; i <= i_next; ++i) UpdateCostPerIndex(cost_manager, i + 1);
+          i = i_next;
         }
-        // 3) jump.
-        i += len - 1;  // for loop does ++i, thus -1 here.
         goto next_symbol;
       }
-      if (len != MIN_LENGTH) {
+      if (len > MIN_LENGTH) {
         int code_min_length;
         double cost_total;
-        HashChainFindOffset(hash_chain, i, argb, MIN_LENGTH, window_size,
-                            &offset);
+        offset = HashChainFindOffset(hash_chain, i);
         code_min_length = DistanceToPlaneCode(xsize, offset);
         cost_total = prev_cost +
             GetDistanceCost(cost_model, code_min_length) +
             GetLengthCost(cost_model, 1);
-        if (cost[i + 1] > cost_total) {
-          cost[i + 1] = (float)cost_total;
+        if (cost_manager->costs_[i + 1] > cost_total) {
+          cost_manager->costs_[i + 1] = (float)cost_total;
           dist_array[i + 1] = 2;
         }
       }
+    } else {    // len < MIN_LENGTH
+      UpdateCostPerIndex(cost_manager, i + 1);
     }
-    AddSingleLiteralWithCostModel(argb + i, hash_chain, &hashers, cost_model, i,
-                                  0, use_color_cache, prev_cost, cost,
-                                  dist_array);
+
+    AddSingleLiteralWithCostModel(argb + i, &hashers, cost_model, i,
+                                  use_color_cache, prev_cost,
+                                  cost_manager->costs_, dist_array);
+
  next_symbol: ;
   }
   // Handle the last pixel.
   if (i == (pix_count - 1)) {
-    AddSingleLiteralWithCostModel(argb + i, hash_chain, &hashers, cost_model, i,
-                                  1, use_color_cache, cost[pix_count - 2], cost,
-                                  dist_array);
+    AddSingleLiteralWithCostModel(
+        argb + i, &hashers, cost_model, i, use_color_cache,
+        cost_manager->costs_[pix_count - 2], cost_manager->costs_, dist_array);
   }
+
   ok = !refs->error_;
  Error:
   if (cc_init) VP8LColorCacheClear(&hashers);
+  CostManagerClear(cost_manager);
   WebPSafeFree(cost_model);
-  WebPSafeFree(cost);
+  WebPSafeFree(cost_manager);
   return ok;
 }
 
@@ -739,18 +1363,14 @@ static void TraceBackwards(uint16_t* const dist_array,
 }
 
 static int BackwardReferencesHashChainFollowChosenPath(
-    int xsize, int ysize, const uint32_t* const argb,
-    int quality, int cache_bits,
+    const uint32_t* const argb, int cache_bits,
     const uint16_t* const chosen_path, int chosen_path_size,
-    VP8LHashChain* const hash_chain,
-    VP8LBackwardRefs* const refs) {
-  const int pix_count = xsize * ysize;
+    const VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs) {
   const int use_color_cache = (cache_bits > 0);
   int ix;
   int i = 0;
   int ok = 0;
   int cc_init = 0;
-  const int window_size = GetWindowSizeForHashChain(quality, xsize);
   VP8LColorCache hashers;
 
   if (use_color_cache) {
@@ -759,25 +1379,17 @@ static int BackwardReferencesHashChainFollowChosenPath(
   }
 
   ClearBackwardRefs(refs);
-  HashChainReset(hash_chain);
   for (ix = 0; ix < chosen_path_size; ++ix) {
-    int offset = 0;
     const int len = chosen_path[ix];
     if (len != 1) {
       int k;
-      HashChainFindOffset(hash_chain, i, argb, len, window_size, &offset);
+      const int offset = HashChainFindOffset(hash_chain, i);
       BackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(offset, len));
       if (use_color_cache) {
         for (k = 0; k < len; ++k) {
           VP8LColorCacheInsert(&hashers, argb[i + k]);
         }
       }
-      {
-        const int last = (len < pix_count - 1 - i) ? len : pix_count - 1 - i;
-        for (k = 0; k < last; ++k) {
-          HashChainInsert(hash_chain, &argb[i + k], i + k);
-        }
-      }
       i += len;
     } else {
       PixOrCopy v;
@@ -790,9 +1402,6 @@ static int BackwardReferencesHashChainFollowChosenPath(
         v = PixOrCopyCreateLiteral(argb[i]);
       }
       BackwardRefsCursorAdd(refs, v);
-      if (i + 1 < pix_count) {
-        HashChainInsert(hash_chain, &argb[i], i);
-      }
       ++i;
     }
   }
@@ -803,11 +1412,10 @@ static int BackwardReferencesHashChainFollowChosenPath(
 }
 
 // Returns 1 on success.
-static int BackwardReferencesTraceBackwards(int xsize, int ysize,
-                                            const uint32_t* const argb,
-                                            int quality, int cache_bits,
-                                            VP8LHashChain* const hash_chain,
-                                            VP8LBackwardRefs* const refs) {
+static int BackwardReferencesTraceBackwards(
+    int xsize, int ysize, const uint32_t* const argb, int quality,
+    int cache_bits, const VP8LHashChain* const hash_chain,
+    VP8LBackwardRefs* const refs) {
   int ok = 0;
   const int dist_array_size = xsize * ysize;
   uint16_t* chosen_path = NULL;
@@ -824,8 +1432,7 @@ static int BackwardReferencesTraceBackwards(int xsize, int ysize,
   }
   TraceBackwards(dist_array, dist_array_size, &chosen_path, &chosen_path_size);
   if (!BackwardReferencesHashChainFollowChosenPath(
-      xsize, ysize, argb, quality, cache_bits, chosen_path, chosen_path_size,
-      hash_chain, refs)) {
+          argb, cache_bits, chosen_path, chosen_path_size, hash_chain, refs)) {
     goto Error;
   }
   ok = 1;
@@ -913,7 +1520,7 @@ static double ComputeCacheEntropy(const uint32_t* argb,
 // Returns 0 in case of memory error.
 static int CalculateBestCacheSize(const uint32_t* const argb,
                                   int xsize, int ysize, int quality,
-                                  VP8LHashChain* const hash_chain,
+                                  const VP8LHashChain* const hash_chain,
                                   VP8LBackwardRefs* const refs,
                                   int* const lz77_computed,
                                   int* const best_cache_bits) {
@@ -933,8 +1540,8 @@ static int CalculateBestCacheSize(const uint32_t* const argb,
     // Local color cache is disabled.
     return 1;
   }
-  if (!BackwardReferencesLz77(xsize, ysize, argb, cache_bits_low, quality, 0,
-                              hash_chain, refs)) {
+  if (!BackwardReferencesLz77(xsize, ysize, argb, cache_bits_low, hash_chain,
+                              refs)) {
     return 0;
   }
   // Do a binary search to find the optimal entropy for cache_bits.
@@ -999,13 +1606,12 @@ static int BackwardRefsWithLocalCache(const uint32_t* const argb,
 }
 
 static VP8LBackwardRefs* GetBackwardReferencesLowEffort(
-    int width, int height, const uint32_t* const argb, int quality,
-    int* const cache_bits, VP8LHashChain* const hash_chain,
+    int width, int height, const uint32_t* const argb,
+    int* const cache_bits, const VP8LHashChain* const hash_chain,
     VP8LBackwardRefs refs_array[2]) {
   VP8LBackwardRefs* refs_lz77 = &refs_array[0];
   *cache_bits = 0;
-  if (!BackwardReferencesLz77(width, height, argb, 0, quality,
-                              1 /* Low effort. */, hash_chain, refs_lz77)) {
+  if (!BackwardReferencesLz77(width, height, argb, 0, hash_chain, refs_lz77)) {
     return NULL;
   }
   BackwardReferences2DLocality(width, refs_lz77);
@@ -1014,7 +1620,7 @@ static VP8LBackwardRefs* GetBackwardReferencesLowEffort(
 
 static VP8LBackwardRefs* GetBackwardReferences(
     int width, int height, const uint32_t* const argb, int quality,
-    int* const cache_bits, VP8LHashChain* const hash_chain,
+    int* const cache_bits, const VP8LHashChain* const hash_chain,
     VP8LBackwardRefs refs_array[2]) {
   int lz77_is_useful;
   int lz77_computed;
@@ -1037,8 +1643,8 @@ static VP8LBackwardRefs* GetBackwardReferences(
       }
     }
   } else {
-    if (!BackwardReferencesLz77(width, height, argb, *cache_bits, quality,
-                                0 /* Low effort. */, hash_chain, refs_lz77)) {
+    if (!BackwardReferencesLz77(width, height, argb, *cache_bits, hash_chain,
+                                refs_lz77)) {
       goto Error;
     }
   }
@@ -1097,11 +1703,11 @@ static VP8LBackwardRefs* GetBackwardReferences(
 
 VP8LBackwardRefs* VP8LGetBackwardReferences(
     int width, int height, const uint32_t* const argb, int quality,
-    int low_effort, int* const cache_bits, VP8LHashChain* const hash_chain,
-    VP8LBackwardRefs refs_array[2]) {
+    int low_effort, int* const cache_bits,
+    const VP8LHashChain* const hash_chain, VP8LBackwardRefs refs_array[2]) {
   if (low_effort) {
-    return GetBackwardReferencesLowEffort(width, height, argb, quality,
-                                          cache_bits, hash_chain, refs_array);
+    return GetBackwardReferencesLowEffort(width, height, argb, cache_bits,
+                                          hash_chain, refs_array);
   } else {
     return GetBackwardReferences(width, height, argb, quality, cache_bits,
                                  hash_chain, refs_array);
diff --git a/drivers/webp/enc/backward_references.h b/drivers/webp/enc/backward_references.h
index e410b06f7d..b72a01fb0e 100644
--- a/drivers/webp/enc/backward_references.h
+++ b/drivers/webp/enc/backward_references.h
@@ -115,11 +115,12 @@ static WEBP_INLINE uint32_t PixOrCopyDistance(const PixOrCopy* const p) {
 
 typedef struct VP8LHashChain VP8LHashChain;
 struct VP8LHashChain {
-  // Stores the most recently added position with the given hash value.
-  int32_t hash_to_first_index_[HASH_SIZE];
-  // chain_[pos] stores the previous position with the same hash value
-  // for every pixel in the image.
-  int32_t* chain_;
+  // The 20 most significant bits contain the offset at which the best match
+  // is found. These 20 bits are the limit defined by GetWindowSizeForHashChain
+  // (through WINDOW_SIZE = 1<<20).
+  // The lower 12 bits contain the length of the match. The 12 bit limit is
+  // defined in MaxFindCopyLength with MAX_LENGTH=4096.
+  uint32_t* offset_length_;
   // This is the maximum size of the hash_chain that can be constructed.
   // Typically this is the pixel count (width x height) for a given image.
   int size_;
@@ -127,6 +128,9 @@ struct VP8LHashChain {
 
 // Must be called first, to set size.
 int VP8LHashChainInit(VP8LHashChain* const p, int size);
+// Pre-compute the best matches for argb.
+int VP8LHashChainFill(VP8LHashChain* const p, int quality,
+                      const uint32_t* const argb, int xsize, int ysize);
 void VP8LHashChainClear(VP8LHashChain* const p);  // release memory
 
 // -----------------------------------------------------------------------------
@@ -192,8 +196,8 @@ static WEBP_INLINE void VP8LRefsCursorNext(VP8LRefsCursor* const c) {
 // refs[0] or refs[1].
 VP8LBackwardRefs* VP8LGetBackwardReferences(
     int width, int height, const uint32_t* const argb, int quality,
-    int low_effort, int* const cache_bits, VP8LHashChain* const hash_chain,
-    VP8LBackwardRefs refs[2]);
+    int low_effort, int* const cache_bits,
+    const VP8LHashChain* const hash_chain, VP8LBackwardRefs refs[2]);
 
 #ifdef __cplusplus
 }
diff --git a/drivers/webp/enc/filter.c b/drivers/webp/enc/filter.c
index 1a4dd947fb..e8ea8b4ff2 100644
--- a/drivers/webp/enc/filter.c
+++ b/drivers/webp/enc/filter.c
@@ -107,10 +107,9 @@ static void DoFilter(const VP8EncIterator* const it, int level) {
 //------------------------------------------------------------------------------
 // SSIM metric
 
-enum { KERNEL = 3 };
 static const double kMinValue = 1.e-10;  // minimal threshold
 
-void VP8SSIMAddStats(const DistoStats* const src, DistoStats* const dst) {
+void VP8SSIMAddStats(const VP8DistoStats* const src, VP8DistoStats* const dst) {
   dst->w   += src->w;
   dst->xm  += src->xm;
   dst->ym  += src->ym;
@@ -119,32 +118,7 @@ void VP8SSIMAddStats(const DistoStats* const src, DistoStats* const dst) {
   dst->yym += src->yym;
 }
 
-static void VP8SSIMAccumulate(const uint8_t* src1, int stride1,
-                              const uint8_t* src2, int stride2,
-                              int xo, int yo, int W, int H,
-                              DistoStats* const stats) {
-  const int ymin = (yo - KERNEL < 0) ? 0 : yo - KERNEL;
-  const int ymax = (yo + KERNEL > H - 1) ? H - 1 : yo + KERNEL;
-  const int xmin = (xo - KERNEL < 0) ? 0 : xo - KERNEL;
-  const int xmax = (xo + KERNEL > W - 1) ? W - 1 : xo + KERNEL;
-  int x, y;
-  src1 += ymin * stride1;
-  src2 += ymin * stride2;
-  for (y = ymin; y <= ymax; ++y, src1 += stride1, src2 += stride2) {
-    for (x = xmin; x <= xmax; ++x) {
-      const int s1 = src1[x];
-      const int s2 = src2[x];
-      stats->w   += 1;
-      stats->xm  += s1;
-      stats->ym  += s2;
-      stats->xxm += s1 * s1;
-      stats->xym += s1 * s2;
-      stats->yym += s2 * s2;
-    }
-  }
-}
-
-double VP8SSIMGet(const DistoStats* const stats) {
+double VP8SSIMGet(const VP8DistoStats* const stats) {
   const double xmxm = stats->xm * stats->xm;
   const double ymym = stats->ym * stats->ym;
   const double xmym = stats->xm * stats->ym;
@@ -165,7 +139,7 @@ double VP8SSIMGet(const DistoStats* const stats) {
   return (fden != 0.) ? fnum / fden : kMinValue;
 }
 
-double VP8SSIMGetSquaredError(const DistoStats* const s) {
+double VP8SSIMGetSquaredError(const VP8DistoStats* const s) {
   if (s->w > 0.) {
     const double iw2 = 1. / (s->w * s->w);
     const double sxx = s->xxm * s->w - s->xm * s->xm;
@@ -177,34 +151,66 @@ double VP8SSIMGetSquaredError(const DistoStats* const s) {
   return kMinValue;
 }
 
+#define LIMIT(A, M)  ((A) > (M) ? (M) : (A))
+static void VP8SSIMAccumulateRow(const uint8_t* src1, int stride1,
+                                 const uint8_t* src2, int stride2,
+                                 int y, int W, int H,
+                                 VP8DistoStats* const stats) {
+  int x = 0;
+  const int w0 = LIMIT(VP8_SSIM_KERNEL, W);
+  for (x = 0; x < w0; ++x) {
+    VP8SSIMAccumulateClipped(src1, stride1, src2, stride2, x, y, W, H, stats);
+  }
+  for (; x <= W - 8 + VP8_SSIM_KERNEL; ++x) {
+    VP8SSIMAccumulate(
+        src1 + (y - VP8_SSIM_KERNEL) * stride1 + (x - VP8_SSIM_KERNEL), stride1,
+        src2 + (y - VP8_SSIM_KERNEL) * stride2 + (x - VP8_SSIM_KERNEL), stride2,
+        stats);
+  }
+  for (; x < W; ++x) {
+    VP8SSIMAccumulateClipped(src1, stride1, src2, stride2, x, y, W, H, stats);
+  }
+}
+
 void VP8SSIMAccumulatePlane(const uint8_t* src1, int stride1,
                             const uint8_t* src2, int stride2,
-                            int W, int H, DistoStats* const stats) {
+                            int W, int H, VP8DistoStats* const stats) {
   int x, y;
-  for (y = 0; y < H; ++y) {
+  const int h0 = LIMIT(VP8_SSIM_KERNEL, H);
+  const int h1 = LIMIT(VP8_SSIM_KERNEL, H - VP8_SSIM_KERNEL);
+  for (y = 0; y < h0; ++y) {
+    for (x = 0; x < W; ++x) {
+      VP8SSIMAccumulateClipped(src1, stride1, src2, stride2, x, y, W, H, stats);
+    }
+  }
+  for (; y < h1; ++y) {
+    VP8SSIMAccumulateRow(src1, stride1, src2, stride2, y, W, H, stats);
+  }
+  for (; y < H; ++y) {
     for (x = 0; x < W; ++x) {
-      VP8SSIMAccumulate(src1, stride1, src2, stride2, x, y, W, H, stats);
+      VP8SSIMAccumulateClipped(src1, stride1, src2, stride2, x, y, W, H, stats);
     }
   }
 }
+#undef LIMIT
 
 static double GetMBSSIM(const uint8_t* yuv1, const uint8_t* yuv2) {
   int x, y;
-  DistoStats s = { .0, .0, .0, .0, .0, .0 };
+  VP8DistoStats s = { .0, .0, .0, .0, .0, .0 };
 
   // compute SSIM in a 10 x 10 window
-  for (x = 3; x < 13; x++) {
-    for (y = 3; y < 13; y++) {
-      VP8SSIMAccumulate(yuv1 + Y_OFF_ENC, BPS, yuv2 + Y_OFF_ENC, BPS,
-                        x, y, 16, 16, &s);
+  for (y = VP8_SSIM_KERNEL; y < 16 - VP8_SSIM_KERNEL; y++) {
+    for (x = VP8_SSIM_KERNEL; x < 16 - VP8_SSIM_KERNEL; x++) {
+      VP8SSIMAccumulateClipped(yuv1 + Y_OFF_ENC, BPS, yuv2 + Y_OFF_ENC, BPS,
+                               x, y, 16, 16, &s);
     }
   }
   for (x = 1; x < 7; x++) {
     for (y = 1; y < 7; y++) {
-      VP8SSIMAccumulate(yuv1 + U_OFF_ENC, BPS, yuv2 + U_OFF_ENC, BPS,
-                        x, y, 8, 8, &s);
-      VP8SSIMAccumulate(yuv1 + V_OFF_ENC, BPS, yuv2 + V_OFF_ENC, BPS,
-                        x, y, 8, 8, &s);
+      VP8SSIMAccumulateClipped(yuv1 + U_OFF_ENC, BPS, yuv2 + U_OFF_ENC, BPS,
+                               x, y, 8, 8, &s);
+      VP8SSIMAccumulateClipped(yuv1 + V_OFF_ENC, BPS, yuv2 + V_OFF_ENC, BPS,
+                               x, y, 8, 8, &s);
     }
   }
   return VP8SSIMGet(&s);
@@ -222,6 +228,7 @@ void VP8InitFilter(VP8EncIterator* const it) {
         (*it->lf_stats_)[s][i] = 0;
       }
     }
+    VP8SSIMDspInit();
   }
 }
 
@@ -229,7 +236,7 @@ void VP8StoreFilterStats(VP8EncIterator* const it) {
   int d;
   VP8Encoder* const enc = it->enc_;
   const int s = it->mb_->segment_;
-  const int level0 = enc->dqm_[s].fstrength_;  // TODO: ref_lf_delta[]
+  const int level0 = enc->dqm_[s].fstrength_;
 
   // explore +/-quant range of values around level0
   const int delta_min = -enc->dqm_[s].quant_;
diff --git a/drivers/webp/enc/histogram.c b/drivers/webp/enc/histogram.c
index 68c27fb1db..61544f4ccd 100644
--- a/drivers/webp/enc/histogram.c
+++ b/drivers/webp/enc/histogram.c
@@ -157,6 +157,109 @@ void VP8LHistogramAddSinglePixOrCopy(VP8LHistogram* const histo,
 }
 
 // -----------------------------------------------------------------------------
+// Entropy-related functions.
+
+static WEBP_INLINE double BitsEntropyRefine(const VP8LBitEntropy* entropy) {
+  double mix;
+  if (entropy->nonzeros < 5) {
+    if (entropy->nonzeros <= 1) {
+      return 0;
+    }
+    // Two symbols, they will be 0 and 1 in a Huffman code.
+    // Let's mix in a bit of entropy to favor good clustering when
+    // distributions of these are combined.
+    if (entropy->nonzeros == 2) {
+      return 0.99 * entropy->sum + 0.01 * entropy->entropy;
+    }
+    // No matter what the entropy says, we cannot be better than min_limit
+    // with Huffman coding. I am mixing a bit of entropy into the
+    // min_limit since it produces much better (~0.5 %) compression results
+    // perhaps because of better entropy clustering.
+    if (entropy->nonzeros == 3) {
+      mix = 0.95;
+    } else {
+      mix = 0.7;  // nonzeros == 4.
+    }
+  } else {
+    mix = 0.627;
+  }
+
+  {
+    double min_limit = 2 * entropy->sum - entropy->max_val;
+    min_limit = mix * min_limit + (1.0 - mix) * entropy->entropy;
+    return (entropy->entropy < min_limit) ? min_limit : entropy->entropy;
+  }
+}
+
+double VP8LBitsEntropy(const uint32_t* const array, int n,
+                       uint32_t* const trivial_symbol) {
+  VP8LBitEntropy entropy;
+  VP8LBitsEntropyUnrefined(array, n, &entropy);
+  if (trivial_symbol != NULL) {
+    *trivial_symbol =
+        (entropy.nonzeros == 1) ? entropy.nonzero_code : VP8L_NON_TRIVIAL_SYM;
+  }
+
+  return BitsEntropyRefine(&entropy);
+}
+
+static double InitialHuffmanCost(void) {
+  // Small bias because Huffman code length is typically not stored in
+  // full length.
+  static const int kHuffmanCodeOfHuffmanCodeSize = CODE_LENGTH_CODES * 3;
+  static const double kSmallBias = 9.1;
+  return kHuffmanCodeOfHuffmanCodeSize - kSmallBias;
+}
+
+// Finalize the Huffman cost based on streak numbers and length type (<3 or >=3)
+static double FinalHuffmanCost(const VP8LStreaks* const stats) {
+  double retval = InitialHuffmanCost();
+  retval += stats->counts[0] * 1.5625 + 0.234375 * stats->streaks[0][1];
+  retval += stats->counts[1] * 2.578125 + 0.703125 * stats->streaks[1][1];
+  retval += 1.796875 * stats->streaks[0][0];
+  retval += 3.28125 * stats->streaks[1][0];
+  return retval;
+}
+
+// Get the symbol entropy for the distribution 'population'.
+// Set 'trivial_sym', if there's only one symbol present in the distribution.
+static double PopulationCost(const uint32_t* const population, int length,
+                             uint32_t* const trivial_sym) {
+  VP8LBitEntropy bit_entropy;
+  VP8LStreaks stats;
+  VP8LGetEntropyUnrefined(population, length, &bit_entropy, &stats);
+  if (trivial_sym != NULL) {
+    *trivial_sym = (bit_entropy.nonzeros == 1) ? bit_entropy.nonzero_code
+                                               : VP8L_NON_TRIVIAL_SYM;
+  }
+
+  return BitsEntropyRefine(&bit_entropy) + FinalHuffmanCost(&stats);
+}
+
+static WEBP_INLINE double GetCombinedEntropy(const uint32_t* const X,
+                                             const uint32_t* const Y,
+                                             int length) {
+  VP8LBitEntropy bit_entropy;
+  VP8LStreaks stats;
+  VP8LGetCombinedEntropyUnrefined(X, Y, length, &bit_entropy, &stats);
+
+  return BitsEntropyRefine(&bit_entropy) + FinalHuffmanCost(&stats);
+}
+
+// Estimates the Entropy + Huffman + other block overhead size cost.
+double VP8LHistogramEstimateBits(const VP8LHistogram* const p) {
+  return
+      PopulationCost(
+          p->literal_, VP8LHistogramNumCodes(p->palette_code_bits_), NULL)
+      + PopulationCost(p->red_, NUM_LITERAL_CODES, NULL)
+      + PopulationCost(p->blue_, NUM_LITERAL_CODES, NULL)
+      + PopulationCost(p->alpha_, NUM_LITERAL_CODES, NULL)
+      + PopulationCost(p->distance_, NUM_DISTANCE_CODES, NULL)
+      + VP8LExtraCost(p->literal_ + NUM_LITERAL_CODES, NUM_LENGTH_CODES)
+      + VP8LExtraCost(p->distance_, NUM_DISTANCE_CODES);
+}
+
+// -----------------------------------------------------------------------------
 // Various histogram combine/cost-eval functions
 
 static int GetCombinedHistogramEntropy(const VP8LHistogram* const a,
@@ -165,26 +268,25 @@ static int GetCombinedHistogramEntropy(const VP8LHistogram* const a,
                                        double* cost) {
   const int palette_code_bits = a->palette_code_bits_;
   assert(a->palette_code_bits_ == b->palette_code_bits_);
-  *cost += VP8LGetCombinedEntropy(a->literal_, b->literal_,
-                                  VP8LHistogramNumCodes(palette_code_bits));
+  *cost += GetCombinedEntropy(a->literal_, b->literal_,
+                              VP8LHistogramNumCodes(palette_code_bits));
   *cost += VP8LExtraCostCombined(a->literal_ + NUM_LITERAL_CODES,
                                  b->literal_ + NUM_LITERAL_CODES,
                                  NUM_LENGTH_CODES);
   if (*cost > cost_threshold) return 0;
 
-  *cost += VP8LGetCombinedEntropy(a->red_, b->red_, NUM_LITERAL_CODES);
+  *cost += GetCombinedEntropy(a->red_, b->red_, NUM_LITERAL_CODES);
   if (*cost > cost_threshold) return 0;
 
-  *cost += VP8LGetCombinedEntropy(a->blue_, b->blue_, NUM_LITERAL_CODES);
+  *cost += GetCombinedEntropy(a->blue_, b->blue_, NUM_LITERAL_CODES);
   if (*cost > cost_threshold) return 0;
 
-  *cost += VP8LGetCombinedEntropy(a->alpha_, b->alpha_, NUM_LITERAL_CODES);
+  *cost += GetCombinedEntropy(a->alpha_, b->alpha_, NUM_LITERAL_CODES);
   if (*cost > cost_threshold) return 0;
 
-  *cost += VP8LGetCombinedEntropy(a->distance_, b->distance_,
-                                  NUM_DISTANCE_CODES);
-  *cost += VP8LExtraCostCombined(a->distance_, b->distance_,
-                                 NUM_DISTANCE_CODES);
+  *cost += GetCombinedEntropy(a->distance_, b->distance_, NUM_DISTANCE_CODES);
+  *cost +=
+      VP8LExtraCostCombined(a->distance_, b->distance_, NUM_DISTANCE_CODES);
   if (*cost > cost_threshold) return 0;
 
   return 1;
@@ -262,17 +364,17 @@ static void UpdateDominantCostRange(
 
 static void UpdateHistogramCost(VP8LHistogram* const h) {
   uint32_t alpha_sym, red_sym, blue_sym;
-  const double alpha_cost = VP8LPopulationCost(h->alpha_, NUM_LITERAL_CODES,
-                                               &alpha_sym);
+  const double alpha_cost =
+      PopulationCost(h->alpha_, NUM_LITERAL_CODES, &alpha_sym);
   const double distance_cost =
-      VP8LPopulationCost(h->distance_, NUM_DISTANCE_CODES, NULL) +
+      PopulationCost(h->distance_, NUM_DISTANCE_CODES, NULL) +
       VP8LExtraCost(h->distance_, NUM_DISTANCE_CODES);
   const int num_codes = VP8LHistogramNumCodes(h->palette_code_bits_);
-  h->literal_cost_ = VP8LPopulationCost(h->literal_, num_codes, NULL) +
+  h->literal_cost_ = PopulationCost(h->literal_, num_codes, NULL) +
                      VP8LExtraCost(h->literal_ + NUM_LITERAL_CODES,
                                    NUM_LENGTH_CODES);
-  h->red_cost_ = VP8LPopulationCost(h->red_, NUM_LITERAL_CODES, &red_sym);
-  h->blue_cost_ = VP8LPopulationCost(h->blue_, NUM_LITERAL_CODES, &blue_sym);
+  h->red_cost_ = PopulationCost(h->red_, NUM_LITERAL_CODES, &red_sym);
+  h->blue_cost_ = PopulationCost(h->blue_, NUM_LITERAL_CODES, &blue_sym);
   h->bit_cost_ = h->literal_cost_ + h->red_cost_ + h->blue_cost_ +
                  alpha_cost + distance_cost;
   if ((alpha_sym | red_sym | blue_sym) == VP8L_NON_TRIVIAL_SYM) {
@@ -284,29 +386,27 @@ static void UpdateHistogramCost(VP8LHistogram* const h) {
 }
 
 static int GetBinIdForEntropy(double min, double max, double val) {
-  const double range = max - min + 1e-6;
-  const double delta = val - min;
-  return (int)(NUM_PARTITIONS * delta / range);
+  const double range = max - min;
+  if (range > 0.) {
+    const double delta = val - min;
+    return (int)((NUM_PARTITIONS - 1e-6) * delta / range);
+  } else {
+    return 0;
+  }
 }
 
-static int GetHistoBinIndexLowEffort(
-    const VP8LHistogram* const h, const DominantCostRange* const c) {
-  const int bin_id = GetBinIdForEntropy(c->literal_min_, c->literal_max_,
-                                        h->literal_cost_);
+static int GetHistoBinIndex(const VP8LHistogram* const h,
+                            const DominantCostRange* const c, int low_effort) {
+  int bin_id = GetBinIdForEntropy(c->literal_min_, c->literal_max_,
+                                  h->literal_cost_);
   assert(bin_id < NUM_PARTITIONS);
-  return bin_id;
-}
-
-static int GetHistoBinIndex(
-    const VP8LHistogram* const h, const DominantCostRange* const c) {
-  const int bin_id =
-      GetBinIdForEntropy(c->blue_min_, c->blue_max_, h->blue_cost_) +
-      NUM_PARTITIONS * GetBinIdForEntropy(c->red_min_, c->red_max_,
-                                          h->red_cost_) +
-      NUM_PARTITIONS * NUM_PARTITIONS * GetBinIdForEntropy(c->literal_min_,
-                                                           c->literal_max_,
-                                                           h->literal_cost_);
-  assert(bin_id < BIN_SIZE);
+  if (!low_effort) {
+    bin_id = bin_id * NUM_PARTITIONS
+           + GetBinIdForEntropy(c->red_min_, c->red_max_, h->red_cost_);
+    bin_id = bin_id * NUM_PARTITIONS
+           + GetBinIdForEntropy(c->blue_min_, c->blue_max_, h->blue_cost_);
+    assert(bin_id < BIN_SIZE);
+  }
   return bin_id;
 }
 
@@ -367,16 +467,13 @@ static void HistogramAnalyzeEntropyBin(VP8LHistogramSet* const image_histo,
   // bin-hash histograms on three of the dominant (literal, red and blue)
   // symbol costs.
   for (i = 0; i < histo_size; ++i) {
-    int num_histos;
-    VP8LHistogram* const histo = histograms[i];
-    const int16_t bin_id = low_effort ?
-        (int16_t)GetHistoBinIndexLowEffort(histo, &cost_range) :
-        (int16_t)GetHistoBinIndex(histo, &cost_range);
+    const VP8LHistogram* const histo = histograms[i];
+    const int bin_id = GetHistoBinIndex(histo, &cost_range, low_effort);
     const int bin_offset = bin_id * bin_depth;
     // bin_map[n][0] for every bin 'n' maintains the counter for the number of
     // histograms in that bin.
     // Get and increment the num_histos in that bin.
-    num_histos = ++bin_map[bin_offset];
+    const int num_histos = ++bin_map[bin_offset];
     assert(bin_offset + num_histos < bin_depth * BIN_SIZE);
     // Add histogram i'th index at num_histos (last) position in the bin_map.
     bin_map[bin_offset + num_histos] = i;
@@ -478,26 +575,31 @@ typedef struct {
 } HistogramPair;
 
 typedef struct {
-  HistogramPair* heap;
-  int* positions;
+  HistogramPair* queue;
   int size;
-  int max_index;
-} HistoHeap;
-
-static int HistoHeapInit(HistoHeap* const histo_heap, const int max_index) {
-  histo_heap->size = 0;
-  histo_heap->max_index = max_index;
-  histo_heap->heap = WebPSafeMalloc(max_index * max_index,
-                                    sizeof(*histo_heap->heap));
-  histo_heap->positions = WebPSafeMalloc(max_index * max_index,
-                                         sizeof(*histo_heap->positions));
-  return histo_heap->heap != NULL && histo_heap->positions != NULL;
-}
-
-static void HistoHeapClear(HistoHeap* const histo_heap) {
-  assert(histo_heap != NULL);
-  WebPSafeFree(histo_heap->heap);
-  WebPSafeFree(histo_heap->positions);
+  int max_size;
+} HistoQueue;
+
+static int HistoQueueInit(HistoQueue* const histo_queue, const int max_index) {
+  histo_queue->size = 0;
+  // max_index^2 for the queue size is safe. If you look at
+  // HistogramCombineGreedy, and imagine that UpdateQueueFront always pushes
+  // data to the queue, you insert at most:
+  // - max_index*(max_index-1)/2 (the first two for loops)
+  // - max_index - 1 in the last for loop at the first iteration of the while
+  //   loop, max_index - 2 at the second iteration ... therefore
+  //   max_index*(max_index-1)/2 overall too
+  histo_queue->max_size = max_index * max_index;
+  // We allocate max_size + 1 because the last element at index "size" is
+  // used as temporary data (and it could be up to max_size).
+  histo_queue->queue = WebPSafeMalloc(histo_queue->max_size + 1,
+                                      sizeof(*histo_queue->queue));
+  return histo_queue->queue != NULL;
+}
+
+static void HistoQueueClear(HistoQueue* const histo_queue) {
+  assert(histo_queue != NULL);
+  WebPSafeFree(histo_queue->queue);
 }
 
 static void SwapHistogramPairs(HistogramPair *p1,
@@ -507,66 +609,33 @@ static void SwapHistogramPairs(HistogramPair *p1,
   *p2 = tmp;
 }
 
-// Given a valid min-heap in range [0, heap_size-1) this function places value
-// heap[heap_size-1] into right location within heap and sets its position in
-// positions array.
-static void HeapPush(HistoHeap* const histo_heap) {
-  HistogramPair* const heap = histo_heap->heap - 1;
-  int* const positions = histo_heap->positions;
-  const int max_index = histo_heap->max_index;
-  int v;
-  ++histo_heap->size;
-  v = histo_heap->size;
-  while (v > 1 && heap[v].cost_diff < heap[v >> 1].cost_diff) {
-    SwapHistogramPairs(&heap[v], &heap[v >> 1]);
-    // Change position of moved pair in heap.
-    if (heap[v].idx1 >= 0) {
-      const int pos = heap[v].idx1 * max_index + heap[v].idx2;
-      assert(pos >= 0 && pos < max_index * max_index);
-      positions[pos] = v;
-    }
-    v >>= 1;
-  }
-  positions[heap[v].idx1 * max_index + heap[v].idx2] = v;
-}
-
-// Given a valid min-heap in range [0, heap_size) this function shortens heap
-// range by one and places element with the lowest value to (heap_size-1).
-static void HeapPop(HistoHeap* const histo_heap) {
-  HistogramPair* const heap = histo_heap->heap - 1;
-  int* const positions = histo_heap->positions;
-  const int heap_size = histo_heap->size;
-  const int max_index = histo_heap->max_index;
-  int v = 1;
-  if (heap[v].idx1 >= 0) {
-    positions[heap[v].idx1 * max_index + heap[v].idx2] = -1;
-  }
-  SwapHistogramPairs(&heap[v], &heap[heap_size]);
-  while ((v << 1) < heap_size) {
-    int son = (heap[v << 1].cost_diff < heap[v].cost_diff) ? (v << 1) : v;
-    if (((v << 1) + 1) < heap_size &&
-        heap[(v << 1) + 1].cost_diff < heap[son].cost_diff) {
-      son = (v << 1) + 1;
-    }
-    if (son == v) break;
-    SwapHistogramPairs(&heap[v], &heap[son]);
-    // Change position of moved pair in heap.
-    if (heap[v].idx1 >= 0) {
-      positions[heap[v].idx1 * max_index + heap[v].idx2] = v;
-    }
-    v = son;
-  }
-  if (heap[v].idx1 >= 0) {
-    positions[heap[v].idx1 * max_index + heap[v].idx2] = v;
+// Given a valid priority queue in range [0, queue_size) this function checks
+// whether histo_queue[queue_size] should be accepted and swaps it with the
+// front if it is smaller. Otherwise, it leaves it as is.
+static void UpdateQueueFront(HistoQueue* const histo_queue) {
+  if (histo_queue->queue[histo_queue->size].cost_diff >= 0) return;
+
+  if (histo_queue->queue[histo_queue->size].cost_diff <
+      histo_queue->queue[0].cost_diff) {
+    SwapHistogramPairs(histo_queue->queue,
+                       histo_queue->queue + histo_queue->size);
   }
-  --histo_heap->size;
+  ++histo_queue->size;
+
+  // We cannot add more elements than the capacity.
+  // The allocation adds an extra element to the official capacity so that
+  // histo_queue->queue[histo_queue->max_size] is read/written within bound.
+  assert(histo_queue->size <= histo_queue->max_size);
 }
 
 // -----------------------------------------------------------------------------
 
 static void PreparePair(VP8LHistogram** histograms, int idx1, int idx2,
-                        HistogramPair* const pair,
-                        VP8LHistogram* const histos) {
+                        HistogramPair* const pair) {
+  VP8LHistogram* h1;
+  VP8LHistogram* h2;
+  double sum_cost;
+
   if (idx1 > idx2) {
     const int tmp = idx2;
     idx2 = idx1;
@@ -574,60 +643,27 @@ static void PreparePair(VP8LHistogram** histograms, int idx1, int idx2,
   }
   pair->idx1 = idx1;
   pair->idx2 = idx2;
-  pair->cost_diff =
-      HistogramAddEval(histograms[idx1], histograms[idx2], histos, 0);
-  pair->cost_combo = histos->bit_cost_;
-}
-
-#define POSITION_INVALID (-1)
-
-// Invalidates pairs intersecting (idx1, idx2) in heap.
-static void InvalidatePairs(int idx1, int idx2,
-                            const HistoHeap* const histo_heap) {
-  HistogramPair* const heap = histo_heap->heap - 1;
-  int* const positions = histo_heap->positions;
-  const int max_index = histo_heap->max_index;
-  int i;
-  for (i = 0; i < idx1; ++i) {
-    const int pos = positions[i * max_index + idx1];
-    if (pos >= 0) {
-      heap[pos].idx1 = POSITION_INVALID;
-    }
-  }
-  for (i = idx1 + 1; i < max_index; ++i) {
-    const int pos = positions[idx1 * max_index + i];
-    if (pos >= 0) {
-      heap[pos].idx1 = POSITION_INVALID;
-    }
-  }
-  for (i = 0; i < idx2; ++i) {
-    const int pos = positions[i * max_index + idx2];
-    if (pos >= 0) {
-      heap[pos].idx1 = POSITION_INVALID;
-    }
-  }
-  for (i = idx2 + 1; i < max_index; ++i) {
-    const int pos = positions[idx2 * max_index + i];
-    if (pos >= 0) {
-      heap[pos].idx1 = POSITION_INVALID;
-    }
-  }
+  h1 = histograms[idx1];
+  h2 = histograms[idx2];
+  sum_cost = h1->bit_cost_ + h2->bit_cost_;
+  pair->cost_combo = 0.;
+  GetCombinedHistogramEntropy(h1, h2, sum_cost, &pair->cost_combo);
+  pair->cost_diff = pair->cost_combo - sum_cost;
 }
 
 // Combines histograms by continuously choosing the one with the highest cost
 // reduction.
-static int HistogramCombineGreedy(VP8LHistogramSet* const image_histo,
-                                  VP8LHistogram* const histos) {
+static int HistogramCombineGreedy(VP8LHistogramSet* const image_histo) {
   int ok = 0;
   int image_histo_size = image_histo->size;
   int i, j;
   VP8LHistogram** const histograms = image_histo->histograms;
   // Indexes of remaining histograms.
   int* const clusters = WebPSafeMalloc(image_histo_size, sizeof(*clusters));
-  // Heap of histogram pairs.
-  HistoHeap histo_heap;
+  // Priority queue of histogram pairs.
+  HistoQueue histo_queue;
 
-  if (!HistoHeapInit(&histo_heap, image_histo_size) || clusters == NULL) {
+  if (!HistoQueueInit(&histo_queue, image_histo_size) || clusters == NULL) {
     goto End;
   }
 
@@ -636,19 +672,17 @@ static int HistogramCombineGreedy(VP8LHistogramSet* const image_histo,
     clusters[i] = i;
     for (j = i + 1; j < image_histo_size; ++j) {
       // Initialize positions array.
-      histo_heap.positions[i * histo_heap.max_index + j] = POSITION_INVALID;
-      PreparePair(histograms, i, j, &histo_heap.heap[histo_heap.size], histos);
-      if (histo_heap.heap[histo_heap.size].cost_diff < 0) {
-        HeapPush(&histo_heap);
-      }
+      PreparePair(histograms, i, j, &histo_queue.queue[histo_queue.size]);
+      UpdateQueueFront(&histo_queue);
     }
   }
 
-  while (image_histo_size > 1 && histo_heap.size > 0) {
-    const int idx1 = histo_heap.heap[0].idx1;
-    const int idx2 = histo_heap.heap[0].idx2;
+  while (image_histo_size > 1 && histo_queue.size > 0) {
+    HistogramPair* copy_to;
+    const int idx1 = histo_queue.queue[0].idx1;
+    const int idx2 = histo_queue.queue[0].idx2;
     VP8LHistogramAdd(histograms[idx2], histograms[idx1], histograms[idx1]);
-    histograms[idx1]->bit_cost_ = histo_heap.heap[0].cost_combo;
+    histograms[idx1]->bit_cost_ = histo_queue.queue[0].cost_combo;
     // Remove merged histogram.
     for (i = 0; i + 1 < image_histo_size; ++i) {
       if (clusters[i] >= idx2) {
@@ -657,22 +691,31 @@ static int HistogramCombineGreedy(VP8LHistogramSet* const image_histo,
     }
     --image_histo_size;
 
-    // Invalidate pairs intersecting the just combined best pair.
-    InvalidatePairs(idx1, idx2, &histo_heap);
-
-    // Pop invalid pairs from the top of the heap.
-    while (histo_heap.size > 0 && histo_heap.heap[0].idx1 < 0) {
-      HeapPop(&histo_heap);
+    // Remove pairs intersecting the just combined best pair. This will
+    // therefore pop the head of the queue.
+    copy_to = histo_queue.queue;
+    for (i = 0; i < histo_queue.size; ++i) {
+      HistogramPair* const p = histo_queue.queue + i;
+      if (p->idx1 == idx1 || p->idx2 == idx1 ||
+          p->idx1 == idx2 || p->idx2 == idx2) {
+        // Do not copy the invalid pair.
+        continue;
+      }
+      if (p->cost_diff < histo_queue.queue[0].cost_diff) {
+        // Replace the top of the queue if we found better.
+        SwapHistogramPairs(histo_queue.queue, p);
+      }
+      SwapHistogramPairs(copy_to, p);
+      ++copy_to;
     }
+    histo_queue.size = (int)(copy_to - histo_queue.queue);
 
-    // Push new pairs formed with combined histogram to the heap.
+    // Push new pairs formed with combined histogram to the queue.
     for (i = 0; i < image_histo_size; ++i) {
       if (clusters[i] != idx1) {
         PreparePair(histograms, idx1, clusters[i],
-                    &histo_heap.heap[histo_heap.size], histos);
-        if (histo_heap.heap[histo_heap.size].cost_diff < 0) {
-          HeapPush(&histo_heap);
-        }
+                    &histo_queue.queue[histo_queue.size]);
+        UpdateQueueFront(&histo_queue);
       }
     }
   }
@@ -688,15 +731,14 @@ static int HistogramCombineGreedy(VP8LHistogramSet* const image_histo,
 
  End:
   WebPSafeFree(clusters);
-  HistoHeapClear(&histo_heap);
+  HistoQueueClear(&histo_queue);
   return ok;
 }
 
-static VP8LHistogram* HistogramCombineStochastic(
-    VP8LHistogramSet* const image_histo,
-    VP8LHistogram* tmp_histo,
-    VP8LHistogram* best_combo,
-    int quality, int min_cluster_size) {
+static void HistogramCombineStochastic(VP8LHistogramSet* const image_histo,
+                                       VP8LHistogram* tmp_histo,
+                                       VP8LHistogram* best_combo,
+                                       int quality, int min_cluster_size) {
   int iter;
   uint32_t seed = 0;
   int tries_with_no_success = 0;
@@ -756,7 +798,6 @@ static VP8LHistogram* HistogramCombineStochastic(
     }
   }
   image_histo->size = image_histo_size;
-  return best_combo;
 }
 
 // -----------------------------------------------------------------------------
@@ -764,24 +805,23 @@ static VP8LHistogram* HistogramCombineStochastic(
 
 // Find the best 'out' histogram for each of the 'in' histograms.
 // Note: we assume that out[]->bit_cost_ is already up-to-date.
-static void HistogramRemap(const VP8LHistogramSet* const orig_histo,
-                           const VP8LHistogramSet* const image_histo,
+static void HistogramRemap(const VP8LHistogramSet* const in,
+                           const VP8LHistogramSet* const out,
                            uint16_t* const symbols) {
   int i;
-  VP8LHistogram** const orig_histograms = orig_histo->histograms;
-  VP8LHistogram** const histograms = image_histo->histograms;
-  const int orig_histo_size = orig_histo->size;
-  const int image_histo_size = image_histo->size;
-  if (image_histo_size > 1) {
-    for (i = 0; i < orig_histo_size; ++i) {
+  VP8LHistogram** const in_histo = in->histograms;
+  VP8LHistogram** const out_histo = out->histograms;
+  const int in_size = in->size;
+  const int out_size = out->size;
+  if (out_size > 1) {
+    for (i = 0; i < in_size; ++i) {
       int best_out = 0;
-      double best_bits =
-          HistogramAddThresh(histograms[0], orig_histograms[i], MAX_COST);
+      double best_bits = MAX_COST;
       int k;
-      for (k = 1; k < image_histo_size; ++k) {
+      for (k = 0; k < out_size; ++k) {
         const double cur_bits =
-            HistogramAddThresh(histograms[k], orig_histograms[i], best_bits);
-        if (cur_bits < best_bits) {
+            HistogramAddThresh(out_histo[k], in_histo[i], best_bits);
+        if (k == 0 || cur_bits < best_bits) {
           best_bits = cur_bits;
           best_out = k;
         }
@@ -789,20 +829,20 @@ static void HistogramRemap(const VP8LHistogramSet* const orig_histo,
       symbols[i] = best_out;
     }
   } else {
-    assert(image_histo_size == 1);
-    for (i = 0; i < orig_histo_size; ++i) {
+    assert(out_size == 1);
+    for (i = 0; i < in_size; ++i) {
       symbols[i] = 0;
     }
   }
 
   // Recompute each out based on raw and symbols.
-  for (i = 0; i < image_histo_size; ++i) {
-    HistogramClear(histograms[i]);
+  for (i = 0; i < out_size; ++i) {
+    HistogramClear(out_histo[i]);
   }
 
-  for (i = 0; i < orig_histo_size; ++i) {
+  for (i = 0; i < in_size; ++i) {
     const int idx = symbols[i];
-    VP8LHistogramAdd(orig_histograms[i], histograms[idx], histograms[idx]);
+    VP8LHistogramAdd(in_histo[i], out_histo[idx], out_histo[idx]);
   }
 }
 
@@ -876,11 +916,10 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
     const float x = quality / 100.f;
     // cubic ramp between 1 and MAX_HISTO_GREEDY:
     const int threshold_size = (int)(1 + (x * x * x) * (MAX_HISTO_GREEDY - 1));
-    cur_combo = HistogramCombineStochastic(image_histo,
-                                           tmp_histos->histograms[0],
-                                           cur_combo, quality, threshold_size);
+    HistogramCombineStochastic(image_histo, tmp_histos->histograms[0],
+                               cur_combo, quality, threshold_size);
     if ((image_histo->size <= threshold_size) &&
-        !HistogramCombineGreedy(image_histo, cur_combo)) {
+        !HistogramCombineGreedy(image_histo)) {
       goto Error;
     }
   }
diff --git a/drivers/webp/enc/histogram.h b/drivers/webp/enc/histogram.h
index 72f045793a..59de42b33e 100644
--- a/drivers/webp/enc/histogram.h
+++ b/drivers/webp/enc/histogram.h
@@ -24,6 +24,9 @@
 extern "C" {
 #endif
 
+// Not a trivial literal symbol.
+#define VP8L_NON_TRIVIAL_SYM (0xffffffff)
+
 // A simple container for histograms of data.
 typedef struct {
   // literal_ contains green literal, palette-code and
@@ -103,6 +106,16 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
                              VP8LHistogramSet* const tmp_histos,
                              uint16_t* const histogram_symbols);
 
+// Returns the entropy for the symbols in the input array.
+// Also sets trivial_symbol to the code value, if the array has only one code
+// value. Otherwise, set it to VP8L_NON_TRIVIAL_SYM.
+double VP8LBitsEntropy(const uint32_t* const array, int n,
+                       uint32_t* const trivial_symbol);
+
+// Estimate how many bits the combined entropy of literals and distance
+// approximately maps to.
+double VP8LHistogramEstimateBits(const VP8LHistogram* const p);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/drivers/webp/enc/near_lossless.c b/drivers/webp/enc/near_lossless.c
index 9bc0f0e786..f4ab91f571 100644
--- a/drivers/webp/enc/near_lossless.c
+++ b/drivers/webp/enc/near_lossless.c
@@ -14,6 +14,7 @@
 // Author: Jyrki Alakuijala (jyrki@google.com)
 // Converted to C by Aleksander Kramarz (akramarz@google.com)
 
+#include <assert.h>
 #include <stdlib.h>
 
 #include "../dsp/lossless.h"
@@ -23,42 +24,14 @@
 #define MIN_DIM_FOR_NEAR_LOSSLESS 64
 #define MAX_LIMIT_BITS             5
 
-// Computes quantized pixel value and distance from original value.
-static void GetValAndDistance(int a, int initial, int bits,
-                              int* const val, int* const distance) {
-  const int mask = ~((1 << bits) - 1);
-  *val = (initial & mask) | (initial >> (8 - bits));
-  *distance = 2 * abs(a - *val);
-}
-
-// Clamps the value to range [0, 255].
-static int Clamp8b(int val) {
-  const int min_val = 0;
-  const int max_val = 0xff;
-  return (val < min_val) ? min_val : (val > max_val) ? max_val : val;
-}
-
-// Quantizes values {a, a+(1<<bits), a-(1<<bits)} and returns the nearest one.
+// Quantizes the value up or down to a multiple of 1<<bits (or to 255),
+// choosing the closer one, resolving ties using bankers' rounding.
 static int FindClosestDiscretized(int a, int bits) {
-  int best_val = a, i;
-  int min_distance = 256;
-
-  for (i = -1; i <= 1; ++i) {
-    int candidate, distance;
-    const int val = Clamp8b(a + i * (1 << bits));
-    GetValAndDistance(a, val, bits, &candidate, &distance);
-    if (i != 0) {
-      ++distance;
-    }
-    // Smallest distance but favor i == 0 over i == -1 and i == 1
-    // since that keeps the overall intensity more constant in the
-    // images.
-    if (distance < min_distance) {
-      min_distance = distance;
-      best_val = candidate;
-    }
-  }
-  return best_val;
+  const int mask = (1 << bits) - 1;
+  const int biased = a + (mask >> 1) + ((a >> bits) & 1);
+  assert(bits > 0);
+  if (biased > 0xff) return 0xff;
+  return biased & ~mask;
 }
 
 // Applies FindClosestDiscretized to all channels of pixel.
@@ -124,22 +97,11 @@ static void NearLossless(int xsize, int ysize, uint32_t* argb,
   }
 }
 
-static int QualityToLimitBits(int quality) {
-  // quality mapping:
-  //  0..19 -> 5
-  //  0..39 -> 4
-  //  0..59 -> 3
-  //  0..79 -> 2
-  //  0..99 -> 1
-  //  100   -> 0
-  return MAX_LIMIT_BITS - quality / 20;
-}
-
 int VP8ApplyNearLossless(int xsize, int ysize, uint32_t* argb, int quality) {
   int i;
   uint32_t* const copy_buffer =
       (uint32_t*)WebPSafeMalloc(xsize * 3, sizeof(*copy_buffer));
-  const int limit_bits = QualityToLimitBits(quality);
+  const int limit_bits = VP8LNearLosslessBits(quality);
   assert(argb != NULL);
   assert(limit_bits >= 0);
   assert(limit_bits <= MAX_LIMIT_BITS);
diff --git a/drivers/webp/enc/picture.c b/drivers/webp/enc/picture.c
index 26679a72e4..d9befbc47d 100644
--- a/drivers/webp/enc/picture.c
+++ b/drivers/webp/enc/picture.c
@@ -237,6 +237,8 @@ static size_t Encode(const uint8_t* rgba, int width, int height, int stride,
   WebPMemoryWriter wrt;
   int ok;
 
+  if (output == NULL) return 0;
+
   if (!WebPConfigPreset(&config, WEBP_PRESET_DEFAULT, quality_factor) ||
       !WebPPictureInit(&pic)) {
     return 0;  // shouldn't happen, except if system installation is broken
diff --git a/drivers/webp/enc/picture_csp.c b/drivers/webp/enc/picture_csp.c
index 0ef5f9eee2..607a6240b0 100644
--- a/drivers/webp/enc/picture_csp.c
+++ b/drivers/webp/enc/picture_csp.c
@@ -1125,32 +1125,44 @@ static int Import(WebPPicture* const picture,
 
 int WebPPictureImportRGB(WebPPicture* picture,
                          const uint8_t* rgb, int rgb_stride) {
-  return (picture != NULL) ? Import(picture, rgb, rgb_stride, 3, 0, 0) : 0;
+  return (picture != NULL && rgb != NULL)
+             ? Import(picture, rgb, rgb_stride, 3, 0, 0)
+             : 0;
 }
 
 int WebPPictureImportBGR(WebPPicture* picture,
                          const uint8_t* rgb, int rgb_stride) {
-  return (picture != NULL) ? Import(picture, rgb, rgb_stride, 3, 1, 0) : 0;
+  return (picture != NULL && rgb != NULL)
+             ? Import(picture, rgb, rgb_stride, 3, 1, 0)
+             : 0;
 }
 
 int WebPPictureImportRGBA(WebPPicture* picture,
                           const uint8_t* rgba, int rgba_stride) {
-  return (picture != NULL) ? Import(picture, rgba, rgba_stride, 4, 0, 1) : 0;
+  return (picture != NULL && rgba != NULL)
+             ? Import(picture, rgba, rgba_stride, 4, 0, 1)
+             : 0;
 }
 
 int WebPPictureImportBGRA(WebPPicture* picture,
                           const uint8_t* rgba, int rgba_stride) {
-  return (picture != NULL) ? Import(picture, rgba, rgba_stride, 4, 1, 1) : 0;
+  return (picture != NULL && rgba != NULL)
+             ? Import(picture, rgba, rgba_stride, 4, 1, 1)
+             : 0;
 }
 
 int WebPPictureImportRGBX(WebPPicture* picture,
                           const uint8_t* rgba, int rgba_stride) {
-  return (picture != NULL) ? Import(picture, rgba, rgba_stride, 4, 0, 0) : 0;
+  return (picture != NULL && rgba != NULL)
+             ? Import(picture, rgba, rgba_stride, 4, 0, 0)
+             : 0;
 }
 
 int WebPPictureImportBGRX(WebPPicture* picture,
                           const uint8_t* rgba, int rgba_stride) {
-  return (picture != NULL) ? Import(picture, rgba, rgba_stride, 4, 1, 0) : 0;
+  return (picture != NULL && rgba != NULL)
+             ? Import(picture, rgba, rgba_stride, 4, 1, 0)
+             : 0;
 }
 
 //------------------------------------------------------------------------------
diff --git a/drivers/webp/enc/picture_psnr.c b/drivers/webp/enc/picture_psnr.c
index 40214efc95..81ab1b5ca1 100644
--- a/drivers/webp/enc/picture_psnr.c
+++ b/drivers/webp/enc/picture_psnr.c
@@ -27,7 +27,7 @@
 
 static void AccumulateLSIM(const uint8_t* src, int src_stride,
                            const uint8_t* ref, int ref_stride,
-                           int w, int h, DistoStats* stats) {
+                           int w, int h, VP8DistoStats* stats) {
   int x, y;
   double total_sse = 0.;
   for (y = 0; y < h; ++y) {
@@ -71,11 +71,13 @@ static float GetPSNR(const double v) {
 
 int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref,
                           int type, float result[5]) {
-  DistoStats stats[5];
+  VP8DistoStats stats[5];
   int w, h;
 
   memset(stats, 0, sizeof(stats));
 
+  VP8SSIMDspInit();
+
   if (src == NULL || ref == NULL ||
       src->width != ref->width || src->height != ref->height ||
       src->use_argb != ref->use_argb || result == NULL) {
diff --git a/drivers/webp/enc/picture_tools.c b/drivers/webp/enc/picture_tools.c
index 7c73646397..bf97af8408 100644
--- a/drivers/webp/enc/picture_tools.c
+++ b/drivers/webp/enc/picture_tools.c
@@ -11,6 +11,8 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
+#include <assert.h>
+
 #include "./vp8enci.h"
 #include "../dsp/yuv.h"
 
@@ -120,6 +122,24 @@ void WebPCleanupTransparentArea(WebPPicture* pic) {
 #undef SIZE
 #undef SIZE2
 
+void WebPCleanupTransparentAreaLossless(WebPPicture* const pic) {
+  int x, y, w, h;
+  uint32_t* argb;
+  assert(pic != NULL && pic->use_argb);
+  w = pic->width;
+  h = pic->height;
+  argb = pic->argb;
+
+  for (y = 0; y < h; ++y) {
+    for (x = 0; x < w; ++x) {
+      if ((argb[x] & 0xff000000) == 0) {
+        argb[x] = 0x00000000;
+      }
+    }
+    argb += pic->argb_stride;
+  }
+}
+
 //------------------------------------------------------------------------------
 // Blend color and remove transparency info
 
diff --git a/drivers/webp/enc/quant.c b/drivers/webp/enc/quant.c
index 002c326b82..549ad26f93 100644
--- a/drivers/webp/enc/quant.c
+++ b/drivers/webp/enc/quant.c
@@ -30,8 +30,6 @@
 #define SNS_TO_DQ 0.9     // Scaling constant between the sns value and the QP
                           // power-law modulation. Must be strictly less than 1.
 
-#define I4_PENALTY 4000   // Rate-penalty for quick i4/i16 decision
-
 // number of non-zero coeffs below which we consider the block very flat
 // (and apply a penalty to complex predictions)
 #define FLATNESS_LIMIT_I16 10      // I16 mode
@@ -41,6 +39,8 @@
 
 #define MULT_8B(a, b) (((a) * (b) + 128) >> 8)
 
+#define RD_DISTO_MULT      256  // distortion multiplier (equivalent of lambda)
+
 // #define DEBUG_BLOCK
 
 //------------------------------------------------------------------------------
@@ -54,15 +54,37 @@ static void PrintBlockInfo(const VP8EncIterator* const it,
                            const VP8ModeScore* const rd) {
   int i, j;
   const int is_i16 = (it->mb_->type_ == 1);
+  const uint8_t* const y_in = it->yuv_in_ + Y_OFF_ENC;
+  const uint8_t* const y_out = it->yuv_out_ + Y_OFF_ENC;
+  const uint8_t* const uv_in = it->yuv_in_ + U_OFF_ENC;
+  const uint8_t* const uv_out = it->yuv_out_ + U_OFF_ENC;
   printf("SOURCE / OUTPUT / ABS DELTA\n");
-  for (j = 0; j < 24; ++j) {
-    if (j == 16) printf("\n");   // newline before the U/V block
-    for (i = 0; i < 16; ++i) printf("%3d ", it->yuv_in_[i + j * BPS]);
+  for (j = 0; j < 16; ++j) {
+    for (i = 0; i < 16; ++i) printf("%3d ", y_in[i + j * BPS]);
     printf("     ");
-    for (i = 0; i < 16; ++i) printf("%3d ", it->yuv_out_[i + j * BPS]);
+    for (i = 0; i < 16; ++i) printf("%3d ", y_out[i + j * BPS]);
     printf("     ");
     for (i = 0; i < 16; ++i) {
-      printf("%1d ", abs(it->yuv_out_[i + j * BPS] - it->yuv_in_[i + j * BPS]));
+      printf("%1d ", abs(y_in[i + j * BPS] - y_out[i + j * BPS]));
+    }
+    printf("\n");
+  }
+  printf("\n");   // newline before the U/V block
+  for (j = 0; j < 8; ++j) {
+    for (i = 0; i < 8; ++i) printf("%3d ", uv_in[i + j * BPS]);
+    printf(" ");
+    for (i = 8; i < 16; ++i) printf("%3d ", uv_in[i + j * BPS]);
+    printf("    ");
+    for (i = 0; i < 8; ++i) printf("%3d ", uv_out[i + j * BPS]);
+    printf(" ");
+    for (i = 8; i < 16; ++i) printf("%3d ", uv_out[i + j * BPS]);
+    printf("   ");
+    for (i = 0; i < 8; ++i) {
+      printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
+    }
+    printf(" ");
+    for (i = 8; i < 16; ++i) {
+      printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
     }
     printf("\n");
   }
@@ -212,6 +234,8 @@ static int ExpandMatrix(VP8Matrix* const m, int type) {
   return (sum + 8) >> 4;
 }
 
+static void CheckLambdaValue(int* const v) { if (*v < 1) *v = 1; }
+
 static void SetupMatrices(VP8Encoder* enc) {
   int i;
   const int tlambda_scale =
@@ -221,7 +245,7 @@ static void SetupMatrices(VP8Encoder* enc) {
   for (i = 0; i < num_segments; ++i) {
     VP8SegmentInfo* const m = &enc->dqm_[i];
     const int q = m->quant_;
-    int q4, q16, quv;
+    int q_i4, q_i16, q_uv;
     m->y1_.q_[0] = kDcTable[clip(q + enc->dq_y1_dc_, 0, 127)];
     m->y1_.q_[1] = kAcTable[clip(q,                  0, 127)];
 
@@ -231,21 +255,33 @@ static void SetupMatrices(VP8Encoder* enc) {
     m->uv_.q_[0] = kDcTable[clip(q + enc->dq_uv_dc_, 0, 117)];
     m->uv_.q_[1] = kAcTable[clip(q + enc->dq_uv_ac_, 0, 127)];
 
-    q4  = ExpandMatrix(&m->y1_, 0);
-    q16 = ExpandMatrix(&m->y2_, 1);
-    quv = ExpandMatrix(&m->uv_, 2);
-
-    m->lambda_i4_          = (3 * q4 * q4) >> 7;
-    m->lambda_i16_         = (3 * q16 * q16);
-    m->lambda_uv_          = (3 * quv * quv) >> 6;
-    m->lambda_mode_        = (1 * q4 * q4) >> 7;
-    m->lambda_trellis_i4_  = (7 * q4 * q4) >> 3;
-    m->lambda_trellis_i16_ = (q16 * q16) >> 2;
-    m->lambda_trellis_uv_  = (quv *quv) << 1;
-    m->tlambda_            = (tlambda_scale * q4) >> 5;
+    q_i4  = ExpandMatrix(&m->y1_, 0);
+    q_i16 = ExpandMatrix(&m->y2_, 1);
+    q_uv  = ExpandMatrix(&m->uv_, 2);
+
+    m->lambda_i4_          = (3 * q_i4 * q_i4) >> 7;
+    m->lambda_i16_         = (3 * q_i16 * q_i16);
+    m->lambda_uv_          = (3 * q_uv * q_uv) >> 6;
+    m->lambda_mode_        = (1 * q_i4 * q_i4) >> 7;
+    m->lambda_trellis_i4_  = (7 * q_i4 * q_i4) >> 3;
+    m->lambda_trellis_i16_ = (q_i16 * q_i16) >> 2;
+    m->lambda_trellis_uv_  = (q_uv * q_uv) << 1;
+    m->tlambda_            = (tlambda_scale * q_i4) >> 5;
+
+    // none of these constants should be < 1
+    CheckLambdaValue(&m->lambda_i4_);
+    CheckLambdaValue(&m->lambda_i16_);
+    CheckLambdaValue(&m->lambda_uv_);
+    CheckLambdaValue(&m->lambda_mode_);
+    CheckLambdaValue(&m->lambda_trellis_i4_);
+    CheckLambdaValue(&m->lambda_trellis_i16_);
+    CheckLambdaValue(&m->lambda_trellis_uv_);
+    CheckLambdaValue(&m->tlambda_);
 
     m->min_disto_ = 10 * m->y1_.q_[0];   // quantization-aware min disto
     m->max_edge_  = 0;
+
+    m->i4_penalty_ = 1000 * q_i4 * q_i4;
   }
 }
 
@@ -324,7 +360,12 @@ static int SegmentsAreEquivalent(const VP8SegmentInfo* const S1,
 
 static void SimplifySegments(VP8Encoder* const enc) {
   int map[NUM_MB_SEGMENTS] = { 0, 1, 2, 3 };
-  const int num_segments = enc->segment_hdr_.num_segments_;
+  // 'num_segments_' is previously validated and <= NUM_MB_SEGMENTS, but an
+  // explicit check is needed to avoid a spurious warning about 'i' exceeding
+  // array bounds of 'dqm_' with some compilers (noticed with gcc-4.9).
+  const int num_segments = (enc->segment_hdr_.num_segments_ < NUM_MB_SEGMENTS)
+                               ? enc->segment_hdr_.num_segments_
+                               : NUM_MB_SEGMENTS;
   int num_final_segments = 1;
   int s1, s2;
   for (s1 = 1; s1 < num_segments; ++s1) {    // find similar segments
@@ -535,13 +576,12 @@ typedef struct {
 #define SCORE_STATE(n, l) (score_states[n][(l) + MIN_DELTA])
 
 static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
-  // TODO: incorporate the "* 256" in the tables?
-  rd->score = (rd->R + rd->H) * lambda + 256 * (rd->D + rd->SD);
+  rd->score = (rd->R + rd->H) * lambda + RD_DISTO_MULT * (rd->D + rd->SD);
 }
 
 static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
                                           score_t distortion) {
-  return rate * lambda + 256 * distortion;
+  return rate * lambda + RD_DISTO_MULT * distortion;
 }
 
 static int TrellisQuantizeBlock(const VP8Encoder* const enc,
@@ -1050,7 +1090,7 @@ static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
 
     // Compute RD-score
     rd_uv.D  = VP8SSE16x8(src, tmp_dst);
-    rd_uv.SD = 0;    // TODO: should we call TDisto? it tends to flatten areas.
+    rd_uv.SD = 0;    // not calling TDisto here: it tends to flatten areas.
     rd_uv.H  = VP8FixedCostsUV[mode];
     rd_uv.R  = VP8GetCostUV(it, &rd_uv);
     if (mode > 0 && IsFlat(rd_uv.uv_levels[0], kNumBlocks, FLATNESS_LIMIT_UV)) {
@@ -1100,56 +1140,108 @@ static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) {
 }
 
 // Refine intra16/intra4 sub-modes based on distortion only (not rate).
-static void DistoRefine(VP8EncIterator* const it, int try_both_i4_i16) {
-  const int is_i16 = (it->mb_->type_ == 1);
+static void RefineUsingDistortion(VP8EncIterator* const it,
+                                  int try_both_modes, int refine_uv_mode,
+                                  VP8ModeScore* const rd) {
   score_t best_score = MAX_COST;
+  int nz = 0;
+  int mode;
+  int is_i16 = try_both_modes || (it->mb_->type_ == 1);
 
-  if (try_both_i4_i16 || is_i16) {
-    int mode;
+  const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
+  // Some empiric constants, of approximate order of magnitude.
+  const int lambda_d_i16 = 106;
+  const int lambda_d_i4 = 11;
+  const int lambda_d_uv = 120;
+  score_t score_i4 = dqm->i4_penalty_;
+  score_t i4_bit_sum = 0;
+  const score_t bit_limit = it->enc_->mb_header_limit_;
+
+  if (is_i16) {   // First, evaluate Intra16 distortion
     int best_mode = -1;
+    const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
     for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
       const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
-      const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
-      const score_t score = VP8SSE16x16(src, ref);
+      const score_t score = VP8SSE16x16(src, ref) * RD_DISTO_MULT
+                          + VP8FixedCostsI16[mode] * lambda_d_i16;
+      if (mode > 0 && VP8FixedCostsI16[mode] > bit_limit) {
+        continue;
+      }
       if (score < best_score) {
         best_mode = mode;
         best_score = score;
       }
     }
     VP8SetIntra16Mode(it, best_mode);
+    // we'll reconstruct later, if i16 mode actually gets selected
   }
-  if (try_both_i4_i16 || !is_i16) {
-    uint8_t modes_i4[16];
+
+  // Next, evaluate Intra4
+  if (try_both_modes || !is_i16) {
     // We don't evaluate the rate here, but just account for it through a
     // constant penalty (i4 mode usually needs more bits compared to i16).
-    score_t score_i4 = (score_t)I4_PENALTY;
-
+    is_i16 = 0;
     VP8IteratorStartI4(it);
     do {
-      int mode;
-      int best_sub_mode = -1;
-      score_t best_sub_score = MAX_COST;
+      int best_i4_mode = -1;
+      score_t best_i4_score = MAX_COST;
       const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
+      const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
 
-      // TODO(skal): we don't really need the prediction pixels here,
-      // but just the distortion against 'src'.
       VP8MakeIntra4Preds(it);
       for (mode = 0; mode < NUM_BMODES; ++mode) {
         const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
-        const score_t score = VP8SSE4x4(src, ref);
-        if (score < best_sub_score) {
-          best_sub_mode = mode;
-          best_sub_score = score;
+        const score_t score = VP8SSE4x4(src, ref) * RD_DISTO_MULT
+                            + mode_costs[mode] * lambda_d_i4;
+        if (score < best_i4_score) {
+          best_i4_mode = mode;
+          best_i4_score = score;
         }
       }
-      modes_i4[it->i4_] = best_sub_mode;
-      score_i4 += best_sub_score;
-      if (score_i4 >= best_score) break;
-    } while (VP8IteratorRotateI4(it, it->yuv_in_ + Y_OFF_ENC));
-    if (score_i4 < best_score) {
-      VP8SetIntra4Mode(it, modes_i4);
+      i4_bit_sum += mode_costs[best_i4_mode];
+      rd->modes_i4[it->i4_] = best_i4_mode;
+      score_i4 += best_i4_score;
+      if (score_i4 >= best_score || i4_bit_sum > bit_limit) {
+        // Intra4 won't be better than Intra16. Bail out and pick Intra16.
+        is_i16 = 1;
+        break;
+      } else {  // reconstruct partial block inside yuv_out2_ buffer
+        uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC + VP8Scan[it->i4_];
+        nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
+                                src, tmp_dst, best_i4_mode) << it->i4_;
+      }
+    } while (VP8IteratorRotateI4(it, it->yuv_out2_ + Y_OFF_ENC));
+  }
+
+  // Final reconstruction, depending on which mode is selected.
+  if (!is_i16) {
+    VP8SetIntra4Mode(it, rd->modes_i4);
+    SwapOut(it);
+    best_score = score_i4;
+  } else {
+    nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
+  }
+
+  // ... and UV!
+  if (refine_uv_mode) {
+    int best_mode = -1;
+    score_t best_uv_score = MAX_COST;
+    const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
+    for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
+      const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
+      const score_t score = VP8SSE16x8(src, ref) * RD_DISTO_MULT
+                          + VP8FixedCostsUV[mode] * lambda_d_uv;
+      if (score < best_uv_score) {
+        best_mode = mode;
+        best_uv_score = score;
+      }
     }
+    VP8SetIntraUVMode(it, best_mode);
   }
+  nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
+
+  rd->nz = nz;
+  rd->score = best_score;
 }
 
 //------------------------------------------------------------------------------
@@ -1179,13 +1271,13 @@ int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd,
       SimpleQuantize(it, rd);
     }
   } else {
-    // For method == 2, pick the best intra4/intra16 based on SSE (~tad slower).
-    // For method <= 1, we refine intra4 or intra16 (but don't re-examine mode).
-    DistoRefine(it, (method >= 2));
-    SimpleQuantize(it, rd);
+    // At this point we have heuristically decided intra16 / intra4.
+    // For method >= 2, pick the best intra4/intra16 based on SSE (~tad slower).
+    // For method <= 1, we don't re-examine the decision but just go ahead with
+    // quantization/reconstruction.
+    RefineUsingDistortion(it, (method >= 2), (method >= 1), rd);
   }
   is_skipped = (rd->nz == 0);
   VP8SetSkip(it, is_skipped);
   return is_skipped;
 }
-
diff --git a/drivers/webp/enc/vp8enci.h b/drivers/webp/enc/vp8enci.h
index 0cb2ccc353..efd2f19a9b 100644
--- a/drivers/webp/enc/vp8enci.h
+++ b/drivers/webp/enc/vp8enci.h
@@ -22,10 +22,6 @@
 #include "../utils/utils.h"
 #include "webp/encode.h"
 
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-#include "./vp8li.h"
-#endif  // WEBP_EXPERIMENTAL_FEATURES
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -35,8 +31,8 @@ extern "C" {
 
 // version numbers
 #define ENC_MAJ_VERSION 0
-#define ENC_MIN_VERSION 4
-#define ENC_REV_VERSION 4
+#define ENC_MIN_VERSION 5
+#define ENC_REV_VERSION 1
 
 enum { MAX_LF_LEVELS = 64,       // Maximum loop filter level
        MAX_VARIABLE_LEVEL = 67,  // last (inclusive) level with variable cost
@@ -200,6 +196,9 @@ typedef struct {
   int lambda_i16_, lambda_i4_, lambda_uv_;
   int lambda_mode_, lambda_trellis_, tlambda_;
   int lambda_trellis_i16_, lambda_trellis_i4_, lambda_trellis_uv_;
+
+  // lambda values for distortion-based evaluation
+  score_t i4_penalty_;   // penalty for using Intra4
 } VP8SegmentInfo;
 
 // Handy transient struct to accumulate score and info during RD-optimization
@@ -395,6 +394,7 @@ struct VP8Encoder {
   int method_;               // 0=fastest, 6=best/slowest.
   VP8RDLevel rd_opt_level_;  // Deduced from method_.
   int max_i4_header_bits_;   // partition #0 safeness factor
+  int mb_header_limit_;      // rough limit for header bits per MB
   int thread_level_;         // derived from config->thread_level
   int do_search_;            // derived from config->target_XXX
   int use_tokens_;           // if true, use token buffer
@@ -477,17 +477,12 @@ int VP8EncFinishAlpha(VP8Encoder* const enc);   // finalize compressed data
 int VP8EncDeleteAlpha(VP8Encoder* const enc);   // delete compressed data
 
   // in filter.c
-
-// SSIM utils
-typedef struct {
-  double w, xm, ym, xxm, xym, yym;
-} DistoStats;
-void VP8SSIMAddStats(const DistoStats* const src, DistoStats* const dst);
+void VP8SSIMAddStats(const VP8DistoStats* const src, VP8DistoStats* const dst);
 void VP8SSIMAccumulatePlane(const uint8_t* src1, int stride1,
                             const uint8_t* src2, int stride2,
-                            int W, int H, DistoStats* const stats);
-double VP8SSIMGet(const DistoStats* const stats);
-double VP8SSIMGetSquaredError(const DistoStats* const stats);
+                            int W, int H, VP8DistoStats* const stats);
+double VP8SSIMGet(const VP8DistoStats* const stats);
+double VP8SSIMGetSquaredError(const VP8DistoStats* const stats);
 
 // autofilter
 void VP8InitFilter(VP8EncIterator* const it);
@@ -514,6 +509,10 @@ int WebPPictureAllocARGB(WebPPicture* const picture, int width, int height);
 // Returns false in case of error (invalid param, out-of-memory).
 int WebPPictureAllocYUVA(WebPPicture* const picture, int width, int height);
 
+// Clean-up the RGB samples under fully transparent area, to help lossless
+// compressibility (no guarantee, though). Assumes that pic->use_argb is true.
+void WebPCleanupTransparentAreaLossless(WebPPicture* const pic);
+
   // in near_lossless.c
 // Near lossless preprocessing in RGB color-space.
 int VP8ApplyNearLossless(int xsize, int ysize, uint32_t* argb, int quality);
diff --git a/drivers/webp/enc/vp8l.c b/drivers/webp/enc/vp8l.c
index 284995e830..1f2d41ea91 100644
--- a/drivers/webp/enc/vp8l.c
+++ b/drivers/webp/enc/vp8l.c
@@ -16,6 +16,7 @@
 #include <stdlib.h>
 
 #include "./backward_references.h"
+#include "./histogram.h"
 #include "./vp8enci.h"
 #include "./vp8li.h"
 #include "../dsp/lossless.h"
@@ -33,8 +34,8 @@
 // Palette reordering for smaller sum of deltas (and for smaller storage).
 
 static int PaletteCompareColorsForQsort(const void* p1, const void* p2) {
-  const uint32_t a = *(const uint32_t*)p1;
-  const uint32_t b = *(const uint32_t*)p2;
+  const uint32_t a = WebPMemToUint32(p1);
+  const uint32_t b = WebPMemToUint32(p2);
   assert(a != b);
   return (a < b) ? -1 : 1;
 }
@@ -125,54 +126,8 @@ static int AnalyzeAndCreatePalette(const WebPPicture* const pic,
                                    int low_effort,
                                    uint32_t palette[MAX_PALETTE_SIZE],
                                    int* const palette_size) {
-  int i, x, y, key;
-  int num_colors = 0;
-  uint8_t in_use[MAX_PALETTE_SIZE * 4] = { 0 };
-  uint32_t colors[MAX_PALETTE_SIZE * 4];
-  static const uint32_t kHashMul = 0x1e35a7bd;
-  const uint32_t* argb = pic->argb;
-  const int width = pic->width;
-  const int height = pic->height;
-  uint32_t last_pix = ~argb[0];   // so we're sure that last_pix != argb[0]
-
-  for (y = 0; y < height; ++y) {
-    for (x = 0; x < width; ++x) {
-      if (argb[x] == last_pix) {
-        continue;
-      }
-      last_pix = argb[x];
-      key = (kHashMul * last_pix) >> PALETTE_KEY_RIGHT_SHIFT;
-      while (1) {
-        if (!in_use[key]) {
-          colors[key] = last_pix;
-          in_use[key] = 1;
-          ++num_colors;
-          if (num_colors > MAX_PALETTE_SIZE) {
-            return 0;
-          }
-          break;
-        } else if (colors[key] == last_pix) {
-          // The color is already there.
-          break;
-        } else {
-          // Some other color sits there.
-          // Do linear conflict resolution.
-          ++key;
-          key &= (MAX_PALETTE_SIZE * 4 - 1);  // key mask for 1K buffer.
-        }
-      }
-    }
-    argb += pic->argb_stride;
-  }
-
-  // TODO(skal): could we reuse in_use[] to speed up EncodePalette()?
-  num_colors = 0;
-  for (i = 0; i < (int)(sizeof(in_use) / sizeof(in_use[0])); ++i) {
-    if (in_use[i]) {
-      palette[num_colors] = colors[i];
-      ++num_colors;
-    }
-  }
+  const int num_colors = WebPGetColorPalette(pic, palette);
+  if (num_colors > MAX_PALETTE_SIZE) return 0;
   *palette_size = num_colors;
   qsort(palette, num_colors, sizeof(*palette), PaletteCompareColorsForQsort);
   if (!low_effort && PaletteHasNonMonotonousDeltas(palette, num_colors)) {
@@ -335,7 +290,7 @@ static int AnalyzeEntropy(const uint32_t* argb,
         }
       }
     }
-    free(histo);
+    WebPSafeFree(histo);
     return 1;
   } else {
     return 0;
@@ -760,6 +715,10 @@ static WebPEncodingError EncodeImageNoHuffman(VP8LBitWriter* const bw,
   }
 
   // Calculate backward references from ARGB image.
+  if (VP8LHashChainFill(hash_chain, quality, argb, width, height) == 0) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    goto Error;
+  }
   refs = VP8LGetBackwardReferences(width, height, argb, quality, 0, &cache_bits,
                                    hash_chain, refs_array);
   if (refs == NULL) {
@@ -823,7 +782,8 @@ static WebPEncodingError EncodeImageInternal(VP8LBitWriter* const bw,
                                              VP8LHashChain* const hash_chain,
                                              VP8LBackwardRefs refs_array[2],
                                              int width, int height, int quality,
-                                             int low_effort, int* cache_bits,
+                                             int low_effort,
+                                             int use_cache, int* cache_bits,
                                              int histogram_bits,
                                              size_t init_byte_position,
                                              int* const hdr_size,
@@ -855,10 +815,14 @@ static WebPEncodingError EncodeImageInternal(VP8LBitWriter* const bw,
     goto Error;
   }
 
-  *cache_bits = MAX_COLOR_CACHE_BITS;
+  *cache_bits = use_cache ? MAX_COLOR_CACHE_BITS : 0;
   // 'best_refs' is the reference to the best backward refs and points to one
   // of refs_array[0] or refs_array[1].
   // Calculate backward references from ARGB image.
+  if (VP8LHashChainFill(hash_chain, quality, argb, width, height) == 0) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    goto Error;
+  }
   best_refs = VP8LGetBackwardReferences(width, height, argb, quality,
                                         low_effort, cache_bits, hash_chain,
                                         refs_array);
@@ -1006,13 +970,19 @@ static void ApplySubtractGreen(VP8LEncoder* const enc, int width, int height,
 static WebPEncodingError ApplyPredictFilter(const VP8LEncoder* const enc,
                                             int width, int height,
                                             int quality, int low_effort,
+                                            int used_subtract_green,
                                             VP8LBitWriter* const bw) {
   const int pred_bits = enc->transform_bits_;
   const int transform_width = VP8LSubSampleSize(width, pred_bits);
   const int transform_height = VP8LSubSampleSize(height, pred_bits);
+  // we disable near-lossless quantization if palette is used.
+  const int near_lossless_strength = enc->use_palette_ ? 100
+                                   : enc->config_->near_lossless;
 
   VP8LResidualImage(width, height, pred_bits, low_effort, enc->argb_,
-                    enc->argb_scratch_, enc->transform_data_);
+                    enc->argb_scratch_, enc->transform_data_,
+                    near_lossless_strength, enc->config_->exact,
+                    used_subtract_green);
   VP8LPutBits(bw, TRANSFORM_PRESENT, 1);
   VP8LPutBits(bw, PREDICTOR_TRANSFORM, 2);
   assert(pred_bits >= 2);
@@ -1112,6 +1082,12 @@ static WebPEncodingError WriteImage(const WebPPicture* const pic,
 
 // -----------------------------------------------------------------------------
 
+static void ClearTransformBuffer(VP8LEncoder* const enc) {
+  WebPSafeFree(enc->transform_mem_);
+  enc->transform_mem_ = NULL;
+  enc->transform_mem_size_ = 0;
+}
+
 // Allocates the memory for argb (W x H) buffer, 2 rows of context for
 // prediction and transform data.
 // Flags influencing the memory allocated:
@@ -1120,43 +1096,48 @@ static WebPEncodingError WriteImage(const WebPPicture* const pic,
 static WebPEncodingError AllocateTransformBuffer(VP8LEncoder* const enc,
                                                  int width, int height) {
   WebPEncodingError err = VP8_ENC_OK;
-  if (enc->argb_ == NULL) {
-    const int tile_size = 1 << enc->transform_bits_;
-    const uint64_t image_size = width * height;
-    // Ensure enough size for tiles, as well as for two scanlines and two
-    // extra pixels for CopyImageWithPrediction.
-    const uint64_t argb_scratch_size =
-        enc->use_predict_ ? tile_size * width + width + 2 : 0;
-    const int transform_data_size =
-        (enc->use_predict_ || enc->use_cross_color_)
-            ? VP8LSubSampleSize(width, enc->transform_bits_) *
-              VP8LSubSampleSize(height, enc->transform_bits_)
-            : 0;
-    const uint64_t total_size =
-        image_size + WEBP_ALIGN_CST +
-        argb_scratch_size + WEBP_ALIGN_CST +
-        (uint64_t)transform_data_size;
-    uint32_t* mem = (uint32_t*)WebPSafeMalloc(total_size, sizeof(*mem));
+  const uint64_t image_size = width * height;
+  // VP8LResidualImage needs room for 2 scanlines of uint32 pixels with an extra
+  // pixel in each, plus 2 regular scanlines of bytes.
+  // TODO(skal): Clean up by using arithmetic in bytes instead of words.
+  const uint64_t argb_scratch_size =
+      enc->use_predict_
+          ? (width + 1) * 2 +
+            (width * 2 + sizeof(uint32_t) - 1) / sizeof(uint32_t)
+          : 0;
+  const uint64_t transform_data_size =
+      (enc->use_predict_ || enc->use_cross_color_)
+          ? VP8LSubSampleSize(width, enc->transform_bits_) *
+                VP8LSubSampleSize(height, enc->transform_bits_)
+          : 0;
+  const uint64_t max_alignment_in_words =
+      (WEBP_ALIGN_CST + sizeof(uint32_t) - 1) / sizeof(uint32_t);
+  const uint64_t mem_size =
+      image_size + max_alignment_in_words +
+      argb_scratch_size + max_alignment_in_words +
+      transform_data_size;
+  uint32_t* mem = enc->transform_mem_;
+  if (mem == NULL || mem_size > enc->transform_mem_size_) {
+    ClearTransformBuffer(enc);
+    mem = (uint32_t*)WebPSafeMalloc(mem_size, sizeof(*mem));
     if (mem == NULL) {
       err = VP8_ENC_ERROR_OUT_OF_MEMORY;
       goto Error;
     }
-    enc->argb_ = mem;
-    mem = (uint32_t*)WEBP_ALIGN(mem + image_size);
-    enc->argb_scratch_ = mem;
-    mem = (uint32_t*)WEBP_ALIGN(mem + argb_scratch_size);
-    enc->transform_data_ = mem;
-    enc->current_width_ = width;
+    enc->transform_mem_ = mem;
+    enc->transform_mem_size_ = (size_t)mem_size;
   }
+  enc->argb_ = mem;
+  mem = (uint32_t*)WEBP_ALIGN(mem + image_size);
+  enc->argb_scratch_ = mem;
+  mem = (uint32_t*)WEBP_ALIGN(mem + argb_scratch_size);
+  enc->transform_data_ = mem;
+
+  enc->current_width_ = width;
  Error:
   return err;
 }
 
-static void ClearTransformBuffer(VP8LEncoder* const enc) {
-  WebPSafeFree(enc->argb_);
-  enc->argb_ = NULL;
-}
-
 static WebPEncodingError MakeInputImageCopy(VP8LEncoder* const enc) {
   WebPEncodingError err = VP8_ENC_OK;
   const WebPPicture* const picture = enc->pic_;
@@ -1176,8 +1157,35 @@ static WebPEncodingError MakeInputImageCopy(VP8LEncoder* const enc) {
 
 // -----------------------------------------------------------------------------
 
-static void MapToPalette(const uint32_t palette[], int num_colors,
+static int SearchColor(const uint32_t sorted[], uint32_t color, int hi) {
+  int low = 0;
+  if (sorted[low] == color) return low;  // loop invariant: sorted[low] != color
+  while (1) {
+    const int mid = (low + hi) >> 1;
+    if (sorted[mid] == color) {
+      return mid;
+    } else if (sorted[mid] < color) {
+      low = mid;
+    } else {
+      hi = mid;
+    }
+  }
+}
+
+// Sort palette in increasing order and prepare an inverse mapping array.
+static void PrepareMapToPalette(const uint32_t palette[], int num_colors,
+                                uint32_t sorted[], int idx_map[]) {
+  int i;
+  memcpy(sorted, palette, num_colors * sizeof(*sorted));
+  qsort(sorted, num_colors, sizeof(*sorted), PaletteCompareColorsForQsort);
+  for (i = 0; i < num_colors; ++i) {
+    idx_map[SearchColor(sorted, palette[i], num_colors)] = i;
+  }
+}
+
+static void MapToPalette(const uint32_t sorted_palette[], int num_colors,
                          uint32_t* const last_pix, int* const last_idx,
+                         const int idx_map[],
                          const uint32_t* src, uint8_t* dst, int width) {
   int x;
   int prev_idx = *last_idx;
@@ -1185,14 +1193,8 @@ static void MapToPalette(const uint32_t palette[], int num_colors,
   for (x = 0; x < width; ++x) {
     const uint32_t pix = src[x];
     if (pix != prev_pix) {
-      int i;
-      for (i = 0; i < num_colors; ++i) {
-        if (pix == palette[i]) {
-          prev_idx = i;
-          prev_pix = pix;
-          break;
-        }
-      }
+      prev_idx = idx_map[SearchColor(sorted_palette, pix, num_colors)];
+      prev_pix = pix;
     }
     dst[x] = prev_idx;
   }
@@ -1239,11 +1241,16 @@ static WebPEncodingError ApplyPalette(const uint32_t* src, uint32_t src_stride,
     }
   } else {
     // Use 1 pixel cache for ARGB pixels.
-    uint32_t last_pix = palette[0];
-    int last_idx = 0;
+    uint32_t last_pix;
+    int last_idx;
+    uint32_t sorted[MAX_PALETTE_SIZE];
+    int idx_map[MAX_PALETTE_SIZE];
+    PrepareMapToPalette(palette, palette_size, sorted, idx_map);
+    last_pix = palette[0];
+    last_idx = 0;
     for (y = 0; y < height; ++y) {
-      MapToPalette(palette, palette_size, &last_pix, &last_idx,
-                   src, tmp_row, width);
+      MapToPalette(sorted, palette_size, &last_pix, &last_idx,
+                   idx_map, src, tmp_row, width);
       VP8LBundleColorMap(tmp_row, width, xbits, dst);
       src += src_stride;
       dst += dst_stride;
@@ -1376,7 +1383,7 @@ static void VP8LEncoderDelete(VP8LEncoder* enc) {
 
 WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
                                    const WebPPicture* const picture,
-                                   VP8LBitWriter* const bw) {
+                                   VP8LBitWriter* const bw, int use_cache) {
   WebPEncodingError err = VP8_ENC_OK;
   const int quality = (int)config->quality;
   const int low_effort = (config->method == 0);
@@ -1403,7 +1410,8 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
   }
 
   // Apply near-lossless preprocessing.
-  use_near_lossless = !enc->use_palette_ && (config->near_lossless < 100);
+  use_near_lossless =
+      (config->near_lossless < 100) && !enc->use_palette_ && !enc->use_predict_;
   if (use_near_lossless) {
     if (!VP8ApplyNearLossless(width, height, picture->argb,
                               config->near_lossless)) {
@@ -1455,7 +1463,7 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
 
     if (enc->use_predict_) {
       err = ApplyPredictFilter(enc, enc->current_width_, height, quality,
-                               low_effort, bw);
+                               low_effort, enc->use_subtract_green_, bw);
       if (err != VP8_ENC_OK) goto Error;
     }
 
@@ -1472,8 +1480,8 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
   // Encode and write the transformed image.
   err = EncodeImageInternal(bw, enc->argb_, &enc->hash_chain_, enc->refs_,
                             enc->current_width_, height, quality, low_effort,
-                            &enc->cache_bits_, enc->histo_bits_, byte_position,
-                            &hdr_size, &data_size);
+                            use_cache, &enc->cache_bits_, enc->histo_bits_,
+                            byte_position, &hdr_size, &data_size);
   if (err != VP8_ENC_OK) goto Error;
 
   if (picture->stats != NULL) {
@@ -1558,7 +1566,7 @@ int VP8LEncodeImage(const WebPConfig* const config,
   if (!WebPReportProgress(picture, 5, &percent)) goto UserAbort;
 
   // Encode main image stream.
-  err = VP8LEncodeStream(config, picture, &bw);
+  err = VP8LEncodeStream(config, picture, &bw, 1 /*use_cache*/);
   if (err != VP8_ENC_OK) goto Error;
 
   // TODO(skal): have a fine-grained progress report in VP8LEncodeStream().
diff --git a/drivers/webp/enc/vp8li.h b/drivers/webp/enc/vp8li.h
index 4543c3b260..8e6b360d81 100644
--- a/drivers/webp/enc/vp8li.h
+++ b/drivers/webp/enc/vp8li.h
@@ -25,14 +25,17 @@ extern "C" {
 #endif
 
 typedef struct {
-  const WebPConfig* config_;    // user configuration and parameters
-  const WebPPicture* pic_;      // input picture.
+  const WebPConfig* config_;      // user configuration and parameters
+  const WebPPicture* pic_;        // input picture.
 
-  uint32_t* argb_;              // Transformed argb image data.
-  uint32_t* argb_scratch_;      // Scratch memory for argb rows
-                                // (used for prediction).
-  uint32_t* transform_data_;    // Scratch memory for transform data.
-  int       current_width_;     // Corresponds to packed image width.
+  uint32_t* argb_;                // Transformed argb image data.
+  uint32_t* argb_scratch_;        // Scratch memory for argb rows
+                                  // (used for prediction).
+  uint32_t* transform_data_;      // Scratch memory for transform data.
+  uint32_t* transform_mem_;       // Currently allocated memory.
+  size_t    transform_mem_size_;  // Currently allocated memory size.
+
+  int       current_width_;       // Corresponds to packed image width.
 
   // Encoding parameters derived from quality parameter.
   int histo_bits_;
@@ -64,9 +67,10 @@ int VP8LEncodeImage(const WebPConfig* const config,
                     const WebPPicture* const picture);
 
 // Encodes the main image stream using the supplied bit writer.
+// If 'use_cache' is false, disables the use of color cache.
 WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
                                    const WebPPicture* const picture,
-                                   VP8LBitWriter* const bw);
+                                   VP8LBitWriter* const bw, int use_cache);
 
 //------------------------------------------------------------------------------
 
diff --git a/drivers/webp/enc/webpenc.c b/drivers/webp/enc/webpenc.c
index 8ced07a2a3..a7d04ea2ce 100644
--- a/drivers/webp/enc/webpenc.c
+++ b/drivers/webp/enc/webpenc.c
@@ -79,7 +79,9 @@ static void ResetBoundaryPredictions(VP8Encoder* const enc) {
 //-------------------+---+---+---+---+---+---+---+
 // basic rd-opt      |   |   |   | x | x | x | x |
 //-------------------+---+---+---+---+---+---+---+
-// disto-score i4/16 |   |   | x |   |   |   |   |
+// disto-refine i4/16| x | x | x |   |   |   |   |
+//-------------------+---+---+---+---+---+---+---+
+// disto-refine uv   |   | x | x |   |   |   |   |
 //-------------------+---+---+---+---+---+---+---+
 // rd-opt i4/16      |   |   | ~ | x | x | x | x |
 //-------------------+---+---+---+---+---+---+---+
@@ -103,6 +105,10 @@ static void MapConfigToTools(VP8Encoder* const enc) {
       256 * 16 * 16 *                 // upper bound: up to 16bit per 4x4 block
       (limit * limit) / (100 * 100);  // ... modulated with a quadratic curve.
 
+  // partition0 = 512k max.
+  enc->mb_header_limit_ =
+      (score_t)256 * 510 * 8 * 1024 / (enc->mb_w_ * enc->mb_h_);
+
   enc->thread_level_ = config->thread_level;
 
   enc->do_search_ = (config->target_size > 0 || config->target_PSNR > 0);
@@ -137,29 +143,30 @@ static void MapConfigToTools(VP8Encoder* const enc) {
 
 static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
                                   WebPPicture* const picture) {
+  VP8Encoder* enc;
   const int use_filter =
       (config->filter_strength > 0) || (config->autofilter > 0);
   const int mb_w = (picture->width + 15) >> 4;
   const int mb_h = (picture->height + 15) >> 4;
   const int preds_w = 4 * mb_w + 1;
   const int preds_h = 4 * mb_h + 1;
-  const size_t preds_size = preds_w * preds_h * sizeof(uint8_t);
+  const size_t preds_size = preds_w * preds_h * sizeof(*enc->preds_);
   const int top_stride = mb_w * 16;
-  const size_t nz_size = (mb_w + 1) * sizeof(uint32_t) + WEBP_ALIGN_CST;
-  const size_t info_size = mb_w * mb_h * sizeof(VP8MBInfo);
-  const size_t samples_size = 2 * top_stride * sizeof(uint8_t)  // top-luma/u/v
-                            + WEBP_ALIGN_CST;                   // align all
+  const size_t nz_size = (mb_w + 1) * sizeof(*enc->nz_) + WEBP_ALIGN_CST;
+  const size_t info_size = mb_w * mb_h * sizeof(*enc->mb_info_);
+  const size_t samples_size =
+      2 * top_stride * sizeof(*enc->y_top_)  // top-luma/u/v
+      + WEBP_ALIGN_CST;                      // align all
   const size_t lf_stats_size =
-      config->autofilter ? sizeof(LFStats) + WEBP_ALIGN_CST : 0;
-  VP8Encoder* enc;
+      config->autofilter ? sizeof(*enc->lf_stats_) + WEBP_ALIGN_CST : 0;
   uint8_t* mem;
-  const uint64_t size = (uint64_t)sizeof(VP8Encoder)   // main struct
-                      + WEBP_ALIGN_CST                 // cache alignment
-                      + info_size                      // modes info
-                      + preds_size                     // prediction modes
-                      + samples_size                   // top/left samples
-                      + nz_size                        // coeff context bits
-                      + lf_stats_size;                 // autofilter stats
+  const uint64_t size = (uint64_t)sizeof(*enc)   // main struct
+                      + WEBP_ALIGN_CST           // cache alignment
+                      + info_size                // modes info
+                      + preds_size               // prediction modes
+                      + samples_size             // top/left samples
+                      + nz_size                  // coeff context bits
+                      + lf_stats_size;           // autofilter stats
 
 #ifdef PRINT_MEMORY_INFO
   printf("===================================\n");
@@ -171,7 +178,7 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
          "            non-zero: %ld\n"
          "            lf-stats: %ld\n"
          "               total: %ld\n",
-         sizeof(VP8Encoder) + WEBP_ALIGN_CST, info_size,
+         sizeof(*enc) + WEBP_ALIGN_CST, info_size,
          preds_size, samples_size, nz_size, lf_stats_size, size);
   printf("Transient object sizes:\n"
          "      VP8EncIterator: %ld\n"
@@ -201,7 +208,7 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
   enc->mb_info_ = (VP8MBInfo*)mem;
   mem += info_size;
   enc->preds_ = ((uint8_t*)mem) + 1 + enc->preds_w_;
-  mem += preds_w * preds_h * sizeof(uint8_t);
+  mem += preds_size;
   enc->nz_ = 1 + (uint32_t*)WEBP_ALIGN(mem);
   mem += nz_size;
   enc->lf_stats_ = lf_stats_size ? (LFStats*)WEBP_ALIGN(mem) : NULL;
@@ -321,14 +328,15 @@ int WebPEncode(const WebPConfig* config, WebPPicture* pic) {
   if (pic->width > WEBP_MAX_DIMENSION || pic->height > WEBP_MAX_DIMENSION)
     return WebPEncodingSetError(pic, VP8_ENC_ERROR_BAD_DIMENSION);
 
-  if (!config->exact) {
-    WebPCleanupTransparentArea(pic);
-  }
-
   if (pic->stats != NULL) memset(pic->stats, 0, sizeof(*pic->stats));
 
   if (!config->lossless) {
     VP8Encoder* enc = NULL;
+
+    if (!config->exact) {
+      WebPCleanupTransparentArea(pic);
+    }
+
     if (pic->use_argb || pic->y == NULL || pic->u == NULL || pic->v == NULL) {
       // Make sure we have YUVA samples.
       if (config->preprocessing & 4) {
@@ -376,6 +384,10 @@ int WebPEncode(const WebPConfig* config, WebPPicture* pic) {
       return 0;
     }
 
+    if (!config->exact) {
+      WebPCleanupTransparentAreaLossless(pic);
+    }
+
     ok = VP8LEncodeImage(config, pic);  // Sets pic->error in case of problem.
   }