22 files changed, 1615 insertions, 2948 deletions
diff --git a/drivers/webp/enc/alpha.c b/drivers/webp/enc/alpha.c
index 21d4b5cbde..0e519b6c66 100644
--- a/drivers/webp/enc/alpha.c
+++ b/drivers/webp/enc/alpha.c
@@ -1,10 +1,8 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Alpha-plane compression.
@@ -19,6 +17,10 @@
 #include "../utils/quant_levels.h"
 #include "../webp/format_constants.h"
 
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 // -----------------------------------------------------------------------------
 // Encodes the given alpha data via specified compression method 'method'.
 // The pre-processing (quantization) is performed if 'quality' is less than 100.
@@ -67,7 +69,7 @@ static int EncodeLossless(const uint8_t* const data, int width, int height,
     const uint8_t* src = data;
     for (j = 0; j < picture.height; ++j) {
       for (i = 0; i < picture.width; ++i) {
-        dst[i] = src[i] << 8;  // we leave A/R/B channels zero'd.
+        dst[i] = (src[i] << 8) | 0xff000000u;
       }
       src += width;
       dst += picture.argb_stride;
@@ -77,19 +79,18 @@ static int EncodeLossless(const uint8_t* const data, int width, int height,
   WebPConfigInit(&config);
   config.lossless = 1;
   config.method = effort_level;  // impact is very small
-  // Set a low default quality for encoding alpha. Ensure that Alpha quality at
-  // lower methods (3 and below) is less than the threshold for triggering
-  // costly 'BackwardReferencesTraceBackwards'.
-  config.quality = 8.f * effort_level;
-  assert(config.quality >= 0 && config.quality <= 100.f);
+  // Set moderate default quality setting for alpha. Higher qualities (80 and
+  // above) could be very slow.
+  config.quality = 10.f + 15.f * effort_level;
+  if (config.quality > 100.f) config.quality = 100.f;
 
   ok = VP8LBitWriterInit(&tmp_bw, (width * height) >> 3);
   ok = ok && (VP8LEncodeStream(&config, &picture, &tmp_bw) == VP8_ENC_OK);
   WebPPictureFree(&picture);
   if (ok) {
-    const uint8_t* const buffer = VP8LBitWriterFinish(&tmp_bw);
-    const size_t buffer_size = VP8LBitWriterNumBytes(&tmp_bw);
-    VP8BitWriterAppend(bw, buffer, buffer_size);
+    const uint8_t* const data = VP8LBitWriterFinish(&tmp_bw);
+    const size_t data_size = VP8LBitWriterNumBytes(&tmp_bw);
+    VP8BitWriterAppend(bw, data, data_size);
   }
   VP8LBitWriterDestroy(&tmp_bw);
   return ok && !bw->error_;
@@ -97,19 +98,12 @@ static int EncodeLossless(const uint8_t* const data, int width, int height,
 
 // -----------------------------------------------------------------------------
 
-// Small struct to hold the result of a filter mode compression attempt.
-typedef struct {
-  size_t score;
-  VP8BitWriter bw;
-  WebPAuxStats stats;
-} FilterTrial;
-
-// This function always returns an initialized 'bw' object, even upon error.
 static int EncodeAlphaInternal(const uint8_t* const data, int width, int height,
                                int method, int filter, int reduce_levels,
                                int effort_level,  // in [0..6] range
                                uint8_t* const tmp_alpha,
-                               FilterTrial* result) {
+                               VP8BitWriter* const bw,
+                               WebPAuxStats* const stats) {
   int ok = 0;
   const uint8_t* alpha_src;
   WebPFilterFunc filter_func;
@@ -130,26 +124,24 @@ static int EncodeAlphaInternal(const uint8_t* const data, int width, int height,
   header = method | (filter << 2);
   if (reduce_levels) header |= ALPHA_PREPROCESSED_LEVELS << 4;
 
-  VP8BitWriterInit(&result->bw, expected_size);
-  VP8BitWriterAppend(&result->bw, &header, ALPHA_HEADER_LEN);
+  VP8BitWriterInit(bw, expected_size);
+  VP8BitWriterAppend(bw, &header, ALPHA_HEADER_LEN);
 
   filter_func = WebPFilters[filter];
-  if (filter_func != NULL) {
-    filter_func(data, width, height, width, tmp_alpha);
+  if (filter_func) {
+    filter_func(data, width, height, 1, width, tmp_alpha);
     alpha_src = tmp_alpha;
   }  else {
     alpha_src = data;
   }
 
   if (method == ALPHA_NO_COMPRESSION) {
-    ok = VP8BitWriterAppend(&result->bw, alpha_src, width * height);
-    ok = ok && !result->bw.error_;
+    ok = VP8BitWriterAppend(bw, alpha_src, width * height);
+    ok = ok && !bw->error_;
   } else {
-    ok = EncodeLossless(alpha_src, width, height, effort_level,
-                        &result->bw, &result->stats);
-    VP8BitWriterFinish(&result->bw);
+    ok = EncodeLossless(alpha_src, width, height, effort_level, bw, stats);
+    VP8BitWriterFinish(bw);
   }
-  result->score = VP8BitWriterSize(&result->bw);
   return ok;
 }
 
@@ -165,104 +157,6 @@ static void CopyPlane(const uint8_t* src, int src_stride,
   }
 }
 
-static int GetNumColors(const uint8_t* data, int width, int height,
-                        int stride) {
-  int j;
-  int colors = 0;
-  uint8_t color[256] = { 0 };
-
-  for (j = 0; j < height; ++j) {
-    int i;
-    const uint8_t* const p = data + j * stride;
-    for (i = 0; i < width; ++i) {
-      color[p[i]] = 1;
-    }
-  }
-  for (j = 0; j < 256; ++j) {
-    if (color[j] > 0) ++colors;
-  }
-  return colors;
-}
-
-#define FILTER_TRY_NONE (1 << WEBP_FILTER_NONE)
-#define FILTER_TRY_ALL ((1 << WEBP_FILTER_LAST) - 1)
-
-// Given the input 'filter' option, return an OR'd bit-set of filters to try.
-static uint32_t GetFilterMap(const uint8_t* alpha, int width, int height,
-                             int filter, int effort_level) {
-  uint32_t bit_map = 0U;
-  if (filter == WEBP_FILTER_FAST) {
-    // Quick estimate of the best candidate.
-    int try_filter_none = (effort_level > 3);
-    const int kMinColorsForFilterNone = 16;
-    const int kMaxColorsForFilterNone = 192;
-    const int num_colors = GetNumColors(alpha, width, height, width);
-    // For low number of colors, NONE yields better compression.
-    filter = (num_colors <= kMinColorsForFilterNone) ? WEBP_FILTER_NONE :
-             EstimateBestFilter(alpha, width, height, width);
-    bit_map |= 1 << filter;
-    // For large number of colors, try FILTER_NONE in addition to the best
-    // filter as well.
-    if (try_filter_none || num_colors > kMaxColorsForFilterNone) {
-      bit_map |= FILTER_TRY_NONE;
-    }
-  } else if (filter == WEBP_FILTER_NONE) {
-    bit_map = FILTER_TRY_NONE;
-  } else {  // WEBP_FILTER_BEST -> try all
-    bit_map = FILTER_TRY_ALL;
-  }
-  return bit_map;
-}
-
-static void InitFilterTrial(FilterTrial* const score) {
-  score->score = (size_t)~0U;
-  VP8BitWriterInit(&score->bw, 0);
-}
-
-static int ApplyFiltersAndEncode(const uint8_t* alpha, int width, int height,
-                                 size_t data_size, int method, int filter,
-                                 int reduce_levels, int effort_level,
-                                 uint8_t** const output,
-                                 size_t* const output_size,
-                                 WebPAuxStats* const stats) {
-  int ok = 1;
-  FilterTrial best;
-  uint32_t try_map =
-      GetFilterMap(alpha, width, height, filter, effort_level);
-  InitFilterTrial(&best);
-  if (try_map != FILTER_TRY_NONE) {
-    uint8_t* filtered_alpha =  (uint8_t*)malloc(data_size);
-    if (filtered_alpha == NULL) return 0;
-
-    for (filter = WEBP_FILTER_NONE; ok && try_map; ++filter, try_map >>= 1) {
-      if (try_map & 1) {
-        FilterTrial trial;
-        ok = EncodeAlphaInternal(alpha, width, height, method, filter,
-                                 reduce_levels, effort_level, filtered_alpha,
-                                 &trial);
-        if (ok && trial.score < best.score) {
-          VP8BitWriterWipeOut(&best.bw);
-          best = trial;
-        } else {
-          VP8BitWriterWipeOut(&trial.bw);
-        }
-      }
-    }
-    free(filtered_alpha);
-  } else {
-    ok = EncodeAlphaInternal(alpha, width, height, method, WEBP_FILTER_NONE,
-                             reduce_levels, effort_level, NULL, &best);
-  }
-  if (ok) {
-    if (stats != NULL) *stats = best.stats;
-    *output_size = VP8BitWriterSize(&best.bw);
-    *output = VP8BitWriterBuf(&best.bw);
-  } else {
-    VP8BitWriterWipeOut(&best.bw);
-  }
-  return ok;
-}
-
 static int EncodeAlpha(VP8Encoder* const enc,
                        int quality, int method, int filter,
                        int effort_level,
@@ -293,11 +187,6 @@ static int EncodeAlpha(VP8Encoder* const enc,
     return 0;
   }
 
-  if (method == ALPHA_NO_COMPRESSION) {
-    // Don't filter, as filtering will make no impact on compressed size.
-    filter = WEBP_FILTER_NONE;
-  }
-
   quant_alpha = (uint8_t*)malloc(data_size);
   if (quant_alpha == NULL) {
     return 0;
@@ -316,95 +205,126 @@ static int EncodeAlpha(VP8Encoder* const enc,
   }
 
   if (ok) {
-    ok = ApplyFiltersAndEncode(quant_alpha, width, height, data_size, method,
-                               filter, reduce_levels, effort_level, output,
-                               output_size, pic->stats);
-    if (pic->stats != NULL) {  // need stats?
-      pic->stats->coded_size += (int)(*output_size);
-      enc->sse_[3] = sse;
+    VP8BitWriter bw;
+    int test_filter;
+    uint8_t* filtered_alpha = NULL;
+
+    // We always test WEBP_FILTER_NONE first.
+    ok = EncodeAlphaInternal(quant_alpha, width, height,
+                             method, WEBP_FILTER_NONE, reduce_levels,
+                             effort_level, NULL, &bw, pic->stats);
+    if (!ok) {
+      VP8BitWriterWipeOut(&bw);
+      goto End;
     }
-  }
 
+    if (filter == WEBP_FILTER_FAST) {  // Quick estimate of a second candidate?
+      filter = EstimateBestFilter(quant_alpha, width, height, width);
+    }
+    // Stop?
+    if (filter == WEBP_FILTER_NONE) {
+      goto Ok;
+    }
+
+    filtered_alpha = (uint8_t*)malloc(data_size);
+    ok = (filtered_alpha != NULL);
+    if (!ok) {
+      goto End;
+    }
+
+    // Try the other mode(s).
+    {
+      WebPAuxStats best_stats;
+      size_t best_score = VP8BitWriterSize(&bw);
+
+      memset(&best_stats, 0, sizeof(best_stats));  // prevent spurious warning
+      if (pic->stats != NULL) best_stats = *pic->stats;
+      for (test_filter = WEBP_FILTER_HORIZONTAL;
+           ok && (test_filter <= WEBP_FILTER_GRADIENT);
+           ++test_filter) {
+        VP8BitWriter tmp_bw;
+        if (filter != WEBP_FILTER_BEST && test_filter != filter) {
+          continue;
+        }
+        ok = EncodeAlphaInternal(quant_alpha, width, height,
+                                 method, test_filter, reduce_levels,
+                                 effort_level, filtered_alpha, &tmp_bw,
+                                 pic->stats);
+        if (ok) {
+          const size_t score = VP8BitWriterSize(&tmp_bw);
+          if (score < best_score) {
+            // swap bitwriter objects.
+            VP8BitWriter tmp = tmp_bw;
+            tmp_bw = bw;
+            bw = tmp;
+            best_score = score;
+            if (pic->stats != NULL) best_stats = *pic->stats;
+          }
+        } else {
+          VP8BitWriterWipeOut(&bw);
+        }
+        VP8BitWriterWipeOut(&tmp_bw);
+      }
+      if (pic->stats != NULL) *pic->stats = best_stats;
+    }
+ Ok:
+    if (ok) {
+      *output_size = VP8BitWriterSize(&bw);
+      *output = VP8BitWriterBuf(&bw);
+      if (pic->stats != NULL) {         // need stats?
+        pic->stats->coded_size += (int)(*output_size);
+        enc->sse_[3] = sse;
+      }
+    }
+    free(filtered_alpha);
+  }
+ End:
   free(quant_alpha);
   return ok;
 }
 
+
 //------------------------------------------------------------------------------
 // Main calls
 
-static int CompressAlphaJob(VP8Encoder* const enc, void* dummy) {
-  const WebPConfig* config = enc->config_;
-  uint8_t* alpha_data = NULL;
-  size_t alpha_size = 0;
-  const int effort_level = config->method;  // maps to [0..6]
-  const WEBP_FILTER_TYPE filter =
-      (config->alpha_filtering == 0) ? WEBP_FILTER_NONE :
-      (config->alpha_filtering == 1) ? WEBP_FILTER_FAST :
-                                       WEBP_FILTER_BEST;
-  if (!EncodeAlpha(enc, config->alpha_quality, config->alpha_compression,
-                   filter, effort_level, &alpha_data, &alpha_size)) {
-    return 0;
-  }
-  if (alpha_size != (uint32_t)alpha_size) {  // Sanity check.
-    free(alpha_data);
-    return 0;
-  }
-  enc->alpha_data_size_ = (uint32_t)alpha_size;
-  enc->alpha_data_ = alpha_data;
-  (void)dummy;
-  return 1;
-}
-
 void VP8EncInitAlpha(VP8Encoder* const enc) {
   enc->has_alpha_ = WebPPictureHasTransparency(enc->pic_);
   enc->alpha_data_ = NULL;
   enc->alpha_data_size_ = 0;
-  if (enc->thread_level_ > 0) {
-    WebPWorker* const worker = &enc->alpha_worker_;
-    WebPWorkerInit(worker);
-    worker->data1 = enc;
-    worker->data2 = NULL;
-    worker->hook = (WebPWorkerHook)CompressAlphaJob;
-  }
-}
-
-int VP8EncStartAlpha(VP8Encoder* const enc) {
-  if (enc->has_alpha_) {
-    if (enc->thread_level_ > 0) {
-      WebPWorker* const worker = &enc->alpha_worker_;
-      if (!WebPWorkerReset(worker)) {    // Makes sure worker is good to go.
-        return 0;
-      }
-      WebPWorkerLaunch(worker);
-      return 1;
-    } else {
-      return CompressAlphaJob(enc, NULL);   // just do the job right away
-    }
-  }
-  return 1;
 }
 
 int VP8EncFinishAlpha(VP8Encoder* const enc) {
   if (enc->has_alpha_) {
-    if (enc->thread_level_ > 0) {
-      WebPWorker* const worker = &enc->alpha_worker_;
-      if (!WebPWorkerSync(worker)) return 0;  // error
+    const WebPConfig* config = enc->config_;
+    uint8_t* tmp_data = NULL;
+    size_t tmp_size = 0;
+    const int effort_level = config->method;  // maps to [0..6]
+    const WEBP_FILTER_TYPE filter =
+        (config->alpha_filtering == 0) ? WEBP_FILTER_NONE :
+        (config->alpha_filtering == 1) ? WEBP_FILTER_FAST :
+                                         WEBP_FILTER_BEST;
+
+    if (!EncodeAlpha(enc, config->alpha_quality, config->alpha_compression,
+                     filter, effort_level, &tmp_data, &tmp_size)) {
+      return 0;
+    }
+    if (tmp_size != (uint32_t)tmp_size) {  // Sanity check.
+      free(tmp_data);
+      return 0;
     }
+    enc->alpha_data_size_ = (uint32_t)tmp_size;
+    enc->alpha_data_ = tmp_data;
   }
   return WebPReportProgress(enc->pic_, enc->percent_ + 20, &enc->percent_);
 }
 
-int VP8EncDeleteAlpha(VP8Encoder* const enc) {
-  int ok = 1;
-  if (enc->thread_level_ > 0) {
-    WebPWorker* const worker = &enc->alpha_worker_;
-    ok = WebPWorkerSync(worker);  // finish anything left in flight
-    WebPWorkerEnd(worker);  // still need to end the worker, even if !ok
-  }
+void VP8EncDeleteAlpha(VP8Encoder* const enc) {
   free(enc->alpha_data_);
   enc->alpha_data_ = NULL;
   enc->alpha_data_size_ = 0;
   enc->has_alpha_ = 0;
-  return ok;
 }
 
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/drivers/webp/enc/analysis.c b/drivers/webp/enc/analysis.c
index 7d4cfdc190..22cfb492e7 100644
--- a/drivers/webp/enc/analysis.c
+++ b/drivers/webp/enc/analysis.c
@@ -1,10 +1,8 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Macroblock analysis
@@ -19,8 +17,16 @@
 #include "./cost.h"
 #include "../utils/utils.h"
 
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 #define MAX_ITERS_K_MEANS  6
 
+static int ClipAlpha(int alpha) {
+  return alpha < 0 ? 0 : alpha > 255 ? 255 : alpha;
+}
+
 //------------------------------------------------------------------------------
 // Smooth the segment map by replacing isolated block by the majority of its
 // neighbours.
@@ -51,7 +57,6 @@ static void SmoothSegmentMap(VP8Encoder* const enc) {
       for (n = 0; n < NUM_MB_SEGMENTS; ++n) {
         if (cnt[n] >= majority_cnt_3_x_3_grid) {
           majority_seg = n;
-          break;
         }
       }
       tmp[x + y * w] = majority_seg;
@@ -67,10 +72,50 @@ static void SmoothSegmentMap(VP8Encoder* const enc) {
 }
 
 //------------------------------------------------------------------------------
-// set segment susceptibility alpha_ / beta_
+// Finalize Segment probability based on the coding tree
+
+static int GetProba(int a, int b) {
+  int proba;
+  const int total = a + b;
+  if (total == 0) return 255;  // that's the default probability.
+  proba = (255 * a + total / 2) / total;
+  return proba;
+}
+
+static void SetSegmentProbas(VP8Encoder* const enc) {
+  int p[NUM_MB_SEGMENTS] = { 0 };
+  int n;
+
+  for (n = 0; n < enc->mb_w_ * enc->mb_h_; ++n) {
+    const VP8MBInfo* const mb = &enc->mb_info_[n];
+    p[mb->segment_]++;
+  }
+  if (enc->pic_->stats) {
+    for (n = 0; n < NUM_MB_SEGMENTS; ++n) {
+      enc->pic_->stats->segment_size[n] = p[n];
+    }
+  }
+  if (enc->segment_hdr_.num_segments_ > 1) {
+    uint8_t* const probas = enc->proba_.segments_;
+    probas[0] = GetProba(p[0] + p[1], p[2] + p[3]);
+    probas[1] = GetProba(p[0], p[1]);
+    probas[2] = GetProba(p[2], p[3]);
+
+    enc->segment_hdr_.update_map_ =
+        (probas[0] != 255) || (probas[1] != 255) || (probas[2] != 255);
+    enc->segment_hdr_.size_ =
+      p[0] * (VP8BitCost(0, probas[0]) + VP8BitCost(0, probas[1])) +
+      p[1] * (VP8BitCost(0, probas[0]) + VP8BitCost(1, probas[1])) +
+      p[2] * (VP8BitCost(1, probas[0]) + VP8BitCost(0, probas[2])) +
+      p[3] * (VP8BitCost(1, probas[0]) + VP8BitCost(1, probas[2]));
+  } else {
+    enc->segment_hdr_.update_map_ = 0;
+    enc->segment_hdr_.size_ = 0;
+  }
+}
 
 static WEBP_INLINE int clip(int v, int m, int M) {
-  return (v < m) ? m : (v > M) ? M : v;
+  return v < m ? m : v > M ? M : v;
 }
 
 static void SetSegmentAlphas(VP8Encoder* const enc,
@@ -97,72 +142,28 @@ static void SetSegmentAlphas(VP8Encoder* const enc,
 }
 
 //------------------------------------------------------------------------------
-// Compute susceptibility based on DCT-coeff histograms:
-// the higher, the "easier" the macroblock is to compress.
-
-#define MAX_ALPHA 255                // 8b of precision for susceptibilities.
-#define ALPHA_SCALE (2 * MAX_ALPHA)  // scaling factor for alpha.
-#define DEFAULT_ALPHA (-1)
-#define IS_BETTER_ALPHA(alpha, best_alpha) ((alpha) > (best_alpha))
-
-static int FinalAlphaValue(int alpha) {
-  alpha = MAX_ALPHA - alpha;
-  return clip(alpha, 0, MAX_ALPHA);
-}
-
-static int GetAlpha(const VP8Histogram* const histo) {
-  int max_value = 0, last_non_zero = 1;
-  int k;
-  int alpha;
-  for (k = 0; k <= MAX_COEFF_THRESH; ++k) {
-    const int value = histo->distribution[k];
-    if (value > 0) {
-      if (value > max_value) max_value = value;
-      last_non_zero = k;
-    }
-  }
-  // 'alpha' will later be clipped to [0..MAX_ALPHA] range, clamping outer
-  // values which happen to be mostly noise. This leaves the maximum precision
-  // for handling the useful small values which contribute most.
-  alpha = (max_value > 1) ? ALPHA_SCALE * last_non_zero / max_value : 0;
-  return alpha;
-}
-
-static void MergeHistograms(const VP8Histogram* const in,
-                            VP8Histogram* const out) {
-  int i;
-  for (i = 0; i <= MAX_COEFF_THRESH; ++i) {
-    out->distribution[i] += in->distribution[i];
-  }
-}
-
-//------------------------------------------------------------------------------
 // Simplified k-Means, to assign Nb segments based on alpha-histogram
 
-static void AssignSegments(VP8Encoder* const enc,
-                           const int alphas[MAX_ALPHA + 1]) {
+static void AssignSegments(VP8Encoder* const enc, const int alphas[256]) {
   const int nb = enc->segment_hdr_.num_segments_;
   int centers[NUM_MB_SEGMENTS];
   int weighted_average = 0;
-  int map[MAX_ALPHA + 1];
+  int map[256];
   int a, n, k;
-  int min_a = 0, max_a = MAX_ALPHA, range_a;
+  int min_a = 0, max_a = 255, range_a;
   // 'int' type is ok for histo, and won't overflow
   int accum[NUM_MB_SEGMENTS], dist_accum[NUM_MB_SEGMENTS];
 
-  assert(nb >= 1);
-
   // bracket the input
-  for (n = 0; n <= MAX_ALPHA && alphas[n] == 0; ++n) {}
+  for (n = 0; n < 256 && alphas[n] == 0; ++n) {}
   min_a = n;
-  for (n = MAX_ALPHA; n > min_a && alphas[n] == 0; --n) {}
+  for (n = 255; n > min_a && alphas[n] == 0; --n) {}
   max_a = n;
   range_a = max_a - min_a;
 
   // Spread initial centers evenly
-  for (k = 0, n = 1; k < nb; ++k, n += 2) {
-    assert(n < 2 * nb);
-    centers[k] = min_a + (n * range_a) / (2 * nb);
+  for (n = 1, k = 0; n < 2 * nb; n += 2) {
+    centers[k++] = min_a + (n * range_a) / (2 * nb);
   }
 
   for (k = 0; k < MAX_ITERS_K_MEANS; ++k) {     // few iters are enough
@@ -177,7 +178,7 @@ static void AssignSegments(VP8Encoder* const enc,
     n = 0;    // track the nearest center for current 'a'
     for (a = min_a; a <= max_a; ++a) {
       if (alphas[a]) {
-        while (n + 1 < nb && abs(a - centers[n + 1]) < abs(a - centers[n])) {
+        while (n < nb - 1 && abs(a - centers[n + 1]) < abs(a - centers[n])) {
           n++;
         }
         map[a] = n;
@@ -209,7 +210,7 @@ static void AssignSegments(VP8Encoder* const enc,
     VP8MBInfo* const mb = &enc->mb_info_[n];
     const int alpha = mb->alpha_;
     mb->segment_ = map[alpha];
-    mb->alpha_ = centers[map[alpha]];  // for the record.
+    mb->alpha_ = centers[map[alpha]];     // just for the record.
   }
 
   if (nb > 1) {
@@ -217,6 +218,7 @@ static void AssignSegments(VP8Encoder* const enc,
     if (smooth) SmoothSegmentMap(enc);
   }
 
+  SetSegmentProbas(enc);                             // Assign final proba
   SetSegmentAlphas(enc, centers, weighted_average);  // pick some alphas.
 }
 
@@ -225,32 +227,24 @@ static void AssignSegments(VP8Encoder* const enc,
 // susceptibility and set best modes for this macroblock.
 // Segment assignment is done later.
 
-// Number of modes to inspect for alpha_ evaluation. For high-quality settings
-// (method >= FAST_ANALYSIS_METHOD) we don't need to test all the possible modes
-// during the analysis phase.
-#define FAST_ANALYSIS_METHOD 4  // method above which we do partial analysis
+// Number of modes to inspect for alpha_ evaluation. For high-quality settings,
+// we don't need to test all the possible modes during the analysis phase.
 #define MAX_INTRA16_MODE 2
 #define MAX_INTRA4_MODE  2
 #define MAX_UV_MODE      2
 
 static int MBAnalyzeBestIntra16Mode(VP8EncIterator* const it) {
-  const int max_mode =
-      (it->enc_->method_ >= FAST_ANALYSIS_METHOD) ? MAX_INTRA16_MODE
-                                                  : NUM_PRED_MODES;
+  const int max_mode = (it->enc_->method_ >= 3) ? MAX_INTRA16_MODE : 4;
   int mode;
-  int best_alpha = DEFAULT_ALPHA;
+  int best_alpha = -1;
   int best_mode = 0;
 
   VP8MakeLuma16Preds(it);
   for (mode = 0; mode < max_mode; ++mode) {
-    VP8Histogram histo = { { 0 } };
-    int alpha;
-
-    VP8CollectHistogram(it->yuv_in_ + Y_OFF,
-                        it->yuv_p_ + VP8I16ModeOffsets[mode],
-                        0, 16, &histo);
-    alpha = GetAlpha(&histo);
-    if (IS_BETTER_ALPHA(alpha, best_alpha)) {
+    const int alpha = VP8CollectHistogram(it->yuv_in_ + Y_OFF,
+                                          it->yuv_p_ + VP8I16ModeOffsets[mode],
+                                          0, 16);
+    if (alpha > best_alpha) {
       best_alpha = alpha;
       best_mode = mode;
     }
@@ -262,63 +256,46 @@ static int MBAnalyzeBestIntra16Mode(VP8EncIterator* const it) {
 static int MBAnalyzeBestIntra4Mode(VP8EncIterator* const it,
                                    int best_alpha) {
   uint8_t modes[16];
-  const int max_mode =
-      (it->enc_->method_ >= FAST_ANALYSIS_METHOD) ? MAX_INTRA4_MODE
-                                                  : NUM_BMODES;
-  int i4_alpha;
-  VP8Histogram total_histo = { { 0 } };
-  int cur_histo = 0;
-
+  const int max_mode = (it->enc_->method_ >= 3) ? MAX_INTRA4_MODE : NUM_BMODES;
+  int i4_alpha = 0;
   VP8IteratorStartI4(it);
   do {
     int mode;
-    int best_mode_alpha = DEFAULT_ALPHA;
-    VP8Histogram histos[2];
+    int best_mode_alpha = -1;
     const uint8_t* const src = it->yuv_in_ + Y_OFF + VP8Scan[it->i4_];
 
     VP8MakeIntra4Preds(it);
     for (mode = 0; mode < max_mode; ++mode) {
-      int alpha;
-
-      memset(&histos[cur_histo], 0, sizeof(histos[cur_histo]));
-      VP8CollectHistogram(src, it->yuv_p_ + VP8I4ModeOffsets[mode],
-                          0, 1, &histos[cur_histo]);
-      alpha = GetAlpha(&histos[cur_histo]);
-      if (IS_BETTER_ALPHA(alpha, best_mode_alpha)) {
+      const int alpha = VP8CollectHistogram(src,
+                                            it->yuv_p_ + VP8I4ModeOffsets[mode],
+                                            0, 1);
+      if (alpha > best_mode_alpha) {
         best_mode_alpha = alpha;
         modes[it->i4_] = mode;
-        cur_histo ^= 1;   // keep track of best histo so far.
       }
     }
-    // accumulate best histogram
-    MergeHistograms(&histos[cur_histo ^ 1], &total_histo);
+    i4_alpha += best_mode_alpha;
     // Note: we reuse the original samples for predictors
   } while (VP8IteratorRotateI4(it, it->yuv_in_ + Y_OFF));
 
-  i4_alpha = GetAlpha(&total_histo);
-  if (IS_BETTER_ALPHA(i4_alpha, best_alpha)) {
+  if (i4_alpha > best_alpha) {
     VP8SetIntra4Mode(it, modes);
-    best_alpha = i4_alpha;
+    best_alpha = ClipAlpha(i4_alpha);
   }
   return best_alpha;
 }
 
 static int MBAnalyzeBestUVMode(VP8EncIterator* const it) {
-  int best_alpha = DEFAULT_ALPHA;
+  int best_alpha = -1;
   int best_mode = 0;
-  const int max_mode =
-      (it->enc_->method_ >= FAST_ANALYSIS_METHOD) ? MAX_UV_MODE
-                                                  : NUM_PRED_MODES;
+  const int max_mode = (it->enc_->method_ >= 3) ? MAX_UV_MODE : 4;
   int mode;
   VP8MakeChroma8Preds(it);
   for (mode = 0; mode < max_mode; ++mode) {
-    VP8Histogram histo = { { 0 } };
-    int alpha;
-    VP8CollectHistogram(it->yuv_in_ + U_OFF,
-                        it->yuv_p_ + VP8UVModeOffsets[mode],
-                        16, 16 + 4 + 4, &histo);
-    alpha = GetAlpha(&histo);
-    if (IS_BETTER_ALPHA(alpha, best_alpha)) {
+    const int alpha = VP8CollectHistogram(it->yuv_in_ + U_OFF,
+                                          it->yuv_p_ + VP8UVModeOffsets[mode],
+                                          16, 16 + 4 + 4);
+    if (alpha > best_alpha) {
       best_alpha = alpha;
       best_mode = mode;
     }
@@ -328,8 +305,7 @@ static int MBAnalyzeBestUVMode(VP8EncIterator* const it) {
 }
 
 static void MBAnalyze(VP8EncIterator* const it,
-                      int alphas[MAX_ALPHA + 1],
-                      int* const alpha, int* const uv_alpha) {
+                      int alphas[256], int* const uv_alpha) {
   const VP8Encoder* const enc = it->enc_;
   int best_alpha, best_uv_alpha;
 
@@ -338,7 +314,7 @@ static void MBAnalyze(VP8EncIterator* const it,
   VP8SetSegment(it, 0);      // default segment, spec-wise.
 
   best_alpha = MBAnalyzeBestIntra16Mode(it);
-  if (enc->method_ >= 5) {
+  if (enc->method_ != 3) {
     // We go and make a fast decision for intra4/intra16.
     // It's usually not a good and definitive pick, but helps seeding the stats
     // about level bit-cost.
@@ -348,22 +324,10 @@ static void MBAnalyze(VP8EncIterator* const it,
   best_uv_alpha = MBAnalyzeBestUVMode(it);
 
   // Final susceptibility mix
-  best_alpha = (3 * best_alpha + best_uv_alpha + 2) >> 2;
-  best_alpha = FinalAlphaValue(best_alpha);
+  best_alpha = (best_alpha + best_uv_alpha + 1) / 2;
   alphas[best_alpha]++;
-  it->mb_->alpha_ = best_alpha;   // for later remapping.
-
-  // Accumulate for later complexity analysis.
-  *alpha += best_alpha;   // mixed susceptibility (not just luma)
   *uv_alpha += best_uv_alpha;
-}
-
-static void DefaultMBInfo(VP8MBInfo* const mb) {
-  mb->type_ = 1;     // I16x16
-  mb->uv_mode_ = 0;
-  mb->skip_ = 0;     // not skipped
-  mb->segment_ = 0;  // default segment
-  mb->alpha_ = 0;
+  it->mb_->alpha_ = best_alpha;   // Informative only.
 }
 
 //------------------------------------------------------------------------------
@@ -376,122 +340,25 @@ static void DefaultMBInfo(VP8MBInfo* const mb) {
 // and decide intra4/intra16, but that's usually almost always a bad choice at
 // this stage.
 
-static void ResetAllMBInfo(VP8Encoder* const enc) {
-  int n;
-  for (n = 0; n < enc->mb_w_ * enc->mb_h_; ++n) {
-    DefaultMBInfo(&enc->mb_info_[n]);
-  }
-  // Default susceptibilities.
-  enc->dqm_[0].alpha_ = 0;
-  enc->dqm_[0].beta_ = 0;
-  // Note: we can't compute this alpha_ / uv_alpha_ -> set to default value.
-  enc->alpha_ = 0;
-  enc->uv_alpha_ = 0;
-  WebPReportProgress(enc->pic_, enc->percent_ + 20, &enc->percent_);
-}
-
-// struct used to collect job result
-typedef struct {
-  WebPWorker worker;
-  int alphas[MAX_ALPHA + 1];
-  int alpha, uv_alpha;
-  VP8EncIterator it;
-  int delta_progress;
-} SegmentJob;
-
-// main work call
-static int DoSegmentsJob(SegmentJob* const job, VP8EncIterator* const it) {
+int VP8EncAnalyze(VP8Encoder* const enc) {
   int ok = 1;
-  if (!VP8IteratorIsDone(it)) {
-    uint8_t tmp[32 + ALIGN_CST];
-    uint8_t* const scratch = (uint8_t*)DO_ALIGN(tmp);
-    do {
-      // Let's pretend we have perfect lossless reconstruction.
-      VP8IteratorImport(it, scratch);
-      MBAnalyze(it, job->alphas, &job->alpha, &job->uv_alpha);
-      ok = VP8IteratorProgress(it, job->delta_progress);
-    } while (ok && VP8IteratorNext(it));
-  }
-  return ok;
-}
-
-static void MergeJobs(const SegmentJob* const src, SegmentJob* const dst) {
-  int i;
-  for (i = 0; i <= MAX_ALPHA; ++i) dst->alphas[i] += src->alphas[i];
-  dst->alpha += src->alpha;
-  dst->uv_alpha += src->uv_alpha;
-}
+  int alphas[256] = { 0 };
+  VP8EncIterator it;
 
-// initialize the job struct with some TODOs
-static void InitSegmentJob(VP8Encoder* const enc, SegmentJob* const job,
-                           int start_row, int end_row) {
-  WebPWorkerInit(&job->worker);
-  job->worker.data1 = job;
-  job->worker.data2 = &job->it;
-  job->worker.hook = (WebPWorkerHook)DoSegmentsJob;
-  VP8IteratorInit(enc, &job->it);
-  VP8IteratorSetRow(&job->it, start_row);
-  VP8IteratorSetCountDown(&job->it, (end_row - start_row) * enc->mb_w_);
-  memset(job->alphas, 0, sizeof(job->alphas));
-  job->alpha = 0;
-  job->uv_alpha = 0;
-  // only one of both jobs can record the progress, since we don't
-  // expect the user's hook to be multi-thread safe
-  job->delta_progress = (start_row == 0) ? 20 : 0;
-}
+  VP8IteratorInit(enc, &it);
+  enc->uv_alpha_ = 0;
+  do {
+    VP8IteratorImport(&it);
+    MBAnalyze(&it, alphas, &enc->uv_alpha_);
+    ok = VP8IteratorProgress(&it, 20);
+    // Let's pretend we have perfect lossless reconstruction.
+  } while (ok && VP8IteratorNext(&it, it.yuv_in_));
+  enc->uv_alpha_ /= enc->mb_w_ * enc->mb_h_;
+  if (ok) AssignSegments(enc, alphas);
 
-// main entry point
-int VP8EncAnalyze(VP8Encoder* const enc) {
-  int ok = 1;
-  const int do_segments =
-      enc->config_->emulate_jpeg_size ||   // We need the complexity evaluation.
-      (enc->segment_hdr_.num_segments_ > 1) ||
-      (enc->method_ == 0);  // for method 0, we need preds_[] to be filled.
-  if (do_segments) {
-    const int last_row = enc->mb_h_;
-    // We give a little more than a half work to the main thread.
-    const int split_row = (9 * last_row + 15) >> 4;
-    const int total_mb = last_row * enc->mb_w_;
-#ifdef WEBP_USE_THREAD
-    const int kMinSplitRow = 2;  // minimal rows needed for mt to be worth it
-    const int do_mt = (enc->thread_level_ > 0) && (split_row >= kMinSplitRow);
-#else
-    const int do_mt = 0;
-#endif
-    SegmentJob main_job;
-    if (do_mt) {
-      SegmentJob side_job;
-      // Note the use of '&' instead of '&&' because we must call the functions
-      // no matter what.
-      InitSegmentJob(enc, &main_job, 0, split_row);
-      InitSegmentJob(enc, &side_job, split_row, last_row);
-      // we don't need to call Reset() on main_job.worker, since we're calling
-      // WebPWorkerExecute() on it
-      ok &= WebPWorkerReset(&side_job.worker);
-      // launch the two jobs in parallel
-      if (ok) {
-        WebPWorkerLaunch(&side_job.worker);
-        WebPWorkerExecute(&main_job.worker);
-        ok &= WebPWorkerSync(&side_job.worker);
-        ok &= WebPWorkerSync(&main_job.worker);
-      }
-      WebPWorkerEnd(&side_job.worker);
-      if (ok) MergeJobs(&side_job, &main_job);  // merge results together
-    } else {
-      // Even for single-thread case, we use the generic Worker tools.
-      InitSegmentJob(enc, &main_job, 0, last_row);
-      WebPWorkerExecute(&main_job.worker);
-      ok &= WebPWorkerSync(&main_job.worker);
-    }
-    WebPWorkerEnd(&main_job.worker);
-    if (ok) {
-      enc->alpha_ = main_job.alpha / total_mb;
-      enc->uv_alpha_ = main_job.uv_alpha / total_mb;
-      AssignSegments(enc, main_job.alphas);
-    }
-  } else {   // Use only one default segment.
-    ResetAllMBInfo(enc);
-  }
   return ok;
 }
 
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/drivers/webp/enc/backward_references.c b/drivers/webp/enc/backward_references.c
index 77b4be7432..b8c8ece806 100644
--- a/drivers/webp/enc/backward_references.c
+++ b/drivers/webp/enc/backward_references.c
@@ -1,10 +1,8 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Author: Jyrki Alakuijala (jyrki@google.com)
@@ -143,95 +141,74 @@ static void HashChainInsert(HashChain* const p,
   p->hash_to_first_index_[hash_code] = pos;
 }
 
-static void GetParamsForHashChainFindCopy(int quality, int xsize,
-                                          int cache_bits, int* window_size,
-                                          int* iter_pos, int* iter_limit) {
-  const int iter_mult = (quality < 27) ? 1 : 1 + ((quality - 27) >> 4);
-  const int iter_neg = -iter_mult * (quality >> 1);
-  // Limit the backward-ref window size for lower qualities.
-  const int max_window_size = (quality > 50) ? WINDOW_SIZE
-                            : (quality > 25) ? (xsize << 8)
-                            : (xsize << 4);
-  assert(xsize > 0);
-  *window_size = (max_window_size > WINDOW_SIZE) ? WINDOW_SIZE
-               : max_window_size;
-  *iter_pos = 8 + (quality >> 3);
-  // For lower entropy images, the rigorous search loop in HashChainFindCopy
-  // can be relaxed.
-  *iter_limit = (cache_bits > 0) ? iter_neg : iter_neg / 2;
-}
-
 static int HashChainFindCopy(const HashChain* const p,
-                             int base_position, int xsize_signed,
-                             const uint32_t* const argb, int max_len,
-                             int window_size, int iter_pos, int iter_limit,
+                             int quality, int index, int xsize,
+                             const uint32_t* const argb, int maxlen,
                              int* const distance_ptr,
                              int* const length_ptr) {
-  const uint32_t* const argb_start = argb + base_position;
-  uint64_t best_val = 0;
-  uint32_t best_length = 1;
-  uint32_t best_distance = 0;
-  const uint32_t xsize = (uint32_t)xsize_signed;
-  const int min_pos =
-      (base_position > window_size) ? base_position - window_size : 0;
+  const uint64_t hash_code = GetPixPairHash64(&argb[index]);
+  int prev_length = 0;
+  int64_t best_val = 0;
+  int best_length = 0;
+  int best_distance = 0;
+  const uint32_t* const argb_start = argb + index;
+  const int iter_min_mult = (quality < 50) ? 2 : (quality < 75) ? 4 : 8;
+  const int iter_min = -quality * iter_min_mult;
+  int iter_cnt = 10 + (quality >> 1);
+  const int min_pos = (index > WINDOW_SIZE) ? index - WINDOW_SIZE : 0;
   int pos;
+
   assert(xsize > 0);
-  if (max_len > MAX_LENGTH) {
-    max_len = MAX_LENGTH;
-  }
-  for (pos = p->hash_to_first_index_[GetPixPairHash64(argb_start)];
+  for (pos = p->hash_to_first_index_[hash_code];
        pos >= min_pos;
        pos = p->chain_[pos]) {
-    uint64_t val;
-    uint32_t curr_length;
-    uint32_t distance;
-    const uint64_t* const ptr1 =
-        (const uint64_t*)(argb + pos + best_length - 1);
-    const uint64_t* const ptr2 =
-        (const uint64_t*)(argb_start + best_length - 1);
-
-    if (iter_pos < 0) {
-      if (iter_pos < iter_limit || best_val >= 0xff0000) {
+    int64_t val;
+    int curr_length;
+    if (iter_cnt < 0) {
+      if (iter_cnt < iter_min || best_val >= 0xff0000) {
         break;
       }
     }
-    --iter_pos;
-
-    // Before 'expensive' linear match, check if the two arrays match at the
-    // current best length index and also for the succeeding elements.
-    if (*ptr1 != *ptr2) continue;
-
-    curr_length = FindMatchLength(argb + pos, argb_start, max_len);
-    if (curr_length < best_length) continue;
-
-    distance = (uint32_t)(base_position - pos);
-    val = curr_length << 16;
+    --iter_cnt;
+    if (best_length != 0 &&
+        argb[pos + best_length - 1] != argb_start[best_length - 1]) {
+      continue;
+    }
+    curr_length = FindMatchLength(argb + pos, argb_start, maxlen);
+    if (curr_length < prev_length) {
+      continue;
+    }
+    val = 65536 * curr_length;
     // Favoring 2d locality here gives savings for certain images.
-    if (distance < 9 * xsize) {
-      const uint32_t y = distance / xsize;
-      uint32_t x = distance % xsize;
-      if (x > (xsize >> 1)) {
+    if (index - pos < 9 * xsize) {
+      const int y = (index - pos) / xsize;
+      int x = (index - pos) % xsize;
+      if (x > xsize / 2) {
         x = xsize - x;
       }
-      if (x <= 7) {
-        val += 9 * 9 + 9 * 9;
+      if (x <= 7 && x >= -8) {
         val -= y * y + x * x;
+      } else {
+        val -= 9 * 9 + 9 * 9;
       }
+    } else {
+      val -= 9 * 9 + 9 * 9;
     }
     if (best_val < val) {
+      prev_length = curr_length;
       best_val = val;
       best_length = curr_length;
-      best_distance = distance;
-      if (curr_length >= (uint32_t)max_len) {
+      best_distance = index - pos;
+      if (curr_length >= MAX_LENGTH) {
         break;
       }
-      if ((best_distance == 1 || distance == xsize) &&
+      if ((best_distance == 1 || best_distance == xsize) &&
           best_length >= 128) {
         break;
       }
     }
   }
-  *distance_ptr = (int)best_distance;
+  *distance_ptr = best_distance;
   *length_ptr = best_length;
   return (best_length >= MIN_LENGTH);
 }
@@ -280,9 +257,6 @@ static int BackwardReferencesHashChain(int xsize, int ysize,
   const int pix_count = xsize * ysize;
   HashChain* const hash_chain = (HashChain*)malloc(sizeof(*hash_chain));
   VP8LColorCache hashers;
-  int window_size = WINDOW_SIZE;
-  int iter_pos = 1;
-  int iter_limit = -1;
 
   if (hash_chain == NULL) return 0;
   if (use_color_cache) {
@@ -293,16 +267,16 @@ static int BackwardReferencesHashChain(int xsize, int ysize,
   if (!HashChainInit(hash_chain, pix_count)) goto Error;
 
   refs->size = 0;
-  GetParamsForHashChainFindCopy(quality, xsize, cache_bits,
-                                &window_size, &iter_pos, &iter_limit);
   for (i = 0; i < pix_count; ) {
     // Alternative#1: Code the pixels starting at 'i' using backward reference.
     int offset = 0;
     int len = 0;
     if (i < pix_count - 1) {  // FindCopy(i,..) reads pixels at [i] and [i + 1].
-      int max_len = pix_count - i;
-      HashChainFindCopy(hash_chain, i, xsize, argb, max_len,
-                        window_size, iter_pos, iter_limit,
+      int maxlen = pix_count - i;
+      if (maxlen > MAX_LENGTH) {
+        maxlen = MAX_LENGTH;
+      }
+      HashChainFindCopy(hash_chain, quality, i, xsize, argb, maxlen,
                         &offset, &len);
     }
     if (len >= MIN_LENGTH) {
@@ -313,10 +287,12 @@ static int BackwardReferencesHashChain(int xsize, int ysize,
       int k;
       HashChainInsert(hash_chain, &argb[i], i);
       if (i < pix_count - 2) {  // FindCopy(i+1,..) reads [i + 1] and [i + 2].
-        int max_len = pix_count - (i + 1);
-        HashChainFindCopy(hash_chain, i + 1, xsize, argb, max_len,
-                          window_size, iter_pos, iter_limit,
-                          &offset2, &len2);
+        int maxlen = pix_count - (i + 1);
+        if (maxlen > MAX_LENGTH) {
+          maxlen = MAX_LENGTH;
+        }
+        HashChainFindCopy(hash_chain, quality,
+                          i + 1, xsize, argb, maxlen, &offset2, &len2);
         if (len2 > len + 1) {
           const uint32_t pixel = argb[i];
           // Alternative#2 is a better match. So push pixel at 'i' as literal.
@@ -324,10 +300,10 @@ static int BackwardReferencesHashChain(int xsize, int ysize,
             const int ix = VP8LColorCacheGetIndex(&hashers, pixel);
             refs->refs[refs->size] = PixOrCopyCreateCacheIdx(ix);
           } else {
-            if (use_color_cache) VP8LColorCacheInsert(&hashers, pixel);
             refs->refs[refs->size] = PixOrCopyCreateLiteral(pixel);
           }
           ++refs->size;
+          if (use_color_cache) VP8LColorCacheInsert(&hashers, pixel);
           i++;  // Backward reference to be done for next pixel.
           len = len2;
           offset = offset2;
@@ -357,10 +333,10 @@ static int BackwardReferencesHashChain(int xsize, int ysize,
         const int ix = VP8LColorCacheGetIndex(&hashers, pixel);
         refs->refs[refs->size] = PixOrCopyCreateCacheIdx(ix);
       } else {
-        if (use_color_cache) VP8LColorCacheInsert(&hashers, pixel);
         refs->refs[refs->size] = PixOrCopyCreateLiteral(pixel);
       }
       ++refs->size;
+      if (use_color_cache) VP8LColorCacheInsert(&hashers, pixel);
       if (i + 1 < pix_count) {
         HashChainInsert(hash_chain, &argb[i], i);
       }
@@ -386,8 +362,7 @@ typedef struct {
 
 static int BackwardReferencesTraceBackwards(
     int xsize, int ysize, int recursive_cost_model,
-    const uint32_t* const argb, int quality, int cache_bits,
-    VP8LBackwardRefs* const refs);
+    const uint32_t* const argb, int cache_bits, VP8LBackwardRefs* const refs);
 
 static void ConvertPopulationCountTableToBitEstimates(
     int num_symbols, const int population_counts[], double output[]) {
@@ -412,16 +387,17 @@ static void ConvertPopulationCountTableToBitEstimates(
 
 static int CostModelBuild(CostModel* const m, int xsize, int ysize,
                           int recursion_level, const uint32_t* const argb,
-                          int quality, int cache_bits) {
+                          int cache_bits) {
   int ok = 0;
   VP8LHistogram histo;
   VP8LBackwardRefs refs;
+  const int quality = 100;
 
   if (!VP8LBackwardRefsAlloc(&refs, xsize * ysize)) goto Error;
 
   if (recursion_level > 0) {
     if (!BackwardReferencesTraceBackwards(xsize, ysize, recursion_level - 1,
-                                          argb, quality, cache_bits, &refs)) {
+                                          argb, cache_bits, &refs)) {
       goto Error;
     }
   } else {
@@ -462,37 +438,34 @@ static WEBP_INLINE double GetCacheCost(const CostModel* const m, uint32_t idx) {
 
 static WEBP_INLINE double GetLengthCost(const CostModel* const m,
                                         uint32_t length) {
-  int code, extra_bits;
-  VP8LPrefixEncodeBits(length, &code, &extra_bits);
-  return m->literal_[VALUES_IN_BYTE + code] + extra_bits;
+  int code, extra_bits_count, extra_bits_value;
+  PrefixEncode(length, &code, &extra_bits_count, &extra_bits_value);
+  return m->literal_[VALUES_IN_BYTE + code] + extra_bits_count;
 }
 
 static WEBP_INLINE double GetDistanceCost(const CostModel* const m,
                                           uint32_t distance) {
-  int code, extra_bits;
-  VP8LPrefixEncodeBits(distance, &code, &extra_bits);
-  return m->distance_[code] + extra_bits;
+  int code, extra_bits_count, extra_bits_value;
+  PrefixEncode(distance, &code, &extra_bits_count, &extra_bits_value);
+  return m->distance_[code] + extra_bits_count;
 }
 
 static int BackwardReferencesHashChainDistanceOnly(
     int xsize, int ysize, int recursive_cost_model, const uint32_t* const argb,
-    int quality, int cache_bits, uint32_t* const dist_array) {
+    int cache_bits, uint32_t* const dist_array) {
   int i;
   int ok = 0;
   int cc_init = 0;
+  const int quality = 100;
   const int pix_count = xsize * ysize;
   const int use_color_cache = (cache_bits > 0);
-  float* const cost =
-      (float*)WebPSafeMalloc((uint64_t)pix_count, sizeof(*cost));
+  double* const cost =
+      (double*)WebPSafeMalloc((uint64_t)pix_count, sizeof(*cost));
   CostModel* cost_model = (CostModel*)malloc(sizeof(*cost_model));
   HashChain* hash_chain = (HashChain*)malloc(sizeof(*hash_chain));
   VP8LColorCache hashers;
   const double mul0 = (recursive_cost_model != 0) ? 1.0 : 0.68;
   const double mul1 = (recursive_cost_model != 0) ? 1.0 : 0.82;
-  const int min_distance_code = 2;  // TODO(vikasa): tune as function of quality
-  int window_size = WINDOW_SIZE;
-  int iter_pos = 1;
-  int iter_limit = -1;
 
   if (cost == NULL || cost_model == NULL || hash_chain == NULL) goto Error;
 
@@ -504,17 +477,15 @@ static int BackwardReferencesHashChainDistanceOnly(
   }
 
   if (!CostModelBuild(cost_model, xsize, ysize, recursive_cost_model, argb,
-                      quality, cache_bits)) {
+                      cache_bits)) {
     goto Error;
   }
 
-  for (i = 0; i < pix_count; ++i) cost[i] = 1e38f;
+  for (i = 0; i < pix_count; ++i) cost[i] = 1e100;
 
   // We loop one pixel at a time, but store all currently best points to
   // non-processed locations from this point.
   dist_array[0] = 0;
-  GetParamsForHashChainFindCopy(quality, xsize, cache_bits,
-                                &window_size, &iter_pos, &iter_limit);
   for (i = 0; i < pix_count; ++i) {
     double prev_cost = 0.0;
     int shortmax;
@@ -525,9 +496,11 @@ static int BackwardReferencesHashChainDistanceOnly(
       int offset = 0;
       int len = 0;
       if (i < pix_count - 1) {  // FindCopy reads pixels at [i] and [i + 1].
-        int max_len = shortmax ? 2 : pix_count - i;
-        HashChainFindCopy(hash_chain, i, xsize, argb, max_len,
-                          window_size, iter_pos, iter_limit,
+        int maxlen = shortmax ? 2 : MAX_LENGTH;
+        if (maxlen > pix_count - i) {
+          maxlen = pix_count - i;
+        }
+        HashChainFindCopy(hash_chain, quality, i, xsize, argb, maxlen,
                           &offset, &len);
       }
       if (len >= MIN_LENGTH) {
@@ -536,15 +509,16 @@ static int BackwardReferencesHashChainDistanceOnly(
             prev_cost + GetDistanceCost(cost_model, code);
         int k;
         for (k = 1; k < len; ++k) {
-          const double cost_val = distance_cost + GetLengthCost(cost_model, k);
+          const double cost_val =
+              distance_cost + GetLengthCost(cost_model, k);
           if (cost[i + k] > cost_val) {
-            cost[i + k] = (float)cost_val;
+            cost[i + k] = cost_val;
             dist_array[i + k] = k + 1;
           }
         }
         // This if is for speedup only. It roughly doubles the speed, and
         // makes compression worse by .1 %.
-        if (len >= 128 && code <= min_distance_code) {
+        if (len >= 128 && code < 2) {
           // Long copy for short distances, let's skip the middle
           // lookups for better copies.
           // 1) insert the hashes.
@@ -555,10 +529,10 @@ static int BackwardReferencesHashChainDistanceOnly(
           }
           // 2) Add to the hash_chain (but cannot add the last pixel)
           {
-            const int last = (len + i < pix_count - 1) ? len + i
-                                                       : pix_count - 1;
-            for (k = i; k < last; ++k) {
-              HashChainInsert(hash_chain, &argb[k], k);
+            const int last = (len < pix_count - 1 - i) ? len
+                                                       : pix_count - 1 - i;
+            for (k = 0; k < last; ++k) {
+              HashChainInsert(hash_chain, &argb[i + k], i + k);
             }
           }
           // 3) jump.
@@ -577,13 +551,13 @@ static int BackwardReferencesHashChainDistanceOnly(
         const int ix = VP8LColorCacheGetIndex(&hashers, argb[i]);
         cost_val += GetCacheCost(cost_model, ix) * mul0;
       } else {
-        if (use_color_cache) VP8LColorCacheInsert(&hashers, argb[i]);
         cost_val += GetLiteralCost(cost_model, argb[i]) * mul1;
       }
       if (cost[i] > cost_val) {
-        cost[i] = (float)cost_val;
+        cost[i] = cost_val;
         dist_array[i] = 1;  // only one is inserted.
       }
+      if (use_color_cache) VP8LColorCacheInsert(&hashers, argb[i]);
     }
  next_symbol: ;
   }
@@ -598,30 +572,40 @@ Error:
   return ok;
 }
 
-// We pack the path at the end of *dist_array and return
-// a pointer to this part of the array. Example:
-// dist_array = [1x2xx3x2] => packed [1x2x1232], chosen_path = [1232]
-static void TraceBackwards(uint32_t* const dist_array,
-                           int dist_array_size,
-                           uint32_t** const chosen_path,
-                           int* const chosen_path_size) {
-  uint32_t* path = dist_array + dist_array_size;
-  uint32_t* cur = dist_array + dist_array_size - 1;
-  while (cur >= dist_array) {
-    const int k = *cur;
-    --path;
-    *path = k;
-    cur -= k;
-  }
-  *chosen_path = path;
-  *chosen_path_size = (int)(dist_array + dist_array_size - path);
+static int TraceBackwards(const uint32_t* const dist_array,
+                          int dist_array_size,
+                          uint32_t** const chosen_path,
+                          int* const chosen_path_size) {
+  int i;
+  // Count how many.
+  int count = 0;
+  for (i = dist_array_size - 1; i >= 0; ) {
+    int k = dist_array[i];
+    assert(k >= 1);
+    ++count;
+    i -= k;
+  }
+  // Allocate.
+  *chosen_path_size = count;
+  *chosen_path =
+      (uint32_t*)WebPSafeMalloc((uint64_t)count, sizeof(**chosen_path));
+  if (*chosen_path == NULL) return 0;
+
+  // Write in reverse order.
+  for (i = dist_array_size - 1; i >= 0; ) {
+    int k = dist_array[i];
+    assert(k >= 1);
+    (*chosen_path)[--count] = k;
+    i -= k;
+  }
+  return 1;
 }
 
 static int BackwardReferencesHashChainFollowChosenPath(
-    int xsize, int ysize, const uint32_t* const argb,
-    int quality, int cache_bits,
+    int xsize, int ysize, const uint32_t* const argb, int cache_bits,
     const uint32_t* const chosen_path, int chosen_path_size,
     VP8LBackwardRefs* const refs) {
+  const int quality = 100;
   const int pix_count = xsize * ysize;
   const int use_color_cache = (cache_bits > 0);
   int size = 0;
@@ -630,9 +614,6 @@ static int BackwardReferencesHashChainFollowChosenPath(
   int ix;
   int ok = 0;
   int cc_init = 0;
-  int window_size = WINDOW_SIZE;
-  int iter_pos = 1;
-  int iter_limit = -1;
   HashChain* hash_chain = (HashChain*)malloc(sizeof(*hash_chain));
   VP8LColorCache hashers;
 
@@ -645,17 +626,14 @@ static int BackwardReferencesHashChainFollowChosenPath(
   }
 
   refs->size = 0;
-  GetParamsForHashChainFindCopy(quality, xsize, cache_bits,
-                                &window_size, &iter_pos, &iter_limit);
   for (ix = 0; ix < chosen_path_size; ++ix, ++size) {
     int offset = 0;
     int len = 0;
-    int max_len = chosen_path[ix];
-    if (max_len != 1) {
-      HashChainFindCopy(hash_chain, i, xsize, argb, max_len,
-                        window_size, iter_pos, iter_limit,
-                        &offset, &len);
-      assert(len == max_len);
+    int maxlen = chosen_path[ix];
+    if (maxlen != 1) {
+      HashChainFindCopy(hash_chain, quality,
+                        i, xsize, argb, maxlen, &offset, &len);
+      assert(len == maxlen);
       refs->refs[size] = PixOrCopyCreateCopy(offset, len);
       if (use_color_cache) {
         for (k = 0; k < len; ++k) {
@@ -675,9 +653,9 @@ static int BackwardReferencesHashChainFollowChosenPath(
         const int idx = VP8LColorCacheGetIndex(&hashers, argb[i]);
         refs->refs[size] = PixOrCopyCreateCacheIdx(idx);
       } else {
-        if (use_color_cache) VP8LColorCacheInsert(&hashers, argb[i]);
         refs->refs[size] = PixOrCopyCreateLiteral(argb[i]);
       }
+      if (use_color_cache) VP8LColorCacheInsert(&hashers, argb[i]);
       if (i + 1 < pix_count) {
         HashChainInsert(hash_chain, &argb[i], i);
       }
@@ -697,7 +675,7 @@ Error:
 static int BackwardReferencesTraceBackwards(int xsize, int ysize,
                                             int recursive_cost_model,
                                             const uint32_t* const argb,
-                                            int quality, int cache_bits,
+                                            int cache_bits,
                                             VP8LBackwardRefs* const refs) {
   int ok = 0;
   const int dist_array_size = xsize * ysize;
@@ -709,18 +687,22 @@ static int BackwardReferencesTraceBackwards(int xsize, int ysize,
   if (dist_array == NULL) goto Error;
 
   if (!BackwardReferencesHashChainDistanceOnly(
-      xsize, ysize, recursive_cost_model, argb, quality, cache_bits,
-      dist_array)) {
+      xsize, ysize, recursive_cost_model, argb, cache_bits, dist_array)) {
+    goto Error;
+  }
+  if (!TraceBackwards(dist_array, dist_array_size,
+                      &chosen_path, &chosen_path_size)) {
     goto Error;
   }
-  TraceBackwards(dist_array, dist_array_size, &chosen_path, &chosen_path_size);
+  free(dist_array);   // no need to retain this memory any longer
+  dist_array = NULL;
   if (!BackwardReferencesHashChainFollowChosenPath(
-      xsize, ysize, argb, quality, cache_bits, chosen_path, chosen_path_size,
-      refs)) {
+      xsize, ysize, argb, cache_bits, chosen_path, chosen_path_size, refs)) {
     goto Error;
   }
   ok = 1;
  Error:
+  free(chosen_path);
   free(dist_array);
   return ok;
 }
@@ -780,20 +762,18 @@ int VP8LGetBackwardReferences(int width, int height,
 
   // Choose appropriate backward reference.
   if (lz77_is_useful) {
-    // TraceBackwards is costly. Don't execute it at lower quality.
-    const int try_lz77_trace_backwards = (quality >= 25);
+    // TraceBackwards is costly. Run it for higher qualities.
+    const int try_lz77_trace_backwards = (quality >= 75);
     *best = refs_lz77;   // default guess: lz77 is better
     VP8LClearBackwardRefs(&refs_rle);
     if (try_lz77_trace_backwards) {
-      // Set recursion level for large images using a color cache.
-      const int recursion_level =
-          (num_pix < 320 * 200) && (cache_bits > 0) ? 1 : 0;
+      const int recursion_level = (num_pix < 320 * 200) ? 1 : 0;
       VP8LBackwardRefs refs_trace;
       if (!VP8LBackwardRefsAlloc(&refs_trace, num_pix)) {
         goto End;
       }
-      if (BackwardReferencesTraceBackwards(width, height, recursion_level, argb,
-                                           quality, cache_bits, &refs_trace)) {
+      if (BackwardReferencesTraceBackwards(
+          width, height, recursion_level, argb, cache_bits, &refs_trace)) {
         VP8LClearBackwardRefs(&refs_lz77);
         *best = refs_trace;
       }
diff --git a/drivers/webp/enc/backward_references.h b/drivers/webp/enc/backward_references.h
index e1c75f04f9..91c03361ed 100644
--- a/drivers/webp/enc/backward_references.h
+++ b/drivers/webp/enc/backward_references.h
@@ -1,10 +1,8 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Author: Jyrki Alakuijala (jyrki@google.com)
@@ -18,7 +16,7 @@
 #include "../webp/types.h"
 #include "../webp/format_constants.h"
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
@@ -31,6 +29,68 @@ extern "C" {
     (NUM_LITERAL_CODES + NUM_LENGTH_CODES + (1 << MAX_COLOR_CACHE_BITS))
 
 // -----------------------------------------------------------------------------
+// PrefixEncode()
+
+// use GNU builtins where available.
+#if defined(__GNUC__) && \
+    ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4)
+static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
+  return n == 0 ? -1 : 31 ^ __builtin_clz(n);
+}
+#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
+#include <intrin.h>
+#pragma intrinsic(_BitScanReverse)
+
+static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
+  unsigned long first_set_bit;
+  return _BitScanReverse(&first_set_bit, n) ? first_set_bit : -1;
+}
+#else
+static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
+  int log = 0;
+  uint32_t value = n;
+  int i;
+
+  if (value == 0) return -1;
+  for (i = 4; i >= 0; --i) {
+    const int shift = (1 << i);
+    const uint32_t x = value >> shift;
+    if (x != 0) {
+      value = x;
+      log += shift;
+    }
+  }
+  return log;
+}
+#endif
+
+static WEBP_INLINE int VP8LBitsLog2Ceiling(uint32_t n) {
+  const int floor = BitsLog2Floor(n);
+  if (n == (n & ~(n - 1)))  // zero or a power of two.
+    return floor;
+  else
+    return floor + 1;
+}
+
+// Splitting of distance and length codes into prefixes and
+// extra bits. The prefixes are encoded with an entropy code
+// while the extra bits are stored just as normal bits.
+static WEBP_INLINE void PrefixEncode(int distance, int* const code,
+                                     int* const extra_bits_count,
+                                     int* const extra_bits_value) {
+  // Collect the two most significant bits where the highest bit is 1.
+  const int highest_bit = BitsLog2Floor(--distance);
+  // & 0x3f is to make behavior well defined when highest_bit
+  // does not exist or is the least significant bit.
+  const int second_highest_bit =
+      (distance >> ((highest_bit - 1) & 0x3f)) & 1;
+  *extra_bits_count = (highest_bit > 0) ? (highest_bit - 1) : 0;
+  *extra_bits_value = distance & ((1 << *extra_bits_count) - 1);
+  *code = (highest_bit > 0) ? (2 * highest_bit + second_highest_bit)
+                            : (highest_bit == 0) ? 1 : 0;
+}
+
+// -----------------------------------------------------------------------------
 // PixOrCopy
 
 enum Mode {
@@ -145,7 +205,7 @@ int VP8LCalculateEstimateForCacheSize(const uint32_t* const argb,
                                       int xsize, int ysize,
                                       int* const best_cache_bits);
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 }
 #endif
 
diff --git a/drivers/webp/enc/config.c b/drivers/webp/enc/config.c
index af7f0b09e8..1a26113554 100644
--- a/drivers/webp/enc/config.c
+++ b/drivers/webp/enc/config.c
@@ -1,10 +1,8 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Coding tools configuration
@@ -13,6 +11,10 @@
 
 #include "../webp/encode.h"
 
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 //------------------------------------------------------------------------------
 // WebPConfig
 //------------------------------------------------------------------------------
@@ -29,9 +31,9 @@ int WebPConfigInitInternal(WebPConfig* config,
   config->target_PSNR = 0.;
   config->method = 4;
   config->sns_strength = 50;
-  config->filter_strength = 60;   // mid-filtering
+  config->filter_strength = 20;   // default: light filtering
   config->filter_sharpness = 0;
-  config->filter_type = 1;        // default: strong (so U/V is filtered too)
+  config->filter_type = 0;        // default: simple
   config->partitions = 0;
   config->segments = 4;
   config->pass = 1;
@@ -44,9 +46,6 @@ int WebPConfigInitInternal(WebPConfig* config,
   config->alpha_quality = 100;
   config->lossless = 0;
   config->image_hint = WEBP_HINT_DEFAULT;
-  config->emulate_jpeg_size = 0;
-  config->thread_level = 0;
-  config->low_memory = 0;
 
   // TODO(skal): tune.
   switch (preset) {
@@ -54,13 +53,11 @@ int WebPConfigInitInternal(WebPConfig* config,
       config->sns_strength = 80;
       config->filter_sharpness = 4;
       config->filter_strength = 35;
-      config->preprocessing &= ~2;   // no dithering
       break;
     case WEBP_PRESET_PHOTO:
       config->sns_strength = 80;
       config->filter_sharpness = 3;
       config->filter_strength = 30;
-      config->preprocessing |= 2;
       break;
     case WEBP_PRESET_DRAWING:
       config->sns_strength = 25;
@@ -70,12 +67,10 @@ int WebPConfigInitInternal(WebPConfig* config,
     case WEBP_PRESET_ICON:
       config->sns_strength = 0;
       config->filter_strength = 0;   // disable filtering to retain sharpness
-      config->preprocessing &= ~2;   // no dithering
       break;
     case WEBP_PRESET_TEXT:
       config->sns_strength = 0;
       config->filter_strength = 0;   // disable filtering to retain sharpness
-      config->preprocessing &= ~2;   // no dithering
       config->segments = 2;
       break;
     case WEBP_PRESET_DEFAULT:
@@ -111,7 +106,7 @@ int WebPValidateConfig(const WebPConfig* config) {
     return 0;
   if (config->show_compressed < 0 || config->show_compressed > 1)
     return 0;
-  if (config->preprocessing < 0 || config->preprocessing > 3)
+  if (config->preprocessing < 0 || config->preprocessing > 1)
     return 0;
   if (config->partitions < 0 || config->partitions > 3)
     return 0;
@@ -127,14 +122,11 @@ int WebPValidateConfig(const WebPConfig* config) {
     return 0;
   if (config->image_hint >= WEBP_HINT_LAST)
     return 0;
-  if (config->emulate_jpeg_size < 0 || config->emulate_jpeg_size > 1)
-    return 0;
-  if (config->thread_level < 0 || config->thread_level > 1)
-    return 0;
-  if (config->low_memory < 0 || config->low_memory > 1)
-    return 0;
   return 1;
 }
 
 //------------------------------------------------------------------------------
 
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/drivers/webp/enc/cost.c b/drivers/webp/enc/cost.c
index 09699f8044..92e0cc713c 100644
--- a/drivers/webp/enc/cost.c
+++ b/drivers/webp/enc/cost.c
@@ -1,10 +1,8 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Cost tables for level and modes
@@ -13,6 +11,10 @@
 
 #include "./cost.h"
 
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 //------------------------------------------------------------------------------
 // Boolean-cost cost table
 
@@ -73,7 +75,7 @@ const uint16_t VP8LevelCodes[MAX_VARIABLE_LEVEL][2] = {
 
 // fixed costs for coding levels, deduce from the coding tree.
 // This is only the part that doesn't depend on the probability state.
-const uint16_t VP8LevelFixedCosts[MAX_LEVEL + 1] = {
+const uint16_t VP8LevelFixedCosts[2048] = {
      0,  256,  256,  256,  256,  432,  618,  630,
    731,  640,  640,  828,  901,  948, 1021, 1101,
   1174, 1221, 1294, 1042, 1085, 1115, 1158, 1202,
@@ -357,7 +359,7 @@ void VP8CalculateLevelCosts(VP8Proba* const proba) {
 
   for (ctype = 0; ctype < NUM_TYPES; ++ctype) {
     for (band = 0; band < NUM_BANDS; ++band) {
-      for (ctx = 0; ctx < NUM_CTX; ++ctx) {
+      for(ctx = 0; ctx < NUM_CTX; ++ctx) {
         const uint8_t* const p = proba->coeffs_[ctype][band][ctx];
         uint16_t* const table = proba->level_cost_[ctype][band][ctx];
         const int cost_base = VP8BitCost(1, p[1]);
@@ -383,107 +385,110 @@ const uint16_t VP8FixedCostsUV[4] = { 302, 984, 439, 642 };
 // note: these values include the fixed VP8BitCost(1, 145) mode selection cost.
 const uint16_t VP8FixedCostsI16[4] = { 663, 919, 872, 919 };
 const uint16_t VP8FixedCostsI4[NUM_BMODES][NUM_BMODES][NUM_BMODES] = {
-  { {   40, 1151, 1723, 1874, 2103, 2019, 1628, 1777, 2226, 2137 },
-    {  192,  469, 1296, 1308, 1849, 1794, 1781, 1703, 1713, 1522 },
-    {  142,  910,  762, 1684, 1849, 1576, 1460, 1305, 1801, 1657 },
-    {  559,  641, 1370,  421, 1182, 1569, 1612, 1725,  863, 1007 },
-    {  299, 1059, 1256, 1108,  636, 1068, 1581, 1883,  869, 1142 },
-    {  277, 1111,  707, 1362, 1089,  672, 1603, 1541, 1545, 1291 },
-    {  214,  781, 1609, 1303, 1632, 2229,  726, 1560, 1713,  918 },
-    {  152, 1037, 1046, 1759, 1983, 2174, 1358,  742, 1740, 1390 },
-    {  512, 1046, 1420,  753,  752, 1297, 1486, 1613,  460, 1207 },
-    {  424,  827, 1362,  719, 1462, 1202, 1199, 1476, 1199,  538 } },
-  { {  240,  402, 1134, 1491, 1659, 1505, 1517, 1555, 1979, 2099 },
-    {  467,  242,  960, 1232, 1714, 1620, 1834, 1570, 1676, 1391 },
-    {  500,  455,  463, 1507, 1699, 1282, 1564,  982, 2114, 2114 },
-    {  672,  643, 1372,  331, 1589, 1667, 1453, 1938,  996,  876 },
-    {  458,  783, 1037,  911,  738,  968, 1165, 1518,  859, 1033 },
-    {  504,  815,  504, 1139, 1219,  719, 1506, 1085, 1268, 1268 },
-    {  333,  630, 1445, 1239, 1883, 3672,  799, 1548, 1865,  598 },
-    {  399,  644,  746, 1342, 1856, 1350, 1493,  613, 1855, 1015 },
-    {  622,  749, 1205,  608, 1066, 1408, 1290, 1406,  546,  971 },
-    {  500,  753, 1041,  668, 1230, 1617, 1297, 1425, 1383,  523 } },
-  { {  394,  553,  523, 1502, 1536,  981, 1608, 1142, 1666, 2181 },
-    {  655,  430,  375, 1411, 1861, 1220, 1677, 1135, 1978, 1553 },
-    {  690,  640,  245, 1954, 2070, 1194, 1528,  982, 1972, 2232 },
-    {  559,  834,  741,  867, 1131,  980, 1225,  852, 1092,  784 },
-    {  690,  875,  516,  959,  673,  894, 1056, 1190, 1528, 1126 },
-    {  740,  951,  384, 1277, 1177,  492, 1579, 1155, 1846, 1513 },
-    {  323,  775, 1062, 1776, 3062, 1274,  813, 1188, 1372,  655 },
-    {  488,  971,  484, 1767, 1515, 1775, 1115,  503, 1539, 1461 },
-    {  740, 1006,  998,  709,  851, 1230, 1337,  788,  741,  721 },
-    {  522, 1073,  573, 1045, 1346,  887, 1046, 1146, 1203,  697 } },
-  { {  105,  864, 1442, 1009, 1934, 1840, 1519, 1920, 1673, 1579 },
-    {  534,  305, 1193,  683, 1388, 2164, 1802, 1894, 1264, 1170 },
-    {  305,  518,  877, 1108, 1426, 3215, 1425, 1064, 1320, 1242 },
-    {  683,  732, 1927,  257, 1493, 2048, 1858, 1552, 1055,  947 },
-    {  394,  814, 1024,  660,  959, 1556, 1282, 1289,  893, 1047 },
-    {  528,  615,  996,  940, 1201,  635, 1094, 2515,  803, 1358 },
-    {  347,  614, 1609, 1187, 3133, 1345, 1007, 1339, 1017,  667 },
-    {  218,  740,  878, 1605, 3650, 3650, 1345,  758, 1357, 1617 },
-    {  672,  750, 1541,  558, 1257, 1599, 1870, 2135,  402, 1087 },
-    {  592,  684, 1161,  430, 1092, 1497, 1475, 1489, 1095,  822 } },
-  { {  228, 1056, 1059, 1368,  752,  982, 1512, 1518,  987, 1782 },
-    {  494,  514,  818,  942,  965,  892, 1610, 1356, 1048, 1363 },
-    {  512,  648,  591, 1042,  761,  991, 1196, 1454, 1309, 1463 },
-    {  683,  749, 1043,  676,  841, 1396, 1133, 1138,  654,  939 },
-    {  622, 1101, 1126,  994,  361, 1077, 1203, 1318,  877, 1219 },
-    {  631, 1068,  857, 1650,  651,  477, 1650, 1419,  828, 1170 },
-    {  555,  727, 1068, 1335, 3127, 1339,  820, 1331, 1077,  429 },
-    {  504,  879,  624, 1398,  889,  889, 1392,  808,  891, 1406 },
-    {  683, 1602, 1289,  977,  578,  983, 1280, 1708,  406, 1122 },
-    {  399,  865, 1433, 1070, 1072,  764,  968, 1477, 1223,  678 } },
-  { {  333,  760,  935, 1638, 1010,  529, 1646, 1410, 1472, 2219 },
-    {  512,  494,  750, 1160, 1215,  610, 1870, 1868, 1628, 1169 },
-    {  572,  646,  492, 1934, 1208,  603, 1580, 1099, 1398, 1995 },
-    {  786,  789,  942,  581, 1018,  951, 1599, 1207,  731,  768 },
-    {  690, 1015,  672, 1078,  582,  504, 1693, 1438, 1108, 2897 },
-    {  768, 1267,  571, 2005, 1243,  244, 2881, 1380, 1786, 1453 },
-    {  452,  899, 1293,  903, 1311, 3100,  465, 1311, 1319,  813 },
-    {  394,  927,  942, 1103, 1358, 1104,  946,  593, 1363, 1109 },
-    {  559, 1005, 1007, 1016,  658, 1173, 1021, 1164,  623, 1028 },
-    {  564,  796,  632, 1005, 1014,  863, 2316, 1268,  938,  764 } },
-  { {  266,  606, 1098, 1228, 1497, 1243,  948, 1030, 1734, 1461 },
-    {  366,  585,  901, 1060, 1407, 1247,  876, 1134, 1620, 1054 },
-    {  452,  565,  542, 1729, 1479, 1479, 1016,  886, 2938, 1150 },
-    {  555, 1088, 1533,  950, 1354,  895,  834, 1019, 1021,  496 },
-    {  704,  815, 1193,  971,  973,  640, 1217, 2214,  832,  578 },
-    {  672, 1245,  579,  871,  875,  774,  872, 1273, 1027,  949 },
-    {  296, 1134, 2050, 1784, 1636, 3425,  442, 1550, 2076,  722 },
-    {  342,  982, 1259, 1846, 1848, 1848,  622,  568, 1847, 1052 },
-    {  555, 1064, 1304,  828,  746, 1343, 1075, 1329, 1078,  494 },
-    {  288, 1167, 1285, 1174, 1639, 1639,  833, 2254, 1304,  509 } },
-  { {  342,  719,  767, 1866, 1757, 1270, 1246,  550, 1746, 2151 },
-    {  483,  653,  694, 1509, 1459, 1410, 1218,  507, 1914, 1266 },
-    {  488,  757,  447, 2979, 1813, 1268, 1654,  539, 1849, 2109 },
-    {  522, 1097, 1085,  851, 1365, 1111,  851,  901,  961,  605 },
-    {  709,  716,  841,  728,  736,  945,  941,  862, 2845, 1057 },
-    {  512, 1323,  500, 1336, 1083,  681, 1342,  717, 1604, 1350 },
-    {  452, 1155, 1372, 1900, 1501, 3290,  311,  944, 1919,  922 },
-    {  403, 1520,  977, 2132, 1733, 3522, 1076,  276, 3335, 1547 },
-    {  559, 1374, 1101,  615,  673, 2462,  974,  795,  984,  984 },
-    {  547, 1122, 1062,  812, 1410,  951, 1140,  622, 1268,  651 } },
-  { {  165,  982, 1235,  938, 1334, 1366, 1659, 1578,  964, 1612 },
-    {  592,  422,  925,  847, 1139, 1112, 1387, 2036,  861, 1041 },
-    {  403,  837,  732,  770,  941, 1658, 1250,  809, 1407, 1407 },
-    {  896,  874, 1071,  381, 1568, 1722, 1437, 2192,  480, 1035 },
-    {  640, 1098, 1012, 1032,  684, 1382, 1581, 2106,  416,  865 },
-    {  559, 1005,  819,  914,  710,  770, 1418,  920,  838, 1435 },
-    {  415, 1258, 1245,  870, 1278, 3067,  770, 1021, 1287,  522 },
-    {  406,  990,  601, 1009, 1265, 1265, 1267,  759, 1017, 1277 },
-    {  968, 1182, 1329,  788, 1032, 1292, 1705, 1714,  203, 1403 },
-    {  732,  877, 1279,  471,  901, 1161, 1545, 1294,  755,  755 } },
-  { {  111,  931, 1378, 1185, 1933, 1648, 1148, 1714, 1873, 1307 },
-    {  406,  414, 1030, 1023, 1910, 1404, 1313, 1647, 1509,  793 },
-    {  342,  640,  575, 1088, 1241, 1349, 1161, 1350, 1756, 1502 },
-    {  559,  766, 1185,  357, 1682, 1428, 1329, 1897, 1219,  802 },
-    {  473,  909, 1164,  771,  719, 2508, 1427, 1432,  722,  782 },
-    {  342,  892,  785, 1145, 1150,  794, 1296, 1550,  973, 1057 },
-    {  208, 1036, 1326, 1343, 1606, 3395,  815, 1455, 1618,  712 },
-    {  228,  928,  890, 1046, 3499, 1711,  994,  829, 1720, 1318 },
-    {  768,  724, 1058,  636,  991, 1075, 1319, 1324,  616,  825 },
-    {  305, 1167, 1358,  899, 1587, 1587,  987, 1988, 1332,  501 } }
+  { {  251, 1362, 1934, 2085, 2314, 2230, 1839, 1988, 2437, 2348 },
+    {  403,  680, 1507, 1519, 2060, 2005, 1992, 1914, 1924, 1733 },
+    {  353, 1121,  973, 1895, 2060, 1787, 1671, 1516, 2012, 1868 },
+    {  770,  852, 1581,  632, 1393, 1780, 1823, 1936, 1074, 1218 },
+    {  510, 1270, 1467, 1319,  847, 1279, 1792, 2094, 1080, 1353 },
+    {  488, 1322,  918, 1573, 1300,  883, 1814, 1752, 1756, 1502 },
+    {  425,  992, 1820, 1514, 1843, 2440,  937, 1771, 1924, 1129 },
+    {  363, 1248, 1257, 1970, 2194, 2385, 1569,  953, 1951, 1601 },
+    {  723, 1257, 1631,  964,  963, 1508, 1697, 1824,  671, 1418 },
+    {  635, 1038, 1573,  930, 1673, 1413, 1410, 1687, 1410,  749 } },
+  { {  451,  613, 1345, 1702, 1870, 1716, 1728, 1766, 2190, 2310 },
+    {  678,  453, 1171, 1443, 1925, 1831, 2045, 1781, 1887, 1602 },
+    {  711,  666,  674, 1718, 1910, 1493, 1775, 1193, 2325, 2325 },
+    {  883,  854, 1583,  542, 1800, 1878, 1664, 2149, 1207, 1087 },
+    {  669,  994, 1248, 1122,  949, 1179, 1376, 1729, 1070, 1244 },
+    {  715, 1026,  715, 1350, 1430,  930, 1717, 1296, 1479, 1479 },
+    {  544,  841, 1656, 1450, 2094, 3883, 1010, 1759, 2076,  809 },
+    {  610,  855,  957, 1553, 2067, 1561, 1704,  824, 2066, 1226 },
+    {  833,  960, 1416,  819, 1277, 1619, 1501, 1617,  757, 1182 },
+    {  711,  964, 1252,  879, 1441, 1828, 1508, 1636, 1594,  734 } },
+  { {  605,  764,  734, 1713, 1747, 1192, 1819, 1353, 1877, 2392 },
+    {  866,  641,  586, 1622, 2072, 1431, 1888, 1346, 2189, 1764 },
+    {  901,  851,  456, 2165, 2281, 1405, 1739, 1193, 2183, 2443 },
+    {  770, 1045,  952, 1078, 1342, 1191, 1436, 1063, 1303,  995 },
+    {  901, 1086,  727, 1170,  884, 1105, 1267, 1401, 1739, 1337 },
+    {  951, 1162,  595, 1488, 1388,  703, 1790, 1366, 2057, 1724 },
+    {  534,  986, 1273, 1987, 3273, 1485, 1024, 1399, 1583,  866 },
+    {  699, 1182,  695, 1978, 1726, 1986, 1326,  714, 1750, 1672 },
+    {  951, 1217, 1209,  920, 1062, 1441, 1548,  999,  952,  932 },
+    {  733, 1284,  784, 1256, 1557, 1098, 1257, 1357, 1414,  908 } },
+  { {  316, 1075, 1653, 1220, 2145, 2051, 1730, 2131, 1884, 1790 },
+    {  745,  516, 1404,  894, 1599, 2375, 2013, 2105, 1475, 1381 },
+    {  516,  729, 1088, 1319, 1637, 3426, 1636, 1275, 1531, 1453 },
+    {  894,  943, 2138,  468, 1704, 2259, 2069, 1763, 1266, 1158 },
+    {  605, 1025, 1235,  871, 1170, 1767, 1493, 1500, 1104, 1258 },
+    {  739,  826, 1207, 1151, 1412,  846, 1305, 2726, 1014, 1569 },
+    {  558,  825, 1820, 1398, 3344, 1556, 1218, 1550, 1228,  878 },
+    {  429,  951, 1089, 1816, 3861, 3861, 1556,  969, 1568, 1828 },
+    {  883,  961, 1752,  769, 1468, 1810, 2081, 2346,  613, 1298 },
+    {  803,  895, 1372,  641, 1303, 1708, 1686, 1700, 1306, 1033 } },
+  { {  439, 1267, 1270, 1579,  963, 1193, 1723, 1729, 1198, 1993 },
+    {  705,  725, 1029, 1153, 1176, 1103, 1821, 1567, 1259, 1574 },
+    {  723,  859,  802, 1253,  972, 1202, 1407, 1665, 1520, 1674 },
+    {  894,  960, 1254,  887, 1052, 1607, 1344, 1349,  865, 1150 },
+    {  833, 1312, 1337, 1205,  572, 1288, 1414, 1529, 1088, 1430 },
+    {  842, 1279, 1068, 1861,  862,  688, 1861, 1630, 1039, 1381 },
+    {  766,  938, 1279, 1546, 3338, 1550, 1031, 1542, 1288,  640 },
+    {  715, 1090,  835, 1609, 1100, 1100, 1603, 1019, 1102, 1617 },
+    {  894, 1813, 1500, 1188,  789, 1194, 1491, 1919,  617, 1333 },
+    {  610, 1076, 1644, 1281, 1283,  975, 1179, 1688, 1434,  889 } },
+  { {  544,  971, 1146, 1849, 1221,  740, 1857, 1621, 1683, 2430 },
+    {  723,  705,  961, 1371, 1426,  821, 2081, 2079, 1839, 1380 },
+    {  783,  857,  703, 2145, 1419,  814, 1791, 1310, 1609, 2206 },
+    {  997, 1000, 1153,  792, 1229, 1162, 1810, 1418,  942,  979 },
+    {  901, 1226,  883, 1289,  793,  715, 1904, 1649, 1319, 3108 },
+    {  979, 1478,  782, 2216, 1454,  455, 3092, 1591, 1997, 1664 },
+    {  663, 1110, 1504, 1114, 1522, 3311,  676, 1522, 1530, 1024 },
+    {  605, 1138, 1153, 1314, 1569, 1315, 1157,  804, 1574, 1320 },
+    {  770, 1216, 1218, 1227,  869, 1384, 1232, 1375,  834, 1239 },
+    {  775, 1007,  843, 1216, 1225, 1074, 2527, 1479, 1149,  975 } },
+  { {  477,  817, 1309, 1439, 1708, 1454, 1159, 1241, 1945, 1672 },
+    {  577,  796, 1112, 1271, 1618, 1458, 1087, 1345, 1831, 1265 },
+    {  663,  776,  753, 1940, 1690, 1690, 1227, 1097, 3149, 1361 },
+    {  766, 1299, 1744, 1161, 1565, 1106, 1045, 1230, 1232,  707 },
+    {  915, 1026, 1404, 1182, 1184,  851, 1428, 2425, 1043,  789 },
+    {  883, 1456,  790, 1082, 1086,  985, 1083, 1484, 1238, 1160 },
+    {  507, 1345, 2261, 1995, 1847, 3636,  653, 1761, 2287,  933 },
+    {  553, 1193, 1470, 2057, 2059, 2059,  833,  779, 2058, 1263 },
+    {  766, 1275, 1515, 1039,  957, 1554, 1286, 1540, 1289,  705 },
+    {  499, 1378, 1496, 1385, 1850, 1850, 1044, 2465, 1515,  720 } },
+  { {  553,  930,  978, 2077, 1968, 1481, 1457,  761, 1957, 2362 },
+    {  694,  864,  905, 1720, 1670, 1621, 1429,  718, 2125, 1477 },
+    {  699,  968,  658, 3190, 2024, 1479, 1865,  750, 2060, 2320 },
+    {  733, 1308, 1296, 1062, 1576, 1322, 1062, 1112, 1172,  816 },
+    {  920,  927, 1052,  939,  947, 1156, 1152, 1073, 3056, 1268 },
+    {  723, 1534,  711, 1547, 1294,  892, 1553,  928, 1815, 1561 },
+    {  663, 1366, 1583, 2111, 1712, 3501,  522, 1155, 2130, 1133 },
+    {  614, 1731, 1188, 2343, 1944, 3733, 1287,  487, 3546, 1758 },
+    {  770, 1585, 1312,  826,  884, 2673, 1185, 1006, 1195, 1195 },
+    {  758, 1333, 1273, 1023, 1621, 1162, 1351,  833, 1479,  862 } },
+  { {  376, 1193, 1446, 1149, 1545, 1577, 1870, 1789, 1175, 1823 },
+    {  803,  633, 1136, 1058, 1350, 1323, 1598, 2247, 1072, 1252 },
+    {  614, 1048,  943,  981, 1152, 1869, 1461, 1020, 1618, 1618 },
+    { 1107, 1085, 1282,  592, 1779, 1933, 1648, 2403,  691, 1246 },
+    {  851, 1309, 1223, 1243,  895, 1593, 1792, 2317,  627, 1076 },
+    {  770, 1216, 1030, 1125,  921,  981, 1629, 1131, 1049, 1646 },
+    {  626, 1469, 1456, 1081, 1489, 3278,  981, 1232, 1498,  733 },
+    {  617, 1201,  812, 1220, 1476, 1476, 1478,  970, 1228, 1488 },
+    { 1179, 1393, 1540,  999, 1243, 1503, 1916, 1925,  414, 1614 },
+    {  943, 1088, 1490,  682, 1112, 1372, 1756, 1505,  966,  966 } },
+  { {  322, 1142, 1589, 1396, 2144, 1859, 1359, 1925, 2084, 1518 },
+    {  617,  625, 1241, 1234, 2121, 1615, 1524, 1858, 1720, 1004 },
+    {  553,  851,  786, 1299, 1452, 1560, 1372, 1561, 1967, 1713 },
+    {  770,  977, 1396,  568, 1893, 1639, 1540, 2108, 1430, 1013 },
+    {  684, 1120, 1375,  982,  930, 2719, 1638, 1643,  933,  993 },
+    {  553, 1103,  996, 1356, 1361, 1005, 1507, 1761, 1184, 1268 },
+    {  419, 1247, 1537, 1554, 1817, 3606, 1026, 1666, 1829,  923 },
+    {  439, 1139, 1101, 1257, 3710, 1922, 1205, 1040, 1931, 1529 },
+    {  979,  935, 1269,  847, 1202, 1286, 1530, 1535,  827, 1036 },
+    {  516, 1378, 1569, 1110, 1798, 1798, 1198, 2199, 1543,  712 } },
 };
 
 //------------------------------------------------------------------------------
 
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/drivers/webp/enc/cost.h b/drivers/webp/enc/cost.h
index 3cbad1ae4c..09b75b699d 100644
--- a/drivers/webp/enc/cost.h
+++ b/drivers/webp/enc/cost.h
@@ -1,10 +1,8 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Cost tables for level and modes.
@@ -16,12 +14,11 @@
 
 #include "./vp8enci.h"
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
-// approximate cost per level:
-extern const uint16_t VP8LevelFixedCosts[MAX_LEVEL + 1];
+extern const uint16_t VP8LevelFixedCosts[2048];   // approximate cost per level
 extern const uint16_t VP8EntropyCost[256];        // 8bit fixed-point log(p)
 
 // Cost of coding one event with probability 'proba'.
@@ -44,7 +41,7 @@ extern const uint16_t VP8FixedCostsI4[NUM_BMODES][NUM_BMODES][NUM_BMODES];
 
 //------------------------------------------------------------------------------
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
 
diff --git a/drivers/webp/enc/filter.c b/drivers/webp/enc/filter.c
index dd27804b55..7fb78a3949 100644
--- a/drivers/webp/enc/filter.c
+++ b/drivers/webp/enc/filter.c
@@ -1,67 +1,20 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Selecting filter level
 //
 // Author: somnath@google.com (Somnath Banerjee)
 
-#include <assert.h>
 #include "./vp8enci.h"
 
-// This table gives, for a given sharpness, the filtering strength to be
-// used (at least) in order to filter a given edge step delta.
-// This is constructed by brute force inspection: for all delta, we iterate
-// over all possible filtering strength / thresh until needs_filter() returns
-// true.
-#define MAX_DELTA_SIZE 64
-static const uint8_t kLevelsFromDelta[8][MAX_DELTA_SIZE] = {
-  { 0,   1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
-    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 },
-  { 0,  1,  2,  3,  5,  6,  7,  8,  9, 11, 12, 13, 14, 15, 17, 18,
-    20, 21, 23, 24, 26, 27, 29, 30, 32, 33, 35, 36, 38, 39, 41, 42,
-    44, 45, 47, 48, 50, 51, 53, 54, 56, 57, 59, 60, 62, 63, 63, 63,
-    63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 },
-  {  0,  1,  2,  3,  5,  6,  7,  8,  9, 11, 12, 13, 14, 16, 17, 19,
-    20, 22, 23, 25, 26, 28, 29, 31, 32, 34, 35, 37, 38, 40, 41, 43,
-    44, 46, 47, 49, 50, 52, 53, 55, 56, 58, 59, 61, 62, 63, 63, 63,
-    63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 },
-  {  0,  1,  2,  3,  5,  6,  7,  8,  9, 11, 12, 13, 15, 16, 18, 19,
-    21, 22, 24, 25, 27, 28, 30, 31, 33, 34, 36, 37, 39, 40, 42, 43,
-    45, 46, 48, 49, 51, 52, 54, 55, 57, 58, 60, 61, 63, 63, 63, 63,
-    63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 },
-  {  0,  1,  2,  3,  5,  6,  7,  8,  9, 11, 12, 14, 15, 17, 18, 20,
-    21, 23, 24, 26, 27, 29, 30, 32, 33, 35, 36, 38, 39, 41, 42, 44,
-    45, 47, 48, 50, 51, 53, 54, 56, 57, 59, 60, 62, 63, 63, 63, 63,
-    63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 },
-  {  0,  1,  2,  4,  5,  7,  8,  9, 11, 12, 13, 15, 16, 17, 19, 20,
-    22, 23, 25, 26, 28, 29, 31, 32, 34, 35, 37, 38, 40, 41, 43, 44,
-    46, 47, 49, 50, 52, 53, 55, 56, 58, 59, 61, 62, 63, 63, 63, 63,
-    63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 },
-  {  0,  1,  2,  4,  5,  7,  8,  9, 11, 12, 13, 15, 16, 18, 19, 21,
-    22, 24, 25, 27, 28, 30, 31, 33, 34, 36, 37, 39, 40, 42, 43, 45,
-    46, 48, 49, 51, 52, 54, 55, 57, 58, 60, 61, 63, 63, 63, 63, 63,
-    63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 },
-  {  0,  1,  2,  4,  5,  7,  8,  9, 11, 12, 14, 15, 17, 18, 20, 21,
-    23, 24, 26, 27, 29, 30, 32, 33, 35, 36, 38, 39, 41, 42, 44, 45,
-    47, 48, 50, 51, 53, 54, 56, 57, 59, 60, 62, 63, 63, 63, 63, 63,
-    63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 }
-};
-
-int VP8FilterStrengthFromDelta(int sharpness, int delta) {
-  const int pos = (delta < MAX_DELTA_SIZE) ? delta : MAX_DELTA_SIZE - 1;
-  assert(sharpness >= 0 && sharpness <= 7);
-  return kLevelsFromDelta[sharpness][pos];
-}
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
 
-// -----------------------------------------------------------------------------
 // NOTE: clip1, tables and InitTables are repeated entries of dsp.c
 static uint8_t abs0[255 + 255 + 1];     // abs(i)
 static uint8_t abs1[255 + 255 + 1];     // abs(i)>>1
@@ -385,29 +338,28 @@ static double GetMBSSIM(const uint8_t* yuv1, const uint8_t* yuv2) {
 // loop filter strength
 
 void VP8InitFilter(VP8EncIterator* const it) {
-  if (it->lf_stats_ != NULL) {
-    int s, i;
-    InitTables();
-    for (s = 0; s < NUM_MB_SEGMENTS; s++) {
-      for (i = 0; i < MAX_LF_LEVELS; i++) {
-        (*it->lf_stats_)[s][i] = 0;
-      }
+  int s, i;
+  if (!it->lf_stats_) return;
+
+  InitTables();
+  for (s = 0; s < NUM_MB_SEGMENTS; s++) {
+    for (i = 0; i < MAX_LF_LEVELS; i++) {
+      (*it->lf_stats_)[s][i] = 0;
     }
   }
 }
 
 void VP8StoreFilterStats(VP8EncIterator* const it) {
   int d;
-  VP8Encoder* const enc = it->enc_;
   const int s = it->mb_->segment_;
-  const int level0 = enc->dqm_[s].fstrength_;  // TODO: ref_lf_delta[]
+  const int level0 = it->enc_->dqm_[s].fstrength_;  // TODO: ref_lf_delta[]
 
   // explore +/-quant range of values around level0
-  const int delta_min = -enc->dqm_[s].quant_;
-  const int delta_max = enc->dqm_[s].quant_;
+  const int delta_min = -it->enc_->dqm_[s].quant_;
+  const int delta_max = it->enc_->dqm_[s].quant_;
   const int step_size = (delta_max - delta_min >= 4) ? 4 : 1;
 
-  if (it->lf_stats_ == NULL) return;
+  if (!it->lf_stats_) return;
 
   // NOTE: Currently we are applying filter only across the sublock edges
   // There are two reasons for that.
@@ -431,41 +383,27 @@ void VP8StoreFilterStats(VP8EncIterator* const it) {
 }
 
 void VP8AdjustFilterStrength(VP8EncIterator* const it) {
+  int s;
   VP8Encoder* const enc = it->enc_;
-  if (it->lf_stats_ != NULL) {
-    int s;
-    for (s = 0; s < NUM_MB_SEGMENTS; s++) {
-      int i, best_level = 0;
-      // Improvement over filter level 0 should be at least 1e-5 (relatively)
-      double best_v = 1.00001 * (*it->lf_stats_)[s][0];
-      for (i = 1; i < MAX_LF_LEVELS; i++) {
-        const double v = (*it->lf_stats_)[s][i];
-        if (v > best_v) {
-          best_v = v;
-          best_level = i;
-        }
-      }
-      enc->dqm_[s].fstrength_ = best_level;
-    }
-  } else if (enc->config_->filter_strength > 0) {
-    int max_level = 0;
-    int s;
-    for (s = 0; s < NUM_MB_SEGMENTS; s++) {
-      VP8SegmentInfo* const dqm = &enc->dqm_[s];
-      // this '>> 3' accounts for some inverse WHT scaling
-      const int delta = (dqm->max_edge_ * dqm->y2_.q_[1]) >> 3;
-      const int level =
-          VP8FilterStrengthFromDelta(enc->filter_hdr_.sharpness_, delta);
-      if (level > dqm->fstrength_) {
-        dqm->fstrength_ = level;
-      }
-      if (max_level < dqm->fstrength_) {
-        max_level = dqm->fstrength_;
+
+  if (!it->lf_stats_) {
+    return;
+  }
+  for (s = 0; s < NUM_MB_SEGMENTS; s++) {
+    int i, best_level = 0;
+    // Improvement over filter level 0 should be at least 1e-5 (relatively)
+    double best_v = 1.00001 * (*it->lf_stats_)[s][0];
+    for (i = 1; i < MAX_LF_LEVELS; i++) {
+      const double v = (*it->lf_stats_)[s][i];
+      if (v > best_v) {
+        best_v = v;
+        best_level = i;
       }
     }
-    enc->filter_hdr_.level_ = max_level;
+    enc->dqm_[s].fstrength_ = best_level;
   }
 }
 
-// -----------------------------------------------------------------------------
-
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/drivers/webp/enc/frame.c b/drivers/webp/enc/frame.c
index 2582244c6c..bdd360069b 100644
--- a/drivers/webp/enc/frame.c
+++ b/drivers/webp/enc/frame.c
@@ -1,10 +1,8 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 //   frame coding and analysis
@@ -18,7 +16,10 @@
 
 #include "./vp8enci.h"
 #include "./cost.h"
-#include "../webp/format_constants.h"  // RIFF constants
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
 
 #define SEGMENT_VISU 0
 #define DEBUG_SEARCH 0    // useful to track search convergence
@@ -37,63 +38,6 @@ typedef struct {
 } VP8Residual;
 
 //------------------------------------------------------------------------------
-// multi-pass convergence
-
-#define HEADER_SIZE_ESTIMATE (RIFF_HEADER_SIZE + CHUNK_HEADER_SIZE +  \
-                              VP8_FRAME_HEADER_SIZE)
-#define DQ_LIMIT 0.4  // convergence is considered reached if dq < DQ_LIMIT
-// we allow 2k of extra head-room in PARTITION0 limit.
-#define PARTITION0_SIZE_LIMIT ((VP8_MAX_PARTITION0_SIZE - 2048ULL) << 11)
-
-typedef struct {  // struct for organizing convergence in either size or PSNR
-  int is_first;
-  float dq;
-  float q, last_q;
-  double value, last_value;   // PSNR or size
-  double target;
-  int do_size_search;
-} PassStats;
-
-static int InitPassStats(const VP8Encoder* const enc, PassStats* const s) {
-  const uint64_t target_size = (uint64_t)enc->config_->target_size;
-  const int do_size_search = (target_size != 0);
-  const float target_PSNR = enc->config_->target_PSNR;
-
-  s->is_first = 1;
-  s->dq = 10.f;
-  s->q = s->last_q = enc->config_->quality;
-  s->target = do_size_search ? (double)target_size
-            : (target_PSNR > 0.) ? target_PSNR
-            : 40.;   // default, just in case
-  s->value = s->last_value = 0.;
-  s->do_size_search = do_size_search;
-  return do_size_search;
-}
-
-static float Clamp(float v, float min, float max) {
-  return (v < min) ? min : (v > max) ? max : v;
-}
-
-static float ComputeNextQ(PassStats* const s) {
-  float dq;
-  if (s->is_first) {
-    dq = (s->value > s->target) ? -s->dq : s->dq;
-    s->is_first = 0;
-  } else if (s->value != s->last_value) {
-    const double slope = (s->target - s->value) / (s->last_value - s->value);
-    dq = (float)(slope * (s->last_q - s->q));
-  } else {
-    dq = 0.;  // we're done?!
-  }
-  // Limit variable to avoid large swings.
-  s->dq = Clamp(dq, -30.f, 30.f);
-  s->last_q = s->q;
-  s->last_value = s->value;
-  s->q = Clamp(s->q + s->dq, 0.f, 100.f);
-  return s->q;
-}
-
-//------------------------------------------------------------------------------
 // Tables for level coding
 
 const uint8_t VP8EncBands[16 + 1] = {
@@ -101,10 +45,10 @@ const uint8_t VP8EncBands[16 + 1] = {
   0  // sentinel
 };
 
-const uint8_t VP8Cat3[] = { 173, 148, 140 };
-const uint8_t VP8Cat4[] = { 176, 155, 140, 135 };
-const uint8_t VP8Cat5[] = { 180, 157, 141, 134, 130 };
-const uint8_t VP8Cat6[] =
+static const uint8_t kCat3[] = { 173, 148, 140 };
+static const uint8_t kCat4[] = { 176, 155, 140, 135 };
+static const uint8_t kCat5[] = { 180, 157, 141, 134, 130 };
+static const uint8_t kCat6[] =
     { 254, 254, 243, 230, 196, 177, 153, 140, 133, 130, 129 };
 
 //------------------------------------------------------------------------------
@@ -169,15 +113,14 @@ static int Record(int bit, proba_t* const stats) {
 // Note: no need to record the fixed probas.
 static int RecordCoeffs(int ctx, const VP8Residual* const res) {
   int n = res->first;
-  // should be stats[VP8EncBands[n]], but it's equivalent for n=0 or 1
-  proba_t* s = res->stats[n][ctx];
+  proba_t* s = res->stats[VP8EncBands[n]][ctx];
   if (res->last  < 0) {
     Record(0, s + 0);
     return 0;
   }
   while (n <= res->last) {
     int v;
-    Record(1, s + 0);  // order of record doesn't matter
+    Record(1, s + 0);
     while ((v = res->coeffs[n++]) == 0) {
       Record(0, s + 1);
       s = res->stats[VP8EncBands[n]][0];
@@ -231,7 +174,8 @@ static int BranchCost(int nb, int total, int proba) {
   return nb * VP8BitCost(1, proba) + (total - nb) * VP8BitCost(0, proba);
 }
 
-static int FinalizeTokenProbas(VP8Proba* const proba) {
+static int FinalizeTokenProbas(VP8Encoder* const enc) {
+  VP8Proba* const proba = &enc->proba_;
   int has_changed = 0;
   int size = 0;
   int t, b, c, p;
@@ -268,47 +212,6 @@ static int FinalizeTokenProbas(VP8Proba* const proba) {
 }
 
 //------------------------------------------------------------------------------
-// Finalize Segment probability based on the coding tree
-
-static int GetProba(int a, int b) {
-  const int total = a + b;
-  return (total == 0) ? 255     // that's the default probability.
-                      : (255 * a + total / 2) / total;  // rounded proba
-}
-
-static void SetSegmentProbas(VP8Encoder* const enc) {
-  int p[NUM_MB_SEGMENTS] = { 0 };
-  int n;
-
-  for (n = 0; n < enc->mb_w_ * enc->mb_h_; ++n) {
-    const VP8MBInfo* const mb = &enc->mb_info_[n];
-    p[mb->segment_]++;
-  }
-  if (enc->pic_->stats != NULL) {
-    for (n = 0; n < NUM_MB_SEGMENTS; ++n) {
-      enc->pic_->stats->segment_size[n] = p[n];
-    }
-  }
-  if (enc->segment_hdr_.num_segments_ > 1) {
-    uint8_t* const probas = enc->proba_.segments_;
-    probas[0] = GetProba(p[0] + p[1], p[2] + p[3]);
-    probas[1] = GetProba(p[0], p[1]);
-    probas[2] = GetProba(p[2], p[3]);
-
-    enc->segment_hdr_.update_map_ =
-        (probas[0] != 255) || (probas[1] != 255) || (probas[2] != 255);
-    enc->segment_hdr_.size_ =
-        p[0] * (VP8BitCost(0, probas[0]) + VP8BitCost(0, probas[1])) +
-        p[1] * (VP8BitCost(0, probas[0]) + VP8BitCost(1, probas[1])) +
-        p[2] * (VP8BitCost(1, probas[0]) + VP8BitCost(0, probas[2])) +
-        p[3] * (VP8BitCost(1, probas[0]) + VP8BitCost(1, probas[2]));
-  } else {
-    enc->segment_hdr_.update_map_ = 0;
-    enc->segment_hdr_.size_ = 0;
-  }
-}
-
-//------------------------------------------------------------------------------
 // helper functions for residuals struct VP8Residual.
 
 static void InitResidual(int first, int coeff_type,
@@ -336,38 +239,39 @@ static void SetResidualCoeffs(const int16_t* const coeffs,
 //------------------------------------------------------------------------------
 // Mode costs
 
-static int GetResidualCost(int ctx0, const VP8Residual* const res) {
+static int GetResidualCost(int ctx, const VP8Residual* const res) {
   int n = res->first;
-  // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
-  int p0 = res->prob[n][ctx0][0];
-  const uint16_t* t = res->cost[n][ctx0];
+  int p0 = res->prob[VP8EncBands[n]][ctx][0];
+  const uint16_t* t = res->cost[VP8EncBands[n]][ctx];
   int cost;
 
   if (res->last < 0) {
     return VP8BitCost(0, p0);
   }
-  cost = VP8BitCost(1, p0);
-  for (; n < res->last; ++n) {
-    const int v = abs(res->coeffs[n]);
+  cost = 0;
+  while (n <= res->last) {
+    const int v = res->coeffs[n];
     const int b = VP8EncBands[n + 1];
-    const int ctx = (v >= 2) ? 2 : v;
-    cost += VP8LevelCost(t, v);
-    t = res->cost[b][ctx];
-    // the masking trick is faster than "if (v) cost += ..." with clang
-    cost += (v ? ~0U : 0) & VP8BitCost(1, res->prob[b][ctx][0]);
-  }
-  // Last coefficient is always non-zero
-  {
-    const int v = abs(res->coeffs[n]);
-    assert(v != 0);
-    cost += VP8LevelCost(t, v);
-    if (n < 15) {
-      const int b = VP8EncBands[n + 1];
-      const int ctx = (v == 1) ? 1 : 2;
-      const int last_p0 = res->prob[b][ctx][0];
-      cost += VP8BitCost(0, last_p0);
+    ++n;
+    if (v == 0) {
+      // short-case for VP8LevelCost(t, 0) (note: VP8LevelFixedCosts[0] == 0):
+      cost += t[0];
+      t = res->cost[b][0];
+      continue;
+    }
+    cost += VP8BitCost(1, p0);
+    if (2u >= (unsigned int)(v + 1)) {   // v = -1 or 1
+      // short-case for "VP8LevelCost(t, 1)" (256 is VP8LevelFixedCosts[1]):
+      cost += 256 + t[1];
+      p0 = res->prob[b][1][0];
+      t = res->cost[b][1];
+    } else {
+      cost += VP8LevelCost(t, abs(v));
+      p0 = res->prob[b][2][0];
+      t = res->cost[b][2];
     }
   }
+  if (n < 16) cost += VP8BitCost(0, p0);
   return cost;
 }
 
@@ -438,8 +342,7 @@ int VP8GetCostUV(VP8EncIterator* const it, const VP8ModeScore* const rd) {
 
 static int PutCoeffs(VP8BitWriter* const bw, int ctx, const VP8Residual* res) {
   int n = res->first;
-  // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
-  const uint8_t* p = res->prob[n][ctx];
+  const uint8_t* p = res->prob[VP8EncBands[n]][ctx];
   if (!VP8PutBit(bw, res->last >= 0, p[0])) {
     return 0;
   }
@@ -468,30 +371,30 @@ static int PutCoeffs(VP8BitWriter* const bw, int ctx, const VP8Residual* res) {
       } else {
         int mask;
         const uint8_t* tab;
-        if (v < 3 + (8 << 1)) {          // VP8Cat3  (3b)
+        if (v < 3 + (8 << 1)) {          // kCat3  (3b)
           VP8PutBit(bw, 0, p[8]);
           VP8PutBit(bw, 0, p[9]);
           v -= 3 + (8 << 0);
           mask = 1 << 2;
-          tab = VP8Cat3;
-        } else if (v < 3 + (8 << 2)) {   // VP8Cat4  (4b)
+          tab = kCat3;
+        } else if (v < 3 + (8 << 2)) {   // kCat4  (4b)
           VP8PutBit(bw, 0, p[8]);
           VP8PutBit(bw, 1, p[9]);
           v -= 3 + (8 << 1);
           mask = 1 << 3;
-          tab = VP8Cat4;
-        } else if (v < 3 + (8 << 3)) {   // VP8Cat5  (5b)
+          tab = kCat4;
+        } else if (v < 3 + (8 << 3)) {   // kCat5  (5b)
           VP8PutBit(bw, 1, p[8]);
           VP8PutBit(bw, 0, p[10]);
           v -= 3 + (8 << 2);
           mask = 1 << 4;
-          tab = VP8Cat5;
-        } else {                         // VP8Cat6 (11b)
+          tab = kCat5;
+        } else {                         // kCat6 (11b)
           VP8PutBit(bw, 1, p[8]);
           VP8PutBit(bw, 1, p[10]);
           v -= 3 + (8 << 3);
           mask = 1 << 10;
-          tab = VP8Cat6;
+          tab = kCat6;
         }
         while (mask) {
           VP8PutBit(bw, !!(v & mask), *tab++);
@@ -508,7 +411,8 @@ static int PutCoeffs(VP8BitWriter* const bw, int ctx, const VP8Residual* res) {
   return 1;
 }
 
-static void CodeResiduals(VP8BitWriter* const bw, VP8EncIterator* const it,
+static void CodeResiduals(VP8BitWriter* const bw,
+                          VP8EncIterator* const it,
                           const VP8ModeScore* const rd) {
   int x, y, ch;
   VP8Residual res;
@@ -608,23 +512,146 @@ static void RecordResiduals(VP8EncIterator* const it,
 //------------------------------------------------------------------------------
 // Token buffer
 
-#if !defined(DISABLE_TOKEN_BUFFER)
+#ifdef USE_TOKEN_BUFFER
+
+void VP8TBufferInit(VP8TBuffer* const b) {
+  b->rows_ = NULL;
+  b->tokens_ = NULL;
+  b->last_ = &b->rows_;
+  b->left_ = 0;
+  b->error_ = 0;
+}
+
+int VP8TBufferNewPage(VP8TBuffer* const b) {
+  VP8Tokens* const page = b->error_ ? NULL : (VP8Tokens*)malloc(sizeof(*page));
+  if (page == NULL) {
+    b->error_ = 1;
+    return 0;
+  }
+  *b->last_ = page;
+  b->last_ = &page->next_;
+  b->left_ = MAX_NUM_TOKEN;
+  b->tokens_ = page->tokens_;
+  return 1;
+}
+
+void VP8TBufferClear(VP8TBuffer* const b) {
+  if (b != NULL) {
+    const VP8Tokens* p = b->rows_;
+    while (p != NULL) {
+      const VP8Tokens* const next = p->next_;
+      free((void*)p);
+      p = next;
+    }
+    VP8TBufferInit(b);
+  }
+}
+
+int VP8EmitTokens(const VP8TBuffer* const b, VP8BitWriter* const bw,
+                  const uint8_t* const probas) {
+  VP8Tokens* p = b->rows_;
+  if (b->error_) return 0;
+  while (p != NULL) {
+    const int N = (p->next_ == NULL) ? b->left_ : 0;
+    int n = MAX_NUM_TOKEN;
+    while (n-- > N) {
+      VP8PutBit(bw, (p->tokens_[n] >> 15) & 1, probas[p->tokens_[n] & 0x7fff]);
+    }
+    p = p->next_;
+  }
+  return 1;
+}
+
+#define TOKEN_ID(b, ctx, p) ((p) + NUM_PROBAS * ((ctx) + (b) * NUM_CTX))
+
+static int RecordCoeffTokens(int ctx, const VP8Residual* const res,
+                             VP8TBuffer* tokens) {
+  int n = res->first;
+  int b = VP8EncBands[n];
+  if (!VP8AddToken(tokens, res->last >= 0, TOKEN_ID(b, ctx, 0))) {
+    return 0;
+  }
+
+  while (n < 16) {
+    const int c = res->coeffs[n++];
+    const int sign = c < 0;
+    int v = sign ? -c : c;
+    const int base_id = TOKEN_ID(b, ctx, 0);
+    if (!VP8AddToken(tokens, v != 0, base_id + 1)) {
+      b = VP8EncBands[n];
+      ctx = 0;
+      continue;
+    }
+    if (!VP8AddToken(tokens, v > 1, base_id + 2)) {
+      b = VP8EncBands[n];
+      ctx = 1;
+    } else {
+      if (!VP8AddToken(tokens, v > 4, base_id + 3)) {
+        if (VP8AddToken(tokens, v != 2, base_id + 4))
+          VP8AddToken(tokens, v == 4, base_id + 5);
+      } else if (!VP8AddToken(tokens, v > 10, base_id + 6)) {
+        if (!VP8AddToken(tokens, v > 6, base_id + 7)) {
+//          VP8AddToken(tokens, v == 6, 159);
+        } else {
+//          VP8AddToken(tokens, v >= 9, 165);
+//          VP8AddToken(tokens, !(v & 1), 145);
+        }
+      } else {
+        int mask;
+        const uint8_t* tab;
+        if (v < 3 + (8 << 1)) {          // kCat3  (3b)
+          VP8AddToken(tokens, 0, base_id + 8);
+          VP8AddToken(tokens, 0, base_id + 9);
+          v -= 3 + (8 << 0);
+          mask = 1 << 2;
+          tab = kCat3;
+        } else if (v < 3 + (8 << 2)) {   // kCat4  (4b)
+          VP8AddToken(tokens, 0, base_id + 8);
+          VP8AddToken(tokens, 1, base_id + 9);
+          v -= 3 + (8 << 1);
+          mask = 1 << 3;
+          tab = kCat4;
+        } else if (v < 3 + (8 << 3)) {   // kCat5  (5b)
+          VP8AddToken(tokens, 1, base_id + 8);
+          VP8AddToken(tokens, 0, base_id + 10);
+          v -= 3 + (8 << 2);
+          mask = 1 << 4;
+          tab = kCat5;
+        } else {                         // kCat6 (11b)
+          VP8AddToken(tokens, 1, base_id + 8);
+          VP8AddToken(tokens, 1, base_id + 10);
+          v -= 3 + (8 << 3);
+          mask = 1 << 10;
+          tab = kCat6;
+        }
+        while (mask) {
+          // VP8AddToken(tokens, !!(v & mask), *tab++);
+          mask >>= 1;
+        }
+      }
+      ctx = 2;
+    }
+    b = VP8EncBands[n];
+    // VP8PutBitUniform(bw, sign);
+    if (n == 16 || !VP8AddToken(tokens, n <= res->last, TOKEN_ID(b, ctx, 0))) {
+      return 1;   // EOB
+    }
+  }
+  return 1;
+}
 
-static void RecordTokens(VP8EncIterator* const it, const VP8ModeScore* const rd,
-                         VP8TBuffer* const tokens) {
+static void RecordTokens(VP8EncIterator* const it,
+                         const VP8ModeScore* const rd, VP8TBuffer tokens[2]) {
   int x, y, ch;
   VP8Residual res;
   VP8Encoder* const enc = it->enc_;
 
   VP8IteratorNzToBytes(it);
   if (it->mb_->type_ == 1) {   // i16x16
-    const int ctx = it->top_nz_[8] + it->left_nz_[8];
     InitResidual(0, 1, enc, &res);
     SetResidualCoeffs(rd->y_dc_levels, &res);
-    it->top_nz_[8] = it->left_nz_[8] =
-        VP8RecordCoeffTokens(ctx, 1,
-                             res.first, res.last, res.coeffs, tokens);
-    RecordCoeffs(ctx, &res);
+// TODO(skal): FIX ->    it->top_nz_[8] = it->left_nz_[8] =
+      RecordCoeffTokens(it->top_nz_[8] + it->left_nz_[8], &res, &tokens[0]);
     InitResidual(1, 0, enc, &res);
   } else {
     InitResidual(0, 3, enc, &res);
@@ -636,9 +663,7 @@ static void RecordTokens(VP8EncIterator* const it, const VP8ModeScore* const rd,
       const int ctx = it->top_nz_[x] + it->left_nz_[y];
       SetResidualCoeffs(rd->y_ac_levels[x + y * 4], &res);
       it->top_nz_[x] = it->left_nz_[y] =
-          VP8RecordCoeffTokens(ctx, res.coeff_type,
-                               res.first, res.last, res.coeffs, tokens);
-      RecordCoeffs(ctx, &res);
+          RecordCoeffTokens(ctx, &res, &tokens[0]);
     }
   }
 
@@ -650,16 +675,13 @@ static void RecordTokens(VP8EncIterator* const it, const VP8ModeScore* const rd,
         const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
         SetResidualCoeffs(rd->uv_levels[ch * 2 + x + y * 2], &res);
         it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] =
-            VP8RecordCoeffTokens(ctx, 2,
-                                 res.first, res.last, res.coeffs, tokens);
-        RecordCoeffs(ctx, &res);
+            RecordCoeffTokens(ctx, &res, &tokens[1]);
       }
     }
   }
-  VP8IteratorBytesToNz(it);
 }
 
-#endif    // !DISABLE_TOKEN_BUFFER
+#endif    // USE_TOKEN_BUFFER
 
 //------------------------------------------------------------------------------
 // ExtraInfo map / Debug function
@@ -675,10 +697,7 @@ static void SetBlock(uint8_t* p, int value, int size) {
 #endif
 
 static void ResetSSE(VP8Encoder* const enc) {
-  enc->sse_[0] = 0;
-  enc->sse_[1] = 0;
-  enc->sse_[2] = 0;
-  // Note: enc->sse_[3] is managed by alpha.c
+  memset(enc->sse_, 0, sizeof(enc->sse_));
   enc->sse_count_ = 0;
 }
 
@@ -717,7 +736,6 @@ static void StoreSideInfo(const VP8EncIterator* const it) {
         const int b = (int)((it->luma_bits_ + it->uv_bits_ + 7) >> 3);
         *info = (b > 255) ? 255 : b; break;
       }
-      case 7: *info = mb->alpha_; break;
       default: *info = 0; break;
     };
   }
@@ -728,149 +746,62 @@ static void StoreSideInfo(const VP8EncIterator* const it) {
 #endif
 }
 
-static double GetPSNR(uint64_t mse, uint64_t size) {
-  return (mse > 0 && size > 0) ? 10. * log10(255. * 255. * size / mse) : 99;
-}
-
 //------------------------------------------------------------------------------
-//  StatLoop(): only collect statistics (number of skips, token usage, ...).
-//  This is used for deciding optimal probabilities. It also modifies the
-//  quantizer value if some target (size, PSNR) was specified.
+// Main loops
+//
+//  VP8EncLoop(): does the final bitstream coding.
 
-static void SetLoopParams(VP8Encoder* const enc, float q) {
-  // Make sure the quality parameter is inside valid bounds
-  q = Clamp(q, 0.f, 100.f);
+static void ResetAfterSkip(VP8EncIterator* const it) {
+  if (it->mb_->type_ == 1) {
+    *it->nz_ = 0;  // reset all predictors
+    it->left_nz_[8] = 0;
+  } else {
+    *it->nz_ &= (1 << 24);  // preserve the dc_nz bit
+  }
+}
 
-  VP8SetSegmentParams(enc, q);      // setup segment quantizations and filters
-  SetSegmentProbas(enc);            // compute segment probabilities
+int VP8EncLoop(VP8Encoder* const enc) {
+  int i, s, p;
+  int ok = 1;
+  VP8EncIterator it;
+  VP8ModeScore info;
+  const int dont_use_skip = !enc->proba_.use_skip_proba_;
+  const int rd_opt = enc->rd_opt_level_;
+  const int kAverageBytesPerMB = 5;     // TODO: have a kTable[quality/10]
+  const int bytes_per_parts =
+    enc->mb_w_ * enc->mb_h_ * kAverageBytesPerMB / enc->num_parts_;
+
+  // Initialize the bit-writers
+  for (p = 0; p < enc->num_parts_; ++p) {
+    VP8BitWriterInit(enc->parts_ + p, bytes_per_parts);
+  }
 
   ResetStats(enc);
   ResetSSE(enc);
-}
-
-static uint64_t OneStatPass(VP8Encoder* const enc, VP8RDLevel rd_opt,
-                            int nb_mbs, int percent_delta,
-                            PassStats* const s) {
-  VP8EncIterator it;
-  uint64_t size = 0;
-  uint64_t size_p0 = 0;
-  uint64_t distortion = 0;
-  const uint64_t pixel_count = nb_mbs * 384;
 
   VP8IteratorInit(enc, &it);
-  SetLoopParams(enc, s->q);
+  VP8InitFilter(&it);
   do {
-    VP8ModeScore info;
-    VP8IteratorImport(&it, NULL);
-    if (VP8Decimate(&it, &info, rd_opt)) {
-      // Just record the number of skips and act like skip_proba is not used.
-      enc->proba_.nb_skip_++;
+    VP8IteratorImport(&it);
+    // Warning! order is important: first call VP8Decimate() and
+    // *then* decide how to code the skip decision if there's one.
+    if (!VP8Decimate(&it, &info, rd_opt) || dont_use_skip) {
+      CodeResiduals(it.bw_, &it, &info);
+    } else {   // reset predictors after a skip
+      ResetAfterSkip(&it);
     }
-    RecordResiduals(&it, &info);
-    size += info.R + info.H;
-    size_p0 += info.H;
-    distortion += info.D;
-    if (percent_delta && !VP8IteratorProgress(&it, percent_delta))
-      return 0;
-    VP8IteratorSaveBoundary(&it);
-  } while (VP8IteratorNext(&it) && --nb_mbs > 0);
-
-  size_p0 += enc->segment_hdr_.size_;
-  if (s->do_size_search) {
-    size += FinalizeSkipProba(enc);
-    size += FinalizeTokenProbas(&enc->proba_);
-    size = ((size + size_p0 + 1024) >> 11) + HEADER_SIZE_ESTIMATE;
-    s->value = (double)size;
-  } else {
-    s->value = GetPSNR(distortion, pixel_count);
-  }
-  return size_p0;
-}
-
-static int StatLoop(VP8Encoder* const enc) {
-  const int method = enc->method_;
-  const int do_search = enc->do_search_;
-  const int fast_probe = ((method == 0 || method == 3) && !do_search);
-  int num_pass_left = enc->config_->pass;
-  const int task_percent = 20;
-  const int percent_per_pass =
-      (task_percent + num_pass_left / 2) / num_pass_left;
-  const int final_percent = enc->percent_ + task_percent;
-  const VP8RDLevel rd_opt =
-      (method >= 3 || do_search) ? RD_OPT_BASIC : RD_OPT_NONE;
-  int nb_mbs = enc->mb_w_ * enc->mb_h_;
-  PassStats stats;
-
-  InitPassStats(enc, &stats);
-  ResetTokenStats(enc);
-
-  // Fast mode: quick analysis pass over few mbs. Better than nothing.
-  if (fast_probe) {
-    if (method == 3) {  // we need more stats for method 3 to be reliable.
-      nb_mbs = (nb_mbs > 200) ? nb_mbs >> 1 : 100;
-    } else {
-      nb_mbs = (nb_mbs > 200) ? nb_mbs >> 2 : 50;
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+    if (enc->use_layer_) {
+      VP8EncCodeLayerBlock(&it);
     }
-  }
-
-  while (num_pass_left-- > 0) {
-    const int is_last_pass = (fabs(stats.dq) <= DQ_LIMIT) ||
-                             (num_pass_left == 0) ||
-                             (enc->max_i4_header_bits_ == 0);
-    const uint64_t size_p0 =
-        OneStatPass(enc, rd_opt, nb_mbs, percent_per_pass, &stats);
-    if (size_p0 == 0) return 0;
-#if (DEBUG_SEARCH > 0)
-    printf("#%d value:%.1lf -> %.1lf   q:%.2f -> %.2f\n",
-           num_pass_left, stats.last_value, stats.value, stats.last_q, stats.q);
 #endif
-    if (enc->max_i4_header_bits_ > 0 && size_p0 > PARTITION0_SIZE_LIMIT) {
-      ++num_pass_left;
-      enc->max_i4_header_bits_ >>= 1;  // strengthen header bit limitation...
-      continue;                        // ...and start over
-    }
-    if (is_last_pass) {
-      break;
-    }
-    // If no target size: just do several pass without changing 'q'
-    if (do_search) {
-      ComputeNextQ(&stats);
-      if (fabs(stats.dq) <= DQ_LIMIT) break;
-    }
-  }
-  if (!do_search || !stats.do_size_search) {
-    // Need to finalize probas now, since it wasn't done during the search.
-    FinalizeSkipProba(enc);
-    FinalizeTokenProbas(&enc->proba_);
-  }
-  VP8CalculateLevelCosts(&enc->proba_);  // finalize costs
-  return WebPReportProgress(enc->pic_, final_percent, &enc->percent_);
-}
-
-//------------------------------------------------------------------------------
-// Main loops
-//
-
-static const int kAverageBytesPerMB[8] = { 50, 24, 16, 9, 7, 5, 3, 2 };
-
-static int PreLoopInitialize(VP8Encoder* const enc) {
-  int p;
-  int ok = 1;
-  const int average_bytes_per_MB = kAverageBytesPerMB[enc->base_quant_ >> 4];
-  const int bytes_per_parts =
-      enc->mb_w_ * enc->mb_h_ * average_bytes_per_MB / enc->num_parts_;
-  // Initialize the bit-writers
-  for (p = 0; ok && p < enc->num_parts_; ++p) {
-    ok = VP8BitWriterInit(enc->parts_ + p, bytes_per_parts);
-  }
-  if (!ok) VP8EncFreeBitWriters(enc);  // malloc error occurred
-  return ok;
-}
+    StoreSideInfo(&it);
+    VP8StoreFilterStats(&it);
+    VP8IteratorExport(&it);
+    ok = VP8IteratorProgress(&it, 20);
+  } while (ok && VP8IteratorNext(&it, it.yuv_out_));
 
-static int PostLoopFinalize(VP8EncIterator* const it, int ok) {
-  VP8Encoder* const enc = it->enc_;
   if (ok) {      // Finalize the partitions, check for extra errors.
-    int p;
     for (p = 0; p < enc->num_parts_; ++p) {
       VP8BitWriterFinish(enc->parts_ + p);
       ok &= !enc->parts_[p].error_;
@@ -878,191 +809,131 @@ static int PostLoopFinalize(VP8EncIterator* const it, int ok) {
   }
 
   if (ok) {      // All good. Finish up.
-    if (enc->pic_->stats != NULL) {  // finalize byte counters...
-      int i, s;
+    if (enc->pic_->stats) {           // finalize byte counters...
       for (i = 0; i <= 2; ++i) {
         for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
-          enc->residual_bytes_[i][s] = (int)((it->bit_count_[s][i] + 7) >> 3);
+          enc->residual_bytes_[i][s] = (int)((it.bit_count_[s][i] + 7) >> 3);
         }
       }
     }
-    VP8AdjustFilterStrength(it);     // ...and store filter stats.
+    VP8AdjustFilterStrength(&it);     // ...and store filter stats.
   } else {
     // Something bad happened -> need to do some memory cleanup.
     VP8EncFreeBitWriters(enc);
   }
+
   return ok;
 }
 
 //------------------------------------------------------------------------------
-//  VP8EncLoop(): does the final bitstream coding.
+//  VP8StatLoop(): only collect statistics (number of skips, token usage, ...)
+//                 This is used for deciding optimal probabilities. It also
+//                 modifies the quantizer value if some target (size, PNSR)
+//                 was specified.
 
-static void ResetAfterSkip(VP8EncIterator* const it) {
-  if (it->mb_->type_ == 1) {
-    *it->nz_ = 0;  // reset all predictors
-    it->left_nz_[8] = 0;
-  } else {
-    *it->nz_ &= (1 << 24);  // preserve the dc_nz bit
-  }
-}
+#define kHeaderSizeEstimate (15 + 20 + 10)      // TODO: fix better
 
-int VP8EncLoop(VP8Encoder* const enc) {
+static int OneStatPass(VP8Encoder* const enc, float q, int rd_opt, int nb_mbs,
+                       float* const PSNR, int percent_delta) {
   VP8EncIterator it;
-  int ok = PreLoopInitialize(enc);
-  if (!ok) return 0;
+  uint64_t size = 0;
+  uint64_t distortion = 0;
+  const uint64_t pixel_count = nb_mbs * 384;
 
-  StatLoop(enc);  // stats-collection loop
+  // Make sure the quality parameter is inside valid bounds
+  if (q < 0.) {
+    q = 0;
+  } else if (q > 100.) {
+    q = 100;
+  }
+
+  VP8SetSegmentParams(enc, q);      // setup segment quantizations and filters
+
+  ResetStats(enc);
+  ResetTokenStats(enc);
 
   VP8IteratorInit(enc, &it);
-  VP8InitFilter(&it);
   do {
     VP8ModeScore info;
-    const int dont_use_skip = !enc->proba_.use_skip_proba_;
-    const VP8RDLevel rd_opt = enc->rd_opt_level_;
-
-    VP8IteratorImport(&it, NULL);
-    // Warning! order is important: first call VP8Decimate() and
-    // *then* decide how to code the skip decision if there's one.
-    if (!VP8Decimate(&it, &info, rd_opt) || dont_use_skip) {
-      CodeResiduals(it.bw_, &it, &info);
-    } else {   // reset predictors after a skip
-      ResetAfterSkip(&it);
-    }
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-    if (enc->use_layer_) {
-      VP8EncCodeLayerBlock(&it);
+    VP8IteratorImport(&it);
+    if (VP8Decimate(&it, &info, rd_opt)) {
+      // Just record the number of skips and act like skip_proba is not used.
+      enc->proba_.nb_skip_++;
     }
-#endif
-    StoreSideInfo(&it);
-    VP8StoreFilterStats(&it);
-    VP8IteratorExport(&it);
-    ok = VP8IteratorProgress(&it, 20);
-    VP8IteratorSaveBoundary(&it);
-  } while (ok && VP8IteratorNext(&it));
+    RecordResiduals(&it, &info);
+    size += info.R;
+    distortion += info.D;
+    if (percent_delta && !VP8IteratorProgress(&it, percent_delta))
+      return 0;
+  } while (VP8IteratorNext(&it, it.yuv_out_) && --nb_mbs > 0);
+  size += FinalizeSkipProba(enc);
+  size += FinalizeTokenProbas(enc);
+  size += enc->segment_hdr_.size_;
+  size = ((size + 1024) >> 11) + kHeaderSizeEstimate;
 
-  return PostLoopFinalize(&it, ok);
+  if (PSNR) {
+    *PSNR = (float)(10.* log10(255. * 255. * pixel_count / distortion));
+  }
+  return (int)size;
 }
 
-//------------------------------------------------------------------------------
-// Single pass using Token Buffer.
-
-#if !defined(DISABLE_TOKEN_BUFFER)
+// successive refinement increments.
+static const int dqs[] = { 20, 15, 10, 8, 6, 4, 2, 1, 0 };
 
-#define MIN_COUNT 96  // minimum number of macroblocks before updating stats
+int VP8StatLoop(VP8Encoder* const enc) {
+  const int do_search =
+    (enc->config_->target_size > 0 || enc->config_->target_PSNR > 0);
+  const int fast_probe = (enc->method_ < 2 && !do_search);
+  float q = enc->config_->quality;
+  const int max_passes = enc->config_->pass;
+  const int task_percent = 20;
+  const int percent_per_pass = (task_percent + max_passes / 2) / max_passes;
+  const int final_percent = enc->percent_ + task_percent;
+  int pass;
+  int nb_mbs;
 
-int VP8EncTokenLoop(VP8Encoder* const enc) {
-  // Roughly refresh the proba eight times per pass
-  int max_count = (enc->mb_w_ * enc->mb_h_) >> 3;
-  int num_pass_left = enc->config_->pass;
-  const int do_search = enc->do_search_;
-  VP8EncIterator it;
-  VP8Proba* const proba = &enc->proba_;
-  const VP8RDLevel rd_opt = enc->rd_opt_level_;
-  const uint64_t pixel_count = enc->mb_w_ * enc->mb_h_ * 384;
-  PassStats stats;
-  int ok;
-
-  InitPassStats(enc, &stats);
-  ok = PreLoopInitialize(enc);
-  if (!ok) return 0;
-
-  if (max_count < MIN_COUNT) max_count = MIN_COUNT;
-
-  assert(enc->num_parts_ == 1);
-  assert(enc->use_tokens_);
-  assert(proba->use_skip_proba_ == 0);
-  assert(rd_opt >= RD_OPT_BASIC);   // otherwise, token-buffer won't be useful
-  assert(num_pass_left > 0);
-
-  while (ok && num_pass_left-- > 0) {
-    const int is_last_pass = (fabs(stats.dq) <= DQ_LIMIT) ||
-                             (num_pass_left == 0) ||
-                             (enc->max_i4_header_bits_ == 0);
-    uint64_t size_p0 = 0;
-    uint64_t distortion = 0;
-    int cnt = max_count;
-    VP8IteratorInit(enc, &it);
-    SetLoopParams(enc, stats.q);
-    if (is_last_pass) {
-      ResetTokenStats(enc);
-      VP8InitFilter(&it);  // don't collect stats until last pass (too costly)
-    }
-    VP8TBufferClear(&enc->tokens_);
-    do {
-      VP8ModeScore info;
-      VP8IteratorImport(&it, NULL);
-      if (--cnt < 0) {
-        FinalizeTokenProbas(proba);
-        VP8CalculateLevelCosts(proba);  // refresh cost tables for rd-opt
-        cnt = max_count;
-      }
-      VP8Decimate(&it, &info, rd_opt);
-      RecordTokens(&it, &info, &enc->tokens_);
-      size_p0 += info.H;
-      distortion += info.D;
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-      if (enc->use_layer_) {
-        VP8EncCodeLayerBlock(&it);
-      }
-#endif
-      if (is_last_pass) {
-        StoreSideInfo(&it);
-        VP8StoreFilterStats(&it);
-        VP8IteratorExport(&it);
-        ok = VP8IteratorProgress(&it, 20);
+  // Fast mode: quick analysis pass over few mbs. Better than nothing.
+  nb_mbs = enc->mb_w_ * enc->mb_h_;
+  if (fast_probe && nb_mbs > 100) nb_mbs = 100;
+
+  // No target size: just do several pass without changing 'q'
+  if (!do_search) {
+    for (pass = 0; pass < max_passes; ++pass) {
+      const int rd_opt = (enc->method_ > 2);
+      if (!OneStatPass(enc, q, rd_opt, nb_mbs, NULL, percent_per_pass)) {
+        return 0;
       }
-      VP8IteratorSaveBoundary(&it);
-    } while (ok && VP8IteratorNext(&it));
-    if (!ok) break;
-
-    size_p0 += enc->segment_hdr_.size_;
-    if (stats.do_size_search) {
-      uint64_t size = FinalizeTokenProbas(&enc->proba_);
-      size += VP8EstimateTokenSize(&enc->tokens_,
-                                   (const uint8_t*)proba->coeffs_);
-      size = (size + size_p0 + 1024) >> 11;  // -> size in bytes
-      size += HEADER_SIZE_ESTIMATE;
-      stats.value = (double)size;
-    } else {  // compute and store PSNR
-      stats.value = GetPSNR(distortion, pixel_count);
     }
-
-#if (DEBUG_SEARCH > 0)
-    printf("#%2d metric:%.1lf -> %.1lf   last_q=%.2lf q=%.2lf dq=%.2lf\n",
-           num_pass_left, stats.last_value, stats.value,
-           stats.last_q, stats.q, stats.dq);
+  } else {
+    // binary search for a size close to target
+    for (pass = 0; pass < max_passes && (dqs[pass] > 0); ++pass) {
+      const int rd_opt = 1;
+      float PSNR;
+      int criterion;
+      const int size = OneStatPass(enc, q, rd_opt, nb_mbs, &PSNR,
+                                   percent_per_pass);
+#if DEBUG_SEARCH
+      printf("#%d size=%d PSNR=%.2f q=%.2f\n", pass, size, PSNR, q);
 #endif
-    if (size_p0 > PARTITION0_SIZE_LIMIT) {
-      ++num_pass_left;
-      enc->max_i4_header_bits_ >>= 1;  // strengthen header bit limitation...
-      continue;                        // ...and start over
-    }
-    if (is_last_pass) {
-      break;   // done
-    }
-    if (do_search) {
-      ComputeNextQ(&stats);  // Adjust q
-    }
-  }
-  if (ok) {
-    if (!stats.do_size_search) {
-      FinalizeTokenProbas(&enc->proba_);
+      if (!size) return 0;
+      if (enc->config_->target_PSNR > 0) {
+        criterion = (PSNR < enc->config_->target_PSNR);
+      } else {
+        criterion = (size < enc->config_->target_size);
+      }
+      // dichotomize
+      if (criterion) {
+        q += dqs[pass];
+      } else {
+        q -= dqs[pass];
+      }
     }
-    ok = VP8EmitTokens(&enc->tokens_, enc->parts_ + 0,
-                       (const uint8_t*)proba->coeffs_, 1);
   }
-  ok = ok && WebPReportProgress(enc->pic_, enc->percent_ + 20, &enc->percent_);
-  return PostLoopFinalize(&it, ok);
-}
-
-#else
-
-int VP8EncTokenLoop(VP8Encoder* const enc) {
-  (void)enc;
-  return 0;   // we shouldn't be here.
+  return WebPReportProgress(enc->pic_, final_percent, &enc->percent_);
 }
 
-#endif    // DISABLE_TOKEN_BUFFER
-
 //------------------------------------------------------------------------------
 
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/drivers/webp/enc/histogram.c b/drivers/webp/enc/histogram.c
index abd253bd7c..ca838e064d 100644
--- a/drivers/webp/enc/histogram.c
+++ b/drivers/webp/enc/histogram.c
@@ -1,10 +1,8 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Author: Jyrki Alakuijala (jyrki@google.com)
@@ -57,9 +55,9 @@ VP8LHistogramSet* VP8LAllocateHistogramSet(int size, int cache_bits) {
   int i;
   VP8LHistogramSet* set;
   VP8LHistogram* bulk;
-  const uint64_t total_size = sizeof(*set)
-                            + (uint64_t)size * sizeof(*set->histograms)
-                            + (uint64_t)size * sizeof(**set->histograms);
+  const uint64_t total_size = (uint64_t)sizeof(*set)
+                            + size * sizeof(*set->histograms)
+                            + size * sizeof(**set->histograms);
   uint8_t* memory = (uint8_t*)WebPSafeMalloc(total_size, sizeof(*memory));
   if (memory == NULL) return NULL;
 
@@ -90,14 +88,18 @@ void VP8LHistogramAddSinglePixOrCopy(VP8LHistogram* const histo,
     int literal_ix = 256 + NUM_LENGTH_CODES + PixOrCopyCacheIdx(v);
     ++histo->literal_[literal_ix];
   } else {
-    int code, extra_bits;
-    VP8LPrefixEncodeBits(PixOrCopyLength(v), &code, &extra_bits);
+    int code, extra_bits_count, extra_bits_value;
+    PrefixEncode(PixOrCopyLength(v),
+                 &code, &extra_bits_count, &extra_bits_value);
     ++histo->literal_[256 + code];
-    VP8LPrefixEncodeBits(PixOrCopyDistance(v), &code, &extra_bits);
+    PrefixEncode(PixOrCopyDistance(v),
+                 &code, &extra_bits_count, &extra_bits_value);
     ++histo->distance_[code];
   }
 }
 
+
+
 static double BitsEntropy(const int* const array, int n) {
   double retval = 0.;
   int sum = 0;
@@ -147,6 +149,25 @@ static double BitsEntropy(const int* const array, int n) {
   }
 }
 
+double VP8LHistogramEstimateBitsBulk(const VP8LHistogram* const p) {
+  double retval = BitsEntropy(&p->literal_[0], VP8LHistogramNumCodes(p))
+                + BitsEntropy(&p->red_[0], 256)
+                + BitsEntropy(&p->blue_[0], 256)
+                + BitsEntropy(&p->alpha_[0], 256)
+                + BitsEntropy(&p->distance_[0], NUM_DISTANCE_CODES);
+  // Compute the extra bits cost.
+  int i;
+  for (i = 2; i < NUM_LENGTH_CODES - 2; ++i) {
+    retval +=
+        (i >> 1) * p->literal_[256 + i + 2];
+  }
+  for (i = 2; i < NUM_DISTANCE_CODES - 2; ++i) {
+    retval += (i >> 1) * p->distance_[i + 2];
+  }
+  return retval;
+}
+
+
 // Returns the cost encode the rle-encoded entropy code.
 // The constants in this function are experimental.
 static double HuffmanCost(const int* const population, int length) {
@@ -186,150 +207,19 @@ static double HuffmanCost(const int* const population, int length) {
   return retval;
 }
 
-static double PopulationCost(const int* const population, int length) {
-  return BitsEntropy(population, length) + HuffmanCost(population, length);
-}
-
-static double ExtraCost(const int* const population, int length) {
-  int i;
-  double cost = 0.;
-  for (i = 2; i < length - 2; ++i) cost += (i >> 1) * population[i + 2];
-  return cost;
+// Estimates the Huffman dictionary + other block overhead size.
+static double HistogramEstimateBitsHeader(const VP8LHistogram* const p) {
+  return HuffmanCost(&p->alpha_[0], 256) +
+         HuffmanCost(&p->red_[0], 256) +
+         HuffmanCost(&p->literal_[0], VP8LHistogramNumCodes(p)) +
+         HuffmanCost(&p->blue_[0], 256) +
+         HuffmanCost(&p->distance_[0], NUM_DISTANCE_CODES);
 }
 
-// Estimates the Entropy + Huffman + other block overhead size cost.
 double VP8LHistogramEstimateBits(const VP8LHistogram* const p) {
-  return PopulationCost(p->literal_, VP8LHistogramNumCodes(p))
-       + PopulationCost(p->red_, 256)
-       + PopulationCost(p->blue_, 256)
-       + PopulationCost(p->alpha_, 256)
-       + PopulationCost(p->distance_, NUM_DISTANCE_CODES)
-       + ExtraCost(p->literal_ + 256, NUM_LENGTH_CODES)
-       + ExtraCost(p->distance_, NUM_DISTANCE_CODES);
-}
-
-double VP8LHistogramEstimateBitsBulk(const VP8LHistogram* const p) {
-  return BitsEntropy(p->literal_, VP8LHistogramNumCodes(p))
-       + BitsEntropy(p->red_, 256)
-       + BitsEntropy(p->blue_, 256)
-       + BitsEntropy(p->alpha_, 256)
-       + BitsEntropy(p->distance_, NUM_DISTANCE_CODES)
-       + ExtraCost(p->literal_ + 256, NUM_LENGTH_CODES)
-       + ExtraCost(p->distance_, NUM_DISTANCE_CODES);
-}
-
-// -----------------------------------------------------------------------------
-// Various histogram combine/cost-eval functions
-
-// Adds 'in' histogram to 'out'
-static void HistogramAdd(const VP8LHistogram* const in,
-                         VP8LHistogram* const out) {
-  int i;
-  for (i = 0; i < PIX_OR_COPY_CODES_MAX; ++i) {
-    out->literal_[i] += in->literal_[i];
-  }
-  for (i = 0; i < NUM_DISTANCE_CODES; ++i) {
-    out->distance_[i] += in->distance_[i];
-  }
-  for (i = 0; i < 256; ++i) {
-    out->red_[i] += in->red_[i];
-    out->blue_[i] += in->blue_[i];
-    out->alpha_[i] += in->alpha_[i];
-  }
-}
-
-// Performs out = a + b, computing the cost C(a+b) - C(a) - C(b) while comparing
-// to the threshold value 'cost_threshold'. The score returned is
-//  Score = C(a+b) - C(a) - C(b), where C(a) + C(b) is known and fixed.
-// Since the previous score passed is 'cost_threshold', we only need to compare
-// the partial cost against 'cost_threshold + C(a) + C(b)' to possibly bail-out
-// early.
-static double HistogramAddEval(const VP8LHistogram* const a,
-                               const VP8LHistogram* const b,
-                               VP8LHistogram* const out,
-                               double cost_threshold) {
-  double cost = 0;
-  const double sum_cost = a->bit_cost_ + b->bit_cost_;
-  int i;
-
-  cost_threshold += sum_cost;
-
-  // palette_code_bits_ is part of the cost evaluation for literal_.
-  // TODO(skal): remove/simplify this palette_code_bits_?
-  out->palette_code_bits_ =
-      (a->palette_code_bits_ > b->palette_code_bits_) ? a->palette_code_bits_ :
-                                                        b->palette_code_bits_;
-  for (i = 0; i < PIX_OR_COPY_CODES_MAX; ++i) {
-    out->literal_[i] = a->literal_[i] + b->literal_[i];
-  }
-  cost += PopulationCost(out->literal_, VP8LHistogramNumCodes(out));
-  cost += ExtraCost(out->literal_ + 256, NUM_LENGTH_CODES);
-  if (cost > cost_threshold) return cost;
-
-  for (i = 0; i < 256; ++i) out->red_[i] = a->red_[i] + b->red_[i];
-  cost += PopulationCost(out->red_, 256);
-  if (cost > cost_threshold) return cost;
-
-  for (i = 0; i < 256; ++i) out->blue_[i] = a->blue_[i] + b->blue_[i];
-  cost += PopulationCost(out->blue_, 256);
-  if (cost > cost_threshold) return cost;
-
-  for (i = 0; i < NUM_DISTANCE_CODES; ++i) {
-    out->distance_[i] = a->distance_[i] + b->distance_[i];
-  }
-  cost += PopulationCost(out->distance_, NUM_DISTANCE_CODES);
-  cost += ExtraCost(out->distance_, NUM_DISTANCE_CODES);
-  if (cost > cost_threshold) return cost;
-
-  for (i = 0; i < 256; ++i) out->alpha_[i] = a->alpha_[i] + b->alpha_[i];
-  cost += PopulationCost(out->alpha_, 256);
-
-  out->bit_cost_ = cost;
-  return cost - sum_cost;
+  return HistogramEstimateBitsHeader(p) + VP8LHistogramEstimateBitsBulk(p);
 }
 
-// Same as HistogramAddEval(), except that the resulting histogram
-// is not stored. Only the cost C(a+b) - C(a) is evaluated. We omit
-// the term C(b) which is constant over all the evaluations.
-static double HistogramAddThresh(const VP8LHistogram* const a,
-                                 const VP8LHistogram* const b,
-                                 double cost_threshold) {
-  int tmp[PIX_OR_COPY_CODES_MAX];  // <= max storage we'll need
-  int i;
-  double cost = -a->bit_cost_;
-
-  for (i = 0; i < PIX_OR_COPY_CODES_MAX; ++i) {
-    tmp[i] = a->literal_[i] + b->literal_[i];
-  }
-  // note that the tests are ordered so that the usually largest
-  // cost shares come first.
-  cost += PopulationCost(tmp, VP8LHistogramNumCodes(a));
-  cost += ExtraCost(tmp + 256, NUM_LENGTH_CODES);
-  if (cost > cost_threshold) return cost;
-
-  for (i = 0; i < 256; ++i) tmp[i] = a->red_[i] + b->red_[i];
-  cost += PopulationCost(tmp, 256);
-  if (cost > cost_threshold) return cost;
-
-  for (i = 0; i < 256; ++i) tmp[i] = a->blue_[i] + b->blue_[i];
-  cost += PopulationCost(tmp, 256);
-  if (cost > cost_threshold) return cost;
-
-  for (i = 0; i < NUM_DISTANCE_CODES; ++i) {
-    tmp[i] = a->distance_[i] + b->distance_[i];
-  }
-  cost += PopulationCost(tmp, NUM_DISTANCE_CODES);
-  cost += ExtraCost(tmp, NUM_DISTANCE_CODES);
-  if (cost > cost_threshold) return cost;
-
-  for (i = 0; i < 256; ++i) tmp[i] = a->alpha_[i] + b->alpha_[i];
-  cost += PopulationCost(tmp, 256);
-
-  return cost;
-}
-
-// -----------------------------------------------------------------------------
-
 static void HistogramBuildImage(int xsize, int histo_bits,
                                 const VP8LBackwardRefs* const backward_refs,
                                 VP8LHistogramSet* const image) {
@@ -359,15 +249,14 @@ static uint32_t MyRand(uint32_t *seed) {
 }
 
 static int HistogramCombine(const VP8LHistogramSet* const in,
-                            VP8LHistogramSet* const out, int iter_mult,
-                            int num_pairs, int num_tries_no_success) {
+                            VP8LHistogramSet* const out, int num_pairs) {
   int ok = 0;
   int i, iter;
   uint32_t seed = 0;
   int tries_with_no_success = 0;
-  int out_size = in->size;
-  const int outer_iters = in->size * iter_mult;
   const int min_cluster_size = 2;
+  int out_size = in->size;
+  const int outer_iters = in->size * 3;
   VP8LHistogram* const histos = (VP8LHistogram*)malloc(2 * sizeof(*histos));
   VP8LHistogram* cur_combo = histos + 0;    // trial merged histogram
   VP8LHistogram* best_combo = histos + 1;   // best merged histogram so far
@@ -382,26 +271,29 @@ static int HistogramCombine(const VP8LHistogramSet* const in,
 
   // Collapse similar histograms in 'out'.
   for (iter = 0; iter < outer_iters && out_size >= min_cluster_size; ++iter) {
+    // We pick the best pair to be combined out of 'inner_iters' pairs.
     double best_cost_diff = 0.;
-    int best_idx1 = -1, best_idx2 = 1;
+    int best_idx1 = 0, best_idx2 = 1;
     int j;
-    const int num_tries = (num_pairs < out_size) ? num_pairs : out_size;
     seed += iter;
-    for (j = 0; j < num_tries; ++j) {
+    for (j = 0; j < num_pairs; ++j) {
       double curr_cost_diff;
       // Choose two histograms at random and try to combine them.
       const uint32_t idx1 = MyRand(&seed) % out_size;
-      const uint32_t tmp = (j & 7) + 1;
+      const uint32_t tmp = ((j & 7) + 1) % (out_size - 1);
       const uint32_t diff = (tmp < 3) ? tmp : MyRand(&seed) % (out_size - 1);
       const uint32_t idx2 = (idx1 + diff + 1) % out_size;
       if (idx1 == idx2) {
         continue;
       }
+      *cur_combo = *out->histograms[idx1];
+      VP8LHistogramAdd(cur_combo, out->histograms[idx2]);
+      cur_combo->bit_cost_ = VP8LHistogramEstimateBits(cur_combo);
       // Calculate cost reduction on combining.
-      curr_cost_diff = HistogramAddEval(out->histograms[idx1],
-                                        out->histograms[idx2],
-                                        cur_combo, best_cost_diff);
-      if (curr_cost_diff < best_cost_diff) {    // found a better pair?
+      curr_cost_diff = cur_combo->bit_cost_
+                     - out->histograms[idx1]->bit_cost_
+                     - out->histograms[idx2]->bit_cost_;
+      if (best_cost_diff > curr_cost_diff) {    // found a better pair?
         {     // swap cur/best combo histograms
           VP8LHistogram* const tmp_histo = cur_combo;
           cur_combo = best_combo;
@@ -413,7 +305,7 @@ static int HistogramCombine(const VP8LHistogramSet* const in,
       }
     }
 
-    if (best_idx1 >= 0) {
+    if (best_cost_diff < 0.0) {
       *out->histograms[best_idx1] = *best_combo;
       // swap best_idx2 slot with last one (which is now unused)
       --out_size;
@@ -423,7 +315,7 @@ static int HistogramCombine(const VP8LHistogramSet* const in,
       }
       tries_with_no_success = 0;
     }
-    if (++tries_with_no_success >= num_tries_no_success) {
+    if (++tries_with_no_success >= 50) {
       break;
     }
   }
@@ -438,11 +330,20 @@ static int HistogramCombine(const VP8LHistogramSet* const in,
 // -----------------------------------------------------------------------------
 // Histogram refinement
 
-// What is the bit cost of moving square_histogram from cur_symbol to candidate.
+// What is the bit cost of moving square_histogram from
+// cur_symbol to candidate_symbol.
+// TODO(skal): we don't really need to copy the histogram and Add(). Instead
+// we just need VP8LDualHistogramEstimateBits(A, B) estimation function.
 static double HistogramDistance(const VP8LHistogram* const square_histogram,
-                                const VP8LHistogram* const candidate,
-                                double cost_threshold) {
-  return HistogramAddThresh(candidate, square_histogram, cost_threshold);
+                                const VP8LHistogram* const candidate) {
+  const double previous_bit_cost = candidate->bit_cost_;
+  double new_bit_cost;
+  VP8LHistogram modified_histo;
+  modified_histo = *candidate;
+  VP8LHistogramAdd(&modified_histo, square_histogram);
+  new_bit_cost = VP8LHistogramEstimateBits(&modified_histo);
+
+  return new_bit_cost - previous_bit_cost;
 }
 
 // Find the best 'out' histogram for each of the 'in' histograms.
@@ -453,12 +354,11 @@ static void HistogramRemap(const VP8LHistogramSet* const in,
   int i;
   for (i = 0; i < in->size; ++i) {
     int best_out = 0;
-    double best_bits =
-        HistogramDistance(in->histograms[i], out->histograms[0], 1.e38);
+    double best_bits = HistogramDistance(in->histograms[i], out->histograms[0]);
     int k;
     for (k = 1; k < out->size; ++k) {
       const double cur_bits =
-          HistogramDistance(in->histograms[i], out->histograms[k], best_bits);
+          HistogramDistance(in->histograms[i], out->histograms[k]);
       if (cur_bits < best_bits) {
         best_bits = cur_bits;
         best_out = k;
@@ -472,7 +372,7 @@ static void HistogramRemap(const VP8LHistogramSet* const in,
     HistogramClear(out->histograms[i]);
   }
   for (i = 0; i < in->size; ++i) {
-    HistogramAdd(in->histograms[i], out->histograms[symbols[i]]);
+    VP8LHistogramAdd(out->histograms[symbols[i]], in->histograms[i]);
   }
 }
 
@@ -484,13 +384,8 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
   int ok = 0;
   const int histo_xsize = histo_bits ? VP8LSubSampleSize(xsize, histo_bits) : 1;
   const int histo_ysize = histo_bits ? VP8LSubSampleSize(ysize, histo_bits) : 1;
+  const int num_histo_pairs = 10 + quality / 2;  // For HistogramCombine().
   const int histo_image_raw_size = histo_xsize * histo_ysize;
-
-  // Heuristic params for HistogramCombine().
-  const int num_tries_no_success = 8 + (quality >> 1);
-  const int iter_mult = (quality < 27) ? 1 : 1 + ((quality - 27) >> 4);
-  const int num_pairs = (quality < 25) ? 10 : (5 * quality) >> 3;
-
   VP8LHistogramSet* const image_out =
       VP8LAllocateHistogramSet(histo_image_raw_size, cache_bits);
   if (image_out == NULL) return 0;
@@ -498,8 +393,7 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
   // Build histogram image.
   HistogramBuildImage(xsize, histo_bits, refs, image_out);
   // Collapse similar histograms.
-  if (!HistogramCombine(image_out, image_in, iter_mult, num_pairs,
-                        num_tries_no_success)) {
+  if (!HistogramCombine(image_out, image_in, num_histo_pairs)) {
     goto Error;
   }
   // Find the optimal map from original histograms to the final ones.
diff --git a/drivers/webp/enc/histogram.h b/drivers/webp/enc/histogram.h
index 4d346a857b..ec573c5c85 100644
--- a/drivers/webp/enc/histogram.h
+++ b/drivers/webp/enc/histogram.h
@@ -1,10 +1,8 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Author: Jyrki Alakuijala (jyrki@google.com)
@@ -24,7 +22,7 @@
 #include "../webp/format_constants.h"
 #include "../webp/types.h"
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
@@ -82,6 +80,22 @@ double VP8LHistogramEstimateBits(const VP8LHistogram* const p);
 // represent the entropy code itself.
 double VP8LHistogramEstimateBitsBulk(const VP8LHistogram* const p);
 
+static WEBP_INLINE void VP8LHistogramAdd(VP8LHistogram* const p,
+                                         const VP8LHistogram* const a) {
+  int i;
+  for (i = 0; i < PIX_OR_COPY_CODES_MAX; ++i) {
+    p->literal_[i] += a->literal_[i];
+  }
+  for (i = 0; i < NUM_DISTANCE_CODES; ++i) {
+    p->distance_[i] += a->distance_[i];
+  }
+  for (i = 0; i < 256; ++i) {
+    p->red_[i] += a->red_[i];
+    p->blue_[i] += a->blue_[i];
+    p->alpha_[i] += a->alpha_[i];
+  }
+}
+
 static WEBP_INLINE int VP8LHistogramNumCodes(const VP8LHistogram* const p) {
   return 256 + NUM_LENGTH_CODES +
       ((p->palette_code_bits_ > 0) ? (1 << p->palette_code_bits_) : 0);
@@ -94,7 +108,7 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
                              VP8LHistogramSet* const image_in,
                              uint16_t* const histogram_symbols);
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 }
 #endif
 
diff --git a/drivers/webp/enc/iterator.c b/drivers/webp/enc/iterator.c
index e42ad001ac..86e473bcf0 100644
--- a/drivers/webp/enc/iterator.c
+++ b/drivers/webp/enc/iterator.c
@@ -1,10 +1,8 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // VP8Iterator: block iterator
@@ -15,16 +13,21 @@
 
 #include "./vp8enci.h"
 
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 //------------------------------------------------------------------------------
 // VP8Iterator
 //------------------------------------------------------------------------------
 
 static void InitLeft(VP8EncIterator* const it) {
-  it->y_left_[-1] = it->u_left_[-1] = it->v_left_[-1] =
+  const VP8Encoder* const enc = it->enc_;
+  enc->y_left_[-1] = enc->u_left_[-1] = enc->v_left_[-1] =
       (it->y_ > 0) ? 129 : 127;
-  memset(it->y_left_, 129, 16);
-  memset(it->u_left_, 129, 8);
-  memset(it->v_left_, 129, 8);
+  memset(enc->y_left_, 129, 16);
+  memset(enc->u_left_, 129, 8);
+  memset(enc->v_left_, 129, 8);
   it->left_nz_[8] = 0;
 }
 
@@ -35,60 +38,43 @@ static void InitTop(VP8EncIterator* const it) {
   memset(enc->nz_, 0, enc->mb_w_ * sizeof(*enc->nz_));
 }
 
-void VP8IteratorSetRow(VP8EncIterator* const it, int y) {
+void VP8IteratorReset(VP8EncIterator* const it) {
   VP8Encoder* const enc = it->enc_;
   it->x_ = 0;
-  it->y_ = y;
-  it->bw_ = &enc->parts_[y & (enc->num_parts_ - 1)];
-  it->preds_ = enc->preds_ + y * 4 * enc->preds_w_;
+  it->y_ = 0;
+  it->y_offset_ = 0;
+  it->uv_offset_ = 0;
+  it->mb_ = enc->mb_info_;
+  it->preds_ = enc->preds_;
   it->nz_ = enc->nz_;
-  it->mb_ = enc->mb_info_ + y * enc->mb_w_;
-  it->y_top_ = enc->y_top_;
-  it->uv_top_ = enc->uv_top_;
-  InitLeft(it);
-}
-
-void VP8IteratorReset(VP8EncIterator* const it) {
-  VP8Encoder* const enc = it->enc_;
-  VP8IteratorSetRow(it, 0);
-  VP8IteratorSetCountDown(it, enc->mb_w_ * enc->mb_h_);  // default
+  it->bw_ = &enc->parts_[0];
+  it->done_ = enc->mb_w_* enc->mb_h_;
   InitTop(it);
   InitLeft(it);
   memset(it->bit_count_, 0, sizeof(it->bit_count_));
   it->do_trellis_ = 0;
 }
 
-void VP8IteratorSetCountDown(VP8EncIterator* const it, int count_down) {
-  it->count_down_ = it->count_down0_ = count_down;
-}
-
-int VP8IteratorIsDone(const VP8EncIterator* const it) {
-  return (it->count_down_ <= 0);
-}
-
 void VP8IteratorInit(VP8Encoder* const enc, VP8EncIterator* const it) {
   it->enc_ = enc;
   it->y_stride_  = enc->pic_->y_stride;
   it->uv_stride_ = enc->pic_->uv_stride;
-  it->yuv_in_   = (uint8_t*)DO_ALIGN(it->yuv_mem_);
-  it->yuv_out_  = it->yuv_in_ + YUV_SIZE;
-  it->yuv_out2_ = it->yuv_out_ + YUV_SIZE;
-  it->yuv_p_    = it->yuv_out2_ + YUV_SIZE;
+  // TODO(later): for multithreading, these should be owned by 'it'.
+  it->yuv_in_   = enc->yuv_in_;
+  it->yuv_out_  = enc->yuv_out_;
+  it->yuv_out2_ = enc->yuv_out2_;
+  it->yuv_p_    = enc->yuv_p_;
   it->lf_stats_ = enc->lf_stats_;
   it->percent0_ = enc->percent_;
-  it->y_left_ = (uint8_t*)DO_ALIGN(it->yuv_left_mem_ + 1);
-  it->u_left_ = it->y_left_ + 16 + 16;
-  it->v_left_ = it->u_left_ + 16;
   VP8IteratorReset(it);
 }
 
 int VP8IteratorProgress(const VP8EncIterator* const it, int delta) {
   VP8Encoder* const enc = it->enc_;
-  if (delta && enc->pic_->progress_hook != NULL) {
-    const int done = it->count_down0_ - it->count_down_;
-    const int percent = (it->count_down0_ <= 0)
+  if (delta && enc->pic_->progress_hook) {
+    const int percent = (enc->mb_h_ <= 1)
                       ? it->percent0_
-                      : it->percent0_ + delta * done / it->count_down0_;
+                      : it->percent0_ + delta * it->y_ / (enc->mb_h_ - 1);
     return WebPReportProgress(enc->pic_, percent, &enc->percent_);
   }
   return 1;
@@ -98,8 +84,6 @@ int VP8IteratorProgress(const VP8EncIterator* const it, int delta) {
 // Import the source samples into the cache. Takes care of replicating
 // boundary pixels if necessary.
 
-static WEBP_INLINE int MinSize(int a, int b) { return (a < b) ? a : b; }
-
 static void ImportBlock(const uint8_t* src, int src_stride,
                         uint8_t* dst, int w, int h, int size) {
   int i;
@@ -117,55 +101,30 @@ static void ImportBlock(const uint8_t* src, int src_stride,
   }
 }
 
-static void ImportLine(const uint8_t* src, int src_stride,
-                       uint8_t* dst, int len, int total_len) {
-  int i;
-  for (i = 0; i < len; ++i, src += src_stride) dst[i] = *src;
-  for (; i < total_len; ++i) dst[i] = dst[len - 1];
-}
-
-void VP8IteratorImport(VP8EncIterator* const it, uint8_t* tmp_32) {
+void VP8IteratorImport(const VP8EncIterator* const it) {
   const VP8Encoder* const enc = it->enc_;
   const int x = it->x_, y = it->y_;
   const WebPPicture* const pic = enc->pic_;
-  const uint8_t* const ysrc = pic->y + (y * pic->y_stride  + x) * 16;
+  const uint8_t* const ysrc = pic->y + (y * pic->y_stride + x) * 16;
   const uint8_t* const usrc = pic->u + (y * pic->uv_stride + x) * 8;
   const uint8_t* const vsrc = pic->v + (y * pic->uv_stride + x) * 8;
-  const int w = MinSize(pic->width - x * 16, 16);
-  const int h = MinSize(pic->height - y * 16, 16);
-  const int uv_w = (w + 1) >> 1;
-  const int uv_h = (h + 1) >> 1;
-
-  ImportBlock(ysrc, pic->y_stride,  it->yuv_in_ + Y_OFF, w, h, 16);
-  ImportBlock(usrc, pic->uv_stride, it->yuv_in_ + U_OFF, uv_w, uv_h, 8);
-  ImportBlock(vsrc, pic->uv_stride, it->yuv_in_ + V_OFF, uv_w, uv_h, 8);
-
-  if (tmp_32 == NULL) return;
-
-  // Import source (uncompressed) samples into boundary.
-  if (x == 0) {
-    InitLeft(it);
-  } else {
-    if (y == 0) {
-      it->y_left_[-1] = it->u_left_[-1] = it->v_left_[-1] = 127;
-    } else {
-      it->y_left_[-1] = ysrc[- 1 - pic->y_stride];
-      it->u_left_[-1] = usrc[- 1 - pic->uv_stride];
-      it->v_left_[-1] = vsrc[- 1 - pic->uv_stride];
-    }
-    ImportLine(ysrc - 1, pic->y_stride,  it->y_left_, h,   16);
-    ImportLine(usrc - 1, pic->uv_stride, it->u_left_, uv_h, 8);
-    ImportLine(vsrc - 1, pic->uv_stride, it->v_left_, uv_h, 8);
-  }
-
-  it->y_top_  = tmp_32 + 0;
-  it->uv_top_ = tmp_32 + 16;
-  if (y == 0) {
-    memset(tmp_32, 127, 32 * sizeof(*tmp_32));
-  } else {
-    ImportLine(ysrc - pic->y_stride,  1, tmp_32,          w,   16);
-    ImportLine(usrc - pic->uv_stride, 1, tmp_32 + 16,     uv_w, 8);
-    ImportLine(vsrc - pic->uv_stride, 1, tmp_32 + 16 + 8, uv_w, 8);
+  uint8_t* const ydst = it->yuv_in_ + Y_OFF;
+  uint8_t* const udst = it->yuv_in_ + U_OFF;
+  uint8_t* const vdst = it->yuv_in_ + V_OFF;
+  int w = (pic->width - x * 16);
+  int h = (pic->height - y * 16);
+
+  if (w > 16) w = 16;
+  if (h > 16) h = 16;
+
+  // Luma plane
+  ImportBlock(ysrc, pic->y_stride, ydst, w, h, 16);
+
+  {   // U/V planes
+    const int uv_w = (w + 1) >> 1;
+    const int uv_h = (h + 1) >> 1;
+    ImportBlock(usrc, pic->uv_stride, udst, uv_w, uv_h, 8);
+    ImportBlock(vsrc, pic->uv_stride, vdst, uv_w, uv_h, 8);
   }
 }
 
@@ -281,44 +240,48 @@ void VP8IteratorBytesToNz(VP8EncIterator* const it) {
 #undef BIT
 
 //------------------------------------------------------------------------------
-// Advance to the next position, doing the bookkeeping.
+// Advance to the next position, doing the bookeeping.
 
-void VP8IteratorSaveBoundary(VP8EncIterator* const it) {
+int VP8IteratorNext(VP8EncIterator* const it,
+                    const uint8_t* const block_to_save) {
   VP8Encoder* const enc = it->enc_;
-  const int x = it->x_, y = it->y_;
-  const uint8_t* const ysrc = it->yuv_out_ + Y_OFF;
-  const uint8_t* const uvsrc = it->yuv_out_ + U_OFF;
-  if (x < enc->mb_w_ - 1) {   // left
-    int i;
-    for (i = 0; i < 16; ++i) {
-      it->y_left_[i] = ysrc[15 + i * BPS];
+  if (block_to_save) {
+    const int x = it->x_, y = it->y_;
+    const uint8_t* const ysrc = block_to_save + Y_OFF;
+    const uint8_t* const usrc = block_to_save + U_OFF;
+    if (x < enc->mb_w_ - 1) {   // left
+      int i;
+      for (i = 0; i < 16; ++i) {
+        enc->y_left_[i] = ysrc[15 + i * BPS];
+      }
+      for (i = 0; i < 8; ++i) {
+        enc->u_left_[i] = usrc[7 + i * BPS];
+        enc->v_left_[i] = usrc[15 + i * BPS];
+      }
+      // top-left (before 'top'!)
+      enc->y_left_[-1] = enc->y_top_[x * 16 + 15];
+      enc->u_left_[-1] = enc->uv_top_[x * 16 + 0 + 7];
+      enc->v_left_[-1] = enc->uv_top_[x * 16 + 8 + 7];
     }
-    for (i = 0; i < 8; ++i) {
-      it->u_left_[i] = uvsrc[7 + i * BPS];
-      it->v_left_[i] = uvsrc[15 + i * BPS];
+    if (y < enc->mb_h_ - 1) {  // top
+      memcpy(enc->y_top_ + x * 16, ysrc + 15 * BPS, 16);
+      memcpy(enc->uv_top_ + x * 16, usrc + 7 * BPS, 8 + 8);
     }
-    // top-left (before 'top'!)
-    it->y_left_[-1] = it->y_top_[15];
-    it->u_left_[-1] = it->uv_top_[0 + 7];
-    it->v_left_[-1] = it->uv_top_[8 + 7];
   }
-  if (y < enc->mb_h_ - 1) {  // top
-    memcpy(it->y_top_, ysrc + 15 * BPS, 16);
-    memcpy(it->uv_top_, uvsrc + 7 * BPS, 8 + 8);
-  }
-}
 
-int VP8IteratorNext(VP8EncIterator* const it) {
+  it->mb_++;
   it->preds_ += 4;
-  it->mb_ += 1;
-  it->nz_ += 1;
-  it->y_top_ += 16;
-  it->uv_top_ += 16;
-  it->x_ += 1;
-  if (it->x_ == it->enc_->mb_w_) {
-    VP8IteratorSetRow(it, ++it->y_);
+  it->nz_++;
+  it->x_++;
+  if (it->x_ == enc->mb_w_) {
+    it->x_ = 0;
+    it->y_++;
+    it->bw_ = &enc->parts_[it->y_ & (enc->num_parts_ - 1)];
+    it->preds_ = enc->preds_ + it->y_ * 4 * enc->preds_w_;
+    it->nz_ = enc->nz_;
+    InitLeft(it);
   }
-  return (0 < --it->count_down_);
+  return (0 < --it->done_);
 }
 
 //------------------------------------------------------------------------------
@@ -405,15 +368,15 @@ void VP8IteratorStartI4(VP8EncIterator* const it) {
 
   // Import the boundary samples
   for (i = 0; i < 17; ++i) {    // left
-    it->i4_boundary_[i] = it->y_left_[15 - i];
+    it->i4_boundary_[i] = enc->y_left_[15 - i];
   }
   for (i = 0; i < 16; ++i) {    // top
-    it->i4_boundary_[17 + i] = it->y_top_[i];
+    it->i4_boundary_[17 + i] = enc->y_top_[it->x_ * 16 + i];
   }
   // top-right samples have a special case on the far right of the picture
   if (it->x_ < enc->mb_w_ - 1) {
     for (i = 16; i < 16 + 4; ++i) {
-      it->i4_boundary_[17 + i] = it->y_top_[i];
+      it->i4_boundary_[17 + i] = enc->y_top_[it->x_ * 16 + i];
     }
   } else {    // else, replicate the last valid pixel four times
     for (i = 16; i < 16 + 4; ++i) {
@@ -454,3 +417,6 @@ int VP8IteratorRotateI4(VP8EncIterator* const it,
 
 //------------------------------------------------------------------------------
 
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/drivers/webp/enc/layer.c b/drivers/webp/enc/layer.c
index 2402362359..423127df63 100644
--- a/drivers/webp/enc/layer.c
+++ b/drivers/webp/enc/layer.c
@@ -1,10 +1,8 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Enhancement layer (for YUV444/422)
@@ -15,6 +13,10 @@
 
 #include "./vp8enci.h"
 
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 //------------------------------------------------------------------------------
 
 void VP8EncInitLayer(VP8Encoder* const enc) {
@@ -42,3 +44,6 @@ void VP8EncDeleteLayer(VP8Encoder* enc) {
   free(enc->layer_data_);
 }
 
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/drivers/webp/enc/picture.c b/drivers/webp/enc/picture.c
index 011690d065..44eed06083 100644
--- a/drivers/webp/enc/picture.c
+++ b/drivers/webp/enc/picture.c
@@ -1,10 +1,8 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // WebPPicture utils: colorspace conversion, crop, ...
@@ -16,15 +14,14 @@
 #include <math.h>
 
 #include "./vp8enci.h"
-#include "../utils/alpha_processing.h"
-#include "../utils/random.h"
 #include "../utils/rescaler.h"
 #include "../utils/utils.h"
 #include "../dsp/dsp.h"
 #include "../dsp/yuv.h"
 
-// Uncomment to disable gamma-compression during RGB->U/V averaging
-#define USE_GAMMA_COMPRESSION
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
 
 #define HALVE(x) (((x) + 1) >> 1)
 #define IS_YUV_CSP(csp, YUV_CSP) (((csp) & WEBP_CSP_UV_MASK) == (YUV_CSP))
@@ -35,10 +32,6 @@ static const union {
 } test_endian = { 0xff000000u };
 #define ALPHA_IS_LAST (test_endian.bytes[3] == 0xff)
 
-static WEBP_INLINE uint32_t MakeARGB32(int r, int g, int b) {
-  return (0xff000000u | (r << 16) | (g << 8) | b);
-}
-
 //------------------------------------------------------------------------------
 // WebPPicture
 //------------------------------------------------------------------------------
@@ -123,7 +116,6 @@ int WebPPictureAlloc(WebPPicture* picture) {
         picture->v0 = mem;
         mem += uv0_size;
       }
-      (void)mem;  // makes the static analyzer happy
     } else {
       void* memory;
       const uint64_t argb_size = (uint64_t)width * height;
@@ -298,11 +290,8 @@ int WebPPictureView(const WebPPicture* src,
     dst->y = src->y + top * src->y_stride + left;
     dst->u = src->u + (top >> 1) * src->uv_stride + (left >> 1);
     dst->v = src->v + (top >> 1) * src->uv_stride + (left >> 1);
-    dst->y_stride = src->y_stride;
-    dst->uv_stride = src->uv_stride;
     if (src->a != NULL) {
       dst->a = src->a + top * src->a_stride + left;
-      dst->a_stride = src->a_stride;
     }
 #ifdef WEBP_EXPERIMENTAL_FEATURES
     if (src->u0 != NULL) {
@@ -310,12 +299,10 @@ int WebPPictureView(const WebPPicture* src,
           IS_YUV_CSP(dst->colorspace, WEBP_YUV422) ? (left >> 1) : left;
       dst->u0 = src->u0 + top * src->uv0_stride + left_pos;
       dst->v0 = src->v0 + top * src->uv0_stride + left_pos;
-      dst->uv0_stride = src->uv0_stride;
     }
 #endif
   } else {
     dst->argb = src->argb + top * src->argb_stride + left;
-    dst->argb_stride = src->argb_stride;
   }
   return 1;
 }
@@ -401,28 +388,6 @@ static void RescalePlane(const uint8_t* src,
   }
 }
 
-static void AlphaMultiplyARGB(WebPPicture* const pic, int inverse) {
-  uint32_t* ptr = pic->argb;
-  int y;
-  for (y = 0; y < pic->height; ++y) {
-    WebPMultARGBRow(ptr, pic->width, inverse);
-    ptr += pic->argb_stride;
-  }
-}
-
-static void AlphaMultiplyY(WebPPicture* const pic, int inverse) {
-  const uint8_t* ptr_a = pic->a;
-  if (ptr_a != NULL) {
-    uint8_t* ptr_y = pic->y;
-    int y;
-    for (y = 0; y < pic->height; ++y) {
-      WebPMultRow(ptr_y, ptr_a, pic->width, inverse);
-      ptr_y += pic->y_stride;
-      ptr_a += pic->a_stride;
-    }
-  }
-}
-
 int WebPPictureRescale(WebPPicture* pic, int width, int height) {
   WebPPicture tmp;
   int prev_width, prev_height;
@@ -453,19 +418,9 @@ int WebPPictureRescale(WebPPicture* pic, int width, int height) {
       WebPPictureFree(&tmp);
       return 0;
     }
-    // If present, we need to rescale alpha first (for AlphaMultiplyY).
-    if (pic->a != NULL) {
-      RescalePlane(pic->a, prev_width, prev_height, pic->a_stride,
-                   tmp.a, width, height, tmp.a_stride, work, 1);
-    }
 
-    // We take transparency into account on the luma plane only. That's not
-    // totally exact blending, but still is a good approximation.
-    AlphaMultiplyY(pic, 0);
     RescalePlane(pic->y, prev_width, prev_height, pic->y_stride,
                  tmp.y, width, height, tmp.y_stride, work, 1);
-    AlphaMultiplyY(&tmp, 1);
-
     RescalePlane(pic->u,
                  HALVE(prev_width), HALVE(prev_height), pic->uv_stride,
                  tmp.u,
@@ -475,6 +430,10 @@ int WebPPictureRescale(WebPPicture* pic, int width, int height) {
                  tmp.v,
                  HALVE(width), HALVE(height), tmp.uv_stride, work, 1);
 
+    if (tmp.a != NULL) {
+      RescalePlane(pic->a, prev_width, prev_height, pic->a_stride,
+                   tmp.a, width, height, tmp.a_stride, work, 1);
+    }
 #ifdef WEBP_EXPERIMENTAL_FEATURES
     if (tmp.u0 != NULL) {
       const int s = IS_YUV_CSP(tmp.colorspace, WEBP_YUV422) ? 2 : 1;
@@ -492,16 +451,13 @@ int WebPPictureRescale(WebPPicture* pic, int width, int height) {
       WebPPictureFree(&tmp);
       return 0;
     }
-    // In order to correctly interpolate colors, we need to apply the alpha
-    // weighting first (black-matting), scale the RGB values, and remove
-    // the premultiplication afterward (while preserving the alpha channel).
-    AlphaMultiplyARGB(pic, 0);
+
     RescalePlane((const uint8_t*)pic->argb, prev_width, prev_height,
                  pic->argb_stride * 4,
                  (uint8_t*)tmp.argb, width, height,
                  tmp.argb_stride * 4,
                  work, 4);
-    AlphaMultiplyARGB(&tmp, 1);
+
   }
   WebPPictureFree(pic);
   free(work);
@@ -590,101 +546,20 @@ int WebPPictureHasTransparency(const WebPPicture* picture) {
 //------------------------------------------------------------------------------
 // RGB -> YUV conversion
 
-static int RGBToY(int r, int g, int b, VP8Random* const rg) {
-  return VP8RGBToY(r, g, b, VP8RandomBits(rg, YUV_FIX));
-}
-
-static int RGBToU(int r, int g, int b, VP8Random* const rg) {
-  return VP8RGBToU(r, g, b, VP8RandomBits(rg, YUV_FIX + 2));
-}
-
-static int RGBToV(int r, int g, int b, VP8Random* const rg) {
-  return VP8RGBToV(r, g, b, VP8RandomBits(rg, YUV_FIX + 2));
-}
-
-//------------------------------------------------------------------------------
-
-#if defined(USE_GAMMA_COMPRESSION)
-
-// gamma-compensates loss of resolution during chroma subsampling
-#define kGamma 0.80
-#define kGammaFix 12     // fixed-point precision for linear values
-#define kGammaScale ((1 << kGammaFix) - 1)
-#define kGammaTabFix 7   // fixed-point fractional bits precision
-#define kGammaTabScale (1 << kGammaTabFix)
-#define kGammaTabRounder (kGammaTabScale >> 1)
-#define kGammaTabSize (1 << (kGammaFix - kGammaTabFix))
-
-static int kLinearToGammaTab[kGammaTabSize + 1];
-static uint16_t kGammaToLinearTab[256];
-static int kGammaTablesOk = 0;
-
-static void InitGammaTables(void) {
-  if (!kGammaTablesOk) {
-    int v;
-    const double scale = 1. / kGammaScale;
-    for (v = 0; v <= 255; ++v) {
-      kGammaToLinearTab[v] =
-          (uint16_t)(pow(v / 255., kGamma) * kGammaScale + .5);
-    }
-    for (v = 0; v <= kGammaTabSize; ++v) {
-      const double x = scale * (v << kGammaTabFix);
-      kLinearToGammaTab[v] = (int)(pow(x, 1. / kGamma) * 255. + .5);
-    }
-    kGammaTablesOk = 1;
-  }
-}
-
-static WEBP_INLINE uint32_t GammaToLinear(uint8_t v) {
-  return kGammaToLinearTab[v];
-}
-
-// Convert a linear value 'v' to YUV_FIX+2 fixed-point precision
-// U/V value, suitable for RGBToU/V calls.
-static WEBP_INLINE int LinearToGamma(uint32_t base_value, int shift) {
-  const int v = base_value << shift;              // final uplifted value
-  const int tab_pos = v >> (kGammaTabFix + 2);    // integer part
-  const int x = v & ((kGammaTabScale << 2) - 1);  // fractional part
-  const int v0 = kLinearToGammaTab[tab_pos];
-  const int v1 = kLinearToGammaTab[tab_pos + 1];
-  const int y = v1 * x + v0 * ((kGammaTabScale << 2) - x);   // interpolate
-  return (y + kGammaTabRounder) >> kGammaTabFix;             // descale
-}
-
-#else
-
-static void InitGammaTables(void) {}
-static WEBP_INLINE uint32_t GammaToLinear(uint8_t v) { return v; }
-static WEBP_INLINE int LinearToGamma(uint32_t base_value, int shift) {
-  (void)shift;
-  return v;
-}
-
-#endif    // USE_GAMMA_COMPRESSION
-
-//------------------------------------------------------------------------------
-
-#define SUM4(ptr) LinearToGamma(                         \
-    GammaToLinear((ptr)[0]) +                            \
-    GammaToLinear((ptr)[step]) +                         \
-    GammaToLinear((ptr)[rgb_stride]) +                   \
-    GammaToLinear((ptr)[rgb_stride + step]), 0)          \
-
-#define SUM2H(ptr) \
-    LinearToGamma(GammaToLinear((ptr)[0]) + GammaToLinear((ptr)[step]), 1)
-#define SUM2V(ptr) \
-    LinearToGamma(GammaToLinear((ptr)[0]) + GammaToLinear((ptr)[rgb_stride]), 1)
-#define SUM1(ptr)  \
-    LinearToGamma(GammaToLinear((ptr)[0]), 2)
-
+// TODO: we can do better than simply 2x2 averaging on U/V samples.
+#define SUM4(ptr) ((ptr)[0] + (ptr)[step] + \
+                   (ptr)[rgb_stride] + (ptr)[rgb_stride + step])
+#define SUM2H(ptr) (2 * (ptr)[0] + 2 * (ptr)[step])
+#define SUM2V(ptr) (2 * (ptr)[0] + 2 * (ptr)[rgb_stride])
+#define SUM1(ptr)  (4 * (ptr)[0])
 #define RGB_TO_UV(x, y, SUM) {                           \
   const int src = (2 * (step * (x) + (y) * rgb_stride)); \
   const int dst = (x) + (y) * picture->uv_stride;        \
   const int r = SUM(r_ptr + src);                        \
   const int g = SUM(g_ptr + src);                        \
   const int b = SUM(b_ptr + src);                        \
-  picture->u[dst] = RGBToU(r, g, b, &rg);                \
-  picture->v[dst] = RGBToV(r, g, b, &rg);                \
+  picture->u[dst] = VP8RGBToU(r, g, b);                  \
+  picture->v[dst] = VP8RGBToV(r, g, b);                  \
 }
 
 #define RGB_TO_UV0(x_in, x_out, y, SUM) {                \
@@ -693,8 +568,8 @@ static WEBP_INLINE int LinearToGamma(uint32_t base_value, int shift) {
   const int r = SUM(r_ptr + src);                        \
   const int g = SUM(g_ptr + src);                        \
   const int b = SUM(b_ptr + src);                        \
-  picture->u0[dst] = RGBToU(r, g, b, &rg);               \
-  picture->v0[dst] = RGBToV(r, g, b, &rg);               \
+  picture->u0[dst] = VP8RGBToU(r, g, b);                 \
+  picture->v0[dst] = VP8RGBToV(r, g, b);                 \
 }
 
 static void MakeGray(WebPPicture* const picture) {
@@ -713,14 +588,12 @@ static int ImportYUVAFromRGBA(const uint8_t* const r_ptr,
                               const uint8_t* const a_ptr,
                               int step,         // bytes per pixel
                               int rgb_stride,   // bytes per scanline
-                              float dithering,
                               WebPPicture* const picture) {
   const WebPEncCSP uv_csp = picture->colorspace & WEBP_CSP_UV_MASK;
   int x, y;
   const int width = picture->width;
   const int height = picture->height;
   const int has_alpha = CheckNonOpaque(a_ptr, width, height, step, rgb_stride);
-  VP8Random rg;
 
   picture->colorspace = uv_csp;
   picture->use_argb = 0;
@@ -729,15 +602,12 @@ static int ImportYUVAFromRGBA(const uint8_t* const r_ptr,
   }
   if (!WebPPictureAlloc(picture)) return 0;
 
-  VP8InitRandom(&rg, dithering);
-  InitGammaTables();
-
   // Import luma plane
   for (y = 0; y < height; ++y) {
     for (x = 0; x < width; ++x) {
       const int offset = step * x + y * rgb_stride;
       picture->y[x + y * picture->y_stride] =
-          RGBToY(r_ptr[offset], g_ptr[offset], b_ptr[offset], &rg);
+          VP8RGBToY(r_ptr[offset], g_ptr[offset], b_ptr[offset]);
     }
   }
 
@@ -785,7 +655,6 @@ static int ImportYUVAFromRGBA(const uint8_t* const r_ptr,
 
   if (has_alpha) {
     assert(step >= 4);
-    assert(picture->a != NULL);
     for (y = 0; y < height; ++y) {
       for (x = 0; x < width; ++x) {
         picture->a[x + y * picture->a_stride] =
@@ -808,7 +677,7 @@ static int Import(WebPPicture* const picture,
 
   if (!picture->use_argb) {
     return ImportYUVAFromRGBA(r_ptr, g_ptr, b_ptr, a_ptr, step, rgb_stride,
-                              0.f /* no dithering */, picture);
+                              picture);
   }
   if (import_alpha) {
     picture->colorspace |= WEBP_CSP_ALPHA_BIT;
@@ -823,7 +692,10 @@ static int Import(WebPPicture* const picture,
       for (x = 0; x < width; ++x) {
         const int offset = step * x + y * rgb_stride;
         const uint32_t argb =
-            MakeARGB32(r_ptr[offset], g_ptr[offset], b_ptr[offset]);
+            0xff000000u |
+            (r_ptr[offset] << 16) |
+            (g_ptr[offset] <<  8) |
+            (b_ptr[offset]);
         picture->argb[x + y * picture->argb_stride] = argb;
       }
     }
@@ -833,7 +705,7 @@ static int Import(WebPPicture* const picture,
     for (y = 0; y < height; ++y) {
       for (x = 0; x < width; ++x) {
         const int offset = step * x + y * rgb_stride;
-        const uint32_t argb = ((uint32_t)a_ptr[offset] << 24) |
+        const uint32_t argb = (a_ptr[offset] << 24) |
                               (r_ptr[offset] << 16) |
                               (g_ptr[offset] <<  8) |
                               (b_ptr[offset]);
@@ -884,7 +756,8 @@ int WebPPictureImportBGRX(WebPPicture* picture,
 
 int WebPPictureYUVAToARGB(WebPPicture* picture) {
   if (picture == NULL) return 0;
-  if (picture->y == NULL || picture->u == NULL || picture->v == NULL) {
+  if (picture->memory_ == NULL || picture->y == NULL ||
+      picture->u == NULL || picture->v == NULL) {
     return WebPEncodingSetError(picture, VP8_ENC_ERROR_NULL_PARAMETER);
   }
   if ((picture->colorspace & WEBP_CSP_ALPHA_BIT) && picture->a == NULL) {
@@ -907,7 +780,7 @@ int WebPPictureYUVAToARGB(WebPPicture* picture) {
     WebPUpsampleLinePairFunc upsample = WebPGetLinePairConverter(ALPHA_IS_LAST);
 
     // First row, with replicated top samples.
-    upsample(cur_y, NULL, cur_u, cur_v, cur_u, cur_v, dst, NULL, width);
+    upsample(NULL, cur_y, cur_u, cur_v, cur_u, cur_v, NULL, dst, width);
     cur_y += picture->y_stride;
     dst += argb_stride;
     // Center rows.
@@ -928,11 +801,11 @@ int WebPPictureYUVAToARGB(WebPPicture* picture) {
     // Insert alpha values if needed, in replacement for the default 0xff ones.
     if (picture->colorspace & WEBP_CSP_ALPHA_BIT) {
       for (y = 0; y < height; ++y) {
-        uint32_t* const argb_dst = picture->argb + y * picture->argb_stride;
+        uint32_t* const dst = picture->argb + y * picture->argb_stride;
         const uint8_t* const src = picture->a + y * picture->a_stride;
         int x;
         for (x = 0; x < width; ++x) {
-          argb_dst[x] = (argb_dst[x] & 0x00ffffffu) | ((uint32_t)src[x] << 24);
+          dst[x] = (dst[x] & 0x00ffffffu) | (src[x] << 24);
         }
       }
     }
@@ -940,8 +813,7 @@ int WebPPictureYUVAToARGB(WebPPicture* picture) {
   return 1;
 }
 
-int WebPPictureARGBToYUVADithered(WebPPicture* picture, WebPEncCSP colorspace,
-                                  float dithering) {
+int WebPPictureARGBToYUVA(WebPPicture* picture, WebPEncCSP colorspace) {
   if (picture == NULL) return 0;
   if (picture->argb == NULL) {
     return WebPEncodingSetError(picture, VP8_ENC_ERROR_NULL_PARAMETER);
@@ -957,8 +829,7 @@ int WebPPictureARGBToYUVADithered(WebPPicture* picture, WebPEncCSP colorspace,
     PictureResetARGB(&tmp);  // reset ARGB buffer so that it's not free()'d.
     tmp.use_argb = 0;
     tmp.colorspace = colorspace & WEBP_CSP_UV_MASK;
-    if (!ImportYUVAFromRGBA(r, g, b, a, 4, 4 * picture->argb_stride, dithering,
-                            &tmp)) {
+    if (!ImportYUVAFromRGBA(r, g, b, a, 4, 4 * picture->argb_stride, &tmp)) {
       return WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
     }
     // Copy back the YUV specs into 'picture'.
@@ -970,10 +841,6 @@ int WebPPictureARGBToYUVADithered(WebPPicture* picture, WebPEncCSP colorspace,
   return 1;
 }
 
-int WebPPictureARGBToYUVA(WebPPicture* picture, WebPEncCSP colorspace) {
-  return WebPPictureARGBToYUVADithered(picture, colorspace, 0.f);
-}
-
 //------------------------------------------------------------------------------
 // Helper: clean up fully transparent area to help compressibility.
 
@@ -1039,220 +906,67 @@ void WebPCleanupTransparentArea(WebPPicture* pic) {
 #undef SIZE
 #undef SIZE2
 
-//------------------------------------------------------------------------------
-// Blend color and remove transparency info
-
-#define BLEND(V0, V1, ALPHA) \
-    ((((V0) * (255 - (ALPHA)) + (V1) * (ALPHA)) * 0x101) >> 16)
-#define BLEND_10BIT(V0, V1, ALPHA) \
-    ((((V0) * (1020 - (ALPHA)) + (V1) * (ALPHA)) * 0x101) >> 18)
-
-void WebPBlendAlpha(WebPPicture* pic, uint32_t background_rgb) {
-  const int red = (background_rgb >> 16) & 0xff;
-  const int green = (background_rgb >> 8) & 0xff;
-  const int blue = (background_rgb >> 0) & 0xff;
-  VP8Random rg;
-  int x, y;
-  if (pic == NULL) return;
-  VP8InitRandom(&rg, 0.f);
-  if (!pic->use_argb) {
-    const int uv_width = (pic->width >> 1);  // omit last pixel during u/v loop
-    const int Y0 = RGBToY(red, green, blue, &rg);
-    // VP8RGBToU/V expects the u/v values summed over four pixels
-    const int U0 = RGBToU(4 * red, 4 * green, 4 * blue, &rg);
-    const int V0 = RGBToV(4 * red, 4 * green, 4 * blue, &rg);
-    const int has_alpha = pic->colorspace & WEBP_CSP_ALPHA_BIT;
-    if (!has_alpha || pic->a == NULL) return;    // nothing to do
-    for (y = 0; y < pic->height; ++y) {
-      // Luma blending
-      uint8_t* const y_ptr = pic->y + y * pic->y_stride;
-      uint8_t* const a_ptr = pic->a + y * pic->a_stride;
-      for (x = 0; x < pic->width; ++x) {
-        const int alpha = a_ptr[x];
-        if (alpha < 0xff) {
-          y_ptr[x] = BLEND(Y0, y_ptr[x], a_ptr[x]);
-        }
-      }
-      // Chroma blending every even line
-      if ((y & 1) == 0) {
-        uint8_t* const u = pic->u + (y >> 1) * pic->uv_stride;
-        uint8_t* const v = pic->v + (y >> 1) * pic->uv_stride;
-        uint8_t* const a_ptr2 =
-            (y + 1 == pic->height) ? a_ptr : a_ptr + pic->a_stride;
-        for (x = 0; x < uv_width; ++x) {
-          // Average four alpha values into a single blending weight.
-          // TODO(skal): might lead to visible contouring. Can we do better?
-          const int alpha =
-              a_ptr[2 * x + 0] + a_ptr[2 * x + 1] +
-              a_ptr2[2 * x + 0] + a_ptr2[2 * x + 1];
-          u[x] = BLEND_10BIT(U0, u[x], alpha);
-          v[x] = BLEND_10BIT(V0, v[x], alpha);
-        }
-        if (pic->width & 1) {   // rightmost pixel
-          const int alpha = 2 * (a_ptr[2 * x + 0] + a_ptr2[2 * x + 0]);
-          u[x] = BLEND_10BIT(U0, u[x], alpha);
-          v[x] = BLEND_10BIT(V0, v[x], alpha);
-        }
-      }
-      memset(a_ptr, 0xff, pic->width);
-    }
-  } else {
-    uint32_t* argb = pic->argb;
-    const uint32_t background = MakeARGB32(red, green, blue);
-    for (y = 0; y < pic->height; ++y) {
-      for (x = 0; x < pic->width; ++x) {
-        const int alpha = (argb[x] >> 24) & 0xff;
-        if (alpha != 0xff) {
-          if (alpha > 0) {
-            int r = (argb[x] >> 16) & 0xff;
-            int g = (argb[x] >>  8) & 0xff;
-            int b = (argb[x] >>  0) & 0xff;
-            r = BLEND(red, r, alpha);
-            g = BLEND(green, g, alpha);
-            b = BLEND(blue, b, alpha);
-            argb[x] = MakeARGB32(r, g, b);
-          } else {
-            argb[x] = background;
-          }
-        }
-      }
-      argb += pic->argb_stride;
-    }
-  }
-}
-
-#undef BLEND
-#undef BLEND_10BIT
-
-//------------------------------------------------------------------------------
-// local-min distortion
-//
-// For every pixel in the *reference* picture, we search for the local best
-// match in the compressed image. This is not a symmetrical measure.
-
-// search radius. Shouldn't be too large.
-#define RADIUS 2
-
-static float AccumulateLSIM(const uint8_t* src, int src_stride,
-                            const uint8_t* ref, int ref_stride,
-                            int w, int h) {
-  int x, y;
-  double total_sse = 0.;
-  for (y = 0; y < h; ++y) {
-    const int y_0 = (y - RADIUS < 0) ? 0 : y - RADIUS;
-    const int y_1 = (y + RADIUS + 1 >= h) ? h : y + RADIUS + 1;
-    for (x = 0; x < w; ++x) {
-      const int x_0 = (x - RADIUS < 0) ? 0 : x - RADIUS;
-      const int x_1 = (x + RADIUS + 1 >= w) ? w : x + RADIUS + 1;
-      double best_sse = 255. * 255.;
-      const double value = (double)ref[y * ref_stride + x];
-      int i, j;
-      for (j = y_0; j < y_1; ++j) {
-        const uint8_t* s = src + j * src_stride;
-        for (i = x_0; i < x_1; ++i) {
-          const double sse = (double)(s[i] - value) * (s[i] - value);
-          if (sse < best_sse) best_sse = sse;
-        }
-      }
-      total_sse += best_sse;
-    }
-  }
-  return (float)total_sse;
-}
-#undef RADIUS
 
 //------------------------------------------------------------------------------
 // Distortion
 
 // Max value returned in case of exact similarity.
 static const double kMinDistortion_dB = 99.;
-static float GetPSNR(const double v) {
-  return (float)((v > 0.) ? -4.3429448 * log(v / (255 * 255.))
-                          : kMinDistortion_dB);
-}
 
-int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref,
+int WebPPictureDistortion(const WebPPicture* pic1, const WebPPicture* pic2,
                           int type, float result[5]) {
+  int c;
   DistoStats stats[5];
   int has_alpha;
-  int uv_w, uv_h;
 
-  if (src == NULL || ref == NULL ||
-      src->width != ref->width || src->height != ref->height ||
-      src->y == NULL || ref->y == NULL ||
-      src->u == NULL || ref->u == NULL ||
-      src->v == NULL || ref->v == NULL ||
+  if (pic1 == NULL || pic2 == NULL ||
+      pic1->width != pic2->width || pic1->height != pic2->height ||
+      pic1->y == NULL || pic2->y == NULL ||
+      pic1->u == NULL || pic2->u == NULL ||
+      pic1->v == NULL || pic2->v == NULL ||
       result == NULL) {
     return 0;
   }
   // TODO(skal): provide distortion for ARGB too.
-  if (src->use_argb == 1 || src->use_argb != ref->use_argb) {
+  if (pic1->use_argb == 1 || pic1->use_argb != pic2->use_argb) {
     return 0;
   }
 
-  has_alpha = !!(src->colorspace & WEBP_CSP_ALPHA_BIT);
-  if (has_alpha != !!(ref->colorspace & WEBP_CSP_ALPHA_BIT) ||
-      (has_alpha && (src->a == NULL || ref->a == NULL))) {
+  has_alpha = !!(pic1->colorspace & WEBP_CSP_ALPHA_BIT);
+  if (has_alpha != !!(pic2->colorspace & WEBP_CSP_ALPHA_BIT) ||
+      (has_alpha && (pic1->a == NULL || pic2->a == NULL))) {
     return 0;
   }
 
   memset(stats, 0, sizeof(stats));
-
-  uv_w = HALVE(src->width);
-  uv_h = HALVE(src->height);
-  if (type >= 2) {
-    float sse[4];
-    sse[0] = AccumulateLSIM(src->y, src->y_stride,
-                            ref->y, ref->y_stride, src->width, src->height);
-    sse[1] = AccumulateLSIM(src->u, src->uv_stride,
-                            ref->u, ref->uv_stride, uv_w, uv_h);
-    sse[2] = AccumulateLSIM(src->v, src->uv_stride,
-                            ref->v, ref->uv_stride, uv_w, uv_h);
-    sse[3] = has_alpha ? AccumulateLSIM(src->a, src->a_stride,
-                                        ref->a, ref->a_stride,
-                                        src->width, src->height)
-                       : 0.f;
-    result[0] = GetPSNR(sse[0] / (src->width * src->height));
-    result[1] = GetPSNR(sse[1] / (uv_w * uv_h));
-    result[2] = GetPSNR(sse[2] / (uv_w * uv_h));
-    result[3] = GetPSNR(sse[3] / (src->width * src->height));
-    {
-      double total_sse = sse[0] + sse[1] + sse[2];
-      int total_pixels = src->width * src->height + 2 * uv_w * uv_h;
-      if (has_alpha) {
-        total_pixels += src->width * src->height;
-        total_sse += sse[3];
-      }
-      result[4] = GetPSNR(total_sse / total_pixels);
-    }
-  } else {
-    int c;
-    VP8SSIMAccumulatePlane(src->y, src->y_stride,
-                           ref->y, ref->y_stride,
-                           src->width, src->height, &stats[0]);
-    VP8SSIMAccumulatePlane(src->u, src->uv_stride,
-                           ref->u, ref->uv_stride,
-                           uv_w, uv_h, &stats[1]);
-    VP8SSIMAccumulatePlane(src->v, src->uv_stride,
-                           ref->v, ref->uv_stride,
-                           uv_w, uv_h, &stats[2]);
-    if (has_alpha) {
-      VP8SSIMAccumulatePlane(src->a, src->a_stride,
-                             ref->a, ref->a_stride,
-                             src->width, src->height, &stats[3]);
-    }
-    for (c = 0; c <= 4; ++c) {
-      if (type == 1) {
-        const double v = VP8SSIMGet(&stats[c]);
-        result[c] = (float)((v < 1.) ? -10.0 * log10(1. - v)
-                                     : kMinDistortion_dB);
-      } else {
-        const double v = VP8SSIMGetSquaredError(&stats[c]);
-        result[c] = GetPSNR(v);
-      }
-      // Accumulate forward
-      if (c < 4) VP8SSIMAddStats(&stats[c], &stats[4]);
+  VP8SSIMAccumulatePlane(pic1->y, pic1->y_stride,
+                         pic2->y, pic2->y_stride,
+                         pic1->width, pic1->height, &stats[0]);
+  VP8SSIMAccumulatePlane(pic1->u, pic1->uv_stride,
+                         pic2->u, pic2->uv_stride,
+                         (pic1->width + 1) >> 1, (pic1->height + 1) >> 1,
+                         &stats[1]);
+  VP8SSIMAccumulatePlane(pic1->v, pic1->uv_stride,
+                         pic2->v, pic2->uv_stride,
+                         (pic1->width + 1) >> 1, (pic1->height + 1) >> 1,
+                         &stats[2]);
+  if (has_alpha) {
+    VP8SSIMAccumulatePlane(pic1->a, pic1->a_stride,
+                           pic2->a, pic2->a_stride,
+                           pic1->width, pic1->height, &stats[3]);
+  }
+  for (c = 0; c <= 4; ++c) {
+    if (type == 1) {
+      const double v = VP8SSIMGet(&stats[c]);
+      result[c] = (float)((v < 1.) ? -10.0 * log10(1. - v)
+                                   : kMinDistortion_dB);
+    } else {
+      const double v = VP8SSIMGetSquaredError(&stats[c]);
+      result[c] = (float)((v > 0.) ? -4.3429448 * log(v / (255 * 255.))
+                                   : kMinDistortion_dB);
     }
+    // Accumulate forward
+    if (c < 4) VP8SSIMAddStats(&stats[c], &stats[4]);
   }
   return 1;
 }
@@ -1300,10 +1014,10 @@ size_t NAME(const uint8_t* in, int w, int h, int bps, float q,          \
   return Encode(in, w, h, bps, IMPORTER, q, 0, out);                    \
 }
 
-ENCODE_FUNC(WebPEncodeRGB, WebPPictureImportRGB)
-ENCODE_FUNC(WebPEncodeBGR, WebPPictureImportBGR)
-ENCODE_FUNC(WebPEncodeRGBA, WebPPictureImportRGBA)
-ENCODE_FUNC(WebPEncodeBGRA, WebPPictureImportBGRA)
+ENCODE_FUNC(WebPEncodeRGB, WebPPictureImportRGB);
+ENCODE_FUNC(WebPEncodeBGR, WebPPictureImportBGR);
+ENCODE_FUNC(WebPEncodeRGBA, WebPPictureImportRGBA);
+ENCODE_FUNC(WebPEncodeBGRA, WebPPictureImportBGRA);
 
 #undef ENCODE_FUNC
 
@@ -1313,12 +1027,15 @@ size_t NAME(const uint8_t* in, int w, int h, int bps, uint8_t** out) {       \
   return Encode(in, w, h, bps, IMPORTER, LOSSLESS_DEFAULT_QUALITY, 1, out);  \
 }
 
-LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessRGB, WebPPictureImportRGB)
-LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessBGR, WebPPictureImportBGR)
-LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessRGBA, WebPPictureImportRGBA)
-LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessBGRA, WebPPictureImportBGRA)
+LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessRGB, WebPPictureImportRGB);
+LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessBGR, WebPPictureImportBGR);
+LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessRGBA, WebPPictureImportRGBA);
+LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessBGRA, WebPPictureImportBGRA);
 
 #undef LOSSLESS_ENCODE_FUNC
 
 //------------------------------------------------------------------------------
 
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/drivers/webp/enc/quant.c b/drivers/webp/enc/quant.c
index e1d202b5a3..ea153849c8 100644
--- a/drivers/webp/enc/quant.c
+++ b/drivers/webp/enc/quant.c
@@ -1,10 +1,8 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 //   Quantization
@@ -13,7 +11,6 @@
 
 #include <assert.h>
 #include <math.h>
-#include <stdlib.h>  // for abs()
 
 #include "./vp8enci.h"
 #include "./cost.h"
@@ -25,78 +22,16 @@
 
 #define MID_ALPHA 64      // neutral value for susceptibility
 #define MIN_ALPHA 30      // lowest usable value for susceptibility
-#define MAX_ALPHA 100     // higher meaningful value for susceptibility
+#define MAX_ALPHA 100     // higher meaninful value for susceptibility
 
 #define SNS_TO_DQ 0.9     // Scaling constant between the sns value and the QP
                           // power-law modulation. Must be strictly less than 1.
 
-#define I4_PENALTY 4000   // Rate-penalty for quick i4/i16 decision
-
-// number of non-zero coeffs below which we consider the block very flat
-// (and apply a penalty to complex predictions)
-#define FLATNESS_LIMIT_I16 10      // I16 mode
-#define FLATNESS_LIMIT_I4  3       // I4 mode
-#define FLATNESS_LIMIT_UV  2       // UV mode
-#define FLATNESS_PENALTY   140     // roughly ~1bit per block
-
 #define MULT_8B(a, b) (((a) * (b) + 128) >> 8)
 
-// #define DEBUG_BLOCK
-
-//------------------------------------------------------------------------------
-
-#if defined(DEBUG_BLOCK)
-
-#include <stdio.h>
-#include <stdlib.h>
-
-static void PrintBlockInfo(const VP8EncIterator* const it,
-                           const VP8ModeScore* const rd) {
-  int i, j;
-  const int is_i16 = (it->mb_->type_ == 1);
-  printf("SOURCE / OUTPUT / ABS DELTA\n");
-  for (j = 0; j < 24; ++j) {
-    if (j == 16) printf("\n");   // newline before the U/V block
-    for (i = 0; i < 16; ++i) printf("%3d ", it->yuv_in_[i + j * BPS]);
-    printf("     ");
-    for (i = 0; i < 16; ++i) printf("%3d ", it->yuv_out_[i + j * BPS]);
-    printf("     ");
-    for (i = 0; i < 16; ++i) {
-      printf("%1d ", abs(it->yuv_out_[i + j * BPS] - it->yuv_in_[i + j * BPS]));
-    }
-    printf("\n");
-  }
-  printf("\nD:%d SD:%d R:%d H:%d nz:0x%x score:%d\n",
-    (int)rd->D, (int)rd->SD, (int)rd->R, (int)rd->H, (int)rd->nz,
-    (int)rd->score);
-  if (is_i16) {
-    printf("Mode: %d\n", rd->mode_i16);
-    printf("y_dc_levels:");
-    for (i = 0; i < 16; ++i) printf("%3d ", rd->y_dc_levels[i]);
-    printf("\n");
-  } else {
-    printf("Modes[16]: ");
-    for (i = 0; i < 16; ++i) printf("%d ", rd->modes_i4[i]);
-    printf("\n");
-  }
-  printf("y_ac_levels:\n");
-  for (j = 0; j < 16; ++j) {
-    for (i = is_i16 ? 1 : 0; i < 16; ++i) {
-      printf("%4d ", rd->y_ac_levels[j][i]);
-    }
-    printf("\n");
-  }
-  printf("\n");
-  printf("uv_levels (mode=%d):\n", rd->mode_uv);
-  for (j = 0; j < 8; ++j) {
-    for (i = 0; i < 16; ++i) {
-      printf("%4d ", rd->uv_levels[j][i]);
-    }
-    printf("\n");
-  }
-}
-
-#endif   // DEBUG_BLOCK
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
 
 //------------------------------------------------------------------------------
 
@@ -165,13 +100,31 @@ static const uint16_t kAcTable2[128] = {
   385, 393, 401, 409, 416, 424, 432, 440
 };
 
-static const uint8_t kBiasMatrices[3][2] = {  // [luma-ac,luma-dc,chroma][dc,ac]
-  { 96, 110 }, { 96, 108 }, { 110, 115 }
+static const uint16_t kCoeffThresh[16] = {
+  0,  10, 20, 30,
+  10, 20, 30, 30,
+  20, 30, 30, 30,
+  30, 30, 30, 30
+};
+
+// TODO(skal): tune more. Coeff thresholding?
+static const uint8_t kBiasMatrices[3][16] = {  // [3] = [luma-ac,luma-dc,chroma]
+  { 96, 96, 96, 96,
+    96, 96, 96, 96,
+    96, 96, 96, 96,
+    96, 96, 96, 96 },
+  { 96, 96, 96, 96,
+    96, 96, 96, 96,
+    96, 96, 96, 96,
+    96, 96, 96, 96 },
+  { 96, 96, 96, 96,
+    96, 96, 96, 96,
+    96, 96, 96, 96,
+    96, 96, 96, 96 }
 };
 
-// Sharpening by (slightly) raising the hi-frequency coeffs.
+// Sharpening by (slightly) raising the hi-frequency coeffs (only for trellis).
 // Hack-ish but helpful for mid-bitrate range. Use with care.
-#define SHARPEN_BITS 11  // number of descaling bits for sharpening bias
 static const uint8_t kFreqSharpening[16] = {
   0,  30, 60, 90,
   30, 60, 90, 90,
@@ -184,30 +137,20 @@ static const uint8_t kFreqSharpening[16] = {
 
 // Returns the average quantizer
 static int ExpandMatrix(VP8Matrix* const m, int type) {
-  int i, sum;
-  for (i = 0; i < 2; ++i) {
-    const int is_ac_coeff = (i > 0);
-    const int bias = kBiasMatrices[type][is_ac_coeff];
-    m->iq_[i] = (1 << QFIX) / m->q_[i];
-    m->bias_[i] = BIAS(bias);
-    // zthresh_ is the exact value such that QUANTDIV(coeff, iQ, B) is:
-    //   * zero if coeff <= zthresh
-    //   * non-zero if coeff > zthresh
-    m->zthresh_[i] = ((1 << QFIX) - 1 - m->bias_[i]) / m->iq_[i];
-  }
+  int i;
+  int sum = 0;
   for (i = 2; i < 16; ++i) {
     m->q_[i] = m->q_[1];
-    m->iq_[i] = m->iq_[1];
-    m->bias_[i] = m->bias_[1];
-    m->zthresh_[i] = m->zthresh_[1];
   }
-  for (sum = 0, i = 0; i < 16; ++i) {
-    if (type == 0) {  // we only use sharpening for AC luma coeffs
-      m->sharpen_[i] = (kFreqSharpening[i] * m->q_[i]) >> SHARPEN_BITS;
-    } else {
-      m->sharpen_[i] = 0;
-    }
-    sum += m->q_[i];
+  for (i = 0; i < 16; ++i) {
+    const int j = kZigzag[i];
+    const int bias = kBiasMatrices[type][j];
+    m->iq_[j] = (1 << QFIX) / m->q_[j];
+    m->bias_[j] = BIAS(bias);
+    // TODO(skal): tune kCoeffThresh[]
+    m->zthresh_[j] = ((256 /*+ kCoeffThresh[j]*/ - bias) * m->q_[j] + 127) >> 8;
+    m->sharpen_[j] = (kFreqSharpening[j] * m->q_[j]) >> 11;
+    sum += m->q_[j];
   }
   return (sum + 8) >> 4;
 }
@@ -235,17 +178,17 @@ static void SetupMatrices(VP8Encoder* enc) {
     q16 = ExpandMatrix(&m->y2_, 1);
     quv = ExpandMatrix(&m->uv_, 2);
 
-    m->lambda_i4_          = (3 * q4 * q4) >> 7;
-    m->lambda_i16_         = (3 * q16 * q16);
-    m->lambda_uv_          = (3 * quv * quv) >> 6;
-    m->lambda_mode_        = (1 * q4 * q4) >> 7;
-    m->lambda_trellis_i4_  = (7 * q4 * q4) >> 3;
-    m->lambda_trellis_i16_ = (q16 * q16) >> 2;
-    m->lambda_trellis_uv_  = (quv *quv) << 1;
-    m->tlambda_            = (tlambda_scale * q4) >> 5;
-
-    m->min_disto_ = 10 * m->y1_.q_[0];   // quantization-aware min disto
-    m->max_edge_  = 0;
+    // TODO: Switch to kLambda*[] tables?
+    {
+      m->lambda_i4_  = (3 * q4 * q4) >> 7;
+      m->lambda_i16_ = (3 * q16 * q16);
+      m->lambda_uv_  = (3 * quv * quv) >> 6;
+      m->lambda_mode_    = (1 * q4 * q4) >> 7;
+      m->lambda_trellis_i4_  = (7 * q4 * q4) >> 3;
+      m->lambda_trellis_i16_ = (q16 * q16) >> 2;
+      m->lambda_trellis_uv_  = (quv *quv) << 1;
+      m->tlambda_            = (tlambda_scale * q4) >> 5;
+    }
   }
 }
 
@@ -254,21 +197,16 @@ static void SetupMatrices(VP8Encoder* enc) {
 
 // Very small filter-strength values have close to no visual effect. So we can
 // save a little decoding-CPU by turning filtering off for these.
-#define FSTRENGTH_CUTOFF 2
+#define FSTRENGTH_CUTOFF 3
 
 static void SetupFilterStrength(VP8Encoder* const enc) {
   int i;
-  // level0 is in [0..500]. Using '-f 50' as filter_strength is mid-filtering.
-  const int level0 = 5 * enc->config_->filter_strength;
+  const int level0 = enc->config_->filter_strength;
   for (i = 0; i < NUM_MB_SEGMENTS; ++i) {
-    VP8SegmentInfo* const m = &enc->dqm_[i];
-    // We focus on the quantization of AC coeffs.
-    const int qstep = kAcTable[clip(m->quant_, 0, 127)] >> 2;
-    const int base_strength =
-        VP8FilterStrengthFromDelta(enc->filter_hdr_.sharpness_, qstep);
-    // Segments with lower complexity ('beta') will be less filtered.
-    const int f = base_strength * level0 / (256 + m->beta_);
-    m->fstrength_ = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f;
+    // Segments with lower quantizer will be less filtered. TODO: tune (wrt SNS)
+    const int level = level0 * 256 * enc->dqm_[i].quant_ / 128;
+    const int f = level / (256 + enc->dqm_[i].beta_);
+    enc->dqm_[i].fstrength_ = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f;
   }
   // We record the initial strength (mainly for the case of 1-segment only).
   enc->filter_hdr_.level_ = enc->dqm_[0].fstrength_;
@@ -286,90 +224,28 @@ static void SetupFilterStrength(VP8Encoder* const enc) {
 // We want to emulate jpeg-like behaviour where the expected "good" quality
 // is around q=75. Internally, our "good" middle is around c=50. So we
 // map accordingly using linear piece-wise function
-static double QualityToCompression(double c) {
-  const double linear_c = (c < 0.75) ? c * (2. / 3.) : 2. * c - 1.;
-  // The file size roughly scales as pow(quantizer, 3.). Actually, the
-  // exponent is somewhere between 2.8 and 3.2, but we're mostly interested
-  // in the mid-quant range. So we scale the compressibility inversely to
-  // this power-law: quant ~= compression ^ 1/3. This law holds well for
-  // low quant. Finer modeling for high-quant would make use of kAcTable[]
-  // more explicitly.
-  const double v = pow(linear_c, 1 / 3.);
-  return v;
-}
-
-static double QualityToJPEGCompression(double c, double alpha) {
-  // We map the complexity 'alpha' and quality setting 'c' to a compression
-  // exponent empirically matched to the compression curve of libjpeg6b.
-  // On average, the WebP output size will be roughly similar to that of a
-  // JPEG file compressed with same quality factor.
-  const double amin = 0.30;
-  const double amax = 0.85;
-  const double exp_min = 0.4;
-  const double exp_max = 0.9;
-  const double slope = (exp_min - exp_max) / (amax - amin);
-  // Linearly interpolate 'expn' from exp_min to exp_max
-  // in the [amin, amax] range.
-  const double expn = (alpha > amax) ? exp_min
-                    : (alpha < amin) ? exp_max
-                    : exp_max + slope * (alpha - amin);
-  const double v = pow(c, expn);
-  return v;
-}
-
-static int SegmentsAreEquivalent(const VP8SegmentInfo* const S1,
-                                 const VP8SegmentInfo* const S2) {
-  return (S1->quant_ == S2->quant_) && (S1->fstrength_ == S2->fstrength_);
-}
-
-static void SimplifySegments(VP8Encoder* const enc) {
-  int map[NUM_MB_SEGMENTS] = { 0, 1, 2, 3 };
-  const int num_segments = enc->segment_hdr_.num_segments_;
-  int num_final_segments = 1;
-  int s1, s2;
-  for (s1 = 1; s1 < num_segments; ++s1) {    // find similar segments
-    const VP8SegmentInfo* const S1 = &enc->dqm_[s1];
-    int found = 0;
-    // check if we already have similar segment
-    for (s2 = 0; s2 < num_final_segments; ++s2) {
-      const VP8SegmentInfo* const S2 = &enc->dqm_[s2];
-      if (SegmentsAreEquivalent(S1, S2)) {
-        found = 1;
-        break;
-      }
-    }
-    map[s1] = s2;
-    if (!found) {
-      if (num_final_segments != s1) {
-        enc->dqm_[num_final_segments] = enc->dqm_[s1];
-      }
-      ++num_final_segments;
-    }
-  }
-  if (num_final_segments < num_segments) {  // Remap
-    int i = enc->mb_w_ * enc->mb_h_;
-    while (i-- > 0) enc->mb_info_[i].segment_ = map[enc->mb_info_[i].segment_];
-    enc->segment_hdr_.num_segments_ = num_final_segments;
-    // Replicate the trailing segment infos (it's mostly cosmetics)
-    for (i = num_final_segments; i < num_segments; ++i) {
-      enc->dqm_[i] = enc->dqm_[num_final_segments - 1];
-    }
-  }
+static double QualityToCompression(double q) {
+  const double c = q / 100.;
+  return (c < 0.75) ? c * (2. / 3.) : 2. * c - 1.;
 }
 
 void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
   int i;
   int dq_uv_ac, dq_uv_dc;
-  const int num_segments = enc->segment_hdr_.num_segments_;
+  const int num_segments = enc->config_->segments;
   const double amp = SNS_TO_DQ * enc->config_->sns_strength / 100. / 128.;
-  const double Q = quality / 100.;
-  const double c_base = enc->config_->emulate_jpeg_size ?
-      QualityToJPEGCompression(Q, enc->alpha_ / 255.) :
-      QualityToCompression(Q);
+  const double c_base = QualityToCompression(quality);
   for (i = 0; i < num_segments; ++i) {
-    // We modulate the base coefficient to accommodate for the quantization
-    // susceptibility and allow denser segments to be quantized more.
-    const double expn = 1. - amp * enc->dqm_[i].alpha_;
+    // The file size roughly scales as pow(quantizer, 3.). Actually, the
+    // exponent is somewhere between 2.8 and 3.2, but we're mostly interested
+    // in the mid-quant range. So we scale the compressibility inversely to
+    // this power-law: quant ~= compression ^ 1/3. This law holds well for
+    // low quant. Finer modelling for high-quant would make use of kAcTable[]
+    // more explicitely.
+    // Additionally, we modulate the base exponent 1/3 to accommodate for the
+    // quantization susceptibility and allow denser segments to be quantized
+    // more.
+    const double expn = (1. - amp * enc->dqm_[i].alpha_) / 3.;
     const double c = pow(c_base, expn);
     const int q = (int)(127. * (1. - c));
     assert(expn > 0.);
@@ -405,11 +281,9 @@ void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
   enc->dq_uv_dc_ = dq_uv_dc;
   enc->dq_uv_ac_ = dq_uv_ac;
 
-  SetupFilterStrength(enc);   // initialize segments' filtering, eventually
-
-  if (num_segments > 1) SimplifySegments(enc);
+  SetupMatrices(enc);
 
-  SetupMatrices(enc);         // finalize quantization matrices
+  SetupFilterStrength(enc);   // initialize segments' filtering, eventually
 }
 
 //------------------------------------------------------------------------------
@@ -425,14 +299,16 @@ const int VP8I4ModeOffsets[NUM_BMODES] = {
 };
 
 void VP8MakeLuma16Preds(const VP8EncIterator* const it) {
-  const uint8_t* const left = it->x_ ? it->y_left_ : NULL;
-  const uint8_t* const top = it->y_ ? it->y_top_ : NULL;
+  const VP8Encoder* const enc = it->enc_;
+  const uint8_t* const left = it->x_ ? enc->y_left_ : NULL;
+  const uint8_t* const top = it->y_ ? enc->y_top_ + it->x_ * 16 : NULL;
   VP8EncPredLuma16(it->yuv_p_, left, top);
 }
 
 void VP8MakeChroma8Preds(const VP8EncIterator* const it) {
-  const uint8_t* const left = it->x_ ? it->u_left_ : NULL;
-  const uint8_t* const top = it->y_ ? it->uv_top_ : NULL;
+  const VP8Encoder* const enc = it->enc_;
+  const uint8_t* const left = it->x_ ? enc->u_left_ : NULL;
+  const uint8_t* const top = it->y_ ? enc->uv_top_ + it->x_ * 16 : NULL;
   VP8EncPredChroma8(it->yuv_p_, left, top);
 }
 
@@ -488,7 +364,6 @@ static void InitScore(VP8ModeScore* const rd) {
   rd->D  = 0;
   rd->SD = 0;
   rd->R  = 0;
-  rd->H  = 0;
   rd->nz = 0;
   rd->score = MAX_COST;
 }
@@ -497,7 +372,6 @@ static void CopyScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
   dst->D  = src->D;
   dst->SD = src->SD;
   dst->R  = src->R;
-  dst->H  = src->H;
   dst->nz = src->nz;      // note that nz is not accumulated, but just copied.
   dst->score = src->score;
 }
@@ -506,7 +380,6 @@ static void AddScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
   dst->D  += src->D;
   dst->SD += src->SD;
   dst->R  += src->R;
-  dst->H  += src->H;
   dst->nz |= src->nz;     // here, new nz bits are accumulated.
   dst->score += src->score;
 }
@@ -535,7 +408,7 @@ typedef struct {
 
 static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
   // TODO: incorporate the "* 256" in the tables?
-  rd->score = (rd->R + rd->H) * lambda + 256 * (rd->D + rd->SD);
+  rd->score = rd->R * lambda + 256 * (rd->D + rd->SD);
 }
 
 static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
@@ -598,10 +471,11 @@ static int TrellisQuantizeBlock(const VP8EncIterator* const it,
     // note: it's important to take sign of the _original_ coeff,
     // so we don't have to consider level < 0 afterward.
     const int sign = (in[j] < 0);
-    const int coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
-    int level0 = QUANTDIV(coeff0, iQ, B);
-    if (level0 > MAX_LEVEL) level0 = MAX_LEVEL;
+    int coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
+    int level0;
+    if (coeff0 > 2047) coeff0 = 2047;
 
+    level0 = QUANTDIV(coeff0, iQ, B);
     // test all alternate level values around level0.
     for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
       Node* const cur = &NODE(n, m);
@@ -613,7 +487,7 @@ static int TrellisQuantizeBlock(const VP8EncIterator* const it,
       cur->sign = sign;
       cur->level = level;
       cur->ctx = (level == 0) ? 0 : (level == 1) ? 1 : 2;
-      if (level > MAX_LEVEL || level < 0) {   // node is dead?
+      if (level >= 2048 || level < 0) {   // node is dead?
         cur->cost = MAX_COST;
         continue;
       }
@@ -706,10 +580,10 @@ static int ReconstructIntra16(VP8EncIterator* const it,
                               VP8ModeScore* const rd,
                               uint8_t* const yuv_out,
                               int mode) {
-  VP8Encoder* const enc = it->enc_;
+  const VP8Encoder* const enc = it->enc_;
   const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
   const uint8_t* const src = it->yuv_in_ + Y_OFF;
-  VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
+  const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
   int nz = 0;
   int n;
   int16_t tmp[16][16], dc_tmp[16];
@@ -718,7 +592,7 @@ static int ReconstructIntra16(VP8EncIterator* const it,
     VP8FTransform(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]);
   }
   VP8FTransformWHT(tmp[0], dc_tmp);
-  nz |= VP8EncQuantizeBlockWHT(dc_tmp, rd->y_dc_levels, &dqm->y2_) << 24;
+  nz |= VP8EncQuantizeBlock(dc_tmp, rd->y_dc_levels, 0, &dqm->y2_) << 24;
 
   if (DO_TRELLIS_I16 && it->do_trellis_) {
     int x, y;
@@ -813,18 +687,7 @@ static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
 
 //------------------------------------------------------------------------------
 // RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost.
-// Pick the mode is lower RD-cost = Rate + lambda * Distortion.
-
-static void StoreMaxDelta(VP8SegmentInfo* const dqm, const int16_t DCs[16]) {
-  // We look at the first three AC coefficients to determine what is the average
-  // delta between each sub-4x4 block.
-  const int v0 = abs(DCs[1]);
-  const int v1 = abs(DCs[4]);
-  const int v2 = abs(DCs[5]);
-  int max_v = (v0 > v1) ? v1 : v0;
-  max_v = (v2 > max_v) ? v2 : max_v;
-  if (max_v > dqm->max_edge_) dqm->max_edge_ = max_v;
-}
+// Pick the mode is lower RD-cost = Rate + lamba * Distortion.
 
 static void SwapPtr(uint8_t** a, uint8_t** b) {
   uint8_t* const tmp = *a;
@@ -836,23 +699,9 @@ static void SwapOut(VP8EncIterator* const it) {
   SwapPtr(&it->yuv_out_, &it->yuv_out2_);
 }
 
-static score_t IsFlat(const int16_t* levels, int num_blocks, score_t thresh) {
-  score_t score = 0;
-  while (num_blocks-- > 0) {      // TODO(skal): refine positional scoring?
-    int i;
-    for (i = 1; i < 16; ++i) {    // omit DC, we're only interested in AC
-      score += (levels[i] != 0);
-      if (score > thresh) return 0;
-    }
-    levels += 16;
-  }
-  return 1;
-}
-
 static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* const rd) {
-  const int kNumBlocks = 16;
-  VP8Encoder* const enc = it->enc_;
-  VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
+  const VP8Encoder* const enc = it->enc_;
+  const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
   const int lambda = dqm->lambda_i16_;
   const int tlambda = dqm->tlambda_;
   const uint8_t* const src = it->yuv_in_ + Y_OFF;
@@ -860,7 +709,7 @@ static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* const rd) {
   int mode;
 
   rd->mode_i16 = -1;
-  for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
+  for (mode = 0; mode < 4; ++mode) {
     uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF;  // scratch buffer
     int nz;
 
@@ -871,13 +720,8 @@ static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* const rd) {
     rd16.D = VP8SSE16x16(src, tmp_dst);
     rd16.SD = tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY))
             : 0;
-    rd16.H = VP8FixedCostsI16[mode];
     rd16.R = VP8GetCostLuma16(it, &rd16);
-    if (mode > 0 &&
-        IsFlat(rd16.y_ac_levels[0], kNumBlocks, FLATNESS_LIMIT_I16)) {
-      // penalty to avoid flat area to be mispredicted by complex mode
-      rd16.R += FLATNESS_PENALTY * kNumBlocks;
-    }
+    rd16.R += VP8FixedCostsI16[mode];
 
     // Since we always examine Intra16 first, we can overwrite *rd directly.
     SetRDScore(lambda, &rd16);
@@ -892,13 +736,6 @@ static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* const rd) {
   }
   SetRDScore(dqm->lambda_mode_, rd);   // finalize score for mode decision.
   VP8SetIntra16Mode(it, rd->mode_i16);
-
-  // we have a blocky macroblock (only DCs are non-zero) with fairly high
-  // distortion, record max delta so we can later adjust the minimal filtering
-  // strength needed to smooth these blocks out.
-  if ((rd->nz & 0xffff) == 0 && rd->D > dqm->min_disto_) {
-    StoreMaxDelta(dqm, rd->y_dc_levels);
-  }
 }
 
 //------------------------------------------------------------------------------
@@ -928,11 +765,9 @@ static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
   }
 
   InitScore(&rd_best);
-  rd_best.H = 211;  // '211' is the value of VP8BitCost(0, 145)
-  SetRDScore(dqm->lambda_mode_, &rd_best);
+  rd_best.score = 211;  // '211' is the value of VP8BitCost(0, 145)
   VP8IteratorStartI4(it);
   do {
-    const int kNumBlocks = 1;
     VP8ModeScore rd_i4;
     int mode;
     int best_mode = -1;
@@ -956,11 +791,8 @@ static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
       rd_tmp.SD =
           tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY))
                   : 0;
-      rd_tmp.H = mode_costs[mode];
       rd_tmp.R = VP8GetCostLuma4(it, tmp_levels);
-      if (mode > 0 && IsFlat(tmp_levels, kNumBlocks, FLATNESS_LIMIT_I4)) {
-        rd_tmp.R += FLATNESS_PENALTY * kNumBlocks;
-      }
+      rd_tmp.R += mode_costs[mode];
 
       SetRDScore(lambda, &rd_tmp);
       if (best_mode < 0 || rd_tmp.score < rd_i4.score) {
@@ -972,17 +804,14 @@ static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
     }
     SetRDScore(dqm->lambda_mode_, &rd_i4);
     AddScore(&rd_best, &rd_i4);
-    if (rd_best.score >= rd->score) {
-      return 0;
-    }
-    total_header_bits += (int)rd_i4.H;   // <- equal to mode_costs[best_mode];
-    if (total_header_bits > enc->max_i4_header_bits_) {
+    total_header_bits += mode_costs[best_mode];
+    if (rd_best.score >= rd->score ||
+        total_header_bits > enc->max_i4_header_bits_) {
       return 0;
     }
     // Copy selected samples if not in the right place already.
-    if (best_block != best_blocks + VP8Scan[it->i4_]) {
+    if (best_block != best_blocks + VP8Scan[it->i4_])
       VP8Copy4x4(best_block, best_blocks + VP8Scan[it->i4_]);
-    }
     rd->modes_i4[it->i4_] = best_mode;
     it->top_nz_[it->i4_ & 3] = it->left_nz_[it->i4_ >> 2] = (rd_i4.nz ? 1 : 0);
   } while (VP8IteratorRotateI4(it, best_blocks));
@@ -998,7 +827,6 @@ static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
 //------------------------------------------------------------------------------
 
 static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
-  const int kNumBlocks = 8;
   const VP8Encoder* const enc = it->enc_;
   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
   const int lambda = dqm->lambda_uv_;
@@ -1010,7 +838,7 @@ static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
 
   rd->mode_uv = -1;
   InitScore(&rd_best);
-  for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
+  for (mode = 0; mode < 4; ++mode) {
     VP8ModeScore rd_uv;
 
     // Reconstruct
@@ -1019,11 +847,8 @@ static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
     // Compute RD-score
     rd_uv.D  = VP8SSE16x8(src, tmp_dst);
     rd_uv.SD = 0;    // TODO: should we call TDisto? it tends to flatten areas.
-    rd_uv.H  = VP8FixedCostsUV[mode];
     rd_uv.R  = VP8GetCostUV(it, &rd_uv);
-    if (mode > 0 && IsFlat(rd_uv.uv_levels[0], kNumBlocks, FLATNESS_LIMIT_UV)) {
-      rd_uv.R += FLATNESS_PENALTY * kNumBlocks;
-    }
+    rd_uv.R += VP8FixedCostsUV[mode];
 
     SetRDScore(lambda, &rd_uv);
     if (mode == 0 || rd_uv.score < rd_best.score) {
@@ -1042,10 +867,10 @@ static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
 
 static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) {
   const VP8Encoder* const enc = it->enc_;
-  const int is_i16 = (it->mb_->type_ == 1);
+  const int i16 = (it->mb_->type_ == 1);
   int nz = 0;
 
-  if (is_i16) {
+  if (i16) {
     nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF, it->preds_[0]);
   } else {
     VP8IteratorStartI4(it);
@@ -1064,66 +889,11 @@ static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) {
   rd->nz = nz;
 }
 
-// Refine intra16/intra4 sub-modes based on distortion only (not rate).
-static void DistoRefine(VP8EncIterator* const it, int try_both_i4_i16) {
-  const int is_i16 = (it->mb_->type_ == 1);
-  score_t best_score = MAX_COST;
-
-  if (try_both_i4_i16 || is_i16) {
-    int mode;
-    int best_mode = -1;
-    for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
-      const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
-      const uint8_t* const src = it->yuv_in_ + Y_OFF;
-      const score_t score = VP8SSE16x16(src, ref);
-      if (score < best_score) {
-        best_mode = mode;
-        best_score = score;
-      }
-    }
-    VP8SetIntra16Mode(it, best_mode);
-  }
-  if (try_both_i4_i16 || !is_i16) {
-    uint8_t modes_i4[16];
-    // We don't evaluate the rate here, but just account for it through a
-    // constant penalty (i4 mode usually needs more bits compared to i16).
-    score_t score_i4 = (score_t)I4_PENALTY;
-
-    VP8IteratorStartI4(it);
-    do {
-      int mode;
-      int best_sub_mode = -1;
-      score_t best_sub_score = MAX_COST;
-      const uint8_t* const src = it->yuv_in_ + Y_OFF + VP8Scan[it->i4_];
-
-      // TODO(skal): we don't really need the prediction pixels here,
-      // but just the distortion against 'src'.
-      VP8MakeIntra4Preds(it);
-      for (mode = 0; mode < NUM_BMODES; ++mode) {
-        const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
-        const score_t score = VP8SSE4x4(src, ref);
-        if (score < best_sub_score) {
-          best_sub_mode = mode;
-          best_sub_score = score;
-        }
-      }
-      modes_i4[it->i4_] = best_sub_mode;
-      score_i4 += best_sub_score;
-      if (score_i4 >= best_score) break;
-    } while (VP8IteratorRotateI4(it, it->yuv_in_ + Y_OFF));
-    if (score_i4 < best_score) {
-      VP8SetIntra4Mode(it, modes_i4);
-    }
-  }
-}
-
 //------------------------------------------------------------------------------
 // Entry point
 
-int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd,
-                VP8RDLevel rd_opt) {
+int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd, int rd_opt) {
   int is_skipped;
-  const int method = it->enc_->method_;
 
   InitScore(rd);
 
@@ -1132,21 +902,22 @@ int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd,
   VP8MakeLuma16Preds(it);
   VP8MakeChroma8Preds(it);
 
-  if (rd_opt > RD_OPT_NONE) {
-    it->do_trellis_ = (rd_opt >= RD_OPT_TRELLIS_ALL);
+  // for rd_opt = 2, we perform trellis-quant on the final decision only.
+  // for rd_opt > 2, we use it for every scoring (=much slower).
+  if (rd_opt > 0) {
+    it->do_trellis_ = (rd_opt > 2);
     PickBestIntra16(it, rd);
-    if (method >= 2) {
+    if (it->enc_->method_ >= 2) {
       PickBestIntra4(it, rd);
     }
     PickBestUV(it, rd);
-    if (rd_opt == RD_OPT_TRELLIS) {   // finish off with trellis-optim now
+    if (rd_opt == 2) {
       it->do_trellis_ = 1;
       SimpleQuantize(it, rd);
     }
   } else {
-    // For method == 2, pick the best intra4/intra16 based on SSE (~tad slower).
-    // For method <= 1, we refine intra4 or intra16 (but don't re-examine mode).
-    DistoRefine(it, (method >= 2));
+    // TODO: for method_ == 2, pick the best intra4/intra16 based on SSE
+    it->do_trellis_ = (it->enc_->method_ == 2);
     SimpleQuantize(it, rd);
   }
   is_skipped = (rd->nz == 0);
@@ -1154,3 +925,6 @@ int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd,
   return is_skipped;
 }
 
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/drivers/webp/enc/syntax.c b/drivers/webp/enc/syntax.c
index 08cfe79ece..7c8c7b1a84 100644
--- a/drivers/webp/enc/syntax.c
+++ b/drivers/webp/enc/syntax.c
@@ -1,10 +1,8 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Header syntax writing
@@ -13,20 +11,35 @@
 
 #include <assert.h>
 
-#include "../utils/utils.h"
-#include "../webp/format_constants.h"  // RIFF constants
-#include "../webp/mux_types.h"         // ALPHA_FLAG
+#include "../webp/format_constants.h"
 #include "./vp8enci.h"
 
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 //------------------------------------------------------------------------------
 // Helper functions
 
+// TODO(later): Move to webp/format_constants.h?
+static void PutLE24(uint8_t* const data, uint32_t val) {
+  data[0] = (val >>  0) & 0xff;
+  data[1] = (val >>  8) & 0xff;
+  data[2] = (val >> 16) & 0xff;
+}
+
+static void PutLE32(uint8_t* const data, uint32_t val) {
+  PutLE24(data, val);
+  data[3] = (val >> 24) & 0xff;
+}
+
 static int IsVP8XNeeded(const VP8Encoder* const enc) {
   return !!enc->has_alpha_;  // Currently the only case when VP8X is needed.
                              // This could change in the future.
 }
 
 static int PutPaddingByte(const WebPPicture* const pic) {
+
   const uint8_t pad_byte[1] = { 0 };
   return !!pic->writer(pad_byte, 1, pic);
 }
@@ -60,14 +73,14 @@ static WebPEncodingError PutVP8XHeader(const VP8Encoder* const enc) {
   assert(pic->width <= MAX_CANVAS_SIZE && pic->height <= MAX_CANVAS_SIZE);
 
   if (enc->has_alpha_) {
-    flags |= ALPHA_FLAG;
+    flags |= ALPHA_FLAG_BIT;
   }
 
   PutLE32(vp8x + TAG_SIZE,              VP8X_CHUNK_SIZE);
   PutLE32(vp8x + CHUNK_HEADER_SIZE,     flags);
   PutLE24(vp8x + CHUNK_HEADER_SIZE + 4, pic->width - 1);
   PutLE24(vp8x + CHUNK_HEADER_SIZE + 7, pic->height - 1);
-  if (!pic->writer(vp8x, sizeof(vp8x), pic)) {
+  if(!pic->writer(vp8x, sizeof(vp8x), pic)) {
     return VP8_ENC_ERROR_BAD_WRITE;
   }
   return VP8_ENC_OK;
@@ -314,9 +327,7 @@ static size_t GeneratePartition0(VP8Encoder* const enc) {
 
   PutSegmentHeader(bw, enc);
   PutFilterHeader(bw, &enc->filter_hdr_);
-  VP8PutValue(bw, enc->num_parts_ == 8 ? 3 :
-                  enc->num_parts_ == 4 ? 2 :
-                  enc->num_parts_ == 2 ? 1 : 0, 2);
+  VP8PutValue(bw, enc->config_->partitions, 2);
   PutQuant(bw, enc);
   VP8PutBitUniform(bw, 0);   // no proba update
   VP8WriteProbas(bw, &enc->proba_);
@@ -421,3 +432,6 @@ int VP8EncWrite(VP8Encoder* const enc) {
 
 //------------------------------------------------------------------------------
 
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/drivers/webp/enc/token.c b/drivers/webp/enc/token.c
deleted file mode 100644
index e696642f16..0000000000
--- a/drivers/webp/enc/token.c
+++ /dev/null
@@ -1,273 +0,0 @@
-// Copyright 2011 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-// Paginated token buffer
-//
-//  A 'token' is a bit value associated with a probability, either fixed
-// or a later-to-be-determined after statistics have been collected.
-// For dynamic probability, we just record the slot id (idx) for the probability
-// value in the final probability array (uint8_t* probas in VP8EmitTokens).
-//
-// Author: Skal (pascal.massimino@gmail.com)
-
-#include <assert.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "./cost.h"
-#include "./vp8enci.h"
-
-#if !defined(DISABLE_TOKEN_BUFFER)
-
-// we use pages to reduce the number of memcpy()
-#define MAX_NUM_TOKEN 8192          // max number of token per page
-#define FIXED_PROBA_BIT (1u << 14)
-
-struct VP8Tokens {
-  uint16_t tokens_[MAX_NUM_TOKEN];  // bit#15: bit
-                                    // bit #14: constant proba or idx
-                                    // bits 0..13: slot or constant proba
-  VP8Tokens* next_;
-};
-
-//------------------------------------------------------------------------------
-
-void VP8TBufferInit(VP8TBuffer* const b) {
-  b->tokens_ = NULL;
-  b->pages_ = NULL;
-  b->last_page_ = &b->pages_;
-  b->left_ = 0;
-  b->error_ = 0;
-}
-
-void VP8TBufferClear(VP8TBuffer* const b) {
-  if (b != NULL) {
-    const VP8Tokens* p = b->pages_;
-    while (p != NULL) {
-      const VP8Tokens* const next = p->next_;
-      free((void*)p);
-      p = next;
-    }
-    VP8TBufferInit(b);
-  }
-}
-
-static int TBufferNewPage(VP8TBuffer* const b) {
-  VP8Tokens* const page = b->error_ ? NULL : (VP8Tokens*)malloc(sizeof(*page));
-  if (page == NULL) {
-    b->error_ = 1;
-    return 0;
-  }
-  *b->last_page_ = page;
-  b->last_page_ = &page->next_;
-  b->left_ = MAX_NUM_TOKEN;
-  b->tokens_ = page->tokens_;
-  page->next_ = NULL;
-  return 1;
-}
-
-//------------------------------------------------------------------------------
-
-#define TOKEN_ID(t, b, ctx, p) \
-    ((p) + NUM_PROBAS * ((ctx) + NUM_CTX * ((b) + NUM_BANDS * (t))))
-
-static WEBP_INLINE int AddToken(VP8TBuffer* const b,
-                                int bit, uint32_t proba_idx) {
-  assert(proba_idx < FIXED_PROBA_BIT);
-  assert(bit == 0 || bit == 1);
-  if (b->left_ > 0 || TBufferNewPage(b)) {
-    const int slot = --b->left_;
-    b->tokens_[slot] = (bit << 15) | proba_idx;
-  }
-  return bit;
-}
-
-static WEBP_INLINE void AddConstantToken(VP8TBuffer* const b,
-                                         int bit, int proba) {
-  assert(proba < 256);
-  assert(bit == 0 || bit == 1);
-  if (b->left_ > 0 || TBufferNewPage(b)) {
-    const int slot = --b->left_;
-    b->tokens_[slot] = (bit << 15) | FIXED_PROBA_BIT | proba;
-  }
-}
-
-int VP8RecordCoeffTokens(int ctx, int coeff_type, int first, int last,
-                         const int16_t* const coeffs,
-                         VP8TBuffer* const tokens) {
-  int n = first;
-  uint32_t base_id = TOKEN_ID(coeff_type, n, ctx, 0);
-  if (!AddToken(tokens, last >= 0, base_id + 0)) {
-    return 0;
-  }
-
-  while (n < 16) {
-    const int c = coeffs[n++];
-    const int sign = c < 0;
-    int v = sign ? -c : c;
-    if (!AddToken(tokens, v != 0, base_id + 1)) {
-      ctx = 0;
-      base_id = TOKEN_ID(coeff_type, VP8EncBands[n], ctx, 0);
-      continue;
-    }
-    if (!AddToken(tokens, v > 1, base_id + 2)) {
-      ctx = 1;
-    } else {
-      if (!AddToken(tokens, v > 4, base_id + 3)) {
-        if (AddToken(tokens, v != 2, base_id + 4))
-          AddToken(tokens, v == 4, base_id + 5);
-      } else if (!AddToken(tokens, v > 10, base_id + 6)) {
-        if (!AddToken(tokens, v > 6, base_id + 7)) {
-          AddConstantToken(tokens, v == 6, 159);
-        } else {
-          AddConstantToken(tokens, v >= 9, 165);
-          AddConstantToken(tokens, !(v & 1), 145);
-        }
-      } else {
-        int mask;
-        const uint8_t* tab;
-        if (v < 3 + (8 << 1)) {          // VP8Cat3  (3b)
-          AddToken(tokens, 0, base_id + 8);
-          AddToken(tokens, 0, base_id + 9);
-          v -= 3 + (8 << 0);
-          mask = 1 << 2;
-          tab = VP8Cat3;
-        } else if (v < 3 + (8 << 2)) {   // VP8Cat4  (4b)
-          AddToken(tokens, 0, base_id + 8);
-          AddToken(tokens, 1, base_id + 9);
-          v -= 3 + (8 << 1);
-          mask = 1 << 3;
-          tab = VP8Cat4;
-        } else if (v < 3 + (8 << 3)) {   // VP8Cat5  (5b)
-          AddToken(tokens, 1, base_id + 8);
-          AddToken(tokens, 0, base_id + 10);
-          v -= 3 + (8 << 2);
-          mask = 1 << 4;
-          tab = VP8Cat5;
-        } else {                         // VP8Cat6 (11b)
-          AddToken(tokens, 1, base_id + 8);
-          AddToken(tokens, 1, base_id + 10);
-          v -= 3 + (8 << 3);
-          mask = 1 << 10;
-          tab = VP8Cat6;
-        }
-        while (mask) {
-          AddConstantToken(tokens, !!(v & mask), *tab++);
-          mask >>= 1;
-        }
-      }
-      ctx = 2;
-    }
-    AddConstantToken(tokens, sign, 128);
-    base_id = TOKEN_ID(coeff_type, VP8EncBands[n], ctx, 0);
-    if (n == 16 || !AddToken(tokens, n <= last, base_id + 0)) {
-      return 1;   // EOB
-    }
-  }
-  return 1;
-}
-
-#undef TOKEN_ID
-
-//------------------------------------------------------------------------------
-// This function works, but isn't currently used. Saved for later.
-
-#if 0
-
-static void Record(int bit, proba_t* const stats) {
-  proba_t p = *stats;
-  if (p >= 0xffff0000u) {               // an overflow is inbound.
-    p = ((p + 1u) >> 1) & 0x7fff7fffu;  // -> divide the stats by 2.
-  }
-  // record bit count (lower 16 bits) and increment total count (upper 16 bits).
-  p += 0x00010000u + bit;
-  *stats = p;
-}
-
-void VP8TokenToStats(const VP8TBuffer* const b, proba_t* const stats) {
-  const VP8Tokens* p = b->pages_;
-  while (p != NULL) {
-    const int N = (p->next_ == NULL) ? b->left_ : 0;
-    int n = MAX_NUM_TOKEN;
-    while (n-- > N) {
-      const uint16_t token = p->tokens_[n];
-      if (!(token & FIXED_PROBA_BIT)) {
-        Record((token >> 15) & 1, stats + (token & 0x3fffu));
-      }
-    }
-    p = p->next_;
-  }
-}
-
-#endif   // 0
-
-//------------------------------------------------------------------------------
-// Final coding pass, with known probabilities
-
-int VP8EmitTokens(VP8TBuffer* const b, VP8BitWriter* const bw,
-                  const uint8_t* const probas, int final_pass) {
-  const VP8Tokens* p = b->pages_;
-  (void)final_pass;
-  if (b->error_) return 0;
-  while (p != NULL) {
-    const VP8Tokens* const next = p->next_;
-    const int N = (next == NULL) ? b->left_ : 0;
-    int n = MAX_NUM_TOKEN;
-    while (n-- > N) {
-      const uint16_t token = p->tokens_[n];
-      const int bit = (token >> 15) & 1;
-      if (token & FIXED_PROBA_BIT) {
-        VP8PutBit(bw, bit, token & 0xffu);  // constant proba
-      } else {
-        VP8PutBit(bw, bit, probas[token & 0x3fffu]);
-      }
-    }
-    if (final_pass) free((void*)p);
-    p = next;
-  }
-  if (final_pass) b->pages_ = NULL;
-  return 1;
-}
-
-// Size estimation
-size_t VP8EstimateTokenSize(VP8TBuffer* const b, const uint8_t* const probas) {
-  size_t size = 0;
-  const VP8Tokens* p = b->pages_;
-  if (b->error_) return 0;
-  while (p != NULL) {
-    const VP8Tokens* const next = p->next_;
-    const int N = (next == NULL) ? b->left_ : 0;
-    int n = MAX_NUM_TOKEN;
-    while (n-- > N) {
-      const uint16_t token = p->tokens_[n];
-      const int bit = token & (1 << 15);
-      if (token & FIXED_PROBA_BIT) {
-        size += VP8BitCost(bit, token & 0xffu);
-      } else {
-        size += VP8BitCost(bit, probas[token & 0x3fffu]);
-      }
-    }
-    p = next;
-  }
-  return size;
-}
-
-//------------------------------------------------------------------------------
-
-#else     // DISABLE_TOKEN_BUFFER
-
-void VP8TBufferInit(VP8TBuffer* const b) {
-  (void)b;
-}
-void VP8TBufferClear(VP8TBuffer* const b) {
-  (void)b;
-}
-
-#endif    // !DISABLE_TOKEN_BUFFER
-
diff --git a/drivers/webp/enc/tree.c b/drivers/webp/enc/tree.c
index e5d05e5221..8b25e5e488 100644
--- a/drivers/webp/enc/tree.c
+++ b/drivers/webp/enc/tree.c
@@ -1,24 +1,27 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
-// Coding of token probabilities, intra modes and segments.
+// Token probabilities
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
 #include "./vp8enci.h"
 
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 //------------------------------------------------------------------------------
 // Default probabilities
 
 // Paragraph 13.5
 const uint8_t
   VP8CoeffsProba0[NUM_TYPES][NUM_BANDS][NUM_CTX][NUM_PROBAS] = {
+  // genereated using vp8_default_coef_probs() in entropy.c:129
   { { { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
       { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
       { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
@@ -315,7 +318,7 @@ void VP8CodeIntraModes(VP8Encoder* const enc) {
   VP8EncIterator it;
   VP8IteratorInit(enc, &it);
   do {
-    const VP8MBInfo* const mb = it.mb_;
+    const VP8MBInfo* mb = it.mb_;
     const uint8_t* preds = it.preds_;
     if (enc->segment_hdr_.update_map_) {
       PutSegment(bw, mb->segment_, enc->proba_.segments_);
@@ -340,7 +343,7 @@ void VP8CodeIntraModes(VP8Encoder* const enc) {
       }
     }
     PutUVMode(bw, mb->uv_mode_);
-  } while (VP8IteratorNext(&it));
+  } while (VP8IteratorNext(&it, 0));
 }
 
 //------------------------------------------------------------------------------
@@ -502,3 +505,6 @@ void VP8WriteProbas(VP8BitWriter* const bw, const VP8Proba* const probas) {
   }
 }
 
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/drivers/webp/enc/vp8enci.h b/drivers/webp/enc/vp8enci.h
index 71adf6c38a..a77778c0d8 100644
--- a/drivers/webp/enc/vp8enci.h
+++ b/drivers/webp/enc/vp8enci.h
@@ -1,10 +1,8 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 //   WebP encoder: internal header.
@@ -18,9 +16,8 @@
 #include "../webp/encode.h"
 #include "../dsp/dsp.h"
 #include "../utils/bit_writer.h"
-#include "../utils/thread.h"
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
@@ -29,9 +26,12 @@ extern "C" {
 
 // version numbers
 #define ENC_MAJ_VERSION 0
-#define ENC_MIN_VERSION 4
+#define ENC_MIN_VERSION 2
 #define ENC_REV_VERSION 0
 
+// size of histogram used by CollectHistogram.
+#define MAX_COEFF_THRESH   64
+
 // intra prediction modes
 enum { B_DC_PRED = 0,   // 4x4 modes
        B_TM_PRED = 1,
@@ -47,8 +47,7 @@ enum { B_DC_PRED = 0,   // 4x4 modes
 
        // Luma16 or UV modes
        DC_PRED = B_DC_PRED, V_PRED = B_VE_PRED,
-       H_PRED = B_HE_PRED, TM_PRED = B_TM_PRED,
-       NUM_PRED_MODES = 4
+       H_PRED = B_HE_PRED, TM_PRED = B_TM_PRED
      };
 
 enum { NUM_MB_SEGMENTS = 4,
@@ -57,24 +56,16 @@ enum { NUM_MB_SEGMENTS = 4,
        NUM_BANDS = 8,
        NUM_CTX = 3,
        NUM_PROBAS = 11,
-       MAX_LF_LEVELS = 64,       // Maximum loop filter level
-       MAX_VARIABLE_LEVEL = 67,  // last (inclusive) level with variable cost
-       MAX_LEVEL = 2047          // max level (note: max codable is 2047 + 67)
+       MAX_LF_LEVELS = 64,      // Maximum loop filter level
+       MAX_VARIABLE_LEVEL = 67  // last (inclusive) level with variable cost
      };
 
-typedef enum {   // Rate-distortion optimization levels
-  RD_OPT_NONE        = 0,  // no rd-opt
-  RD_OPT_BASIC       = 1,  // basic scoring (no trellis)
-  RD_OPT_TRELLIS     = 2,  // perform trellis-quant on the final decision only
-  RD_OPT_TRELLIS_ALL = 3   // trellis-quant for every scoring (much slower)
-} VP8RDLevel;
-
 // YUV-cache parameters. Cache is 16-pixels wide.
 // The original or reconstructed samples can be accessed using VP8Scan[]
 // The predicted blocks can be accessed using offsets to yuv_p_ and
 // the arrays VP8*ModeOffsets[];
 //         +----+      YUV Samples area. See VP8Scan[] for accessing the blocks.
-//  Y_OFF  |YYYY| <- original samples  ('yuv_in_')
+//  Y_OFF  |YYYY| <- original samples  (enc->yuv_in_)
 //         |YYYY|
 //         |YYYY|
 //         |YYYY|
@@ -169,17 +160,7 @@ typedef int64_t score_t;     // type used for scores, rate, distortion
 static WEBP_INLINE int QUANTDIV(int n, int iQ, int B) {
   return (n * iQ + B) >> QFIX;
 }
-
-// size of histogram used by CollectHistogram.
-#define MAX_COEFF_THRESH   31
-typedef struct VP8Histogram VP8Histogram;
-struct VP8Histogram {
-  // TODO(skal): we only need to store the max_value and last_non_zero actually.
-  int distribution[MAX_COEFF_THRESH + 1];
-};
-
-// Uncomment the following to remove token-buffer code:
-// #define DISABLE_TOKEN_BUFFER
+extern const uint8_t VP8Zigzag[16];
 
 //------------------------------------------------------------------------------
 // Headers
@@ -248,19 +229,16 @@ typedef struct {
   int beta_;       // filter-susceptibility, range [0,255].
   int quant_;      // final segment quantizer.
   int fstrength_;  // final in-loop filtering strength
-  int max_edge_;   // max edge delta (for filtering strength)
-  int min_disto_;  // minimum distortion required to trigger filtering record
   // reactivities
   int lambda_i16_, lambda_i4_, lambda_uv_;
   int lambda_mode_, lambda_trellis_, tlambda_;
   int lambda_trellis_i16_, lambda_trellis_i4_, lambda_trellis_uv_;
 } VP8SegmentInfo;
 
-// Handy transient struct to accumulate score and info during RD-optimization
+// Handy transcient struct to accumulate score and info during RD-optimization
 // and mode evaluation.
 typedef struct {
-  score_t D, SD;              // Distortion, spectral distortion
-  score_t H, R, score;        // header bits, rate, score.
+  score_t D, SD, R, score;    // Distortion, spectral distortion, rate, score.
   int16_t y_dc_levels[16];    // Quantized levels for luma-DC, luma-AC, chroma.
   int16_t y_ac_levels[16][16];
   int16_t uv_levels[4 + 4][16];
@@ -274,11 +252,12 @@ typedef struct {
 // right neighbouring data (samples, predictions, contexts, ...)
 typedef struct {
   int x_, y_;                      // current macroblock
+  int y_offset_, uv_offset_;       // offset to the luma / chroma planes
   int y_stride_, uv_stride_;       // respective strides
-  uint8_t*      yuv_in_;           // input samples
-  uint8_t*      yuv_out_;          // output samples
-  uint8_t*      yuv_out2_;         // secondary buffer swapped with yuv_out_.
-  uint8_t*      yuv_p_;            // scratch buffer for prediction
+  uint8_t*      yuv_in_;           // borrowed from enc_ (for now)
+  uint8_t*      yuv_out_;          // ''
+  uint8_t*      yuv_out2_;         // ''
+  uint8_t*      yuv_p_;            // ''
   VP8Encoder*   enc_;              // back-pointer
   VP8MBInfo*    mb_;               // current macroblock
   VP8BitWriter* bw_;               // current bit-writer
@@ -294,43 +273,24 @@ typedef struct {
   uint64_t      uv_bits_;          // macroblock bit-cost for chroma
   LFStats*      lf_stats_;         // filter stats (borrowed from enc_)
   int           do_trellis_;       // if true, perform extra level optimisation
-  int           count_down_;       // number of mb still to be processed
-  int           count_down0_;      // starting counter value (for progress)
+  int           done_;             // true when scan is finished
   int           percent0_;         // saved initial progress percent
-
-  uint8_t* y_left_;    // left luma samples (addressable from index -1 to 15).
-  uint8_t* u_left_;    // left u samples (addressable from index -1 to 7)
-  uint8_t* v_left_;    // left v samples (addressable from index -1 to 7)
-
-  uint8_t* y_top_;     // top luma samples at position 'x_'
-  uint8_t* uv_top_;    // top u/v samples at position 'x_', packed as 16 bytes
-
-  // memory for storing y/u/v_left_ and yuv_in_/out_*
-  uint8_t yuv_left_mem_[17 + 16 + 16 + 8 + ALIGN_CST];     // memory for *_left_
-  uint8_t yuv_mem_[3 * YUV_SIZE + PRED_SIZE + ALIGN_CST];  // memory for yuv_*
 } VP8EncIterator;
 
   // in iterator.c
-// must be called first
+// must be called first.
 void VP8IteratorInit(VP8Encoder* const enc, VP8EncIterator* const it);
-// restart a scan
+// restart a scan.
 void VP8IteratorReset(VP8EncIterator* const it);
-// reset iterator position to row 'y'
-void VP8IteratorSetRow(VP8EncIterator* const it, int y);
-// set count down (=number of iterations to go)
-void VP8IteratorSetCountDown(VP8EncIterator* const it, int count_down);
-// return true if iteration is finished
-int VP8IteratorIsDone(const VP8EncIterator* const it);
-// Import uncompressed samples from source.
-// If tmp_32 is not NULL, import boundary samples too.
-// tmp_32 is a 32-bytes scratch buffer that must be aligned in memory.
-void VP8IteratorImport(VP8EncIterator* const it, uint8_t* tmp_32);
+// import samples from source
+void VP8IteratorImport(const VP8EncIterator* const it);
 // export decimated samples
 void VP8IteratorExport(const VP8EncIterator* const it);
-// go to next macroblock. Returns false if not finished.
-int VP8IteratorNext(VP8EncIterator* const it);
-// save the yuv_out_ boundary values to top_/left_ arrays for next iterations.
-void VP8IteratorSaveBoundary(VP8EncIterator* const it);
+// go to next macroblock. Returns !done_. If *block_to_save is non-null, will
+// save the boundary values to top_/left_ arrays. block_to_save can be
+// it->yuv_out_ or it->yuv_in_.
+int VP8IteratorNext(VP8EncIterator* const it,
+                    const uint8_t* const block_to_save);
 // Report progression based on macroblock rows. Return 0 for user-abort request.
 int VP8IteratorProgress(const VP8EncIterator* const it,
                         int final_delta_percent);
@@ -354,40 +314,44 @@ void VP8SetSegment(const VP8EncIterator* const it, int segment);
 //------------------------------------------------------------------------------
 // Paginated token buffer
 
-typedef struct VP8Tokens VP8Tokens;  // struct details in token.c
-
-typedef struct {
-#if !defined(DISABLE_TOKEN_BUFFER)
-  VP8Tokens* pages_;        // first page
-  VP8Tokens** last_page_;   // last page
-  uint16_t* tokens_;        // set to (*last_page_)->tokens_
-  int left_;          // how many free tokens left before the page is full.
-#endif
-  int error_;         // true in case of malloc error
-} VP8TBuffer;
-
-void VP8TBufferInit(VP8TBuffer* const b);    // initialize an empty buffer
-void VP8TBufferClear(VP8TBuffer* const b);   // de-allocate pages memory
+// WIP: #define USE_TOKEN_BUFFER
 
-#if !defined(DISABLE_TOKEN_BUFFER)
+#ifdef USE_TOKEN_BUFFER
 
-// Finalizes bitstream when probabilities are known.
-// Deletes the allocated token memory if final_pass is true.
-int VP8EmitTokens(VP8TBuffer* const b, VP8BitWriter* const bw,
-                  const uint8_t* const probas, int final_pass);
+#define MAX_NUM_TOKEN 2048
 
-// record the coding of coefficients without knowing the probabilities yet
-int VP8RecordCoeffTokens(int ctx, int coeff_type, int first, int last,
-                         const int16_t* const coeffs,
-                         VP8TBuffer* const tokens);
+typedef struct VP8Tokens VP8Tokens;
+struct VP8Tokens {
+  uint16_t tokens_[MAX_NUM_TOKEN];  // bit#15: bit, bits 0..14: slot
+  int left_;
+  VP8Tokens* next_;
+};
 
-// Estimate the final coded size given a set of 'probas'.
-size_t VP8EstimateTokenSize(VP8TBuffer* const b, const uint8_t* const probas);
+typedef struct {
+  VP8Tokens* rows_;
+  uint16_t* tokens_;    // set to (*last_)->tokens_
+  VP8Tokens** last_;
+  int left_;
+  int error_;  // true in case of malloc error
+} VP8TBuffer;
 
-// unused for now
-void VP8TokenToStats(const VP8TBuffer* const b, proba_t* const stats);
+void VP8TBufferInit(VP8TBuffer* const b);    // initialize an empty buffer
+int VP8TBufferNewPage(VP8TBuffer* const b);  // allocate a new page
+void VP8TBufferClear(VP8TBuffer* const b);   // de-allocate memory
+
+int VP8EmitTokens(const VP8TBuffer* const b, VP8BitWriter* const bw,
+                  const uint8_t* const probas);
+
+static WEBP_INLINE int VP8AddToken(VP8TBuffer* const b,
+                                   int bit, int proba_idx) {
+  if (b->left_ > 0 || VP8TBufferNewPage(b)) {
+    const int slot = --b->left_;
+    b->tokens_[slot] = (bit << 15) | proba_idx;
+  }
+  return bit;
+}
 
-#endif  // !DISABLE_TOKEN_BUFFER
+#endif  // USE_TOKEN_BUFFER
 
 //------------------------------------------------------------------------------
 // VP8Encoder
@@ -412,7 +376,6 @@ struct VP8Encoder {
   // per-partition boolean decoders.
   VP8BitWriter bw_;                         // part0
   VP8BitWriter parts_[MAX_NUM_PARTITIONS];  // token partitions
-  VP8TBuffer tokens_;                       // token buffer
 
   int percent_;                             // for progress
 
@@ -420,7 +383,6 @@ struct VP8Encoder {
   int has_alpha_;
   uint8_t* alpha_data_;       // non-NULL if transparency is present
   uint32_t alpha_data_size_;
-  WebPWorker alpha_worker_;
 
   // enhancement layer
   int use_layer_;
@@ -432,7 +394,6 @@ struct VP8Encoder {
   VP8SegmentInfo dqm_[NUM_MB_SEGMENTS];
   int base_quant_;                 // nominal quantizer value. Only used
                                    // for relative coding of segments' quant.
-  int alpha_;                      // global susceptibility (<=> complexity)
   int uv_alpha_;                   // U/V quantization susceptibility
   // global offset of quantizers, shared by all segments
   int dq_y1_dc_;
@@ -448,20 +409,25 @@ struct VP8Encoder {
   int      block_count_[3];
 
   // quality/speed settings
-  int method_;               // 0=fastest, 6=best/slowest.
-  VP8RDLevel rd_opt_level_;  // Deduced from method_.
-  int max_i4_header_bits_;   // partition #0 safeness factor
-  int thread_level_;         // derived from config->thread_level
-  int do_search_;            // derived from config->target_XXX
-  int use_tokens_;           // if true, use token buffer
+  int method_;              // 0=fastest, 6=best/slowest.
+  int rd_opt_level_;        // Deduced from method_.
+  int max_i4_header_bits_;  // partition #0 safeness factor
 
   // Memory
   VP8MBInfo* mb_info_;   // contextual macroblock infos (mb_w_ + 1)
   uint8_t*   preds_;     // predictions modes: (4*mb_w+1) * (4*mb_h+1)
   uint32_t*  nz_;        // non-zero bit context: mb_w+1
+  uint8_t*   yuv_in_;    // input samples
+  uint8_t*   yuv_out_;   // output samples
+  uint8_t*   yuv_out2_;  // secondary scratch out-buffer. swapped with yuv_out_.
+  uint8_t*   yuv_p_;     // scratch buffer for prediction
   uint8_t   *y_top_;     // top luma samples.
   uint8_t   *uv_top_;    // top u/v samples.
-                         // U and V are packed into 16 bytes (8 U + 8 V)
+                         // U and V are packed into 16 pixels (8 U + 8 V)
+  uint8_t   *y_left_;    // left luma samples (adressable from index -1 to 15).
+  uint8_t   *u_left_;    // left u samples (adressable from index -1 to 7)
+  uint8_t   *v_left_;    // left v samples (adressable from index -1 to 7)
+
   LFStats   *lf_stats_;  // autofilter stats (if NULL, autofilter is off)
 };
 
@@ -489,11 +455,6 @@ void VP8EncFreeBitWriters(VP8Encoder* const enc);
 
   // in frame.c
 extern const uint8_t VP8EncBands[16 + 1];
-extern const uint8_t VP8Cat3[];
-extern const uint8_t VP8Cat4[];
-extern const uint8_t VP8Cat5[];
-extern const uint8_t VP8Cat6[];
-
 // Form all the four Intra16x16 predictions in the yuv_p_ cache
 void VP8MakeLuma16Preds(const VP8EncIterator* const it);
 // Form all the four Chroma8x8 predictions in the yuv_p_ cache
@@ -505,9 +466,9 @@ void VP8MakeIntra4Preds(const VP8EncIterator* const it);
 int VP8GetCostLuma16(VP8EncIterator* const it, const VP8ModeScore* const rd);
 int VP8GetCostLuma4(VP8EncIterator* const it, const int16_t levels[16]);
 int VP8GetCostUV(VP8EncIterator* const it, const VP8ModeScore* const rd);
-// Main coding calls
+// Main stat / coding passes
 int VP8EncLoop(VP8Encoder* const enc);
-int VP8EncTokenLoop(VP8Encoder* const enc);
+int VP8StatLoop(VP8Encoder* const enc);
 
   // in webpenc.c
 // Assign an error code to a picture. Return false for convenience.
@@ -524,14 +485,12 @@ int VP8EncAnalyze(VP8Encoder* const enc);
 // Sets up segment's quantization values, base_quant_ and filter strengths.
 void VP8SetSegmentParams(VP8Encoder* const enc, float quality);
 // Pick best modes and fills the levels. Returns true if skipped.
-int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd,
-                VP8RDLevel rd_opt);
+int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd, int rd_opt);
 
   // in alpha.c
 void VP8EncInitAlpha(VP8Encoder* const enc);    // initialize alpha compression
-int VP8EncStartAlpha(VP8Encoder* const enc);    // start alpha coding process
 int VP8EncFinishAlpha(VP8Encoder* const enc);   // finalize compressed data
-int VP8EncDeleteAlpha(VP8Encoder* const enc);   // delete compressed data
+void VP8EncDeleteAlpha(VP8Encoder* const enc);  // delete compressed data
 
   // in layer.c
 void VP8EncInitLayer(VP8Encoder* const enc);     // init everything
@@ -557,13 +516,9 @@ void VP8InitFilter(VP8EncIterator* const it);
 void VP8StoreFilterStats(VP8EncIterator* const it);
 void VP8AdjustFilterStrength(VP8EncIterator* const it);
 
-// returns the approximate filtering strength needed to smooth a edge
-// step of 'delta', given a sharpness parameter 'sharpness'.
-int VP8FilterStrengthFromDelta(int sharpness, int delta);
-
 //------------------------------------------------------------------------------
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
 
diff --git a/drivers/webp/enc/vp8l.c b/drivers/webp/enc/vp8l.c
index 15726318e2..9c202f8d36 100644
--- a/drivers/webp/enc/vp8l.c
+++ b/drivers/webp/enc/vp8l.c
@@ -1,10 +1,8 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // main entry for the lossless encoder.
@@ -25,6 +23,10 @@
 #include "../utils/utils.h"
 #include "../webp/format_constants.h"
 
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 #define PALETTE_KEY_RIGHT_SHIFT   22  // Key for 1K buffer.
 #define MAX_HUFF_IMAGE_SIZE       (16 * 1024 * 1024)
 #define MAX_COLORS_FOR_GRAPH      64
@@ -35,8 +37,7 @@
 static int CompareColors(const void* p1, const void* p2) {
   const uint32_t a = *(const uint32_t*)p1;
   const uint32_t b = *(const uint32_t*)p2;
-  assert(a != b);
-  return (a < b) ? -1 : 1;
+  return (a < b) ? -1 : (a > b) ? 1 : 0;
 }
 
 // If number of colors in the image is less than or equal to MAX_PALETTE_SIZE,
@@ -84,7 +85,7 @@ static int AnalyzeAndCreatePalette(const WebPPicture* const pic,
     argb += pic->argb_stride;
   }
 
-  // TODO(skal): could we reuse in_use[] to speed up EncodePalette()?
+  // TODO(skal): could we reuse in_use[] to speed up ApplyPalette()?
   num_colors = 0;
   for (i = 0; i < (int)(sizeof(in_use) / sizeof(in_use[0])); ++i) {
     if (in_use[i]) {
@@ -164,6 +165,9 @@ static int VP8LEncAnalyze(VP8LEncoder* const enc, WebPImageHint image_hint) {
       }
       if (pred_entropy < 0.95 * non_pred_entropy) {
         enc->use_predict_ = 1;
+        // TODO(vikasa): Observed some correlation of cross_color transform with
+        // predict. Need to investigate this further and add separate heuristic
+        // for setting use_cross_color flag.
         enc->use_cross_color_ = 1;
       }
     }
@@ -216,7 +220,7 @@ static int GetHuffBitLengthsAndCodes(
   }
 
   // Create Huffman trees.
-  for (i = 0; ok && (i < histogram_image_size); ++i) {
+  for (i = 0; i < histogram_image_size; ++i) {
     HuffmanTreeCode* const codes = &huffman_codes[5 * i];
     VP8LHistogram* const histo = histogram_image->histograms[i];
     ok = ok && VP8LCreateHuffmanTree(histo->literal_, 15, codes + 0);
@@ -227,11 +231,7 @@ static int GetHuffBitLengthsAndCodes(
   }
 
  End:
-  if (!ok) {
-    free(mem_buf);
-    // If one VP8LCreateHuffmanTree() above fails, we need to clean up behind.
-    memset(huffman_codes, 0, 5 * histogram_image_size * sizeof(*huffman_codes));
-  }
+  if (!ok) free(mem_buf);
   return ok;
 }
 
@@ -406,10 +406,9 @@ static int StoreHuffmanCode(VP8LBitWriter* const bw,
 }
 
 static void WriteHuffmanCode(VP8LBitWriter* const bw,
-                             const HuffmanTreeCode* const code,
-                             int code_index) {
-  const int depth = code->code_lengths[code_index];
-  const int symbol = code->codes[code_index];
+                             const HuffmanTreeCode* const code, int index) {
+  const int depth = code->code_lengths[index];
+  const int symbol = code->codes[index];
   VP8LWriteBits(bw, depth, symbol);
 }
 
@@ -444,12 +443,12 @@ static void StoreImageToBitMask(
       int bits, n_bits;
       int code, distance;
 
-      VP8LPrefixEncode(v->len, &code, &n_bits, &bits);
+      PrefixEncode(v->len, &code, &n_bits, &bits);
       WriteHuffmanCode(bw, codes, 256 + code);
       VP8LWriteBits(bw, n_bits, bits);
 
       distance = PixOrCopyDistance(v);
-      VP8LPrefixEncode(distance, &code, &n_bits, &bits);
+      PrefixEncode(distance, &code, &n_bits, &bits);
       WriteHuffmanCode(bw, codes + 4, code);
       VP8LWriteBits(bw, n_bits, bits);
     }
@@ -530,12 +529,7 @@ static int EncodeImageInternal(VP8LBitWriter* const bw,
                                 sizeof(*histogram_symbols));
   assert(histogram_bits >= MIN_HUFFMAN_BITS);
   assert(histogram_bits <= MAX_HUFFMAN_BITS);
-
-  if (histogram_image == NULL || histogram_symbols == NULL) {
-    free(histogram_image);
-    free(histogram_symbols);
-    return 0;
-  }
+  if (histogram_image == NULL || histogram_symbols == NULL) goto Error;
 
   // Calculate backward references from ARGB image.
   if (!VP8LGetBackwardReferences(width, height, argb, quality, cache_bits,
@@ -558,9 +552,6 @@ static int EncodeImageInternal(VP8LBitWriter* const bw,
       !GetHuffBitLengthsAndCodes(histogram_image, huffman_codes)) {
     goto Error;
   }
-  // Free combined histograms.
-  free(histogram_image);
-  histogram_image = NULL;
 
   // Color Cache parameters.
   VP8LWriteBits(bw, 1, use_color_cache);
@@ -580,10 +571,10 @@ static int EncodeImageInternal(VP8LBitWriter* const bw,
       uint32_t i;
       if (histogram_argb == NULL) goto Error;
       for (i = 0; i < histogram_image_xysize; ++i) {
-        const int symbol_index = histogram_symbols[i] & 0xffff;
-        histogram_argb[i] = 0xff000000 | (symbol_index << 8);
-        if (symbol_index >= max_index) {
-          max_index = symbol_index + 1;
+        const int index = histogram_symbols[i] & 0xffff;
+        histogram_argb[i] = 0xff000000 | (index << 8);
+        if (index >= max_index) {
+          max_index = index + 1;
         }
       }
       histogram_image_size = max_index;
@@ -607,6 +598,9 @@ static int EncodeImageInternal(VP8LBitWriter* const bw,
       ClearHuffmanTreeIfOnlyOneSymbol(codes);
     }
   }
+  // Free combined histograms.
+  free(histogram_image);
+  histogram_image = NULL;
 
   // Store actual literals.
   StoreImageToBitMask(bw, width, histogram_bits, &refs,
@@ -614,7 +608,7 @@ static int EncodeImageInternal(VP8LBitWriter* const bw,
   ok = 1;
 
  Error:
-  free(histogram_image);
+  if (!ok) free(histogram_image);
 
   VP8LClearBackwardRefs(&refs);
   if (huffman_codes != NULL) {
@@ -695,7 +689,7 @@ static int ApplyCrossColorFilter(const VP8LEncoder* const enc,
   const int ccolor_transform_bits = enc->transform_bits_;
   const int transform_width = VP8LSubSampleSize(width, ccolor_transform_bits);
   const int transform_height = VP8LSubSampleSize(height, ccolor_transform_bits);
-  const int step = (quality < 25) ? 32 : (quality > 50) ? 8 : 16;
+  const int step = (quality == 0) ? 32 : 8;
 
   VP8LColorSpaceTransform(width, height, ccolor_transform_bits, step,
                           enc->argb_, enc->transform_data_);
@@ -712,6 +706,13 @@ static int ApplyCrossColorFilter(const VP8LEncoder* const enc,
 
 // -----------------------------------------------------------------------------
 
+static void PutLE32(uint8_t* const data, uint32_t val) {
+  data[0] = (val >>  0) & 0xff;
+  data[1] = (val >>  8) & 0xff;
+  data[2] = (val >> 16) & 0xff;
+  data[3] = (val >> 24) & 0xff;
+}
+
 static WebPEncodingError WriteRiffHeader(const WebPPicture* const pic,
                                          size_t riff_size, size_t vp8l_size) {
   uint8_t riff[RIFF_HEADER_SIZE + CHUNK_HEADER_SIZE + VP8L_SIGNATURE_SIZE] = {
@@ -806,94 +807,61 @@ static WebPEncodingError AllocateTransformBuffer(VP8LEncoder* const enc,
   return err;
 }
 
-static void ApplyPalette(uint32_t* src, uint32_t* dst,
-                         uint32_t src_stride, uint32_t dst_stride,
-                         const uint32_t* palette, int palette_size,
-                         int width, int height, int xbits, uint8_t* row) {
-  int i, x, y;
-  int use_LUT = 1;
-  for (i = 0; i < palette_size; ++i) {
-    if ((palette[i] & 0xffff00ffu) != 0) {
-      use_LUT = 0;
-      break;
-    }
-  }
+// Bundles multiple (2, 4 or 8) pixels into a single pixel.
+// Returns the new xsize.
+static void BundleColorMap(const WebPPicture* const pic,
+                           int xbits, uint32_t* bundled_argb, int xs) {
+  int y;
+  const int bit_depth = 1 << (3 - xbits);
+  uint32_t code = 0;
+  const uint32_t* argb = pic->argb;
+  const int width = pic->width;
+  const int height = pic->height;
 
-  if (use_LUT) {
-    uint8_t inv_palette[MAX_PALETTE_SIZE] = { 0 };
-    for (i = 0; i < palette_size; ++i) {
-      const int color = (palette[i] >> 8) & 0xff;
-      inv_palette[color] = i;
-    }
-    for (y = 0; y < height; ++y) {
-      for (x = 0; x < width; ++x) {
-        const int color = (src[x] >> 8) & 0xff;
-        row[x] = inv_palette[color];
-      }
-      VP8LBundleColorMap(row, width, xbits, dst);
-      src += src_stride;
-      dst += dst_stride;
-    }
-  } else {
-    // Use 1 pixel cache for ARGB pixels.
-    uint32_t last_pix = palette[0];
-    int last_idx = 0;
-    for (y = 0; y < height; ++y) {
-      for (x = 0; x < width; ++x) {
-        const uint32_t pix = src[x];
-        if (pix != last_pix) {
-          for (i = 0; i < palette_size; ++i) {
-            if (pix == palette[i]) {
-              last_idx = i;
-              last_pix = pix;
-              break;
-            }
-          }
-        }
-        row[x] = last_idx;
+  for (y = 0; y < height; ++y) {
+    int x;
+    for (x = 0; x < width; ++x) {
+      const int mask = (1 << xbits) - 1;
+      const int xsub = x & mask;
+      if (xsub == 0) {
+        code = 0;
       }
-      VP8LBundleColorMap(row, width, xbits, dst);
-      src += src_stride;
-      dst += dst_stride;
+      // TODO(vikasa): simplify the bundling logic.
+      code |= (argb[x] & 0xff00) << (bit_depth * xsub);
+      bundled_argb[y * xs + (x >> xbits)] = 0xff000000 | code;
     }
+    argb += pic->argb_stride;
   }
 }
 
 // Note: Expects "enc->palette_" to be set properly.
 // Also, "enc->palette_" will be modified after this call and should not be used
 // later.
-static WebPEncodingError EncodePalette(VP8LBitWriter* const bw,
-                                       VP8LEncoder* const enc, int quality) {
+static WebPEncodingError ApplyPalette(VP8LBitWriter* const bw,
+                                      VP8LEncoder* const enc, int quality) {
   WebPEncodingError err = VP8_ENC_OK;
-  int i;
+  int i, x, y;
   const WebPPicture* const pic = enc->pic_;
-  uint32_t* src = pic->argb;
-  uint32_t* dst;
+  uint32_t* argb = pic->argb;
   const int width = pic->width;
   const int height = pic->height;
   uint32_t* const palette = enc->palette_;
   const int palette_size = enc->palette_size_;
-  uint8_t* row = NULL;
-  int xbits;
 
   // Replace each input pixel by corresponding palette index.
-  // This is done line by line.
-  if (palette_size <= 4) {
-    xbits = (palette_size <= 2) ? 3 : 2;
-  } else {
-    xbits = (palette_size <= 16) ? 1 : 0;
+  for (y = 0; y < height; ++y) {
+    for (x = 0; x < width; ++x) {
+      const uint32_t pix = argb[x];
+      for (i = 0; i < palette_size; ++i) {
+        if (pix == palette[i]) {
+          argb[x] = 0xff000000u | (i << 8);
+          break;
+        }
+      }
+    }
+    argb += pic->argb_stride;
   }
 
-  err = AllocateTransformBuffer(enc, VP8LSubSampleSize(width, xbits), height);
-  if (err != VP8_ENC_OK) goto Error;
-  dst = enc->argb_;
-
-  row = (uint8_t*)WebPSafeMalloc((uint64_t)width, sizeof(*row));
-  if (row == NULL) return VP8_ENC_ERROR_OUT_OF_MEMORY;
-
-  ApplyPalette(src, dst, pic->argb_stride, enc->current_width_,
-               palette, palette_size, width, height, xbits, row);
-
   // Save palette to bitstream.
   VP8LWriteBits(bw, 1, TRANSFORM_PRESENT);
   VP8LWriteBits(bw, 2, COLOR_INDEXING_TRANSFORM);
@@ -907,21 +875,36 @@ static WebPEncodingError EncodePalette(VP8LBitWriter* const bw,
     goto Error;
   }
 
+  if (palette_size <= 16) {
+    // Image can be packed (multiple pixels per uint32_t).
+    int xbits = 1;
+    if (palette_size <= 2) {
+      xbits = 3;
+    } else if (palette_size <= 4) {
+      xbits = 2;
+    }
+    err = AllocateTransformBuffer(enc, VP8LSubSampleSize(width, xbits), height);
+    if (err != VP8_ENC_OK) goto Error;
+    BundleColorMap(pic, xbits, enc->argb_, enc->current_width_);
+  }
+
  Error:
-  free(row);
   return err;
 }
 
 // -----------------------------------------------------------------------------
 
-static int GetHistoBits(int method, int use_palette, int width, int height) {
-  const uint64_t hist_size = sizeof(VP8LHistogram);
+static int GetHistoBits(const WebPConfig* const config,
+                        const WebPPicture* const pic) {
+  const int width = pic->width;
+  const int height = pic->height;
+  const size_t hist_size = sizeof(VP8LHistogram);
   // Make tile size a function of encoding method (Range: 0 to 6).
-  int histo_bits = (use_palette ? 9 : 7) - method;
+  int histo_bits = 7 - config->method;
   while (1) {
-    const uint64_t huff_image_size = VP8LSubSampleSize(width, histo_bits) *
-                                     VP8LSubSampleSize(height, histo_bits) *
-                                     hist_size;
+    const size_t huff_image_size = VP8LSubSampleSize(width, histo_bits) *
+                                   VP8LSubSampleSize(height, histo_bits) *
+                                   hist_size;
     if (huff_image_size <= MAX_HUFF_IMAGE_SIZE) break;
     ++histo_bits;
   }
@@ -929,14 +912,13 @@ static int GetHistoBits(int method, int use_palette, int width, int height) {
          (histo_bits > MAX_HUFFMAN_BITS) ? MAX_HUFFMAN_BITS : histo_bits;
 }
 
-static void FinishEncParams(VP8LEncoder* const enc) {
+static void InitEncParams(VP8LEncoder* const enc) {
   const WebPConfig* const config = enc->config_;
-  const WebPPicture* const pic = enc->pic_;
+  const WebPPicture* const picture = enc->pic_;
   const int method = config->method;
   const float quality = config->quality;
-  const int use_palette = enc->use_palette_;
   enc->transform_bits_ = (method < 4) ? 5 : (method > 4) ? 3 : 4;
-  enc->histo_bits_ = GetHistoBits(method, use_palette, pic->width, pic->height);
+  enc->histo_bits_ = GetHistoBits(config, picture);
   enc->cache_bits_ = (quality <= 25.f) ? 0 : 7;
 }
 
@@ -952,9 +934,6 @@ static VP8LEncoder* VP8LEncoderNew(const WebPConfig* const config,
   }
   enc->config_ = config;
   enc->pic_ = picture;
-
-  VP8LDspInit();
-
   return enc;
 }
 
@@ -981,6 +960,8 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
     goto Error;
   }
 
+  InitEncParams(enc);
+
   // ---------------------------------------------------------------------------
   // Analyze image (entropy, num_palettes etc)
 
@@ -989,10 +970,8 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
     goto Error;
   }
 
-  FinishEncParams(enc);
-
   if (enc->use_palette_) {
-    err = EncodePalette(bw, enc, quality);
+    err = ApplyPalette(bw, enc, quality);
     if (err != VP8_ENC_OK) goto Error;
     // Color cache is disabled for palette.
     enc->cache_bits_ = 0;
@@ -1166,3 +1145,6 @@ int VP8LEncodeImage(const WebPConfig* const config,
 
 //------------------------------------------------------------------------------
 
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/drivers/webp/enc/vp8li.h b/drivers/webp/enc/vp8li.h
index 96d6faed64..eae90dd61f 100644
--- a/drivers/webp/enc/vp8li.h
+++ b/drivers/webp/enc/vp8li.h
@@ -1,10 +1,8 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Lossless encoder: internal header.
@@ -19,7 +17,7 @@
 #include "../webp/encode.h"
 #include "../webp/format_constants.h"
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
@@ -63,7 +61,7 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
 
 //------------------------------------------------------------------------------
 
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
 
diff --git a/drivers/webp/enc/webpenc.c b/drivers/webp/enc/webpenc.c
index 207cce6beb..3c275589fc 100644
--- a/drivers/webp/enc/webpenc.c
+++ b/drivers/webp/enc/webpenc.c
@@ -1,10 +1,8 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // WebP encoder: main entry point
@@ -22,6 +20,10 @@
 
 // #define PRINT_MEMORY_INFO
 
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 #ifdef PRINT_MEMORY_INFO
 #include <stdio.h>
 #endif
@@ -91,53 +93,34 @@ static void ResetBoundaryPredictions(VP8Encoder* const enc) {
   enc->nz_[-1] = 0;   // constant
 }
 
-// Mapping from config->method_ to coding tools used.
-//-------------------+---+---+---+---+---+---+---+
-//   Method          | 0 | 1 | 2 | 3 |(4)| 5 | 6 |
-//-------------------+---+---+---+---+---+---+---+
-// fast probe        | x |   |   | x |   |   |   |
-//-------------------+---+---+---+---+---+---+---+
-// dynamic proba     | ~ | x | x | x | x | x | x |
-//-------------------+---+---+---+---+---+---+---+
-// fast mode analysis|   |   |   |   | x | x | x |
-//-------------------+---+---+---+---+---+---+---+
-// basic rd-opt      |   |   |   | x | x | x | x |
-//-------------------+---+---+---+---+---+---+---+
-// disto-score i4/16 |   |   | x |   |   |   |   |
-//-------------------+---+---+---+---+---+---+---+
-// rd-opt i4/16      |   |   | ~ | x | x | x | x |
-//-------------------+---+---+---+---+---+---+---+
-// token buffer (opt)|   |   |   | x | x | x | x |
-//-------------------+---+---+---+---+---+---+---+
-// Trellis           |   |   |   |   |   | x |Ful|
-//-------------------+---+---+---+---+---+---+---+
-// full-SNS          |   |   |   |   | x | x | x |
-//-------------------+---+---+---+---+---+---+---+
+// Map configured quality level to coding tools used.
+//-------------+---+---+---+---+---+---+
+//   Quality   | 0 | 1 | 2 | 3 | 4 | 5 +
+//-------------+---+---+---+---+---+---+
+// dynamic prob| ~ | x | x | x | x | x |
+//-------------+---+---+---+---+---+---+
+// rd-opt modes|   |   | x | x | x | x |
+//-------------+---+---+---+---+---+---+
+// fast i4/i16 | x | x |   |   |   |   |
+//-------------+---+---+---+---+---+---+
+// rd-opt i4/16|   |   | x | x | x | x |
+//-------------+---+---+---+---+---+---+
+// Trellis     |   | x |   |   | x | x |
+//-------------+---+---+---+---+---+---+
+// full-SNS    |   |   |   |   |   | x |
+//-------------+---+---+---+---+---+---+
 
 static void MapConfigToTools(VP8Encoder* const enc) {
-  const WebPConfig* const config = enc->config_;
-  const int method = config->method;
-  const int limit = 100 - config->partition_limit;
+  const int method = enc->config_->method;
+  const int limit = 100 - enc->config_->partition_limit;
   enc->method_ = method;
-  enc->rd_opt_level_ = (method >= 6) ? RD_OPT_TRELLIS_ALL
-                     : (method >= 5) ? RD_OPT_TRELLIS
-                     : (method >= 3) ? RD_OPT_BASIC
-                     : RD_OPT_NONE;
+  enc->rd_opt_level_ = (method >= 6) ? 3
+                     : (method >= 5) ? 2
+                     : (method >= 3) ? 1
+                     : 0;
   enc->max_i4_header_bits_ =
       256 * 16 * 16 *                 // upper bound: up to 16bit per 4x4 block
       (limit * limit) / (100 * 100);  // ... modulated with a quadratic curve.
-
-  enc->thread_level_ = config->thread_level;
-
-  enc->do_search_ = (config->target_size > 0 || config->target_PSNR > 0);
-  if (!config->low_memory) {
-#if !defined(DISABLE_TOKEN_BUFFER)
-    enc->use_tokens_ = (enc->rd_opt_level_ >= RD_OPT_BASIC);  // need rd stats
-#endif
-    if (enc->use_tokens_) {
-      enc->num_parts_ = 1;   // doesn't work with multi-partition
-    }
-  }
 }
 
 // Memory scaling with dimensions:
@@ -153,7 +136,7 @@ static void MapConfigToTools(VP8Encoder* const enc) {
 //             non-zero: 196
 //             lf-stats: 2048
 //                total: 68635
-// Transient object sizes:
+// Transcient object sizes:
 //       VP8EncIterator: 352
 //         VP8ModeScore: 912
 //       VP8SegmentInfo: 532
@@ -171,16 +154,20 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
   const int preds_h = 4 * mb_h + 1;
   const size_t preds_size = preds_w * preds_h * sizeof(uint8_t);
   const int top_stride = mb_w * 16;
-  const size_t nz_size = (mb_w + 1) * sizeof(uint32_t) + ALIGN_CST;
+  const size_t nz_size = (mb_w + 1) * sizeof(uint32_t);
+  const size_t cache_size = (3 * YUV_SIZE + PRED_SIZE) * sizeof(uint8_t);
   const size_t info_size = mb_w * mb_h * sizeof(VP8MBInfo);
-  const size_t samples_size = 2 * top_stride * sizeof(uint8_t)  // top-luma/u/v
-                            + ALIGN_CST;                        // align all
+  const size_t samples_size = (2 * top_stride +         // top-luma/u/v
+                               16 + 16 + 16 + 8 + 1 +   // left y/u/v
+                               2 * ALIGN_CST)           // align all
+                               * sizeof(uint8_t);
   const size_t lf_stats_size =
       config->autofilter ? sizeof(LFStats) + ALIGN_CST : 0;
   VP8Encoder* enc;
   uint8_t* mem;
   const uint64_t size = (uint64_t)sizeof(VP8Encoder)   // main struct
                       + ALIGN_CST                      // cache alignment
+                      + cache_size                     // working caches
                       + info_size                      // modes info
                       + preds_size                     // prediction modes
                       + samples_size                   // top/left samples
@@ -191,15 +178,16 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
   printf("===================================\n");
   printf("Memory used:\n"
          "             encoder: %ld\n"
+         "         block cache: %ld\n"
          "                info: %ld\n"
          "               preds: %ld\n"
          "         top samples: %ld\n"
          "            non-zero: %ld\n"
          "            lf-stats: %ld\n"
          "               total: %ld\n",
-         sizeof(VP8Encoder) + ALIGN_CST, info_size,
+         sizeof(VP8Encoder) + ALIGN_CST, cache_size, info_size,
          preds_size, samples_size, nz_size, lf_stats_size, size);
-  printf("Transient object sizes:\n"
+  printf("Transcient object sizes:\n"
          "      VP8EncIterator: %ld\n"
          "        VP8ModeScore: %ld\n"
          "      VP8SegmentInfo: %ld\n"
@@ -224,11 +212,19 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
   enc->mb_w_ = mb_w;
   enc->mb_h_ = mb_h;
   enc->preds_w_ = preds_w;
+  enc->yuv_in_ = (uint8_t*)mem;
+  mem += YUV_SIZE;
+  enc->yuv_out_ = (uint8_t*)mem;
+  mem += YUV_SIZE;
+  enc->yuv_out2_ = (uint8_t*)mem;
+  mem += YUV_SIZE;
+  enc->yuv_p_ = (uint8_t*)mem;
+  mem += PRED_SIZE;
   enc->mb_info_ = (VP8MBInfo*)mem;
   mem += info_size;
   enc->preds_ = ((uint8_t*)mem) + 1 + enc->preds_w_;
   mem += preds_w * preds_h * sizeof(uint8_t);
-  enc->nz_ = 1 + (uint32_t*)DO_ALIGN(mem);
+  enc->nz_ = 1 + (uint32_t*)mem;
   mem += nz_size;
   enc->lf_stats_ = lf_stats_size ? (LFStats*)DO_ALIGN(mem) : NULL;
   mem += lf_stats_size;
@@ -238,7 +234,13 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
   enc->y_top_ = (uint8_t*)mem;
   enc->uv_top_ = enc->y_top_ + top_stride;
   mem += 2 * top_stride;
-  assert(mem <= (uint8_t*)enc + size);
+  mem = (uint8_t*)DO_ALIGN(mem + 1);
+  enc->y_left_ = (uint8_t*)mem;
+  mem += 16 + 16;
+  enc->u_left_ = (uint8_t*)mem;
+  mem += 16;
+  enc->v_left_ = (uint8_t*)mem;
+  mem += 8;
 
   enc->config_ = config;
   enc->profile_ = use_filter ? ((config->filter_type == 1) ? 0 : 1) : 2;
@@ -257,27 +259,23 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
   VP8EncInitLayer(enc);
 #endif
 
-  VP8TBufferInit(&enc->tokens_);
   return enc;
 }
 
-static int DeleteVP8Encoder(VP8Encoder* enc) {
-  int ok = 1;
+static void DeleteVP8Encoder(VP8Encoder* enc) {
   if (enc != NULL) {
-    ok = VP8EncDeleteAlpha(enc);
+    VP8EncDeleteAlpha(enc);
 #ifdef WEBP_EXPERIMENTAL_FEATURES
     VP8EncDeleteLayer(enc);
 #endif
-    VP8TBufferClear(&enc->tokens_);
     free(enc);
   }
-  return ok;
 }
 
 //------------------------------------------------------------------------------
 
 static double GetPSNR(uint64_t err, uint64_t size) {
-  return (err > 0 && size > 0) ? 10. * log10(255. * 255. * size / err) : 99.;
+  return err ? 10. * log10(255. * 255. * size / err) : 99.;
 }
 
 static void FinalizePSNR(const VP8Encoder* const enc) {
@@ -334,7 +332,7 @@ int WebPReportProgress(const WebPPicture* const pic,
 //------------------------------------------------------------------------------
 
 int WebPEncode(const WebPConfig* config, WebPPicture* pic) {
-  int ok = 0;
+  int ok;
 
   if (pic == NULL)
     return 0;
@@ -353,48 +351,32 @@ int WebPEncode(const WebPConfig* config, WebPPicture* pic) {
   if (!config->lossless) {
     VP8Encoder* enc = NULL;
     if (pic->y == NULL || pic->u == NULL || pic->v == NULL) {
-      // Make sure we have YUVA samples.
-      float dithering = 0.f;
-      if (config->preprocessing & 2) {
-        const float x = config->quality / 100.f;
-        const float x2 = x * x;
-        // slowly decreasing from max dithering at low quality (q->0)
-        // to 0.5 dithering amplitude at high quality (q->100)
-        dithering = 1.0f + (0.5f - 1.0f) * x2 * x2;
-      }
-      if (!WebPPictureARGBToYUVADithered(pic, WEBP_YUV420, dithering)) {
-        return 0;
+      if (pic->argb != NULL) {
+        if (!WebPPictureARGBToYUVA(pic, WEBP_YUV420)) return 0;
+      } else {
+        return WebPEncodingSetError(pic, VP8_ENC_ERROR_NULL_PARAMETER);
       }
     }
 
     enc = InitVP8Encoder(config, pic);
     if (enc == NULL) return 0;  // pic->error is already set.
     // Note: each of the tasks below account for 20% in the progress report.
-    ok = VP8EncAnalyze(enc);
-
-    // Analysis is done, proceed to actual coding.
-    ok = ok && VP8EncStartAlpha(enc);   // possibly done in parallel
-    if (!enc->use_tokens_) {
-      ok = ok && VP8EncLoop(enc);
-    } else {
-      ok = ok && VP8EncTokenLoop(enc);
-    }
-    ok = ok && VP8EncFinishAlpha(enc);
+    ok = VP8EncAnalyze(enc)
+      && VP8StatLoop(enc)
+      && VP8EncLoop(enc)
+      && VP8EncFinishAlpha(enc)
 #ifdef WEBP_EXPERIMENTAL_FEATURES
-    ok = ok && VP8EncFinishLayer(enc);
+      && VP8EncFinishLayer(enc)
 #endif
-
-    ok = ok && VP8EncWrite(enc);
+      && VP8EncWrite(enc);
     StoreStats(enc);
     if (!ok) {
       VP8EncFreeBitWriters(enc);
     }
-    ok &= DeleteVP8Encoder(enc);  // must always be called, even if !ok
+    DeleteVP8Encoder(enc);
   } else {
-    // Make sure we have ARGB samples.
-    if (pic->argb == NULL && !WebPPictureYUVAToARGB(pic)) {
-      return 0;
-    }
+    if (pic->argb == NULL)
+      return WebPEncodingSetError(pic, VP8_ENC_ERROR_NULL_PARAMETER);
 
     ok = VP8LEncodeImage(config, pic);  // Sets pic->error in case of problem.
   }
@@ -402,3 +384,6 @@ int WebPEncode(const WebPConfig* config, WebPPicture* pic) {
   return ok;
 }
 
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif